/*
 * Copyright (c) 2023 NetInt
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#ifdef __linux__
#include <sys/ioctl.h>
#include <sys/sysinfo.h>
#endif
#include <sys/time.h>
#include <getopt.h>
#include <pthread.h>
#include <signal.h>

#include <ni_rsrc_api.h>

#include <libavcodec/avcodec.h>
#if LIBAVCODEC_VERSION_MAJOR >= 59
#include <libavcodec/bsf.h>
#endif

#include <libavfilter/buffersink.h>
#include <libavfilter/buffersrc.h>

#include <libavformat/avformat.h>

#include <libavutil/opt.h>
#include <libavutil/imgutils.h>
#include <libavutil/time.h>
#include <libavutil/avassert.h>
#include <libavutil/pixdesc.h>


#define container_of(ptr, type, member) \
    ((type *)((char *)(ptr) - (size_t)&(((type *)0)->member)))

#define list_entry(ptr, type, member) \
    container_of(ptr, type, member)

#define list_first_entry(ptr, type, field)  list_entry((ptr)->next, type, field)

enum {
    DEBUG_LEVEL_DEBUG,
    DEBUG_LEVEL_INFO,
    DEBUG_LEVEL_WARN,
    DEBUG_LEVEL_ERROR,
};

struct list_head {
    struct list_head *next, *prev;
};

struct input_frame_node {
    AVFrame *frame;
    int input_idx;
    struct list_head link;
};

struct filter_thread_context {
    pthread_t tid;
    struct tile_codec_context *codec_ctx;
    int nb_input_frames;
    struct list_head input_frame_list;
    pthread_mutex_t inst_mtx;
    pthread_cond_t inst_cond;
    int frame_queue_depth;
};

struct repack_thread_context {
    pthread_t tid;
    struct tile_codec_context *codec_ctx;
    pthread_mutex_t inst_mtx;
    pthread_cond_t inst_cond;
    bool need_repack;
};

struct tiled_frame_node {
    AVFrame *frame;
    struct list_head link;
};

struct tiled_packet_node {
    AVPacket pkt;
    struct list_head link;
};

typedef struct TileInfo {
    int w, h, x, y;
} TileInfo;

struct codec_instance {
    int id;
    FILE *encoder_fp;
    FILE *rawtotile_fp;
    AVPacket pkt;
    const AVCodec *encoder;
    AVCodecContext *enc_ctx;
    const char *codec_params;
    size_t frame_number;
    TileInfo tile_info;
    AVBSFContext *raw_to_tile_bsf_ctx;

    int nb_tiled_frames;
    int nb_tiled_packets;
    struct list_head tiled_frame_list;
    struct list_head tiled_packet_list;

    pthread_mutex_t pkt_mtx;
    pthread_mutex_t frm_mtx;
    bool need_repack;
    int frame_queue_depth;
};

struct codec_instance_node {
    struct codec_instance *inst;
    struct list_head link;
};

struct tile_thread_context {
    pthread_t tid;
    int nb_instances;
    struct tile_codec_context *codec_ctx;
    struct list_head inst_list;
    pthread_mutex_t inst_mtx;
    pthread_cond_t inst_cond;
};

struct tile_codec_context {
    struct filter_thread_context *filter_worker;
    int active_filter_worker;
    struct codec_instance *instances;
    struct tile_thread_context *workers;
    int active_workers;
    int total_threads;
    int finish;
    int input_finish;
    int filter_finish;
    int split_row;
    int split_col;
    int video_width;
    int video_height;
    int nb_inputs;
    int nb_encoders;
    int nb_frames;
    AVRational sar;
    pthread_mutex_t mutex;
    pthread_cond_t cond;
    TileInfo *tile_info;
    AVFormatContext *ifmt_ctx;
    AVFormatContext *ofmt_ctx;
    AVFilterGraph *filter_graph;
    AVFilterContext **bufsrc_ctxes;
    AVFilterContext **bufsink_ctxes;
    AVBSFContext *tile_repack_bsf_ctx;

    struct repack_thread_context *repack_worker;
    int active_repack_worker;
};

//static int debug_level = DEBUG_LEVEL_DEBUG;
static struct tile_codec_context *p_tile_codec_ctx;

static inline void INIT_LIST_HEAD(struct list_head *list)
{
    list->next = list->prev = list;
}

static inline int list_empty(const struct list_head *head)
{
    return head->next == head;
}

static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next)
{
    next->prev = new;
    new->next = next;
    new->prev = prev;
    prev->next = new;
}

static inline void list_add(struct list_head *_new, struct list_head *head)
{
    __list_add(_new, head, head->next);
}

static inline void list_add_tail(struct list_head *_new, struct list_head *head)
{
    __list_add(_new, head->prev, head);
}

static inline void __list_del(struct list_head *entry)
{
    entry->next->prev = entry->prev;
    entry->prev->next = entry->next;
}

static inline void list_del(struct list_head *entry)
{
    __list_del(entry);
    entry->next = entry->prev = NULL;
}

static void sigint_handler(int signo)
{
    p_tile_codec_ctx->finish = 1;
    av_log(NULL, AV_LOG_INFO, "%s().\n", __func__);
}

static void usage(void)
{
    printf("Usage: \n"
           "-h | --help                           help info.\n"
           "-s | --size                            [width]x[height] of yuv.\n"
           "-p | --pix_fmt                       pixel format of yuv.\n"
           "-x | --xcoder_params             xcoder parameters.\n"
           "-i | --inputfile                       [tile index]:[input yuv file] (up to 4 inputs).\n"
           "-c | --codec                         codec name.\n"
           "-o | --output_file                   output filename.\n"
           "-t | --tiles                            [column]x[row] of tiles.\n"
           "-j | --threads                        number of threads to use, default value is number of processors.\n"
           "-q | --frame_queue_depth       frame queue depth, default value is 3 (max 10).\n"
           "-f | --pad_filter                     pad input frame to 64x64 aligned instead of cropping input frame.\n"
           "-v | --verbose                       print verbose logs.\n"
           "-b | --bitrate                        bitrate.\n"
           "-r | --fps                             framerate.\n"
           "-g | --log                             output each of the encoded bitstreams before tile conversion.\n");
}

static void print_report(double t_total, double t_read, int64_t frame_number)
{
    double fps_total = (frame_number * 1000 / t_total);
    double fps_encoder = (frame_number * 1000 / (t_total - t_read));

    av_log(NULL, AV_LOG_ERROR, "t_read %f total_time2 %f frame_number %d total_framerate2 %f framerate2 %f -- \n",
                    t_read, t_total, frame_number, fps_total, fps_encoder);

    fprintf(stderr, "Encoder only fps=%3.f Average fps=%3.f \n", fps_encoder, fps_total);

    fflush(stderr);
}

static int tile_packet_available(void)
{
    int i;
    struct codec_instance *inst;
    for (i = 0; i < p_tile_codec_ctx->nb_inputs; i++) {
        inst = &p_tile_codec_ctx->instances[i];
        pthread_mutex_lock(&inst->pkt_mtx);
        if (list_empty(&inst->tiled_packet_list)) {
            pthread_mutex_unlock(&inst->pkt_mtx);
            return 0;
        }
        pthread_mutex_unlock(&inst->pkt_mtx);
    }
    return 1;
}

static void packet_dispatch(struct repack_thread_context *repack_worker,
                           struct codec_instance *inst, AVPacket *packet)
{
    struct tiled_packet_node *tpn;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    tpn = (struct tiled_packet_node *) malloc(sizeof(*tpn));
    av_assert0(tpn);
    av_packet_move_ref(&tpn->pkt, packet);

    pthread_mutex_lock(&inst->pkt_mtx);
    list_add_tail(&tpn->link, &inst->tiled_packet_list);
    inst->nb_tiled_packets++;
    pthread_mutex_unlock(&inst->pkt_mtx);

    pthread_mutex_lock(&repack_worker->inst_mtx);
    pthread_cond_signal(&repack_worker->inst_cond);
    pthread_mutex_unlock(&repack_worker->inst_mtx);
}


#define LEN_OUTPKT_HEADER 96
static int raw_to_tile_bsf(struct codec_instance *inst, AVPacket *in_pkt, FILE *outfile)
{
    int ret;
    AVPacket out_pkt;
    AVBSFContext *bsf_ctx;
    struct repack_thread_context *repack_worker = p_tile_codec_ctx->repack_worker;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    if (in_pkt) {
        av_log(NULL, AV_LOG_INFO, "%d: send in_pkt: size %d,dts %ld,pts %ld,sd elems %d\n",
               inst->id, in_pkt->size, in_pkt->dts, in_pkt->pts, in_pkt->side_data_elems);
    }

    if (inst->need_repack)
    {
        bsf_ctx = inst->raw_to_tile_bsf_ctx;
        ret = av_bsf_send_packet(bsf_ctx, in_pkt);
        if (ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "failed to send packet to bsf\n");
            goto out;
        }

        for (; ;) {
            ret = av_bsf_receive_packet(bsf_ctx, &out_pkt);  //hevc_rawtotile_filter()
            if (ret == AVERROR(EAGAIN)) {
                break;
            } else if (ret < 0) {
                if (ret == AVERROR_EOF)
                    av_log(NULL, AV_LOG_ERROR, "eof reach\n");
                else
                    av_log(NULL, AV_LOG_ERROR, "failed to receive packet %p from bsf: %s\n", in_pkt, av_err2str(ret));
                break;
            }

            av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: %d: recv out_pkt: size %d,dts %ld,pts %ld,sd elems %d\n",
                    __FILE__, __LINE__, __func__, inst->id, out_pkt.size, out_pkt.dts, out_pkt.pts, out_pkt.side_data_elems);

            if (outfile) {
                    fwrite(out_pkt.data, 1, out_pkt.size, outfile);
            }
            packet_dispatch(repack_worker, inst, &out_pkt);
        }
    }
    else
    {
        packet_dispatch(repack_worker, inst, in_pkt);
    }

out:
    return ret;
}

static int do_encode(struct codec_instance *inst, AVFrame *frame, AVPacket *pkt, FILE *outfile)
{
    int ret;
    AVCodecContext *enc_ctx;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    /* send the frame to the encoder */
    if (frame) {
        av_log(NULL, AV_LOG_INFO, "%d: Send frame %3"PRId64", type=%d\n", inst->id, frame->pts, frame->pict_type);
    }

    enc_ctx = inst->enc_ctx;
    for (; ;) {
        ret = avcodec_send_frame(enc_ctx, frame);
        if (ret == AVERROR(EAGAIN)) {
            continue;
        } else if (ret == AVERROR_EOF) {
            return ret;
        } else if (ret < 0) {
            fprintf(stderr, "Error sending a frame for encoding (%d)\n", ret);
            return ret;
        } else {
            break;
        }
    }

    while (ret >= 0) {
        ret = avcodec_receive_packet(enc_ctx, pkt);
        if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
            return ret;
        } else if (ret < 0) {
            fprintf(stderr, "Error during encoding (%d)\n", ret);
            return ret;
        }

        av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: frame_number %ld pkt->size %d\n", __FILE__, __LINE__, __func__, inst->frame_number, pkt->size);

        inst->frame_number++;
        if (outfile) {
            fwrite(pkt->data, 1, pkt->size, outfile);
        }

        raw_to_tile_bsf(inst, pkt, inst->rawtotile_fp);

        av_packet_unref(pkt);
    }

    return 0;
}

static int raw_to_tile_bsf_init(struct codec_instance *inst)
{
    int ret, video_width, video_height, split_row, split_col;
    char options[128] = { 0 };
    AVCodecParameters *codec_par;
    AVBSFContext *bsf_ctx;

    av_log(NULL, AV_LOG_INFO, "ready to init bsf\n");

    video_width = p_tile_codec_ctx->video_width;
    video_height = p_tile_codec_ctx->video_height;
    split_row = p_tile_codec_ctx->split_row;
    split_col = p_tile_codec_ctx->split_col;
    snprintf(options, sizeof(options), "%s_rawtotile=width=%d:height=%d:column=%d:row=%d:x=%d:y=%d",
             "hevc", video_width, video_height, split_col, split_row, inst->tile_info.x, inst->tile_info.y);
    av_log(NULL, AV_LOG_ERROR, "raw_to_tile_bsf_init options: %s\n", options);

    ret = av_bsf_list_parse_str(options, &bsf_ctx);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "failed to allocate bsf\n");
        return ret;
    }

    codec_par = avcodec_parameters_alloc();
    avcodec_parameters_from_context(codec_par, inst->enc_ctx);
    avcodec_parameters_copy(bsf_ctx->par_in, codec_par);
    avcodec_parameters_free(&codec_par);

    ret = av_bsf_init(bsf_ctx);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Error initialize bitstream filter = %s\n", av_err2str(ret));
        return ret;
    }

    inst->raw_to_tile_bsf_ctx = bsf_ctx;
    av_log(NULL, AV_LOG_INFO, "bsf init done\n");

    return 0;
}

static int tile_repack_bsf_init(AVStream *stream, int tile_num)
{
    int ret;
    AVBSFContext *bsf_ctx;
    char options[128] = { 0 };

    snprintf(options, sizeof(options), "%s_tile_repack=tile_num=%d",
                "hevc", tile_num);
    av_log(NULL, AV_LOG_ERROR, "tile_repack_bsf_init options: %s\n", options);

    ret = av_bsf_list_parse_str(options, &bsf_ctx);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "failed to parse tile repack bsf\n");
        return ret;
    }

    avcodec_parameters_copy(bsf_ctx->par_in, stream->codecpar);

    ret = av_bsf_init(bsf_ctx);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Error initialize bitstream filter\n");
        return ret;
    }

    p_tile_codec_ctx->tile_repack_bsf_ctx = bsf_ctx;
    av_log(NULL, AV_LOG_INFO, "tile repack bsf init done\n");

    return 0;
}


// encoder threads APIs

static void encode_thread_sync(void)
{
    pthread_mutex_lock(&p_tile_codec_ctx->mutex);
    while (p_tile_codec_ctx->active_workers < p_tile_codec_ctx->total_threads) {
        pthread_cond_wait(&p_tile_codec_ctx->cond, &p_tile_codec_ctx->mutex);
    }
    pthread_mutex_unlock(&p_tile_codec_ctx->mutex);
}

static void *encode_routine(void *arg)
{
    struct tile_thread_context *worker = (struct tile_thread_context *) arg;
    struct tiled_frame_node *tfn;
    struct codec_instance_node *cin;
    struct codec_instance *inst;
    struct tile_codec_context *ctx = worker->codec_ctx;
    AVFrame *frame;
    struct repack_thread_context *repack_worker;

    av_log(NULL, AV_LOG_INFO, "tile_thread_context %ld enter.\n", worker->tid);

    // I am ready
    pthread_mutex_lock(&ctx->mutex);
    ctx->active_workers++;
    if (ctx->active_workers == ctx->total_threads) {
        pthread_cond_signal(&ctx->cond);
    }
    pthread_mutex_unlock(&ctx->mutex);

    for (; ;) {
        pthread_mutex_lock(&worker->inst_mtx);

        while (list_empty(&worker->inst_list) && !ctx->filter_finish) {
            pthread_cond_wait(&worker->inst_cond, &worker->inst_mtx);
        }

        if (ctx->finish || (list_empty(&worker->inst_list) && ctx->filter_finish)) {
            // End of all frame encoding including flushing.
            pthread_mutex_unlock(&worker->inst_mtx);
            break;
        }

        cin = list_first_entry(&worker->inst_list, struct codec_instance_node, link);
        av_assert0(cin != NULL && cin->inst != NULL);
        list_del(&cin->link);
        inst = cin->inst;
        worker->nb_instances--;

        pthread_mutex_unlock(&worker->inst_mtx);

        pthread_mutex_lock(&inst->frm_mtx);
        tfn = list_first_entry(&cin->inst->tiled_frame_list, struct tiled_frame_node, link);
        av_assert0(tfn != NULL);
        list_del(&tfn->link);
        inst->nb_tiled_frames--;
        pthread_mutex_unlock(&inst->frm_mtx);
        frame = tfn->frame;
        free(tfn);
        free(cin);

        if (!frame) {
            /* flush the encoder */
            do_encode(inst, NULL, &inst->pkt, inst->encoder_fp);
        } else {
            av_log(NULL, AV_LOG_INFO, "%d: Fetch frame pts %ld\n", inst->id, frame->pts);
            do_encode(inst, frame, &inst->pkt, inst->encoder_fp);
            av_frame_free(&frame);
        }
    }

    // Release tile codec context condvar.
    pthread_mutex_lock(&ctx->mutex);
    ctx->active_workers--;
    pthread_mutex_unlock(&ctx->mutex);

    // signal repack thread
    repack_worker = p_tile_codec_ctx->repack_worker;
    pthread_mutex_lock(&repack_worker->inst_mtx);
    pthread_cond_signal(&repack_worker->inst_cond);
    pthread_mutex_unlock(&repack_worker->inst_mtx);

    av_log(NULL, AV_LOG_INFO, "tile_thread_context %ld exit\n", worker->tid);
    return (void *) 0;
}

static void frame_dispatch(struct tile_thread_context *worker,
                           struct codec_instance *inst, AVFrame *frame)
{
    struct tiled_frame_node *tfn;
    struct codec_instance_node *cin;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    tfn = (struct tiled_frame_node *) malloc(sizeof(*tfn));
    av_assert0(tfn);
    while (inst->nb_tiled_frames > inst->frame_queue_depth && !worker->codec_ctx->finish)
        sched_yield();
    tfn->frame = frame;
    pthread_mutex_lock(&inst->frm_mtx);
    list_add_tail(&tfn->link, &inst->tiled_frame_list);
    inst->nb_tiled_frames++;
    pthread_mutex_unlock(&inst->frm_mtx);

    cin = (struct codec_instance_node *) malloc(sizeof(*cin));
    av_assert0(cin);
    cin->inst = inst;

    pthread_mutex_lock(&worker->inst_mtx);
    list_add_tail(&cin->link, &worker->inst_list);
    worker->nb_instances++;
    pthread_cond_signal(&worker->inst_cond);
    pthread_mutex_unlock(&worker->inst_mtx);
}

static int codec_instance_init(struct codec_instance *inst, int bit_rate, int fps,
                               enum AVPixelFormat pix_fmt, AVRational sar, int logging, int video_width, int video_height, bool need_filter, bool need_repack, int frame_queue_depth)
{
    int ret, devid;
    const AVCodec *encoder;
    AVCodecContext *enc_ctx;
    char name[256] = { 0 };
    char rawtotile_name[256] = { 0 };
    char str_devid[8] = { 0 };

    ret = pthread_mutex_init(&inst->pkt_mtx, NULL);
    if (ret) {
        goto end;
    }

    ret = pthread_mutex_init(&inst->frm_mtx, NULL);
    if (ret) {
        goto end;
    }

    encoder = inst->encoder;

    snprintf(name, sizeof(name) - 1, "output-%d.%s", inst->id, "h265");

    snprintf(rawtotile_name, sizeof(rawtotile_name) - 1, "rawtotile-%d.%s", inst->id, "h265");

    if (logging) {
        inst->encoder_fp = fopen(name, "wb");
        if (!inst->encoder_fp) {
            av_log(NULL, AV_LOG_ERROR, "Failed to open output file: %s.\n", strerror(errno));
            ret = AVERROR(errno);
            goto end;
        }

        inst->rawtotile_fp = fopen(rawtotile_name, "wb");
        if (!inst->rawtotile_fp) {
            av_log(NULL, AV_LOG_ERROR, "Failed to open output file: %s.\n", strerror(errno));
            ret = AVERROR(errno);
            goto end;
        }
    }

    enc_ctx = avcodec_alloc_context3(encoder);
    if (!enc_ctx) {
        av_log(NULL, AV_LOG_ERROR, "Could not allocate video encoder context\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    /* resolution must be a multiple of two */
    enc_ctx->width = video_width;
    enc_ctx->height = video_height;
    enc_ctx->bit_rate = bit_rate;
    enc_ctx->time_base = (AVRational){1, fps};
    enc_ctx->framerate = (AVRational){fps, 1};
    enc_ctx->sample_aspect_ratio = sar;
    /* emit one intra frame every ten frames
     * check frame pict_type before passing frame
     * to encoder, if frame->pict_type is AV_PICTURE_TYPE_I
     * then gop_size is ignored and the output of encoder
     * will always be I frame irrespective to gop_size
     */
    enc_ctx->gop_size = 0;
    enc_ctx->max_b_frames = 0;
    if (need_filter)
    {
        enc_ctx->pix_fmt = AV_PIX_FMT_NI_QUAD;
        enc_ctx->sw_pix_fmt = pix_fmt;
    }
    else
    {
        enc_ctx->pix_fmt = pix_fmt;
    }

    if (!strncmp(encoder->name, "h265_ni", strlen("h265_ni"))) {
        /* for example:
         * gopPresetIdx=6:lowDelay=1:intraPeriod=120:RcEnable=1:bitrate=4000000
         */
        if (inst->codec_params) {
            av_log(NULL, AV_LOG_ERROR, "param: %s\n", inst->codec_params);
            av_opt_set(enc_ctx->priv_data, "xcoder-params", inst->codec_params, 0);
        }
    } else {
        av_log(NULL, AV_LOG_ERROR, "codec %s not supported.\n", encoder->name);
        ret = AVERROR(EINVAL);
        goto end;
    }

    av_log(NULL, AV_LOG_INFO, "enc_ctx framerate num %d den %d\n", enc_ctx->framerate.num, enc_ctx->framerate.den);
    ret = avcodec_open2(enc_ctx, encoder, NULL);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Could not open codec: %s\n",
               av_err2str(ret));
        goto end;
    }
    inst->enc_ctx = enc_ctx;
    inst->need_repack = need_repack;
    inst->frame_queue_depth = frame_queue_depth;

    ret = raw_to_tile_bsf_init(inst);
    if (ret) {
        av_log(NULL, AV_LOG_ERROR, "Failed to init raw to tile bsf.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    return 0;

end:
    pthread_mutex_destroy(&inst->pkt_mtx);
    pthread_mutex_destroy(&inst->frm_mtx);
    return ret;
}

static int tile_thread_context_init(struct tile_thread_context *worker)
{
    int ret;

    ret = pthread_mutex_init(&worker->inst_mtx, NULL);
    if (ret) {
        goto end;
    }

    ret = pthread_cond_init(&worker->inst_cond, NULL);
    if (ret) {
        goto end;
    }

    return 0;

end:
    pthread_mutex_destroy(&worker->inst_mtx);
    pthread_cond_destroy(&worker->inst_cond);

    return ret;
}

static void codec_instance_cleanup(struct codec_instance *inst)
{
    if (inst) {
        av_bsf_free(&inst->raw_to_tile_bsf_ctx);
        avcodec_free_context(&inst->enc_ctx);
        pthread_mutex_destroy(&inst->pkt_mtx);
        pthread_mutex_destroy(&inst->frm_mtx);
        if (inst->encoder_fp != NULL) {
            fclose(inst->encoder_fp);
        }
        if (inst->rawtotile_fp != NULL) {
            fclose(inst->rawtotile_fp);
        }
    }
}

static int input_file_open(const char *filename, int width, int height, enum AVPixelFormat pix_fmt)
{
    int i, ret, video_stream_index;
    char video_size[16] = { 0 };
#if LIBAVFORMAT_VERSION_MAJOR >= 59
    const
#endif
    AVInputFormat *file_iformat;
    AVDictionary *iformat_opts = NULL;
    AVFormatContext *ifmt_ctx = NULL;
    AVStream *st;

    file_iformat = av_find_input_format("rawvideo");
    if (!file_iformat) {
        av_log(NULL, AV_LOG_ERROR, "failed to find input format\n");
        exit(1);
    }

    snprintf(video_size, sizeof(video_size), "%dx%d", width, height);
    av_dict_set(&iformat_opts, "video_size", video_size, 0);
    av_dict_set(&iformat_opts, "pixel_format", av_get_pix_fmt_name(pix_fmt), 0);
    av_dict_set(&iformat_opts, "scan_all_fmt", NULL, AV_DICT_MATCH_CASE);

    ret = avformat_open_input(&ifmt_ctx, filename, file_iformat, &iformat_opts);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "failed to open input file\n");
        exit(1);
    }

    ret = avformat_find_stream_info(ifmt_ctx, NULL);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Cannot find stream information\n");
        return ret;
    }

    for (i = 0; i < ifmt_ctx->nb_streams; i++)
    {
      if (ifmt_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
      {
        video_stream_index = i;
        break;
      }
    }
    st = ifmt_ctx->streams[video_stream_index];

    p_tile_codec_ctx->sar = st->codecpar->sample_aspect_ratio;

    return 0;
}

/* codec_name is video codec name. */
static int output_file_open(const char *output_file, int width, int height, int fps, enum AVCodecID codec_id)
{
    int ret;
    AVStream *out_stream;
    AVFormatContext *ofmt_ctx;
    char *ext;
#if LIBAVFORMAT_VERSION_MAJOR >= 59
    const
#endif
    AVOutputFormat *fmt;

    if (!strcmp(output_file, "null")) {
        avformat_alloc_output_context2(&ofmt_ctx, NULL, "null", NULL);
    } else {
        /* Note: The file extension string should be in output_file here for
                 avformat_alloc_output_context2() to auto-detect output format */
        avformat_alloc_output_context2(&ofmt_ctx, NULL, NULL, output_file);
    }

    if (!ofmt_ctx) {
        av_log(NULL, AV_LOG_ERROR, "Could not create output context\n");
        return AVERROR_UNKNOWN;
    }

    if (codec_id == AV_CODEC_ID_HEVC)
    {
        fmt = av_guess_format("hevc", NULL, NULL);
        if (!fmt) {
            av_log(NULL, AV_LOG_ERROR, "Could not find output format\n");
            return AVERROR_UNKNOWN;
        }
        ofmt_ctx->oformat = fmt;
    }

    //Add Stream
    out_stream = avformat_new_stream(ofmt_ctx, NULL);
    if (!out_stream) {
        av_log(NULL, AV_LOG_ERROR, "Failed allocating output stream\n");
        return AVERROR_UNKNOWN;
    }

    out_stream->codecpar->codec_type = AVMEDIA_TYPE_VIDEO;
    out_stream->codecpar->codec_id = codec_id;
    out_stream->codecpar->width = width;
    out_stream->codecpar->height = height;
    out_stream->time_base.num = 1;
    out_stream->time_base.den = fps;

    av_dump_format(ofmt_ctx, 0, output_file, 1);

    if (!(ofmt_ctx->oformat->flags & AVFMT_NOFILE)) {
        ret = avio_open(&ofmt_ctx->pb, output_file, AVIO_FLAG_WRITE);
        if (ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "Could not open output file '%s'", output_file);
            return ret;
        }
    }

    /* init muxer, write output file header */
    ret = avformat_write_header(ofmt_ctx, NULL);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Error occurred when opening output file\n");
        return ret;
    }

    p_tile_codec_ctx->ofmt_ctx = ofmt_ctx;

    return 0;
}

static int filter_graph_open(const char *graph_desc, int width, int height,
                             enum AVPixelFormat pix_fmt, int fps, AVRational sar, int nb_inputs)
{
    int i, ret;
    AVFilterInOut *inputs, *outputs, *cur;

    p_tile_codec_ctx->filter_graph = avfilter_graph_alloc();
    if (p_tile_codec_ctx->filter_graph == NULL) {
        av_log(NULL, AV_LOG_ERROR, "failed to allocate filter graph\n");
        goto out;
    }

    ret = avfilter_graph_parse2(p_tile_codec_ctx->filter_graph, graph_desc, &inputs, &outputs);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "failed to parse graph\n");
        goto out;
    }

    p_tile_codec_ctx->bufsrc_ctxes = malloc(sizeof(AVFilterContext *) * nb_inputs);
    av_assert0(p_tile_codec_ctx->bufsrc_ctxes);

    for (cur = inputs, i = 0; cur; cur = cur->next, i++) {
        char args[512] = { 0 };
        char name[255] = { 0 };

        snprintf(name, sizeof(name), "in_%d", i);
        snprintf(args, sizeof(args), "video_size=%dx%d:pix_fmt=%d:time_base=%d/%d:pixel_aspect=%d/%d",
                 width, height, pix_fmt, 1, fps, sar.num, sar.den);
        av_log(NULL, AV_LOG_ERROR, "input filter name: %s args: %s\n", name, args);
        ret = avfilter_graph_create_filter(&p_tile_codec_ctx->bufsrc_ctxes[i], avfilter_get_by_name("buffer"),
                                           name, args, NULL, p_tile_codec_ctx->filter_graph);
        if (ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "failed to create input filter: %d\n", i);
            goto out;
        }

        ret = avfilter_link(p_tile_codec_ctx->bufsrc_ctxes[i], 0, cur->filter_ctx, cur->pad_idx);
        if (ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "failed to link input filter: %d\n", i);
            goto out;
        }
    }

    p_tile_codec_ctx->bufsink_ctxes = malloc(sizeof(AVFilterContext *) * nb_inputs);
    av_assert0(p_tile_codec_ctx->bufsink_ctxes);

    for (cur = outputs, i = nb_inputs - 1; cur && i >= 0; cur = cur->next, i--) {
        char name[255] = { 0 };

        snprintf(name, sizeof(name), "out_%d", i);
        av_log(NULL, AV_LOG_ERROR, "output filter name: %s\n", name);
        ret = avfilter_graph_create_filter(&p_tile_codec_ctx->bufsink_ctxes[i], avfilter_get_by_name("buffersink"),
                                           name, NULL, NULL, p_tile_codec_ctx->filter_graph);
        if (ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "failed to create output filter: %d", i);
            goto out;
        }

        ret = avfilter_link(cur->filter_ctx, cur->pad_idx, p_tile_codec_ctx->bufsink_ctxes[i], 0);
        if (ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "failed to link output filter: %d\n", i);
            goto out;
        }
    }

    ret = avfilter_graph_config(p_tile_codec_ctx->filter_graph, NULL);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "failed to config graph filter\n");
        goto out;
    }

out:
    avfilter_inout_free(&inputs);
    avfilter_inout_free(&outputs);
    return ret;
}

static int filter_frame(AVFrame *in_frame, int input_idx)
{
    int i, ret;
    AVFrame *filt_frame;
    struct codec_instance *instances;
    struct tile_thread_context *workers;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    instances = p_tile_codec_ctx->instances;
    workers = p_tile_codec_ctx->workers;
    if (in_frame) {
        av_log(NULL, AV_LOG_INFO, "frame pixel=%dx%d, crop=%ld/%ld/%ld/%ld linesize %d/%d/%d "
               "data=%p/%p/%p\n",
               in_frame->width, in_frame->height, in_frame->crop_top,
               in_frame->crop_bottom, in_frame->crop_left, in_frame->crop_right,
               in_frame->linesize[0], in_frame->linesize[1], in_frame->linesize[2],
               in_frame->data[0], in_frame->data[1], in_frame->data[2]);

        av_log(NULL, AV_LOG_INFO, "ref count %d/%d/%d\n",
               in_frame->buf[0] ? av_buffer_get_ref_count(in_frame->buf[0]) : -1,
               in_frame->buf[1] ? av_buffer_get_ref_count(in_frame->buf[1]) : -1,
               in_frame->buf[2] ? av_buffer_get_ref_count(in_frame->buf[2]) : -1);

        in_frame->pict_type = AV_PICTURE_TYPE_NONE;
        av_log(NULL, AV_LOG_INFO, "decode frame %3"PRId64", type=%d\n", in_frame->pts, in_frame->pict_type);
    }

    ret = av_buffersrc_add_frame_flags(p_tile_codec_ctx->bufsrc_ctxes[input_idx], in_frame, AV_BUFFERSRC_FLAG_KEEP_REF);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Failed to add frame to buffersrc\n");
        return ret;
    }

    for (i = 0; i < p_tile_codec_ctx->nb_inputs; i++) {
        for (; ;) {
            filt_frame = av_frame_alloc();
            av_assert0(filt_frame);

            ret = av_buffersink_get_frame(p_tile_codec_ctx->bufsink_ctxes[i], filt_frame);
            if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
                av_frame_free(&filt_frame);
                break;
            } else if (ret < 0) {
                av_log(NULL, AV_LOG_ERROR, "%d: failed to get frame from buffersink\n", i);
                av_frame_free(&filt_frame);
                return ret;
            }

            av_log(NULL, AV_LOG_INFO, "%d: filt frame pixel=%dx%d, crop=%ld/%ld/%ld/%ld, linesize=%d/%d/%d "
                       "data=%p/%p/%p\n",
                   i, filt_frame->width, filt_frame->height, filt_frame->crop_top,
                   filt_frame->crop_bottom, filt_frame->crop_left, filt_frame->crop_right,
                   filt_frame->linesize[0], filt_frame->linesize[1], filt_frame->linesize[2],
                   filt_frame->data[0], filt_frame->data[1], filt_frame->data[2]);

            av_log(NULL, AV_LOG_INFO, "filt ref count %d/%d/%d\n",
                   filt_frame->buf[0] ? av_buffer_get_ref_count(filt_frame->buf[0]) : -1,
                   filt_frame->buf[1] ? av_buffer_get_ref_count(filt_frame->buf[1]) : -1,
                   filt_frame->buf[2] ? av_buffer_get_ref_count(filt_frame->buf[2]) : -1);

            frame_dispatch(&workers[i % p_tile_codec_ctx->total_threads], &instances[i], filt_frame);
        }
    }

    /* flush decoder */
    if (!in_frame) {
        for (i = 0; i < p_tile_codec_ctx->nb_inputs; i++) {
            frame_dispatch(&workers[i % p_tile_codec_ctx->total_threads], &instances[i], NULL);
            av_log(NULL, AV_LOG_INFO, "%d: submit flush encoder\n", i);
        }
        p_tile_codec_ctx->filter_finish = 1;
    }

    return 0;
}

// filter thread API

static void filter_thread_sync(void)
{
    pthread_mutex_lock(&p_tile_codec_ctx->mutex);
    while (!p_tile_codec_ctx->active_filter_worker) {
        pthread_cond_wait(&p_tile_codec_ctx->cond, &p_tile_codec_ctx->mutex);
    }
    pthread_mutex_unlock(&p_tile_codec_ctx->mutex);
}

static void *filter_routine(void *arg)
{
    struct filter_thread_context *filter_worker = (struct filter_thread_context *) arg;
    struct input_frame_node *ifn;
    struct tile_codec_context *ctx = filter_worker->codec_ctx;
    AVFrame *frame;
    int input_idx;
    int ret;

    av_log(NULL, AV_LOG_INFO, "filter_thread_context %ld enter.\n", filter_worker->tid);

    // I am ready
    pthread_mutex_lock(&ctx->mutex);
    ctx->active_filter_worker++;
    pthread_cond_signal(&ctx->cond);
    pthread_mutex_unlock(&ctx->mutex);

    for (; ;) {
        pthread_mutex_lock(&filter_worker->inst_mtx);

        while (list_empty(&filter_worker->input_frame_list) && !ctx->input_finish) {
            pthread_cond_wait(&filter_worker->inst_cond, &filter_worker->inst_mtx);
        }

        if (ctx->finish || (list_empty(&filter_worker->input_frame_list) && ctx->input_finish)) {
            // End of all frame encoding including flushing.
            ctx->filter_finish = 1;
            pthread_mutex_unlock(&filter_worker->inst_mtx);
            break;
        }

        ifn = list_first_entry(&filter_worker->input_frame_list, struct input_frame_node, link);
        av_assert0(ifn != NULL);
        list_del(&ifn->link);
        filter_worker->nb_input_frames--;

        pthread_mutex_unlock(&filter_worker->inst_mtx);

        frame = ifn->frame;
        input_idx = ifn->input_idx;
        free(ifn);

        if (!frame) {
            /* flush the encoder */
            filter_frame(NULL, input_idx);
        } else {
            av_log(NULL, AV_LOG_INFO, "filter frame pts %ld\n", frame->pts);
            ret = filter_frame(frame, input_idx);
            if (ret < 0) {
                av_log(NULL, AV_LOG_ERROR, "Failed to send frame to filter!\n");
            }
            av_frame_free(&frame);
        }
    }

    // Release tile codec context condvar.
    pthread_mutex_lock(&ctx->mutex);
    ctx->active_filter_worker--;
    pthread_mutex_unlock(&ctx->mutex);

    av_log(NULL, AV_LOG_INFO, "filter_thread_context %ld exit\n", filter_worker->tid);
    return (void *) 0;
}

static void input_dispatch(struct filter_thread_context *filter_worker,
                           AVFrame *frame, int input_idx)
{
    struct input_frame_node *ifn;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    ifn = (struct input_frame_node *) malloc(sizeof(*ifn));
    av_assert0(ifn);
    ifn->frame = frame;
    ifn->input_idx = input_idx;
    while (filter_worker->nb_input_frames > filter_worker->frame_queue_depth && !filter_worker->codec_ctx->finish)
        sched_yield();
    pthread_mutex_lock(&filter_worker->inst_mtx);
    list_add_tail(&ifn->link, &filter_worker->input_frame_list);
    filter_worker->nb_input_frames++;
    pthread_cond_signal(&filter_worker->inst_cond);
    pthread_mutex_unlock(&filter_worker->inst_mtx);
}

static int filter_thread_context_init(struct filter_thread_context *filter_worker)
{
    int ret;

    ret = pthread_mutex_init(&filter_worker->inst_mtx, NULL);
    if (ret) {
        goto end;
    }

    ret = pthread_cond_init(&filter_worker->inst_cond, NULL);
    if (ret) {
        goto end;
    }

    return 0;

end:
    pthread_mutex_destroy(&filter_worker->inst_mtx);
    pthread_cond_destroy(&filter_worker->inst_cond);

    return ret;
}

static int tile_repack_bsf(AVPacket *repack_pkt, AVPacket *part_pkt)
{
    int ret;
    AVBSFContext *bsf_ctx;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    bsf_ctx = p_tile_codec_ctx->tile_repack_bsf_ctx;
    if (part_pkt) {
        ret = av_bsf_send_packet(bsf_ctx, part_pkt);
        if (ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "failed to send packet to repack bsf\n");
            return ret;
        }
    }

    ret = av_bsf_receive_packet(bsf_ctx, repack_pkt);
    if (ret == AVERROR(EAGAIN)) {
        return ret;
    } else if (ret < 0) {
        if (ret == AVERROR_EOF)
            av_log(NULL, AV_LOG_INFO, "tile repack eof reach\n");
        else
            av_log(NULL, AV_LOG_ERROR, "failed to receive packet from repack bsf: %s\n", av_err2str(ret));
        return ret;
    }

    av_log(NULL, AV_LOG_DEBUG, "recv pkt from repack bsf: size %d,dts %ld,pts %ld\n",
           repack_pkt->size, repack_pkt->dts, repack_pkt->pts);

    return 0;
}

static struct tiled_packet_node *packet_fetch(struct codec_instance *inst)
{
    struct tiled_packet_node *tpn;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    pthread_mutex_lock(&inst->pkt_mtx);

    if (list_empty(&inst->tiled_packet_list)) {
        pthread_mutex_unlock(&inst->pkt_mtx);
        return NULL;
    }

    tpn = list_first_entry(&inst->tiled_packet_list, struct tiled_packet_node, link);
    av_assert0(tpn != NULL);
    list_del(&tpn->link);
    inst->nb_tiled_packets--;
    pthread_mutex_unlock(&inst->pkt_mtx);

    return tpn;
}

/**!
 *
 * @param cp
 * @param out_pkts
 * @return 0 - get output <0 - no output
 */
static int repack_tile_packets(AVPacket **out_pkts)
{
    int i, send_packet;
    int *slice_addr;
    struct codec_instance *inst;
    struct tiled_packet_node *tpn;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    send_packet = 0;
    for (i = 0; i < p_tile_codec_ctx->nb_inputs; i++) {
        inst = &p_tile_codec_ctx->instances[i];
        tpn = packet_fetch(inst);
        if (tpn) {
            slice_addr = malloc(sizeof(int));
            *slice_addr = i;

            av_packet_add_side_data(&tpn->pkt, AV_PKT_DATA_SLICE_ADDR, (void *)slice_addr, sizeof(int));

            tile_repack_bsf(out_pkts[i], &tpn->pkt);
            av_log(NULL, AV_LOG_INFO, "Repack encoded packet tile=%d size=%d\n", i, out_pkts[i]->size);
            av_packet_unref(&tpn->pkt);
            free(tpn);
            send_packet++;
        }
    }

    // A complete output packet
    return send_packet == p_tile_codec_ctx->nb_inputs ? 0 : -1;
}

static int get_ni_devices_cnt(ni_device_type_t dev_type)
{
    int dev_cnt = -1;
    // Store ni_device_t in heap as it is ~450KB
    ni_device_t *coders = malloc(sizeof(ni_device_t));

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    if (NULL == coders) {
        av_log(NULL, AV_LOG_ERROR, "Error failed to malloc ni_device_t\n");
        return -1;
    }
    memset(coders, 0, sizeof(ni_device_t));

    if (ni_rsrc_list_all_devices(coders) != NI_RETCODE_SUCCESS) {
        av_log(NULL, AV_LOG_ERROR, "Failed to get available xcoders.\n");
        free(coders);
        return -1;
    }

    dev_cnt = coders->xcoder_cnt[dev_type];
    free(coders);

    return dev_cnt;
}


// repack thread API
static int repack_thread_context_init(struct repack_thread_context *repack_worker, bool need_repack)
{
    int ret;

    ret = pthread_mutex_init(&repack_worker->inst_mtx, NULL);
    if (ret) {
        goto end;
    }

    ret = pthread_cond_init(&repack_worker->inst_cond, NULL);
    if (ret) {
        goto end;
    }

    repack_worker->need_repack = need_repack;

    return 0;

end:
    pthread_mutex_destroy(&repack_worker->inst_mtx);
    pthread_cond_destroy(&repack_worker->inst_cond);

    return ret;
}

static void repack_thread_sync(void)
{
    pthread_mutex_lock(&p_tile_codec_ctx->mutex);
    while (!p_tile_codec_ctx->active_repack_worker) {
        pthread_cond_wait(&p_tile_codec_ctx->cond, &p_tile_codec_ctx->mutex);
    }
    pthread_mutex_unlock(&p_tile_codec_ctx->mutex);
}

static void *repack_routine(void *arg)
{
    struct repack_thread_context *repack_worker = (struct repack_thread_context *) arg;
    struct input_frame_node *ifn;
    struct tile_codec_context *ctx = repack_worker->codec_ctx;
    AVFrame *frame;
    int input_idx;
    int ret;
    AVPacket **out_pkts;
    int i;

    av_log(NULL, AV_LOG_INFO, "repack_thread_context %ld enter.\n", repack_worker->tid);

    out_pkts = av_mallocz(ctx->nb_inputs * sizeof(*out_pkts));
    av_assert0(out_pkts);
    for (i = 0; i < ctx->nb_inputs; i++) {
        out_pkts[i] = av_packet_alloc();
    }

    // I am ready
    pthread_mutex_lock(&ctx->mutex);
    ctx->active_repack_worker++;
    pthread_cond_signal(&ctx->cond);
    pthread_mutex_unlock(&ctx->mutex);

    // When there are output tile packets
    for (; ;) {
        pthread_mutex_lock(&repack_worker->inst_mtx);

        while (!tile_packet_available() && ctx->active_workers > 0) {
            pthread_cond_wait(&repack_worker->inst_cond, &repack_worker->inst_mtx);
        }

        pthread_mutex_unlock(&repack_worker->inst_mtx);

        if (ctx->finish || (!tile_packet_available() && ctx->active_workers == 0)) {
            // End of all frame encoding including flushing.
            //pthread_mutex_unlock(&repack_worker->inst_mtx);
            break;
        }

        if (repack_worker->need_repack)
        {
            if (repack_tile_packets(out_pkts) == 0) {
                for (i = 0; i < ctx->nb_inputs; i++) {
                    if(out_pkts[i]->size)
                    {
                        av_write_frame(ctx->ofmt_ctx, out_pkts[i]);
                    }
                    av_packet_unref(out_pkts[i]);
                }
            }
        }
        else
        {
            struct codec_instance *inst = &ctx->instances[0];
            struct tiled_packet_node *tpn;
            tpn = packet_fetch(inst);
            if(tpn->pkt.size)
            {
                av_write_frame(ctx->ofmt_ctx, &tpn->pkt);
            }
            av_packet_unref(&tpn->pkt);
            free(tpn);
        }
    }

    av_write_trailer(ctx->ofmt_ctx);

    for (i = 0; i < ctx->nb_inputs; i++) {
        av_packet_free(&out_pkts[i]);
    }
    av_freep(&out_pkts);

    // Release tile codec context condvar.
    pthread_mutex_lock(&ctx->mutex);
    ctx->active_repack_worker--;
    pthread_mutex_unlock(&ctx->mutex);

    av_log(NULL, AV_LOG_INFO, "repack_thread_context %ld exit\n", repack_worker->tid);
    return (void *) 0;
}


int main(int argc, char **argv)
{
    const char *codec_name = NULL;
    enum AVCodecID codec_id = AV_CODEC_ID_NONE;
    const AVCodec *encoder;
    AVFrame *p_decoded_frame;
    //AVPacket **out_pkts;
    AVRational sar = (AVRational){0, 1};
    int enc_cnt;
    int64_t frame_number;
    int64_t t0, t1, t2, t3;
    double t_read = 0;
    double t_total = 0;
    int done;
    int logging = 0;
    int i, j;
    int loops = 1;
    int ctu_size = 64;
    char *xcoder_params = NULL;
    int xcoder_params_size = 0;
    int video_width = 0;
    int video_height = 0;
    int ctb_width;
    int ctb_height;
    int min_ctb_width;
    int min_ctb_height;
    int bitrate = 4000000;
    int fps = 25;
    int pad_filter_needed = 0;
    char *input_file[4];
    int input_path_len = 0;
    int input_tile_index[4] = {-1, -1, -1, -1};
    int first_tile_index = 4;
    int input_file_num = 0;
    const char *output_file = NULL;
    int devid = 0;
    char *n, *p;
    int ret;
    size_t input_size;
    int nb_procs = 0;
    int nb_tiles = 1;
    bool need_repack = false;
    struct repack_thread_context *repack_worker;
    struct filter_thread_context *filter_worker;
    struct codec_instance *instances;
    struct tile_thread_context *workers;
    int nb_threads = 0;
    int split_col = 1;
    int split_row = 1;
    int frame_queue_depth = 3;
    int verbose = 0;
    char *graph_desc;
    int graph_desc_size;
    int opt;
    int opt_index;
    int ctb_aligned_width = 0;
    int ctb_aligned_height = 0;
    int bit_depth_factor = 1;
    FILE *in_fp[4];
    enum AVPixelFormat pix_fmt = AV_PIX_FMT_YUV420P;
    const char *opt_string = "s:x:i:o:d:c:r:b:d:t:l:p:j:q:fvmygh";
    static struct option long_options[] = {
        {"size",          required_argument, NULL, 's'},
        {"xcoder_params", required_argument, NULL, 'x'},
        {"input_file",    required_argument, NULL, 'i'},
        {"output_file",   required_argument, NULL, 'o'},
        {"codec",         required_argument, NULL, 'c'},
        {"fps",           required_argument, NULL, 'r'},
        {"bitrate",       required_argument, NULL, 'b'},
        {"devid",         required_argument, NULL, 'd'},
        {"tiles",         required_argument, NULL, 't'},
        {"loop",          required_argument, NULL, 'l'},
        {"pix_fmt",       required_argument, NULL, 'p'},
        {"threads",       required_argument, NULL, 'j'},
        {"frame_queue_depth",       required_argument, NULL, 'q'},
        {"pad_filter",         no_argument, NULL, 'f'},
        {"verbose",         no_argument, NULL, 'v'},
        {"mute",          no_argument,       NULL, 'm'},
        {"sync",          no_argument,       NULL, 'y'},
        {"log",           no_argument,       NULL, 'g'},
        {"help",          no_argument,       NULL, 'h'},
        {NULL,            0,                 NULL, 0},
    };

    while ((opt = getopt_long(argc, argv, opt_string, long_options, &opt_index)) != -1) {
        switch (opt) {
            case 's':
                video_width = strtoul(optarg, &n, 10);
                if (*n != 'x') {
                    usage();
                    return -1;
                }
                video_height = strtoul(n + 1, NULL, 10);
                break;
            case 'x':
                //xcoder_params = optarg;
                xcoder_params_size = strlen(optarg) + 128;  // assume 128 bytes are enough for crop parameters
                xcoder_params = malloc(xcoder_params_size);
                if (!xcoder_params)
                {
                    av_log(NULL, AV_LOG_ERROR, "Failed to allocate xcoder_params.\n");
                    ret = AVERROR(ENOMEM);
                    goto end;
                }
                memset(xcoder_params, 0, xcoder_params_size);
                strncpy(xcoder_params, optarg, strlen(optarg));
                break;
            case 'i':
                if (input_file_num >= 4)
                {
                    av_log(NULL, AV_LOG_ERROR, "Cannot open more than 4 inputs.\n");
                    ret = AVERROR(EINVAL);
                    goto end;
                }
                input_tile_index[input_file_num] = strtoul(optarg, &n, 10);
                if (input_tile_index[input_file_num] >= 4)
                {
                    av_log(NULL, AV_LOG_ERROR, "tile index cannot exceed 4.\n");
                    ret = AVERROR(EINVAL);
                    goto end;
                }
                if (input_tile_index[input_file_num] < first_tile_index)
                    first_tile_index = input_tile_index[input_file_num];
                if (*n != ':') {
                    av_log(NULL, AV_LOG_ERROR, "Please use [tile index]:[input yuv file] format for input_file parameter\n");
                    usage();
                    return -1;
                }
                input_path_len = strlen(optarg)-2;
                input_file[input_file_num] = ((char *)optarg)+2;
                input_file_num++;
                break;
            case 'o':
                output_file = optarg;
                break;
            case 'c':
                codec_name = optarg;
                break;
            case 'd':
                devid = atoi(optarg);
                break;
            case 'g':
                logging = 1;
                break;
            case 'l':
                loops = atoi(optarg);
                break;
            case 'p':
                pix_fmt = av_get_pix_fmt(optarg);
                break;
            case 'b':
                bitrate = atoi(optarg);
                break;
            case 'r':
                fps = atoi(optarg);
                break;
            case 't':
                split_col = strtoul(optarg, &n, 10);
                if (n[0] != 'x' || !isdigit(n[1])) {
                    usage();
                    return -1;
                }
                split_row = strtoul(n + 1, NULL, 10);
                break;
            case 'j':
                nb_threads = atoi(optarg);
                break;
            case 'q':
                frame_queue_depth = atoi(optarg);
                break;
            case 'f':
                pad_filter_needed = 1;
                break;
            case 'v':
                verbose = 1;
                break;
            case 'h':
                usage();
                return 0;
            default:
                usage();
                return -1;
        }
    }

    if (!input_file[0] || !codec_name || video_width == 0 || video_height == 0) {
        usage();
        exit(1);
    }

    av_log(NULL, AV_LOG_INFO, "### Start encoding\n");

    enc_cnt = get_ni_devices_cnt(NI_DEVICE_TYPE_ENCODER);
    if (-1 == enc_cnt) {
        av_log(NULL, AV_LOG_ERROR, "Failed to get available encoders.\n");
        exit(1);
    }

    // HEVC parameters check
    if (!strncmp(codec_name, "h265_ni", strlen("h265_ni"))) {
        codec_id = AV_CODEC_ID_HEVC;
        if (!output_file) {
            output_file = "tile_output.h265";
        }

        if (xcoder_params) {
            // Do not use adaptive (default) GOP preset for multi-card encoding on Quadra
            p = strstr(xcoder_params, "gopPresetIdx=");
            if(p) {
                if (p[13] == '-' && p[14] == '1') {
                    av_log(NULL, AV_LOG_ERROR, "Error: gopPresetIdx=-1 is not supported.\n");
                    ret = AVERROR(EINVAL);
                    goto end;
                }
                else if (p[13] == '1') {
                    snprintf(xcoder_params + strlen(xcoder_params), xcoder_params_size - strlen(xcoder_params), ":repeatHeaders=0");
                }
            }
            else
            {
                snprintf(xcoder_params + strlen(xcoder_params), xcoder_params_size - strlen(xcoder_params), ":gopPresetIdx=1:repeatHeaders=0");
            }
        }
        else
        {
            xcoder_params_size = 128; // assume 128 bytes are enough for crop parameters
            xcoder_params = malloc(xcoder_params_size);
            if (!xcoder_params)
            {
                av_log(NULL, AV_LOG_ERROR, "Failed to allocate xcoder_params.\n");
                ret = AVERROR(ENOMEM);
                goto end;
            }
            memset(xcoder_params, 0, xcoder_params_size);
            snprintf(xcoder_params, xcoder_params_size, "gopPresetIdx=1:repeatHeaders=0");
        }

        if (pad_filter_needed)
        {
            ctb_aligned_width = (video_width + (64-1)) / 64 * 64;
            ctb_aligned_height = (video_height + (64-1)) / 64 * 64;
        }
        else
        {
            ctb_aligned_width = video_width / 64 * 64;
            ctb_aligned_height = video_height / 64 * 64;
        }
    }
    else {
        printf("%s codec not supported for tile encode\n", codec_name);
        exit(1);
    }

    //av_log_set_level(AV_LOG_INFO);
    if (!verbose)
      av_log_set_level(AV_LOG_WARNING);
    else
      av_log_set_level(AV_LOG_DEBUG);
    //av_log_set_level(AV_LOG_TRACE);

    //av_log(NULL, AV_LOG_INFO, "### Done\n"); return 0;

    p_tile_codec_ctx = av_mallocz(sizeof(struct tile_codec_context));
    if (p_tile_codec_ctx == NULL) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate p_tile_codec_ctx.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    av_log(NULL, AV_LOG_INFO, "Number of row tile %d number of col tile %d \n", split_row, split_col);
    nb_tiles = split_row * split_col;
    if (nb_tiles > enc_cnt * NI_MAX_CONTEXTS_PER_HW_INSTANCE) {
        printf("The number of tiles shall not exceed max instances per encoder "
               "x number of encoders %dx%d\n", NI_MAX_CONTEXTS_PER_HW_INSTANCE,
               enc_cnt);
        exit(1);
    }

    if (nb_tiles < input_file_num) {
        printf("ERROR: number of tiles %d cannot be less than number of inputs %d\n", nb_tiles, input_file_num);
        exit(1);
    }

    // NOTE - currently we do not support 3 tiles
    if (nb_tiles == 3) {
        printf("ERROR: number of tiles cannot be 3\n");
        exit(1);
    }

    // NOTE - currently we only accept up to 2x2 tile partitions
    if (nb_tiles > 2*2) {
        printf("ERROR: number of tiles %d shall not exceed 4\n", nb_tiles);
        exit(1);
    }

    for (i = 0; i < input_file_num; i++)
    {
        if (input_tile_index[i] >= nb_tiles)
        {
            printf("ERROR: tile index %d is >= number of tiles %d\n", input_tile_index[i], nb_tiles);
            exit(1);
        }
    }

    if (nb_tiles > 1) {
        need_repack = true;
    }

    if (nb_tiles < input_file_num) {
        printf("ERROR: number of tiles %d cannot be less than number of inputs %d\n", nb_tiles, input_file_num);
        exit(1);
    }

    if (first_tile_index > 0)
    {
        printf("WARNING: tile at location x=0 y=0 must be populated, forcing tile index %d to index 0\n", first_tile_index);
    }

    // count each element in the input tile index array to prevent repeated tile index, also make sure tile index 0 is populated
    int tile_input_index[4] = {-1, -1, -1, -1};
    char *tile_input_file[4];
    for (i = 0; i < input_file_num; i++) {
         // tile index 0 must be populated
        if (first_tile_index > 0 && input_tile_index[i] == first_tile_index)
            input_tile_index[i] = first_tile_index = 0;

        if (tile_input_index[input_tile_index[i]] != -1)
        {
            printf("ERROR: tile index %d is not allowed to be assigned to more than one input\n", input_tile_index[i]);
            exit(1);
        }
        tile_input_index[input_tile_index[i]] = i;
        tile_input_file[input_tile_index[i]] = input_file[i];
    }

    // re-arrange input order because the spec rqeuires slice segments to be in raster scan tile order
    int tmp_input_num = 0;
    for (i = 0; i < 4; i++)
    {
        if (tile_input_index[i] == -1)
            continue;

        input_tile_index[tmp_input_num] = i;
        input_file[tmp_input_num] = tile_input_file[i];
        tmp_input_num++;
    }

    // retrict frame queue depth to 10 to prevent process from occupying too much host memory, which may result in OOM kill
    if (frame_queue_depth > 10) {
        printf("ERROR: number of frame queue depth shall not exceed 10\n");
        exit(1);
    }

    // prevent tile encoding fail due to mem bin buffer allocation exceeds limit
    bit_depth_factor = (pix_fmt == AV_PIX_FMT_YUV420P10LE || pix_fmt == AV_PIX_FMT_P010LE) ? 2 : 1;
    if ((int64_t)input_file_num * ctb_aligned_width * ctb_aligned_height * bit_depth_factor > (2*2) * 7680 * 4320)
    {
        snprintf(xcoder_params + strlen(xcoder_params), xcoder_params_size - strlen(xcoder_params), ":lowDelay=1");
    }

    if (SIG_ERR == signal(SIGINT, sigint_handler)) {
        av_log(NULL, AV_LOG_ERROR, "Failed to set signal.\n");
        ret = errno;
        goto end;
    }

    graph_desc_size = 64 * input_file_num;
    graph_desc = malloc(graph_desc_size);
    if (!graph_desc) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate graph_desc.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }
    memset(graph_desc, 0, graph_desc_size);

    if (pad_filter_needed)
    {
        for (i = 0; i < input_file_num; i++) {
            if (i > 0)
                snprintf(graph_desc + strlen(graph_desc),
                        graph_desc_size - strlen(graph_desc), ";");
            snprintf(graph_desc + strlen(graph_desc),
                     graph_desc_size - strlen(graph_desc), "[%d:v]ni_quadra_hwupload=-1,ni_quadra_pad=%d:%d[in_%d]", i, ctb_aligned_width, ctb_aligned_height, i);
        }
    }

    p_tile_codec_ctx->tile_info = av_mallocz(nb_tiles * sizeof(TileInfo));
    if (p_tile_codec_ctx->tile_info == NULL) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate tile info.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    // The tile splitting method is derived from source code of TComPicSym.cpp
    // in HEVC Test Model open project: https://vcgit.hhi.fraunhofer.de/jvet/HM
    // as it usually can be regarded as the best practice of HEVC standard
    // specification. In xInitTiles function it shows how it works:
    //
    // if( m_pps.getTileUniformSpacingFlag() )
    // {
    //   //set width and height for each (uniform) tile
    //   for(Int row=0; row < numRows; row++)
    //   {
    //     for(Int col=0; col < numCols; col++)
    //     {
    //       const Int tileIdx = row * numCols + col;
    //       m_tileParameters[tileIdx].setTileWidthInCtus(  (col+1)*getFrameWidthInCtus( )/numCols - (col*getFrameWidthInCtus( ))/numCols );
    //       m_tileParameters[tileIdx].setTileHeightInCtus( (row+1)*getFrameHeightInCtus()/numRows - (row*getFrameHeightInCtus())/numRows );
    //     }
    //   }
    // }
    //
    // As we can see the tile splitting is not being devided in average because
    // the size of tile should be in units of CTU/LTU. Say the size of CTU/LTU
    // is 64x64 and we have 1280 pixel width and we want to split into 8 columns
    // in tile. Then we will get 8 tiles with width of 2 3 2 3 2 3 2 3 in units
    // of CTU/LTU.
    //
    // Besides it also specifies the minimum size of each tile according to the
    // profile indicator. We can see that with MAIN or MAIN10 (or higher)
    // profile the width of tile should be at least 4 sizes of CTU/LTU. It can
    // guarantee integrity output pictures for tile encoding.
    //
    // // Tile size check
    // Int minWidth  = 1;
    // Int minHeight = 1;
    // const Int profileIdc = m_sps.getPTL()->getGeneralPTL()->getProfileIdc();
    // if (  profileIdc == Profile::MAIN || profileIdc == Profile::MAIN10)
    // {
    //   if (m_pps.getTilesEnabledFlag())
    //   {
    //     minHeight = 64  / m_sps.getMaxCUHeight();
    //     minWidth  = 256 / m_sps.getMaxCUWidth();
    //   }
    // }
    if (pad_filter_needed)
    {
        ctb_width = (video_width + ctu_size - 1) / ctu_size;
        ctb_height = (video_height + ctu_size - 1) / ctu_size;
    }
    else
    {
        ctb_width = video_width / ctu_size;
        ctb_height = video_height / ctu_size;
    }

    min_ctb_width = 256 / ctu_size;
    if (codec_id == AV_CODEC_ID_HEVC)
        min_ctb_height = 64 / ctu_size;
    else
        min_ctb_height = 256 / ctu_size;

    for (i = 0; i < split_row; i++) {
        for (j = 0; j < split_col; j++) {
            int index = i * split_col + j;
            TileInfo *ti = &p_tile_codec_ctx->tile_info[index];
            ti->w = ctb_width;
            if (ti->w < min_ctb_width) {
                av_log(NULL, AV_LOG_ERROR, "%dx%d tiles spec for resolution of "
                       "%dx%d results in width of %d unit(s) of CTU which is "
                       "less than mininum allowed %d units of CTU. Please reduce"
                       " the column number of tiles in -t, --tiles option.\n",
                       split_col, split_row, video_width, video_height, ti->w, min_ctb_width);
                exit(-1);
            }

            ti->h = ctb_height;
            if (ti->h < min_ctb_height) {
                av_log(NULL, AV_LOG_ERROR, "%dx%d tiles spec for resolution of "
                       "%dx%d results in height of %d unit of CTU which is less"
                       " than mininum allowed %d unit of CTU. Please reduce the"
                       " row number of tiles in -t, --tiles option.\n", split_col,
                       split_row, video_width, video_height, ti->h, min_ctb_height);
                exit(-1);
            }

            ti->w *= ctu_size;
            ti->h *= ctu_size;
            ti->x = j == 0 ? 0 : ti[-1].x + ti[-1].w;
            ti->y = i == 0 ? 0 : ti[-split_col].y + ti[-split_col].h;
        }
    }

    av_log(NULL, AV_LOG_ERROR, "filter desc: %s\n", graph_desc);

    /* find the mpeg1video encoder */
    encoder = avcodec_find_encoder_by_name(codec_name);
    if (!encoder) {
        av_log(NULL, AV_LOG_ERROR, "Codec '%s' not found\n", codec_name);
        ret = AVERROR(ENOMEM);
        goto end;
    }

    av_log(NULL, AV_LOG_INFO, "Codec %s.\n", encoder->name);

    nb_procs = get_nprocs();
    if (nb_threads > 0) {
        if (nb_threads > input_file_num) {
            av_log(NULL, AV_LOG_WARNING, "The number of threads %d cannot "
                   "exceed number of input files %d. ", nb_threads, input_file_num);
            nb_threads = nb_procs < input_file_num ? nb_procs : input_file_num;
            av_log(NULL, AV_LOG_WARNING, "Change the number of threads into "
                   "%d.\n ", nb_threads);
        }
    } else {
        nb_threads = nb_procs < input_file_num ? nb_procs : input_file_num;
    }

    p_tile_codec_ctx->total_threads = nb_threads;
    p_tile_codec_ctx->video_width = ctb_aligned_width * split_col;
    p_tile_codec_ctx->video_height = ctb_aligned_height * split_row;
    p_tile_codec_ctx->split_row = split_row;
    p_tile_codec_ctx->split_col = split_col;
    p_tile_codec_ctx->sar = sar;
    p_tile_codec_ctx->nb_inputs = input_file_num;
    p_tile_codec_ctx->nb_encoders = enc_cnt;

    av_log(NULL, AV_LOG_INFO, "split column: %d.\n", split_col);
    av_log(NULL, AV_LOG_INFO, "split row: %d.\n", split_row);
    av_log(NULL, AV_LOG_INFO, "threads: %d.\n", p_tile_codec_ctx->total_threads);
    av_log(NULL, AV_LOG_INFO, "input file: %s.\n", input_file[0]);
    av_log(NULL, AV_LOG_INFO, "output file: %s.\n", output_file);
    av_log(NULL, AV_LOG_INFO, "codec name: %s.\n", codec_name);
    av_log(NULL, AV_LOG_INFO, "video width: %u.\n", video_width);
    av_log(NULL, AV_LOG_INFO, "video height: %u.\n", video_height);
    av_log(NULL, AV_LOG_INFO, "device id: %d.\n", devid);

    for (i = 0; i < input_file_num; i++)
    {
        av_log(NULL, AV_LOG_INFO, "opening input file[%d]: %s.\n", i, input_file[i]);
        in_fp[i] = fopen(input_file[i], "rb");
        if (!in_fp[i]) {
            av_log(NULL, AV_LOG_ERROR, "failed to open input file\n");
            exit(1);
        }

        ret = input_file_open(input_file[i], video_width, video_height, pix_fmt);
        if (ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "Failed to open input file\n");
            ret = AVERROR(ENOMEM);
            goto end;
        }
    }



    if (pad_filter_needed)
    {
        ret = filter_graph_open(graph_desc, video_width, video_height, pix_fmt, fps, sar, input_file_num);
        if (ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "Failed to open filter graph\n");
            ret = AVERROR(ENOMEM);
            goto end;
        }
    }

    ret = output_file_open(output_file, ctb_aligned_width*split_row, ctb_aligned_height*split_col, fps, codec_id);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Failed to open output file\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    ret = tile_repack_bsf_init(p_tile_codec_ctx->ofmt_ctx->streams[0], input_file_num);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Failed to initialize repack bsf\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    ret = pthread_mutex_init(&p_tile_codec_ctx->mutex, NULL);
    if (ret) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate tile codec context mutex.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    ret = pthread_cond_init(&p_tile_codec_ctx->cond, NULL);
    if (ret) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate tile codec context condvar.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    // init filter thread
    if (pad_filter_needed)
    {
        p_tile_codec_ctx->input_finish = 0;

        p_tile_codec_ctx->filter_worker = av_mallocz(sizeof(struct filter_thread_context));
        if (p_tile_codec_ctx->filter_worker == NULL) {
            av_log(NULL, AV_LOG_ERROR, "Failed to allocate filter thread.\n");
            ret = AVERROR(ENOMEM);
            goto end;
        }
        filter_worker = p_tile_codec_ctx->filter_worker;

        filter_worker->nb_input_frames = 0;
        filter_worker->frame_queue_depth = frame_queue_depth;
        filter_worker->codec_ctx = p_tile_codec_ctx;
        INIT_LIST_HEAD(&filter_worker->input_frame_list);

        ret = filter_thread_context_init(filter_worker);
        if (ret) {
            av_log(NULL, AV_LOG_ERROR, "Failed init filter thread context %d.\n", i);
            ret = AVUNERROR(ret);
            goto end;
        }

        ret = pthread_create(&filter_worker->tid, NULL, filter_routine, filter_worker);
        if (ret) {
            av_log(NULL, AV_LOG_ERROR, "Failed to create pthread for filter: %s.\n",
                   strerror(errno));
            ret = AVUNERROR(ret);
            goto end;
        }

        // Wait for filter thread running...
        filter_thread_sync();
    }

    // init encoder threads
    p_tile_codec_ctx->filter_finish = 0;

    p_tile_codec_ctx->instances = av_mallocz(input_file_num * sizeof(struct codec_instance));
    if (p_tile_codec_ctx->instances == NULL) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate encoder instances.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }
    instances = p_tile_codec_ctx->instances;

    sar = p_tile_codec_ctx->sar;

    TileInfo *ti;
    int xcoder_params_len = strlen(xcoder_params);
    for (i = 0; i < input_file_num; i++) {
        struct codec_instance *inst = &p_tile_codec_ctx->instances[i];
        inst->id = i;
        inst->encoder = encoder;

        ti = &p_tile_codec_ctx->tile_info[input_tile_index[i]];

        if (!pad_filter_needed)
            snprintf(xcoder_params + xcoder_params_len,
                         xcoder_params_size - xcoder_params_len, ":cropWidth=%d:cropHeight=%d:horOffset=%d:verOffset=%d",
                         ti->w, ti->h, 0, 0);

        inst->codec_params = xcoder_params;
        inst->tile_info = p_tile_codec_ctx->tile_info[input_tile_index[i]];
        INIT_LIST_HEAD(&inst->tiled_frame_list);
        INIT_LIST_HEAD(&inst->tiled_packet_list);

        if (pad_filter_needed)
            ret = codec_instance_init(inst, bitrate, fps, pix_fmt, sar, logging, ctb_aligned_width, ctb_aligned_height, pad_filter_needed, need_repack, frame_queue_depth);
        else
            ret = codec_instance_init(inst, bitrate, fps, pix_fmt, sar, logging, video_width, video_height, pad_filter_needed, need_repack, frame_queue_depth);

        if (ret) {
            av_log(NULL, AV_LOG_ERROR, "Failed init codec instance %d.\n", i);
            ret = AVUNERROR(ret);
            goto end;
        }
    }

    p_tile_codec_ctx->workers = av_mallocz(p_tile_codec_ctx->total_threads * sizeof(struct tile_thread_context));
    if (p_tile_codec_ctx->workers == NULL) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate encode threads.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }
    workers = p_tile_codec_ctx->workers;

    for (i = 0; i < p_tile_codec_ctx->total_threads; i++) {
        struct tile_thread_context *worker = &p_tile_codec_ctx->workers[i];
        worker->nb_instances = 0;
        worker->codec_ctx = p_tile_codec_ctx;
        INIT_LIST_HEAD(&worker->inst_list);

        ret = tile_thread_context_init(worker);
        if (ret) {
            av_log(NULL, AV_LOG_ERROR, "Failed init tile thread context %d.\n", i);
            ret = AVUNERROR(ret);
            goto end;
        }

        ret = pthread_create(&worker->tid, NULL, encode_routine, worker);
        if (ret) {
            av_log(NULL, AV_LOG_ERROR, "Failed to create pthread for %d: %s.\n",
                   i, strerror(errno));
            ret = AVUNERROR(ret);
            goto end;
        }
    }

    // Wait for all instances running...
    encode_thread_sync();

    // init repack thread
    p_tile_codec_ctx->repack_worker = av_mallocz(sizeof(struct repack_thread_context));
    if (p_tile_codec_ctx->repack_worker == NULL) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate repack thread context.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }
    repack_worker = p_tile_codec_ctx->repack_worker;

    repack_worker->codec_ctx = p_tile_codec_ctx;

    ret = repack_thread_context_init(repack_worker, need_repack);
    if (ret) {
        av_log(NULL, AV_LOG_ERROR, "Failed init repack thread context %d.\n", i);
        ret = AVUNERROR(ret);
        goto end;
    }

    ret = pthread_create(&repack_worker->tid, NULL, repack_routine, repack_worker);
    if (ret) {
        av_log(NULL, AV_LOG_ERROR, "Failed to create pthread for repack: %s.\n",
               strerror(errno));
        ret = AVUNERROR(ret);
        goto end;
    }

    // Wait for filter thread running...
    repack_thread_sync();

    done = 0;
    frame_number = 0;

    p_tile_codec_ctx->nb_frames = 0;
    for (i = 0; i < input_file_num; i++)
    {
        fseek(in_fp[i], 0, SEEK_END);
        input_size = ftell(in_fp[i]);
        rewind(in_fp[i]);
        int nb_frames = input_size / ((video_width*bit_depth_factor) * video_height * 3 / 2);
        if (!p_tile_codec_ctx->nb_frames)
            p_tile_codec_ctx->nb_frames = nb_frames;
        else
            p_tile_codec_ctx->nb_frames = (nb_frames < p_tile_codec_ctx->nb_frames) ? nb_frames : p_tile_codec_ctx->nb_frames;
    }

    t0 = av_gettime_relative();

    for (i = 0; i < p_tile_codec_ctx->nb_frames; i++) {
        if (p_tile_codec_ctx->finish) {
            break;
        }

        for (j = 0; j < input_file_num; j++)
        {
            // allocate new AvFrame for each input frame and dispatch to filter thread (filter thread frees AvFrame)
            p_decoded_frame = av_frame_alloc();
            if (!p_decoded_frame) {
                av_log(NULL, AV_LOG_ERROR, "Could not allocate video frame\n");
                ret = AVERROR(ENOMEM);
                goto end;
            }

            p_decoded_frame->width = video_width;
            p_decoded_frame->height = video_height;
            p_decoded_frame->format = pix_fmt;

            ret = av_frame_get_buffer(p_decoded_frame, 32);
            if (ret < 0) {
                av_log(NULL, AV_LOG_FATAL, "Could not allocate the AVFrame buffers\n");
                ret = AVERROR(ENOMEM);
                goto end;
            }

            if (!av_frame_is_writable(p_decoded_frame))
                av_log(NULL, AV_LOG_FATAL, "p_decoded_frame is not writable %d!\n");

            t2 = av_gettime_relative();
            fread(p_decoded_frame->data[0], p_decoded_frame->linesize[0], p_decoded_frame->height, in_fp[j]);
            fread(p_decoded_frame->data[1], p_decoded_frame->linesize[0] / 2, p_decoded_frame->height / 2, in_fp[j]);
            fread(p_decoded_frame->data[2], p_decoded_frame->linesize[0] / 2, p_decoded_frame->height / 2, in_fp[j]);
            t3 = av_gettime_relative();

            p_decoded_frame->pts = frame_number;
            av_log(NULL, AV_LOG_INFO, "read one pkt: %ld\n", frame_number);

            // dispatch to filter
            if (pad_filter_needed)
                input_dispatch(filter_worker, p_decoded_frame, j);
            else
                frame_dispatch(&workers[j % p_tile_codec_ctx->total_threads], &instances[j], p_decoded_frame);

            // prevent double free
            p_decoded_frame = NULL;
        }

        if (i == p_tile_codec_ctx->nb_frames - 1 && --loops > 0) {
            i = -1;
            for (j = 0; j < input_file_num; j++)
            {
                rewind(in_fp[j]);
            }
        }

        frame_number++;
        t_read += (t3 - t2) / 1000.0;
    }

    // initiate encoder flush
    for (j = 0; j < input_file_num; j++)
    {
        if (pad_filter_needed)
        {
            input_dispatch(filter_worker, NULL, j);
        }
        else
        {
            frame_dispatch(&workers[j % p_tile_codec_ctx->total_threads], &instances[j], NULL);
        }
    }

    if (!pad_filter_needed)
        p_tile_codec_ctx->filter_finish = 1;

    p_tile_codec_ctx->input_finish = 1;

    // Monitor repack thread and SIGINT
    while (!p_tile_codec_ctx->finish) {
        pthread_mutex_lock(&p_tile_codec_ctx->mutex);
        if (p_tile_codec_ctx->active_repack_worker == 0) {
            done = 1;
        }
        pthread_mutex_unlock(&p_tile_codec_ctx->mutex);

        if (done) {
            break;
        }
        else {
            sched_yield();
        }
    }

    t1 = av_gettime_relative();

    t_total = ((t1-t0) / 1000.0);
    double total_framerate = (frame_number * 1000 / (t_total));
    double encoder_framerate = (frame_number * 1000 / (t_total - t_read));

    print_report(t_total, t_read, frame_number);

    av_log(NULL, AV_LOG_ERROR, "t_read %f total_time2 %f frame_number %d total_framerate2 %f framerate2 %f\n",
                t_read, t_total, frame_number, total_framerate, encoder_framerate);

    p_tile_codec_ctx->finish = 1;

    // clean up filter thread
    void *result;
    if (pad_filter_needed)
    {
        pthread_mutex_lock(&filter_worker->inst_mtx);
        pthread_cond_signal(&filter_worker->inst_cond);
        pthread_mutex_unlock(&filter_worker->inst_mtx);

        ret = pthread_join(filter_worker->tid, &result);
        if (ret) {
            av_log(NULL, AV_LOG_ERROR, "Failed to join filter pthread for %ld: %s.\n",
                   filter_worker->tid, strerror(errno));
        }

        if ((long) result != 0) {
            av_log(NULL, AV_LOG_ERROR, "filter pthread %ld result=0x%lx.\n",
                   filter_worker->tid, (long)result);
            ret = (int)((long) result);
        }

        pthread_mutex_destroy(&filter_worker->inst_mtx);
        pthread_cond_destroy(&filter_worker->inst_cond);
    }

    // clean up encoder threads
    for (i = 0; i < p_tile_codec_ctx->total_threads; i++) {
        //void *result;
        struct tile_thread_context *worker = &p_tile_codec_ctx->workers[i];

        av_log(NULL, AV_LOG_INFO, "pthread %ld ready to exit.\n", worker->tid);
        pthread_mutex_lock(&worker->inst_mtx);
        pthread_cond_signal(&worker->inst_cond);
        pthread_mutex_unlock(&worker->inst_mtx);

        ret = pthread_join(worker->tid, &result);
        if (ret) {
            av_log(NULL, AV_LOG_ERROR, "Failed to join pthread for %ld: %s.\n",
                   worker->tid, strerror(errno));
        }

        if ((long) result != 0) {
            av_log(NULL, AV_LOG_ERROR, "pthread %ld result=0x%lx.\n",
                   worker->tid, (long)result);
            ret = (int)((long) result);
        }

        pthread_mutex_destroy(&worker->inst_mtx);
        pthread_cond_destroy(&worker->inst_cond);
    }

    // clean up repack thread
    pthread_mutex_lock(&repack_worker->inst_mtx);
    pthread_cond_signal(&repack_worker->inst_cond);
    pthread_mutex_unlock(&repack_worker->inst_mtx);

    ret = pthread_join(repack_worker->tid, &result);
    if (ret) {
        av_log(NULL, AV_LOG_ERROR, "Failed to join repack pthread for %ld: %s.\n",
               repack_worker->tid, strerror(errno));
    }

    if ((long) result != 0) {
        av_log(NULL, AV_LOG_ERROR, "repack pthread %ld result=0x%lx.\n",
               repack_worker->tid, (long)result);
        ret = (int)((long) result);
    }

    pthread_mutex_destroy(&repack_worker->inst_mtx);
    pthread_cond_destroy(&repack_worker->inst_cond);


    // clean up instances
    for (i = 0; i < input_file_num; i++) {
        codec_instance_cleanup(&p_tile_codec_ctx->instances[i]);
    }

end:
    av_log(NULL, AV_LOG_ERROR, "end.. ret=0x%x.\n", ret);

    avformat_close_input(&p_tile_codec_ctx->ifmt_ctx);
    avformat_close_input(&p_tile_codec_ctx->ofmt_ctx); // avio_close()
    pthread_mutex_destroy(&p_tile_codec_ctx->mutex);
    pthread_cond_destroy(&p_tile_codec_ctx->cond);
    av_frame_free(&p_decoded_frame);
    avfilter_graph_free(&p_tile_codec_ctx->filter_graph);
    av_bsf_free(&p_tile_codec_ctx->tile_repack_bsf_ctx);
    av_freep(&p_tile_codec_ctx->tile_info);
    free(p_tile_codec_ctx->bufsink_ctxes);
    free(p_tile_codec_ctx->instances);
    free(p_tile_codec_ctx->workers);
    if (pad_filter_needed)
        free(p_tile_codec_ctx->filter_worker);
    free(p_tile_codec_ctx);
    free(graph_desc);
    free(xcoder_params);

    av_log(NULL, AV_LOG_INFO, "EXIT.. ret=0x%x.\n", ret);
    if (ret == 0) {
        fprintf(stderr, "%s finished.\n", output_file);
    }

    return ret;
}
