/*
 * Copyright (c) 2023 NetInt
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#ifdef __linux__
#include <sys/ioctl.h>
#include <sys/sysinfo.h>
#endif
#include <sys/time.h>
#include <getopt.h>
#include <pthread.h>
#include <signal.h>

#include <ni_rsrc_api.h>

#include <libavcodec/avcodec.h>
#if LIBAVCODEC_VERSION_MAJOR >= 59
#include <libavcodec/bsf.h>
#endif

#include <libavfilter/buffersink.h>
#include <libavfilter/buffersrc.h>

#include <libavformat/avformat.h>

#include <libavutil/opt.h>
#include <libavutil/imgutils.h>
#include <libavutil/time.h>
#include <libavutil/avassert.h>
#include <libavutil/pixdesc.h>

#define IS_FFMPEG_61_AND_ABOVE                                                \
    ((LIBAVFILTER_VERSION_MAJOR > 9) ||                                        \
     (LIBAVFILTER_VERSION_MAJOR == 9 && LIBAVFILTER_VERSION_MINOR >= 12))

#define container_of(ptr, type, member) \
    ((type *)((char *)(ptr) - (size_t)&(((type *)0)->member)))

#define list_entry(ptr, type, member) \
    container_of(ptr, type, member)

#define list_first_entry(ptr, type, field)  list_entry((ptr)->next, type, field)

enum {
    DEBUG_LEVEL_DEBUG,
    DEBUG_LEVEL_INFO,
    DEBUG_LEVEL_WARN,
    DEBUG_LEVEL_ERROR,
};

// Constants (section 3).
enum {
    AV1_MAX_TILE_WIDTH = 4096,
    AV1_MAX_TILE_AREA  = 4096 * 2304,
    AV1_MAX_TILE_ROWS  = 64,
    AV1_MAX_TILE_COLS  = 64,
};

struct list_head {
    struct list_head *next, *prev;
};

struct tiled_frame_node {
    AVFrame *frame;
    struct list_head link;
};

struct tiled_packet_node {
    AVPacket pkt;
    struct list_head link;
};

typedef struct TileInfo {
    int w, h, x, y;
} TileInfo;

struct codec_instance {
    int id;
    FILE *encoder_fp;
    FILE *rawtotile_fp;
    AVPacket pkt;
    const AVCodec *encoder;
    AVCodecContext *enc_ctx;
    const char *codec_params;
    size_t frame_number;
    TileInfo tile_info;
    AVBSFContext *raw_to_tile_bsf_ctx;

    int nb_tiled_frames;
    int nb_tiled_packets;
    struct list_head tiled_frame_list;
    struct list_head tiled_packet_list;

    pthread_mutex_t pkt_mtx;
    pthread_mutex_t frm_mtx;
};

struct codec_instance_node {
    struct codec_instance *inst;
    struct list_head link;
};

struct tile_thread_context {
    pthread_t tid;
    int nb_instances;
    struct tile_codec_context *codec_ctx;
    struct list_head inst_list;
    pthread_mutex_t inst_mtx;
    pthread_cond_t inst_cond;
};

struct tile_codec_context {
    struct codec_instance *instances;
    struct tile_thread_context *workers;
    int active_workers;
    int total_threads;
    int finish;
    int filter_finish;
    int split_row;
    int split_col;
    int video_width;
    int video_height;
    int frame_rate;
    int nb_tiles;
    int nb_encoders;
    int video_stream_index;
    AVRational sar;
    pthread_mutex_t mutex;
    pthread_cond_t cond;
    TileInfo *tile_info;
    AVFormatContext *ifmt_ctx;
    AVFormatContext *ofmt_ctx;
    AVCodecContext *dec_ctx;
    AVFilterGraph *filter_graph;
    AVFilterContext *bufsrc_ctx;
    AVFilterContext **bufsink_ctxes;
    AVBSFContext *tile_repack_bsf_ctx;
};

//static int debug_level = DEBUG_LEVEL_DEBUG;
static struct tile_codec_context *p_tile_codec_ctx;

static inline void INIT_LIST_HEAD(struct list_head *list)
{
    list->next = list->prev = list;
}

static inline int list_empty(const struct list_head *head)
{
    return head->next == head;
}

static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next)
{
    next->prev = new;
    new->next = next;
    new->prev = prev;
    prev->next = new;
}

static inline void list_add(struct list_head *_new, struct list_head *head)
{
    __list_add(_new, head, head->next);
}

static inline void list_add_tail(struct list_head *_new, struct list_head *head)
{
    __list_add(_new, head->prev, head);
}

static inline void __list_del(struct list_head *entry)
{
    entry->next->prev = entry->prev;
    entry->prev->next = entry->next;
}

static inline void list_del(struct list_head *entry)
{
    __list_del(entry);
    entry->next = entry->prev = NULL;
}

static void sigint_handler(int signo)
{
    p_tile_codec_ctx->finish = 1;
    av_log(NULL, AV_LOG_INFO, "%s().\n", __func__);
}

static void usage(void)
{
    printf("Usage: \n"
           "-h | --help            help info.\n"
           "-x | --xcoder_params   xcoder parameters.\n"
           "-i | --inputfile       input avc/hevc file.\n"
           "-d | --decoder         decoder name.\n"
           "-e | --encoder         encoder name.\n"
           "-i | --input_file      input filename.\n"
           "-o | --output_file     output filename. Use av1 extension for AV1 OBU, ivf for AV1 IVF\n"
           "-t | --tiles           [column]x[row] of tiles.\n"
           "-j | --threads         number of threads to use, default value is number of processors.\n"
           "-b | --bitrate         bitrate.\n");
}

static void print_report(int is_last_report, int64_t timer_start, int64_t curr_time)
{
    int i;
    char buf[100] = {'\0'};
    double t, fps;
    size_t num, curr_frames;
    static size_t last_frames = 0;
    static int64_t last_time = -1;

    t = (curr_time - last_time) / 1000.0; // in millisecond
    if (!is_last_report) {
        if (last_time == -1) {
            last_time = curr_time;
            return;
        }
        if ((curr_time - last_time) < 500000)
            return;
        last_time = curr_time;
    }

    curr_frames = ULONG_MAX;
    for (i = 0; i < p_tile_codec_ctx->nb_tiles; i++) {
        num = p_tile_codec_ctx->instances[i].frame_number;
        if (curr_frames > num) {
            curr_frames = num;
        }
    }
    fps = t > 1 ? (curr_frames - last_frames) * 1000 / t : 0;
    last_frames = curr_frames;

    snprintf(&buf[0], sizeof(buf) - 1, "fps=%3.*f ", (fps < 9.95), fps);
    fprintf(stderr, "Tile transcoding %s   %c", buf, is_last_report ? '\n' : '\r');

    if (is_last_report) {
        t = (curr_time - timer_start) / 1000.0; // in millisecond
        fps = curr_frames * 1000 / t;
        fprintf(stderr, "Average fps=%3.f\n", fps);
    }
    fflush(stderr);
}

static int tile_packet_available(void)
{
    int i;
    struct codec_instance *inst;

    for (i = 0; i < p_tile_codec_ctx->nb_tiles; i++) {
        inst = &p_tile_codec_ctx->instances[i];
        pthread_mutex_lock(&inst->pkt_mtx);
        if (list_empty(&inst->tiled_packet_list)) {
            pthread_mutex_unlock(&inst->pkt_mtx);
            return 0;
        }
        pthread_mutex_unlock(&inst->pkt_mtx);
    }
    return 1;
}

static void packet_dispatch(struct codec_instance *inst, AVPacket *packet)
{
    struct tiled_packet_node *tpn;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    tpn = (struct tiled_packet_node *) malloc(sizeof(*tpn));
    av_assert0(tpn);
    av_packet_move_ref(&tpn->pkt, packet);

    pthread_mutex_lock(&inst->pkt_mtx);

    list_add_tail(&tpn->link, &inst->tiled_packet_list);
    inst->nb_tiled_packets++;

    pthread_mutex_unlock(&inst->pkt_mtx);
}

#define LEN_OUTPKT_HEADER 96
static int raw_to_tile_bsf(struct codec_instance *inst, AVPacket *in_pkt, FILE *outfile)
{
    int ret;
    AVPacket out_pkt;
    AVBSFContext *bsf_ctx;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    if (in_pkt) {
        av_log(NULL, AV_LOG_INFO, "%d: send in_pkt: size %d,dts %ld,pts %ld,sd elems %d\n",
               inst->id, in_pkt->size, in_pkt->dts, in_pkt->pts, in_pkt->side_data_elems);
    }

    bsf_ctx = inst->raw_to_tile_bsf_ctx;
    ret = av_bsf_send_packet(bsf_ctx, in_pkt);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "failed to send packet to bsf\n");
        goto out;
    }

    for (; ;) {
        ret = av_bsf_receive_packet(bsf_ctx, &out_pkt);  //av1_rawtotile_filter()
        if (ret == AVERROR(EAGAIN)) {
            break;
        } else if (ret < 0) {
            if (ret == AVERROR_EOF)
                av_log(NULL, AV_LOG_ERROR, "eof reach\n");
            else
                av_log(NULL, AV_LOG_ERROR, "failed to receive packet %p from bsf: %s\n", in_pkt, av_err2str(ret));
            break;
        }

        av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: %d: recv out_pkt: size %d,dts %ld,pts %ld,sd elems %d\n",
                __FILE__, __LINE__, __func__, inst->id, out_pkt.size, out_pkt.dts, out_pkt.pts, out_pkt.side_data_elems);

        if (outfile) {
            fwrite(out_pkt.data + LEN_OUTPKT_HEADER, 1, out_pkt.size - LEN_OUTPKT_HEADER, outfile);
        }

        packet_dispatch(inst, &out_pkt);
    }

out:
    return ret;
}

static int do_encode(struct codec_instance *inst, AVFrame *frame, AVPacket *pkt, FILE *outfile)
{
    int ret;
    AVCodecContext *enc_ctx;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    /* send the frame to the encoder */
    if (frame) {
        av_log(NULL, AV_LOG_INFO, "%d: Send frame %3"PRId64", type=%d\n", inst->id, frame->pts, frame->pict_type);
    }

    enc_ctx = inst->enc_ctx;
    for (; ;) {
        ret = avcodec_send_frame(enc_ctx, frame);
        if (ret == AVERROR(EAGAIN)) {
            continue;
        } else if (ret < 0) {
            fprintf(stderr, "Error sending a frame for encoding\n");
            return ret;
        } else {
            break;
        }
    }

    while (ret >= 0) {
        ret = avcodec_receive_packet(enc_ctx, pkt);
        if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
            return ret;
        } else if (ret < 0) {
            fprintf(stderr, "Error during encoding\n");
            return ret;
        }

        av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: frame_number %ld pkt->size %d\n", __FILE__, __LINE__, __func__, inst->frame_number, pkt->size);

        inst->frame_number++;
        if (outfile) {
            fwrite(pkt->data, 1, pkt->size, outfile);
        }

        raw_to_tile_bsf(inst, pkt, inst->rawtotile_fp);

        av_packet_unref(pkt);
    }

    return 0;
}

static int raw_to_tile_bsf_init(struct codec_instance *inst)
{
    int ret, video_width, video_height, split_row, split_col;
    char options[128] = { 0 };
    AVCodecParameters *codec_par;
    AVBSFContext *bsf_ctx;

    av_log(NULL, AV_LOG_INFO, "ready to init bsf\n");

    video_width = p_tile_codec_ctx->video_width;
    video_height = p_tile_codec_ctx->video_height;
    split_row = p_tile_codec_ctx->split_row;
    split_col = p_tile_codec_ctx->split_col;
    snprintf(options, sizeof(options), "av1_rawtotile=width=%d:height=%d:column=%d:row=%d:x=%d:y=%d:x_w=%d:y_h=%d",
             video_width, video_height, split_col, split_row, inst->tile_info.x, inst->tile_info.y, inst->tile_info.w, inst->tile_info.h);

    ret = av_bsf_list_parse_str(options, &bsf_ctx);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "failed to allocate bsf\n");
        return ret;
    }

    codec_par = avcodec_parameters_alloc();
    avcodec_parameters_from_context(codec_par, inst->enc_ctx);
    avcodec_parameters_copy(bsf_ctx->par_in, codec_par);
    avcodec_parameters_free(&codec_par);

    ret = av_bsf_init(bsf_ctx);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Error initialize bitstream filter = %s\n", av_err2str(ret));
        return ret;
    }

    inst->raw_to_tile_bsf_ctx = bsf_ctx;
    av_log(NULL, AV_LOG_INFO, "bsf init done\n");

    return 0;
}

static int tile_repack_bsf_init(AVStream *stream, int tile_num)
{
    int ret;
    AVBSFContext *bsf_ctx;
    char options[128] = { 0 };

    snprintf(options, sizeof(options), "av1_tile_repack=tile_num=%d", tile_num);
    ret = av_bsf_list_parse_str(options, &bsf_ctx);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "failed to parse av1 tile repack bsf\n");
        return ret;
    }

    avcodec_parameters_copy(bsf_ctx->par_in, stream->codecpar);

    ret = av_bsf_init(bsf_ctx);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Error initialize bitstream filter\n");
        return ret;
    }

    p_tile_codec_ctx->tile_repack_bsf_ctx = bsf_ctx;
    av_log(NULL, AV_LOG_INFO, "av1 tile repack bsf init done\n");

    return 0;
}

static void encode_thread_sync(void)
{
    pthread_mutex_lock(&p_tile_codec_ctx->mutex);
    while (p_tile_codec_ctx->active_workers < p_tile_codec_ctx->total_threads) {
        pthread_cond_wait(&p_tile_codec_ctx->cond, &p_tile_codec_ctx->mutex);
    }
    pthread_mutex_unlock(&p_tile_codec_ctx->mutex);
}

static void *encode_routine(void *arg)
{
    struct tile_thread_context *worker = (struct tile_thread_context *) arg;
    struct tiled_frame_node *tfn;
    struct codec_instance_node *cin;
    struct codec_instance *inst;
    struct tile_codec_context *ctx = worker->codec_ctx;
    AVFrame *frame;

    av_log(NULL, AV_LOG_INFO, "tile_thread_context %ld enter.\n", worker->tid);

    // I am ready
    pthread_mutex_lock(&ctx->mutex);
    ctx->active_workers++;
    if (ctx->active_workers == ctx->total_threads) {
        pthread_cond_signal(&ctx->cond);
    }
    pthread_mutex_unlock(&ctx->mutex);

    for (; ;) {
        pthread_mutex_lock(&worker->inst_mtx);

        while (list_empty(&worker->inst_list) && !ctx->filter_finish) {
            pthread_cond_wait(&worker->inst_cond, &worker->inst_mtx);
        }

        if (ctx->finish || (list_empty(&worker->inst_list) && ctx->filter_finish)) {
            // End of all frame encoding including flushing.
            pthread_mutex_unlock(&worker->inst_mtx);
            break;
        }

        cin = list_first_entry(&worker->inst_list, struct codec_instance_node, link);
        av_assert0(cin != NULL && cin->inst != NULL);
        list_del(&cin->link);
        inst = cin->inst;
        worker->nb_instances--;

        pthread_mutex_unlock(&worker->inst_mtx);

        pthread_mutex_lock(&inst->frm_mtx);
        tfn = list_first_entry(&cin->inst->tiled_frame_list, struct tiled_frame_node, link);
        av_assert0(tfn != NULL);
        list_del(&tfn->link);
        inst->nb_tiled_frames--;
        pthread_mutex_unlock(&inst->frm_mtx);
        frame = tfn->frame;
        free(tfn);
        free(cin);

        if (!frame) {
            /* flush the encoder */
            do_encode(inst, NULL, &inst->pkt, inst->encoder_fp);
        } else {
            av_log(NULL, AV_LOG_INFO, "%d: Fetch frame pts %ld\n", inst->id, frame->pts);
            do_encode(inst, frame, &inst->pkt, inst->encoder_fp);
            av_frame_free(&frame);
        }
    }

    // Release tile codec context condvar.
    pthread_mutex_lock(&ctx->mutex);
    ctx->active_workers--;
    pthread_mutex_unlock(&ctx->mutex);

    av_log(NULL, AV_LOG_INFO, "tile_thread_context %ld exit\n", worker->tid);
    return (void *) 0;
}

static void frame_dispatch(struct tile_thread_context *worker,
                           struct codec_instance *inst, AVFrame *frame)
{
    struct tiled_frame_node *tfn;
    struct codec_instance_node *cin;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    tfn = (struct tiled_frame_node *) malloc(sizeof(*tfn));
    av_assert0(tfn);
    tfn->frame = frame;
    pthread_mutex_lock(&inst->frm_mtx);
    list_add_tail(&tfn->link, &inst->tiled_frame_list);
    inst->nb_tiled_frames++;
    pthread_mutex_unlock(&inst->frm_mtx);

    cin = (struct codec_instance_node *) malloc(sizeof(*cin));
    av_assert0(cin);
    cin->inst = inst;

    pthread_mutex_lock(&worker->inst_mtx);
    list_add_tail(&cin->link, &worker->inst_list);
    worker->nb_instances++;
    pthread_cond_signal(&worker->inst_cond);
    pthread_mutex_unlock(&worker->inst_mtx);
}

static int codec_instance_init(struct codec_instance *inst, int bit_rate, int fps,
                             AVRational sar, int logging)
{
    int ret, devid;
    const AVCodec *encoder;
    AVCodecContext *enc_ctx;
    char name[256] = { 0 };
    char rawtotile_name[256] = { 0 };
    char str_devid[8] = { 0 };

    ret = pthread_mutex_init(&inst->pkt_mtx, NULL);
    if (ret) {
        goto end;
    }

    ret = pthread_mutex_init(&inst->frm_mtx, NULL);
    if (ret) {
        goto end;
    }

    encoder = inst->encoder;
    snprintf(name, sizeof(name) - 1, "output-%d.%s", inst->id, "av1");

    snprintf(rawtotile_name, sizeof(rawtotile_name) - 1, "rawtotile-%d.%s", inst->id, "av1");

    if (logging) {
        inst->encoder_fp = fopen(name, "wb");
        if (!inst->encoder_fp) {
            av_log(NULL, AV_LOG_ERROR, "Failed to open output file: %s.\n", strerror(errno));
            ret = AVERROR(errno);
            goto end;
        }

        inst->rawtotile_fp = fopen(rawtotile_name, "wb");
        if (!inst->rawtotile_fp) {
            av_log(NULL, AV_LOG_ERROR, "Failed to open output file: %s.\n", strerror(errno));
            ret = AVERROR(errno);
            goto end;
        }
    }

    enc_ctx = avcodec_alloc_context3(encoder);
    if (!enc_ctx) {
        av_log(NULL, AV_LOG_ERROR, "Could not allocate video encoder context\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    /* resolution must be a multiple of two */
    enc_ctx->width = inst->tile_info.w;
    enc_ctx->height = inst->tile_info.h;
    enc_ctx->bit_rate = bit_rate;
    enc_ctx->time_base = (AVRational){1, fps};
    enc_ctx->framerate = (AVRational){fps, 1};
    enc_ctx->sample_aspect_ratio = sar;
    /* emit one intra frame every ten frames
     * check frame pict_type before passing frame
     * to encoder, if frame->pict_type is AV_PICTURE_TYPE_I
     * then gop_size is ignored and the output of encoder
     * will always be I frame irrespective to gop_size
     */
    enc_ctx->gop_size = 10;
    enc_ctx->max_b_frames = 1;
    enc_ctx->pix_fmt = p_tile_codec_ctx->dec_ctx->pix_fmt;

    if (!strncmp(encoder->name, "av1_ni", strlen("av1_ni"))) {
        /* for example:
         * gopPresetIdx=6:lowDelay=1:intraPeriod=120:RcEnable=1:bitrate=4000000
         */
        if (inst->codec_params) {
            av_log(NULL, AV_LOG_INFO, "param: %s\n", inst->codec_params);
            av_opt_set(enc_ctx->priv_data, "xcoder-params", inst->codec_params, 0);
        }
        devid = inst->id % p_tile_codec_ctx->nb_encoders;
        snprintf(str_devid, sizeof(str_devid), "%d", devid);
        av_opt_set(enc_ctx->priv_data, "enc", str_devid, 0);
    } else {
        av_log(NULL, AV_LOG_ERROR, "codec %s not supported.\n", encoder->name);
        ret = AVERROR(EINVAL);
        goto end;
    }

    ret = avcodec_open2(enc_ctx, encoder, NULL);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Could not open codec: %s\n",
               av_err2str(ret));
        goto end;
    }
    inst->enc_ctx = enc_ctx;

    ret = raw_to_tile_bsf_init(inst);
    if (ret) {
        av_log(NULL, AV_LOG_ERROR, "Failed to init raw to tile bsf.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    return 0;

end:
    pthread_mutex_destroy(&inst->pkt_mtx);
    pthread_mutex_destroy(&inst->frm_mtx);
    return ret;
}

static int tile_thread_context_init(struct tile_thread_context *worker)
{
    int ret;

    ret = pthread_mutex_init(&worker->inst_mtx, NULL);
    if (ret) {
        goto end;
    }

    ret = pthread_cond_init(&worker->inst_cond, NULL);
    if (ret) {
        goto end;
    }

    return 0;

end:
    pthread_mutex_destroy(&worker->inst_mtx);
    pthread_cond_destroy(&worker->inst_cond);

    return ret;
}

static void codec_instance_cleanup(struct codec_instance *inst)
{
    if (inst) {
        av_bsf_free(&inst->raw_to_tile_bsf_ctx);
        avcodec_free_context(&inst->enc_ctx);
        pthread_mutex_destroy(&inst->pkt_mtx);
        pthread_mutex_destroy(&inst->frm_mtx);
        if (inst->encoder_fp != NULL) {
            fclose(inst->encoder_fp);
        }
        if (inst->rawtotile_fp != NULL) {
            fclose(inst->rawtotile_fp);
        }
    }
}

static int input_file_open(const char *filename, const char *decoder_name)
{
    int i, ret, video_stream_index;
    const AVCodec *dec;
    AVStream *st;
    AVFormatContext *ifmt_ctx = NULL;
    AVCodecContext *dec_ctx;

    ret = avformat_open_input(&ifmt_ctx, filename, NULL, NULL);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "failed to open input file\n");
        exit(1);
    }

    ret = avformat_find_stream_info(ifmt_ctx, NULL);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Cannot find stream information\n");
        return ret;
    }

    // Find lowest indexed video stream
    for (i = 0; i < ifmt_ctx->nb_streams; i++)
    {
      if (ifmt_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
      {
        video_stream_index = i;
        break;
      }
    }
    st = ifmt_ctx->streams[video_stream_index];

    dec = avcodec_find_decoder_by_name(decoder_name);
    if (!dec) {
        av_log(NULL, AV_LOG_ERROR, "Codec '%s' not found\n", decoder_name);
        return AVERROR(ENOMEM);
    }

    dec_ctx = avcodec_alloc_context3(dec);
    if (!dec_ctx) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate the decoder context for video stream.\n");
        return AVERROR(ENOMEM);
    }

    ret = avcodec_parameters_to_context(dec_ctx, st->codecpar);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Failed to copy decoder parameters to input decoder context for video stream\n");
        return ret;
    }

    ret = avcodec_open2(dec_ctx, dec, NULL);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Failed to open decoder for video stream\n");
        return ret;
    }

    av_dump_format(ifmt_ctx, 0, filename, 0);

    p_tile_codec_ctx->ifmt_ctx = ifmt_ctx;
    p_tile_codec_ctx->dec_ctx = dec_ctx;
    p_tile_codec_ctx->video_width = dec_ctx->width;
    p_tile_codec_ctx->video_height = dec_ctx->height;
    p_tile_codec_ctx->frame_rate = st->r_frame_rate.num / st->r_frame_rate.den;
    p_tile_codec_ctx->sar = dec_ctx->sample_aspect_ratio;
    p_tile_codec_ctx->video_stream_index = video_stream_index;

    return 0;
}

static int output_file_open(const char *output_file, int width, int height, int fps)
{
    int ret;
    AVStream *out_stream;
    AVFormatContext *ofmt_ctx;
    char *ext;

    if (!strcmp(output_file, "null")) {
        avformat_alloc_output_context2(&ofmt_ctx, NULL, "null", NULL);
    } else {
        /* Note: The file extension string should be in output_file here for
                 avformat_alloc_output_context2() to auto-detect output format */
        ext = strrchr(output_file,'.');
        ext++;
        if(strstr(ext, "av1") && ext[3] == '\0')
        {
            avformat_alloc_output_context2(&ofmt_ctx, NULL, NULL, "raw_av1.yuv");
            ofmt_ctx->url = av_strdup(output_file);
        }
        else
            avformat_alloc_output_context2(&ofmt_ctx, NULL, NULL, output_file);
    }

    if (!ofmt_ctx) {
        av_log(NULL, AV_LOG_ERROR, "Could not create output context\n");
        return AVERROR_UNKNOWN;
    }

    //Add Stream
    out_stream = avformat_new_stream(ofmt_ctx, NULL);
    if (!out_stream) {
        av_log(NULL, AV_LOG_ERROR, "Failed allocating output stream\n");
        return AVERROR_UNKNOWN;
    }

    out_stream->codecpar->codec_type = AVMEDIA_TYPE_VIDEO;
    out_stream->codecpar->codec_id = AV_CODEC_ID_AV1;
    out_stream->codecpar->width = width;
    out_stream->codecpar->height = height;
    out_stream->time_base.num = 1;
    out_stream->time_base.den = fps;

    av_dump_format(ofmt_ctx, 0, output_file, 1);

    if (!(ofmt_ctx->oformat->flags & AVFMT_NOFILE)) {
        ret = avio_open(&ofmt_ctx->pb, output_file, AVIO_FLAG_WRITE);
        if (ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "Could not open output file '%s'", output_file);
            return ret;
        }
    }

    /* init muxer, write output file header */
    ret = avformat_write_header(ofmt_ctx, NULL);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Error occurred when opening output file\n");
        return ret;
    }

    p_tile_codec_ctx->ofmt_ctx = ofmt_ctx;

    return 0;
}

static int filter_graph_open(const char *graph_desc, int width, int height,
                             int fps, AVRational sar, int nb_tiles)
{
    int i, ret;
    AVFilterInOut *inputs, *outputs, *cur;

    p_tile_codec_ctx->filter_graph = avfilter_graph_alloc();
    if (p_tile_codec_ctx->filter_graph == NULL) {
        av_log(NULL, AV_LOG_ERROR, "failed to allocate filter graph\n");
        goto out;
    }

    ret = avfilter_graph_parse2(p_tile_codec_ctx->filter_graph, graph_desc, &inputs, &outputs);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "failed to parse graph\n");
        goto out;
    }

    for (cur = inputs, i = 0; cur; cur = cur->next, i++) {
        char args[512] = { 0 };
        char name[255] = { 0 };

        snprintf(name, sizeof(name), "in_%d", i);
        snprintf(args, sizeof(args), "video_size=%dx%d:pix_fmt=%d:time_base=%d/%d:pixel_aspect=%d/%d",
                 width, height, p_tile_codec_ctx->dec_ctx->pix_fmt, 1, fps, sar.num, sar.den);
        av_log(NULL, AV_LOG_INFO, "input filter args: %s\n", args);
        ret = avfilter_graph_create_filter(&p_tile_codec_ctx->bufsrc_ctx, avfilter_get_by_name("buffer"),
                                           name, args, NULL, p_tile_codec_ctx->filter_graph);
        if (ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "failed to create input filter: %d\n", i);
            goto out;
        }

        ret = avfilter_link(p_tile_codec_ctx->bufsrc_ctx, 0, cur->filter_ctx, cur->pad_idx);
        if (ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "failed to link input filter: %d\n", i);
            goto out;
        }
    }

    p_tile_codec_ctx->bufsink_ctxes = malloc(sizeof(AVFilterContext *) * nb_tiles);
    av_assert0(p_tile_codec_ctx->bufsink_ctxes);

#if IS_FFMPEG_61_AND_ABOVE
    for (cur = outputs, i = 0; cur && i < nb_tiles; cur = cur->next, i++) {
#else
    for (cur = outputs, i = nb_tiles - 1; cur && i >= 0; cur = cur->next, i--) {
#endif
        char name[255] = { 0 };

        snprintf(name, sizeof(name), "out_%d", i);
        ret = avfilter_graph_create_filter(&p_tile_codec_ctx->bufsink_ctxes[i], avfilter_get_by_name("buffersink"),
                                           name, NULL, NULL, p_tile_codec_ctx->filter_graph);
        if (ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "failed to create output filter: %d", i);
            goto out;
        }

        ret = avfilter_link(cur->filter_ctx, cur->pad_idx, p_tile_codec_ctx->bufsink_ctxes[i], 0);
        if (ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "failed to link output filter: %d\n", i);
            goto out;
        }
    }

    ret = avfilter_graph_config(p_tile_codec_ctx->filter_graph, NULL);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "failed to config graph filter\n");
        goto out;
    }

out:
    avfilter_inout_free(&inputs);
    avfilter_inout_free(&outputs);
    return ret;
}

static int filter_frame(AVFrame *in_frame)
{
    int i, ret;
    AVFrame *filt_frame;
    struct codec_instance *instances;
    struct tile_thread_context *workers;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    instances = p_tile_codec_ctx->instances;
    workers = p_tile_codec_ctx->workers;
    if (in_frame) {
        av_log(NULL, AV_LOG_INFO, "frame pixel=%dx%d, crop=%ld/%ld/%ld/%ld linesize %d/%d/%d\n",
               in_frame->width, in_frame->height, in_frame->crop_top,
               in_frame->crop_bottom, in_frame->crop_left, in_frame->crop_right,
               in_frame->linesize[0], in_frame->linesize[1], in_frame->linesize[2]);

        av_log(NULL, AV_LOG_INFO, "ref count %d/%d/%d\n",
               in_frame->buf[0] ? av_buffer_get_ref_count(in_frame->buf[0]) : -1,
               in_frame->buf[1] ? av_buffer_get_ref_count(in_frame->buf[1]) : -1,
               in_frame->buf[2] ? av_buffer_get_ref_count(in_frame->buf[2]) : -1);

        in_frame->pict_type = AV_PICTURE_TYPE_NONE;
        av_log(NULL, AV_LOG_INFO, "decode frame %3"PRId64", type=%d\n", in_frame->pts, in_frame->pict_type);
    }

    ret = av_buffersrc_add_frame_flags(p_tile_codec_ctx->bufsrc_ctx, in_frame, AV_BUFFERSRC_FLAG_KEEP_REF);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Failed to add frame to buffersrc\n");
        return ret;
    }

    for (i = 0; i < p_tile_codec_ctx->nb_tiles; i++) {
        for (; ;) {
            filt_frame = av_frame_alloc();
            av_assert0(filt_frame);

            ret = av_buffersink_get_frame(p_tile_codec_ctx->bufsink_ctxes[i], filt_frame);
            if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
                av_frame_free(&filt_frame);
                break;
            } else if (ret < 0) {
                av_log(NULL, AV_LOG_ERROR, "%d: failed to get frame from buffersink\n", i);
                av_frame_free(&filt_frame);
                return ret;
            }

            av_log(NULL, AV_LOG_INFO, "%d: filt frame pixel=%dx%d, crop=%ld/%ld/%ld/%ld, linesize=%d/%d/%d\n",
                   i, filt_frame->width, filt_frame->height, filt_frame->crop_top,
                   filt_frame->crop_bottom, filt_frame->crop_left, filt_frame->crop_right,
                   filt_frame->linesize[0], filt_frame->linesize[1], filt_frame->linesize[2]);

            av_log(NULL, AV_LOG_INFO, "filt ref count %d/%d/%d\n",
                   filt_frame->buf[0] ? av_buffer_get_ref_count(filt_frame->buf[0]) : -1,
                   filt_frame->buf[1] ? av_buffer_get_ref_count(filt_frame->buf[1]) : -1,
                   filt_frame->buf[2] ? av_buffer_get_ref_count(filt_frame->buf[2]) : -1);

            frame_dispatch(&workers[i % p_tile_codec_ctx->total_threads], &instances[i], filt_frame);
        }
    }

    /* flush decoder */
    if (!in_frame) {
        for (i = 0; i < p_tile_codec_ctx->nb_tiles; i++) {
            frame_dispatch(&workers[i % p_tile_codec_ctx->total_threads], &instances[i], NULL);
            av_log(NULL, AV_LOG_INFO, "%d: submit flush encoder\n", i);
        }
        p_tile_codec_ctx->filter_finish = 1;
    }

    return 0;
}

static int do_decode(AVCodecContext *dec_ctx, AVFrame *frame, AVPacket *pkt, int64_t t0)
{
    int64_t t1;
    int ret = avcodec_send_packet(dec_ctx, pkt);
    if (ret < 0 && ret != AVERROR_EOF) {
        av_log(NULL, AV_LOG_ERROR, "error sending a packet for decoding\n");
        return ret;
    }

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    ret = 0;
    // Attempt to retreive a decoded frame
    while (ret >= 0) {
        t1 = av_gettime_relative();
        ret = avcodec_receive_frame(dec_ctx, frame);
        if (ret == AVERROR_EOF) {
            av_log(NULL, AV_LOG_DEBUG, "\ndecoder EOF reached\n");
            filter_frame(NULL);
            break;
        } else if (ret == AVERROR(EAGAIN)) {
            // Failed to get a frame because decoder output buffer is empty
            break;
        } else if (ret < 0) {
            // Decoding error occured, possibly bad packet in input
            av_log(NULL, AV_LOG_ERROR, "Error during decoding: %s\n", av_err2str(ret));
            break;
        } else {
            av_log(NULL, AV_LOG_INFO, "read one pkt: %ld\n", frame->pts);
            ret = filter_frame(frame);
            av_frame_unref(frame);
            print_report(0, t0, t1);
            if (ret < 0) {
                av_log(NULL, AV_LOG_ERROR, "Failed to send frame to filter\n");
                break;
            }
        }
    }

    return ret;
}

static int tile_repack_bsf(AVPacket *repack_pkt, AVPacket *part_pkt)
{
    int ret;
    AVBSFContext *bsf_ctx;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    bsf_ctx = p_tile_codec_ctx->tile_repack_bsf_ctx;
    if (part_pkt) {
        ret = av_bsf_send_packet(bsf_ctx, part_pkt);
        if (ret < 0) {
            av_log(NULL, AV_LOG_ERROR, "failed to send packet to repack bsf\n");
            return ret;
        }
    }

    ret = av_bsf_receive_packet(bsf_ctx, repack_pkt);
    if (ret == AVERROR(EAGAIN)) {
        return ret;
    } else if (ret < 0) {
        if (ret == AVERROR_EOF)
            av_log(NULL, AV_LOG_INFO, "av1 tile repack eof reach\n");
        else
            av_log(NULL, AV_LOG_ERROR, "failed to receive packet from repack bsf: %s\n", av_err2str(ret));
        return ret;
    }

    av_log(NULL, AV_LOG_DEBUG, "recv pkt from repack bsf: size %d,dts %ld,pts %ld\n",
           repack_pkt->size, repack_pkt->dts, repack_pkt->pts);

    return 0;
}

static struct tiled_packet_node *packet_fetch(struct codec_instance *inst)
{
    struct tiled_packet_node *tpn;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    pthread_mutex_lock(&inst->pkt_mtx);

    if (list_empty(&inst->tiled_packet_list)) {
        pthread_mutex_unlock(&inst->pkt_mtx);
        return NULL;
    }

    tpn = list_first_entry(&inst->tiled_packet_list, struct tiled_packet_node, link);
    av_assert0(tpn != NULL);
    list_del(&tpn->link);
    inst->nb_tiled_packets--;
    pthread_mutex_unlock(&inst->pkt_mtx);

    return tpn;
}

/**!
 *
 * @param cp
 * @param out_pkts
 * @return 0 - get output <0 - no output
 */
static int repack_tile_packets(AVPacket **out_pkts)
{
    int i, send_packet;
    int *slice_addr;
    struct codec_instance *inst;
    struct tiled_packet_node *tpn;

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    send_packet = 0;
    for (i = 0; i < p_tile_codec_ctx->nb_tiles; i++) {
        inst = &p_tile_codec_ctx->instances[i];
        tpn = packet_fetch(inst);
        if (tpn) {
            slice_addr = malloc(sizeof(int));
            *slice_addr = i;

            av_packet_add_side_data(&tpn->pkt, AV_PKT_DATA_SLICE_ADDR, (void *)slice_addr, sizeof(int));

            tile_repack_bsf(out_pkts[i], &tpn->pkt);
            av_log(NULL, AV_LOG_INFO, "Repack encoded packet tile=%d size=%d\n", i, out_pkts[i]->size);
            av_packet_unref(&tpn->pkt);
            free(tpn);
            send_packet++;
        }
    }

    // A complete output packet
    return send_packet == p_tile_codec_ctx->nb_tiles ? 0 : -1;
}

static int get_ni_devices_cnt(ni_device_type_t dev_type)
{
    int dev_cnt = -1;
    // Store ni_device_t in heap as it is ~450KB
    ni_device_t *coders = malloc(sizeof(ni_device_t));

    av_log(NULL, AV_LOG_DEBUG, "### %s line %d %s: \n", __FILE__, __LINE__, __func__);

    if (NULL == coders) {
        av_log(NULL, AV_LOG_ERROR, "Error failed to malloc ni_device_t\n");
        return -1;
    }
    memset(coders, 0, sizeof(ni_device_t));

    if (ni_rsrc_list_all_devices(coders) != NI_RETCODE_SUCCESS) {
        av_log(NULL, AV_LOG_ERROR, "Failed to get available xcoders.\n");
        free(coders);
        return -1;
    }

    dev_cnt = coders->xcoder_cnt[dev_type];
    free(coders);

    return dev_cnt;
}

static int av1_tile_log2(int blksize, int target)
{
    int k;
    for (k = 0; (blksize << k) < target; k++);
    return k;
}

static int check_min_row_and_adjust(int frame_width, int frame_height, int col, int row)
{
    int mi_cols, mi_rows, sb_cols, sb_rows;
    int sb_shift, sb_size, max_tile_width_sb, max_tile_height_sb, max_tile_area_sb;
    int min_log2_tile_cols, max_log2_tile_cols, max_log2_tile_rows, min_log2_tiles;
    int widest_tile_sb = 0, size_sb;
    int use_128x128_superblock = 0; //1;  // Assume that Quadra is generating 0
    int max_width, max_height;
    int width_in_sbs_minus_1, height_in_sbs_minus_1, height_in_sbs;
    int new_row = row;

    mi_cols = 2 * ((frame_width  + 7) >> 3);
    mi_rows = 2 * ((frame_height + 7) >> 3);

    av_log(NULL, AV_LOG_DEBUG, "mi_cols %d mi_rows %d\n", mi_cols, mi_rows);

    sb_cols = use_128x128_superblock ? ((mi_cols + 31) >> 5)
                                         : ((mi_cols + 15) >> 4);
    sb_rows = use_128x128_superblock ? ((mi_rows + 31) >> 5)
                                          : ((mi_rows + 15) >> 4);

    av_log(NULL, AV_LOG_DEBUG, "sb_cols %d sb_rows %d\n", sb_cols, sb_rows);

    sb_shift = use_128x128_superblock ? 5 : 4;
    sb_size  = sb_shift + 2;

    max_tile_width_sb = AV1_MAX_TILE_WIDTH >> sb_size;
    max_tile_area_sb  = AV1_MAX_TILE_AREA  >> (2 * sb_size);

    av_log(NULL, AV_LOG_DEBUG, "max_tile_width_sb %d max_tile_area_sb %d\n", max_tile_width_sb, max_tile_area_sb);

    min_log2_tile_cols = av1_tile_log2(max_tile_width_sb, sb_cols);
    max_log2_tile_cols = av1_tile_log2(1, FFMIN(sb_cols, AV1_MAX_TILE_COLS));
    max_log2_tile_rows = av1_tile_log2(1, FFMIN(sb_rows, AV1_MAX_TILE_ROWS));
    min_log2_tiles = FFMAX(min_log2_tile_cols,
                           av1_tile_log2(max_tile_area_sb, sb_rows * sb_cols));

    av_log(NULL, AV_LOG_DEBUG, "min_log2_tile_cols %d max_log2_tile_cols %d max_log2_tile_rows %d min_log2_tiles %d\n", min_log2_tile_cols, max_log2_tile_cols, max_log2_tile_rows, min_log2_tiles);

    width_in_sbs_minus_1 = (frame_width/col + 63) / 64 - 1;
    height_in_sbs_minus_1 = (frame_height/row + 63) / 64 - 1;

    av_log(NULL, AV_LOG_DEBUG, "width_in_sbs_minus_1 %d height_in_sbs_minus_1 %d\n", width_in_sbs_minus_1, height_in_sbs_minus_1);

    size_sb = width_in_sbs_minus_1 + 1;
    widest_tile_sb = FFMAX(size_sb, widest_tile_sb);

    if (min_log2_tiles > 0)
      max_tile_area_sb = (sb_rows * sb_cols) >> (min_log2_tiles + 1);
    else
      max_tile_area_sb = sb_rows * sb_cols;
    max_tile_height_sb = FFMAX(max_tile_area_sb / widest_tile_sb, 1);

    max_width = FFMIN(sb_cols, max_tile_width_sb);
    max_height = FFMIN(sb_rows, max_tile_height_sb);
    av_log(NULL, AV_LOG_DEBUG, "max_width %d max_height %d\n", max_width, max_height);

    height_in_sbs = height_in_sbs_minus_1 + 1;
    while(height_in_sbs > max_height)
    {
        av_log(NULL, AV_LOG_DEBUG, "height_in_sbs %d max_height %d\n", height_in_sbs, max_height);
        new_row++;
        height_in_sbs = (frame_height/new_row + 63) / 64;
        av_log(NULL, AV_LOG_INFO, "Adjusting number of row tile to %d\n", new_row);
    }

    return new_row;
}

int main(int argc, char **argv)
{
    int64_t t0, t1;
    const char *decoder_name = NULL;
    const char *encoder_name = NULL;
    const AVCodec *encoder;
    AVFrame *p_decoded_frame;
    AVPacket in_pkt, pkt_bak, **out_pkts;
    AVRational sar;
    int enc_cnt;
    int done;
    int logging = 0;
    int i, j;
    int ctu_size = 64;
    char *xcoder_params = NULL;
    int video_width = 0;
    int video_height = 0;
    int ctb_width;
    int ctb_height;
    int min_ctb_width;
    int min_ctb_height;
    int bitrate = 4000000;
    int fps = 0;
    const char *input_file;
    const char *output_file = NULL;
    char *n, *p;
    int ret;
    int nb_procs = 0;
    int nb_tiles = 1;
    int nb_threads = 0;
    int split_col = 1;
    int split_row = 1;
    char *graph_desc;
    int graph_desc_size;
    int opt;
    int opt_index;
    const char *opt_string = "x:i:o:d:e:b:t:u:v:p:j:ygh";
    static struct option long_options[] = {
        {"xcoder_params", required_argument, NULL, 'x'},
        {"input_file",    required_argument, NULL, 'i'},
        {"output_file",   required_argument, NULL, 'o'},
        {"decoder",       required_argument, NULL, 'd'},
        {"encoder",       required_argument, NULL, 'e'},
        {"fps",           required_argument, NULL, 'r'},
        {"bitrate",       required_argument, NULL, 'b'},
        {"tiles",         required_argument, NULL, 't'},
        {"threads",       required_argument, NULL, 'j'},
        {"sync",          no_argument,       NULL, 'y'},
        {"log",           no_argument,       NULL, 'g'},
        {"help",          no_argument,       NULL, 'h'},
        {NULL,            0,                 NULL, 0},
    };

    while ((opt = getopt_long(argc, argv, opt_string, long_options, &opt_index)) != -1) {
        switch (opt) {
            case 'x':
                xcoder_params = optarg;
                break;
            case 'i':
                input_file = optarg;
                break;
            case 'o':
                output_file = optarg;
                break;
            case 'e':
                encoder_name = optarg;
                break;
            case 'd':
                decoder_name = optarg;
                break;
            case 'g':
                logging = 1;
                break;
            case 'b':
                bitrate = atoi(optarg);
                break;
            case 'r':
                fps = atoi(optarg);
                break;
            case 't':
                split_col = strtoul(optarg, &n, 10);
                if (n[0] != 'x' || !isdigit(n[1])) {
                    usage();
                    return -1;
                }
                split_row = strtoul(n + 1, NULL, 10);
                break;
            case 'j':
                nb_threads = atoi(optarg);
                break;
            case 'h':
                usage();
                return 0;
            default:
                usage();
                return -1;
        }
    }

    if (!input_file || !encoder_name) {
        usage();
        exit(1);
    }

    av_log(NULL, AV_LOG_INFO, "### Start encoding\n");

    enc_cnt = get_ni_devices_cnt(NI_DEVICE_TYPE_ENCODER);
    if (-1 == enc_cnt) {
        av_log(NULL, AV_LOG_ERROR, "Failed to get available encoders.\n");
        exit(1);
    }

    if (!output_file) {
        output_file = "tile_output.ivf";
    }

    if (!decoder_name) {
        decoder_name = "h264";
    }

    if (!xcoder_params) {
        // the crop filter does change the input frame linesize.
        // if the input frame resolution is large and the crop size is small, the linesize is much bigger then width.
        // we should not use zeroCopyMode when from big size to small size.
        xcoder_params = "gopPresetIdx=9:zeroCopyMode=-1";
    }

    av_log_set_level(AV_LOG_WARNING);

    p_tile_codec_ctx = av_mallocz(sizeof(struct tile_codec_context));
    if (p_tile_codec_ctx == NULL) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate p_tile_codec_ctx.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    ret = input_file_open(input_file, decoder_name);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Failed to open input file\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    if (!fps) {
        fps = p_tile_codec_ctx->frame_rate;
    }

    // Check the minimum number of row tiles supported and adjust
    split_row = check_min_row_and_adjust(p_tile_codec_ctx->video_width, p_tile_codec_ctx->video_height, split_col, split_row);
    av_log(NULL, AV_LOG_INFO, "Number of row tile %d number of col tile %d \n", split_row, split_col);

    nb_tiles = split_row * split_col;
    if (!strncmp(encoder_name, "av1_ni", strlen("av1_ni")) &&
        nb_tiles > enc_cnt * NI_MAX_CONTEXTS_PER_HW_INSTANCE) {
        printf("The number of tiles shall not exceed max instances per encoder "
               "x number of encoders %dx%d\n", NI_MAX_CONTEXTS_PER_HW_INSTANCE,
               enc_cnt);
        exit(1);
    }

    if (SIG_ERR == signal(SIGINT, sigint_handler)) {
        av_log(NULL, AV_LOG_ERROR, "Failed to set signal.\n");
        ret = errno;
        goto end;
    }

    graph_desc_size = 64 * nb_tiles;
    graph_desc = malloc(graph_desc_size);
    if (!graph_desc) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate graph_desc.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }
    memset(graph_desc, 0, graph_desc_size);

    av_log(NULL, AV_LOG_INFO, "### Start encoding\n");

    snprintf(graph_desc, graph_desc_size, "split=%d", split_col * split_row);
    for (i = 0; i < split_col * split_row; i++) {
        snprintf(graph_desc + strlen(graph_desc),
                 graph_desc_size - strlen(graph_desc), "[sin_%d]", i);
    }

    nb_procs = get_nprocs();
    if (nb_threads > 0) {
        if (nb_threads > nb_tiles) {
            av_log(NULL, AV_LOG_WARNING, "The number of threads %d cannot "
                   "exceed number of tiles %d. ", nb_threads, nb_tiles);
            nb_threads = nb_procs < nb_tiles ? nb_procs : nb_tiles;
            av_log(NULL, AV_LOG_WARNING, "Change the number of threads into "
                   "%d.\n ", nb_threads);
        }
    } else {
        nb_threads = nb_procs < nb_tiles ? nb_procs : nb_tiles;
    }

    p_tile_codec_ctx->total_threads = nb_threads;
    p_tile_codec_ctx->split_row = split_row;
    p_tile_codec_ctx->split_col = split_col;
    p_tile_codec_ctx->sar = (AVRational){0, 1};
    p_tile_codec_ctx->nb_tiles = nb_tiles;
    p_tile_codec_ctx->nb_encoders = enc_cnt;

    av_log(NULL, AV_LOG_INFO, "split column: %d.\n", split_col);
    av_log(NULL, AV_LOG_INFO, "split row: %d.\n", split_row);
    av_log(NULL, AV_LOG_INFO, "threads: %d.\n", p_tile_codec_ctx->total_threads);
    av_log(NULL, AV_LOG_INFO, "input file: %s.\n", input_file);
    av_log(NULL, AV_LOG_INFO, "output file: %s.\n", output_file);
    av_log(NULL, AV_LOG_INFO, "decoder name: %s.\n", decoder_name);
    av_log(NULL, AV_LOG_INFO, "encoder name: %s.\n", encoder_name);

    p_tile_codec_ctx->tile_info = av_mallocz(nb_tiles * sizeof(TileInfo));
    if (p_tile_codec_ctx->tile_info == NULL) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate tile info.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    // The tile splitting method is derived from source code of TComPicSym.cpp
    // in HEVC Test Model open project: https://vcgit.hhi.fraunhofer.de/jvet/HM
    // as it usually can be regarded as the best practice of HEVC standard
    // specification. In xInitTiles function it shows how it works:
    //
    // if( m_pps.getTileUniformSpacingFlag() )
    // {
    //   //set width and height for each (uniform) tile
    //   for(Int row=0; row < numRows; row++)
    //   {
    //     for(Int col=0; col < numCols; col++)
    //     {
    //       const Int tileIdx = row * numCols + col;
    //       m_tileParameters[tileIdx].setTileWidthInCtus(  (col+1)*getFrameWidthInCtus( )/numCols - (col*getFrameWidthInCtus( ))/numCols );
    //       m_tileParameters[tileIdx].setTileHeightInCtus( (row+1)*getFrameHeightInCtus()/numRows - (row*getFrameHeightInCtus())/numRows );
    //     }
    //   }
    // }
    //
    // As we can see the tile splitting is not being devided in average because
    // the size of tile should be in units of CTU/LTU. Say the size of CTU/LTU
    // is 64x64 and we have 1280 pixel width and we want to split into 8 columns
    // in tile. Then we will get 8 tiles with width of 2 3 2 3 2 3 2 3 in units
    // of CTU/LTU.
    //
    // Besides it also specifies the minimum size of each tile according to the
    // profile indicator. We can see that with MAIN or MAIN10 (or higher)
    // profile the width of tile should be at least 4 sizes of CTU/LTU. It can
    // guarantee integrity output pictures for tile encoding.
    //
    // // Tile size check
    // Int minWidth  = 1;
    // Int minHeight = 1;
    // const Int profileIdc = m_sps.getPTL()->getGeneralPTL()->getProfileIdc();
    // if (  profileIdc == Profile::MAIN || profileIdc == Profile::MAIN10)
    // {
    //   if (m_pps.getTilesEnabledFlag())
    //   {
    //     minHeight = 64  / m_sps.getMaxCUHeight();
    //     minWidth  = 256 / m_sps.getMaxCUWidth();
    //   }
    // }
    sar = p_tile_codec_ctx->sar;
    video_width = p_tile_codec_ctx->video_width;
    video_height = p_tile_codec_ctx->video_height;
    ctb_width = (video_width + ctu_size - 1) / ctu_size;
    ctb_height = (video_height + ctu_size - 1) / ctu_size;
    min_ctb_width = 256 / ctu_size;
    min_ctb_height = 256 / ctu_size;
    for (i = 0; i < split_row; i++) {
        for (j = 0; j < split_col; j++) {
            int index = i * split_col + j;
            TileInfo *ti = &p_tile_codec_ctx->tile_info[index];
            ti->w = (j + 1) * ctb_width / split_col - j * ctb_width / split_col;
            if (ti->w < min_ctb_width) {
                av_log(NULL, AV_LOG_ERROR, "%dx%d tiles spec for resolution of "
                       "%dx%d results in width of %d unit(s) of CTU which is "
                       "less than mininum allowed %d units of CTU. Please reduce"
                       " the column number of tiles in -t, --tiles option.\n",
                       split_col, split_row, video_width, video_height, ti->w, min_ctb_width);
                exit(-1);
            }

            ti->h = (i + 1) * ctb_height / split_row - i * ctb_height / split_row;
            if (ti->h < min_ctb_height) {
                av_log(NULL, AV_LOG_ERROR, "%dx%d tiles spec for resolution of "
                       "%dx%d results in height of %d unit of CTU which is less"
                       " than mininum allowed %d unit of CTU. Please reduce the"
                       " row number of tiles in -t, --tiles option.\n", split_col,
                       split_row, video_width, video_height, ti->h, min_ctb_height);
                exit(-1);
            }

            if (j == split_col - 1 && video_width % ctu_size) {
                ti->w = (ti->w - 1) * ctu_size + video_width % ctu_size;
            } else {
                ti->w *= ctu_size;
            }
            if (i == split_row - 1 && video_height % ctu_size) {
                ti->h = (ti->h - 1) * ctu_size + video_height % ctu_size;
            } else {
                ti->h *= ctu_size;
            }
            ti->x = j == 0 ? 0 : ti[-1].x + ti[-1].w;
            ti->y = i == 0 ? 0 : ti[-split_col].y + ti[-split_col].h;
            snprintf(graph_desc + strlen(graph_desc),
                     graph_desc_size - strlen(graph_desc), ";[sin_%d]crop=%d:%d:%d:%d[sout_%d]",
                     index, ti->w, ti->h, ti->x, ti->y, index);
        }
    }

    av_log(NULL, AV_LOG_INFO, "filter desc: %s\n", graph_desc);

    ret = filter_graph_open(graph_desc, video_width, video_height, fps, sar, nb_tiles);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Failed to open filter graph\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    ret = output_file_open(output_file, video_width, video_height, fps);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Failed to open output file\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    ret = tile_repack_bsf_init(p_tile_codec_ctx->ofmt_ctx->streams[0], nb_tiles);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Failed to initialize repack bsf\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    p_decoded_frame = av_frame_alloc();
    if (!p_decoded_frame) {
        av_log(NULL, AV_LOG_ERROR, "Could not allocate video frame\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    ret = pthread_mutex_init(&p_tile_codec_ctx->mutex, NULL);
    if (ret) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate tile codec context mutex.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    ret = pthread_cond_init(&p_tile_codec_ctx->cond, NULL);
    if (ret) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate tile codec context condvar.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    encoder = avcodec_find_encoder_by_name(encoder_name);
    if (!encoder) {
        av_log(NULL, AV_LOG_ERROR, "Codec '%s' not found\n", encoder_name);
        ret = AVERROR(ENOMEM);
        goto end;
    }

    p_tile_codec_ctx->filter_finish = 0;

    p_tile_codec_ctx->instances = av_mallocz(nb_tiles * sizeof(struct codec_instance));
    if (p_tile_codec_ctx->instances == NULL) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate encode instances.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    for (i = 0; i < nb_tiles; i++) {
        struct codec_instance *inst = &p_tile_codec_ctx->instances[i];
        inst->id = i;
        inst->encoder = encoder;
        inst->codec_params = xcoder_params;
        inst->tile_info = p_tile_codec_ctx->tile_info[i];
        INIT_LIST_HEAD(&inst->tiled_frame_list);
        INIT_LIST_HEAD(&inst->tiled_packet_list);

        ret = codec_instance_init(inst, bitrate, fps, sar, logging);
        if (ret) {
            av_log(NULL, AV_LOG_ERROR, "Failed init codec instance %d.\n", i);
            ret = AVUNERROR(ret);
            goto end;
        }
    }

    p_tile_codec_ctx->workers = av_mallocz(p_tile_codec_ctx->total_threads * sizeof(struct tile_thread_context));
    if (p_tile_codec_ctx->workers == NULL) {
        av_log(NULL, AV_LOG_ERROR, "Failed to allocate encode threads.\n");
        ret = AVERROR(ENOMEM);
        goto end;
    }

    for (i = 0; i < p_tile_codec_ctx->total_threads; i++) {
        struct tile_thread_context *worker = &p_tile_codec_ctx->workers[i];
        worker->nb_instances = 0;
        worker->codec_ctx = p_tile_codec_ctx;
        INIT_LIST_HEAD(&worker->inst_list);

        ret = tile_thread_context_init(worker);
        if (ret) {
            av_log(NULL, AV_LOG_ERROR, "Failed init tile thread context %d.\n", i);
            ret = AVUNERROR(ret);
            goto end;
        }

        ret = pthread_create(&worker->tid, NULL, encode_routine, worker);
        if (ret) {
            av_log(NULL, AV_LOG_ERROR, "Failed to create pthread for %d: %s.\n",
                   i, strerror(errno));
            ret = AVUNERROR(ret);
            goto end;
        }
    }

    // Wait for all instances running...
    encode_thread_sync();

    out_pkts = av_mallocz(nb_tiles * sizeof(*out_pkts));
    av_assert0(out_pkts);
    for (i = 0; i < nb_tiles; i++) {
        out_pkts[i] = av_packet_alloc();
    }

    t0 = av_gettime_relative();
    while (!p_tile_codec_ctx->finish) {
        t1 = av_gettime_relative();
        ret = av_read_frame(p_tile_codec_ctx->ifmt_ctx, &in_pkt);
        pkt_bak = in_pkt; // backup AVPacket structure
        if (in_pkt.stream_index == p_tile_codec_ctx->video_stream_index || ret < 0) {
            ret = do_decode(p_tile_codec_ctx->dec_ctx, p_decoded_frame, ret >= 0 ? &in_pkt : NULL, t0);
            if (ret < 0 && ret != AVERROR(EAGAIN)) {
                break;
            }
        }
        av_packet_unref(&pkt_bak);
        print_report(0, t0, t1);
    }

    // When there are output tile packets
    done = 0;
    while (!p_tile_codec_ctx->finish) {
        t1 = av_gettime_relative();
        pthread_mutex_lock(&p_tile_codec_ctx->mutex);
        if (p_tile_codec_ctx->active_workers == 0) {
            done = 1;
        }
        pthread_mutex_unlock(&p_tile_codec_ctx->mutex);

        if (tile_packet_available()) {
            if (repack_tile_packets(out_pkts) == 0) {
                for (i = 0; i < p_tile_codec_ctx->nb_tiles; i++) {
                    if(out_pkts[i]->size)
                    {
                        av_write_frame(p_tile_codec_ctx->ofmt_ctx, out_pkts[i]);
                    }
                    av_packet_unref(out_pkts[i]);
                }
            }
            print_report(0, t0, t1);
        } else if (done) {
            break;
        }
    }
    av_write_trailer(p_tile_codec_ctx->ofmt_ctx);

    print_report(1, t0, av_gettime_relative());
    av_log(NULL, AV_LOG_INFO, "Main thread is going to exit.\n");

    p_tile_codec_ctx->finish = 1;
    for (i = 0; i < p_tile_codec_ctx->total_threads; i++) {
        void *result;
        struct tile_thread_context *worker = &p_tile_codec_ctx->workers[i];
        av_log(NULL, AV_LOG_INFO, "pthread %ld ready to exit.\n", worker->tid);
        pthread_mutex_lock(&worker->inst_mtx);
        pthread_cond_signal(&worker->inst_cond);
        pthread_mutex_unlock(&worker->inst_mtx);

        ret = pthread_join(worker->tid, &result);
        if (ret) {
            av_log(NULL, AV_LOG_ERROR, "Failed to join pthread for %ld: %s.\n",
                   worker->tid, strerror(errno));
        }

        if ((long) result != 0) {
            av_log(NULL, AV_LOG_ERROR, "pthread %ld result=0x%lx.\n",
                   worker->tid, (long)result);
            ret = (int)((long) result);
        }

        pthread_mutex_destroy(&worker->inst_mtx);
        pthread_cond_destroy(&worker->inst_cond);
    }

    for (i = 0; i < nb_tiles; i++) {
        codec_instance_cleanup(&p_tile_codec_ctx->instances[i]);
    }

end:
    for (i = 0; i < nb_tiles; i++) {
        av_packet_free(&out_pkts[i]);
    }
    av_freep(&out_pkts);

    av_frame_free(&p_decoded_frame);
    avcodec_free_context(&p_tile_codec_ctx->dec_ctx);
    avformat_close_input(&p_tile_codec_ctx->ifmt_ctx);
    avformat_close_input(&p_tile_codec_ctx->ofmt_ctx); // avio_close()
    pthread_mutex_destroy(&p_tile_codec_ctx->mutex);
    pthread_cond_destroy(&p_tile_codec_ctx->cond);
    avfilter_graph_free(&p_tile_codec_ctx->filter_graph);
    av_bsf_free(&p_tile_codec_ctx->tile_repack_bsf_ctx);
    av_freep(&p_tile_codec_ctx->tile_info);
    free(p_tile_codec_ctx->bufsink_ctxes);
    free(p_tile_codec_ctx->instances);
    free(p_tile_codec_ctx->workers);
    free(p_tile_codec_ctx);
    free(graph_desc);

    av_log(NULL, AV_LOG_INFO, "EXIT.. ret=0x%x.\n", ret);
    if (ret == 0) {
        fprintf(stderr, "%s finished.\n", output_file);
    }

    return ret;
}
