/*******************************************************************************
 *
 * Copyright (C) 2022 NETINT Technologies
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 ******************************************************************************/

/*!*****************************************************************************
 *  \file   ni_xstack_application.c
 *
 *  \brief  Application for performing xstack processing with libxcoder API.
 *          Its code provides examples on how to programatically use libxcoder
 *          API.
 ******************************************************************************/

#ifdef _WIN32
#include "ni_getopt.h"
#include <io.h>
#elif __linux__ || __APPLE__
#define _POSIX_C_SOURCE 200809L
#include <getopt.h>
#include <signal.h>
#include <unistd.h>
#endif

#include "ni_test_utils.h"
#include <fcntl.h>

#define XSTACK_APP_D2E 0
#define XSTACK_APP_D2L 1
#define XSTACK_APP_U2E 2
#define XSTACK_APP_U2L 3

#define NI_MAX_HW_FRAME 4
#define NI_MAX_BUFFERED_FRAME 45
#define MAX_SINGLE_INPUTS 8
#define NI_MAX_XSTACK_NUM 16
#define NI_MAX_XSTACK_INPUTS 50
#define NI_MAX_XSTACK_OUTPUTS 32
#define NI_XSTACK_RECONFIG_FILE_NAME "reconf.xstack"
#define MAX_WIDTH 7680
#define MAX_HEIGHT 4800
#define MIN_WIDTH 128
#define MIN_HEIGHT 96

// simplistic ref counted HW frame
typedef struct _output_info {
    char *output_name;
    ni_codec_format_t encoder_type;
} output_info;

typedef struct _stack_info {
    int outputs_num;
    int out_width;
    int out_height;
    output_info output_info[NI_MAX_XSTACK_OUTPUTS];
} stack_info_t;

typedef struct _upload_info {
    int width;
    int height;
    bool specific_res;
} upload_info;

typedef struct _ni_stack_item {
    int x, y;
    int w;
    int h;
} ni_stack_item_t;

typedef struct _ni_ref_frame {
    ni_session_data_io_t data;
    int ref_cnt;
} ni_ref_frame_t;

typedef struct _ni_buffered_frame_list {
    ni_ref_frame_t frames[NI_MAX_BUFFERED_FRAME];
    int head;
    int tail;
    int size;
} ni_buffered_frame_list_t;

typedef struct _common_t {
    ni_pthread_mutex_t lock;
    ni_pthread_cond_t cond;
    uint32_t stack_frame_num;
    uint32_t decoded_frame_num;
    uint32_t uploaded_frame_num;
    uint32_t max_exited_input_frame_num;
    int shortest;
    int total_dec_threads;
    int total_enc_threads;
    int total_upl_threads;
    int total_xstack_threads;

    int exit_dec_num;
    int exit_enc_num;
    int exit_upl_num;
    int exit_stack_num;
    int ready_upl_worker_num[NI_MAX_HW_FRAME];
    int ready_dec_worker_num[NI_MAX_HW_FRAME];
    int ready_frames[NI_MAX_HW_FRAME];
    int input_eos[NI_MAX_XSTACK_INPUTS];
} common_t;

typedef struct _ni_frame_entry {
    ni_pthread_mutex_t lock;
    ni_pthread_cond_t cond;
    ni_buffered_frame_list_t frame_list;
} ni_frame_entry_t;

// decoding task description
typedef struct _decoder_worker {
    ni_pthread_t send_tid;
    ni_pthread_t recv_tid;
    int index;
    int pfs;
    ni_codec_format_t codec_type;
    char filename[256];
    char decoder_params[256];
    ni_file_reader_t file_reader;
    void *p_stream_info;
    uint8_t *stream_buf;
    int bit_depth;
    int width;
    int height;
    int fps_num;
    int fps_den;
    ni_session_context_t *p_dec_ctx;
    uint64_t recv_frame_num;

    int should_exit;
    // have to stop now, e.g. participant removal
    int force_exit;
    common_t *common;
    ni_frame_entry_t *frame_entry;
} decoder_worker;

// hw upload task description
typedef struct _upload_worker {
    ni_pthread_t tid;
    int index;
    int pfs;
    ni_codec_format_t codec_type;
    ni_file_reader_t file_reader;
    char filename[256];
    uint8_t bit_depth;
    uint16_t in_width;
    uint16_t in_height;
    int pixel_format;
    ni_session_context_t *p_upl_ctx;
    void *yuv_buf;
    uint64_t upl_frame_num;
    uint64_t total_frame_num;

    int should_exit;
    // have to stop now, e.g. participant removal
    int force_exit;
    common_t *common;
    ni_frame_entry_t *frame_entry;
} upload_worker;

typedef struct _encoder_worker {
    int index;
    ni_pthread_t send_tid;
    ni_pthread_t recv_tid;
    ni_pthread_mutex_t lock;
    ni_pthread_cond_t start_cond;
    ni_pthread_cond_t opened_cond;
    void *stack_worker;
    ni_session_context_t *p_enc_ctx;
    ni_codec_format_t codec_format;
    int pixel_format;
    int input_width;
    int input_height;
    int fps_num;
    int fps_den;
    FILE *out_fp;
    char encoder_params[256];

    bool started;
    int got_sos;
    int should_exit;
    uint32_t enc_frame_num;
    ni_frame_entry_t *frame_entry;
    common_t *common;
} encoder_worker;

typedef struct _stack_worker {
    int index;
    int mode;
    int head;
    ni_pthread_t tid;
    ni_pthread_mutex_t lock;
    uint32_t nb_inputs;
    uint32_t nb_encoders;
    uint16_t out_width;
    uint16_t out_height;
    int pixel_format;
    uint32_t stack_frame_num;
    ni_session_context_t *p_stack_ctx;
    ni_stack_item_t *stack_items;
    common_t *common;
    int fillcolor;
    int should_exit;
    int fps_num;
    int fps_den;
    FILE *out_fp;
    encoder_worker *encoder_workers[NI_MAX_XSTACK_OUTPUTS];
} stack_worker_t;

typedef struct _ni_src_desc {
    int src_fmt;
    int src_width;
    int src_height;
    char decoder_name[32];
    char file_name[256];
    char decoder_params[256];
} ni_src_desc_t;

static ni_hwframe_ref_t g_hwframe_pool[NI_MAX_DR_HWDESC_FRAME_INDEX] = {0};
upload_worker *upload_workers[NI_MAX_XSTACK_INPUTS] = {0};
decoder_worker *decoder_workers[NI_MAX_XSTACK_INPUTS] = {0};
stack_worker_t *stack_workers[NI_MAX_XSTACK_OUTPUTS] = {0};
uint32_t active_upload_workers = 0;
uint32_t active_decoder_workers = 0;
uint32_t active_encoder_workers = 0;
uint8_t stack_num = 0;
static volatile int need_reconfig = 0;
static int pix_fmt = NI_PIX_FMT_YUV420P;
static int devid = 0;
int print_stat = 1;
int global_stop = 0;
int global_state = EXIT_SUCCESS;

static inline bool frame_list_is_empty(ni_buffered_frame_list_t *list) {
    return (list->head == list->tail);
}

static inline bool frame_list_is_full(ni_buffered_frame_list_t *list) {
    return (list->head == ((list->tail + 1) % (list->size)));
}

static inline int frame_list_enqueue(ni_buffered_frame_list_t *list) {
    if (frame_list_is_full(list)) {
        ni_log(NI_LOG_ERROR, "ERROR: frame_list is full\n");
        return -1;
    }
    list->tail = (list->tail + 1) % (list->size);
    return 0;
}

static inline int frame_list_dequeue(ni_buffered_frame_list_t *list) {
    if (frame_list_is_empty(list)) {
        ni_log(NI_LOG_ERROR, "ERROR: frame_list is empty\n");
        return -1;
    }
    list->head = (list->head + 1) % (list->size);
    return 0;
}

static inline int frame_list_length(ni_buffered_frame_list_t *list) {
    return ((list->tail - list->head + list->size) % (list->size));
}

static inline int ni_unref_frame(ni_ref_frame_t *frame_ref) {
    if (!frame_ref) {
        ni_log(NI_LOG_ERROR, "ERROR: frame_ref is NULL\n");
        return -1;
    }

    ni_log(NI_LOG_DEBUG, "unref, ref_cnt %d\n", frame_ref->ref_cnt);
    if (frame_ref->ref_cnt > 0) {
        frame_ref->ref_cnt -= 1;
        if (frame_ref->ref_cnt == 0) {
            // recycle hw buffer
            niFrameSurface1_t *p_surface =
                (niFrameSurface1_t *)(frame_ref->data.data.frame.p_data[3]);
            if (p_surface && p_surface->ui16FrameIdx) {
                ni_hwframe_buffer_recycle2(p_surface);
            }
            return 1; // need to dequeue
        }
        return 0;
    } else {
        ni_log(NI_LOG_ERROR, "ERROR: frame_ref cnt is 0\n");
        return -1;
    }
}

// check the output resolution
static inline int check_resolution(int width, int height) {
    if (width % 2 != 0) {
        return -1;
    }
    if (height % 2 != 0) {
        return -1;
    }
    if (width < MIN_WIDTH || width > MAX_WIDTH || height < MIN_HEIGHT ||
        height > MAX_HEIGHT) {
        return -1;
    }
    return 0;
}

static inline int get_decoder_type(const char *decoder_name) {
    if (!strcmp("h264_ni_quadra_dec", decoder_name)) {
        return NI_CODEC_FORMAT_H264;
    } else if (!strcmp("h265_ni_quadra_dec", decoder_name)) {
        return NI_CODEC_FORMAT_H265;
    } else if (!strcmp("vp9_ni_quadra_dec", decoder_name)) {
        return NI_CODEC_FORMAT_VP9;
    } else {
        ni_log(NI_LOG_ERROR, "Error: invalid decoder %s specified\n",
               decoder_name);
        return NI_RETCODE_FAILURE;
    }
}

static inline int get_encoder_type(const char *encoder_name) {
    if (!strcmp("h264_ni_quadra_enc", encoder_name)) {
        return NI_CODEC_FORMAT_H264;
    } else if (!strcmp("h265_ni_quadra_enc", encoder_name)) {
        return NI_CODEC_FORMAT_H265;
    } else if (!strcmp("av1_ni_quadra_enc", encoder_name)) {
        ni_log(NI_LOG_ERROR, "av1 encoder is not supported yet\n");
        return NI_RETCODE_FAILURE;
    } else {
        ni_log(NI_LOG_ERROR, "Error: invalid encoder %s specified\n",
               encoder_name);
        return NI_RETCODE_FAILURE;
    }
}

static int read_swframe_from_file(upload_worker *upl_worker, int *eos) {
    int read_size, frame_size;

    frame_size = calc_swframe_size(upl_worker->in_width, upl_worker->in_height,
                                   upl_worker->pixel_format);

    if (upl_worker->file_reader.loop > 1 && upl_worker->total_frame_num == 0) {
        // run once to know total frame count of input
        upl_worker->total_frame_num = upl_worker->upl_frame_num;
    }
    read_size = read_one_swframe(upl_worker->pfs, &(upl_worker->file_reader),
                                 upl_worker->yuv_buf, frame_size);
    if (read_size < 0) {
        ni_log(NI_LOG_ERROR, "Error: could not read file!");
        return NI_RETCODE_FAILURE;
    } else if (read_size == 0) {
        *eos = 1;
        ni_log(NI_LOG_DEBUG, "%s: read chunk size 0, eos!\n", __func__);
        return 0;
    } else {
        *eos = 0;
        return read_size;
    }
}

static int ni_stack_params_set_value(ni_stack_item_t stack_items[],
                                     const char *size_desc, const char *layout_desc, int nb_inputs) {
    if (!stack_items || !size_desc || !layout_desc) {
        ni_log(NI_LOG_ERROR, "Error: %s() Null pointer parameters passed\n", __func__);
        return NI_RETCODE_FAILURE;
    }

    int height, width;
    char *arg, *p, *saveptr = NULL;
    char *arg2, *p2, *saveptr2 = NULL;
    char *arg3, *p3, *saveptr3 = NULL;
    int inw, inh, size;

    // Parse size parameter
    p = (char *)size_desc;
    for (int i = 0; i < nb_inputs; i++) {
        ni_stack_item_t *item = &(stack_items[i]);

        if (!(arg = ni_strtok(p, "|", &saveptr)))
            return NI_RETCODE_FAILURE;

        p = NULL;

        p2 = arg;
        inw = inh = 0;
        for (int j = 0; j < 2; j++) {
            if (!(arg2 = ni_strtok(p2, "_", &saveptr2)))
                return NI_RETCODE_FAILURE;

            p2 = NULL;
            p3 = arg2;
            while ((arg3 = ni_strtok(p3, "+", &saveptr3))) {
                p3 = NULL;
                if (sscanf(arg3, "%d", &size) == 1) {
                    if (size < 0)
                        return NI_RETCODE_FAILURE;

                    if (!j)
                        inw += size;
                    else
                        inh += size;
                } else {
                    return NI_RETCODE_FAILURE;
                }
            }
        }

        item->w = NIALIGN(inw, 2);
        item->h = NIALIGN(inh, 2);
    }

    //Parse layout parameter after size to support w0/h0 syntax
    width = 0;
    height = 0;
    p = (char *)layout_desc;
    saveptr = NULL;
    saveptr2 = NULL;
    saveptr3 = NULL;
    for (int i = 0; i < nb_inputs; i++) {
        ni_stack_item_t *item = &(stack_items[i]);

        if (!(arg = ni_strtok(p, "|", &saveptr)))
            return NI_RETCODE_FAILURE;

        p = NULL;

        p2 = arg;
        inw = inh = 0;

        for (int j = 0; j < 2; j++) {
            if (!(arg2 = ni_strtok(p2, "_", &saveptr2)))
                return NI_RETCODE_FAILURE;

            p2 = NULL;
            p3 = arg2;
            while ((arg3 = ni_strtok(p3, "+", &saveptr3))) {
                p3 = NULL;
                if (sscanf(arg3, "w%d", &size) == 1) {
                    if (size == i || size < 0 || size >= nb_inputs)
                        return NI_RETCODE_FAILURE;

                    if (!j)
                        inw += stack_items[size].w;
                    else
                        inh += stack_items[size].w;
                } else if (sscanf(arg3, "h%d", &size) == 1) {
                    if (size == i || size < 0 || size >= nb_inputs)
                        return NI_RETCODE_FAILURE;

                    if (!j)
                        inw += stack_items[size].h;
                    else
                        inh += stack_items[size].h;
                } else if (sscanf(arg3, "%d", &size) == 1) {
                    if (size < 0)
                        return NI_RETCODE_FAILURE;

                    if (!j)
                        inw += size;
                    else
                        inh += size;
                } else {
                    return NI_RETCODE_FAILURE;
                }
            }
        }

        item->x = NIALIGN(inw, 2);
        item->y = NIALIGN(inh, 2);

        width = NIMAX(width, item->w + inw);
        height = NIMAX(height, item->h + inh);
    }

    return NI_RETCODE_SUCCESS;
}

int retrieve_filter_params(char filter_params[],
                           ni_stack_item_t stack_items[]) {
    char key[64], value[2048], size[2048], layout[2048];
    char *curr = filter_params, *colon_pos;
    int ret = 0;
    int nb_inputs = 0;

    size[0] = '\0';
    layout[0] = '\0';

    while (*curr) {
        colon_pos = strchr(curr, ':');

        if (colon_pos) {
            *colon_pos = '\0';
        }

        if (strlen(curr) > sizeof(key) + sizeof(value) - 1 ||
            ni_param_get_key_value(curr, key, value)) {
            fprintf(stderr,
                    "Error: scaler-params key/value not "
                    "retrieved: %s\n",
                    curr);
            ret = NI_RETCODE_FAILURE;
            break;
        }

        if (!strcmp("inputs", key)) {
            nb_inputs = atoi(value);
        } else if (!strcmp("size", key)) {
            strcpy(size, value);
        } else if (!strcmp("layout", key)) {
            strcpy(layout, value);
        } else {
            ni_log(NI_LOG_ERROR, "Error: Unrecognized filter parameter name %s", key);
            ret = NI_RETCODE_FAILURE;
            break;
        }

        if (colon_pos) {
            curr = colon_pos + 1;
        } else {
            curr += strlen(curr);
        }
    }

    ret = ni_stack_params_set_value(stack_items, size, layout, nb_inputs);
    if (ret) {
        ni_log(NI_LOG_ERROR, "Error: failed to parse xstack parameters %s\n", key);
    }

    return ret ? ret : nb_inputs;
}

int hwupload_frame(upload_worker *upl_worker, ni_session_data_io_t *p_sw_data,
                   ni_session_data_io_t *p_hw_data, int *eos) {
    int ret, read_size;
    ni_session_context_t *p_upl_ctx = upl_worker->p_upl_ctx;
    read_size = read_swframe_from_file(upl_worker, eos);
    if (read_size < 0) {
        ni_log(NI_LOG_ERROR, "Error: read yuv file error\n");
        return NI_RETCODE_FAILURE;
    } // else read_size == one swframe size or read_size == 0(eof)

    // we're uploading rawdata for hw scaling
    // so use upload_send_data_get_desc2
    ret = upload_send_data_get_desc2(p_upl_ctx, p_sw_data, p_hw_data,
                                    upl_worker->in_width, upl_worker->in_height,
                                    *eos ? NULL : upl_worker->yuv_buf);
    if (p_upl_ctx->status == NI_RETCODE_NVME_SC_WRITE_BUFFER_FULL) {
        ni_log(NI_LOG_DEBUG, "No space to write to, try to read a packet\n");
        // file was read so reset read pointer and try again
        upl_worker->file_reader.data_left_size += read_size;
        uint64_t offset = 0;
        if (upl_worker->total_frame_num &&
            (upl_worker->upl_frame_num > upl_worker->total_frame_num)) {
            offset = read_size *
                     (upl_worker->upl_frame_num % upl_worker->total_frame_num);
        } else {
            offset = read_size * upl_worker->upl_frame_num;
        }
        lseek(upl_worker->pfs, offset, SEEK_SET);
    } else if (ret) {
        ni_log(NI_LOG_ERROR, "Error: upload frame error\n");
    }

    return ret;
}

/*!*****************************************************************************
 *  \brief  Encoder session open
 *
 *  \param
 *
 *  \return 0 if successful, < 0 otherwise
 ******************************************************************************/
int encoder_open_session(ni_session_context_t *p_enc_ctx, int dst_codec_format,
                         int iXcoderGUID, ni_xcoder_params_t *p_enc_params,
                         int width, int height,
                         ni_color_primaries_t color_primaries,
                         ni_color_transfer_characteristic_t color_trc,
                         ni_color_space_t color_space,
                         int video_full_range_flag, int sar_num, int sar_den,
                         ni_pix_fmt_t pix_fmt, bool check_zerocopy) {
    int ret = 0;

    if (video_full_range_flag < 0) {
        ni_log(NI_LOG_ERROR,
               "ERROR %s: The video full range flag is %d should "
               "be indicated excplicitly as 0 or 1!\n",
               __func__, video_full_range_flag);
        return -1;
    }

    p_enc_ctx->p_session_config = p_enc_params;
    p_enc_ctx->session_id = NI_INVALID_SESSION_ID;
    p_enc_ctx->codec_format = dst_codec_format;

    // assign the card GUID in the encoder context and let session open
    // take care of the rest
    p_enc_ctx->device_handle = NI_INVALID_DEVICE_HANDLE;
    p_enc_ctx->blk_io_handle = NI_INVALID_DEVICE_HANDLE;
    p_enc_ctx->hw_id = iXcoderGUID;

    // default: little endian
    p_enc_ctx->src_endian = NI_FRAME_LITTLE_ENDIAN;

    switch (pix_fmt) {
    case NI_PIX_FMT_YUV420P:
    case NI_PIX_FMT_NV12:
        p_enc_ctx->src_bit_depth = 8;
        p_enc_ctx->bit_depth_factor = 1;
        break;
    case NI_PIX_FMT_YUV420P10LE:
    case NI_PIX_FMT_P010LE:
        p_enc_ctx->src_bit_depth = 10;
        p_enc_ctx->bit_depth_factor = 2;
        break;
    default:
        p_enc_ctx->src_bit_depth = 8;
        p_enc_ctx->bit_depth_factor = 1;
        pix_fmt = NI_PIX_FMT_YUV420P;
        break;
    }

    // original resolution this stream started with, this is used by encoder
    // sequence change
    p_enc_ctx->ori_width = width;
    p_enc_ctx->ori_height = height;
    p_enc_ctx->ori_bit_depth_factor = p_enc_ctx->bit_depth_factor;
    p_enc_ctx->ori_pix_fmt = pix_fmt;
    p_enc_ctx->pixel_format = pix_fmt;

    int linesize_aligned = width;
    if (linesize_aligned < NI_MIN_WIDTH) {
        p_enc_params->cfg_enc_params.conf_win_right +=
            (NI_MIN_WIDTH - width) / 2 * 2;
        linesize_aligned = NI_MIN_WIDTH;
    } else {
        linesize_aligned = ((width + 1) / 2) * 2;
        p_enc_params->cfg_enc_params.conf_win_right +=
            (linesize_aligned - width) / 2 * 2;
    }
    p_enc_params->source_width = linesize_aligned;

    int height_aligned = height;
    if (height_aligned < NI_MIN_HEIGHT) {
        p_enc_params->cfg_enc_params.conf_win_bottom +=
            (NI_MIN_HEIGHT - height) / 2 * 2;
        height_aligned = NI_MIN_HEIGHT;
    } else {
        height_aligned = ((height + 1) / 2) * 2;
        p_enc_params->cfg_enc_params.conf_win_bottom +=
            (height_aligned - height) / 2 * 2;
    }
    p_enc_params->source_height = height_aligned;

    // VUI setting including color setting is done by specifying them in the
    // encoder config
    p_enc_params->cfg_enc_params.colorDescPresent = 0;
    if ((color_primaries != NI_COL_PRI_UNSPECIFIED) ||
        (color_space != NI_COL_SPC_UNSPECIFIED) ||
        (color_trc != NI_COL_TRC_UNSPECIFIED)) {
        p_enc_params->cfg_enc_params.colorDescPresent = 1;
    }
    p_enc_params->cfg_enc_params.colorPrimaries = color_primaries;
    p_enc_params->cfg_enc_params.colorTrc = color_trc;
    p_enc_params->cfg_enc_params.colorSpace = color_space;
    p_enc_params->cfg_enc_params.videoFullRange = video_full_range_flag;
    p_enc_params->cfg_enc_params.aspectRatioWidth = sar_num;
    p_enc_params->cfg_enc_params.aspectRatioHeight = sar_den;

    p_enc_ctx->framerate.framerate_denom = p_enc_params->fps_denominator;
    p_enc_ctx->framerate.framerate_num = p_enc_params->fps_number;

    // default planar encoder input data
    p_enc_params->cfg_enc_params.planar = get_pixel_planar(pix_fmt);

    p_enc_params->video_full_range_flag = video_full_range_flag;

    if (check_zerocopy) {
        // config linesize for zero copy (if input resolution is zero copy
        // compatible)
        int src_stride[NI_MAX_NUM_DATA_POINTERS];

        // NOTE - FFmpeg / Gstreamer users should use linesize array in frame
        // structure instead of src_stride in the following sample code
        src_stride[0] = width * p_enc_ctx->bit_depth_factor;

        // NOTE - NV12 is not yet supported for zero copy
        // bool isnv12frame = (p_enc_params->cfg_enc_params.planar ==
        // NI_PIXEL_PLANAR_FORMAT_SEMIPLANAR) ? true : false; src_stride[1] =
        // isnv12frame ? src_stride[0] : src_stride[0] / 2; src_stride[2] =
        // isnv12frame ? 0 : src_stride[0] / 2;
        src_stride[1] = src_stride[2] = src_stride[0] / 2;

        ni_encoder_frame_zerocopy_check(p_enc_ctx, p_enc_params, width, height,
                                        (const int *)src_stride, true);
    }

    ret = ni_device_session_open(p_enc_ctx, NI_DEVICE_TYPE_ENCODER);
    if (ret != NI_RETCODE_SUCCESS) {
        ni_log(NI_LOG_ERROR, "Error: %s failure!\n", __func__);
    } else {
        ni_log(NI_LOG_INFO, "Encoder device %d session open successful.\n",
               iXcoderGUID);
    }

    // set up ROI QP map for ROI demo modes if enabled
    if (p_enc_params->cfg_enc_params.roi_enable &&
        (1 == p_enc_params->roi_demo_mode ||
         2 == p_enc_params->roi_demo_mode)) {
        set_demo_roi_map(p_enc_ctx);
    }

    return ret;
}

int encoder_receive(encoder_worker *enc_worker, ni_session_data_io_t *pkt,
                    uint32_t *number_of_packets, FILE *pfs,
                    device_state_t *xcoder_state) {
    int ret = 0;
    int recycle_index;
    int recv_fin_flag;
    uint32_t prev_num_pkt;
    ni_session_context_t *p_enc_ctx = enc_worker->p_enc_ctx;

    pkt->data.packet.end_of_stream = 0;
    prev_num_pkt = *number_of_packets;

    recv_fin_flag =
        encoder_receive_data(p_enc_ctx, pkt, pfs, number_of_packets);

    recycle_index = pkt->data.packet.recycle_index;
    if (prev_num_pkt < *number_of_packets && p_enc_ctx->hw_action &&
        recycle_index > 0 &&
        recycle_index < NI_GET_MAX_HWDESC_FRAME_INDEX(p_enc_ctx->ddr_config)) {
        enc_worker->enc_frame_num++;
        // encoder only returns valid recycle index
        // when there's something to recycle.
        // This range is suitable for all memory bins
        ni_log(NI_LOG_DEBUG, "encoder %d received 1 pkt, recycle index %d\n",
               enc_worker->index, recycle_index);
        ni_pthread_mutex_lock(
            &((stack_worker_t *)(enc_worker->stack_worker))->lock);
        if (g_hwframe_pool[recycle_index].ref_cnt > 0) {
            g_hwframe_pool[recycle_index].ref_cnt--;
            if (g_hwframe_pool[recycle_index].ref_cnt == 0) {
                ni_hwframe_buffer_recycle2(
                    &(g_hwframe_pool[recycle_index].surface));
            }
            recycle_index = 0;
        } else {
            ni_log(NI_LOG_ERROR, "ERROR: ILLEGAL unref %d\n", recycle_index);
        }
        ni_pthread_mutex_unlock(
            &((stack_worker_t *)(enc_worker->stack_worker))->lock);

        if (recycle_index) {
            ni_log(NI_LOG_ERROR,
                   "recycle_index %d is not found in waiting to recycles "
                   "frames!\n",
                   recycle_index);
            return NI_RETCODE_FAILURE;
        }
    } else {
        ni_log(NI_LOG_TRACE,
               "enc recv, prev_num_pkt %u "
               "number_of_packets_list %u recycle_index %u\n",
               prev_num_pkt, *number_of_packets, recycle_index);
    }

    // Error or eos
    if (recv_fin_flag < 0) {
        ni_log(NI_LOG_ERROR, "encoder recv error, quit!\n");
        ret = 1;
    } else if (recv_fin_flag == NI_TEST_RETCODE_EAGAIN) {
        ni_usleep(100);
        ret = 0;
    }

    if (pkt->data.packet.end_of_stream) {
        ni_log(NI_LOG_INFO, "encoder receive thread got eos !\n");
        ret = 1;
    }

    return ret;
}

static common_t *alloc_common(void) {
    common_t *common;
    int ret, i;

    common = malloc(sizeof(common_t));
    if (common == NULL) {
        return NULL;
    }

    memset(common, 0, sizeof(common_t));

    for (i = 0; i < NI_MAX_HW_FRAME; i++) {
        common->ready_frames[i] = -1;
    }

    for (i = 0; i < NI_MAX_XSTACK_INPUTS; i++) {
        common->input_eos[i] = -1;
    }

    ret = ni_pthread_mutex_init(&common->lock);
    if (ret) {
        goto fail_init_lock;
    }

    ret = ni_pthread_cond_init(&common->cond, NULL);
    if (ret) {
        goto fail_init_ready_cond;
    }

    return common;

fail_init_ready_cond:
    ni_pthread_mutex_destroy(&common->lock);
fail_init_lock:
    free(common);
    return NULL;
}

static void free_common(common_t *common) {
    if (common) {
        pthread_mutex_destroy(&common->lock);
        pthread_cond_destroy(&common->cond);
        free(common);
    }
}

static void recycle_stack_input_hwframe(common_t *common, int index, int mode) {
    int i;
    int ret;
    bool need_recycle = false;

    if (mode == XSTACK_APP_D2E || mode == XSTACK_APP_D2L) {
        // recycle one hw frame of all decoders
        for (i = 0; i < active_decoder_workers; i++) {
            decoder_worker *dec_worker = decoder_workers[i];
            ni_frame_entry_t *frame_entry = dec_worker->frame_entry;
            ni_buffered_frame_list_t *dec_frame_list =
                &(frame_entry->frame_list);
            ni_pthread_mutex_lock(&frame_entry->lock);
            ni_ref_frame_t *p_ref_frame = &(dec_frame_list->frames[index]);
            ret = ni_unref_frame(p_ref_frame);
            if (ret == 1) {
                // need to dequeue
                if (frame_list_is_full(dec_frame_list)) {
                    frame_list_dequeue(dec_frame_list);
                    // both dec send and receive threads are waiting this cond
                    ni_pthread_cond_broadcast(&frame_entry->cond);
                } else {
                    frame_list_dequeue(dec_frame_list);
                }
                need_recycle = true;
            } else if (ret < 0) {
                ni_log(NI_LOG_ERROR, "ERROR: ILLEGAL unref\n");
                ni_pthread_mutex_unlock(&frame_entry->lock);
                continue;
            }
            ni_pthread_mutex_unlock(&frame_entry->lock);
        }
    } else if (mode == XSTACK_APP_U2E || mode == XSTACK_APP_U2L) {
        // recycle one hw frame of all uploaders
        for (i = 0; i < active_upload_workers; i++) {
            upload_worker *upl_worker = upload_workers[i];
            ni_frame_entry_t *frame_entry = upl_worker->frame_entry;
            ni_buffered_frame_list_t *upl_frame_list =
                &(frame_entry->frame_list);
            ni_pthread_mutex_lock(&frame_entry->lock);
            ni_ref_frame_t *p_ref_frame = &(upl_frame_list->frames[index]);
            ret = ni_unref_frame(p_ref_frame);
            if (ret == 1) {
                // need to dequeue
                if (frame_list_is_full(upl_frame_list)) {
                    frame_list_dequeue(upl_frame_list);
                    ni_pthread_cond_signal(&frame_entry->cond);
                } else {
                    frame_list_dequeue(upl_frame_list);
                }
                need_recycle = true;
            } else if (ret < 0) {
                ni_log(NI_LOG_ERROR, "ERROR: ILLEGAL unref\n");
                ni_pthread_mutex_unlock(&frame_entry->lock);
                continue;
            }
            ni_pthread_mutex_unlock(&frame_entry->lock);
        }
    }

    if (need_recycle) {
        ni_pthread_mutex_lock(&common->lock);
        common->ready_upl_worker_num[index] = 0;
        common->ready_dec_worker_num[index] = 0;
        common->ready_frames[index] = -1;
        ni_pthread_mutex_unlock(&common->lock);
    }
}

int init_stack_worker(stack_worker_t *stack_worker) {
    int ret;

    stack_worker->p_stack_ctx = malloc(sizeof(ni_session_context_t));
    if (stack_worker->p_stack_ctx) {
        if (ni_device_session_context_init(stack_worker->p_stack_ctx) < 0) {
            ni_log(NI_LOG_ERROR,
                   "Error: failed to init stack session context\n");
            free(stack_worker->p_stack_ctx);
            ret = NI_RETCODE_FAILURE;
            goto end;
        }
    } else {
        ni_log(NI_LOG_ERROR,
               "ERROR: failed to allocate stack session context\n");
        ret = NI_RETCODE_FAILURE;
        goto end;
    }

    stack_worker->stack_frame_num = 0;
    stack_worker->fillcolor = 1;
    stack_worker->pixel_format = pix_fmt;
    stack_worker->p_stack_ctx->device_handle = NI_INVALID_DEVICE_HANDLE;
    stack_worker->p_stack_ctx->blk_io_handle = NI_INVALID_DEVICE_HANDLE;
    stack_worker->p_stack_ctx->hw_id = devid;
    stack_worker->p_stack_ctx->device_type = NI_DEVICE_TYPE_SCALER;
    stack_worker->p_stack_ctx->scaler_operation = NI_SCALER_OPCODE_STACK;

    if (NI_RETCODE_SUCCESS != ni_device_session_open(stack_worker->p_stack_ctx,
                                                     NI_DEVICE_TYPE_SCALER)) {
        ni_log(NI_LOG_ERROR, "Error: failed to open stack session\n");
        free(stack_worker->p_stack_ctx);
        ret = NI_RETCODE_FAILURE;
        goto end;
    }

    // init stack filter parameters
    ni_scaler_input_params_t stack_params = {0};
    stack_params.output_format =
        ni_to_gc620_pix_fmt(stack_worker->pixel_format);
    stack_params.input_format = ni_to_gc620_pix_fmt(stack_worker->pixel_format);
    stack_params.output_width = stack_worker->out_width;
    stack_params.output_height = stack_worker->out_height;
    // allocate frame pool
    if (NI_RETCODE_SUCCESS !=
        ni_scaler_frame_pool_alloc(stack_worker->p_stack_ctx, stack_params)) {
        fprintf(stderr, "Error: init filter hwframe pool\n");
        ni_device_session_close(stack_worker->p_stack_ctx, 0,
                                NI_DEVICE_TYPE_SCALER);
        free(stack_worker->p_stack_ctx);
        ret = NI_RETCODE_FAILURE;
        goto end;
    }

    ni_scaler_params_t scaler_params = {0};
    if (stack_worker->nb_inputs < MAX_SINGLE_INPUTS)
        scaler_params.nb_inputs = stack_worker->nb_inputs;
    else
        scaler_params.nb_inputs = MAX_SINGLE_INPUTS;

    if (NI_RETCODE_SUCCESS !=
        ni_scaler_set_params(stack_worker->p_stack_ctx, &scaler_params)) {
        ni_log(NI_LOG_ERROR, "Error: failed to set params for stack worker\n");
        ni_device_session_close(stack_worker->p_stack_ctx, 0,
                                NI_DEVICE_TYPE_SCALER);
        free(stack_worker->p_stack_ctx);
        ret = NI_RETCODE_FAILURE;
        goto end;
    }

    ret = ni_pthread_mutex_init(&stack_worker->lock);
    if (ret) {
        ni_log(NI_LOG_ERROR, "Error: failed to init stack worker mutex\n");
        ni_device_session_close(stack_worker->p_stack_ctx, 0,
                                NI_DEVICE_TYPE_SCALER);
        free(stack_worker->p_stack_ctx);
        ret = NI_RETCODE_FAILURE;
        goto end;
    }

    return NI_RETCODE_SUCCESS;

end:
    return ret;
}

int init_upl_worker(upload_worker *upl_worker) {
    int frame_size = calc_swframe_size(
        upl_worker->in_width, upl_worker->in_height, upl_worker->pixel_format);
    if (frame_size > MAX_YUV_FRAME_SIZE) {
        ni_log(NI_LOG_ERROR,
               "Unsupported upload frame size, wxh %dx%d, pix_fmt %d\n",
               upl_worker->in_width, upl_worker->in_height,
               upl_worker->pixel_format);
        return NI_RETCODE_FAILURE;
    }
    if (ni_posix_memalign(&(upl_worker->yuv_buf), sysconf(_SC_PAGESIZE),
                          frame_size)) {
        ni_log(NI_LOG_ERROR,
               "Error: failed to allocate YUV data buffer for uploader %d\n",
               upl_worker->index);
        return NI_RETCODE_FAILURE;
    }
    // get total file size of input
    lseek(upl_worker->pfs, 0, SEEK_END);
    upl_worker->file_reader.total_file_size =
        lseek(upl_worker->pfs, 0, SEEK_CUR);
    lseek(upl_worker->pfs, 0, SEEK_SET);
    upl_worker->file_reader.data_left_size =
        upl_worker->file_reader.total_file_size;

    int ret;

    upl_worker->p_upl_ctx = malloc(sizeof(ni_session_context_t));
    if (upl_worker->p_upl_ctx == NULL) {
        ni_log(NI_LOG_ERROR, "Failed to allocate session context\n");
        ni_aligned_free(upl_worker->yuv_buf);
        return NI_RETCODE_FAILURE;
    }
    memset(upl_worker->p_upl_ctx, 0, sizeof(ni_session_context_t));
    if ((ret = ni_device_session_context_init(upl_worker->p_upl_ctx))) {
        ni_log(NI_LOG_ERROR, "Failed to init p_upl_ctx\n");
        ni_aligned_free(upl_worker->yuv_buf);
        free(upl_worker->p_upl_ctx);
        return ret;
    }

    upl_worker->frame_entry = malloc(sizeof(ni_frame_entry_t));
    if (!upl_worker->frame_entry) {
        ni_log(NI_LOG_ERROR, "Failed to allocate frame entry for uploader\n");
        ni_aligned_free(upl_worker->yuv_buf);
        free(upl_worker->p_upl_ctx);
        return NI_RETCODE_ERROR_MEM_ALOC;
    }
    memset(upl_worker->frame_entry, 0, sizeof(ni_frame_entry_t));
    upl_worker->frame_entry->frame_list.size = NI_MAX_HW_FRAME;
    ret = ni_pthread_mutex_init(&upl_worker->frame_entry->lock);
    if (ret) {
        ni_log(NI_LOG_ERROR,
               "Failed to initialize frame_entry mutex for uploader\n");
        ni_aligned_free(upl_worker->yuv_buf);
        free(upl_worker->p_upl_ctx);
        free(upl_worker->frame_entry);
        return ret;
    }
    ret = ni_pthread_cond_init(&upl_worker->frame_entry->cond, NULL);
    if (ret) {
        ni_log(NI_LOG_ERROR,
               "Failed to initialize frame_entry cond for uploader\n");
        ni_pthread_mutex_destroy(&upl_worker->frame_entry->lock);
        ni_aligned_free(upl_worker->yuv_buf);
        free(upl_worker->p_upl_ctx);
        free(upl_worker->frame_entry);
        return ret;
    }

    ret = uploader_open_session(
        upl_worker->p_upl_ctx, upl_worker->in_width, upl_worker->in_height,
        upl_worker->pixel_format,
        // default pool size is NI_MAX_HW_FRAME (the same as decoder)
        NI_MAX_HW_FRAME, devid);
    if (ret) {
        ni_log(NI_LOG_ERROR, "uploader_open_session failed\n");
        ni_pthread_mutex_destroy(&(upl_worker->frame_entry->lock));
        ni_pthread_cond_destroy(&(upl_worker->frame_entry->cond));
        ni_aligned_free(upl_worker->yuv_buf);
        free(upl_worker->p_upl_ctx);
        free(upl_worker->frame_entry);
        return ret;
    }

    return ret;
}

int init_dec_worker(decoder_worker *dec_worker) {
    // get total file size of input
    lseek(dec_worker->pfs, 0, SEEK_END);
    dec_worker->file_reader.total_file_size =
        lseek(dec_worker->pfs, 0, SEEK_CUR);
    lseek(dec_worker->pfs, 0, SEEK_SET);
    dec_worker->file_reader.data_left_size =
        dec_worker->file_reader.total_file_size;

    int ret;

    // read file
    ret = read_input_file(&(dec_worker->file_reader), dec_worker->pfs);
    if (ret != NI_RETCODE_SUCCESS) {
        close(dec_worker->pfs);
        printf("Error: failed to read file: %s\n", dec_worker->filename);
        return ret;
    }

    // allocate stream info struct
    if (dec_worker->codec_type == NI_CODEC_FORMAT_H264) {
        dec_worker->p_stream_info = malloc(sizeof(ni_h264_sps_t));
        memset(dec_worker->p_stream_info, 0, sizeof(ni_h264_sps_t));
    } else if (dec_worker->codec_type == NI_CODEC_FORMAT_H265) {
        dec_worker->p_stream_info = malloc(sizeof(ni_h265_sps_t));
        memset(dec_worker->p_stream_info, 0, sizeof(ni_h265_sps_t));
    } else { // dec_worker->codec_type == NI_CODEC_FORMAT_VP9
        dec_worker->p_stream_info = malloc(sizeof(ni_vp9_header_info_t));
        memset(dec_worker->p_stream_info, 0, sizeof(ni_vp9_header_info_t));
    }

    // probe stream info
    if (probe_dec_stream_info(dec_worker->codec_type, dec_worker->p_stream_info,
                              &(dec_worker->file_reader), &(dec_worker->width),
                              &(dec_worker->height), &(dec_worker->bit_depth),
                              &(dec_worker->fps_num), &(dec_worker->fps_den))) {
        printf("Error: failed to probe decoder %d info\n", dec_worker->index);
        free(dec_worker->p_stream_info);
        return NI_RETCODE_FAILURE;
    }

    // init params
    ni_xcoder_params_t *p_dec_api_param = malloc(sizeof(ni_xcoder_params_t));
    if (!p_dec_api_param) {
        ni_log(NI_LOG_ERROR, "Error: failed to allocate p_dec_api_param\n");
        return NI_RETCODE_ERROR_MEM_ALOC;
    }

    if (ni_decoder_init_default_params(p_dec_api_param, 25, 1, 200000,
                                       dec_worker->width,
                                       dec_worker->height) < 0) {
        fprintf(stderr, "Error: decoder p_config set up error\n");
        free(p_dec_api_param);
        free(dec_worker->p_stream_info);
        return NI_RETCODE_FAILURE;
    }

    dec_worker->p_dec_ctx = malloc(sizeof(ni_session_context_t));
    if (dec_worker->p_dec_ctx) {
        memset(dec_worker->p_dec_ctx, 0, sizeof(ni_session_context_t));
        if (ni_device_session_context_init(dec_worker->p_dec_ctx) < 0) {
            fprintf(stderr, "Error: failed to init p_dec_ctx for decoder %d\n",
                    dec_worker->index);
            free(p_dec_api_param);
            free(dec_worker->p_stream_info);
            free(dec_worker->p_dec_ctx);
            return NI_RETCODE_FAILURE;
        }
    } else {
        ni_log(NI_LOG_ERROR,
               "ERROR: failed to allocate p_dec_ctx for decoder %d\n",
               dec_worker->index);
        free(p_dec_api_param);
        free(dec_worker->p_stream_info);
        return NI_RETCODE_ERROR_MEM_ALOC;
    }

    dec_worker->p_dec_ctx->p_session_config = p_dec_api_param;
    dec_worker->p_dec_ctx->session_id = NI_INVALID_SESSION_ID;
    dec_worker->p_dec_ctx->codec_format = dec_worker->codec_type;
    dec_worker->p_dec_ctx->device_handle =
        dec_worker->p_dec_ctx->blk_io_handle = NI_INVALID_DEVICE_HANDLE;
    dec_worker->p_dec_ctx->hw_id = devid;
    dec_worker->p_dec_ctx->src_bit_depth = dec_worker->bit_depth;
    dec_worker->p_dec_ctx->src_endian = NI_FRAME_LITTLE_ENDIAN;
    dec_worker->p_dec_ctx->bit_depth_factor = 1;
    dec_worker->p_dec_ctx->async_mode = 0;
    if (10 == dec_worker->p_dec_ctx->src_bit_depth) {
        dec_worker->p_dec_ctx->bit_depth_factor = 2;
    }

    // check and set ni_decoder_params from --xcoder-params
    if (ni_retrieve_decoder_params(dec_worker->decoder_params, p_dec_api_param,
                                   dec_worker->p_dec_ctx)) {
        fprintf(stderr, "Error: decoder %d p_config parsing error\n",
                dec_worker->index);
        free(p_dec_api_param);
        free(dec_worker->p_stream_info);
        free(dec_worker->p_dec_ctx);
        return NI_RETCODE_FAILURE;
    }

    if (parse_symbolic_decoder_param(p_dec_api_param)) {
        fprintf(stderr, "Error: decoder %d p_config parsing error\n",
                dec_worker->index);
        free(p_dec_api_param);
        free(dec_worker->p_stream_info);
        free(dec_worker->p_dec_ctx);
        return NI_RETCODE_FAILURE;
    }

    if (!p_dec_api_param->dec_input_params.hwframes) {
        printf("Error: must set out=hw for this demo\n");
        free(p_dec_api_param);
        free(dec_worker->p_stream_info);
        free(dec_worker->p_dec_ctx);
        return NI_RETCODE_FAILURE;
    }
    dec_worker->p_dec_ctx->hw_action = NI_CODEC_HW_ENABLE;
    if (ni_device_session_open(dec_worker->p_dec_ctx, NI_DEVICE_TYPE_DECODER)) {
        fprintf(stderr, "Error: ni_decoder_session_open() failure!\n");
        free(p_dec_api_param);
        free(dec_worker->p_stream_info);
        free(dec_worker->p_dec_ctx);
        return NI_RETCODE_FAILURE;
    }

    dec_worker->frame_entry = malloc(sizeof(ni_frame_entry_t));
    if (!dec_worker->frame_entry) {
        fprintf(
            stderr,
            "Error: failed to allocate memory for frame_entry for decoder %d\n",
            dec_worker->index);
        free(p_dec_api_param);
        free(dec_worker->p_stream_info);
        free(dec_worker->p_dec_ctx);
        return NI_RETCODE_ERROR_MEM_ALOC;
    }
    memset(dec_worker->frame_entry, 0, sizeof(ni_frame_entry_t));
    dec_worker->frame_entry->frame_list.size = NI_MAX_HW_FRAME;
    ret = ni_pthread_mutex_init(&dec_worker->frame_entry->lock);
    if (ret) {
        fprintf(stderr, "Error: decoder %d ni_pthread_mutex_init failed: %d\n",
                dec_worker->index, ret);
        free(p_dec_api_param);
        free(dec_worker->p_stream_info);
        free(dec_worker->p_dec_ctx);
        free(dec_worker->frame_entry);
        return ret;
    }
    ret = ni_pthread_cond_init(&dec_worker->frame_entry->cond, NULL);
    if (ret) {
        fprintf(stderr, "Error: decoder %d failed to init cond: %d\n",
                dec_worker->index, ret);
        ni_pthread_mutex_destroy(&dec_worker->frame_entry->lock);
        free(p_dec_api_param);
        free(dec_worker->p_stream_info);
        free(dec_worker->p_dec_ctx);
        free(dec_worker->frame_entry);
        return ret;
    }

    // alloc packet buffer
    dec_worker->stream_buf = NULL;
    if (NULL == (dec_worker->stream_buf = malloc(NI_MAX_TX_SZ))) {
        ni_log(NI_LOG_ERROR, "Failed to allocate stream buf for decoder %d\n",
               dec_worker->index);
        ni_pthread_mutex_destroy(&dec_worker->frame_entry->lock);
        ni_pthread_cond_destroy(&dec_worker->frame_entry->cond);
        free(p_dec_api_param);
        free(dec_worker->p_stream_info);
        free(dec_worker->p_dec_ctx);
        free(dec_worker->frame_entry);
        return NI_RETCODE_FAILURE;
    }
    memset(dec_worker->stream_buf, 0, NI_MAX_TX_SZ);

    return NI_RETCODE_SUCCESS;
}

int init_enc_worker(encoder_worker *enc_worker) {
    int ret = 0;
    enc_worker->p_enc_ctx = malloc(sizeof(ni_session_context_t));
    if (enc_worker->p_enc_ctx) {
        memset(enc_worker->p_enc_ctx, 0, sizeof(ni_session_context_t));
        if (ni_device_session_context_init(enc_worker->p_enc_ctx) < 0) {
            ni_log(NI_LOG_ERROR, "Error: failed to init p_enc_ctx\n");
            free(enc_worker->p_enc_ctx);
            return NI_RETCODE_FAILURE;
        }
    } else {
        ni_log(NI_LOG_ERROR, "ERROR: failed to allocate p_enc_ctx\n");
        return NI_RETCODE_ERROR_MEM_ALOC;
    }

    ret = ni_pthread_mutex_init(&enc_worker->lock);
    if (ret) {
        ni_log(NI_LOG_ERROR, "ERROR: failed to init encoder pthread attr\n");
        free(enc_worker->p_enc_ctx);
        return NI_RETCODE_FAILURE;
    }
    ret = ni_pthread_cond_init(&enc_worker->start_cond, NULL);
    if (ret) {
        ni_log(NI_LOG_ERROR, "ERROR: failed to init encoder start_cond\n");
        free(enc_worker->p_enc_ctx);
        ni_pthread_mutex_destroy(&enc_worker->lock);
        return NI_RETCODE_FAILURE;
    }

    ret = ni_pthread_cond_init(&enc_worker->opened_cond, NULL);
    if (ret) {
        ni_log(NI_LOG_ERROR, "ERROR: failed to init encoder start_cond\n");
        free(enc_worker->p_enc_ctx);
        ni_pthread_mutex_destroy(&enc_worker->lock);
        ni_pthread_cond_destroy(&enc_worker->start_cond);
        return NI_RETCODE_FAILURE;
    }

    enc_worker->frame_entry = malloc(sizeof(ni_frame_entry_t));
    if (!enc_worker->frame_entry) {
        ni_log(NI_LOG_ERROR,
               "Error: failed to allocate memory for frame_entry of encoder\n");
        free(enc_worker->p_enc_ctx);
        ni_pthread_mutex_destroy(&enc_worker->lock);
        ni_pthread_cond_destroy(&enc_worker->start_cond);
        ni_pthread_cond_destroy(&enc_worker->opened_cond);
        return NI_RETCODE_ERROR_MEM_ALOC;
    }
    memset(enc_worker->frame_entry, 0, sizeof(ni_frame_entry_t));
    enc_worker->frame_entry->frame_list.size = NI_MAX_BUFFERED_FRAME;
    ret = ni_pthread_mutex_init(&enc_worker->frame_entry->lock);
    if (ret) {
        ni_log(NI_LOG_ERROR, "Error: encoder ni_pthread_mutex_init failed\n");
        free(enc_worker->p_enc_ctx);
        free(enc_worker->frame_entry);
        ni_pthread_mutex_destroy(&enc_worker->lock);
        ni_pthread_cond_destroy(&enc_worker->start_cond);
        ni_pthread_cond_destroy(&enc_worker->opened_cond);
        return ret;
    }
    ret = ni_pthread_cond_init(&enc_worker->frame_entry->cond, NULL);
    if (ret) {
        ni_log(NI_LOG_ERROR, "Error: encoder failed to init cond\n");
        free(enc_worker->p_enc_ctx);
        free(enc_worker->frame_entry);
        ni_pthread_mutex_destroy(&enc_worker->lock);
        ni_pthread_cond_destroy(&enc_worker->start_cond);
        ni_pthread_cond_destroy(&enc_worker->opened_cond);
        ni_pthread_mutex_destroy(&enc_worker->frame_entry->lock);
        return ret;
    }

    enc_worker->got_sos = 0;
    enc_worker->started = false;
    return NI_RETCODE_SUCCESS;
}

int init_open_encoder(encoder_worker *enc_worker, ni_frame_t *p_ni_frame,
                      niFrameSurface1_t *p_surface) {
    // init params
    ni_session_context_t *p_enc_ctx = enc_worker->p_enc_ctx;
    p_enc_ctx->codec_format = enc_worker->codec_format;
    ni_xcoder_params_t *p_enc_api_param;
    p_enc_api_param = malloc(sizeof(*p_enc_api_param));
    if (!p_enc_api_param) {
        printf("Error: failed to allocate p_enc_api_param\n");
        return NI_RETCODE_ERROR_MEM_ALOC;
    }

    int ret = 0;
    int color_prim = NI_COL_PRI_UNSPECIFIED;
    int color_trc = NI_COL_TRC_UNSPECIFIED;
    int color_space = NI_COL_SPC_UNSPECIFIED;
    int sar_num = 0;
    int sar_den = 0;
    int video_full_range_flag = 0;
    int fps_num = 30;
    int fps_den = 1;
    int bitrate = 200000;

    if (p_ni_frame) {
        // open the encode session when the first frame arrives and the session
        // is not opened yet, with the source stream and user-configured encode
        // info both considered when constructing VUI in the stream headers
        color_prim = p_ni_frame->color_primaries;
        color_trc = p_ni_frame->color_trc;
        color_space = p_ni_frame->color_space;
        ni_log(NI_LOG_DEBUG,
               "color primaries: %d, color space: %d, color_trc: %d\n",
               color_prim, color_space, color_trc);
        sar_num = p_ni_frame->sar_width ? (p_ni_frame->sar_width) : sar_num;
        sar_den = p_ni_frame->sar_height ? (p_ni_frame->sar_height) : sar_den;
        ni_log(NI_LOG_DEBUG, "sar_num: %d, den: %d\n", sar_num, sar_den);
        video_full_range_flag = p_ni_frame->video_full_range_flag;
        ni_log(NI_LOG_DEBUG, "video full range flag %d\n",
               video_full_range_flag);

        // calculate the source fps and set it as the default target fps, based
        // on the timing_info passed in from the decoded frame
        if (p_ni_frame->vui_num_units_in_tick && p_ni_frame->vui_time_scale) {
            if (NI_CODEC_FORMAT_H264 == p_ni_frame->src_codec) {
                if (0 == p_ni_frame->vui_time_scale % 2) {
                    fps_num = (int)(p_ni_frame->vui_time_scale / 2);
                    fps_den = (int)(p_ni_frame->vui_num_units_in_tick);
                } else {
                    fps_num = (int)(p_ni_frame->vui_time_scale);
                    fps_den = (int)(2 * p_ni_frame->vui_num_units_in_tick);
                }
            } else if (NI_CODEC_FORMAT_H265 == p_ni_frame->src_codec) {
                fps_num = p_ni_frame->vui_time_scale;
                fps_den = p_ni_frame->vui_num_units_in_tick;
            }
        } else if (enc_worker->fps_num && enc_worker->fps_den) {
            fps_num = enc_worker->fps_num;
            fps_den = enc_worker->fps_den;
        }
    }

    ret = ni_encoder_init_default_params(
        p_enc_api_param, fps_num, fps_den, bitrate, enc_worker->input_width,
        enc_worker->input_height, enc_worker->codec_format);
    if (ret < 0) {
        ni_log(NI_LOG_ERROR, "ERROR: ni_encoder_init_default_params failed\n");
        return ret;
    }

    if (ni_retrieve_xcoder_params(enc_worker->encoder_params, p_enc_api_param,
                                  p_enc_ctx)) {
        ni_log(NI_LOG_ERROR, "ERROR: encoder p_config parsing error\n");
        return ret;
    }

    // check video full range flag configuration
    if (p_enc_api_param->video_full_range_flag >= 0) {
        video_full_range_flag = p_enc_api_param->video_full_range_flag;
    }

    // check aspect ratio indicator configuration
    if (p_enc_api_param->sar_denom) {
        sar_num = p_enc_api_param->sar_num;
        sar_den = p_enc_api_param->sar_denom;
    }

    if (p_enc_api_param->low_delay_mode)
        p_enc_ctx->async_mode = 1;
    p_enc_ctx->hw_id = devid;

    if (p_surface) {
        p_enc_ctx->hw_action = NI_CODEC_HW_ENABLE;
        p_enc_api_param->hwframes = 1;
        p_enc_ctx->sender_handle =
            (ni_device_handle_t)(int64_t)(p_surface->device_handle);
        p_enc_api_param->rootBufId = p_surface->ui16FrameIdx;
    } else {
        ni_log(NI_LOG_ERROR, "ERROR: input frame surface is null!\n");
        return NI_RETCODE_FAILURE;
    }

    ni_log(NI_LOG_DEBUG,
           "Open encoder, codec_fmt %d, input_width %d, input_height %d, "
           "pix_fmt %d\n",
           enc_worker->codec_format, enc_worker->input_width,
           enc_worker->input_height, pix_fmt);
    ret = encoder_open_session(p_enc_ctx, enc_worker->codec_format, devid,
                               p_enc_api_param, enc_worker->input_width,
                               enc_worker->input_height, color_prim, color_trc,
                               color_space, video_full_range_flag, sar_num,
                               sar_den, pix_fmt, false);
    if (ret != NI_RETCODE_SUCCESS) {
        ni_log(NI_LOG_ERROR, "Failed to open encoder\n");
    }
    return ret;
}

static void try_close_uploader_session(upload_worker *upl_worker) {
    if (!upl_worker || !upl_worker->p_upl_ctx)
        return;

    // wait for at most retry_count * keep_alive_timeout before close
    int retry_count = 3;

    for (int i = 0; i < retry_count; i++) {
        ni_pthread_mutex_lock(&upl_worker->frame_entry->lock);
        if (!frame_list_is_empty(&upl_worker->frame_entry->frame_list)) {
            ni_log(NI_LOG_INFO,
                   "%s: %d uploader frames still in use, postpone close!\n",
                   __func__,
                   frame_list_length(&upl_worker->frame_entry->frame_list));
            ni_pthread_mutex_unlock(&upl_worker->frame_entry->lock);
            ni_usleep(upl_worker->p_upl_ctx->keep_alive_timeout);
            continue;
        }
        ni_pthread_mutex_unlock(&upl_worker->frame_entry->lock);
        break;
    }
}

static void free_upload_worker(upload_worker *upl_worker) {
    if (!upl_worker)
        return;
    void *result;
    ni_log(NI_LOG_INFO, "clean and free upload worker %d\n", upl_worker->index);

    // signal condition in case upload thread is waiting
    ni_pthread_mutex_lock(&upl_worker->frame_entry->lock);
    upl_worker->should_exit = 1;
    ni_pthread_cond_signal(&upl_worker->frame_entry->cond);
    ni_pthread_mutex_unlock(&upl_worker->frame_entry->lock);

    if (ni_pthread_join(upl_worker->tid, &result) == 0) {
        if ((long)result != 0) {
            ni_log(NI_LOG_INFO,
                   "pthread_join upload worker %d thread ret %ld\n",
                   upl_worker->index, (long)result);
            global_state = EXIT_FAILURE;
        }
    }

    try_close_uploader_session(upl_worker);
    ni_pthread_mutex_destroy(&(upl_worker->frame_entry->lock));
    ni_pthread_cond_destroy(&(upl_worker->frame_entry->cond));
    if (upl_worker->frame_entry) {
        ni_buffered_frame_list_t *frame_list =
            &upl_worker->frame_entry->frame_list;
        for (int i = 0; i < frame_list->size; i++) {
            ni_frame_t *frame = &(frame_list->frames[i].data.data.frame);
            int ref_cnt = frame_list->frames[i].ref_cnt;
            if (frame && frame->p_data[3]) {
                niFrameSurface1_t *surf =
                    (niFrameSurface1_t *)(frame->p_data[3]);
                if (surf->ui16FrameIdx) {
                    if (ref_cnt > 0) {
                        ni_log(
                            NI_LOG_ERROR,
                            "ERROR: uploader output frame %d is not recycled "
                            "before close, it may be used by other sessions. "
                            "Leave it to be recycled by hw!\n",
                            surf->ui16FrameIdx);
                    } else {
                        ni_hwframe_buffer_recycle2(surf);
                    }
                }
            }
            ni_frame_buffer_free(frame);
        }
        free(upl_worker->frame_entry);
    }
    if (upl_worker->p_upl_ctx) {
        int session_id = upl_worker->p_upl_ctx->session_id;
        ni_device_session_close(upl_worker->p_upl_ctx, 0, NI_DEVICE_TYPE_UPLOAD);
        ni_log(NI_LOG_DEBUG, "%s: upload session %x closed.\n", __func__,
            session_id);
        ni_device_session_context_clear(upl_worker->p_upl_ctx);
        ni_device_close(upl_worker->p_upl_ctx->device_handle);
        ni_device_close(upl_worker->p_upl_ctx->blk_io_handle);
        free(upl_worker->p_upl_ctx);
    }
    ni_aligned_free(upl_worker->yuv_buf);
    close(upl_worker->pfs);
    free(upl_worker);
}

static void try_close_decoder_session(decoder_worker *dec_worker) {
    if (!dec_worker || !dec_worker->p_dec_ctx)
        return;

    // wait for at most retry_count * keep_alive_timeout before close
    int retry_count = 3;

    for (int i = 0; i < retry_count; i++) {
        ni_pthread_mutex_lock(&dec_worker->frame_entry->lock);
        if (!frame_list_is_empty(&dec_worker->frame_entry->frame_list)) {
            ni_log(NI_LOG_INFO,
                   "%s: %d decoder frames still in use, postpone close!\n",
                   __func__,
                   frame_list_length(&dec_worker->frame_entry->frame_list));
            ni_pthread_mutex_unlock(&dec_worker->frame_entry->lock);
            ni_usleep(dec_worker->p_dec_ctx->keep_alive_timeout);
            continue;
        }
        ni_pthread_mutex_unlock(&dec_worker->frame_entry->lock);
        break;
    }
}

static void free_decoder_worker(decoder_worker *dec_worker) {
    if (!dec_worker)
        return;
    void *result;
    ni_log(NI_LOG_INFO, "free and clean decoder worker %d\n",
           dec_worker->index);

    free(dec_worker->file_reader.file_cache);
    close(dec_worker->pfs);
    free(dec_worker->stream_buf);
    // signal condition in case send/recv threads are waiting
    ni_pthread_mutex_lock(&(dec_worker->frame_entry->lock));
    dec_worker->should_exit = 1;
    ni_pthread_cond_broadcast(&(dec_worker->frame_entry->cond));
    ni_pthread_mutex_unlock(&(dec_worker->frame_entry->lock));

    if (ni_pthread_join(dec_worker->send_tid, &result) == 0) {
        if ((long)result != 0) {
            ni_log(NI_LOG_INFO,
                   "pthread_join decoder worker send_thread ret %ld\n",
                   (long)result);
            global_state = EXIT_FAILURE;
        }
    }

    if (ni_pthread_join(dec_worker->recv_tid, &result) == 0) {
        if ((long)result != 0) {
            ni_log(NI_LOG_INFO,
                   "pthread_join decoder worker recv_thread ret %ld\n",
                   (long)result);
            global_state = EXIT_FAILURE;
        }
    }

    try_close_decoder_session(dec_worker);
    ni_pthread_mutex_destroy(&dec_worker->frame_entry->lock);
    ni_pthread_cond_destroy(&dec_worker->frame_entry->cond);
    if (dec_worker->p_stream_info)
        free(dec_worker->p_stream_info);
    if (dec_worker->frame_entry) {
        ni_buffered_frame_list_t *frame_list =
            &dec_worker->frame_entry->frame_list;
        for (int i = 0; i < frame_list->size; i++) {
            ni_frame_t *frame = &(frame_list->frames[i].data.data.frame);
            int ref_cnt = frame_list->frames[i].ref_cnt;
            if (frame && frame->p_data[3]) {
                niFrameSurface1_t *surf =
                    (niFrameSurface1_t *)(frame->p_data[3]);
                if (surf->ui16FrameIdx) {
                    if (ref_cnt > 0) {
                        ni_log(
                            NI_LOG_INFO,
                            "WARNING: decoder output frame %d is not recycled "
                            "before close, it may be used by other sessions. "
                            "Leave it to be reycled by hw!\n",
                            surf->ui16FrameIdx);
                    } else {
                        ni_hwframe_buffer_recycle2(surf);
                    }
                }
            }
            ni_frame_buffer_free(frame);
        }
        free(dec_worker->frame_entry);
    }
    if (dec_worker->p_dec_ctx) {
        int session_id = dec_worker->p_dec_ctx->session_id;
        ni_device_session_close(dec_worker->p_dec_ctx, 0, NI_DEVICE_TYPE_DECODER);
        ni_log(NI_LOG_DEBUG, "%s: decoder session 0x%x closed.\n", __func__,
            session_id);
        if (dec_worker->p_dec_ctx->p_session_config)  {
            free(dec_worker->p_dec_ctx->p_session_config);
        }
        ni_device_session_context_clear(dec_worker->p_dec_ctx);
        ni_device_close(dec_worker->p_dec_ctx->device_handle);
        ni_device_close(dec_worker->p_dec_ctx->blk_io_handle);
        free(dec_worker->p_dec_ctx);
    }
    free(dec_worker);
}

static void free_encoder_worker(encoder_worker *enc_worker) {
    if (!enc_worker)
        return;

    ni_log(NI_LOG_INFO, "cleanup and free enc_worker %d\n", enc_worker->index);
    void *result;
    enc_worker->should_exit = 1;

    ni_pthread_mutex_lock(&(enc_worker->lock));
    ni_pthread_cond_signal(&(enc_worker->start_cond));
    ni_pthread_mutex_unlock(&(enc_worker->lock));

    ni_pthread_mutex_lock(&(enc_worker->lock));
    ni_pthread_cond_signal(&(enc_worker->opened_cond));
    ni_pthread_mutex_unlock(&(enc_worker->lock));

    ni_pthread_mutex_lock(&enc_worker->frame_entry->lock);
    ni_pthread_cond_signal(&enc_worker->frame_entry->cond);
    ni_pthread_mutex_unlock(&enc_worker->frame_entry->lock);

    if (ni_pthread_join(enc_worker->send_tid, &result) == 0) {
        if ((long)result != 0) {
            ni_log(NI_LOG_INFO,
                   "pthread_join encoder worker send_thread ret %ld\n",
                   (long)result);
            global_state = EXIT_FAILURE;
        }
    }

    if (ni_pthread_join(enc_worker->recv_tid, &result) == 0) {
        if ((long)result != 0) {
            ni_log(NI_LOG_INFO,
                   "pthread_join encoder worker recv_thread ret %ld\n",
                   (long)result);
            global_state = EXIT_FAILURE;
        }
    }

    ni_pthread_mutex_destroy(&enc_worker->lock);
    ni_pthread_cond_destroy(&enc_worker->start_cond);
    ni_pthread_cond_destroy(&enc_worker->opened_cond);
    if (enc_worker->frame_entry) {
        ni_pthread_mutex_destroy(&enc_worker->frame_entry->lock);
        ni_pthread_cond_destroy(&enc_worker->frame_entry->cond);
        ni_buffered_frame_list_t *frame_list =
            &enc_worker->frame_entry->frame_list;
        for (int i = 0; i < frame_list->size; i++) {
            ni_frame_t *frame = &(frame_list->frames[i].data.data.frame);
            // do not recycle encoder input frame because
            // it might be used by other encoder workers
            ni_frame_buffer_free(frame);
        }
        free(enc_worker->frame_entry);
    }
    if (enc_worker->p_enc_ctx) {
        ni_device_session_close(enc_worker->p_enc_ctx, 0,
                                NI_DEVICE_TYPE_ENCODER);
        ni_device_session_context_clear(enc_worker->p_enc_ctx);
        ni_device_close(enc_worker->p_enc_ctx->device_handle);
        ni_device_close(enc_worker->p_enc_ctx->blk_io_handle);
        free(enc_worker->p_enc_ctx->p_session_config);
        free(enc_worker->p_enc_ctx);
    }
    free(enc_worker);
}

static void free_stack_worker(stack_worker_t *stack_worker) {
    if (!stack_worker)
        return;

    ni_log(NI_LOG_INFO, "cleanup and free stack_worker %d\n",
           stack_worker->index);
    if (stack_worker->out_fp)
        fclose(stack_worker->out_fp);
    void *result;
    stack_worker->should_exit = 1;

    int max_recycle_index = 0;
    if (stack_worker->nb_encoders) {
        if (stack_worker->encoder_workers[0]->p_enc_ctx) {
            max_recycle_index = NI_GET_MAX_HWDESC_FRAME_INDEX(
                stack_worker->encoder_workers[0]->p_enc_ctx->ddr_config);
        }
    }

    // let stack thread finish first because it accesses some memory
    // allocated for encoder workers (e.g. buffered frame list)
    if (ni_pthread_join(stack_worker->tid, &result) == 0) {
        if ((long)result != 0) {
            ni_log(NI_LOG_INFO,
                   "pthread_join decoder worker send_thread ret %ld\n",
                   (long)result);
            global_state = EXIT_FAILURE;
        }
    }

    // then free encoder workers
    for (int i = 0; i < stack_worker->nb_encoders; i++) {
        free_encoder_worker(stack_worker->encoder_workers[i]);
    }

    // clean up g_hwframe_pool
    for (int i = 0; i < max_recycle_index; i++) {
        if (g_hwframe_pool[i].ref_cnt > 0 &&
            g_hwframe_pool[i].surface.ui16FrameIdx > 0) {
                g_hwframe_pool[i].surface.device_handle =
                    stack_worker->p_stack_ctx->device_handle;
                ni_hwframe_buffer_recycle2(&(g_hwframe_pool[i].surface));
                g_hwframe_pool[i].ref_cnt = 0;
            }
    }

    if (stack_worker->p_stack_ctx) {
        ni_device_session_close(stack_worker->p_stack_ctx, 0,
                                NI_DEVICE_TYPE_SCALER);
        ni_device_session_context_clear(stack_worker->p_stack_ctx);
        ni_device_close(stack_worker->p_stack_ctx->device_handle);
        ni_device_close(stack_worker->p_stack_ctx->blk_io_handle);
        free(stack_worker->p_stack_ctx);
    }

    ni_pthread_mutex_destroy(&stack_worker->lock);

    free(stack_worker);
}

static void uploader_self_cleanup(upload_worker *upl_worker) {
    if (!upl_worker)
        return;

    close(upl_worker->pfs);
    ni_pthread_mutex_destroy(&(upl_worker->frame_entry->lock));
    ni_pthread_cond_destroy(&(upl_worker->frame_entry->cond));
    if (upl_worker->frame_entry) {
        ni_buffered_frame_list_t *frame_list =
            &upl_worker->frame_entry->frame_list;
        for (int i = 0; i < frame_list->size; i++) {
            ni_frame_t *frame = &(frame_list->frames[i].data.data.frame);
            if (frame && frame->p_data[3]) {
                niFrameSurface1_t *surf =
                    (niFrameSurface1_t *)(frame->p_data[3]);
                if (surf->ui16FrameIdx)
                    ni_hwframe_buffer_recycle2(surf);
            }
            ni_frame_buffer_free(frame);
        }
        free(upl_worker->frame_entry);
    }
    if (upl_worker->p_upl_ctx) {
        ni_device_session_close(upl_worker->p_upl_ctx, 0,
                                NI_DEVICE_TYPE_UPLOAD);
        ni_device_session_context_clear(upl_worker->p_upl_ctx);
        ni_device_close(upl_worker->p_upl_ctx->device_handle);
        ni_device_close(upl_worker->p_upl_ctx->blk_io_handle);
        free(upl_worker->p_upl_ctx);
    }
    ni_aligned_free(upl_worker->yuv_buf);
    free(upl_worker);
}

static void decoder_self_cleanup(decoder_worker *dec_worker) {
    if (!dec_worker)
        return;

    free(dec_worker->file_reader.file_cache);
    close(dec_worker->pfs);
    free(dec_worker->stream_buf);
    // signal condition in case send thread is waiting
    ni_pthread_mutex_lock(&(dec_worker->frame_entry->lock));
    dec_worker->should_exit = 1;
    ni_pthread_cond_signal(&(dec_worker->frame_entry->cond));
    ni_pthread_mutex_unlock(&(dec_worker->frame_entry->lock));

    void *result;
    if (ni_pthread_join(dec_worker->send_tid, &result) == 0) {
        if ((long)result != 0) {
            ni_log(NI_LOG_INFO,
                   "pthread_join decoder worker send_thread ret %ld\n",
                   (long)result);
        }
    }

    ni_pthread_mutex_destroy(&dec_worker->frame_entry->lock);
    ni_pthread_cond_destroy(&dec_worker->frame_entry->cond);
    if (dec_worker->p_stream_info)
        free(dec_worker->p_stream_info);
    if (dec_worker->frame_entry) {
        ni_buffered_frame_list_t *frame_list =
            &dec_worker->frame_entry->frame_list;
        for (int i = 0; i < frame_list->size; i++) {
            ni_frame_t *frame = &(frame_list->frames[i].data.data.frame);
            if (frame && frame->p_data[3]) {
                niFrameSurface1_t *surf =
                    (niFrameSurface1_t *)(frame->p_data[3]);
                if (surf->ui16FrameIdx)
                    ni_hwframe_buffer_recycle2(surf);
            }
            ni_frame_buffer_free(frame);
        }
        free(dec_worker->frame_entry);
    }
    if (dec_worker->p_dec_ctx) {
        ni_device_session_close(dec_worker->p_dec_ctx, 0,
                                NI_DEVICE_TYPE_DECODER);
        if (dec_worker->p_dec_ctx->p_session_config) {
            free(dec_worker->p_dec_ctx->p_session_config);
        }
        ni_device_session_context_clear(dec_worker->p_dec_ctx);
        ni_device_close(dec_worker->p_dec_ctx->device_handle);
        ni_device_close(dec_worker->p_dec_ctx->blk_io_handle);
        free(dec_worker->p_dec_ctx);
    }
    free(dec_worker);
}

void *hwupload_thread(void *args) {
    upload_worker *upl_worker = args;
    ni_session_context_t *p_upl_ctx = upl_worker->p_upl_ctx;
    ni_frame_entry_t *frame_entry = upl_worker->frame_entry;
    ni_buffered_frame_list_t *frame_list = &(frame_entry->frame_list);
    ni_session_data_io_t swin_data = {0};
    ni_session_data_io_t *p_swin_data = &swin_data;
    ni_session_data_io_t *p_out_data;
    niFrameSurface1_t *p_surface;
    int ret, index = 0;
    int eos = 0;
    ni_log(NI_LOG_INFO, "upload %d hwupload thread start\n", upl_worker->index);

    for (;;) {
        // block hwupload thread if need_reconfig
        while (need_reconfig && !upl_worker->should_exit) {
            // wait until reconfig done
            ni_usleep(1000);
        }
        ni_pthread_mutex_lock(&frame_entry->lock);
        if (upl_worker->should_exit || upl_worker->force_exit) {
            ni_pthread_mutex_unlock(&frame_entry->lock);
            goto end;
        }
        while (frame_list_is_full(frame_list)) {
            ni_pthread_cond_wait(&(frame_entry->cond), &(frame_entry->lock));
            if (upl_worker->should_exit || upl_worker->force_exit) {
                ni_pthread_mutex_unlock(&frame_entry->lock);
                goto end;
            }
        }
        ni_pthread_mutex_unlock(&frame_entry->lock);

        p_out_data = &(frame_list->frames[frame_list->tail].data);
        frame_list->frames[frame_list->tail].ref_cnt = 0;
        ret = hwupload_frame(upl_worker, p_swin_data, p_out_data, &eos);
        p_surface = (niFrameSurface1_t *)(p_out_data->data.frame.p_data[3]);
        if (ret) {
            ni_log(NI_LOG_INFO, "Error: uploader %d upload_thread break!\n",
                   upl_worker->index);
            break;
        } else if (p_upl_ctx->status == NI_RETCODE_NVME_SC_WRITE_BUFFER_FULL) {
            ni_usleep(100);
            continue;
        } else if (p_out_data->data.frame.end_of_stream) {
            ni_log(NI_LOG_DEBUG, "uploader %d reach eos\n", upl_worker->index);
            ni_frame_buffer_free(&(p_swin_data->data.frame));
            break;
        }

        ni_log(NI_LOG_DEBUG, "uploader:%d uploaded frameIdx %u\n",
               upl_worker->index,
               p_surface->ui16FrameIdx);
        index = frame_list->tail;
        ni_pthread_mutex_lock(&frame_entry->lock);
        frame_list_enqueue(frame_list);
        ni_pthread_mutex_unlock(&frame_entry->lock);

        common_t *common = upl_worker->common;
        ni_pthread_mutex_lock(&common->lock);
        if (!common->total_xstack_threads) {
            // xstack threads are all exited, return
            upl_worker->should_exit = 1;
            ni_pthread_mutex_unlock(&common->lock);
            break;
        }
        upl_worker->upl_frame_num++;
        frame_list->frames[index].ref_cnt = common->total_xstack_threads;
        if (!upl_worker->force_exit) {
            if (common->shortest) {
                if (common->ready_upl_worker_num[index] < active_upload_workers)
                    common->ready_upl_worker_num[index]++;
                if (common->ready_upl_worker_num[index] ==
                    active_upload_workers) {
                    common->ready_frames[index] = 1;
                    common->uploaded_frame_num++;
                    common->ready_upl_worker_num[index] = 0;
                }
            } else {
                if (common->total_upl_threads < active_upload_workers &&
                    upl_worker->upl_frame_num <=
                        common->max_exited_input_frame_num) {
                    common->ready_upl_worker_num[index]++;
                    int exited_cnt = 0;
                    for (int i = 0; i < NI_MAX_XSTACK_INPUTS; i++) {
                        if (common->input_eos[i] == index &&
                            (upload_workers[i]->upl_frame_num >=
                             upl_worker->upl_frame_num)) {
                            // the exited upl_worker has increased the
                            // ready_upl_worker_num, subtract it
                            common->ready_upl_worker_num[index]--;
                            exited_cnt++;
                            ni_log(NI_LOG_TRACE,
                                   "upload_workers[%d]->upl_frame_num "
                                   "%d, uploaded_frame_num %d, curr "
                                   "upl_worker %d upl_frame_num %d, "
                                   "ready_upl_worker_num %d\n",
                                   i, upload_workers[i]->upl_frame_num,
                                   common->uploaded_frame_num,
                                   upl_worker->index, upl_worker->upl_frame_num,
                                   common->ready_upl_worker_num[index]);
                        }
                    }
                    if (common->ready_upl_worker_num[index] + exited_cnt ==
                        active_upload_workers) {
                        common->ready_frames[index] = 1;
                        common->uploaded_frame_num++;
                        common->ready_upl_worker_num[index] = 0;
                    } else {
                        // restore the ready_upl_worker_num
                        common->ready_upl_worker_num[index] =
                            common->ready_upl_worker_num[index] + exited_cnt;
                    }
                } else {
                    if (common->ready_upl_worker_num[index] <
                        common->total_upl_threads) {
                        common->ready_upl_worker_num[index]++;
                    } else {
                        ni_log(NI_LOG_ERROR,
                               "ERROR: droped one frame of upl_worker %d, "
                               "upl upl_frame_num %d, index %d\n",
                               upl_worker->index, upl_worker->upl_frame_num,
                               index);
                    }
                    if (common->ready_upl_worker_num[index] ==
                        common->total_upl_threads) {
                        common->ready_frames[index] = 1;
                        common->uploaded_frame_num++;
                        common->ready_upl_worker_num[index] = 0;
                    }
                }
            }
        }
        ni_pthread_mutex_unlock(&common->lock);
    }

end:
    ni_log(NI_LOG_INFO, "uploader %d exit\n", upl_worker->index);
    ni_pthread_mutex_lock(&(upl_worker->common->lock));
    upl_worker->should_exit = 1;
    if (!upl_worker->force_exit) {
        if (upl_worker->common->max_exited_input_frame_num <
            upl_worker->upl_frame_num) {
            upl_worker->common->max_exited_input_frame_num =
                upl_worker->upl_frame_num;
        }
        upl_worker->common->exit_upl_num++;
        upl_worker->common->total_upl_threads--;
        upl_worker->common->input_eos[upl_worker->index] = index;
        ni_log(NI_LOG_DEBUG, "set input_eos[%d] to %d\n", upl_worker->index,
               index);
        if (!upl_worker->common->shortest) {
            frame_list->frames[index].ref_cnt =
                0xFF; // mark the last frame to not recycle
        }
    }
    ni_pthread_mutex_unlock(&(upl_worker->common->lock));
    // clean up and self destroy if early end, since main thread won't track
    // it any more
    if (upl_worker->force_exit) {
        uploader_self_cleanup(upl_worker);
    }
    return NULL;
}

void *encoder_send_thread(void *args) {
    encoder_worker *enc_worker = args;
    ni_session_context_t *p_enc_ctx = enc_worker->p_enc_ctx;
    ni_buffered_frame_list_t *frame_list =
        &(enc_worker->frame_entry->frame_list);
    int ret;
    ni_session_data_io_t *p_in_data = NULL;
    ni_session_data_io_t enc_send_data = {0};
    ni_session_data_io_t *p_enc_send_data = &enc_send_data;
    ni_frame_t *p_in_frame = NULL;
    ni_session_data_io_t *p_first_frame_data = NULL;
    ni_frame_t *p_first_frame = NULL;
    niFrameSurface1_t *p_surface;
    int eos = 0;
    device_state_t xcoder_state = {0};
    device_state_t *p_xcoder_state = &xcoder_state;
    if (!enc_worker->started) {
        ni_log(NI_LOG_DEBUG, "encoder %d wait start cond\n", enc_worker->index);
        // wait for encoder start_cond
        ni_pthread_mutex_lock(&(enc_worker->lock));
        if (enc_worker->should_exit) {
            ni_pthread_mutex_unlock(&(enc_worker->lock));
            return NULL;
        }
        ni_pthread_cond_wait(&(enc_worker->start_cond), &(enc_worker->lock));
        if (enc_worker->should_exit) {
            ni_pthread_mutex_unlock(&(enc_worker->lock));
            return NULL;
        }
        ni_pthread_mutex_unlock(&(enc_worker->lock));

        p_first_frame_data = &(frame_list->frames[frame_list->head].data);
        p_first_frame = &(p_first_frame_data->data.frame);
        niFrameSurface1_t *p_first_surface =
            (niFrameSurface1_t *)(p_first_frame->p_data[3]);
        ret = init_open_encoder(enc_worker, p_first_frame, p_first_surface);
        if (ret) {
            ni_log(NI_LOG_ERROR, "Failed to init and open encoder, exit\n");
            return NULL;
        }

        if (!p_enc_ctx->hw_action) {
            ni_log(NI_LOG_ERROR, "ERROR: hwframe only for this demo\n");
            return NULL;
        }
        ni_log(NI_LOG_DEBUG, "encoder %d send_thread start\n",
               enc_worker->index);
        enc_worker->started = true;
        // prep the first enc input frame
        prepare_enc_input_frame(
            p_enc_ctx, p_first_frame_data, p_enc_send_data,
            p_first_frame->pixel_format, p_first_frame->video_width,
            p_first_frame->video_height);
        ni_log(NI_LOG_INFO,
               "First frame pixel_format %d, video_width %d, video_height %d\n",
               p_first_frame->pixel_format, p_first_frame->video_width,
               p_first_frame->video_height);

        ni_pthread_mutex_lock(&(enc_worker->lock));
        ni_pthread_cond_signal(&(enc_worker->opened_cond));
        ni_pthread_mutex_unlock(&(enc_worker->lock));
    }
    for (;;) {
        ni_pthread_mutex_lock(&(enc_worker->frame_entry->lock));
        if (enc_worker->should_exit) {
            ni_pthread_mutex_unlock(&(enc_worker->frame_entry->lock));
            goto end;
        }
        while (frame_list_is_empty(&(enc_worker->frame_entry->frame_list))) {
            ni_pthread_cond_wait(&(enc_worker->frame_entry->cond),
                                 &(enc_worker->frame_entry->lock));
            if (enc_worker->should_exit) {
                ni_pthread_mutex_unlock(&(enc_worker->frame_entry->lock));
                goto end;
            }
        }
        ni_pthread_mutex_unlock(&(enc_worker->frame_entry->lock));

        if (p_enc_ctx->frame_num == 0) {
            // send the first frame
            p_in_data = p_enc_send_data;
        } else {
            p_in_data = &(frame_list->frames[frame_list->head].data);
        }

        eos = p_in_data->data.frame.end_of_stream;
        p_in_frame = &p_in_data->data.frame;
        if (!enc_worker->got_sos) {
            p_in_frame->start_of_stream = 1;
            enc_worker->got_sos = 1;
        }

        ret = encoder_send_data(p_enc_ctx, p_in_data, enc_worker->input_width,
                                enc_worker->input_height, p_xcoder_state, eos);
        if (ret < 0) // Error
        {
            // pre close cleanup will clear it out
            p_surface = (niFrameSurface1_t *)p_in_frame->p_data[3];
            ni_hwframe_buffer_recycle2(p_surface);
            ni_frame_buffer_free(p_in_frame);
            ni_log(NI_LOG_ERROR, "Error: encoder send_thread break!\n");
            break;
        }

        if (!p_xcoder_state->enc_resend) {
            p_surface = (niFrameSurface1_t *)p_in_frame->p_data[3];
            ni_log(NI_LOG_DEBUG, "encoder %d sent 1 frame, index %d\n",
                   enc_worker->index, p_surface->ui16FrameIdx);

            ni_pthread_mutex_lock(&(enc_worker->frame_entry->lock));
            if (frame_list_is_full(frame_list)) {
                frame_list_dequeue(frame_list);
                ni_pthread_cond_signal(&(enc_worker->frame_entry->cond));
            } else
                frame_list_dequeue(frame_list);
            ni_pthread_mutex_unlock(&(enc_worker->frame_entry->lock));

            ni_frame_wipe_aux_data(p_in_frame);
        }

        if (p_xcoder_state->enc_eos_sent) // eos
        {
            ni_log(NI_LOG_INFO,
                   "encoder %d send_thread reach eos, should exit\n",
                   enc_worker->index);
            break;
        }
    }

end:
    ni_frame_buffer_free(&(p_enc_send_data->data.frame));
    return NULL;
}

void *encoder_receive_thread(void *args) {
    encoder_worker *enc_worker = args;
    int ret;

    ni_log(NI_LOG_DEBUG, "===> encoder_receive_thread <===\n");
    ni_session_data_io_t out_pkt = {0};
    uint32_t num_recv_packets = 0;
    device_state_t xcoder_state = {0};

    ni_pthread_mutex_lock(&(enc_worker->lock));
    if (enc_worker->should_exit) {
        ni_pthread_mutex_unlock(&(enc_worker->lock));
        ni_pthread_mutex_lock(&(enc_worker->common->lock));
        enc_worker->common->exit_enc_num++;
        ni_pthread_mutex_unlock(&(enc_worker->common->lock));
        return NULL;
    }
    while (!enc_worker->started) {
        ni_pthread_cond_wait(&(enc_worker->opened_cond), &(enc_worker->lock));
        if (enc_worker->should_exit) {
            ni_pthread_mutex_unlock(&(enc_worker->lock));
            ni_pthread_mutex_lock(&(enc_worker->common->lock));
            enc_worker->common->exit_enc_num++;
            ni_pthread_mutex_unlock(&(enc_worker->common->lock));
            return NULL;
        }
    }
    ni_pthread_mutex_unlock(&(enc_worker->lock));

    ni_log(NI_LOG_INFO, "Encoder %d receive thread started\n",
           enc_worker->index);

    for (;;) {
        if (enc_worker->should_exit)
            break;
        ret = encoder_receive(enc_worker, &out_pkt, &num_recv_packets,
                              enc_worker->out_fp, &xcoder_state);
        if (ret)
            break;
    }

    ni_log(NI_LOG_INFO, "encoder %d receive thread exit\n", enc_worker->index);
    fclose(enc_worker->out_fp);
    ni_packet_buffer_free(&(out_pkt.data.packet));
    ni_pthread_mutex_lock(&(enc_worker->common->lock));
    enc_worker->common->exit_enc_num++;
    ni_pthread_mutex_unlock(&(enc_worker->common->lock));

    return NULL;
}

void *decoder_send_thread(void *args) {
    decoder_worker *dec_worker = args;
    ni_session_context_t *p_dec_ctx = dec_worker->p_dec_ctx;
    ni_frame_entry_t *frame_entry = dec_worker->frame_entry;
    ni_buffered_frame_list_t *frame_list = &(frame_entry->frame_list);
    int retval;
    ni_log(NI_LOG_DEBUG,
           "decoder %d send_thread start: async_mode %d decoder_low_delay %d\n",
           dec_worker->index, p_dec_ctx->async_mode,
           p_dec_ctx->decoder_low_delay);
    ni_session_data_io_t in_pkt = {0};
    device_state_t xcoder_state = {0};
    device_state_t *p_xcoder_state = &xcoder_state;

    for (;;) {
        ni_pthread_mutex_lock(&frame_entry->lock);
        if (dec_worker->should_exit || dec_worker->force_exit) {
            ni_pthread_mutex_unlock(&frame_entry->lock);
            return NULL;
        }
        while (frame_list_is_full(frame_list)) {
            ni_pthread_cond_wait(&frame_entry->cond, &frame_entry->lock);
            if (dec_worker->should_exit || dec_worker->force_exit) {
                ni_pthread_mutex_unlock(&frame_entry->lock);
                return NULL;
            }
        }
        ni_pthread_mutex_unlock(&frame_entry->lock);

        retval = decoder_send_data(
            dec_worker->p_dec_ctx, &in_pkt, p_xcoder_state,
            &(dec_worker->file_reader), dec_worker->p_stream_info,
            dec_worker->width, dec_worker->height, dec_worker->stream_buf);
        if (retval < 0) // Error
        {
            dec_worker->should_exit = 1;
            fprintf(stderr,
                    "Error: decoder %d send_thread break, force exit!\n",
                    dec_worker->index);
            break;
        } else if (p_xcoder_state->dec_eos_sent) // eos
        {
            ni_log(NI_LOG_DEBUG, "decoder %d send_thread reach eos\n",
                   dec_worker->index);
            break;
        }
    }
    ni_log(NI_LOG_TRACE, "decoder %d send_thread exit\n", dec_worker->index);
    return NULL;
}

void *decoder_receive_thread(void *args) {
    decoder_worker *dec_worker = args;
    ni_session_data_io_t *p_out_data = NULL;
    ni_frame_t *p_ni_frame = NULL;
    ni_frame_entry_t *frame_entry = dec_worker->frame_entry;
    ni_buffered_frame_list_t *frame_list = &(frame_entry->frame_list);
    device_state_t xcoder_state = {0};
    device_state_t *p_xcoder_state = &xcoder_state;
    int retval;
    int index = 0;

    if (!(dec_worker->p_dec_ctx->hw_action == NI_CODEC_HW_ENABLE)) {
        ni_log(NI_LOG_ERROR, "Error: decoder %d is not in hw mode\n",
               dec_worker->index);
        goto end;
    }
    ni_log(NI_LOG_DEBUG, "decoder %d receive_thread start\n",
           dec_worker->index);
    for (;;) {
        // block dec receive thread if need_reconfig
        while (need_reconfig && !dec_worker->should_exit) {
            // wait until reconfig done
            ni_usleep(1000);
        }
        // block dec thread if waiting to free queue is full
        ni_pthread_mutex_lock(&frame_entry->lock);
        if (dec_worker->should_exit || dec_worker->force_exit) {
            ni_pthread_mutex_unlock(&frame_entry->lock);
            goto end;
        }
        while (frame_list_is_full(frame_list)) {
            ni_pthread_cond_wait(&frame_entry->cond, &frame_entry->lock);
            if (dec_worker->should_exit || dec_worker->force_exit) {
                ni_pthread_mutex_unlock(&frame_entry->lock);
                goto end;
            }
        }
        ni_pthread_mutex_unlock(&frame_entry->lock);

        // dec recv thread operates tail, while stack thread operates head
        p_out_data = &(frame_list->frames[frame_list->tail].data);
        frame_list->frames[frame_list->tail].ref_cnt = 0;
        p_ni_frame = &p_out_data->data.frame;
        retval = decoder_receive_data(dec_worker->p_dec_ctx, p_out_data,
                                      p_xcoder_state, dec_worker->width,
                                      dec_worker->height, NULL);
        if (retval < 0) // Error
        {
            ni_frame_buffer_free(p_ni_frame);
            dec_worker->should_exit = 1;
            fprintf(stderr,
                    "Error: decoder_receive_thread %p of decoder %d break!\n",
                    &(dec_worker->recv_tid), dec_worker->index);
            break;
        } else if (p_ni_frame->end_of_stream) // eos
        {
            ni_log(NI_LOG_DEBUG, "decoder %d receive_thread reach eos\n",
                   dec_worker->index);
            break;
        } else if (retval == 2) // eagain
        {
            // sleep to switch into other threads
            ni_usleep(100);
        } else {
            niFrameSurface1_t *recv_frame =
                ((niFrameSurface1_t *)((uint8_t *)p_ni_frame->p_data[3]));
            ni_log(NI_LOG_DEBUG, "decoder recv:%u, tail:%d\n",
                   recv_frame->ui16FrameIdx, frame_list->tail);
            recv_frame->ui16width = dec_worker->p_dec_ctx->actual_video_width;
            recv_frame->ui16height = dec_worker->p_dec_ctx->active_video_height;
            recv_frame->ui32nodeAddress = 0;
            recv_frame->bit_depth = dec_worker->p_dec_ctx->bit_depth_factor;
            recv_frame->encoding_type =
                (dec_worker->p_dec_ctx->pixel_format == NI_PIX_FMT_YUV420P) ||
                (dec_worker->p_dec_ctx->pixel_format == NI_PIX_FMT_YUV420P10LE);
            if (dec_worker->p_dec_ctx->pixel_format == NI_PIX_FMT_8_TILED4X4)
                recv_frame->encoding_type = NI_PIXEL_PLANAR_FORMAT_TILED4X4;
            index = frame_list->tail;

            ni_pthread_mutex_lock(&frame_entry->lock);
            frame_list_enqueue(frame_list);
            ni_pthread_mutex_unlock(&frame_entry->lock);

            common_t *common = dec_worker->common;
            ni_pthread_mutex_lock(&common->lock);
            if (!common->total_xstack_threads) {
                // xstack threads are all exited, return
                dec_worker->should_exit = 1;
                ni_pthread_mutex_unlock(&common->lock);
                break;
            }
            frame_list->frames[index].ref_cnt = common->total_xstack_threads;
            dec_worker->recv_frame_num++;
            if (!dec_worker->force_exit) {
                if (common->shortest) {
                    if (common->ready_dec_worker_num[index] <
                        active_decoder_workers)
                        common->ready_dec_worker_num[index]++;
                    if (common->ready_dec_worker_num[index] ==
                        active_decoder_workers) {
                        common->ready_frames[index] = 1;
                        common->decoded_frame_num++;
                        common->ready_dec_worker_num[index] = 0;
                    }
                } else {
                    if (common->total_dec_threads < active_decoder_workers &&
                        dec_worker->recv_frame_num <=
                            common->max_exited_input_frame_num) {
                        common->ready_dec_worker_num[index]++;
                        int exited_cnt = 0;
                        for (int i = 0; i < NI_MAX_XSTACK_INPUTS; i++) {
                            if (common->input_eos[i] == index &&
                                (decoder_workers[i]->recv_frame_num >=
                                 dec_worker->recv_frame_num)) {
                                // the exited dec_worker has increased the
                                // ready_dec_worker_num, subtract it
                                common->ready_dec_worker_num[index]--;
                                exited_cnt++;
                                ni_log(NI_LOG_TRACE,
                                       "decoder_workers[%d]->recv_frame_num "
                                       "%d, decoded_frame_num %d, curr "
                                       "dec_worker %d recv_frame_num %d, "
                                       "ready_dec_worker_num %d\n",
                                       i, decoder_workers[i]->recv_frame_num,
                                       common->decoded_frame_num,
                                       dec_worker->index,
                                       dec_worker->recv_frame_num,
                                       common->ready_dec_worker_num[index]);
                            }
                        }
                        if (common->ready_dec_worker_num[index] + exited_cnt ==
                            active_decoder_workers) {
                            common->ready_frames[index] = 1;
                            common->decoded_frame_num++;
                            common->ready_dec_worker_num[index] = 0;
                        } else {
                            // restore the ready_dec_worker_num
                            common->ready_dec_worker_num[index] =
                                common->ready_dec_worker_num[index] +
                                exited_cnt;
                        }
                    } else {
                        if (common->ready_dec_worker_num[index] <
                            common->total_dec_threads) {
                            common->ready_dec_worker_num[index]++;
                        } else {
                            ni_log(NI_LOG_ERROR,
                                   "ERROR: droped one frame of dec_worker %d, "
                                   "dec recv_frame_num %d, index %d\n",
                                   dec_worker->index,
                                   dec_worker->recv_frame_num, index);
                        }
                        if (common->ready_dec_worker_num[index] ==
                            common->total_dec_threads) {
                            common->ready_frames[index] = 1;
                            common->decoded_frame_num++;
                            common->ready_dec_worker_num[index] = 0;
                        }
                    }
                }
            }
            ni_pthread_mutex_unlock(&common->lock);
        }
    }

end:
    ni_log(NI_LOG_INFO, "decoder %d receive_thread exit\n", dec_worker->index);
    ni_pthread_mutex_lock(&(dec_worker->common->lock));
    if (!dec_worker->force_exit) {
        if (dec_worker->common->max_exited_input_frame_num <
            dec_worker->recv_frame_num) {
            dec_worker->common->max_exited_input_frame_num =
                dec_worker->recv_frame_num;
        }
        dec_worker->common->exit_dec_num++;
        dec_worker->common->total_dec_threads--;
        dec_worker->common->input_eos[dec_worker->index] =
            index; // record the position of the last received frame
        ni_log(NI_LOG_DEBUG, "set input_eos[%d] to %d\n", dec_worker->index,
               index);
        if (!dec_worker->common->shortest) {
            frame_list->frames[index].ref_cnt =
                0xFF; // mark the last frame to not recycle
        }
    }
    ni_pthread_mutex_unlock(&(dec_worker->common->lock));
    // clean up and self destroy if early end, since main thread won't track
    // it any more
    if (dec_worker->force_exit) {
        decoder_self_cleanup(dec_worker);
    }
    return NULL;
}

int upl_worker_thread_run(upload_worker *upl_worker) {
    ni_log(NI_LOG_DEBUG, "Uploader %d: input wxh %dx%d, pix_fmt %d\n",
           upl_worker->index, upl_worker->in_width, upl_worker->in_height,
           upl_worker->pixel_format);
    ni_pthread_attr_t attr;
    int ret;
    ret = pthread_attr_init(&attr);
    if (ret) {
        ni_log(NI_LOG_ERROR, "failed to init attr %d\n", ret);
        return ret;
    }
    ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
    if (ret) {
        ni_log(NI_LOG_ERROR, "failed to set attr %d\n", ret);
        pthread_attr_destroy(&attr);
        return ret;
    }

    if (ni_pthread_create(&(upl_worker->tid), &attr, hwupload_thread,
                          upl_worker)) {
        ni_log(NI_LOG_ERROR, "Uploader %d: failed to create hw upload thread\n",
               upl_worker->index);
        pthread_attr_destroy(&attr);
        free_upload_worker(upl_worker);
        return NI_RETCODE_FAILURE;
    }
    return ret;
}

int dec_worker_thread_run(decoder_worker *dec_worker) {
    ni_log(NI_LOG_DEBUG, "Decoder %d: %dx%d HWFrames %d\n", dec_worker->index,
           dec_worker->width, dec_worker->height,
           dec_worker->p_dec_ctx->hw_action);
    ni_pthread_attr_t attr;
    int ret = 0;
    ret = pthread_attr_init(&attr);
    if (ret) {
        ni_log(NI_LOG_ERROR, "failed to init attr %d.\n", ret);
        return ret;
    }
    ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
    if (ret) {
        ni_log(NI_LOG_ERROR, "failed to set attr %d.\n", ret);
        pthread_attr_destroy(&attr);
        return ret;
    }

    if (ni_pthread_create(&(dec_worker->send_tid), &attr, decoder_send_thread,
                          dec_worker)) {
        fprintf(stderr, "Error: create decoder send thread failed in decode "
                        "mode\n");
        pthread_attr_destroy(&attr);
        free_decoder_worker(dec_worker);
        return NI_RETCODE_FAILURE;
    }
    if (ni_pthread_create(&(dec_worker->recv_tid), &attr,
                          decoder_receive_thread, dec_worker)) {
        fprintf(stderr, "Error: create decoder receive thread failed in decode "
                        "mode\n");
        pthread_attr_destroy(&attr);
        free_decoder_worker(dec_worker);
        return NI_RETCODE_FAILURE;
    }
    return ret;
}

int enc_worker_thread_run(encoder_worker *enc_worker) {
    int ret = 0;
    ni_pthread_attr_t attr;
    ret = pthread_attr_init(&attr);
    if (ret) {
        ni_log(NI_LOG_ERROR, "failed to init attr %d.\n", ret);
        return ret;
    }
    ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
    if (ret) {
        ni_log(NI_LOG_ERROR, "failed to set attr %d\n", ret);
        pthread_attr_destroy(&attr);
        return ret;
    }

    if (ni_pthread_create(&(enc_worker->send_tid), &attr, encoder_send_thread,
                          enc_worker)) {
        ni_log(NI_LOG_ERROR, "Error: failed to create encoder send thread\n");
        pthread_attr_destroy(&attr);
        free_encoder_worker(enc_worker);
        return NI_RETCODE_FAILURE;
    }

    if (ni_pthread_create(&(enc_worker->recv_tid), &attr,
                          encoder_receive_thread, enc_worker)) {
        ni_log(NI_LOG_ERROR,
               "Error: failed to create encoder receive thread\n");
        pthread_attr_destroy(&attr);
        free_encoder_worker(enc_worker);
        return NI_RETCODE_FAILURE;
    }
    return ret;
}

// apply participant addition/removal and xstack layout changes to new one
void reconfig_xstack_layout(ni_src_desc_t src_to_remove[NI_MAX_XSTACK_INPUTS],
                            int num_src_to_remove,
                            ni_src_desc_t src_to_add[NI_MAX_XSTACK_INPUTS],
                            int num_src_to_add, char *new_xstack_desc) {
    int to_remove_eos[NI_MAX_XSTACK_INPUTS] = {0};
    int i, j, ret, src_idx, total_active, num_to_remove = 0;
    common_t *common = stack_workers[0]->common;

    if (stack_workers[0]->mode == XSTACK_APP_D2E ||
        stack_workers[0]->mode == XSTACK_APP_D2L) {
        decoder_worker *to_remove[NI_MAX_XSTACK_INPUTS] = {0};
        decoder_worker *dec_worker;
        total_active = active_decoder_workers;

        ni_log(NI_LOG_INFO,
               "before reconfig common_t total_dec_threads: %d, "
               "exit_dec_num: %d, ready_dec_worker_num: [%d, %d, %d, %d], "
               "ready_frames: [%d, %d, %d, %d], decoded_frame_num %d, "
               "stack_frame_num %d\n",
               common->total_dec_threads, common->exit_dec_num,
               common->ready_dec_worker_num[0], common->ready_dec_worker_num[1],
               common->ready_dec_worker_num[2], common->ready_dec_worker_num[3],
               common->ready_frames[0], common->ready_frames[1],
               common->ready_frames[2], common->ready_frames[3],
               common->decoded_frame_num, common->stack_frame_num);

        ni_log(NI_LOG_INFO, "%s to_remove %d to_add %d!\n", __func__,
               num_src_to_remove, num_src_to_add);

        for (src_idx = 0; src_idx < num_src_to_remove; src_idx++) {
            for (i = 0; i < total_active; i++) {
                if (0 == strcmp(decoder_workers[i]->filename,
                                src_to_remove[src_idx].file_name)) {
                    to_remove[num_to_remove] = decoder_workers[i];
                    to_remove_eos[num_to_remove] = common->input_eos[i];
                    num_to_remove++;
                    total_active--;
                    for (j = i; j < total_active; j++) {
                        decoder_workers[j] = decoder_workers[j + 1];
                        decoder_workers[j]->index = j;
                        common->input_eos[j] = common->input_eos[j + 1];
                    }
                    decoder_workers[total_active] = NULL;
                    common->input_eos[total_active] = -1;
                    break;
                }
            }
        }

        if (num_src_to_remove && total_active == active_decoder_workers) {
            ni_log(NI_LOG_ERROR, "%s no decoder found for removal !\n",
                   __func__);
            num_src_to_remove = 0;
        }

        ni_frame_entry_t *frame_entry;
        ni_buffered_frame_list_t *frame_list;
        for (i = 0; i < num_to_remove; i++) {
            dec_worker = to_remove[i];
            // handle already exited (receive eos early) dec_worker
            if (to_remove_eos[i] != -1) {
                common->exit_dec_num--;
                common->total_dec_threads++;
                // clean and free dec_worker
                decoder_self_cleanup(dec_worker);
            } else {
                frame_entry = dec_worker->frame_entry;
                frame_list = &(frame_entry->frame_list);
                int index = frame_list->tail - 1;
                if (index < 0) {
                    index = frame_list->size - 1;
                }
                // NOTE: assume that when doing reconfig, decoder receive
                // threads are sleeping
                int pending_frame_num =
                    dec_worker->recv_frame_num - common->decoded_frame_num;
                ni_log(
                    NI_LOG_INFO,
                    "%s decoder idx %d force_exit 1, recv_frame_num %d, index "
                    "%d, pending_frame_num %d!\n",
                    __func__, dec_worker->index, dec_worker->recv_frame_num,
                    index, pending_frame_num);
                while (pending_frame_num) {
                    common->ready_dec_worker_num[index]--;
                    index -= 1;
                    if (index < 0) {
                        index = frame_list->size - 1; // circular queue
                    }
                    pending_frame_num--;
                }

                // NOTE: here dec_worker->frame_entry->lock is acquired inside
                // of common->lock context, try to avoid later
                ni_pthread_mutex_lock(&(dec_worker->frame_entry->lock));
                dec_worker->force_exit = 1;
                ni_pthread_cond_broadcast(&(dec_worker->frame_entry->cond));
                ni_pthread_mutex_unlock(&(dec_worker->frame_entry->lock));
            }
        }

        active_decoder_workers = total_active;

        if (num_src_to_add &&
            total_active + num_src_to_add > NI_MAX_XSTACK_INPUTS) {
            ni_log(NI_LOG_ERROR,
                   "%s existing %d + %d to add exceeding max "
                   "allowed: %d, not adding !\n",
                   __func__, total_active, num_src_to_add,
                   NI_MAX_XSTACK_INPUTS);
            num_src_to_add = 0;
        }

        j = total_active; // the first empty slot in the decoder_workers array
        for (i = 0; i < num_src_to_add; i++) {
            decoder_workers[j] = malloc(sizeof(decoder_worker));
            if (!decoder_workers[j]) {
                ni_log(NI_LOG_ERROR, "failed to alloc new decoder worker.\n");
                break;
            }
            memset(decoder_workers[j], 0, sizeof(decoder_worker));
            decoder_workers[j]->index = decoder_workers[j - 1]->index + 1;
            decoder_workers[j]->common = decoder_workers[0]->common;
            decoder_workers[j]->file_reader.loop =
                decoder_workers[0]->file_reader.loop;
            ni_log(NI_LOG_INFO, "new decoder file loop %llu\n",
                   decoder_workers[j]->file_reader.loop);
            decoder_workers[j]->pfs = open(src_to_add[i].file_name, O_RDONLY);
            if (!decoder_workers[j]->pfs) {
                ni_log(NI_LOG_ERROR, "Error: cannot open %s\n",
                       src_to_add[i].file_name);
                ni_log(NI_LOG_ERROR, "Error: input file read failure\n");
                free(decoder_workers[j]);
                break;
            }
            strcpy(decoder_workers[j]->filename, src_to_add[i].file_name);
            decoder_workers[j]->codec_type =
                get_decoder_type(src_to_add[i].decoder_name);
            if (decoder_workers[j]->codec_type < 0) {
                ni_log(NI_LOG_ERROR, "Invalid decoder name %s\n",
                       src_to_add[i].decoder_name);
                free(decoder_workers[j]);
                break;
            }
            strcpy(decoder_workers[j]->decoder_params,
                   src_to_add[i].decoder_params);

            if (init_dec_worker(decoder_workers[j])) {
                ni_log(NI_LOG_ERROR, "failed to init_dec_worker for new.\n");
                free(decoder_workers[j]);
                break;
            }

            decoder_workers[j]->frame_entry->frame_list.head =
                decoder_workers[j]->frame_entry->frame_list.tail =
                    common->stack_frame_num %
                    (decoder_workers[j]->frame_entry->frame_list.size);
            decoder_workers[j]->recv_frame_num = common->stack_frame_num;

            if (dec_worker_thread_run(decoder_workers[j])) {
                ni_log(NI_LOG_ERROR,
                       "failed to create send/receive thread for decoder "
                       "worker %d.\n",
                       decoder_workers[j]);
                free(decoder_workers[j]);
                break;
            }

            j++;
        }

        active_decoder_workers = j;

        // re-init xstack so filter graph can be re-generated based on the new
        // filter graph description
        if (num_src_to_add || num_src_to_remove) {
            ni_scaler_params_t scaler_params = {0};
            if (active_decoder_workers < MAX_SINGLE_INPUTS)
                scaler_params.nb_inputs = active_decoder_workers;
            else
                scaler_params.nb_inputs = MAX_SINGLE_INPUTS;
            for (i = 0; i < stack_num; i++) {
                stack_workers[i]->nb_inputs = active_decoder_workers;
                ret = ni_scaler_set_params(stack_workers[i]->p_stack_ctx,
                                           &scaler_params);
                if (ret != NI_RETCODE_SUCCESS) {
                    ni_log(NI_LOG_ERROR, "Error: failed to reconfig number "
                           "of inputs for stack worker %d\n", i);
                }
                // Reset all stack parameters before reconfiguring to new parameters
                memset(stack_workers[i]->stack_items, 0, sizeof(ni_stack_item_t) * NI_MAX_XSTACK_INPUTS);
                ret = retrieve_filter_params(new_xstack_desc,
                                             stack_workers[i]->stack_items);
                if (ret < 0) {
                    ni_log(NI_LOG_ERROR,
                           "Error retrieving filter parameters from "
                           "description %s.\n",
                           new_xstack_desc);
                    break;
                } else {
                    if (ret != stack_workers[i]->nb_inputs) {
                        ni_log(NI_LOG_ERROR,
                               "Error: filter parameter %d gives incorrect "
                               "nb_inputs %d\n",
                               i, ret);
                        break;
                    }
                }
            }

            // update common_t
            common->total_dec_threads += (num_src_to_add - num_src_to_remove);
            if (!num_src_to_add) {
                // check if there's ready frame for stack thread after removing
                // dec workers
                for (i = 0; i < NI_MAX_HW_FRAME; i++) {
                    if (common->ready_dec_worker_num[i] ==
                        active_decoder_workers) {
                        common->ready_frames[i] = 1;
                        common->decoded_frame_num++;
                        common->ready_dec_worker_num[i] = 0;
                    }
                }
            }

            ni_log(NI_LOG_INFO,
                   "after reconfig common_t total_dec_threads: %d, "
                   "exit_dec_num: %d, ready_dec_worker_num: [%d, %d, %d, %d], "
                   "ready_frames: [%d, %d, %d, %d], decoded_frame_num %d, "
                   "stack_frame_num %d\n",
                   common->total_dec_threads, common->exit_dec_num,
                   common->ready_dec_worker_num[0],
                   common->ready_dec_worker_num[1],
                   common->ready_dec_worker_num[2],
                   common->ready_dec_worker_num[3], common->ready_frames[0],
                   common->ready_frames[1], common->ready_frames[2],
                   common->ready_frames[3], common->decoded_frame_num,
                   common->stack_frame_num);
        }
    } else {
        // mode = U2E or U2D
        upload_worker *to_remove[NI_MAX_XSTACK_INPUTS] = {0};
        upload_worker *upl_worker = NULL;
        total_active = active_upload_workers;
        ni_log(NI_LOG_INFO,
               "before reconfig common_t total_upl_threads: %d, "
               "exit_upl_num: %d, ready_upl_worker_num: [%d, %d, %d, %d], "
               "ready_frames: [%d, %d, %d, %d], uploaded_frame_num %d, "
               "stack_frame_num %d\n",
               common->total_upl_threads, common->exit_upl_num,
               common->ready_upl_worker_num[0], common->ready_upl_worker_num[1],
               common->ready_upl_worker_num[2], common->ready_upl_worker_num[3],
               common->ready_frames[0], common->ready_frames[1],
               common->ready_frames[2], common->ready_frames[3],
               common->uploaded_frame_num, common->stack_frame_num);

        ni_log(NI_LOG_INFO, "%s to_remove %d to_add %d!\n", __func__,
               num_src_to_remove, num_src_to_add);

        for (src_idx = 0; src_idx < num_src_to_remove; src_idx++) {
            for (i = 0; i < total_active; i++) {
                if (0 == strcmp(upload_workers[i]->filename,
                                src_to_remove[src_idx].file_name)) {
                    to_remove[num_to_remove] = upload_workers[i];
                    to_remove_eos[num_to_remove] = common->input_eos[i];
                    num_to_remove++;
                    total_active--;
                    for (j = i; j < total_active; j++) {
                        upload_workers[j] = upload_workers[j + 1];
                        upload_workers[j]->index = j;
                        common->input_eos[j] = common->input_eos[j + 1];
                    }
                    upload_workers[total_active] = NULL;
                    common->input_eos[total_active] = -1;
                    break;
                }
            }
        }

        if (num_src_to_remove && total_active == active_upload_workers) {
            ni_log(NI_LOG_ERROR, "%s no decoder found for removal !\n",
                   __func__);
            num_src_to_remove = 0;
        }

        ni_frame_entry_t *frame_entry;
        ni_buffered_frame_list_t *frame_list;
        for (i = 0; i < num_to_remove; i++) {
            upl_worker = to_remove[i];
            // handle already exited (receive eos early) upl_worker
            if (to_remove_eos[i] != -1) {
                common->exit_upl_num--;
                common->total_upl_threads++;
                // clean and free upl_worker
                uploader_self_cleanup(upl_worker);
            } else {
                frame_entry = upl_worker->frame_entry;
                frame_list = &(frame_entry->frame_list);
                int index = frame_list->tail - 1;
                if (index < 0) {
                    index = frame_list->size - 1;
                }
                // NOTE: assume that when doing reconfig, uploader hwupload
                // thread is sleeping
                int pending_frame_num =
                    upl_worker->upl_frame_num - common->uploaded_frame_num;
                ni_log(
                    NI_LOG_INFO,
                    "%s uploader idx %d force_exit 1, recv_frame_num %d, index "
                    "%d, pending_frame_num %d!\n",
                    __func__, upl_worker->index, upl_worker->upl_frame_num,
                    index, pending_frame_num);
                while (pending_frame_num) {
                    common->ready_upl_worker_num[index]--;
                    index -= 1;
                    if (index < 0) {
                        index = frame_list->size - 1; // circular queue
                    }
                    pending_frame_num--;
                }

                // NOTE: here upl_worker->frame_entry->lock is acquired inside
                // of common->lock context, try to avoid later
                ni_pthread_mutex_lock(&(upl_worker->frame_entry->lock));
                upl_worker->force_exit = 1;
                ni_pthread_cond_signal(&(upl_worker->frame_entry->cond));
                ni_pthread_mutex_unlock(&(upl_worker->frame_entry->lock));
            }
        }

        active_upload_workers = total_active;

        if (num_src_to_add &&
            total_active + num_src_to_add > NI_MAX_XSTACK_INPUTS) {
            ni_log(NI_LOG_ERROR,
                   "%s existing %d + %d to add exceeding max "
                   "allowed: %d, not adding !\n",
                   __func__, total_active, num_src_to_add,
                   NI_MAX_XSTACK_INPUTS);
            num_src_to_add = 0;
        }

        j = total_active; // the first empty slot in the upload_workers array
        for (i = 0; i < num_src_to_add; i++) {
            upload_workers[j] = malloc(sizeof(upload_worker));
            if (!upload_workers[j]) {
                ni_log(NI_LOG_ERROR, "failed to alloc new upload worker.\n");
                break;
            }
            memset(upload_workers[j], 0, sizeof(upload_worker));
            upload_workers[j]->index = upload_workers[j - 1]->index + 1;
            upload_workers[j]->common = upload_workers[0]->common;
            upload_workers[j]->file_reader.loop =
                upload_workers[0]->file_reader.loop;
            ni_log(NI_LOG_INFO, "new uploader file loop %llu\n",
                   upload_workers[j]->file_reader.loop);
            upload_workers[j]->pfs = open(src_to_add[i].file_name, O_RDONLY);
            if (!upload_workers[j]->pfs) {
                ni_log(NI_LOG_ERROR, "Error: cannot open %s\n",
                       src_to_add[i].file_name);
                ni_log(NI_LOG_ERROR, "Error: input file read failure\n");
                free(upload_workers[j]);
                break;
            }
            strcpy(upload_workers[j]->filename, src_to_add[i].file_name);
            upload_workers[j]->in_width = src_to_add[i].src_width;
            upload_workers[j]->in_height = src_to_add[i].src_height;
            if (upl_worker)
                upl_worker->pixel_format = src_to_add[i].src_fmt;

            if (init_upl_worker(upload_workers[j])) {
                ni_log(NI_LOG_ERROR, "failed to init_dec_worker for new.\n");
                free(upload_workers[j]);
                break;
            }

            upload_workers[j]->frame_entry->frame_list.head =
                upload_workers[j]->frame_entry->frame_list.tail =
                    common->stack_frame_num %
                    (upload_workers[j]->frame_entry->frame_list.size);
            upload_workers[j]->upl_frame_num = common->stack_frame_num;

            if (upl_worker_thread_run(upload_workers[j])) {
                ni_log(NI_LOG_ERROR,
                       "failed to create hwupload thread for upload "
                       "worker %d.\n",
                       upload_workers[j]);
                free(upload_workers[j]);
                break;
            }

            j++;
        }

        active_upload_workers = j;

        // re-init xstack so filter graph can be re-generated based on the new
        // filter graph description
        if (num_src_to_add || num_src_to_remove) {
            ni_scaler_params_t scaler_params = {0};
            if (active_upload_workers < MAX_SINGLE_INPUTS)
                scaler_params.nb_inputs = active_upload_workers;
            else
                scaler_params.nb_inputs = MAX_SINGLE_INPUTS;
            for (i = 0; i < stack_num; i++) {
                stack_workers[i]->nb_inputs = active_upload_workers;
                ret = ni_scaler_set_params(stack_workers[i]->p_stack_ctx,
                                           &scaler_params);
                if (ret != NI_RETCODE_SUCCESS) {
                ni_log(NI_LOG_ERROR, "Error: failed to reconfig number "
                       "of inputs for stack worker %d\n", i);
                }
                ret = retrieve_filter_params(new_xstack_desc,
                                             stack_workers[i]->stack_items);
                if (ret < 0) {
                    ni_log(NI_LOG_ERROR,
                           "Error retrieving filter parameters from "
                           "description %s.\n",
                           new_xstack_desc);
                    break;
                } else {
                    if (ret != stack_workers[i]->nb_inputs) {
                        ni_log(NI_LOG_ERROR,
                               "Error: filter parameter %d gives incorrect "
                               "nb_inputs %d\n",
                               i, ret);
                        break;
                    }
                }
            }

            // update common_t
            common->total_upl_threads += (num_src_to_add - num_src_to_remove);
            if (!num_src_to_add) {
                // check if there's ready frame for stack thread after removing
                // upl workers
                for (i = 0; i < NI_MAX_HW_FRAME; i++) {
                    if (common->ready_upl_worker_num[i] ==
                        active_upload_workers) {
                        common->ready_frames[i] = 1;
                        common->uploaded_frame_num++;
                        common->ready_upl_worker_num[i] = 0;
                    }
                }
            }

            ni_log(NI_LOG_INFO,
                   "after reconfig common_t total_upl_threads: %d, "
                   "exit_upl_num: %d, ready_upl_worker_num: [%d, %d, %d, %d], "
                   "ready_frames: [%d, %d, %d, %d], uploaded_frame_num %d, "
                   "stack_frame_num %d\n",
                   common->total_upl_threads, common->exit_upl_num,
                   common->ready_upl_worker_num[0],
                   common->ready_upl_worker_num[1],
                   common->ready_upl_worker_num[2],
                   common->ready_upl_worker_num[3], common->ready_frames[0],
                   common->ready_frames[1], common->ready_frames[2],
                   common->ready_frames[3], common->uploaded_frame_num,
                   common->stack_frame_num);
        }
    }
}

static void read_reconfig_file_and_apply_update(void) {
#define skip_blank(cur)                                                        \
    while (*cur && isblank(*cur)) {                                            \
        cur++;                                                                 \
    }

    ni_src_desc_t src_to_remove[NI_MAX_XSTACK_INPUTS] = {0};
    int num_src_to_remove = 0;
    ni_src_desc_t src_to_add[NI_MAX_XSTACK_INPUTS] = {0};
    int num_src_to_add = 0;
    int i;
    char new_xstack_desc[2048] = {0};
    FILE *reconf = NULL;
    char one_line[2048] = {0};
    int parse_filter = 0;
    const char *reconf_file = NI_XSTACK_RECONFIG_FILE_NAME;
    char *curr = NULL;
    char *str_start = NULL;
    char *ch = NULL;

    ni_log(NI_LOG_INFO, "%s().\n", __func__);
    reconf = fopen(reconf_file, "r");
    if (!reconf) {
        ni_log(NI_LOG_ERROR, "ERROR: %s: Cannot open reconfig_file: %s\n",
               __func__, reconf_file);
        return;
    }

    while (fgets(one_line, sizeof(one_line), reconf)) {
        curr = one_line;
        skip_blank(curr)

            if (*curr == '-') {
            // get src input file name to remove: terminated by blank or '\n'
            curr++;
            skip_blank(curr) str_start = curr;
            while (*curr && !isblank(*curr) && *curr != '\n') {
                curr++;
            }
            strncpy(src_to_remove[num_src_to_remove].file_name, str_start,
                    curr - str_start);
            src_to_remove[num_src_to_remove].file_name[curr - str_start] = '\0';
            num_src_to_remove++;
        }
        else if (*curr == '+') {
            // get src input file AND decoder name to add
            curr++;
            skip_blank(curr) str_start = curr;
            while (*curr && !isblank(*curr) && *curr != '\n') {
                curr++;
            }
            if (*curr == '\n') {
                ni_log(NI_LOG_ERROR,
                       "ERROR: %s <%s> missing decoder "
                       "name, line ignored.\n",
                       __func__, one_line);
                fclose(reconf);
                return;
            } else {
                strncpy(src_to_add[num_src_to_add].file_name, str_start,
                        curr - str_start);
                src_to_add[num_src_to_add].file_name[curr - str_start] = '\0';

                skip_blank(curr) str_start = curr;
                while (*curr && !isblank(*curr) && *curr != '\n') {
                    curr++;
                }
                // decoder_name is src_pix_fmt if mode is U2D or U2E
                strncpy(src_to_add[num_src_to_add].decoder_name, str_start,
                        curr - str_start);
                src_to_add[num_src_to_add].decoder_name[curr - str_start] =
                    '\0';

                skip_blank(curr) str_start = curr;
                while (*curr && !isblank(*curr) && *curr != '\n') {
                    curr++;
                }
                // decoder_params is src resolution if mode is U2D or U2E
                strncpy(src_to_add[num_src_to_add].decoder_params, str_start,
                        curr - str_start);
                src_to_add[num_src_to_add].decoder_params[curr - str_start] =
                    '\0';

                if (stack_workers[0]->mode == XSTACK_APP_U2L ||
                    stack_workers[0]->mode == XSTACK_APP_U2E) {
                    src_to_add[num_src_to_add].src_fmt = get_pix_fmt_from_desc(
                        src_to_add[num_src_to_add].decoder_name);
                    if (src_to_add[num_src_to_add].src_fmt < 0) {
                        ni_log(NI_LOG_ERROR, "Unsupported pix fmt!\n");
                        fclose(reconf);
                        return;
                    }
                    src_to_add[num_src_to_add].src_width = strtoul(
                        src_to_add[num_src_to_add].decoder_params, &ch, 10);
                    if (*ch != 'x') {
                        ni_log(NI_LOG_ERROR,
                               "invalid resolution format %s, line ignored\n",
                               src_to_add[num_src_to_add].decoder_params);
                        fclose(reconf);
                        return;
                    }
                    src_to_add[num_src_to_add].src_height =
                        strtoul(ch + 1, NULL, 10);
                    if (check_resolution(
                            src_to_add[num_src_to_add].src_width,
                            src_to_add[num_src_to_add].src_height) < 0) {
                        ni_log(NI_LOG_ERROR,
                               "invalid resolution value %s, line ignored\n",
                               src_to_add[num_src_to_add].decoder_params);
                        fclose(reconf);
                        return;
                    }
                }
                num_src_to_add++;
            }
        }
        else if (*curr == 'f' || parse_filter) {
            // get new xstack filter description until file end, stop at each
            // line terminating at \ or \n
            if (!parse_filter) {
                parse_filter = 1;
                curr++;
            }

            skip_blank(curr) str_start = curr;
            while (*curr && *curr != '\\' && *curr != '\n') {
                curr++;
            }
            strncpy(&new_xstack_desc[strlen(new_xstack_desc)], str_start,
                    curr - str_start);
            new_xstack_desc[strlen(new_xstack_desc) + curr - str_start] = '\0';
        }
        else if (*curr == '\n') {
            ; // just ignore empty line
        }
        else {
            if (!parse_filter) {
                ni_log(NI_LOG_ERROR, "ERROR: %s <%s> not accepted.\n", __func__,
                       one_line);
                fclose(reconf);
                return;
            } else {
                skip_blank(curr) str_start = curr;
                while (*curr && *curr != '\\' && *curr != '\n') {
                    curr++;
                }
                strncpy(&new_xstack_desc[strlen(new_xstack_desc)], str_start,
                        curr - str_start);
                new_xstack_desc[strlen(new_xstack_desc) + curr - str_start] =
                    '\0';
            }
        }
    }

    fclose(reconf);

    ni_log(NI_LOG_INFO,
           "%s num_src_to_remove %d num_src_to_add %d\n"
           "New filter desc: <%s>.\n",
           __func__, num_src_to_remove, num_src_to_add, new_xstack_desc);

    for (i = 0; i < num_src_to_remove; i++) {
        ni_log(NI_LOG_INFO, "%s src to remove %d: <%s>\n", __func__, i,
               src_to_remove[i].file_name);
    }
    for (i = 0; i < num_src_to_add; i++) {
        ni_log(NI_LOG_INFO, "%s src to add %d: <%s> <%s>\n", __func__, i,
               src_to_add[i].file_name, src_to_add[i].decoder_name);
    }

    if (num_src_to_remove || num_src_to_add) {
        reconfig_xstack_layout(src_to_remove, num_src_to_remove, src_to_add,
                               num_src_to_add, new_xstack_desc);
    }
}

int do_stack(stack_worker_t *stack_worker, ni_session_data_io_t *p_dst_data,
             ni_frame_config_t *out_frame_cfg, ni_frame_config_t in_frame_cfg[],
             int frame_idx, int mode) {
    ni_log(NI_LOG_DEBUG, "stack %d do stack\n", stack_worker->index);
    int ret = 0;
    int i, p;
    ret = ni_frame_buffer_alloc_hwenc(&(p_dst_data->data.frame),
                                      stack_worker->out_width,
                                      stack_worker->out_height, 0);

    if (ret != NI_RETCODE_SUCCESS) {
        ni_log(NI_LOG_ERROR, "Error: ni_frame_buffer_alloc_hwenc failed\n");
        return ret;
    }

    ret = ni_device_session_read_hwdesc(stack_worker->p_stack_ctx, p_dst_data,
                                        NI_DEVICE_TYPE_SCALER);
    if (ret != NI_RETCODE_SUCCESS) {
        ni_log(NI_LOG_ERROR, "Xstack can't aquire output frame %d\n", ret);
        return ret;
    }

    niFrameSurface1_t *frame_surface =
        (niFrameSurface1_t *)(p_dst_data->data.frame.p_data[3]);
    int pix_fmt = stack_worker->pixel_format;
    frame_surface->bit_depth = get_bitdepth_factor_from_pixfmt(pix_fmt);
    frame_surface->encoding_type =
        (pix_fmt == NI_PIX_FMT_YUV420P) || (pix_fmt == NI_PIX_FMT_YUV420P10LE);
    if (pix_fmt == NI_PIX_FMT_8_TILED4X4)
        frame_surface->encoding_type = NI_PIXEL_PLANAR_FORMAT_TILED4X4;
    frame_surface->ui16width = stack_worker->out_width;
    frame_surface->ui16height = stack_worker->out_height;
    frame_surface->ui32nodeAddress = 0;
    ni_log(NI_LOG_DEBUG, "stack out_frame trace ui16FrameIdx = [%d]\n",
           frame_surface->ui16FrameIdx);

    if (stack_worker->fillcolor)
        out_frame_cfg->options = NI_SCALER_FLAG_FCE;

    i = 0;
    int num_cfg_inputs = MAX_SINGLE_INPUTS;
    int input_frame_idx;
    niFrameSurface1_t *input_surf;
    ni_buffered_frame_list_t *frame_list;
    for (p = stack_worker->nb_inputs; p > 0; p -= MAX_SINGLE_INPUTS) {
        int start = i;
        int end = i + MAX_SINGLE_INPUTS;

        if (end > stack_worker->nb_inputs) {
            num_cfg_inputs = p;
            end = stack_worker->nb_inputs;
        }

        ni_log(NI_LOG_DEBUG, "stack %d in_frame trace ui16FrameIdx = \n",
               stack_worker->index);
        for (; i < end; i++) {
            if (mode == XSTACK_APP_D2L || mode == XSTACK_APP_D2E)
                frame_list = &(decoder_workers[i]->frame_entry->frame_list);
            else
                frame_list = &(upload_workers[i]->frame_entry->frame_list);

            if (!stack_worker->common->shortest) {
                if (stack_worker->common->input_eos[i] != -1) {
                    input_frame_idx = stack_worker->common->input_eos[i];
                } else
                    input_frame_idx = frame_idx;
            } else {
                input_frame_idx = frame_idx;
            }
            input_surf =
                (niFrameSurface1_t *)((frame_list->frames[input_frame_idx])
                                          .data.data.frame.p_data[3]);
            if (input_surf && !(input_surf->ui16FrameIdx)) {
                ni_log(NI_LOG_ERROR,
                       "ERROR: input_surf of dec %d is invalid, frame_idx %d, "
                       "input_idx %d, frameIdx %u\n",
                       i, frame_idx, input_frame_idx, input_surf->ui16FrameIdx);
                ni_hwframe_buffer_recycle2(frame_surface);
                ni_frame_buffer_free(&(p_dst_data->data.frame));
                return NI_RETCODE_FAILURE;
            }

            if (i == 0) {
                // copy props of the first input frame to the dst frame
                ret = frame_copy(
                    p_dst_data, &((frame_list->frames[input_frame_idx]).data),
                    false);
                if (ret < 0) {
                    ni_log(NI_LOG_ERROR, "Error: frame_copy failed");
                    ni_hwframe_buffer_recycle2(frame_surface);
                    ni_frame_buffer_free(&(p_dst_data->data.frame));
                    return ret;
                }
            }

            // input config
            in_frame_cfg[i].picture_width = NIALIGN(input_surf->ui16width, 2);
            in_frame_cfg[i].picture_height = NIALIGN(input_surf->ui16height, 2);
            in_frame_cfg[i].picture_format =
                ni_to_gc620_pix_fmt(stack_worker->pixel_format);
            in_frame_cfg[i].session_id = input_surf->ui16session_ID;
            in_frame_cfg[i].output_index = input_surf->output_idx;
            in_frame_cfg[i].frame_index = input_surf->ui16FrameIdx;
            in_frame_cfg[i].options = 0;

            // where to place the input into the output
            in_frame_cfg[i].rectangle_x = stack_worker->stack_items[i].x;
            in_frame_cfg[i].rectangle_y = stack_worker->stack_items[i].y;
            in_frame_cfg[i].rectangle_width = stack_worker->stack_items[i].w;
            in_frame_cfg[i].rectangle_height = stack_worker->stack_items[i].h;

            ni_log(NI_LOG_DEBUG, "[%u] ", input_surf->ui16FrameIdx);
            ni_log(NI_LOG_DEBUG,
                   "stack input config rec_x:y %d:%d, rec_w:h %d:%d\n",
                   in_frame_cfg[i].rectangle_x, in_frame_cfg[i].rectangle_y,
                   in_frame_cfg[i].rectangle_width,
                   in_frame_cfg[i].rectangle_height);
        }
        ni_log(NI_LOG_DEBUG, "\n");

        out_frame_cfg->picture_width = NIALIGN(stack_worker->out_width, 2);
        out_frame_cfg->picture_height = NIALIGN(stack_worker->out_height, 2);
        out_frame_cfg->picture_format =
            ni_to_gc620_pix_fmt(stack_worker->pixel_format);
        out_frame_cfg->frame_index = frame_surface->ui16FrameIdx;
        out_frame_cfg->options |= NI_SCALER_FLAG_IO;
        if (out_frame_cfg->options & NI_SCALER_FLAG_FCE) {
            out_frame_cfg->rgba_color = 4278190080;
        } else {
            out_frame_cfg->rgba_color = 0;
        }

        ni_log(NI_LOG_DEBUG,
               "out_frame_cfg pic_w:h %d:%d, frame_index:%d, options:%d, "
               "format:%d, out_frame_cfg sessionID %d\n",
               out_frame_cfg->picture_width, out_frame_cfg->picture_height,
               out_frame_cfg->frame_index, out_frame_cfg->options,
               out_frame_cfg->picture_format, out_frame_cfg->session_id);

        ret = ni_device_multi_config_frame(stack_worker->p_stack_ctx,
                                           &in_frame_cfg[start], num_cfg_inputs,
                                           out_frame_cfg);
        if (ret != NI_RETCODE_SUCCESS) {
            ni_log(NI_LOG_ERROR,
                   "Error: ni_device_multi_config_frame failed\n");
            ni_hwframe_buffer_recycle2(frame_surface);
            ni_frame_buffer_free(&(p_dst_data->data.frame));
            return ret;
        }

        out_frame_cfg->options &= ~NI_SCALER_FLAG_FCE;
    }

    return NI_RETCODE_SUCCESS;
}

int send_frame_to_encoders(stack_worker_t *stack_worker,
                           ni_session_data_io_t *p_src_data) {
    if (!p_src_data)
        return 0;

    int i;
    encoder_worker *enc_worker;
    ni_buffered_frame_list_t *enc_frame_list;
    ni_session_data_io_t *p_in_data;
    // Send eos
    if (p_src_data->data.frame.end_of_stream) {
        for (i = 0; i < stack_worker->nb_encoders; i++) {
            ni_log(NI_LOG_DEBUG, "stack %d send eos to encoder %d\n",
                   stack_worker->index, i);
            enc_worker = stack_worker->encoder_workers[i];
            enc_frame_list = &(enc_worker->frame_entry->frame_list);
            while (frame_list_is_full(enc_frame_list) &&
                   !stack_worker->should_exit) {
                ni_log(NI_LOG_DEBUG,
                       "Warn: encoder send frame list is full, wait!\n");
                ni_usleep(100);
            }

            p_in_data = &(enc_frame_list->frames[enc_frame_list->tail].data);
            if (stack_worker->common->stack_frame_num > 1) {
                // make sure encoder session is opened before calling
                // prepare_enc_input_frame
                while (!enc_worker->started && !stack_worker->should_exit) {
                    ni_usleep(100);
                }
                prepare_enc_input_frame(
                    enc_worker->p_enc_ctx, p_src_data, p_in_data,
                    stack_worker->pixel_format, stack_worker->out_width,
                    stack_worker->out_height);
            } else {
                // it's unexpected for encoder to receive eos as the first frame
                // !!!
                ni_log(NI_LOG_ERROR,
                       "Error: encoder received eos before start!\n");
                enc_worker->should_exit = 1;
                continue;
            }

            ni_pthread_mutex_lock(&(enc_worker->frame_entry->lock));
            if (frame_list_is_empty(enc_frame_list)) {
                frame_list_enqueue(enc_frame_list);
                ni_pthread_cond_signal(&(enc_worker->frame_entry->cond));
            } else
                frame_list_enqueue(enc_frame_list);
            ni_pthread_mutex_unlock(&(enc_worker->frame_entry->lock));
        }
        return 0;
    }

    niFrameSurface1_t *p_src_surface =
        (niFrameSurface1_t *)(p_src_data->data.frame.p_data[3]);
    if (!p_src_surface || !p_src_surface->ui16FrameIdx) {
        ni_log(NI_LOG_ERROR, "%s input frame surface invalid!\n", __func__);
        return -1;
    }

    // save the encoder input frame for recycle
    int recycle_index = p_src_surface->ui16FrameIdx;
    ni_pthread_mutex_lock(&(stack_worker->lock));
    if (g_hwframe_pool[recycle_index].ref_cnt > 0) {
        ni_log(NI_LOG_ERROR, "ERROR: %s hwframe %d already exists!\n", __func__,
               recycle_index);
        ni_hwframe_buffer_recycle2(&(g_hwframe_pool[recycle_index].surface));
    }
    memcpy(&(g_hwframe_pool[recycle_index].surface), p_src_surface,
           sizeof(niFrameSurface1_t));
    g_hwframe_pool[recycle_index].ref_cnt = stack_worker->nb_encoders;
    ni_pthread_mutex_unlock(&(stack_worker->lock));

    for (i = 0; i < stack_worker->nb_encoders; i++) {
        ni_log(NI_LOG_DEBUG, "stack %d send hwframe %u to encoder %d\n",
               stack_worker->index, p_src_surface->ui16FrameIdx, i);
        enc_worker = stack_worker->encoder_workers[i];
        enc_frame_list = &(enc_worker->frame_entry->frame_list);
        while (frame_list_is_full(enc_frame_list) &&
               !stack_worker->should_exit) {
            ni_log(NI_LOG_DEBUG,
                   "Warn: encoder send frame list is full, wait!\n");
            ni_usleep(100);
        }

        p_in_data = &(enc_frame_list->frames[enc_frame_list->tail].data);
        if (stack_worker->common->stack_frame_num > 1) {
            // make sure encoder session is opened before calling
            // prepare_enc_input_frame
            while (!enc_worker->started && !stack_worker->should_exit) {
                ni_usleep(100);
            }
            prepare_enc_input_frame(
                enc_worker->p_enc_ctx, p_src_data, p_in_data,
                stack_worker->pixel_format, stack_worker->out_width,
                stack_worker->out_height);
        } else {
            // since encoder is not open, copy the frame crops and prepare enc
            // input frame before send later
            frame_copy(p_in_data, p_src_data, true); // force copy
            p_in_data->data.frame.video_width =
                p_src_data->data.frame.video_width;
            p_in_data->data.frame.video_height =
                p_src_data->data.frame.video_height;
            // signal encoder send thread for the first frame
            ni_log(NI_LOG_INFO, "Signal encoder %d to start\n",
                   enc_worker->index);
            ni_pthread_mutex_lock(&(enc_worker->lock));
            ni_pthread_cond_signal(&(enc_worker->start_cond));
            ni_pthread_mutex_unlock(&(enc_worker->lock));
        }

        ni_pthread_mutex_lock(&(enc_worker->frame_entry->lock));
        if (frame_list_is_empty(enc_frame_list)) {
            frame_list_enqueue(enc_frame_list);
            ni_pthread_cond_signal(&(enc_worker->frame_entry->cond));
        } else
            frame_list_enqueue(enc_frame_list);
        ni_pthread_mutex_unlock(&(enc_worker->frame_entry->lock));
    }

    ni_frame_wipe_aux_data(&(p_src_data->data.frame));
    return NI_RETCODE_SUCCESS;
}

void *stack_process_thread(void *args) {
    int ret;
    stack_worker_t *stack_worker = args;
    ni_frame_config_t out_frame_cfg = {0};
    ni_frame_config_t in_frame_cfg[NI_MAX_XSTACK_INPUTS] = {0};
    ni_session_data_io_t stack_output_data = {0};
    ni_session_data_io_t *p_dst_data;
    ni_frame_t *p_dst_frame;
    niFrameSurface1_t *p_surface;
    bool need_encoder, need_decoder, need_uploader, need_download;
    need_encoder = need_decoder = need_uploader = need_download = false;
    switch (stack_worker->mode) {
    case XSTACK_APP_D2E:
        need_decoder = need_encoder = true;
        break;
    case XSTACK_APP_D2L:
        need_decoder = need_download = true;
        break;
    case XSTACK_APP_U2E:
        need_uploader = need_encoder = true;
        break;
    case XSTACK_APP_U2L:
        need_uploader = need_download = true;
        break;
    }

    bool got_eos = false;
    stack_worker->head = 0;

    ni_log(NI_LOG_INFO, "stack %d process thread started, mode %d\n",
           stack_worker->index, stack_worker->mode);

    for (;;) {
        if (stack_worker->should_exit)
            break;
        if (need_decoder) {
            if ((stack_worker->common->decoded_frame_num >
                 stack_worker->common->stack_frame_num) &&
                (stack_worker->common
                     ->ready_frames[(stack_worker->head % NI_MAX_HW_FRAME)] ==
                 1)) {
                ni_log(
                    NI_LOG_DEBUG,
                    "stack_worker %d head %d, ready frames[head] = %d, "
                    "decoded_frame_num %d, stack_frame_num %d\n",
                    stack_worker->index, stack_worker->head,
                    stack_worker->common
                        ->ready_frames[(stack_worker->head % NI_MAX_HW_FRAME)],
                    stack_worker->common->decoded_frame_num,
                    stack_worker->common->stack_frame_num);

            } else {
                if (need_reconfig) {
                    ni_pthread_mutex_lock(&(stack_worker->common->lock));
                    if (need_reconfig &&
                        (stack_worker->common->stack_frame_num ==
                         stack_worker->common->decoded_frame_num)) {
                        ni_log(NI_LOG_DEBUG, "reconfig, stack index %d\n",
                               stack_worker->head);
                        // reconfig xstack layout
                        read_reconfig_file_and_apply_update();
                        need_reconfig = 0;
                    } // else continue until all pending frames being processed
                    ni_pthread_mutex_unlock(&(stack_worker->common->lock));
                }

                if (stack_worker->common->shortest) {
                    if (stack_worker->common->exit_dec_num > 0) {
                        // find exited dec_workers
                        int min_dec_recv_frame_num = 0;
                        for (int idx = 0; idx < NI_MAX_XSTACK_INPUTS; idx++) {
                            if (stack_worker->common->input_eos[idx] != -1) {
                                if (min_dec_recv_frame_num) {
                                    min_dec_recv_frame_num = NIMIN(
                                        min_dec_recv_frame_num,
                                        decoder_workers[idx]->recv_frame_num);
                                } else {
                                    min_dec_recv_frame_num =
                                        decoder_workers[idx]->recv_frame_num;
                                }
                            }
                        }
                        if (min_dec_recv_frame_num >
                            stack_worker->common->stack_frame_num) {
                            // need to wait remaining decoded frames of the
                            // shortest stream to stack
                            ni_pthread_mutex_unlock(
                                &(stack_worker->common->lock));
                            ni_usleep(200);
                            continue;
                        } else {
                            ni_pthread_mutex_unlock(
                                &(stack_worker->common->lock));
                            // exit all decoding work threads
                            for (int i = 0; i < active_decoder_workers; i++) {
                                ni_pthread_mutex_lock(
                                    &(decoder_workers[i]->frame_entry->lock));
                                decoder_workers[i]->should_exit = 1;
                                ni_pthread_cond_broadcast(
                                    &(decoder_workers[i]->frame_entry->cond));
                                ni_pthread_mutex_unlock(
                                    &(decoder_workers[i]->frame_entry->lock));
                            }
                        }
                    }
                }
                if (stack_worker->common->exit_dec_num ==
                        active_decoder_workers &&
                    (stack_worker->common->stack_frame_num ==
                     stack_worker->common->decoded_frame_num)) {
                    ni_log(NI_LOG_INFO,
                           "stack_worker %d got eos, stack_frame_num %d, "
                           "decoded_frame_num %d\n",
                           stack_worker->index, stack_worker->common->stack_frame_num,
                           stack_worker->common->decoded_frame_num);
                    got_eos = true;
                } else {
                    ni_usleep(200);
                    continue;
                }
            }
        } else { // need uploader
            if ((stack_worker->common->uploaded_frame_num >
                 stack_worker->common->stack_frame_num) &&
                (stack_worker->common
                     ->ready_frames[(stack_worker->head % NI_MAX_HW_FRAME)] ==
                 1)) {
                ni_log(
                    NI_LOG_DEBUG,
                    "stack_worker %d head %d, ready frames[head] = %d, "
                    "uploaded_frame_num %d, stack_frame_num %d\n",
                    stack_worker->index, stack_worker->head,
                    stack_worker->common
                        ->ready_frames[(stack_worker->head % NI_MAX_HW_FRAME)],
                    stack_worker->common->uploaded_frame_num,
                    stack_worker->common->stack_frame_num);
            } else {
                if (need_reconfig) {
                    ni_pthread_mutex_lock(&(stack_worker->common->lock));
                    if (need_reconfig &&
                        (stack_worker->common->stack_frame_num ==
                         stack_worker->common->uploaded_frame_num)) {
                        ni_log(NI_LOG_DEBUG, "reconfig, stack index %d\n",
                               stack_worker->head);
                        // reconfig xstack layout
                        read_reconfig_file_and_apply_update();
                        need_reconfig = 0;
                    } // else continue until all pending frames being processed
                    ni_pthread_mutex_unlock(&(stack_worker->common->lock));
                }

                if (stack_worker->common->shortest) {
                    if (stack_worker->common->exit_upl_num > 0) {
                        // find exited upl_workers
                        int min_upl_frame_num = 0;
                        for (int idx = 0; idx < NI_MAX_XSTACK_INPUTS; idx++) {
                            if (stack_worker->common->input_eos[idx] != -1) {
                                if (min_upl_frame_num) {
                                    min_upl_frame_num = NIMIN(
                                        min_upl_frame_num,
                                        upload_workers[idx]->upl_frame_num);
                                } else {
                                    min_upl_frame_num =
                                        upload_workers[idx]->upl_frame_num;
                                }
                            }
                        }
                        if (min_upl_frame_num > stack_worker->common->stack_frame_num) {
                            // need to wait remaining uploaded frames of the
                            // shortest stream to stack
                            ni_pthread_mutex_unlock(
                                &(stack_worker->common->lock));
                            ni_usleep(200);
                            continue;
                        } else {
                            ni_pthread_mutex_unlock(
                                &(stack_worker->common->lock));
                            // exit all uploading work threads
                            for (int i = 0; i < active_upload_workers; i++) {
                                ni_pthread_mutex_lock(
                                    &(upload_workers[i]->frame_entry->lock));
                                upload_workers[i]->should_exit = 1;
                                ni_pthread_cond_broadcast(
                                    &(upload_workers[i]->frame_entry->cond));
                                ni_pthread_mutex_unlock(
                                    &(upload_workers[i]->frame_entry->lock));
                            }
                        }
                    }
                }

                if (stack_worker->common->exit_upl_num ==
                        active_upload_workers &&
                    (stack_worker->common->stack_frame_num ==
                     stack_worker->common->uploaded_frame_num)) {
                    ni_log(NI_LOG_INFO,
                           "stack_worker %d got eos, stack_frame_num %d, "
                           "uploaded_frame_num %d\n",
                           stack_worker->index, stack_worker->common->stack_frame_num,
                           stack_worker->common->uploaded_frame_num);
                    got_eos = true;
                } else {
                    ni_usleep(200);
                    continue;
                }
            }
        }

        p_dst_data = &(stack_output_data);

        p_dst_frame = &(p_dst_data->data.frame);
        p_dst_frame->video_width = stack_worker->out_width;
        p_dst_frame->video_height = stack_worker->out_height;
        if (!got_eos) {
            ret = do_stack(stack_worker, p_dst_data, &out_frame_cfg,
                           in_frame_cfg, (stack_worker->head % NI_MAX_HW_FRAME),
                           stack_worker->mode);
            if (ret != NI_RETCODE_SUCCESS) {
                ni_log(NI_LOG_ERROR, "Failed to stack inputs frames, exit\n");
                ni_pthread_mutex_lock(&(stack_worker->common->lock));
                stack_worker->common->exit_stack_num++;
                stack_worker->common->total_xstack_threads--;
                ni_pthread_mutex_unlock(&(stack_worker->common->lock));
                // deref/recycle input frames
                recycle_stack_input_hwframe(
                    stack_worker->common,
                    (stack_worker->head % NI_MAX_HW_FRAME), stack_worker->mode);
                // encoders of the stack should exit as well
                for (int i = 0; i < stack_worker->nb_encoders; i++) {
                    free_encoder_worker(stack_worker->encoder_workers[i]);
                }
                // stack worker thread break, exit to clean up resources
                return NULL;
            }
            stack_worker->common->stack_frame_num++;

            if (need_encoder) {
                // send stack output to encoders
                send_frame_to_encoders(stack_worker, p_dst_data);
            }

            // recycle input hw frames from decoder or uploader
            // deref the input frames
            recycle_stack_input_hwframe(stack_worker->common,
                                        (stack_worker->head % NI_MAX_HW_FRAME),
                                        stack_worker->mode);
            if (ret < 0) {
                ni_log(NI_LOG_ERROR,
                       "ERROR when recycle input hw frames, should exit!");
            }
            ret = 0;

            if (need_download) {
                // use hwdownload to download hw frame, recycle hwframe buffer
                ni_session_data_io_t hwdl_session_data = {0};
                ret = hwdl_frame(stack_worker->p_stack_ctx, &hwdl_session_data,
                                 p_dst_frame, stack_worker->pixel_format);

                p_surface = (niFrameSurface1_t *)(p_dst_frame->p_data[3]);
                if (ret <= 0) {
                    ni_log(NI_LOG_ERROR,
                           "stack_worker %d: HW download failed: %d",
                           stack_worker->index, ret);
                    ni_hwframe_buffer_recycle2(p_surface);
                    ni_frame_buffer_free(p_dst_frame);
                    ni_pthread_mutex_lock(&(stack_worker->common->lock));
                    stack_worker->common->exit_stack_num++;
                    stack_worker->common->total_xstack_threads--;
                    ni_pthread_mutex_unlock(&(stack_worker->common->lock));
                    return NULL;
                }
                // write hwdl frame to output_file
                write_rawvideo_data(
                    stack_worker->out_fp, stack_worker->out_width,
                    stack_worker->out_height, stack_worker->pixel_format,
                    &hwdl_session_data.data.frame);
                ni_frame_buffer_free(&hwdl_session_data.data.frame);
                ni_hwframe_buffer_recycle2(p_surface);
            }
        } else {
            p_dst_frame->end_of_stream = 1;
            if (need_encoder) {
                // send eos to encoder
                send_frame_to_encoders(stack_worker, p_dst_data);
            } else {
                ni_frame_buffer_free(p_dst_frame);
            }
            // stack worker is ready, should exit
            ni_log(NI_LOG_INFO, "stack_worker %d ready to exit\n",
                   stack_worker->index);
            ni_pthread_mutex_lock(&(stack_worker->common->lock));
            stack_worker->common->exit_stack_num++;
            stack_worker->common->total_xstack_threads--;
            ni_pthread_mutex_unlock(&(stack_worker->common->lock));
            break;
        }

        stack_worker->head += 1;
    }

    // stack worker is ready, should exit
    ni_frame_buffer_free(p_dst_frame);
    ni_log(NI_LOG_DEBUG, "stack thread exited\n");
    return NULL;
}

int xstack_thread_run(stack_worker_t *stack_worker) {
    ni_pthread_attr_t attr;
    int ret = 0;
    ret = pthread_attr_init(&attr);
    if (ret) {
        ni_log(NI_LOG_ERROR, "failed to init attr %d.\n", ret);
        return ret;
    }
    ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
    if (ret) {
        ni_log(NI_LOG_ERROR, "failed to set attr %d.\n", ret);
        pthread_attr_destroy(&attr);
        return ret;
    }

    if (ni_pthread_create(&(stack_worker->tid), &attr, stack_process_thread,
                          stack_worker)) {
        fprintf(stderr, "Error: create stack process thread failed in decode "
                        "mode\n");
        pthread_attr_destroy(&attr);
        return NI_RETCODE_FAILURE;
    }

    return ret;
}

static void help_usage(void) {
    printf(
        "Application for performing xstack processing with libxcoder API.\n"
        "Usage:\n"
        "-i | --input                    input video file name.\n"
        "-d | --decoder                  decoder name, supported ones: "
        "[h264_ni_quadra_dec, h265_ni_quadra_dec, vp9_ni_quadra_dec]\n"
        "-p | --decoder-params           decoder parameters (applicable to all "
        "decoders).\n"
        "-f | --filter                   ni_quadra_xstack filter description, "
        "should only includes nb_inputs, layout and size.\n"
        "-s | --resolution               output file resolution, in the format "
        "of width x height, "
        "should be associated with the preceeding filter description.\n"
        "-e | --encoder                  encoder name, supported ones: "
        "[h264_ni_quadra_enc, h265_ni_quadra_enc]\n"
        "-x | --encoder-params           encoder parameters.\n"
        "-l | --loop                     number of times to repeat rereading input(s)\n"
        "-o | --output                   output file name, should be "
        "associated with mode and preceeding filter description or encoder "
        "name\n"
        "-n | --devid                    device id.\n"
        "-v | --loglevel                 available debug level: warning, info, "
        "debug, trace.\n"
        "-m | --mode                     available mode: d2e, d2l, u2e, u2l. "
        "Type notation: d: Decode, e: Encode, u: Upload, l: Download.\n"
        "-h | --help                     print this help information.\n"
        "-t | --input-format             input format, default yuv420p.\n"
        "-r | --input-resolution         format: {width}x{height}, essential for "
        "u2l and u2e mode, e.g. 1280x720\n"
        "-c | --sync-mode                available sync mode: 1(shortest, "
        "default), "
        "0(not supported yet).\n");
}

// calculate the number of all encoded frames
static inline unsigned long cur_total_frames() {
    int i, j, total_frame = 0;
    if (stack_num) {
        for (i = 0; i < stack_num; i++) {
            for (j = 0; j < stack_workers[i]->nb_encoders; j++) {
                encoder_worker *enc_worker =
                    stack_workers[i]->encoder_workers[j];
                total_frame += enc_worker->enc_frame_num;
            }
        }
    } else {
        total_frame = stack_num * stack_workers[0]->common->stack_frame_num;
    }
    return total_frame;
}

static void print_report(int is_last_report, int64_t timer_start,
                         int64_t cur_time, unsigned long frame_number) {
    static int64_t last_time = -1;
    float t;
    char buf[1024];
    float fps;

    if (!print_stat)
        return;

    if (!is_last_report) {
        if (last_time == -1) {
            last_time = cur_time;
            return;
        }
        if ((cur_time - last_time) < 500000)
            return;
        last_time = cur_time;
    }

    t = (cur_time - timer_start) / 1000000.0;

    fps = t > 1 ? frame_number / t : 0;
    if (print_stat) {
        const char end = is_last_report ? '\n' : '\r';

        buf[0] = '\0';
        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
                 "frame=%5lu fps=%3.*f ", frame_number, (fps < 9.95), fps);
        if (NI_LOG_INFO > ni_log_get_level()) {
            fprintf(stderr, "%s   %c", buf, end);
        } else {
            ni_log(NI_LOG_INFO, "%s   %c", buf, end);
        }
        fflush(stderr);
    }
}

static void sigint_handler(void) {
    global_stop = 1;
    ni_log(NI_LOG_INFO, "%s().\n", __func__);
}

void *signal_handler(void *arg) {
    sigset_t *set = arg;
    int ret, sig;
    struct timespec timeout;
    siginfo_t info;

    ret = 0;

    timeout.tv_sec = 0;
    timeout.tv_nsec = 100000000; // 100ms timeout

    // loop to wait for signals and handle them
    while (!global_stop) {
        sig = -1;
        ret = sigtimedwait(set, &info, &timeout);
        if (ret > 0) {
            ni_log(NI_LOG_ERROR, "Received signal %d ret %d\n", info.si_signo,
                   ret);
            sig = info.si_signo;
        } else if (ret < 0) {
            if (EAGAIN != errno) {
                ni_log(NI_LOG_ERROR,
                       "Error sigtimedwait < 0, errno not EAGAIN: %d\n", errno);
            } else {
                ni_log(NI_LOG_DEBUG, "sigtimedwait EAGAIN !\n");
                ret = 0;
            }
        } else {
            ni_log(NI_LOG_ERROR, "sigtimedwait ret 0, should not happen ?\n");
        }

        switch (sig) {
        case SIGINT:
            sigint_handler();
            ni_log(NI_LOG_INFO, "sigint SIGINT !\n");
            break;
        case SIGUSR1:
            need_reconfig = 1;
            ni_log(NI_LOG_INFO, "signal to reconfig xstack format\n");
            sigaddset(set, SIGUSR1);
            break;
        default:
            continue;
        }
    }

    ni_log(NI_LOG_ERROR, "%s end.\n", __func__);
    return (void *)((long)ret);
}

int main(int argc, char *argv[]) {
    size_t i = 0;
    int mode = -1;
    uint8_t input_file_num = 0;
    uint8_t decoder_num = 0;
    uint8_t encoder_num = 0;
    uint8_t output_num = 0;
    upload_info upl_info[NI_MAX_XSTACK_INPUTS] = {0};

    stack_info_t stack_info[NI_MAX_XSTACK_NUM] = {0};
    bool need_decoder, need_encoder, need_uploader;
    need_decoder = need_encoder = need_uploader = false;

    char mode_description[128];
    char *input_files[NI_MAX_XSTACK_INPUTS]; // input video files, excluding
                                             // image overlay
    char *decoder_names[NI_MAX_XSTACK_INPUTS];
    char *decoder_params = NULL;
    char encConfXcoderParams[256] = {0};
    char filter_desc[NI_MAX_XSTACK_OUTPUTS][2048] = {0};
    char *ch = NULL;
    void *result;
    int width = 0;
    int height = 0;
    bool shortest = true;
    uint64_t loop = 0;
    ni_log_level_t log_level = NI_LOG_INFO;
    // Input arg handling
    int opt;
    int opt_index;
    const char *opt_string = "hi:d:p:o:e:x:s:n:f:m:v:l:r:t:c";
    static struct option long_options[] = {
        {"input", required_argument, NULL, 'i'},
        {"decoder", required_argument, NULL, 'd'},
        {"decoder-params", required_argument, NULL, 'p'},
        {"encoder", required_argument, NULL, 'e'},
        {"encoder-params", required_argument, NULL, 'x'},
        {"output", required_argument, NULL, 'o'},
        {"output-resolution", required_argument, NULL, 's'},
        {"devid", required_argument, NULL, 'n'},
        {"filter", required_argument, NULL, 'f'},
        {"input-resolution", required_argument, NULL, 'r'},
        {"input-format", required_argument, NULL, 't'},
        {"sync-mode", required_argument, NULL, 'c'},
        {"mode", required_argument, NULL, 'm'},
        {"loglevel", required_argument, NULL, 'v'},
        {"loop", required_argument, NULL, 'l'},
        {"help", no_argument, NULL, 'h'},
        {NULL, 0, NULL, 0},
    };

    while ((opt = getopt_long(argc, argv, opt_string, long_options,
                              &opt_index)) != -1) {
        switch (opt) {
        case 'i':
            if (input_file_num >= NI_MAX_XSTACK_INPUTS) {
                ni_log(NI_LOG_ERROR, "Error, exceeding max %d input files\n",
                       NI_MAX_XSTACK_INPUTS);
                help_usage();
                return EXIT_FAILURE;
            }
            input_files[input_file_num] = optarg;
            input_file_num++;
            break;
        case 'd':
            if (decoder_num >= NI_MAX_XSTACK_INPUTS) {
                ni_log(NI_LOG_ERROR, "Error, exceeding max %d decoders\n",
                       NI_MAX_XSTACK_INPUTS);
                help_usage();
                return EXIT_FAILURE;
            }
            decoder_names[decoder_num] = optarg;
            decoder_num++;
            break;
        case 'p':
            decoder_params = optarg;
            break;
        case 'e':
            encoder_num++;
            if (encoder_num > NI_MAX_XSTACK_OUTPUTS) {
                ni_log(NI_LOG_ERROR,
                       "Error, exceeding max %d encoders per stack\n",
                       NI_MAX_XSTACK_OUTPUTS);
                return EXIT_FAILURE;
            }
            if (stack_num < 1) {
                ni_log(NI_LOG_ERROR, "Error: should set encoder name after "
                                     "xstack descriptor!\n");
                break;
            }
            stack_info[stack_num - 1]
                .output_info[encoder_num - 1]
                .encoder_type = get_encoder_type(optarg);
            break;
        case 'o':
            output_num++;
            if (encoder_num > 0 && output_num != encoder_num) {
                ni_log(NI_LOG_ERROR,
                       "Error: should set output filename after encoder name, "
                       "skip current output filename %s\n",
                       optarg);
                return EXIT_FAILURE;
            }
            if (output_num > NI_MAX_XSTACK_OUTPUTS) {
                ni_log(NI_LOG_ERROR, "Error, exceeding max %d output files\n",
                       NI_MAX_XSTACK_OUTPUTS);
                help_usage();
                return EXIT_FAILURE;
            }
            if (stack_num < 1) {
                ni_log(NI_LOG_ERROR,
                       "Error: should set output filename after xstack "
                       "descriptor, skip current output filename %s\n",
                       optarg);
                help_usage();
                return EXIT_FAILURE;
            }
            stack_info[stack_num - 1].output_info[output_num - 1].output_name =
                optarg;
            stack_info[stack_num - 1].outputs_num = output_num;
            break;
        case 's':
            if (stack_num > NI_MAX_XSTACK_OUTPUTS) {
                ni_log(NI_LOG_ERROR, "Error, exceeding max %d output files\n",
                       NI_MAX_XSTACK_OUTPUTS);
                help_usage();
                return EXIT_FAILURE;
            }
            if (stack_num < 1) {
                ni_log(NI_LOG_ERROR,
                       "should to set resolution after stack descriptors, skip "
                       "current resolution %s\n",
                       optarg);
                return EXIT_FAILURE;
            }
            width = strtoul(optarg, &ch, 10);
            if (*ch != 'x') {
                ni_log(NI_LOG_ERROR, "invalid resolution format %s\n", optarg);
                return EXIT_FAILURE;
            }
            height = strtoul(ch + 1, NULL, 10);
            if (check_resolution(width, height) < 0) {
                ni_log(NI_LOG_ERROR, "invalid resolution value %s\n", optarg);
                return EXIT_FAILURE;
            }
            stack_info[stack_num - 1].out_width = width;
            stack_info[stack_num - 1].out_height = height;
            break;
        case 'x':
            strcpy(encConfXcoderParams, optarg);
            break;
        case 'r':
            if (input_file_num > NI_MAX_XSTACK_INPUTS) {
                ni_log(NI_LOG_ERROR,
                       "Error, exceeding max %d resolution files\n",
                       NI_MAX_XSTACK_INPUTS);
                help_usage();
                return EXIT_FAILURE;
            }
            if (input_file_num < 1) {
                ni_log(NI_LOG_INFO,
                       "should to set resolution after input file, skip "
                       "current resolution %s\n",
                       optarg);
                return EXIT_FAILURE;
            }
            if (upl_info[input_file_num - 1].specific_res) {
                ni_log(NI_LOG_INFO,
                       "current input resolution has been set, skip current "
                       "resolution %s\n",
                       optarg);
                return EXIT_FAILURE;
            }
            width = strtoul(optarg, &ch, 10);
            if (*ch != 'x') {
                ni_log(NI_LOG_ERROR, "invalid resolution format %s\n", optarg);
                return EXIT_FAILURE;
            }
            height = strtoul(ch + 1, NULL, 10);
            if (check_resolution(width, height) < 0) {
                ni_log(NI_LOG_ERROR, "invalid resolution value %s\n", optarg);
                return EXIT_FAILURE;
            }
            upl_info[input_file_num - 1].width = width;
            upl_info[input_file_num - 1].height = height;
            upl_info[input_file_num - 1].specific_res = true;
            break;
        case 't':
            // currently inputs of stack must be of the same format
            pix_fmt = get_pix_fmt_from_desc(optarg);
            if (pix_fmt < 0) {
                ni_log(NI_LOG_ERROR, "Unsupported format!\n");
                help_usage();
                return EXIT_FAILURE;
            } else {
                ni_log(NI_LOG_INFO, "input format is %d\n", pix_fmt);
            }
            break;
        case 'c':
            shortest = atoi(optarg);
            if (!shortest) {
                ni_log(NI_LOG_ERROR, "ERROR: now only support shortest=1\n");
                return EXIT_FAILURE;
            }
            break;
        case 'm':
            if (!(strlen(optarg) == 3)) {
                ni_log(NI_LOG_ERROR, "unrecognized args: %s for -m | --mode\n",
                       optarg);
                return EXIT_FAILURE;
            }
            for (i = 0; i < strlen(optarg); i++) {
                optarg[i] = (char)tolower((unsigned char)optarg[i]);
            }

            if (strcmp(optarg, "u2e") != 0 && strcmp(optarg, "u2l") != 0 &&
                strcmp(optarg, "d2e") != 0 && strcmp(optarg, "d2l") != 0) {
                ni_log(NI_LOG_ERROR, "unrecognized args: %s for -m | --mode\n",
                       optarg);
                return EXIT_FAILURE;
            }

            if (optarg[0] == 'd' && optarg[2] == 'e') {
                sprintf(mode_description, "Decode + Stack + Encode");
                mode = 0;
            } else if (optarg[0] == 'd' && optarg[2] == 'l') {
                sprintf(mode_description, "Decode + Stack + HW Download");
                mode = 1;
            } else if (optarg[0] == 'u' && optarg[2] == 'e') {
                sprintf(mode_description, "Upload + Stack + Encode");
                mode = 2;
            } else {
                sprintf(mode_description, "Upload + Stack + HW Download");
                mode = 3;
            }
            break;
        case 'l':
            loop = strtoul(optarg, NULL, 10);
            ni_log(NI_LOG_DEBUG, "loop: %llu\n", loop);
            break;
        case 'f':
            if (stack_num >= NI_MAX_XSTACK_OUTPUTS) {
                ni_log(NI_LOG_ERROR, "Error, exceeding max %d xstack\n",
                       NI_MAX_XSTACK_OUTPUTS);
                return EXIT_FAILURE;
            }
            strcpy(filter_desc[stack_num], optarg);
            stack_num++;
            output_num = 0;
            encoder_num = 0;
            break;
        case 'n':
            devid = atoi(optarg);
            break;
        case 'v':
            log_level = arg_to_ni_log_level(optarg);
            if (log_level != NI_LOG_INVALID) {
                ni_log_set_level(log_level);
            } else {
                help_usage();
                return EXIT_FAILURE;
            }
            break;
        case 'h':
            help_usage();
            return EXIT_SUCCESS;
        default:
            help_usage();
            return EXIT_FAILURE;
        }
    }

    // validate arguments
    if (mode < 0) {
        ni_log(NI_LOG_ERROR, "Error: missing -m mode description.\n");
        return EXIT_FAILURE;
    }
    if (!filter_desc[0]) {
        ni_log(NI_LOG_ERROR, "Error missing -f filter description.\n");
        return EXIT_FAILURE;
    }
    if (input_file_num < 2) {
        ni_log(NI_LOG_ERROR, "Error number of input files less than 2\n");
        return EXIT_FAILURE;
    }
    if (output_num < 1) {
        ni_log(NI_LOG_ERROR, "Error number of output files\n");
        return EXIT_FAILURE;
    }

    if (mode == XSTACK_APP_D2E || mode == XSTACK_APP_U2E) {
        need_encoder = true;
    }
    if (mode == XSTACK_APP_D2E || mode == XSTACK_APP_D2L) {
        if (decoder_num != input_file_num) {
            ni_log(NI_LOG_ERROR,
                   "Error number of input files %d not matching decoders %d.\n",
                   input_file_num, decoder_num);
            return EXIT_FAILURE;
        }
        need_decoder = true;
    }
    if (mode == XSTACK_APP_U2E || mode == XSTACK_APP_U2L)
        need_uploader = true;

    ni_log(NI_LOG_INFO, "%s\n", mode_description);

    int ret, nb_inputs;
    // retrieve filter parameters
    nb_inputs = input_file_num;
    ni_stack_item_t stack_items[NI_MAX_XSTACK_OUTPUTS][NI_MAX_XSTACK_INPUTS] = {
        0};
    for (i = 0; i < stack_num; i++) {
        ret = retrieve_filter_params(filter_desc[i], stack_items[i]);
        if (ret < 0) {
            ni_log(NI_LOG_ERROR,
                   "Error retrieving filter parameters from description %s.\n",
                   filter_desc);
            return EXIT_FAILURE;
        } else {
            if (ret != nb_inputs) {
                ni_log(
                    NI_LOG_ERROR,
                    "Error: filter parameter %d gives incorrect nb_inputs %d\n",
                    i, ret);
                return EXIT_FAILURE;
            }
        }
    }

    common_t *common = alloc_common();
    if (common == NULL) {
        ni_log(NI_LOG_ERROR, "failed to allocate common data.\n");
        global_state = EXIT_FAILURE;
        goto end;
    }
    common->total_dec_threads = input_file_num;
    common->total_upl_threads = input_file_num;
    common->total_xstack_threads = stack_num;
    if (need_encoder) {
        for (i = 0; i < stack_num; i++) {
            common->total_enc_threads += stack_info[i].outputs_num;
        }
    }
    common->shortest = shortest;

    ni_pthread_t sighandler_tid;
    sigset_t sig_set;
    // block SIGINT and SIGUSR1
    sigemptyset(&sig_set);
    sigaddset(&sig_set, SIGINT);
    sigaddset(&sig_set, SIGUSR1);
    ret = pthread_sigmask(SIG_BLOCK, &sig_set, NULL);
    if (ret) {
        ni_log(NI_LOG_ERROR, "sigmask block failed !\n");
        global_state = EXIT_FAILURE;
        goto end;
    }
    // create a thread to handle signals
    ret = ni_pthread_create(&sighandler_tid, NULL, signal_handler,
                            (void *)&sig_set);
    if (ret) {
        ni_log(NI_LOG_ERROR, "create sighandler thread failed !\n");
        global_state = EXIT_FAILURE;
        goto end;
    }

    if (need_decoder) {
        // alloc decoder_workers
        for (i = 0; i < input_file_num; i++) {
            decoder_workers[i] = malloc(sizeof(decoder_worker));
            if (decoder_workers[i] == NULL) {
                ni_log(NI_LOG_ERROR, "Error alloc decoder worker.\n");
                global_state = EXIT_FAILURE;
                goto end;
            }
            memset(decoder_workers[i], 0, sizeof(decoder_worker));
            active_decoder_workers++;
        }

        // run decoder threads
        for (i = 0; i < input_file_num; i++) {
            int ret = 0;
            int input_arg_pfs;
            decoder_worker *dec_worker = decoder_workers[i];
            dec_worker->index = i;
            dec_worker->common = common;

            dec_worker->file_reader.loop = loop;
            input_arg_pfs = open(input_files[i], O_RDONLY);
            if (input_arg_pfs < 0) {
                fprintf(stderr, "Error: cannot open %s\n", input_files[i]);
                fprintf(stderr, "Error: input file read failure\n");
                goto end;
            }
            dec_worker->pfs = input_arg_pfs;
            strcpy(dec_worker->filename, input_files[i]);

            dec_worker->codec_type = get_decoder_type(decoder_names[i]);
            if (dec_worker->codec_type < 0) {
                goto end;
            }

            if (decoder_params) {
                strcpy(dec_worker->decoder_params, decoder_params);
            }

            ret = init_dec_worker(dec_worker);
            if (ret) {
                ni_log(NI_LOG_ERROR, "failed init codec worker %d.\n", i);
                goto end;
            }

            // create send and receive threads for current decoder worker
            ret = dec_worker_thread_run(dec_worker);
            if (ret) {
                ni_log(NI_LOG_ERROR,
                       "failed to create send/receive thread for decoder "
                       "worker %d.\n",
                       i);
                goto end;
            }
        }
    }
    if (need_uploader) {
        // allocate upload worker
        for (i = 0; i < input_file_num; i++) {
            upload_workers[i] = malloc(sizeof(upload_worker));
            if (upload_workers[i] == NULL) {
                ni_log(NI_LOG_ERROR, "failed to allocate upload worker\n");
                global_state = EXIT_FAILURE;
                goto end;
            }
            memset(upload_workers[i], 0, sizeof(upload_worker));
            active_upload_workers++;
        }

        // run upload threads
        for (i = 0; i < input_file_num; i++) {
            ret = 0;
            int input_arg_pfs;
            upload_worker *upl_worker = upload_workers[i];
            upl_worker->index = i;
            upl_worker->common = common;

            upl_worker->file_reader.loop = loop;
            input_arg_pfs = open(input_files[i], O_RDONLY);
            if (input_arg_pfs < 0) {
                fprintf(stderr, "Error: cannot open %s\n", input_files[i]);
                fprintf(stderr, "Error: input file read failure\n");
                goto end;
            }
            upl_worker->pfs = input_arg_pfs;
            strcpy(upl_worker->filename, input_files[i]);
            upl_worker->in_width = upl_info[i].width;
            upl_worker->in_height = upl_info[i].height;
            upl_worker->pixel_format = pix_fmt;

            ret = init_upl_worker(upl_worker);
            if (ret) {
                ni_log(NI_LOG_ERROR, "Error: init_upl_worker failed\n");
                goto end;
            }

            ret = upl_worker_thread_run(upl_worker);
            if (ret) {
                ni_log(NI_LOG_ERROR, "Error: upl_worker_thread_run failed\n");
                goto end;
            }
        }
    }

    // alloc and run stack worker threads
    for (i = 0; i < stack_num; i++) {
        stack_workers[i] = malloc(sizeof(stack_worker_t));
        stack_worker_t *stack_worker = stack_workers[i];
        if (stack_worker == NULL) {
            ni_log(NI_LOG_ERROR, "Error alloc stack worker.\n");
            global_state = EXIT_FAILURE;
            goto end;
        }
        memset(stack_worker, 0, sizeof(stack_worker_t));
        if (!need_encoder) // open file for hw download
            stack_worker->out_fp =
                fopen(stack_info[i].output_info[0].output_name, "wb");
        else
            stack_worker->nb_encoders = stack_info[i].outputs_num;
        stack_worker->index = i;
        stack_worker->mode = mode;
        stack_worker->common = common;
        stack_worker->stack_items = stack_items[i];
        stack_worker->nb_inputs = input_file_num;
        stack_worker->pixel_format = pix_fmt;
        stack_worker->out_width = stack_info[i].out_width;   // output_width;
        stack_worker->out_height = stack_info[i].out_height; // output_height;
        if (need_decoder) {
            stack_worker->fps_num = decoder_workers[0]->fps_num;
            stack_worker->fps_den = decoder_workers[0]->fps_den;
        }
        ret = init_stack_worker(stack_worker);
        if (ret) {
            ni_log(NI_LOG_ERROR, "failed init stack worker %d, w %d h %d.\n",
                   stack_worker->index, stack_worker->out_width,
                   stack_worker->out_height);
            goto end;
        }

        if (need_encoder) {
            // alloc encoder worker
            for (int j = 0; j < stack_worker->nb_encoders; j++) {
                stack_worker->encoder_workers[j] =
                    malloc(sizeof(encoder_worker));
                encoder_worker *enc_worker = stack_worker->encoder_workers[j];
                if (enc_worker == NULL) {
                    ni_log(NI_LOG_ERROR, "Error alloc encoder worker.\n");
                    global_state = EXIT_FAILURE;
                    goto end;
                }
                memset(enc_worker, 0, sizeof(encoder_worker));
                active_encoder_workers++;
                // init encoder worker
                enc_worker->index = i * NI_MAX_XSTACK_OUTPUTS + j;
                enc_worker->stack_worker = stack_worker;
                enc_worker->out_fp =
                    fopen(stack_info[i].output_info[j].output_name, "wb");
                ni_log(NI_LOG_INFO, "Encoder output filename %s\n",
                       stack_info[i].output_info[j].output_name);
                enc_worker->input_width = stack_worker->out_width;
                enc_worker->input_height = stack_worker->out_height;
                enc_worker->fps_num = stack_worker->fps_num;
                enc_worker->fps_den = stack_worker->fps_den;
                enc_worker->codec_format =
                    stack_info[i].output_info[j].encoder_type;
                strcpy(enc_worker->encoder_params, encConfXcoderParams);
                enc_worker->common = common;
                if (init_enc_worker(enc_worker)) {
                    ni_log(NI_LOG_ERROR, "encoder_worker init failed\n");
                    goto end;
                }
            }

            // run encoder send/recv threads
            for (int j = 0; j < stack_worker->nb_encoders; j++) {
                ret = enc_worker_thread_run(stack_worker->encoder_workers[j]);
                if (ret) {
                    ni_log(NI_LOG_ERROR,
                           "failed to create enc_worker threads\n");
                    goto end;
                }
            }
        }

        // create work thread for stacking frames
        ret = xstack_thread_run(stack_worker);
        if (ret) {
            ni_log(NI_LOG_ERROR, "failed to create xstack thread.\n");
            goto end;
        }
    }

    int64_t timer_start = ni_get_utime();
    while (global_stop == 0) {
        if (mode == XSTACK_APP_D2L) {
            if (common->exit_dec_num == active_decoder_workers &&
                common->exit_stack_num == stack_num) {
                global_stop = 1;
                ni_log(NI_LOG_INFO, "main thread is going to exit.\n");
                break;
            }
        } else if (mode == XSTACK_APP_D2E) {
            if (common->exit_dec_num == active_decoder_workers &&
                common->exit_enc_num == active_encoder_workers) {
                global_stop = 1;
                ni_log(NI_LOG_INFO, "main thread is going to exit.\n");
                break;
            }
        } else if (mode == XSTACK_APP_U2E) {
            if (common->exit_upl_num == active_upload_workers &&
                common->exit_enc_num == active_encoder_workers) {
                global_stop = 1;
                ni_log(NI_LOG_INFO, "main thread is going to exit.\n");
                break;
            }
        } else if (mode == XSTACK_APP_U2L) {
            if (common->exit_upl_num == active_upload_workers &&
                common->exit_stack_num == stack_num) {
                global_stop = 1;
                ni_log(NI_LOG_INFO, "main thread is going to exit.\n");
                break;
            }
        }
        sleep(1);

        print_report(0, timer_start, ni_get_utime(), cur_total_frames());
    }
    print_report(1, timer_start, ni_get_utime(), cur_total_frames());

    if (ni_pthread_join(sighandler_tid, &result) == 0) {
        if ((long)result != 0) {
            ni_log(NI_LOG_ERROR, "pthread_join sighandler ret %ld\n",
                   (long)result);
            global_state = EXIT_FAILURE;
        }
    }

end:
    ni_log(NI_LOG_INFO, "come to end!\n");

    // free active encoder workers and stack workers first
    for (i = 0; i < stack_num; i++) {
        free_stack_worker(stack_workers[i]);
        stack_workers[i] = NULL;
    }

    for (i = 0; i < active_upload_workers; i++) {
        free_upload_worker(upload_workers[i]);
        upload_workers[i] = NULL;
    }

    // free active_decoder_workers
    for (i = 0; i < active_decoder_workers; i++) {
        free_decoder_worker(decoder_workers[i]);
        decoder_workers[i] = NULL;
    }

    free_common(common);
    ni_log(NI_LOG_INFO, "EXIT.. state=%d.\n", global_state);
    return global_state;
}
