#include <sys/mman.h>

#include <fcntl.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <linux/dma-buf.h>
#include <sys/ioctl.h>
#include <gbm.h>

#include <string.h>
#include <stdio.h>

#include <atomic>
#include <climits>
#include <iostream>
#include <string>
#include <vector>
#include <unordered_set>
#include <unordered_map>

#include <EGL/egl.h>
#include <EGL/eglext.h>
#include <GLES2/gl2.h>
#include <GLES2/gl2ext.h>

#include <GLES3/gl3.h>
#include <GLES3/gl3ext.h>

#include <ni_p2p_ioctl.h>
#include <ni_device_api.h>
#include <ni_av_codec.h>
#include <ni_util.h>
#include <ni_rsrc_api.h>

#include "ni_block_frame_set.h"

/*!****************************************************************************
 *  \brief  Recycle hw frame back to Quadra
 *
 *  \param [in] p2p_frame - hw frame to recycle
 *
 *  \return  Returns NI_RETCODE_SUCCESS or error
 *******************************************************************************/
int recycle_frame(ni_frame_t *p2p_frame)
{
    ni_retcode_t rc;

    rc = ni_hwframe_p2p_buffer_recycle(p2p_frame);

    if (rc != NI_RETCODE_SUCCESS)
    {
        fprintf(stderr, "Recycle failed\n");
    }

    return rc;
}

/*!****************************************************************************
 * \brief   Import a dma buf to a Quadra device
 *
 * \param  [in] p_session - upload session to the Quadra device
 *         [in] frame     - frame of the proxy GPU card containing the dma buf fd
 *         [in] frame_size - frame size in bytes
 *         [out] dma_addrs - DMA addresses of the GPU frame
 *
 * \return Returns 0 on success, -1 otherwise
 ******************************************************************************/
static int import_dma_buf(
    ni_session_context_t *p_session,
    int dma_buf_fd,
    unsigned long frame_size,
    ni_p2p_sgl_t *dma_addr)
{
    struct netint_iocmd_import_dmabuf uimp;
    int ret, i;

    uimp.fd = dma_buf_fd;
    uimp.flags = 0; // import
    uimp.domain = p_session->domain;
    uimp.bus = p_session->bus;
    uimp.dev = p_session->dev;
    uimp.fn = p_session->fn;

    // Pass frame size to kernel driver. Only necessary if the kernel
    // driver has been specially compiled for customer A1. Otherwise,
    // this can be skipped.
    uimp.dma_len[0] = frame_size;

    ret = ioctl(p_session->netint_fd, NETINT_IOCTL_IMPORT_DMABUF, &uimp);

    if (ret == 0)
    {
        for (i = 0; i < uimp.nents; i++)
        {
            dma_addr->ui32DMALen[i] = uimp.dma_len[i];
            dma_addr->ui64DMAAddr[i] = uimp.dma_addr[i];
        }
        dma_addr->ui32NumEntries = uimp.nents;
    }

    return ret;
}

/*!****************************************************************************
 * \brief   Unimport a dma buf to a Quadra device
 *
 * \param  [in] p_session - upload session to the Quadra device
 *         [in] frame     - frame of the GPU card containing the dma buf fd
 *
 * \return Returns 0 on success, -1 otherwise
 ******************************************************************************/
static int unimport_dma_buf(
    ni_session_context_t *p_session,
    int dma_buf_fd)
{
    struct netint_iocmd_import_dmabuf uimp;
    int ret;

    uimp.fd = dma_buf_fd;
    uimp.flags = 1; // unimport
    uimp.domain = p_session->domain;
    uimp.bus = p_session->bus;
    uimp.dev = p_session->dev,
    uimp.fn = p_session->fn;

    ret = ioctl(p_session->netint_fd, NETINT_IOCTL_IMPORT_DMABUF, &uimp);

    return ret;
}

/*!****************************************************************************
 *  \brief  Prepare frame on the encoding Quadra device
 *
 *  \param [in] p_upl_ctx           pointer to caller allocated upload
 *                                  session context
 *         [in] input_video_width   video width
 *         [in] input_video_height  video height
 *         [out] p2p_frame          p2p frame
 *
 *  \return  0  on success
 *          -1  on error
 ******************************************************************************/
int enc_prepare_frame(ni_session_context_t *p_upl_ctx, int input_video_width,
                      int input_video_height, ni_frame_t *p2p_frame)
{
    int ret = 0;

    p2p_frame->start_of_stream = 0;
    p2p_frame->end_of_stream = 0;
    p2p_frame->force_key_frame = 0;
    p2p_frame->extra_data_len = 0;

    // Allocate a hardware ni_frame structure for the encoder
    if (ni_frame_buffer_alloc_hwenc(
            p2p_frame, input_video_width, input_video_height,
            (int)p2p_frame->extra_data_len) != NI_RETCODE_SUCCESS)
    {
        fprintf(stderr, "Error: could not allocate hw frame buffer!\n");
        ret = -1;
        goto fail_out;
    }

#ifndef _WIN32
    if (ni_device_session_acquire_for_read(p_upl_ctx, p2p_frame))
    {
        fprintf(stderr, "Error: failed ni_device_session_acquire()\n");
        ret = -1;
        goto fail_out;
    }
#endif

    return ret;

fail_out:

    ni_frame_buffer_free(p2p_frame);
    return ret;
}

/*!****************************************************************************
 *  \brief  Send the Quadra encoder a hardware frame which triggers
 *          Quadra to encode the frame
 *
 *  \param  [in] p_enc_ctx              pointer to encoder context
 *          [in] p_in_frame             pointer to hw frame
 *          [in] input_exhausted        flag indicating this is the last frame
 *          [in/out] need_to_resend     flag indicating need to re-send
 *
 *  \return  0 on success
 *          -1 on failure
 ******************************************************************************/
int encoder_encode_frame(ni_session_context_t *p_enc_ctx,
                         ni_frame_t *p_in_frame, int input_exhausted,
                         int &need_to_resend, int &enc_eos_sent)
{
    static int started = 0;
    int oneSent;
    ni_session_data_io_t in_data;

    ni_log2(p_enc_ctx, NI_LOG_DEBUG, "===> encoder_encode_frame <===\n");

    if (enc_eos_sent == 1)
    {
        ni_log2(p_enc_ctx, NI_LOG_DEBUG, "encoder_encode_frame: ALL data (incl. eos) sent "
                                         "already!\n");
        return 0;
    }

    if (need_to_resend)
    {
        goto send_frame;
    }

    p_in_frame->start_of_stream = 0;

    // If this is the first frame, mark the frame as start-of-stream
    if (!started)
    {
        started = 1;
        p_in_frame->start_of_stream = 1;
    }

    // If this is the last frame, mark the frame as end-of-stream
    p_in_frame->end_of_stream = input_exhausted ? 1 : 0;
    p_in_frame->force_key_frame = 0;

send_frame:

    in_data.data.frame = *p_in_frame;
    oneSent =
        ni_device_session_write(p_enc_ctx, &in_data, NI_DEVICE_TYPE_ENCODER);

    if (oneSent < 0)
    {
        fprintf(stderr,
                "Error: failed ni_device_session_write() for encoder\n");
        need_to_resend = 1;
        return -1;
    }
    else if (oneSent == 0 && !p_enc_ctx->ready_to_close)
    {
        need_to_resend = 1;
        ni_log2(p_enc_ctx, NI_LOG_DEBUG, "NEEDED TO RESEND");
    }
    else
    {
        need_to_resend = 0;

        ni_log2(p_enc_ctx, NI_LOG_DEBUG, "encoder_encode_frame: total sent data size=%u\n",
                p_in_frame->data_len[3]);

        ni_log2(p_enc_ctx, NI_LOG_DEBUG, "encoder_encode_frame: success\n");

        if (p_enc_ctx->ready_to_close)
        {
            enc_eos_sent = 1;
        }
    }

    return 0;
}

/*!****************************************************************************
 *  \brief  Receive output packet data from the Quadra encoder
 *
 *  \param  [in] p_enc_ctx              pointer to encoder session context
 *          [in] p_out_data             pointer to output data session
 *          [in] p_file                 pointer to file to write the packet
 *          [out] total_bytes_received  running counter of bytes read
 *          [in] print_time             1 = print the time
 *
 *  \return 0 - success got packet
 *          1 - received eos
 *          2 - got nothing, need retry
 *         -1 - failure
 ******************************************************************************/
int encoder_receive_data(ni_session_context_t *p_enc_ctx,
                         ni_session_data_io_t *p_out_data, FILE *p_file)
{
    int packet_size = NI_MAX_TX_SZ;
    int rc = 0;
    int end_flag = 0;
    int rx_size = 0;
    int meta_size = p_enc_ctx->meta_size;
    ni_packet_t *p_out_pkt = &(p_out_data->data.packet);
    static int received_stream_header = 0;

    ni_log2(p_enc_ctx, NI_LOG_DEBUG, "===> encoder_receive_data <===\n");

    if (NI_INVALID_SESSION_ID == p_enc_ctx->session_id ||
        NI_INVALID_DEVICE_HANDLE == p_enc_ctx->blk_io_handle)
    {
        ni_log2(p_enc_ctx, NI_LOG_DEBUG, "encode session not opened yet, return\n");
        return 0;
    }

    if (p_file == NULL)
    {
        ni_log2(p_enc_ctx, NI_LOG_ERROR, "Bad file pointer, return\n");
        return -1;
    }

    rc = ni_packet_buffer_alloc(p_out_pkt, packet_size);
    if (rc != NI_RETCODE_SUCCESS)
    {
        fprintf(stderr, "Error: malloc packet failed, ret = %d!\n", rc);
        return -1;
    }

    /*
     * The first data read from the encoder session context
     * is a stream header read.
     */
    if (!received_stream_header)
    {
        /* Read the encoded stream header */
        rc = ni_encoder_session_read_stream_header(p_enc_ctx, p_out_data);

        if (rc > 0)
        {
            /* Write out the stream header */
            if (fwrite((uint8_t *)p_out_pkt->p_data + meta_size,
                       p_out_pkt->data_len - meta_size, 1, p_file) != 1)
            {
                fprintf(stderr, "Error: writing data %u bytes error!\n",
                        p_out_pkt->data_len - meta_size);
                fprintf(stderr, "Error: ferror rc = %d\n", ferror(p_file));
            }
            received_stream_header = 1;
        }
        else if (rc != 0)
        {
            fprintf(stderr, "Error: reading header %d\n", rc);
            return -1;
        }

        /* This shouldn't happen */
        if (p_out_pkt->end_of_stream)
        {
            return 1;
        }
        else if (rc == 0)
        {
            return 2;
        }
    }

receive_data:
    rc = ni_device_session_read(p_enc_ctx, p_out_data, NI_DEVICE_TYPE_ENCODER);

    end_flag = p_out_pkt->end_of_stream;
    rx_size = rc;

    ni_log2(p_enc_ctx, NI_LOG_DEBUG, "encoder_receive_data: received data size=%d\n", rx_size);

    if (rx_size > meta_size)
    {
        if (fwrite((uint8_t *)p_out_pkt->p_data + meta_size,
                   p_out_pkt->data_len - meta_size, 1, p_file) != 1)
        {
            fprintf(stderr, "Error: writing data %u bytes error!\n",
                    p_out_pkt->data_len - meta_size);
            fprintf(stderr, "Error: ferror rc = %d\n", ferror(p_file));
        }
    }
    else if (rx_size != 0)
    {
        fprintf(stderr, "Error: received %d bytes, <= metadata size %d!\n",
                rx_size, meta_size);
        return -1;
    }
    else if (!end_flag &&
             (((ni_xcoder_params_t *)(p_enc_ctx->p_session_config))
                  ->low_delay_mode))
    {
        ni_log2(p_enc_ctx, NI_LOG_DEBUG, "low delay mode and NO pkt, keep reading...\n");
        goto receive_data;
    }

    if (end_flag)
    {
        printf("Encoder Receiving done\n");
        return 1;
    }
    else if (0 == rx_size)
    {
        return 2;
    }

    ni_log2(p_enc_ctx, NI_LOG_DEBUG, "encoder_receive_data: success\n");

    return 0;
}

/*!****************************************************************************
 *  \brief  Open an encoder session to Quadra
 *
 *  \param  [out] p_enc_ctx         pointer to an encoder session context
 *          [in]  dst_codec_format  AVC or HEVC
 *          [in]  iXcoderGUID       id to identify the Quadra device
 *          [in]  p_enc_params      sets the encoder parameters
 *          [in]  width             width of frames to encode
 *          [in]  height            height of frames to encode
 *
 *  \return 0 if successful, < 0 otherwise
 ******************************************************************************/
int encoder_open_session(ni_session_context_t *p_enc_ctx, int dst_codec_format,
                         int iXcoderGUID, ni_xcoder_params_t *p_enc_params,
                         int width, int height, ni_frame_t *p_frame, ni_pix_fmt_t pix_fmt)
{
    int ret = 0;

    // Enable hardware frame encoding
    p_enc_ctx->hw_action = NI_CODEC_HW_ENABLE;
    p_enc_params->hwframes = 1;

    // Provide the first frame to the Quadra encoder
    p_enc_params->p_first_frame = p_frame;

    // Specify codec, AVC vs HEVC
    p_enc_ctx->codec_format = dst_codec_format;

    p_enc_ctx->p_session_config = p_enc_params;
    p_enc_ctx->session_id = NI_INVALID_SESSION_ID;

    // Assign the card GUID in the encoder context to open a session
    // to that specific Quadra device
    p_enc_ctx->device_handle = NI_INVALID_DEVICE_HANDLE;
    p_enc_ctx->blk_io_handle = NI_INVALID_DEVICE_HANDLE;
    p_enc_ctx->hw_id = iXcoderGUID;

    p_enc_ctx->pixel_format = pix_fmt;

    ni_encoder_set_input_frame_format(p_enc_ctx, p_enc_params, width, height, 8,
                                      NI_FRAME_LITTLE_ENDIAN, 1);

    // Encoder will operate in P2P mode
    ret = ni_device_session_open(p_enc_ctx, NI_DEVICE_TYPE_ENCODER);
    if (ret != NI_RETCODE_SUCCESS)
    {
        fprintf(stderr, "Error: encoder open session failure\n");
    }
    else
    {
        printf("Encoder device %d session open successful\n", iXcoderGUID);
    }

    return ret;
}

/*!****************************************************************************
 *  \brief  Open an upload session to Quadra
 *
 *  \param  [out] p_upl_ctx   pointer to an upload context of the open session
 *          [in]  iXcoderGUID pointer to  Quadra card hw id
 *          [in]  width       width of the frames
 *          [in]  height      height of the frames
 *          [in]  p2p         p2p session
 *
 *  \return 0 if successful, < 0 otherwise
 ******************************************************************************/
int uploader_open_session(ni_session_context_t *p_upl_ctx, int *iXcoderGUID,
                          int width, int height, int p2p, ni_pix_fmt_t pix_fmt)
{
    int ret = 0;
    ni_pix_fmt_t frame_format;

    p_upl_ctx->session_id = NI_INVALID_SESSION_ID;

    // Assign the card GUID in the encoder context
    p_upl_ctx->device_handle = NI_INVALID_DEVICE_HANDLE;
    p_upl_ctx->blk_io_handle = NI_INVALID_DEVICE_HANDLE;

    // Assign the card id to specify the specific Quadra device
    p_upl_ctx->hw_id = *iXcoderGUID;

    // Assign the pixel format we want to use
    frame_format = pix_fmt;

    // Set the input frame format of the upload session
    ni_uploader_set_frame_format(p_upl_ctx, width, height, frame_format, 1);

    ret = ni_device_session_open(p_upl_ctx, NI_DEVICE_TYPE_UPLOAD);
    if (ret != NI_RETCODE_SUCCESS)
    {
        fprintf(stderr, "Error: uploader_open_session failure!\n");
        return ret;
    }
    else
    {
        printf("Uploader device %d session opened successfully\n",
               *iXcoderGUID);
        *iXcoderGUID = p_upl_ctx->hw_id;
    }

    // Create a P2P frame pool for the uploader sesson of pool size 1
    ret = ni_device_session_init_framepool(p_upl_ctx, 1, p2p);
    if (ret < 0)
    {
        fprintf(stderr, "Error: Can't create frame pool\n");
        ni_device_session_close(p_upl_ctx, 1, NI_DEVICE_TYPE_UPLOAD);
    }
    else
    {
        printf("Uploader device %d configured successfully\n", *iXcoderGUID);
    }

    return ret;
}

static void release_egl_resource(EGLDisplay egl_display, EGLContext egl_context, EGLSyncKHR sync, PFNEGLDESTROYSYNCKHRPROC eglDestroySyncKHR)
{
    eglDestroySyncKHR(egl_display, sync);
    eglDestroyContext(egl_display, egl_context);
    eglTerminate(egl_display);
}

static void release_gbm_resource(int drm_fd, gbm_device *gbm)
{
    if (gbm)
    {
        gbm_device_destroy(gbm);
    }
    if (drm_fd)
    {
        close(drm_fd);
    }
}

// just for checking the result
static void draw(const int w, const int h, GLuint texture, GLuint fbo)
{
#define UPDATE_DRAW_NUMBER 30
    static int count = 0;
    static std::vector<std::vector<double>> color{
        {1.0, 0.7, 0.2, 1.0},
        {0.2, 1.0, 0.7, 1.0},
        {0.7, 0.2, 1.0, 1.0},
    }; // RGBA

    ++count;

    if (count % UPDATE_DRAW_NUMBER == 0)
    {
        auto back = color[color.size() - 1];
        for (int i = color.size() - 1; i > 0; --i)
        {
            color[i] = color[i - 1];
        }
        color[0] = back;
    }

    glBindTexture(GL_TEXTURE_2D, texture);
    glBindFramebuffer(GL_FRAMEBUFFER, fbo);

#if 1
    glEnable(GL_SCISSOR_TEST);

    for (int i = 0; i < 3; ++i)
    {
        glScissor(0, i * h / 3, w, ((i + 1 == 3) ? w - i * h / 3 : h / 3));
        glViewport(0, 0, w, h);
        glClearColor(color[i][0], color[i][1], color[i][2], color[i][3]);
        glClear(GL_COLOR_BUFFER_BIT);
    }

    glDisable(GL_SCISSOR_TEST);
#else

    glViewport(0, 0, w, h);
    glClearColor(color[0][0], color[0][1], color[0][2], color[0][3]);
    glClear(GL_COLOR_BUFFER_BIT);

#endif

    // glMemoryBarrier(GL_FRAMEBUFFER_BARRIER_BIT);
    glFinish();

    eglWaitGL();
    eglWaitNative(EGL_CORE_NATIVE_ENGINE);

    return;
}

#if __cplusplus > 201703L
[[maybe_unused]] static void write_RGBA(FILE *file, const int w, const int h, void *addr)
{
    if (w == 0 || h == 0 || addr == nullptr || file == nullptr)
    {
        fprintf(stderr, "Failed to write_RGBA\n");
    }
    for (int i = 0; i < w * h * 4; ++i)
    {
        if (i % (4 * 4) == 0)
        {
            fprintf(file, "\n");
        }
        // std::cout << (int)((char *)addr_gpu)[i] << " ";
        fprintf(file, "%d ", (int)(((char *)addr)[i]));
    }
}

[[maybe_unused]] static int try_to_lock_fd(int dma_buf_fd)
{
    int ret = 0;
    struct dma_buf_sync sync_end = {
        .flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_RW,
    };

    ret = ioctl(dma_buf_fd, DMA_BUF_IOCTL_SYNC, &sync_end);

    if (ret < 0)
    {
        perror("Warning: failed to lock fd. Operate: DMA_BUF_IOCTL_SYNC\n");
    }

    return ret;
}

[[maybe_unused]] static int unlock_fd(int dma_buf_fd)
{
    int ret = 0;
    struct dma_buf_sync sync_end = {
        .flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_RW,
    };

    ret = ioctl(dma_buf_fd, DMA_BUF_IOCTL_SYNC, &sync_end);

    if (ret < 0)
    {
        perror("Warning: failed to unlock fd. Operate: MA_BUF_IOCTL_SYNC\n");
    }

    return ret;
}
#endif

static void checkEGLError(const char *msg)
{
    EGLint error = eglGetError();
    if (error != EGL_SUCCESS)
    {
        std::cerr << msg << ": EGL error 0x" << std::hex << error << std::endl;
        exit(1);
    }
}

#define CHECK(x, t)                  \
    if (x)                           \
    {                                \
        throw std::runtime_error(t); \
    }

struct GPU_picture
{
    gbm_bo *bo = nullptr;
    EGLImageKHR egl_image = EGL_NO_IMAGE_KHR;
    GLuint texture = 0;
    GLuint fbo = 0;
    int poc = -1; // display order
    int fd = -1;
};

static void release_GPU_pictures(std::shared_ptr<std::vector<GPU_picture>> gpu_picture_buffer, EGLDisplay egl_display, PFNEGLDESTROYIMAGEKHRPROC eglDestroyImageKHR)
{

    for (auto &item : *gpu_picture_buffer)
    {
        GLuint fbo = item.fbo;
        GLuint texture = item.texture;
        EGLImageKHR egl_image = item.egl_image;

        int dma_buf_fd_gbm = item.fd;
        gbm_bo *bo = item.bo;

        if (egl_display != EGL_NO_DISPLAY)
        {

            if (fbo != 0)
            {
                glDeleteFramebuffers(1, &fbo);
            }
            if (texture != 0)
            {
                glDeleteTextures(1, &texture);
            }

            if (eglDestroyImageKHR && egl_image != EGL_NO_IMAGE_KHR)
            {
                eglDestroyImageKHR(egl_display, egl_image);
            }
        }

        if (dma_buf_fd_gbm >= 0)
        {
            close(dma_buf_fd_gbm);
        }

        if (bo)
        {
            gbm_bo_destroy(bo);
        }
    }
}

class Circular_GPU_picture
{
public:
    Circular_GPU_picture(std::shared_ptr<std::vector<GPU_picture>> vec) : GPU_picture_buffer(vec)
    {
        size = GPU_picture_buffer->size();
        used_size = 0;
        head = 0;
        tail = 0;
    }

    bool empty() const { return used_size == 0; }
    bool full() const { return used_size == size; }

    GPU_picture &get_tail()
    {
        CHECK(empty(), "get tail from empty queue\n");
        return (*GPU_picture_buffer)[tail];
    }

    void free_tail()
    {
        CHECK(empty(), "free tail from empty queue\n");
        tail = (tail + 1) % size;
        --used_size;
    }

    GPU_picture &get_and_advance_head()
    {
        CHECK(full(), "get and advance head from full queue\n");
        GPU_picture &item = (*GPU_picture_buffer)[head];

        head = (head + 1) % size;
        ++used_size;

        return item;
    }

private:
    std::shared_ptr<std::vector<GPU_picture>> GPU_picture_buffer;
    GPU_picture invalid_item;
    int head = -1;
    int tail = -1;
    int used_size = -1;
    int size = -1;
};


/*

The synchronization between threads is primarily achieved through these three blocking queues. 
The push and get operations on the blocking queues are controlled using mutex and condition variables.

    ni_block_frame_set draw_set;
    ni_block_frame_set p2p_set;
    ni_block_frame_set encode_set;

These three blocking queues are associated with three threads:

    draw_set --> draw_thread --> p2p_set
    p2p_set --> encoder_send_thread --> encode_set
    encode_set --> encoder_receive_thread --> draw_set

The queues store struct objects of the type struct ni_frame_stream_t, which is defined as follows:

struct ni_frame_stream_t
{
    // Currently, one p2p frame is allocated in an upload instance
    ni_session_context_t *p_upl_ctx = nullptr;

    // frame associated with p_upl_ctx
    ni_frame_t *frame = nullptr;

    // flag indicating the end of input, used to control the program's termination
    bool input_end = false;

    // display order
    int poc = -1;

    // information returned by encoder indicating if a resend is needed
    bool need_to_resend = false;

    // DMA address to read from
    ni_p2p_sgl_t *p_dma_address;
};

Objects are transferred between different threads using a producer-consumer model to achieve thread synchronization.

Main workflow: 
Initially, draw_set contains two ni_frame_stream_t objects (frame_stream), with the p_upl_ctx and frame information already filled.
At the same time, there are two buffer_gpu in the graphics card for rendering.

draw_thread: Draws images and fills in information. 
It retrieves a frame_stream resource from draw_set, and then the graphics card renders in one of the buffer_gpu. 
The DMA address in buffer_gpu is written into the p_dma_address field of frame_stream, and then frame_stream is passed to p2p_set. 
It is also responsible for controlling the need_to_resend information for encoding completion control.

encoder_send_thread: Performs p2p transmission. 
It retrieves a frame_stream resource from p2p_set and performs p2p transfer using the p_dma_address, p_upl_ctx, and frame filled in frame_stream. 
After the transmission is complete, frame_stream is passed to encode_set.

encoder_receive_thread: Performs encoding and video output operations. 
It retrieves a frame_stream resource from encode_set, and the data has been transferred to frame_stream.frame. 
This frame is then passed to the encoder for encoding. 
After obtaining the encoding result from the encoder, it fills in the need_to_resend information and passes frame_stream back to draw_set.

*/

void draw_thread_function(ni_block_frame_set &input,
                          ni_block_frame_set &output,
                          std::shared_ptr<std::vector<GPU_picture>> gpu_picture_buffer,
                          std::unordered_map<int, ni_p2p_sgl_t> &fd_dma_addr,
                          std::atomic<bool> &stop,
                          std::atomic<bool> &force_stop,
                          EGLContext &main_egl_context,
                          EGLConfig &egl_config,
                          PFNEGLWAITSYNCKHRPROC eglWaitSyncKHR,
                          EGLDisplay &egl_display,
                          EGLSyncKHR &egl_sync,
                          EGLint egl_context_attribs[],
                          const int w,
                          const int h)
{
    Circular_GPU_picture circular_gpu_picture_buffer(gpu_picture_buffer);

    int poc = 0;
    bool last_frame_sent = false;
    int frame_count = 0;
    const int all_frame_count = 500;

    EGLContext shared_context = eglCreateContext(egl_display, egl_config, main_egl_context, egl_context_attribs);
    if (shared_context == EGL_NO_CONTEXT)
    {
        fprintf(stderr, "Failed to create shared context\n");
        force_stop = true;
    }

    if (!force_stop)
    {
        if (!(eglMakeCurrent(egl_display, EGL_NO_SURFACE, EGL_NO_SURFACE, shared_context) == EGL_TRUE))
        {
            std::cerr << "Failed to make the EGLContext shared_context current.\n";
            force_stop = true;
        }
    }

    while (true)
    {
        if (force_stop)
        {
            ni_block_frame_set::ni_frame_stream_t force_stop_frame_stream;
            force_stop_frame_stream.poc = INT_MIN;
            output.push(force_stop_frame_stream);
            break;
        }

        ni_block_frame_set::ni_frame_stream_t frame_stream = input.get_and_pop();

        if (frame_stream.need_to_resend)
        {
            const GPU_picture &gpu_picture = circular_gpu_picture_buffer.get_tail();
            CHECK(gpu_picture.fd < 0 || gpu_picture.poc != frame_stream.poc, "Incorrect fd or poc\n");
            frame_stream.need_to_resend = false;
            output.push(frame_stream);
        }
        else
        {
            if (frame_stream.poc >= 0)
            {
                circular_gpu_picture_buffer.free_tail();

                // here we just stop when the encoded frame reaches to all_frame_count
                ++frame_count;
                if (frame_count == all_frame_count - 1)
                {
                    stop = true;
                }
            }

            if (stop && last_frame_sent)
            {
                if (circular_gpu_picture_buffer.empty())
                {
                    break;
                }
                else
                {
                    continue;
                }
            }

            GPU_picture &gpu_picture = circular_gpu_picture_buffer.get_and_advance_head();
            gpu_picture.poc = poc;

            draw(w, h, gpu_picture.texture, gpu_picture.fbo);
            eglWaitSyncKHR(egl_display, egl_sync, 0);
            
            int dma_fd = gpu_picture.fd;
            auto it = fd_dma_addr.find(dma_fd);
            if(it == fd_dma_addr.end())
            {
                std::cerr << "Could not find the dma addr associated with dma fd:" << dma_fd << "\n";
                force_stop = true;
                continue;
            }

            ni_p2p_sgl_t *dma_addr = &(it->second);

            frame_stream.p_dma_address = dma_addr;
            frame_stream.need_to_resend = false;
            frame_stream.input_end = stop;
            frame_stream.poc = poc;

            output.push(frame_stream);

            if (stop)
            {
                last_frame_sent = true;
            }

            ++poc;
        }
    }

    if (shared_context != EGL_NO_CONTEXT)
    {
        eglDestroyContext(egl_display, shared_context);
    }
}

void encoder_send_thread_function(ni_block_frame_set &input,
                                  ni_block_frame_set &output,
                                  std::atomic<bool> &force_stop,
                                  const int w,
                                  const int h)

{
    while (true)
    {
        ni_block_frame_set::ni_frame_stream_t frame_stream = input.get_and_pop();

        if (force_stop)
        {
            frame_stream.poc = INT_MIN;
            output.push(frame_stream);
            break;
        }

        if (frame_stream.input_end)
        {
            output.push(frame_stream);
            break;
        }

        /*
         Since an upload has one buffer (p2p_frame),
         data reading is performed using the upload associated with this frame.
        */
        ni_p2p_sgl_t *p_dma_addrs = frame_stream.p_dma_address;
        ni_frame_t *p2p_frame = frame_stream.frame;
        ni_session_context_t *p_upl_ctx = frame_stream.p_upl_ctx;

        // Execute a P2P read into the frame
        int ret = ni_p2p_recv(p_upl_ctx, p_dma_addrs, p2p_frame);

        if (ret != NI_RETCODE_SUCCESS)
        {
            fprintf(stderr, "Error! Error! can't pull frame\n");

            force_stop = true;
            frame_stream.poc = INT_MIN;
            output.push(frame_stream);
            break;
        }

        output.push(frame_stream);
    }
}

void encoder_receive_thread_function(ni_block_frame_set &input,
                                     ni_block_frame_set &output,
                                     ni_session_context_t *p_enc_ctx,
                                     std::atomic<bool> &force_stop,
                                     ni_session_data_io_t &out_packet,
                                     const int w,
                                     const int h,
                                     FILE *p_file)
{
    int need_to_resend = 0;
    int input_exhausted = 0;
    int eos_send = 0;
    int send_fin_flag = 0;
    int receive_fin_flag = 0;

    int resend_poc = 0;

    int frame_count = 0;

    while (true)
    {
        ni_block_frame_set::ni_frame_stream_t frame_stream;

        if (need_to_resend)
        {
            frame_stream = input.get_and_pop_if_begin_poc(resend_poc);
            need_to_resend = 0;
        }
        else
        {
            frame_stream = input.get_and_pop();
        }

        if (force_stop)
        {
            frame_stream.poc = INT_MIN;
            output.push(frame_stream);
            break;
        }

        ni_frame_t *p_p2p_frame = frame_stream.frame;

        if (frame_stream.input_end)
        {
            input_exhausted = 1;
        }

        send_fin_flag = encoder_encode_frame(p_enc_ctx, p_p2p_frame,
                                             input_exhausted, need_to_resend, eos_send);

        if (send_fin_flag == 2)
        {
            // error
            force_stop = true;
            frame_stream.poc = INT_MIN;
            output.push(frame_stream);
            break;
        }

        if (need_to_resend)
        {
            frame_stream.need_to_resend = true;
            output.push(frame_stream);
            resend_poc = frame_stream.poc;
            continue;
        }

        // Receive encoded packet data from the encoder
        receive_fin_flag = encoder_receive_data(
            p_enc_ctx, &out_packet, p_file);

        // Error or eos
        if (receive_fin_flag < 0)
        {
            fprintf(stderr, "ERROR: receive_fin_flag=%d\n", receive_fin_flag);
            // error
            force_stop = true;
            frame_stream.poc = INT_MIN;
            output.push(frame_stream);
            break;
        }

        output.push(frame_stream);

        if (out_packet.data.packet.end_of_stream)
        {
            break;
        }

        ++frame_count;
        std::cout << "Frame: " << frame_count << std::endl;
    }
}

int main(int argc, const char *argv[])
{
    const int w = 1920;
    const int h = 1080;

    const int input_video_width = w;
    const int input_video_height = h;

    const int arg_width = w;
    const int arg_height = h;

    const ni_pix_fmt_t pix_fmt = NI_PIX_FMT_RGBA;

    const int dst_codec_format = NI_CODEC_FORMAT_H265;

    int frame_size_just_for_A1 = w * h * 4;

    // int draw_count = 0;
    // const int all_draw_count = 500;

    const int pool_size = 2;

    FILE *p_file = NULL;

    std::unordered_set<std::string> parameters(argv + 1, argv + argc);

    std::string drm_parameter_name_prefix{"--drm_device="};
    std::string drm_driver_name;
    // bool find_drm_name{false};
    for (const std::string &parameter : parameters)
    {
        if (parameter.size() > drm_parameter_name_prefix.size() &&
            parameter.compare(0, drm_parameter_name_prefix.size(), drm_parameter_name_prefix) == 0)
        {
            drm_driver_name = parameter.substr(drm_parameter_name_prefix.size());
            break;
        }
    }

    if (drm_driver_name == "")
    {
        std::cerr << "drm device is not specified. "
                     "Please use the option --drm_device=/absolute/path/to/drm/device to set the device path.\n";
        return -1;
    }

    std::string output_parameter_name_prefix{"--output="};
    std::string output_file_name = "p2p_read_gbm_egl_test.h265";
    // bool find_drm_name{false};
    for (const std::string &parameter : parameters)
    {
        if (parameter.size() > output_parameter_name_prefix.size() &&
            parameter.compare(0, output_parameter_name_prefix.size(), output_parameter_name_prefix) == 0)
        {
            output_file_name = parameter.substr(output_parameter_name_prefix.size());
            break;
        }
    }

    if (output_file_name == "")
    {
        std::cerr << "output file name is empty. "
                     "Please use the option --output=path/to/output to set the output path.\n";
        return -1;
    }

    p_file = fopen(output_file_name.c_str(), "w+");

    // open DRM device
    int drm_fd = open(drm_driver_name.c_str(), O_RDWR | O_CLOEXEC);
    if (drm_fd < 0)
    {
        std::cerr << "Failed to open DRM device: " << drm_driver_name << "\n";
        return -1;
    }

    // create GBM device
    gbm_device *gbm = gbm_create_device(drm_fd);
    if (!gbm)
    {
        std::cerr << "Failed to create GBM device\n";
        close(drm_fd);
        return -1;
    }

    std::shared_ptr<std::vector<GPU_picture>> gpu_picture_buffer = std::make_shared<std::vector<GPU_picture>>(pool_size);

    int ret = 0;

    ret = 0;
    // get two pairs of (gpu buffer + dma buf fd)
    for (int i = 0; i < pool_size; ++i)
    {
        // create GBM buffer object
        gbm_bo *bo = gbm_bo_create(gbm, w, h, GBM_FORMAT_ARGB8888,
                                   GBM_BO_USE_RENDERING | GBM_BO_USE_LINEAR);
        if (!bo)
        {
            std::cerr << "Failed to create GBM buffer object\n";
            ret = -1;
            break;
        }
        (*gpu_picture_buffer)[i].bo = bo;

        // get DMA-BUF fd
        int dma_buf_fd = gbm_bo_get_fd(bo);
        if (dma_buf_fd < 0)
        {
            std::cerr << "Failed to get DMA-BUF file descriptor\n";
            ret = -1;
            break;
        }
        (*gpu_picture_buffer)[i].fd = dma_buf_fd;

        std::cout << "DMA-BUF file descriptor: " << dma_buf_fd << std::endl;
    }

    if (ret < 0)
    {
        release_GPU_pictures(gpu_picture_buffer, EGL_NO_DISPLAY, nullptr);
        release_gbm_resource(drm_fd, gbm);
        return -1;
    }

    ret = 0;
    EGLDisplay egl_display = eglGetDisplay((EGLNativeDisplayType)gbm);
    if (egl_display == EGL_NO_DISPLAY)
    {
        std::cerr << "Failed to get EGL display\n";
        release_GPU_pictures(gpu_picture_buffer, EGL_NO_DISPLAY, nullptr);
        release_gbm_resource(drm_fd, gbm);
        return -1;
    }

    if (!eglInitialize(egl_display, nullptr, nullptr))
    {
        std::cerr << "Failed to initialize EGL\n";
        checkEGLError("eglInitialize");
        release_GPU_pictures(gpu_picture_buffer, EGL_NO_DISPLAY, nullptr);
        release_gbm_resource(drm_fd, gbm);
        return -1;
    }

    // get EGL configure
    EGLint num_configs;
    EGLConfig egl_config;
    EGLint egl_config_attribs[] = {
        EGL_SURFACE_TYPE, EGL_WINDOW_BIT,
        EGL_RED_SIZE, 8,
        EGL_GREEN_SIZE, 8,
        EGL_BLUE_SIZE, 8,
        EGL_ALPHA_SIZE, 8,
        EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT,
        EGL_NONE};

    if (!eglChooseConfig(egl_display, egl_config_attribs, &egl_config, 1, &num_configs) || num_configs == 0)
    {
        std::cerr << "Failed to choose EGL config\n";
        checkEGLError("eglChooseConfig");
        eglTerminate(egl_display);
        release_GPU_pictures(gpu_picture_buffer, EGL_NO_DISPLAY, nullptr);
        release_gbm_resource(drm_fd, gbm);
        return -1;
    }

    // create EGL context
    EGLint egl_context_attribs[] = {
        EGL_CONTEXT_CLIENT_VERSION, 3,
        EGL_NONE};
    EGLContext egl_context = eglCreateContext(egl_display, egl_config, EGL_NO_CONTEXT, egl_context_attribs);
    if (egl_context == EGL_NO_CONTEXT)
    {
        std::cerr << "Failed to create EGL context\n";
        checkEGLError("eglCreateContext");
        eglTerminate(egl_display);
        release_GPU_pictures(gpu_picture_buffer, EGL_NO_DISPLAY, nullptr);
        release_gbm_resource(drm_fd, gbm);
        return -1;
    }

    if (!(eglMakeCurrent(egl_display, EGL_NO_SURFACE, EGL_NO_SURFACE, egl_context) == EGL_TRUE))
    {
        std::cerr << "Failed to make the EGLContext current.\n";
        eglTerminate(egl_display);
        release_GPU_pictures(gpu_picture_buffer, EGL_NO_DISPLAY, nullptr);
        release_gbm_resource(drm_fd, gbm);
        return -1;
    }

    PFNEGLCREATEIMAGEKHRPROC eglCreateImageKHR = (PFNEGLCREATEIMAGEKHRPROC)eglGetProcAddress("eglCreateImageKHR");
    PFNGLEGLIMAGETARGETTEXTURE2DOESPROC glEGLImageTargetTexture2DOES = (PFNGLEGLIMAGETARGETTEXTURE2DOESPROC)eglGetProcAddress("glEGLImageTargetTexture2DOES");
    PFNEGLDESTROYIMAGEKHRPROC eglDestroyImageKHR = (PFNEGLDESTROYIMAGEKHRPROC)eglGetProcAddress("eglDestroyImageKHR");

    PFNEGLCREATESYNCKHRPROC eglCreateSyncKHR = (PFNEGLCREATESYNCKHRPROC)eglGetProcAddress("eglCreateSyncKHR");
    PFNEGLDESTROYSYNCKHRPROC eglDestroySyncKHR = (PFNEGLDESTROYSYNCKHRPROC)eglGetProcAddress("eglDestroySyncKHR");
    // PFNEGLCLIENTWAITSYNCKHRPROC eglClientWaitSyncKHR = (PFNEGLCLIENTWAITSYNCKHRPROC)eglGetProcAddress("eglClientWaitSyncKHR");
    PFNEGLWAITSYNCKHRPROC eglWaitSyncKHR = (PFNEGLWAITSYNCKHRPROC)eglGetProcAddress("eglWaitSyncKHR");

    if (!eglCreateImageKHR || !glEGLImageTargetTexture2DOES || !eglDestroyImageKHR ||
        !eglCreateSyncKHR || !eglDestroySyncKHR || !eglWaitSyncKHR)
    {
        std::cerr << "Failed to load egl extension functions\n";
        eglDestroyContext(egl_display, egl_context);
        eglTerminate(egl_display);
        release_GPU_pictures(gpu_picture_buffer, EGL_NO_DISPLAY, nullptr);
        release_gbm_resource(drm_fd, gbm);
        return -1;
    }

    // create the sync for refreshing the GPU memory
    EGLSyncKHR egl_sync = eglCreateSyncKHR(egl_display, EGL_SYNC_FENCE_KHR, NULL);
    if (egl_sync == EGL_NO_SYNC_KHR)
    {
        std::cerr << "Failed to create EGLSyncKHR\n";
        eglDestroyContext(egl_display, egl_context);
        eglTerminate(egl_display);
        release_GPU_pictures(gpu_picture_buffer, EGL_NO_DISPLAY, nullptr);
        release_gbm_resource(drm_fd, gbm);
        return -1;
    }

    ret = 0;
    for (int i = 0; i < pool_size; ++i)
    {
        int dma_buf_fd = (*gpu_picture_buffer)[i].fd;
        gbm_bo *bo = (*gpu_picture_buffer)[i].bo;

        // create EGLImage
        EGLint image_attrs[] = {
            EGL_WIDTH, w,
            EGL_HEIGHT, h,
            EGL_LINUX_DRM_FOURCC_EXT, GBM_FORMAT_ARGB8888,
            EGL_DMA_BUF_PLANE0_FD_EXT, dma_buf_fd,
            EGL_DMA_BUF_PLANE0_OFFSET_EXT, 0,
            EGL_DMA_BUF_PLANE0_PITCH_EXT, (int)gbm_bo_get_stride(bo),
            EGL_NONE};

        EGLImageKHR egl_image = eglCreateImageKHR(egl_display, EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, nullptr, image_attrs);
        if (egl_image == EGL_NO_IMAGE_KHR)
        {
            std::cerr << "Failed to create EGLImage\n";
            checkEGLError("eglCreateImageKHR");
            ret = -1;
            break;
        }
        (*gpu_picture_buffer)[i].egl_image = egl_image;

        // create an OpenGL texture
        GLuint texture;
        glGenTextures(1, &texture);
        glBindTexture(GL_TEXTURE_2D, texture);
        glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, egl_image);
        checkEGLError("glEGLImageTargetTexture2DOES");

        // create and bind a FBO
        GLuint fbo;
        glGenFramebuffers(1, &fbo);
        glBindFramebuffer(GL_FRAMEBUFFER, fbo);
        glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, texture, 0);

        (*gpu_picture_buffer)[i].texture = texture;
        (*gpu_picture_buffer)[i].fbo = fbo;

        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE)
        {
            std::cerr << "Framebuffer is not complete\n";
            checkEGLError("glCheckFramebufferStatus");
            ret = -1;
            break;
        }
    }

    if (ret < 0)
    {
        release_GPU_pictures(gpu_picture_buffer, egl_display, eglDestroyImageKHR);
        release_egl_resource(egl_display, egl_context, egl_sync, eglDestroySyncKHR);
        release_gbm_resource(drm_fd, gbm);
        return -1;
    }

    ret = 0;

    /*  GBM EGL init ok*/

    /* INIT QUADRA */

    int iXcoderGUID = 0;
    int saved_guid = iXcoderGUID;

    ni_xcoder_params_t api_param{};
    ni_session_context_t enc_ctx{.device_handle = NI_INVALID_DEVICE_HANDLE, .blk_io_handle = NI_INVALID_DEVICE_HANDLE};

    std::vector<ni_session_context_t> upl_ctx_vec(pool_size, {.device_handle = NI_INVALID_DEVICE_HANDLE, .blk_io_handle = NI_INVALID_DEVICE_HANDLE});
    std::vector<ni_frame_t> p2p_frame_vec(pool_size);

    ni_session_data_io_t out_packet{};

    const int encode_start_frame_pos = 0;

    std::unordered_map<int /*fd*/, ni_p2p_sgl_t> fd_dma_address;

    ni_block_frame_set draw_set;
    ni_block_frame_set p2p_set;
    ni_block_frame_set encode_set;

    std::shared_ptr<std::thread> draw_thread_ptr;
    std::shared_ptr<std::thread> encoder_send_thread_ptr;
    std::shared_ptr<std::thread> encoder_receive_thread_ptr;

    std::atomic<bool> this_stop{false};
    std::atomic<bool> force_stop{false};

    if (ni_device_session_context_init(&enc_ctx) < 0)
    {
        fprintf(stderr, "Error: init encoder context error\n");
        goto end;
    }

    for (int i = 0; i < pool_size; ++i)
    {
        if (ni_device_session_context_init(&upl_ctx_vec[i]) < 0)
        {
            fprintf(stderr, "Error: init uploader context error\n");
            goto end;
        }

        // Open a P2P upload session to the destination Quadra device that will
        // be doing the video encoding
        if (uploader_open_session(&upl_ctx_vec[i], &iXcoderGUID, input_video_width, input_video_height, 0, pix_fmt))
        {
            goto end;
        }

        if (i == 0)
        {
            saved_guid = iXcoderGUID;
        }
        else if (saved_guid != iXcoderGUID)
        {
            std::cerr << "upload instances are not in the same card\n";
            goto end;
        }

        saved_guid = iXcoderGUID;

        ret = enc_prepare_frame(&upl_ctx_vec[i], input_video_width, input_video_height,
                                &p2p_frame_vec[i]);

        if (ret < 0)
        {
            goto end;
        }
    }

    // Configure the encoder parameter structure. We'll use some basic
    // defaults: 30 fps, 200000 bps CBR encoding, AVC or HEVC encoding
    if (ni_encoder_init_default_params(&api_param, 30, 1, 200000, arg_width,
                                       arg_height, (ni_codec_format_t)enc_ctx.codec_format) < 0)
    {
        fprintf(stderr, "Error: encoder init default set up error\n");
        goto end;
    }

    // For P2P demo, change some of the encoding parameters from
    // the default. Enable low delay encoding.
    if ((ret = ni_encoder_params_set_value(&api_param, "lowDelay", "1")) !=
        NI_RETCODE_SUCCESS)
    {
        fprintf(stderr, "Error: can't set low delay mode %d\n", ret);
        goto end;
    }

    // Use a GOP preset of 9 which represents a GOP pattern of
    // IPPPPPPP....This will be low latency encoding.
    if ((ret = ni_encoder_params_set_value(&api_param, "gopPresetIdx", "9")) !=
        NI_RETCODE_SUCCESS)
    {
        fprintf(stderr, "Error: can't set gop preset %d\n", ret);
        goto end;
    }

    if (pix_fmt == NI_PIX_FMT_RGBA)
    {
        // Quadra encoder always generates full range YCbCr
        if (ni_encoder_params_set_value(&api_param, "videoFullRangeFlag", "1") !=
            NI_RETCODE_SUCCESS)
        {
            fprintf(stderr, "Error: can't set video full range\n");
            goto end;
        }

        // sRGB has the same color primaries as BT.709/IEC-61966-2-1
        if (ni_encoder_params_set_value(&api_param, "colorPri", "1") !=
            NI_RETCODE_SUCCESS)
        {
            fprintf(stderr, "Error: can't set color primaries\n");
            goto end;
        }

        // Quadra encoder converts to YUV420 using BT.709 matrix
        if (ni_encoder_params_set_value(&api_param, "colorSpc", "1") !=
            NI_RETCODE_SUCCESS)
        {
            fprintf(stderr, "Error: can't set color space\n");
            goto end;
        }

        // sRGB transfer characteristics is IEC-61966-2-1
        if (ni_encoder_params_set_value(&api_param, "colorTrc", "13") !=
            NI_RETCODE_SUCCESS)
        {
            fprintf(stderr, "Error: can't set color transfer characteristics\n");
            goto end;
        }
    }

    // Open the encoder session with given parameters
    ret = encoder_open_session(&enc_ctx, dst_codec_format, iXcoderGUID,
                               &api_param, arg_width, arg_height, &p2p_frame_vec[encode_start_frame_pos], pix_fmt);

    if (ret < 0)
    {
        fprintf(stderr, "Could not open encoder session\n");
        goto end;
    }

    ret = 0;

    for (int i = 0; i < pool_size; ++i)
    {
        ni_p2p_sgl_t dma_addr{ {0}, 0, {0} };
        int fd = (*gpu_picture_buffer)[i].fd;

        ret = import_dma_buf(&upl_ctx_vec[i], fd, frame_size_just_for_A1, &dma_addr);
        if(ret != 0)
        {
            fprintf(stderr, "Failed to import dma buff\n");
            break;
        }

        fd_dma_address.insert({fd, dma_addr});
    }

    if(ret != 0)
    {
        goto end;
    }

    for (int i = 0; i < pool_size; ++i)
    {
        ni_block_frame_set::ni_frame_stream_t item{&upl_ctx_vec[i], &p2p_frame_vec[i], false, i - pool_size, false, nullptr};
        draw_set.push_with_no_mutex(item);
    }

    {

        std::shared_ptr<std::thread> draw = std::make_shared<std::thread>(draw_thread_function,
                                                                          std::ref(draw_set),
                                                                          std::ref(p2p_set),
                                                                          gpu_picture_buffer,
                                                                          std::ref(fd_dma_address),
                                                                          std::ref(this_stop),
                                                                          std::ref(force_stop),
                                                                          std::ref(egl_context),
                                                                          std::ref(egl_config),
                                                                          eglWaitSyncKHR,
                                                                          std::ref(egl_display),
                                                                          std::ref(egl_sync),
                                                                          egl_context_attribs,
                                                                          w,
                                                                          h);

        std::shared_ptr<std::thread> send = std::make_shared<std::thread>(encoder_send_thread_function,
                                                                          std::ref(p2p_set),
                                                                          std::ref(encode_set),
                                                                          std::ref(force_stop),
                                                                          w,
                                                                          h);

        std::shared_ptr<std::thread> receive = std::make_shared<std::thread>(encoder_receive_thread_function,
                                                                             std::ref(encode_set),
                                                                             std::ref(draw_set),
                                                                             &enc_ctx,
                                                                             std::ref(force_stop),
                                                                             std::ref(out_packet),
                                                                             w,
                                                                             h,
                                                                             p_file);

        draw_thread_ptr = draw;
        encoder_send_thread_ptr = send;
        encoder_receive_thread_ptr = receive;
    }

end:
    if (draw_thread_ptr)
    {
        draw_thread_ptr->join();
    }
    if (encoder_send_thread_ptr)
    {
        encoder_send_thread_ptr->join();
    }
    if (encoder_receive_thread_ptr)
    {
        encoder_receive_thread_ptr->join();
    }

    int eos_sent = (force_stop ? 0 : 1);

    for (auto &p2p_frame : p2p_frame_vec)
    {
        if (p2p_frame.p_data[3])
        {
            recycle_frame(&p2p_frame);
        }
    }

    for(int i = 0; i < pool_size; ++i)
    {
        int fd = (*gpu_picture_buffer)[i].fd;
        if(fd > 0 && fd_dma_address[fd].ui32NumEntries != 0)
        {
            unimport_dma_buf(&upl_ctx_vec[i], fd);
        }
    }

    for (ni_session_context_t &upl_ctx : upl_ctx_vec)
    {
        if (upl_ctx.device_handle != NI_INVALID_EVENT_HANDLE)
        {
            ni_device_session_close(&upl_ctx, eos_sent, NI_DEVICE_TYPE_UPLOAD);
        }

        ni_device_session_context_clear(&upl_ctx);
    }

    if (enc_ctx.device_handle != NI_INVALID_EVENT_HANDLE)
    {
        ni_device_session_close(&enc_ctx, eos_sent, NI_DEVICE_TYPE_ENCODER);
    }

    ni_device_session_context_clear(&enc_ctx);

    for (auto &p2p_frame : p2p_frame_vec)
    {
        ni_frame_buffer_free(&p2p_frame);
    }

    ni_packet_buffer_free(&(out_packet.data.packet));

    std::unordered_set<int> handles;
    for (ni_session_context_t &upl_ctx : upl_ctx_vec)
    {
        handles.insert(upl_ctx.device_handle);
        handles.insert(upl_ctx.blk_io_handle);
    }
    handles.insert(enc_ctx.device_handle);
    handles.insert(enc_ctx.blk_io_handle);

    for(int fd : handles)
    {
        close(fd);
    }

    if (p_file)
    {
        fclose(p_file);
    }

    release_GPU_pictures(gpu_picture_buffer, egl_display, eglDestroyImageKHR);
    release_egl_resource(egl_display, egl_context, egl_sync, eglDestroySyncKHR);
    release_gbm_resource(drm_fd, gbm);

    return 0;
}
