Analysis of ffmpeg hardware accelerated playback framework

summary

Ffmpeg provides API interfaces, and users can call these APIs to realize various functions of ffmpeg. To design the hardware decoding framework of video based on ffmpeg, we need to deeply understand the hardware decoding framework of ffmpeg. Therefore, this paper first analyzes the code framework of ffplay, starts with ffplay, goes deep into the source code of ffmpeg, analyzes how ffplay calls NVIDIA hardware decoding framework, and designs the hardware decoding framework of video on this basis.

This paper only analyzes the hardware decoding framework of NVIDIA called by ffplay. In addition to playing, the hardware acceleration framework of ffmpeg is also used in transcoding. This paper will not analyze the hardware acceleration framework of ffmpeg transcoding.

1. general process of multimedia playback

2. ffplay framework analysis

ffplay architecture diagram

1. A main thread. The main loop is responsible for video playback and SDL message processing

2 start a demultiplexing sub thread in the main thread, and use AV_ read_ The frame () interface separates audio and video, and the read video packets and audio packets are stored in a cache queue, video packets queue and audio packets queue, respectively.

3. Two other threads are created in the demultiplexing sub thread, namely, the video decoding thread and the audio decoding thread. The audio and video packets are extracted from the video queue and audio queue for decoding, and the decoded frames are stored in the cache queues video frames queue and audio frames queue.

4 the main thread plays video by obtaining the image information in the video frames queue. By default, ffplay uses audio as the main clock, and the video clock is synchronized to the audio clock.

The ffplay framework is shown below.

Main structure of ffplay

1 struct VideoState

The struct VideoState structure is the most important structure throughout ffplay. It stores the input stream information, output frame data, clock information, packet cache queue before decoding, frame cache queue after decoding, SDL control information, SDL window information, video parameter information, such as video height, width, audio sampling information, code rate information, filter information, video image scaling information, etc. required by the program.

The benefits of this article are to receive the latest and complete C++ audio and video learning improvement materials, including (C/C++, Linux server development, FFmpeg, webRTC, rtmp, hls, rtsp, ffplay, srs) ↓↓↓↓↓↓↓ see the following at the bottom of the article

  

typedef struct VideoState {
    SDL_Thread *read_tid;           // demux demultiplexing thread
    AVInputFormat *iformat;
    int abort_request;
    int force_refresh;
    int paused;
    int last_paused;
    int queue_attachments_req;
    int seek_req;                   // Identify a SEEK request
    int seek_flags;                 // SEEK flag, such as AVSEEK_FLAG_BYTE, etc
    int64_t seek_pos;               // Target position of SEEK (current position + increment)
    int64_t seek_rel;               // Position increment of this SEEK
    int read_pause_return;
    AVFormatContext *ic;
    int realtime;
    Clock audclk;                   // Audio clock
    Clock vidclk;                   // Video clock
    Clock extclk;                   // External clock

    FrameQueue pictq;               // Video frame queue
    FrameQueue subpq;               // Subtitle frame queue
    FrameQueue sampq;               // Audio frame queue

    Decoder auddec;                 // Audio decoder
    Decoder viddec;                 // Video Decoder 
    Decoder subdec;                 // subtitle decoder 

    int audio_stream;               // Audio stream index

    int av_sync_type;

    double audio_clock;             // This value is updated for each audio frame, expressed in the form of pts
    int audio_clock_serial;         // Play the sequence, seek to change this value
    double audio_diff_cum; /* used for AV difference average computation */
    double audio_diff_avg_coef;
    double audio_diff_threshold;
    int audio_diff_avg_count;
    AVStream *audio_st;             // Audio stream
    PacketQueue audioq;             // Audio packet queue
    int audio_hw_buf_size;          // SDL audio buffer size in bytes
    uint8_t *audio_buf;             // Point to a frame of audio data to be played, and the pointed data area will be copied into the SDL audio buffer. Point to audio if resampled_ Buf1, otherwise it points to the audio in the frame
    uint8_t *audio_buf1;            // Audio resampled output buffer
    unsigned int audio_buf_size; /* in bytes */ // The size of one frame of audio data to be played (audio_buf points to)
    unsigned int audio_buf1_size;   // Audio buffer requested_ Actual dimensions of buf1
    int audio_buf_index; /* in bytes */ // The location index of the SDL audio buffer copied into the current audio frame (pointing to the first byte to be copied)
    int audio_write_buf_size;       // The amount of data in the current audio frame that has not been copied into the SDL audio buffer, audio_buf_size = audio_buf_index + audio_write_buf_size
    int audio_volume;               // volume
    int muted;                      // Mute state
    struct AudioParams audio_src;   // Parameters of audio frame
#if CONFIG_AVFILTER
    struct AudioParams audio_filter_src;
#endif
    struct AudioParams audio_tgt;   // Audio parameters supported by SDL, resample conversion: Audio_ src->audio_ tgt
    struct SwrContext *swr_ctx;     // Audio resampling context
    int frame_drops_early;          // Drop video packet count
    int frame_drops_late;           // Drop video frame count

    enum ShowMode {
        SHOW_MODE_NONE = -1, SHOW_MODE_VIDEO = 0, SHOW_MODE_WAVES, SHOW_MODE_RDFT, SHOW_MODE_NB
    } show_mode;
    int16_t sample_array[SAMPLE_ARRAY_SIZE];
    int sample_array_index;
    int last_i_start;
    RDFTContext *rdft;
    int rdft_bits;
    FFTSample *rdft_data;
    int xpos;
    double last_vis_time;
    SDL_Texture *vis_texture;
    SDL_Texture *sub_texture;
    SDL_Texture *vid_texture;

    int subtitle_stream;                // Caption stream index
    AVStream *subtitle_st;              // Subtitle stream
    PacketQueue subtitleq;              // Subtitle packet queue

    double frame_timer;                 // Record the time when the last frame is played
    double frame_last_returned_time;
    double frame_last_filter_delay;
    int video_stream;
    AVStream *video_st;                 // Video streaming
    PacketQueue videoq;                 // Video queue
    double max_frame_duration;      // maximum duration of a frame - above this, we consider the jump a timestamp discontinuity
    struct SwsContext *img_convert_ctx;
    struct SwsContext *sub_convert_ctx;
    int eof;

    char *filename;
    int width, height, xleft, ytop;
    int step;

#if CONFIG_AVFILTER
    int vfilter_idx;
    AVFilterContext *in_video_filter;   // the first filter in the video chain
    AVFilterContext *out_video_filter;  // the last filter in the video chain
    AVFilterContext *in_audio_filter;   // the first filter in the audio chain
    AVFilterContext *out_audio_filter;  // the last filter in the audio chain
    AVFilterGraph *agraph;              // audio filter graph
#endif

    int last_video_stream, last_audio_stream, last_subtitle_stream;

    SDL_cond *continue_read_thread;
} VideoState;

3. ffmpeg decoding framework (API call process)

Ffmpeg provides a set of API for audio and video processing. The main interfaces of the decoding part are listed below. It is also several core functions used in ffplay decoding (regardless of processing such as threads, queues and SDL). As a player demo of ffmpeg, ffplay provides a good example for the use of ffmpeg API.

4. ffmpeg h264 NVIDIA hardware decoding playback framework
In avcodec_ send_ Before the packet interface, the calling process of software decoding and hardware decoding is basically the same. Only in the part of finding decoder, avcodec is called for software decoding_ find_ Decoder (codecid), and the hardware decoding calls avcodec_find_decoder_by_name(“codec_name”).
In the decoding function, namely avcodec_ receive_ In frame (), software decoding or hardware decoding is selected according to whether hardware acceleration is enabled.
ffmpeg h264 NVIDIA hardware decoding framework. ffmpeg defines a file cuviddec for NVIDIA h264 hardware decoding c. At cuviddec NVIDIA hardware decoder is defined in C, as shown in the penultimate box in the figure below

Simply mention how the NVIDIA hardware decoder is set in the decoding function. When called
avcodec_find_decoder_by_name() will call avcodec after getting the decoder_ Open2() function opens the decoder. The obtained decoder is assigned to avctx->codec in this function,

avctx->codec = codec;

In this way, when the "receive_frame" field of the decoder is not empty, the hardware decoding is called:

if (avctx->codec->receive_frame) 
        ret = avctx->codec->receive_frame(avctx, frame);      
    else
        ret = decode_simple_receive_frame(avctx, frame);

At cuviddec C medium

#define DEFINE_CUVID_CODEC(x, X, bsf_name) \
    static const AVClass x##_cuvid_class = { \
        .class_name = #x "_cuvid", \
        .item_name = av_default_item_name, \
        .option = options, \
        .version = LIBAVUTIL_VERSION_INT, \
    }; \
    const AVCodec ff_##x##_cuvid_decoder = { \
        .name           = #x "_cuvid", \
        .long_name      = NULL_IF_CONFIG_SMALL("Nvidia CUVID " #X " decoder"), \
        .type           = AVMEDIA_TYPE_VIDEO, \
        .id             = AV_CODEC_ID_##X, \
        .priv_data_size = sizeof(CuvidContext), \
        .priv_class     = &x##_cuvid_class, \
        .init           = cuvid_decode_init, \
        .close          = cuvid_decode_end, \
        .receive_frame  = cuvid_output_frame, \
        .flush          = cuvid_flush, \
        .bsfs           = bsf_name, \
        .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING | AV_CODEC_CAP_HARDWARE, \
        .caps_internal  = FF_CODEC_CAP_SETS_FRAME_PROPS, \
        .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_CUDA, \
                                                        AV_PIX_FMT_NV12, \
                                                        AV_PIX_FMT_P010, \
                                                        AV_PIX_FMT_P016, \
                                                        AV_PIX_FMT_NONE }, \
        .hw_configs     = cuvid_hw_configs, \
        .wrapper_name   = "cuvid", \
    };

#if CONFIG_HEVC_CUVID_DECODER
DEFINE_CUVID_CODEC(hevc, HEVC, "hevc_mp4toannexb")
#endif
#if CONFIG_H264_CUVID_DECODER
DEFINE_CUVID_CODEC(h264, H264, "h264_mp4toannexb")
#endif

In cuvid_ decode_ init, cuvid_ decode_ end, cuvid_ output_ frame, cuvid_ Flush, among the four callback functions, calls the library functions (APIs) provided by NVIDIA.

Original link: ffmpeg hardware accelerated playback framework analysis - materials - I love audio and video network - build the most authoritative audio and video technology exchange and sharing forum in China

The benefits of this article are to receive the latest and complete C++ audio and video learning improvement materials, including (C/C++, Linux server development, FFmpeg, webRTC, rtmp, hls, rtsp, ffplay, srs) ↓↓↓↓↓↓↓ see the following at the bottom of the article

Tags: webrtc rtc

Posted by nivosh on Thu, 30 Jun 2022 21:54:01 +0300