用C++调ffmpeg接口实现encoder和decoder

本文的测试代码，实现了用C++的方式，调用ffmpeg接口，完成H264的encoder和decoder。encoder会生成一个out.bin文件，然后decoder再去解这个文件。

配置好VS环境，即可直接运行。我想在此文总结一下VS环境的配置ffmpeg：

#include <cstdio>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;


extern "C" {
#include <libavutil/opt.h>
#include <libavutil/imgutils.h>
#include <libavcodec/avcodec.h>
}


#pragma comment(lib, "avcodec.lib")
#pragma comment(lib, "avformat.lib")
#pragma comment(lib, "avutil.lib")
#pragma comment(lib, "opencv_world455d.lib")


string fname{ "out.bin" };
#define INBUF_SIZE  4096


static void encode(AVCodecContext* enc_ctx, AVFrame* frame, AVPacket* pkt) {
    int ret;

    /* send the frame to the encoder */
    ret = avcodec_send_frame(enc_ctx, frame);
    if (ret < 0) {
        cerr << "error sendinig a frame to encoder" << endl;
        exit(1);
    }

    while (ret >= 0) {
        // receive packet
        ret = avcodec_receive_packet(enc_ctx, pkt);
        if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF)
            return;
        else if (ret < 0) {
            cerr << "error during encoding" << endl;
            exit(1);
        }

        // write packet
        ofstream fout{ fname, ios::binary | ios::app};
        if (!fout.is_open()) {
            cerr << "open out.bin in encode() error" << endl;
            exit(1);
        }
        fout.write((const char*)pkt->data, pkt->size);
        fout.close();

        printf("packet %3lld (size=%5d)  flags:%d\n", pkt->pts, pkt->size, pkt->flags);
        av_packet_unref(pkt);
    }
}


int encode_main(void) {
    const AVCodec* codec = nullptr;
    AVCodecContext* enc_context = nullptr;
    int ret;

    // truncate output file
    ofstream fout{ fname };
    if (!fout.is_open()) {
        cerr << "truncate output file error" << endl;
        exit(1);
    }
    fout.close();

    /* find the encoder */
    codec = avcodec_find_encoder_by_name("libx264");
    if (!codec) {
        cerr << "codec not found" << endl;
        exit(1);
    }

    /* create encoder context */
    enc_context = avcodec_alloc_context3(codec);
    if (enc_context == nullptr) {
        cerr << "could not allocate video codec context" << endl;
        exit(1);
    }

    /* set parameters */
    //c->bit_rate = 400000;
    /* resolution must be a multiple of two */
    enc_context->width = 640;
    enc_context->height = 360;
    /* frames per second */
    enc_context->time_base = { 1, 30 };
    enc_context->framerate = { 30, 1 };

    /* emit one intra frame every ten frames
     * check frame pict_type before passing frame
     * to encoder, if frame->pict_type is AV_PICTURE_TYPE_I
     * then gop_size is ignored and the output of encoder
     * will always be I frame irrespective to gop_size
     */
    enc_context->gop_size = 30;
    enc_context->pix_fmt = AV_PIX_FMT_YUV420P;
    // when flags = 0, SPS & PPS will be generated for each IDR
    enc_context->flags = 0;
    //enc_context->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
    //enc_context->flags2 |= AV_CODEC_FLAG2_LOCAL_HEADER;
    if (codec->id == AV_CODEC_ID_H264) {
        enc_context->max_b_frames = 0;
        av_opt_set(enc_context->priv_data, "preset", "veryfast", 0);
    }

    /* open it */
    if ((ret=avcodec_open2(enc_context,codec,nullptr)) < 0) {
        cerr << "could not open codec: " << endl;
        //fprintf(stderr, "Could not open codec: %s\n", av_err2str(ret));
        exit(1);
    }

    AVFrame* frame = nullptr;
    frame = av_frame_alloc();
    if (frame == nullptr) {
        cerr << "could not allocate video frame" << endl;
        exit(1);
    }

    frame->format = enc_context->pix_fmt;
    frame->width = enc_context->width;
    frame->height = enc_context->height;

    if (av_frame_get_buffer(frame,0) < 0) {
        cerr << "could not allocate video frame data" << endl;
        exit(1);
    }

    printf("frame->linesize[0] = %u\n", frame->linesize[0]);
    printf("frame->linesize[1] = %u\n", frame->linesize[1]);
    printf("frame->linesize[2] = %u\n", frame->linesize[2]);

    AVPacket* pkt = nullptr;
    pkt = av_packet_alloc();
    if (pkt == nullptr)
        exit(1);

    /* encode 3 second of video */
    int x, y;
    for (int i = 0; i < 100; ++i) {
        fflush(stdout);

        /* Make sure the frame data is writable.
           On the first round, the frame is fresh from av_frame_get_buffer()
           and therefore we know it is writable.
           But on the next rounds, encode() will have called
           avcodec_send_frame(), and the codec may have kept a reference to
           the frame in its internal structures, that makes the frame
           unwritable.
           av_frame_make_writable() checks that and allocates a new buffer
           for the frame only if necessary.
         */
        if (av_frame_make_writable(frame) < 0) {
            cerr << "frame is not writable" << endl;
            exit(1);
        }
        /* Y */
        for (y = 0; y < enc_context->height; y++) {
            for (x = 0; x < enc_context->width; x++) {
                frame->data[0][y * frame->linesize[0] + x] = x + y + i * 3;
            }
        }
        /* Cb and Cr */
        for (y = 0; y < enc_context->height/2; y++) {
            for (x = 0; x < enc_context->width/2; x++) {
                frame->data[1][y * frame->linesize[1] + x] = 128 + y + i * 2;
                frame->data[2][y * frame->linesize[2] + x] = 64 + x + i * 5;
            }
        }

        frame->pts = i; // presentation time stamp
        /* encode the frame */
        encode(enc_context, frame, pkt);
    }

    /* flush the encoder */
    cout << "start flush encoder..." << endl;
    encode(enc_context, NULL, pkt);

    avcodec_free_context(&enc_context);
    av_frame_free(&frame);
    av_packet_free(&pkt);
    return 0;
}


static void decode(AVCodecContext* dec_ctx, AVFrame* frame, AVPacket* pkt)
{
    int ret;

    // send packet to decoder
    ret = avcodec_send_packet(dec_ctx, pkt);
    if (ret < 0) {
        fprintf(stderr, "Error sending a packet for decoding %d\n", ret);
        exit(1);
    }

    while (ret >= 0) {
        // receive frame
        ret = avcodec_receive_frame(dec_ctx, frame);
        if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
            return;
        }
        else if (ret < 0) {
            fprintf(stderr, "Error during decoding\n");
            exit(1);
        }

        printf("dec_ctx->frame %3d, key %d, type %d\n", 
                        dec_ctx->frame_number, 
                        frame->key_frame,
                        frame->pict_type);
        fflush(stdout);
    }
}


int decode_main()
{
    const AVCodec* codec = nullptr;
    AVCodecParserContext* parser = nullptr;
    AVCodecContext* dec_context = nullptr;
    AVFrame* frame = nullptr;
    char inbuf[INBUF_SIZE + AV_INPUT_BUFFER_PADDING_SIZE]{};
    char* data = nullptr;
    size_t   data_size;
    int ret;
    int eof;
    AVPacket* pkt = nullptr;

    pkt = av_packet_alloc();
    if (pkt == nullptr)
        exit(1);

    /* find the video decoder */
    codec = avcodec_find_decoder(AV_CODEC_ID_H264);
    if (!codec) {
        fprintf(stderr, "Codec not found\n");
        exit(1);
    }

    parser = av_parser_init(codec->id);
    if (!parser) {
        fprintf(stderr, "parser not found\n");
        exit(1);
    }

    dec_context = avcodec_alloc_context3(codec);
    if (!dec_context) {
        fprintf(stderr, "Could not allocate video codec context\n");
        exit(1);
    }

    /* open it */
    if (avcodec_open2(dec_context, codec, nullptr) < 0) {
        fprintf(stderr, "Could not open codec\n");
        exit(1);
    }

    frame = av_frame_alloc();
    if (!frame) {
        fprintf(stderr, "Could not allocate video frame\n");
        exit(1);
    }

    ifstream fin{ fname, ios::binary };
    if (!fin.is_open()) {
        cerr << "open output file error" << endl;
    }

    while (1) {
        fin.read(inbuf, INBUF_SIZE);
        if (fin.bad()) {
            cout << "read file error" << endl;
            break;
        }
        data_size = fin.gcount();
        eof = !data_size;
        data = inbuf;
        while (data_size > 0 || eof) {
            ret = av_parser_parse2(parser, dec_context, &pkt->data, &pkt->size,
                      (const uint8_t*)data, (int)data_size, AV_NOPTS_VALUE, AV_NOPTS_VALUE, 0);
            if (ret < 0) {
                fprintf(stderr, "Error while parsing\n");
                exit(1);
            }
            data += ret;
            data_size -= ret;

            if (pkt->size)
                decode(dec_context, frame, pkt);

            if (eof)
                break;
        }
        if (eof)
            break;
    }

    /* flush the decoder */
    cout << "flush decoder..." << endl;
    decode(dec_context, frame, nullptr);

    fin.close();
    av_parser_close(parser);
    avcodec_free_context(&dec_context);
    av_frame_free(&frame);
    av_packet_free(&pkt);
    return 0;
}


int main(void) {
    encode_main();
    decode_main();
    return 0;
}

以上代码，在data_size=0的时候，av_parser_parse2还被调用了一次，是为了得到最后一帧，此时pkt->size大于0，调用decode。

将一个码流拆解开成一个个NAL Unit，其实只需要判断\x00\x00\x01这个分割标志，这样就可以不使用av_parser_parse2接口，而是直接调用decode里面封装的avcodec_send_packet和avcodec_receive_frame，网络应用，这样设计是否会更好一些？更清晰直接，但需要有一个私有的头，来将每个NAL Unit分开。

def get_each_nal(fn):
    """ yield nal unit including header and rbsp """
    ibs = b''
    with open(fn, 'rb') as f:
        while ob:=f.read(1):
            ibs += ob
            if ibs[-3:] == b'\x00\x00\x01':
                cont = b''
                while ob:=f.read(1):
                    cont += ob
                    if (cont[-3:]==b'\x00\x00\x00' or
                        cont[-3:]==b'\x00\x00\x01'):
                        # discard emulation_prevention_three_byte 0x03
                        cont = re.sub(rb'\x00\x00\x03',b'\x00\x00',cont[:-3])
                        # nal last byte should not be 0x00
                        # assert cont[-1] != 0
                        yield cont
                        # one byte back in case the next is 0x01
                        f.seek(f.tell()-1)
                        ibs = b'\x00\x00'
                        break
                else:
                    #print(cont)
                    cont = re.sub(rb'\x00\x00\x03',b'\x00\x00',cont)
                    #assert cont[-1] != 0
                    yield cont  # last nal unit, do not use return!

估计是浪费时间了，我想尝试使用正则表达式的方式来匹配NAL UNIT的分隔符，Python实现起来很自然，C++实现起来有些别扭，没找到Python中bytes对象在C++中的对应，感觉只能是char数据。后仔细一想这个思路可能是有问题的。正则表达式里面有很多专用的符号，比如\w\d\s.*?^$等等，使用正则，主要就是使用这些符号来清晰地精确地表达一个字符串，如果不使用这些符号，完全可以不使用正则表达式。匹配NAL UNIT的分隔符号，就是这类不需要使用正则表达式专用符号的场景。

在encode的时候，把每一个packet保存在一个独立的文件中，decode的时候，也是一个个的读文件。这样decode部分的代码，还可以这样写（只摘取变化的部分）：

    int index = 0;
    int length = 0;
    char* pcont = nullptr;
    ifstream fin;
    string ffname{};
    while (index < 50) {
        ffname.clear();
        ffname += fname + '.' + to_string(index);
        fin.open(ffname, ios::binary);
        if (!fin.is_open()) {
            cerr << "open out.bin.index error" << endl;
            break;
        }
        fin.seekg(0, ios::end);
        length = fin.tellg();
        fin.seekg(0, ios::beg);
        pcont = new char[length] {};
        fin.read(pcont, length);
        if (fin.bad()) {
            cerr << "read out.bin.index error" << endl;
            break;
        }
        pkt->data = (uint8_t*)pcont;
        pkt->size = length;
        decode(dec_context, frame, pkt);
        delete pcont;
        pcont = nullptr;
        fin.close();
        index += 1;
    }
    cout << "flush decoder..." << endl;
    decode(dec_context, frame, nullptr);

index是encode部分代码生成的文件数量。测试decode，打印正常。代码中没有处理new失败的情况。