From 4161163d9c99a1cf4c4ea6f22ab32ab66a593ad4 Mon Sep 17 00:00:00 2001 From: zhupengfei Date: Sat, 1 Feb 2020 12:23:07 +0800 Subject: [PATCH] ffmpeg: Correctly handle sample rates Previously, we just used the native sample rate for encoding. However, some encoders like libmp3lame doesn't support it. Therefore, we now use a supported sample rate (preferring the native one if possible). FFmpeg requires audio data to be sent in a sequence of frames, each containing the same specific number of samples. Previously, we buffered input samples in FFmpegBackend. However, as the source and destination sample rates can now be different, we should buffer resampled data instead. swresample have an internal input buffer, so we now just forward all data to it and 'gradually' receive resampled data, at most one frame_size at a time. When there is not enough resampled data to form a frame, we will record the current offset and request for less data on the next call. Additionally, this commit also fixes a flaw. When an encoder supports variable frame sizes, its frame size is reported to be 0, which breaks our buffering system. Now we treat variable frame size encoders as having a frame size of 160 (the size of a HLE audio frame). --- src/core/dumping/ffmpeg_backend.cpp | 129 ++++++++++++++++------------ src/core/dumping/ffmpeg_backend.h | 19 ++-- 2 files changed, 84 insertions(+), 64 deletions(-) diff --git a/src/core/dumping/ffmpeg_backend.cpp b/src/core/dumping/ffmpeg_backend.cpp index ef7843ffca..3c34c8440d 100644 --- a/src/core/dumping/ffmpeg_backend.cpp +++ b/src/core/dumping/ffmpeg_backend.cpp @@ -211,7 +211,7 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) { if (!FFmpegStream::Init(format_context)) return false; - sample_count = 0; + frame_count = 0; // Initialize audio codec const AVCodec* codec = avcodec_find_encoder_by_name(Settings::values.audio_encoder.c_str()); @@ -243,7 +243,20 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) { codec_context->sample_fmt = AV_SAMPLE_FMT_S16P; } - codec_context->sample_rate = AudioCore::native_sample_rate; + if (codec->supported_samplerates) { + codec_context->sample_rate = codec->supported_samplerates[0]; + // Prefer native sample rate if supported + const int* ptr = codec->supported_samplerates; + while ((*ptr)) { + if ((*ptr) == AudioCore::native_sample_rate) { + codec_context->sample_rate = AudioCore::native_sample_rate; + break; + } + ptr++; + } + } else { + codec_context->sample_rate = AudioCore::native_sample_rate; + } codec_context->channel_layout = AV_CH_LAYOUT_STEREO; codec_context->channels = 2; @@ -259,6 +272,12 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) { LOG_WARNING(Render, "Audio encoder options not found: {}", buf); } + if (codec_context->frame_size) { + frame_size = static_cast(codec_context->frame_size); + } else { // variable frame size support + frame_size = std::tuple_size::value; + } + // Create audio stream stream = avformat_new_stream(format_context, codec); if (!stream || avcodec_parameters_from_context(stream->codecpar, codec_context.get()) < 0) { @@ -291,7 +310,7 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) { // Allocate resampled data int error = av_samples_alloc_array_and_samples(&resampled_data, nullptr, codec_context->channels, - codec_context->frame_size, codec_context->sample_fmt, 0); + frame_size, codec_context->sample_fmt, 0); if (error < 0) { LOG_ERROR(Render, "Could not allocate samples storage"); return false; @@ -312,31 +331,62 @@ void FFmpegAudioStream::Free() { av_freep(&resampled_data); } -void FFmpegAudioStream::ProcessFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1) { +void FFmpegAudioStream::ProcessFrame(const VariableAudioFrame& channel0, + const VariableAudioFrame& channel1) { ASSERT_MSG(channel0.size() == channel1.size(), "Frames of the two channels must have the same number of samples"); - std::array src_data = {reinterpret_cast(channel0.data()), - reinterpret_cast(channel1.data())}; - if (swr_convert(swr_context.get(), resampled_data, channel0.size(), src_data.data(), - channel0.size()) < 0) { + const auto sample_size = av_get_bytes_per_sample(codec_context->sample_fmt); + std::array src_data = {reinterpret_cast(channel0.data()), + reinterpret_cast(channel1.data())}; + std::array dst_data = {resampled_data[0] + sample_size * offset, + resampled_data[1] + sample_size * offset}; + + auto resampled_count = swr_convert(swr_context.get(), dst_data.data(), frame_size - offset, + src_data.data(), channel0.size()); + if (resampled_count < 0) { LOG_ERROR(Render, "Audio frame dropped: Could not resample data"); return; } - // Prepare frame - audio_frame->nb_samples = channel0.size(); - audio_frame->data[0] = resampled_data[0]; - audio_frame->data[1] = resampled_data[1]; - audio_frame->pts = sample_count; - sample_count += channel0.size(); + offset += resampled_count; + if (offset < frame_size) { // Still not enough to form a frame + return; + } - SendFrame(audio_frame.get()); + while (true) { + // Prepare frame + audio_frame->nb_samples = frame_size; + audio_frame->data[0] = resampled_data[0]; + audio_frame->data[1] = resampled_data[1]; + audio_frame->pts = frame_count * frame_size; + frame_count++; + + SendFrame(audio_frame.get()); + + // swr_convert buffers input internally. Try to get more resampled data + resampled_count = swr_convert(swr_context.get(), resampled_data, frame_size, nullptr, 0); + if (resampled_count < 0) { + LOG_ERROR(Render, "Audio frame dropped: Could not resample data"); + return; + } + if (static_cast(resampled_count) < frame_size) { + offset = resampled_count; + break; + } + } } -std::size_t FFmpegAudioStream::GetAudioFrameSize() const { - ASSERT_MSG(codec_context, "Codec context is not initialized yet!"); - return codec_context->frame_size; +void FFmpegAudioStream::Flush() { + // Send the last samples + audio_frame->nb_samples = offset; + audio_frame->data[0] = resampled_data[0]; + audio_frame->data[1] = resampled_data[1]; + audio_frame->pts = frame_count * frame_size; + + SendFrame(audio_frame.get()); + + FFmpegStream::Flush(); } FFmpegMuxer::~FFmpegMuxer() { @@ -402,7 +452,8 @@ void FFmpegMuxer::ProcessVideoFrame(VideoFrame& frame) { video_stream.ProcessFrame(frame); } -void FFmpegMuxer::ProcessAudioFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1) { +void FFmpegMuxer::ProcessAudioFrame(const VariableAudioFrame& channel0, + const VariableAudioFrame& channel1) { audio_stream.ProcessFrame(channel0, channel1); } @@ -414,10 +465,6 @@ void FFmpegMuxer::FlushAudio() { audio_stream.Flush(); } -std::size_t FFmpegMuxer::GetAudioFrameSize() const { - return audio_stream.GetAudioFrameSize(); -} - void FFmpegMuxer::WriteTrailer() { av_write_trailer(format_context.get()); } @@ -498,24 +545,20 @@ void FFmpegBackend::AddVideoFrame(VideoFrame frame) { } void FFmpegBackend::AddAudioFrame(AudioCore::StereoFrame16 frame) { - std::array, 2> refactored_frame; + std::array refactored_frame; + for (auto& channel : refactored_frame) { + channel.resize(frame.size()); + } for (std::size_t i = 0; i < frame.size(); i++) { refactored_frame[0][i] = frame[i][0]; refactored_frame[1][i] = frame[i][1]; } - for (auto i : {0, 1}) { - audio_buffers[i].insert(audio_buffers[i].end(), refactored_frame[i].begin(), - refactored_frame[i].end()); - } - CheckAudioBuffer(); + ffmpeg.ProcessAudioFrame(refactored_frame[0], refactored_frame[1]); } void FFmpegBackend::AddAudioSample(const std::array& sample) { - for (auto i : {0, 1}) { - audio_buffers[i].push_back(sample[i]); - } - CheckAudioBuffer(); + ffmpeg.ProcessAudioFrame({sample[0]}, {sample[1]}); } void FFmpegBackend::StopDumping() { @@ -525,12 +568,6 @@ void FFmpegBackend::StopDumping() { // Flush the video processing queue AddVideoFrame(VideoFrame()); for (auto i : {0, 1}) { - // Add remaining data to audio queue - if (audio_buffers[i].size() >= 0) { - VariableAudioFrame buffer(audio_buffers[i].begin(), audio_buffers[i].end()); - audio_frame_queues[i].Push(std::move(buffer)); - audio_buffers[i].clear(); - } // Flush the audio processing queue audio_frame_queues[i].Push(VariableAudioFrame()); } @@ -554,18 +591,4 @@ void FFmpegBackend::EndDumping() { processing_ended.Set(); } -void FFmpegBackend::CheckAudioBuffer() { - for (auto i : {0, 1}) { - const std::size_t frame_size = ffmpeg.GetAudioFrameSize(); - // Add audio data to the queue when there is enough to form a frame - while (audio_buffers[i].size() >= frame_size) { - VariableAudioFrame buffer(audio_buffers[i].begin(), - audio_buffers[i].begin() + frame_size); - audio_frame_queues[i].Push(std::move(buffer)); - - audio_buffers[i].erase(audio_buffers[i].begin(), audio_buffers[i].begin() + frame_size); - } - } -} - } // namespace VideoDumper diff --git a/src/core/dumping/ffmpeg_backend.h b/src/core/dumping/ffmpeg_backend.h index f08f31d3d1..f0962189e6 100644 --- a/src/core/dumping/ffmpeg_backend.h +++ b/src/core/dumping/ffmpeg_backend.h @@ -96,6 +96,7 @@ private: /** * A FFmpegStream used for audio data. * Resamples (converts), encodes and writes a frame. + * This also temporarily stores resampled audio data before there are enough to form a frame. */ class FFmpegAudioStream : public FFmpegStream { public: @@ -103,8 +104,8 @@ public: bool Init(AVFormatContext* format_context); void Free(); - void ProcessFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1); - std::size_t GetAudioFrameSize() const; + void ProcessFrame(const VariableAudioFrame& channel0, const VariableAudioFrame& channel1); + void Flush(); private: struct SwrContextDeleter { @@ -113,12 +114,14 @@ private: } }; - u64 sample_count{}; + u64 frame_size{}; + u64 frame_count{}; std::unique_ptr audio_frame{}; std::unique_ptr swr_context{}; u8** resampled_data{}; + u64 offset{}; // Number of output samples that are currently in resampled_data. }; /** @@ -132,10 +135,9 @@ public: bool Init(const std::string& path, const Layout::FramebufferLayout& layout); void Free(); void ProcessVideoFrame(VideoFrame& frame); - void ProcessAudioFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1); + void ProcessAudioFrame(const VariableAudioFrame& channel0, const VariableAudioFrame& channel1); void FlushVideo(); void FlushAudio(); - std::size_t GetAudioFrameSize() const; void WriteTrailer(); private: @@ -153,8 +155,7 @@ private: /** * FFmpeg video dumping backend. - * This class implements a double buffer, and an audio queue to keep audio data - * before enough data is received to form a frame. + * This class implements a double buffer. */ class FFmpegBackend : public Backend { public: @@ -169,7 +170,6 @@ public: Layout::FramebufferLayout GetLayout() const override; private: - void CheckAudioBuffer(); void EndDumping(); std::atomic_bool is_dumping = false; ///< Whether the backend is currently dumping @@ -182,9 +182,6 @@ private: Common::Event event1, event2; std::thread video_processing_thread; - /// An audio buffer used to temporarily hold audio data, before the size is big enough - /// to be sent to the encoder as a frame - std::array audio_buffers; std::array, 2> audio_frame_queues; std::thread audio_processing_thread;