Merge pull request #6799 from ameerj/vp9-fixes

nvdec: Fix VP9 reference frame refreshes
master
bunnei 2021-08-06 17:46:46 +07:00 committed by GitHub
commit f183668a87
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 51 additions and 204 deletions

@ -166,8 +166,6 @@ NvResult nvhost_nvdec_common::MapBuffer(const std::vector<u8>& input, std::vecto
LOG_ERROR(Service_NVDRV, "failed to map size={}", object->size); LOG_ERROR(Service_NVDRV, "failed to map size={}", object->size);
} else { } else {
cmd_buffer.map_address = object->dma_map_addr; cmd_buffer.map_address = object->dma_map_addr;
AddBufferMap(object->dma_map_addr, object->size, object->addr,
object->status == nvmap::Object::Status::Allocated);
} }
} }
std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer)); std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
@ -178,30 +176,11 @@ NvResult nvhost_nvdec_common::MapBuffer(const std::vector<u8>& input, std::vecto
} }
NvResult nvhost_nvdec_common::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output) { NvResult nvhost_nvdec_common::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
IoctlMapBuffer params{}; // This is intntionally stubbed.
std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer)); // Skip unmapping buffers here, as to not break the continuity of the VP9 reference frame
std::vector<MapBufferEntry> cmd_buffer_handles(params.num_entries); // addresses, and risk invalidating data before the async GPU thread is done with it
SliceVectors(input, cmd_buffer_handles, params.num_entries, sizeof(IoctlMapBuffer));
auto& gpu = system.GPU();
for (auto& cmd_buffer : cmd_buffer_handles) {
const auto object{nvmap_dev->GetObject(cmd_buffer.map_handle)};
if (!object) {
LOG_ERROR(Service_NVDRV, "invalid cmd_buffer nvmap_handle={:X}", cmd_buffer.map_handle);
std::memcpy(output.data(), &params, output.size());
return NvResult::InvalidState;
}
if (const auto size{RemoveBufferMap(object->dma_map_addr)}; size) {
gpu.MemoryManager().Unmap(object->dma_map_addr, *size);
} else {
// This occurs quite frequently, however does not seem to impact functionality
LOG_DEBUG(Service_NVDRV, "invalid offset=0x{:X} dma=0x{:X}", object->addr,
object->dma_map_addr);
}
object->dma_map_addr = 0;
}
std::memset(output.data(), 0, output.size()); std::memset(output.data(), 0, output.size());
LOG_DEBUG(Service_NVDRV, "(STUBBED) called");
return NvResult::Success; return NvResult::Success;
} }
@ -212,33 +191,4 @@ NvResult nvhost_nvdec_common::SetSubmitTimeout(const std::vector<u8>& input,
return NvResult::Success; return NvResult::Success;
} }
std::optional<nvhost_nvdec_common::BufferMap> nvhost_nvdec_common::FindBufferMap(
GPUVAddr gpu_addr) const {
const auto it = std::find_if(
buffer_mappings.begin(), buffer_mappings.upper_bound(gpu_addr), [&](const auto& entry) {
return (gpu_addr >= entry.second.StartAddr() && gpu_addr < entry.second.EndAddr());
});
ASSERT(it != buffer_mappings.end());
return it->second;
}
void nvhost_nvdec_common::AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr,
bool is_allocated) {
buffer_mappings.insert_or_assign(gpu_addr, BufferMap{gpu_addr, size, cpu_addr, is_allocated});
}
std::optional<std::size_t> nvhost_nvdec_common::RemoveBufferMap(GPUVAddr gpu_addr) {
const auto iter{buffer_mappings.find(gpu_addr)};
if (iter == buffer_mappings.end()) {
return std::nullopt;
}
std::size_t size = 0;
if (iter->second.IsAllocated()) {
size = iter->second.Size();
}
buffer_mappings.erase(iter);
return size;
}
} // namespace Service::Nvidia::Devices } // namespace Service::Nvidia::Devices

@ -23,45 +23,6 @@ public:
~nvhost_nvdec_common() override; ~nvhost_nvdec_common() override;
protected: protected:
class BufferMap final {
public:
constexpr BufferMap() = default;
constexpr BufferMap(GPUVAddr start_addr_, std::size_t size_)
: start_addr{start_addr_}, end_addr{start_addr_ + size_} {}
constexpr BufferMap(GPUVAddr start_addr_, std::size_t size_, VAddr cpu_addr_,
bool is_allocated_)
: start_addr{start_addr_}, end_addr{start_addr_ + size_}, cpu_addr{cpu_addr_},
is_allocated{is_allocated_} {}
constexpr VAddr StartAddr() const {
return start_addr;
}
constexpr VAddr EndAddr() const {
return end_addr;
}
constexpr std::size_t Size() const {
return end_addr - start_addr;
}
constexpr VAddr CpuAddr() const {
return cpu_addr;
}
constexpr bool IsAllocated() const {
return is_allocated;
}
private:
GPUVAddr start_addr{};
GPUVAddr end_addr{};
VAddr cpu_addr{};
bool is_allocated{};
};
struct IoctlSetNvmapFD { struct IoctlSetNvmapFD {
s32_le nvmap_fd{}; s32_le nvmap_fd{};
}; };
@ -154,17 +115,11 @@ protected:
NvResult UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output); NvResult UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
NvResult SetSubmitTimeout(const std::vector<u8>& input, std::vector<u8>& output); NvResult SetSubmitTimeout(const std::vector<u8>& input, std::vector<u8>& output);
std::optional<BufferMap> FindBufferMap(GPUVAddr gpu_addr) const;
void AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr, bool is_allocated);
std::optional<std::size_t> RemoveBufferMap(GPUVAddr gpu_addr);
s32_le nvmap_fd{}; s32_le nvmap_fd{};
u32_le submit_timeout{}; u32_le submit_timeout{};
std::shared_ptr<nvmap> nvmap_dev; std::shared_ptr<nvmap> nvmap_dev;
SyncpointManager& syncpoint_manager; SyncpointManager& syncpoint_manager;
std::array<u32, MaxSyncPoints> device_syncpoints{}; std::array<u32, MaxSyncPoints> device_syncpoints{};
// This is expected to be ordered, therefore we must use a map, not unordered_map
std::map<GPUVAddr, BufferMap> buffer_mappings;
}; };
}; // namespace Devices }; // namespace Devices
} // namespace Service::Nvidia } // namespace Service::Nvidia

@ -11,6 +11,9 @@
namespace Tegra::Decoder { namespace Tegra::Decoder {
namespace { namespace {
constexpr u32 diff_update_probability = 252;
constexpr u32 frame_sync_code = 0x498342;
// Default compressed header probabilities once frame context resets // Default compressed header probabilities once frame context resets
constexpr Vp9EntropyProbs default_probs{ constexpr Vp9EntropyProbs default_probs{
.y_mode_prob{ .y_mode_prob{
@ -361,8 +364,7 @@ Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state)
InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy); InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy);
// surface_luma_offset[0:3] contains the address of the reference frame offsets in the following // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following
// order: last, golden, altref, current. It may be worthwhile to track the updates done here // order: last, golden, altref, current.
// to avoid buffering frame data needed for reference frame updating in the header composition.
std::copy(state.surface_luma_offset.begin(), state.surface_luma_offset.begin() + 4, std::copy(state.surface_luma_offset.begin(), state.surface_luma_offset.begin() + 4,
vp9_info.frame_offsets.begin()); vp9_info.frame_offsets.begin());
@ -384,33 +386,18 @@ Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state)
gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(), gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(),
current_frame.info.bitstream_size); current_frame.info.bitstream_size);
} }
// Buffer two frames, saving the last show frame info if (!next_frame.bit_stream.empty()) {
if (!next_next_frame.bit_stream.empty()) {
Vp9FrameContainer temp{ Vp9FrameContainer temp{
.info = current_frame.info, .info = current_frame.info,
.bit_stream = std::move(current_frame.bit_stream), .bit_stream = std::move(current_frame.bit_stream),
}; };
next_next_frame.info.show_frame = current_frame.info.last_frame_shown; next_frame.info.show_frame = current_frame.info.last_frame_shown;
current_frame.info = next_next_frame.info; current_frame.info = next_frame.info;
current_frame.bit_stream = std::move(next_next_frame.bit_stream); current_frame.bit_stream = std::move(next_frame.bit_stream);
next_next_frame = std::move(temp); next_frame = std::move(temp);
if (!next_frame.bit_stream.empty()) {
Vp9FrameContainer temp2{
.info = current_frame.info,
.bit_stream = std::move(current_frame.bit_stream),
};
next_frame.info.show_frame = current_frame.info.last_frame_shown;
current_frame.info = next_frame.info;
current_frame.bit_stream = std::move(next_frame.bit_stream);
next_frame = std::move(temp2);
} else {
next_frame.info = current_frame.info;
next_frame.bit_stream = std::move(current_frame.bit_stream);
}
} else { } else {
next_next_frame.info = current_frame.info; next_frame.info = current_frame.info;
next_next_frame.bit_stream = std::move(current_frame.bit_stream); next_frame.bit_stream = std::move(current_frame.bit_stream);
} }
return current_frame; return current_frame;
} }
@ -613,86 +600,64 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
// Reset context // Reset context
prev_frame_probs = default_probs; prev_frame_probs = default_probs;
swap_next_golden = false; swap_ref_indices = false;
loop_filter_ref_deltas.fill(0); loop_filter_ref_deltas.fill(0);
loop_filter_mode_deltas.fill(0); loop_filter_mode_deltas.fill(0);
frame_ctxs.fill(default_probs);
// allow frames offsets to stabilize before checking for golden frames
grace_period = 4;
// On key frames, all frame slots are set to the current frame,
// so the value of the selected slot doesn't really matter.
frame_ctxs.fill({current_frame_number, false, default_probs});
// intra only, meaning the frame can be recreated with no other references // intra only, meaning the frame can be recreated with no other references
current_frame_info.intra_only = true; current_frame_info.intra_only = true;
} else { } else {
if (!current_frame_info.show_frame) { if (!current_frame_info.show_frame) {
uncomp_writer.WriteBit(current_frame_info.intra_only); uncomp_writer.WriteBit(current_frame_info.intra_only);
if (!current_frame_info.last_frame_was_key) {
swap_next_golden = !swap_next_golden;
}
} else { } else {
current_frame_info.intra_only = false; current_frame_info.intra_only = false;
} }
if (!current_frame_info.error_resilient_mode) { if (!current_frame_info.error_resilient_mode) {
uncomp_writer.WriteU(0, 2); // Reset frame context. uncomp_writer.WriteU(0, 2); // Reset frame context.
} }
const auto& curr_offsets = current_frame_info.frame_offsets;
// Last, Golden, Altref frames const auto& next_offsets = next_frame.info.frame_offsets;
std::array<s32, 3> ref_frame_index{0, 1, 2}; const bool ref_frames_different = curr_offsets[1] != curr_offsets[2];
const bool next_references_swap =
// Set when next frame is hidden (next_offsets[1] == curr_offsets[2]) || (next_offsets[2] == curr_offsets[1]);
// altref and golden references are swapped const bool needs_ref_swap = ref_frames_different && next_references_swap;
if (swap_next_golden) { if (needs_ref_swap) {
ref_frame_index = std::array<s32, 3>{0, 2, 1}; swap_ref_indices = !swap_ref_indices;
} }
union {
u32 raw;
BitField<0, 1, u32> refresh_last;
BitField<1, 2, u32> refresh_golden;
BitField<2, 1, u32> refresh_alt;
} refresh_frame_flags;
// update Last Frame refresh_frame_flags.raw = 0;
u64 refresh_frame_flags = 1; for (u32 index = 0; index < 3; ++index) {
// Refresh indices that use the current frame as an index
// golden frame may refresh, determined if the next golden frame offset is changed if (curr_offsets[3] == next_offsets[index]) {
bool golden_refresh = false; refresh_frame_flags.raw |= 1u << index;
if (grace_period <= 0) {
for (s32 index = 1; index < 3; ++index) {
if (current_frame_info.frame_offsets[index] !=
next_frame.info.frame_offsets[index]) {
current_frame_info.refresh_frame[index] = true;
golden_refresh = true;
grace_period = 3;
}
} }
} }
if (swap_ref_indices) {
if (current_frame_info.show_frame && const u32 temp = refresh_frame_flags.refresh_golden;
(!next_frame.info.show_frame || next_frame.info.is_key_frame)) { refresh_frame_flags.refresh_golden.Assign(refresh_frame_flags.refresh_alt.Value());
// Update golden frame refresh_frame_flags.refresh_alt.Assign(temp);
refresh_frame_flags = swap_next_golden ? 2 : 4;
} }
if (!current_frame_info.show_frame) {
// Update altref
refresh_frame_flags = swap_next_golden ? 2 : 4;
} else if (golden_refresh) {
refresh_frame_flags = 3;
}
if (current_frame_info.intra_only) { if (current_frame_info.intra_only) {
uncomp_writer.WriteU(frame_sync_code, 24); uncomp_writer.WriteU(frame_sync_code, 24);
uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8); uncomp_writer.WriteU(refresh_frame_flags.raw, 8);
uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16); uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16); uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
uncomp_writer.WriteBit(false); // Render and frame size different. uncomp_writer.WriteBit(false); // Render and frame size different.
} else { } else {
uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8); const bool swap_indices = needs_ref_swap ^ swap_ref_indices;
const auto ref_frame_index = swap_indices ? std::array{0, 2, 1} : std::array{0, 1, 2};
for (s32 index = 1; index < 4; index++) { uncomp_writer.WriteU(refresh_frame_flags.raw, 8);
for (size_t index = 1; index < 4; index++) {
uncomp_writer.WriteU(ref_frame_index[index - 1], 3); uncomp_writer.WriteU(ref_frame_index[index - 1], 3);
uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1); uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1);
} }
uncomp_writer.WriteBit(true); // Frame size with refs. uncomp_writer.WriteBit(true); // Frame size with refs.
uncomp_writer.WriteBit(false); // Render and frame size different. uncomp_writer.WriteBit(false); // Render and frame size different.
uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv); uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv);
@ -714,10 +679,9 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
frame_ctx_idx = 1; frame_ctx_idx = 1;
} }
uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index. uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index.
prev_frame_probs = prev_frame_probs = frame_ctxs[frame_ctx_idx]; // reference probabilities for compressed header
frame_ctxs[frame_ctx_idx].probs; // reference probabilities for compressed header frame_ctxs[frame_ctx_idx] = current_frame_info.entropy;
frame_ctxs[frame_ctx_idx] = {current_frame_number, false, current_frame_info.entropy};
uncomp_writer.WriteU(current_frame_info.first_level, 6); uncomp_writer.WriteU(current_frame_info.first_level, 6);
uncomp_writer.WriteU(current_frame_info.sharpness_level, 3); uncomp_writer.WriteU(current_frame_info.sharpness_level, 3);
@ -812,7 +776,6 @@ const std::vector<u8>& VP9::ComposeFrameHeader(const NvdecCommon::NvdecRegisters
current_frame_info = curr_frame.info; current_frame_info = curr_frame.info;
bitstream = std::move(curr_frame.bit_stream); bitstream = std::move(curr_frame.bit_stream);
} }
// The uncompressed header routine sets PrevProb parameters needed for the compressed header // The uncompressed header routine sets PrevProb parameters needed for the compressed header
auto uncomp_writer = ComposeUncompressedHeader(); auto uncomp_writer = ComposeUncompressedHeader();
std::vector<u8> compressed_header = ComposeCompressedHeader(); std::vector<u8> compressed_header = ComposeCompressedHeader();
@ -828,13 +791,6 @@ const std::vector<u8>& VP9::ComposeFrameHeader(const NvdecCommon::NvdecRegisters
frame.begin() + uncompressed_header.size()); frame.begin() + uncompressed_header.size());
std::copy(bitstream.begin(), bitstream.end(), std::copy(bitstream.begin(), bitstream.end(),
frame.begin() + uncompressed_header.size() + compressed_header.size()); frame.begin() + uncompressed_header.size() + compressed_header.size());
// keep track of frame number
current_frame_number++;
grace_period--;
// don't display hidden frames
hidden = !current_frame_info.show_frame;
return frame; return frame;
} }

@ -14,7 +14,6 @@
namespace Tegra { namespace Tegra {
class GPU; class GPU;
enum class FrameType { KeyFrame = 0, InterFrame = 1 };
namespace Decoder { namespace Decoder {
/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the /// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
@ -124,7 +123,7 @@ public:
/// Returns true if the most recent frame was a hidden frame. /// Returns true if the most recent frame was a hidden frame.
[[nodiscard]] bool WasFrameHidden() const { [[nodiscard]] bool WasFrameHidden() const {
return hidden; return !current_frame_info.show_frame;
} }
private: private:
@ -178,19 +177,12 @@ private:
std::array<s8, 4> loop_filter_ref_deltas{}; std::array<s8, 4> loop_filter_ref_deltas{};
std::array<s8, 2> loop_filter_mode_deltas{}; std::array<s8, 2> loop_filter_mode_deltas{};
bool hidden = false;
s64 current_frame_number = -2; // since we buffer 2 frames
s32 grace_period = 6; // frame offsets need to stabilize
std::array<FrameContexts, 4> frame_ctxs{};
Vp9FrameContainer next_frame{}; Vp9FrameContainer next_frame{};
Vp9FrameContainer next_next_frame{}; std::array<Vp9EntropyProbs, 4> frame_ctxs{};
bool swap_next_golden{}; bool swap_ref_indices{};
Vp9PictureInfo current_frame_info{}; Vp9PictureInfo current_frame_info{};
Vp9EntropyProbs prev_frame_probs{}; Vp9EntropyProbs prev_frame_probs{};
s32 diff_update_probability = 252;
s32 frame_sync_code = 0x498342;
}; };
} // namespace Decoder } // namespace Decoder

@ -296,12 +296,6 @@ struct RefPoolElement {
bool refresh{}; bool refresh{};
}; };
struct FrameContexts {
s64 from;
bool adapted;
Vp9EntropyProbs probs;
};
#define ASSERT_POSITION(field_name, position) \ #define ASSERT_POSITION(field_name, position) \
static_assert(offsetof(Vp9EntropyProbs, field_name) == position, \ static_assert(offsetof(Vp9EntropyProbs, field_name) == position, \
"Field " #field_name " has invalid position") "Field " #field_name " has invalid position")