Merge pull request #9559 from FernandoS27/cached-writes

VideoCore: Implement Cached Writes, use fastmem for reading GPU memory and eliminate old stuffs
2023-01-06 07:31:39 +07:00 · 2023-01-06 07:31:39 +07:00 · 5bcbb8de45
parent 990fe2b3fc b56ad93bbc
commit 5bcbb8de45
15 changed files with 233 additions and 53 deletions
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@ -436,7 +436,7 @@ struct Memory::Impl {
        }

        if (Settings::IsFastmemEnabled()) {
-            const bool is_read_enable = Settings::IsGPULevelHigh() || !cached;
+            const bool is_read_enable = !Settings::IsGPULevelExtreme() || !cached;
            system.DeviceMemory().buffer.Protect(vaddr, size, is_read_enable, !cached);
        }

--- a/src/tests/video_core/buffer_base.cpp
+++ b/src/tests/video_core/buffer_base.cpp
@ -538,7 +538,7 @@ TEST_CASE("BufferBase: Cached write downloads") {
    int num = 0;
    buffer.ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; });
    buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; });
-    REQUIRE(num == 0);
+    REQUIRE(num == 1);
    REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
    REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE));
    buffer.FlushCachedWrites();
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@ -85,6 +85,7 @@ add_library(video_core STATIC
    gpu.h
    gpu_thread.cpp
    gpu_thread.h
+    invalidation_accumulator.h
    memory_manager.cpp
    memory_manager.h
    precompiled_headers.h
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@ -430,7 +430,7 @@ private:
        if (query_begin >= SizeBytes() || size < 0) {
            return;
        }
-        u64* const untracked_words = Array<Type::Untracked>();
+        [[maybe_unused]] u64* const untracked_words = Array<Type::Untracked>();
        u64* const state_words = Array<type>();
        const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes());
        u64* const words_begin = state_words + query_begin / BYTES_PER_WORD;
@ -483,7 +483,7 @@ private:
                NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
            }
            // Exclude CPU modified pages when visiting GPU pages
-            const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0);
+            const u64 word = current_word;
            u64 page = page_begin;
            page_begin = 0;

@ -531,7 +531,7 @@ private:
    [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
        static_assert(type != Type::Untracked);

-        const u64* const untracked_words = Array<Type::Untracked>();
+        [[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
        const u64* const state_words = Array<type>();
        const u64 num_query_words = size / BYTES_PER_WORD + 1;
        const u64 word_begin = offset / BYTES_PER_WORD;
@ -539,8 +539,7 @@ private:
        const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
        u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
        for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
-            const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
-            const u64 word = state_words[word_index] & ~off_word;
+            const u64 word = state_words[word_index];
            if (word == 0) {
                continue;
            }
@ -564,7 +563,7 @@ private:
    [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {
        static_assert(type != Type::Untracked);

-        const u64* const untracked_words = Array<Type::Untracked>();
+        [[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
        const u64* const state_words = Array<type>();
        const u64 num_query_words = size / BYTES_PER_WORD + 1;
        const u64 word_begin = offset / BYTES_PER_WORD;
@ -574,8 +573,7 @@ private:
        u64 begin = std::numeric_limits<u64>::max();
        u64 end = 0;
        for (u64 word_index = word_begin; word_index < word_end; ++word_index) {
-            const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
-            const u64 word = state_words[word_index] & ~off_word;
+            const u64 word = state_words[word_index];
            if (word == 0) {
                continue;
            }
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@ -76,7 +76,7 @@ void State::ProcessData(std::span<const u8> read_buffer) {
                                       regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,
                                       x_elements, regs.line_count, regs.dest.BlockHeight(),
                                       regs.dest.BlockDepth(), regs.line_length_in);
-        memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
+        memory_manager.WriteBlockCached(address, tmp_buffer.data(), dst_size);
    }
 }

--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@ -6,6 +6,7 @@
 #include "common/microprofile.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/sw_blitter/blitter.h"
+#include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/surface.h"
 #include "video_core/textures/decoders.h"
@ -20,8 +21,8 @@ namespace Tegra::Engines {

 using namespace Texture;

-Fermi2D::Fermi2D(MemoryManager& memory_manager_) {
-    sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager_);
+Fermi2D::Fermi2D(MemoryManager& memory_manager_) : memory_manager{memory_manager_} {
+    sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager);
    // Nvidia's OpenGL driver seems to assume these values
    regs.src.depth = 1;
    regs.dst.depth = 1;
@ -104,6 +105,7 @@ void Fermi2D::Blit() {
        config.src_x0 = 0;
    }

+    memory_manager.FlushCaching();
    if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) {
        sw_blitter->Blit(src, regs.dst, config);
    }
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@ -305,6 +305,7 @@ public:
 private:
    VideoCore::RasterizerInterface* rasterizer = nullptr;
    std::unique_ptr<Blitter::SoftwareBlitEngine> sw_blitter;
+    MemoryManager& memory_manager;

    /// Performs the copy from the source surface to the destination surface as configured in the
    /// registers.
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@ -485,11 +485,6 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
 }

 void Maxwell3D::ProcessQueryGet() {
-    // TODO(Subv): Support the other query units.
-    if (regs.report_semaphore.query.location != Regs::ReportSemaphore::Location::All) {
-        LOG_DEBUG(HW_GPU, "Locations other than ALL are unimplemented");
-    }
-
    switch (regs.report_semaphore.query.operation) {
    case Regs::ReportSemaphore::Operation::Release:
        if (regs.report_semaphore.query.short_query != 0) {
@ -649,7 +644,7 @@ void Maxwell3D::ProcessCBMultiData(const u32* start_base, u32 amount) {

    const GPUVAddr address{buffer_address + regs.const_buffer.offset};
    const size_t copy_size = amount * sizeof(u32);
-    memory_manager.WriteBlock(address, start_base, copy_size);
+    memory_manager.WriteBlockCached(address, start_base, copy_size);

    // Increment the current buffer position.
    regs.const_buffer.offset += static_cast<u32>(copy_size);
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@ -69,7 +69,7 @@ void MaxwellDMA::Launch() {
    if (launch.multi_line_enable) {
        const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
        const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
-
+        memory_manager.FlushCaching();
        if (!is_src_pitch && !is_dst_pitch) {
            // If both the source and the destination are in block layout, assert.
            CopyBlockLinearToBlockLinear();
@ -104,6 +104,7 @@ void MaxwellDMA::Launch() {
                                            reinterpret_cast<u8*>(tmp_buffer.data()),
                                            regs.line_length_in * sizeof(u32));
        } else {
+            memory_manager.FlushCaching();
            const auto convert_linear_2_blocklinear_addr = [](u64 address) {
                return (address & ~0x1f0ULL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) |
                       ((address & 0x180) >> 1) | ((address & 0x20) << 3);
@ -121,8 +122,8 @@ void MaxwellDMA::Launch() {
                    memory_manager.ReadBlockUnsafe(
                        convert_linear_2_blocklinear_addr(regs.offset_in + offset),
                        tmp_buffer.data(), tmp_buffer.size());
-                    memory_manager.WriteBlock(regs.offset_out + offset, tmp_buffer.data(),
-                                              tmp_buffer.size());
+                    memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(),
+                                                    tmp_buffer.size());
                }
            } else if (is_src_pitch && !is_dst_pitch) {
                UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0);
@ -132,7 +133,7 @@ void MaxwellDMA::Launch() {
                for (u32 offset = 0; offset < regs.line_length_in; offset += 16) {
                    memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(),
                                                   tmp_buffer.size());
-                    memory_manager.WriteBlock(
+                    memory_manager.WriteBlockCached(
                        convert_linear_2_blocklinear_addr(regs.offset_out + offset),
                        tmp_buffer.data(), tmp_buffer.size());
                }
@ -141,8 +142,8 @@ void MaxwellDMA::Launch() {
                    std::vector<u8> tmp_buffer(regs.line_length_in);
                    memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(),
                                                   regs.line_length_in);
-                    memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(),
-                                              regs.line_length_in);
+                    memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(),
+                                                    regs.line_length_in);
                }
            }
        }
@ -204,7 +205,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
                     src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
                     regs.pitch_out);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::CopyPitchToBlockLinear() {
@ -256,7 +257,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
                   dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
                   regs.pitch_in);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::FastCopyBlockLinearToPitch() {
@ -287,7 +288,7 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
                     regs.src_params.block_size.height, regs.src_params.block_size.depth,
                     regs.pitch_out);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::CopyBlockLinearToBlockLinear() {
@ -347,7 +348,7 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() {
                   dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count,
                   dst.block_size.height, dst.block_size.depth, pitch);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::ReleaseSemaphore() {
--- a/src/video_core/invalidation_accumulator.h
+++ b/src/video_core/invalidation_accumulator.h
@ -0,0 +1,79 @@
+// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include "common/common_types.h"
+
+namespace VideoCommon {
+
+class InvalidationAccumulator {
+public:
+    InvalidationAccumulator() = default;
+    ~InvalidationAccumulator() = default;
+
+    void Add(GPUVAddr address, size_t size) {
+        const auto reset_values = [&]() {
+            if (has_collected) {
+                buffer.emplace_back(start_address, accumulated_size);
+            }
+            start_address = address;
+            accumulated_size = size;
+            last_collection = start_address + size;
+        };
+        if (address >= start_address && address + size <= last_collection) [[likely]] {
+            return;
+        }
+        size = ((address + size + atomicity_size_mask) & atomicity_mask) - address;
+        address = address & atomicity_mask;
+        if (!has_collected) [[unlikely]] {
+            reset_values();
+            has_collected = true;
+            return;
+        }
+        if (address != last_collection) [[unlikely]] {
+            reset_values();
+            return;
+        }
+        accumulated_size += size;
+        last_collection += size;
+    }
+
+    void Clear() {
+        buffer.clear();
+        start_address = 0;
+        last_collection = 0;
+        has_collected = false;
+    }
+
+    bool AnyAccumulated() const {
+        return has_collected;
+    }
+
+    template <typename Func>
+    void Callback(Func&& func) {
+        if (!has_collected) {
+            return;
+        }
+        buffer.emplace_back(start_address, accumulated_size);
+        for (auto& [address, size] : buffer) {
+            func(address, size);
+        }
+    }
+
+private:
+    static constexpr size_t atomicity_bits = 5;
+    static constexpr size_t atomicity_size = 1ULL << atomicity_bits;
+    static constexpr size_t atomicity_size_mask = atomicity_size - 1;
+    static constexpr size_t atomicity_mask = ~atomicity_size_mask;
+    GPUVAddr start_address{};
+    GPUVAddr last_collection{};
+    size_t accumulated_size{};
+    bool has_collected{};
+    std::vector<std::pair<VAddr, size_t>> buffer;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@ -6,11 +6,13 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/settings.h"
 #include "core/core.h"
 #include "core/device_memory.h"
 #include "core/hle/kernel/k_page_table.h"
 #include "core/hle/kernel/k_process.h"
 #include "core/memory.h"
+#include "video_core/invalidation_accumulator.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
@ -26,7 +28,8 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64
      entries{}, big_entries{}, page_table{address_space_bits, address_space_bits + page_bits - 38,
                                           page_bits != big_page_bits ? page_bits : 0},
      kind_map{PTEKind::INVALID}, unique_identifier{unique_identifier_generator.fetch_add(
-                                      1, std::memory_order_acq_rel)} {
+                                      1, std::memory_order_acq_rel)},
+      accumulator{std::make_unique<VideoCommon::InvalidationAccumulator>()} {
    address_space_size = 1ULL << address_space_bits;
    page_size = 1ULL << page_bits;
    page_mask = page_size - 1ULL;
@ -43,6 +46,11 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64
    big_page_table_cpu.resize(big_page_table_size);
    big_page_continous.resize(big_page_table_size / continous_bits, 0);
    entries.resize(page_table_size / 32, 0);
+    if (!Settings::IsGPULevelExtreme() && Settings::IsFastmemEnabled()) {
+        fastmem_arena = system.DeviceMemory().buffer.VirtualBasePointer();
+    } else {
+        fastmem_arena = nullptr;
+    }
 }

 MemoryManager::~MemoryManager() = default;
@ -185,15 +193,12 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
    if (size == 0) {
        return;
    }
-    const auto submapped_ranges = GetSubmappedRange(gpu_addr, size);
+    GetSubmappedRangeImpl<false>(gpu_addr, size, page_stash);

-    for (const auto& [map_addr, map_size] : submapped_ranges) {
-        // Flush and invalidate through the GPU interface, to be asynchronous if possible.
-        const std::optional<VAddr> cpu_addr = GpuToCpuAddress(map_addr);
-        ASSERT(cpu_addr);
-
-        rasterizer->UnmapMemory(*cpu_addr, map_size);
+    for (const auto& [map_addr, map_size] : page_stash) {
+        rasterizer->UnmapMemory(map_addr, map_size);
    }
+    page_stash.clear();

    BigPageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
    PageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
@ -355,7 +360,7 @@ inline void MemoryManager::MemoryOperation(GPUVAddr gpu_src_addr, std::size_t si
    }
 }

-template <bool is_safe>
+template <bool is_safe, bool use_fastmem>
 void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
                                  [[maybe_unused]] VideoCommon::CacheType which) const {
    auto set_to_zero = [&]([[maybe_unused]] std::size_t page_index,
@ -369,8 +374,12 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:
        if constexpr (is_safe) {
            rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);
        }
-        u8* physical = memory.GetPointer(cpu_addr_base);
-        std::memcpy(dest_buffer, physical, copy_amount);
+        if constexpr (use_fastmem) {
+            std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount);
+        } else {
+            u8* physical = memory.GetPointer(cpu_addr_base);
+            std::memcpy(dest_buffer, physical, copy_amount);
+        }
        dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
    };
    auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
@ -379,11 +388,15 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:
        if constexpr (is_safe) {
            rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);
        }
-        if (!IsBigPageContinous(page_index)) [[unlikely]] {
-            memory.ReadBlockUnsafe(cpu_addr_base, dest_buffer, copy_amount);
+        if constexpr (use_fastmem) {
+            std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount);
        } else {
-            u8* physical = memory.GetPointer(cpu_addr_base);
-            std::memcpy(dest_buffer, physical, copy_amount);
+            if (!IsBigPageContinous(page_index)) [[unlikely]] {
+                memory.ReadBlockUnsafe(cpu_addr_base, dest_buffer, copy_amount);
+            } else {
+                u8* physical = memory.GetPointer(cpu_addr_base);
+                std::memcpy(dest_buffer, physical, copy_amount);
+            }
        }
        dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
    };
@ -397,12 +410,20 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:

 void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
                              VideoCommon::CacheType which) const {
-    ReadBlockImpl<true>(gpu_src_addr, dest_buffer, size, which);
+    if (fastmem_arena) [[likely]] {
+        ReadBlockImpl<true, true>(gpu_src_addr, dest_buffer, size, which);
+        return;
+    }
+    ReadBlockImpl<true, false>(gpu_src_addr, dest_buffer, size, which);
 }

 void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
                                    const std::size_t size) const {
-    ReadBlockImpl<false>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
+    if (fastmem_arena) [[likely]] {
+        ReadBlockImpl<false, true>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
+        return;
+    }
+    ReadBlockImpl<false, false>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
 }

 template <bool is_safe>
@ -454,6 +475,12 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buf
    WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
 }

+void MemoryManager::WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer,
+                                     std::size_t size) {
+    WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
+    accumulator->Add(gpu_dest_addr, size);
+}
+
 void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size,
                                VideoCommon::CacheType which) const {
    auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
@ -663,7 +690,17 @@ bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) cons
 std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
    GPUVAddr gpu_addr, std::size_t size) const {
    std::vector<std::pair<GPUVAddr, std::size_t>> result{};
-    std::optional<std::pair<GPUVAddr, std::size_t>> last_segment{};
+    GetSubmappedRangeImpl<true>(gpu_addr, size, result);
+    return result;
+}
+
+template <bool is_gpu_address>
+void MemoryManager::GetSubmappedRangeImpl(
+    GPUVAddr gpu_addr, std::size_t size,
+    std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
+        result) const {
+    std::optional<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>
+        last_segment{};
    std::optional<VAddr> old_page_addr{};
    const auto split = [&last_segment, &result]([[maybe_unused]] std::size_t page_index,
                                                [[maybe_unused]] std::size_t offset,
@ -685,8 +722,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
        }
        old_page_addr = {cpu_addr_base + copy_amount};
        if (!last_segment) {
-            const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset;
-            last_segment = {new_base_addr, copy_amount};
+            if constexpr (is_gpu_address) {
+                const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset;
+                last_segment = {new_base_addr, copy_amount};
+            } else {
+                last_segment = {cpu_addr_base, copy_amount};
+            }
        } else {
            last_segment->second += copy_amount;
        }
@ -703,8 +744,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
        }
        old_page_addr = {cpu_addr_base + copy_amount};
        if (!last_segment) {
-            const GPUVAddr new_base_addr = (page_index << page_bits) + offset;
-            last_segment = {new_base_addr, copy_amount};
+            if constexpr (is_gpu_address) {
+                const GPUVAddr new_base_addr = (page_index << page_bits) + offset;
+                last_segment = {new_base_addr, copy_amount};
+            } else {
+                last_segment = {cpu_addr_base, copy_amount};
+            }
        } else {
            last_segment->second += copy_amount;
        }
@ -715,7 +760,18 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
    };
    MemoryOperation<true>(gpu_addr, size, extend_size_big, split, do_short_pages);
    split(0, 0, 0);
-    return result;
+}
+
+void MemoryManager::FlushCaching() {
+    if (!accumulator->AnyAccumulated()) {
+        return;
+    }
+    accumulator->Callback([this](GPUVAddr addr, size_t size) {
+        GetSubmappedRangeImpl<false>(addr, size, page_stash);
+    });
+    rasterizer->InnerInvalidation(page_stash);
+    page_stash.clear();
+    accumulator->Clear();
 }

 } // namespace Tegra
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@ -19,6 +19,10 @@ namespace VideoCore {
 class RasterizerInterface;
 }

+namespace VideoCommon {
+class InvalidationAccumulator;
+}
+
 namespace Core {
 class DeviceMemory;
 namespace Memory {
@ -80,6 +84,7 @@ public:
     */
    void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
    void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);

    /**
     * Checks if a gpu region can be simply read with a pointer.
@ -129,12 +134,14 @@ public:
    size_t GetMemoryLayoutSize(GPUVAddr gpu_addr,
                               size_t max_size = std::numeric_limits<size_t>::max()) const;

+    void FlushCaching();
+
 private:
    template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>
    inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,
                                FuncReserved&& func_reserved, FuncUnmapped&& func_unmapped) const;

-    template <bool is_safe>
+    template <bool is_safe, bool use_fastmem>
    void ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
                       VideoCommon::CacheType which) const;

@ -154,6 +161,12 @@ private:
    inline bool IsBigPageContinous(size_t big_page_index) const;
    inline void SetBigPageContinous(size_t big_page_index, bool value);

+    template <bool is_gpu_address>
+    void GetSubmappedRangeImpl(
+        GPUVAddr gpu_addr, std::size_t size,
+        std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
+            result) const;
+
    Core::System& system;
    Core::Memory::Memory& memory;
    Core::DeviceMemory& device_memory;
@ -201,10 +214,13 @@ private:
    Common::VirtualBuffer<u32> big_page_table_cpu;

    std::vector<u64> big_page_continous;
+    std::vector<std::pair<VAddr, std::size_t>> page_stash{};
+    u8* fastmem_arena{};

    constexpr static size_t continous_bits = 64;

    const size_t unique_identifier;
+    std::unique_ptr<VideoCommon::InvalidationAccumulator> accumulator;

    static std::atomic<size_t> unique_identifier_generator;
 };
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@ -6,6 +6,7 @@
 #include <functional>
 #include <optional>
 #include <span>
+#include <utility>
 #include "common/common_types.h"
 #include "common/polyfill_thread.h"
 #include "video_core/cache_types.h"
@ -95,6 +96,12 @@ public:
    virtual void InvalidateRegion(VAddr addr, u64 size,
                                  VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0;

+    virtual void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
+        for (const auto& [cpu_addr, size] : sequences) {
+            InvalidateRegion(cpu_addr, size);
+        }
+    }
+
    /// Notify rasterizer that any caches of the specified region are desync with guest
    virtual void OnCPUWrite(VAddr addr, u64 size) = 0;

--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@ -186,6 +186,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {

    SCOPE_EXIT({ gpu.TickWork(); });
    FlushWork();
+    gpu_memory->FlushCaching();

    query_cache.UpdateCounters();

@ -393,6 +394,7 @@ void RasterizerVulkan::Clear(u32 layer_count) {

 void RasterizerVulkan::DispatchCompute() {
    FlushWork();
+    gpu_memory->FlushCaching();

    ComputePipeline* const pipeline{pipeline_cache.CurrentComputePipeline()};
    if (!pipeline) {
@ -481,6 +483,27 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size, VideoCommon::Cache
    }
 }

+void RasterizerVulkan::InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
+    {
+        std::scoped_lock lock{texture_cache.mutex};
+        for (const auto& [addr, size] : sequences) {
+            texture_cache.WriteMemory(addr, size);
+        }
+    }
+    {
+        std::scoped_lock lock{buffer_cache.mutex};
+        for (const auto& [addr, size] : sequences) {
+            buffer_cache.WriteMemory(addr, size);
+        }
+    }
+    {
+        for (const auto& [addr, size] : sequences) {
+            query_cache.InvalidateRegion(addr, size);
+            pipeline_cache.InvalidateRegion(addr, size);
+        }
+    }
+}
+
 void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
    if (addr == 0 || size == 0) {
        return;
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@ -79,6 +79,7 @@ public:
                         VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
    void InvalidateRegion(VAddr addr, u64 size,
                          VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
+    void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) override;
    void OnCPUWrite(VAddr addr, u64 size) override;
    void InvalidateGPUCache() override;
    void UnmapMemory(VAddr addr, u64 size) override;