Merge pull request #2237 from bunnei/cache-host-addr

gpu: Use host address for caching instead of guest address.
master
bunnei 2019-03-16 00:05:24 +07:00 committed by GitHub
commit 47b622825c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 393 additions and 293 deletions

@ -10,6 +10,7 @@
#include "core/core.h" #include "core/core.h"
#include "core/hle/service/nvdrv/devices/nvhost_as_gpu.h" #include "core/hle/service/nvdrv/devices/nvhost_as_gpu.h"
#include "core/hle/service/nvdrv/devices/nvmap.h" #include "core/hle/service/nvdrv/devices/nvmap.h"
#include "core/memory.h"
#include "video_core/memory_manager.h" #include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h" #include "video_core/rasterizer_interface.h"
#include "video_core/renderer_base.h" #include "video_core/renderer_base.h"
@ -178,7 +179,7 @@ u32 nvhost_as_gpu::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& ou
auto& gpu = system_instance.GPU(); auto& gpu = system_instance.GPU();
auto cpu_addr = gpu.MemoryManager().GpuToCpuAddress(params.offset); auto cpu_addr = gpu.MemoryManager().GpuToCpuAddress(params.offset);
ASSERT(cpu_addr); ASSERT(cpu_addr);
gpu.FlushAndInvalidateRegion(*cpu_addr, itr->second.size); gpu.FlushAndInvalidateRegion(ToCacheAddr(Memory::GetPointer(*cpu_addr)), itr->second.size);
params.offset = gpu.MemoryManager().UnmapBuffer(params.offset, itr->second.size); params.offset = gpu.MemoryManager().UnmapBuffer(params.offset, itr->second.size);

@ -67,8 +67,11 @@ static void MapPages(PageTable& page_table, VAddr base, u64 size, u8* memory, Pa
LOG_DEBUG(HW_Memory, "Mapping {} onto {:016X}-{:016X}", fmt::ptr(memory), base * PAGE_SIZE, LOG_DEBUG(HW_Memory, "Mapping {} onto {:016X}-{:016X}", fmt::ptr(memory), base * PAGE_SIZE,
(base + size) * PAGE_SIZE); (base + size) * PAGE_SIZE);
RasterizerFlushVirtualRegion(base << PAGE_BITS, size * PAGE_SIZE, // During boot, current_page_table might not be set yet, in which case we need not flush
FlushMode::FlushAndInvalidate); if (current_page_table) {
RasterizerFlushVirtualRegion(base << PAGE_BITS, size * PAGE_SIZE,
FlushMode::FlushAndInvalidate);
}
VAddr end = base + size; VAddr end = base + size;
ASSERT_MSG(end <= page_table.pointers.size(), "out of range mapping at {:016X}", ASSERT_MSG(end <= page_table.pointers.size(), "out of range mapping at {:016X}",
@ -359,13 +362,13 @@ void RasterizerFlushVirtualRegion(VAddr start, u64 size, FlushMode mode) {
auto& gpu = system_instance.GPU(); auto& gpu = system_instance.GPU();
switch (mode) { switch (mode) {
case FlushMode::Flush: case FlushMode::Flush:
gpu.FlushRegion(overlap_start, overlap_size); gpu.FlushRegion(ToCacheAddr(GetPointer(overlap_start)), overlap_size);
break; break;
case FlushMode::Invalidate: case FlushMode::Invalidate:
gpu.InvalidateRegion(overlap_start, overlap_size); gpu.InvalidateRegion(ToCacheAddr(GetPointer(overlap_start)), overlap_size);
break; break;
case FlushMode::FlushAndInvalidate: case FlushMode::FlushAndInvalidate:
gpu.FlushAndInvalidateRegion(overlap_start, overlap_size); gpu.FlushAndInvalidateRegion(ToCacheAddr(GetPointer(overlap_start)), overlap_size);
break; break;
} }
}; };

@ -9,6 +9,7 @@
#include "video_core/engines/kepler_memory.h" #include "video_core/engines/kepler_memory.h"
#include "video_core/engines/maxwell_3d.h" #include "video_core/engines/maxwell_3d.h"
#include "video_core/rasterizer_interface.h" #include "video_core/rasterizer_interface.h"
#include "video_core/renderer_base.h"
namespace Tegra::Engines { namespace Tegra::Engines {
@ -48,7 +49,8 @@ void KeplerMemory::ProcessData(u32 data) {
// We have to invalidate the destination region to evict any outdated surfaces from the cache. // We have to invalidate the destination region to evict any outdated surfaces from the cache.
// We do this before actually writing the new data because the destination address might contain // We do this before actually writing the new data because the destination address might contain
// a dirty surface that will have to be written back to memory. // a dirty surface that will have to be written back to memory.
Core::System::GetInstance().GPU().InvalidateRegion(*dest_address, sizeof(u32)); system.Renderer().Rasterizer().InvalidateRegion(ToCacheAddr(Memory::GetPointer(*dest_address)),
sizeof(u32));
Memory::Write32(*dest_address, data); Memory::Write32(*dest_address, data);
system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();

@ -396,7 +396,10 @@ void Maxwell3D::ProcessCBData(u32 value) {
const auto address = memory_manager.GpuToCpuAddress(buffer_address + regs.const_buffer.cb_pos); const auto address = memory_manager.GpuToCpuAddress(buffer_address + regs.const_buffer.cb_pos);
ASSERT_MSG(address, "Invalid GPU address"); ASSERT_MSG(address, "Invalid GPU address");
Memory::Write32(*address, value); u8* ptr{Memory::GetPointer(*address)};
rasterizer.InvalidateRegion(ToCacheAddr(ptr), sizeof(u32));
std::memcpy(ptr, &value, sizeof(u32));
dirty_flags.OnMemoryWrite(); dirty_flags.OnMemoryWrite();
// Increment the current buffer position. // Increment the current buffer position.

@ -9,6 +9,7 @@
#include "video_core/engines/maxwell_3d.h" #include "video_core/engines/maxwell_3d.h"
#include "video_core/engines/maxwell_dma.h" #include "video_core/engines/maxwell_dma.h"
#include "video_core/rasterizer_interface.h" #include "video_core/rasterizer_interface.h"
#include "video_core/renderer_base.h"
#include "video_core/textures/decoders.h" #include "video_core/textures/decoders.h"
namespace Tegra::Engines { namespace Tegra::Engines {
@ -92,12 +93,14 @@ void MaxwellDMA::HandleCopy() {
const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) { const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) {
// TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated
// copying. // copying.
Core::System::GetInstance().GPU().FlushRegion(*source_cpu, src_size); Core::System::GetInstance().Renderer().Rasterizer().FlushRegion(
ToCacheAddr(Memory::GetPointer(*source_cpu)), src_size);
// We have to invalidate the destination region to evict any outdated surfaces from the // We have to invalidate the destination region to evict any outdated surfaces from the
// cache. We do this before actually writing the new data because the destination address // cache. We do this before actually writing the new data because the destination address
// might contain a dirty surface that will have to be written back to memory. // might contain a dirty surface that will have to be written back to memory.
Core::System::GetInstance().GPU().InvalidateRegion(*dest_cpu, dst_size); Core::System::GetInstance().Renderer().Rasterizer().InvalidateRegion(
ToCacheAddr(Memory::GetPointer(*dest_cpu)), dst_size);
}; };
if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) { if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {

@ -11,6 +11,11 @@
#include "video_core/dma_pusher.h" #include "video_core/dma_pusher.h"
#include "video_core/memory_manager.h" #include "video_core/memory_manager.h"
using CacheAddr = std::uintptr_t;
inline CacheAddr ToCacheAddr(const void* host_ptr) {
return reinterpret_cast<CacheAddr>(host_ptr);
}
namespace Core { namespace Core {
class System; class System;
} }
@ -209,13 +214,13 @@ public:
std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) = 0; std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) = 0;
/// Notify rasterizer that any caches of the specified region should be flushed to Switch memory /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
virtual void FlushRegion(VAddr addr, u64 size) = 0; virtual void FlushRegion(CacheAddr addr, u64 size) = 0;
/// Notify rasterizer that any caches of the specified region should be invalidated /// Notify rasterizer that any caches of the specified region should be invalidated
virtual void InvalidateRegion(VAddr addr, u64 size) = 0; virtual void InvalidateRegion(CacheAddr addr, u64 size) = 0;
/// Notify rasterizer that any caches of the specified region should be flushed and invalidated /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0; virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
private: private:
void ProcessBindMethod(const MethodCall& method_call); void ProcessBindMethod(const MethodCall& method_call);

@ -22,15 +22,15 @@ void GPUAsynch::SwapBuffers(
gpu_thread.SwapBuffers(std::move(framebuffer)); gpu_thread.SwapBuffers(std::move(framebuffer));
} }
void GPUAsynch::FlushRegion(VAddr addr, u64 size) { void GPUAsynch::FlushRegion(CacheAddr addr, u64 size) {
gpu_thread.FlushRegion(addr, size); gpu_thread.FlushRegion(addr, size);
} }
void GPUAsynch::InvalidateRegion(VAddr addr, u64 size) { void GPUAsynch::InvalidateRegion(CacheAddr addr, u64 size) {
gpu_thread.InvalidateRegion(addr, size); gpu_thread.InvalidateRegion(addr, size);
} }
void GPUAsynch::FlushAndInvalidateRegion(VAddr addr, u64 size) { void GPUAsynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
gpu_thread.FlushAndInvalidateRegion(addr, size); gpu_thread.FlushAndInvalidateRegion(addr, size);
} }

@ -26,9 +26,9 @@ public:
void PushGPUEntries(Tegra::CommandList&& entries) override; void PushGPUEntries(Tegra::CommandList&& entries) override;
void SwapBuffers( void SwapBuffers(
std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override; std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
void FlushRegion(VAddr addr, u64 size) override; void FlushRegion(CacheAddr addr, u64 size) override;
void InvalidateRegion(VAddr addr, u64 size) override; void InvalidateRegion(CacheAddr addr, u64 size) override;
void FlushAndInvalidateRegion(VAddr addr, u64 size) override; void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
private: private:
GPUThread::ThreadManager gpu_thread; GPUThread::ThreadManager gpu_thread;

@ -22,15 +22,15 @@ void GPUSynch::SwapBuffers(
renderer.SwapBuffers(std::move(framebuffer)); renderer.SwapBuffers(std::move(framebuffer));
} }
void GPUSynch::FlushRegion(VAddr addr, u64 size) { void GPUSynch::FlushRegion(CacheAddr addr, u64 size) {
renderer.Rasterizer().FlushRegion(addr, size); renderer.Rasterizer().FlushRegion(addr, size);
} }
void GPUSynch::InvalidateRegion(VAddr addr, u64 size) { void GPUSynch::InvalidateRegion(CacheAddr addr, u64 size) {
renderer.Rasterizer().InvalidateRegion(addr, size); renderer.Rasterizer().InvalidateRegion(addr, size);
} }
void GPUSynch::FlushAndInvalidateRegion(VAddr addr, u64 size) { void GPUSynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
renderer.Rasterizer().FlushAndInvalidateRegion(addr, size); renderer.Rasterizer().FlushAndInvalidateRegion(addr, size);
} }

@ -21,9 +21,9 @@ public:
void PushGPUEntries(Tegra::CommandList&& entries) override; void PushGPUEntries(Tegra::CommandList&& entries) override;
void SwapBuffers( void SwapBuffers(
std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override; std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
void FlushRegion(VAddr addr, u64 size) override; void FlushRegion(CacheAddr addr, u64 size) override;
void InvalidateRegion(VAddr addr, u64 size) override; void InvalidateRegion(CacheAddr addr, u64 size) override;
void FlushAndInvalidateRegion(VAddr addr, u64 size) override; void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
}; };
} // namespace VideoCommon } // namespace VideoCommon

@ -5,7 +5,6 @@
#include "common/assert.h" #include "common/assert.h"
#include "common/microprofile.h" #include "common/microprofile.h"
#include "core/frontend/scope_acquire_window_context.h" #include "core/frontend/scope_acquire_window_context.h"
#include "core/settings.h"
#include "video_core/dma_pusher.h" #include "video_core/dma_pusher.h"
#include "video_core/gpu.h" #include "video_core/gpu.h"
#include "video_core/gpu_thread.h" #include "video_core/gpu_thread.h"
@ -13,38 +12,13 @@
namespace VideoCommon::GPUThread { namespace VideoCommon::GPUThread {
/// Executes a single GPU thread command
static void ExecuteCommand(CommandData* command, VideoCore::RendererBase& renderer,
Tegra::DmaPusher& dma_pusher) {
if (const auto submit_list = std::get_if<SubmitListCommand>(command)) {
dma_pusher.Push(std::move(submit_list->entries));
dma_pusher.DispatchCalls();
} else if (const auto data = std::get_if<SwapBuffersCommand>(command)) {
renderer.SwapBuffers(data->framebuffer);
} else if (const auto data = std::get_if<FlushRegionCommand>(command)) {
renderer.Rasterizer().FlushRegion(data->addr, data->size);
} else if (const auto data = std::get_if<InvalidateRegionCommand>(command)) {
renderer.Rasterizer().InvalidateRegion(data->addr, data->size);
} else if (const auto data = std::get_if<FlushAndInvalidateRegionCommand>(command)) {
renderer.Rasterizer().FlushAndInvalidateRegion(data->addr, data->size);
} else {
UNREACHABLE();
}
}
/// Runs the GPU thread /// Runs the GPU thread
static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher, static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher,
SynchState& state) { SynchState& state) {
MicroProfileOnThreadCreate("GpuThread"); MicroProfileOnThreadCreate("GpuThread");
auto WaitForWakeup = [&]() {
std::unique_lock<std::mutex> lock{state.signal_mutex};
state.signal_condition.wait(lock, [&] { return !state.is_idle || !state.is_running; });
};
// Wait for first GPU command before acquiring the window context // Wait for first GPU command before acquiring the window context
WaitForWakeup(); state.WaitForCommands();
// If emulation was stopped during disk shader loading, abort before trying to acquire context // If emulation was stopped during disk shader loading, abort before trying to acquire context
if (!state.is_running) { if (!state.is_running) {
@ -53,100 +27,72 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
Core::Frontend::ScopeAcquireWindowContext acquire_context{renderer.GetRenderWindow()}; Core::Frontend::ScopeAcquireWindowContext acquire_context{renderer.GetRenderWindow()};
CommandDataContainer next;
while (state.is_running) { while (state.is_running) {
if (!state.is_running) { state.WaitForCommands();
return; while (!state.queue.Empty()) {
state.queue.Pop(next);
if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
dma_pusher.Push(std::move(submit_list->entries));
dma_pusher.DispatchCalls();
} else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) {
state.DecrementFramesCounter();
renderer.SwapBuffers(std::move(data->framebuffer));
} else if (const auto data = std::get_if<FlushRegionCommand>(&next.data)) {
renderer.Rasterizer().FlushRegion(data->addr, data->size);
} else if (const auto data = std::get_if<InvalidateRegionCommand>(&next.data)) {
renderer.Rasterizer().InvalidateRegion(data->addr, data->size);
} else if (const auto data = std::get_if<EndProcessingCommand>(&next.data)) {
return;
} else {
UNREACHABLE();
}
} }
{
// Thread has been woken up, so make the previous write queue the next read queue
std::lock_guard<std::mutex> lock{state.signal_mutex};
std::swap(state.push_queue, state.pop_queue);
}
// Execute all of the GPU commands
while (!state.pop_queue->empty()) {
ExecuteCommand(&state.pop_queue->front(), renderer, dma_pusher);
state.pop_queue->pop();
}
state.UpdateIdleState();
// Signal that the GPU thread has finished processing commands
if (state.is_idle) {
state.idle_condition.notify_one();
}
// Wait for CPU thread to send more GPU commands
WaitForWakeup();
} }
} }
ThreadManager::ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher) ThreadManager::ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher)
: renderer{renderer}, dma_pusher{dma_pusher}, thread{RunThread, std::ref(renderer), : renderer{renderer}, dma_pusher{dma_pusher}, thread{RunThread, std::ref(renderer),
std::ref(dma_pusher), std::ref(state)}, std::ref(dma_pusher), std::ref(state)} {}
thread_id{thread.get_id()} {}
ThreadManager::~ThreadManager() { ThreadManager::~ThreadManager() {
{ // Notify GPU thread that a shutdown is pending
// Notify GPU thread that a shutdown is pending PushCommand(EndProcessingCommand());
std::lock_guard<std::mutex> lock{state.signal_mutex};
state.is_running = false;
}
state.signal_condition.notify_one();
thread.join(); thread.join();
} }
void ThreadManager::SubmitList(Tegra::CommandList&& entries) { void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
if (entries.empty()) { PushCommand(SubmitListCommand(std::move(entries)));
return;
}
PushCommand(SubmitListCommand(std::move(entries)), false, false);
} }
void ThreadManager::SwapBuffers( void ThreadManager::SwapBuffers(
std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) { std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
PushCommand(SwapBuffersCommand(std::move(framebuffer)), true, false); state.IncrementFramesCounter();
PushCommand(SwapBuffersCommand(std::move(framebuffer)));
state.WaitForFrames();
} }
void ThreadManager::FlushRegion(VAddr addr, u64 size) { void ThreadManager::FlushRegion(CacheAddr addr, u64 size) {
// Block the CPU when using accurate emulation PushCommand(FlushRegionCommand(addr, size));
PushCommand(FlushRegionCommand(addr, size), Settings::values.use_accurate_gpu_emulation, false);
} }
void ThreadManager::InvalidateRegion(VAddr addr, u64 size) { void ThreadManager::InvalidateRegion(CacheAddr addr, u64 size) {
PushCommand(InvalidateRegionCommand(addr, size), true, true); if (state.queue.Empty()) {
// It's quicker to invalidate a single region on the CPU if the queue is already empty
renderer.Rasterizer().InvalidateRegion(addr, size);
} else {
PushCommand(InvalidateRegionCommand(addr, size));
}
} }
void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) { void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
// Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important
InvalidateRegion(addr, size); InvalidateRegion(addr, size);
} }
void ThreadManager::PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu) { void ThreadManager::PushCommand(CommandData&& command_data) {
{ state.queue.Push(CommandDataContainer(std::move(command_data)));
std::lock_guard<std::mutex> lock{state.signal_mutex}; state.SignalCommands();
if ((allow_on_cpu && state.is_idle) || IsGpuThread()) {
// Execute the command synchronously on the current thread
ExecuteCommand(&command_data, renderer, dma_pusher);
return;
}
// Push the command to the GPU thread
state.UpdateIdleState();
state.push_queue->emplace(command_data);
}
// Signal the GPU thread that commands are pending
state.signal_condition.notify_one();
if (wait_for_idle) {
// Wait for the GPU to be idle (all commands to be executed)
std::unique_lock<std::mutex> lock{state.idle_mutex};
state.idle_condition.wait(lock, [this] { return static_cast<bool>(state.is_idle); });
}
} }
} // namespace VideoCommon::GPUThread } // namespace VideoCommon::GPUThread

@ -13,6 +13,9 @@
#include <thread> #include <thread>
#include <variant> #include <variant>
#include "common/threadsafe_queue.h"
#include "video_core/gpu.h"
namespace Tegra { namespace Tegra {
struct FramebufferConfig; struct FramebufferConfig;
class DmaPusher; class DmaPusher;
@ -24,6 +27,9 @@ class RendererBase;
namespace VideoCommon::GPUThread { namespace VideoCommon::GPUThread {
/// Command to signal to the GPU thread that processing has ended
struct EndProcessingCommand final {};
/// Command to signal to the GPU thread that a command list is ready for processing /// Command to signal to the GPU thread that a command list is ready for processing
struct SubmitListCommand final { struct SubmitListCommand final {
explicit SubmitListCommand(Tegra::CommandList&& entries) : entries{std::move(entries)} {} explicit SubmitListCommand(Tegra::CommandList&& entries) : entries{std::move(entries)} {}
@ -36,59 +42,110 @@ struct SwapBuffersCommand final {
explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer) explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer)
: framebuffer{std::move(framebuffer)} {} : framebuffer{std::move(framebuffer)} {}
std::optional<const Tegra::FramebufferConfig> framebuffer; std::optional<Tegra::FramebufferConfig> framebuffer;
}; };
/// Command to signal to the GPU thread to flush a region /// Command to signal to the GPU thread to flush a region
struct FlushRegionCommand final { struct FlushRegionCommand final {
explicit constexpr FlushRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {} explicit constexpr FlushRegionCommand(CacheAddr addr, u64 size) : addr{addr}, size{size} {}
const VAddr addr; CacheAddr addr;
const u64 size; u64 size;
}; };
/// Command to signal to the GPU thread to invalidate a region /// Command to signal to the GPU thread to invalidate a region
struct InvalidateRegionCommand final { struct InvalidateRegionCommand final {
explicit constexpr InvalidateRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {} explicit constexpr InvalidateRegionCommand(CacheAddr addr, u64 size) : addr{addr}, size{size} {}
const VAddr addr; CacheAddr addr;
const u64 size; u64 size;
}; };
/// Command to signal to the GPU thread to flush and invalidate a region /// Command to signal to the GPU thread to flush and invalidate a region
struct FlushAndInvalidateRegionCommand final { struct FlushAndInvalidateRegionCommand final {
explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr, u64 size) explicit constexpr FlushAndInvalidateRegionCommand(CacheAddr addr, u64 size)
: addr{addr}, size{size} {} : addr{addr}, size{size} {}
const VAddr addr; CacheAddr addr;
const u64 size; u64 size;
}; };
using CommandData = std::variant<SubmitListCommand, SwapBuffersCommand, FlushRegionCommand, using CommandData =
InvalidateRegionCommand, FlushAndInvalidateRegionCommand>; std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
InvalidateRegionCommand, FlushAndInvalidateRegionCommand>;
struct CommandDataContainer {
CommandDataContainer() = default;
CommandDataContainer(CommandData&& data) : data{std::move(data)} {}
CommandDataContainer& operator=(const CommandDataContainer& t) {
data = std::move(t.data);
return *this;
}
CommandData data;
};
/// Struct used to synchronize the GPU thread /// Struct used to synchronize the GPU thread
struct SynchState final { struct SynchState final {
std::atomic<bool> is_running{true}; std::atomic_bool is_running{true};
std::atomic<bool> is_idle{true}; std::atomic_int queued_frame_count{};
std::condition_variable signal_condition; std::mutex frames_mutex;
std::mutex signal_mutex; std::mutex commands_mutex;
std::condition_variable idle_condition; std::condition_variable commands_condition;
std::mutex idle_mutex; std::condition_variable frames_condition;
// We use two queues for sending commands to the GPU thread, one for writing (push_queue) to and void IncrementFramesCounter() {
// one for reading from (pop_queue). These are swapped whenever the current pop_queue becomes std::lock_guard<std::mutex> lock{frames_mutex};
// empty. This allows for efficient thread-safe access, as it does not require any copies. ++queued_frame_count;
using CommandQueue = std::queue<CommandData>;
std::array<CommandQueue, 2> command_queues;
CommandQueue* push_queue{&command_queues[0]};
CommandQueue* pop_queue{&command_queues[1]};
void UpdateIdleState() {
std::lock_guard<std::mutex> lock{idle_mutex};
is_idle = command_queues[0].empty() && command_queues[1].empty();
} }
void DecrementFramesCounter() {
{
std::lock_guard<std::mutex> lock{frames_mutex};
--queued_frame_count;
if (queued_frame_count) {
return;
}
}
frames_condition.notify_one();
}
void WaitForFrames() {
{
std::lock_guard<std::mutex> lock{frames_mutex};
if (!queued_frame_count) {
return;
}
}
// Wait for the GPU to be idle (all commands to be executed)
{
std::unique_lock<std::mutex> lock{frames_mutex};
frames_condition.wait(lock, [this] { return !queued_frame_count; });
}
}
void SignalCommands() {
{
std::unique_lock<std::mutex> lock{commands_mutex};
if (queue.Empty()) {
return;
}
}
commands_condition.notify_one();
}
void WaitForCommands() {
std::unique_lock<std::mutex> lock{commands_mutex};
commands_condition.wait(lock, [this] { return !queue.Empty(); });
}
using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
CommandQueue queue;
}; };
/// Class used to manage the GPU thread /// Class used to manage the GPU thread
@ -105,22 +162,17 @@ public:
std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer); std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer);
/// Notify rasterizer that any caches of the specified region should be flushed to Switch memory /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
void FlushRegion(VAddr addr, u64 size); void FlushRegion(CacheAddr addr, u64 size);
/// Notify rasterizer that any caches of the specified region should be invalidated /// Notify rasterizer that any caches of the specified region should be invalidated
void InvalidateRegion(VAddr addr, u64 size); void InvalidateRegion(CacheAddr addr, u64 size);
/// Notify rasterizer that any caches of the specified region should be flushed and invalidated /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
void FlushAndInvalidateRegion(VAddr addr, u64 size); void FlushAndInvalidateRegion(CacheAddr addr, u64 size);
private: private:
/// Pushes a command to be executed by the GPU thread /// Pushes a command to be executed by the GPU thread
void PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu); void PushCommand(CommandData&& command_data);
/// Returns true if this is called by the GPU thread
bool IsGpuThread() const {
return std::this_thread::get_id() == thread_id;
}
private: private:
SynchState state; SynchState state;

@ -4,6 +4,7 @@
#pragma once #pragma once
#include <mutex>
#include <set> #include <set>
#include <unordered_map> #include <unordered_map>
@ -12,14 +13,26 @@
#include "common/common_types.h" #include "common/common_types.h"
#include "core/settings.h" #include "core/settings.h"
#include "video_core/gpu.h"
#include "video_core/rasterizer_interface.h" #include "video_core/rasterizer_interface.h"
class RasterizerCacheObject { class RasterizerCacheObject {
public: public:
explicit RasterizerCacheObject(const u8* host_ptr)
: host_ptr{host_ptr}, cache_addr{ToCacheAddr(host_ptr)} {}
virtual ~RasterizerCacheObject(); virtual ~RasterizerCacheObject();
CacheAddr GetCacheAddr() const {
return cache_addr;
}
const u8* GetHostPtr() const {
return host_ptr;
}
/// Gets the address of the shader in guest memory, required for cache management /// Gets the address of the shader in guest memory, required for cache management
virtual VAddr GetAddr() const = 0; virtual VAddr GetCpuAddr() const = 0;
/// Gets the size of the shader in guest memory, required for cache management /// Gets the size of the shader in guest memory, required for cache management
virtual std::size_t GetSizeInBytes() const = 0; virtual std::size_t GetSizeInBytes() const = 0;
@ -58,6 +71,8 @@ private:
bool is_registered{}; ///< Whether the object is currently registered with the cache bool is_registered{}; ///< Whether the object is currently registered with the cache
bool is_dirty{}; ///< Whether the object is dirty (out of sync with guest memory) bool is_dirty{}; ///< Whether the object is dirty (out of sync with guest memory)
u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing
CacheAddr cache_addr{}; ///< Cache address memory, unique from emulated virtual address space
const u8* host_ptr{}; ///< Pointer to the memory backing this cached region
}; };
template <class T> template <class T>
@ -68,7 +83,9 @@ public:
explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {} explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}
/// Write any cached resources overlapping the specified region back to memory /// Write any cached resources overlapping the specified region back to memory
void FlushRegion(Tegra::GPUVAddr addr, size_t size) { void FlushRegion(CacheAddr addr, std::size_t size) {
std::lock_guard<std::recursive_mutex> lock{mutex};
const auto& objects{GetSortedObjectsFromRegion(addr, size)}; const auto& objects{GetSortedObjectsFromRegion(addr, size)};
for (auto& object : objects) { for (auto& object : objects) {
FlushObject(object); FlushObject(object);
@ -76,7 +93,9 @@ public:
} }
/// Mark the specified region as being invalidated /// Mark the specified region as being invalidated
void InvalidateRegion(VAddr addr, u64 size) { void InvalidateRegion(CacheAddr addr, u64 size) {
std::lock_guard<std::recursive_mutex> lock{mutex};
const auto& objects{GetSortedObjectsFromRegion(addr, size)}; const auto& objects{GetSortedObjectsFromRegion(addr, size)};
for (auto& object : objects) { for (auto& object : objects) {
if (!object->IsRegistered()) { if (!object->IsRegistered()) {
@ -89,48 +108,60 @@ public:
/// Invalidates everything in the cache /// Invalidates everything in the cache
void InvalidateAll() { void InvalidateAll() {
std::lock_guard<std::recursive_mutex> lock{mutex};
while (interval_cache.begin() != interval_cache.end()) { while (interval_cache.begin() != interval_cache.end()) {
Unregister(*interval_cache.begin()->second.begin()); Unregister(*interval_cache.begin()->second.begin());
} }
} }
protected: protected:
/// Tries to get an object from the cache with the specified address /// Tries to get an object from the cache with the specified cache address
T TryGet(VAddr addr) const { T TryGet(CacheAddr addr) const {
const auto iter = map_cache.find(addr); const auto iter = map_cache.find(addr);
if (iter != map_cache.end()) if (iter != map_cache.end())
return iter->second; return iter->second;
return nullptr; return nullptr;
} }
T TryGet(const void* addr) const {
const auto iter = map_cache.find(ToCacheAddr(addr));
if (iter != map_cache.end())
return iter->second;
return nullptr;
}
/// Register an object into the cache /// Register an object into the cache
void Register(const T& object) { void Register(const T& object) {
std::lock_guard<std::recursive_mutex> lock{mutex};
object->SetIsRegistered(true); object->SetIsRegistered(true);
interval_cache.add({GetInterval(object), ObjectSet{object}}); interval_cache.add({GetInterval(object), ObjectSet{object}});
map_cache.insert({object->GetAddr(), object}); map_cache.insert({object->GetCacheAddr(), object});
rasterizer.UpdatePagesCachedCount(object->GetAddr(), object->GetSizeInBytes(), 1); rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1);
} }
/// Unregisters an object from the cache /// Unregisters an object from the cache
void Unregister(const T& object) { void Unregister(const T& object) {
object->SetIsRegistered(false); std::lock_guard<std::recursive_mutex> lock{mutex};
rasterizer.UpdatePagesCachedCount(object->GetAddr(), object->GetSizeInBytes(), -1);
// Only flush if use_accurate_gpu_emulation is enabled, as it incurs a performance hit
if (Settings::values.use_accurate_gpu_emulation) {
FlushObject(object);
}
object->SetIsRegistered(false);
rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
interval_cache.subtract({GetInterval(object), ObjectSet{object}}); interval_cache.subtract({GetInterval(object), ObjectSet{object}});
map_cache.erase(object->GetAddr()); map_cache.erase(object->GetCacheAddr());
} }
/// Returns a ticks counter used for tracking when cached objects were last modified /// Returns a ticks counter used for tracking when cached objects were last modified
u64 GetModifiedTicks() { u64 GetModifiedTicks() {
std::lock_guard<std::recursive_mutex> lock{mutex};
return ++modified_ticks; return ++modified_ticks;
} }
/// Flushes the specified object, updating appropriate cache state as needed /// Flushes the specified object, updating appropriate cache state as needed
void FlushObject(const T& object) { void FlushObject(const T& object) {
std::lock_guard<std::recursive_mutex> lock{mutex};
if (!object->IsDirty()) { if (!object->IsDirty()) {
return; return;
} }
@ -140,7 +171,7 @@ protected:
private: private:
/// Returns a list of cached objects from the specified memory region, ordered by access time /// Returns a list of cached objects from the specified memory region, ordered by access time
std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) { std::vector<T> GetSortedObjectsFromRegion(CacheAddr addr, u64 size) {
if (size == 0) { if (size == 0) {
return {}; return {};
} }
@ -164,17 +195,18 @@ private:
} }
using ObjectSet = std::set<T>; using ObjectSet = std::set<T>;
using ObjectCache = std::unordered_map<VAddr, T>; using ObjectCache = std::unordered_map<CacheAddr, T>;
using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>; using IntervalCache = boost::icl::interval_map<CacheAddr, ObjectSet>;
using ObjectInterval = typename IntervalCache::interval_type; using ObjectInterval = typename IntervalCache::interval_type;
static auto GetInterval(const T& object) { static auto GetInterval(const T& object) {
return ObjectInterval::right_open(object->GetAddr(), return ObjectInterval::right_open(object->GetCacheAddr(),
object->GetAddr() + object->GetSizeInBytes()); object->GetCacheAddr() + object->GetSizeInBytes());
} }
ObjectCache map_cache; ObjectCache map_cache;
IntervalCache interval_cache; ///< Cache of objects IntervalCache interval_cache; ///< Cache of objects
u64 modified_ticks{}; ///< Counter of cache state ticks, used for in-order flushing u64 modified_ticks{}; ///< Counter of cache state ticks, used for in-order flushing
VideoCore::RasterizerInterface& rasterizer; VideoCore::RasterizerInterface& rasterizer;
std::recursive_mutex mutex;
}; };

@ -35,14 +35,14 @@ public:
virtual void FlushAll() = 0; virtual void FlushAll() = 0;
/// Notify rasterizer that any caches of the specified region should be flushed to Switch memory /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
virtual void FlushRegion(VAddr addr, u64 size) = 0; virtual void FlushRegion(CacheAddr addr, u64 size) = 0;
/// Notify rasterizer that any caches of the specified region should be invalidated /// Notify rasterizer that any caches of the specified region should be invalidated
virtual void InvalidateRegion(VAddr addr, u64 size) = 0; virtual void InvalidateRegion(CacheAddr addr, u64 size) = 0;
/// Notify rasterizer that any caches of the specified region should be flushed to Switch memory /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
/// and invalidated /// and invalidated
virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0; virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
/// Attempt to use a faster method to perform a surface copy /// Attempt to use a faster method to perform a surface copy
virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
@ -63,7 +63,7 @@ public:
} }
/// Increase/decrease the number of object in pages touching the specified region /// Increase/decrease the number of object in pages touching the specified region
virtual void UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta) {} virtual void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {}
/// Initialize disk cached resources for the game being emulated /// Initialize disk cached resources for the game being emulated
virtual void LoadDiskResources(const std::atomic_bool& stop_loading = false, virtual void LoadDiskResources(const std::atomic_bool& stop_loading = false,

@ -13,6 +13,11 @@
namespace OpenGL { namespace OpenGL {
CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset,
std::size_t alignment, u8* host_ptr)
: cpu_addr{cpu_addr}, size{size}, offset{offset}, alignment{alignment}, RasterizerCacheObject{
host_ptr} {}
OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size) OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size)
: RasterizerCache{rasterizer}, stream_buffer(size, true) {} : RasterizerCache{rasterizer}, stream_buffer(size, true) {}
@ -26,11 +31,12 @@ GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size
// TODO: Figure out which size is the best for given games. // TODO: Figure out which size is the best for given games.
cache &= size >= 2048; cache &= size >= 2048;
const auto& host_ptr{Memory::GetPointer(*cpu_addr)};
if (cache) { if (cache) {
auto entry = TryGet(*cpu_addr); auto entry = TryGet(host_ptr);
if (entry) { if (entry) {
if (entry->size >= size && entry->alignment == alignment) { if (entry->GetSize() >= size && entry->GetAlignment() == alignment) {
return entry->offset; return entry->GetOffset();
} }
Unregister(entry); Unregister(entry);
} }
@ -39,17 +45,17 @@ GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size
AlignBuffer(alignment); AlignBuffer(alignment);
const GLintptr uploaded_offset = buffer_offset; const GLintptr uploaded_offset = buffer_offset;
Memory::ReadBlock(*cpu_addr, buffer_ptr, size); if (!host_ptr) {
return uploaded_offset;
}
std::memcpy(buffer_ptr, host_ptr, size);
buffer_ptr += size; buffer_ptr += size;
buffer_offset += size; buffer_offset += size;
if (cache) { if (cache) {
auto entry = std::make_shared<CachedBufferEntry>(); auto entry = std::make_shared<CachedBufferEntry>(*cpu_addr, size, uploaded_offset,
entry->offset = uploaded_offset; alignment, host_ptr);
entry->size = size;
entry->alignment = alignment;
entry->addr = *cpu_addr;
Register(entry); Register(entry);
} }

@ -17,22 +17,39 @@ namespace OpenGL {
class RasterizerOpenGL; class RasterizerOpenGL;
struct CachedBufferEntry final : public RasterizerCacheObject { class CachedBufferEntry final : public RasterizerCacheObject {
VAddr GetAddr() const override { public:
return addr; explicit CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset,
std::size_t alignment, u8* host_ptr);
VAddr GetCpuAddr() const override {
return cpu_addr;
} }
std::size_t GetSizeInBytes() const override { std::size_t GetSizeInBytes() const override {
return size; return size;
} }
std::size_t GetSize() const {
return size;
}
GLintptr GetOffset() const {
return offset;
}
std::size_t GetAlignment() const {
return alignment;
}
// We do not have to flush this cache as things in it are never modified by us. // We do not have to flush this cache as things in it are never modified by us.
void Flush() override {} void Flush() override {}
VAddr addr; private:
std::size_t size; VAddr cpu_addr{};
GLintptr offset; std::size_t size{};
std::size_t alignment; GLintptr offset{};
std::size_t alignment{};
}; };
class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> { class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> {

@ -15,12 +15,13 @@
namespace OpenGL { namespace OpenGL {
CachedGlobalRegion::CachedGlobalRegion(VAddr addr, u32 size) : addr{addr}, size{size} { CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr)
: cpu_addr{cpu_addr}, size{size}, RasterizerCacheObject{host_ptr} {
buffer.Create(); buffer.Create();
// Bind and unbind the buffer so it gets allocated by the driver // Bind and unbind the buffer so it gets allocated by the driver
glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle); glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
LabelGLObject(GL_BUFFER, buffer.handle, addr, "GlobalMemory"); LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory");
} }
void CachedGlobalRegion::Reload(u32 size_) { void CachedGlobalRegion::Reload(u32 size_) {
@ -35,7 +36,7 @@ void CachedGlobalRegion::Reload(u32 size_) {
// TODO(Rodrigo): Get rid of Memory::GetPointer with a staging buffer // TODO(Rodrigo): Get rid of Memory::GetPointer with a staging buffer
glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle); glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);
glBufferData(GL_SHADER_STORAGE_BUFFER, size, Memory::GetPointer(addr), GL_DYNAMIC_DRAW); glBufferData(GL_SHADER_STORAGE_BUFFER, size, GetHostPtr(), GL_DYNAMIC_DRAW);
} }
GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(VAddr addr, u32 size) const { GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(VAddr addr, u32 size) const {
@ -46,11 +47,11 @@ GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(VAddr addr, u32
return search->second; return search->second;
} }
GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(VAddr addr, u32 size) { GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(VAddr addr, u32 size, u8* host_ptr) {
GlobalRegion region{TryGetReservedGlobalRegion(addr, size)}; GlobalRegion region{TryGetReservedGlobalRegion(addr, size)};
if (!region) { if (!region) {
// No reserved surface available, create a new one and reserve it // No reserved surface available, create a new one and reserve it
region = std::make_shared<CachedGlobalRegion>(addr, size); region = std::make_shared<CachedGlobalRegion>(addr, size, host_ptr);
ReserveGlobalRegion(region); ReserveGlobalRegion(region);
} }
region->Reload(size); region->Reload(size);
@ -58,7 +59,7 @@ GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(VAddr addr, u32 si
} }
void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) { void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) {
reserve.insert_or_assign(region->GetAddr(), std::move(region)); reserve.insert_or_assign(region->GetCpuAddr(), std::move(region));
} }
GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer) GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer)
@ -80,11 +81,12 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
ASSERT(actual_addr); ASSERT(actual_addr);
// Look up global region in the cache based on address // Look up global region in the cache based on address
GlobalRegion region = TryGet(*actual_addr); const auto& host_ptr{Memory::GetPointer(*actual_addr)};
GlobalRegion region{TryGet(host_ptr)};
if (!region) { if (!region) {
// No global region found - create a new one // No global region found - create a new one
region = GetUncachedGlobalRegion(*actual_addr, size); region = GetUncachedGlobalRegion(*actual_addr, size, host_ptr);
Register(region); Register(region);
} }

@ -27,14 +27,12 @@ using GlobalRegion = std::shared_ptr<CachedGlobalRegion>;
class CachedGlobalRegion final : public RasterizerCacheObject { class CachedGlobalRegion final : public RasterizerCacheObject {
public: public:
explicit CachedGlobalRegion(VAddr addr, u32 size); explicit CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr);
/// Gets the address of the shader in guest memory, required for cache management VAddr GetCpuAddr() const override {
VAddr GetAddr() const override { return cpu_addr;
return addr;
} }
/// Gets the size of the shader in guest memory, required for cache management
std::size_t GetSizeInBytes() const override { std::size_t GetSizeInBytes() const override {
return size; return size;
} }
@ -53,9 +51,8 @@ public:
} }
private: private:
VAddr addr{}; VAddr cpu_addr{};
u32 size{}; u32 size{};
OGLBuffer buffer; OGLBuffer buffer;
}; };
@ -69,7 +66,7 @@ public:
private: private:
GlobalRegion TryGetReservedGlobalRegion(VAddr addr, u32 size) const; GlobalRegion TryGetReservedGlobalRegion(VAddr addr, u32 size) const;
GlobalRegion GetUncachedGlobalRegion(VAddr addr, u32 size); GlobalRegion GetUncachedGlobalRegion(VAddr addr, u32 size, u8* host_ptr);
void ReserveGlobalRegion(GlobalRegion region); void ReserveGlobalRegion(GlobalRegion region);
std::unordered_map<VAddr, GlobalRegion> reserve; std::unordered_map<VAddr, GlobalRegion> reserve;

@ -449,7 +449,7 @@ static constexpr auto RangeFromInterval(Map& map, const Interval& interval) {
return boost::make_iterator_range(map.equal_range(interval)); return boost::make_iterator_range(map.equal_range(interval));
} }
void RasterizerOpenGL::UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta) { void RasterizerOpenGL::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {
const u64 page_start{addr >> Memory::PAGE_BITS}; const u64 page_start{addr >> Memory::PAGE_BITS};
const u64 page_end{(addr + size + Memory::PAGE_SIZE - 1) >> Memory::PAGE_BITS}; const u64 page_end{(addr + size + Memory::PAGE_SIZE - 1) >> Memory::PAGE_BITS};
@ -747,12 +747,12 @@ void RasterizerOpenGL::DrawArrays() {
void RasterizerOpenGL::FlushAll() {} void RasterizerOpenGL::FlushAll() {}
void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
MICROPROFILE_SCOPE(OpenGL_CacheManagement); MICROPROFILE_SCOPE(OpenGL_CacheManagement);
res_cache.FlushRegion(addr, size); res_cache.FlushRegion(addr, size);
} }
void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
MICROPROFILE_SCOPE(OpenGL_CacheManagement); MICROPROFILE_SCOPE(OpenGL_CacheManagement);
res_cache.InvalidateRegion(addr, size); res_cache.InvalidateRegion(addr, size);
shader_cache.InvalidateRegion(addr, size); shader_cache.InvalidateRegion(addr, size);
@ -760,7 +760,7 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
buffer_cache.InvalidateRegion(addr, size); buffer_cache.InvalidateRegion(addr, size);
} }
void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) { void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
FlushRegion(addr, size); FlushRegion(addr, size);
InvalidateRegion(addr, size); InvalidateRegion(addr, size);
} }
@ -782,7 +782,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
MICROPROFILE_SCOPE(OpenGL_CacheManagement); MICROPROFILE_SCOPE(OpenGL_CacheManagement);
const auto& surface{res_cache.TryFindFramebufferSurface(framebuffer_addr)}; const auto& surface{res_cache.TryFindFramebufferSurface(Memory::GetPointer(framebuffer_addr))};
if (!surface) { if (!surface) {
return {}; return {};
} }

@ -57,9 +57,9 @@ public:
void DrawArrays() override; void DrawArrays() override;
void Clear() override; void Clear() override;
void FlushAll() override; void FlushAll() override;
void FlushRegion(VAddr addr, u64 size) override; void FlushRegion(CacheAddr addr, u64 size) override;
void InvalidateRegion(VAddr addr, u64 size) override; void InvalidateRegion(CacheAddr addr, u64 size) override;
void FlushAndInvalidateRegion(VAddr addr, u64 size) override; void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
const Tegra::Engines::Fermi2D::Regs::Surface& dst, const Tegra::Engines::Fermi2D::Regs::Surface& dst,
const Common::Rectangle<u32>& src_rect, const Common::Rectangle<u32>& src_rect,
@ -67,7 +67,7 @@ public:
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
u32 pixel_stride) override; u32 pixel_stride) override;
bool AccelerateDrawBatch(bool is_indexed) override; bool AccelerateDrawBatch(bool is_indexed) override;
void UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta) override; void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) override;
void LoadDiskResources(const std::atomic_bool& stop_loading, void LoadDiskResources(const std::atomic_bool& stop_loading,
const VideoCore::DiskResourceLoadCallback& callback) override; const VideoCore::DiskResourceLoadCallback& callback) override;

@ -61,6 +61,7 @@ void SurfaceParams::InitCacheParameters(Tegra::GPUVAddr gpu_addr_) {
addr = cpu_addr ? *cpu_addr : 0; addr = cpu_addr ? *cpu_addr : 0;
gpu_addr = gpu_addr_; gpu_addr = gpu_addr_;
host_ptr = Memory::GetPointer(addr);
size_in_bytes = SizeInBytesRaw(); size_in_bytes = SizeInBytesRaw();
if (IsPixelFormatASTC(pixel_format)) { if (IsPixelFormatASTC(pixel_format)) {
@ -563,8 +564,8 @@ void RasterizerCacheOpenGL::CopySurface(const Surface& src_surface, const Surfac
} }
CachedSurface::CachedSurface(const SurfaceParams& params) CachedSurface::CachedSurface(const SurfaceParams& params)
: params(params), gl_target(SurfaceTargetToGL(params.target)), : params{params}, gl_target{SurfaceTargetToGL(params.target)},
cached_size_in_bytes(params.size_in_bytes) { cached_size_in_bytes{params.size_in_bytes}, RasterizerCacheObject{params.host_ptr} {
texture.Create(gl_target); texture.Create(gl_target);
// TODO(Rodrigo): Using params.GetRect() returns a different size than using its Mip*(0) // TODO(Rodrigo): Using params.GetRect() returns a different size than using its Mip*(0)
@ -633,10 +634,9 @@ void CachedSurface::LoadGLBuffer() {
const u32 bpp = params.GetFormatBpp() / 8; const u32 bpp = params.GetFormatBpp() / 8;
const u32 copy_size = params.width * bpp; const u32 copy_size = params.width * bpp;
if (params.pitch == copy_size) { if (params.pitch == copy_size) {
std::memcpy(gl_buffer[0].data(), Memory::GetPointer(params.addr), std::memcpy(gl_buffer[0].data(), params.host_ptr, params.size_in_bytes_gl);
params.size_in_bytes_gl);
} else { } else {
const u8* start = Memory::GetPointer(params.addr); const u8* start{params.host_ptr};
u8* write_to = gl_buffer[0].data(); u8* write_to = gl_buffer[0].data();
for (u32 h = params.height; h > 0; h--) { for (u32 h = params.height; h > 0; h--) {
std::memcpy(write_to, start, copy_size); std::memcpy(write_to, start, copy_size);
@ -680,8 +680,6 @@ void CachedSurface::FlushGLBuffer() {
glPixelStorei(GL_PACK_ROW_LENGTH, 0); glPixelStorei(GL_PACK_ROW_LENGTH, 0);
Tegra::Texture::ConvertFromHostToGuest(gl_buffer[0].data(), params.pixel_format, params.width, Tegra::Texture::ConvertFromHostToGuest(gl_buffer[0].data(), params.pixel_format, params.width,
params.height, params.depth, true, true); params.height, params.depth, true, true);
const u8* const texture_src_data = Memory::GetPointer(params.addr);
ASSERT(texture_src_data);
if (params.is_tiled) { if (params.is_tiled) {
ASSERT_MSG(params.block_width == 1, "Block width is defined as {} on texture type {}", ASSERT_MSG(params.block_width == 1, "Block width is defined as {} on texture type {}",
params.block_width, static_cast<u32>(params.target)); params.block_width, static_cast<u32>(params.target));
@ -691,9 +689,9 @@ void CachedSurface::FlushGLBuffer() {
const u32 bpp = params.GetFormatBpp() / 8; const u32 bpp = params.GetFormatBpp() / 8;
const u32 copy_size = params.width * bpp; const u32 copy_size = params.width * bpp;
if (params.pitch == copy_size) { if (params.pitch == copy_size) {
std::memcpy(Memory::GetPointer(params.addr), gl_buffer[0].data(), GetSizeInBytes()); std::memcpy(params.host_ptr, gl_buffer[0].data(), GetSizeInBytes());
} else { } else {
u8* start = Memory::GetPointer(params.addr); u8* start{params.host_ptr};
const u8* read_to = gl_buffer[0].data(); const u8* read_to = gl_buffer[0].data();
for (u32 h = params.height; h > 0; h--) { for (u32 h = params.height; h > 0; h--) {
std::memcpy(start, read_to, copy_size); std::memcpy(start, read_to, copy_size);
@ -932,7 +930,7 @@ Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool pres
} }
// Look up surface in the cache based on address // Look up surface in the cache based on address
Surface surface{TryGet(params.addr)}; Surface surface{TryGet(params.host_ptr)};
if (surface) { if (surface) {
if (surface->GetSurfaceParams().IsCompatibleSurface(params)) { if (surface->GetSurfaceParams().IsCompatibleSurface(params)) {
// Use the cached surface as-is unless it's not synced with memory // Use the cached surface as-is unless it's not synced with memory
@ -986,7 +984,7 @@ void RasterizerCacheOpenGL::FastLayeredCopySurface(const Surface& src_surface,
for (u32 layer = 0; layer < dst_params.depth; layer++) { for (u32 layer = 0; layer < dst_params.depth; layer++) {
for (u32 mipmap = 0; mipmap < dst_params.max_mip_level; mipmap++) { for (u32 mipmap = 0; mipmap < dst_params.max_mip_level; mipmap++) {
const VAddr sub_address = address + dst_params.GetMipmapLevelOffset(mipmap); const VAddr sub_address = address + dst_params.GetMipmapLevelOffset(mipmap);
const Surface& copy = TryGet(sub_address); const Surface& copy = TryGet(Memory::GetPointer(sub_address));
if (!copy) if (!copy)
continue; continue;
const auto& src_params{copy->GetSurfaceParams()}; const auto& src_params{copy->GetSurfaceParams()};
@ -1163,7 +1161,8 @@ void RasterizerCacheOpenGL::AccurateCopySurface(const Surface& src_surface,
const auto& dst_params{dst_surface->GetSurfaceParams()}; const auto& dst_params{dst_surface->GetSurfaceParams()};
// Flush enough memory for both the source and destination surface // Flush enough memory for both the source and destination surface
FlushRegion(src_params.addr, std::max(src_params.MemorySize(), dst_params.MemorySize())); FlushRegion(ToCacheAddr(src_params.host_ptr),
std::max(src_params.MemorySize(), dst_params.MemorySize()));
LoadSurface(dst_surface); LoadSurface(dst_surface);
} }
@ -1215,8 +1214,8 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface,
return new_surface; return new_surface;
} }
Surface RasterizerCacheOpenGL::TryFindFramebufferSurface(VAddr addr) const { Surface RasterizerCacheOpenGL::TryFindFramebufferSurface(const u8* host_ptr) const {
return TryGet(addr); return TryGet(host_ptr);
} }
void RasterizerCacheOpenGL::ReserveSurface(const Surface& surface) { void RasterizerCacheOpenGL::ReserveSurface(const Surface& surface) {
@ -1267,7 +1266,7 @@ static bool LayerFitReinterpretSurface(RasterizerCacheOpenGL& cache, const Surfa
src_params.height == dst_params.MipHeight(*level) && src_params.height == dst_params.MipHeight(*level) &&
src_params.block_height >= dst_params.MipBlockHeight(*level)) { src_params.block_height >= dst_params.MipBlockHeight(*level)) {
const std::optional<u32> slot = const std::optional<u32> slot =
TryFindBestLayer(render_surface->GetAddr(), dst_params, *level); TryFindBestLayer(render_surface->GetCpuAddr(), dst_params, *level);
if (slot.has_value()) { if (slot.has_value()) {
glCopyImageSubData(render_surface->Texture().handle, glCopyImageSubData(render_surface->Texture().handle,
SurfaceTargetToGL(src_params.target), 0, 0, 0, 0, SurfaceTargetToGL(src_params.target), 0, 0, 0, 0,
@ -1283,8 +1282,8 @@ static bool LayerFitReinterpretSurface(RasterizerCacheOpenGL& cache, const Surfa
} }
static bool IsReinterpretInvalid(const Surface render_surface, const Surface blitted_surface) { static bool IsReinterpretInvalid(const Surface render_surface, const Surface blitted_surface) {
const VAddr bound1 = blitted_surface->GetAddr() + blitted_surface->GetMemorySize(); const VAddr bound1 = blitted_surface->GetCpuAddr() + blitted_surface->GetMemorySize();
const VAddr bound2 = render_surface->GetAddr() + render_surface->GetMemorySize(); const VAddr bound2 = render_surface->GetCpuAddr() + render_surface->GetMemorySize();
if (bound2 > bound1) if (bound2 > bound1)
return true; return true;
const auto& dst_params = blitted_surface->GetSurfaceParams(); const auto& dst_params = blitted_surface->GetSurfaceParams();
@ -1327,7 +1326,8 @@ void RasterizerCacheOpenGL::SignalPreDrawCall() {
void RasterizerCacheOpenGL::SignalPostDrawCall() { void RasterizerCacheOpenGL::SignalPostDrawCall() {
for (u32 i = 0; i < Maxwell::NumRenderTargets; i++) { for (u32 i = 0; i < Maxwell::NumRenderTargets; i++) {
if (current_color_buffers[i] != nullptr) { if (current_color_buffers[i] != nullptr) {
Surface intersect = CollideOnReinterpretedSurface(current_color_buffers[i]->GetAddr()); Surface intersect =
CollideOnReinterpretedSurface(current_color_buffers[i]->GetCacheAddr());
if (intersect != nullptr) { if (intersect != nullptr) {
PartialReinterpretSurface(current_color_buffers[i], intersect); PartialReinterpretSurface(current_color_buffers[i], intersect);
texception = true; texception = true;

@ -297,6 +297,7 @@ struct SurfaceParams {
bool srgb_conversion; bool srgb_conversion;
// Parameters used for caching // Parameters used for caching
VAddr addr; VAddr addr;
u8* host_ptr;
Tegra::GPUVAddr gpu_addr; Tegra::GPUVAddr gpu_addr;
std::size_t size_in_bytes; std::size_t size_in_bytes;
std::size_t size_in_bytes_gl; std::size_t size_in_bytes_gl;
@ -345,9 +346,9 @@ class RasterizerOpenGL;
class CachedSurface final : public RasterizerCacheObject { class CachedSurface final : public RasterizerCacheObject {
public: public:
CachedSurface(const SurfaceParams& params); explicit CachedSurface(const SurfaceParams& params);
VAddr GetAddr() const override { VAddr GetCpuAddr() const override {
return params.addr; return params.addr;
} }
@ -449,7 +450,7 @@ public:
Surface GetColorBufferSurface(std::size_t index, bool preserve_contents); Surface GetColorBufferSurface(std::size_t index, bool preserve_contents);
/// Tries to find a framebuffer using on the provided CPU address /// Tries to find a framebuffer using on the provided CPU address
Surface TryFindFramebufferSurface(VAddr addr) const; Surface TryFindFramebufferSurface(const u8* host_ptr) const;
/// Copies the contents of one surface to another /// Copies the contents of one surface to another
void FermiCopySurface(const Tegra::Engines::Fermi2D::Regs::Surface& src_config, void FermiCopySurface(const Tegra::Engines::Fermi2D::Regs::Surface& src_config,
@ -506,12 +507,12 @@ private:
std::array<Surface, Maxwell::NumRenderTargets> current_color_buffers; std::array<Surface, Maxwell::NumRenderTargets> current_color_buffers;
Surface last_depth_buffer; Surface last_depth_buffer;
using SurfaceIntervalCache = boost::icl::interval_map<VAddr, Surface>; using SurfaceIntervalCache = boost::icl::interval_map<CacheAddr, Surface>;
using SurfaceInterval = typename SurfaceIntervalCache::interval_type; using SurfaceInterval = typename SurfaceIntervalCache::interval_type;
static auto GetReinterpretInterval(const Surface& object) { static auto GetReinterpretInterval(const Surface& object) {
return SurfaceInterval::right_open(object->GetAddr() + 1, return SurfaceInterval::right_open(object->GetCacheAddr() + 1,
object->GetAddr() + object->GetMemorySize() - 1); object->GetCacheAddr() + object->GetMemorySize() - 1);
} }
// Reinterpreted surfaces are very fragil as the game may keep rendering into them. // Reinterpreted surfaces are very fragil as the game may keep rendering into them.
@ -523,7 +524,7 @@ private:
reinterpret_surface->MarkReinterpreted(); reinterpret_surface->MarkReinterpreted();
} }
Surface CollideOnReinterpretedSurface(VAddr addr) const { Surface CollideOnReinterpretedSurface(CacheAddr addr) const {
const SurfaceInterval interval{addr}; const SurfaceInterval interval{addr};
for (auto& pair : for (auto& pair :
boost::make_iterator_range(reinterpreted_surfaces.equal_range(interval))) { boost::make_iterator_range(reinterpreted_surfaces.equal_range(interval))) {

@ -42,9 +42,9 @@ VAddr GetShaderAddress(Maxwell::ShaderProgram program) {
} }
/// Gets the shader program code from memory for the specified address /// Gets the shader program code from memory for the specified address
ProgramCode GetShaderCode(VAddr addr) { ProgramCode GetShaderCode(const u8* host_ptr) {
ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH); ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH);
Memory::ReadBlock(addr, program_code.data(), program_code.size() * sizeof(u64)); std::memcpy(program_code.data(), host_ptr, program_code.size() * sizeof(u64));
return program_code; return program_code;
} }
@ -214,12 +214,13 @@ std::set<GLenum> GetSupportedFormats() {
} // namespace } // namespace
CachedShader::CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type, CachedShader::CachedShader(VAddr guest_addr, u64 unique_identifier,
ShaderDiskCacheOpenGL& disk_cache, Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
const PrecompiledPrograms& precompiled_programs, const PrecompiledPrograms& precompiled_programs,
ProgramCode&& program_code, ProgramCode&& program_code_b) ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr)
: addr{addr}, unique_identifier{unique_identifier}, program_type{program_type}, : host_ptr{host_ptr}, guest_addr{guest_addr}, unique_identifier{unique_identifier},
disk_cache{disk_cache}, precompiled_programs{precompiled_programs} { program_type{program_type}, disk_cache{disk_cache},
precompiled_programs{precompiled_programs}, RasterizerCacheObject{host_ptr} {
const std::size_t code_size = CalculateProgramSize(program_code); const std::size_t code_size = CalculateProgramSize(program_code);
const std::size_t code_size_b = const std::size_t code_size_b =
@ -243,12 +244,13 @@ CachedShader::CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderPro
disk_cache.SaveRaw(raw); disk_cache.SaveRaw(raw);
} }
CachedShader::CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type, CachedShader::CachedShader(VAddr guest_addr, u64 unique_identifier,
ShaderDiskCacheOpenGL& disk_cache, Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
const PrecompiledPrograms& precompiled_programs, const PrecompiledPrograms& precompiled_programs,
GLShader::ProgramResult result) GLShader::ProgramResult result, u8* host_ptr)
: addr{addr}, unique_identifier{unique_identifier}, program_type{program_type}, : guest_addr{guest_addr}, unique_identifier{unique_identifier}, program_type{program_type},
disk_cache{disk_cache}, precompiled_programs{precompiled_programs} { disk_cache{disk_cache}, precompiled_programs{precompiled_programs}, RasterizerCacheObject{
host_ptr} {
code = std::move(result.first); code = std::move(result.first);
entries = result.second; entries = result.second;
@ -271,7 +273,7 @@ std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(GLenum primitive
disk_cache.SaveUsage(GetUsage(primitive_mode, base_bindings)); disk_cache.SaveUsage(GetUsage(primitive_mode, base_bindings));
} }
LabelGLObject(GL_PROGRAM, program->handle, addr); LabelGLObject(GL_PROGRAM, program->handle, guest_addr);
} }
handle = program->handle; handle = program->handle;
@ -323,7 +325,7 @@ GLuint CachedShader::LazyGeometryProgram(CachedProgram& target_program, BaseBind
disk_cache.SaveUsage(GetUsage(primitive_mode, base_bindings)); disk_cache.SaveUsage(GetUsage(primitive_mode, base_bindings));
} }
LabelGLObject(GL_PROGRAM, target_program->handle, addr, debug_name); LabelGLObject(GL_PROGRAM, target_program->handle, guest_addr, debug_name);
return target_program->handle; return target_program->handle;
}; };
@ -489,14 +491,17 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
const VAddr program_addr{GetShaderAddress(program)}; const VAddr program_addr{GetShaderAddress(program)};
// Look up shader in the cache based on address // Look up shader in the cache based on address
Shader shader{TryGet(program_addr)}; const auto& host_ptr{Memory::GetPointer(program_addr)};
Shader shader{TryGet(host_ptr)};
if (!shader) { if (!shader) {
// No shader found - create a new one // No shader found - create a new one
ProgramCode program_code = GetShaderCode(program_addr); const auto& host_ptr{Memory::GetPointer(program_addr)};
ProgramCode program_code{GetShaderCode(host_ptr)};
ProgramCode program_code_b; ProgramCode program_code_b;
if (program == Maxwell::ShaderProgram::VertexA) { if (program == Maxwell::ShaderProgram::VertexA) {
program_code_b = GetShaderCode(GetShaderAddress(Maxwell::ShaderProgram::VertexB)); program_code_b = GetShaderCode(
Memory::GetPointer(GetShaderAddress(Maxwell::ShaderProgram::VertexB)));
} }
const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b); const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
@ -504,11 +509,11 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
if (found != precompiled_shaders.end()) { if (found != precompiled_shaders.end()) {
shader = shader =
std::make_shared<CachedShader>(program_addr, unique_identifier, program, disk_cache, std::make_shared<CachedShader>(program_addr, unique_identifier, program, disk_cache,
precompiled_programs, found->second); precompiled_programs, found->second, host_ptr);
} else { } else {
shader = std::make_shared<CachedShader>( shader = std::make_shared<CachedShader>(
program_addr, unique_identifier, program, disk_cache, precompiled_programs, program_addr, unique_identifier, program, disk_cache, precompiled_programs,
std::move(program_code), std::move(program_code_b)); std::move(program_code), std::move(program_code_b), host_ptr);
} }
Register(shader); Register(shader);
} }

@ -39,18 +39,18 @@ using PrecompiledShaders = std::unordered_map<u64, GLShader::ProgramResult>;
class CachedShader final : public RasterizerCacheObject { class CachedShader final : public RasterizerCacheObject {
public: public:
explicit CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type, explicit CachedShader(VAddr guest_addr, u64 unique_identifier,
ShaderDiskCacheOpenGL& disk_cache, Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
const PrecompiledPrograms& precompiled_programs, const PrecompiledPrograms& precompiled_programs,
ProgramCode&& program_code, ProgramCode&& program_code_b); ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr);
explicit CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type, explicit CachedShader(VAddr guest_addr, u64 unique_identifier,
ShaderDiskCacheOpenGL& disk_cache, Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
const PrecompiledPrograms& precompiled_programs, const PrecompiledPrograms& precompiled_programs,
GLShader::ProgramResult result); GLShader::ProgramResult result, u8* host_ptr);
VAddr GetAddr() const override { VAddr GetCpuAddr() const override {
return addr; return guest_addr;
} }
std::size_t GetSizeInBytes() const override { std::size_t GetSizeInBytes() const override {
@ -91,7 +91,8 @@ private:
ShaderDiskCacheUsage GetUsage(GLenum primitive_mode, BaseBindings base_bindings) const; ShaderDiskCacheUsage GetUsage(GLenum primitive_mode, BaseBindings base_bindings) const;
VAddr addr{}; u8* host_ptr{};
VAddr guest_addr{};
u64 unique_identifier{}; u64 unique_identifier{};
Maxwell::ShaderProgram program_type{}; Maxwell::ShaderProgram program_type{};
ShaderDiskCacheOpenGL& disk_cache; ShaderDiskCacheOpenGL& disk_cache;

@ -17,6 +17,11 @@
namespace Vulkan { namespace Vulkan {
CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, u64 offset,
std::size_t alignment, u8* host_ptr)
: cpu_addr{cpu_addr}, size{size}, offset{offset}, alignment{alignment}, RasterizerCacheObject{
host_ptr} {}
VKBufferCache::VKBufferCache(Tegra::MemoryManager& tegra_memory_manager, VKBufferCache::VKBufferCache(Tegra::MemoryManager& tegra_memory_manager,
VideoCore::RasterizerInterface& rasterizer, const VKDevice& device, VideoCore::RasterizerInterface& rasterizer, const VKDevice& device,
VKMemoryManager& memory_manager, VKScheduler& scheduler, u64 size) VKMemoryManager& memory_manager, VKScheduler& scheduler, u64 size)
@ -37,16 +42,18 @@ VKBufferCache::~VKBufferCache() = default;
u64 VKBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, u64 alignment, u64 VKBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, u64 alignment,
bool cache) { bool cache) {
const auto cpu_addr{tegra_memory_manager.GpuToCpuAddress(gpu_addr)}; const auto cpu_addr{tegra_memory_manager.GpuToCpuAddress(gpu_addr)};
ASSERT(cpu_addr); ASSERT_MSG(cpu_addr, "Invalid GPU address");
// Cache management is a big overhead, so only cache entries with a given size. // Cache management is a big overhead, so only cache entries with a given size.
// TODO: Figure out which size is the best for given games. // TODO: Figure out which size is the best for given games.
cache &= size >= 2048; cache &= size >= 2048;
const auto& host_ptr{Memory::GetPointer(*cpu_addr)};
if (cache) { if (cache) {
if (auto entry = TryGet(*cpu_addr); entry) { auto entry = TryGet(host_ptr);
if (entry->size >= size && entry->alignment == alignment) { if (entry) {
return entry->offset; if (entry->GetSize() >= size && entry->GetAlignment() == alignment) {
return entry->GetOffset();
} }
Unregister(entry); Unregister(entry);
} }
@ -55,17 +62,17 @@ u64 VKBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, u64
AlignBuffer(alignment); AlignBuffer(alignment);
const u64 uploaded_offset = buffer_offset; const u64 uploaded_offset = buffer_offset;
Memory::ReadBlock(*cpu_addr, buffer_ptr, size); if (!host_ptr) {
return uploaded_offset;
}
std::memcpy(buffer_ptr, host_ptr, size);
buffer_ptr += size; buffer_ptr += size;
buffer_offset += size; buffer_offset += size;
if (cache) { if (cache) {
auto entry = std::make_shared<CachedBufferEntry>(); auto entry = std::make_shared<CachedBufferEntry>(*cpu_addr, size, uploaded_offset,
entry->offset = uploaded_offset; alignment, host_ptr);
entry->size = size;
entry->alignment = alignment;
entry->addr = *cpu_addr;
Register(entry); Register(entry);
} }

@ -24,22 +24,39 @@ class VKFence;
class VKMemoryManager; class VKMemoryManager;
class VKStreamBuffer; class VKStreamBuffer;
struct CachedBufferEntry final : public RasterizerCacheObject { class CachedBufferEntry final : public RasterizerCacheObject {
VAddr GetAddr() const override { public:
return addr; explicit CachedBufferEntry(VAddr cpu_addr, std::size_t size, u64 offset, std::size_t alignment,
u8* host_ptr);
VAddr GetCpuAddr() const override {
return cpu_addr;
} }
std::size_t GetSizeInBytes() const override { std::size_t GetSizeInBytes() const override {
return size; return size;
} }
std::size_t GetSize() const {
return size;
}
u64 GetOffset() const {
return offset;
}
std::size_t GetAlignment() const {
return alignment;
}
// We do not have to flush this cache as things in it are never modified by us. // We do not have to flush this cache as things in it are never modified by us.
void Flush() override {} void Flush() override {}
VAddr addr; private:
std::size_t size; VAddr cpu_addr{};
u64 offset; std::size_t size{};
std::size_t alignment; u64 offset{};
std::size_t alignment{};
}; };
class VKBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> { class VKBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> {