|
|
|
@ -243,6 +243,8 @@ private:
|
|
|
|
|
template <bool insert>
|
|
|
|
|
void ChangeRegister(BufferId buffer_id);
|
|
|
|
|
|
|
|
|
|
void TouchBuffer(Buffer& buffer) const noexcept;
|
|
|
|
|
|
|
|
|
|
bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);
|
|
|
|
|
|
|
|
|
|
bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size);
|
|
|
|
@ -255,6 +257,10 @@ private:
|
|
|
|
|
|
|
|
|
|
void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies);
|
|
|
|
|
|
|
|
|
|
void DownloadBufferMemory(Buffer& buffer_id);
|
|
|
|
|
|
|
|
|
|
void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size);
|
|
|
|
|
|
|
|
|
|
void DeleteBuffer(BufferId buffer_id);
|
|
|
|
|
|
|
|
|
|
void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id);
|
|
|
|
@ -319,6 +325,9 @@ private:
|
|
|
|
|
size_t immediate_buffer_capacity = 0;
|
|
|
|
|
std::unique_ptr<u8[]> immediate_buffer_alloc;
|
|
|
|
|
|
|
|
|
|
typename SlotVector<Buffer>::Iterator deletion_iterator;
|
|
|
|
|
u64 frame_tick = 0;
|
|
|
|
|
|
|
|
|
|
std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
@ -332,6 +341,7 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
|
|
|
|
|
gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} {
|
|
|
|
|
// Ensure the first slot is used for the null buffer
|
|
|
|
|
void(slot_buffers.insert(runtime, NullBufferParams{}));
|
|
|
|
|
deletion_iterator = slot_buffers.end();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <class P>
|
|
|
|
@ -349,7 +359,24 @@ void BufferCache<P>::TickFrame() {
|
|
|
|
|
const bool skip_preferred = hits * 256 < shots * 251;
|
|
|
|
|
uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0;
|
|
|
|
|
|
|
|
|
|
static constexpr u64 ticks_to_destroy = 120;
|
|
|
|
|
int num_iterations = 32;
|
|
|
|
|
for (; num_iterations > 0; --num_iterations) {
|
|
|
|
|
if (deletion_iterator == slot_buffers.end()) {
|
|
|
|
|
deletion_iterator = slot_buffers.begin();
|
|
|
|
|
}
|
|
|
|
|
++deletion_iterator;
|
|
|
|
|
if (deletion_iterator == slot_buffers.end()) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
const auto [buffer_id, buffer] = *deletion_iterator;
|
|
|
|
|
if (buffer->FrameTick() + ticks_to_destroy < frame_tick) {
|
|
|
|
|
DownloadBufferMemory(*buffer);
|
|
|
|
|
DeleteBuffer(buffer_id);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
delayed_destruction_ring.Tick();
|
|
|
|
|
++frame_tick;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <class P>
|
|
|
|
@ -371,50 +398,8 @@ void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) {
|
|
|
|
|
|
|
|
|
|
template <class P>
|
|
|
|
|
void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) {
|
|
|
|
|
ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
|
|
|
|
|
boost::container::small_vector<BufferCopy, 1> copies;
|
|
|
|
|
u64 total_size_bytes = 0;
|
|
|
|
|
u64 largest_copy = 0;
|
|
|
|
|
buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
|
|
|
|
|
copies.push_back(BufferCopy{
|
|
|
|
|
.src_offset = range_offset,
|
|
|
|
|
.dst_offset = total_size_bytes,
|
|
|
|
|
.size = range_size,
|
|
|
|
|
});
|
|
|
|
|
total_size_bytes += range_size;
|
|
|
|
|
largest_copy = std::max(largest_copy, range_size);
|
|
|
|
|
});
|
|
|
|
|
if (total_size_bytes == 0) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
MICROPROFILE_SCOPE(GPU_DownloadMemory);
|
|
|
|
|
|
|
|
|
|
if constexpr (USE_MEMORY_MAPS) {
|
|
|
|
|
auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
|
|
|
|
|
const u8* const mapped_memory = download_staging.mapped_span.data();
|
|
|
|
|
const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size());
|
|
|
|
|
for (BufferCopy& copy : copies) {
|
|
|
|
|
// Modify copies to have the staging offset in mind
|
|
|
|
|
copy.dst_offset += download_staging.offset;
|
|
|
|
|
}
|
|
|
|
|
runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
|
|
|
|
|
runtime.Finish();
|
|
|
|
|
for (const BufferCopy& copy : copies) {
|
|
|
|
|
const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
|
|
|
|
|
// Undo the modified offset
|
|
|
|
|
const u64 dst_offset = copy.dst_offset - download_staging.offset;
|
|
|
|
|
const u8* copy_mapped_memory = mapped_memory + dst_offset;
|
|
|
|
|
cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
|
|
|
|
|
for (const BufferCopy& copy : copies) {
|
|
|
|
|
buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
|
|
|
|
|
const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
|
|
|
|
|
cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
ForEachBufferInRange(cpu_addr, size,
|
|
|
|
|
[&](BufferId, Buffer& buffer) { DownloadBufferMemory(buffer); });
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <class P>
|
|
|
|
@ -640,6 +625,7 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
|
|
|
|
|
template <class P>
|
|
|
|
|
void BufferCache<P>::BindHostIndexBuffer() {
|
|
|
|
|
Buffer& buffer = slot_buffers[index_buffer.buffer_id];
|
|
|
|
|
TouchBuffer(buffer);
|
|
|
|
|
const u32 offset = buffer.Offset(index_buffer.cpu_addr);
|
|
|
|
|
const u32 size = index_buffer.size;
|
|
|
|
|
SynchronizeBuffer(buffer, index_buffer.cpu_addr, size);
|
|
|
|
@ -658,6 +644,7 @@ void BufferCache<P>::BindHostVertexBuffers() {
|
|
|
|
|
for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
|
|
|
|
|
const Binding& binding = vertex_buffers[index];
|
|
|
|
|
Buffer& buffer = slot_buffers[binding.buffer_id];
|
|
|
|
|
TouchBuffer(buffer);
|
|
|
|
|
SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
|
|
|
|
|
if (!flags[Dirty::VertexBuffer0 + index]) {
|
|
|
|
|
continue;
|
|
|
|
@ -693,6 +680,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
|
|
|
|
|
const VAddr cpu_addr = binding.cpu_addr;
|
|
|
|
|
const u32 size = binding.size;
|
|
|
|
|
Buffer& buffer = slot_buffers[binding.buffer_id];
|
|
|
|
|
TouchBuffer(buffer);
|
|
|
|
|
const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID &&
|
|
|
|
|
size <= uniform_buffer_skip_cache_size &&
|
|
|
|
|
!buffer.IsRegionGpuModified(cpu_addr, size);
|
|
|
|
@ -744,6 +732,7 @@ void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
|
|
|
|
|
ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
|
|
|
|
|
const Binding& binding = storage_buffers[stage][index];
|
|
|
|
|
Buffer& buffer = slot_buffers[binding.buffer_id];
|
|
|
|
|
TouchBuffer(buffer);
|
|
|
|
|
const u32 size = binding.size;
|
|
|
|
|
SynchronizeBuffer(buffer, binding.cpu_addr, size);
|
|
|
|
|
|
|
|
|
@ -766,6 +755,7 @@ void BufferCache<P>::BindHostTransformFeedbackBuffers() {
|
|
|
|
|
for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
|
|
|
|
|
const Binding& binding = transform_feedback_buffers[index];
|
|
|
|
|
Buffer& buffer = slot_buffers[binding.buffer_id];
|
|
|
|
|
TouchBuffer(buffer);
|
|
|
|
|
const u32 size = binding.size;
|
|
|
|
|
SynchronizeBuffer(buffer, binding.cpu_addr, size);
|
|
|
|
|
|
|
|
|
@ -784,6 +774,7 @@ void BufferCache<P>::BindHostComputeUniformBuffers() {
|
|
|
|
|
ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
|
|
|
|
|
const Binding& binding = compute_uniform_buffers[index];
|
|
|
|
|
Buffer& buffer = slot_buffers[binding.buffer_id];
|
|
|
|
|
TouchBuffer(buffer);
|
|
|
|
|
const u32 size = binding.size;
|
|
|
|
|
SynchronizeBuffer(buffer, binding.cpu_addr, size);
|
|
|
|
|
|
|
|
|
@ -803,6 +794,7 @@ void BufferCache<P>::BindHostComputeStorageBuffers() {
|
|
|
|
|
ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
|
|
|
|
|
const Binding& binding = compute_storage_buffers[index];
|
|
|
|
|
Buffer& buffer = slot_buffers[binding.buffer_id];
|
|
|
|
|
TouchBuffer(buffer);
|
|
|
|
|
const u32 size = binding.size;
|
|
|
|
|
SynchronizeBuffer(buffer, binding.cpu_addr, size);
|
|
|
|
|
|
|
|
|
@ -1101,6 +1093,7 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
|
|
|
|
|
const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size);
|
|
|
|
|
const u32 size = static_cast<u32>(overlap.end - overlap.begin);
|
|
|
|
|
const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size);
|
|
|
|
|
TouchBuffer(slot_buffers[new_buffer_id]);
|
|
|
|
|
for (const BufferId overlap_id : overlap.ids) {
|
|
|
|
|
JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);
|
|
|
|
|
}
|
|
|
|
@ -1135,6 +1128,11 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <class P>
|
|
|
|
|
void BufferCache<P>::TouchBuffer(Buffer& buffer) const noexcept {
|
|
|
|
|
buffer.SetFrameTick(frame_tick);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <class P>
|
|
|
|
|
bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) {
|
|
|
|
|
if (buffer.CpuAddr() == 0) {
|
|
|
|
@ -1211,6 +1209,57 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
|
|
|
|
|
runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <class P>
|
|
|
|
|
void BufferCache<P>::DownloadBufferMemory(Buffer& buffer) {
|
|
|
|
|
DownloadBufferMemory(buffer, buffer.CpuAddr(), buffer.SizeBytes());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <class P>
|
|
|
|
|
void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 size) {
|
|
|
|
|
boost::container::small_vector<BufferCopy, 1> copies;
|
|
|
|
|
u64 total_size_bytes = 0;
|
|
|
|
|
u64 largest_copy = 0;
|
|
|
|
|
buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
|
|
|
|
|
copies.push_back(BufferCopy{
|
|
|
|
|
.src_offset = range_offset,
|
|
|
|
|
.dst_offset = total_size_bytes,
|
|
|
|
|
.size = range_size,
|
|
|
|
|
});
|
|
|
|
|
total_size_bytes += range_size;
|
|
|
|
|
largest_copy = std::max(largest_copy, range_size);
|
|
|
|
|
});
|
|
|
|
|
if (total_size_bytes == 0) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
MICROPROFILE_SCOPE(GPU_DownloadMemory);
|
|
|
|
|
|
|
|
|
|
if constexpr (USE_MEMORY_MAPS) {
|
|
|
|
|
auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
|
|
|
|
|
const u8* const mapped_memory = download_staging.mapped_span.data();
|
|
|
|
|
const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size());
|
|
|
|
|
for (BufferCopy& copy : copies) {
|
|
|
|
|
// Modify copies to have the staging offset in mind
|
|
|
|
|
copy.dst_offset += download_staging.offset;
|
|
|
|
|
}
|
|
|
|
|
runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
|
|
|
|
|
runtime.Finish();
|
|
|
|
|
for (const BufferCopy& copy : copies) {
|
|
|
|
|
const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
|
|
|
|
|
// Undo the modified offset
|
|
|
|
|
const u64 dst_offset = copy.dst_offset - download_staging.offset;
|
|
|
|
|
const u8* copy_mapped_memory = mapped_memory + dst_offset;
|
|
|
|
|
cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
|
|
|
|
|
for (const BufferCopy& copy : copies) {
|
|
|
|
|
buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
|
|
|
|
|
const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
|
|
|
|
|
cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <class P>
|
|
|
|
|
void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
|
|
|
|
|
const auto scalar_replace = [buffer_id](Binding& binding) {
|
|
|
|
@ -1236,6 +1285,7 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
|
|
|
|
|
|
|
|
|
|
Unregister(buffer_id);
|
|
|
|
|
delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id]));
|
|
|
|
|
slot_buffers.erase(buffer_id);
|
|
|
|
|
|
|
|
|
|
NotifyBufferDeletion();
|
|
|
|
|
}
|
|
|
|
|