@ -16,6 +16,7 @@
# include <boost/container/small_vector.hpp>
# include "common/common_sizes.h"
# include "common/common_types.h"
# include "common/div_ceil.h"
# include "common/microprofile.h"
@ -65,6 +66,9 @@ class BufferCache {
static constexpr BufferId NULL_BUFFER_ID { 0 } ;
static constexpr u64 EXPECTED_MEMORY = Common : : Size_512_MB ;
static constexpr u64 CRITICAL_MEMORY = Common : : Size_1_GB ;
using Maxwell = Tegra : : Engines : : Maxwell3D : : Regs ;
using Runtime = typename P : : Runtime ;
@ -102,6 +106,8 @@ public:
void TickFrame ( ) ;
void RunGarbageCollector ( ) ;
void WriteMemory ( VAddr cpu_addr , u64 size ) ;
void CachedWriteMemory ( VAddr cpu_addr , u64 size ) ;
@ -243,6 +249,8 @@ private:
template < bool insert >
void ChangeRegister ( BufferId buffer_id ) ;
void TouchBuffer ( Buffer & buffer ) const noexcept ;
bool SynchronizeBuffer ( Buffer & buffer , VAddr cpu_addr , u32 size ) ;
bool SynchronizeBufferImpl ( Buffer & buffer , VAddr cpu_addr , u32 size ) ;
@ -255,6 +263,10 @@ private:
void MappedUploadMemory ( Buffer & buffer , u64 total_size_bytes , std : : span < BufferCopy > copies ) ;
void DownloadBufferMemory ( Buffer & buffer_id ) ;
void DownloadBufferMemory ( Buffer & buffer_id , VAddr cpu_addr , u64 size ) ;
void DeleteBuffer ( BufferId buffer_id ) ;
void ReplaceBufferDownloads ( BufferId old_buffer_id , BufferId new_buffer_id ) ;
@ -319,6 +331,10 @@ private:
size_t immediate_buffer_capacity = 0 ;
std : : unique_ptr < u8 [ ] > immediate_buffer_alloc ;
typename SlotVector < Buffer > : : Iterator deletion_iterator ;
u64 frame_tick = 0 ;
u64 total_used_memory = 0 ;
std : : array < BufferId , ( ( 1ULL < < 39 ) > > PAGE_BITS ) > page_table ;
} ;
@ -332,6 +348,28 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
gpu_memory { gpu_memory_ } , cpu_memory { cpu_memory_ } , runtime { runtime_ } {
// Ensure the first slot is used for the null buffer
void ( slot_buffers . insert ( runtime , NullBufferParams { } ) ) ;
deletion_iterator = slot_buffers . end ( ) ;
}
template < class P >
void BufferCache < P > : : RunGarbageCollector ( ) {
const bool aggressive_gc = total_used_memory > = CRITICAL_MEMORY ;
const u64 ticks_to_destroy = aggressive_gc ? 60 : 120 ;
int num_iterations = aggressive_gc ? 64 : 32 ;
for ( ; num_iterations > 0 ; - - num_iterations ) {
if ( deletion_iterator = = slot_buffers . end ( ) ) {
deletion_iterator = slot_buffers . begin ( ) ;
}
+ + deletion_iterator ;
if ( deletion_iterator = = slot_buffers . end ( ) ) {
break ;
}
const auto [ buffer_id , buffer ] = * deletion_iterator ;
if ( buffer - > FrameTick ( ) + ticks_to_destroy < frame_tick ) {
DownloadBufferMemory ( * buffer ) ;
DeleteBuffer ( buffer_id ) ;
}
}
}
template < class P >
@ -349,6 +387,10 @@ void BufferCache<P>::TickFrame() {
const bool skip_preferred = hits * 256 < shots * 251 ;
uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0 ;
if ( Settings : : values . use_caches_gc . GetValue ( ) & & total_used_memory > = EXPECTED_MEMORY ) {
RunGarbageCollector ( ) ;
}
+ + frame_tick ;
delayed_destruction_ring . Tick ( ) ;
}
@ -371,50 +413,8 @@ void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) {
template < class P >
void BufferCache < P > : : DownloadMemory ( VAddr cpu_addr , u64 size ) {
ForEachBufferInRange ( cpu_addr , size , [ & ] ( BufferId , Buffer & buffer ) {
boost : : container : : small_vector < BufferCopy , 1 > copies ;
u64 total_size_bytes = 0 ;
u64 largest_copy = 0 ;
buffer . ForEachDownloadRange ( cpu_addr , size , [ & ] ( u64 range_offset , u64 range_size ) {
copies . push_back ( BufferCopy {
. src_offset = range_offset ,
. dst_offset = total_size_bytes ,
. size = range_size ,
} ) ;
total_size_bytes + = range_size ;
largest_copy = std : : max ( largest_copy , range_size ) ;
} ) ;
if ( total_size_bytes = = 0 ) {
return ;
}
MICROPROFILE_SCOPE ( GPU_DownloadMemory ) ;
if constexpr ( USE_MEMORY_MAPS ) {
auto download_staging = runtime . DownloadStagingBuffer ( total_size_bytes ) ;
const u8 * const mapped_memory = download_staging . mapped_span . data ( ) ;
const std : : span < BufferCopy > copies_span ( copies . data ( ) , copies . data ( ) + copies . size ( ) ) ;
for ( BufferCopy & copy : copies ) {
// Modify copies to have the staging offset in mind
copy . dst_offset + = download_staging . offset ;
}
runtime . CopyBuffer ( download_staging . buffer , buffer , copies_span ) ;
runtime . Finish ( ) ;
for ( const BufferCopy & copy : copies ) {
const VAddr copy_cpu_addr = buffer . CpuAddr ( ) + copy . src_offset ;
// Undo the modified offset
const u64 dst_offset = copy . dst_offset - download_staging . offset ;
const u8 * copy_mapped_memory = mapped_memory + dst_offset ;
cpu_memory . WriteBlockUnsafe ( copy_cpu_addr , copy_mapped_memory , copy . size ) ;
}
} else {
const std : : span < u8 > immediate_buffer = ImmediateBuffer ( largest_copy ) ;
for ( const BufferCopy & copy : copies ) {
buffer . ImmediateDownload ( copy . src_offset , immediate_buffer . subspan ( 0 , copy . size ) ) ;
const VAddr copy_cpu_addr = buffer . CpuAddr ( ) + copy . src_offset ;
cpu_memory . WriteBlockUnsafe ( copy_cpu_addr , immediate_buffer . data ( ) , copy . size ) ;
}
}
} ) ;
ForEachBufferInRange ( cpu_addr , size ,
[ & ] ( BufferId , Buffer & buffer ) { DownloadBufferMemory ( buffer ) ; } ) ;
}
template < class P >
@ -640,6 +640,7 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
template < class P >
void BufferCache < P > : : BindHostIndexBuffer ( ) {
Buffer & buffer = slot_buffers [ index_buffer . buffer_id ] ;
TouchBuffer ( buffer ) ;
const u32 offset = buffer . Offset ( index_buffer . cpu_addr ) ;
const u32 size = index_buffer . size ;
SynchronizeBuffer ( buffer , index_buffer . cpu_addr , size ) ;
@ -658,6 +659,7 @@ void BufferCache<P>::BindHostVertexBuffers() {
for ( u32 index = 0 ; index < NUM_VERTEX_BUFFERS ; + + index ) {
const Binding & binding = vertex_buffers [ index ] ;
Buffer & buffer = slot_buffers [ binding . buffer_id ] ;
TouchBuffer ( buffer ) ;
SynchronizeBuffer ( buffer , binding . cpu_addr , binding . size ) ;
if ( ! flags [ Dirty : : VertexBuffer0 + index ] ) {
continue ;
@ -693,6 +695,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
const VAddr cpu_addr = binding . cpu_addr ;
const u32 size = binding . size ;
Buffer & buffer = slot_buffers [ binding . buffer_id ] ;
TouchBuffer ( buffer ) ;
const bool use_fast_buffer = binding . buffer_id ! = NULL_BUFFER_ID & &
size < = uniform_buffer_skip_cache_size & &
! buffer . IsRegionGpuModified ( cpu_addr , size ) ;
@ -744,6 +747,7 @@ void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
ForEachEnabledBit ( enabled_storage_buffers [ stage ] , [ & ] ( u32 index ) {
const Binding & binding = storage_buffers [ stage ] [ index ] ;
Buffer & buffer = slot_buffers [ binding . buffer_id ] ;
TouchBuffer ( buffer ) ;
const u32 size = binding . size ;
SynchronizeBuffer ( buffer , binding . cpu_addr , size ) ;
@ -766,6 +770,7 @@ void BufferCache<P>::BindHostTransformFeedbackBuffers() {
for ( u32 index = 0 ; index < NUM_TRANSFORM_FEEDBACK_BUFFERS ; + + index ) {
const Binding & binding = transform_feedback_buffers [ index ] ;
Buffer & buffer = slot_buffers [ binding . buffer_id ] ;
TouchBuffer ( buffer ) ;
const u32 size = binding . size ;
SynchronizeBuffer ( buffer , binding . cpu_addr , size ) ;
@ -784,6 +789,7 @@ void BufferCache<P>::BindHostComputeUniformBuffers() {
ForEachEnabledBit ( enabled_compute_uniform_buffers , [ & ] ( u32 index ) {
const Binding & binding = compute_uniform_buffers [ index ] ;
Buffer & buffer = slot_buffers [ binding . buffer_id ] ;
TouchBuffer ( buffer ) ;
const u32 size = binding . size ;
SynchronizeBuffer ( buffer , binding . cpu_addr , size ) ;
@ -803,6 +809,7 @@ void BufferCache<P>::BindHostComputeStorageBuffers() {
ForEachEnabledBit ( enabled_compute_storage_buffers , [ & ] ( u32 index ) {
const Binding & binding = compute_storage_buffers [ index ] ;
Buffer & buffer = slot_buffers [ binding . buffer_id ] ;
TouchBuffer ( buffer ) ;
const u32 size = binding . size ;
SynchronizeBuffer ( buffer , binding . cpu_addr , size ) ;
@ -1101,6 +1108,7 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
const OverlapResult overlap = ResolveOverlaps ( cpu_addr , wanted_size ) ;
const u32 size = static_cast < u32 > ( overlap . end - overlap . begin ) ;
const BufferId new_buffer_id = slot_buffers . insert ( runtime , rasterizer , overlap . begin , size ) ;
TouchBuffer ( slot_buffers [ new_buffer_id ] ) ;
for ( const BufferId overlap_id : overlap . ids ) {
JoinOverlap ( new_buffer_id , overlap_id , ! overlap . has_stream_leap ) ;
}
@ -1122,8 +1130,14 @@ template <class P>
template < bool insert >
void BufferCache < P > : : ChangeRegister ( BufferId buffer_id ) {
const Buffer & buffer = slot_buffers [ buffer_id ] ;
const auto size = buffer . SizeBytes ( ) ;
if ( insert ) {
total_used_memory + = Common : : AlignUp ( size , 1024 ) ;
} else {
total_used_memory - = Common : : AlignUp ( size , 1024 ) ;
}
const VAddr cpu_addr_begin = buffer . CpuAddr ( ) ;
const VAddr cpu_addr_end = cpu_addr_begin + buffer . SizeBytes ( ) ;
const VAddr cpu_addr_end = cpu_addr_begin + size ;
const u64 page_begin = cpu_addr_begin / PAGE_SIZE ;
const u64 page_end = Common : : DivCeil ( cpu_addr_end , PAGE_SIZE ) ;
for ( u64 page = page_begin ; page ! = page_end ; + + page ) {
@ -1135,6 +1149,11 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
}
}
template < class P >
void BufferCache < P > : : TouchBuffer ( Buffer & buffer ) const noexcept {
buffer . SetFrameTick ( frame_tick ) ;
}
template < class P >
bool BufferCache < P > : : SynchronizeBuffer ( Buffer & buffer , VAddr cpu_addr , u32 size ) {
if ( buffer . CpuAddr ( ) = = 0 ) {
@ -1211,6 +1230,57 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
runtime . CopyBuffer ( buffer , upload_staging . buffer , copies ) ;
}
template < class P >
void BufferCache < P > : : DownloadBufferMemory ( Buffer & buffer ) {
DownloadBufferMemory ( buffer , buffer . CpuAddr ( ) , buffer . SizeBytes ( ) ) ;
}
template < class P >
void BufferCache < P > : : DownloadBufferMemory ( Buffer & buffer , VAddr cpu_addr , u64 size ) {
boost : : container : : small_vector < BufferCopy , 1 > copies ;
u64 total_size_bytes = 0 ;
u64 largest_copy = 0 ;
buffer . ForEachDownloadRange ( cpu_addr , size , [ & ] ( u64 range_offset , u64 range_size ) {
copies . push_back ( BufferCopy {
. src_offset = range_offset ,
. dst_offset = total_size_bytes ,
. size = range_size ,
} ) ;
total_size_bytes + = range_size ;
largest_copy = std : : max ( largest_copy , range_size ) ;
} ) ;
if ( total_size_bytes = = 0 ) {
return ;
}
MICROPROFILE_SCOPE ( GPU_DownloadMemory ) ;
if constexpr ( USE_MEMORY_MAPS ) {
auto download_staging = runtime . DownloadStagingBuffer ( total_size_bytes ) ;
const u8 * const mapped_memory = download_staging . mapped_span . data ( ) ;
const std : : span < BufferCopy > copies_span ( copies . data ( ) , copies . data ( ) + copies . size ( ) ) ;
for ( BufferCopy & copy : copies ) {
// Modify copies to have the staging offset in mind
copy . dst_offset + = download_staging . offset ;
}
runtime . CopyBuffer ( download_staging . buffer , buffer , copies_span ) ;
runtime . Finish ( ) ;
for ( const BufferCopy & copy : copies ) {
const VAddr copy_cpu_addr = buffer . CpuAddr ( ) + copy . src_offset ;
// Undo the modified offset
const u64 dst_offset = copy . dst_offset - download_staging . offset ;
const u8 * copy_mapped_memory = mapped_memory + dst_offset ;
cpu_memory . WriteBlockUnsafe ( copy_cpu_addr , copy_mapped_memory , copy . size ) ;
}
} else {
const std : : span < u8 > immediate_buffer = ImmediateBuffer ( largest_copy ) ;
for ( const BufferCopy & copy : copies ) {
buffer . ImmediateDownload ( copy . src_offset , immediate_buffer . subspan ( 0 , copy . size ) ) ;
const VAddr copy_cpu_addr = buffer . CpuAddr ( ) + copy . src_offset ;
cpu_memory . WriteBlockUnsafe ( copy_cpu_addr , immediate_buffer . data ( ) , copy . size ) ;
}
}
}
template < class P >
void BufferCache < P > : : DeleteBuffer ( BufferId buffer_id ) {
const auto scalar_replace = [ buffer_id ] ( Binding & binding ) {
@ -1236,6 +1306,7 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
Unregister ( buffer_id ) ;
delayed_destruction_ring . Push ( std : : move ( slot_buffers [ buffer_id ] ) ) ;
slot_buffers . erase ( buffer_id ) ;
NotifyBufferDeletion ( ) ;
}