{maxwell_3d,buffer_cache}: Implement memory barriers using 3D registers

Drop MemoryBarrier from the buffer cache and use Maxwell3D's register
WaitForIdle.

To implement this on OpenGL we just call glMemoryBarrier with the
necessary bits.

Vulkan lacks this synchronization primitive, so we set an event and
immediately wait for it. This is not a pretty solution, but it's what
Vulkan can do without submitting the current command buffer to the queue
(which ends up being more expensive on the CPU).
master
ReinUsesLisp 2020-04-28 02:14:11 +07:00
parent 1517cba8ca
commit fe931ac976
13 changed files with 57 additions and 16 deletions

@ -89,10 +89,6 @@ public:
map->MarkAsWritten(true); map->MarkAsWritten(true);
MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1); MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
} }
} else {
if (map->IsWritten()) {
WriteBarrier();
}
} }
return {ToHandle(block), static_cast<u64>(block->GetOffset(cpu_addr))}; return {ToHandle(block), static_cast<u64>(block->GetOffset(cpu_addr))};
@ -254,8 +250,6 @@ protected:
virtual BufferType ToHandle(const OwnerBuffer& storage) = 0; virtual BufferType ToHandle(const OwnerBuffer& storage) = 0;
virtual void WriteBarrier() = 0;
virtual OwnerBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0; virtual OwnerBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
virtual void UploadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size, virtual void UploadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size,

@ -184,6 +184,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
} }
switch (method) { switch (method) {
case MAXWELL3D_REG_INDEX(wait_for_idle): {
rasterizer.WaitForIdle();
break;
}
case MAXWELL3D_REG_INDEX(shadow_ram_control): { case MAXWELL3D_REG_INDEX(shadow_ram_control): {
shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(method_call.argument); shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(method_call.argument);
break; break;

@ -709,7 +709,9 @@ public:
union { union {
struct { struct {
INSERT_UNION_PADDING_WORDS(0x45); INSERT_UNION_PADDING_WORDS(0x44);
u32 wait_for_idle;
struct { struct {
u32 upload_address; u32 upload_address;
@ -1535,6 +1537,7 @@ private:
static_assert(offsetof(Maxwell3D::Regs, field_name) == position * 4, \ static_assert(offsetof(Maxwell3D::Regs, field_name) == position * 4, \
"Field " #field_name " has invalid position") "Field " #field_name " has invalid position")
ASSERT_REG_POSITION(wait_for_idle, 0x44);
ASSERT_REG_POSITION(macros, 0x45); ASSERT_REG_POSITION(macros, 0x45);
ASSERT_REG_POSITION(shadow_ram_control, 0x49); ASSERT_REG_POSITION(shadow_ram_control, 0x49);
ASSERT_REG_POSITION(upload, 0x60); ASSERT_REG_POSITION(upload, 0x60);

@ -80,6 +80,9 @@ public:
/// and invalidated /// and invalidated
virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0; virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
/// Notify the host renderer to wait for previous primitive and compute operations.
virtual void WaitForIdle() = 0;
/// Notify the rasterizer to send all written commands to the host GPU. /// Notify the rasterizer to send all written commands to the host GPU.
virtual void FlushCommands() = 0; virtual void FlushCommands() = 0;

@ -51,10 +51,6 @@ Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
return std::make_shared<CachedBufferBlock>(cpu_addr, size); return std::make_shared<CachedBufferBlock>(cpu_addr, size);
} }
void OGLBufferCache::WriteBarrier() {
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
}
GLuint OGLBufferCache::ToHandle(const Buffer& buffer) { GLuint OGLBufferCache::ToHandle(const Buffer& buffer) {
return buffer->GetHandle(); return buffer->GetHandle();
} }

@ -59,8 +59,6 @@ protected:
GLuint ToHandle(const Buffer& buffer) override; GLuint ToHandle(const Buffer& buffer) override;
void WriteBarrier() override;
void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
const u8* data) override; const u8* data) override;

@ -725,6 +725,17 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
InvalidateRegion(addr, size); InvalidateRegion(addr, size);
} }
void RasterizerOpenGL::WaitForIdle() {
// Place a barrier on everything that is not framebuffer related.
// This is related to another flag that is not currently implemented.
glMemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT | GL_ELEMENT_ARRAY_BARRIER_BIT |
GL_UNIFORM_BARRIER_BIT | GL_TEXTURE_FETCH_BARRIER_BIT |
GL_SHADER_IMAGE_ACCESS_BARRIER_BIT | GL_COMMAND_BARRIER_BIT |
GL_PIXEL_BUFFER_BARRIER_BIT | GL_TEXTURE_UPDATE_BARRIER_BIT |
GL_BUFFER_UPDATE_BARRIER_BIT | GL_TRANSFORM_FEEDBACK_BARRIER_BIT |
GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT);
}
void RasterizerOpenGL::FlushCommands() { void RasterizerOpenGL::FlushCommands() {
// Only flush when we have commands queued to OpenGL. // Only flush when we have commands queued to OpenGL.
if (num_queued_commands == 0) { if (num_queued_commands == 0) {

@ -75,6 +75,7 @@ public:
void SignalSyncPoint(u32 value) override; void SignalSyncPoint(u32 value) override;
void ReleaseFences() override; void ReleaseFences() override;
void FlushAndInvalidateRegion(VAddr addr, u64 size) override; void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
void WaitForIdle() override;
void FlushCommands() override; void FlushCommands() override;
void TickFrame() override; void TickFrame() override;
bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,

@ -55,8 +55,6 @@ public:
protected: protected:
VkBuffer ToHandle(const Buffer& buffer) override; VkBuffer ToHandle(const Buffer& buffer) override;
void WriteBarrier() override {}
Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override; Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,

@ -302,7 +302,7 @@ RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWind
buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool), buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool),
sampler_cache(device), sampler_cache(device),
fence_manager(system, *this, device, scheduler, texture_cache, buffer_cache, query_cache), fence_manager(system, *this, device, scheduler, texture_cache, buffer_cache, query_cache),
query_cache(system, *this, device, scheduler) { query_cache(system, *this, device, scheduler), wfi_event{device.GetLogical().CreateEvent()} {
scheduler.SetQueryCache(query_cache); scheduler.SetQueryCache(query_cache);
} }
@ -576,6 +576,26 @@ void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size) {
InvalidateRegion(addr, size); InvalidateRegion(addr, size);
} }
void RasterizerVulkan::WaitForIdle() {
// Everything but wait pixel operations. This intentionally includes FRAGMENT_SHADER_BIT because
// fragment shaders can still write storage buffers.
VkPipelineStageFlags flags =
VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT;
if (device.IsExtTransformFeedbackSupported()) {
flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT;
}
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) {
cmdbuf.SetEvent(event, flags);
cmdbuf.WaitEvents(event, flags, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, {}, {}, {});
});
}
void RasterizerVulkan::FlushCommands() { void RasterizerVulkan::FlushCommands() {
if (draw_counter > 0) { if (draw_counter > 0) {
draw_counter = 0; draw_counter = 0;

@ -127,6 +127,7 @@ public:
void SignalSyncPoint(u32 value) override; void SignalSyncPoint(u32 value) override;
void ReleaseFences() override; void ReleaseFences() override;
void FlushAndInvalidateRegion(VAddr addr, u64 size) override; void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
void WaitForIdle() override;
void FlushCommands() override; void FlushCommands() override;
void TickFrame() override; void TickFrame() override;
bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
@ -276,6 +277,7 @@ private:
vk::Buffer default_buffer; vk::Buffer default_buffer;
VKMemoryCommit default_buffer_commit; VKMemoryCommit default_buffer_commit;
vk::Event wfi_event;
std::array<View, Maxwell::NumRenderTargets> color_attachments; std::array<View, Maxwell::NumRenderTargets> color_attachments;
View zeta_attachment; View zeta_attachment;

@ -87,6 +87,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
X(vkCmdSetStencilReference); X(vkCmdSetStencilReference);
X(vkCmdSetStencilWriteMask); X(vkCmdSetStencilWriteMask);
X(vkCmdSetViewport); X(vkCmdSetViewport);
X(vkCmdWaitEvents);
X(vkCreateBuffer); X(vkCreateBuffer);
X(vkCreateBufferView); X(vkCreateBufferView);
X(vkCreateCommandPool); X(vkCreateCommandPool);

@ -205,6 +205,7 @@ struct DeviceDispatch : public InstanceDispatch {
PFN_vkCmdSetStencilReference vkCmdSetStencilReference; PFN_vkCmdSetStencilReference vkCmdSetStencilReference;
PFN_vkCmdSetStencilWriteMask vkCmdSetStencilWriteMask; PFN_vkCmdSetStencilWriteMask vkCmdSetStencilWriteMask;
PFN_vkCmdSetViewport vkCmdSetViewport; PFN_vkCmdSetViewport vkCmdSetViewport;
PFN_vkCmdWaitEvents vkCmdWaitEvents;
PFN_vkCreateBuffer vkCreateBuffer; PFN_vkCreateBuffer vkCreateBuffer;
PFN_vkCreateBufferView vkCreateBufferView; PFN_vkCreateBufferView vkCreateBufferView;
PFN_vkCreateCommandPool vkCreateCommandPool; PFN_vkCreateCommandPool vkCreateCommandPool;
@ -958,6 +959,15 @@ public:
dld->vkCmdSetEvent(handle, event, stage_flags); dld->vkCmdSetEvent(handle, event, stage_flags);
} }
void WaitEvents(Span<VkEvent> events, VkPipelineStageFlags src_stage_mask,
VkPipelineStageFlags dst_stage_mask, Span<VkMemoryBarrier> memory_barriers,
Span<VkBufferMemoryBarrier> buffer_barriers,
Span<VkImageMemoryBarrier> image_barriers) const noexcept {
dld->vkCmdWaitEvents(handle, events.size(), events.data(), src_stage_mask, dst_stage_mask,
memory_barriers.size(), memory_barriers.data(), buffer_barriers.size(),
buffer_barriers.data(), image_barriers.size(), image_barriers.data());
}
void BindTransformFeedbackBuffersEXT(u32 first, u32 count, const VkBuffer* buffers, void BindTransformFeedbackBuffersEXT(u32 first, u32 count, const VkBuffer* buffers,
const VkDeviceSize* offsets, const VkDeviceSize* offsets,
const VkDeviceSize* sizes) const noexcept { const VkDeviceSize* sizes) const noexcept {