gl_arb_decompiler: Use NV_shader_buffer_{load,store} on assembly shaders

NV_shader_buffer_{load,store} is a 2010 extension that allows GL applications
to use what in Vulkan is known as physical pointers, this is basically C
pointers. On GLASM these is exposed through the LOAD/STORE/ATOM
instructions.

Up until now, assembly shaders were using NV_shader_storage_buffer_object.
These work fine, but have a (probably unintended) limitation that forces
us to have the limit of a single stage for all shader stages. In contrast,
with NV_shader_buffer_{load,store} we can pass GPU addresses to the
shader through local parameters (GLASM equivalent uniform constants, or
push constants on Vulkan). Local parameters have the advantage of being
per stage, allowing us to generate code without worrying about binding
overlaps.
master
ReinUsesLisp 2020-06-25 17:12:33 +07:00
parent 90cbcaa44a
commit a8a2526128
7 changed files with 173 additions and 110 deletions

@ -185,10 +185,6 @@ std::string TextureType(const MetaTexture& meta) {
return type;
}
std::string GlobalMemoryName(const GlobalMemoryBase& base) {
return fmt::format("gmem{}_{}", base.cbuf_index, base.cbuf_offset);
}
class ARBDecompiler final {
public:
explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
@ -199,6 +195,8 @@ public:
}
private:
void DefineGlobalMemory();
void DeclareHeader();
void DeclareVertex();
void DeclareGeometry();
@ -228,6 +226,7 @@ private:
std::pair<std::string, std::size_t> BuildCoords(Operation);
std::string BuildAoffi(Operation);
std::string GlobalMemoryPointer(const GmemNode& gmem);
void Exit();
std::string Assign(Operation);
@ -378,10 +377,8 @@ private:
std::string address;
std::string_view opname;
if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) {
AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
Visit(gmem->GetBaseAddress()));
address = fmt::format("{}[{}]", GlobalMemoryName(gmem->GetDescriptor()), temporary);
opname = "ATOMB";
address = GlobalMemoryPointer(*gmem);
opname = "ATOM";
} else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress()));
opname = "ATOMS";
@ -456,9 +453,13 @@ private:
shader_source += '\n';
}
std::string AllocTemporary() {
max_temporaries = std::max(max_temporaries, num_temporaries + 1);
return fmt::format("T{}.x", num_temporaries++);
std::string AllocLongVectorTemporary() {
max_long_temporaries = std::max(max_long_temporaries, num_long_temporaries + 1);
return fmt::format("L{}", num_long_temporaries++);
}
std::string AllocLongTemporary() {
return fmt::format("{}.x", AllocLongVectorTemporary());
}
std::string AllocVectorTemporary() {
@ -466,8 +467,13 @@ private:
return fmt::format("T{}", num_temporaries++);
}
std::string AllocTemporary() {
return fmt::format("{}.x", AllocVectorTemporary());
}
void ResetTemporaries() noexcept {
num_temporaries = 0;
num_long_temporaries = 0;
}
const Device& device;
@ -478,6 +484,11 @@ private:
std::size_t num_temporaries = 0;
std::size_t max_temporaries = 0;
std::size_t num_long_temporaries = 0;
std::size_t max_long_temporaries = 0;
std::map<GlobalMemoryBase, u32> global_memory_names;
std::string shader_source;
static constexpr std::string_view ADD_F32 = "ADD.F32";
@ -784,6 +795,8 @@ private:
ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
ShaderType stage, std::string_view identifier)
: device{device}, ir{ir}, registry{registry}, stage{stage} {
DefineGlobalMemory();
AddLine("TEMP RC;");
AddLine("TEMP FSWZA[4];");
AddLine("TEMP FSWZB[4];");
@ -829,12 +842,20 @@ std::string_view HeaderStageName(ShaderType stage) {
}
}
void ARBDecompiler::DefineGlobalMemory() {
u32 binding = 0;
for (const auto& pair : ir.GetGlobalMemory()) {
const GlobalMemoryBase base = pair.first;
global_memory_names.emplace(base, binding);
++binding;
}
}
void ARBDecompiler::DeclareHeader() {
AddLine("!!NV{}5.0", HeaderStageName(stage));
// Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D
AddLine("OPTION NV_internal;");
AddLine("OPTION NV_gpu_program_fp64;");
AddLine("OPTION NV_shader_storage_buffer;");
AddLine("OPTION NV_shader_thread_group;");
if (ir.UsesWarps() && device.HasWarpIntrinsics()) {
AddLine("OPTION NV_shader_thread_shuffle;");
@ -951,11 +972,10 @@ void ARBDecompiler::DeclareLocalMemory() {
}
void ARBDecompiler::DeclareGlobalMemory() {
u32 binding = 0; // device.GetBaseBindings(stage).shader_storage_buffer;
for (const auto& pair : ir.GetGlobalMemory()) {
const auto& base = pair.first;
AddLine("STORAGE {}[] = {{ program.storage[{}] }};", GlobalMemoryName(base), binding);
++binding;
const std::size_t num_entries = ir.GetGlobalMemory().size();
if (num_entries > 0) {
const std::size_t num_vectors = Common::AlignUp(num_entries, 2) / 2;
AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_vectors, num_vectors - 1);
}
}
@ -977,6 +997,9 @@ void ARBDecompiler::DeclareTemporaries() {
for (std::size_t i = 0; i < max_temporaries; ++i) {
AddLine("TEMP T{};", i);
}
for (std::size_t i = 0; i < max_long_temporaries; ++i) {
AddLine("LONG TEMP L{};", i);
}
}
void ARBDecompiler::DeclarePredicates() {
@ -1339,10 +1362,7 @@ std::string ARBDecompiler::Visit(const Node& node) {
if (const auto gmem = std::get_if<GmemNode>(&*node)) {
std::string temporary = AllocTemporary();
AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
Visit(gmem->GetBaseAddress()));
AddLine("LDB.U32 {}, {}[{}];", temporary, GlobalMemoryName(gmem->GetDescriptor()),
temporary);
AddLine("LOAD.U32 {}, {};", temporary, GlobalMemoryPointer(*gmem));
return temporary;
}
@ -1419,6 +1439,22 @@ std::string ARBDecompiler::BuildAoffi(Operation operation) {
return fmt::format(", offset({})", temporary);
}
std::string ARBDecompiler::GlobalMemoryPointer(const GmemNode& gmem) {
const u32 binding = global_memory_names.at(gmem.GetDescriptor());
const char result_swizzle = binding % 2 == 0 ? 'x' : 'y';
const std::string pointer = AllocLongVectorTemporary();
std::string temporary = AllocTemporary();
const u32 local_index = binding / 2;
AddLine("PK64.U {}, c[{}];", pointer, local_index);
AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem.GetRealAddress()),
Visit(gmem.GetBaseAddress()));
AddLine("CVT.U64.U32 {}.z, {};", pointer, temporary);
AddLine("ADD.U64 {}.x, {}.{}, {}.z;", pointer, pointer, result_swizzle, pointer);
return fmt::format("{}.x", pointer);
}
void ARBDecompiler::Exit() {
if (stage != ShaderType::Fragment) {
AddLine("RET;");
@ -1515,11 +1551,7 @@ std::string ARBDecompiler::Assign(Operation operation) {
ResetTemporaries();
return {};
} else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
const std::string temporary = AllocTemporary();
AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
Visit(gmem->GetBaseAddress()));
AddLine("STB.U32 {}, {}[{}];", Visit(src), GlobalMemoryName(gmem->GetDescriptor()),
temporary);
AddLine("STORE.U32 {}, {};", Visit(src), GlobalMemoryPointer(*gmem));
ResetTemporaries();
return {};
} else {

@ -26,7 +26,7 @@ Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
: VideoCommon::BufferBlock{cpu_addr, size} {
gl_buffer.Create();
glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
if (device.HasVertexBufferUnifiedMemory()) {
if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) {
glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
}

@ -139,6 +139,18 @@ void oglEnable(GLenum cap, bool state) {
(state ? glEnable : glDisable)(cap);
}
void UpdateBindlessPointers(GLenum target, GLuint64EXT* pointers, std::size_t num_entries) {
if (num_entries == 0) {
return;
}
if (num_entries % 2 == 1) {
pointers[num_entries] = 0;
}
const GLsizei num_vectors = static_cast<GLsizei>((num_entries + 1) / 2);
glProgramLocalParametersI4uivNV(target, 0, num_vectors,
reinterpret_cast<const GLuint*>(pointers));
}
} // Anonymous namespace
RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
@ -324,7 +336,6 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
MICROPROFILE_SCOPE(OpenGL_Shader);
auto& gpu = system.GPU().Maxwell3D();
std::size_t num_ssbos = 0;
u32 clip_distances = 0;
for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
@ -347,29 +358,13 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
}
// Currently this stages are not supported in the OpenGL backend.
// Todo(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL
if (program == Maxwell::ShaderProgram::TesselationControl) {
continue;
} else if (program == Maxwell::ShaderProgram::TesselationEval) {
// TODO(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL
if (program == Maxwell::ShaderProgram::TesselationControl ||
program == Maxwell::ShaderProgram::TesselationEval) {
continue;
}
Shader* shader = shader_cache.GetStageProgram(program, async_shaders);
if (device.UseAssemblyShaders()) {
// Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this
// all stages share the same bindings.
const std::size_t num_stage_ssbos = shader->GetEntries().global_memory_entries.size();
ASSERT_MSG(num_stage_ssbos == 0 || num_ssbos == 0, "SSBOs on more than one stage");
num_ssbos += num_stage_ssbos;
}
// Stage indices are 0 - 5
const std::size_t stage = index == 0 ? 0 : index - 1;
SetupDrawConstBuffers(stage, shader);
SetupDrawGlobalMemory(stage, shader);
SetupDrawTextures(stage, shader);
SetupDrawImages(stage, shader);
Shader* const shader = shader_cache.GetStageProgram(program, async_shaders);
const GLuint program_handle = shader->IsBuilt() ? shader->GetHandle() : 0;
switch (program) {
@ -388,6 +383,13 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
shader_config.enable.Value(), shader_config.offset);
}
// Stage indices are 0 - 5
const std::size_t stage = index == 0 ? 0 : index - 1;
SetupDrawConstBuffers(stage, shader);
SetupDrawGlobalMemory(stage, shader);
SetupDrawTextures(stage, shader);
SetupDrawImages(stage, shader);
// Workaround for Intel drivers.
// When a clip distance is enabled but not set in the shader it crops parts of the screen
// (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the
@ -749,6 +751,8 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
current_cbuf = 0;
auto kernel = shader_cache.GetComputeKernel(code_addr);
program_manager.BindCompute(kernel->GetHandle());
SetupComputeTextures(kernel);
SetupComputeImages(kernel);
@ -763,7 +767,6 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
buffer_cache.Unmap();
const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
program_manager.BindCompute(kernel->GetHandle());
glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
++num_queued_commands;
}
@ -1023,40 +1026,66 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
}
void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) {
static constexpr std::array TARGET_LUT = {
GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
};
auto& gpu{system.GPU()};
auto& memory_manager{gpu.MemoryManager()};
const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
const auto& cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
const auto& entries{shader->GetEntries().global_memory_entries};
u32 binding =
device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
for (const auto& entry : shader->GetEntries().global_memory_entries) {
std::array<GLuint64EXT, 32> pointers;
ASSERT(entries.size() < pointers.size());
const bool assembly_shaders = device.UseAssemblyShaders();
u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
for (const auto& entry : entries) {
const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)};
const u32 size{memory_manager.Read<u32>(addr + 8)};
SetupGlobalMemory(binding++, entry, gpu_addr, size);
SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]);
++binding;
}
if (assembly_shaders) {
UpdateBindlessPointers(TARGET_LUT[stage_index], pointers.data(), entries.size());
}
}
void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
auto& gpu{system.GPU()};
auto& memory_manager{gpu.MemoryManager()};
const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
const auto& cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
const auto& entries{kernel->GetEntries().global_memory_entries};
std::array<GLuint64EXT, 32> pointers;
ASSERT(entries.size() < pointers.size());
u32 binding = 0;
for (const auto& entry : kernel->GetEntries().global_memory_entries) {
const auto addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset};
const auto gpu_addr{memory_manager.Read<u64>(addr)};
const auto size{memory_manager.Read<u32>(addr + 8)};
SetupGlobalMemory(binding++, entry, gpu_addr, size);
for (const auto& entry : entries) {
const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset};
const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)};
const u32 size{memory_manager.Read<u32>(addr + 8)};
SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]);
++binding;
}
if (device.UseAssemblyShaders()) {
UpdateBindlessPointers(GL_COMPUTE_PROGRAM_NV, pointers.data(), entries.size());
}
}
void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
GPUVAddr gpu_addr, std::size_t size) {
const auto alignment{device.GetShaderStorageBufferAlignment()};
GPUVAddr gpu_addr, std::size_t size,
GLuint64EXT* pointer) {
const std::size_t alignment{device.GetShaderStorageBufferAlignment()};
const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
static_cast<GLsizeiptr>(size));
if (device.UseAssemblyShaders()) {
*pointer = info.address + info.offset;
} else {
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
static_cast<GLsizeiptr>(size));
}
}
void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) {

@ -124,9 +124,9 @@ private:
/// Configures the current global memory entries to use for the kernel invocation.
void SetupComputeGlobalMemory(Shader* kernel);
/// Configures a constant buffer.
/// Configures a global memory buffer.
void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
std::size_t size);
std::size_t size, GLuint64EXT* pointer);
/// Configures the current textures to use for the draw command.
void SetupDrawTextures(std::size_t stage_index, Shader* shader);

@ -11,8 +11,30 @@
namespace OpenGL {
ProgramManager::ProgramManager(const Device& device) {
use_assembly_programs = device.UseAssemblyShaders();
namespace {
void BindProgram(GLenum stage, GLuint current, GLuint old, bool& enabled) {
if (current == old) {
return;
}
if (current == 0) {
if (enabled) {
enabled = false;
glDisable(stage);
}
return;
}
if (!enabled) {
enabled = true;
glEnable(stage);
}
glBindProgramARB(stage, current);
}
} // Anonymous namespace
ProgramManager::ProgramManager(const Device& device)
: use_assembly_programs{device.UseAssemblyShaders()} {
if (use_assembly_programs) {
glEnable(GL_COMPUTE_PROGRAM_NV);
} else {
@ -33,9 +55,7 @@ void ProgramManager::BindCompute(GLuint program) {
}
void ProgramManager::BindGraphicsPipeline() {
if (use_assembly_programs) {
UpdateAssemblyPrograms();
} else {
if (!use_assembly_programs) {
UpdateSourcePrograms();
}
}
@ -63,32 +83,25 @@ void ProgramManager::RestoreGuestPipeline() {
}
}
void ProgramManager::UpdateAssemblyPrograms() {
const auto update_state = [](GLenum stage, bool& enabled, GLuint current, GLuint old) {
if (current == old) {
return;
}
if (current == 0) {
if (enabled) {
enabled = false;
glDisable(stage);
}
return;
}
if (!enabled) {
enabled = true;
glEnable(stage);
}
glBindProgramARB(stage, current);
};
void ProgramManager::UseVertexShader(GLuint program) {
if (use_assembly_programs) {
BindProgram(GL_VERTEX_PROGRAM_NV, program, current_state.vertex, vertex_enabled);
}
current_state.vertex = program;
}
update_state(GL_VERTEX_PROGRAM_NV, vertex_enabled, current_state.vertex, old_state.vertex);
update_state(GL_GEOMETRY_PROGRAM_NV, geometry_enabled, current_state.geometry,
old_state.geometry);
update_state(GL_FRAGMENT_PROGRAM_NV, fragment_enabled, current_state.fragment,
old_state.fragment);
void ProgramManager::UseGeometryShader(GLuint program) {
if (use_assembly_programs) {
BindProgram(GL_GEOMETRY_PROGRAM_NV, program, current_state.vertex, geometry_enabled);
}
current_state.geometry = program;
}
old_state = current_state;
void ProgramManager::UseFragmentShader(GLuint program) {
if (use_assembly_programs) {
BindProgram(GL_FRAGMENT_PROGRAM_NV, program, current_state.vertex, fragment_enabled);
}
current_state.fragment = program;
}
void ProgramManager::UpdateSourcePrograms() {

@ -45,17 +45,9 @@ public:
/// Rewinds BindHostPipeline state changes.
void RestoreGuestPipeline();
void UseVertexShader(GLuint program) {
current_state.vertex = program;
}
void UseGeometryShader(GLuint program) {
current_state.geometry = program;
}
void UseFragmentShader(GLuint program) {
current_state.fragment = program;
}
void UseVertexShader(GLuint program);
void UseGeometryShader(GLuint program);
void UseFragmentShader(GLuint program);
private:
struct PipelineState {
@ -64,9 +56,6 @@ private:
GLuint fragment = 0;
};
/// Update NV_gpu_program5 programs.
void UpdateAssemblyPrograms();
/// Update GLSL programs.
void UpdateSourcePrograms();

@ -35,7 +35,7 @@ OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool ver
mapped_ptr = static_cast<u8*>(
glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT));
if (device.HasVertexBufferUnifiedMemory()) {
if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) {
glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
}