Merge pull request #1904 from bunnei/better-fermi-copy

gl_rasterizer: Implement a more accurate fermi 2D copy.
master
bunnei 2019-02-08 23:32:24 +07:00 committed by GitHub
commit 1d98027a0e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 206 additions and 72 deletions

@ -21,7 +21,9 @@ void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {
regs.reg_array[method_call.method] = method_call.argument;
switch (method_call.method) {
case FERMI2D_REG_INDEX(trigger): {
// Trigger the surface copy on the last register write. This is blit_src_y, but this is 64-bit,
// so trigger on the second 32-bit write.
case FERMI2D_REG_INDEX(blit_src_y) + 1: {
HandleSurfaceCopy();
break;
}
@ -32,57 +34,23 @@ void Fermi2D::HandleSurfaceCopy() {
LOG_WARNING(HW_GPU, "Requested a surface copy with operation {}",
static_cast<u32>(regs.operation));
const GPUVAddr source = regs.src.Address();
const GPUVAddr dest = regs.dst.Address();
// TODO(Subv): Only same-format and same-size copies are allowed for now.
ASSERT(regs.src.format == regs.dst.format);
ASSERT(regs.src.width * regs.src.height == regs.dst.width * regs.dst.height);
// TODO(Subv): Only raw copies are implemented.
ASSERT(regs.operation == Regs::Operation::SrcCopy);
const auto source_cpu = memory_manager.GpuToCpuAddress(source);
const auto dest_cpu = memory_manager.GpuToCpuAddress(dest);
ASSERT_MSG(source_cpu, "Invalid source GPU address");
ASSERT_MSG(dest_cpu, "Invalid destination GPU address");
const u32 src_blit_x1{static_cast<u32>(regs.blit_src_x >> 32)};
const u32 src_blit_y1{static_cast<u32>(regs.blit_src_y >> 32)};
const u32 src_blit_x2{
static_cast<u32>((regs.blit_src_x + (regs.blit_dst_width * regs.blit_du_dx)) >> 32)};
const u32 src_blit_y2{
static_cast<u32>((regs.blit_src_y + (regs.blit_dst_height * regs.blit_dv_dy)) >> 32)};
u32 src_bytes_per_pixel = RenderTargetBytesPerPixel(regs.src.format);
u32 dst_bytes_per_pixel = RenderTargetBytesPerPixel(regs.dst.format);
const MathUtil::Rectangle<u32> src_rect{src_blit_x1, src_blit_y1, src_blit_x2, src_blit_y2};
const MathUtil::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y,
regs.blit_dst_x + regs.blit_dst_width,
regs.blit_dst_y + regs.blit_dst_height};
if (!rasterizer.AccelerateSurfaceCopy(regs.src, regs.dst)) {
// All copies here update the main memory, so mark all rasterizer states as invalid.
Core::System::GetInstance().GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
rasterizer.FlushRegion(*source_cpu, src_bytes_per_pixel * regs.src.width * regs.src.height);
// We have to invalidate the destination region to evict any outdated surfaces from the
// cache. We do this before actually writing the new data because the destination address
// might contain a dirty surface that will have to be written back to memory.
rasterizer.InvalidateRegion(*dest_cpu,
dst_bytes_per_pixel * regs.dst.width * regs.dst.height);
if (regs.src.linear == regs.dst.linear) {
// If the input layout and the output layout are the same, just perform a raw copy.
ASSERT(regs.src.BlockHeight() == regs.dst.BlockHeight());
Memory::CopyBlock(*dest_cpu, *source_cpu,
src_bytes_per_pixel * regs.dst.width * regs.dst.height);
return;
}
u8* src_buffer = Memory::GetPointer(*source_cpu);
u8* dst_buffer = Memory::GetPointer(*dest_cpu);
if (!regs.src.linear && regs.dst.linear) {
// If the input is tiled and the output is linear, deswizzle the input and copy it over.
Texture::CopySwizzledData(regs.src.width, regs.src.height, regs.src.depth,
src_bytes_per_pixel, dst_bytes_per_pixel, src_buffer,
dst_buffer, true, regs.src.BlockHeight(),
regs.src.BlockDepth(), 0);
} else {
// If the input is linear and the output is tiled, swizzle the input and copy it over.
Texture::CopySwizzledData(regs.src.width, regs.src.height, regs.src.depth,
src_bytes_per_pixel, dst_bytes_per_pixel, dst_buffer,
src_buffer, false, regs.dst.BlockHeight(),
regs.dst.BlockDepth(), 0);
}
if (!rasterizer.AccelerateSurfaceCopy(regs.src, regs.dst, src_rect, dst_rect)) {
UNIMPLEMENTED();
}
}

@ -94,12 +94,22 @@ public:
Operation operation;
INSERT_PADDING_WORDS(0x9);
INSERT_PADDING_WORDS(0x177);
// TODO(Subv): This is only a guess.
u32 trigger;
u32 blit_control;
INSERT_PADDING_WORDS(0x1A3);
INSERT_PADDING_WORDS(0x8);
u32 blit_dst_x;
u32 blit_dst_y;
u32 blit_dst_width;
u32 blit_dst_height;
u64 blit_du_dx;
u64 blit_dv_dy;
u64 blit_src_x;
u64 blit_src_y;
INSERT_PADDING_WORDS(0x21);
};
std::array<u32, NUM_REGS> reg_array;
};
@ -122,7 +132,16 @@ private:
ASSERT_REG_POSITION(dst, 0x80);
ASSERT_REG_POSITION(src, 0x8C);
ASSERT_REG_POSITION(operation, 0xAB);
ASSERT_REG_POSITION(trigger, 0xB5);
ASSERT_REG_POSITION(blit_control, 0x223);
ASSERT_REG_POSITION(blit_dst_x, 0x22c);
ASSERT_REG_POSITION(blit_dst_y, 0x22d);
ASSERT_REG_POSITION(blit_dst_width, 0x22e);
ASSERT_REG_POSITION(blit_dst_height, 0x22f);
ASSERT_REG_POSITION(blit_du_dx, 0x230);
ASSERT_REG_POSITION(blit_dv_dy, 0x232);
ASSERT_REG_POSITION(blit_src_x, 0x234);
ASSERT_REG_POSITION(blit_src_y, 0x236);
#undef ASSERT_REG_POSITION
} // namespace Tegra::Engines

@ -46,7 +46,9 @@ public:
/// Attempt to use a faster method to perform a surface copy
virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
const Tegra::Engines::Fermi2D::Regs::Surface& dst) {
const Tegra::Engines::Fermi2D::Regs::Surface& dst,
const MathUtil::Rectangle<u32>& src_rect,
const MathUtil::Rectangle<u32>& dst_rect) {
return false;
}

@ -778,15 +778,11 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
}
bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
const Tegra::Engines::Fermi2D::Regs::Surface& dst) {
const Tegra::Engines::Fermi2D::Regs::Surface& dst,
const MathUtil::Rectangle<u32>& src_rect,
const MathUtil::Rectangle<u32>& dst_rect) {
MICROPROFILE_SCOPE(OpenGL_Blits);
if (Settings::values.use_accurate_gpu_emulation) {
// Skip the accelerated copy and perform a slow but more accurate copy
return false;
}
res_cache.FermiCopySurface(src, dst);
res_cache.FermiCopySurface(src, dst, src_rect, dst_rect);
return true;
}

@ -61,7 +61,9 @@ public:
void InvalidateRegion(VAddr addr, u64 size) override;
void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
const Tegra::Engines::Fermi2D::Regs::Surface& dst) override;
const Tegra::Engines::Fermi2D::Regs::Surface& dst,
const MathUtil::Rectangle<u32>& src_rect,
const MathUtil::Rectangle<u32>& dst_rect) override;
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
u32 pixel_stride) override;
bool AccelerateDrawBatch(bool is_indexed) override;

@ -437,7 +437,8 @@ void SwizzleFunc(const MortonSwizzleMode& mode, const SurfaceParams& params,
}
}
static void FastCopySurface(const Surface& src_surface, const Surface& dst_surface) {
void RasterizerCacheOpenGL::FastCopySurface(const Surface& src_surface,
const Surface& dst_surface) {
const auto& src_params{src_surface->GetSurfaceParams()};
const auto& dst_params{dst_surface->GetSurfaceParams()};
@ -447,12 +448,15 @@ static void FastCopySurface(const Surface& src_surface, const Surface& dst_surfa
glCopyImageSubData(src_surface->Texture().handle, SurfaceTargetToGL(src_params.target), 0, 0, 0,
0, dst_surface->Texture().handle, SurfaceTargetToGL(dst_params.target), 0, 0,
0, 0, width, height, 1);
dst_surface->MarkAsModified(true, *this);
}
MICROPROFILE_DEFINE(OpenGL_CopySurface, "OpenGL", "CopySurface", MP_RGB(128, 192, 64));
static void CopySurface(const Surface& src_surface, const Surface& dst_surface,
const GLuint copy_pbo_handle, const GLenum src_attachment = 0,
const GLenum dst_attachment = 0, const std::size_t cubemap_face = 0) {
void RasterizerCacheOpenGL::CopySurface(const Surface& src_surface, const Surface& dst_surface,
const GLuint copy_pbo_handle, const GLenum src_attachment,
const GLenum dst_attachment,
const std::size_t cubemap_face) {
MICROPROFILE_SCOPE(OpenGL_CopySurface);
ASSERT_MSG(dst_attachment == 0, "Unimplemented");
@ -532,6 +536,8 @@ static void CopySurface(const Surface& src_surface, const Surface& dst_surface,
}
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
}
dst_surface->MarkAsModified(true, *this);
}
CachedSurface::CachedSurface(const SurfaceParams& params)
@ -1051,26 +1057,161 @@ void RasterizerCacheOpenGL::FastLayeredCopySurface(const Surface& src_surface,
}
address += layer_size;
}
dst_surface->MarkAsModified(true, *this);
}
static bool BlitSurface(const Surface& src_surface, const Surface& dst_surface,
const MathUtil::Rectangle<u32>& src_rect,
const MathUtil::Rectangle<u32>& dst_rect, GLuint read_fb_handle,
GLuint draw_fb_handle, GLenum src_attachment = 0, GLenum dst_attachment = 0,
std::size_t cubemap_face = 0) {
const auto& src_params{src_surface->GetSurfaceParams()};
const auto& dst_params{dst_surface->GetSurfaceParams()};
OpenGLState prev_state{OpenGLState::GetCurState()};
SCOPE_EXIT({ prev_state.Apply(); });
OpenGLState state;
state.draw.read_framebuffer = read_fb_handle;
state.draw.draw_framebuffer = draw_fb_handle;
state.Apply();
u32 buffers{};
if (src_params.type == SurfaceType::ColorTexture) {
switch (src_params.target) {
case SurfaceTarget::Texture2D:
glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment,
GL_TEXTURE_2D, src_surface->Texture().handle, 0);
glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
0, 0);
break;
case SurfaceTarget::TextureCubemap:
glFramebufferTexture2D(
GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment,
static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + cubemap_face),
src_surface->Texture().handle, 0);
glFramebufferTexture2D(
GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT,
static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + cubemap_face), 0, 0);
break;
case SurfaceTarget::Texture2DArray:
glFramebufferTextureLayer(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment,
src_surface->Texture().handle, 0, 0);
glFramebufferTextureLayer(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, 0, 0, 0);
break;
case SurfaceTarget::Texture3D:
glFramebufferTexture3D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment,
SurfaceTargetToGL(src_params.target),
src_surface->Texture().handle, 0, 0);
glFramebufferTexture3D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT,
SurfaceTargetToGL(src_params.target), 0, 0, 0);
break;
default:
glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment,
GL_TEXTURE_2D, src_surface->Texture().handle, 0);
glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
0, 0);
break;
}
switch (dst_params.target) {
case SurfaceTarget::Texture2D:
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment,
GL_TEXTURE_2D, dst_surface->Texture().handle, 0);
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
0, 0);
break;
case SurfaceTarget::TextureCubemap:
glFramebufferTexture2D(
GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment,
static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + cubemap_face),
dst_surface->Texture().handle, 0);
glFramebufferTexture2D(
GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT,
static_cast<GLenum>(GL_TEXTURE_CUBE_MAP_POSITIVE_X + cubemap_face), 0, 0);
break;
case SurfaceTarget::Texture2DArray:
glFramebufferTextureLayer(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment,
dst_surface->Texture().handle, 0, 0);
glFramebufferTextureLayer(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, 0, 0, 0);
break;
case SurfaceTarget::Texture3D:
glFramebufferTexture3D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment,
SurfaceTargetToGL(dst_params.target),
dst_surface->Texture().handle, 0, 0);
glFramebufferTexture3D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT,
SurfaceTargetToGL(dst_params.target), 0, 0, 0);
break;
default:
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment,
GL_TEXTURE_2D, dst_surface->Texture().handle, 0);
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
0, 0);
break;
}
buffers = GL_COLOR_BUFFER_BIT;
} else if (src_params.type == SurfaceType::Depth) {
glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment,
GL_TEXTURE_2D, 0, 0);
glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D,
src_surface->Texture().handle, 0);
glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment,
GL_TEXTURE_2D, 0, 0);
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D,
dst_surface->Texture().handle, 0);
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
buffers = GL_DEPTH_BUFFER_BIT;
} else if (src_params.type == SurfaceType::DepthStencil) {
glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + src_attachment,
GL_TEXTURE_2D, 0, 0);
glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
src_surface->Texture().handle, 0);
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + dst_attachment,
GL_TEXTURE_2D, 0, 0);
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
dst_surface->Texture().handle, 0);
buffers = GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT;
}
glBlitFramebuffer(src_rect.left, src_rect.top, src_rect.right, src_rect.bottom, dst_rect.left,
dst_rect.top, dst_rect.right, dst_rect.bottom, buffers,
buffers == GL_COLOR_BUFFER_BIT ? GL_LINEAR : GL_NEAREST);
return true;
}
void RasterizerCacheOpenGL::FermiCopySurface(
const Tegra::Engines::Fermi2D::Regs::Surface& src_config,
const Tegra::Engines::Fermi2D::Regs::Surface& dst_config) {
const Tegra::Engines::Fermi2D::Regs::Surface& dst_config,
const MathUtil::Rectangle<u32>& src_rect, const MathUtil::Rectangle<u32>& dst_rect) {
const auto& src_params = SurfaceParams::CreateForFermiCopySurface(src_config);
const auto& dst_params = SurfaceParams::CreateForFermiCopySurface(dst_config);
ASSERT(src_params.width == dst_params.width);
ASSERT(src_params.height == dst_params.height);
ASSERT(src_params.pixel_format == dst_params.pixel_format);
ASSERT(src_params.block_height == dst_params.block_height);
ASSERT(src_params.is_tiled == dst_params.is_tiled);
ASSERT(src_params.depth == dst_params.depth);
ASSERT(src_params.depth == 1); // Currently, FastCopySurface only works with 2D surfaces
ASSERT(src_params.target == dst_params.target);
ASSERT(src_params.rt.index == dst_params.rt.index);
FastCopySurface(GetSurface(src_params, true), GetSurface(dst_params, false));
auto src_surface = GetSurface(src_params, true);
auto dst_surface = GetSurface(dst_params, true);
BlitSurface(src_surface, dst_surface, src_rect, dst_rect, read_framebuffer.handle,
draw_framebuffer.handle);
dst_surface->MarkAsModified(true, *this);
}
void RasterizerCacheOpenGL::AccurateCopySurface(const Surface& src_surface,

@ -423,7 +423,9 @@ public:
/// Copies the contents of one surface to another
void FermiCopySurface(const Tegra::Engines::Fermi2D::Regs::Surface& src_config,
const Tegra::Engines::Fermi2D::Regs::Surface& dst_config);
const Tegra::Engines::Fermi2D::Regs::Surface& dst_config,
const MathUtil::Rectangle<u32>& src_rect,
const MathUtil::Rectangle<u32>& dst_rect);
private:
void LoadSurface(const Surface& surface);
@ -444,6 +446,10 @@ private:
/// Performs a slow but accurate surface copy, flushing to RAM and reinterpreting the data
void AccurateCopySurface(const Surface& src_surface, const Surface& dst_surface);
void FastLayeredCopySurface(const Surface& src_surface, const Surface& dst_surface);
void FastCopySurface(const Surface& src_surface, const Surface& dst_surface);
void CopySurface(const Surface& src_surface, const Surface& dst_surface,
const GLuint copy_pbo_handle, const GLenum src_attachment = 0,
const GLenum dst_attachment = 0, const std::size_t cubemap_face = 0);
/// The surface reserve is a "backup" cache, this is where we put unique surfaces that have
/// previously been used. This is to prevent surfaces from being constantly created and