Pica: Add vertex shader implementation.
parent
d443f0a921
commit
c526512619
@ -0,0 +1,270 @@
|
||||
// Copyright 2014 Citra Emulator Project
|
||||
// Licensed under GPLv2
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#include "pica.h"
|
||||
#include "vertex_shader.h"
|
||||
#include <core/mem_map.h>
|
||||
#include <common/file_util.h>
|
||||
|
||||
namespace Pica {
|
||||
|
||||
namespace VertexShader {
|
||||
|
||||
static struct {
|
||||
Math::Vec4<float24> f[96];
|
||||
} shader_uniforms;
|
||||
|
||||
|
||||
// TODO: Not sure where the shader binary and swizzle patterns are supposed to be loaded to!
|
||||
// For now, we just keep these local arrays around.
|
||||
static u32 shader_memory[1024];
|
||||
static u32 swizzle_data[1024];
|
||||
|
||||
void SubmitShaderMemoryChange(u32 addr, u32 value)
|
||||
{
|
||||
shader_memory[addr] = value;
|
||||
}
|
||||
|
||||
void SubmitSwizzleDataChange(u32 addr, u32 value)
|
||||
{
|
||||
swizzle_data[addr] = value;
|
||||
}
|
||||
|
||||
Math::Vec4<float24>& GetFloatUniform(u32 index)
|
||||
{
|
||||
return shader_uniforms.f[index];
|
||||
}
|
||||
|
||||
struct VertexShaderState {
|
||||
u32* program_counter;
|
||||
|
||||
const float24* input_register_table[16];
|
||||
float24* output_register_table[7*4];
|
||||
|
||||
Math::Vec4<float24> temporary_registers[16];
|
||||
bool status_registers[2];
|
||||
|
||||
enum {
|
||||
INVALID_ADDRESS = 0xFFFFFFFF
|
||||
};
|
||||
u32 call_stack[8]; // TODO: What is the maximal call stack depth?
|
||||
u32* call_stack_pointer;
|
||||
};
|
||||
|
||||
static void ProcessShaderCode(VertexShaderState& state) {
|
||||
while (true) {
|
||||
bool increment_pc = true;
|
||||
bool exit_loop = false;
|
||||
const Instruction& instr = *(const Instruction*)state.program_counter;
|
||||
|
||||
const float24* src1_ = (instr.common.src1 < 0x10) ? state.input_register_table[instr.common.src1]
|
||||
: (instr.common.src1 < 0x20) ? &state.temporary_registers[instr.common.src1-0x10].x
|
||||
: (instr.common.src1 < 0x80) ? &shader_uniforms.f[instr.common.src1-0x20].x
|
||||
: nullptr;
|
||||
const float24* src2_ = (instr.common.src2 < 0x10) ? state.input_register_table[instr.common.src2]
|
||||
: &state.temporary_registers[instr.common.src2-0x10].x;
|
||||
// TODO: Unsure about the limit values
|
||||
float24* dest = (instr.common.dest <= 0x1C) ? state.output_register_table[instr.common.dest]
|
||||
: (instr.common.dest <= 0x3C) ? nullptr
|
||||
: (instr.common.dest <= 0x7C) ? &state.temporary_registers[(instr.common.dest-0x40)/4][instr.common.dest%4]
|
||||
: nullptr;
|
||||
|
||||
const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id];
|
||||
|
||||
const float24 src1[4] = {
|
||||
src1_[(int)swizzle.GetSelectorSrc1(0)],
|
||||
src1_[(int)swizzle.GetSelectorSrc1(1)],
|
||||
src1_[(int)swizzle.GetSelectorSrc1(2)],
|
||||
src1_[(int)swizzle.GetSelectorSrc1(3)],
|
||||
};
|
||||
const float24 src2[4] = {
|
||||
src2_[(int)swizzle.GetSelectorSrc2(0)],
|
||||
src2_[(int)swizzle.GetSelectorSrc2(1)],
|
||||
src2_[(int)swizzle.GetSelectorSrc2(2)],
|
||||
src2_[(int)swizzle.GetSelectorSrc2(3)],
|
||||
};
|
||||
|
||||
switch (instr.opcode) {
|
||||
case Instruction::OpCode::ADD:
|
||||
{
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
if (!swizzle.DestComponentEnabled(i))
|
||||
continue;
|
||||
|
||||
dest[i] = src1[i] + src2[i];
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case Instruction::OpCode::MUL:
|
||||
{
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
if (!swizzle.DestComponentEnabled(i))
|
||||
continue;
|
||||
|
||||
dest[i] = src1[i] * src2[i];
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case Instruction::OpCode::DP3:
|
||||
case Instruction::OpCode::DP4:
|
||||
{
|
||||
float24 dot = float24::FromFloat32(0.f);
|
||||
int num_components = (instr.opcode == Instruction::OpCode::DP3) ? 3 : 4;
|
||||
for (int i = 0; i < num_components; ++i)
|
||||
dot = dot + src1[i] * src2[i];
|
||||
|
||||
for (int i = 0; i < num_components; ++i) {
|
||||
if (!swizzle.DestComponentEnabled(i))
|
||||
continue;
|
||||
|
||||
dest[i] = dot;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// Reciprocal
|
||||
case Instruction::OpCode::RCP:
|
||||
{
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
if (!swizzle.DestComponentEnabled(i))
|
||||
continue;
|
||||
|
||||
// TODO: Be stable against division by zero!
|
||||
// TODO: I think this might be wrong... we should only use one component here
|
||||
dest[i] = float24::FromFloat32(1.0 / src1[i].ToFloat32());
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
// Reciprocal Square Root
|
||||
case Instruction::OpCode::RSQ:
|
||||
{
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
if (!swizzle.DestComponentEnabled(i))
|
||||
continue;
|
||||
|
||||
// TODO: Be stable against division by zero!
|
||||
// TODO: I think this might be wrong... we should only use one component here
|
||||
dest[i] = float24::FromFloat32(1.0 / sqrt(src1[i].ToFloat32()));
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case Instruction::OpCode::MOV:
|
||||
{
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
if (!swizzle.DestComponentEnabled(i))
|
||||
continue;
|
||||
|
||||
dest[i] = src1[i];
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case Instruction::OpCode::RET:
|
||||
if (*state.call_stack_pointer == VertexShaderState::INVALID_ADDRESS) {
|
||||
exit_loop = true;
|
||||
} else {
|
||||
state.program_counter = &shader_memory[*state.call_stack_pointer--];
|
||||
*state.call_stack_pointer = VertexShaderState::INVALID_ADDRESS;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case Instruction::OpCode::CALL:
|
||||
increment_pc = false;
|
||||
|
||||
_dbg_assert_(GPU, state.call_stack_pointer - state.call_stack < sizeof(state.call_stack));
|
||||
|
||||
*++state.call_stack_pointer = state.program_counter - shader_memory;
|
||||
// TODO: Does this offset refer to the beginning of shader memory?
|
||||
state.program_counter = &shader_memory[instr.flow_control.offset_words];
|
||||
break;
|
||||
|
||||
case Instruction::OpCode::FLS:
|
||||
// TODO: Do whatever needs to be done here?
|
||||
break;
|
||||
|
||||
default:
|
||||
ERROR_LOG(GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
|
||||
(int)instr.opcode.Value(), instr.GetOpCodeName().c_str(), instr.hex);
|
||||
break;
|
||||
}
|
||||
|
||||
if (increment_pc)
|
||||
++state.program_counter;
|
||||
|
||||
if (exit_loop)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
OutputVertex RunShader(const InputVertex& input, int num_attributes)
|
||||
{
|
||||
VertexShaderState state;
|
||||
|
||||
const u32* main = &shader_memory[registers.vs_main_offset];
|
||||
state.program_counter = (u32*)main;
|
||||
|
||||
// Setup input register table
|
||||
const auto& attribute_register_map = registers.vs_input_register_map;
|
||||
float24 dummy_register;
|
||||
std::fill(&state.input_register_table[0], &state.input_register_table[16], &dummy_register);
|
||||
if(num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x;
|
||||
if(num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x;
|
||||
if(num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x;
|
||||
if(num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x;
|
||||
if(num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x;
|
||||
if(num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x;
|
||||
if(num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x;
|
||||
if(num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x;
|
||||
if(num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x;
|
||||
if(num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x;
|
||||
if(num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x;
|
||||
if(num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x;
|
||||
if(num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x;
|
||||
if(num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x;
|
||||
if(num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x;
|
||||
if(num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x;
|
||||
|
||||
// Setup output register table
|
||||
OutputVertex ret;
|
||||
for (int i = 0; i < 7; ++i) {
|
||||
const auto& output_register_map = registers.vs_output_attributes[i];
|
||||
|
||||
u32 semantics[4] = {
|
||||
output_register_map.map_x, output_register_map.map_y,
|
||||
output_register_map.map_z, output_register_map.map_w
|
||||
};
|
||||
|
||||
for (int comp = 0; comp < 4; ++comp)
|
||||
state.output_register_table[4*i+comp] = ((float24*)&ret) + semantics[comp];
|
||||
}
|
||||
|
||||
state.status_registers[0] = false;
|
||||
state.status_registers[1] = false;
|
||||
std::fill(state.call_stack, state.call_stack + sizeof(state.call_stack) / sizeof(state.call_stack[0]),
|
||||
VertexShaderState::INVALID_ADDRESS);
|
||||
state.call_stack_pointer = &state.call_stack[0];
|
||||
|
||||
ProcessShaderCode(state);
|
||||
|
||||
DEBUG_LOG(GPU, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
|
||||
ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),
|
||||
ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(),
|
||||
ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32());
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
} // namespace
|
||||
|
||||
} // namespace
|
@ -0,0 +1,211 @@
|
||||
// Copyright 2014 Citra Emulator Project
|
||||
// Licensed under GPLv2
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <initializer_list>
|
||||
|
||||
#include <common/common_types.h>
|
||||
|
||||
#include "math.h"
|
||||
#include "pica.h"
|
||||
|
||||
namespace Pica {
|
||||
|
||||
namespace VertexShader {
|
||||
|
||||
struct InputVertex {
|
||||
Math::Vec4<float24> attr[16];
|
||||
};
|
||||
|
||||
struct OutputVertex {
|
||||
OutputVertex() = default;
|
||||
|
||||
// VS output attributes
|
||||
Math::Vec4<float24> pos;
|
||||
Math::Vec4<float24> dummy; // quaternions (not implemented, yet)
|
||||
Math::Vec4<float24> color;
|
||||
Math::Vec2<float24> tc0;
|
||||
float24 tc0_v;
|
||||
|
||||
// Padding for optimal alignment
|
||||
float24 pad[14];
|
||||
|
||||
// Attributes used to store intermediate results
|
||||
|
||||
// position after perspective divide
|
||||
Math::Vec3<float24> screenpos;
|
||||
|
||||
// Linear interpolation
|
||||
// factor: 0=this, 1=vtx
|
||||
void Lerp(float24 factor, const OutputVertex& vtx) {
|
||||
pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor);
|
||||
|
||||
// TODO: Should perform perspective correct interpolation here...
|
||||
tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor);
|
||||
|
||||
screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor);
|
||||
|
||||
color = color * factor + vtx.color * (float24::FromFloat32(1) - factor);
|
||||
}
|
||||
|
||||
// Linear interpolation
|
||||
// factor: 0=v0, 1=v1
|
||||
static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) {
|
||||
OutputVertex ret = v0;
|
||||
ret.Lerp(factor, v1);
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
|
||||
|
||||
union Instruction {
|
||||
enum class OpCode : u32 {
|
||||
ADD = 0x0,
|
||||
DP3 = 0x1,
|
||||
DP4 = 0x2,
|
||||
|
||||
MUL = 0x8,
|
||||
|
||||
MAX = 0xC,
|
||||
MIN = 0xD,
|
||||
RCP = 0xE,
|
||||
RSQ = 0xF,
|
||||
|
||||
MOV = 0x13,
|
||||
|
||||
RET = 0x21,
|
||||
FLS = 0x22, // Flush
|
||||
CALL = 0x24,
|
||||
};
|
||||
|
||||
std::string GetOpCodeName() const {
|
||||
std::map<OpCode, std::string> map = {
|
||||
{ OpCode::ADD, "ADD" },
|
||||
{ OpCode::DP3, "DP3" },
|
||||
{ OpCode::DP4, "DP4" },
|
||||
{ OpCode::MUL, "MUL" },
|
||||
{ OpCode::MAX, "MAX" },
|
||||
{ OpCode::MIN, "MIN" },
|
||||
{ OpCode::RCP, "RCP" },
|
||||
{ OpCode::RSQ, "RSQ" },
|
||||
{ OpCode::MOV, "MOV" },
|
||||
{ OpCode::RET, "RET" },
|
||||
{ OpCode::FLS, "FLS" },
|
||||
};
|
||||
auto it = map.find(opcode);
|
||||
if (it == map.end())
|
||||
return "UNK";
|
||||
else
|
||||
return it->second;
|
||||
}
|
||||
|
||||
u32 hex;
|
||||
|
||||
BitField<0x1a, 0x6, OpCode> opcode;
|
||||
|
||||
// General notes:
|
||||
//
|
||||
// When two input registers are used, one of them uses a 5-bit index while the other
|
||||
// one uses a 7-bit index. This is because at most one floating point uniform may be used
|
||||
// as an input.
|
||||
|
||||
|
||||
// Format used e.g. by arithmetic instructions and comparisons
|
||||
// "src1" and "src2" specify register indices (i.e. indices referring to groups of 4 floats),
|
||||
// while "dest" addresses individual floats.
|
||||
union {
|
||||
BitField<0x00, 0x5, u32> operand_desc_id;
|
||||
BitField<0x07, 0x5, u32> src2;
|
||||
BitField<0x0c, 0x7, u32> src1;
|
||||
BitField<0x13, 0x7, u32> dest;
|
||||
} common;
|
||||
|
||||
// Format used for flow control instructions ("if")
|
||||
union {
|
||||
BitField<0x00, 0x8, u32> num_instructions;
|
||||
BitField<0x0a, 0xc, u32> offset_words;
|
||||
} flow_control;
|
||||
};
|
||||
|
||||
union SwizzlePattern {
|
||||
u32 hex;
|
||||
|
||||
enum class Selector : u32 {
|
||||
x = 0,
|
||||
y = 1,
|
||||
z = 2,
|
||||
w = 3
|
||||
};
|
||||
|
||||
Selector GetSelectorSrc1(int comp) const {
|
||||
Selector selectors[] = {
|
||||
src1_selector_0, src1_selector_1, src1_selector_2, src1_selector_3
|
||||
};
|
||||
return selectors[comp];
|
||||
}
|
||||
|
||||
Selector GetSelectorSrc2(int comp) const {
|
||||
Selector selectors[] = {
|
||||
src2_selector_0, src2_selector_1, src2_selector_2, src2_selector_3
|
||||
};
|
||||
return selectors[comp];
|
||||
}
|
||||
|
||||
bool DestComponentEnabled(int i) const {
|
||||
return (dest_mask & (0x8 >> i));
|
||||
}
|
||||
|
||||
std::string SelectorToString(bool src2) const {
|
||||
std::map<Selector, std::string> map = {
|
||||
{ Selector::x, "x" },
|
||||
{ Selector::y, "y" },
|
||||
{ Selector::z, "z" },
|
||||
{ Selector::w, "w" }
|
||||
};
|
||||
std::string ret;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
ret += map.at(src2 ? GetSelectorSrc2(i) : GetSelectorSrc1(i));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string DestMaskToString() const {
|
||||
std::string ret;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
if (!DestComponentEnabled(i))
|
||||
ret += "_";
|
||||
else
|
||||
ret += "xyzw"[i];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Components of "dest" that should be written to: LSB=dest.w, MSB=dest.x
|
||||
BitField< 0, 4, u32> dest_mask;
|
||||
|
||||
BitField< 5, 2, Selector> src1_selector_3;
|
||||
BitField< 7, 2, Selector> src1_selector_2;
|
||||
BitField< 9, 2, Selector> src1_selector_1;
|
||||
BitField<11, 2, Selector> src1_selector_0;
|
||||
|
||||
BitField<14, 2, Selector> src2_selector_3;
|
||||
BitField<16, 2, Selector> src2_selector_2;
|
||||
BitField<18, 2, Selector> src2_selector_1;
|
||||
BitField<20, 2, Selector> src2_selector_0;
|
||||
|
||||
BitField<31, 1, u32> flag; // not sure what this means, maybe it's the sign?
|
||||
};
|
||||
|
||||
void SubmitShaderMemoryChange(u32 addr, u32 value);
|
||||
void SubmitSwizzleDataChange(u32 addr, u32 value);
|
||||
|
||||
OutputVertex RunShader(const InputVertex& input, int num_attributes);
|
||||
|
||||
Math::Vec4<float24>& GetFloatUniform(u32 index);
|
||||
|
||||
} // namespace
|
||||
|
||||
} // namespace
|
||||
|
Loading…
Reference in New Issue