Merge pull request #2609 from FernandoS27/new-scan

Implement a New Shader Scanner, Decompile Flow Stack and implement BRX BRA.CC
2019-07-11 17:36:23 +07:00 · 2019-07-11 17:36:23 +07:00 · bb67091c77
parent 79c382fafd f2549739d1
commit bb67091c77
16 changed files with 778 additions and 124 deletions
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@ -82,6 +82,8 @@ set(HASH_FILES
    "${VIDEO_CORE}/shader/decode/shift.cpp"
    "${VIDEO_CORE}/shader/decode/video.cpp"
    "${VIDEO_CORE}/shader/decode/xmad.cpp"
    "${VIDEO_CORE}/shader/control_flow.cpp"
    "${VIDEO_CORE}/shader/control_flow.h"
    "${VIDEO_CORE}/shader/decode.cpp"
    "${VIDEO_CORE}/shader/node.h"
    "${VIDEO_CORE}/shader/node_helper.cpp"
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@ -56,6 +56,8 @@ add_custom_command(OUTPUT scm_rev.cpp
      "${VIDEO_CORE}/shader/decode/shift.cpp"
      "${VIDEO_CORE}/shader/decode/video.cpp"
      "${VIDEO_CORE}/shader/decode/xmad.cpp"
      "${VIDEO_CORE}/shader/control_flow.cpp"
      "${VIDEO_CORE}/shader/control_flow.h"
      "${VIDEO_CORE}/shader/decode.cpp"
      "${VIDEO_CORE}/shader/node.h"
      "${VIDEO_CORE}/shader/node_helper.cpp"
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@ -103,6 +103,8 @@ add_library(video_core STATIC
    shader/decode/video.cpp
    shader/decode/xmad.cpp
    shader/decode/other.cpp
    shader/control_flow.cpp
    shader/control_flow.h
    shader/decode.cpp
    shader/node_helper.cpp
    shader/node_helper.h
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@ -1367,6 +1367,20 @@ union Instruction {
        }
    } bra;
    union {
        BitField<20, 24, u64> target;
        BitField<5, 1, u64> constant_buffer;
        s32 GetBranchExtend() const {
            // Sign extend the branch target offset
            u32 mask = 1U << (24 - 1);
            u32 value = static_cast<u32>(target);
            // The branch offset is relative to the next instruction and is stored in bytes, so
            // divide it by the size of an instruction and add 1 to it.
            return static_cast<s32>((value ^ mask) - mask) / sizeof(Instruction) + 1;
        }
    } brx;
    union {
        BitField<39, 1, u64> emit; // EmitVertex
        BitField<40, 1, u64> cut;  // EndPrimitive
@ -1464,6 +1478,7 @@ public:
        BFE_IMM,
        BFI_IMM_R,
        BRA,
        BRX,
        PBK,
        LD_A,
        LD_L,
@ -1738,6 +1753,7 @@ private:
            INST("111000101001----", Id::SSY, Type::Flow, "SSY"),
            INST("111000101010----", Id::PBK, Type::Flow, "PBK"),
            INST("111000100100----", Id::BRA, Type::Flow, "BRA"),
            INST("111000100101----", Id::BRX, Type::Flow, "BRX"),
            INST("1111000011111---", Id::SYNC, Type::Flow, "SYNC"),
            INST("111000110100---", Id::BRK, Type::Flow, "BRK"),
            INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@ -129,9 +129,11 @@ std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
 /// Hashes one (or two) program streams
 u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& code,
-                        const ProgramCode& code_b) {
+                        const ProgramCode& code_b, std::size_t size_a = 0, std::size_t size_b = 0) {
-    u64 unique_identifier =
+    if (size_a == 0) {
-        Common::CityHash64(reinterpret_cast<const char*>(code.data()), CalculateProgramSize(code));
+        size_a = CalculateProgramSize(code);
    }
    u64 unique_identifier = Common::CityHash64(reinterpret_cast<const char*>(code.data()), size_a);
    if (program_type != Maxwell::ShaderProgram::VertexA) {
        return unique_identifier;
    }
@ -140,8 +142,11 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode&
    std::size_t seed = 0;
    boost::hash_combine(seed, unique_identifier);
-    const u64 identifier_b = Common::CityHash64(reinterpret_cast<const char*>(code_b.data()),
+    if (size_b == 0) {
-                                                CalculateProgramSize(code_b));
+        size_b = CalculateProgramSize(code_b);
    }
    const u64 identifier_b =
        Common::CityHash64(reinterpret_cast<const char*>(code_b.data()), size_b);
    boost::hash_combine(seed, identifier_b);
    return static_cast<u64>(seed);
 }
@ -150,14 +155,17 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode&
 GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type,
                                      ProgramCode program_code, ProgramCode program_code_b) {
    GLShader::ShaderSetup setup(program_code);
    setup.program.size_a = CalculateProgramSize(program_code);
    setup.program.size_b = 0;
    if (program_type == Maxwell::ShaderProgram::VertexA) {
        // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders.
        // Conventional HW does not support this, so we combine VertexA and VertexB into one
        // stage here.
        setup.SetProgramB(program_code_b);
        setup.program.size_b = CalculateProgramSize(program_code_b);
    }
-    setup.program.unique_identifier =
+    setup.program.unique_identifier = GetUniqueIdentifier(
-        GetUniqueIdentifier(program_type, program_code, program_code_b);
+        program_type, program_code, program_code_b, setup.program.size_a, setup.program.size_b);
    switch (program_type) {
    case Maxwell::ShaderProgram::VertexA:
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@ -191,10 +191,12 @@ public:
        // TODO(Subv): Figure out the actual depth of the flow stack, for now it seems
        // unlikely that shaders will use 20 nested SSYs and PBKs.
-        constexpr u32 FLOW_STACK_SIZE = 20;
+        if (!ir.IsFlowStackDisabled()) {
-        for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) {
+            constexpr u32 FLOW_STACK_SIZE = 20;
-            code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE);
+            for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) {
-            code.AddLine("uint {} = 0u;", FlowStackTopName(stack));
+                code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE);
                code.AddLine("uint {} = 0u;", FlowStackTopName(stack));
            }
        }
        code.AddLine("while (true) {{");
@ -1555,6 +1557,14 @@ private:
        return {};
    }
    std::string BranchIndirect(Operation operation) {
        const std::string op_a = VisitOperand(operation, 0, Type::Uint);
        code.AddLine("jmp_to = {};", op_a);
        code.AddLine("break;");
        return {};
    }
    std::string PushFlowStack(Operation operation) {
        const auto stack = std::get<MetaStackClass>(operation.GetMeta());
        const auto target = std::get_if<ImmediateNode>(&*operation[0]);
@ -1789,6 +1799,7 @@ private:
        &GLSLDecompiler::ImageStore,
        &GLSLDecompiler::Branch,
        &GLSLDecompiler::BranchIndirect,
        &GLSLDecompiler::PushFlowStack,
        &GLSLDecompiler::PopFlowStack,
        &GLSLDecompiler::Exit,
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@ -29,14 +29,14 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
 };
 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
    ProgramResult program =
        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");
    out += program.first;
    if (setup.IsDualProgram()) {
-        const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET);
+        const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET, setup.program.size_b);
        ProgramResult program_b =
            Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");
@ -80,7 +80,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
 };
 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
    ProgramResult program =
        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
    out += program.first;
@ -115,7 +115,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config {
 };
 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
    ProgramResult program =
        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@ -27,6 +27,8 @@ struct ShaderSetup {
        ProgramCode code;
        ProgramCode code_b; // Used for dual vertex shaders
        u64 unique_identifier;
        std::size_t size_a;
        std::size_t size_b;
    } program;
    /// Used in scenarios where we have a dual vertex shaders
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@ -949,6 +949,14 @@ private:
        return {};
    }
    Id BranchIndirect(Operation operation) {
        const Id op_a = VisitOperand<Type::Uint>(operation, 0);
        Emit(OpStore(jmp_to, op_a));
        BranchingOp([&]() { Emit(OpBranch(continue_label)); });
        return {};
    }
    Id PushFlowStack(Operation operation) {
        const auto target = std::get_if<ImmediateNode>(&*operation[0]);
        ASSERT(target);
@ -1334,6 +1342,7 @@ private:
        &SPIRVDecompiler::ImageStore,
        &SPIRVDecompiler::Branch,
        &SPIRVDecompiler::BranchIndirect,
        &SPIRVDecompiler::PushFlowStack,
        &SPIRVDecompiler::PopFlowStack,
        &SPIRVDecompiler::Exit,
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@ -0,0 +1,476 @@
 // Copyright 2019 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #include <list>
 #include <map>
 #include <stack>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/shader/control_flow.h"
 #include "video_core/shader/shader_ir.h"
 namespace VideoCommon::Shader {
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 constexpr s32 unassigned_branch = -2;
 struct Query {
    u32 address{};
    std::stack<u32> ssy_stack{};
    std::stack<u32> pbk_stack{};
 };
 struct BlockStack {
    BlockStack() = default;
    BlockStack(const BlockStack& b) = default;
    BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {}
    std::stack<u32> ssy_stack{};
    std::stack<u32> pbk_stack{};
 };
 struct BlockBranchInfo {
    Condition condition{};
    s32 address{exit_branch};
    bool kill{};
    bool is_sync{};
    bool is_brk{};
    bool ignore{};
 };
 struct BlockInfo {
    u32 start{};
    u32 end{};
    bool visited{};
    BlockBranchInfo branch{};
    bool IsInside(const u32 address) const {
        return start <= address && address <= end;
    }
 };
 struct CFGRebuildState {
    explicit CFGRebuildState(const ProgramCode& program_code, const std::size_t program_size,
                             const u32 start)
        : program_code{program_code}, program_size{program_size}, start{start} {}
    u32 start{};
    std::vector<BlockInfo> block_info{};
    std::list<u32> inspect_queries{};
    std::list<Query> queries{};
    std::unordered_map<u32, u32> registered{};
    std::unordered_set<u32> labels{};
    std::map<u32, u32> ssy_labels{};
    std::map<u32, u32> pbk_labels{};
    std::unordered_map<u32, BlockStack> stacks{};
    const ProgramCode& program_code;
    const std::size_t program_size;
 };
 enum class BlockCollision : u32 { None, Found, Inside };
 std::pair<BlockCollision, u32> TryGetBlock(CFGRebuildState& state, u32 address) {
    const auto& blocks = state.block_info;
    for (u32 index = 0; index < blocks.size(); index++) {
        if (blocks[index].start == address) {
            return {BlockCollision::Found, index};
        }
        if (blocks[index].IsInside(address)) {
            return {BlockCollision::Inside, index};
        }
    }
    return {BlockCollision::None, -1};
 }
 struct ParseInfo {
    BlockBranchInfo branch_info{};
    u32 end_address{};
 };
 BlockInfo& CreateBlockInfo(CFGRebuildState& state, u32 start, u32 end) {
    auto& it = state.block_info.emplace_back();
    it.start = start;
    it.end = end;
    const u32 index = static_cast<u32>(state.block_info.size() - 1);
    state.registered.insert({start, index});
    return it;
 }
 Pred GetPredicate(u32 index, bool negated) {
    return static_cast<Pred>(index + (negated ? 8 : 0));
 }
 /**
 * Returns whether the instruction at the specified offset is a 'sched' instruction.
 * Sched instructions always appear before a sequence of 3 instructions.
 */
 constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
    constexpr u32 SchedPeriod = 4;
    u32 absolute_offset = offset - main_offset;
    return (absolute_offset % SchedPeriod) == 0;
 }
 enum class ParseResult : u32 {
    ControlCaught,
    BlockEnd,
    AbnormalFlow,
 };
 std::pair<ParseResult, ParseInfo> ParseCode(CFGRebuildState& state, u32 address) {
    u32 offset = static_cast<u32>(address);
    const u32 end_address = static_cast<u32>(state.program_size / sizeof(Instruction));
    ParseInfo parse_info{};
    const auto insert_label = [](CFGRebuildState& state, u32 address) {
        const auto pair = state.labels.emplace(address);
        if (pair.second) {
            state.inspect_queries.push_back(address);
        }
    };
    while (true) {
        if (offset >= end_address) {
            // ASSERT_OR_EXECUTE can't be used, as it ignores the break
            ASSERT_MSG(false, "Shader passed the current limit!");
            parse_info.branch_info.address = exit_branch;
            parse_info.branch_info.ignore = false;
            break;
        }
        if (state.registered.count(offset) != 0) {
            parse_info.branch_info.address = offset;
            parse_info.branch_info.ignore = true;
            break;
        }
        if (IsSchedInstruction(offset, state.start)) {
            offset++;
            continue;
        }
        const Instruction instr = {state.program_code[offset]};
        const auto opcode = OpCode::Decode(instr);
        if (!opcode || opcode->get().GetType() != OpCode::Type::Flow) {
            offset++;
            continue;
        }
        switch (opcode->get().GetId()) {
        case OpCode::Id::EXIT: {
            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
            parse_info.branch_info.condition.predicate =
                GetPredicate(pred_index, instr.negate_pred != 0);
            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
                offset++;
                continue;
            }
            const ConditionCode cc = instr.flow_condition_code;
            parse_info.branch_info.condition.cc = cc;
            if (cc == ConditionCode::F) {
                offset++;
                continue;
            }
            parse_info.branch_info.address = exit_branch;
            parse_info.branch_info.kill = false;
            parse_info.branch_info.is_sync = false;
            parse_info.branch_info.is_brk = false;
            parse_info.branch_info.ignore = false;
            parse_info.end_address = offset;
            return {ParseResult::ControlCaught, parse_info};
        }
        case OpCode::Id::BRA: {
            if (instr.bra.constant_buffer != 0) {
                return {ParseResult::AbnormalFlow, parse_info};
            }
            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
            parse_info.branch_info.condition.predicate =
                GetPredicate(pred_index, instr.negate_pred != 0);
            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
                offset++;
                continue;
            }
            const ConditionCode cc = instr.flow_condition_code;
            parse_info.branch_info.condition.cc = cc;
            if (cc == ConditionCode::F) {
                offset++;
                continue;
            }
            const u32 branch_offset = offset + instr.bra.GetBranchTarget();
            if (branch_offset == 0) {
                parse_info.branch_info.address = exit_branch;
            } else {
                parse_info.branch_info.address = branch_offset;
            }
            insert_label(state, branch_offset);
            parse_info.branch_info.kill = false;
            parse_info.branch_info.is_sync = false;
            parse_info.branch_info.is_brk = false;
            parse_info.branch_info.ignore = false;
            parse_info.end_address = offset;
            return {ParseResult::ControlCaught, parse_info};
        }
        case OpCode::Id::SYNC: {
            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
            parse_info.branch_info.condition.predicate =
                GetPredicate(pred_index, instr.negate_pred != 0);
            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
                offset++;
                continue;
            }
            const ConditionCode cc = instr.flow_condition_code;
            parse_info.branch_info.condition.cc = cc;
            if (cc == ConditionCode::F) {
                offset++;
                continue;
            }
            parse_info.branch_info.address = unassigned_branch;
            parse_info.branch_info.kill = false;
            parse_info.branch_info.is_sync = true;
            parse_info.branch_info.is_brk = false;
            parse_info.branch_info.ignore = false;
            parse_info.end_address = offset;
            return {ParseResult::ControlCaught, parse_info};
        }
        case OpCode::Id::BRK: {
            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
            parse_info.branch_info.condition.predicate =
                GetPredicate(pred_index, instr.negate_pred != 0);
            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
                offset++;
                continue;
            }
            const ConditionCode cc = instr.flow_condition_code;
            parse_info.branch_info.condition.cc = cc;
            if (cc == ConditionCode::F) {
                offset++;
                continue;
            }
            parse_info.branch_info.address = unassigned_branch;
            parse_info.branch_info.kill = false;
            parse_info.branch_info.is_sync = false;
            parse_info.branch_info.is_brk = true;
            parse_info.branch_info.ignore = false;
            parse_info.end_address = offset;
            return {ParseResult::ControlCaught, parse_info};
        }
        case OpCode::Id::KIL: {
            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
            parse_info.branch_info.condition.predicate =
                GetPredicate(pred_index, instr.negate_pred != 0);
            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
                offset++;
                continue;
            }
            const ConditionCode cc = instr.flow_condition_code;
            parse_info.branch_info.condition.cc = cc;
            if (cc == ConditionCode::F) {
                offset++;
                continue;
            }
            parse_info.branch_info.address = exit_branch;
            parse_info.branch_info.kill = true;
            parse_info.branch_info.is_sync = false;
            parse_info.branch_info.is_brk = false;
            parse_info.branch_info.ignore = false;
            parse_info.end_address = offset;
            return {ParseResult::ControlCaught, parse_info};
        }
        case OpCode::Id::SSY: {
            const u32 target = offset + instr.bra.GetBranchTarget();
            insert_label(state, target);
            state.ssy_labels.emplace(offset, target);
            break;
        }
        case OpCode::Id::PBK: {
            const u32 target = offset + instr.bra.GetBranchTarget();
            insert_label(state, target);
            state.pbk_labels.emplace(offset, target);
            break;
        }
        case OpCode::Id::BRX: {
            return {ParseResult::AbnormalFlow, parse_info};
        }
        default:
            break;
        }
        offset++;
    }
    parse_info.branch_info.kill = false;
    parse_info.branch_info.is_sync = false;
    parse_info.branch_info.is_brk = false;
    parse_info.end_address = offset - 1;
    return {ParseResult::BlockEnd, parse_info};
 }
 bool TryInspectAddress(CFGRebuildState& state) {
    if (state.inspect_queries.empty()) {
        return false;
    }
    const u32 address = state.inspect_queries.front();
    state.inspect_queries.pop_front();
    const auto [result, block_index] = TryGetBlock(state, address);
    switch (result) {
    case BlockCollision::Found: {
        return true;
    }
    case BlockCollision::Inside: {
        // This case is the tricky one:
        // We need to Split the block in 2 sepparate blocks
        const u32 end = state.block_info[block_index].end;
        BlockInfo& new_block = CreateBlockInfo(state, address, end);
        BlockInfo& current_block = state.block_info[block_index];
        current_block.end = address - 1;
        new_block.branch = current_block.branch;
        BlockBranchInfo forward_branch{};
        forward_branch.address = address;
        forward_branch.ignore = true;
        current_block.branch = forward_branch;
        return true;
    }
    default:
        break;
    }
    const auto [parse_result, parse_info] = ParseCode(state, address);
    if (parse_result == ParseResult::AbnormalFlow) {
        // if it's AbnormalFlow, we end it as false, ending the CFG reconstruction
        return false;
    }
    BlockInfo& block_info = CreateBlockInfo(state, address, parse_info.end_address);
    block_info.branch = parse_info.branch_info;
    if (parse_info.branch_info.condition.IsUnconditional()) {
        return true;
    }
    const u32 fallthrough_address = parse_info.end_address + 1;
    state.inspect_queries.push_front(fallthrough_address);
    return true;
 }
 bool TryQuery(CFGRebuildState& state) {
    const auto gather_labels = [](std::stack<u32>& cc, std::map<u32, u32>& labels,
                                  BlockInfo& block) {
        auto gather_start = labels.lower_bound(block.start);
        const auto gather_end = labels.upper_bound(block.end);
        while (gather_start != gather_end) {
            cc.push(gather_start->second);
            gather_start++;
        }
    };
    if (state.queries.empty()) {
        return false;
    }
    Query& q = state.queries.front();
    const u32 block_index = state.registered[q.address];
    BlockInfo& block = state.block_info[block_index];
    // If the block is visted, check if the stacks match, else gather the ssy/pbk
    // labels into the current stack and look if the branch at the end of the block
    // consumes a label. Schedule new queries accordingly
    if (block.visited) {
        BlockStack& stack = state.stacks[q.address];
        const bool all_okay = (stack.ssy_stack.size() == 0 || q.ssy_stack == stack.ssy_stack) &&
                              (stack.pbk_stack.size() == 0 || q.pbk_stack == stack.pbk_stack);
        state.queries.pop_front();
        return all_okay;
    }
    block.visited = true;
    state.stacks[q.address] = BlockStack{q};
    Query q2(q);
    state.queries.pop_front();
    gather_labels(q2.ssy_stack, state.ssy_labels, block);
    gather_labels(q2.pbk_stack, state.pbk_labels, block);
    if (!block.branch.condition.IsUnconditional()) {
        q2.address = block.end + 1;
        state.queries.push_back(q2);
    }
    Query conditional_query{q2};
    if (block.branch.is_sync) {
        if (block.branch.address == unassigned_branch) {
            block.branch.address = conditional_query.ssy_stack.top();
        }
        conditional_query.ssy_stack.pop();
    }
    if (block.branch.is_brk) {
        if (block.branch.address == unassigned_branch) {
            block.branch.address = conditional_query.pbk_stack.top();
        }
        conditional_query.pbk_stack.pop();
    }
    conditional_query.address = block.branch.address;
    state.queries.push_back(conditional_query);
    return true;
 }
 std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
                                              u32 start_address) {
    CFGRebuildState state{program_code, program_size, start_address};
    // Inspect Code and generate blocks
    state.labels.clear();
    state.labels.emplace(start_address);
    state.inspect_queries.push_back(state.start);
    while (!state.inspect_queries.empty()) {
        if (!TryInspectAddress(state)) {
            return {};
        }
    }
    // Decompile Stacks
    Query start_query{};
    start_query.address = state.start;
    state.queries.push_back(start_query);
    bool decompiled = true;
    while (!state.queries.empty()) {
        if (!TryQuery(state)) {
            decompiled = false;
            break;
        }
    }
    // Sort and organize results
    std::sort(state.block_info.begin(), state.block_info.end(),
              [](const BlockInfo& a, const BlockInfo& b) -> bool { return a.start < b.start; });
    ShaderCharacteristics result_out{};
    result_out.decompilable = decompiled;
    result_out.start = start_address;
    result_out.end = start_address;
    for (auto& block : state.block_info) {
        ShaderBlock new_block{};
        new_block.start = block.start;
        new_block.end = block.end;
        new_block.ignore_branch = block.branch.ignore;
        if (!new_block.ignore_branch) {
            new_block.branch.cond = block.branch.condition;
            new_block.branch.kills = block.branch.kill;
            new_block.branch.address = block.branch.address;
        }
        result_out.end = std::max(result_out.end, block.end);
        result_out.blocks.push_back(new_block);
    }
    if (result_out.decompilable) {
        result_out.labels = std::move(state.labels);
        return {result_out};
    }
    // If it's not decompilable, merge the unlabelled blocks together
    auto back = result_out.blocks.begin();
    auto next = std::next(back);
    while (next != result_out.blocks.end()) {
        if (state.labels.count(next->start) == 0 && next->start == back->end + 1) {
            back->end = next->end;
            next = result_out.blocks.erase(next);
            continue;
        }
        back = next;
        next++;
    }
    return {result_out};
 }
 } // namespace VideoCommon::Shader
--- a/src/video_core/shader/control_flow.h
+++ b/src/video_core/shader/control_flow.h
@ -0,0 +1,63 @@
 // Copyright 2019 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #pragma once
 #include <cstring>
 #include <list>
 #include <optional>
 #include <unordered_set>
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/shader_ir.h"
 namespace VideoCommon::Shader {
 using Tegra::Shader::ConditionCode;
 using Tegra::Shader::Pred;
 constexpr s32 exit_branch = -1;
 struct Condition {
    Pred predicate{Pred::UnusedIndex};
    ConditionCode cc{ConditionCode::T};
    bool IsUnconditional() const {
        return predicate == Pred::UnusedIndex && cc == ConditionCode::T;
    }
    bool operator==(const Condition& other) const {
        return std::tie(predicate, cc) == std::tie(other.predicate, other.cc);
    }
 };
 struct ShaderBlock {
    u32 start{};
    u32 end{};
    bool ignore_branch{};
    struct Branch {
        Condition cond{};
        bool kills{};
        s32 address{};
        bool operator==(const Branch& b) const {
            return std::tie(cond, kills, address) == std::tie(b.cond, b.kills, b.address);
        }
    } branch{};
    bool operator==(const ShaderBlock& sb) const {
        return std::tie(start, end, ignore_branch, branch) ==
               std::tie(sb.start, sb.end, sb.ignore_branch, sb.branch);
    }
 };
 struct ShaderCharacteristics {
    std::list<ShaderBlock> blocks{};
    bool decompilable{};
    u32 start{};
    u32 end{};
    std::unordered_set<u32> labels{};
 };
 std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
                                              u32 start_address);
 } // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@ -11,6 +11,7 @@
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/engines/shader_header.h"
 #include "video_core/shader/control_flow.h"
 #include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
@ -21,20 +22,6 @@ using Tegra::Shader::OpCode;
 namespace {
 /// Merges exit method of two parallel branches.
 constexpr ExitMethod ParallelExit(ExitMethod a, ExitMethod b) {
    if (a == ExitMethod::Undetermined) {
        return b;
    }
    if (b == ExitMethod::Undetermined) {
        return a;
    }
    if (a == b) {
        return a;
    }
    return ExitMethod::Conditional;
 }
 /**
 * Returns whether the instruction at the specified offset is a 'sched' instruction.
 * Sched instructions always appear before a sequence of 3 instructions.
@ -51,87 +38,106 @@ constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
 void ShaderIR::Decode() {
    std::memcpy(&header, program_code.data(), sizeof(Tegra::Shader::Header));
-    std::set<u32> labels;
+    disable_flow_stack = false;
-    const ExitMethod exit_method = Scan(main_offset, MAX_PROGRAM_LENGTH, labels);
+    const auto info = ScanFlow(program_code, program_size, main_offset);
-    if (exit_method != ExitMethod::AlwaysEnd) {
+    if (info) {
-        UNREACHABLE_MSG("Program does not always end");
+        const auto& shader_info = *info;
-    }
+        coverage_begin = shader_info.start;
-
+        coverage_end = shader_info.end;
-    if (labels.empty()) {
+        if (shader_info.decompilable) {
-        basic_blocks.insert({main_offset, DecodeRange(main_offset, MAX_PROGRAM_LENGTH)});
+            disable_flow_stack = true;
            const auto insert_block = ([this](NodeBlock& nodes, u32 label) {
                if (label == exit_branch) {
                    return;
                }
                basic_blocks.insert({label, nodes});
            });
            const auto& blocks = shader_info.blocks;
            NodeBlock current_block;
            u32 current_label = exit_branch;
            for (auto& block : blocks) {
                if (shader_info.labels.count(block.start) != 0) {
                    insert_block(current_block, current_label);
                    current_block.clear();
                    current_label = block.start;
                }
                if (!block.ignore_branch) {
                    DecodeRangeInner(current_block, block.start, block.end);
                    InsertControlFlow(current_block, block);
                } else {
                    DecodeRangeInner(current_block, block.start, block.end + 1);
                }
            }
            insert_block(current_block, current_label);
            return;
        }
        LOG_WARNING(HW_GPU, "Flow Stack Removing Failed! Falling back to old method");
        // we can't decompile it, fallback to standard method
        for (const auto& block : shader_info.blocks) {
            basic_blocks.insert({block.start, DecodeRange(block.start, block.end + 1)});
        }
        return;
    }
    LOG_WARNING(HW_GPU, "Flow Analysis Failed! Falling back to brute force compiling");
-    labels.insert(main_offset);
+    // Now we need to deal with an undecompilable shader. We need to brute force
-
+    // a shader that captures every position.
-    for (const u32 label : labels) {
+    coverage_begin = main_offset;
-        const auto next_it = labels.lower_bound(label + 1);
+    const u32 shader_end = static_cast<u32>(program_size / sizeof(u64));
-        const u32 next_label = next_it == labels.end() ? MAX_PROGRAM_LENGTH : *next_it;
+    coverage_end = shader_end;
-
+    for (u32 label = main_offset; label < shader_end; label++) {
-        basic_blocks.insert({label, DecodeRange(label, next_label)});
+        basic_blocks.insert({label, DecodeRange(label, label + 1)});
    }
 }
 ExitMethod ShaderIR::Scan(u32 begin, u32 end, std::set<u32>& labels) {
    const auto [iter, inserted] =
        exit_method_map.emplace(std::make_pair(begin, end), ExitMethod::Undetermined);
    ExitMethod& exit_method = iter->second;
    if (!inserted)
        return exit_method;
    for (u32 offset = begin; offset != end && offset != MAX_PROGRAM_LENGTH; ++offset) {
        coverage_begin = std::min(coverage_begin, offset);
        coverage_end = std::max(coverage_end, offset + 1);
        const Instruction instr = {program_code[offset]};
        const auto opcode = OpCode::Decode(instr);
        if (!opcode)
            continue;
        switch (opcode->get().GetId()) {
        case OpCode::Id::EXIT: {
            // The EXIT instruction can be predicated, which means that the shader can conditionally
            // end on this instruction. We have to consider the case where the condition is not met
            // and check the exit method of that other basic block.
            using Tegra::Shader::Pred;
            if (instr.pred.pred_index == static_cast<u64>(Pred::UnusedIndex)) {
                return exit_method = ExitMethod::AlwaysEnd;
            } else {
                const ExitMethod not_met = Scan(offset + 1, end, labels);
                return exit_method = ParallelExit(ExitMethod::AlwaysEnd, not_met);
            }
        }
        case OpCode::Id::BRA: {
            const u32 target = offset + instr.bra.GetBranchTarget();
            labels.insert(target);
            const ExitMethod no_jmp = Scan(offset + 1, end, labels);
            const ExitMethod jmp = Scan(target, end, labels);
            return exit_method = ParallelExit(no_jmp, jmp);
        }
        case OpCode::Id::SSY:
        case OpCode::Id::PBK: {
            // The SSY and PBK use a similar encoding as the BRA instruction.
            UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
                                 "Constant buffer branching is not supported");
            const u32 target = offset + instr.bra.GetBranchTarget();
            labels.insert(target);
            // Continue scanning for an exit method.
            break;
        }
        default:
            break;
        }
    }
    return exit_method = ExitMethod::AlwaysReturn;
 }
 NodeBlock ShaderIR::DecodeRange(u32 begin, u32 end) {
    NodeBlock basic_block;
-    for (u32 pc = begin; pc < (begin > end ? MAX_PROGRAM_LENGTH : end);) {
+    DecodeRangeInner(basic_block, begin, end);
        pc = DecodeInstr(basic_block, pc);
    }
    return basic_block;
 }
 void ShaderIR::DecodeRangeInner(NodeBlock& bb, u32 begin, u32 end) {
    for (u32 pc = begin; pc < (begin > end ? MAX_PROGRAM_LENGTH : end);) {
        pc = DecodeInstr(bb, pc);
    }
 }
 void ShaderIR::InsertControlFlow(NodeBlock& bb, const ShaderBlock& block) {
    const auto apply_conditions = ([&](const Condition& cond, Node n) -> Node {
        Node result = n;
        if (cond.cc != ConditionCode::T) {
            result = Conditional(GetConditionCode(cond.cc), {result});
        }
        if (cond.predicate != Pred::UnusedIndex) {
            u32 pred = static_cast<u32>(cond.predicate);
            const bool is_neg = pred > 7;
            if (is_neg) {
                pred -= 8;
            }
            result = Conditional(GetPredicate(pred, is_neg), {result});
        }
        return result;
    });
    if (block.branch.address < 0) {
        if (block.branch.kills) {
            Node n = Operation(OperationCode::Discard);
            n = apply_conditions(block.branch.cond, n);
            bb.push_back(n);
            global_code.push_back(n);
            return;
        }
        Node n = Operation(OperationCode::Exit);
        n = apply_conditions(block.branch.cond, n);
        bb.push_back(n);
        global_code.push_back(n);
        return;
    }
    Node n = Operation(OperationCode::Branch, Immediate(block.branch.address));
    n = apply_conditions(block.branch.cond, n);
    bb.push_back(n);
    global_code.push_back(n);
 }
 u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
    // Ignore sched instructions when generating code.
    if (IsSchedInstruction(pc, main_offset)) {
@ -140,15 +146,18 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
    const Instruction instr = {program_code[pc]};
    const auto opcode = OpCode::Decode(instr);
    const u32 nv_address = ConvertAddressToNvidiaSpace(pc);
    // Decoding failure
    if (!opcode) {
        UNIMPLEMENTED_MSG("Unhandled instruction: {0:x}", instr.value);
        bb.push_back(Comment(fmt::format("{:05x} Unimplemented Shader instruction (0x{:016x})",
                                         nv_address, instr.value)));
        return pc + 1;
    }
-    bb.push_back(
+    bb.push_back(Comment(
-        Comment(fmt::format("{}: {} (0x{:016x})", pc, opcode->get().GetName(), instr.value)));
+        fmt::format("{:05x} {} (0x{:016x})", nv_address, opcode->get().GetName(), instr.value)));
    using Tegra::Shader::Pred;
    UNIMPLEMENTED_IF_MSG(instr.pred.full_pred == Pred::NeverExecute,
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@ -91,11 +91,46 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
        break;
    }
    case OpCode::Id::BRA: {
-        UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
+        Node branch;
-                             "BRA with constant buffers are not implemented");
+        if (instr.bra.constant_buffer == 0) {
            const u32 target = pc + instr.bra.GetBranchTarget();
            branch = Operation(OperationCode::Branch, Immediate(target));
        } else {
            const u32 target = pc + 1;
            const Node op_a = GetConstBuffer(instr.cbuf36.index, instr.cbuf36.GetOffset());
            const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, true,
                                                 PRECISE, op_a, Immediate(3));
            const Node operand =
                Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target));
            branch = Operation(OperationCode::BranchIndirect, convert);
        }
-        const u32 target = pc + instr.bra.GetBranchTarget();
+        const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
-        const Node branch = Operation(OperationCode::Branch, Immediate(target));
+        if (cc != Tegra::Shader::ConditionCode::T) {
            bb.push_back(Conditional(GetConditionCode(cc), {branch}));
        } else {
            bb.push_back(branch);
        }
        break;
    }
    case OpCode::Id::BRX: {
        Node operand;
        if (instr.brx.constant_buffer != 0) {
            const s32 target = pc + 1;
            const Node index = GetRegister(instr.gpr8);
            const Node op_a =
                GetConstBufferIndirect(instr.cbuf36.index, instr.cbuf36.GetOffset() + 0, index);
            const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, true,
                                                 PRECISE, op_a, Immediate(3));
            operand = Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target));
        } else {
            const s32 target = pc + instr.brx.GetBranchExtend();
            const Node op_a = GetRegister(instr.gpr8);
            const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, true,
                                                 PRECISE, op_a, Immediate(3));
            operand = Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target));
        }
        const Node branch = Operation(OperationCode::BranchIndirect, operand);
        const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
        if (cc != Tegra::Shader::ConditionCode::T) {
@ -109,6 +144,10 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
        UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
                             "Constant buffer flow is not supported");
        if (disable_flow_stack) {
            break;
        }
        // The SSY opcode tells the GPU where to re-converge divergent execution paths with SYNC.
        const u32 target = pc + instr.bra.GetBranchTarget();
        bb.push_back(
@ -119,6 +158,10 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
        UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
                             "Constant buffer PBK is not supported");
        if (disable_flow_stack) {
            break;
        }
        // PBK pushes to a stack the address where BRK will jump to.
        const u32 target = pc + instr.bra.GetBranchTarget();
        bb.push_back(
@ -130,6 +173,10 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
        UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "SYNC condition code used: {}",
                             static_cast<u32>(cc));
        if (disable_flow_stack) {
            break;
        }
        // The SYNC opcode jumps to the address previously set by the SSY opcode
        bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Ssy));
        break;
@ -138,6 +185,9 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
        const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
        UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "BRK condition code used: {}",
                             static_cast<u32>(cc));
        if (disable_flow_stack) {
            break;
        }
        // The BRK opcode jumps to the address previously set by the PBK opcode
        bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Pbk));
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@ -148,11 +148,12 @@ enum class OperationCode {
    ImageStore, /// (MetaImage, float[N] coords) -> void
-    Branch,        /// (uint branch_target) -> void
+    Branch,         /// (uint branch_target) -> void
-    PushFlowStack, /// (uint branch_target) -> void
+    BranchIndirect, /// (uint branch_target) -> void
-    PopFlowStack,  /// () -> void
+    PushFlowStack,  /// (uint branch_target) -> void
-    Exit,          /// () -> void
+    PopFlowStack,   /// () -> void
-    Discard,       /// () -> void
+    Exit,           /// () -> void
    Discard,        /// () -> void
    EmitVertex,   /// () -> void
    EndPrimitive, /// () -> void
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@ -22,8 +22,8 @@ using Tegra::Shader::PredCondition;
 using Tegra::Shader::PredOperation;
 using Tegra::Shader::Register;
-ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset)
+ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset, const std::size_t size)
-    : program_code{program_code}, main_offset{main_offset} {
+    : program_code{program_code}, main_offset{main_offset}, program_size{size} {
    Decode();
 }
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@ -22,18 +22,12 @@
 namespace VideoCommon::Shader {
 struct ShaderBlock;
 using ProgramCode = std::vector<u64>;
 constexpr u32 MAX_PROGRAM_LENGTH = 0x1000;
 /// Describes the behaviour of code path of a given entry point and a return point.
 enum class ExitMethod {
    Undetermined, ///< Internal value. Only occur when analyzing JMP loop.
    AlwaysReturn, ///< All code paths reach the return point.
    Conditional,  ///< Code path reaches the return point or an END instruction conditionally.
    AlwaysEnd,    ///< All code paths reach a END instruction.
 };
 class ConstBuffer {
 public:
    explicit ConstBuffer(u32 max_offset, bool is_indirect)
@ -73,7 +67,7 @@ struct GlobalMemoryUsage {
 class ShaderIR final {
 public:
-    explicit ShaderIR(const ProgramCode& program_code, u32 main_offset);
+    explicit ShaderIR(const ProgramCode& program_code, u32 main_offset, std::size_t size);
    ~ShaderIR();
    const std::map<u32, NodeBlock>& GetBasicBlocks() const {
@ -129,12 +123,20 @@ public:
        return header;
    }
    bool IsFlowStackDisabled() const {
        return disable_flow_stack;
    }
    u32 ConvertAddressToNvidiaSpace(const u32 address) const {
        return (address - main_offset) * sizeof(Tegra::Shader::Instruction);
    }
 private:
    void Decode();
    ExitMethod Scan(u32 begin, u32 end, std::set<u32>& labels);
    NodeBlock DecodeRange(u32 begin, u32 end);
    void DecodeRangeInner(NodeBlock& bb, u32 begin, u32 end);
    void InsertControlFlow(NodeBlock& bb, const ShaderBlock& block);
    /**
     * Decodes a single instruction from Tegra to IR.
@ -326,10 +328,11 @@ private:
    const ProgramCode& program_code;
    const u32 main_offset;
    const std::size_t program_size;
    bool disable_flow_stack{};
    u32 coverage_begin{};
    u32 coverage_end{};
    std::map<std::pair<u32, u32>, ExitMethod> exit_method_map;
    std::map<u32, NodeBlock> basic_blocks;
    NodeBlock global_code;