Revert "New GS refactor (#8)"

This reverts commit 10272dd541.
2016-04-16 01:32:33 +08:00
parent 8be067ae29
commit 1108438607
15 changed files with 321 additions and 726 deletions
--- a/src/citra_qt/debugger/graphics_breakpoints.cpp
+++ b/src/citra_qt/debugger/graphics_breakpoints.cpp
@@ -44,8 +44,7 @@ QVariant BreakPointModel::data(const QModelIndex& index, int role) const
                { Pica::DebugContext::Event::PicaCommandProcessed, tr("Pica command processed") },
                { Pica::DebugContext::Event::IncomingPrimitiveBatch, tr("Incoming primitive batch") },
                { Pica::DebugContext::Event::FinishedPrimitiveBatch, tr("Finished primitive batch") },
-                { Pica::DebugContext::Event::RunVS, tr("Vertex shader invocation") },
-                { Pica::DebugContext::Event::RunGS, tr("Geometry shader invocation") },
+                { Pica::DebugContext::Event::VertexLoaded, tr("Vertex loaded") },
                { Pica::DebugContext::Event::IncomingDisplayTransfer, tr("Incoming display transfer") },
                { Pica::DebugContext::Event::GSPCommandProcessed, tr("GSP command processed") },
                { Pica::DebugContext::Event::BufferSwapped, tr("Buffers swapped") }
--- a/src/citra_qt/debugger/graphics_tracing.cpp
+++ b/src/citra_qt/debugger/graphics_tracing.cpp
@@ -70,7 +70,7 @@ void GraphicsTracingWidget::StartRecording() {
    std::array<u32, 4 * 16> default_attributes;
    for (unsigned i = 0; i < 16; ++i) {
        for (unsigned comp = 0; comp < 3; ++comp) {
-            default_attributes[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs_default_attributes[i][comp].ToFloat32());
+            default_attributes[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs.default_attributes[i][comp].ToFloat32());
        }
    }

--- a/src/citra_qt/debugger/graphics_vertex_shader.cpp
+++ b/src/citra_qt/debugger/graphics_vertex_shader.cpp
@@ -365,7 +365,7 @@ GraphicsVertexShaderWidget::GraphicsVertexShaderWidget(std::shared_ptr< Pica::De
        input_data[i]->setValidator(new QDoubleValidator(input_data[i]));
    }

-    breakpoint_warning = new QLabel(tr("(data only available at vertex shader invocation breakpoints)"));
+    breakpoint_warning = new QLabel(tr("(data only available at VertexLoaded breakpoints)"));

    // TODO: Add some button for jumping to the shader entry point

@@ -454,7 +454,7 @@ GraphicsVertexShaderWidget::GraphicsVertexShaderWidget(std::shared_ptr< Pica::De

 void GraphicsVertexShaderWidget::OnBreakPointHit(Pica::DebugContext::Event event, void* data) {
    auto input = static_cast<Pica::Shader::InputVertex*>(data);
-    if (event == Pica::DebugContext::Event::RunVS) {
+    if (event == Pica::DebugContext::Event::VertexLoaded) {
        Reload(true, data);
    } else {
        // No vertex data is retrievable => invalidate currently stored vertex data
@@ -501,7 +501,7 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d
    info.labels.insert({ entry_point, "main" });

    // Generate debug information
-    debug_data = Pica::g_state.vs.ProduceDebugInfo(input_vertex, num_attributes, shader_config);
+    debug_data = Pica::Shader::ProduceDebugInfo(input_vertex, num_attributes, shader_config, shader_setup);

    // Reload widget state
    for (int attr = 0; attr < num_attributes; ++attr) {
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -29,6 +29,10 @@ namespace Pica {

 namespace CommandProcessor {

+static int float_regs_counter = 0;
+
+static u32 uniform_write_buffer[4];
+
 static int default_attr_counter = 0;

 static u32 default_attr_write_buffer[3];
@@ -122,7 +126,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {

                // TODO: Verify that this actually modifies the register!
                if (setup.index < 15) {
-                    g_state.vs_default_attributes[setup.index] = attribute;
+                    g_state.vs.default_attributes[setup.index] = attribute;
                    setup.index++;
                } else {
                    // Put each attribute into an immediate input buffer.
@@ -137,14 +141,14 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                    if (immediate_attribute_id >= regs.vs.num_input_attributes+1) {
                        immediate_attribute_id = 0;

-                        auto& shader_unit = Shader::GetShaderUnit(false);
-                        g_state.vs.Setup();
+                        Shader::UnitState<false> shader_unit;
+                        Shader::Setup();
+
+                        if (g_debug_context)
+                            g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, static_cast<void*>(&immediate_input));

                        // Send to vertex shader
-                        if (g_debug_context)
-                            g_debug_context->OnEvent(DebugContext::Event::RunVS, static_cast<void*>(&immediate_input));
-                        g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1, regs.vs);
-                        Shader::OutputVertex output_vertex = shader_unit.output_registers.ToVertex(regs.vs);
+                        Shader::OutputVertex output = Shader::Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1);

                        // Send to renderer
                        using Pica::Shader::OutputVertex;
@@ -152,7 +156,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                            VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
                        };

-                        g_state.primitive_assembler.SubmitVertex(output_vertex, AddTriangle);
+                        g_state.primitive_assembler.SubmitVertex(output, AddTriangle);
                    }
                }
            }
@@ -299,16 +303,13 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
            // The size has been tuned for optimal balance between hit-rate and the cost of lookup
            const size_t VERTEX_CACHE_SIZE = 32;
            std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids;
-            std::array<Shader::OutputRegisters, VERTEX_CACHE_SIZE> vertex_cache;
+            std::array<Shader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache;

            unsigned int vertex_cache_pos = 0;
            vertex_cache_ids.fill(-1);

-            auto& vs_shader_unit = Shader::GetShaderUnit(false);
-            g_state.vs.Setup();
-
-            auto& gs_unit_state = Shader::GetShaderUnit(true);
-            g_state.gs.Setup();
+            Shader::UnitState<false> shader_unit;
+            Shader::Setup();

            for (unsigned int index = 0; index < regs.num_vertices; ++index)
            {
@@ -320,7 +321,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                ASSERT(vertex != -1);

                bool vertex_cache_hit = false;
-                Shader::OutputRegisters output_registers;
+                Shader::OutputVertex output;

                if (is_indexed) {
                    if (g_debug_context && Pica::g_debug_context->recorder) {
@@ -330,7 +331,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {

                    for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) {
                        if (vertex == vertex_cache_ids[i]) {
-                            output_registers = vertex_cache[i];
+                            output = vertex_cache[i];
                            vertex_cache_hit = true;
                            break;
                        }
@@ -377,7 +378,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                            }
                        } else if (attribute_config.IsDefaultAttribute(i)) {
                            // Load the default attribute if we're configured to do so
-                            input.attr[i] = g_state.vs_default_attributes[i];
+                            input.attr[i] = g_state.vs.default_attributes[i];
                            LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)",
                                      i, vertex, index,
                                      input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),
@@ -389,69 +390,27 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                        }
                    }

-                    // Send to vertex shader
                    if (g_debug_context)
-                        g_debug_context->OnEvent(DebugContext::Event::RunVS, static_cast<void*>(&input));
-                    g_state.vs.Run(vs_shader_unit, input, attribute_config.GetNumTotalAttributes(), g_state.regs.vs);
-                    output_registers = vs_shader_unit.output_registers;
+                        g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input);
+
+                    // Send to vertex shader
+                    output = Shader::Run(shader_unit, input, attribute_config.GetNumTotalAttributes());

                    if (is_indexed) {
-                        vertex_cache[vertex_cache_pos] = output_registers;
+                        vertex_cache[vertex_cache_pos] = output;
                        vertex_cache_ids[vertex_cache_pos] = vertex;
                        vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE;
                    }
                }

-                // Helper to send triangle to renderer
+                // Send to renderer
                using Pica::Shader::OutputVertex;
                auto AddTriangle = [](
                        const OutputVertex& v0, const OutputVertex& v1, const OutputVertex& v2) {
                    VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
                };

-                if (Shader::UseGS()) {
-
-                    auto& regs = g_state.regs;
-                    auto& gs_regs = g_state.regs.gs;
-                    auto& gs_buf = g_state.gs_input_buffer;
-
-                    // Vertex Shader Outputs are converted into Geometry Shader inputs by filling up a buffer
-                    // For example, if we have a geoshader that takes 6 inputs, and the vertex shader outputs 2 attributes
-                    // It would take 3 vertices to fill up the Geometry Shader buffer
-                    unsigned int gs_input_count = gs_regs.num_input_attributes + 1;
-                    unsigned int vs_output_count = regs.vs_outmap_total2 + 1;
-                    ASSERT_MSG(regs.vs_outmap_total1 == regs.vs_outmap_total2, "VS_OUTMAP_TOTAL1 and VS_OUTMAP_TOTAL2 don't match!");
-                    // copy into the geoshader buffer
-                    for (unsigned int i = 0; i < vs_output_count; i++) {
-                        if (gs_buf.index >= gs_input_count) {
-                            // TODO(ds84182): LOG_ERROR()
-                            ASSERT_MSG(false, "Number of GS inputs (%d) is not divisible by number of VS outputs (%d)",
-                                        gs_input_count, vs_output_count);
-                            continue;
-                        }
-                        gs_buf.buffer.attr[gs_buf.index++] = output_registers.value[i];
-                    }
-
-                    if (gs_buf.index >= gs_input_count) {
-
-                        // b15 will be false when a new primitive starts and then switch to true at some point
-                        //TODO: Test how this works exactly on hardware
-                        g_state.gs.uniforms.b[15] |= (index > 0);
-
-                        // Process Geometry Shader
-                        if (g_debug_context)
-                            g_debug_context->OnEvent(DebugContext::Event::RunGS, static_cast<void*>(&gs_buf.buffer));
-                        gs_unit_state.emit_triangle_callback = AddTriangle;
-                        g_state.gs.Run(gs_unit_state, gs_buf.buffer, gs_input_count, regs.gs);
-                        gs_unit_state.emit_triangle_callback = nullptr;
-
-                        gs_buf.index = 0;
-                    }
-                } else {
-                    Shader::OutputVertex output_vertex = output_registers.ToVertex(regs.vs);
-                    primitive_assembler.SubmitVertex(output_vertex, AddTriangle);
-                }
-
+                primitive_assembler.SubmitVertex(output, AddTriangle);
            }

            for (auto& range : memory_accesses.ranges) {
@@ -462,76 +421,10 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
            break;
        }

-        case PICA_REG_INDEX(gs.bool_uniforms):
-            Shader::WriteUniformBoolReg(true, value);
-            break;
-
-        case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[0], 0x281):
-        case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[1], 0x282):
-        case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[2], 0x283):
-        case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[3], 0x284):
-        {
-            unsigned index = (id - PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[0], 0x281));
-            auto values = regs.gs.int_uniforms[index];
-            Shader::WriteUniformIntReg(true, index, Math::Vec4<u8>(values.x, values.y, values.z, values.w));
-            break;
-        }
-
-        case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.setup, 0x290):
-            Shader::WriteUniformFloatSetupReg(true, value);
-            break;
-
-        case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[0], 0x291):
-        case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[1], 0x292):
-        case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[2], 0x293):
-        case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[3], 0x294):
-        case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[4], 0x295):
-        case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[5], 0x296):
-        case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[6], 0x297):
-        case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[7], 0x298):
-        {
-            Shader::WriteUniformFloatReg(true, value);
-            break;
-        }
-
-        // Load shader program code
-        case PICA_REG_INDEX_WORKAROUND(gs.program.offset, 0x29b):
-            Shader::WriteProgramCodeOffset(true, value);
-            break;
-
-        case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[0], 0x29c):
-        case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[1], 0x29d):
-        case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[2], 0x29e):
-        case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[3], 0x29f):
-        case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[4], 0x2a0):
-        case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[5], 0x2a1):
-        case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[6], 0x2a2):
-        case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[7], 0x2a3):
-        {
-            Shader::WriteProgramCode(true, value);
-            break;
-        }
-
-        // Load swizzle pattern data
-        case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.offset, 0x2a5):
-            Shader::WriteSwizzlePatternsOffset(true, value);
-            break;
-
-        case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[0], 0x2a6):
-        case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[1], 0x2a7):
-        case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[2], 0x2a8):
-        case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[3], 0x2a9):
-        case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[4], 0x2aa):
-        case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[5], 0x2ab):
-        case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[6], 0x2ac):
-        case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[7], 0x2ad):
-        {
-            Shader::WriteSwizzlePatterns(true, value);
-            break;
-        }
-
        case PICA_REG_INDEX(vs.bool_uniforms):
-            Shader::WriteUniformBoolReg(false, value);
+            for (unsigned i = 0; i < 16; ++i)
+                g_state.vs.uniforms.b[i] = (regs.vs.bool_uniforms.Value() & (1 << i)) != 0;
+
            break;

        case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1):
@@ -539,16 +432,14 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
        case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[2], 0x2b3):
        case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[3], 0x2b4):
        {
-            unsigned index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1));
+            int index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1));
            auto values = regs.vs.int_uniforms[index];
-            Shader::WriteUniformIntReg(false, index, Math::Vec4<u8>(values.x, values.y, values.z, values.w));
+            g_state.vs.uniforms.i[index] = Math::Vec4<u8>(values.x, values.y, values.z, values.w);
+            LOG_TRACE(HW_GPU, "Set integer uniform %d to %02x %02x %02x %02x",
+                      index, values.x.Value(), values.y.Value(), values.z.Value(), values.w.Value());
            break;
        }

-        case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.setup, 0x2c0):
-            Shader::WriteUniformFloatSetupReg(false, value);
-            break;
-
        case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[0], 0x2c1):
        case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[1], 0x2c2):
        case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[2], 0x2c3):
@@ -558,15 +449,49 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
        case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[6], 0x2c7):
        case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[7], 0x2c8):
        {
-            Shader::WriteUniformFloatReg(false, value);
+            auto& uniform_setup = regs.vs.uniform_setup;
+
+            // TODO: Does actual hardware indeed keep an intermediate buffer or does
+            //       it directly write the values?
+            uniform_write_buffer[float_regs_counter++] = value;
+
+            // Uniforms are written in a packed format such that four float24 values are encoded in
+            // three 32-bit numbers. We write to internal memory once a full such vector is
+            // written.
+            if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) ||
+                (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) {
+                float_regs_counter = 0;
+
+                auto& uniform = g_state.vs.uniforms.f[uniform_setup.index];
+
+                if (uniform_setup.index > 95) {
+                    LOG_ERROR(HW_GPU, "Invalid VS uniform index %d", (int)uniform_setup.index);
+                    break;
+                }
+
+                // NOTE: The destination component order indeed is "backwards"
+                if (uniform_setup.IsFloat32()) {
+                    for (auto i : {0,1,2,3})
+                        uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i]));
+                } else {
+                    // TODO: Untested
+                    uniform.w = float24::FromRaw(uniform_write_buffer[0] >> 8);
+                    uniform.z = float24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | ((uniform_write_buffer[1] >> 16) & 0xFFFF));
+                    uniform.y = float24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | ((uniform_write_buffer[2] >> 24) & 0xFF));
+                    uniform.x = float24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF);
+                }
+
+                LOG_TRACE(HW_GPU, "Set uniform %x to (%f %f %f %f)", (int)uniform_setup.index,
+                          uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(),
+                          uniform.w.ToFloat32());
+
+                // TODO: Verify that this actually modifies the register!
+                uniform_setup.index.Assign(uniform_setup.index + 1);
+            }
            break;
        }

        // Load shader program code
-        case PICA_REG_INDEX_WORKAROUND(vs.program.offset, 0x2cb):
-            Shader::WriteProgramCodeOffset(false, value);
-            break;
-
        case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[0], 0x2cc):
        case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[1], 0x2cd):
        case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[2], 0x2ce):
@@ -576,15 +501,12 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
        case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[6], 0x2d2):
        case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[7], 0x2d3):
        {
-            Shader::WriteProgramCode(false, value);
+            g_state.vs.program_code[regs.vs.program.offset] = value;
+            regs.vs.program.offset++;
            break;
        }

        // Load swizzle pattern data
-        case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.offset, 0x2d5):
-            Shader::WriteSwizzlePatternsOffset(false, value);
-            break;
-
        case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[0], 0x2d6):
        case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[1], 0x2d7):
        case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[2], 0x2d8):
@@ -594,7 +516,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
        case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[6], 0x2dc):
        case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[7], 0x2dd):
        {
-            Shader::WriteSwizzlePatterns(false, value);
+            g_state.vs.swizzle_data[regs.vs.swizzle_patterns.offset] = value;
+            regs.vs.swizzle_patterns.offset++;
            break;
        }

--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -30,8 +30,7 @@ public:
        PicaCommandProcessed,
        IncomingPrimitiveBatch,
        FinishedPrimitiveBatch,
-        RunVS,
-        RunGS,
+        VertexLoaded,
        IncomingDisplayTransfer,
        GSPCommandProcessed,
        BufferSwapped,
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@@ -497,7 +497,7 @@ void Init() {
 }

 void Shutdown() {
-    Shader::ShaderSetup::Shutdown();
+    Shader::Shutdown();
 }

 template <typename T>
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -1070,7 +1070,7 @@ struct Regs {
    // Number of vertices to render
    u32 num_vertices;

-    BitField<0, 2, u32> using_geometry_shader;
+    INSERT_PADDING_WORDS(0x1);

    // The index of the first vertex to render
    u32 vertex_offset;
@@ -1118,14 +1118,7 @@ struct Regs {
        }
    } command_buffer;

-    INSERT_PADDING_WORDS(0x06);
-
-    enum class VSComMode : u32 {
-        Shared = 0,
-        Exclusive = 1
-    };
-
-    VSComMode vs_com_mode;
+    INSERT_PADDING_WORDS(0x07);

    enum class GPUMode : u32 {
        Drawing = 0,
@@ -1134,17 +1127,7 @@ struct Regs {

    GPUMode gpu_mode;

-    INSERT_PADDING_WORDS(0x4);
-
-    BitField<0, 4, u32> vs_outmap_total1;
-
-    INSERT_PADDING_WORDS(0x6);
-
-    BitField<0, 4, u32> vs_outmap_total2;
-
-    BitField<0, 4, u32> gsh_misc0;
-
-    INSERT_PADDING_WORDS(0xB);
+    INSERT_PADDING_WORDS(0x18);

    enum class TriangleTopology : u32 {
        List   = 0,
@@ -1153,10 +1136,7 @@ struct Regs {
        Shader = 3, // Programmable setup unit implemented in a geometry shader
    };

-    union {
-        BitField<0, 4, u32> vs_outmap_count;
-        BitField<8, 2, TriangleTopology> triangle_topology;
-    };
+    BitField<8, 2, TriangleTopology> triangle_topology;

    u32 restart_primitive;

@@ -1175,9 +1155,8 @@ struct Regs {
        INSERT_PADDING_WORDS(0x4);

        union {
-            BitField<0, 4, u32> num_input_attributes; // Number of input attributes to shader unit - 1
-            BitField<8, 4, u32> use_subdivision;
-            BitField<24, 8, u32> use_geometry_shader;
+            // Number of input attributes to shader unit - 1
+            BitField<0, 4, u32> num_input_attributes;
        };

        // Offset to shader program entry point (in words)
@@ -1229,8 +1208,6 @@ struct Regs {
            }

            union {
-                u32 setup;
-
                // Index of the next uniform to write to
                // TODO: ctrulib uses 8 bits for this, however that seems to yield lots of invalid indices
                // TODO: Maybe the uppermost index is for the geometry shader? Investigate!
@@ -1347,11 +1324,7 @@ ASSERT_REG_POSITION(trigger_draw, 0x22e);
 ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f);
 ASSERT_REG_POSITION(vs_default_attributes_setup, 0x232);
 ASSERT_REG_POSITION(command_buffer, 0x238);
-ASSERT_REG_POSITION(vs_com_mode, 0x244);
 ASSERT_REG_POSITION(gpu_mode, 0x245);
-ASSERT_REG_POSITION(vs_outmap_total1, 0x24A);
-ASSERT_REG_POSITION(vs_outmap_total2, 0x251);
-ASSERT_REG_POSITION(gsh_misc0, 0x252);
 ASSERT_REG_POSITION(triangle_topology, 0x25e);
 ASSERT_REG_POSITION(restart_primitive, 0x25f);
 ASSERT_REG_POSITION(gs, 0x280);
--- a/src/video_core/pica_state.h
+++ b/src/video_core/pica_state.h
@@ -17,13 +17,9 @@ struct State {
    /// Pica registers
    Regs regs;

-    Shader::UnitState<false> shader_units[4];
-
    Shader::ShaderSetup vs;
    Shader::ShaderSetup gs;

-    Math::Vec4<float24> vs_default_attributes[16];
-
    struct {
        union LutEntry {
            // Used for raw access
@@ -60,15 +56,6 @@ struct State {

    // This is constructed with a dummy triangle topology
    PrimitiveAssembler<Shader::OutputVertex> primitive_assembler;
-
-    /// Current geometry shader state
-    struct GeometryShaderState {
-        // Buffer used for geometry shader inputs
-        Shader::InputVertex buffer;
-        // The current index into the buffer
-        unsigned int index;
-    } gs_input_buffer;
-
 };

 extern State g_state; ///< Current Pica state
--- a/src/video_core/primitive_assembly.cpp
+++ b/src/video_core/primitive_assembly.cpp
@@ -20,6 +20,7 @@ template<typename VertexType>
 void PrimitiveAssembler<VertexType>::SubmitVertex(VertexType& vtx, TriangleHandler triangle_handler)
 {
    switch (topology) {
+        // TODO: Figure out what's different with TriangleTopology::Shader.
        case Regs::TriangleTopology::List:
        case Regs::TriangleTopology::Shader:
            if (buffer_index < 2) {
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -27,7 +27,83 @@ namespace Pica {

 namespace Shader {

-OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) {
+#ifdef ARCHITECTURE_x86_64
+static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map;
+static const JitShader* jit_shader;
+#endif // ARCHITECTURE_x86_64
+
+void Setup() {
+#ifdef ARCHITECTURE_x86_64
+    if (VideoCore::g_shader_jit_enabled) {
+        u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^
+            Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)));
+
+        auto iter = shader_map.find(cache_key);
+        if (iter != shader_map.end()) {
+            jit_shader = iter->second.get();
+        } else {
+            auto shader = std::make_unique<JitShader>();
+            shader->Compile();
+            jit_shader = shader.get();
+            shader_map[cache_key] = std::move(shader);
+        }
+    }
+#endif // ARCHITECTURE_x86_64
+}
+
+void Shutdown() {
+#ifdef ARCHITECTURE_x86_64
+    shader_map.clear();
+#endif // ARCHITECTURE_x86_64
+}
+
+static Common::Profiling::TimingCategory shader_category("Vertex Shader");
+MICROPROFILE_DEFINE(GPU_VertexShader, "GPU", "Vertex Shader", MP_RGB(50, 50, 240));
+
+OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes) {
+    auto& config = g_state.regs.vs;
+
+    Common::Profiling::ScopeTimer timer(shader_category);
+    MICROPROFILE_SCOPE(GPU_VertexShader);
+
+    state.program_counter = config.main_offset;
+    state.debug.max_offset = 0;
+    state.debug.max_opdesc_id = 0;
+
+    // Setup input register table
+    const auto& attribute_register_map = config.input_register_map;
+
+    // TODO: Instead of this cumbersome logic, just load the input data directly like
+    // for (int attr = 0; attr < num_attributes; ++attr) { input_attr[0] = state.registers.input[attribute_register_map.attribute0_register]; }
+    if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = input.attr[0];
+    if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = input.attr[1];
+    if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = input.attr[2];
+    if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = input.attr[3];
+    if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = input.attr[4];
+    if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = input.attr[5];
+    if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = input.attr[6];
+    if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = input.attr[7];
+    if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = input.attr[8];
+    if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = input.attr[9];
+    if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = input.attr[10];
+    if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = input.attr[11];
+    if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = input.attr[12];
+    if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = input.attr[13];
+    if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = input.attr[14];
+    if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = input.attr[15];
+
+    state.conditional_code[0] = false;
+    state.conditional_code[1] = false;
+
+#ifdef ARCHITECTURE_x86_64
+    if (VideoCore::g_shader_jit_enabled)
+        jit_shader->Run(&state.registers, g_state.regs.vs.main_offset);
+    else
+        RunInterpreter(state);
+#else
+    RunInterpreter(state);
+#endif // ARCHITECTURE_x86_64
+
    // Setup output data
    OutputVertex ret;
    // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
@@ -38,10 +114,10 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) {
        if (index >= g_state.regs.vs_output_total)
            break;

-        if ((config.output_mask & (1 << i)) == 0)
+        if ((g_state.regs.vs.output_mask & (1 << i)) == 0)
            continue;

-        const auto& output_register_map = g_state.regs.vs_output_attributes[index];
+        const auto& output_register_map = g_state.regs.vs_output_attributes[index]; // TODO: Don't hardcode VS here

        u32 semantics[4] = {
            output_register_map.map_x, output_register_map.map_y,
@@ -51,7 +127,7 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) {
        for (unsigned comp = 0; comp < 4; ++comp) {
            float24* out = ((float24*)&ret) + semantics[comp];
            if (semantics[comp] != Regs::VSOutputAttributes::INVALID) {
-                *out = value[i][comp];
+                *out = state.registers.output[i][comp];
            } else {
                // Zero output so that attributes which aren't output won't have denormals in them,
                // which would slow us down later.
@@ -79,71 +155,10 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) {
    return ret;
 }

-#ifdef ARCHITECTURE_x86_64
-static std::unordered_map<u64, std::shared_ptr<JitShader>> shader_map;
-#endif // ARCHITECTURE_x86_64
-
-void ShaderSetup::Setup() {
-#ifdef ARCHITECTURE_x86_64
-    if (VideoCore::g_shader_jit_enabled) {
-        u64 cache_key = (Common::ComputeHash64(&program_code, sizeof(program_code)) ^
-            Common::ComputeHash64(&swizzle_data, sizeof(swizzle_data)));
-
-        auto iter = shader_map.find(cache_key);
-        if (iter != shader_map.end()) {
-            jit_shader = iter->second;
-        } else {
-            auto shader = std::make_shared<JitShader>();
-            shader->Compile(*this);
-            jit_shader = shader;
-            shader_map[cache_key] = std::move(shader);
-        }
-    } else {
-        jit_shader.reset();
-    }
-#endif // ARCHITECTURE_x86_64
-}
-
-void ShaderSetup::Shutdown() {
-#ifdef ARCHITECTURE_x86_64
-    shader_map.clear();
-#endif // ARCHITECTURE_x86_64
-}
-
-static Common::Profiling::TimingCategory shader_category("Shader");
-MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
-
-void ShaderSetup::Run(UnitState<false>& state, const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config) {
-
-    Common::Profiling::ScopeTimer timer(shader_category);
-    MICROPROFILE_SCOPE(GPU_Shader);
-
-    state.debug.max_offset = 0;
-    state.debug.max_opdesc_id = 0;
-
-    // Setup input register table
-    const auto& attribute_register_map = config.input_register_map;
-
-    for (unsigned i = 0; i < num_attributes; i++)
-         state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i];
-
-    state.conditional_code[0] = false;
-    state.conditional_code[1] = false;
-
-#ifdef ARCHITECTURE_x86_64
-    if (auto shader = jit_shader.lock())
-        shader.get()->Run(config, *this, state);
-    else
-        RunInterpreter(config, *this, state);
-#else
-    RunInterpreter(config, *this, state);
-#endif // ARCHITECTURE_x86_64
-
-}
-
-DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config) {
+DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) {
    UnitState<true> state;

+    state.program_counter = config.main_offset;
    state.debug.max_offset = 0;
    state.debug.max_opdesc_id = 0;

@@ -152,218 +167,30 @@ DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_
    float24 dummy_register;
    boost::fill(state.registers.input, &dummy_register);

-    for (unsigned i = 0; i < num_attributes; i++)
-         state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i];
+    if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = &input.attr[0].x;
+    if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = &input.attr[1].x;
+    if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = &input.attr[2].x;
+    if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = &input.attr[3].x;
+    if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = &input.attr[4].x;
+    if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = &input.attr[5].x;
+    if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = &input.attr[6].x;
+    if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = &input.attr[7].x;
+    if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = &input.attr[8].x;
+    if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = &input.attr[9].x;
+    if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = &input.attr[10].x;
+    if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = &input.attr[11].x;
+    if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = &input.attr[12].x;
+    if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = &input.attr[13].x;
+    if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = &input.attr[14].x;
+    if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = &input.attr[15].x;

    state.conditional_code[0] = false;
    state.conditional_code[1] = false;

-    RunInterpreter(config, *this, state);
+    RunInterpreter(state);
    return state.debug;
 }

-bool SharedGS() {
-    return g_state.regs.vs_com_mode == Pica::Regs::VSComMode::Shared;
-}
-
-bool UseGS() {
-    // TODO(ds84182): This would be more accurate if it looked at induvidual shader units for the geoshader bit
-    // gs_regs.input_buffer_config.use_geometry_shader == 0x08
-    ASSERT((g_state.regs.using_geometry_shader == 0) || (g_state.regs.using_geometry_shader == 2));
-    return g_state.regs.using_geometry_shader == 2;
-}
-
-UnitState<false>& GetShaderUnit(bool gs) {
-
-    // GS are always run on shader unit 3
-    if (gs) {
-        return g_state.shader_units[3];
-    }
-
-    // The worst scheduler you'll ever see!
-    //TODO: How does PICA shader scheduling work?
-    static unsigned shader_unit_scheduler = 0;
-    shader_unit_scheduler++;
-    shader_unit_scheduler %= 3; // TODO: When does it also allow use of unit 3?!
-    return g_state.shader_units[shader_unit_scheduler];
-}
-
-void WriteUniformBoolReg(bool gs, u32 value) {
-    auto& setup = gs ? g_state.gs : g_state.vs;
-
-    ASSERT(setup.uniforms.b.size() == 16);
-    for (unsigned i = 0; i < 16; ++i)
-        setup.uniforms.b[i] = (value & (1 << i)) != 0;
-
-    // Copy for GS in shared mode
-    if (!gs && SharedGS()) {
-        WriteUniformBoolReg(true, value);
-    }
-}
-
-void WriteUniformIntReg(bool gs, unsigned index, const Math::Vec4<u8>& values) {
-    const char* shader_type = gs ? "GS" : "VS";
-    auto& setup = gs ? g_state.gs : g_state.vs;
-
-    ASSERT(index < setup.uniforms.i.size());
-    setup.uniforms.i[index] = values;
-    LOG_TRACE(HW_GPU, "Set %s integer uniform %d to %02x %02x %02x %02x",
-              shader_type, index, values.x.Value(), values.y.Value(), values.z.Value(), values.w.Value());
-
-    // Copy for GS in shared mode
-    if (!gs && SharedGS()) {
-        WriteUniformIntReg(true, index, values);
-    }
-}
-
-void WriteUniformFloatSetupReg(bool gs, u32 value) {
-    auto& config = gs ? g_state.regs.gs : g_state.regs.vs;
-
-    config.uniform_setup.setup = value;
-
-    // Copy for GS in shared mode
-    if (!gs && SharedGS()) {
-        WriteUniformFloatSetupReg(true, value);
-    }
-}
-
-void WriteUniformFloatReg(bool gs, u32 value) {
-    const char* shader_type = gs ? "GS" : "VS";
-    auto& config = gs ? g_state.regs.gs : g_state.regs.vs;
-    auto& setup = gs ? g_state.gs : g_state.vs;
-
-    auto& uniform_setup = config.uniform_setup;
-    auto& uniform_write_buffer = setup.uniform_write_buffer;
-    auto& float_regs_counter = setup.float_regs_counter;
-
-    // TODO: Does actual hardware indeed keep an intermediate buffer or does
-    //       it directly write the values?
-    uniform_write_buffer[float_regs_counter++] = value;
-
-    // Uniforms are written in a packed format such that four float24 values are encoded in
-    // three 32-bit numbers. We write to internal memory once a full such vector is
-    // written.
-    if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) ||
-        (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) {
-        float_regs_counter = 0;
-
-        auto& uniform = setup.uniforms.f[uniform_setup.index];
-
-        if (uniform_setup.index >= 96) {
-            LOG_ERROR(HW_GPU, "Invalid %s float uniform index %d", shader_type, (int)uniform_setup.index);
-        } else {
-
-            // NOTE: The destination component order indeed is "backwards"
-            if (uniform_setup.IsFloat32()) {
-                for (auto i : {0,1,2,3})
-                    uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i]));
-            } else {
-                // TODO: Untested
-                uniform.w = float24::FromRaw(uniform_write_buffer[0] >> 8);
-                uniform.z = float24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | ((uniform_write_buffer[1] >> 16) & 0xFFFF));
-                uniform.y = float24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | ((uniform_write_buffer[2] >> 24) & 0xFF));
-                uniform.x = float24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF);
-            }
-
-            LOG_TRACE(HW_GPU, "Set %s float uniform %x to (%f %f %f %f)", shader_type, (int)uniform_setup.index,
-                      uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(),
-                      uniform.w.ToFloat32());
-
-            // TODO: Verify that this actually modifies the register!
-            uniform_setup.index.Assign(uniform_setup.index + 1);
-        }
-
-    }
-
-    // Copy for GS in shared mode
-    if (!gs && SharedGS()) {
-        WriteUniformFloatReg(true, value);
-    }
-}
-
-void WriteProgramCodeOffset(bool gs, u32 value) {
-    auto& config = gs ? g_state.regs.gs : g_state.regs.vs;
-    config.program.offset = value;
-
-    // Copy for GS in shared mode
-    if (!gs && SharedGS()) {
-        WriteProgramCodeOffset(true, value);
-    }
-}
-
-void WriteProgramCode(bool gs, u32 value) {
-    const char* shader_type = gs ? "GS" : "VS";
-    auto& config = gs ? g_state.regs.gs : g_state.regs.vs;
-    auto& setup = gs ? g_state.gs : g_state.vs;
-
-    if (config.program.offset >= setup.program_code.size()) {
-        LOG_ERROR(HW_GPU, "Invalid %s program offset %d", shader_type, (int)config.program.offset);
-    } else {
-        setup.program_code[config.program.offset] = value;
-        config.program.offset++;
-    }
-
-    // Copy for GS in shared mode
-    if (!gs && SharedGS()) {
-        WriteProgramCode(true, value);
-    }
-}
-
-void WriteSwizzlePatternsOffset(bool gs, u32 value) {
-    auto& config = gs ? g_state.regs.gs : g_state.regs.vs;
-    config.swizzle_patterns.offset = value;
-
-    // Copy for GS in shared mode
-    if (!gs && SharedGS()) {
-        WriteSwizzlePatternsOffset(true, value);
-    }
-}
-
-void WriteSwizzlePatterns(bool gs, u32 value) {
-    const char* shader_type = gs ? "GS" : "VS";
-    auto& config = gs ? g_state.regs.gs : g_state.regs.vs;
-    auto& setup = gs ? g_state.gs : g_state.vs;
-
-    if (config.swizzle_patterns.offset >= setup.swizzle_data.size()) {
-        LOG_ERROR(HW_GPU, "Invalid %s swizzle pattern offset %d", shader_type, (int)config.swizzle_patterns.offset);
-    } else {
-        setup.swizzle_data[config.swizzle_patterns.offset] = value;
-        config.swizzle_patterns.offset++;
-    }
-
-    // Copy for GS in shared mode
-    if (!gs && SharedGS()) {
-        WriteSwizzlePatterns(true, value);
-    }
-}
-
-template<bool Debug>
-void HandleEMIT(UnitState<Debug>& state) {
-    auto &config = g_state.regs.gs;
-    auto &emit_params = state.emit_params;
-    auto &emit_buffers = state.emit_buffers;
-
-    ASSERT(emit_params.vertex_id < 3);
-
-    emit_buffers[emit_params.vertex_id] = state.output_registers;
-
-    if (emit_params.primitive_emit) {
-        ASSERT_MSG(state.emit_triangle_callback, "EMIT invoked but no handler set!");
-        OutputVertex v0 = emit_buffers[0].ToVertex(config);
-        OutputVertex v1 = emit_buffers[1].ToVertex(config);
-        OutputVertex v2 = emit_buffers[2].ToVertex(config);
-        if (emit_params.winding) {
-            state.emit_triangle_callback(v2, v1, v0);
-        } else {
-            state.emit_triangle_callback(v0, v1, v2);
-        }
-    }
-}
-
-// Explicit instantiation
-template void HandleEMIT(UnitState<false>& state);
-template void HandleEMIT(UnitState<true>& state);
-
 } // namespace Shader

 } // namespace Pica
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -4,7 +4,6 @@

 #pragma once

-#include <memory>
 #include <vector>

 #include <boost/container/static_vector.hpp>
@@ -16,7 +15,6 @@
 #include "common/vector_math.h"

 #include "video_core/pica.h"
-#include "video_core/primitive_assembly.h"

 using nihstro::RegisterType;
 using nihstro::SourceRegister;
@@ -26,11 +24,6 @@ namespace Pica {

 namespace Shader {

-#ifdef ARCHITECTURE_x86_64
-// Forward declare JitShader because shader_jit_x64.h requires ShaderSetup (which uses JitShader) from this file
-class JitShader;
-#endif // ARCHITECTURE_x86_64
-
 struct InputVertex {
    Math::Vec4<float24> attr[16];
 };
@@ -84,14 +77,22 @@ struct OutputVertex {
 static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
 static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");

-struct OutputRegisters {
-    OutputRegisters() = default;
+/// Vertex shader memory
+struct ShaderSetup {
+    struct {
+        // The float uniforms are accessed by the shader JIT using SSE instructions, and are
+        // therefore required to be 16-byte aligned.
+        alignas(16) Math::Vec4<float24> f[96];

-    alignas(16) Math::Vec4<float24> value[16];
+        std::array<bool, 16> b;
+        std::array<Math::Vec4<u8>, 4> i;
+    } uniforms;

-    OutputVertex ToVertex(const Regs::ShaderConfig& config);
+    Math::Vec4<float24> default_attributes[16];
+
+    std::array<u32, 1024> program_code;
+    std::array<u32, 1024> swizzle_data;
 };
-static_assert(std::is_pod<OutputRegisters>::value, "Structure is not POD");

 // Helper structure used to keep track of data useful for inspection of shader emulation
 template<bool full_debugging>
@@ -191,9 +192,9 @@ inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, float24* va
    record.src3.x = value[0];
    record.src3.y = value[1];
    record.src3.z = value[2];
-
    record.src3.w = value[3];
 }
+
 template<>
 inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, float24* value) {
    record.dest_in.x = value[0];
@@ -276,38 +277,43 @@ struct UnitState {
        // The registers are accessed by the shader JIT using SSE instructions, and are therefore
        // required to be 16-byte aligned.
        alignas(16) Math::Vec4<float24> input[16];
+        alignas(16) Math::Vec4<float24> output[16];
        alignas(16) Math::Vec4<float24> temporary[16];
    } registers;
    static_assert(std::is_pod<Registers>::value, "Structure is not POD");

-    OutputRegisters emit_buffers[3]; //TODO: 3dbrew suggests this only stores the first 7 output registers
-
-    union EmitParameters {
-        u32 raw;
-        BitField<22, 1, u32> winding;
-        BitField<23, 1, u32> primitive_emit;
-        BitField<24, 2, u32> vertex_id;
-    } emit_params;
-
-    PrimitiveAssembler<OutputVertex>::TriangleHandler emit_triangle_callback;
-
-    OutputRegisters output_registers;
-
+    u32 program_counter;
    bool conditional_code[2];

    // Two Address registers and one loop counter
    // TODO: How many bits do these actually have?
    s32 address_registers[3];

+    enum {
+        INVALID_ADDRESS = 0xFFFFFFFF
+    };
+
+    struct CallStackElement {
+        u32 final_address;  // Address upon which we jump to return_address
+        u32 return_address; // Where to jump when leaving scope
+        u8 repeat_counter;  // How often to repeat until this call stack element is removed
+        u8 loop_increment;  // Which value to add to the loop counter after an iteration
+                            // TODO: Should this be a signed value? Does it even matter?
+        u32 loop_address;   // The address where we'll return to after each loop iteration
+    };
+
+    // TODO: Is there a maximal size for this?
+    boost::container::static_vector<CallStackElement, 16> call_stack;
+
    DebugData<Debug> debug;

    static size_t InputOffset(const SourceRegister& reg) {
        switch (reg.GetRegisterType()) {
        case RegisterType::Input:
-            return offsetof(UnitState, registers.input) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState::Registers, input) + reg.GetIndex()*sizeof(Math::Vec4<float24>);

        case RegisterType::Temporary:
-            return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);

        default:
            UNREACHABLE();
@@ -318,105 +324,45 @@ struct UnitState {
    static size_t OutputOffset(const DestRegister& reg) {
        switch (reg.GetRegisterType()) {
        case RegisterType::Output:
-            return offsetof(UnitState, output_registers.value) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState::Registers, output) + reg.GetIndex()*sizeof(Math::Vec4<float24>);

        case RegisterType::Temporary:
-            return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);

        default:
            UNREACHABLE();
            return 0;
        }
    }
-
-    static size_t EmitParamsOffset() {
-        return offsetof(UnitState, emit_params.raw);
-    }
 };

-class ShaderSetup {
+/**
+ * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per
+ * vertex, which would happen within the `Run` function).
+ */
+void Setup();

-public:
+/// Performs any cleanup when the emulator is shutdown
+void Shutdown();

-    struct {
-        // The float uniforms are accessed by the shader JIT using SSE instructions, and are
-        // therefore required to be 16-byte aligned.
-        alignas(16) Math::Vec4<float24> f[96];
+/**
+ * Runs the currently setup shader
+ * @param state Shader unit state, must be setup per shader and per shader unit
+ * @param input Input vertex into the shader
+ * @param num_attributes The number of vertex shader attributes
+ * @return The output vertex, after having been processed by the vertex shader
+ */
+OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes);

-        std::array<bool, 16> b;
-        std::array<Math::Vec4<u8>, 4> i;
-    } uniforms;
-
-    static size_t UniformOffset(RegisterType type, unsigned index) {
-        switch (type) {
-        case RegisterType::FloatUniform:
-            return offsetof(ShaderSetup, uniforms.f) + index*sizeof(Math::Vec4<float24>);
-
-        case RegisterType::BoolUniform:
-            return offsetof(ShaderSetup, uniforms.b) + index*sizeof(bool);
-
-        case RegisterType::IntUniform:
-            return offsetof(ShaderSetup, uniforms.i) + index*sizeof(Math::Vec4<u8>);
-
-        default:
-            UNREACHABLE();
-            return 0;
-        }
-    }
-
-    int float_regs_counter = 0;
-    u32 uniform_write_buffer[4];
-
-    std::array<u32, 1024> program_code;
-    std::array<u32, 1024> swizzle_data;
-
-#ifdef ARCHITECTURE_x86_64
-    std::weak_ptr<const JitShader> jit_shader;
-#endif
-
-    /**
-     * Performs any shader setup that only needs to happen once per shader (as opposed to once per
-     * vertex, which would happen within the `Run` function).
-     */
-    void Setup();
-
-    /// Performs any cleanup when the emulator is shutdown
-    static void Shutdown();
-
-    /**
-     * Runs the currently setup shader
-     * @param state Shader unit state, must be setup per shader and per shader unit
-     * @param input Input vertex into the shader
-     * @param num_attributes The number of vertex shader attributes
-     * @param config Configuration object for the shader pipeline
-     */
-    void Run(UnitState<false>& state, const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config);
-
-    /**
-     * Produce debug information based on the given shader and input vertex
-     * @param input Input vertex into the shader
-     * @param num_attributes The number of vertex shader attributes
-     * @param config Configuration object for the shader pipeline
-     * @return Debug information for this shader with regards to the given vertex
-     */
-    DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config);
-
-};
-
-bool SharedGS();
-bool UseGS();
-UnitState<false>& GetShaderUnit(bool gs);
-void WriteUniformBoolReg(bool gs, u32 value);
-void WriteUniformIntReg(bool gs, unsigned index, const Math::Vec4<u8>& values);
-void WriteUniformFloatSetupReg(bool gs, u32 value);
-void WriteUniformFloatReg(bool gs, u32 value);
-void WriteProgramCodeOffset(bool gs, u32 value);
-void WriteProgramCode(bool gs, u32 value);
-void WriteSwizzlePatternsOffset(bool gs, u32 value);
-void WriteSwizzlePatterns(bool gs, u32 value);
-
-template<bool Debug>
-void HandleEMIT(UnitState<Debug>& state);
+/**
+ * Produce debug information based on the given shader and input vertex
+ * @param input Input vertex into the shader
+ * @param num_attributes The number of vertex shader attributes
+ * @param config Configuration object for the shader pipeline
+ * @param setup Setup object for the shader pipeline
+ * @return Debug information for this shader with regards to the given vertex
+ */
+DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup);

 } // namespace Shader

--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -21,30 +21,11 @@ namespace Pica {

 namespace Shader {

-enum {
-        INVALID_ADDRESS = 0xFFFFFFFF
-};
-
-struct CallStackElement {
-    u32 final_address;  // Address upon which we jump to return_address
-    u32 return_address; // Where to jump when leaving scope
-    u8 repeat_counter;  // How often to repeat until this call stack element is removed
-    u8 loop_increment;  // Which value to add to the loop counter after an iteration
-                        // TODO: Should this be a signed value? Does it even matter?
-    u32 loop_address;   // The address where we'll return to after each loop iteration
-};
-
 template<bool Debug>
-void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& setup, UnitState<Debug>& state) {
-
-    // TODO: Is there a maximal size for this?
-    boost::container::static_vector<CallStackElement, 16> call_stack;
-
-    u32 program_counter = config.main_offset;
-
-    const auto& uniforms = setup.uniforms;
-    const auto& swizzle_data = setup.swizzle_data;
-    const auto& program_code = setup.program_code;
+void RunInterpreter(UnitState<Debug>& state) {
+    const auto& uniforms = g_state.vs.uniforms;
+    const auto& swizzle_data = g_state.vs.swizzle_data;
+    const auto& program_code = g_state.vs.program_code;

    // Placeholder for invalid inputs
    static float24 dummy_vec4_float24[4];
@@ -52,16 +33,16 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s
    unsigned iteration = 0;
    bool exit_loop = false;
    while (!exit_loop) {
-        if (!call_stack.empty()) {
-            auto& top = call_stack.back();
-            if (program_counter == top.final_address) {
+        if (!state.call_stack.empty()) {
+            auto& top = state.call_stack.back();
+            if (state.program_counter == top.final_address) {
                state.address_registers[2] += top.loop_increment;

                if (top.repeat_counter-- == 0) {
-                    program_counter = top.return_address;
-                    call_stack.pop_back();
+                    state.program_counter = top.return_address;
+                    state.call_stack.pop_back();
                } else {
-                    program_counter = top.loop_address;
+                    state.program_counter = top.loop_address;
                }

                // TODO: Is "trying again" accurate to hardware?
@@ -69,20 +50,20 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s
            }
        }

-        const Instruction instr = { program_code[program_counter] };
+        const Instruction instr = { program_code[state.program_counter] };
        const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] };

-        static auto call = [&program_counter, &call_stack](UnitState<Debug>& state, u32 offset, u32 num_instructions,
+        static auto call = [](UnitState<Debug>& state, u32 offset, u32 num_instructions,
                              u32 return_offset, u8 repeat_count, u8 loop_increment) {
-            program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
-            ASSERT(call_stack.size() < call_stack.capacity());
-            call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
+            state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
+            ASSERT(state.call_stack.size() < state.call_stack.capacity());
+            state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
        };
-        Record<DebugDataRecord::CUR_INSTR>(state.debug, iteration, program_counter);
+        Record<DebugDataRecord::CUR_INSTR>(state.debug, iteration, state.program_counter);
        if (iteration > 0)
-            Record<DebugDataRecord::NEXT_INSTR>(state.debug, iteration - 1, program_counter);
+            Record<DebugDataRecord::NEXT_INSTR>(state.debug, iteration - 1, state.program_counter);

-        state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + program_counter);
+        state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + state.program_counter);

        auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* {
            switch (source_reg.GetRegisterType()) {
@@ -139,7 +120,7 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s
                src2[3] = src2[3] * float24::FromFloat32(-1);
            }

-            float24* dest = (instr.common.dest.Value() < 0x10) ? &state.output_registers.value[instr.common.dest.Value().GetIndex()][0]
+            float24* dest = (instr.common.dest.Value() < 0x10) ? &state.registers.output[instr.common.dest.Value().GetIndex()][0]
                        : (instr.common.dest.Value() < 0x20) ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0]
                        : dummy_vec4_float24;

@@ -478,7 +459,7 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s
                    src3[3] = src3[3] * float24::FromFloat32(-1);
                }

-                float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.output_registers.value[instr.mad.dest.Value().GetIndex()][0]
+                float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0]
                            : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0]
                            : dummy_vec4_float24;

@@ -530,7 +511,7 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s
            case OpCode::Id::JMPC:
                Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
                if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
-                    program_counter = instr.flow_control.dest_offset - 1;
+                    state.program_counter = instr.flow_control.dest_offset - 1;
                }
                break;

@@ -538,7 +519,7 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s
                Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);

                if (uniforms.b[instr.flow_control.bool_uniform_id] == !(instr.flow_control.num_instructions & 1)) {
-                    program_counter = instr.flow_control.dest_offset - 1;
+                    state.program_counter = instr.flow_control.dest_offset - 1;
                }
                break;

@@ -546,7 +527,7 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s
                call(state,
                     instr.flow_control.dest_offset,
                     instr.flow_control.num_instructions,
-                     program_counter + 1, 0, 0);
+                     state.program_counter + 1, 0, 0);
                break;

            case OpCode::Id::CALLU:
@@ -555,7 +536,7 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s
                    call(state,
                        instr.flow_control.dest_offset,
                        instr.flow_control.num_instructions,
-                        program_counter + 1, 0, 0);
+                        state.program_counter + 1, 0, 0);
                }
                break;

@@ -565,7 +546,7 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s
                    call(state,
                        instr.flow_control.dest_offset,
                        instr.flow_control.num_instructions,
-                        program_counter + 1, 0, 0);
+                        state.program_counter + 1, 0, 0);
                }
                break;

@@ -576,8 +557,8 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s
                Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
                if (uniforms.b[instr.flow_control.bool_uniform_id]) {
                    call(state,
-                         program_counter + 1,
-                         instr.flow_control.dest_offset - program_counter - 1,
+                         state.program_counter + 1,
+                         instr.flow_control.dest_offset - state.program_counter - 1,
                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                } else {
                    call(state,
@@ -595,8 +576,8 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s
                Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
                if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
                    call(state,
-                         program_counter + 1,
-                         instr.flow_control.dest_offset - program_counter - 1,
+                         state.program_counter + 1,
+                         instr.flow_control.dest_offset - state.program_counter - 1,
                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                } else {
                    call(state,
@@ -618,24 +599,14 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s

                Record<DebugDataRecord::LOOP_INT_IN>(state.debug, iteration, loop_param);
                call(state,
-                     program_counter + 1,
-                     instr.flow_control.dest_offset - program_counter + 1,
+                     state.program_counter + 1,
+                     instr.flow_control.dest_offset - state.program_counter + 1,
                     instr.flow_control.dest_offset + 1,
                     loop_param.x,
                     loop_param.z);
                break;
            }

-            case OpCode::Id::EMIT: {
-                Shader::HandleEMIT(state);
-                break;
-            }
-
-            case OpCode::Id::SETEMIT: {
-                state.emit_params.raw = program_code[program_counter];
-                break;
-            }
-
            default:
                LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
                          (int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex);
@@ -646,14 +617,14 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s
        }
        }

-        ++program_counter;
+        ++state.program_counter;
        ++iteration;
    }
 }

 // Explicit instantiation
-template void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& setup, UnitState<false>& state);
-template void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& setup, UnitState<true>& state);
+template void RunInterpreter(UnitState<false>& state);
+template void RunInterpreter(UnitState<true>& state);

 } // namespace

--- a/src/video_core/shader/shader_interpreter.h
+++ b/src/video_core/shader/shader_interpreter.h
@@ -4,7 +4,6 @@

 #pragma once

-#include "video_core/pica.h"
 #include "video_core/shader/shader.h"

 namespace Pica {
@@ -12,7 +11,7 @@ namespace Pica {
 namespace Shader {

 template<bool Debug>
-void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& setup, UnitState<Debug>& state);
+void RunInterpreter(UnitState<Debug>& state);

 } // namespace

--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -65,8 +65,8 @@ const JitFunction instr_table[64] = {
    &JitShader::Compile_IF,         // ifu
    &JitShader::Compile_IF,         // ifc
    &JitShader::Compile_LOOP,       // loop
-    &JitShader::Compile_EMIT,       // emit
-    &JitShader::Compile_SETEMIT,    // setemit
+    nullptr,                        // emit
+    nullptr,                        // sete
    &JitShader::Compile_JMP,        // jmpc
    &JitShader::Compile_JMP,        // jmpu
    &JitShader::Compile_CMP,        // cmp
@@ -94,7 +94,7 @@ const JitFunction instr_table[64] = {
 // purposes, as documented below:

 /// Pointer to the uniform memory
-static const X64Reg SETUP = R9;
+static const X64Reg UNIFORMS = R9;
 /// The two 32-bit VS address offset registers set by the MOVA instruction
 static const X64Reg ADDROFFS_REG_0 = R10;
 static const X64Reg ADDROFFS_REG_1 = R11;
@@ -109,7 +109,7 @@ static const X64Reg COND0 = R13;
 /// Result of the previous CMP instruction for the Y-component comparison
 static const X64Reg COND1 = R14;
 /// Pointer to the UnitState instance for the current VS unit
-static const X64Reg STATE = R15;
+static const X64Reg REGISTERS = R15;
 /// SIMD scratch register
 static const X64Reg SCRATCH = XMM0;
 /// Loaded with the first swizzled source register, otherwise can be used as a scratch register
@@ -128,7 +128,7 @@ static const X64Reg NEGBIT = XMM15;
 // State registers that must not be modified by external functions calls
 // Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed
 static const BitSet32 persistent_regs = {
-    SETUP, STATE, // Pointers to register blocks
+    UNIFORMS, REGISTERS, // Pointers to register blocks
    ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, // Cached registers
    ONE+16, NEGBIT+16, // Constants
 };
@@ -138,6 +138,15 @@ static const u8 NO_SRC_REG_SWIZZLE = 0x1b;
 /// Raw constant for the destination register enable mask that indicates all components are enabled
 static const u8 NO_DEST_REG_MASK = 0xf;

+/**
+ * Get the vertex shader instruction for a given offset in the current shader program
+ * @param offset Offset in the current shader program of the instruction
+ * @return Instruction at the specified offset
+ */
+static Instruction GetVertexShaderInstruction(size_t offset) {
+    return { g_state.vs.program_code[offset] };
+}
+
 static void LogCritical(const char* msg) {
    LOG_CRITICAL(HW_GPU, msg);
 }
@@ -160,10 +169,10 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe
    size_t src_offset;

    if (src_reg.GetRegisterType() == RegisterType::FloatUniform) {
-        src_ptr = SETUP;
-        src_offset = ShaderSetup::UniformOffset(RegisterType::FloatUniform, src_reg.GetIndex());
+        src_ptr = UNIFORMS;
+        src_offset = src_reg.GetIndex() * sizeof(float24) * 4;
    } else {
-        src_ptr = STATE;
+        src_ptr = REGISTERS;
        src_offset = UnitState<false>::InputOffset(src_reg);
    }

@@ -208,7 +217,7 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe
        MOVAPS(dest, MDisp(src_ptr, src_offset_disp));
    }

-    SwizzlePattern swiz = { setup->swizzle_data[operand_desc_id] };
+    SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] };

    // Generate instructions for source register swizzling as needed
    u8 sel = swiz.GetRawSelector(src_num);
@@ -239,7 +248,7 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) {
        dest = instr.common.dest.Value();
    }

-    SwizzlePattern swiz = { setup->swizzle_data[operand_desc_id] };
+    SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] };

    int dest_offset_disp = (int)UnitState<false>::OutputOffset(dest);
    ASSERT_MSG(dest_offset_disp == UnitState<false>::OutputOffset(dest), "Destinaton offset too large for int type");
@@ -247,11 +256,11 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) {
    // If all components are enabled, write the result to the destination register
    if (swiz.dest_mask == NO_DEST_REG_MASK) {
        // Store dest back to memory
-        MOVAPS(MDisp(STATE, dest_offset_disp), src);
+        MOVAPS(MDisp(REGISTERS, dest_offset_disp), src);

    } else {
        // Not all components are enabled, so mask the result when storing to the destination register...
-        MOVAPS(SCRATCH, MDisp(STATE, dest_offset_disp));
+        MOVAPS(SCRATCH, MDisp(REGISTERS, dest_offset_disp));

        if (Common::GetCPUCaps().sse4_1) {
            u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
@@ -270,7 +279,7 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) {
        }

        // Store dest back to memory
-        MOVAPS(MDisp(STATE, dest_offset_disp), SCRATCH);
+        MOVAPS(MDisp(REGISTERS, dest_offset_disp), SCRATCH);
    }
 }

@@ -319,8 +328,8 @@ void JitShader::Compile_EvaluateCondition(Instruction instr) {
 }

 void JitShader::Compile_UniformCondition(Instruction instr) {
-    int offset = ShaderSetup::UniformOffset(RegisterType::BoolUniform, instr.flow_control.bool_uniform_id);
-    CMP(sizeof(bool) * 8, MDisp(SETUP, offset), Imm8(0));
+    int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool));
+    CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0));
 }

 BitSet32 JitShader::PersistentCallerSavedRegs() {
@@ -495,7 +504,7 @@ void JitShader::Compile_MIN(Instruction instr) {
 }

 void JitShader::Compile_MOVA(Instruction instr) {
-    SwizzlePattern swiz = { setup->swizzle_data[instr.common.operand_desc_id] };
+    SwizzlePattern swiz = { g_state.vs.swizzle_data[instr.common.operand_desc_id] };

    if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) {
        return; // NoOp
@@ -697,8 +706,8 @@ void JitShader::Compile_LOOP(Instruction instr) {

    looping = true;

-    int offset = ShaderSetup::UniformOffset(RegisterType::IntUniform, instr.flow_control.int_uniform_id);
-    MOV(32, R(LOOPCOUNT), MDisp(SETUP, offset));
+    int offset = offsetof(decltype(g_state.vs.uniforms), i) + (instr.flow_control.int_uniform_id * sizeof(Math::Vec4<u8>));
+    MOV(32, R(LOOPCOUNT), MDisp(UNIFORMS, offset));
    MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT));
    SHR(32, R(LOOPCOUNT_REG), Imm8(8));
    AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start
@@ -719,22 +728,6 @@ void JitShader::Compile_LOOP(Instruction instr) {
    looping = false;
 }

-static void Handle_EMIT(void* param1) {
-    UnitState<false>& state = *static_cast<UnitState<false>*>(param1);
-    Shader::HandleEMIT(state);
-};
-
-void JitShader::Compile_EMIT(Instruction instr) {
-    ABI_PushRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0);
-    MOV(PTRBITS, R(ABI_PARAM1), R(STATE));
-    ABI_CallFunctionR(reinterpret_cast<const void*>(Handle_EMIT), ABI_PARAM1);
-    ABI_PopRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0);
-}
-
-void JitShader::Compile_SETEMIT(Instruction instr) {
-    MOV(32, MDisp(STATE, UnitState<false>::EmitParamsOffset()), Imm32(*(u32*)&instr.setemit));
-}
-
 void JitShader::Compile_JMP(Instruction instr) {
    if (instr.opcode.Value() == OpCode::Id::JMPC)
        Compile_EvaluateCondition(instr);
@@ -775,7 +768,7 @@ void JitShader::Compile_NextInstr() {
    ASSERT_MSG(code_ptr[program_counter] == nullptr, "Tried to compile already compiled shader location!");
    code_ptr[program_counter] = GetCodePtr();

-    Instruction instr = GetShaderInstruction(program_counter++);
+    Instruction instr = GetVertexShaderInstruction(program_counter++);

    OpCode::Id opcode = instr.opcode.Value();
    auto instr_func = instr_table[static_cast<unsigned>(opcode)];
@@ -793,8 +786,8 @@ void JitShader::Compile_NextInstr() {
 void JitShader::FindReturnOffsets() {
    return_offsets.clear();

-    for (size_t offset = 0; offset < setup->program_code.size(); ++offset) {
-        Instruction instr = GetShaderInstruction(offset);
+    for (size_t offset = 0; offset < g_state.vs.program_code.size(); ++offset) {
+        Instruction instr = GetVertexShaderInstruction(offset);

        switch (instr.opcode.Value()) {
        case OpCode::Id::CALL:
@@ -809,11 +802,7 @@ void JitShader::FindReturnOffsets() {
    std::sort(return_offsets.begin(), return_offsets.end());
 }

-void JitShader::Compile(const ShaderSetup& setup) {
-
-    // Get a pointer to the setup to access program_code and swizzle_data
-    this->setup = &setup;
-
+void JitShader::Compile() {
    // Reset flow control state
    program = (CompiledShader*)GetCodePtr();
    program_counter = 0;
@@ -827,8 +816,8 @@ void JitShader::Compile(const ShaderSetup& setup) {
    // The stack pointer is 8 modulo 16 at the entry of a procedure
    ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);

-    MOV(PTRBITS, R(SETUP), R(ABI_PARAM1));
-    MOV(PTRBITS, R(STATE), R(ABI_PARAM3));
+    MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1));
+    MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms));

    // Zero address/loop  registers
    XOR(64, R(ADDROFFS_REG_0), R(ADDROFFS_REG_0));
@@ -849,7 +838,7 @@ void JitShader::Compile(const ShaderSetup& setup) {
    JMPptr(R(ABI_PARAM2));

    // Compile entire program
-    Compile_Block(static_cast<unsigned>(this->setup->program_code.size()));
+    Compile_Block(static_cast<unsigned>(g_state.vs.program_code.size()));

    // Set the target for any incomplete branches now that the entire shader program has been emitted
    for (const auto& branch : fixup_branches) {
@@ -866,10 +855,6 @@ void JitShader::Compile(const ShaderSetup& setup) {
    ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!");

    LOG_DEBUG(HW_GPU, "Compiled shader size=%d", size);
-
-    // We don't need the setup anymore
-    this->setup = nullptr;
-
 }

 JitShader::JitShader() {
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -33,11 +33,11 @@ class JitShader : public Gen::XCodeBlock {
 public:
    JitShader();

-    void Run(const Pica::Regs::ShaderConfig& config, const ShaderSetup& setup, UnitState<false>& state) const {
-        program(&setup, code_ptr[config.main_offset], &state);
+    void Run(void* registers, unsigned offset) const {
+        program(registers, code_ptr[offset]);
    }

-    void Compile(const ShaderSetup& setup);
+    void Compile();

    void Compile_ADD(Instruction instr);
    void Compile_DP3(Instruction instr);
@@ -62,8 +62,6 @@ public:
    void Compile_CALLU(Instruction instr);
    void Compile_IF(Instruction instr);
    void Compile_LOOP(Instruction instr);
-    void Compile_EMIT(Instruction instr);
-    void Compile_SETEMIT(Instruction instr);
    void Compile_JMP(Instruction instr);
    void Compile_CMP(Instruction instr);
    void Compile_MAD(Instruction instr);
@@ -98,17 +96,6 @@ private:
     */
    void Compile_Assert(bool condition, const char* msg);

-    /**
-     * Get the shader instruction for a given offset in the current shader program
-     * @param offset Offset in the current shader program of the instruction
-     * @return Instruction at the specified offset
-     */
-    Instruction GetShaderInstruction(size_t offset) {
-        Instruction instruction;
-        std::memcpy(&instruction, &setup->program_code[offset], sizeof(Instruction));
-        return instruction;
-    }
-
    /**
     * Analyzes the entire shader program for `CALL` instructions before emitting any code,
     * identifying the locations where a return needs to be inserted.
@@ -127,10 +114,8 @@ private:
    /// Branches that need to be fixed up once the entire shader program is compiled
    std::vector<std::pair<Gen::FixupBranch, unsigned>> fixup_branches;

-    using CompiledShader = void(const void* setup, const u8* start_addr, void* state);
+    using CompiledShader = void(void* registers, const u8* start_addr);
    CompiledShader* program = nullptr;
-
-    const ShaderSetup* setup = nullptr;
 };

 } // Shader