vertex_shader: Optimized SIMD-based vertex shader interpreter for x86 systems.

2015-07-08 22:55:37 -04:00
parent b7ac4cb54e
commit f468dcd799
10 changed files with 640 additions and 2 deletions
--- a/externals/nihstro
+++ b/externals/nihstro
--- a/src/citra/config.cpp
+++ b/src/citra/config.cpp
@@ -69,6 +69,7 @@ void Config::ReadValues() {
    Settings::values.frame_skip = glfw_config->GetInteger("Core", "frame_skip", 0);

    // Renderer
+    Settings::values.shader_core = glfw_config->GetBoolean("Renderer", "shader_core", 0);
    Settings::values.use_hw_renderer = glfw_config->GetBoolean("Renderer", "use_hw_renderer", false);

    Settings::values.bg_red   = (float)glfw_config->GetReal("Renderer", "bg_red",   1.0);
--- a/src/citra/default_ini.h
+++ b/src/citra/default_ini.h
@@ -42,6 +42,10 @@ gpu_refresh_rate =
 frame_skip =

 [Renderer]
+# Shader core to use for emulation of vertex shaders
+# 0 (default): Optimized interpreter, 1: Interpreter
+shader_core =
+
 # Whether to use software or hardware rendering.
 # 0 (default): Software, 1: Hardware
 use_hw_renderer =
--- a/src/citra_qt/config.cpp
+++ b/src/citra_qt/config.cpp
@@ -53,6 +53,7 @@ void Config::ReadValues() {
    qt_config->endGroup();

    qt_config->beginGroup("Renderer");
+    Settings::values.shader_core = qt_config->value("shader_core", 0).toInt();
    Settings::values.use_hw_renderer = qt_config->value("use_hw_renderer", false).toBool();

    Settings::values.bg_red   = qt_config->value("bg_red",   1.0).toFloat();
@@ -105,6 +106,7 @@ void Config::SaveValues() {
    qt_config->endGroup();

    qt_config->beginGroup("Renderer");
+    qt_config->setValue("shader_core", Settings::values.shader_core);
    qt_config->setValue("use_hw_renderer", Settings::values.use_hw_renderer);

    // Cast to double because Qt's written float values are not human-readable
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -8,6 +8,11 @@

 namespace Settings {

+enum ShaderCore {
+    OptimizedInterpreter,
+    Interpreter
+};
+
 struct Values {
    // Controls
    int pad_a_key;
@@ -44,6 +49,7 @@ struct Values {
    int region_value;

    // Renderer
+    int shader_core;
    bool use_hw_renderer;

    float bg_red;
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -14,6 +14,7 @@ set(SRCS
            rasterizer.cpp
            utils.cpp
            vertex_shader.cpp
+            vertex_shader_simd.cpp
            video_core.cpp
            )

@@ -38,6 +39,7 @@ set(HEADERS
            renderer_base.h
            utils.h
            vertex_shader.h
+            vertex_shader_simd.h
            video_core.h
            )

--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -5,6 +5,7 @@
 #include <boost/range/algorithm/fill.hpp>

 #include "common/profiler.h"
+#include "common/cpu_detect.h"

 #include "clipper.h"
 #include "command_processor.h"
@@ -12,6 +13,7 @@
 #include "pica.h"
 #include "primitive_assembly.h"
 #include "vertex_shader.h"
+#include "vertex_shader_simd.h"
 #include "video_core.h"
 #include "core/hle/service/gsp_gpu.h"
 #include "core/hw/gpu.h"
@@ -121,6 +123,11 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
            PrimitiveAssembler<VertexShader::OutputVertex> primitive_assembler(regs.triangle_topology.Value());
            PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex> dumping_primitive_assembler(regs.triangle_topology.Value());

+            VertexShaderSIMD::CoreState state;
+            if (Settings::values.shader_core == Settings::OptimizedInterpreter) {
+                VertexShaderSIMD::InitCore(state);
+            }
+
            for (unsigned int index = 0; index < regs.num_vertices; ++index)
            {
                unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index;
@@ -190,7 +197,12 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                                                                   &geometry_dumper, _1, _2, _3));

                // Send to vertex shader
-                VertexShader::OutputVertex output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes());
+                VertexShader::OutputVertex output;
+                if (Settings::values.shader_core == Settings::OptimizedInterpreter && Common::cpu_info.bSSE4_1) {
+                    output = VertexShaderSIMD::RunShader(state, input, attribute_config.GetNumTotalAttributes());
+                } else {
+                    output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes());
+                }

                if (is_indexed) {
                    // TODO: Add processed vertex to vertex cache!
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@@ -6,11 +6,16 @@

 #include "pica.h"

+#include "vertex_shader_simd.h"
+
 namespace Pica {

 State g_state;

 void Init() {
+#if _M_SSE >= 0x401
+    VertexShaderSIMD::Init();
+#endif
 }

 void Shutdown() {
--- a/src/video_core/vertex_shader_simd.cpp
+++ b/src/video_core/vertex_shader_simd.cpp
@@ -0,0 +1,528 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <vector>
+
+#include <nihstro/shader_bytecode.h>
+
+#include "video_core/vertex_shader_simd.h"
+
+using nihstro::OpCode;
+using nihstro::SwizzlePattern;
+
+namespace Pica {
+
+namespace VertexShaderSIMD {
+
+#if _M_SSE >= 0x401
+static __m128i swizzle_dst_mask[16];
+static __m128i swizzle_src_selector[256];
+#endif
+
+void Init() {
+#if _M_SSE >= 0x401
+    for (int i = 0; i < 16; ++i) {
+        swizzle_dst_mask[i].m128i_u32[0] = ((i >> 3) & 1) * 0xffffffff;
+        swizzle_dst_mask[i].m128i_u32[1] = ((i >> 2) & 1) * 0xffffffff;
+        swizzle_dst_mask[i].m128i_u32[2] = ((i >> 1) & 1) * 0xffffffff;
+        swizzle_dst_mask[i].m128i_u32[3] = (i & 1) * 0xffffffff;
+    }
+
+#define SELECT(n) (((((n) << 2) + 3) << 24) | ((((n) << 2) + 2) << 16) | ((((n) << 2) + 1) << 8) | ((n) << 2))
+    for (int i = 0; i < 256; ++i) {
+        swizzle_src_selector[i].m128i_u32[0] = SELECT((i >> 6) & 0x3);
+        swizzle_src_selector[i].m128i_u32[1] = SELECT((i >> 4) & 0x3);
+        swizzle_src_selector[i].m128i_u32[2] = SELECT((i >> 2) & 0x3);
+        swizzle_src_selector[i].m128i_u32[3] = SELECT( i       & 0x3);
+    }
+#undef SELECT
+#endif
+}
+
+#pragma pack(1)
+union Instruction {
+    Instruction& operator =(const Instruction& instr) {
+        hex = instr.hex;
+        return *this;
+    }
+
+    u32 hex;
+
+    BitField<0x1a, 0x6, OpCode::Id> opcode;
+
+    // General notes:
+    //
+    // When two input registers are used, one of them uses a 5-bit index while the other
+    // one uses a 7-bit index. This is because at most one floating point uniform may be used
+    // as an input.
+
+    enum CompareOp : u32 {
+        Equal = 0,
+        NotEqual = 1,
+        LessThan = 2,
+        LessEqual = 3,
+        GreaterThan = 4,
+        GreaterEqual = 5,
+        Unk6 = 6,
+        Unk7 = 7
+    };
+
+    // Format used e.g. by arithmetic instructions and comparisons
+    union Common { // TODO: Remove name
+        BitField<0x00, 0x7, u32> operand_desc_id;
+
+        /**
+        * Source inputs may be reordered for certain instructions.
+        * Use GetSrc1 and GetSrc2 instead to access the input register indices hence.
+        */
+        BitField<0x07, 0x5, u32> src2;
+        BitField<0x0c, 0x7, u32> src1;
+
+        BitField<0x07, 0x7, u32> src2i;
+        BitField<0x0e, 0x5, u32> src1i;
+
+        // Address register value is used for relative addressing of src1
+        BitField<0x13, 0x2, u32> address_register_index;
+
+        union {
+            BitField<0x15, 0x3, CompareOp> y;
+            BitField<0x18, 0x3, CompareOp> x;
+        } compare_op;
+
+        BitField<0x15, 0x5, u32> dest;
+    } common;
+
+    union FlowControlType {  // TODO: Make nameless once MSVC supports it
+        enum Op : u32 {
+            Or = 0,
+            And = 1,
+            JustX = 2,
+            JustY = 3
+        };
+
+        BitField<0x00, 0x8, u32> num_instructions;
+        BitField<0x0a, 0xc, u32> dest_offset;
+
+        BitField<0x16, 0x2, Op> op;
+        BitField<0x16, 0x4, u32> bool_uniform_id;
+        BitField<0x16, 0x2, u32> int_uniform_id; // TODO: Verify that only this many bits are used...
+
+        BitFlag<0x18, u32> refy;
+        BitFlag<0x19, u32> refx;
+    } flow_control;
+
+    union {
+        BitField<0x00, 0x5, u32> operand_desc_id;
+
+        BitField<0x05, 0x5, u32> src3;
+        BitField<0x0a, 0x7, u32> src2;
+        BitField<0x11, 0x7, u32> src1;
+
+        BitField<0x05, 0x7, u32> src3i;
+        BitField<0x0c, 0x5, u32> src2i;
+
+        BitField<0x18, 0x5, u32> dest;
+    } cmp;
+
+    union {
+        BitField<0x00, 0x5, u32> operand_desc_id;
+
+        BitField<0x05, 0x5, u32> src3;
+        BitField<0x0a, 0x7, u32> src2;
+        BitField<0x11, 0x7, u32> src1;
+
+        BitField<0x05, 0x7, u32> src3i;
+        BitField<0x0c, 0x5, u32> src2i;
+
+        BitField<0x18, 0x5, u32> dest;
+    } mad;
+};
+static_assert(sizeof(Instruction) == 0x4, "Incorrect structure size");
+static_assert(std::is_standard_layout<Instruction>::value, "Structure does not have standard layout");
+
+static inline void Call(CoreState& state, u32 offset, u32 num_instructions, u32 return_offset, u8 repeat_count, u8 loop_increment) {
+    state.pc = offset;
+    state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
+};
+
+static inline bool Compare(Instruction::CompareOp op, f32 src1, f32 src2) {
+    switch (op) {
+    case Instruction::CompareOp::Equal:
+        return src1 == src2;
+
+    case Instruction::CompareOp::NotEqual:
+        return src1 != src2;
+
+    case Instruction::CompareOp::LessThan:
+        return src1 < src2;
+
+    case Instruction::CompareOp::LessEqual:
+        return src1 <= src2;
+
+    case Instruction::CompareOp::GreaterThan:
+        return src1 > src2;
+
+    case Instruction::CompareOp::GreaterEqual:
+        return src1 >= src2;
+    }
+
+    return false;
+}
+
+static inline bool EvaluateCondition(const CoreState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) {
+    const bool results[2] = { refx == state.conditional_code[0],
+                              refy == state.conditional_code[1] };
+
+    switch (flow_control.op) {
+    case flow_control.Or:
+        return results[0] || results[1];
+
+    case flow_control.And:
+        return results[0] && results[1];
+
+    case flow_control.JustX:
+        return results[0];
+
+    case flow_control.JustY:
+        return results[1];
+    }
+
+    return false;
+};
+
+void InitCore(CoreState& state) {
+    memcpy(&state.uniform[0].x, g_state.vs.uniforms.f, sizeof(state.uniform));
+}
+
+VertexShader::OutputVertex RunShader(CoreState& state, const VertexShader::InputVertex& input, int num_attributes) {
+    const auto& regs = g_state.regs;
+    auto& vs = g_state.vs;
+    const auto& swizzle_data = vs.swizzle_data;
+    const auto& program_code = vs.program_code;
+    bool exit_loop = false;
+
+    state.pc = regs.vs_main_offset;
+    state.address_offset.raw_i.m128i_i32[0] = 0;
+    state.conditional_code[0] = false;
+    state.conditional_code[1] = false;
+
+    const auto& reg_map = regs.vs_input_register_map;
+
+    for (int i = 0; i < num_attributes; ++i) {
+        Reg& reg = state.input[reg_map.GetRegisterForAttribute(i)];
+        reg.x = input.attr[i].x.ToFloat32();
+        reg.y = input.attr[i].y.ToFloat32();
+        reg.z = input.attr[i].z.ToFloat32();
+        reg.w = input.attr[i].w.ToFloat32();
+    }
+
+#if _M_SSE >= 0x401
+    while (true) {
+        if (!state.call_stack.empty()) {
+            auto& top = state.call_stack.back();
+            if (&program_code[state.pc] - program_code.data() == top.final_address) {
+                state.address_offset.raw_i.m128i_i32[3] += top.loop_increment;
+
+                if (top.repeat_counter-- == 0) {
+                    state.pc = top.return_address;
+                    state.call_stack.pop_back();
+                } else {
+                    state.pc = top.loop_address;
+                }
+
+                // TODO: Is "trying again" accurate to hardware?
+                continue;
+            }
+        }
+
+        const Instruction& instr = *(const Instruction*)&program_code[state.pc];
+
+        #define NEGATE(value, negate) _mm_xor_si128(value, _mm_set1_epi32(negate << 31))
+
+        #define SWIZZLE(reg, selector) _mm_shuffle_epi8(state.InputReg(reg).raw_i, swizzle_src_selector[selector])
+
+        #define FORMAT1(operation_) { \
+            Reg src1, src2, temp; \
+            const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id]; \
+            int offset = state.address_offset.raw_i.m128i_i32[instr.common.address_register_index]; \
+            src1.raw_i = NEGATE(SWIZZLE(instr.common.src1 + offset, swizzle.src1_selector), swizzle.negate_src1); \
+            src2.raw_i = NEGATE(SWIZZLE(instr.common.src2, swizzle.src2_selector), swizzle.negate_src2); \
+            temp.raw_f = operation_; \
+            __m128i dst_mask = swizzle_dst_mask[swizzle.dest_mask]; \
+            auto& dest = state.OutputReg(instr.common.dest); \
+            dest.raw_i = _mm_or_si128(_mm_andnot_si128(dst_mask, dest.raw_i), _mm_and_si128(dst_mask, temp.raw_i)); \
+        }
+
+        #define FORMAT1I(operation_) { \
+            Reg src1, src2, temp; \
+            const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id]; \
+            int offset = state.address_offset.raw_i.m128i_i32[instr.common.address_register_index]; \
+            src1.raw_i = NEGATE(SWIZZLE(instr.common.src1i, swizzle.src1_selector), swizzle.negate_src1); \
+            src2.raw_i = NEGATE(SWIZZLE(instr.common.src2i + offset, swizzle.src2_selector), swizzle.negate_src2); \
+            temp.raw_f = operation_; \
+            __m128i dst_mask = swizzle_dst_mask[swizzle.dest_mask]; \
+            auto& dest = state.OutputReg(instr.common.dest); \
+            dest.raw_i = _mm_or_si128(_mm_andnot_si128(dst_mask, dest.raw_i), _mm_and_si128(dst_mask, temp.raw_i)); \
+        }
+
+        #define FORMAT1U(operation_) { \
+            Reg src1, temp; \
+            const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id]; \
+            int offset = state.address_offset.raw_i.m128i_i32[instr.common.address_register_index]; \
+            src1.raw_i = NEGATE(SWIZZLE(instr.common.src1 + offset, swizzle.src1_selector), swizzle.negate_src1); \
+            temp.raw_f = operation_; \
+            __m128i dst_mask = swizzle_dst_mask[swizzle.dest_mask]; \
+            auto& dest = state.OutputReg(instr.common.dest); \
+            dest.raw_i = _mm_or_si128(_mm_andnot_si128(dst_mask, dest.raw_i), _mm_and_si128(dst_mask, temp.raw_i)); \
+        }
+
+        #define FORMAT5(operation_) { \
+            Reg src1, src2, src3, temp; \
+            const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.mad.operand_desc_id]; \
+            src1.raw_i = NEGATE(SWIZZLE(instr.mad.src1, swizzle.src1_selector), swizzle.negate_src1); \
+            src2.raw_i = NEGATE(SWIZZLE(instr.mad.src2, swizzle.src2_selector), swizzle.negate_src2); \
+            src3.raw_i = NEGATE(SWIZZLE(instr.mad.src3, swizzle.src3_selector), swizzle.negate_src3); \
+            temp.raw_f = operation_; \
+            __m128i dst_mask = swizzle_dst_mask[swizzle.dest_mask]; \
+            auto& dest = state.OutputReg(instr.mad.dest); \
+            dest.raw_i = _mm_or_si128(_mm_andnot_si128(dst_mask, dest.raw_i), _mm_and_si128(dst_mask, temp.raw_i)); \
+        }
+
+        #define FORMAT5I(operation_) { \
+            Reg src1, src2, src3, temp; \
+            const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.mad.operand_desc_id]; \
+            src1.raw_i = NEGATE(SWIZZLE(instr.mad.src1, swizzle.src1_selector), swizzle.negate_src1); \
+            src2.raw_i = NEGATE(SWIZZLE(instr.mad.src2i, swizzle.src2_selector), swizzle.negate_src2); \
+            src3.raw_i = NEGATE(SWIZZLE(instr.mad.src3i, swizzle.src3_selector), swizzle.negate_src3); \
+            temp.raw_f = operation_; \
+            __m128i dst_mask = swizzle_dst_mask[swizzle.dest_mask]; \
+            auto& dest = state.OutputReg(instr.mad.dest); \
+            dest.raw_i = _mm_or_si128(_mm_andnot_si128(dst_mask, dest.raw_i), _mm_and_si128(dst_mask, temp.raw_i)); \
+        }
+
+        switch (instr.opcode) {
+        case OpCode::Id::ADD:
+            FORMAT1(_mm_add_ps(src1.raw_f, src2.raw_f));
+            break;
+
+        case OpCode::Id::DP3:
+            FORMAT1(_mm_dp_ps(src1.raw_f, src2.raw_f, 0x7f));
+            break;
+
+        case OpCode::Id::DP4:
+            FORMAT1(_mm_dp_ps(src1.raw_f, src2.raw_f, 0xff));
+            break;
+
+        case OpCode::Id::MUL:
+            FORMAT1(_mm_mul_ps(src1.raw_f, src2.raw_f));
+            break;
+
+        case OpCode::Id::SLT:
+            FORMAT1(_mm_and_ps(_mm_cmplt_ps(src1.raw_f, src2.raw_f), _mm_set1_ps(1.f)));
+            break;
+
+        case OpCode::Id::SLTI:
+            FORMAT1I(_mm_and_ps(_mm_cmplt_ps(src1.raw_f, src2.raw_f), _mm_set1_ps(1.f)));
+            break;
+
+        case OpCode::Id::FLR:
+            FORMAT1U(_mm_floor_ps(src1.raw_f));
+            break;
+
+        case OpCode::Id::MAX:
+            FORMAT1(_mm_max_ps(src1.raw_f, src2.raw_f));
+            break;
+
+        case OpCode::Id::MIN:
+            FORMAT1(_mm_min_ps(src1.raw_f, src2.raw_f));
+            break;
+
+        case OpCode::Id::RCP:
+            FORMAT1U(_mm_rcp_ps(src1.raw_f));
+            break;
+
+        case OpCode::Id::RSQ:
+            FORMAT1U(_mm_rsqrt_ps(src1.raw_f));
+            break;
+
+        case OpCode::Id::MOVA:
+        {
+            Reg src1, temp;
+            const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id];
+            int offset = state.address_offset.raw_i.m128i_i32[instr.common.address_register_index];
+            src1.raw_i = NEGATE(SWIZZLE(instr.common.src1 + offset, swizzle.src1_selector), swizzle.negate_src1);
+            temp.raw_i.m128i_i32[1] = static_cast<s32>(src1[0]);
+            temp.raw_i.m128i_i32[2] = static_cast<s32>(src1[1]);
+            __m128i dst_mask = swizzle_dst_mask[(swizzle.dest_mask & 0xc) >> 1];
+            state.address_offset.raw_i = _mm_or_si128(_mm_andnot_si128(dst_mask, state.address_offset.raw_i),
+                _mm_and_si128(dst_mask, temp.raw_i));
+            break;
+        }
+
+        case OpCode::Id::MOV:
+            FORMAT1U(src1.raw_f);
+            break;
+
+        case OpCode::Id::CALL:
+            Call(state,
+                 instr.flow_control.dest_offset,
+                 instr.flow_control.num_instructions,
+                 state.pc + 1, 0, 0);
+            continue;
+
+        case OpCode::Id::CALLU:
+            if (vs.uniforms.b[instr.flow_control.bool_uniform_id]) {
+                Call(state,
+                     instr.flow_control.dest_offset,
+                     instr.flow_control.num_instructions,
+                     state.pc + 1, 0, 0);
+                continue;
+            }
+            break;
+
+        case OpCode::Id::CALLC:
+            if (EvaluateCondition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
+                Call(state,
+                     instr.flow_control.dest_offset,
+                     instr.flow_control.num_instructions,
+                     state.pc + 1, 0, 0);
+                continue;
+            }
+            break;
+
+        case OpCode::Id::NOP:
+            break;
+
+        case OpCode::Id::END:
+            exit_loop = true;
+            break;
+
+        case OpCode::Id::IFU:
+            if (vs.uniforms.b[instr.flow_control.bool_uniform_id]) {
+                Call(state,
+                     state.pc + 1,
+                     instr.flow_control.dest_offset - state.pc - 1,
+                     instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
+            } else {
+                Call(state,
+                     instr.flow_control.dest_offset,
+                     instr.flow_control.num_instructions,
+                     instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
+            }
+            continue;
+
+        case OpCode::Id::IFC:
+            // TODO: Do we need to consider swizzlers here?
+            if (EvaluateCondition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
+                Call(state,
+                     state.pc + 1,
+                     instr.flow_control.dest_offset - state.pc - 1,
+                     instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
+            } else {
+                Call(state,
+                     instr.flow_control.dest_offset,
+                     instr.flow_control.num_instructions,
+                     instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
+            }
+            continue;
+
+        case OpCode::Id::LOOP:
+            state.address_offset.raw_i.m128i_i32[3] = vs.uniforms.i[instr.flow_control.int_uniform_id].y;
+
+            Call(state,
+                state.pc + 1,
+                instr.flow_control.dest_offset - state.pc + 1,
+                instr.flow_control.dest_offset + 1,
+                vs.uniforms.i[instr.flow_control.int_uniform_id].x,
+                vs.uniforms.i[instr.flow_control.int_uniform_id].z);
+
+            continue;
+
+        case OpCode::Id::JMPC:
+            if (EvaluateCondition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
+                state.pc = instr.flow_control.dest_offset;
+                continue;
+            }
+            break;
+
+        case OpCode::Id::JMPU:
+            if (vs.uniforms.b[instr.flow_control.bool_uniform_id]) {
+                state.pc = instr.flow_control.dest_offset;
+                continue;
+            }
+            break;
+
+        case OpCode::Id::CMP:
+        case OpCode::Id::CMP + 1:
+        {
+            Reg src1, src2;
+            const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id];
+            int offset = state.address_offset.raw_i.m128i_i32[instr.common.address_register_index];
+            src1.raw_i = NEGATE(SWIZZLE(instr.common.src1 + offset, swizzle.src1_selector), swizzle.negate_src1);
+            src2.raw_i = NEGATE(SWIZZLE(instr.common.src2, swizzle.src2_selector), swizzle.negate_src2);
+            state.conditional_code[0] = Compare(instr.common.compare_op.x, src1.x, src2.x);
+            state.conditional_code[1] = Compare(instr.common.compare_op.y, src1.y, src2.y);
+            break;
+        }
+
+        case OpCode::Id::MADI:
+        case OpCode::Id::MADI + 1:
+        case OpCode::Id::MADI + 2:
+        case OpCode::Id::MADI + 3:
+        case OpCode::Id::MADI + 4:
+        case OpCode::Id::MADI + 5:
+        case OpCode::Id::MADI + 6:
+        case OpCode::Id::MADI + 7:
+            FORMAT5I(_mm_add_ps(_mm_mul_ps(src1.raw_f, src2.raw_f), src3.raw_f));
+            break;
+
+        case OpCode::Id::MAD:
+        case OpCode::Id::MAD + 1:
+        case OpCode::Id::MAD + 2:
+        case OpCode::Id::MAD + 3:
+        case OpCode::Id::MAD + 4:
+        case OpCode::Id::MAD + 5:
+        case OpCode::Id::MAD + 6:
+        case OpCode::Id::MAD + 7:
+            FORMAT5(_mm_add_ps(_mm_mul_ps(src1.raw_f, src2.raw_f), src3.raw_f));
+            break;
+
+        default:
+            LOG_CRITICAL(HW_GPU, "Unhandled opcode: 0x%02x", instr.opcode.Value());
+            UNIMPLEMENTED();
+        }
+
+        state.pc += 1;
+
+        if (exit_loop)
+            break;
+    }
+#else
+    UNREACHABLE();
+#endif
+
+    // Setup output data
+    VertexShader::OutputVertex ret;
+    // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
+    // figure out what those circumstances are and enable the remaining outputs then.
+    for (int i = 0; i < 7; ++i) {
+        const auto& output_register_map = regs.vs_output_attributes[i];
+
+        u32 semantics[4] = {
+            output_register_map.map_x, output_register_map.map_y,
+            output_register_map.map_z, output_register_map.map_w
+        };
+
+        for (int comp = 0; comp < 4; ++comp) {
+            f32* out = ((f32*)&ret) + semantics[comp];
+            *out = state.output[i][comp];
+        }
+    }
+
+    return ret;
+}
+
+} // namespace VertexShaderSIMD
+
+} // namespace Pica
--- a/src/video_core/vertex_shader_simd.h
+++ b/src/video_core/vertex_shader_simd.h
@@ -0,0 +1,78 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/common_types.h"
+#include "common/intrinsics.h"
+
+#include "vertex_shader.h"
+
+namespace Pica {
+
+namespace VertexShaderSIMD {
+
+struct Reg {
+    union {
+        struct {
+            f32 x, y, z, w;
+        };
+#if _M_SSE >= 0x401
+        __m128 raw_f;
+        __m128i raw_i;
+#endif
+    };
+
+    f32 operator [] (int i) {
+        return *(&x + i);
+    }
+};
+
+struct CoreState {
+    u32 pc;
+    Reg address_offset;
+    bool conditional_code[2];
+
+    struct CallStackElement {
+        u32 final_address;  // Address upon which we jump to return_address
+        u32 return_address; // Where to jump when leaving scope
+        u8 repeat_counter;  // How often to repeat until this call stack element is removed
+        u8 loop_increment;  // Which value to add to the loop counter after an iteration
+        // TODO: Should this be a signed value? Does it even matter?
+        u32 loop_address;   // The address where we'll return to after each loop iteration
+    };
+
+    // TODO: Is there a maximal size for this?
+    std::vector<CallStackElement> call_stack;
+
+    union {
+        struct {
+            Reg output[0x10];
+            Reg input[0x10];
+            Reg temporary[0x10];
+            Reg uniform[0x60];
+        };
+        Reg regs[0x90];
+    };
+
+    Reg InputReg(int index) const {
+        return input[index];
+    }
+
+    Reg& OutputReg(int index) {
+        return regs[(index & 0xf) | ((index & 0x10) << 1)];
+    }
+
+};
+
+/// Initializes lookup tables used by the SIMD vertex shader core
+void Init();
+
+/// Initializes a SIMD vertex shader core for the current shader in Pica memory
+void InitCore(CoreState& state);
+
+/// Runs a vertex shader core for the current shader in Pica memory
+VertexShader::OutputVertex RunShader(CoreState& state, const VertexShader::InputVertex& input, int num_attributes);
+
+} // namespace VertexShaderSIMD
+
+} // namespace Pica