diff --git a/externals/nihstro b/externals/nihstro index 81f1804a4..0eb48f448 160000 --- a/externals/nihstro +++ b/externals/nihstro @@ -1 +1 @@ -Subproject commit 81f1804a43f625e3a1a20752c0db70a413410380 +Subproject commit 0eb48f4488b1e85d8f97b55574b35cb92cbb0b8b diff --git a/src/citra/config.cpp b/src/citra/config.cpp index 1378567c1..6af1d57ae 100644 --- a/src/citra/config.cpp +++ b/src/citra/config.cpp @@ -69,6 +69,7 @@ void Config::ReadValues() { Settings::values.frame_skip = glfw_config->GetInteger("Core", "frame_skip", 0); // Renderer + Settings::values.shader_core = glfw_config->GetBoolean("Renderer", "shader_core", 0); Settings::values.use_hw_renderer = glfw_config->GetBoolean("Renderer", "use_hw_renderer", false); Settings::values.bg_red = (float)glfw_config->GetReal("Renderer", "bg_red", 1.0); diff --git a/src/citra/default_ini.h b/src/citra/default_ini.h index fd5a90d56..ab72471cf 100644 --- a/src/citra/default_ini.h +++ b/src/citra/default_ini.h @@ -42,6 +42,10 @@ gpu_refresh_rate = frame_skip = [Renderer] +# Shader core to use for emulation of vertex shaders +# 0 (default): Optimized interpreter, 1: Interpreter +shader_core = + # Whether to use software or hardware rendering. # 0 (default): Software, 1: Hardware use_hw_renderer = diff --git a/src/citra_qt/config.cpp b/src/citra_qt/config.cpp index 2a9af1f38..367abef85 100644 --- a/src/citra_qt/config.cpp +++ b/src/citra_qt/config.cpp @@ -53,6 +53,7 @@ void Config::ReadValues() { qt_config->endGroup(); qt_config->beginGroup("Renderer"); + Settings::values.shader_core = qt_config->value("shader_core", 0).toInt(); Settings::values.use_hw_renderer = qt_config->value("use_hw_renderer", false).toBool(); Settings::values.bg_red = qt_config->value("bg_red", 1.0).toFloat(); @@ -105,6 +106,7 @@ void Config::SaveValues() { qt_config->endGroup(); qt_config->beginGroup("Renderer"); + qt_config->setValue("shader_core", Settings::values.shader_core); qt_config->setValue("use_hw_renderer", Settings::values.use_hw_renderer); // Cast to double because Qt's written float values are not human-readable diff --git a/src/core/settings.h b/src/core/settings.h index 5a70d157a..286dc4e8d 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -8,6 +8,11 @@ namespace Settings { +enum ShaderCore { + OptimizedInterpreter, + Interpreter +}; + struct Values { // Controls int pad_a_key; @@ -44,6 +49,7 @@ struct Values { int region_value; // Renderer + int shader_core; bool use_hw_renderer; float bg_red; diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 5c7f4ae18..a8c02d9bc 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -14,6 +14,7 @@ set(SRCS rasterizer.cpp utils.cpp vertex_shader.cpp + vertex_shader_simd.cpp video_core.cpp ) @@ -38,6 +39,7 @@ set(HEADERS renderer_base.h utils.h vertex_shader.h + vertex_shader_simd.h video_core.h ) diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index b46fadd9f..d09f7bfa8 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -5,6 +5,7 @@ #include #include "common/profiler.h" +#include "common/cpu_detect.h" #include "clipper.h" #include "command_processor.h" @@ -12,6 +13,7 @@ #include "pica.h" #include "primitive_assembly.h" #include "vertex_shader.h" +#include "vertex_shader_simd.h" #include "video_core.h" #include "core/hle/service/gsp_gpu.h" #include "core/hw/gpu.h" @@ -121,6 +123,11 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { PrimitiveAssembler primitive_assembler(regs.triangle_topology.Value()); PrimitiveAssembler dumping_primitive_assembler(regs.triangle_topology.Value()); + VertexShaderSIMD::CoreState state; + if (Settings::values.shader_core == Settings::OptimizedInterpreter) { + VertexShaderSIMD::InitCore(state); + } + for (unsigned int index = 0; index < regs.num_vertices; ++index) { unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index; @@ -190,7 +197,12 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { &geometry_dumper, _1, _2, _3)); // Send to vertex shader - VertexShader::OutputVertex output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes()); + VertexShader::OutputVertex output; + if (Settings::values.shader_core == Settings::OptimizedInterpreter && Common::cpu_info.bSSE4_1) { + output = VertexShaderSIMD::RunShader(state, input, attribute_config.GetNumTotalAttributes()); + } else { + output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes()); + } if (is_indexed) { // TODO: Add processed vertex to vertex cache! diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp index 543d9c443..bd19e5561 100644 --- a/src/video_core/pica.cpp +++ b/src/video_core/pica.cpp @@ -6,11 +6,16 @@ #include "pica.h" +#include "vertex_shader_simd.h" + namespace Pica { State g_state; void Init() { +#if _M_SSE >= 0x401 + VertexShaderSIMD::Init(); +#endif } void Shutdown() { diff --git a/src/video_core/vertex_shader_simd.cpp b/src/video_core/vertex_shader_simd.cpp new file mode 100644 index 000000000..04672946d --- /dev/null +++ b/src/video_core/vertex_shader_simd.cpp @@ -0,0 +1,528 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include + +#include + +#include "video_core/vertex_shader_simd.h" + +using nihstro::OpCode; +using nihstro::SwizzlePattern; + +namespace Pica { + +namespace VertexShaderSIMD { + +#if _M_SSE >= 0x401 +static __m128i swizzle_dst_mask[16]; +static __m128i swizzle_src_selector[256]; +#endif + +void Init() { +#if _M_SSE >= 0x401 + for (int i = 0; i < 16; ++i) { + swizzle_dst_mask[i].m128i_u32[0] = ((i >> 3) & 1) * 0xffffffff; + swizzle_dst_mask[i].m128i_u32[1] = ((i >> 2) & 1) * 0xffffffff; + swizzle_dst_mask[i].m128i_u32[2] = ((i >> 1) & 1) * 0xffffffff; + swizzle_dst_mask[i].m128i_u32[3] = (i & 1) * 0xffffffff; + } + +#define SELECT(n) (((((n) << 2) + 3) << 24) | ((((n) << 2) + 2) << 16) | ((((n) << 2) + 1) << 8) | ((n) << 2)) + for (int i = 0; i < 256; ++i) { + swizzle_src_selector[i].m128i_u32[0] = SELECT((i >> 6) & 0x3); + swizzle_src_selector[i].m128i_u32[1] = SELECT((i >> 4) & 0x3); + swizzle_src_selector[i].m128i_u32[2] = SELECT((i >> 2) & 0x3); + swizzle_src_selector[i].m128i_u32[3] = SELECT( i & 0x3); + } +#undef SELECT +#endif +} + +#pragma pack(1) +union Instruction { + Instruction& operator =(const Instruction& instr) { + hex = instr.hex; + return *this; + } + + u32 hex; + + BitField<0x1a, 0x6, OpCode::Id> opcode; + + // General notes: + // + // When two input registers are used, one of them uses a 5-bit index while the other + // one uses a 7-bit index. This is because at most one floating point uniform may be used + // as an input. + + enum CompareOp : u32 { + Equal = 0, + NotEqual = 1, + LessThan = 2, + LessEqual = 3, + GreaterThan = 4, + GreaterEqual = 5, + Unk6 = 6, + Unk7 = 7 + }; + + // Format used e.g. by arithmetic instructions and comparisons + union Common { // TODO: Remove name + BitField<0x00, 0x7, u32> operand_desc_id; + + /** + * Source inputs may be reordered for certain instructions. + * Use GetSrc1 and GetSrc2 instead to access the input register indices hence. + */ + BitField<0x07, 0x5, u32> src2; + BitField<0x0c, 0x7, u32> src1; + + BitField<0x07, 0x7, u32> src2i; + BitField<0x0e, 0x5, u32> src1i; + + // Address register value is used for relative addressing of src1 + BitField<0x13, 0x2, u32> address_register_index; + + union { + BitField<0x15, 0x3, CompareOp> y; + BitField<0x18, 0x3, CompareOp> x; + } compare_op; + + BitField<0x15, 0x5, u32> dest; + } common; + + union FlowControlType { // TODO: Make nameless once MSVC supports it + enum Op : u32 { + Or = 0, + And = 1, + JustX = 2, + JustY = 3 + }; + + BitField<0x00, 0x8, u32> num_instructions; + BitField<0x0a, 0xc, u32> dest_offset; + + BitField<0x16, 0x2, Op> op; + BitField<0x16, 0x4, u32> bool_uniform_id; + BitField<0x16, 0x2, u32> int_uniform_id; // TODO: Verify that only this many bits are used... + + BitFlag<0x18, u32> refy; + BitFlag<0x19, u32> refx; + } flow_control; + + union { + BitField<0x00, 0x5, u32> operand_desc_id; + + BitField<0x05, 0x5, u32> src3; + BitField<0x0a, 0x7, u32> src2; + BitField<0x11, 0x7, u32> src1; + + BitField<0x05, 0x7, u32> src3i; + BitField<0x0c, 0x5, u32> src2i; + + BitField<0x18, 0x5, u32> dest; + } cmp; + + union { + BitField<0x00, 0x5, u32> operand_desc_id; + + BitField<0x05, 0x5, u32> src3; + BitField<0x0a, 0x7, u32> src2; + BitField<0x11, 0x7, u32> src1; + + BitField<0x05, 0x7, u32> src3i; + BitField<0x0c, 0x5, u32> src2i; + + BitField<0x18, 0x5, u32> dest; + } mad; +}; +static_assert(sizeof(Instruction) == 0x4, "Incorrect structure size"); +static_assert(std::is_standard_layout::value, "Structure does not have standard layout"); + +static inline void Call(CoreState& state, u32 offset, u32 num_instructions, u32 return_offset, u8 repeat_count, u8 loop_increment) { + state.pc = offset; + state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset }); +}; + +static inline bool Compare(Instruction::CompareOp op, f32 src1, f32 src2) { + switch (op) { + case Instruction::CompareOp::Equal: + return src1 == src2; + + case Instruction::CompareOp::NotEqual: + return src1 != src2; + + case Instruction::CompareOp::LessThan: + return src1 < src2; + + case Instruction::CompareOp::LessEqual: + return src1 <= src2; + + case Instruction::CompareOp::GreaterThan: + return src1 > src2; + + case Instruction::CompareOp::GreaterEqual: + return src1 >= src2; + } + + return false; +} + +static inline bool EvaluateCondition(const CoreState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) { + const bool results[2] = { refx == state.conditional_code[0], + refy == state.conditional_code[1] }; + + switch (flow_control.op) { + case flow_control.Or: + return results[0] || results[1]; + + case flow_control.And: + return results[0] && results[1]; + + case flow_control.JustX: + return results[0]; + + case flow_control.JustY: + return results[1]; + } + + return false; +}; + +void InitCore(CoreState& state) { + memcpy(&state.uniform[0].x, g_state.vs.uniforms.f, sizeof(state.uniform)); +} + +VertexShader::OutputVertex RunShader(CoreState& state, const VertexShader::InputVertex& input, int num_attributes) { + const auto& regs = g_state.regs; + auto& vs = g_state.vs; + const auto& swizzle_data = vs.swizzle_data; + const auto& program_code = vs.program_code; + bool exit_loop = false; + + state.pc = regs.vs_main_offset; + state.address_offset.raw_i.m128i_i32[0] = 0; + state.conditional_code[0] = false; + state.conditional_code[1] = false; + + const auto& reg_map = regs.vs_input_register_map; + + for (int i = 0; i < num_attributes; ++i) { + Reg& reg = state.input[reg_map.GetRegisterForAttribute(i)]; + reg.x = input.attr[i].x.ToFloat32(); + reg.y = input.attr[i].y.ToFloat32(); + reg.z = input.attr[i].z.ToFloat32(); + reg.w = input.attr[i].w.ToFloat32(); + } + +#if _M_SSE >= 0x401 + while (true) { + if (!state.call_stack.empty()) { + auto& top = state.call_stack.back(); + if (&program_code[state.pc] - program_code.data() == top.final_address) { + state.address_offset.raw_i.m128i_i32[3] += top.loop_increment; + + if (top.repeat_counter-- == 0) { + state.pc = top.return_address; + state.call_stack.pop_back(); + } else { + state.pc = top.loop_address; + } + + // TODO: Is "trying again" accurate to hardware? + continue; + } + } + + const Instruction& instr = *(const Instruction*)&program_code[state.pc]; + + #define NEGATE(value, negate) _mm_xor_si128(value, _mm_set1_epi32(negate << 31)) + + #define SWIZZLE(reg, selector) _mm_shuffle_epi8(state.InputReg(reg).raw_i, swizzle_src_selector[selector]) + + #define FORMAT1(operation_) { \ + Reg src1, src2, temp; \ + const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id]; \ + int offset = state.address_offset.raw_i.m128i_i32[instr.common.address_register_index]; \ + src1.raw_i = NEGATE(SWIZZLE(instr.common.src1 + offset, swizzle.src1_selector), swizzle.negate_src1); \ + src2.raw_i = NEGATE(SWIZZLE(instr.common.src2, swizzle.src2_selector), swizzle.negate_src2); \ + temp.raw_f = operation_; \ + __m128i dst_mask = swizzle_dst_mask[swizzle.dest_mask]; \ + auto& dest = state.OutputReg(instr.common.dest); \ + dest.raw_i = _mm_or_si128(_mm_andnot_si128(dst_mask, dest.raw_i), _mm_and_si128(dst_mask, temp.raw_i)); \ + } + + #define FORMAT1I(operation_) { \ + Reg src1, src2, temp; \ + const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id]; \ + int offset = state.address_offset.raw_i.m128i_i32[instr.common.address_register_index]; \ + src1.raw_i = NEGATE(SWIZZLE(instr.common.src1i, swizzle.src1_selector), swizzle.negate_src1); \ + src2.raw_i = NEGATE(SWIZZLE(instr.common.src2i + offset, swizzle.src2_selector), swizzle.negate_src2); \ + temp.raw_f = operation_; \ + __m128i dst_mask = swizzle_dst_mask[swizzle.dest_mask]; \ + auto& dest = state.OutputReg(instr.common.dest); \ + dest.raw_i = _mm_or_si128(_mm_andnot_si128(dst_mask, dest.raw_i), _mm_and_si128(dst_mask, temp.raw_i)); \ + } + + #define FORMAT1U(operation_) { \ + Reg src1, temp; \ + const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id]; \ + int offset = state.address_offset.raw_i.m128i_i32[instr.common.address_register_index]; \ + src1.raw_i = NEGATE(SWIZZLE(instr.common.src1 + offset, swizzle.src1_selector), swizzle.negate_src1); \ + temp.raw_f = operation_; \ + __m128i dst_mask = swizzle_dst_mask[swizzle.dest_mask]; \ + auto& dest = state.OutputReg(instr.common.dest); \ + dest.raw_i = _mm_or_si128(_mm_andnot_si128(dst_mask, dest.raw_i), _mm_and_si128(dst_mask, temp.raw_i)); \ + } + + #define FORMAT5(operation_) { \ + Reg src1, src2, src3, temp; \ + const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.mad.operand_desc_id]; \ + src1.raw_i = NEGATE(SWIZZLE(instr.mad.src1, swizzle.src1_selector), swizzle.negate_src1); \ + src2.raw_i = NEGATE(SWIZZLE(instr.mad.src2, swizzle.src2_selector), swizzle.negate_src2); \ + src3.raw_i = NEGATE(SWIZZLE(instr.mad.src3, swizzle.src3_selector), swizzle.negate_src3); \ + temp.raw_f = operation_; \ + __m128i dst_mask = swizzle_dst_mask[swizzle.dest_mask]; \ + auto& dest = state.OutputReg(instr.mad.dest); \ + dest.raw_i = _mm_or_si128(_mm_andnot_si128(dst_mask, dest.raw_i), _mm_and_si128(dst_mask, temp.raw_i)); \ + } + + #define FORMAT5I(operation_) { \ + Reg src1, src2, src3, temp; \ + const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.mad.operand_desc_id]; \ + src1.raw_i = NEGATE(SWIZZLE(instr.mad.src1, swizzle.src1_selector), swizzle.negate_src1); \ + src2.raw_i = NEGATE(SWIZZLE(instr.mad.src2i, swizzle.src2_selector), swizzle.negate_src2); \ + src3.raw_i = NEGATE(SWIZZLE(instr.mad.src3i, swizzle.src3_selector), swizzle.negate_src3); \ + temp.raw_f = operation_; \ + __m128i dst_mask = swizzle_dst_mask[swizzle.dest_mask]; \ + auto& dest = state.OutputReg(instr.mad.dest); \ + dest.raw_i = _mm_or_si128(_mm_andnot_si128(dst_mask, dest.raw_i), _mm_and_si128(dst_mask, temp.raw_i)); \ + } + + switch (instr.opcode) { + case OpCode::Id::ADD: + FORMAT1(_mm_add_ps(src1.raw_f, src2.raw_f)); + break; + + case OpCode::Id::DP3: + FORMAT1(_mm_dp_ps(src1.raw_f, src2.raw_f, 0x7f)); + break; + + case OpCode::Id::DP4: + FORMAT1(_mm_dp_ps(src1.raw_f, src2.raw_f, 0xff)); + break; + + case OpCode::Id::MUL: + FORMAT1(_mm_mul_ps(src1.raw_f, src2.raw_f)); + break; + + case OpCode::Id::SLT: + FORMAT1(_mm_and_ps(_mm_cmplt_ps(src1.raw_f, src2.raw_f), _mm_set1_ps(1.f))); + break; + + case OpCode::Id::SLTI: + FORMAT1I(_mm_and_ps(_mm_cmplt_ps(src1.raw_f, src2.raw_f), _mm_set1_ps(1.f))); + break; + + case OpCode::Id::FLR: + FORMAT1U(_mm_floor_ps(src1.raw_f)); + break; + + case OpCode::Id::MAX: + FORMAT1(_mm_max_ps(src1.raw_f, src2.raw_f)); + break; + + case OpCode::Id::MIN: + FORMAT1(_mm_min_ps(src1.raw_f, src2.raw_f)); + break; + + case OpCode::Id::RCP: + FORMAT1U(_mm_rcp_ps(src1.raw_f)); + break; + + case OpCode::Id::RSQ: + FORMAT1U(_mm_rsqrt_ps(src1.raw_f)); + break; + + case OpCode::Id::MOVA: + { + Reg src1, temp; + const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id]; + int offset = state.address_offset.raw_i.m128i_i32[instr.common.address_register_index]; + src1.raw_i = NEGATE(SWIZZLE(instr.common.src1 + offset, swizzle.src1_selector), swizzle.negate_src1); + temp.raw_i.m128i_i32[1] = static_cast(src1[0]); + temp.raw_i.m128i_i32[2] = static_cast(src1[1]); + __m128i dst_mask = swizzle_dst_mask[(swizzle.dest_mask & 0xc) >> 1]; + state.address_offset.raw_i = _mm_or_si128(_mm_andnot_si128(dst_mask, state.address_offset.raw_i), + _mm_and_si128(dst_mask, temp.raw_i)); + break; + } + + case OpCode::Id::MOV: + FORMAT1U(src1.raw_f); + break; + + case OpCode::Id::CALL: + Call(state, + instr.flow_control.dest_offset, + instr.flow_control.num_instructions, + state.pc + 1, 0, 0); + continue; + + case OpCode::Id::CALLU: + if (vs.uniforms.b[instr.flow_control.bool_uniform_id]) { + Call(state, + instr.flow_control.dest_offset, + instr.flow_control.num_instructions, + state.pc + 1, 0, 0); + continue; + } + break; + + case OpCode::Id::CALLC: + if (EvaluateCondition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { + Call(state, + instr.flow_control.dest_offset, + instr.flow_control.num_instructions, + state.pc + 1, 0, 0); + continue; + } + break; + + case OpCode::Id::NOP: + break; + + case OpCode::Id::END: + exit_loop = true; + break; + + case OpCode::Id::IFU: + if (vs.uniforms.b[instr.flow_control.bool_uniform_id]) { + Call(state, + state.pc + 1, + instr.flow_control.dest_offset - state.pc - 1, + instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); + } else { + Call(state, + instr.flow_control.dest_offset, + instr.flow_control.num_instructions, + instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); + } + continue; + + case OpCode::Id::IFC: + // TODO: Do we need to consider swizzlers here? + if (EvaluateCondition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { + Call(state, + state.pc + 1, + instr.flow_control.dest_offset - state.pc - 1, + instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); + } else { + Call(state, + instr.flow_control.dest_offset, + instr.flow_control.num_instructions, + instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); + } + continue; + + case OpCode::Id::LOOP: + state.address_offset.raw_i.m128i_i32[3] = vs.uniforms.i[instr.flow_control.int_uniform_id].y; + + Call(state, + state.pc + 1, + instr.flow_control.dest_offset - state.pc + 1, + instr.flow_control.dest_offset + 1, + vs.uniforms.i[instr.flow_control.int_uniform_id].x, + vs.uniforms.i[instr.flow_control.int_uniform_id].z); + + continue; + + case OpCode::Id::JMPC: + if (EvaluateCondition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { + state.pc = instr.flow_control.dest_offset; + continue; + } + break; + + case OpCode::Id::JMPU: + if (vs.uniforms.b[instr.flow_control.bool_uniform_id]) { + state.pc = instr.flow_control.dest_offset; + continue; + } + break; + + case OpCode::Id::CMP: + case OpCode::Id::CMP + 1: + { + Reg src1, src2; + const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id]; + int offset = state.address_offset.raw_i.m128i_i32[instr.common.address_register_index]; + src1.raw_i = NEGATE(SWIZZLE(instr.common.src1 + offset, swizzle.src1_selector), swizzle.negate_src1); + src2.raw_i = NEGATE(SWIZZLE(instr.common.src2, swizzle.src2_selector), swizzle.negate_src2); + state.conditional_code[0] = Compare(instr.common.compare_op.x, src1.x, src2.x); + state.conditional_code[1] = Compare(instr.common.compare_op.y, src1.y, src2.y); + break; + } + + case OpCode::Id::MADI: + case OpCode::Id::MADI + 1: + case OpCode::Id::MADI + 2: + case OpCode::Id::MADI + 3: + case OpCode::Id::MADI + 4: + case OpCode::Id::MADI + 5: + case OpCode::Id::MADI + 6: + case OpCode::Id::MADI + 7: + FORMAT5I(_mm_add_ps(_mm_mul_ps(src1.raw_f, src2.raw_f), src3.raw_f)); + break; + + case OpCode::Id::MAD: + case OpCode::Id::MAD + 1: + case OpCode::Id::MAD + 2: + case OpCode::Id::MAD + 3: + case OpCode::Id::MAD + 4: + case OpCode::Id::MAD + 5: + case OpCode::Id::MAD + 6: + case OpCode::Id::MAD + 7: + FORMAT5(_mm_add_ps(_mm_mul_ps(src1.raw_f, src2.raw_f), src3.raw_f)); + break; + + default: + LOG_CRITICAL(HW_GPU, "Unhandled opcode: 0x%02x", instr.opcode.Value()); + UNIMPLEMENTED(); + } + + state.pc += 1; + + if (exit_loop) + break; + } +#else + UNREACHABLE(); +#endif + + // Setup output data + VertexShader::OutputVertex ret; + // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to + // figure out what those circumstances are and enable the remaining outputs then. + for (int i = 0; i < 7; ++i) { + const auto& output_register_map = regs.vs_output_attributes[i]; + + u32 semantics[4] = { + output_register_map.map_x, output_register_map.map_y, + output_register_map.map_z, output_register_map.map_w + }; + + for (int comp = 0; comp < 4; ++comp) { + f32* out = ((f32*)&ret) + semantics[comp]; + *out = state.output[i][comp]; + } + } + + return ret; +} + +} // namespace VertexShaderSIMD + +} // namespace Pica diff --git a/src/video_core/vertex_shader_simd.h b/src/video_core/vertex_shader_simd.h new file mode 100644 index 000000000..daf5c9a84 --- /dev/null +++ b/src/video_core/vertex_shader_simd.h @@ -0,0 +1,78 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/common_types.h" +#include "common/intrinsics.h" + +#include "vertex_shader.h" + +namespace Pica { + +namespace VertexShaderSIMD { + +struct Reg { + union { + struct { + f32 x, y, z, w; + }; +#if _M_SSE >= 0x401 + __m128 raw_f; + __m128i raw_i; +#endif + }; + + f32 operator [] (int i) { + return *(&x + i); + } +}; + +struct CoreState { + u32 pc; + Reg address_offset; + bool conditional_code[2]; + + struct CallStackElement { + u32 final_address; // Address upon which we jump to return_address + u32 return_address; // Where to jump when leaving scope + u8 repeat_counter; // How often to repeat until this call stack element is removed + u8 loop_increment; // Which value to add to the loop counter after an iteration + // TODO: Should this be a signed value? Does it even matter? + u32 loop_address; // The address where we'll return to after each loop iteration + }; + + // TODO: Is there a maximal size for this? + std::vector call_stack; + + union { + struct { + Reg output[0x10]; + Reg input[0x10]; + Reg temporary[0x10]; + Reg uniform[0x60]; + }; + Reg regs[0x90]; + }; + + Reg InputReg(int index) const { + return input[index]; + } + + Reg& OutputReg(int index) { + return regs[(index & 0xf) | ((index & 0x10) << 1)]; + } + +}; + +/// Initializes lookup tables used by the SIMD vertex shader core +void Init(); + +/// Initializes a SIMD vertex shader core for the current shader in Pica memory +void InitCore(CoreState& state); + +/// Runs a vertex shader core for the current shader in Pica memory +VertexShader::OutputVertex RunShader(CoreState& state, const VertexShader::InputVertex& input, int num_attributes); + +} // namespace VertexShaderSIMD + +} // namespace Pica