From 10272dd54108852f535ebe00d55db96402e12d27 Mon Sep 17 00:00:00 2001 From: Dragios Date: Sat, 16 Apr 2016 01:17:59 +0800 Subject: [PATCH] New GS refactor (#8) * Turn ShaderSetup into a class * Cleanup ShaderSetup * Replace logic in shader.cpp with loop * Rename 'VertexLoaded' breakpoint to 'Vertex shader invocation' * Prepare Pica regs for GS * Make shader code less VS-specific * Only check for enabled JIT in Setup() to avoid race conditions * Write shader registers in functions * Write GS registers * Implement EMIT and SETEMIT * Implement 4 shader units and geometry shaders --- .../debugger/graphics_breakpoints.cpp | 3 +- src/citra_qt/debugger/graphics_tracing.cpp | 2 +- .../debugger/graphics_vertex_shader.cpp | 6 +- src/video_core/command_processor.cpp | 227 +++++++---- src/video_core/debug_utils/debug_utils.h | 3 +- src/video_core/pica.cpp | 2 +- src/video_core/pica.h | 39 +- src/video_core/pica_state.h | 13 + src/video_core/primitive_assembly.cpp | 1 - src/video_core/shader/shader.cpp | 371 +++++++++++++----- src/video_core/shader/shader.h | 174 +++++--- src/video_core/shader/shader_interpreter.cpp | 97 +++-- src/video_core/shader/shader_interpreter.h | 3 +- src/video_core/shader/shader_jit_x64.cpp | 83 ++-- src/video_core/shader/shader_jit_x64.h | 23 +- 15 files changed, 726 insertions(+), 321 deletions(-) diff --git a/src/citra_qt/debugger/graphics_breakpoints.cpp b/src/citra_qt/debugger/graphics_breakpoints.cpp index 819ec7707..d6d3c558e 100644 --- a/src/citra_qt/debugger/graphics_breakpoints.cpp +++ b/src/citra_qt/debugger/graphics_breakpoints.cpp @@ -44,7 +44,8 @@ QVariant BreakPointModel::data(const QModelIndex& index, int role) const { Pica::DebugContext::Event::PicaCommandProcessed, tr("Pica command processed") }, { Pica::DebugContext::Event::IncomingPrimitiveBatch, tr("Incoming primitive batch") }, { Pica::DebugContext::Event::FinishedPrimitiveBatch, tr("Finished primitive batch") }, - { Pica::DebugContext::Event::VertexLoaded, tr("Vertex loaded") }, + { Pica::DebugContext::Event::RunVS, tr("Vertex shader invocation") }, + { Pica::DebugContext::Event::RunGS, tr("Geometry shader invocation") }, { Pica::DebugContext::Event::IncomingDisplayTransfer, tr("Incoming display transfer") }, { Pica::DebugContext::Event::GSPCommandProcessed, tr("GSP command processed") }, { Pica::DebugContext::Event::BufferSwapped, tr("Buffers swapped") } diff --git a/src/citra_qt/debugger/graphics_tracing.cpp b/src/citra_qt/debugger/graphics_tracing.cpp index e06498744..e1e02a1da 100644 --- a/src/citra_qt/debugger/graphics_tracing.cpp +++ b/src/citra_qt/debugger/graphics_tracing.cpp @@ -70,7 +70,7 @@ void GraphicsTracingWidget::StartRecording() { std::array default_attributes; for (unsigned i = 0; i < 16; ++i) { for (unsigned comp = 0; comp < 3; ++comp) { - default_attributes[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs.default_attributes[i][comp].ToFloat32()); + default_attributes[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs_default_attributes[i][comp].ToFloat32()); } } diff --git a/src/citra_qt/debugger/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics_vertex_shader.cpp index d648d4640..c04e9f92d 100644 --- a/src/citra_qt/debugger/graphics_vertex_shader.cpp +++ b/src/citra_qt/debugger/graphics_vertex_shader.cpp @@ -365,7 +365,7 @@ GraphicsVertexShaderWidget::GraphicsVertexShaderWidget(std::shared_ptr< Pica::De input_data[i]->setValidator(new QDoubleValidator(input_data[i])); } - breakpoint_warning = new QLabel(tr("(data only available at VertexLoaded breakpoints)")); + breakpoint_warning = new QLabel(tr("(data only available at vertex shader invocation breakpoints)")); // TODO: Add some button for jumping to the shader entry point @@ -454,7 +454,7 @@ GraphicsVertexShaderWidget::GraphicsVertexShaderWidget(std::shared_ptr< Pica::De void GraphicsVertexShaderWidget::OnBreakPointHit(Pica::DebugContext::Event event, void* data) { auto input = static_cast(data); - if (event == Pica::DebugContext::Event::VertexLoaded) { + if (event == Pica::DebugContext::Event::RunVS) { Reload(true, data); } else { // No vertex data is retrievable => invalidate currently stored vertex data @@ -501,7 +501,7 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d info.labels.insert({ entry_point, "main" }); // Generate debug information - debug_data = Pica::Shader::ProduceDebugInfo(input_vertex, num_attributes, shader_config, shader_setup); + debug_data = Pica::g_state.vs.ProduceDebugInfo(input_vertex, num_attributes, shader_config); // Reload widget state for (int attr = 0; attr < num_attributes; ++attr) { diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index a889ec0e1..8d9b83780 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -29,10 +29,6 @@ namespace Pica { namespace CommandProcessor { -static int float_regs_counter = 0; - -static u32 uniform_write_buffer[4]; - static int default_attr_counter = 0; static u32 default_attr_write_buffer[3]; @@ -126,7 +122,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { // TODO: Verify that this actually modifies the register! if (setup.index < 15) { - g_state.vs.default_attributes[setup.index] = attribute; + g_state.vs_default_attributes[setup.index] = attribute; setup.index++; } else { // Put each attribute into an immediate input buffer. @@ -141,14 +137,14 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { if (immediate_attribute_id >= regs.vs.num_input_attributes+1) { immediate_attribute_id = 0; - Shader::UnitState shader_unit; - Shader::Setup(); - - if (g_debug_context) - g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, static_cast(&immediate_input)); + auto& shader_unit = Shader::GetShaderUnit(false); + g_state.vs.Setup(); // Send to vertex shader - Shader::OutputVertex output = Shader::Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1); + if (g_debug_context) + g_debug_context->OnEvent(DebugContext::Event::RunVS, static_cast(&immediate_input)); + g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1, regs.vs); + Shader::OutputVertex output_vertex = shader_unit.output_registers.ToVertex(regs.vs); // Send to renderer using Pica::Shader::OutputVertex; @@ -156,7 +152,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); }; - g_state.primitive_assembler.SubmitVertex(output, AddTriangle); + g_state.primitive_assembler.SubmitVertex(output_vertex, AddTriangle); } } } @@ -303,13 +299,16 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { // The size has been tuned for optimal balance between hit-rate and the cost of lookup const size_t VERTEX_CACHE_SIZE = 32; std::array vertex_cache_ids; - std::array vertex_cache; + std::array vertex_cache; unsigned int vertex_cache_pos = 0; vertex_cache_ids.fill(-1); - Shader::UnitState shader_unit; - Shader::Setup(); + auto& vs_shader_unit = Shader::GetShaderUnit(false); + g_state.vs.Setup(); + + auto& gs_unit_state = Shader::GetShaderUnit(true); + g_state.gs.Setup(); for (unsigned int index = 0; index < regs.num_vertices; ++index) { @@ -321,7 +320,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { ASSERT(vertex != -1); bool vertex_cache_hit = false; - Shader::OutputVertex output; + Shader::OutputRegisters output_registers; if (is_indexed) { if (g_debug_context && Pica::g_debug_context->recorder) { @@ -331,7 +330,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) { if (vertex == vertex_cache_ids[i]) { - output = vertex_cache[i]; + output_registers = vertex_cache[i]; vertex_cache_hit = true; break; } @@ -378,7 +377,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { } } else if (attribute_config.IsDefaultAttribute(i)) { // Load the default attribute if we're configured to do so - input.attr[i] = g_state.vs.default_attributes[i]; + input.attr[i] = g_state.vs_default_attributes[i]; LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)", i, vertex, index, input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), @@ -390,27 +389,69 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { } } - if (g_debug_context) - g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input); - // Send to vertex shader - output = Shader::Run(shader_unit, input, attribute_config.GetNumTotalAttributes()); + if (g_debug_context) + g_debug_context->OnEvent(DebugContext::Event::RunVS, static_cast(&input)); + g_state.vs.Run(vs_shader_unit, input, attribute_config.GetNumTotalAttributes(), g_state.regs.vs); + output_registers = vs_shader_unit.output_registers; if (is_indexed) { - vertex_cache[vertex_cache_pos] = output; + vertex_cache[vertex_cache_pos] = output_registers; vertex_cache_ids[vertex_cache_pos] = vertex; vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE; } } - // Send to renderer + // Helper to send triangle to renderer using Pica::Shader::OutputVertex; auto AddTriangle = []( const OutputVertex& v0, const OutputVertex& v1, const OutputVertex& v2) { VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); }; - primitive_assembler.SubmitVertex(output, AddTriangle); + if (Shader::UseGS()) { + + auto& regs = g_state.regs; + auto& gs_regs = g_state.regs.gs; + auto& gs_buf = g_state.gs_input_buffer; + + // Vertex Shader Outputs are converted into Geometry Shader inputs by filling up a buffer + // For example, if we have a geoshader that takes 6 inputs, and the vertex shader outputs 2 attributes + // It would take 3 vertices to fill up the Geometry Shader buffer + unsigned int gs_input_count = gs_regs.num_input_attributes + 1; + unsigned int vs_output_count = regs.vs_outmap_total2 + 1; + ASSERT_MSG(regs.vs_outmap_total1 == regs.vs_outmap_total2, "VS_OUTMAP_TOTAL1 and VS_OUTMAP_TOTAL2 don't match!"); + // copy into the geoshader buffer + for (unsigned int i = 0; i < vs_output_count; i++) { + if (gs_buf.index >= gs_input_count) { + // TODO(ds84182): LOG_ERROR() + ASSERT_MSG(false, "Number of GS inputs (%d) is not divisible by number of VS outputs (%d)", + gs_input_count, vs_output_count); + continue; + } + gs_buf.buffer.attr[gs_buf.index++] = output_registers.value[i]; + } + + if (gs_buf.index >= gs_input_count) { + + // b15 will be false when a new primitive starts and then switch to true at some point + //TODO: Test how this works exactly on hardware + g_state.gs.uniforms.b[15] |= (index > 0); + + // Process Geometry Shader + if (g_debug_context) + g_debug_context->OnEvent(DebugContext::Event::RunGS, static_cast(&gs_buf.buffer)); + gs_unit_state.emit_triangle_callback = AddTriangle; + g_state.gs.Run(gs_unit_state, gs_buf.buffer, gs_input_count, regs.gs); + gs_unit_state.emit_triangle_callback = nullptr; + + gs_buf.index = 0; + } + } else { + Shader::OutputVertex output_vertex = output_registers.ToVertex(regs.vs); + primitive_assembler.SubmitVertex(output_vertex, AddTriangle); + } + } for (auto& range : memory_accesses.ranges) { @@ -421,10 +462,76 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { break; } - case PICA_REG_INDEX(vs.bool_uniforms): - for (unsigned i = 0; i < 16; ++i) - g_state.vs.uniforms.b[i] = (regs.vs.bool_uniforms.Value() & (1 << i)) != 0; + case PICA_REG_INDEX(gs.bool_uniforms): + Shader::WriteUniformBoolReg(true, value); + break; + case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[0], 0x281): + case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[1], 0x282): + case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[2], 0x283): + case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[3], 0x284): + { + unsigned index = (id - PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[0], 0x281)); + auto values = regs.gs.int_uniforms[index]; + Shader::WriteUniformIntReg(true, index, Math::Vec4(values.x, values.y, values.z, values.w)); + break; + } + + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.setup, 0x290): + Shader::WriteUniformFloatSetupReg(true, value); + break; + + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[0], 0x291): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[1], 0x292): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[2], 0x293): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[3], 0x294): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[4], 0x295): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[5], 0x296): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[6], 0x297): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[7], 0x298): + { + Shader::WriteUniformFloatReg(true, value); + break; + } + + // Load shader program code + case PICA_REG_INDEX_WORKAROUND(gs.program.offset, 0x29b): + Shader::WriteProgramCodeOffset(true, value); + break; + + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[0], 0x29c): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[1], 0x29d): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[2], 0x29e): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[3], 0x29f): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[4], 0x2a0): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[5], 0x2a1): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[6], 0x2a2): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[7], 0x2a3): + { + Shader::WriteProgramCode(true, value); + break; + } + + // Load swizzle pattern data + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.offset, 0x2a5): + Shader::WriteSwizzlePatternsOffset(true, value); + break; + + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[0], 0x2a6): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[1], 0x2a7): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[2], 0x2a8): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[3], 0x2a9): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[4], 0x2aa): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[5], 0x2ab): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[6], 0x2ac): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[7], 0x2ad): + { + Shader::WriteSwizzlePatterns(true, value); + break; + } + + case PICA_REG_INDEX(vs.bool_uniforms): + Shader::WriteUniformBoolReg(false, value); break; case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1): @@ -432,14 +539,16 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[2], 0x2b3): case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[3], 0x2b4): { - int index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1)); + unsigned index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1)); auto values = regs.vs.int_uniforms[index]; - g_state.vs.uniforms.i[index] = Math::Vec4(values.x, values.y, values.z, values.w); - LOG_TRACE(HW_GPU, "Set integer uniform %d to %02x %02x %02x %02x", - index, values.x.Value(), values.y.Value(), values.z.Value(), values.w.Value()); + Shader::WriteUniformIntReg(false, index, Math::Vec4(values.x, values.y, values.z, values.w)); break; } + case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.setup, 0x2c0): + Shader::WriteUniformFloatSetupReg(false, value); + break; + case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[0], 0x2c1): case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[1], 0x2c2): case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[2], 0x2c3): @@ -449,49 +558,15 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[6], 0x2c7): case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[7], 0x2c8): { - auto& uniform_setup = regs.vs.uniform_setup; - - // TODO: Does actual hardware indeed keep an intermediate buffer or does - // it directly write the values? - uniform_write_buffer[float_regs_counter++] = value; - - // Uniforms are written in a packed format such that four float24 values are encoded in - // three 32-bit numbers. We write to internal memory once a full such vector is - // written. - if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) || - (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) { - float_regs_counter = 0; - - auto& uniform = g_state.vs.uniforms.f[uniform_setup.index]; - - if (uniform_setup.index > 95) { - LOG_ERROR(HW_GPU, "Invalid VS uniform index %d", (int)uniform_setup.index); - break; - } - - // NOTE: The destination component order indeed is "backwards" - if (uniform_setup.IsFloat32()) { - for (auto i : {0,1,2,3}) - uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i])); - } else { - // TODO: Untested - uniform.w = float24::FromRaw(uniform_write_buffer[0] >> 8); - uniform.z = float24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | ((uniform_write_buffer[1] >> 16) & 0xFFFF)); - uniform.y = float24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | ((uniform_write_buffer[2] >> 24) & 0xFF)); - uniform.x = float24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF); - } - - LOG_TRACE(HW_GPU, "Set uniform %x to (%f %f %f %f)", (int)uniform_setup.index, - uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(), - uniform.w.ToFloat32()); - - // TODO: Verify that this actually modifies the register! - uniform_setup.index.Assign(uniform_setup.index + 1); - } + Shader::WriteUniformFloatReg(false, value); break; } // Load shader program code + case PICA_REG_INDEX_WORKAROUND(vs.program.offset, 0x2cb): + Shader::WriteProgramCodeOffset(false, value); + break; + case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[0], 0x2cc): case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[1], 0x2cd): case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[2], 0x2ce): @@ -501,12 +576,15 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[6], 0x2d2): case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[7], 0x2d3): { - g_state.vs.program_code[regs.vs.program.offset] = value; - regs.vs.program.offset++; + Shader::WriteProgramCode(false, value); break; } // Load swizzle pattern data + case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.offset, 0x2d5): + Shader::WriteSwizzlePatternsOffset(false, value); + break; + case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[0], 0x2d6): case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[1], 0x2d7): case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[2], 0x2d8): @@ -516,8 +594,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[6], 0x2dc): case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[7], 0x2dd): { - g_state.vs.swizzle_data[regs.vs.swizzle_patterns.offset] = value; - regs.vs.swizzle_patterns.offset++; + Shader::WriteSwizzlePatterns(false, value); break; } diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h index 7df941619..e01133d6f 100644 --- a/src/video_core/debug_utils/debug_utils.h +++ b/src/video_core/debug_utils/debug_utils.h @@ -30,7 +30,8 @@ public: PicaCommandProcessed, IncomingPrimitiveBatch, FinishedPrimitiveBatch, - VertexLoaded, + RunVS, + RunGS, IncomingDisplayTransfer, GSPCommandProcessed, BufferSwapped, diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp index ccbaf071b..710ebedc1 100644 --- a/src/video_core/pica.cpp +++ b/src/video_core/pica.cpp @@ -497,7 +497,7 @@ void Init() { } void Shutdown() { - Shader::Shutdown(); + Shader::ShaderSetup::Shutdown(); } template diff --git a/src/video_core/pica.h b/src/video_core/pica.h index f066c9719..f4d7d720f 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h @@ -1070,7 +1070,7 @@ struct Regs { // Number of vertices to render u32 num_vertices; - INSERT_PADDING_WORDS(0x1); + BitField<0, 2, u32> using_geometry_shader; // The index of the first vertex to render u32 vertex_offset; @@ -1118,7 +1118,14 @@ struct Regs { } } command_buffer; - INSERT_PADDING_WORDS(0x07); + INSERT_PADDING_WORDS(0x06); + + enum class VSComMode : u32 { + Shared = 0, + Exclusive = 1 + }; + + VSComMode vs_com_mode; enum class GPUMode : u32 { Drawing = 0, @@ -1127,7 +1134,17 @@ struct Regs { GPUMode gpu_mode; - INSERT_PADDING_WORDS(0x18); + INSERT_PADDING_WORDS(0x4); + + BitField<0, 4, u32> vs_outmap_total1; + + INSERT_PADDING_WORDS(0x6); + + BitField<0, 4, u32> vs_outmap_total2; + + BitField<0, 4, u32> gsh_misc0; + + INSERT_PADDING_WORDS(0xB); enum class TriangleTopology : u32 { List = 0, @@ -1136,7 +1153,10 @@ struct Regs { Shader = 3, // Programmable setup unit implemented in a geometry shader }; - BitField<8, 2, TriangleTopology> triangle_topology; + union { + BitField<0, 4, u32> vs_outmap_count; + BitField<8, 2, TriangleTopology> triangle_topology; + }; u32 restart_primitive; @@ -1155,8 +1175,9 @@ struct Regs { INSERT_PADDING_WORDS(0x4); union { - // Number of input attributes to shader unit - 1 - BitField<0, 4, u32> num_input_attributes; + BitField<0, 4, u32> num_input_attributes; // Number of input attributes to shader unit - 1 + BitField<8, 4, u32> use_subdivision; + BitField<24, 8, u32> use_geometry_shader; }; // Offset to shader program entry point (in words) @@ -1208,6 +1229,8 @@ struct Regs { } union { + u32 setup; + // Index of the next uniform to write to // TODO: ctrulib uses 8 bits for this, however that seems to yield lots of invalid indices // TODO: Maybe the uppermost index is for the geometry shader? Investigate! @@ -1324,7 +1347,11 @@ ASSERT_REG_POSITION(trigger_draw, 0x22e); ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f); ASSERT_REG_POSITION(vs_default_attributes_setup, 0x232); ASSERT_REG_POSITION(command_buffer, 0x238); +ASSERT_REG_POSITION(vs_com_mode, 0x244); ASSERT_REG_POSITION(gpu_mode, 0x245); +ASSERT_REG_POSITION(vs_outmap_total1, 0x24A); +ASSERT_REG_POSITION(vs_outmap_total2, 0x251); +ASSERT_REG_POSITION(gsh_misc0, 0x252); ASSERT_REG_POSITION(triangle_topology, 0x25e); ASSERT_REG_POSITION(restart_primitive, 0x25f); ASSERT_REG_POSITION(gs, 0x280); diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h index 323290054..848e6bde1 100644 --- a/src/video_core/pica_state.h +++ b/src/video_core/pica_state.h @@ -17,9 +17,13 @@ struct State { /// Pica registers Regs regs; + Shader::UnitState shader_units[4]; + Shader::ShaderSetup vs; Shader::ShaderSetup gs; + Math::Vec4 vs_default_attributes[16]; + struct { union LutEntry { // Used for raw access @@ -56,6 +60,15 @@ struct State { // This is constructed with a dummy triangle topology PrimitiveAssembler primitive_assembler; + + /// Current geometry shader state + struct GeometryShaderState { + // Buffer used for geometry shader inputs + Shader::InputVertex buffer; + // The current index into the buffer + unsigned int index; + } gs_input_buffer; + }; extern State g_state; ///< Current Pica state diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp index ff3e2b862..a22f153d3 100644 --- a/src/video_core/primitive_assembly.cpp +++ b/src/video_core/primitive_assembly.cpp @@ -20,7 +20,6 @@ template void PrimitiveAssembler::SubmitVertex(VertexType& vtx, TriangleHandler triangle_handler) { switch (topology) { - // TODO: Figure out what's different with TriangleTopology::Shader. case Regs::TriangleTopology::List: case Regs::TriangleTopology::Shader: if (buffer_index < 2) { diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 75301accd..3008d23d9 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -27,83 +27,7 @@ namespace Pica { namespace Shader { -#ifdef ARCHITECTURE_x86_64 -static std::unordered_map> shader_map; -static const JitShader* jit_shader; -#endif // ARCHITECTURE_x86_64 - -void Setup() { -#ifdef ARCHITECTURE_x86_64 - if (VideoCore::g_shader_jit_enabled) { - u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ - Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data))); - - auto iter = shader_map.find(cache_key); - if (iter != shader_map.end()) { - jit_shader = iter->second.get(); - } else { - auto shader = std::make_unique(); - shader->Compile(); - jit_shader = shader.get(); - shader_map[cache_key] = std::move(shader); - } - } -#endif // ARCHITECTURE_x86_64 -} - -void Shutdown() { -#ifdef ARCHITECTURE_x86_64 - shader_map.clear(); -#endif // ARCHITECTURE_x86_64 -} - -static Common::Profiling::TimingCategory shader_category("Vertex Shader"); -MICROPROFILE_DEFINE(GPU_VertexShader, "GPU", "Vertex Shader", MP_RGB(50, 50, 240)); - -OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes) { - auto& config = g_state.regs.vs; - - Common::Profiling::ScopeTimer timer(shader_category); - MICROPROFILE_SCOPE(GPU_VertexShader); - - state.program_counter = config.main_offset; - state.debug.max_offset = 0; - state.debug.max_opdesc_id = 0; - - // Setup input register table - const auto& attribute_register_map = config.input_register_map; - - // TODO: Instead of this cumbersome logic, just load the input data directly like - // for (int attr = 0; attr < num_attributes; ++attr) { input_attr[0] = state.registers.input[attribute_register_map.attribute0_register]; } - if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = input.attr[0]; - if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = input.attr[1]; - if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = input.attr[2]; - if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = input.attr[3]; - if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = input.attr[4]; - if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = input.attr[5]; - if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = input.attr[6]; - if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = input.attr[7]; - if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = input.attr[8]; - if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = input.attr[9]; - if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = input.attr[10]; - if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = input.attr[11]; - if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = input.attr[12]; - if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = input.attr[13]; - if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = input.attr[14]; - if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = input.attr[15]; - - state.conditional_code[0] = false; - state.conditional_code[1] = false; - -#ifdef ARCHITECTURE_x86_64 - if (VideoCore::g_shader_jit_enabled) - jit_shader->Run(&state.registers, g_state.regs.vs.main_offset); - else - RunInterpreter(state); -#else - RunInterpreter(state); -#endif // ARCHITECTURE_x86_64 - +OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) { // Setup output data OutputVertex ret; // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to @@ -114,10 +38,10 @@ OutputVertex Run(UnitState& state, const InputVertex& input, int num_attr if (index >= g_state.regs.vs_output_total) break; - if ((g_state.regs.vs.output_mask & (1 << i)) == 0) + if ((config.output_mask & (1 << i)) == 0) continue; - const auto& output_register_map = g_state.regs.vs_output_attributes[index]; // TODO: Don't hardcode VS here + const auto& output_register_map = g_state.regs.vs_output_attributes[index]; u32 semantics[4] = { output_register_map.map_x, output_register_map.map_y, @@ -127,7 +51,7 @@ OutputVertex Run(UnitState& state, const InputVertex& input, int num_attr for (unsigned comp = 0; comp < 4; ++comp) { float24* out = ((float24*)&ret) + semantics[comp]; if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { - *out = state.registers.output[i][comp]; + *out = value[i][comp]; } else { // Zero output so that attributes which aren't output won't have denormals in them, // which would slow us down later. @@ -155,10 +79,71 @@ OutputVertex Run(UnitState& state, const InputVertex& input, int num_attr return ret; } -DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) { +#ifdef ARCHITECTURE_x86_64 +static std::unordered_map> shader_map; +#endif // ARCHITECTURE_x86_64 + +void ShaderSetup::Setup() { +#ifdef ARCHITECTURE_x86_64 + if (VideoCore::g_shader_jit_enabled) { + u64 cache_key = (Common::ComputeHash64(&program_code, sizeof(program_code)) ^ + Common::ComputeHash64(&swizzle_data, sizeof(swizzle_data))); + + auto iter = shader_map.find(cache_key); + if (iter != shader_map.end()) { + jit_shader = iter->second; + } else { + auto shader = std::make_shared(); + shader->Compile(*this); + jit_shader = shader; + shader_map[cache_key] = std::move(shader); + } + } else { + jit_shader.reset(); + } +#endif // ARCHITECTURE_x86_64 +} + +void ShaderSetup::Shutdown() { +#ifdef ARCHITECTURE_x86_64 + shader_map.clear(); +#endif // ARCHITECTURE_x86_64 +} + +static Common::Profiling::TimingCategory shader_category("Shader"); +MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); + +void ShaderSetup::Run(UnitState& state, const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config) { + + Common::Profiling::ScopeTimer timer(shader_category); + MICROPROFILE_SCOPE(GPU_Shader); + + state.debug.max_offset = 0; + state.debug.max_opdesc_id = 0; + + // Setup input register table + const auto& attribute_register_map = config.input_register_map; + + for (unsigned i = 0; i < num_attributes; i++) + state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i]; + + state.conditional_code[0] = false; + state.conditional_code[1] = false; + +#ifdef ARCHITECTURE_x86_64 + if (auto shader = jit_shader.lock()) + shader.get()->Run(config, *this, state); + else + RunInterpreter(config, *this, state); +#else + RunInterpreter(config, *this, state); +#endif // ARCHITECTURE_x86_64 + +} + +DebugData ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config) { UnitState state; - state.program_counter = config.main_offset; state.debug.max_offset = 0; state.debug.max_opdesc_id = 0; @@ -167,30 +152,218 @@ DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, c float24 dummy_register; boost::fill(state.registers.input, &dummy_register); - if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = &input.attr[0].x; - if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = &input.attr[1].x; - if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = &input.attr[2].x; - if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = &input.attr[3].x; - if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = &input.attr[4].x; - if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = &input.attr[5].x; - if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = &input.attr[6].x; - if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = &input.attr[7].x; - if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = &input.attr[8].x; - if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = &input.attr[9].x; - if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = &input.attr[10].x; - if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = &input.attr[11].x; - if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = &input.attr[12].x; - if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = &input.attr[13].x; - if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = &input.attr[14].x; - if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = &input.attr[15].x; + for (unsigned i = 0; i < num_attributes; i++) + state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i]; state.conditional_code[0] = false; state.conditional_code[1] = false; - RunInterpreter(state); + RunInterpreter(config, *this, state); return state.debug; } +bool SharedGS() { + return g_state.regs.vs_com_mode == Pica::Regs::VSComMode::Shared; +} + +bool UseGS() { + // TODO(ds84182): This would be more accurate if it looked at induvidual shader units for the geoshader bit + // gs_regs.input_buffer_config.use_geometry_shader == 0x08 + ASSERT((g_state.regs.using_geometry_shader == 0) || (g_state.regs.using_geometry_shader == 2)); + return g_state.regs.using_geometry_shader == 2; +} + +UnitState& GetShaderUnit(bool gs) { + + // GS are always run on shader unit 3 + if (gs) { + return g_state.shader_units[3]; + } + + // The worst scheduler you'll ever see! + //TODO: How does PICA shader scheduling work? + static unsigned shader_unit_scheduler = 0; + shader_unit_scheduler++; + shader_unit_scheduler %= 3; // TODO: When does it also allow use of unit 3?! + return g_state.shader_units[shader_unit_scheduler]; +} + +void WriteUniformBoolReg(bool gs, u32 value) { + auto& setup = gs ? g_state.gs : g_state.vs; + + ASSERT(setup.uniforms.b.size() == 16); + for (unsigned i = 0; i < 16; ++i) + setup.uniforms.b[i] = (value & (1 << i)) != 0; + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteUniformBoolReg(true, value); + } +} + +void WriteUniformIntReg(bool gs, unsigned index, const Math::Vec4& values) { + const char* shader_type = gs ? "GS" : "VS"; + auto& setup = gs ? g_state.gs : g_state.vs; + + ASSERT(index < setup.uniforms.i.size()); + setup.uniforms.i[index] = values; + LOG_TRACE(HW_GPU, "Set %s integer uniform %d to %02x %02x %02x %02x", + shader_type, index, values.x.Value(), values.y.Value(), values.z.Value(), values.w.Value()); + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteUniformIntReg(true, index, values); + } +} + +void WriteUniformFloatSetupReg(bool gs, u32 value) { + auto& config = gs ? g_state.regs.gs : g_state.regs.vs; + + config.uniform_setup.setup = value; + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteUniformFloatSetupReg(true, value); + } +} + +void WriteUniformFloatReg(bool gs, u32 value) { + const char* shader_type = gs ? "GS" : "VS"; + auto& config = gs ? g_state.regs.gs : g_state.regs.vs; + auto& setup = gs ? g_state.gs : g_state.vs; + + auto& uniform_setup = config.uniform_setup; + auto& uniform_write_buffer = setup.uniform_write_buffer; + auto& float_regs_counter = setup.float_regs_counter; + + // TODO: Does actual hardware indeed keep an intermediate buffer or does + // it directly write the values? + uniform_write_buffer[float_regs_counter++] = value; + + // Uniforms are written in a packed format such that four float24 values are encoded in + // three 32-bit numbers. We write to internal memory once a full such vector is + // written. + if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) || + (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) { + float_regs_counter = 0; + + auto& uniform = setup.uniforms.f[uniform_setup.index]; + + if (uniform_setup.index >= 96) { + LOG_ERROR(HW_GPU, "Invalid %s float uniform index %d", shader_type, (int)uniform_setup.index); + } else { + + // NOTE: The destination component order indeed is "backwards" + if (uniform_setup.IsFloat32()) { + for (auto i : {0,1,2,3}) + uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i])); + } else { + // TODO: Untested + uniform.w = float24::FromRaw(uniform_write_buffer[0] >> 8); + uniform.z = float24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | ((uniform_write_buffer[1] >> 16) & 0xFFFF)); + uniform.y = float24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | ((uniform_write_buffer[2] >> 24) & 0xFF)); + uniform.x = float24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF); + } + + LOG_TRACE(HW_GPU, "Set %s float uniform %x to (%f %f %f %f)", shader_type, (int)uniform_setup.index, + uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(), + uniform.w.ToFloat32()); + + // TODO: Verify that this actually modifies the register! + uniform_setup.index.Assign(uniform_setup.index + 1); + } + + } + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteUniformFloatReg(true, value); + } +} + +void WriteProgramCodeOffset(bool gs, u32 value) { + auto& config = gs ? g_state.regs.gs : g_state.regs.vs; + config.program.offset = value; + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteProgramCodeOffset(true, value); + } +} + +void WriteProgramCode(bool gs, u32 value) { + const char* shader_type = gs ? "GS" : "VS"; + auto& config = gs ? g_state.regs.gs : g_state.regs.vs; + auto& setup = gs ? g_state.gs : g_state.vs; + + if (config.program.offset >= setup.program_code.size()) { + LOG_ERROR(HW_GPU, "Invalid %s program offset %d", shader_type, (int)config.program.offset); + } else { + setup.program_code[config.program.offset] = value; + config.program.offset++; + } + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteProgramCode(true, value); + } +} + +void WriteSwizzlePatternsOffset(bool gs, u32 value) { + auto& config = gs ? g_state.regs.gs : g_state.regs.vs; + config.swizzle_patterns.offset = value; + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteSwizzlePatternsOffset(true, value); + } +} + +void WriteSwizzlePatterns(bool gs, u32 value) { + const char* shader_type = gs ? "GS" : "VS"; + auto& config = gs ? g_state.regs.gs : g_state.regs.vs; + auto& setup = gs ? g_state.gs : g_state.vs; + + if (config.swizzle_patterns.offset >= setup.swizzle_data.size()) { + LOG_ERROR(HW_GPU, "Invalid %s swizzle pattern offset %d", shader_type, (int)config.swizzle_patterns.offset); + } else { + setup.swizzle_data[config.swizzle_patterns.offset] = value; + config.swizzle_patterns.offset++; + } + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteSwizzlePatterns(true, value); + } +} + +template +void HandleEMIT(UnitState& state) { + auto &config = g_state.regs.gs; + auto &emit_params = state.emit_params; + auto &emit_buffers = state.emit_buffers; + + ASSERT(emit_params.vertex_id < 3); + + emit_buffers[emit_params.vertex_id] = state.output_registers; + + if (emit_params.primitive_emit) { + ASSERT_MSG(state.emit_triangle_callback, "EMIT invoked but no handler set!"); + OutputVertex v0 = emit_buffers[0].ToVertex(config); + OutputVertex v1 = emit_buffers[1].ToVertex(config); + OutputVertex v2 = emit_buffers[2].ToVertex(config); + if (emit_params.winding) { + state.emit_triangle_callback(v2, v1, v0); + } else { + state.emit_triangle_callback(v0, v1, v2); + } + } +} + +// Explicit instantiation +template void HandleEMIT(UnitState& state); +template void HandleEMIT(UnitState& state); + } // namespace Shader } // namespace Pica diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 9c5bd97bd..3b1658215 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include @@ -15,6 +16,7 @@ #include "common/vector_math.h" #include "video_core/pica.h" +#include "video_core/primitive_assembly.h" using nihstro::RegisterType; using nihstro::SourceRegister; @@ -24,6 +26,11 @@ namespace Pica { namespace Shader { +#ifdef ARCHITECTURE_x86_64 +// Forward declare JitShader because shader_jit_x64.h requires ShaderSetup (which uses JitShader) from this file +class JitShader; +#endif // ARCHITECTURE_x86_64 + struct InputVertex { Math::Vec4 attr[16]; }; @@ -77,22 +84,14 @@ struct OutputVertex { static_assert(std::is_pod::value, "Structure is not POD"); static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); -/// Vertex shader memory -struct ShaderSetup { - struct { - // The float uniforms are accessed by the shader JIT using SSE instructions, and are - // therefore required to be 16-byte aligned. - alignas(16) Math::Vec4 f[96]; +struct OutputRegisters { + OutputRegisters() = default; - std::array b; - std::array, 4> i; - } uniforms; + alignas(16) Math::Vec4 value[16]; - Math::Vec4 default_attributes[16]; - - std::array program_code; - std::array swizzle_data; + OutputVertex ToVertex(const Regs::ShaderConfig& config); }; +static_assert(std::is_pod::value, "Structure is not POD"); // Helper structure used to keep track of data useful for inspection of shader emulation template @@ -192,9 +191,9 @@ inline void SetField(DebugDataRecord& record, float24* va record.src3.x = value[0]; record.src3.y = value[1]; record.src3.z = value[2]; + record.src3.w = value[3]; } - template<> inline void SetField(DebugDataRecord& record, float24* value) { record.dest_in.x = value[0]; @@ -277,43 +276,38 @@ struct UnitState { // The registers are accessed by the shader JIT using SSE instructions, and are therefore // required to be 16-byte aligned. alignas(16) Math::Vec4 input[16]; - alignas(16) Math::Vec4 output[16]; alignas(16) Math::Vec4 temporary[16]; } registers; static_assert(std::is_pod::value, "Structure is not POD"); - u32 program_counter; + OutputRegisters emit_buffers[3]; //TODO: 3dbrew suggests this only stores the first 7 output registers + + union EmitParameters { + u32 raw; + BitField<22, 1, u32> winding; + BitField<23, 1, u32> primitive_emit; + BitField<24, 2, u32> vertex_id; + } emit_params; + + PrimitiveAssembler::TriangleHandler emit_triangle_callback; + + OutputRegisters output_registers; + bool conditional_code[2]; // Two Address registers and one loop counter // TODO: How many bits do these actually have? s32 address_registers[3]; - enum { - INVALID_ADDRESS = 0xFFFFFFFF - }; - - struct CallStackElement { - u32 final_address; // Address upon which we jump to return_address - u32 return_address; // Where to jump when leaving scope - u8 repeat_counter; // How often to repeat until this call stack element is removed - u8 loop_increment; // Which value to add to the loop counter after an iteration - // TODO: Should this be a signed value? Does it even matter? - u32 loop_address; // The address where we'll return to after each loop iteration - }; - - // TODO: Is there a maximal size for this? - boost::container::static_vector call_stack; - DebugData debug; static size_t InputOffset(const SourceRegister& reg) { switch (reg.GetRegisterType()) { case RegisterType::Input: - return offsetof(UnitState::Registers, input) + reg.GetIndex()*sizeof(Math::Vec4); + return offsetof(UnitState, registers.input) + reg.GetIndex()*sizeof(Math::Vec4); case RegisterType::Temporary: - return offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4); + return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4); default: UNREACHABLE(); @@ -324,45 +318,105 @@ struct UnitState { static size_t OutputOffset(const DestRegister& reg) { switch (reg.GetRegisterType()) { case RegisterType::Output: - return offsetof(UnitState::Registers, output) + reg.GetIndex()*sizeof(Math::Vec4); + return offsetof(UnitState, output_registers.value) + reg.GetIndex()*sizeof(Math::Vec4); case RegisterType::Temporary: - return offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4); + return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4); default: UNREACHABLE(); return 0; } } + + static size_t EmitParamsOffset() { + return offsetof(UnitState, emit_params.raw); + } }; -/** - * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per - * vertex, which would happen within the `Run` function). - */ -void Setup(); +class ShaderSetup { -/// Performs any cleanup when the emulator is shutdown -void Shutdown(); +public: -/** - * Runs the currently setup shader - * @param state Shader unit state, must be setup per shader and per shader unit - * @param input Input vertex into the shader - * @param num_attributes The number of vertex shader attributes - * @return The output vertex, after having been processed by the vertex shader - */ -OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes); + struct { + // The float uniforms are accessed by the shader JIT using SSE instructions, and are + // therefore required to be 16-byte aligned. + alignas(16) Math::Vec4 f[96]; -/** - * Produce debug information based on the given shader and input vertex - * @param input Input vertex into the shader - * @param num_attributes The number of vertex shader attributes - * @param config Configuration object for the shader pipeline - * @param setup Setup object for the shader pipeline - * @return Debug information for this shader with regards to the given vertex - */ -DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup); + std::array b; + std::array, 4> i; + } uniforms; + + static size_t UniformOffset(RegisterType type, unsigned index) { + switch (type) { + case RegisterType::FloatUniform: + return offsetof(ShaderSetup, uniforms.f) + index*sizeof(Math::Vec4); + + case RegisterType::BoolUniform: + return offsetof(ShaderSetup, uniforms.b) + index*sizeof(bool); + + case RegisterType::IntUniform: + return offsetof(ShaderSetup, uniforms.i) + index*sizeof(Math::Vec4); + + default: + UNREACHABLE(); + return 0; + } + } + + int float_regs_counter = 0; + u32 uniform_write_buffer[4]; + + std::array program_code; + std::array swizzle_data; + +#ifdef ARCHITECTURE_x86_64 + std::weak_ptr jit_shader; +#endif + + /** + * Performs any shader setup that only needs to happen once per shader (as opposed to once per + * vertex, which would happen within the `Run` function). + */ + void Setup(); + + /// Performs any cleanup when the emulator is shutdown + static void Shutdown(); + + /** + * Runs the currently setup shader + * @param state Shader unit state, must be setup per shader and per shader unit + * @param input Input vertex into the shader + * @param num_attributes The number of vertex shader attributes + * @param config Configuration object for the shader pipeline + */ + void Run(UnitState& state, const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config); + + /** + * Produce debug information based on the given shader and input vertex + * @param input Input vertex into the shader + * @param num_attributes The number of vertex shader attributes + * @param config Configuration object for the shader pipeline + * @return Debug information for this shader with regards to the given vertex + */ + DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config); + +}; + +bool SharedGS(); +bool UseGS(); +UnitState& GetShaderUnit(bool gs); +void WriteUniformBoolReg(bool gs, u32 value); +void WriteUniformIntReg(bool gs, unsigned index, const Math::Vec4& values); +void WriteUniformFloatSetupReg(bool gs, u32 value); +void WriteUniformFloatReg(bool gs, u32 value); +void WriteProgramCodeOffset(bool gs, u32 value); +void WriteProgramCode(bool gs, u32 value); +void WriteSwizzlePatternsOffset(bool gs, u32 value); +void WriteSwizzlePatterns(bool gs, u32 value); + +template +void HandleEMIT(UnitState& state); } // namespace Shader diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index 9b978583e..30b50ce2f 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -21,11 +21,30 @@ namespace Pica { namespace Shader { +enum { + INVALID_ADDRESS = 0xFFFFFFFF +}; + +struct CallStackElement { + u32 final_address; // Address upon which we jump to return_address + u32 return_address; // Where to jump when leaving scope + u8 repeat_counter; // How often to repeat until this call stack element is removed + u8 loop_increment; // Which value to add to the loop counter after an iteration + // TODO: Should this be a signed value? Does it even matter? + u32 loop_address; // The address where we'll return to after each loop iteration +}; + template -void RunInterpreter(UnitState& state) { - const auto& uniforms = g_state.vs.uniforms; - const auto& swizzle_data = g_state.vs.swizzle_data; - const auto& program_code = g_state.vs.program_code; +void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& setup, UnitState& state) { + + // TODO: Is there a maximal size for this? + boost::container::static_vector call_stack; + + u32 program_counter = config.main_offset; + + const auto& uniforms = setup.uniforms; + const auto& swizzle_data = setup.swizzle_data; + const auto& program_code = setup.program_code; // Placeholder for invalid inputs static float24 dummy_vec4_float24[4]; @@ -33,16 +52,16 @@ void RunInterpreter(UnitState& state) { unsigned iteration = 0; bool exit_loop = false; while (!exit_loop) { - if (!state.call_stack.empty()) { - auto& top = state.call_stack.back(); - if (state.program_counter == top.final_address) { + if (!call_stack.empty()) { + auto& top = call_stack.back(); + if (program_counter == top.final_address) { state.address_registers[2] += top.loop_increment; if (top.repeat_counter-- == 0) { - state.program_counter = top.return_address; - state.call_stack.pop_back(); + program_counter = top.return_address; + call_stack.pop_back(); } else { - state.program_counter = top.loop_address; + program_counter = top.loop_address; } // TODO: Is "trying again" accurate to hardware? @@ -50,20 +69,20 @@ void RunInterpreter(UnitState& state) { } } - const Instruction instr = { program_code[state.program_counter] }; + const Instruction instr = { program_code[program_counter] }; const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] }; - static auto call = [](UnitState& state, u32 offset, u32 num_instructions, + static auto call = [&program_counter, &call_stack](UnitState& state, u32 offset, u32 num_instructions, u32 return_offset, u8 repeat_count, u8 loop_increment) { - state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset - ASSERT(state.call_stack.size() < state.call_stack.capacity()); - state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset }); + program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset + ASSERT(call_stack.size() < call_stack.capacity()); + call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset }); }; - Record(state.debug, iteration, state.program_counter); + Record(state.debug, iteration, program_counter); if (iteration > 0) - Record(state.debug, iteration - 1, state.program_counter); + Record(state.debug, iteration - 1, program_counter); - state.debug.max_offset = std::max(state.debug.max_offset, 1 + state.program_counter); + state.debug.max_offset = std::max(state.debug.max_offset, 1 + program_counter); auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { switch (source_reg.GetRegisterType()) { @@ -120,7 +139,7 @@ void RunInterpreter(UnitState& state) { src2[3] = src2[3] * float24::FromFloat32(-1); } - float24* dest = (instr.common.dest.Value() < 0x10) ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] + float24* dest = (instr.common.dest.Value() < 0x10) ? &state.output_registers.value[instr.common.dest.Value().GetIndex()][0] : (instr.common.dest.Value() < 0x20) ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] : dummy_vec4_float24; @@ -459,7 +478,7 @@ void RunInterpreter(UnitState& state) { src3[3] = src3[3] * float24::FromFloat32(-1); } - float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] + float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.output_registers.value[instr.mad.dest.Value().GetIndex()][0] : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] : dummy_vec4_float24; @@ -511,7 +530,7 @@ void RunInterpreter(UnitState& state) { case OpCode::Id::JMPC: Record(state.debug, iteration, state.conditional_code); if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { - state.program_counter = instr.flow_control.dest_offset - 1; + program_counter = instr.flow_control.dest_offset - 1; } break; @@ -519,7 +538,7 @@ void RunInterpreter(UnitState& state) { Record(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); if (uniforms.b[instr.flow_control.bool_uniform_id] == !(instr.flow_control.num_instructions & 1)) { - state.program_counter = instr.flow_control.dest_offset - 1; + program_counter = instr.flow_control.dest_offset - 1; } break; @@ -527,7 +546,7 @@ void RunInterpreter(UnitState& state) { call(state, instr.flow_control.dest_offset, instr.flow_control.num_instructions, - state.program_counter + 1, 0, 0); + program_counter + 1, 0, 0); break; case OpCode::Id::CALLU: @@ -536,7 +555,7 @@ void RunInterpreter(UnitState& state) { call(state, instr.flow_control.dest_offset, instr.flow_control.num_instructions, - state.program_counter + 1, 0, 0); + program_counter + 1, 0, 0); } break; @@ -546,7 +565,7 @@ void RunInterpreter(UnitState& state) { call(state, instr.flow_control.dest_offset, instr.flow_control.num_instructions, - state.program_counter + 1, 0, 0); + program_counter + 1, 0, 0); } break; @@ -557,8 +576,8 @@ void RunInterpreter(UnitState& state) { Record(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); if (uniforms.b[instr.flow_control.bool_uniform_id]) { call(state, - state.program_counter + 1, - instr.flow_control.dest_offset - state.program_counter - 1, + program_counter + 1, + instr.flow_control.dest_offset - program_counter - 1, instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); } else { call(state, @@ -576,8 +595,8 @@ void RunInterpreter(UnitState& state) { Record(state.debug, iteration, state.conditional_code); if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { call(state, - state.program_counter + 1, - instr.flow_control.dest_offset - state.program_counter - 1, + program_counter + 1, + instr.flow_control.dest_offset - program_counter - 1, instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); } else { call(state, @@ -599,14 +618,24 @@ void RunInterpreter(UnitState& state) { Record(state.debug, iteration, loop_param); call(state, - state.program_counter + 1, - instr.flow_control.dest_offset - state.program_counter + 1, + program_counter + 1, + instr.flow_control.dest_offset - program_counter + 1, instr.flow_control.dest_offset + 1, loop_param.x, loop_param.z); break; } + case OpCode::Id::EMIT: { + Shader::HandleEMIT(state); + break; + } + + case OpCode::Id::SETEMIT: { + state.emit_params.raw = program_code[program_counter]; + break; + } + default: LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x", (int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex); @@ -617,14 +646,14 @@ void RunInterpreter(UnitState& state) { } } - ++state.program_counter; + ++program_counter; ++iteration; } } // Explicit instantiation -template void RunInterpreter(UnitState& state); -template void RunInterpreter(UnitState& state); +template void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& setup, UnitState& state); +template void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& setup, UnitState& state); } // namespace diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h index 294bca50e..5af36f217 100644 --- a/src/video_core/shader/shader_interpreter.h +++ b/src/video_core/shader/shader_interpreter.h @@ -4,6 +4,7 @@ #pragma once +#include "video_core/pica.h" #include "video_core/shader/shader.h" namespace Pica { @@ -11,7 +12,7 @@ namespace Pica { namespace Shader { template -void RunInterpreter(UnitState& state); +void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& setup, UnitState& state); } // namespace diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index b47d3beda..2bba07e14 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -65,8 +65,8 @@ const JitFunction instr_table[64] = { &JitShader::Compile_IF, // ifu &JitShader::Compile_IF, // ifc &JitShader::Compile_LOOP, // loop - nullptr, // emit - nullptr, // sete + &JitShader::Compile_EMIT, // emit + &JitShader::Compile_SETEMIT, // setemit &JitShader::Compile_JMP, // jmpc &JitShader::Compile_JMP, // jmpu &JitShader::Compile_CMP, // cmp @@ -94,7 +94,7 @@ const JitFunction instr_table[64] = { // purposes, as documented below: /// Pointer to the uniform memory -static const X64Reg UNIFORMS = R9; +static const X64Reg SETUP = R9; /// The two 32-bit VS address offset registers set by the MOVA instruction static const X64Reg ADDROFFS_REG_0 = R10; static const X64Reg ADDROFFS_REG_1 = R11; @@ -109,7 +109,7 @@ static const X64Reg COND0 = R13; /// Result of the previous CMP instruction for the Y-component comparison static const X64Reg COND1 = R14; /// Pointer to the UnitState instance for the current VS unit -static const X64Reg REGISTERS = R15; +static const X64Reg STATE = R15; /// SIMD scratch register static const X64Reg SCRATCH = XMM0; /// Loaded with the first swizzled source register, otherwise can be used as a scratch register @@ -128,7 +128,7 @@ static const X64Reg NEGBIT = XMM15; // State registers that must not be modified by external functions calls // Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed static const BitSet32 persistent_regs = { - UNIFORMS, REGISTERS, // Pointers to register blocks + SETUP, STATE, // Pointers to register blocks ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, // Cached registers ONE+16, NEGBIT+16, // Constants }; @@ -138,15 +138,6 @@ static const u8 NO_SRC_REG_SWIZZLE = 0x1b; /// Raw constant for the destination register enable mask that indicates all components are enabled static const u8 NO_DEST_REG_MASK = 0xf; -/** - * Get the vertex shader instruction for a given offset in the current shader program - * @param offset Offset in the current shader program of the instruction - * @return Instruction at the specified offset - */ -static Instruction GetVertexShaderInstruction(size_t offset) { - return { g_state.vs.program_code[offset] }; -} - static void LogCritical(const char* msg) { LOG_CRITICAL(HW_GPU, msg); } @@ -169,10 +160,10 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe size_t src_offset; if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { - src_ptr = UNIFORMS; - src_offset = src_reg.GetIndex() * sizeof(float24) * 4; + src_ptr = SETUP; + src_offset = ShaderSetup::UniformOffset(RegisterType::FloatUniform, src_reg.GetIndex()); } else { - src_ptr = REGISTERS; + src_ptr = STATE; src_offset = UnitState::InputOffset(src_reg); } @@ -217,7 +208,7 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe MOVAPS(dest, MDisp(src_ptr, src_offset_disp)); } - SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; + SwizzlePattern swiz = { setup->swizzle_data[operand_desc_id] }; // Generate instructions for source register swizzling as needed u8 sel = swiz.GetRawSelector(src_num); @@ -248,7 +239,7 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) { dest = instr.common.dest.Value(); } - SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; + SwizzlePattern swiz = { setup->swizzle_data[operand_desc_id] }; int dest_offset_disp = (int)UnitState::OutputOffset(dest); ASSERT_MSG(dest_offset_disp == UnitState::OutputOffset(dest), "Destinaton offset too large for int type"); @@ -256,11 +247,11 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) { // If all components are enabled, write the result to the destination register if (swiz.dest_mask == NO_DEST_REG_MASK) { // Store dest back to memory - MOVAPS(MDisp(REGISTERS, dest_offset_disp), src); + MOVAPS(MDisp(STATE, dest_offset_disp), src); } else { // Not all components are enabled, so mask the result when storing to the destination register... - MOVAPS(SCRATCH, MDisp(REGISTERS, dest_offset_disp)); + MOVAPS(SCRATCH, MDisp(STATE, dest_offset_disp)); if (Common::GetCPUCaps().sse4_1) { u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); @@ -279,7 +270,7 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) { } // Store dest back to memory - MOVAPS(MDisp(REGISTERS, dest_offset_disp), SCRATCH); + MOVAPS(MDisp(STATE, dest_offset_disp), SCRATCH); } } @@ -328,8 +319,8 @@ void JitShader::Compile_EvaluateCondition(Instruction instr) { } void JitShader::Compile_UniformCondition(Instruction instr) { - int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool)); - CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); + int offset = ShaderSetup::UniformOffset(RegisterType::BoolUniform, instr.flow_control.bool_uniform_id); + CMP(sizeof(bool) * 8, MDisp(SETUP, offset), Imm8(0)); } BitSet32 JitShader::PersistentCallerSavedRegs() { @@ -504,7 +495,7 @@ void JitShader::Compile_MIN(Instruction instr) { } void JitShader::Compile_MOVA(Instruction instr) { - SwizzlePattern swiz = { g_state.vs.swizzle_data[instr.common.operand_desc_id] }; + SwizzlePattern swiz = { setup->swizzle_data[instr.common.operand_desc_id] }; if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { return; // NoOp @@ -706,8 +697,8 @@ void JitShader::Compile_LOOP(Instruction instr) { looping = true; - int offset = offsetof(decltype(g_state.vs.uniforms), i) + (instr.flow_control.int_uniform_id * sizeof(Math::Vec4)); - MOV(32, R(LOOPCOUNT), MDisp(UNIFORMS, offset)); + int offset = ShaderSetup::UniformOffset(RegisterType::IntUniform, instr.flow_control.int_uniform_id); + MOV(32, R(LOOPCOUNT), MDisp(SETUP, offset)); MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT)); SHR(32, R(LOOPCOUNT_REG), Imm8(8)); AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start @@ -728,6 +719,22 @@ void JitShader::Compile_LOOP(Instruction instr) { looping = false; } +static void Handle_EMIT(void* param1) { + UnitState& state = *static_cast*>(param1); + Shader::HandleEMIT(state); +}; + +void JitShader::Compile_EMIT(Instruction instr) { + ABI_PushRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); + MOV(PTRBITS, R(ABI_PARAM1), R(STATE)); + ABI_CallFunctionR(reinterpret_cast(Handle_EMIT), ABI_PARAM1); + ABI_PopRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); +} + +void JitShader::Compile_SETEMIT(Instruction instr) { + MOV(32, MDisp(STATE, UnitState::EmitParamsOffset()), Imm32(*(u32*)&instr.setemit)); +} + void JitShader::Compile_JMP(Instruction instr) { if (instr.opcode.Value() == OpCode::Id::JMPC) Compile_EvaluateCondition(instr); @@ -768,7 +775,7 @@ void JitShader::Compile_NextInstr() { ASSERT_MSG(code_ptr[program_counter] == nullptr, "Tried to compile already compiled shader location!"); code_ptr[program_counter] = GetCodePtr(); - Instruction instr = GetVertexShaderInstruction(program_counter++); + Instruction instr = GetShaderInstruction(program_counter++); OpCode::Id opcode = instr.opcode.Value(); auto instr_func = instr_table[static_cast(opcode)]; @@ -786,8 +793,8 @@ void JitShader::Compile_NextInstr() { void JitShader::FindReturnOffsets() { return_offsets.clear(); - for (size_t offset = 0; offset < g_state.vs.program_code.size(); ++offset) { - Instruction instr = GetVertexShaderInstruction(offset); + for (size_t offset = 0; offset < setup->program_code.size(); ++offset) { + Instruction instr = GetShaderInstruction(offset); switch (instr.opcode.Value()) { case OpCode::Id::CALL: @@ -802,7 +809,11 @@ void JitShader::FindReturnOffsets() { std::sort(return_offsets.begin(), return_offsets.end()); } -void JitShader::Compile() { +void JitShader::Compile(const ShaderSetup& setup) { + + // Get a pointer to the setup to access program_code and swizzle_data + this->setup = &setup; + // Reset flow control state program = (CompiledShader*)GetCodePtr(); program_counter = 0; @@ -816,8 +827,8 @@ void JitShader::Compile() { // The stack pointer is 8 modulo 16 at the entry of a procedure ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); - MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1)); - MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms)); + MOV(PTRBITS, R(SETUP), R(ABI_PARAM1)); + MOV(PTRBITS, R(STATE), R(ABI_PARAM3)); // Zero address/loop registers XOR(64, R(ADDROFFS_REG_0), R(ADDROFFS_REG_0)); @@ -838,7 +849,7 @@ void JitShader::Compile() { JMPptr(R(ABI_PARAM2)); // Compile entire program - Compile_Block(static_cast(g_state.vs.program_code.size())); + Compile_Block(static_cast(this->setup->program_code.size())); // Set the target for any incomplete branches now that the entire shader program has been emitted for (const auto& branch : fixup_branches) { @@ -855,6 +866,10 @@ void JitShader::Compile() { ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); LOG_DEBUG(HW_GPU, "Compiled shader size=%d", size); + + // We don't need the setup anymore + this->setup = nullptr; + } JitShader::JitShader() { diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index cd6280ade..9842d2ba7 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h @@ -33,11 +33,11 @@ class JitShader : public Gen::XCodeBlock { public: JitShader(); - void Run(void* registers, unsigned offset) const { - program(registers, code_ptr[offset]); + void Run(const Pica::Regs::ShaderConfig& config, const ShaderSetup& setup, UnitState& state) const { + program(&setup, code_ptr[config.main_offset], &state); } - void Compile(); + void Compile(const ShaderSetup& setup); void Compile_ADD(Instruction instr); void Compile_DP3(Instruction instr); @@ -62,6 +62,8 @@ public: void Compile_CALLU(Instruction instr); void Compile_IF(Instruction instr); void Compile_LOOP(Instruction instr); + void Compile_EMIT(Instruction instr); + void Compile_SETEMIT(Instruction instr); void Compile_JMP(Instruction instr); void Compile_CMP(Instruction instr); void Compile_MAD(Instruction instr); @@ -96,6 +98,17 @@ private: */ void Compile_Assert(bool condition, const char* msg); + /** + * Get the shader instruction for a given offset in the current shader program + * @param offset Offset in the current shader program of the instruction + * @return Instruction at the specified offset + */ + Instruction GetShaderInstruction(size_t offset) { + Instruction instruction; + std::memcpy(&instruction, &setup->program_code[offset], sizeof(Instruction)); + return instruction; + } + /** * Analyzes the entire shader program for `CALL` instructions before emitting any code, * identifying the locations where a return needs to be inserted. @@ -114,8 +127,10 @@ private: /// Branches that need to be fixed up once the entire shader program is compiled std::vector> fixup_branches; - using CompiledShader = void(void* registers, const u8* start_addr); + using CompiledShader = void(const void* setup, const u8* start_addr, void* state); CompiledShader* program = nullptr; + + const ShaderSetup* setup = nullptr; }; } // Shader