From 11084386073a36d7d766b5b35b7c435c47e522ef Mon Sep 17 00:00:00 2001 From: Dragios Date: Sat, 16 Apr 2016 01:32:33 +0800 Subject: [PATCH] Revert "New GS refactor (#8)" This reverts commit 10272dd54108852f535ebe00d55db96402e12d27. --- .../debugger/graphics_breakpoints.cpp | 3 +- src/citra_qt/debugger/graphics_tracing.cpp | 2 +- .../debugger/graphics_vertex_shader.cpp | 6 +- src/video_core/command_processor.cpp | 227 ++++------- src/video_core/debug_utils/debug_utils.h | 3 +- src/video_core/pica.cpp | 2 +- src/video_core/pica.h | 39 +- src/video_core/pica_state.h | 13 - src/video_core/primitive_assembly.cpp | 1 + src/video_core/shader/shader.cpp | 371 +++++------------- src/video_core/shader/shader.h | 174 +++----- src/video_core/shader/shader_interpreter.cpp | 97 ++--- src/video_core/shader/shader_interpreter.h | 3 +- src/video_core/shader/shader_jit_x64.cpp | 83 ++-- src/video_core/shader/shader_jit_x64.h | 23 +- 15 files changed, 321 insertions(+), 726 deletions(-) diff --git a/src/citra_qt/debugger/graphics_breakpoints.cpp b/src/citra_qt/debugger/graphics_breakpoints.cpp index d6d3c558e..819ec7707 100644 --- a/src/citra_qt/debugger/graphics_breakpoints.cpp +++ b/src/citra_qt/debugger/graphics_breakpoints.cpp @@ -44,8 +44,7 @@ QVariant BreakPointModel::data(const QModelIndex& index, int role) const { Pica::DebugContext::Event::PicaCommandProcessed, tr("Pica command processed") }, { Pica::DebugContext::Event::IncomingPrimitiveBatch, tr("Incoming primitive batch") }, { Pica::DebugContext::Event::FinishedPrimitiveBatch, tr("Finished primitive batch") }, - { Pica::DebugContext::Event::RunVS, tr("Vertex shader invocation") }, - { Pica::DebugContext::Event::RunGS, tr("Geometry shader invocation") }, + { Pica::DebugContext::Event::VertexLoaded, tr("Vertex loaded") }, { Pica::DebugContext::Event::IncomingDisplayTransfer, tr("Incoming display transfer") }, { Pica::DebugContext::Event::GSPCommandProcessed, tr("GSP command processed") }, { Pica::DebugContext::Event::BufferSwapped, tr("Buffers swapped") } diff --git a/src/citra_qt/debugger/graphics_tracing.cpp b/src/citra_qt/debugger/graphics_tracing.cpp index e1e02a1da..e06498744 100644 --- a/src/citra_qt/debugger/graphics_tracing.cpp +++ b/src/citra_qt/debugger/graphics_tracing.cpp @@ -70,7 +70,7 @@ void GraphicsTracingWidget::StartRecording() { std::array default_attributes; for (unsigned i = 0; i < 16; ++i) { for (unsigned comp = 0; comp < 3; ++comp) { - default_attributes[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs_default_attributes[i][comp].ToFloat32()); + default_attributes[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs.default_attributes[i][comp].ToFloat32()); } } diff --git a/src/citra_qt/debugger/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics_vertex_shader.cpp index c04e9f92d..d648d4640 100644 --- a/src/citra_qt/debugger/graphics_vertex_shader.cpp +++ b/src/citra_qt/debugger/graphics_vertex_shader.cpp @@ -365,7 +365,7 @@ GraphicsVertexShaderWidget::GraphicsVertexShaderWidget(std::shared_ptr< Pica::De input_data[i]->setValidator(new QDoubleValidator(input_data[i])); } - breakpoint_warning = new QLabel(tr("(data only available at vertex shader invocation breakpoints)")); + breakpoint_warning = new QLabel(tr("(data only available at VertexLoaded breakpoints)")); // TODO: Add some button for jumping to the shader entry point @@ -454,7 +454,7 @@ GraphicsVertexShaderWidget::GraphicsVertexShaderWidget(std::shared_ptr< Pica::De void GraphicsVertexShaderWidget::OnBreakPointHit(Pica::DebugContext::Event event, void* data) { auto input = static_cast(data); - if (event == Pica::DebugContext::Event::RunVS) { + if (event == Pica::DebugContext::Event::VertexLoaded) { Reload(true, data); } else { // No vertex data is retrievable => invalidate currently stored vertex data @@ -501,7 +501,7 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d info.labels.insert({ entry_point, "main" }); // Generate debug information - debug_data = Pica::g_state.vs.ProduceDebugInfo(input_vertex, num_attributes, shader_config); + debug_data = Pica::Shader::ProduceDebugInfo(input_vertex, num_attributes, shader_config, shader_setup); // Reload widget state for (int attr = 0; attr < num_attributes; ++attr) { diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 8d9b83780..a889ec0e1 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -29,6 +29,10 @@ namespace Pica { namespace CommandProcessor { +static int float_regs_counter = 0; + +static u32 uniform_write_buffer[4]; + static int default_attr_counter = 0; static u32 default_attr_write_buffer[3]; @@ -122,7 +126,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { // TODO: Verify that this actually modifies the register! if (setup.index < 15) { - g_state.vs_default_attributes[setup.index] = attribute; + g_state.vs.default_attributes[setup.index] = attribute; setup.index++; } else { // Put each attribute into an immediate input buffer. @@ -137,14 +141,14 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { if (immediate_attribute_id >= regs.vs.num_input_attributes+1) { immediate_attribute_id = 0; - auto& shader_unit = Shader::GetShaderUnit(false); - g_state.vs.Setup(); + Shader::UnitState shader_unit; + Shader::Setup(); + + if (g_debug_context) + g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, static_cast(&immediate_input)); // Send to vertex shader - if (g_debug_context) - g_debug_context->OnEvent(DebugContext::Event::RunVS, static_cast(&immediate_input)); - g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1, regs.vs); - Shader::OutputVertex output_vertex = shader_unit.output_registers.ToVertex(regs.vs); + Shader::OutputVertex output = Shader::Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1); // Send to renderer using Pica::Shader::OutputVertex; @@ -152,7 +156,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); }; - g_state.primitive_assembler.SubmitVertex(output_vertex, AddTriangle); + g_state.primitive_assembler.SubmitVertex(output, AddTriangle); } } } @@ -299,16 +303,13 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { // The size has been tuned for optimal balance between hit-rate and the cost of lookup const size_t VERTEX_CACHE_SIZE = 32; std::array vertex_cache_ids; - std::array vertex_cache; + std::array vertex_cache; unsigned int vertex_cache_pos = 0; vertex_cache_ids.fill(-1); - auto& vs_shader_unit = Shader::GetShaderUnit(false); - g_state.vs.Setup(); - - auto& gs_unit_state = Shader::GetShaderUnit(true); - g_state.gs.Setup(); + Shader::UnitState shader_unit; + Shader::Setup(); for (unsigned int index = 0; index < regs.num_vertices; ++index) { @@ -320,7 +321,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { ASSERT(vertex != -1); bool vertex_cache_hit = false; - Shader::OutputRegisters output_registers; + Shader::OutputVertex output; if (is_indexed) { if (g_debug_context && Pica::g_debug_context->recorder) { @@ -330,7 +331,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) { if (vertex == vertex_cache_ids[i]) { - output_registers = vertex_cache[i]; + output = vertex_cache[i]; vertex_cache_hit = true; break; } @@ -377,7 +378,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { } } else if (attribute_config.IsDefaultAttribute(i)) { // Load the default attribute if we're configured to do so - input.attr[i] = g_state.vs_default_attributes[i]; + input.attr[i] = g_state.vs.default_attributes[i]; LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)", i, vertex, index, input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), @@ -389,69 +390,27 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { } } - // Send to vertex shader if (g_debug_context) - g_debug_context->OnEvent(DebugContext::Event::RunVS, static_cast(&input)); - g_state.vs.Run(vs_shader_unit, input, attribute_config.GetNumTotalAttributes(), g_state.regs.vs); - output_registers = vs_shader_unit.output_registers; + g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input); + + // Send to vertex shader + output = Shader::Run(shader_unit, input, attribute_config.GetNumTotalAttributes()); if (is_indexed) { - vertex_cache[vertex_cache_pos] = output_registers; + vertex_cache[vertex_cache_pos] = output; vertex_cache_ids[vertex_cache_pos] = vertex; vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE; } } - // Helper to send triangle to renderer + // Send to renderer using Pica::Shader::OutputVertex; auto AddTriangle = []( const OutputVertex& v0, const OutputVertex& v1, const OutputVertex& v2) { VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); }; - if (Shader::UseGS()) { - - auto& regs = g_state.regs; - auto& gs_regs = g_state.regs.gs; - auto& gs_buf = g_state.gs_input_buffer; - - // Vertex Shader Outputs are converted into Geometry Shader inputs by filling up a buffer - // For example, if we have a geoshader that takes 6 inputs, and the vertex shader outputs 2 attributes - // It would take 3 vertices to fill up the Geometry Shader buffer - unsigned int gs_input_count = gs_regs.num_input_attributes + 1; - unsigned int vs_output_count = regs.vs_outmap_total2 + 1; - ASSERT_MSG(regs.vs_outmap_total1 == regs.vs_outmap_total2, "VS_OUTMAP_TOTAL1 and VS_OUTMAP_TOTAL2 don't match!"); - // copy into the geoshader buffer - for (unsigned int i = 0; i < vs_output_count; i++) { - if (gs_buf.index >= gs_input_count) { - // TODO(ds84182): LOG_ERROR() - ASSERT_MSG(false, "Number of GS inputs (%d) is not divisible by number of VS outputs (%d)", - gs_input_count, vs_output_count); - continue; - } - gs_buf.buffer.attr[gs_buf.index++] = output_registers.value[i]; - } - - if (gs_buf.index >= gs_input_count) { - - // b15 will be false when a new primitive starts and then switch to true at some point - //TODO: Test how this works exactly on hardware - g_state.gs.uniforms.b[15] |= (index > 0); - - // Process Geometry Shader - if (g_debug_context) - g_debug_context->OnEvent(DebugContext::Event::RunGS, static_cast(&gs_buf.buffer)); - gs_unit_state.emit_triangle_callback = AddTriangle; - g_state.gs.Run(gs_unit_state, gs_buf.buffer, gs_input_count, regs.gs); - gs_unit_state.emit_triangle_callback = nullptr; - - gs_buf.index = 0; - } - } else { - Shader::OutputVertex output_vertex = output_registers.ToVertex(regs.vs); - primitive_assembler.SubmitVertex(output_vertex, AddTriangle); - } - + primitive_assembler.SubmitVertex(output, AddTriangle); } for (auto& range : memory_accesses.ranges) { @@ -462,76 +421,10 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { break; } - case PICA_REG_INDEX(gs.bool_uniforms): - Shader::WriteUniformBoolReg(true, value); - break; - - case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[0], 0x281): - case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[1], 0x282): - case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[2], 0x283): - case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[3], 0x284): - { - unsigned index = (id - PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[0], 0x281)); - auto values = regs.gs.int_uniforms[index]; - Shader::WriteUniformIntReg(true, index, Math::Vec4(values.x, values.y, values.z, values.w)); - break; - } - - case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.setup, 0x290): - Shader::WriteUniformFloatSetupReg(true, value); - break; - - case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[0], 0x291): - case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[1], 0x292): - case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[2], 0x293): - case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[3], 0x294): - case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[4], 0x295): - case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[5], 0x296): - case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[6], 0x297): - case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[7], 0x298): - { - Shader::WriteUniformFloatReg(true, value); - break; - } - - // Load shader program code - case PICA_REG_INDEX_WORKAROUND(gs.program.offset, 0x29b): - Shader::WriteProgramCodeOffset(true, value); - break; - - case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[0], 0x29c): - case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[1], 0x29d): - case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[2], 0x29e): - case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[3], 0x29f): - case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[4], 0x2a0): - case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[5], 0x2a1): - case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[6], 0x2a2): - case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[7], 0x2a3): - { - Shader::WriteProgramCode(true, value); - break; - } - - // Load swizzle pattern data - case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.offset, 0x2a5): - Shader::WriteSwizzlePatternsOffset(true, value); - break; - - case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[0], 0x2a6): - case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[1], 0x2a7): - case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[2], 0x2a8): - case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[3], 0x2a9): - case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[4], 0x2aa): - case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[5], 0x2ab): - case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[6], 0x2ac): - case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[7], 0x2ad): - { - Shader::WriteSwizzlePatterns(true, value); - break; - } - case PICA_REG_INDEX(vs.bool_uniforms): - Shader::WriteUniformBoolReg(false, value); + for (unsigned i = 0; i < 16; ++i) + g_state.vs.uniforms.b[i] = (regs.vs.bool_uniforms.Value() & (1 << i)) != 0; + break; case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1): @@ -539,16 +432,14 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[2], 0x2b3): case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[3], 0x2b4): { - unsigned index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1)); + int index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1)); auto values = regs.vs.int_uniforms[index]; - Shader::WriteUniformIntReg(false, index, Math::Vec4(values.x, values.y, values.z, values.w)); + g_state.vs.uniforms.i[index] = Math::Vec4(values.x, values.y, values.z, values.w); + LOG_TRACE(HW_GPU, "Set integer uniform %d to %02x %02x %02x %02x", + index, values.x.Value(), values.y.Value(), values.z.Value(), values.w.Value()); break; } - case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.setup, 0x2c0): - Shader::WriteUniformFloatSetupReg(false, value); - break; - case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[0], 0x2c1): case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[1], 0x2c2): case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[2], 0x2c3): @@ -558,15 +449,49 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[6], 0x2c7): case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[7], 0x2c8): { - Shader::WriteUniformFloatReg(false, value); + auto& uniform_setup = regs.vs.uniform_setup; + + // TODO: Does actual hardware indeed keep an intermediate buffer or does + // it directly write the values? + uniform_write_buffer[float_regs_counter++] = value; + + // Uniforms are written in a packed format such that four float24 values are encoded in + // three 32-bit numbers. We write to internal memory once a full such vector is + // written. + if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) || + (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) { + float_regs_counter = 0; + + auto& uniform = g_state.vs.uniforms.f[uniform_setup.index]; + + if (uniform_setup.index > 95) { + LOG_ERROR(HW_GPU, "Invalid VS uniform index %d", (int)uniform_setup.index); + break; + } + + // NOTE: The destination component order indeed is "backwards" + if (uniform_setup.IsFloat32()) { + for (auto i : {0,1,2,3}) + uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i])); + } else { + // TODO: Untested + uniform.w = float24::FromRaw(uniform_write_buffer[0] >> 8); + uniform.z = float24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | ((uniform_write_buffer[1] >> 16) & 0xFFFF)); + uniform.y = float24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | ((uniform_write_buffer[2] >> 24) & 0xFF)); + uniform.x = float24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF); + } + + LOG_TRACE(HW_GPU, "Set uniform %x to (%f %f %f %f)", (int)uniform_setup.index, + uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(), + uniform.w.ToFloat32()); + + // TODO: Verify that this actually modifies the register! + uniform_setup.index.Assign(uniform_setup.index + 1); + } break; } // Load shader program code - case PICA_REG_INDEX_WORKAROUND(vs.program.offset, 0x2cb): - Shader::WriteProgramCodeOffset(false, value); - break; - case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[0], 0x2cc): case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[1], 0x2cd): case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[2], 0x2ce): @@ -576,15 +501,12 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[6], 0x2d2): case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[7], 0x2d3): { - Shader::WriteProgramCode(false, value); + g_state.vs.program_code[regs.vs.program.offset] = value; + regs.vs.program.offset++; break; } // Load swizzle pattern data - case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.offset, 0x2d5): - Shader::WriteSwizzlePatternsOffset(false, value); - break; - case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[0], 0x2d6): case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[1], 0x2d7): case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[2], 0x2d8): @@ -594,7 +516,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[6], 0x2dc): case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[7], 0x2dd): { - Shader::WriteSwizzlePatterns(false, value); + g_state.vs.swizzle_data[regs.vs.swizzle_patterns.offset] = value; + regs.vs.swizzle_patterns.offset++; break; } diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h index e01133d6f..7df941619 100644 --- a/src/video_core/debug_utils/debug_utils.h +++ b/src/video_core/debug_utils/debug_utils.h @@ -30,8 +30,7 @@ public: PicaCommandProcessed, IncomingPrimitiveBatch, FinishedPrimitiveBatch, - RunVS, - RunGS, + VertexLoaded, IncomingDisplayTransfer, GSPCommandProcessed, BufferSwapped, diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp index 710ebedc1..ccbaf071b 100644 --- a/src/video_core/pica.cpp +++ b/src/video_core/pica.cpp @@ -497,7 +497,7 @@ void Init() { } void Shutdown() { - Shader::ShaderSetup::Shutdown(); + Shader::Shutdown(); } template diff --git a/src/video_core/pica.h b/src/video_core/pica.h index f4d7d720f..f066c9719 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h @@ -1070,7 +1070,7 @@ struct Regs { // Number of vertices to render u32 num_vertices; - BitField<0, 2, u32> using_geometry_shader; + INSERT_PADDING_WORDS(0x1); // The index of the first vertex to render u32 vertex_offset; @@ -1118,14 +1118,7 @@ struct Regs { } } command_buffer; - INSERT_PADDING_WORDS(0x06); - - enum class VSComMode : u32 { - Shared = 0, - Exclusive = 1 - }; - - VSComMode vs_com_mode; + INSERT_PADDING_WORDS(0x07); enum class GPUMode : u32 { Drawing = 0, @@ -1134,17 +1127,7 @@ struct Regs { GPUMode gpu_mode; - INSERT_PADDING_WORDS(0x4); - - BitField<0, 4, u32> vs_outmap_total1; - - INSERT_PADDING_WORDS(0x6); - - BitField<0, 4, u32> vs_outmap_total2; - - BitField<0, 4, u32> gsh_misc0; - - INSERT_PADDING_WORDS(0xB); + INSERT_PADDING_WORDS(0x18); enum class TriangleTopology : u32 { List = 0, @@ -1153,10 +1136,7 @@ struct Regs { Shader = 3, // Programmable setup unit implemented in a geometry shader }; - union { - BitField<0, 4, u32> vs_outmap_count; - BitField<8, 2, TriangleTopology> triangle_topology; - }; + BitField<8, 2, TriangleTopology> triangle_topology; u32 restart_primitive; @@ -1175,9 +1155,8 @@ struct Regs { INSERT_PADDING_WORDS(0x4); union { - BitField<0, 4, u32> num_input_attributes; // Number of input attributes to shader unit - 1 - BitField<8, 4, u32> use_subdivision; - BitField<24, 8, u32> use_geometry_shader; + // Number of input attributes to shader unit - 1 + BitField<0, 4, u32> num_input_attributes; }; // Offset to shader program entry point (in words) @@ -1229,8 +1208,6 @@ struct Regs { } union { - u32 setup; - // Index of the next uniform to write to // TODO: ctrulib uses 8 bits for this, however that seems to yield lots of invalid indices // TODO: Maybe the uppermost index is for the geometry shader? Investigate! @@ -1347,11 +1324,7 @@ ASSERT_REG_POSITION(trigger_draw, 0x22e); ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f); ASSERT_REG_POSITION(vs_default_attributes_setup, 0x232); ASSERT_REG_POSITION(command_buffer, 0x238); -ASSERT_REG_POSITION(vs_com_mode, 0x244); ASSERT_REG_POSITION(gpu_mode, 0x245); -ASSERT_REG_POSITION(vs_outmap_total1, 0x24A); -ASSERT_REG_POSITION(vs_outmap_total2, 0x251); -ASSERT_REG_POSITION(gsh_misc0, 0x252); ASSERT_REG_POSITION(triangle_topology, 0x25e); ASSERT_REG_POSITION(restart_primitive, 0x25f); ASSERT_REG_POSITION(gs, 0x280); diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h index 848e6bde1..323290054 100644 --- a/src/video_core/pica_state.h +++ b/src/video_core/pica_state.h @@ -17,13 +17,9 @@ struct State { /// Pica registers Regs regs; - Shader::UnitState shader_units[4]; - Shader::ShaderSetup vs; Shader::ShaderSetup gs; - Math::Vec4 vs_default_attributes[16]; - struct { union LutEntry { // Used for raw access @@ -60,15 +56,6 @@ struct State { // This is constructed with a dummy triangle topology PrimitiveAssembler primitive_assembler; - - /// Current geometry shader state - struct GeometryShaderState { - // Buffer used for geometry shader inputs - Shader::InputVertex buffer; - // The current index into the buffer - unsigned int index; - } gs_input_buffer; - }; extern State g_state; ///< Current Pica state diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp index a22f153d3..ff3e2b862 100644 --- a/src/video_core/primitive_assembly.cpp +++ b/src/video_core/primitive_assembly.cpp @@ -20,6 +20,7 @@ template void PrimitiveAssembler::SubmitVertex(VertexType& vtx, TriangleHandler triangle_handler) { switch (topology) { + // TODO: Figure out what's different with TriangleTopology::Shader. case Regs::TriangleTopology::List: case Regs::TriangleTopology::Shader: if (buffer_index < 2) { diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 3008d23d9..75301accd 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -27,7 +27,83 @@ namespace Pica { namespace Shader { -OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) { +#ifdef ARCHITECTURE_x86_64 +static std::unordered_map> shader_map; +static const JitShader* jit_shader; +#endif // ARCHITECTURE_x86_64 + +void Setup() { +#ifdef ARCHITECTURE_x86_64 + if (VideoCore::g_shader_jit_enabled) { + u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ + Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data))); + + auto iter = shader_map.find(cache_key); + if (iter != shader_map.end()) { + jit_shader = iter->second.get(); + } else { + auto shader = std::make_unique(); + shader->Compile(); + jit_shader = shader.get(); + shader_map[cache_key] = std::move(shader); + } + } +#endif // ARCHITECTURE_x86_64 +} + +void Shutdown() { +#ifdef ARCHITECTURE_x86_64 + shader_map.clear(); +#endif // ARCHITECTURE_x86_64 +} + +static Common::Profiling::TimingCategory shader_category("Vertex Shader"); +MICROPROFILE_DEFINE(GPU_VertexShader, "GPU", "Vertex Shader", MP_RGB(50, 50, 240)); + +OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes) { + auto& config = g_state.regs.vs; + + Common::Profiling::ScopeTimer timer(shader_category); + MICROPROFILE_SCOPE(GPU_VertexShader); + + state.program_counter = config.main_offset; + state.debug.max_offset = 0; + state.debug.max_opdesc_id = 0; + + // Setup input register table + const auto& attribute_register_map = config.input_register_map; + + // TODO: Instead of this cumbersome logic, just load the input data directly like + // for (int attr = 0; attr < num_attributes; ++attr) { input_attr[0] = state.registers.input[attribute_register_map.attribute0_register]; } + if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = input.attr[0]; + if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = input.attr[1]; + if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = input.attr[2]; + if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = input.attr[3]; + if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = input.attr[4]; + if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = input.attr[5]; + if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = input.attr[6]; + if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = input.attr[7]; + if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = input.attr[8]; + if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = input.attr[9]; + if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = input.attr[10]; + if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = input.attr[11]; + if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = input.attr[12]; + if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = input.attr[13]; + if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = input.attr[14]; + if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = input.attr[15]; + + state.conditional_code[0] = false; + state.conditional_code[1] = false; + +#ifdef ARCHITECTURE_x86_64 + if (VideoCore::g_shader_jit_enabled) + jit_shader->Run(&state.registers, g_state.regs.vs.main_offset); + else + RunInterpreter(state); +#else + RunInterpreter(state); +#endif // ARCHITECTURE_x86_64 + // Setup output data OutputVertex ret; // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to @@ -38,10 +114,10 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) { if (index >= g_state.regs.vs_output_total) break; - if ((config.output_mask & (1 << i)) == 0) + if ((g_state.regs.vs.output_mask & (1 << i)) == 0) continue; - const auto& output_register_map = g_state.regs.vs_output_attributes[index]; + const auto& output_register_map = g_state.regs.vs_output_attributes[index]; // TODO: Don't hardcode VS here u32 semantics[4] = { output_register_map.map_x, output_register_map.map_y, @@ -51,7 +127,7 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) { for (unsigned comp = 0; comp < 4; ++comp) { float24* out = ((float24*)&ret) + semantics[comp]; if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { - *out = value[i][comp]; + *out = state.registers.output[i][comp]; } else { // Zero output so that attributes which aren't output won't have denormals in them, // which would slow us down later. @@ -79,71 +155,10 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) { return ret; } -#ifdef ARCHITECTURE_x86_64 -static std::unordered_map> shader_map; -#endif // ARCHITECTURE_x86_64 - -void ShaderSetup::Setup() { -#ifdef ARCHITECTURE_x86_64 - if (VideoCore::g_shader_jit_enabled) { - u64 cache_key = (Common::ComputeHash64(&program_code, sizeof(program_code)) ^ - Common::ComputeHash64(&swizzle_data, sizeof(swizzle_data))); - - auto iter = shader_map.find(cache_key); - if (iter != shader_map.end()) { - jit_shader = iter->second; - } else { - auto shader = std::make_shared(); - shader->Compile(*this); - jit_shader = shader; - shader_map[cache_key] = std::move(shader); - } - } else { - jit_shader.reset(); - } -#endif // ARCHITECTURE_x86_64 -} - -void ShaderSetup::Shutdown() { -#ifdef ARCHITECTURE_x86_64 - shader_map.clear(); -#endif // ARCHITECTURE_x86_64 -} - -static Common::Profiling::TimingCategory shader_category("Shader"); -MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); - -void ShaderSetup::Run(UnitState& state, const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config) { - - Common::Profiling::ScopeTimer timer(shader_category); - MICROPROFILE_SCOPE(GPU_Shader); - - state.debug.max_offset = 0; - state.debug.max_opdesc_id = 0; - - // Setup input register table - const auto& attribute_register_map = config.input_register_map; - - for (unsigned i = 0; i < num_attributes; i++) - state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i]; - - state.conditional_code[0] = false; - state.conditional_code[1] = false; - -#ifdef ARCHITECTURE_x86_64 - if (auto shader = jit_shader.lock()) - shader.get()->Run(config, *this, state); - else - RunInterpreter(config, *this, state); -#else - RunInterpreter(config, *this, state); -#endif // ARCHITECTURE_x86_64 - -} - -DebugData ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config) { +DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) { UnitState state; + state.program_counter = config.main_offset; state.debug.max_offset = 0; state.debug.max_opdesc_id = 0; @@ -152,218 +167,30 @@ DebugData ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_ float24 dummy_register; boost::fill(state.registers.input, &dummy_register); - for (unsigned i = 0; i < num_attributes; i++) - state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i]; + if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = &input.attr[0].x; + if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = &input.attr[1].x; + if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = &input.attr[2].x; + if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = &input.attr[3].x; + if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = &input.attr[4].x; + if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = &input.attr[5].x; + if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = &input.attr[6].x; + if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = &input.attr[7].x; + if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = &input.attr[8].x; + if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = &input.attr[9].x; + if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = &input.attr[10].x; + if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = &input.attr[11].x; + if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = &input.attr[12].x; + if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = &input.attr[13].x; + if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = &input.attr[14].x; + if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = &input.attr[15].x; state.conditional_code[0] = false; state.conditional_code[1] = false; - RunInterpreter(config, *this, state); + RunInterpreter(state); return state.debug; } -bool SharedGS() { - return g_state.regs.vs_com_mode == Pica::Regs::VSComMode::Shared; -} - -bool UseGS() { - // TODO(ds84182): This would be more accurate if it looked at induvidual shader units for the geoshader bit - // gs_regs.input_buffer_config.use_geometry_shader == 0x08 - ASSERT((g_state.regs.using_geometry_shader == 0) || (g_state.regs.using_geometry_shader == 2)); - return g_state.regs.using_geometry_shader == 2; -} - -UnitState& GetShaderUnit(bool gs) { - - // GS are always run on shader unit 3 - if (gs) { - return g_state.shader_units[3]; - } - - // The worst scheduler you'll ever see! - //TODO: How does PICA shader scheduling work? - static unsigned shader_unit_scheduler = 0; - shader_unit_scheduler++; - shader_unit_scheduler %= 3; // TODO: When does it also allow use of unit 3?! - return g_state.shader_units[shader_unit_scheduler]; -} - -void WriteUniformBoolReg(bool gs, u32 value) { - auto& setup = gs ? g_state.gs : g_state.vs; - - ASSERT(setup.uniforms.b.size() == 16); - for (unsigned i = 0; i < 16; ++i) - setup.uniforms.b[i] = (value & (1 << i)) != 0; - - // Copy for GS in shared mode - if (!gs && SharedGS()) { - WriteUniformBoolReg(true, value); - } -} - -void WriteUniformIntReg(bool gs, unsigned index, const Math::Vec4& values) { - const char* shader_type = gs ? "GS" : "VS"; - auto& setup = gs ? g_state.gs : g_state.vs; - - ASSERT(index < setup.uniforms.i.size()); - setup.uniforms.i[index] = values; - LOG_TRACE(HW_GPU, "Set %s integer uniform %d to %02x %02x %02x %02x", - shader_type, index, values.x.Value(), values.y.Value(), values.z.Value(), values.w.Value()); - - // Copy for GS in shared mode - if (!gs && SharedGS()) { - WriteUniformIntReg(true, index, values); - } -} - -void WriteUniformFloatSetupReg(bool gs, u32 value) { - auto& config = gs ? g_state.regs.gs : g_state.regs.vs; - - config.uniform_setup.setup = value; - - // Copy for GS in shared mode - if (!gs && SharedGS()) { - WriteUniformFloatSetupReg(true, value); - } -} - -void WriteUniformFloatReg(bool gs, u32 value) { - const char* shader_type = gs ? "GS" : "VS"; - auto& config = gs ? g_state.regs.gs : g_state.regs.vs; - auto& setup = gs ? g_state.gs : g_state.vs; - - auto& uniform_setup = config.uniform_setup; - auto& uniform_write_buffer = setup.uniform_write_buffer; - auto& float_regs_counter = setup.float_regs_counter; - - // TODO: Does actual hardware indeed keep an intermediate buffer or does - // it directly write the values? - uniform_write_buffer[float_regs_counter++] = value; - - // Uniforms are written in a packed format such that four float24 values are encoded in - // three 32-bit numbers. We write to internal memory once a full such vector is - // written. - if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) || - (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) { - float_regs_counter = 0; - - auto& uniform = setup.uniforms.f[uniform_setup.index]; - - if (uniform_setup.index >= 96) { - LOG_ERROR(HW_GPU, "Invalid %s float uniform index %d", shader_type, (int)uniform_setup.index); - } else { - - // NOTE: The destination component order indeed is "backwards" - if (uniform_setup.IsFloat32()) { - for (auto i : {0,1,2,3}) - uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i])); - } else { - // TODO: Untested - uniform.w = float24::FromRaw(uniform_write_buffer[0] >> 8); - uniform.z = float24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | ((uniform_write_buffer[1] >> 16) & 0xFFFF)); - uniform.y = float24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | ((uniform_write_buffer[2] >> 24) & 0xFF)); - uniform.x = float24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF); - } - - LOG_TRACE(HW_GPU, "Set %s float uniform %x to (%f %f %f %f)", shader_type, (int)uniform_setup.index, - uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(), - uniform.w.ToFloat32()); - - // TODO: Verify that this actually modifies the register! - uniform_setup.index.Assign(uniform_setup.index + 1); - } - - } - - // Copy for GS in shared mode - if (!gs && SharedGS()) { - WriteUniformFloatReg(true, value); - } -} - -void WriteProgramCodeOffset(bool gs, u32 value) { - auto& config = gs ? g_state.regs.gs : g_state.regs.vs; - config.program.offset = value; - - // Copy for GS in shared mode - if (!gs && SharedGS()) { - WriteProgramCodeOffset(true, value); - } -} - -void WriteProgramCode(bool gs, u32 value) { - const char* shader_type = gs ? "GS" : "VS"; - auto& config = gs ? g_state.regs.gs : g_state.regs.vs; - auto& setup = gs ? g_state.gs : g_state.vs; - - if (config.program.offset >= setup.program_code.size()) { - LOG_ERROR(HW_GPU, "Invalid %s program offset %d", shader_type, (int)config.program.offset); - } else { - setup.program_code[config.program.offset] = value; - config.program.offset++; - } - - // Copy for GS in shared mode - if (!gs && SharedGS()) { - WriteProgramCode(true, value); - } -} - -void WriteSwizzlePatternsOffset(bool gs, u32 value) { - auto& config = gs ? g_state.regs.gs : g_state.regs.vs; - config.swizzle_patterns.offset = value; - - // Copy for GS in shared mode - if (!gs && SharedGS()) { - WriteSwizzlePatternsOffset(true, value); - } -} - -void WriteSwizzlePatterns(bool gs, u32 value) { - const char* shader_type = gs ? "GS" : "VS"; - auto& config = gs ? g_state.regs.gs : g_state.regs.vs; - auto& setup = gs ? g_state.gs : g_state.vs; - - if (config.swizzle_patterns.offset >= setup.swizzle_data.size()) { - LOG_ERROR(HW_GPU, "Invalid %s swizzle pattern offset %d", shader_type, (int)config.swizzle_patterns.offset); - } else { - setup.swizzle_data[config.swizzle_patterns.offset] = value; - config.swizzle_patterns.offset++; - } - - // Copy for GS in shared mode - if (!gs && SharedGS()) { - WriteSwizzlePatterns(true, value); - } -} - -template -void HandleEMIT(UnitState& state) { - auto &config = g_state.regs.gs; - auto &emit_params = state.emit_params; - auto &emit_buffers = state.emit_buffers; - - ASSERT(emit_params.vertex_id < 3); - - emit_buffers[emit_params.vertex_id] = state.output_registers; - - if (emit_params.primitive_emit) { - ASSERT_MSG(state.emit_triangle_callback, "EMIT invoked but no handler set!"); - OutputVertex v0 = emit_buffers[0].ToVertex(config); - OutputVertex v1 = emit_buffers[1].ToVertex(config); - OutputVertex v2 = emit_buffers[2].ToVertex(config); - if (emit_params.winding) { - state.emit_triangle_callback(v2, v1, v0); - } else { - state.emit_triangle_callback(v0, v1, v2); - } - } -} - -// Explicit instantiation -template void HandleEMIT(UnitState& state); -template void HandleEMIT(UnitState& state); - } // namespace Shader } // namespace Pica diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 3b1658215..9c5bd97bd 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -4,7 +4,6 @@ #pragma once -#include #include #include @@ -16,7 +15,6 @@ #include "common/vector_math.h" #include "video_core/pica.h" -#include "video_core/primitive_assembly.h" using nihstro::RegisterType; using nihstro::SourceRegister; @@ -26,11 +24,6 @@ namespace Pica { namespace Shader { -#ifdef ARCHITECTURE_x86_64 -// Forward declare JitShader because shader_jit_x64.h requires ShaderSetup (which uses JitShader) from this file -class JitShader; -#endif // ARCHITECTURE_x86_64 - struct InputVertex { Math::Vec4 attr[16]; }; @@ -84,14 +77,22 @@ struct OutputVertex { static_assert(std::is_pod::value, "Structure is not POD"); static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); -struct OutputRegisters { - OutputRegisters() = default; +/// Vertex shader memory +struct ShaderSetup { + struct { + // The float uniforms are accessed by the shader JIT using SSE instructions, and are + // therefore required to be 16-byte aligned. + alignas(16) Math::Vec4 f[96]; - alignas(16) Math::Vec4 value[16]; + std::array b; + std::array, 4> i; + } uniforms; - OutputVertex ToVertex(const Regs::ShaderConfig& config); + Math::Vec4 default_attributes[16]; + + std::array program_code; + std::array swizzle_data; }; -static_assert(std::is_pod::value, "Structure is not POD"); // Helper structure used to keep track of data useful for inspection of shader emulation template @@ -191,9 +192,9 @@ inline void SetField(DebugDataRecord& record, float24* va record.src3.x = value[0]; record.src3.y = value[1]; record.src3.z = value[2]; - record.src3.w = value[3]; } + template<> inline void SetField(DebugDataRecord& record, float24* value) { record.dest_in.x = value[0]; @@ -276,38 +277,43 @@ struct UnitState { // The registers are accessed by the shader JIT using SSE instructions, and are therefore // required to be 16-byte aligned. alignas(16) Math::Vec4 input[16]; + alignas(16) Math::Vec4 output[16]; alignas(16) Math::Vec4 temporary[16]; } registers; static_assert(std::is_pod::value, "Structure is not POD"); - OutputRegisters emit_buffers[3]; //TODO: 3dbrew suggests this only stores the first 7 output registers - - union EmitParameters { - u32 raw; - BitField<22, 1, u32> winding; - BitField<23, 1, u32> primitive_emit; - BitField<24, 2, u32> vertex_id; - } emit_params; - - PrimitiveAssembler::TriangleHandler emit_triangle_callback; - - OutputRegisters output_registers; - + u32 program_counter; bool conditional_code[2]; // Two Address registers and one loop counter // TODO: How many bits do these actually have? s32 address_registers[3]; + enum { + INVALID_ADDRESS = 0xFFFFFFFF + }; + + struct CallStackElement { + u32 final_address; // Address upon which we jump to return_address + u32 return_address; // Where to jump when leaving scope + u8 repeat_counter; // How often to repeat until this call stack element is removed + u8 loop_increment; // Which value to add to the loop counter after an iteration + // TODO: Should this be a signed value? Does it even matter? + u32 loop_address; // The address where we'll return to after each loop iteration + }; + + // TODO: Is there a maximal size for this? + boost::container::static_vector call_stack; + DebugData debug; static size_t InputOffset(const SourceRegister& reg) { switch (reg.GetRegisterType()) { case RegisterType::Input: - return offsetof(UnitState, registers.input) + reg.GetIndex()*sizeof(Math::Vec4); + return offsetof(UnitState::Registers, input) + reg.GetIndex()*sizeof(Math::Vec4); case RegisterType::Temporary: - return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4); + return offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4); default: UNREACHABLE(); @@ -318,105 +324,45 @@ struct UnitState { static size_t OutputOffset(const DestRegister& reg) { switch (reg.GetRegisterType()) { case RegisterType::Output: - return offsetof(UnitState, output_registers.value) + reg.GetIndex()*sizeof(Math::Vec4); + return offsetof(UnitState::Registers, output) + reg.GetIndex()*sizeof(Math::Vec4); case RegisterType::Temporary: - return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4); + return offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4); default: UNREACHABLE(); return 0; } } - - static size_t EmitParamsOffset() { - return offsetof(UnitState, emit_params.raw); - } }; -class ShaderSetup { +/** + * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per + * vertex, which would happen within the `Run` function). + */ +void Setup(); -public: +/// Performs any cleanup when the emulator is shutdown +void Shutdown(); - struct { - // The float uniforms are accessed by the shader JIT using SSE instructions, and are - // therefore required to be 16-byte aligned. - alignas(16) Math::Vec4 f[96]; +/** + * Runs the currently setup shader + * @param state Shader unit state, must be setup per shader and per shader unit + * @param input Input vertex into the shader + * @param num_attributes The number of vertex shader attributes + * @return The output vertex, after having been processed by the vertex shader + */ +OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes); - std::array b; - std::array, 4> i; - } uniforms; - - static size_t UniformOffset(RegisterType type, unsigned index) { - switch (type) { - case RegisterType::FloatUniform: - return offsetof(ShaderSetup, uniforms.f) + index*sizeof(Math::Vec4); - - case RegisterType::BoolUniform: - return offsetof(ShaderSetup, uniforms.b) + index*sizeof(bool); - - case RegisterType::IntUniform: - return offsetof(ShaderSetup, uniforms.i) + index*sizeof(Math::Vec4); - - default: - UNREACHABLE(); - return 0; - } - } - - int float_regs_counter = 0; - u32 uniform_write_buffer[4]; - - std::array program_code; - std::array swizzle_data; - -#ifdef ARCHITECTURE_x86_64 - std::weak_ptr jit_shader; -#endif - - /** - * Performs any shader setup that only needs to happen once per shader (as opposed to once per - * vertex, which would happen within the `Run` function). - */ - void Setup(); - - /// Performs any cleanup when the emulator is shutdown - static void Shutdown(); - - /** - * Runs the currently setup shader - * @param state Shader unit state, must be setup per shader and per shader unit - * @param input Input vertex into the shader - * @param num_attributes The number of vertex shader attributes - * @param config Configuration object for the shader pipeline - */ - void Run(UnitState& state, const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config); - - /** - * Produce debug information based on the given shader and input vertex - * @param input Input vertex into the shader - * @param num_attributes The number of vertex shader attributes - * @param config Configuration object for the shader pipeline - * @return Debug information for this shader with regards to the given vertex - */ - DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config); - -}; - -bool SharedGS(); -bool UseGS(); -UnitState& GetShaderUnit(bool gs); -void WriteUniformBoolReg(bool gs, u32 value); -void WriteUniformIntReg(bool gs, unsigned index, const Math::Vec4& values); -void WriteUniformFloatSetupReg(bool gs, u32 value); -void WriteUniformFloatReg(bool gs, u32 value); -void WriteProgramCodeOffset(bool gs, u32 value); -void WriteProgramCode(bool gs, u32 value); -void WriteSwizzlePatternsOffset(bool gs, u32 value); -void WriteSwizzlePatterns(bool gs, u32 value); - -template -void HandleEMIT(UnitState& state); +/** + * Produce debug information based on the given shader and input vertex + * @param input Input vertex into the shader + * @param num_attributes The number of vertex shader attributes + * @param config Configuration object for the shader pipeline + * @param setup Setup object for the shader pipeline + * @return Debug information for this shader with regards to the given vertex + */ +DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup); } // namespace Shader diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index 30b50ce2f..9b978583e 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -21,30 +21,11 @@ namespace Pica { namespace Shader { -enum { - INVALID_ADDRESS = 0xFFFFFFFF -}; - -struct CallStackElement { - u32 final_address; // Address upon which we jump to return_address - u32 return_address; // Where to jump when leaving scope - u8 repeat_counter; // How often to repeat until this call stack element is removed - u8 loop_increment; // Which value to add to the loop counter after an iteration - // TODO: Should this be a signed value? Does it even matter? - u32 loop_address; // The address where we'll return to after each loop iteration -}; - template -void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& setup, UnitState& state) { - - // TODO: Is there a maximal size for this? - boost::container::static_vector call_stack; - - u32 program_counter = config.main_offset; - - const auto& uniforms = setup.uniforms; - const auto& swizzle_data = setup.swizzle_data; - const auto& program_code = setup.program_code; +void RunInterpreter(UnitState& state) { + const auto& uniforms = g_state.vs.uniforms; + const auto& swizzle_data = g_state.vs.swizzle_data; + const auto& program_code = g_state.vs.program_code; // Placeholder for invalid inputs static float24 dummy_vec4_float24[4]; @@ -52,16 +33,16 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s unsigned iteration = 0; bool exit_loop = false; while (!exit_loop) { - if (!call_stack.empty()) { - auto& top = call_stack.back(); - if (program_counter == top.final_address) { + if (!state.call_stack.empty()) { + auto& top = state.call_stack.back(); + if (state.program_counter == top.final_address) { state.address_registers[2] += top.loop_increment; if (top.repeat_counter-- == 0) { - program_counter = top.return_address; - call_stack.pop_back(); + state.program_counter = top.return_address; + state.call_stack.pop_back(); } else { - program_counter = top.loop_address; + state.program_counter = top.loop_address; } // TODO: Is "trying again" accurate to hardware? @@ -69,20 +50,20 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s } } - const Instruction instr = { program_code[program_counter] }; + const Instruction instr = { program_code[state.program_counter] }; const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] }; - static auto call = [&program_counter, &call_stack](UnitState& state, u32 offset, u32 num_instructions, + static auto call = [](UnitState& state, u32 offset, u32 num_instructions, u32 return_offset, u8 repeat_count, u8 loop_increment) { - program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset - ASSERT(call_stack.size() < call_stack.capacity()); - call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset }); + state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset + ASSERT(state.call_stack.size() < state.call_stack.capacity()); + state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset }); }; - Record(state.debug, iteration, program_counter); + Record(state.debug, iteration, state.program_counter); if (iteration > 0) - Record(state.debug, iteration - 1, program_counter); + Record(state.debug, iteration - 1, state.program_counter); - state.debug.max_offset = std::max(state.debug.max_offset, 1 + program_counter); + state.debug.max_offset = std::max(state.debug.max_offset, 1 + state.program_counter); auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { switch (source_reg.GetRegisterType()) { @@ -139,7 +120,7 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s src2[3] = src2[3] * float24::FromFloat32(-1); } - float24* dest = (instr.common.dest.Value() < 0x10) ? &state.output_registers.value[instr.common.dest.Value().GetIndex()][0] + float24* dest = (instr.common.dest.Value() < 0x10) ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] : (instr.common.dest.Value() < 0x20) ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] : dummy_vec4_float24; @@ -478,7 +459,7 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s src3[3] = src3[3] * float24::FromFloat32(-1); } - float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.output_registers.value[instr.mad.dest.Value().GetIndex()][0] + float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] : dummy_vec4_float24; @@ -530,7 +511,7 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s case OpCode::Id::JMPC: Record(state.debug, iteration, state.conditional_code); if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { - program_counter = instr.flow_control.dest_offset - 1; + state.program_counter = instr.flow_control.dest_offset - 1; } break; @@ -538,7 +519,7 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s Record(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); if (uniforms.b[instr.flow_control.bool_uniform_id] == !(instr.flow_control.num_instructions & 1)) { - program_counter = instr.flow_control.dest_offset - 1; + state.program_counter = instr.flow_control.dest_offset - 1; } break; @@ -546,7 +527,7 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s call(state, instr.flow_control.dest_offset, instr.flow_control.num_instructions, - program_counter + 1, 0, 0); + state.program_counter + 1, 0, 0); break; case OpCode::Id::CALLU: @@ -555,7 +536,7 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s call(state, instr.flow_control.dest_offset, instr.flow_control.num_instructions, - program_counter + 1, 0, 0); + state.program_counter + 1, 0, 0); } break; @@ -565,7 +546,7 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s call(state, instr.flow_control.dest_offset, instr.flow_control.num_instructions, - program_counter + 1, 0, 0); + state.program_counter + 1, 0, 0); } break; @@ -576,8 +557,8 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s Record(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); if (uniforms.b[instr.flow_control.bool_uniform_id]) { call(state, - program_counter + 1, - instr.flow_control.dest_offset - program_counter - 1, + state.program_counter + 1, + instr.flow_control.dest_offset - state.program_counter - 1, instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); } else { call(state, @@ -595,8 +576,8 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s Record(state.debug, iteration, state.conditional_code); if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { call(state, - program_counter + 1, - instr.flow_control.dest_offset - program_counter - 1, + state.program_counter + 1, + instr.flow_control.dest_offset - state.program_counter - 1, instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); } else { call(state, @@ -618,24 +599,14 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s Record(state.debug, iteration, loop_param); call(state, - program_counter + 1, - instr.flow_control.dest_offset - program_counter + 1, + state.program_counter + 1, + instr.flow_control.dest_offset - state.program_counter + 1, instr.flow_control.dest_offset + 1, loop_param.x, loop_param.z); break; } - case OpCode::Id::EMIT: { - Shader::HandleEMIT(state); - break; - } - - case OpCode::Id::SETEMIT: { - state.emit_params.raw = program_code[program_counter]; - break; - } - default: LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x", (int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex); @@ -646,14 +617,14 @@ void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& s } } - ++program_counter; + ++state.program_counter; ++iteration; } } // Explicit instantiation -template void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& setup, UnitState& state); -template void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& setup, UnitState& state); +template void RunInterpreter(UnitState& state); +template void RunInterpreter(UnitState& state); } // namespace diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h index 5af36f217..294bca50e 100644 --- a/src/video_core/shader/shader_interpreter.h +++ b/src/video_core/shader/shader_interpreter.h @@ -4,7 +4,6 @@ #pragma once -#include "video_core/pica.h" #include "video_core/shader/shader.h" namespace Pica { @@ -12,7 +11,7 @@ namespace Pica { namespace Shader { template -void RunInterpreter(const Pica::Regs::ShaderConfig& config, const ShaderSetup& setup, UnitState& state); +void RunInterpreter(UnitState& state); } // namespace diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index 2bba07e14..b47d3beda 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -65,8 +65,8 @@ const JitFunction instr_table[64] = { &JitShader::Compile_IF, // ifu &JitShader::Compile_IF, // ifc &JitShader::Compile_LOOP, // loop - &JitShader::Compile_EMIT, // emit - &JitShader::Compile_SETEMIT, // setemit + nullptr, // emit + nullptr, // sete &JitShader::Compile_JMP, // jmpc &JitShader::Compile_JMP, // jmpu &JitShader::Compile_CMP, // cmp @@ -94,7 +94,7 @@ const JitFunction instr_table[64] = { // purposes, as documented below: /// Pointer to the uniform memory -static const X64Reg SETUP = R9; +static const X64Reg UNIFORMS = R9; /// The two 32-bit VS address offset registers set by the MOVA instruction static const X64Reg ADDROFFS_REG_0 = R10; static const X64Reg ADDROFFS_REG_1 = R11; @@ -109,7 +109,7 @@ static const X64Reg COND0 = R13; /// Result of the previous CMP instruction for the Y-component comparison static const X64Reg COND1 = R14; /// Pointer to the UnitState instance for the current VS unit -static const X64Reg STATE = R15; +static const X64Reg REGISTERS = R15; /// SIMD scratch register static const X64Reg SCRATCH = XMM0; /// Loaded with the first swizzled source register, otherwise can be used as a scratch register @@ -128,7 +128,7 @@ static const X64Reg NEGBIT = XMM15; // State registers that must not be modified by external functions calls // Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed static const BitSet32 persistent_regs = { - SETUP, STATE, // Pointers to register blocks + UNIFORMS, REGISTERS, // Pointers to register blocks ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, // Cached registers ONE+16, NEGBIT+16, // Constants }; @@ -138,6 +138,15 @@ static const u8 NO_SRC_REG_SWIZZLE = 0x1b; /// Raw constant for the destination register enable mask that indicates all components are enabled static const u8 NO_DEST_REG_MASK = 0xf; +/** + * Get the vertex shader instruction for a given offset in the current shader program + * @param offset Offset in the current shader program of the instruction + * @return Instruction at the specified offset + */ +static Instruction GetVertexShaderInstruction(size_t offset) { + return { g_state.vs.program_code[offset] }; +} + static void LogCritical(const char* msg) { LOG_CRITICAL(HW_GPU, msg); } @@ -160,10 +169,10 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe size_t src_offset; if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { - src_ptr = SETUP; - src_offset = ShaderSetup::UniformOffset(RegisterType::FloatUniform, src_reg.GetIndex()); + src_ptr = UNIFORMS; + src_offset = src_reg.GetIndex() * sizeof(float24) * 4; } else { - src_ptr = STATE; + src_ptr = REGISTERS; src_offset = UnitState::InputOffset(src_reg); } @@ -208,7 +217,7 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe MOVAPS(dest, MDisp(src_ptr, src_offset_disp)); } - SwizzlePattern swiz = { setup->swizzle_data[operand_desc_id] }; + SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; // Generate instructions for source register swizzling as needed u8 sel = swiz.GetRawSelector(src_num); @@ -239,7 +248,7 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) { dest = instr.common.dest.Value(); } - SwizzlePattern swiz = { setup->swizzle_data[operand_desc_id] }; + SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; int dest_offset_disp = (int)UnitState::OutputOffset(dest); ASSERT_MSG(dest_offset_disp == UnitState::OutputOffset(dest), "Destinaton offset too large for int type"); @@ -247,11 +256,11 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) { // If all components are enabled, write the result to the destination register if (swiz.dest_mask == NO_DEST_REG_MASK) { // Store dest back to memory - MOVAPS(MDisp(STATE, dest_offset_disp), src); + MOVAPS(MDisp(REGISTERS, dest_offset_disp), src); } else { // Not all components are enabled, so mask the result when storing to the destination register... - MOVAPS(SCRATCH, MDisp(STATE, dest_offset_disp)); + MOVAPS(SCRATCH, MDisp(REGISTERS, dest_offset_disp)); if (Common::GetCPUCaps().sse4_1) { u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); @@ -270,7 +279,7 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) { } // Store dest back to memory - MOVAPS(MDisp(STATE, dest_offset_disp), SCRATCH); + MOVAPS(MDisp(REGISTERS, dest_offset_disp), SCRATCH); } } @@ -319,8 +328,8 @@ void JitShader::Compile_EvaluateCondition(Instruction instr) { } void JitShader::Compile_UniformCondition(Instruction instr) { - int offset = ShaderSetup::UniformOffset(RegisterType::BoolUniform, instr.flow_control.bool_uniform_id); - CMP(sizeof(bool) * 8, MDisp(SETUP, offset), Imm8(0)); + int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool)); + CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); } BitSet32 JitShader::PersistentCallerSavedRegs() { @@ -495,7 +504,7 @@ void JitShader::Compile_MIN(Instruction instr) { } void JitShader::Compile_MOVA(Instruction instr) { - SwizzlePattern swiz = { setup->swizzle_data[instr.common.operand_desc_id] }; + SwizzlePattern swiz = { g_state.vs.swizzle_data[instr.common.operand_desc_id] }; if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { return; // NoOp @@ -697,8 +706,8 @@ void JitShader::Compile_LOOP(Instruction instr) { looping = true; - int offset = ShaderSetup::UniformOffset(RegisterType::IntUniform, instr.flow_control.int_uniform_id); - MOV(32, R(LOOPCOUNT), MDisp(SETUP, offset)); + int offset = offsetof(decltype(g_state.vs.uniforms), i) + (instr.flow_control.int_uniform_id * sizeof(Math::Vec4)); + MOV(32, R(LOOPCOUNT), MDisp(UNIFORMS, offset)); MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT)); SHR(32, R(LOOPCOUNT_REG), Imm8(8)); AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start @@ -719,22 +728,6 @@ void JitShader::Compile_LOOP(Instruction instr) { looping = false; } -static void Handle_EMIT(void* param1) { - UnitState& state = *static_cast*>(param1); - Shader::HandleEMIT(state); -}; - -void JitShader::Compile_EMIT(Instruction instr) { - ABI_PushRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); - MOV(PTRBITS, R(ABI_PARAM1), R(STATE)); - ABI_CallFunctionR(reinterpret_cast(Handle_EMIT), ABI_PARAM1); - ABI_PopRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); -} - -void JitShader::Compile_SETEMIT(Instruction instr) { - MOV(32, MDisp(STATE, UnitState::EmitParamsOffset()), Imm32(*(u32*)&instr.setemit)); -} - void JitShader::Compile_JMP(Instruction instr) { if (instr.opcode.Value() == OpCode::Id::JMPC) Compile_EvaluateCondition(instr); @@ -775,7 +768,7 @@ void JitShader::Compile_NextInstr() { ASSERT_MSG(code_ptr[program_counter] == nullptr, "Tried to compile already compiled shader location!"); code_ptr[program_counter] = GetCodePtr(); - Instruction instr = GetShaderInstruction(program_counter++); + Instruction instr = GetVertexShaderInstruction(program_counter++); OpCode::Id opcode = instr.opcode.Value(); auto instr_func = instr_table[static_cast(opcode)]; @@ -793,8 +786,8 @@ void JitShader::Compile_NextInstr() { void JitShader::FindReturnOffsets() { return_offsets.clear(); - for (size_t offset = 0; offset < setup->program_code.size(); ++offset) { - Instruction instr = GetShaderInstruction(offset); + for (size_t offset = 0; offset < g_state.vs.program_code.size(); ++offset) { + Instruction instr = GetVertexShaderInstruction(offset); switch (instr.opcode.Value()) { case OpCode::Id::CALL: @@ -809,11 +802,7 @@ void JitShader::FindReturnOffsets() { std::sort(return_offsets.begin(), return_offsets.end()); } -void JitShader::Compile(const ShaderSetup& setup) { - - // Get a pointer to the setup to access program_code and swizzle_data - this->setup = &setup; - +void JitShader::Compile() { // Reset flow control state program = (CompiledShader*)GetCodePtr(); program_counter = 0; @@ -827,8 +816,8 @@ void JitShader::Compile(const ShaderSetup& setup) { // The stack pointer is 8 modulo 16 at the entry of a procedure ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); - MOV(PTRBITS, R(SETUP), R(ABI_PARAM1)); - MOV(PTRBITS, R(STATE), R(ABI_PARAM3)); + MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1)); + MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms)); // Zero address/loop registers XOR(64, R(ADDROFFS_REG_0), R(ADDROFFS_REG_0)); @@ -849,7 +838,7 @@ void JitShader::Compile(const ShaderSetup& setup) { JMPptr(R(ABI_PARAM2)); // Compile entire program - Compile_Block(static_cast(this->setup->program_code.size())); + Compile_Block(static_cast(g_state.vs.program_code.size())); // Set the target for any incomplete branches now that the entire shader program has been emitted for (const auto& branch : fixup_branches) { @@ -866,10 +855,6 @@ void JitShader::Compile(const ShaderSetup& setup) { ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); LOG_DEBUG(HW_GPU, "Compiled shader size=%d", size); - - // We don't need the setup anymore - this->setup = nullptr; - } JitShader::JitShader() { diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index 9842d2ba7..cd6280ade 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h @@ -33,11 +33,11 @@ class JitShader : public Gen::XCodeBlock { public: JitShader(); - void Run(const Pica::Regs::ShaderConfig& config, const ShaderSetup& setup, UnitState& state) const { - program(&setup, code_ptr[config.main_offset], &state); + void Run(void* registers, unsigned offset) const { + program(registers, code_ptr[offset]); } - void Compile(const ShaderSetup& setup); + void Compile(); void Compile_ADD(Instruction instr); void Compile_DP3(Instruction instr); @@ -62,8 +62,6 @@ public: void Compile_CALLU(Instruction instr); void Compile_IF(Instruction instr); void Compile_LOOP(Instruction instr); - void Compile_EMIT(Instruction instr); - void Compile_SETEMIT(Instruction instr); void Compile_JMP(Instruction instr); void Compile_CMP(Instruction instr); void Compile_MAD(Instruction instr); @@ -98,17 +96,6 @@ private: */ void Compile_Assert(bool condition, const char* msg); - /** - * Get the shader instruction for a given offset in the current shader program - * @param offset Offset in the current shader program of the instruction - * @return Instruction at the specified offset - */ - Instruction GetShaderInstruction(size_t offset) { - Instruction instruction; - std::memcpy(&instruction, &setup->program_code[offset], sizeof(Instruction)); - return instruction; - } - /** * Analyzes the entire shader program for `CALL` instructions before emitting any code, * identifying the locations where a return needs to be inserted. @@ -127,10 +114,8 @@ private: /// Branches that need to be fixed up once the entire shader program is compiled std::vector> fixup_branches; - using CompiledShader = void(const void* setup, const u8* start_addr, void* state); + using CompiledShader = void(void* registers, const u8* start_addr); CompiledShader* program = nullptr; - - const ShaderSetup* setup = nullptr; }; } // Shader