diff --git a/src/citra_qt/debugger/graphics_breakpoints.cpp b/src/citra_qt/debugger/graphics_breakpoints.cpp index fe66918a8..b92a55c0a 100644 --- a/src/citra_qt/debugger/graphics_breakpoints.cpp +++ b/src/citra_qt/debugger/graphics_breakpoints.cpp @@ -45,6 +45,7 @@ QVariant BreakPointModel::data(const QModelIndex& index, int role) const { Pica::DebugContext::Event::IncomingPrimitiveBatch, tr("Incoming primitive batch") }, { Pica::DebugContext::Event::FinishedPrimitiveBatch, tr("Finished primitive batch") }, { Pica::DebugContext::Event::VertexShaderInvocation, tr("Vertex shader invocation") }, + { Pica::DebugContext::Event::GeometryShaderInvocation, tr("Geometry shader invocation") }, { Pica::DebugContext::Event::IncomingDisplayTransfer, tr("Incoming display transfer") }, { Pica::DebugContext::Event::GSPCommandProcessed, tr("GSP command processed") }, { Pica::DebugContext::Event::BufferSwapped, tr("Buffers swapped") } diff --git a/src/citra_qt/debugger/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics_vertex_shader.cpp index 391666d35..50be59856 100644 --- a/src/citra_qt/debugger/graphics_vertex_shader.cpp +++ b/src/citra_qt/debugger/graphics_vertex_shader.cpp @@ -501,7 +501,7 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d info.labels.insert({ entry_point, "main" }); // Generate debug information - debug_data = Pica::g_state.vs.ProduceDebugInfo(input_vertex, num_attributes, shader_config, shader_setup); + debug_data = Pica::g_state.vs.ProduceDebugInfo(input_vertex, num_attributes, shader_config); // Reload widget state for (int attr = 0; attr < num_attributes; ++attr) { diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 309482d56..271bbb09d 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -33,10 +33,6 @@ namespace Pica { namespace CommandProcessor { -static int float_regs_counter = 0; - -static u32 uniform_write_buffer[4]; - static int default_attr_counter = 0; static u32 default_attr_write_buffer[3]; @@ -143,13 +139,13 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { if (immediate_attribute_id >= regs.vs.num_input_attributes+1) { immediate_attribute_id = 0; - Shader::UnitState shader_unit; + auto& shader_unit = Shader::GetShaderUnit(false); g_state.vs.Setup(); // Send to vertex shader if (g_debug_context) g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, static_cast(&immediate_input)); - g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1); + g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1, regs.vs); Shader::OutputVertex output_vertex = shader_unit.output_registers.ToVertex(regs.vs); // Send to renderer @@ -236,9 +232,12 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { unsigned int vertex_cache_pos = 0; vertex_cache_ids.fill(-1); - Shader::UnitState shader_unit; + auto& vs_shader_unit = Shader::GetShaderUnit(false); g_state.vs.Setup(); + auto& gs_unit_state = Shader::GetShaderUnit(true); + g_state.gs.Setup(); + for (unsigned int index = 0; index < regs.num_vertices; ++index) { // Indexed rendering doesn't use the start offset @@ -274,8 +273,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { // Send to vertex shader if (g_debug_context) g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, (void*)&input); - g_state.vs.Run(shader_unit, input, loader.GetNumTotalAttributes()); - output_registers = shader_unit.output_registers; + g_state.vs.Run(vs_shader_unit, input, loader.GetNumTotalAttributes(), regs.vs); + output_registers = vs_shader_unit.output_registers; if (is_indexed) { vertex_cache[vertex_cache_pos] = output_registers; @@ -284,17 +283,56 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { } } - // Retreive vertex from register data - Shader::OutputVertex output_vertex = output_registers.ToVertex(regs.vs); - - // Send to renderer + // Helper to send triangle to renderer using Pica::Shader::OutputVertex; auto AddTriangle = []( const OutputVertex& v0, const OutputVertex& v1, const OutputVertex& v2) { VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); }; - primitive_assembler.SubmitVertex(output_vertex, AddTriangle); + if (Shader::UseGS()) { + + auto& regs = g_state.regs; + auto& gs_regs = g_state.regs.gs; + auto& gs_buf = g_state.gs_input_buffer; + + // Vertex Shader Outputs are converted into Geometry Shader inputs by filling up a buffer + // For example, if we have a geoshader that takes 6 inputs, and the vertex shader outputs 2 attributes + // It would take 3 vertices to fill up the Geometry Shader buffer + unsigned int gs_input_count = gs_regs.num_input_attributes + 1; + unsigned int vs_output_count = regs.vs_outmap_total2 + 1; + ASSERT_MSG(regs.vs_outmap_total1 == regs.vs_outmap_total2, "VS_OUTMAP_TOTAL1 and VS_OUTMAP_TOTAL2 don't match!"); + // copy into the geoshader buffer + for (unsigned int i = 0; i < vs_output_count; i++) { + if (gs_buf.index >= gs_input_count) { + // TODO(ds84182): LOG_ERROR() + ASSERT_MSG(false, "Number of GS inputs (%d) is not divisible by number of VS outputs (%d)", + gs_input_count, vs_output_count); + continue; + } + gs_buf.buffer.attr[gs_buf.index++] = output_registers.value[i]; + } + + if (gs_buf.index >= gs_input_count) { + + // b15 will be false when a new primitive starts and then switch to true at some point + //TODO: Test how this works exactly on hardware + g_state.gs.uniforms.b[15] |= (index > 0); + + // Process Geometry Shader + if (g_debug_context) + g_debug_context->OnEvent(DebugContext::Event::GeometryShaderInvocation, static_cast(&gs_buf.buffer)); + gs_unit_state.emit_triangle_callback = AddTriangle; + g_state.gs.Run(gs_unit_state, gs_buf.buffer, gs_input_count, regs.gs); + gs_unit_state.emit_triangle_callback = nullptr; + + gs_buf.index = 0; + } + } else { + Shader::OutputVertex output_vertex = output_registers.ToVertex(regs.vs); + primitive_assembler.SubmitVertex(output_vertex, AddTriangle); + } + } for (auto& range : memory_accesses.ranges) { @@ -311,10 +349,76 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { break; } - case PICA_REG_INDEX(vs.bool_uniforms): - for (unsigned i = 0; i < 16; ++i) - g_state.vs.uniforms.b[i] = (regs.vs.bool_uniforms.Value() & (1 << i)) != 0; + case PICA_REG_INDEX(gs.bool_uniforms): + Shader::WriteUniformBoolReg(true, value); + break; + case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[0], 0x281): + case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[1], 0x282): + case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[2], 0x283): + case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[3], 0x284): + { + unsigned index = (id - PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[0], 0x281)); + auto values = regs.gs.int_uniforms[index]; + Shader::WriteUniformIntReg(true, index, Math::Vec4(values.x, values.y, values.z, values.w)); + break; + } + + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.setup, 0x290): + Shader::WriteUniformFloatSetupReg(true, value); + break; + + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[0], 0x291): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[1], 0x292): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[2], 0x293): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[3], 0x294): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[4], 0x295): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[5], 0x296): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[6], 0x297): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[7], 0x298): + { + Shader::WriteUniformFloatReg(true, value); + break; + } + + // Load shader program code + case PICA_REG_INDEX_WORKAROUND(gs.program.offset, 0x29b): + Shader::WriteProgramCodeOffset(true, value); + break; + + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[0], 0x29c): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[1], 0x29d): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[2], 0x29e): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[3], 0x29f): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[4], 0x2a0): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[5], 0x2a1): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[6], 0x2a2): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[7], 0x2a3): + { + Shader::WriteProgramCode(true, value); + break; + } + + // Load swizzle pattern data + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.offset, 0x2a5): + Shader::WriteSwizzlePatternsOffset(true, value); + break; + + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[0], 0x2a6): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[1], 0x2a7): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[2], 0x2a8): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[3], 0x2a9): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[4], 0x2aa): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[5], 0x2ab): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[6], 0x2ac): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[7], 0x2ad): + { + Shader::WriteSwizzlePatterns(true, value); + break; + } + + case PICA_REG_INDEX(vs.bool_uniforms): + Shader::WriteUniformBoolReg(false, value); break; case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1): @@ -322,14 +426,16 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[2], 0x2b3): case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[3], 0x2b4): { - int index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1)); + unsigned index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1)); auto values = regs.vs.int_uniforms[index]; - g_state.vs.uniforms.i[index] = Math::Vec4(values.x, values.y, values.z, values.w); - LOG_TRACE(HW_GPU, "Set integer uniform %d to %02x %02x %02x %02x", - index, values.x.Value(), values.y.Value(), values.z.Value(), values.w.Value()); + Shader::WriteUniformIntReg(false, index, Math::Vec4(values.x, values.y, values.z, values.w)); break; } + case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.setup, 0x2c0): + Shader::WriteUniformFloatSetupReg(false, value); + break; + case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[0], 0x2c1): case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[1], 0x2c2): case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[2], 0x2c3): @@ -339,49 +445,15 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[6], 0x2c7): case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[7], 0x2c8): { - auto& uniform_setup = regs.vs.uniform_setup; - - // TODO: Does actual hardware indeed keep an intermediate buffer or does - // it directly write the values? - uniform_write_buffer[float_regs_counter++] = value; - - // Uniforms are written in a packed format such that four float24 values are encoded in - // three 32-bit numbers. We write to internal memory once a full such vector is - // written. - if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) || - (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) { - float_regs_counter = 0; - - auto& uniform = g_state.vs.uniforms.f[uniform_setup.index]; - - if (uniform_setup.index > 95) { - LOG_ERROR(HW_GPU, "Invalid VS uniform index %d", (int)uniform_setup.index); - break; - } - - // NOTE: The destination component order indeed is "backwards" - if (uniform_setup.IsFloat32()) { - for (auto i : {0,1,2,3}) - uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i])); - } else { - // TODO: Untested - uniform.w = float24::FromRaw(uniform_write_buffer[0] >> 8); - uniform.z = float24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | ((uniform_write_buffer[1] >> 16) & 0xFFFF)); - uniform.y = float24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | ((uniform_write_buffer[2] >> 24) & 0xFF)); - uniform.x = float24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF); - } - - LOG_TRACE(HW_GPU, "Set uniform %x to (%f %f %f %f)", (int)uniform_setup.index, - uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(), - uniform.w.ToFloat32()); - - // TODO: Verify that this actually modifies the register! - uniform_setup.index.Assign(uniform_setup.index + 1); - } + Shader::WriteUniformFloatReg(false, value); break; } // Load shader program code + case PICA_REG_INDEX_WORKAROUND(vs.program.offset, 0x2cb): + Shader::WriteProgramCodeOffset(false, value); + break; + case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[0], 0x2cc): case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[1], 0x2cd): case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[2], 0x2ce): @@ -391,12 +463,15 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[6], 0x2d2): case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[7], 0x2d3): { - g_state.vs.program_code[regs.vs.program.offset] = value; - regs.vs.program.offset++; + Shader::WriteProgramCode(false, value); break; } // Load swizzle pattern data + case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.offset, 0x2d5): + Shader::WriteSwizzlePatternsOffset(false, value); + break; + case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[0], 0x2d6): case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[1], 0x2d7): case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[2], 0x2d8): @@ -406,8 +481,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[6], 0x2dc): case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[7], 0x2dd): { - g_state.vs.swizzle_data[regs.vs.swizzle_patterns.offset] = value; - regs.vs.swizzle_patterns.offset++; + Shader::WriteSwizzlePatterns(false, value); break; } diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h index 92e9734ae..a79143cbb 100644 --- a/src/video_core/debug_utils/debug_utils.h +++ b/src/video_core/debug_utils/debug_utils.h @@ -41,6 +41,7 @@ public: IncomingPrimitiveBatch, FinishedPrimitiveBatch, VertexShaderInvocation, + GeometryShaderInvocation, IncomingDisplayTransfer, GSPCommandProcessed, BufferSwapped, diff --git a/src/video_core/pica.h b/src/video_core/pica.h index 09702d46a..85cad3a62 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h @@ -1103,7 +1103,7 @@ struct Regs { // Number of vertices to render u32 num_vertices; - INSERT_PADDING_WORDS(0x1); + BitField<0, 2, u32> using_geometry_shader; // The index of the first vertex to render u32 vertex_offset; @@ -1151,7 +1151,14 @@ struct Regs { } } command_buffer; - INSERT_PADDING_WORDS(0x07); + INSERT_PADDING_WORDS(0x06); + + enum class VSComMode : u32 { + Shared = 0, + Exclusive = 1 + }; + + VSComMode vs_com_mode; enum class GPUMode : u32 { Drawing = 0, @@ -1160,7 +1167,17 @@ struct Regs { GPUMode gpu_mode; - INSERT_PADDING_WORDS(0x18); + INSERT_PADDING_WORDS(0x4); + + BitField<0, 4, u32> vs_outmap_total1; + + INSERT_PADDING_WORDS(0x6); + + BitField<0, 4, u32> vs_outmap_total2; + + BitField<0, 4, u32> gsh_misc0; + + INSERT_PADDING_WORDS(0xB); enum class TriangleTopology : u32 { List = 0, @@ -1169,7 +1186,10 @@ struct Regs { Shader = 3, // Programmable setup unit implemented in a geometry shader }; - BitField<8, 2, TriangleTopology> triangle_topology; + union { + BitField<0, 4, u32> vs_outmap_count; + BitField<8, 2, TriangleTopology> triangle_topology; + }; u32 restart_primitive; @@ -1188,8 +1208,9 @@ struct Regs { INSERT_PADDING_WORDS(0x4); union { - // Number of input attributes to shader unit - 1 - BitField<0, 4, u32> num_input_attributes; + BitField<0, 4, u32> num_input_attributes; // Number of input attributes to shader unit - 1 + BitField<8, 4, u32> use_subdivision; + BitField<24, 8, u32> use_geometry_shader; }; // Offset to shader program entry point (in words) @@ -1241,6 +1262,8 @@ struct Regs { } union { + u32 setup; + // Index of the next uniform to write to // TODO: ctrulib uses 8 bits for this, however that seems to yield lots of invalid indices // TODO: Maybe the uppermost index is for the geometry shader? Investigate! @@ -1361,7 +1384,11 @@ ASSERT_REG_POSITION(trigger_draw, 0x22e); ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f); ASSERT_REG_POSITION(vs_default_attributes_setup, 0x232); ASSERT_REG_POSITION(command_buffer, 0x238); +ASSERT_REG_POSITION(vs_com_mode, 0x244); ASSERT_REG_POSITION(gpu_mode, 0x245); +ASSERT_REG_POSITION(vs_outmap_total1, 0x24A); +ASSERT_REG_POSITION(vs_outmap_total2, 0x251); +ASSERT_REG_POSITION(gsh_misc0, 0x252); ASSERT_REG_POSITION(triangle_topology, 0x25e); ASSERT_REG_POSITION(restart_primitive, 0x25f); ASSERT_REG_POSITION(gs, 0x280); diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h index 01f4285a8..3043be00b 100644 --- a/src/video_core/pica_state.h +++ b/src/video_core/pica_state.h @@ -22,6 +22,8 @@ struct State { /// Pica registers Regs regs; + Shader::UnitState shader_units[4]; + Shader::ShaderSetup vs; Shader::ShaderSetup gs; @@ -75,6 +77,15 @@ struct State { // This is constructed with a dummy triangle topology PrimitiveAssembler primitive_assembler; + + /// Current geometry shader state + struct GeometryShaderState { + // Buffer used for geometry shader inputs + Shader::InputVertex buffer; + // The current index into the buffer + unsigned int index; + } gs_input_buffer; + }; extern State g_state; ///< Current Pica state diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp index 68ea3c08a..df79b1925 100644 --- a/src/video_core/primitive_assembly.cpp +++ b/src/video_core/primitive_assembly.cpp @@ -19,7 +19,6 @@ template void PrimitiveAssembler::SubmitVertex(VertexType& vtx, TriangleHandler triangle_handler) { switch (topology) { - // TODO: Figure out what's different with TriangleTopology::Shader. case Regs::TriangleTopology::List: case Regs::TriangleTopology::Shader: if (buffer_index < 2) { diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index f565e2c91..8e2651f8e 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -83,8 +83,7 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) { } #ifdef ARCHITECTURE_x86_64 -static std::unordered_map> shader_map; -static const JitShader* jit_shader; +static std::unordered_map> shader_map; #endif // ARCHITECTURE_x86_64 void ClearCache() { @@ -96,27 +95,27 @@ void ClearCache() { void ShaderSetup::Setup() { #ifdef ARCHITECTURE_x86_64 if (VideoCore::g_shader_jit_enabled) { - u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ - Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data))); + u64 cache_key = (Common::ComputeHash64(&program_code, sizeof(program_code)) ^ + Common::ComputeHash64(&swizzle_data, sizeof(swizzle_data))); auto iter = shader_map.find(cache_key); if (iter != shader_map.end()) { - jit_shader = iter->second.get(); + jit_shader = iter->second; } else { - auto shader = std::make_unique(); - shader->Compile(); - jit_shader = shader.get(); + auto shader = std::make_shared(); + shader->Compile(*this); + jit_shader = shader; shader_map[cache_key] = std::move(shader); } + } else { + jit_shader.reset(); } #endif // ARCHITECTURE_x86_64 } MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); -void ShaderSetup::Run(UnitState& state, const InputVertex& input, int num_attributes) { - auto& config = g_state.regs.vs; - auto& setup = g_state.vs; +void ShaderSetup::Run(UnitState& state, const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config) { MICROPROFILE_SCOPE(GPU_Shader); @@ -133,17 +132,17 @@ void ShaderSetup::Run(UnitState& state, const InputVertex& input, int num state.conditional_code[1] = false; #ifdef ARCHITECTURE_x86_64 - if (VideoCore::g_shader_jit_enabled) - jit_shader->Run(setup, state, config.main_offset); + if (auto shader = jit_shader.lock()) + shader.get()->Run(*this, state, config.main_offset); else - RunInterpreter(setup, state, config.main_offset); + RunInterpreter(*this, state, config.main_offset); #else - RunInterpreter(setup, state, config.main_offset); + RunInterpreter(*this, state, config.main_offset); #endif // ARCHITECTURE_x86_64 } -DebugData ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) { +DebugData ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config) { UnitState state; state.debug.max_offset = 0; @@ -160,10 +159,212 @@ DebugData ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_ state.conditional_code[0] = false; state.conditional_code[1] = false; - RunInterpreter(setup, state, config.main_offset); + RunInterpreter(*this, state, config.main_offset); return state.debug; } +bool SharedGS() { + return g_state.regs.vs_com_mode == Pica::Regs::VSComMode::Shared; +} + +bool UseGS() { + // TODO(ds84182): This would be more accurate if it looked at induvidual shader units for the geoshader bit + // gs_regs.input_buffer_config.use_geometry_shader == 0x08 + ASSERT((g_state.regs.using_geometry_shader == 0) || (g_state.regs.using_geometry_shader == 2)); + return g_state.regs.using_geometry_shader == 2; +} + +UnitState& GetShaderUnit(bool gs) { + + // GS are always run on shader unit 3 + if (gs) { + return g_state.shader_units[3]; + } + + // The worst scheduler you'll ever see! + //TODO: How does PICA shader scheduling work? + static unsigned shader_unit_scheduler = 0; + shader_unit_scheduler++; + shader_unit_scheduler %= 3; // TODO: When does it also allow use of unit 3?! + return g_state.shader_units[shader_unit_scheduler]; +} + +void WriteUniformBoolReg(bool gs, u32 value) { + auto& setup = gs ? g_state.gs : g_state.vs; + + ASSERT(setup.uniforms.b.size() == 16); + for (unsigned i = 0; i < 16; ++i) + setup.uniforms.b[i] = (value & (1 << i)) != 0; + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteUniformBoolReg(true, value); + } +} + +void WriteUniformIntReg(bool gs, unsigned index, const Math::Vec4& values) { + const char* shader_type = gs ? "GS" : "VS"; + auto& setup = gs ? g_state.gs : g_state.vs; + + ASSERT(index < setup.uniforms.i.size()); + setup.uniforms.i[index] = values; + LOG_TRACE(HW_GPU, "Set %s integer uniform %d to %02x %02x %02x %02x", + shader_type, index, values.x.Value(), values.y.Value(), values.z.Value(), values.w.Value()); + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteUniformIntReg(true, index, values); + } +} + +void WriteUniformFloatSetupReg(bool gs, u32 value) { + auto& config = gs ? g_state.regs.gs : g_state.regs.vs; + + config.uniform_setup.setup = value; + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteUniformFloatSetupReg(true, value); + } +} + +void WriteUniformFloatReg(bool gs, u32 value) { + const char* shader_type = gs ? "GS" : "VS"; + auto& config = gs ? g_state.regs.gs : g_state.regs.vs; + auto& setup = gs ? g_state.gs : g_state.vs; + + auto& uniform_setup = config.uniform_setup; + auto& uniform_write_buffer = setup.uniform_write_buffer; + auto& float_regs_counter = setup.float_regs_counter; + + // TODO: Does actual hardware indeed keep an intermediate buffer or does + // it directly write the values? + uniform_write_buffer[float_regs_counter++] = value; + + // Uniforms are written in a packed format such that four float24 values are encoded in + // three 32-bit numbers. We write to internal memory once a full such vector is + // written. + if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) || + (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) { + float_regs_counter = 0; + + auto& uniform = setup.uniforms.f[uniform_setup.index]; + + if (uniform_setup.index >= 96) { + LOG_ERROR(HW_GPU, "Invalid %s float uniform index %d", shader_type, (int)uniform_setup.index); + } else { + + // NOTE: The destination component order indeed is "backwards" + if (uniform_setup.IsFloat32()) { + for (auto i : {0,1,2,3}) + uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i])); + } else { + // TODO: Untested + uniform.w = float24::FromRaw(uniform_write_buffer[0] >> 8); + uniform.z = float24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | ((uniform_write_buffer[1] >> 16) & 0xFFFF)); + uniform.y = float24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | ((uniform_write_buffer[2] >> 24) & 0xFF)); + uniform.x = float24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF); + } + + LOG_TRACE(HW_GPU, "Set %s float uniform %x to (%f %f %f %f)", shader_type, (int)uniform_setup.index, + uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(), + uniform.w.ToFloat32()); + + // TODO: Verify that this actually modifies the register! + uniform_setup.index.Assign(uniform_setup.index + 1); + } + + } + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteUniformFloatReg(true, value); + } +} + +void WriteProgramCodeOffset(bool gs, u32 value) { + auto& config = gs ? g_state.regs.gs : g_state.regs.vs; + config.program.offset = value; + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteProgramCodeOffset(true, value); + } +} + +void WriteProgramCode(bool gs, u32 value) { + const char* shader_type = gs ? "GS" : "VS"; + auto& config = gs ? g_state.regs.gs : g_state.regs.vs; + auto& setup = gs ? g_state.gs : g_state.vs; + + if (config.program.offset >= setup.program_code.size()) { + LOG_ERROR(HW_GPU, "Invalid %s program offset %d", shader_type, (int)config.program.offset); + } else { + setup.program_code[config.program.offset] = value; + config.program.offset++; + } + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteProgramCode(true, value); + } +} + +void WriteSwizzlePatternsOffset(bool gs, u32 value) { + auto& config = gs ? g_state.regs.gs : g_state.regs.vs; + config.swizzle_patterns.offset = value; + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteSwizzlePatternsOffset(true, value); + } +} + +void WriteSwizzlePatterns(bool gs, u32 value) { + const char* shader_type = gs ? "GS" : "VS"; + auto& config = gs ? g_state.regs.gs : g_state.regs.vs; + auto& setup = gs ? g_state.gs : g_state.vs; + + if (config.swizzle_patterns.offset >= setup.swizzle_data.size()) { + LOG_ERROR(HW_GPU, "Invalid %s swizzle pattern offset %d", shader_type, (int)config.swizzle_patterns.offset); + } else { + setup.swizzle_data[config.swizzle_patterns.offset] = value; + config.swizzle_patterns.offset++; + } + + // Copy for GS in shared mode + if (!gs && SharedGS()) { + WriteSwizzlePatterns(true, value); + } +} + +template +void HandleEMIT(UnitState& state) { + auto &config = g_state.regs.gs; + auto &emit_params = state.emit_params; + auto &emit_buffers = state.emit_buffers; + + ASSERT(emit_params.vertex_id < 3); + + emit_buffers[emit_params.vertex_id] = state.output_registers; + + if (emit_params.primitive_emit) { + ASSERT_MSG(state.emit_triangle_callback, "EMIT invoked but no handler set!"); + OutputVertex v0 = emit_buffers[0].ToVertex(config); + OutputVertex v1 = emit_buffers[1].ToVertex(config); + OutputVertex v2 = emit_buffers[2].ToVertex(config); + if (emit_params.winding) { + state.emit_triangle_callback(v2, v1, v0); + } else { + state.emit_triangle_callback(v0, v1, v2); + } + } +} + +// Explicit instantiation +template void HandleEMIT(UnitState& state); +template void HandleEMIT(UnitState& state); + } // namespace Shader } // namespace Pica diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index fee16df62..3af8d4ebf 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -21,6 +21,7 @@ #include "video_core/pica.h" #include "video_core/pica_types.h" +#include "video_core/primitive_assembly.h" using nihstro::RegisterType; using nihstro::SourceRegister; @@ -30,6 +31,11 @@ namespace Pica { namespace Shader { +#ifdef ARCHITECTURE_x86_64 +// Forward declare JitShader because shader_jit_x64.h requires ShaderSetup (which uses JitShader) from this file +class JitShader; +#endif // ARCHITECTURE_x86_64 + struct InputVertex { alignas(16) Math::Vec4 attr[16]; }; @@ -191,9 +197,9 @@ inline void SetField(DebugDataRecord& record, float24* va record.src3.x = value[0]; record.src3.y = value[1]; record.src3.z = value[2]; + record.src3.w = value[3]; } - template<> inline void SetField(DebugDataRecord& record, float24* value) { record.dest_in.x = value[0]; @@ -280,6 +286,17 @@ struct UnitState { } registers; static_assert(std::is_pod::value, "Structure is not POD"); + OutputRegisters emit_buffers[3]; //TODO: 3dbrew suggests this only stores the first 7 output registers + + union EmitParameters { + u32 raw; + BitField<22, 1, u32> winding; + BitField<23, 1, u32> primitive_emit; + BitField<24, 2, u32> vertex_id; + } emit_params; + + PrimitiveAssembler::TriangleHandler emit_triangle_callback; + OutputRegisters output_registers; bool conditional_code[2]; @@ -317,6 +334,10 @@ struct UnitState { return 0; } } + + static size_t EmitParamsOffset() { + return offsetof(UnitState, emit_params.raw); + } }; /// Clears the shader cache @@ -350,11 +371,18 @@ struct ShaderSetup { } } + int float_regs_counter = 0; + u32 uniform_write_buffer[4]; + std::array program_code; std::array swizzle_data; +#ifdef ARCHITECTURE_x86_64 + std::weak_ptr jit_shader; +#endif + /** - * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per + * Performs any shader setup that only needs to happen once per shader (as opposed to once per * vertex, which would happen within the `Run` function). */ void Setup(); @@ -364,21 +392,36 @@ struct ShaderSetup { * @param state Shader unit state, must be setup per shader and per shader unit * @param input Input vertex into the shader * @param num_attributes The number of vertex shader attributes + * @param config Configuration object for the shader pipeline */ - void Run(UnitState& state, const InputVertex& input, int num_attributes); + void Run(UnitState& state, const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config); /** * Produce debug information based on the given shader and input vertex * @param input Input vertex into the shader * @param num_attributes The number of vertex shader attributes * @param config Configuration object for the shader pipeline - * @param setup Setup object for the shader pipeline * @return Debug information for this shader with regards to the given vertex */ - DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup); + DebugData ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config); }; +bool SharedGS(); +bool UseGS(); +UnitState& GetShaderUnit(bool gs); +void WriteUniformBoolReg(bool gs, u32 value); +void WriteUniformIntReg(bool gs, unsigned index, const Math::Vec4& values); +void WriteUniformFloatSetupReg(bool gs, u32 value); +void WriteUniformFloatReg(bool gs, u32 value); +void WriteProgramCodeOffset(bool gs, u32 value); +void WriteProgramCode(bool gs, u32 value); +void WriteSwizzlePatternsOffset(bool gs, u32 value); +void WriteSwizzlePatterns(bool gs, u32 value); + +template +void HandleEMIT(UnitState& state); + } // namespace Shader } // namespace Pica diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index b1eadc071..96da962b3 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -47,9 +47,9 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, unsigned u32 program_counter = offset; - const auto& uniforms = g_state.vs.uniforms; - const auto& swizzle_data = g_state.vs.swizzle_data; - const auto& program_code = g_state.vs.program_code; + const auto& uniforms = setup.uniforms; + const auto& swizzle_data = setup.swizzle_data; + const auto& program_code = setup.program_code; // Placeholder for invalid inputs static float24 dummy_vec4_float24[4]; @@ -631,6 +631,16 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, unsigned break; } + case OpCode::Id::EMIT: { + Shader::HandleEMIT(state); + break; + } + + case OpCode::Id::SETEMIT: { + state.emit_params.raw = program_code[program_counter]; + break; + } + default: LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x", (int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex); diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index 43e7e6b4c..726422561 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -73,8 +73,8 @@ const JitFunction instr_table[64] = { &JitShader::Compile_IF, // ifu &JitShader::Compile_IF, // ifc &JitShader::Compile_LOOP, // loop - nullptr, // emit - nullptr, // sete + &JitShader::Compile_EMIT, // emit + &JitShader::Compile_SETEMIT, // setemit &JitShader::Compile_JMP, // jmpc &JitShader::Compile_JMP, // jmpu &JitShader::Compile_CMP, // cmp @@ -146,15 +146,6 @@ static const u8 NO_SRC_REG_SWIZZLE = 0x1b; /// Raw constant for the destination register enable mask that indicates all components are enabled static const u8 NO_DEST_REG_MASK = 0xf; -/** - * Get the vertex shader instruction for a given offset in the current shader program - * @param offset Offset in the current shader program of the instruction - * @return Instruction at the specified offset - */ -static Instruction GetVertexShaderInstruction(size_t offset) { - return { g_state.vs.program_code[offset] }; -} - static void LogCritical(const char* msg) { LOG_CRITICAL(HW_GPU, "%s", msg); } @@ -225,7 +216,7 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe MOVAPS(dest, MDisp(src_ptr, src_offset_disp)); } - SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; + SwizzlePattern swiz = { setup->swizzle_data[operand_desc_id] }; // Generate instructions for source register swizzling as needed u8 sel = swiz.GetRawSelector(src_num); @@ -256,7 +247,7 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) { dest = instr.common.dest.Value(); } - SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; + SwizzlePattern swiz = { setup->swizzle_data[operand_desc_id] }; int dest_offset_disp = (int)UnitState::OutputOffset(dest); ASSERT_MSG(dest_offset_disp == UnitState::OutputOffset(dest), "Destinaton offset too large for int type"); @@ -512,7 +503,7 @@ void JitShader::Compile_MIN(Instruction instr) { } void JitShader::Compile_MOVA(Instruction instr) { - SwizzlePattern swiz = { g_state.vs.swizzle_data[instr.common.operand_desc_id] }; + SwizzlePattern swiz = { setup->swizzle_data[instr.common.operand_desc_id] }; if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { return; // NoOp @@ -736,6 +727,22 @@ void JitShader::Compile_LOOP(Instruction instr) { looping = false; } +static void Handle_EMIT(void* param1) { + UnitState& state = *static_cast*>(param1); + Shader::HandleEMIT(state); +}; + +void JitShader::Compile_EMIT(Instruction instr) { + ABI_PushRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); + MOV(PTRBITS, R(ABI_PARAM1), R(STATE)); + ABI_CallFunctionR(reinterpret_cast(Handle_EMIT), ABI_PARAM1); + ABI_PopRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); +} + +void JitShader::Compile_SETEMIT(Instruction instr) { + MOV(32, MDisp(STATE, UnitState::EmitParamsOffset()), Imm32(*(u32*)&instr.setemit)); +} + void JitShader::Compile_JMP(Instruction instr) { if (instr.opcode.Value() == OpCode::Id::JMPC) Compile_EvaluateCondition(instr); @@ -776,7 +783,7 @@ void JitShader::Compile_NextInstr() { ASSERT_MSG(code_ptr[program_counter] == nullptr, "Tried to compile already compiled shader location!"); code_ptr[program_counter] = GetCodePtr(); - Instruction instr = GetVertexShaderInstruction(program_counter++); + Instruction instr = GetShaderInstruction(program_counter++); OpCode::Id opcode = instr.opcode.Value(); auto instr_func = instr_table[static_cast(opcode)]; @@ -794,8 +801,8 @@ void JitShader::Compile_NextInstr() { void JitShader::FindReturnOffsets() { return_offsets.clear(); - for (size_t offset = 0; offset < g_state.vs.program_code.size(); ++offset) { - Instruction instr = GetVertexShaderInstruction(offset); + for (size_t offset = 0; offset < setup->program_code.size(); ++offset) { + Instruction instr = GetShaderInstruction(offset); switch (instr.opcode.Value()) { case OpCode::Id::CALL: @@ -812,7 +819,11 @@ void JitShader::FindReturnOffsets() { std::sort(return_offsets.begin(), return_offsets.end()); } -void JitShader::Compile() { +void JitShader::Compile(const ShaderSetup& setup) { + + // Get a pointer to the setup to access program_code and swizzle_data + this->setup = &setup; + // Reset flow control state program = (CompiledShader*)GetCodePtr(); program_counter = 0; @@ -848,7 +859,7 @@ void JitShader::Compile() { JMPptr(R(ABI_PARAM3)); // Compile entire program - Compile_Block(static_cast(g_state.vs.program_code.size())); + Compile_Block(static_cast(this->setup->program_code.size())); // Set the target for any incomplete branches now that the entire shader program has been emitted for (const auto& branch : fixup_branches) { @@ -865,6 +876,9 @@ void JitShader::Compile() { ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", size); + + // We don't need the setup anymore + this->setup = nullptr; } JitShader::JitShader() { diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index 5468459d4..b6d4de07b 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h @@ -40,7 +40,7 @@ public: program(&setup, &state, code_ptr[offset]); } - void Compile(); + void Compile(const ShaderSetup& setup); void Compile_ADD(Instruction instr); void Compile_DP3(Instruction instr); @@ -65,6 +65,8 @@ public: void Compile_CALLU(Instruction instr); void Compile_IF(Instruction instr); void Compile_LOOP(Instruction instr); + void Compile_EMIT(Instruction instr); + void Compile_SETEMIT(Instruction instr); void Compile_JMP(Instruction instr); void Compile_CMP(Instruction instr); void Compile_MAD(Instruction instr); @@ -99,6 +101,17 @@ private: */ void Compile_Assert(bool condition, const char* msg); + /** + * Get the shader instruction for a given offset in the current shader program + * @param offset Offset in the current shader program of the instruction + * @return Instruction at the specified offset + */ + Instruction GetShaderInstruction(size_t offset) { + Instruction instruction; + std::memcpy(&instruction, &setup->program_code[offset], sizeof(Instruction)); + return instruction; + } + /** * Analyzes the entire shader program for `CALL` instructions before emitting any code, * identifying the locations where a return needs to be inserted. @@ -119,6 +132,8 @@ private: using CompiledShader = void(const void* setup, void* state, const u8* start_addr); CompiledShader* program = nullptr; + + const ShaderSetup* setup = nullptr; }; } // Shader