From 16fb89fef0579112929f10f3ea545272ff1a9427 Mon Sep 17 00:00:00 2001 From: B3n30 Date: Fri, 29 Sep 2017 20:50:11 +0200 Subject: [PATCH] Threaded vertex rendering --- src/video_core/command_processor.cpp | 213 +++++++++++++++++++-------- 1 file changed, 151 insertions(+), 62 deletions(-) diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 3ab4af374..2f1b90d94 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -9,6 +9,7 @@ #include "common/assert.h" #include "common/logging/log.h" #include "common/microprofile.h" +#include "common/thread_pool.h" #include "common/vector_math.h" #include "core/hle/service/gsp_gpu.h" #include "core/hw/gpu.h" @@ -298,6 +299,39 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { const u16* index_address_16 = reinterpret_cast(index_address_8); bool index_u16 = index_info.format != 0; + struct CacheEntry { + Shader::AttributeBuffer output_attr; + Shader::OutputVertex output_vertex; + std::atomic id; + std::atomic_flag writing{ + ATOMIC_FLAG_INIT}; // Set when a thread is writing into this entry + }; + static std::array cache; + + // used as a mean to invalidate data from the previous batch without clearing it + static u32 cache_batch_id = std::numeric_limits::max(); + + ++cache_batch_id; + if (cache_batch_id == 0) { // reset cache if the emu ever runs long enough to overflow id + ++cache_batch_id; + for (auto& entry : cache) + entry.id = 0; + } + + struct VsOutput { + explicit VsOutput() = default; + VsOutput(VsOutput&& other) { + batch_id = 0; + } + + Pica::Shader::OutputVertex vertex; + std::atomic batch_id; + }; + static std::vector vs_output; + while (vs_output.size() < regs.pipeline.num_vertices) { + vs_output.emplace_back(); + } + PrimitiveAssembler& primitive_assembler = g_state.primitive_assembler; if (g_debug_context && g_debug_context->recorder) { @@ -314,20 +348,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { } } - DebugUtils::MemoryAccessTracker memory_accesses; - - // Simple circular-replacement vertex cache - // The size has been tuned for optimal balance between hit-rate and the cost of lookup - const size_t VERTEX_CACHE_SIZE = 32; - std::array vertex_cache_ids; - std::array vertex_cache; - Shader::AttributeBuffer vs_output; - - unsigned int vertex_cache_pos = 0; - vertex_cache_ids.fill(-1); - auto* shader_engine = Shader::GetEngine(); - Shader::UnitState shader_unit; shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); @@ -336,66 +357,134 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { if (g_state.geometry_pipeline.NeedIndexInput()) ASSERT(is_indexed); - for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) { - // Indexed rendering doesn't use the start offset - unsigned int vertex = - is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) - : (index + regs.pipeline.vertex_offset); + auto UnitLoop = [&](bool single_thread, u32 index_start, u32 index_end) { + DebugUtils::MemoryAccessTracker memory_accesses; + Shader::UnitState shader_unit; - // -1 is a common special value used for primitive restart. Since it's unknown if - // the PICA supports it, and it would mess up the caching, guard against it here. - ASSERT(vertex != -1); + for (unsigned int index = index_start; index < index_end; ++index) { + // Indexed rendering doesn't use the start offset + unsigned int vertex = + is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) + : (index + regs.pipeline.vertex_offset); - bool vertex_cache_hit = false; + // -1 is a common special value used for primitive restart. Since it's unknown if + // the PICA supports it, and it would mess up the caching, guard against it here. + ASSERT(vertex != -1); - if (is_indexed) { - if (g_state.geometry_pipeline.NeedIndexInput()) { - g_state.geometry_pipeline.SubmitIndex(vertex); - continue; - } + bool vertex_cache_hit = false; - if (g_debug_context && Pica::g_debug_context->recorder) { - int size = index_u16 ? 2 : 1; - memory_accesses.AddAccess(base_address + index_info.offset + size * index, - size); - } + Shader::AttributeBuffer output_attr_tmp; + Shader::AttributeBuffer& output_attr = + is_indexed ? cache[vertex].output_attr : output_attr_tmp; - for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) { - if (vertex == vertex_cache_ids[i]) { - vs_output = vertex_cache[i]; - vertex_cache_hit = true; - break; - } - } - } - - if (!vertex_cache_hit) { - // Initialize data for the current vertex - Shader::AttributeBuffer input; - loader.LoadVertex(base_address, index, vertex, input, memory_accesses); - - // Send to vertex shader - if (g_debug_context) - g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, - (void*)&input); - shader_unit.LoadInput(regs.vs, input); - shader_engine->Run(g_state.vs, shader_unit); - shader_unit.WriteOutput(regs.vs, vs_output); + Pica::Shader::OutputVertex output_vertex_tmp; + Pica::Shader::OutputVertex& output_vertex = + is_indexed ? cache[vertex].output_vertex : output_vertex_tmp; if (is_indexed) { - vertex_cache[vertex_cache_pos] = vs_output; - vertex_cache_ids[vertex_cache_pos] = vertex; - vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE; + if (single_thread && g_state.geometry_pipeline.NeedIndexInput()) { + g_state.geometry_pipeline.SubmitIndex(vertex); + continue; + } + + if (g_debug_context && Pica::g_debug_context->recorder) { + int size = index_u16 ? 2 : 1; + memory_accesses.AddAccess(base_address + index_info.offset + size * index, + size); + } + + if (single_thread) { + if (cache[vertex].id.load(std::memory_order_relaxed) == cache_batch_id) { + vertex_cache_hit = true; + } + } else if (cache[vertex].id.load(std::memory_order_acquire) == cache_batch_id) { + vertex_cache_hit = true; + } + // Set the "writing" flag and check its previous status + else if (cache[vertex].writing.test_and_set(std::memory_order_acquire)) { + // Another thread is writing into the cache, spin until it's done + while (cache[vertex].writing.test_and_set(std::memory_order_acquire)) + ; + cache[vertex].writing.clear(std::memory_order_release); + vertex_cache_hit = true; + } + } + + if (!vertex_cache_hit) { + // Initialize data for the current vertex + Shader::AttributeBuffer input; + loader.LoadVertex(base_address, index, vertex, input, memory_accesses); + + // Send to vertex shader + if (g_debug_context) + g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, + &input); + shader_unit.LoadInput(regs.vs, input); + shader_engine->Run(g_state.vs, shader_unit); + + shader_unit.WriteOutput(regs.vs, output_attr); + if (!single_thread) + output_vertex = + Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, output_attr); + + if (is_indexed) { + if (single_thread) { + cache[vertex].id.store(cache_batch_id, std::memory_order_relaxed); + } else { + cache[vertex].id.store(cache_batch_id, std::memory_order_release); + cache[vertex].writing.clear(std::memory_order_release); + } + } + } + + if (single_thread) { + // Send to geometry pipeline + g_state.geometry_pipeline.SubmitVertex(output_attr); + } else { + vs_output[index].vertex = output_vertex; + vs_output[index].batch_id.store(cache_batch_id, std::memory_order_release); } } - // Send to geometry pipeline - g_state.geometry_pipeline.SubmitVertex(vs_output); - } + static std::mutex dbg_mtx; + if (!memory_accesses.ranges.empty()) { + std::lock_guard lock(dbg_mtx); + for (auto& range : memory_accesses.ranges) { + g_debug_context->recorder->MemoryAccessed( + Memory::GetPhysicalPointer(range.first), range.second, range.first); + } + } + }; - for (auto& range : memory_accesses.ranges) { - g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first), - range.second, range.first); + constexpr unsigned int VS_UNITS = 3; + const bool use_gs = regs.pipeline.use_gs == PipelineRegs::UseGS::Yes; + + auto& thread_pool = Common::ThreadPool::GetPool(); + unsigned int num_threads = use_gs ? 1 : VS_UNITS; + + if (num_threads == 1) { + UnitLoop(true, 0, regs.pipeline.num_vertices); + } else { + const u32 range = std::max(regs.pipeline.num_vertices / num_threads + 1, 50u); + for (unsigned int thread_id = 0; thread_id < num_threads; ++thread_id) { + const u32 loop_start = range * thread_id; + const u32 loop_end = loop_start + range; + if (loop_end >= regs.pipeline.num_vertices) { + thread_pool.push(UnitLoop, false, loop_start, regs.pipeline.num_vertices); + break; + } + thread_pool.push(UnitLoop, false, loop_start, loop_end); + } + for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) { + while (vs_output[index].batch_id.load(std::memory_order_acquire) != cache_batch_id) + ; + using Pica::Shader::OutputVertex; + primitive_assembler.SubmitVertex( + vs_output[index].vertex, + [](const OutputVertex& v0, const OutputVertex& v1, const OutputVertex& v2) { + VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); + }); + } } VideoCore::g_renderer->Rasterizer()->DrawTriangles();