From 07ff3527a0f014ee8618d67d251cb8230bd93b27 Mon Sep 17 00:00:00 2001 From: Phantom Date: Wed, 27 Sep 2017 07:23:55 +0200 Subject: [PATCH 1/4] threaded vertex rendering --- src/common/CMakeLists.txt | 1 + src/common/thread_pool.h | 118 +++++++++++++++ src/video_core/command_processor.cpp | 210 +++++++++++++++++++-------- 3 files changed, 267 insertions(+), 62 deletions(-) create mode 100644 src/common/thread_pool.h diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 7e83e64b0..08cff3450 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -75,6 +75,7 @@ set(HEADERS synchronized_wrapper.h telemetry.h thread.h + thread_pool.h thread_queue_list.h timer.h vector_math.h diff --git a/src/common/thread_pool.h b/src/common/thread_pool.h new file mode 100644 index 000000000..aea5abe9b --- /dev/null +++ b/src/common/thread_pool.h @@ -0,0 +1,118 @@ +// Copyright 2016 Citra Emulator Project / PPSSPP Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include +#include +#include +#include +#include +#include + +#include "common/assert.h" + +namespace Common { + +class ThreadPool { +private: + explicit ThreadPool(unsigned int num_threads) : + num_threads(num_threads), + workers(num_threads) { + ASSERT(num_threads); + } + +public: + static ThreadPool& GetPool() { + static ThreadPool thread_pool(std::thread::hardware_concurrency()); + return thread_pool; + } + + void set_spinlocking(bool enable) { + for (auto& worker : workers) { + worker.spinlock_enabled = enable; + if (enable) { + std::unique_lock lock(worker.mutex); + lock.unlock(); + worker.cv.notify_one(); + } + } + } + + template + auto push(F&& f, Args&&... args) { + auto ret = workers[next_worker].push(std::forward(f), std::forward(args)...); + next_worker = (next_worker + 1) % num_threads; + return ret; + } + + unsigned int total_threads() { + return num_threads; + } + +private: + class Worker { + public: + Worker() : + exit_loop(false), + spinlock_enabled(false), + thread([this] { loop(); }) { + } + + ~Worker() { + exit_loop = true; + std::unique_lock lock(mutex); + lock.unlock(); + cv.notify_one(); + thread.join(); + } + + void loop() { + for (;;) { + while (queue.consume_all([](const auto& f) { + f(); + })); + if (spinlock_enabled) + continue; + + std::unique_lock lock(mutex); + if (queue.read_available()) + continue; + if (exit_loop) + break; + cv.wait(lock); + } + } + + template + auto push(F&& f, Args&&... args) { + auto task = std::make_shared>( + std::bind(std::forward(f), std::forward(args)...) + ); + + while (!queue.push([task]() {(*task)(); })) + std::this_thread::yield(); + + if (!spinlock_enabled.load(std::memory_order_relaxed)) { + std::unique_lock lock(mutex); + lock.unlock(); + cv.notify_one(); + } + + return task->get_future(); + } + + bool exit_loop; + std::atomic spinlock_enabled; + std::mutex mutex; + std::condition_variable cv; + boost::lockfree::spsc_queue, boost::lockfree::capacity<100>> queue; + std::thread thread; + }; + + const unsigned int num_threads; + int next_worker = 0; + std::vector workers; +}; + +} // namespace ThreadPool diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 3ab4af374..9b90268b0 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -9,6 +9,7 @@ #include "common/assert.h" #include "common/logging/log.h" #include "common/microprofile.h" +#include "common/thread_pool.h" #include "common/vector_math.h" #include "core/hle/service/gsp_gpu.h" #include "core/hw/gpu.h" @@ -298,6 +299,36 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { const u16* index_address_16 = reinterpret_cast(index_address_8); bool index_u16 = index_info.format != 0; + struct CacheEntry { + Shader::AttributeBuffer output_attr; + Shader::OutputVertex output_vertex; + std::atomic id; + std::atomic_flag writing{ ATOMIC_FLAG_INIT }; // Set when a thread is writing into this entry + }; + static std::array cache; + + // used as a mean to invalidate data from the previous batch without clearing it + static u32 cache_batch_id = std::numeric_limits::max(); + + ++cache_batch_id; + if (cache_batch_id == 0) { // reset cache if the emu ever runs long enough to overflow id + ++cache_batch_id; + for (auto& entry : cache) + entry.id = 0; + } + + struct VsOutput { + explicit VsOutput() = default; + VsOutput(VsOutput&& other) { batch_id = 0; } + + Pica::Shader::OutputVertex vertex; + std::atomic batch_id; + }; + static std::vector vs_output; + while (vs_output.size() < regs.pipeline.num_vertices) { + vs_output.emplace_back(); + } + PrimitiveAssembler& primitive_assembler = g_state.primitive_assembler; if (g_debug_context && g_debug_context->recorder) { @@ -314,20 +345,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { } } - DebugUtils::MemoryAccessTracker memory_accesses; - - // Simple circular-replacement vertex cache - // The size has been tuned for optimal balance between hit-rate and the cost of lookup - const size_t VERTEX_CACHE_SIZE = 32; - std::array vertex_cache_ids; - std::array vertex_cache; - Shader::AttributeBuffer vs_output; - - unsigned int vertex_cache_pos = 0; - vertex_cache_ids.fill(-1); - auto* shader_engine = Shader::GetEngine(); - Shader::UnitState shader_unit; shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); @@ -336,66 +354,134 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { if (g_state.geometry_pipeline.NeedIndexInput()) ASSERT(is_indexed); - for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) { - // Indexed rendering doesn't use the start offset - unsigned int vertex = - is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) - : (index + regs.pipeline.vertex_offset); + auto UnitLoop = [&](bool single_thread, + u32 index_start, + u32 index_end) { + DebugUtils::MemoryAccessTracker memory_accesses; + Shader::UnitState shader_unit; - // -1 is a common special value used for primitive restart. Since it's unknown if - // the PICA supports it, and it would mess up the caching, guard against it here. - ASSERT(vertex != -1); + for (unsigned int index = index_start; index < index_end; ++index) { + // Indexed rendering doesn't use the start offset + unsigned int vertex = + is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) + : (index + regs.pipeline.vertex_offset); - bool vertex_cache_hit = false; + // -1 is a common special value used for primitive restart. Since it's unknown if + // the PICA supports it, and it would mess up the caching, guard against it here. + ASSERT(vertex != -1); - if (is_indexed) { - if (g_state.geometry_pipeline.NeedIndexInput()) { - g_state.geometry_pipeline.SubmitIndex(vertex); - continue; - } + bool vertex_cache_hit = false; - if (g_debug_context && Pica::g_debug_context->recorder) { - int size = index_u16 ? 2 : 1; - memory_accesses.AddAccess(base_address + index_info.offset + size * index, - size); - } + Shader::AttributeBuffer output_attr_tmp; + Shader::AttributeBuffer& output_attr = is_indexed ? cache[vertex].output_attr : output_attr_tmp; - for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) { - if (vertex == vertex_cache_ids[i]) { - vs_output = vertex_cache[i]; - vertex_cache_hit = true; - break; - } - } - } - - if (!vertex_cache_hit) { - // Initialize data for the current vertex - Shader::AttributeBuffer input; - loader.LoadVertex(base_address, index, vertex, input, memory_accesses); - - // Send to vertex shader - if (g_debug_context) - g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, - (void*)&input); - shader_unit.LoadInput(regs.vs, input); - shader_engine->Run(g_state.vs, shader_unit); - shader_unit.WriteOutput(regs.vs, vs_output); + Pica::Shader::OutputVertex output_vertex_tmp; + Pica::Shader::OutputVertex& output_vertex = is_indexed ? cache[vertex].output_vertex : output_vertex_tmp; if (is_indexed) { - vertex_cache[vertex_cache_pos] = vs_output; - vertex_cache_ids[vertex_cache_pos] = vertex; - vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE; + if (single_thread && g_state.geometry_pipeline.NeedIndexInput()) { + g_state.geometry_pipeline.SubmitIndex(vertex); + continue; + } + + if (g_debug_context && Pica::g_debug_context->recorder) { + int size = index_u16 ? 2 : 1; + memory_accesses.AddAccess(base_address + index_info.offset + size * index, + size); + } + + if (single_thread) { + if (cache[vertex].id.load(std::memory_order_relaxed) == cache_batch_id) { + vertex_cache_hit = true; + } + } + else if (cache[vertex].id.load(std::memory_order_acquire) == cache_batch_id) { + vertex_cache_hit = true; + } + // Set the "writing" flag and check its previous status + else if (cache[vertex].writing.test_and_set(std::memory_order_acquire)) { + // Another thread is writing into the cache, spin until it's done + while (cache[vertex].writing.test_and_set(std::memory_order_acquire)); + cache[vertex].writing.clear(std::memory_order_release); + vertex_cache_hit = true; + } + } + + if (!vertex_cache_hit) { + // Initialize data for the current vertex + Shader::AttributeBuffer input; + loader.LoadVertex(base_address, index, vertex, input, memory_accesses); + + // Send to vertex shader + if (g_debug_context) + g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, &input); + shader_unit.LoadInput(regs.vs, input); + shader_engine->Run(g_state.vs, shader_unit); + + shader_unit.WriteOutput(regs.vs, output_attr); + if (!single_thread) + output_vertex = Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, output_attr); + + if (is_indexed) { + if (single_thread) { + cache[vertex].id.store(cache_batch_id, std::memory_order_relaxed); + } + else { + cache[vertex].id.store(cache_batch_id, std::memory_order_release); + cache[vertex].writing.clear(std::memory_order_release); + } + } + } + + if (single_thread) { + // Send to geometry pipeline + g_state.geometry_pipeline.SubmitVertex(output_attr); + } else { + vs_output[index].vertex = output_vertex; + vs_output[index].batch_id.store(cache_batch_id, std::memory_order_release); } } - // Send to geometry pipeline - g_state.geometry_pipeline.SubmitVertex(vs_output); - } + static std::mutex dbg_mtx; + if (!memory_accesses.ranges.empty()) { + std::lock_guard lock(dbg_mtx); + for (auto& range : memory_accesses.ranges) { + g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first), + range.second, range.first); + } + } + }; - for (auto& range : memory_accesses.ranges) { - g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first), - range.second, range.first); + constexpr unsigned int VS_UNITS = 3; + const bool use_gs = regs.pipeline.use_gs == PipelineRegs::UseGS::Yes; + + auto& thread_pool = Common::ThreadPool::GetPool(); + unsigned int num_threads = use_gs ? 1 : thread_pool.total_threads();//VS_UNITS; + + if (num_threads == 1) { + UnitLoop(true, 0, regs.pipeline.num_vertices); + } + else { + const u32 range = std::max(regs.pipeline.num_vertices / num_threads + 1, 50u); + for (unsigned int thread_id = 0; thread_id < num_threads; ++thread_id) { + const u32 loop_start = range * thread_id; + const u32 loop_end = loop_start + range; + if (loop_end >= regs.pipeline.num_vertices) { + thread_pool.push(UnitLoop, false, loop_start, regs.pipeline.num_vertices); + break; + } + thread_pool.push(UnitLoop, false, loop_start, loop_end); + } + for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) { + while (vs_output[index].batch_id.load(std::memory_order_acquire) != cache_batch_id); + using Pica::Shader::OutputVertex; + primitive_assembler.SubmitVertex(vs_output[index].vertex, + [] (const OutputVertex& v0, + const OutputVertex& v1, + const OutputVertex& v2) { + VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); + }); + } } VideoCore::g_renderer->Rasterizer()->DrawTriangles(); From 502f0c991b501da05f95daa2a3d9993eedf742da Mon Sep 17 00:00:00 2001 From: Phantom Date: Fri, 29 Sep 2017 22:11:54 +0200 Subject: [PATCH 2/4] optimized threading --- src/common/thread_pool.h | 2 +- src/video_core/command_processor.cpp | 239 +++++++++++------------ src/video_core/debug_utils/debug_utils.h | 4 + 3 files changed, 119 insertions(+), 126 deletions(-) diff --git a/src/common/thread_pool.h b/src/common/thread_pool.h index aea5abe9b..21ae33fab 100644 --- a/src/common/thread_pool.h +++ b/src/common/thread_pool.h @@ -14,7 +14,7 @@ namespace Common { -class ThreadPool { +class ThreadPool : NonCopyable { private: explicit ThreadPool(unsigned int num_threads) : num_threads(num_threads), diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 9b90268b0..340536f6a 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include "common/assert.h" @@ -278,6 +279,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX(pipeline.trigger_draw): case PICA_REG_INDEX(pipeline.trigger_draw_indexed): { MICROPROFILE_SCOPE(GPU_Drawing); + const bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed)); #if PICA_LOG_TEV DebugUtils::DumpTevStageConfig(regs.GetTevStages()); @@ -285,49 +287,47 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { if (g_debug_context) g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr); + struct CachedVertex { + explicit CachedVertex() : batch(0), lock{ ATOMIC_FLAG_INIT } {} + CachedVertex(const CachedVertex& other) : CachedVertex() {} + union { + Shader::AttributeBuffer output_attr; // GS used + Shader::OutputVertex output_vertex; // No GS + }; + std::atomic batch; + std::atomic_flag lock; + }; + static std::vector vs_output(0x10000); + + if (!is_indexed && vs_output.size() < regs.pipeline.num_vertices) + vs_output.resize(regs.pipeline.num_vertices); + + // used as a mean to invalidate data from the previous batch without clearing it + static u32 batch_id = std::numeric_limits::max(); + + ++batch_id; + if (batch_id == 0) { // reset cache when id overflows for safety + ++batch_id; + for (auto& entry : vs_output) + entry.batch = 0; + } + // Processes information about internal vertex attributes to figure out how a vertex is // loaded. // Later, these can be compiled and cached. const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress(); VertexLoader loader(regs.pipeline); - // Load vertices - bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed)); - const auto& index_info = regs.pipeline.index_array; const u8* index_address_8 = Memory::GetPhysicalPointer(base_address + index_info.offset); const u16* index_address_16 = reinterpret_cast(index_address_8); bool index_u16 = index_info.format != 0; - struct CacheEntry { - Shader::AttributeBuffer output_attr; - Shader::OutputVertex output_vertex; - std::atomic id; - std::atomic_flag writing{ ATOMIC_FLAG_INIT }; // Set when a thread is writing into this entry + auto VertexIndex = [&](unsigned int index) { + // Indexed rendering doesn't use the start offset + return is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) + : (index + regs.pipeline.vertex_offset); }; - static std::array cache; - - // used as a mean to invalidate data from the previous batch without clearing it - static u32 cache_batch_id = std::numeric_limits::max(); - - ++cache_batch_id; - if (cache_batch_id == 0) { // reset cache if the emu ever runs long enough to overflow id - ++cache_batch_id; - for (auto& entry : cache) - entry.id = 0; - } - - struct VsOutput { - explicit VsOutput() = default; - VsOutput(VsOutput&& other) { batch_id = 0; } - - Pica::Shader::OutputVertex vertex; - std::atomic batch_id; - }; - static std::vector vs_output; - while (vs_output.size() < regs.pipeline.num_vertices) { - vs_output.emplace_back(); - } PrimitiveAssembler& primitive_assembler = g_state.primitive_assembler; @@ -345,146 +345,135 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { } } + DebugUtils::MemoryAccessTracker memory_accesses; + auto* shader_engine = Shader::GetEngine(); shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); + const bool use_gs = regs.pipeline.use_gs == PipelineRegs::UseGS::Yes; g_state.geometry_pipeline.Reconfigure(); g_state.geometry_pipeline.Setup(shader_engine); if (g_state.geometry_pipeline.NeedIndexInput()) ASSERT(is_indexed); - auto UnitLoop = [&](bool single_thread, - u32 index_start, - u32 index_end) { - DebugUtils::MemoryAccessTracker memory_accesses; + auto VSUnitLoop = [&](u32 thread_id, auto num_threads) { + constexpr bool single_thread = std::is_same_v, decltype(num_threads)>; Shader::UnitState shader_unit; - for (unsigned int index = index_start; index < index_end; ++index) { - // Indexed rendering doesn't use the start offset - unsigned int vertex = - is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) - : (index + regs.pipeline.vertex_offset); + for (unsigned int index = thread_id; index < regs.pipeline.num_vertices; index += num_threads) { + unsigned int vertex = VertexIndex(index); + auto& cached_vertex = vs_output[is_indexed ? vertex : index]; // -1 is a common special value used for primitive restart. Since it's unknown if // the PICA supports it, and it would mess up the caching, guard against it here. ASSERT(vertex != -1); - bool vertex_cache_hit = false; - - Shader::AttributeBuffer output_attr_tmp; - Shader::AttributeBuffer& output_attr = is_indexed ? cache[vertex].output_attr : output_attr_tmp; - - Pica::Shader::OutputVertex output_vertex_tmp; - Pica::Shader::OutputVertex& output_vertex = is_indexed ? cache[vertex].output_vertex : output_vertex_tmp; - if (is_indexed) { - if (single_thread && g_state.geometry_pipeline.NeedIndexInput()) { - g_state.geometry_pipeline.SubmitIndex(vertex); - continue; - } - if (g_debug_context && Pica::g_debug_context->recorder) { int size = index_u16 ? 2 : 1; memory_accesses.AddAccess(base_address + index_info.offset + size * index, - size); + size); } - if (single_thread) { - if (cache[vertex].id.load(std::memory_order_relaxed) == cache_batch_id) { - vertex_cache_hit = true; + if (!single_thread) { + // Try locking this vertex + if (cached_vertex.lock.test_and_set(std::memory_order_acquire)) { + // Another thread is processing this vertex + continue; + } + // Vertex is not being processed and is from the correct batch + else if (cached_vertex.batch.load(std::memory_order_acquire) == batch_id) { + // Unlock + cached_vertex.lock.clear(std::memory_order_release); + continue; } } - else if (cache[vertex].id.load(std::memory_order_acquire) == cache_batch_id) { - vertex_cache_hit = true; - } - // Set the "writing" flag and check its previous status - else if (cache[vertex].writing.test_and_set(std::memory_order_acquire)) { - // Another thread is writing into the cache, spin until it's done - while (cache[vertex].writing.test_and_set(std::memory_order_acquire)); - cache[vertex].writing.clear(std::memory_order_release); - vertex_cache_hit = true; + else if (cached_vertex.batch.load(std::memory_order_relaxed) == batch_id) { + continue; } } + Shader::AttributeBuffer attribute_buffer; + Shader::AttributeBuffer& output_attr = use_gs ? cached_vertex.output_attr : attribute_buffer; - if (!vertex_cache_hit) { - // Initialize data for the current vertex - Shader::AttributeBuffer input; - loader.LoadVertex(base_address, index, vertex, input, memory_accesses); + // Initialize data for the current vertex + loader.LoadVertex(base_address, index, vertex, attribute_buffer, memory_accesses); - // Send to vertex shader - if (g_debug_context) - g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, &input); - shader_unit.LoadInput(regs.vs, input); - shader_engine->Run(g_state.vs, shader_unit); + // Send to vertex shader + if (g_debug_context) + g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, &attribute_buffer); + shader_unit.LoadInput(regs.vs, attribute_buffer); + shader_engine->Run(g_state.vs, shader_unit); - shader_unit.WriteOutput(regs.vs, output_attr); - if (!single_thread) - output_vertex = Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, output_attr); + shader_unit.WriteOutput(regs.vs, output_attr); + if (!use_gs) + cached_vertex.output_vertex = Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, output_attr); + if (!single_thread) { + cached_vertex.batch.store(batch_id, std::memory_order_release); if (is_indexed) { - if (single_thread) { - cache[vertex].id.store(cache_batch_id, std::memory_order_relaxed); - } - else { - cache[vertex].id.store(cache_batch_id, std::memory_order_release); - cache[vertex].writing.clear(std::memory_order_release); - } + cached_vertex.lock.clear(std::memory_order_release); } } - - if (single_thread) { - // Send to geometry pipeline - g_state.geometry_pipeline.SubmitVertex(output_attr); - } else { - vs_output[index].vertex = output_vertex; - vs_output[index].batch_id.store(cache_batch_id, std::memory_order_release); - } - } - - static std::mutex dbg_mtx; - if (!memory_accesses.ranges.empty()) { - std::lock_guard lock(dbg_mtx); - for (auto& range : memory_accesses.ranges) { - g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first), - range.second, range.first); + else if (is_indexed) { + cached_vertex.batch.store(batch_id, std::memory_order_relaxed); } } }; - constexpr unsigned int VS_UNITS = 3; - const bool use_gs = regs.pipeline.use_gs == PipelineRegs::UseGS::Yes; - auto& thread_pool = Common::ThreadPool::GetPool(); - unsigned int num_threads = use_gs ? 1 : thread_pool.total_threads();//VS_UNITS; + std::vector> futures; - if (num_threads == 1) { - UnitLoop(true, 0, regs.pipeline.num_vertices); - } - else { - const u32 range = std::max(regs.pipeline.num_vertices / num_threads + 1, 50u); + constexpr unsigned int MIN_VERTICES_PER_THREAD = 20; + unsigned int num_threads = regs.pipeline.num_vertices / MIN_VERTICES_PER_THREAD + + (regs.pipeline.num_vertices % MIN_VERTICES_PER_THREAD != 0); + num_threads = std::min(num_threads, std::thread::hardware_concurrency() - 1); + + if (num_threads <= 1) { + VSUnitLoop(0, std::integral_constant{}); + } else { for (unsigned int thread_id = 0; thread_id < num_threads; ++thread_id) { - const u32 loop_start = range * thread_id; - const u32 loop_end = loop_start + range; - if (loop_end >= regs.pipeline.num_vertices) { - thread_pool.push(UnitLoop, false, loop_start, regs.pipeline.num_vertices); - break; + futures.emplace_back(thread_pool.push(VSUnitLoop, thread_id, num_threads)); + } + } + + for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) { + unsigned int vertex = VertexIndex(index); + auto& cached_vertex = vs_output[is_indexed ? vertex : index]; + + if (use_gs && is_indexed && g_state.geometry_pipeline.NeedIndexInput()) { + g_state.geometry_pipeline.SubmitIndex(vertex); + continue; + } + + // Synchronize threads + if (num_threads != 1) { + while (cached_vertex.batch.load(std::memory_order_acquire) != batch_id) { + std::this_thread::yield(); } - thread_pool.push(UnitLoop, false, loop_start, loop_end); } - for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) { - while (vs_output[index].batch_id.load(std::memory_order_acquire) != cache_batch_id); - using Pica::Shader::OutputVertex; - primitive_assembler.SubmitVertex(vs_output[index].vertex, - [] (const OutputVertex& v0, - const OutputVertex& v1, - const OutputVertex& v2) { - VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); - }); + + if (use_gs) { + // Send to geometry pipeline + g_state.geometry_pipeline.SubmitVertex(cached_vertex.output_attr); + } else { + primitive_assembler.SubmitVertex(cached_vertex.output_vertex, + std::bind(&std::decay_tRasterizer())>::AddTriangle, + VideoCore::g_renderer->Rasterizer(), + std::placeholders::_1, std::placeholders::_2, std::placeholders::_3)); } } + for (auto& future : futures) + future.get(); + + for (auto& range : memory_accesses.ranges) { + g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first), + range.second, range.first); + } + VideoCore::g_renderer->Rasterizer()->DrawTriangles(); + if (g_debug_context) { g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr); } diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h index c1f29c527..f266bc9cd 100644 --- a/src/video_core/debug_utils/debug_utils.h +++ b/src/video_core/debug_utils/debug_utils.h @@ -235,6 +235,8 @@ class MemoryAccessTracker { public: /// Record a particular memory access in the list void AddAccess(u32 paddr, u32 size) { + std::lock_guard lock(mutex); + // Create new range or extend existing one ranges[paddr] = std::max(ranges[paddr], size); @@ -242,6 +244,8 @@ public: SimplifyRanges(); } + std::mutex mutex; + /// Map of accessed ranges (mapping start address to range size) std::map ranges; }; From e258faf0f0760cf89bd9745824e8f0c363b1897a Mon Sep 17 00:00:00 2001 From: Phantom Date: Sat, 30 Sep 2017 08:08:41 +0200 Subject: [PATCH 3/4] fix dispatching --- src/video_core/command_processor.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 340536f6a..f7e424a34 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -424,16 +424,15 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { auto& thread_pool = Common::ThreadPool::GetPool(); std::vector> futures; - constexpr unsigned int MIN_VERTICES_PER_THREAD = 20; - unsigned int num_threads = regs.pipeline.num_vertices / MIN_VERTICES_PER_THREAD + - (regs.pipeline.num_vertices % MIN_VERTICES_PER_THREAD != 0); - num_threads = std::min(num_threads, std::thread::hardware_concurrency() - 1); + constexpr unsigned int MIN_VERTICES_PER_THREAD = 10; + unsigned int vs_threads = regs.pipeline.num_vertices / MIN_VERTICES_PER_THREAD; + vs_threads = std::min(vs_threads, std::thread::hardware_concurrency() - 1); - if (num_threads <= 1) { + if (!vs_threads) { VSUnitLoop(0, std::integral_constant{}); } else { - for (unsigned int thread_id = 0; thread_id < num_threads; ++thread_id) { - futures.emplace_back(thread_pool.push(VSUnitLoop, thread_id, num_threads)); + for (unsigned int thread_id = 0; thread_id < vs_threads; ++thread_id) { + futures.emplace_back(thread_pool.push(VSUnitLoop, thread_id, vs_threads)); } } @@ -447,7 +446,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { } // Synchronize threads - if (num_threads != 1) { + if (vs_threads) { while (cached_vertex.batch.load(std::memory_order_acquire) != batch_id) { std::this_thread::yield(); } From a1112cb7122faa30e5fd17d022f87ffb5e08f637 Mon Sep 17 00:00:00 2001 From: Phantom Date: Tue, 10 Oct 2017 03:43:26 +0200 Subject: [PATCH 4/4] glBufferSubData VBO --- src/video_core/command_processor.cpp | 9 +++--- src/video_core/primitive_assembly.cpp | 2 +- src/video_core/primitive_assembly.h | 2 +- .../renderer_opengl/gl_rasterizer.cpp | 30 ++++++++++++------- .../renderer_opengl/gl_rasterizer.h | 1 + 5 files changed, 27 insertions(+), 17 deletions(-) diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index f7e424a34..643b3f187 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -352,10 +352,6 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); const bool use_gs = regs.pipeline.use_gs == PipelineRegs::UseGS::Yes; - g_state.geometry_pipeline.Reconfigure(); - g_state.geometry_pipeline.Setup(shader_engine); - if (g_state.geometry_pipeline.NeedIndexInput()) - ASSERT(is_indexed); auto VSUnitLoop = [&](u32 thread_id, auto num_threads) { constexpr bool single_thread = std::is_same_v, decltype(num_threads)>; @@ -436,6 +432,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { } } + g_state.geometry_pipeline.Reconfigure(); + g_state.geometry_pipeline.Setup(shader_engine); + if (g_state.geometry_pipeline.NeedIndexInput()) + ASSERT(is_indexed); + for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) { unsigned int vertex = VertexIndex(index); auto& cached_vertex = vs_output[is_indexed ? vertex : index]; diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp index 9c3dd4cab..9ff9d097a 100644 --- a/src/video_core/primitive_assembly.cpp +++ b/src/video_core/primitive_assembly.cpp @@ -15,7 +15,7 @@ PrimitiveAssembler::PrimitiveAssembler(PipelineRegs::TriangleTopolog template void PrimitiveAssembler::SubmitVertex(const VertexType& vtx, - TriangleHandler triangle_handler) { + const TriangleHandler& triangle_handler) { switch (topology) { case PipelineRegs::TriangleTopology::List: case PipelineRegs::TriangleTopology::Shader: diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h index 12de8e3b9..2ecbe6742 100644 --- a/src/video_core/primitive_assembly.h +++ b/src/video_core/primitive_assembly.h @@ -27,7 +27,7 @@ struct PrimitiveAssembler { * NOTE: We could specify the triangle handler in the constructor, but this way we can * keep event and handler code next to each other. */ - void SubmitVertex(const VertexType& vtx, TriangleHandler triangle_handler); + void SubmitVertex(const VertexType& vtx, const TriangleHandler& triangle_handler); /** * Invert the vertex order of the next triangle. Called by geometry shader emitter. diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 7e09e4712..9bcd7c103 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -27,7 +27,7 @@ MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255)); MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100)); -RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) { +RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true), vertex_buffer_size(0) { // Clipping plane 0 is always enabled for PICA fixed clip plane z <= 0 state.clip_distance[0] = true; @@ -236,24 +236,24 @@ void RasterizerOpenGL::DrawTriangles() { state.Apply(); glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, - color_surface != nullptr ? color_surface->texture.handle : 0, 0); + color_surface != nullptr ? color_surface->texture.handle : 0, 0); if (depth_surface != nullptr) { if (regs.framebuffer.framebuffer.depth_format == Pica::FramebufferRegs::DepthFormat::D24S8) { // attach both depth and stencil glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - depth_surface->texture.handle, 0); + depth_surface->texture.handle, 0); } else { // attach depth glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, - depth_surface->texture.handle, 0); + depth_surface->texture.handle, 0); // clear stencil attachment glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); } } else { // clear both depth and stencil attachment glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, - 0); + 0); } // Sync the viewport @@ -263,6 +263,11 @@ void RasterizerOpenGL::DrawTriangles() { GLsizei viewport_height = (GLsizei)Pica::float24::FromRaw(regs.rasterizer.viewport_size_y).ToFloat32() * 2; + const float res_scale_width = color_surface != nullptr ? color_surface->res_scale_width : + (depth_surface == nullptr ? 1.0f : depth_surface->res_scale_width); + const float res_scale_height = color_surface != nullptr ? color_surface->res_scale_height : + (depth_surface == nullptr ? 1.0f : depth_surface->res_scale_height); + glViewport( (GLint)(rect.left + regs.rasterizer.viewport_corner.x * color_surface->res_scale_width), (GLint)(rect.bottom + regs.rasterizer.viewport_corner.y * color_surface->res_scale_height), @@ -374,16 +379,21 @@ void RasterizerOpenGL::DrawTriangles() { // Sync the uniform data if (uniform_block_data.dirty) { glBufferData(GL_UNIFORM_BUFFER, sizeof(UniformData), &uniform_block_data.data, - GL_STATIC_DRAW); + GL_STATIC_DRAW); uniform_block_data.dirty = false; } state.Apply(); // Draw the vertex batch - glBufferData(GL_ARRAY_BUFFER, vertex_batch.size() * sizeof(HardwareVertex), vertex_batch.data(), - GL_STREAM_DRAW); - glDrawArrays(GL_TRIANGLES, 0, (GLsizei)vertex_batch.size()); + GLsizeiptr target_size = vertex_batch.size() * sizeof(HardwareVertex); + if (vertex_buffer_size < target_size) { + vertex_buffer_size = target_size * 2; + glBufferData(GL_ARRAY_BUFFER, vertex_buffer_size, nullptr, GL_STREAM_DRAW); + } + glBufferSubData(GL_ARRAY_BUFFER, 0, target_size, vertex_batch.data()); + glDrawArrays(GL_TRIANGLES, 0, static_cast(vertex_batch.size())); + vertex_batch.clear(); // Mark framebuffer surfaces as dirty // TODO: Restrict invalidation area to the viewport @@ -396,8 +406,6 @@ void RasterizerOpenGL::DrawTriangles() { res_cache.FlushRegion(depth_surface->addr, depth_surface->size, depth_surface, true); } - vertex_batch.clear(); - // Unbind textures for potential future use as framebuffer attachments for (unsigned texture_index = 0; texture_index < pica_textures.size(); ++texture_index) { state.texture_units[texture_index].texture_2d = 0; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 46c62961c..ef94d498a 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -283,6 +283,7 @@ private: std::array texture_samplers; OGLVertexArray vertex_array; OGLBuffer vertex_buffer; + GLsizeiptr vertex_buffer_size; OGLBuffer uniform_buffer; OGLFramebuffer framebuffer;