diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 7e83e64b0..08cff3450 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -75,6 +75,7 @@ set(HEADERS synchronized_wrapper.h telemetry.h thread.h + thread_pool.h thread_queue_list.h timer.h vector_math.h diff --git a/src/common/thread_pool.h b/src/common/thread_pool.h new file mode 100644 index 000000000..aea5abe9b --- /dev/null +++ b/src/common/thread_pool.h @@ -0,0 +1,118 @@ +// Copyright 2016 Citra Emulator Project / PPSSPP Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include +#include +#include +#include +#include +#include + +#include "common/assert.h" + +namespace Common { + +class ThreadPool { +private: + explicit ThreadPool(unsigned int num_threads) : + num_threads(num_threads), + workers(num_threads) { + ASSERT(num_threads); + } + +public: + static ThreadPool& GetPool() { + static ThreadPool thread_pool(std::thread::hardware_concurrency()); + return thread_pool; + } + + void set_spinlocking(bool enable) { + for (auto& worker : workers) { + worker.spinlock_enabled = enable; + if (enable) { + std::unique_lock lock(worker.mutex); + lock.unlock(); + worker.cv.notify_one(); + } + } + } + + template + auto push(F&& f, Args&&... args) { + auto ret = workers[next_worker].push(std::forward(f), std::forward(args)...); + next_worker = (next_worker + 1) % num_threads; + return ret; + } + + unsigned int total_threads() { + return num_threads; + } + +private: + class Worker { + public: + Worker() : + exit_loop(false), + spinlock_enabled(false), + thread([this] { loop(); }) { + } + + ~Worker() { + exit_loop = true; + std::unique_lock lock(mutex); + lock.unlock(); + cv.notify_one(); + thread.join(); + } + + void loop() { + for (;;) { + while (queue.consume_all([](const auto& f) { + f(); + })); + if (spinlock_enabled) + continue; + + std::unique_lock lock(mutex); + if (queue.read_available()) + continue; + if (exit_loop) + break; + cv.wait(lock); + } + } + + template + auto push(F&& f, Args&&... args) { + auto task = std::make_shared>( + std::bind(std::forward(f), std::forward(args)...) + ); + + while (!queue.push([task]() {(*task)(); })) + std::this_thread::yield(); + + if (!spinlock_enabled.load(std::memory_order_relaxed)) { + std::unique_lock lock(mutex); + lock.unlock(); + cv.notify_one(); + } + + return task->get_future(); + } + + bool exit_loop; + std::atomic spinlock_enabled; + std::mutex mutex; + std::condition_variable cv; + boost::lockfree::spsc_queue, boost::lockfree::capacity<100>> queue; + std::thread thread; + }; + + const unsigned int num_threads; + int next_worker = 0; + std::vector workers; +}; + +} // namespace ThreadPool diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 3ab4af374..9b90268b0 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -9,6 +9,7 @@ #include "common/assert.h" #include "common/logging/log.h" #include "common/microprofile.h" +#include "common/thread_pool.h" #include "common/vector_math.h" #include "core/hle/service/gsp_gpu.h" #include "core/hw/gpu.h" @@ -298,6 +299,36 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { const u16* index_address_16 = reinterpret_cast(index_address_8); bool index_u16 = index_info.format != 0; + struct CacheEntry { + Shader::AttributeBuffer output_attr; + Shader::OutputVertex output_vertex; + std::atomic id; + std::atomic_flag writing{ ATOMIC_FLAG_INIT }; // Set when a thread is writing into this entry + }; + static std::array cache; + + // used as a mean to invalidate data from the previous batch without clearing it + static u32 cache_batch_id = std::numeric_limits::max(); + + ++cache_batch_id; + if (cache_batch_id == 0) { // reset cache if the emu ever runs long enough to overflow id + ++cache_batch_id; + for (auto& entry : cache) + entry.id = 0; + } + + struct VsOutput { + explicit VsOutput() = default; + VsOutput(VsOutput&& other) { batch_id = 0; } + + Pica::Shader::OutputVertex vertex; + std::atomic batch_id; + }; + static std::vector vs_output; + while (vs_output.size() < regs.pipeline.num_vertices) { + vs_output.emplace_back(); + } + PrimitiveAssembler& primitive_assembler = g_state.primitive_assembler; if (g_debug_context && g_debug_context->recorder) { @@ -314,20 +345,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { } } - DebugUtils::MemoryAccessTracker memory_accesses; - - // Simple circular-replacement vertex cache - // The size has been tuned for optimal balance between hit-rate and the cost of lookup - const size_t VERTEX_CACHE_SIZE = 32; - std::array vertex_cache_ids; - std::array vertex_cache; - Shader::AttributeBuffer vs_output; - - unsigned int vertex_cache_pos = 0; - vertex_cache_ids.fill(-1); - auto* shader_engine = Shader::GetEngine(); - Shader::UnitState shader_unit; shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); @@ -336,66 +354,134 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { if (g_state.geometry_pipeline.NeedIndexInput()) ASSERT(is_indexed); - for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) { - // Indexed rendering doesn't use the start offset - unsigned int vertex = - is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) - : (index + regs.pipeline.vertex_offset); + auto UnitLoop = [&](bool single_thread, + u32 index_start, + u32 index_end) { + DebugUtils::MemoryAccessTracker memory_accesses; + Shader::UnitState shader_unit; - // -1 is a common special value used for primitive restart. Since it's unknown if - // the PICA supports it, and it would mess up the caching, guard against it here. - ASSERT(vertex != -1); + for (unsigned int index = index_start; index < index_end; ++index) { + // Indexed rendering doesn't use the start offset + unsigned int vertex = + is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) + : (index + regs.pipeline.vertex_offset); - bool vertex_cache_hit = false; + // -1 is a common special value used for primitive restart. Since it's unknown if + // the PICA supports it, and it would mess up the caching, guard against it here. + ASSERT(vertex != -1); - if (is_indexed) { - if (g_state.geometry_pipeline.NeedIndexInput()) { - g_state.geometry_pipeline.SubmitIndex(vertex); - continue; - } + bool vertex_cache_hit = false; - if (g_debug_context && Pica::g_debug_context->recorder) { - int size = index_u16 ? 2 : 1; - memory_accesses.AddAccess(base_address + index_info.offset + size * index, - size); - } + Shader::AttributeBuffer output_attr_tmp; + Shader::AttributeBuffer& output_attr = is_indexed ? cache[vertex].output_attr : output_attr_tmp; - for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) { - if (vertex == vertex_cache_ids[i]) { - vs_output = vertex_cache[i]; - vertex_cache_hit = true; - break; - } - } - } - - if (!vertex_cache_hit) { - // Initialize data for the current vertex - Shader::AttributeBuffer input; - loader.LoadVertex(base_address, index, vertex, input, memory_accesses); - - // Send to vertex shader - if (g_debug_context) - g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, - (void*)&input); - shader_unit.LoadInput(regs.vs, input); - shader_engine->Run(g_state.vs, shader_unit); - shader_unit.WriteOutput(regs.vs, vs_output); + Pica::Shader::OutputVertex output_vertex_tmp; + Pica::Shader::OutputVertex& output_vertex = is_indexed ? cache[vertex].output_vertex : output_vertex_tmp; if (is_indexed) { - vertex_cache[vertex_cache_pos] = vs_output; - vertex_cache_ids[vertex_cache_pos] = vertex; - vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE; + if (single_thread && g_state.geometry_pipeline.NeedIndexInput()) { + g_state.geometry_pipeline.SubmitIndex(vertex); + continue; + } + + if (g_debug_context && Pica::g_debug_context->recorder) { + int size = index_u16 ? 2 : 1; + memory_accesses.AddAccess(base_address + index_info.offset + size * index, + size); + } + + if (single_thread) { + if (cache[vertex].id.load(std::memory_order_relaxed) == cache_batch_id) { + vertex_cache_hit = true; + } + } + else if (cache[vertex].id.load(std::memory_order_acquire) == cache_batch_id) { + vertex_cache_hit = true; + } + // Set the "writing" flag and check its previous status + else if (cache[vertex].writing.test_and_set(std::memory_order_acquire)) { + // Another thread is writing into the cache, spin until it's done + while (cache[vertex].writing.test_and_set(std::memory_order_acquire)); + cache[vertex].writing.clear(std::memory_order_release); + vertex_cache_hit = true; + } + } + + if (!vertex_cache_hit) { + // Initialize data for the current vertex + Shader::AttributeBuffer input; + loader.LoadVertex(base_address, index, vertex, input, memory_accesses); + + // Send to vertex shader + if (g_debug_context) + g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, &input); + shader_unit.LoadInput(regs.vs, input); + shader_engine->Run(g_state.vs, shader_unit); + + shader_unit.WriteOutput(regs.vs, output_attr); + if (!single_thread) + output_vertex = Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, output_attr); + + if (is_indexed) { + if (single_thread) { + cache[vertex].id.store(cache_batch_id, std::memory_order_relaxed); + } + else { + cache[vertex].id.store(cache_batch_id, std::memory_order_release); + cache[vertex].writing.clear(std::memory_order_release); + } + } + } + + if (single_thread) { + // Send to geometry pipeline + g_state.geometry_pipeline.SubmitVertex(output_attr); + } else { + vs_output[index].vertex = output_vertex; + vs_output[index].batch_id.store(cache_batch_id, std::memory_order_release); } } - // Send to geometry pipeline - g_state.geometry_pipeline.SubmitVertex(vs_output); - } + static std::mutex dbg_mtx; + if (!memory_accesses.ranges.empty()) { + std::lock_guard lock(dbg_mtx); + for (auto& range : memory_accesses.ranges) { + g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first), + range.second, range.first); + } + } + }; - for (auto& range : memory_accesses.ranges) { - g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first), - range.second, range.first); + constexpr unsigned int VS_UNITS = 3; + const bool use_gs = regs.pipeline.use_gs == PipelineRegs::UseGS::Yes; + + auto& thread_pool = Common::ThreadPool::GetPool(); + unsigned int num_threads = use_gs ? 1 : thread_pool.total_threads();//VS_UNITS; + + if (num_threads == 1) { + UnitLoop(true, 0, regs.pipeline.num_vertices); + } + else { + const u32 range = std::max(regs.pipeline.num_vertices / num_threads + 1, 50u); + for (unsigned int thread_id = 0; thread_id < num_threads; ++thread_id) { + const u32 loop_start = range * thread_id; + const u32 loop_end = loop_start + range; + if (loop_end >= regs.pipeline.num_vertices) { + thread_pool.push(UnitLoop, false, loop_start, regs.pipeline.num_vertices); + break; + } + thread_pool.push(UnitLoop, false, loop_start, loop_end); + } + for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) { + while (vs_output[index].batch_id.load(std::memory_order_acquire) != cache_batch_id); + using Pica::Shader::OutputVertex; + primitive_assembler.SubmitVertex(vs_output[index].vertex, + [] (const OutputVertex& v0, + const OutputVertex& v1, + const OutputVertex& v2) { + VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); + }); + } } VideoCore::g_renderer->Rasterizer()->DrawTriangles();