Merge pull request #1264 from degasus/optimizations
video_core: Optimize the command processor.
This commit is contained in:
		| @@ -8,6 +8,7 @@ | ||||
| #include "core/core.h" | ||||
| #include "core/hle/service/nvdrv/devices/nvhost_gpu.h" | ||||
| #include "core/memory.h" | ||||
| #include "video_core/command_processor.h" | ||||
| #include "video_core/gpu.h" | ||||
| #include "video_core/memory_manager.h" | ||||
|  | ||||
| @@ -134,17 +135,16 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp | ||||
|     LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}", | ||||
|                 params.address, params.num_entries, params.flags); | ||||
|  | ||||
|     ASSERT_MSG(input.size() == | ||||
|                    sizeof(IoctlSubmitGpfifo) + params.num_entries * sizeof(IoctlGpfifoEntry), | ||||
|     ASSERT_MSG(input.size() == sizeof(IoctlSubmitGpfifo) + | ||||
|                                    params.num_entries * sizeof(Tegra::CommandListHeader), | ||||
|                "Incorrect input size"); | ||||
|  | ||||
|     std::vector<IoctlGpfifoEntry> entries(params.num_entries); | ||||
|     std::vector<Tegra::CommandListHeader> entries(params.num_entries); | ||||
|     std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)], | ||||
|                 params.num_entries * sizeof(IoctlGpfifoEntry)); | ||||
|     for (auto entry : entries) { | ||||
|         Tegra::GPUVAddr va_addr = entry.Address(); | ||||
|         Core::System::GetInstance().GPU().ProcessCommandList(va_addr, entry.sz); | ||||
|     } | ||||
|                 params.num_entries * sizeof(Tegra::CommandListHeader)); | ||||
|  | ||||
|     Core::System::GetInstance().GPU().ProcessCommandLists(entries); | ||||
|  | ||||
|     params.fence_out.id = 0; | ||||
|     params.fence_out.value = 0; | ||||
|     std::memcpy(output.data(), ¶ms, sizeof(IoctlSubmitGpfifo)); | ||||
| @@ -160,14 +160,12 @@ u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output) | ||||
|     LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}", | ||||
|                 params.address, params.num_entries, params.flags); | ||||
|  | ||||
|     std::vector<IoctlGpfifoEntry> entries(params.num_entries); | ||||
|     std::vector<Tegra::CommandListHeader> entries(params.num_entries); | ||||
|     Memory::ReadBlock(params.address, entries.data(), | ||||
|                       params.num_entries * sizeof(IoctlGpfifoEntry)); | ||||
|                       params.num_entries * sizeof(Tegra::CommandListHeader)); | ||||
|  | ||||
|     Core::System::GetInstance().GPU().ProcessCommandLists(entries); | ||||
|  | ||||
|     for (auto entry : entries) { | ||||
|         Tegra::GPUVAddr va_addr = entry.Address(); | ||||
|         Core::System::GetInstance().GPU().ProcessCommandList(va_addr, entry.sz); | ||||
|     } | ||||
|     params.fence_out.id = 0; | ||||
|     params.fence_out.value = 0; | ||||
|     std::memcpy(output.data(), ¶ms, output.size()); | ||||
|   | ||||
| @@ -10,7 +10,6 @@ | ||||
| #include "common/common_types.h" | ||||
| #include "common/swap.h" | ||||
| #include "core/hle/service/nvdrv/devices/nvdevice.h" | ||||
| #include "video_core/memory_manager.h" | ||||
|  | ||||
| namespace Service::Nvidia::Devices { | ||||
|  | ||||
| @@ -151,22 +150,6 @@ private: | ||||
|     }; | ||||
|     static_assert(sizeof(IoctlAllocObjCtx) == 16, "IoctlAllocObjCtx is incorrect size"); | ||||
|  | ||||
|     struct IoctlGpfifoEntry { | ||||
|         u32_le entry0; // gpu_va_lo | ||||
|         union { | ||||
|             u32_le entry1; // gpu_va_hi | (unk_0x02 << 0x08) | (size << 0x0A) | (unk_0x01 << 0x1F) | ||||
|             BitField<0, 8, u32_le> gpu_va_hi; | ||||
|             BitField<8, 2, u32_le> unk1; | ||||
|             BitField<10, 21, u32_le> sz; | ||||
|             BitField<31, 1, u32_le> unk2; | ||||
|         }; | ||||
|  | ||||
|         Tegra::GPUVAddr Address() const { | ||||
|             return (static_cast<Tegra::GPUVAddr>(gpu_va_hi) << 32) | entry0; | ||||
|         } | ||||
|     }; | ||||
|     static_assert(sizeof(IoctlGpfifoEntry) == 8, "IoctlGpfifoEntry is incorrect size"); | ||||
|  | ||||
|     struct IoctlSubmitGpfifo { | ||||
|         u64_le address;     // pointer to gpfifo entry structs | ||||
|         u32_le num_entries; // number of fence objects being submitted | ||||
|   | ||||
| @@ -28,98 +28,106 @@ enum class BufferMethods { | ||||
|     CountBufferMethods = 0x40, | ||||
| }; | ||||
|  | ||||
| void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params) { | ||||
|     LOG_TRACE(HW_GPU, | ||||
|               "Processing method {:08X} on subchannel {} value " | ||||
|               "{:08X} remaining params {}", | ||||
|               method, subchannel, value, remaining_params); | ||||
| MICROPROFILE_DEFINE(ProcessCommandLists, "GPU", "Execute command buffer", MP_RGB(128, 128, 192)); | ||||
|  | ||||
|     ASSERT(subchannel < bound_engines.size()); | ||||
| void GPU::ProcessCommandLists(const std::vector<CommandListHeader>& commands) { | ||||
|     MICROPROFILE_SCOPE(ProcessCommandLists); | ||||
|  | ||||
|     if (method == static_cast<u32>(BufferMethods::BindObject)) { | ||||
|         // Bind the current subchannel to the desired engine id. | ||||
|         LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", subchannel, value); | ||||
|         bound_engines[subchannel] = static_cast<EngineID>(value); | ||||
|         return; | ||||
|     } | ||||
|     auto WriteReg = [this](u32 method, u32 subchannel, u32 value, u32 remaining_params) { | ||||
|         LOG_TRACE(HW_GPU, | ||||
|                   "Processing method {:08X} on subchannel {} value " | ||||
|                   "{:08X} remaining params {}", | ||||
|                   method, subchannel, value, remaining_params); | ||||
|  | ||||
|     if (method < static_cast<u32>(BufferMethods::CountBufferMethods)) { | ||||
|         // TODO(Subv): Research and implement these methods. | ||||
|         LOG_ERROR(HW_GPU, "Special buffer methods other than Bind are not implemented"); | ||||
|         return; | ||||
|     } | ||||
|         ASSERT(subchannel < bound_engines.size()); | ||||
|  | ||||
|     const EngineID engine = bound_engines[subchannel]; | ||||
|  | ||||
|     switch (engine) { | ||||
|     case EngineID::FERMI_TWOD_A: | ||||
|         fermi_2d->WriteReg(method, value); | ||||
|         break; | ||||
|     case EngineID::MAXWELL_B: | ||||
|         maxwell_3d->WriteReg(method, value, remaining_params); | ||||
|         break; | ||||
|     case EngineID::MAXWELL_COMPUTE_B: | ||||
|         maxwell_compute->WriteReg(method, value); | ||||
|         break; | ||||
|     case EngineID::MAXWELL_DMA_COPY_A: | ||||
|         maxwell_dma->WriteReg(method, value); | ||||
|         break; | ||||
|     default: | ||||
|         UNIMPLEMENTED_MSG("Unimplemented engine"); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void GPU::ProcessCommandList(GPUVAddr address, u32 size) { | ||||
|     const boost::optional<VAddr> head_address = memory_manager->GpuToCpuAddress(address); | ||||
|     VAddr current_addr = *head_address; | ||||
|     while (current_addr < *head_address + size * sizeof(CommandHeader)) { | ||||
|         const CommandHeader header = {Memory::Read32(current_addr)}; | ||||
|         current_addr += sizeof(u32); | ||||
|  | ||||
|         switch (header.mode.Value()) { | ||||
|         case SubmissionMode::IncreasingOld: | ||||
|         case SubmissionMode::Increasing: { | ||||
|             // Increase the method value with each argument. | ||||
|             for (unsigned i = 0; i < header.arg_count; ++i) { | ||||
|                 WriteReg(header.method + i, header.subchannel, Memory::Read32(current_addr), | ||||
|                          header.arg_count - i - 1); | ||||
|                 current_addr += sizeof(u32); | ||||
|             } | ||||
|             break; | ||||
|         if (method == static_cast<u32>(BufferMethods::BindObject)) { | ||||
|             // Bind the current subchannel to the desired engine id. | ||||
|             LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", subchannel, value); | ||||
|             bound_engines[subchannel] = static_cast<EngineID>(value); | ||||
|             return; | ||||
|         } | ||||
|         case SubmissionMode::NonIncreasingOld: | ||||
|         case SubmissionMode::NonIncreasing: { | ||||
|             // Use the same method value for all arguments. | ||||
|             for (unsigned i = 0; i < header.arg_count; ++i) { | ||||
|                 WriteReg(header.method, header.subchannel, Memory::Read32(current_addr), | ||||
|                          header.arg_count - i - 1); | ||||
|                 current_addr += sizeof(u32); | ||||
|             } | ||||
|             break; | ||||
|         } | ||||
|         case SubmissionMode::IncreaseOnce: { | ||||
|             ASSERT(header.arg_count.Value() >= 1); | ||||
|  | ||||
|             // Use the original method for the first argument and then the next method for all other | ||||
|             // arguments. | ||||
|             WriteReg(header.method, header.subchannel, Memory::Read32(current_addr), | ||||
|                      header.arg_count - 1); | ||||
|         if (method < static_cast<u32>(BufferMethods::CountBufferMethods)) { | ||||
|             // TODO(Subv): Research and implement these methods. | ||||
|             LOG_ERROR(HW_GPU, "Special buffer methods other than Bind are not implemented"); | ||||
|             return; | ||||
|         } | ||||
|  | ||||
|         const EngineID engine = bound_engines[subchannel]; | ||||
|  | ||||
|         switch (engine) { | ||||
|         case EngineID::FERMI_TWOD_A: | ||||
|             fermi_2d->WriteReg(method, value); | ||||
|             break; | ||||
|         case EngineID::MAXWELL_B: | ||||
|             maxwell_3d->WriteReg(method, value, remaining_params); | ||||
|             break; | ||||
|         case EngineID::MAXWELL_COMPUTE_B: | ||||
|             maxwell_compute->WriteReg(method, value); | ||||
|             break; | ||||
|         case EngineID::MAXWELL_DMA_COPY_A: | ||||
|             maxwell_dma->WriteReg(method, value); | ||||
|             break; | ||||
|         default: | ||||
|             UNIMPLEMENTED_MSG("Unimplemented engine"); | ||||
|         } | ||||
|     }; | ||||
|  | ||||
|     for (auto entry : commands) { | ||||
|         Tegra::GPUVAddr address = entry.Address(); | ||||
|         u32 size = entry.sz; | ||||
|         const boost::optional<VAddr> head_address = memory_manager->GpuToCpuAddress(address); | ||||
|         VAddr current_addr = *head_address; | ||||
|         while (current_addr < *head_address + size * sizeof(CommandHeader)) { | ||||
|             const CommandHeader header = {Memory::Read32(current_addr)}; | ||||
|             current_addr += sizeof(u32); | ||||
|  | ||||
|             for (unsigned i = 1; i < header.arg_count; ++i) { | ||||
|                 WriteReg(header.method + 1, header.subchannel, Memory::Read32(current_addr), | ||||
|                          header.arg_count - i - 1); | ||||
|                 current_addr += sizeof(u32); | ||||
|             switch (header.mode.Value()) { | ||||
|             case SubmissionMode::IncreasingOld: | ||||
|             case SubmissionMode::Increasing: { | ||||
|                 // Increase the method value with each argument. | ||||
|                 for (unsigned i = 0; i < header.arg_count; ++i) { | ||||
|                     WriteReg(header.method + i, header.subchannel, Memory::Read32(current_addr), | ||||
|                              header.arg_count - i - 1); | ||||
|                     current_addr += sizeof(u32); | ||||
|                 } | ||||
|                 break; | ||||
|             } | ||||
|             case SubmissionMode::NonIncreasingOld: | ||||
|             case SubmissionMode::NonIncreasing: { | ||||
|                 // Use the same method value for all arguments. | ||||
|                 for (unsigned i = 0; i < header.arg_count; ++i) { | ||||
|                     WriteReg(header.method, header.subchannel, Memory::Read32(current_addr), | ||||
|                              header.arg_count - i - 1); | ||||
|                     current_addr += sizeof(u32); | ||||
|                 } | ||||
|                 break; | ||||
|             } | ||||
|             case SubmissionMode::IncreaseOnce: { | ||||
|                 ASSERT(header.arg_count.Value() >= 1); | ||||
|  | ||||
|                 // Use the original method for the first argument and then the next method for all | ||||
|                 // other arguments. | ||||
|                 WriteReg(header.method, header.subchannel, Memory::Read32(current_addr), | ||||
|                          header.arg_count - 1); | ||||
|                 current_addr += sizeof(u32); | ||||
|  | ||||
|                 for (unsigned i = 1; i < header.arg_count; ++i) { | ||||
|                     WriteReg(header.method + 1, header.subchannel, Memory::Read32(current_addr), | ||||
|                              header.arg_count - i - 1); | ||||
|                     current_addr += sizeof(u32); | ||||
|                 } | ||||
|                 break; | ||||
|             } | ||||
|             case SubmissionMode::Inline: { | ||||
|                 // The register value is stored in the bits 16-28 as an immediate | ||||
|                 WriteReg(header.method, header.subchannel, header.inline_data, 0); | ||||
|                 break; | ||||
|             } | ||||
|             default: | ||||
|                 UNIMPLEMENTED(); | ||||
|             } | ||||
|             break; | ||||
|         } | ||||
|         case SubmissionMode::Inline: { | ||||
|             // The register value is stored in the bits 16-28 as an immediate | ||||
|             WriteReg(header.method, header.subchannel, header.inline_data, 0); | ||||
|             break; | ||||
|         } | ||||
|         default: | ||||
|             UNIMPLEMENTED(); | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -7,6 +7,7 @@ | ||||
| #include <type_traits> | ||||
| #include "common/bit_field.h" | ||||
| #include "common/common_types.h" | ||||
| #include "video_core/memory_manager.h" | ||||
|  | ||||
| namespace Tegra { | ||||
|  | ||||
| @@ -19,6 +20,22 @@ enum class SubmissionMode : u32 { | ||||
|     IncreaseOnce = 5 | ||||
| }; | ||||
|  | ||||
| struct CommandListHeader { | ||||
|     u32 entry0; // gpu_va_lo | ||||
|     union { | ||||
|         u32 entry1; // gpu_va_hi | (unk_0x02 << 0x08) | (size << 0x0A) | (unk_0x01 << 0x1F) | ||||
|         BitField<0, 8, u32> gpu_va_hi; | ||||
|         BitField<8, 2, u32> unk1; | ||||
|         BitField<10, 21, u32> sz; | ||||
|         BitField<31, 1, u32> unk2; | ||||
|     }; | ||||
|  | ||||
|     GPUVAddr Address() const { | ||||
|         return (static_cast<GPUVAddr>(gpu_va_hi) << 32) | entry0; | ||||
|     } | ||||
| }; | ||||
| static_assert(sizeof(CommandListHeader) == 8, "CommandListHeader is incorrect size"); | ||||
|  | ||||
| union CommandHeader { | ||||
|     u32 hex; | ||||
|  | ||||
|   | ||||
| @@ -135,8 +135,6 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) { | ||||
|         break; | ||||
|     } | ||||
|  | ||||
|     rasterizer.NotifyMaxwellRegisterChanged(method); | ||||
|  | ||||
|     if (debug_context) { | ||||
|         debug_context->OnEvent(Tegra::DebugContext::Event::MaxwellCommandProcessed, nullptr); | ||||
|     } | ||||
|   | ||||
| @@ -6,6 +6,7 @@ | ||||
|  | ||||
| #include <array> | ||||
| #include <memory> | ||||
| #include <vector> | ||||
| #include "common/common_types.h" | ||||
| #include "core/hle/service/nvflinger/buffer_queue.h" | ||||
| #include "video_core/memory_manager.h" | ||||
| @@ -67,6 +68,7 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format); | ||||
| /// Returns the number of bytes per pixel of each depth format. | ||||
| u32 DepthFormatBytesPerPixel(DepthFormat format); | ||||
|  | ||||
| struct CommandListHeader; | ||||
| class DebugContext; | ||||
|  | ||||
| /** | ||||
| @@ -115,7 +117,7 @@ public: | ||||
|     ~GPU(); | ||||
|  | ||||
|     /// Processes a command list stored at the specified address in GPU memory. | ||||
|     void ProcessCommandList(GPUVAddr address, u32 size); | ||||
|     void ProcessCommandLists(const std::vector<CommandListHeader>& commands); | ||||
|  | ||||
|     /// Returns a reference to the Maxwell3D GPU engine. | ||||
|     Engines::Maxwell3D& Maxwell3D(); | ||||
| @@ -130,9 +132,6 @@ public: | ||||
|     const Tegra::MemoryManager& MemoryManager() const; | ||||
|  | ||||
| private: | ||||
|     /// Writes a single register in the engine bound to the specified subchannel | ||||
|     void WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params); | ||||
|  | ||||
|     std::unique_ptr<Tegra::MemoryManager> memory_manager; | ||||
|  | ||||
|     /// Mapping of command subchannels to their bound engine ids. | ||||
|   | ||||
| @@ -20,9 +20,6 @@ public: | ||||
|     /// Clear the current framebuffer | ||||
|     virtual void Clear() = 0; | ||||
|  | ||||
|     /// Notify rasterizer that the specified Maxwell register has been changed | ||||
|     virtual void NotifyMaxwellRegisterChanged(u32 method) = 0; | ||||
|  | ||||
|     /// Notify rasterizer that all caches should be flushed to Switch memory | ||||
|     virtual void FlushAll() = 0; | ||||
|  | ||||
|   | ||||
| @@ -527,8 +527,6 @@ void RasterizerOpenGL::DrawArrays() { | ||||
|     state.Apply(); | ||||
| } | ||||
|  | ||||
| void RasterizerOpenGL::NotifyMaxwellRegisterChanged(u32 method) {} | ||||
|  | ||||
| void RasterizerOpenGL::FlushAll() {} | ||||
|  | ||||
| void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {} | ||||
|   | ||||
| @@ -45,7 +45,6 @@ public: | ||||
|  | ||||
|     void DrawArrays() override; | ||||
|     void Clear() override; | ||||
|     void NotifyMaxwellRegisterChanged(u32 method) override; | ||||
|     void FlushAll() override; | ||||
|     void FlushRegion(VAddr addr, u64 size) override; | ||||
|     void InvalidateRegion(VAddr addr, u64 size) override; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 bunnei
					bunnei