Merge pull request #3395 from FernandoS27/queries
GPU: Refactor queries implementation and correct GPU Clock.
This commit is contained in:
		| @@ -9,6 +9,7 @@ | |||||||
| #include "core/core_timing.h" | #include "core/core_timing.h" | ||||||
| #include "video_core/engines/maxwell_3d.h" | #include "video_core/engines/maxwell_3d.h" | ||||||
| #include "video_core/engines/shader_type.h" | #include "video_core/engines/shader_type.h" | ||||||
|  | #include "video_core/gpu.h" | ||||||
| #include "video_core/memory_manager.h" | #include "video_core/memory_manager.h" | ||||||
| #include "video_core/rasterizer_interface.h" | #include "video_core/rasterizer_interface.h" | ||||||
| #include "video_core/textures/texture.h" | #include "video_core/textures/texture.h" | ||||||
| @@ -519,61 +520,63 @@ void Maxwell3D::ProcessFirmwareCall4() { | |||||||
|     regs.reg_array[0xd00] = 1; |     regs.reg_array[0xd00] = 1; | ||||||
| } | } | ||||||
|  |  | ||||||
| void Maxwell3D::ProcessQueryGet() { | void Maxwell3D::StampQueryResult(u64 payload, bool long_query) { | ||||||
|  |     struct LongQueryResult { | ||||||
|  |         u64_le value; | ||||||
|  |         u64_le timestamp; | ||||||
|  |     }; | ||||||
|  |     static_assert(sizeof(LongQueryResult) == 16, "LongQueryResult has wrong size"); | ||||||
|     const GPUVAddr sequence_address{regs.query.QueryAddress()}; |     const GPUVAddr sequence_address{regs.query.QueryAddress()}; | ||||||
|     // Since the sequence address is given as a GPU VAddr, we have to convert it to an application |     if (long_query) { | ||||||
|     // VAddr before writing. |         // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast | ||||||
|  |         // GPU, this command may actually take a while to complete in real hardware due to GPU | ||||||
|  |         // wait queues. | ||||||
|  |         LongQueryResult query_result{payload, system.GPU().GetTicks()}; | ||||||
|  |         memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result)); | ||||||
|  |     } else { | ||||||
|  |         memory_manager.Write<u32>(sequence_address, static_cast<u32>(payload)); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void Maxwell3D::ProcessQueryGet() { | ||||||
|     // TODO(Subv): Support the other query units. |     // TODO(Subv): Support the other query units. | ||||||
|     ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop, |     ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop, | ||||||
|                "Units other than CROP are unimplemented"); |                "Units other than CROP are unimplemented"); | ||||||
|  |  | ||||||
|     u64 result = 0; |     switch (regs.query.query_get.operation) { | ||||||
|  |     case Regs::QueryOperation::Release: { | ||||||
|     // TODO(Subv): Support the other query variables |         const u64 result = regs.query.query_sequence; | ||||||
|  |         StampQueryResult(result, regs.query.query_get.short_query == 0); | ||||||
|  |         break; | ||||||
|  |     } | ||||||
|  |     case Regs::QueryOperation::Acquire: { | ||||||
|  |         // Todo(Blinkhawk): Under this operation, the GPU waits for the CPU | ||||||
|  |         // to write a value that matches the current payload. | ||||||
|  |         UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE"); | ||||||
|  |         break; | ||||||
|  |     } | ||||||
|  |     case Regs::QueryOperation::Counter: { | ||||||
|  |         u64 result{}; | ||||||
|         switch (regs.query.query_get.select) { |         switch (regs.query.query_get.select) { | ||||||
|         case Regs::QuerySelect::Zero: |         case Regs::QuerySelect::Zero: | ||||||
|         // This seems to actually write the query sequence to the query address. |             result = 0; | ||||||
|         result = regs.query.query_sequence; |  | ||||||
|             break; |             break; | ||||||
|         default: |         default: | ||||||
|             result = 1; |             result = 1; | ||||||
|             UNIMPLEMENTED_MSG("Unimplemented query select type {}", |             UNIMPLEMENTED_MSG("Unimplemented query select type {}", | ||||||
|                               static_cast<u32>(regs.query.query_get.select.Value())); |                               static_cast<u32>(regs.query.query_get.select.Value())); | ||||||
|         } |         } | ||||||
|  |         StampQueryResult(result, regs.query.query_get.short_query == 0); | ||||||
|     // TODO(Subv): Research and implement how query sync conditions work. |         break; | ||||||
|  |     } | ||||||
|     struct LongQueryResult { |     case Regs::QueryOperation::Trap: { | ||||||
|         u64_le value; |         UNIMPLEMENTED_MSG("Unimplemented query operation TRAP"); | ||||||
|         u64_le timestamp; |         break; | ||||||
|     }; |     } | ||||||
|     static_assert(sizeof(LongQueryResult) == 16, "LongQueryResult has wrong size"); |     default: { | ||||||
|  |         UNIMPLEMENTED_MSG("Unknown query operation"); | ||||||
|     switch (regs.query.query_get.mode) { |  | ||||||
|     case Regs::QueryMode::Write: |  | ||||||
|     case Regs::QueryMode::Write2: { |  | ||||||
|         u32 sequence = regs.query.query_sequence; |  | ||||||
|         if (regs.query.query_get.short_query) { |  | ||||||
|             // Write the current query sequence to the sequence address. |  | ||||||
|             // TODO(Subv): Find out what happens if you use a long query type but mark it as a short |  | ||||||
|             // query. |  | ||||||
|             memory_manager.Write<u32>(sequence_address, sequence); |  | ||||||
|         } else { |  | ||||||
|             // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast |  | ||||||
|             // GPU, this command may actually take a while to complete in real hardware due to GPU |  | ||||||
|             // wait queues. |  | ||||||
|             LongQueryResult query_result{}; |  | ||||||
|             query_result.value = result; |  | ||||||
|             // TODO(Subv): Generate a real GPU timestamp and write it here instead of CoreTiming |  | ||||||
|             query_result.timestamp = system.CoreTiming().GetTicks(); |  | ||||||
|             memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result)); |  | ||||||
|         } |  | ||||||
|         break; |         break; | ||||||
|     } |     } | ||||||
|     default: |  | ||||||
|         UNIMPLEMENTED_MSG("Query mode {} not implemented", |  | ||||||
|                           static_cast<u32>(regs.query.query_get.mode.Value())); |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -71,12 +71,11 @@ public: | |||||||
|         static constexpr std::size_t MaxConstBuffers = 18; |         static constexpr std::size_t MaxConstBuffers = 18; | ||||||
|         static constexpr std::size_t MaxConstBufferSize = 0x10000; |         static constexpr std::size_t MaxConstBufferSize = 0x10000; | ||||||
|  |  | ||||||
|         enum class QueryMode : u32 { |         enum class QueryOperation : u32 { | ||||||
|             Write = 0, |             Release = 0, | ||||||
|             Sync = 1, |             Acquire = 1, | ||||||
|             // TODO(Subv): It is currently unknown what the difference between method 2 and method 0 |             Counter = 2, | ||||||
|             // is. |             Trap = 3, | ||||||
|             Write2 = 2, |  | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         enum class QueryUnit : u32 { |         enum class QueryUnit : u32 { | ||||||
| @@ -1081,7 +1080,7 @@ public: | |||||||
|                     u32 query_sequence; |                     u32 query_sequence; | ||||||
|                     union { |                     union { | ||||||
|                         u32 raw; |                         u32 raw; | ||||||
|                         BitField<0, 2, QueryMode> mode; |                         BitField<0, 2, QueryOperation> operation; | ||||||
|                         BitField<4, 1, u32> fence; |                         BitField<4, 1, u32> fence; | ||||||
|                         BitField<12, 4, QueryUnit> unit; |                         BitField<12, 4, QueryUnit> unit; | ||||||
|                         BitField<16, 1, QuerySyncCondition> sync_cond; |                         BitField<16, 1, QuerySyncCondition> sync_cond; | ||||||
| @@ -1413,6 +1412,9 @@ private: | |||||||
|     /// Handles a write to the QUERY_GET register. |     /// Handles a write to the QUERY_GET register. | ||||||
|     void ProcessQueryGet(); |     void ProcessQueryGet(); | ||||||
|  |  | ||||||
|  |     // Writes the query result accordingly | ||||||
|  |     void StampQueryResult(u64 payload, bool long_query); | ||||||
|  |  | ||||||
|     // Handles Conditional Rendering |     // Handles Conditional Rendering | ||||||
|     void ProcessQueryCondition(); |     void ProcessQueryCondition(); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -6,6 +6,7 @@ | |||||||
| #include "common/microprofile.h" | #include "common/microprofile.h" | ||||||
| #include "core/core.h" | #include "core/core.h" | ||||||
| #include "core/core_timing.h" | #include "core/core_timing.h" | ||||||
|  | #include "core/core_timing_util.h" | ||||||
| #include "core/memory.h" | #include "core/memory.h" | ||||||
| #include "video_core/engines/fermi_2d.h" | #include "video_core/engines/fermi_2d.h" | ||||||
| #include "video_core/engines/kepler_compute.h" | #include "video_core/engines/kepler_compute.h" | ||||||
| @@ -122,6 +123,19 @@ bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) { | |||||||
|     return true; |     return true; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | u64 GPU::GetTicks() const { | ||||||
|  |     // This values were reversed engineered by fincs from NVN | ||||||
|  |     // The gpu clock is reported in units of 385/625 nanoseconds | ||||||
|  |     constexpr u64 gpu_ticks_num = 384; | ||||||
|  |     constexpr u64 gpu_ticks_den = 625; | ||||||
|  |  | ||||||
|  |     const u64 cpu_ticks = system.CoreTiming().GetTicks(); | ||||||
|  |     const u64 nanoseconds = Core::Timing::CyclesToNs(cpu_ticks).count(); | ||||||
|  |     const u64 nanoseconds_num = nanoseconds / gpu_ticks_den; | ||||||
|  |     const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den; | ||||||
|  |     return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den; | ||||||
|  | } | ||||||
|  |  | ||||||
| void GPU::FlushCommands() { | void GPU::FlushCommands() { | ||||||
|     renderer.Rasterizer().FlushCommands(); |     renderer.Rasterizer().FlushCommands(); | ||||||
| } | } | ||||||
| @@ -340,7 +354,7 @@ void GPU::ProcessSemaphoreTriggerMethod() { | |||||||
|         block.sequence = regs.semaphore_sequence; |         block.sequence = regs.semaphore_sequence; | ||||||
|         // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of |         // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of | ||||||
|         // CoreTiming |         // CoreTiming | ||||||
|         block.timestamp = system.CoreTiming().GetTicks(); |         block.timestamp = GetTicks(); | ||||||
|         memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block, |         memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block, | ||||||
|                                    sizeof(block)); |                                    sizeof(block)); | ||||||
|     } else { |     } else { | ||||||
|   | |||||||
| @@ -192,6 +192,8 @@ public: | |||||||
|  |  | ||||||
|     bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value); |     bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value); | ||||||
|  |  | ||||||
|  |     u64 GetTicks() const; | ||||||
|  |  | ||||||
|     std::unique_lock<std::mutex> LockSync() { |     std::unique_lock<std::mutex> LockSync() { | ||||||
|         return std::unique_lock{sync_mutex}; |         return std::unique_lock{sync_mutex}; | ||||||
|     } |     } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 bunnei
					bunnei