MacroHLE: Implement DrawIndexedIndirect & DrawArraysIndirect.
This commit is contained in:
		| @@ -171,7 +171,9 @@ public: | ||||
|                                   bool is_written, bool is_image); | ||||
|  | ||||
|     [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, | ||||
|                                                        bool synchronize, bool mark_as_written); | ||||
|                                                        bool synchronize = true, | ||||
|                                                        bool mark_as_written = false, | ||||
|                                                        bool discard_downloads = false); | ||||
|  | ||||
|     void FlushCachedWrites(); | ||||
|  | ||||
| @@ -203,6 +205,14 @@ public: | ||||
|     /// Return true when a CPU region is modified from the CPU | ||||
|     [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); | ||||
|  | ||||
|     void SetDrawIndirect(const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) { | ||||
|         current_draw_indirect = current_draw_indirect_; | ||||
|     } | ||||
|  | ||||
|     [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectCount(); | ||||
|  | ||||
|     [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer(); | ||||
|  | ||||
|     std::mutex mutex; | ||||
|     Runtime& runtime; | ||||
|  | ||||
| @@ -275,6 +285,8 @@ private: | ||||
|  | ||||
|     void BindHostVertexBuffers(); | ||||
|  | ||||
|     void BindHostDrawIndirectBuffers(); | ||||
|  | ||||
|     void BindHostGraphicsUniformBuffers(size_t stage); | ||||
|  | ||||
|     void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); | ||||
| @@ -301,6 +313,8 @@ private: | ||||
|  | ||||
|     void UpdateVertexBuffer(u32 index); | ||||
|  | ||||
|     void UpdateDrawIndirect(); | ||||
|  | ||||
|     void UpdateUniformBuffers(size_t stage); | ||||
|  | ||||
|     void UpdateStorageBuffers(size_t stage); | ||||
| @@ -340,6 +354,8 @@ private: | ||||
|  | ||||
|     bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); | ||||
|  | ||||
|     bool SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size); | ||||
|  | ||||
|     void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, | ||||
|                       std::span<BufferCopy> copies); | ||||
|  | ||||
| @@ -375,6 +391,8 @@ private: | ||||
|     SlotVector<Buffer> slot_buffers; | ||||
|     DelayedDestructionRing<Buffer, 8> delayed_destruction_ring; | ||||
|  | ||||
|     const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{}; | ||||
|  | ||||
|     u32 last_index_count = 0; | ||||
|  | ||||
|     Binding index_buffer; | ||||
| @@ -383,6 +401,8 @@ private: | ||||
|     std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers; | ||||
|     std::array<std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS>, NUM_STAGES> texture_buffers; | ||||
|     std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers; | ||||
|     Binding count_buffer_binding; | ||||
|     Binding indirect_buffer_binding; | ||||
|  | ||||
|     std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers; | ||||
|     std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers; | ||||
| @@ -422,6 +442,7 @@ private: | ||||
|  | ||||
|     std::vector<BufferId> cached_write_buffer_ids; | ||||
|  | ||||
|     IntervalSet discarded_ranges; | ||||
|     IntervalSet uncommitted_ranges; | ||||
|     IntervalSet common_ranges; | ||||
|     std::deque<IntervalSet> committed_ranges; | ||||
| @@ -579,13 +600,17 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am | ||||
|     }}; | ||||
|  | ||||
|     boost::container::small_vector<IntervalType, 4> tmp_intervals; | ||||
|     const bool is_high_accuracy = | ||||
|         Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High; | ||||
|     auto mirror = [&](VAddr base_address, VAddr base_address_end) { | ||||
|         const u64 size = base_address_end - base_address; | ||||
|         const VAddr diff = base_address - *cpu_src_address; | ||||
|         const VAddr new_base_address = *cpu_dest_address + diff; | ||||
|         const IntervalType add_interval{new_base_address, new_base_address + size}; | ||||
|         uncommitted_ranges.add(add_interval); | ||||
|         tmp_intervals.push_back(add_interval); | ||||
|         if (is_high_accuracy) { | ||||
|             uncommitted_ranges.add(add_interval); | ||||
|         } | ||||
|     }; | ||||
|     ForEachWrittenRange(*cpu_src_address, amount, mirror); | ||||
|     // This subtraction in this order is important for overlapping copies. | ||||
| @@ -677,6 +702,9 @@ void BufferCache<P>::BindHostGeometryBuffers(bool is_indexed) { | ||||
|     } | ||||
|     BindHostVertexBuffers(); | ||||
|     BindHostTransformFeedbackBuffers(); | ||||
|     if (current_draw_indirect) { | ||||
|         BindHostDrawIndirectBuffers(); | ||||
|     } | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| @@ -796,7 +824,8 @@ void BufferCache<P>::BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_add | ||||
| template <class P> | ||||
| std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_addr, u32 size, | ||||
|                                                                  bool synchronize, | ||||
|                                                                  bool mark_as_written) { | ||||
|                                                                  bool mark_as_written, | ||||
|                                                                  bool discard_downloads) { | ||||
|     const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); | ||||
|     if (!cpu_addr) { | ||||
|         return {&slot_buffers[NULL_BUFFER_ID], 0}; | ||||
| @@ -804,11 +833,17 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad | ||||
|     const BufferId buffer_id = FindBuffer(*cpu_addr, size); | ||||
|     Buffer& buffer = slot_buffers[buffer_id]; | ||||
|     if (synchronize) { | ||||
|         SynchronizeBuffer(buffer, *cpu_addr, size); | ||||
|         // SynchronizeBuffer(buffer, *cpu_addr, size); | ||||
|         SynchronizeBufferNoModified(buffer, *cpu_addr, size); | ||||
|     } | ||||
|     if (mark_as_written) { | ||||
|         MarkWrittenBuffer(buffer_id, *cpu_addr, size); | ||||
|     } | ||||
|     if (discard_downloads) { | ||||
|         IntervalType interval{*cpu_addr, size}; | ||||
|         ClearDownload(interval); | ||||
|         discarded_ranges.subtract(interval); | ||||
|     } | ||||
|     return {&buffer, buffer.Offset(*cpu_addr)}; | ||||
| } | ||||
|  | ||||
| @@ -827,10 +862,6 @@ bool BufferCache<P>::HasUncommittedFlushes() const noexcept { | ||||
|  | ||||
| template <class P> | ||||
| void BufferCache<P>::AccumulateFlushes() { | ||||
|     if (Settings::values.gpu_accuracy.GetValue() != Settings::GPUAccuracy::High) { | ||||
|         uncommitted_ranges.clear(); | ||||
|         return; | ||||
|     } | ||||
|     if (uncommitted_ranges.empty()) { | ||||
|         return; | ||||
|     } | ||||
| @@ -845,12 +876,15 @@ bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept { | ||||
| template <class P> | ||||
| void BufferCache<P>::CommitAsyncFlushesHigh() { | ||||
|     AccumulateFlushes(); | ||||
|  | ||||
|     for (const auto& interval : discarded_ranges) { | ||||
|         common_ranges.subtract(interval); | ||||
|     } | ||||
|  | ||||
|     if (committed_ranges.empty()) { | ||||
|         return; | ||||
|     } | ||||
|     MICROPROFILE_SCOPE(GPU_DownloadMemory); | ||||
|     const bool is_accuracy_normal = | ||||
|         Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::Normal; | ||||
|  | ||||
|     auto it = committed_ranges.begin(); | ||||
|     while (it != committed_ranges.end()) { | ||||
| @@ -875,9 +909,6 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { | ||||
|             ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { | ||||
|                 buffer.ForEachDownloadRangeAndClear( | ||||
|                     cpu_addr, size, [&](u64 range_offset, u64 range_size) { | ||||
|                         if (is_accuracy_normal) { | ||||
|                             return; | ||||
|                         } | ||||
|                         const VAddr buffer_addr = buffer.CpuAddr(); | ||||
|                         const auto add_download = [&](VAddr start, VAddr end) { | ||||
|                             const u64 new_offset = start - buffer_addr; | ||||
| @@ -891,7 +922,7 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { | ||||
|                                 buffer_id, | ||||
|                             }); | ||||
|                             // Align up to avoid cache conflicts | ||||
|                             constexpr u64 align = 256ULL; | ||||
|                             constexpr u64 align = 8ULL; | ||||
|                             constexpr u64 mask = ~(align - 1ULL); | ||||
|                             total_size_bytes += (new_size + align - 1) & mask; | ||||
|                             largest_copy = std::max(largest_copy, new_size); | ||||
| @@ -942,12 +973,7 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { | ||||
|  | ||||
| template <class P> | ||||
| void BufferCache<P>::CommitAsyncFlushes() { | ||||
|     if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { | ||||
|         CommitAsyncFlushesHigh(); | ||||
|     } else { | ||||
|         uncommitted_ranges.clear(); | ||||
|         committed_ranges.clear(); | ||||
|     } | ||||
|     CommitAsyncFlushesHigh(); | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| @@ -1063,6 +1089,19 @@ void BufferCache<P>::BindHostVertexBuffers() { | ||||
|     } | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| void BufferCache<P>::BindHostDrawIndirectBuffers() { | ||||
|     const auto bind_buffer = [this](const Binding& binding) { | ||||
|         Buffer& buffer = slot_buffers[binding.buffer_id]; | ||||
|         TouchBuffer(buffer, binding.buffer_id); | ||||
|         SynchronizeBuffer(buffer, binding.cpu_addr, binding.size); | ||||
|     }; | ||||
|     if (current_draw_indirect->include_count) { | ||||
|         bind_buffer(count_buffer_binding); | ||||
|     } | ||||
|     bind_buffer(indirect_buffer_binding); | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) { | ||||
|     u32 dirty = ~0U; | ||||
| @@ -1294,6 +1333,9 @@ void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) { | ||||
|             UpdateStorageBuffers(stage); | ||||
|             UpdateTextureBuffers(stage); | ||||
|         } | ||||
|         if (current_draw_indirect) { | ||||
|             UpdateDrawIndirect(); | ||||
|         } | ||||
|     } while (has_deleted_buffers); | ||||
| } | ||||
|  | ||||
| @@ -1383,6 +1425,27 @@ void BufferCache<P>::UpdateVertexBuffer(u32 index) { | ||||
|     }; | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| void BufferCache<P>::UpdateDrawIndirect() { | ||||
|     const auto update = [this](GPUVAddr gpu_addr, size_t size, Binding& binding) { | ||||
|         const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); | ||||
|         if (!cpu_addr) { | ||||
|             binding = NULL_BINDING; | ||||
|             return; | ||||
|         } | ||||
|         binding = Binding{ | ||||
|             .cpu_addr = *cpu_addr, | ||||
|             .size = static_cast<u32>(size), | ||||
|             .buffer_id = FindBuffer(*cpu_addr, static_cast<u32>(size)), | ||||
|         }; | ||||
|     }; | ||||
|     if (current_draw_indirect->include_count) { | ||||
|         update(current_draw_indirect->count_start_address, sizeof(u32), count_buffer_binding); | ||||
|     } | ||||
|     update(current_draw_indirect->indirect_start_address, current_draw_indirect->buffer_size, | ||||
|            indirect_buffer_binding); | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| void BufferCache<P>::UpdateUniformBuffers(size_t stage) { | ||||
|     ForEachEnabledBit(enabled_uniform_buffer_masks[stage], [&](u32 index) { | ||||
| @@ -1704,6 +1767,51 @@ bool BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s | ||||
|     return false; | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| bool BufferCache<P>::SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size) { | ||||
|     boost::container::small_vector<BufferCopy, 4> copies; | ||||
|     u64 total_size_bytes = 0; | ||||
|     u64 largest_copy = 0; | ||||
|     IntervalSet found_sets{}; | ||||
|     auto make_copies = [&] { | ||||
|         for (auto& interval : found_sets) { | ||||
|             const std::size_t sub_size = interval.upper() - interval.lower(); | ||||
|             const VAddr cpu_addr = interval.lower(); | ||||
|             copies.push_back(BufferCopy{ | ||||
|                 .src_offset = total_size_bytes, | ||||
|                 .dst_offset = cpu_addr - buffer.CpuAddr(), | ||||
|                 .size = sub_size, | ||||
|             }); | ||||
|             total_size_bytes += sub_size; | ||||
|             largest_copy = std::max(largest_copy, sub_size); | ||||
|         } | ||||
|         const std::span<BufferCopy> copies_span(copies.data(), copies.size()); | ||||
|         UploadMemory(buffer, total_size_bytes, largest_copy, copies_span); | ||||
|     }; | ||||
|     buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { | ||||
|         const VAddr base_adr = buffer.CpuAddr() + range_offset; | ||||
|         const VAddr end_adr = base_adr + range_size; | ||||
|         const IntervalType add_interval{base_adr, end_adr}; | ||||
|         found_sets.add(add_interval); | ||||
|     }); | ||||
|     if (found_sets.empty()) { | ||||
|         return true; | ||||
|     } | ||||
|     const IntervalType search_interval{cpu_addr, cpu_addr + size}; | ||||
|     auto it = common_ranges.lower_bound(search_interval); | ||||
|     auto it_end = common_ranges.upper_bound(search_interval); | ||||
|     if (it == common_ranges.end()) { | ||||
|         make_copies(); | ||||
|         return false; | ||||
|     } | ||||
|     while (it != it_end) { | ||||
|         found_sets.subtract(*it); | ||||
|         it++; | ||||
|     } | ||||
|     make_copies(); | ||||
|     return false; | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, | ||||
|                                   std::span<BufferCopy> copies) { | ||||
| @@ -1963,4 +2071,16 @@ bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index) | ||||
|     } | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| std::pair<typename BufferCache<P>::Buffer*, u32> BufferCache<P>::GetDrawIndirectCount() { | ||||
|     auto& buffer = slot_buffers[count_buffer_binding.buffer_id]; | ||||
|     return std::make_pair(&buffer, buffer.Offset(count_buffer_binding.cpu_addr)); | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| std::pair<typename BufferCache<P>::Buffer*, u32> BufferCache<P>::GetDrawIndirectBuffer() { | ||||
|     auto& buffer = slot_buffers[indirect_buffer_binding.buffer_id]; | ||||
|     return std::make_pair(&buffer, buffer.Offset(indirect_buffer_binding.cpu_addr)); | ||||
| } | ||||
|  | ||||
| } // namespace VideoCommon | ||||
|   | ||||
| @@ -97,6 +97,7 @@ void DmaPusher::ProcessCommands(std::span<const CommandHeader> commands) { | ||||
|             if (dma_state.non_incrementing) { | ||||
|                 const u32 max_write = static_cast<u32>( | ||||
|                     std::min<std::size_t>(index + dma_state.method_count, commands.size()) - index); | ||||
|                 dma_state.dma_word_offset = static_cast<u32>(index * sizeof(u32)); | ||||
|                 CallMultiMethod(&command_header.argument, max_write); | ||||
|                 dma_state.method_count -= max_write; | ||||
|                 dma_state.is_last_call = true; | ||||
| @@ -175,7 +176,7 @@ void DmaPusher::CallMultiMethod(const u32* base_start, u32 num_methods) const { | ||||
|                                dma_state.method_count); | ||||
|     } else { | ||||
|         auto subchannel = subchannels[dma_state.subchannel]; | ||||
|         subchannel->current_dma_segment = dma_state.dma_get; | ||||
|         subchannel->current_dma_segment = dma_state.dma_get + dma_state.dma_word_offset; | ||||
|         subchannel->CallMultiMethod(dma_state.method, base_start, num_methods, | ||||
|                                     dma_state.method_count); | ||||
|     } | ||||
|   | ||||
| @@ -157,6 +157,7 @@ private: | ||||
|         u32 method_count;      ///< Current method count | ||||
|         u32 length_pending;    ///< Large NI command length pending | ||||
|         GPUVAddr dma_get;      ///< Currently read segment | ||||
|         u32 dma_word_offset;   ///< Current word ofset from address | ||||
|         bool non_incrementing; ///< Current command's NI flag | ||||
|         bool is_last_call; | ||||
|     }; | ||||
|   | ||||
| @@ -216,7 +216,7 @@ void DrawManager::ProcessDrawIndirect(bool draw_indexed) { | ||||
|     UpdateTopology(); | ||||
|  | ||||
|     if (maxwell3d->ShouldExecute()) { | ||||
|         maxwell3d->rasterizer->DrawIndirect(draw_indexed); | ||||
|         maxwell3d->rasterizer->DrawIndirect(); | ||||
|     } | ||||
| } | ||||
| } // namespace Tegra::Engines | ||||
|   | ||||
| @@ -33,7 +33,10 @@ public: | ||||
|     }; | ||||
|  | ||||
|     struct IndirectParams { | ||||
|         GPUVAddr start_address; | ||||
|         bool is_indexed; | ||||
|         bool include_count; | ||||
|         GPUVAddr count_start_address; | ||||
|         GPUVAddr indirect_start_address; | ||||
|         size_t buffer_size; | ||||
|         size_t max_draw_counts; | ||||
|         size_t stride; | ||||
|   | ||||
| @@ -130,11 +130,15 @@ void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool | ||||
|     } | ||||
|  | ||||
|     macro_params.insert(macro_params.end(), base_start, base_start + amount); | ||||
|     for (size_t i = 0; i < amount; i++) { | ||||
|         macro_addresses.push_back(current_dma_segment + i * sizeof(u32)); | ||||
|     } | ||||
|  | ||||
|     // Call the macro when there are no more parameters in the command buffer | ||||
|     if (is_last_call) { | ||||
|         CallMacroMethod(executing_macro, macro_params); | ||||
|         macro_params.clear(); | ||||
|         macro_addresses.clear(); | ||||
|     } | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -3066,6 +3066,15 @@ public: | ||||
|  | ||||
|     std::unique_ptr<DrawManager> draw_manager; | ||||
|     friend class DrawManager; | ||||
|      | ||||
|     std::vector<u8> inline_index_draw_indexes; | ||||
|     std::vector<GPUVAddr> macro_addresses; | ||||
|  | ||||
|     Core::System& system; | ||||
|     MemoryManager& memory_manager; | ||||
|  | ||||
|     /// Handles a write to the CLEAR_BUFFERS register. | ||||
|     void ProcessClearBuffers(u32 layer_count); | ||||
|  | ||||
| private: | ||||
|     void InitializeRegisterDefaults(); | ||||
| @@ -3126,9 +3135,6 @@ private: | ||||
|     /// Returns a query's value or an empty object if the value will be deferred through a cache. | ||||
|     std::optional<u64> GetQueryResult(); | ||||
|  | ||||
|     Core::System& system; | ||||
|     MemoryManager& memory_manager; | ||||
|  | ||||
|     VideoCore::RasterizerInterface* rasterizer = nullptr; | ||||
|  | ||||
|     /// Start offsets of each macro in macro_memory | ||||
|   | ||||
| @@ -9,6 +9,7 @@ | ||||
| #include "video_core/engines/maxwell_3d.h" | ||||
| #include "video_core/macro/macro.h" | ||||
| #include "video_core/macro/macro_hle.h" | ||||
| #include "video_core/memory_manager.h" | ||||
| #include "video_core/rasterizer_interface.h" | ||||
|  | ||||
| namespace Tegra { | ||||
| @@ -24,15 +25,14 @@ void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& | ||||
|         parameters[4], parameters[1], parameters[3], parameters[5], instance_count); | ||||
| } | ||||
|  | ||||
| void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) { | ||||
| void HLE_DrawArraysIndirect(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) { | ||||
|     const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); | ||||
|     maxwell3d.draw_manager->DrawArray( | ||||
|         static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]), | ||||
|         parameters[3], parameters[1], parameters[4], instance_count); | ||||
| } | ||||
|  | ||||
| void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) { | ||||
|     const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); | ||||
| void HLE_DrawIndexedIndirect(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) { | ||||
|     const u32 element_base = parameters[4]; | ||||
|     const u32 base_instance = parameters[5]; | ||||
|     maxwell3d.regs.vertex_id_base = element_base; | ||||
| @@ -41,9 +41,18 @@ void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& | ||||
|     maxwell3d.CallMethod(0x8e4, element_base, true); | ||||
|     maxwell3d.CallMethod(0x8e5, base_instance, true); | ||||
|  | ||||
|     maxwell3d.draw_manager->DrawIndex( | ||||
|         static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]), | ||||
|         parameters[3], parameters[1], element_base, base_instance, instance_count); | ||||
|     auto& params = maxwell3d.draw_manager->GetIndirectParams(); | ||||
|     params.is_indexed = true; | ||||
|     params.include_count = false; | ||||
|     params.count_start_address = 0; | ||||
|     params.indirect_start_address = maxwell3d.macro_addresses[1]; | ||||
|     params.buffer_size = 5 * sizeof(u32); | ||||
|     params.max_draw_counts = 1; | ||||
|     params.stride = 0; | ||||
|  | ||||
|     maxwell3d.draw_manager->DrawIndexedIndirect( | ||||
|         static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]), 0, | ||||
|         1U << 18); | ||||
|  | ||||
|     maxwell3d.regs.vertex_id_base = 0x0; | ||||
|     maxwell3d.CallMethod(0x8e3, 0x640, true); | ||||
| @@ -51,8 +60,9 @@ void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& | ||||
|     maxwell3d.CallMethod(0x8e5, 0x0, true); | ||||
| } | ||||
|  | ||||
| // Multidraw Indirect | ||||
| void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) { | ||||
| // Multidraw Indixed Indirect | ||||
| void HLE_MultiDrawIndexedIndirect(Engines::Maxwell3D& maxwell3d, | ||||
|                                   const std::vector<u32>& parameters) { | ||||
|     const u32 start_indirect = parameters[0]; | ||||
|     const u32 end_indirect = parameters[1]; | ||||
|     if (start_indirect >= end_indirect) { | ||||
| @@ -66,7 +76,6 @@ void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& | ||||
|     // size of each indirect segment | ||||
|     const u32 indirect_words = 5 + padding; | ||||
|     const u32 stride = indirect_words * sizeof(u32); | ||||
|     const GPUVAddr start_address = maxwell3d.current_dma_segment + 4 * sizeof(u32); | ||||
|     const std::size_t draw_count = end_indirect - start_indirect; | ||||
|     u32 lowest_first = std::numeric_limits<u32>::max(); | ||||
|     u32 highest_limit = std::numeric_limits<u32>::min(); | ||||
| @@ -80,12 +89,16 @@ void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& | ||||
|  | ||||
|     const u32 base_vertex = parameters[8]; | ||||
|     const u32 base_instance = parameters[9]; | ||||
|     maxwell3d.regs.vertex_id_base = base_vertex; | ||||
|     maxwell3d.CallMethod(0x8e3, 0x640, true); | ||||
|     maxwell3d.CallMethod(0x8e4, base_vertex, true); | ||||
|     maxwell3d.CallMethod(0x8e5, base_instance, true); | ||||
|     auto& params = maxwell3d.draw_manager->GetIndirectParams(); | ||||
|     params.start_address = start_address; | ||||
|     params.buffer_size = sizeof(u32) + stride * draw_count; | ||||
|     params.is_indexed = true; | ||||
|     params.include_count = true; | ||||
|     params.count_start_address = maxwell3d.macro_addresses[4]; | ||||
|     params.indirect_start_address = maxwell3d.macro_addresses[5]; | ||||
|     params.buffer_size = stride * draw_count; | ||||
|     params.max_draw_counts = draw_count; | ||||
|     params.stride = stride; | ||||
|     maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; | ||||
| @@ -93,7 +106,7 @@ void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& | ||||
| } | ||||
|  | ||||
| // Multi-layer Clear | ||||
| void HLE_EAD26C3E2109B06B(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) { | ||||
| void HLE_MultiLayerClear(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) { | ||||
|     ASSERT(parameters.size() == 1); | ||||
|  | ||||
|     const Engines::Maxwell3D::Regs::ClearSurface clear_params{parameters[0]}; | ||||
| @@ -107,10 +120,10 @@ void HLE_EAD26C3E2109B06B(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& | ||||
|  | ||||
| constexpr std::array<std::pair<u64, HLEFunction>, 5> hle_funcs{{ | ||||
|     {0x771BB18C62444DA0, &HLE_771BB18C62444DA0}, | ||||
|     {0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD}, | ||||
|     {0x0217920100488FF7, &HLE_0217920100488FF7}, | ||||
|     {0x3F5E74B9C9A50164, &HLE_3F5E74B9C9A50164}, | ||||
|     {0xEAD26C3E2109B06B, &HLE_EAD26C3E2109B06B}, | ||||
|     {0x0D61FC9FAAC9FCAD, &HLE_DrawArraysIndirect}, | ||||
|     {0x0217920100488FF7, &HLE_DrawIndexedIndirect}, | ||||
|     {0x3F5E74B9C9A50164, &HLE_MultiDrawIndexedIndirect}, | ||||
|     {0xEAD26C3E2109B06B, &HLE_MultiLayerClear}, | ||||
| }}; | ||||
|  | ||||
| class HLEMacroImpl final : public CachedMacro { | ||||
|   | ||||
| @@ -43,7 +43,7 @@ public: | ||||
|     virtual void Draw(bool is_indexed, u32 instance_count) = 0; | ||||
|  | ||||
|     /// Dispatches an indirect draw invocation | ||||
|     virtual void DrawIndirect(bool is_indexed) {} | ||||
|     virtual void DrawIndirect() {} | ||||
|  | ||||
|     /// Clear the current framebuffer | ||||
|     virtual void Clear(u32 layer_count) = 0; | ||||
|   | ||||
| @@ -56,7 +56,8 @@ vk::Buffer CreateBuffer(const Device& device, u64 size) { | ||||
|         VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | | ||||
|         VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | | ||||
|         VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | | ||||
|         VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT; | ||||
|         VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | | ||||
|         VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT; | ||||
|     if (device.IsExtTransformFeedbackSupported()) { | ||||
|         flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; | ||||
|     } | ||||
| @@ -516,6 +517,7 @@ void BufferCacheRuntime::ReserveNullBuffer() { | ||||
|     if (device.IsExtTransformFeedbackSupported()) { | ||||
|         create_info.usage |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; | ||||
|     } | ||||
|     create_info.usage |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT; | ||||
|     null_buffer = device.GetLogical().CreateBuffer(create_info); | ||||
|     if (device.HasDebuggingToolAttached()) { | ||||
|         null_buffer.SetObjectNameEXT("Null buffer"); | ||||
|   | ||||
| @@ -225,25 +225,40 @@ void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { | ||||
|     }); | ||||
| } | ||||
|  | ||||
| void RasterizerVulkan::DrawIndirect(bool is_indexed) { | ||||
|     PrepareDraw(is_indexed, [this, is_indexed] { | ||||
|         const auto params = maxwell3d->draw_manager->GetIndirectParams(); | ||||
|         const auto [buffer, offset] = buffer_cache.ObtainBuffer( | ||||
|             params.start_address, static_cast<u32>(params.buffer_size), true, false); | ||||
|         scheduler.Record([buffer_obj = buffer->Handle(), offset, | ||||
|                           max_draw_counts = params.max_draw_counts, stride = params.stride, | ||||
|                           is_indexed](vk::CommandBuffer cmdbuf) { | ||||
|             if (is_indexed) { | ||||
|                 cmdbuf.DrawIndexedIndirectCount(buffer_obj, offset + 4ULL, buffer_obj, offset, | ||||
|                                                 static_cast<u32>(max_draw_counts), | ||||
|                                                 static_cast<u32>(stride)); | ||||
| void RasterizerVulkan::DrawIndirect() { | ||||
|     const auto& params = maxwell3d->draw_manager->GetIndirectParams(); | ||||
|     buffer_cache.SetDrawIndirect(¶ms); | ||||
|     PrepareDraw(params.is_indexed, [this, ¶ms] { | ||||
|         const auto [buffer, offset] = buffer_cache.GetDrawIndirectBuffer(); | ||||
|         if (params.include_count) { | ||||
|             const auto [draw_buffer, offset_base] = buffer_cache.GetDrawIndirectCount(); | ||||
|             scheduler.Record([draw_buffer_obj = draw_buffer->Handle(), | ||||
|                               buffer_obj = buffer->Handle(), offset_base, offset, | ||||
|                               params](vk::CommandBuffer cmdbuf) { | ||||
|                 if (params.is_indexed) { | ||||
|                     cmdbuf.DrawIndexedIndirectCount( | ||||
|                         buffer_obj, offset, draw_buffer_obj, offset_base, | ||||
|                         static_cast<u32>(params.max_draw_counts), static_cast<u32>(params.stride)); | ||||
|                 } else { | ||||
|                     cmdbuf.DrawIndirectCount(buffer_obj, offset, draw_buffer_obj, offset_base, | ||||
|                                              static_cast<u32>(params.max_draw_counts), | ||||
|                                              static_cast<u32>(params.stride)); | ||||
|                 } | ||||
|             }); | ||||
|             return; | ||||
|         } | ||||
|         scheduler.Record([buffer_obj = buffer->Handle(), offset, params](vk::CommandBuffer cmdbuf) { | ||||
|             if (params.is_indexed) { | ||||
|                 cmdbuf.DrawIndexedIndirect(buffer_obj, offset, | ||||
|                                            static_cast<u32>(params.max_draw_counts), | ||||
|                                            static_cast<u32>(params.stride)); | ||||
|             } else { | ||||
|                 cmdbuf.DrawIndirectCount(buffer_obj, offset + 4ULL, buffer_obj, offset, | ||||
|                                          static_cast<u32>(max_draw_counts), | ||||
|                                          static_cast<u32>(stride)); | ||||
|                 cmdbuf.DrawIndirect(buffer_obj, offset, static_cast<u32>(params.max_draw_counts), | ||||
|                                     static_cast<u32>(params.stride)); | ||||
|             } | ||||
|         }); | ||||
|     }); | ||||
|     buffer_cache.SetDrawIndirect(nullptr); | ||||
| } | ||||
|  | ||||
| void RasterizerVulkan::Clear(u32 layer_count) { | ||||
| @@ -425,9 +440,6 @@ void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) { | ||||
|  | ||||
| bool RasterizerVulkan::MustFlushRegion(VAddr addr, u64 size) { | ||||
|     std::scoped_lock lock{texture_cache.mutex, buffer_cache.mutex}; | ||||
|     if (!Settings::IsGPULevelHigh()) { | ||||
|         return buffer_cache.IsRegionGpuModified(addr, size); | ||||
|     } | ||||
|     return texture_cache.IsRegionGpuModified(addr, size) || | ||||
|            buffer_cache.IsRegionGpuModified(addr, size); | ||||
| } | ||||
|   | ||||
| @@ -65,7 +65,7 @@ public: | ||||
|     ~RasterizerVulkan() override; | ||||
|  | ||||
|     void Draw(bool is_indexed, u32 instance_count) override; | ||||
|     void DrawIndirect(bool is_indexed) override; | ||||
|     void DrawIndirect() override; | ||||
|     void Clear(u32 layer_count) override; | ||||
|     void DispatchCompute() override; | ||||
|     void ResetCounter(VideoCore::QueryType type) override; | ||||
|   | ||||
| @@ -351,7 +351,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR | ||||
|             .dualSrcBlend = true, | ||||
|             .logicOp = true, | ||||
|             .multiDrawIndirect = true, | ||||
|             .drawIndirectFirstInstance = false, | ||||
|             .drawIndirectFirstInstance = true, | ||||
|             .depthClamp = true, | ||||
|             .depthBiasClamp = true, | ||||
|             .fillModeNonSolid = true, | ||||
| @@ -1024,6 +1024,8 @@ void Device::CheckSuitability(bool requires_swapchain) const { | ||||
|         std::make_pair(features.vertexPipelineStoresAndAtomics, "vertexPipelineStoresAndAtomics"), | ||||
|         std::make_pair(features.imageCubeArray, "imageCubeArray"), | ||||
|         std::make_pair(features.independentBlend, "independentBlend"), | ||||
|         std::make_pair(features.multiDrawIndirect, "multiDrawIndirect"), | ||||
|         std::make_pair(features.drawIndirectFirstInstance, "drawIndirectFirstInstance"), | ||||
|         std::make_pair(features.depthClamp, "depthClamp"), | ||||
|         std::make_pair(features.samplerAnisotropy, "samplerAnisotropy"), | ||||
|         std::make_pair(features.largePoints, "largePoints"), | ||||
| @@ -1117,6 +1119,7 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) { | ||||
|         test(khr_spirv_1_4, VK_KHR_SPIRV_1_4_EXTENSION_NAME, true); | ||||
|         test(khr_push_descriptor, VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME, true); | ||||
|         test(has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false); | ||||
|         test(khr_draw_indirect_count, VK_KHR_DRAW_INDIRECT_COUNT_EXTENSION_NAME, true); | ||||
|         test(ext_depth_range_unrestricted, VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true); | ||||
|         test(ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true); | ||||
|         test(has_ext_primitive_topology_list_restart, | ||||
|   | ||||
| @@ -451,6 +451,7 @@ private: | ||||
|     bool nv_viewport_swizzle{};                 ///< Support for VK_NV_viewport_swizzle. | ||||
|     bool nv_viewport_array2{};                  ///< Support for VK_NV_viewport_array2. | ||||
|     bool nv_geometry_shader_passthrough{};      ///< Support for VK_NV_geometry_shader_passthrough. | ||||
|     bool khr_draw_indirect_count{};             ///< Support for VK_KHR_draw_indirect_count. | ||||
|     bool khr_uniform_buffer_standard_layout{};  ///< Support for scalar uniform buffer layouts. | ||||
|     bool khr_spirv_1_4{};                       ///< Support for VK_KHR_spirv_1_4. | ||||
|     bool khr_workgroup_memory_explicit_layout{}; ///< Support for explicit workgroup layouts. | ||||
|   | ||||
| @@ -94,8 +94,10 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { | ||||
|     X(vkCmdDispatch); | ||||
|     X(vkCmdDraw); | ||||
|     X(vkCmdDrawIndexed); | ||||
|     X(vkCmdDrawIndirectCount); | ||||
|     X(vkCmdDrawIndexedIndirectCount); | ||||
|     X(vkCmdDrawIndirect); | ||||
|     X(vkCmdDrawIndexedIndirect); | ||||
|     X(vkCmdDrawIndirectCountKHR); | ||||
|     X(vkCmdDrawIndexedIndirectCountKHR); | ||||
|     X(vkCmdEndQuery); | ||||
|     X(vkCmdEndRenderPass); | ||||
|     X(vkCmdEndTransformFeedbackEXT); | ||||
|   | ||||
| @@ -213,8 +213,10 @@ struct DeviceDispatch : InstanceDispatch { | ||||
|     PFN_vkCmdDispatch vkCmdDispatch{}; | ||||
|     PFN_vkCmdDraw vkCmdDraw{}; | ||||
|     PFN_vkCmdDrawIndexed vkCmdDrawIndexed{}; | ||||
|     PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{}; | ||||
|     PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{}; | ||||
|     PFN_vkCmdDrawIndirect vkCmdDrawIndirect{}; | ||||
|     PFN_vkCmdDrawIndexedIndirect vkCmdDrawIndexedIndirect{}; | ||||
|     PFN_vkCmdDrawIndirectCountKHR vkCmdDrawIndirectCountKHR{}; | ||||
|     PFN_vkCmdDrawIndexedIndirectCountKHR vkCmdDrawIndexedIndirectCountKHR{}; | ||||
|     PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{}; | ||||
|     PFN_vkCmdEndQuery vkCmdEndQuery{}; | ||||
|     PFN_vkCmdEndRenderPass vkCmdEndRenderPass{}; | ||||
| @@ -1021,17 +1023,27 @@ public: | ||||
|                               first_instance); | ||||
|     } | ||||
|  | ||||
|     void DrawIndirect(VkBuffer src_buffer, VkDeviceSize src_offset, u32 draw_count, | ||||
|                       u32 stride) const noexcept { | ||||
|         dld->vkCmdDrawIndirect(handle, src_buffer, src_offset, draw_count, stride); | ||||
|     } | ||||
|  | ||||
|     void DrawIndexedIndirect(VkBuffer src_buffer, VkDeviceSize src_offset, u32 draw_count, | ||||
|                              u32 stride) const noexcept { | ||||
|         dld->vkCmdDrawIndexedIndirect(handle, src_buffer, src_offset, draw_count, stride); | ||||
|     } | ||||
|  | ||||
|     void DrawIndirectCount(VkBuffer src_buffer, VkDeviceSize src_offset, VkBuffer count_buffer, | ||||
|                            VkDeviceSize count_offset, u32 draw_count, u32 stride) const noexcept { | ||||
|         dld->vkCmdDrawIndirectCount(handle, src_buffer, src_offset, count_buffer, count_offset, | ||||
|                                     draw_count, stride); | ||||
|         dld->vkCmdDrawIndirectCountKHR(handle, src_buffer, src_offset, count_buffer, count_offset, | ||||
|                                        draw_count, stride); | ||||
|     } | ||||
|  | ||||
|     void DrawIndexedIndirectCount(VkBuffer src_buffer, VkDeviceSize src_offset, | ||||
|                                   VkBuffer count_buffer, VkDeviceSize count_offset, u32 draw_count, | ||||
|                                   u32 stride) const noexcept { | ||||
|         dld->vkCmdDrawIndexedIndirectCount(handle, src_buffer, src_offset, count_buffer, | ||||
|                                            count_offset, draw_count, stride); | ||||
|         dld->vkCmdDrawIndexedIndirectCountKHR(handle, src_buffer, src_offset, count_buffer, | ||||
|                                               count_offset, draw_count, stride); | ||||
|     } | ||||
|  | ||||
|     void ClearAttachments(Span<VkClearAttachment> attachments, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Fernando Sahmkow
					Fernando Sahmkow