Merge pull request #9786 from FernandoS27/the-gaia-is-a-lie
YFC - Engines: Implement Accelerate DMA Texture.
This commit is contained in:
		| @@ -55,6 +55,19 @@ constexpr u32 NUM_STORAGE_BUFFERS = 16; | ||||
| constexpr u32 NUM_TEXTURE_BUFFERS = 16; | ||||
| constexpr u32 NUM_STAGES = 5; | ||||
|  | ||||
| enum class ObtainBufferSynchronize : u32 { | ||||
|     NoSynchronize = 0, | ||||
|     FullSynchronize = 1, | ||||
|     SynchronizeNoDirty = 2, | ||||
| }; | ||||
|  | ||||
| enum class ObtainBufferOperation : u32 { | ||||
|     DoNothing = 0, | ||||
|     MarkAsWritten = 1, | ||||
|     DiscardWrite = 2, | ||||
|     MarkQuery = 3, | ||||
| }; | ||||
|  | ||||
| using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>; | ||||
| using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>; | ||||
|  | ||||
| @@ -191,6 +204,10 @@ public: | ||||
|  | ||||
|     bool DMAClear(GPUVAddr src_address, u64 amount, u32 value); | ||||
|  | ||||
|     [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, | ||||
|                                                        ObtainBufferSynchronize sync_info, | ||||
|                                                        ObtainBufferOperation post_op); | ||||
|  | ||||
|     /// Return true when a CPU region is modified from the GPU | ||||
|     [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); | ||||
|  | ||||
| @@ -641,6 +658,42 @@ bool BufferCache<P>::DMAClear(GPUVAddr dst_address, u64 amount, u32 value) { | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_addr, u32 size, | ||||
|                                                                  ObtainBufferSynchronize sync_info, | ||||
|                                                                  ObtainBufferOperation post_op) { | ||||
|     const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); | ||||
|     if (!cpu_addr) { | ||||
|         return {&slot_buffers[NULL_BUFFER_ID], 0}; | ||||
|     } | ||||
|     const BufferId buffer_id = FindBuffer(*cpu_addr, size); | ||||
|     Buffer& buffer = slot_buffers[buffer_id]; | ||||
|  | ||||
|     // synchronize op | ||||
|     switch (sync_info) { | ||||
|     case ObtainBufferSynchronize::FullSynchronize: | ||||
|         SynchronizeBuffer(buffer, *cpu_addr, size); | ||||
|         break; | ||||
|     default: | ||||
|         break; | ||||
|     } | ||||
|  | ||||
|     switch (post_op) { | ||||
|     case ObtainBufferOperation::MarkAsWritten: | ||||
|         MarkWrittenBuffer(buffer_id, *cpu_addr, size); | ||||
|         break; | ||||
|     case ObtainBufferOperation::DiscardWrite: { | ||||
|         IntervalType interval{*cpu_addr, size}; | ||||
|         ClearDownload(interval); | ||||
|         break; | ||||
|     } | ||||
|     default: | ||||
|         break; | ||||
|     } | ||||
|  | ||||
|     return {&buffer, buffer.Offset(*cpu_addr)}; | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, | ||||
|                                                u32 size) { | ||||
|   | ||||
| @@ -14,7 +14,13 @@ | ||||
| #include "video_core/textures/decoders.h" | ||||
|  | ||||
| MICROPROFILE_DECLARE(GPU_DMAEngine); | ||||
| MICROPROFILE_DECLARE(GPU_DMAEngineBL); | ||||
| MICROPROFILE_DECLARE(GPU_DMAEngineLB); | ||||
| MICROPROFILE_DECLARE(GPU_DMAEngineBB); | ||||
| MICROPROFILE_DEFINE(GPU_DMAEngine, "GPU", "DMA Engine", MP_RGB(224, 224, 128)); | ||||
| MICROPROFILE_DEFINE(GPU_DMAEngineBL, "GPU", "DMA Engine Block - Linear", MP_RGB(224, 224, 128)); | ||||
| MICROPROFILE_DEFINE(GPU_DMAEngineLB, "GPU", "DMA Engine Linear - Block", MP_RGB(224, 224, 128)); | ||||
| MICROPROFILE_DEFINE(GPU_DMAEngineBB, "GPU", "DMA Engine Block - Block", MP_RGB(224, 224, 128)); | ||||
|  | ||||
| namespace Tegra::Engines { | ||||
|  | ||||
| @@ -72,6 +78,7 @@ void MaxwellDMA::Launch() { | ||||
|         memory_manager.FlushCaching(); | ||||
|         if (!is_src_pitch && !is_dst_pitch) { | ||||
|             // If both the source and the destination are in block layout, assert. | ||||
|             MICROPROFILE_SCOPE(GPU_DMAEngineBB); | ||||
|             CopyBlockLinearToBlockLinear(); | ||||
|             ReleaseSemaphore(); | ||||
|             return; | ||||
| @@ -87,8 +94,10 @@ void MaxwellDMA::Launch() { | ||||
|             } | ||||
|         } else { | ||||
|             if (!is_src_pitch && is_dst_pitch) { | ||||
|                 MICROPROFILE_SCOPE(GPU_DMAEngineBL); | ||||
|                 CopyBlockLinearToPitch(); | ||||
|             } else { | ||||
|                 MICROPROFILE_SCOPE(GPU_DMAEngineLB); | ||||
|                 CopyPitchToBlockLinear(); | ||||
|             } | ||||
|         } | ||||
| @@ -153,21 +162,35 @@ void MaxwellDMA::Launch() { | ||||
| } | ||||
|  | ||||
| void MaxwellDMA::CopyBlockLinearToPitch() { | ||||
|     UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0); | ||||
|     UNIMPLEMENTED_IF(regs.src_params.layer != 0); | ||||
|     UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0); | ||||
|  | ||||
|     const bool is_remapping = regs.launch_dma.remap_enable != 0; | ||||
|     u32 bytes_per_pixel = 1; | ||||
|     DMA::ImageOperand src_operand; | ||||
|     src_operand.bytes_per_pixel = bytes_per_pixel; | ||||
|     src_operand.params = regs.src_params; | ||||
|     src_operand.address = regs.offset_in; | ||||
|  | ||||
|     // Optimized path for micro copies. | ||||
|     const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; | ||||
|     if (!is_remapping && dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X && | ||||
|         regs.src_params.height > GOB_SIZE_Y) { | ||||
|         FastCopyBlockLinearToPitch(); | ||||
|     DMA::BufferOperand dst_operand; | ||||
|     dst_operand.pitch = regs.pitch_out; | ||||
|     dst_operand.width = regs.line_length_in; | ||||
|     dst_operand.height = regs.line_count; | ||||
|     dst_operand.address = regs.offset_out; | ||||
|     DMA::ImageCopy copy_info{}; | ||||
|     copy_info.length_x = regs.line_length_in; | ||||
|     copy_info.length_y = regs.line_count; | ||||
|     auto& accelerate = rasterizer->AccessAccelerateDMA(); | ||||
|     if (accelerate.ImageToBuffer(copy_info, src_operand, dst_operand)) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0); | ||||
|     UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0); | ||||
|     UNIMPLEMENTED_IF(regs.src_params.block_size.depth == 0 && regs.src_params.depth != 1); | ||||
|  | ||||
|     // Deswizzle the input and copy it over. | ||||
|     const Parameters& src_params = regs.src_params; | ||||
|     const DMA::Parameters& src_params = regs.src_params; | ||||
|  | ||||
|     const bool is_remapping = regs.launch_dma.remap_enable != 0; | ||||
|  | ||||
|     const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1; | ||||
|     const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1; | ||||
| @@ -187,7 +210,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() { | ||||
|         x_offset >>= bpp_shift; | ||||
|     } | ||||
|  | ||||
|     const u32 bytes_per_pixel = base_bpp << bpp_shift; | ||||
|     bytes_per_pixel = base_bpp << bpp_shift; | ||||
|     const u32 height = src_params.height; | ||||
|     const u32 depth = src_params.depth; | ||||
|     const u32 block_height = src_params.block_size.height; | ||||
| @@ -195,11 +218,12 @@ void MaxwellDMA::CopyBlockLinearToPitch() { | ||||
|     const size_t src_size = | ||||
|         CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); | ||||
|  | ||||
|     const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; | ||||
|     read_buffer.resize_destructive(src_size); | ||||
|     write_buffer.resize_destructive(dst_size); | ||||
|  | ||||
|     memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); | ||||
|     memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); | ||||
|     memory_manager.ReadBlock(src_operand.address, read_buffer.data(), src_size); | ||||
|     memory_manager.ReadBlockUnsafe(dst_operand.address, write_buffer.data(), dst_size); | ||||
|  | ||||
|     UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset, | ||||
|                      src_params.origin.y, x_elements, regs.line_count, block_height, block_depth, | ||||
| @@ -216,6 +240,24 @@ void MaxwellDMA::CopyPitchToBlockLinear() { | ||||
|     const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1; | ||||
|     const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1; | ||||
|  | ||||
|     u32 bytes_per_pixel = 1; | ||||
|     DMA::ImageOperand dst_operand; | ||||
|     dst_operand.bytes_per_pixel = bytes_per_pixel; | ||||
|     dst_operand.params = regs.dst_params; | ||||
|     dst_operand.address = regs.offset_out; | ||||
|     DMA::BufferOperand src_operand; | ||||
|     src_operand.pitch = regs.pitch_in; | ||||
|     src_operand.width = regs.line_length_in; | ||||
|     src_operand.height = regs.line_count; | ||||
|     src_operand.address = regs.offset_in; | ||||
|     DMA::ImageCopy copy_info{}; | ||||
|     copy_info.length_x = regs.line_length_in; | ||||
|     copy_info.length_y = regs.line_count; | ||||
|     auto& accelerate = rasterizer->AccessAccelerateDMA(); | ||||
|     if (accelerate.BufferToImage(copy_info, src_operand, dst_operand)) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     const auto& dst_params = regs.dst_params; | ||||
|  | ||||
|     const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size; | ||||
| @@ -233,7 +275,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() { | ||||
|         x_offset >>= bpp_shift; | ||||
|     } | ||||
|  | ||||
|     const u32 bytes_per_pixel = base_bpp << bpp_shift; | ||||
|     bytes_per_pixel = base_bpp << bpp_shift; | ||||
|     const u32 height = dst_params.height; | ||||
|     const u32 depth = dst_params.depth; | ||||
|     const u32 block_height = dst_params.block_size.height; | ||||
| @@ -260,45 +302,14 @@ void MaxwellDMA::CopyPitchToBlockLinear() { | ||||
|     memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); | ||||
| } | ||||
|  | ||||
| void MaxwellDMA::FastCopyBlockLinearToPitch() { | ||||
|     const u32 bytes_per_pixel = 1U; | ||||
|     const size_t src_size = GOB_SIZE; | ||||
|     const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; | ||||
|     u32 pos_x = regs.src_params.origin.x; | ||||
|     u32 pos_y = regs.src_params.origin.y; | ||||
|     const u64 offset = GetGOBOffset(regs.src_params.width, regs.src_params.height, pos_x, pos_y, | ||||
|                                     regs.src_params.block_size.height, bytes_per_pixel); | ||||
|     const u32 x_in_gob = 64 / bytes_per_pixel; | ||||
|     pos_x = pos_x % x_in_gob; | ||||
|     pos_y = pos_y % 8; | ||||
|  | ||||
|     read_buffer.resize_destructive(src_size); | ||||
|     write_buffer.resize_destructive(dst_size); | ||||
|  | ||||
|     if (Settings::IsGPULevelExtreme()) { | ||||
|         memory_manager.ReadBlock(regs.offset_in + offset, read_buffer.data(), src_size); | ||||
|         memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); | ||||
|     } else { | ||||
|         memory_manager.ReadBlockUnsafe(regs.offset_in + offset, read_buffer.data(), src_size); | ||||
|         memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size); | ||||
|     } | ||||
|  | ||||
|     UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, regs.src_params.width, | ||||
|                      regs.src_params.height, 1, pos_x, pos_y, regs.line_length_in, regs.line_count, | ||||
|                      regs.src_params.block_size.height, regs.src_params.block_size.depth, | ||||
|                      regs.pitch_out); | ||||
|  | ||||
|     memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); | ||||
| } | ||||
|  | ||||
| void MaxwellDMA::CopyBlockLinearToBlockLinear() { | ||||
|     UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0); | ||||
|  | ||||
|     const bool is_remapping = regs.launch_dma.remap_enable != 0; | ||||
|  | ||||
|     // Deswizzle the input and copy it over. | ||||
|     const Parameters& src = regs.src_params; | ||||
|     const Parameters& dst = regs.dst_params; | ||||
|     const DMA::Parameters& src = regs.src_params; | ||||
|     const DMA::Parameters& dst = regs.dst_params; | ||||
|  | ||||
|     const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1; | ||||
|     const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1; | ||||
|   | ||||
| @@ -24,6 +24,54 @@ namespace VideoCore { | ||||
| class RasterizerInterface; | ||||
| } | ||||
|  | ||||
| namespace Tegra { | ||||
| namespace DMA { | ||||
|  | ||||
| union Origin { | ||||
|     BitField<0, 16, u32> x; | ||||
|     BitField<16, 16, u32> y; | ||||
| }; | ||||
| static_assert(sizeof(Origin) == 4); | ||||
|  | ||||
| struct ImageCopy { | ||||
|     u32 length_x{}; | ||||
|     u32 length_y{}; | ||||
| }; | ||||
|  | ||||
| union BlockSize { | ||||
|     BitField<0, 4, u32> width; | ||||
|     BitField<4, 4, u32> height; | ||||
|     BitField<8, 4, u32> depth; | ||||
|     BitField<12, 4, u32> gob_height; | ||||
| }; | ||||
| static_assert(sizeof(BlockSize) == 4); | ||||
|  | ||||
| struct Parameters { | ||||
|     BlockSize block_size; | ||||
|     u32 width; | ||||
|     u32 height; | ||||
|     u32 depth; | ||||
|     u32 layer; | ||||
|     Origin origin; | ||||
| }; | ||||
| static_assert(sizeof(Parameters) == 24); | ||||
|  | ||||
| struct ImageOperand { | ||||
|     u32 bytes_per_pixel; | ||||
|     Parameters params; | ||||
|     GPUVAddr address; | ||||
| }; | ||||
|  | ||||
| struct BufferOperand { | ||||
|     u32 pitch; | ||||
|     u32 width; | ||||
|     u32 height; | ||||
|     GPUVAddr address; | ||||
| }; | ||||
|  | ||||
| } // namespace DMA | ||||
| } // namespace Tegra | ||||
|  | ||||
| namespace Tegra::Engines { | ||||
|  | ||||
| class AccelerateDMAInterface { | ||||
| @@ -32,6 +80,12 @@ public: | ||||
|     virtual bool BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) = 0; | ||||
|  | ||||
|     virtual bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) = 0; | ||||
|  | ||||
|     virtual bool ImageToBuffer(const DMA::ImageCopy& copy_info, const DMA::ImageOperand& src, | ||||
|                                const DMA::BufferOperand& dst) = 0; | ||||
|  | ||||
|     virtual bool BufferToImage(const DMA::ImageCopy& copy_info, const DMA::BufferOperand& src, | ||||
|                                const DMA::ImageOperand& dst) = 0; | ||||
| }; | ||||
|  | ||||
| /** | ||||
| @@ -51,30 +105,6 @@ public: | ||||
|         } | ||||
|     }; | ||||
|  | ||||
|     union BlockSize { | ||||
|         BitField<0, 4, u32> width; | ||||
|         BitField<4, 4, u32> height; | ||||
|         BitField<8, 4, u32> depth; | ||||
|         BitField<12, 4, u32> gob_height; | ||||
|     }; | ||||
|     static_assert(sizeof(BlockSize) == 4); | ||||
|  | ||||
|     union Origin { | ||||
|         BitField<0, 16, u32> x; | ||||
|         BitField<16, 16, u32> y; | ||||
|     }; | ||||
|     static_assert(sizeof(Origin) == 4); | ||||
|  | ||||
|     struct Parameters { | ||||
|         BlockSize block_size; | ||||
|         u32 width; | ||||
|         u32 height; | ||||
|         u32 depth; | ||||
|         u32 layer; | ||||
|         Origin origin; | ||||
|     }; | ||||
|     static_assert(sizeof(Parameters) == 24); | ||||
|  | ||||
|     struct Semaphore { | ||||
|         PackedGPUVAddr address; | ||||
|         u32 payload; | ||||
| @@ -227,8 +257,6 @@ private: | ||||
|  | ||||
|     void CopyBlockLinearToBlockLinear(); | ||||
|  | ||||
|     void FastCopyBlockLinearToPitch(); | ||||
|  | ||||
|     void ReleaseSemaphore(); | ||||
|  | ||||
|     void ConsumeSinkImpl() override; | ||||
| @@ -261,17 +289,17 @@ private: | ||||
|                 u32 reserved05[0x3f]; | ||||
|                 PackedGPUVAddr offset_in; | ||||
|                 PackedGPUVAddr offset_out; | ||||
|                 u32 pitch_in; | ||||
|                 u32 pitch_out; | ||||
|                 s32 pitch_in; | ||||
|                 s32 pitch_out; | ||||
|                 u32 line_length_in; | ||||
|                 u32 line_count; | ||||
|                 u32 reserved06[0xb6]; | ||||
|                 u32 remap_consta_value; | ||||
|                 u32 remap_constb_value; | ||||
|                 RemapConst remap_const; | ||||
|                 Parameters dst_params; | ||||
|                 DMA::Parameters dst_params; | ||||
|                 u32 reserved07[0x1]; | ||||
|                 Parameters src_params; | ||||
|                 DMA::Parameters src_params; | ||||
|                 u32 reserved08[0x275]; | ||||
|                 u32 pm_trigger_end; | ||||
|                 u32 reserved09[0x3ba]; | ||||
|   | ||||
| @@ -22,6 +22,14 @@ public: | ||||
|     explicit AccelerateDMA(); | ||||
|     bool BufferCopy(GPUVAddr start_address, GPUVAddr end_address, u64 amount) override; | ||||
|     bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) override; | ||||
|     bool ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::ImageOperand& src, | ||||
|                        const Tegra::DMA::BufferOperand& dst) override { | ||||
|         return false; | ||||
|     } | ||||
|     bool BufferToImage(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src, | ||||
|                        const Tegra::DMA::ImageOperand& dst) override { | ||||
|         return false; | ||||
|     } | ||||
| }; | ||||
|  | ||||
| class RasterizerNull final : public VideoCore::RasterizerAccelerated, | ||||
|   | ||||
| @@ -56,6 +56,16 @@ public: | ||||
|  | ||||
|     bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) override; | ||||
|  | ||||
|     bool ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::ImageOperand& src, | ||||
|                        const Tegra::DMA::BufferOperand& dst) override { | ||||
|         return false; | ||||
|     } | ||||
|  | ||||
|     bool BufferToImage(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src, | ||||
|                        const Tegra::DMA::ImageOperand& dst) override { | ||||
|         return false; | ||||
|     } | ||||
|  | ||||
| private: | ||||
|     BufferCache& buffer_cache; | ||||
| }; | ||||
|   | ||||
| @@ -172,7 +172,7 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra | ||||
|       buffer_cache(*this, cpu_memory_, buffer_cache_runtime), | ||||
|       pipeline_cache(*this, device, scheduler, descriptor_pool, update_descriptor_queue, | ||||
|                      render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), | ||||
|       query_cache{*this, device, scheduler}, accelerate_dma{buffer_cache}, | ||||
|       query_cache{*this, device, scheduler}, accelerate_dma(buffer_cache, texture_cache, scheduler), | ||||
|       fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), | ||||
|       wfi_event(device.GetLogical().CreateEvent()) { | ||||
|     scheduler.SetQueryCache(query_cache); | ||||
| @@ -756,7 +756,9 @@ void RasterizerVulkan::FlushWork() { | ||||
|     draw_counter = 0; | ||||
| } | ||||
|  | ||||
| AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_) : buffer_cache{buffer_cache_} {} | ||||
| AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_, TextureCache& texture_cache_, | ||||
|                              Scheduler& scheduler_) | ||||
|     : buffer_cache{buffer_cache_}, texture_cache{texture_cache_}, scheduler{scheduler_} {} | ||||
|  | ||||
| bool AccelerateDMA::BufferClear(GPUVAddr src_address, u64 amount, u32 value) { | ||||
|     std::scoped_lock lock{buffer_cache.mutex}; | ||||
| @@ -768,6 +770,234 @@ bool AccelerateDMA::BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 | ||||
|     return buffer_cache.DMACopy(src_address, dest_address, amount); | ||||
| } | ||||
|  | ||||
| bool AccelerateDMA::ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, | ||||
|                                   const Tegra::DMA::ImageOperand& src, | ||||
|                                   const Tegra::DMA::BufferOperand& dst) { | ||||
|     std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; | ||||
|     auto query_image = texture_cache.ObtainImage(src, false); | ||||
|     if (!query_image) { | ||||
|         return false; | ||||
|     } | ||||
|     auto* image = query_image->first; | ||||
|     auto [level, base] = query_image->second; | ||||
|     const u32 buffer_size = static_cast<u32>(dst.pitch * dst.height); | ||||
|     const auto [buffer, offset] = buffer_cache.ObtainBuffer( | ||||
|         dst.address, buffer_size, VideoCommon::ObtainBufferSynchronize::FullSynchronize, | ||||
|         VideoCommon::ObtainBufferOperation::MarkAsWritten); | ||||
|  | ||||
|     const bool is_rescaled = image->IsRescaled(); | ||||
|     if (is_rescaled) { | ||||
|         image->ScaleDown(); | ||||
|     } | ||||
|     VkImageSubresourceLayers subresources{ | ||||
|         .aspectMask = image->AspectMask(), | ||||
|         .mipLevel = level, | ||||
|         .baseArrayLayer = base, | ||||
|         .layerCount = 1, | ||||
|     }; | ||||
|     const u32 bpp = VideoCore::Surface::BytesPerBlock(image->info.format); | ||||
|     const auto convert = [old_bpp = src.bytes_per_pixel, bpp](u32 value) { | ||||
|         return (old_bpp * value) / bpp; | ||||
|     }; | ||||
|     const u32 base_x = convert(src.params.origin.x.Value()); | ||||
|     const u32 base_y = src.params.origin.y.Value(); | ||||
|     const u32 length_x = convert(copy_info.length_x); | ||||
|     const u32 length_y = copy_info.length_y; | ||||
|     VkOffset3D image_offset{ | ||||
|         .x = static_cast<s32>(base_x), | ||||
|         .y = static_cast<s32>(base_y), | ||||
|         .z = 0, | ||||
|     }; | ||||
|     VkExtent3D image_extent{ | ||||
|         .width = length_x, | ||||
|         .height = length_y, | ||||
|         .depth = 1, | ||||
|     }; | ||||
|     auto buff_info(dst); | ||||
|     buff_info.pitch = convert(dst.pitch); | ||||
|     scheduler.RequestOutsideRenderPassOperationContext(); | ||||
|     scheduler.Record([src_image = image->Handle(), dst_buffer = buffer->Handle(), | ||||
|                       buffer_offset = offset, subresources, image_offset, image_extent, | ||||
|                       buff_info](vk::CommandBuffer cmdbuf) { | ||||
|         const std::array buffer_copy_info{ | ||||
|             VkBufferImageCopy{ | ||||
|                 .bufferOffset = buffer_offset, | ||||
|                 .bufferRowLength = buff_info.pitch, | ||||
|                 .bufferImageHeight = buff_info.height, | ||||
|                 .imageSubresource = subresources, | ||||
|                 .imageOffset = image_offset, | ||||
|                 .imageExtent = image_extent, | ||||
|             }, | ||||
|         }; | ||||
|         const VkImageSubresourceRange range{ | ||||
|             .aspectMask = subresources.aspectMask, | ||||
|             .baseMipLevel = subresources.mipLevel, | ||||
|             .levelCount = 1, | ||||
|             .baseArrayLayer = subresources.baseArrayLayer, | ||||
|             .layerCount = 1, | ||||
|         }; | ||||
|         static constexpr VkMemoryBarrier WRITE_BARRIER{ | ||||
|             .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||||
|             .pNext = nullptr, | ||||
|             .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, | ||||
|             .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, | ||||
|         }; | ||||
|         const std::array pre_barriers{ | ||||
|             VkImageMemoryBarrier{ | ||||
|                 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, | ||||
|                 .pNext = nullptr, | ||||
|                 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | | ||||
|                                  VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | | ||||
|                                  VK_ACCESS_TRANSFER_WRITE_BIT, | ||||
|                 .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, | ||||
|                 .oldLayout = VK_IMAGE_LAYOUT_GENERAL, | ||||
|                 .newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, | ||||
|                 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||||
|                 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||||
|                 .image = src_image, | ||||
|                 .subresourceRange = range, | ||||
|             }, | ||||
|         }; | ||||
|         const std::array post_barriers{ | ||||
|             VkImageMemoryBarrier{ | ||||
|                 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, | ||||
|                 .pNext = nullptr, | ||||
|                 .srcAccessMask = 0, | ||||
|                 .dstAccessMask = 0, | ||||
|                 .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, | ||||
|                 .newLayout = VK_IMAGE_LAYOUT_GENERAL, | ||||
|                 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||||
|                 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||||
|                 .image = src_image, | ||||
|                 .subresourceRange = range, | ||||
|             }, | ||||
|         }; | ||||
|         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, | ||||
|                                0, {}, {}, pre_barriers); | ||||
|         cmdbuf.CopyImageToBuffer(src_image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst_buffer, | ||||
|                                  buffer_copy_info); | ||||
|         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, | ||||
|                                0, WRITE_BARRIER, nullptr, post_barriers); | ||||
|     }); | ||||
|     if (is_rescaled) { | ||||
|         image->ScaleUp(true); | ||||
|     } | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| bool AccelerateDMA::BufferToImage(const Tegra::DMA::ImageCopy& copy_info, | ||||
|                                   const Tegra::DMA::BufferOperand& src, | ||||
|                                   const Tegra::DMA::ImageOperand& dst) { | ||||
|     std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; | ||||
|     auto query_image = texture_cache.ObtainImage(dst, true); | ||||
|     if (!query_image) { | ||||
|         return false; | ||||
|     } | ||||
|     auto* image = query_image->first; | ||||
|     auto [level, base] = query_image->second; | ||||
|     const u32 buffer_size = static_cast<u32>(src.pitch * src.height); | ||||
|     const auto [buffer, offset] = buffer_cache.ObtainBuffer( | ||||
|         src.address, buffer_size, VideoCommon::ObtainBufferSynchronize::FullSynchronize, | ||||
|         VideoCommon::ObtainBufferOperation::DoNothing); | ||||
|     const bool is_rescaled = image->IsRescaled(); | ||||
|     if (is_rescaled) { | ||||
|         image->ScaleDown(true); | ||||
|     } | ||||
|     VkImageSubresourceLayers subresources{ | ||||
|         .aspectMask = image->AspectMask(), | ||||
|         .mipLevel = level, | ||||
|         .baseArrayLayer = base, | ||||
|         .layerCount = 1, | ||||
|     }; | ||||
|     const u32 bpp = VideoCore::Surface::BytesPerBlock(image->info.format); | ||||
|     const auto convert = [old_bpp = dst.bytes_per_pixel, bpp](u32 value) { | ||||
|         return (old_bpp * value) / bpp; | ||||
|     }; | ||||
|     const u32 base_x = convert(dst.params.origin.x.Value()); | ||||
|     const u32 base_y = dst.params.origin.y.Value(); | ||||
|     const u32 length_x = convert(copy_info.length_x); | ||||
|     const u32 length_y = copy_info.length_y; | ||||
|     VkOffset3D image_offset{ | ||||
|         .x = static_cast<s32>(base_x), | ||||
|         .y = static_cast<s32>(base_y), | ||||
|         .z = 0, | ||||
|     }; | ||||
|     VkExtent3D image_extent{ | ||||
|         .width = length_x, | ||||
|         .height = length_y, | ||||
|         .depth = 1, | ||||
|     }; | ||||
|     auto buff_info(src); | ||||
|     buff_info.pitch = convert(src.pitch); | ||||
|     scheduler.RequestOutsideRenderPassOperationContext(); | ||||
|     scheduler.Record([dst_image = image->Handle(), src_buffer = buffer->Handle(), | ||||
|                       buffer_offset = offset, subresources, image_offset, image_extent, | ||||
|                       buff_info](vk::CommandBuffer cmdbuf) { | ||||
|         const std::array buffer_copy_info{ | ||||
|             VkBufferImageCopy{ | ||||
|                 .bufferOffset = buffer_offset, | ||||
|                 .bufferRowLength = buff_info.pitch, | ||||
|                 .bufferImageHeight = buff_info.height, | ||||
|                 .imageSubresource = subresources, | ||||
|                 .imageOffset = image_offset, | ||||
|                 .imageExtent = image_extent, | ||||
|             }, | ||||
|         }; | ||||
|         const VkImageSubresourceRange range{ | ||||
|             .aspectMask = subresources.aspectMask, | ||||
|             .baseMipLevel = subresources.mipLevel, | ||||
|             .levelCount = 1, | ||||
|             .baseArrayLayer = subresources.baseArrayLayer, | ||||
|             .layerCount = 1, | ||||
|         }; | ||||
|         static constexpr VkMemoryBarrier READ_BARRIER{ | ||||
|             .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||||
|             .pNext = nullptr, | ||||
|             .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, | ||||
|             .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, | ||||
|         }; | ||||
|         const std::array pre_barriers{ | ||||
|             VkImageMemoryBarrier{ | ||||
|                 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, | ||||
|                 .pNext = nullptr, | ||||
|                 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | | ||||
|                                  VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | | ||||
|                                  VK_ACCESS_TRANSFER_WRITE_BIT, | ||||
|                 .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, | ||||
|                 .oldLayout = VK_IMAGE_LAYOUT_GENERAL, | ||||
|                 .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, | ||||
|                 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||||
|                 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||||
|                 .image = dst_image, | ||||
|                 .subresourceRange = range, | ||||
|             }, | ||||
|         }; | ||||
|         const std::array post_barriers{ | ||||
|             VkImageMemoryBarrier{ | ||||
|                 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, | ||||
|                 .pNext = nullptr, | ||||
|                 .srcAccessMask = 0, | ||||
|                 .dstAccessMask = 0, | ||||
|                 .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, | ||||
|                 .newLayout = VK_IMAGE_LAYOUT_GENERAL, | ||||
|                 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||||
|                 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||||
|                 .image = dst_image, | ||||
|                 .subresourceRange = range, | ||||
|             }, | ||||
|         }; | ||||
|         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, | ||||
|                                0, READ_BARRIER, {}, pre_barriers); | ||||
|         cmdbuf.CopyBufferToImage(src_buffer, dst_image, VK_IMAGE_LAYOUT_GENERAL, buffer_copy_info); | ||||
|         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, | ||||
|                                0, nullptr, nullptr, post_barriers); | ||||
|     }); | ||||
|     if (is_rescaled) { | ||||
|         image->ScaleUp(); | ||||
|     } | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| void RasterizerVulkan::UpdateDynamicStates() { | ||||
|     auto& regs = maxwell3d->regs; | ||||
|     UpdateViewportsState(regs); | ||||
|   | ||||
| @@ -45,14 +45,23 @@ class StateTracker; | ||||
|  | ||||
| class AccelerateDMA : public Tegra::Engines::AccelerateDMAInterface { | ||||
| public: | ||||
|     explicit AccelerateDMA(BufferCache& buffer_cache); | ||||
|     explicit AccelerateDMA(BufferCache& buffer_cache, TextureCache& texture_cache, | ||||
|                            Scheduler& scheduler); | ||||
|  | ||||
|     bool BufferCopy(GPUVAddr start_address, GPUVAddr end_address, u64 amount) override; | ||||
|  | ||||
|     bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) override; | ||||
|  | ||||
|     bool ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::ImageOperand& src, | ||||
|                        const Tegra::DMA::BufferOperand& dst) override; | ||||
|  | ||||
|     bool BufferToImage(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src, | ||||
|                        const Tegra::DMA::ImageOperand& dst) override; | ||||
|  | ||||
| private: | ||||
|     BufferCache& buffer_cache; | ||||
|     TextureCache& texture_cache; | ||||
|     Scheduler& scheduler; | ||||
| }; | ||||
|  | ||||
| class RasterizerVulkan final : public VideoCore::RasterizerAccelerated, | ||||
|   | ||||
| @@ -864,13 +864,19 @@ void TextureCacheRuntime::ReinterpretImage(Image& dst, Image& src, | ||||
|     const VkImageAspectFlags src_aspect_mask = src.AspectMask(); | ||||
|     const VkImageAspectFlags dst_aspect_mask = dst.AspectMask(); | ||||
|  | ||||
|     std::ranges::transform(copies, vk_in_copies.begin(), [src_aspect_mask](const auto& copy) { | ||||
|         return MakeBufferImageCopy(copy, true, src_aspect_mask); | ||||
|     }); | ||||
|     const auto bpp_in = BytesPerBlock(src.info.format) / DefaultBlockWidth(src.info.format); | ||||
|     const auto bpp_out = BytesPerBlock(dst.info.format) / DefaultBlockWidth(dst.info.format); | ||||
|     std::ranges::transform(copies, vk_in_copies.begin(), | ||||
|                            [src_aspect_mask, bpp_in, bpp_out](const auto& copy) { | ||||
|                                auto copy2 = copy; | ||||
|                                copy2.src_offset.x = (bpp_out * copy.src_offset.x) / bpp_in; | ||||
|                                copy2.extent.width = (bpp_out * copy.extent.width) / bpp_in; | ||||
|                                return MakeBufferImageCopy(copy2, true, src_aspect_mask); | ||||
|                            }); | ||||
|     std::ranges::transform(copies, vk_out_copies.begin(), [dst_aspect_mask](const auto& copy) { | ||||
|         return MakeBufferImageCopy(copy, false, dst_aspect_mask); | ||||
|     }); | ||||
|     const u32 img_bpp = BytesPerBlock(src.info.format); | ||||
|     const u32 img_bpp = BytesPerBlock(dst.info.format); | ||||
|     size_t total_size = 0; | ||||
|     for (const auto& copy : copies) { | ||||
|         total_size += copy.extent.width * copy.extent.height * copy.extent.depth * img_bpp; | ||||
|   | ||||
| @@ -216,10 +216,51 @@ ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept { | ||||
|             .height = config.height, | ||||
|             .depth = 1, | ||||
|         }; | ||||
|         rescaleable = block.depth == 0; | ||||
|         rescaleable &= size.height > 256; | ||||
|         rescaleable = block.depth == 0 && size.height > 256; | ||||
|         downscaleable = size.height > 512; | ||||
|     } | ||||
| } | ||||
|  | ||||
| static PixelFormat ByteSizeToFormat(u32 bytes_per_pixel) { | ||||
|     switch (bytes_per_pixel) { | ||||
|     case 1: | ||||
|         return PixelFormat::R8_UINT; | ||||
|     case 2: | ||||
|         return PixelFormat::R8G8_UINT; | ||||
|     case 4: | ||||
|         return PixelFormat::A8B8G8R8_UINT; | ||||
|     case 8: | ||||
|         return PixelFormat::R16G16B16A16_UINT; | ||||
|     case 16: | ||||
|         return PixelFormat::R32G32B32A32_UINT; | ||||
|     default: | ||||
|         UNIMPLEMENTED(); | ||||
|         return PixelFormat::Invalid; | ||||
|     } | ||||
| } | ||||
|  | ||||
| ImageInfo::ImageInfo(const Tegra::DMA::ImageOperand& config) noexcept { | ||||
|     const u32 bytes_per_pixel = config.bytes_per_pixel; | ||||
|     format = ByteSizeToFormat(bytes_per_pixel); | ||||
|     type = config.params.block_size.depth > 0 ? ImageType::e3D : ImageType::e2D; | ||||
|     num_samples = 1; | ||||
|     block = Extent3D{ | ||||
|         .width = config.params.block_size.width, | ||||
|         .height = config.params.block_size.height, | ||||
|         .depth = config.params.block_size.depth, | ||||
|     }; | ||||
|     size = Extent3D{ | ||||
|         .width = config.params.width, | ||||
|         .height = config.params.height, | ||||
|         .depth = config.params.depth, | ||||
|     }; | ||||
|     tile_width_spacing = 0; | ||||
|     resources.levels = 1; | ||||
|     resources.layers = 1; | ||||
|     layer_stride = CalculateLayerStride(*this); | ||||
|     maybe_unaligned_layer_stride = CalculateLayerSize(*this); | ||||
|     rescaleable = block.depth == 0 && size.height > 256; | ||||
|     downscaleable = size.height > 512; | ||||
| } | ||||
|  | ||||
| } // namespace VideoCommon | ||||
|   | ||||
| @@ -5,6 +5,7 @@ | ||||
|  | ||||
| #include "video_core/engines/fermi_2d.h" | ||||
| #include "video_core/engines/maxwell_3d.h" | ||||
| #include "video_core/engines/maxwell_dma.h" | ||||
| #include "video_core/surface.h" | ||||
| #include "video_core/texture_cache/types.h" | ||||
|  | ||||
| @@ -19,6 +20,7 @@ struct ImageInfo { | ||||
|     explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) noexcept; | ||||
|     explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept; | ||||
|     explicit ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept; | ||||
|     explicit ImageInfo(const Tegra::DMA::ImageOperand& config) noexcept; | ||||
|  | ||||
|     PixelFormat format = PixelFormat::Invalid; | ||||
|     ImageType type = ImageType::e1D; | ||||
|   | ||||
| @@ -1358,6 +1358,75 @@ std::optional<typename TextureCache<P>::BlitImages> TextureCache<P>::GetBlitImag | ||||
|     }}; | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| ImageId TextureCache<P>::FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr) { | ||||
|     std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); | ||||
|     if (!cpu_addr) { | ||||
|         cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr, CalculateGuestSizeInBytes(info)); | ||||
|         if (!cpu_addr) { | ||||
|             return ImageId{}; | ||||
|         } | ||||
|     } | ||||
|     ImageId image_id{}; | ||||
|     boost::container::small_vector<ImageId, 1> image_ids; | ||||
|     const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) { | ||||
|         if (True(existing_image.flags & ImageFlagBits::Remapped)) { | ||||
|             return false; | ||||
|         } | ||||
|         if (info.type == ImageType::Linear || existing_image.info.type == ImageType::Linear) | ||||
|             [[unlikely]] { | ||||
|             const bool strict_size = True(existing_image.flags & ImageFlagBits::Strong); | ||||
|             const ImageInfo& existing = existing_image.info; | ||||
|             if (existing_image.gpu_addr == gpu_addr && existing.type == info.type && | ||||
|                 existing.pitch == info.pitch && | ||||
|                 IsPitchLinearSameSize(existing, info, strict_size) && | ||||
|                 IsViewCompatible(existing.format, info.format, false, true)) { | ||||
|                 image_id = existing_image_id; | ||||
|                 image_ids.push_back(existing_image_id); | ||||
|                 return true; | ||||
|             } | ||||
|         } else if (IsSubCopy(info, existing_image, gpu_addr)) { | ||||
|             image_id = existing_image_id; | ||||
|             image_ids.push_back(existing_image_id); | ||||
|             return true; | ||||
|         } | ||||
|         return false; | ||||
|     }; | ||||
|     ForEachImageInRegion(*cpu_addr, CalculateGuestSizeInBytes(info), lambda); | ||||
|     if (image_ids.size() <= 1) [[likely]] { | ||||
|         return image_id; | ||||
|     } | ||||
|     auto image_ids_compare = [this](ImageId a, ImageId b) { | ||||
|         auto& image_a = slot_images[a]; | ||||
|         auto& image_b = slot_images[b]; | ||||
|         return image_a.modification_tick < image_b.modification_tick; | ||||
|     }; | ||||
|     return *std::ranges::max_element(image_ids, image_ids_compare); | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| std::optional<std::pair<typename TextureCache<P>::Image*, std::pair<u32, u32>>> | ||||
| TextureCache<P>::ObtainImage(const Tegra::DMA::ImageOperand& operand, bool mark_as_modified) { | ||||
|     ImageInfo dst_info(operand); | ||||
|     ImageId dst_id = FindDMAImage(dst_info, operand.address); | ||||
|     if (!dst_id) { | ||||
|         return std::nullopt; | ||||
|     } | ||||
|     auto& image = slot_images[dst_id]; | ||||
|     auto base = image.TryFindBase(operand.address); | ||||
|     if (!base) { | ||||
|         return std::nullopt; | ||||
|     } | ||||
|     if (False(image.flags & ImageFlagBits::GpuModified)) { | ||||
|         // No need to waste time on an image that's synced with guest | ||||
|         return std::nullopt; | ||||
|     } | ||||
|     PrepareImage(dst_id, mark_as_modified, false); | ||||
|     auto& new_image = slot_images[dst_id]; | ||||
|     lru_cache.Touch(new_image.lru_index, frame_tick); | ||||
|     return std::make_pair(&new_image, std::make_pair(base->level, base->layer)); | ||||
| } | ||||
|  | ||||
| template <class P> | ||||
| SamplerId TextureCache<P>::FindSampler(const TSCEntry& config) { | ||||
|     if (std::ranges::all_of(config.raw, [](u64 value) { return value == 0; })) { | ||||
|   | ||||
| @@ -209,6 +209,9 @@ public: | ||||
|     /// Pop asynchronous downloads | ||||
|     void PopAsyncFlushes(); | ||||
|  | ||||
|     [[nodiscard]] std::optional<std::pair<Image*, std::pair<u32, u32>>> ObtainImage( | ||||
|         const Tegra::DMA::ImageOperand& operand, bool mark_as_modified); | ||||
|  | ||||
|     /// Return true when a CPU region is modified from the GPU | ||||
|     [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); | ||||
|  | ||||
| @@ -300,6 +303,8 @@ private: | ||||
|     /// Remove joined images from the cache | ||||
|     [[nodiscard]] ImageId JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); | ||||
|  | ||||
|     [[nodiscard]] ImageId FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr); | ||||
|  | ||||
|     /// Return a blit image pair from the given guest blit parameters | ||||
|     [[nodiscard]] std::optional<BlitImages> GetBlitImages( | ||||
|         const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src, | ||||
|   | ||||
| @@ -54,6 +54,7 @@ enum class RelaxedOptions : u32 { | ||||
|     Format = 1 << 1, | ||||
|     Samples = 1 << 2, | ||||
|     ForceBrokenViews = 1 << 3, | ||||
|     FormatBpp = 1 << 4, | ||||
| }; | ||||
| DECLARE_ENUM_FLAG_OPERATORS(RelaxedOptions) | ||||
|  | ||||
|   | ||||
| @@ -743,6 +743,44 @@ std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn | ||||
|     return copies; | ||||
| } | ||||
|  | ||||
| std::vector<ImageCopy> MakeReinterpretImageCopies(const ImageInfo& src, u32 up_scale, | ||||
|                                                   u32 down_shift) { | ||||
|     std::vector<ImageCopy> copies; | ||||
|     copies.reserve(src.resources.levels); | ||||
|     const bool is_3d = src.type == ImageType::e3D; | ||||
|     for (s32 level = 0; level < src.resources.levels; ++level) { | ||||
|         ImageCopy& copy = copies.emplace_back(); | ||||
|         copy.src_subresource = SubresourceLayers{ | ||||
|             .base_level = level, | ||||
|             .base_layer = 0, | ||||
|             .num_layers = src.resources.layers, | ||||
|         }; | ||||
|         copy.dst_subresource = SubresourceLayers{ | ||||
|             .base_level = level, | ||||
|             .base_layer = 0, | ||||
|             .num_layers = src.resources.layers, | ||||
|         }; | ||||
|         copy.src_offset = Offset3D{ | ||||
|             .x = 0, | ||||
|             .y = 0, | ||||
|             .z = 0, | ||||
|         }; | ||||
|         copy.dst_offset = Offset3D{ | ||||
|             .x = 0, | ||||
|             .y = 0, | ||||
|             .z = 0, | ||||
|         }; | ||||
|         const Extent3D mip_size = AdjustMipSize(src.size, level); | ||||
|         copy.extent = AdjustSamplesSize(mip_size, src.num_samples); | ||||
|         if (is_3d) { | ||||
|             copy.extent.depth = src.size.depth; | ||||
|         } | ||||
|         copy.extent.width = std::max<u32>((copy.extent.width * up_scale) >> down_shift, 1); | ||||
|         copy.extent.height = std::max<u32>((copy.extent.height * up_scale) >> down_shift, 1); | ||||
|     } | ||||
|     return copies; | ||||
| } | ||||
|  | ||||
| bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config) { | ||||
|     const GPUVAddr address = config.Address(); | ||||
|     if (address == 0) { | ||||
| @@ -999,6 +1037,20 @@ bool IsBlockLinearSizeCompatible(const ImageInfo& lhs, const ImageInfo& rhs, u32 | ||||
|     } | ||||
| } | ||||
|  | ||||
| bool IsBlockLinearSizeCompatibleBPPRelaxed(const ImageInfo& lhs, const ImageInfo& rhs, | ||||
|                                            u32 lhs_level, u32 rhs_level) noexcept { | ||||
|     ASSERT(lhs.type != ImageType::Linear); | ||||
|     ASSERT(rhs.type != ImageType::Linear); | ||||
|     const auto lhs_bpp = BytesPerBlock(lhs.format); | ||||
|     const auto rhs_bpp = BytesPerBlock(rhs.format); | ||||
|     const Extent3D lhs_size = AdjustMipSize(lhs.size, lhs_level); | ||||
|     const Extent3D rhs_size = AdjustMipSize(rhs.size, rhs_level); | ||||
|     return Common::AlignUpLog2(lhs_size.width * lhs_bpp, GOB_SIZE_X_SHIFT) == | ||||
|                Common::AlignUpLog2(rhs_size.width * rhs_bpp, GOB_SIZE_X_SHIFT) && | ||||
|            Common::AlignUpLog2(lhs_size.height, GOB_SIZE_Y_SHIFT) == | ||||
|                Common::AlignUpLog2(rhs_size.height, GOB_SIZE_Y_SHIFT); | ||||
| } | ||||
|  | ||||
| bool IsPitchLinearSameSize(const ImageInfo& lhs, const ImageInfo& rhs, bool strict_size) noexcept { | ||||
|     ASSERT(lhs.type == ImageType::Linear); | ||||
|     ASSERT(rhs.type == ImageType::Linear); | ||||
| @@ -1073,7 +1125,8 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const | ||||
|         // Format checking is relaxed, but we still have to check for matching bytes per block. | ||||
|         // This avoids creating a view for blits on UE4 titles where formats with different bytes | ||||
|         // per block are aliased. | ||||
|         if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format)) { | ||||
|         if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format) && | ||||
|             False(options & RelaxedOptions::FormatBpp)) { | ||||
|             return std::nullopt; | ||||
|         } | ||||
|     } else { | ||||
| @@ -1088,10 +1141,8 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const | ||||
|     if (existing.type != candidate.type) { | ||||
|         return std::nullopt; | ||||
|     } | ||||
|     if (False(options & RelaxedOptions::Samples)) { | ||||
|         if (existing.num_samples != candidate.num_samples) { | ||||
|             return std::nullopt; | ||||
|         } | ||||
|     if (False(options & RelaxedOptions::Samples) && existing.num_samples != candidate.num_samples) { | ||||
|         return std::nullopt; | ||||
|     } | ||||
|     if (existing.resources.levels < candidate.resources.levels + base->level) { | ||||
|         return std::nullopt; | ||||
| @@ -1101,14 +1152,16 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const | ||||
|         if (mip_depth < candidate.size.depth + base->layer) { | ||||
|             return std::nullopt; | ||||
|         } | ||||
|     } else { | ||||
|         if (existing.resources.layers < candidate.resources.layers + base->layer) { | ||||
|             return std::nullopt; | ||||
|         } | ||||
|     } else if (existing.resources.layers < candidate.resources.layers + base->layer) { | ||||
|         return std::nullopt; | ||||
|     } | ||||
|     const bool strict_size = False(options & RelaxedOptions::Size); | ||||
|     if (!IsBlockLinearSizeCompatible(existing, candidate, base->level, 0, strict_size)) { | ||||
|         return std::nullopt; | ||||
|         if (False(options & RelaxedOptions::FormatBpp)) { | ||||
|             return std::nullopt; | ||||
|         } else if (!IsBlockLinearSizeCompatibleBPPRelaxed(existing, candidate, base->level, 0)) { | ||||
|             return std::nullopt; | ||||
|         } | ||||
|     } | ||||
|     // TODO: compare block sizes | ||||
|     return base; | ||||
| @@ -1120,6 +1173,31 @@ bool IsSubresource(const ImageInfo& candidate, const ImageBase& image, GPUVAddr | ||||
|         .has_value(); | ||||
| } | ||||
|  | ||||
| bool IsSubCopy(const ImageInfo& candidate, const ImageBase& image, GPUVAddr candidate_addr) { | ||||
|     const std::optional<SubresourceBase> base = image.TryFindBase(candidate_addr); | ||||
|     if (!base) { | ||||
|         return false; | ||||
|     } | ||||
|     const ImageInfo& existing = image.info; | ||||
|     if (existing.resources.levels < candidate.resources.levels + base->level) { | ||||
|         return false; | ||||
|     } | ||||
|     if (existing.type == ImageType::e3D) { | ||||
|         const u32 mip_depth = std::max(1U, existing.size.depth << base->level); | ||||
|         if (mip_depth < candidate.size.depth + base->layer) { | ||||
|             return false; | ||||
|         } | ||||
|     } else { | ||||
|         if (existing.resources.layers < candidate.resources.layers + base->layer) { | ||||
|             return false; | ||||
|         } | ||||
|     } | ||||
|     if (!IsBlockLinearSizeCompatibleBPPRelaxed(existing, candidate, base->level, 0)) { | ||||
|         return false; | ||||
|     } | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase* dst, | ||||
|                       const ImageBase* src) { | ||||
|     const auto original_dst_format = dst_info.format; | ||||
|   | ||||
| @@ -56,6 +56,10 @@ struct OverlapResult { | ||||
|                                                            SubresourceBase base, u32 up_scale = 1, | ||||
|                                                            u32 down_shift = 0); | ||||
|  | ||||
| [[nodiscard]] std::vector<ImageCopy> MakeReinterpretImageCopies(const ImageInfo& src, | ||||
|                                                                 u32 up_scale = 1, | ||||
|                                                                 u32 down_shift = 0); | ||||
|  | ||||
| [[nodiscard]] bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config); | ||||
|  | ||||
| [[nodiscard]] std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, | ||||
| @@ -88,6 +92,9 @@ void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const Ima | ||||
| [[nodiscard]] bool IsPitchLinearSameSize(const ImageInfo& lhs, const ImageInfo& rhs, | ||||
|                                          bool strict_size) noexcept; | ||||
|  | ||||
| [[nodiscard]] bool IsBlockLinearSizeCompatibleBPPRelaxed(const ImageInfo& lhs, const ImageInfo& rhs, | ||||
|                                                          u32 lhs_level, u32 rhs_level) noexcept; | ||||
|  | ||||
| [[nodiscard]] std::optional<OverlapResult> ResolveOverlap(const ImageInfo& new_info, | ||||
|                                                           GPUVAddr gpu_addr, VAddr cpu_addr, | ||||
|                                                           const ImageBase& overlap, | ||||
| @@ -106,6 +113,9 @@ void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const Ima | ||||
|                                  GPUVAddr candidate_addr, RelaxedOptions options, bool broken_views, | ||||
|                                  bool native_bgr); | ||||
|  | ||||
| [[nodiscard]] bool IsSubCopy(const ImageInfo& candidate, const ImageBase& image, | ||||
|                              GPUVAddr candidate_addr); | ||||
|  | ||||
| void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase* dst, | ||||
|                       const ImageBase* src); | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Fernando S
					Fernando S