Fermi2D: Rework blit engine and add a software blitter.

2022-11-05 22:26:38 +01:00
parent 168c9ee341
commit 957840be91
12 changed files with 1431 additions and 18 deletions
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -28,6 +28,10 @@ add_library(video_core STATIC
    dirty_flags.h
    dma_pusher.cpp
    dma_pusher.h
+    engines/sw_blitter/blitter.cpp
+    engines/sw_blitter/blitter.h
+    engines/sw_blitter/converter.cpp
+    engines/sw_blitter/converter.h
    engines/const_buffer_info.h
    engines/engine_interface.h
    engines/engine_upload.cpp
--- a/src/video_core/control/channel_state.cpp
+++ b/src/video_core/control/channel_state.cpp
@@ -20,7 +20,7 @@ void ChannelState::Init(Core::System& system, GPU& gpu) {
    ASSERT(memory_manager);
    dma_pusher = std::make_unique<Tegra::DmaPusher>(system, gpu, *memory_manager, *this);
    maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, *memory_manager);
-    fermi_2d = std::make_unique<Engines::Fermi2D>();
+    fermi_2d = std::make_unique<Engines::Fermi2D>(*memory_manager);
    kepler_compute = std::make_unique<Engines::KeplerCompute>(system, *memory_manager);
    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, *memory_manager);
    kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager);
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -3,17 +3,25 @@

 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "video_core/engines/fermi_2d.h"
-#include "video_core/memory_manager.h"
+#include "video_core/engines/sw_blitter/blitter.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/surface.h"
+#include "video_core/textures/decoders.h"
+
+MICROPROFILE_DECLARE(GPU_BlitEngine);
+MICROPROFILE_DEFINE(GPU_BlitEngine, "GPU", "Blit Engine", MP_RGB(224, 224, 128));

 using VideoCore::Surface::BytesPerBlock;
 using VideoCore::Surface::PixelFormatFromRenderTargetFormat;

 namespace Tegra::Engines {

-Fermi2D::Fermi2D() {
+using namespace Texture;
+
+Fermi2D::Fermi2D(MemoryManager& memory_manager_) {
+    sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager_);
    // Nvidia's OpenGL driver seems to assume these values
    regs.src.depth = 1;
    regs.dst.depth = 1;
@@ -42,6 +50,7 @@ void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32
 }

 void Fermi2D::Blit() {
+    MICROPROFILE_SCOPE(GPU_BlitEngine);
    LOG_DEBUG(HW_GPU, "called. source address=0x{:x}, destination address=0x{:x}",
              regs.src.Address(), regs.dst.Address());

@@ -52,9 +61,12 @@ void Fermi2D::Blit() {
    UNIMPLEMENTED_IF_MSG(regs.clip_enable != 0, "Clipped blit enabled");

    const auto& args = regs.pixels_from_memory;
+    constexpr s64 null_derivate = 1ULL << 32;
    Config config{
        .operation = regs.operation,
        .filter = args.sample_mode.filter,
+        .must_accelerate = args.du_dx != null_derivate || args.dv_dy != null_derivate ||
+                           args.sample_mode.filter == Filter::Bilinear,
        .dst_x0 = args.dst_x0,
        .dst_y0 = args.dst_y0,
        .dst_x1 = args.dst_x0 + args.dst_width,
@@ -78,8 +90,9 @@ void Fermi2D::Blit() {
        config.src_x1 -= config.src_x0;
        config.src_x0 = 0;
    }
+
    if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) {
-        UNIMPLEMENTED();
+        sw_blitter->Blit(src, regs.dst, config);
    }
 }

--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -5,6 +5,7 @@

 #include <array>
 #include <cstddef>
+#include <memory>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
@@ -21,6 +22,10 @@ class RasterizerInterface;

 namespace Tegra::Engines {

+namespace Blitter {
+class SoftwareBlitEngine;
+}
+
 /**
 * This Engine is known as G80_2D. Documentation can be found in:
 * https://github.com/envytools/envytools/blob/master/rnndb/graph/g80_2d.xml
@@ -32,7 +37,7 @@ namespace Tegra::Engines {

 class Fermi2D final : public EngineInterface {
 public:
-    explicit Fermi2D();
+    explicit Fermi2D(MemoryManager& memory_manager_);
    ~Fermi2D() override;

    /// Binds a rasterizer to this engine.
@@ -286,6 +291,7 @@ public:
    struct Config {
        Operation operation;
        Filter filter;
+        bool must_accelerate;
        s32 dst_x0;
        s32 dst_y0;
        s32 dst_x1;
@@ -298,6 +304,7 @@ public:

 private:
    VideoCore::RasterizerInterface* rasterizer = nullptr;
+    std::unique_ptr<Blitter::SoftwareBlitEngine> sw_blitter;

    /// Performs the copy from the source surface to the destination surface as configured in the
    /// registers.
--- a/src/video_core/engines/sw_blitter/blitter.cpp
+++ b/src/video_core/engines/sw_blitter/blitter.cpp
@@ -0,0 +1,213 @@
+// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include <vector>
+
+#include "video_core/engines/sw_blitter/blitter.h"
+#include "video_core/engines/sw_blitter/converter.h"
+#include "video_core/memory_manager.h"
+#include "video_core/surface.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra {
+class MemoryManager;
+}
+
+using VideoCore::Surface::BytesPerBlock;
+using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
+
+namespace Tegra::Engines::Blitter {
+
+using namespace Texture;
+
+namespace {
+
+void NeighrestNeighbor(std::span<u8> input, std::span<u8> output, u32 src_width, u32 src_height,
+                       u32 dst_width, u32 dst_height, size_t bpp) {
+    const size_t dx_du = std::llround((static_cast<f64>(src_width) / dst_width) * (1ULL << 32));
+    const size_t dy_dv = std::llround((static_cast<f64>(src_height) / dst_height) * (1ULL << 32));
+    size_t src_y = 0;
+    for (u32 y = 0; y < dst_height; y++) {
+        size_t src_x = 0;
+        for (u32 x = 0; x < dst_width; x++) {
+            const size_t read_from = ((src_y * src_width + src_x) >> 32) * bpp;
+            const size_t write_to = (y * dst_width + x) * bpp;
+
+            std::memcpy(&output[write_to], &input[read_from], bpp);
+            src_x += dx_du;
+        }
+        src_y += dy_dv;
+    }
+}
+
+void NeighrestNeighborFast(std::span<f32> input, std::span<f32> output, u32 src_width,
+                           u32 src_height, u32 dst_width, u32 dst_height) {
+    const size_t dx_du = std::llround((static_cast<f64>(src_width) / dst_width) * (1ULL << 32));
+    const size_t dy_dv = std::llround((static_cast<f64>(src_height) / dst_height) * (1ULL << 32));
+    size_t src_y = 0;
+    for (u32 y = 0; y < dst_height; y++) {
+        size_t src_x = 0;
+        for (u32 x = 0; x < dst_width; x++) {
+            const size_t read_from = ((src_y * src_width + src_x) >> 32) * 4;
+            const size_t write_to = (y * dst_width + x) * 4;
+
+            std::memcpy(&output[write_to], &input[read_from], sizeof(f32) * 4);
+            src_x += dx_du;
+        }
+        src_y += dy_dv;
+    }
+}
+
+/*
+void Bilinear(std::span<f32> input, std::span<f32> output, size_t src_width,
+                       size_t src_height, size_t dst_width, size_t dst_height) {
+    const auto inv_lerp = [](u32 coord, u32 end) { return
+static_cast<f32>(std::min(std::max(static_cast<s32>(coord), 0), end - 1)) / (end); };
+
+
+    for (u32 y = 0; y < dst_height; y++) {
+        const f32 ty_0 = inv_lerp(y, dst_extent_y);
+        const f32 ty_1 = inv_lerp(y + 1, dst_extent_y);
+        for (u32 x = 0; x < dst_width; x++) {
+            const f32 tx_0 = inv_lerp(x, dst_extent_x);
+            const f32 tx_1 = inv_lerp(x + 1, dst_extent_x);
+            const std::array<f32, 4> get_pixel = [&](f32 tx, f32 ty, u32 width, u32 height) {
+                std::array<f32, 4> result{};
+
+                return (std::llround(width * tx) + std::llround(height * ty) * width) * 4;
+            };
+            std::array<f32, 4> result{};
+
+            const size_t read_from = get_pixel(src_width, src_height);
+            const size_t write_to = get_pixel(tx_0, ty_0, dst_width, dst_height);
+
+            std::memcpy(&output[write_to], &input[read_from], bpp);
+        }
+    }
+}
+*/
+
+} // namespace
+
+struct SoftwareBlitEngine::BlitEngineImpl {
+    std::vector<u8> tmp_buffer;
+    std::vector<u8> src_buffer;
+    std::vector<u8> dst_buffer;
+    std::vector<f32> intermediate_src;
+    std::vector<f32> intermediate_dst;
+    ConverterFactory converter_factory;
+};
+
+SoftwareBlitEngine::SoftwareBlitEngine(MemoryManager& memory_manager_)
+    : memory_manager{memory_manager_} {
+    impl = std::make_unique<BlitEngineImpl>();
+}
+
+SoftwareBlitEngine::~SoftwareBlitEngine() = default;
+
+bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst,
+                              Fermi2D::Config& config) {
+    UNIMPLEMENTED_IF(config.filter == Fermi2D::Filter::Bilinear);
+
+    const auto get_surface_size = [](Fermi2D::Surface& surface, u32 bytes_per_pixel) {
+        if (surface.linear == Fermi2D::MemoryLayout::BlockLinear) {
+            return CalculateSize(true, bytes_per_pixel, surface.width, surface.height,
+                                 surface.depth, surface.block_height, surface.block_depth);
+        }
+        return static_cast<size_t>(surface.pitch * surface.height);
+    };
+    const auto process_pitch_linear = [](bool unpack, std::span<u8> input, std::span<u8> output,
+                                         u32 extent_x, u32 extent_y, u32 pitch, u32 x0, u32 y0,
+                                         size_t bpp) {
+        const size_t base_offset = x0 * bpp;
+        const size_t copy_size = extent_x * bpp;
+        for (u32 y = y0; y < extent_y; y++) {
+            const size_t first_offset = y * pitch + base_offset;
+            const size_t second_offset = y * extent_x * bpp;
+            u8* write_to = unpack ? &output[first_offset] : &output[second_offset];
+            const u8* read_from = unpack ? &input[second_offset] : &input[first_offset];
+            std::memcpy(write_to, read_from, copy_size);
+        }
+    };
+
+    const u32 src_extent_x = config.src_x1 - config.src_x0;
+    const u32 src_extent_y = config.src_y1 - config.src_y0;
+
+    const u32 dst_extent_x = config.dst_x1 - config.dst_x0;
+    const u32 dst_extent_y = config.dst_y1 - config.dst_y0;
+    const auto src_bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format));
+    const auto dst_bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(dst.format));
+    const size_t src_size = get_surface_size(src, src_bytes_per_pixel);
+    impl->tmp_buffer.resize(src_size);
+    memory_manager.ReadBlock(src.Address(), impl->tmp_buffer.data(), src_size);
+
+    const size_t src_copy_size = src_extent_x * src_extent_y * src_bytes_per_pixel;
+
+    const size_t dst_copy_size = dst_extent_x * dst_extent_y * dst_bytes_per_pixel;
+
+    impl->src_buffer.resize(src_copy_size);
+
+    const bool no_passthrough =
+        src.format != dst.format || src_extent_x != dst_extent_x || src_extent_y != dst_extent_y;
+
+    const auto convertion_phase_same_format = [&]() {
+        NeighrestNeighbor(impl->src_buffer, impl->dst_buffer, src_extent_x, src_extent_y,
+                          dst_extent_x, dst_extent_y, dst_bytes_per_pixel);
+    };
+
+    const auto convertion_phase_ir = [&]() {
+        auto* input_converter = impl->converter_factory.GetFormatConverter(src.format);
+        impl->intermediate_src.resize((src_copy_size / src_bytes_per_pixel) * 4);
+        impl->intermediate_dst.resize((dst_copy_size / dst_bytes_per_pixel) * 4);
+        input_converter->ConvertTo(impl->src_buffer, impl->intermediate_src);
+
+        NeighrestNeighborFast(impl->intermediate_src, impl->intermediate_dst, src_extent_x,
+                              src_extent_y, dst_extent_x, dst_extent_y);
+
+        auto* output_converter = impl->converter_factory.GetFormatConverter(dst.format);
+        output_converter->ConvertFrom(impl->intermediate_dst, impl->dst_buffer);
+    };
+
+    // Do actuall Blit
+
+    impl->dst_buffer.resize(dst_copy_size);
+    if (src.linear == Fermi2D::MemoryLayout::BlockLinear) {
+        UnswizzleSubrect(impl->src_buffer, impl->tmp_buffer, src_bytes_per_pixel, src.width,
+                         src.height, src.depth, config.src_x0, config.src_y0, src_extent_x,
+                         src_extent_y, src.block_height, src.block_depth,
+                         src_extent_x * src_bytes_per_pixel);
+    } else {
+        process_pitch_linear(false, impl->tmp_buffer, impl->src_buffer, src_extent_x, src_extent_y,
+                             src.pitch, config.src_x0, config.src_y0, src_bytes_per_pixel);
+    }
+
+    // Conversion Phase
+    if (no_passthrough) {
+        if (src.format != dst.format) {
+            convertion_phase_ir();
+        } else {
+            convertion_phase_same_format();
+        }
+    } else {
+        impl->dst_buffer.swap(impl->src_buffer);
+    }
+
+    const size_t dst_size = get_surface_size(dst, dst_bytes_per_pixel);
+    impl->tmp_buffer.resize(dst_size);
+    memory_manager.ReadBlock(dst.Address(), impl->tmp_buffer.data(), dst_size);
+
+    if (dst.linear == Fermi2D::MemoryLayout::BlockLinear) {
+        SwizzleSubrect(impl->tmp_buffer, impl->dst_buffer, dst_bytes_per_pixel, dst.width,
+                       dst.height, dst.depth, config.dst_x0, config.dst_y0, dst_extent_x,
+                       dst_extent_y, dst.block_height, dst.block_depth,
+                       dst_extent_x * dst_bytes_per_pixel);
+    } else {
+        process_pitch_linear(true, impl->dst_buffer, impl->tmp_buffer, dst_extent_x, dst_extent_y,
+                             dst.pitch, config.dst_x0, config.dst_y0,
+                             static_cast<size_t>(dst_bytes_per_pixel));
+    }
+    memory_manager.WriteBlock(dst.Address(), impl->tmp_buffer.data(), dst_size);
+    return true;
+}
+
+} // namespace Tegra::Engines::Blitter
--- a/src/video_core/engines/sw_blitter/blitter.h
+++ b/src/video_core/engines/sw_blitter/blitter.h
@@ -0,0 +1,27 @@
+// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#pragma once
+
+#include "video_core/engines/fermi_2d.h"
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace Tegra::Engines::Blitter {
+
+class SoftwareBlitEngine {
+public:
+    SoftwareBlitEngine(MemoryManager& memory_manager_);
+    ~SoftwareBlitEngine();
+
+    bool Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst, Fermi2D::Config& copy_config);
+
+private:
+    MemoryManager& memory_manager;
+    struct BlitEngineImpl;
+    std::unique_ptr<BlitEngineImpl> impl;
+};
+
+} // namespace Tegra::Engines::Blitter
--- a/src/video_core/engines/sw_blitter/converter.cpp
+++ b/src/video_core/engines/sw_blitter/converter.cpp
--- a/src/video_core/engines/sw_blitter/converter.h
+++ b/src/video_core/engines/sw_blitter/converter.h
@@ -0,0 +1,35 @@
+// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include <memory>
+#include <span>
+
+#include "common/common_types.h"
+
+#pragma once
+
+#include "video_core/gpu.h"
+
+namespace Tegra::Engines::Blitter {
+
+class Converter {
+public:
+    virtual void ConvertTo(std::span<u8> input, std::span<f32> output) = 0;
+    virtual void ConvertFrom(std::span<f32> input, std::span<u8> output) = 0;
+};
+
+class ConverterFactory {
+public:
+    ConverterFactory();
+    ~ConverterFactory();
+
+    Converter* GetFormatConverter(RenderTargetFormat format);
+
+private:
+    Converter* BuildConverter(RenderTargetFormat format);
+
+    struct ConverterFactoryImpl;
+    std::unique_ptr<ConverterFactoryImpl> impl;
+};
+
+} // namespace Tegra::Engines::Blitter
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -466,8 +466,7 @@ bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surf
                                             const Tegra::Engines::Fermi2D::Config& copy_config) {
    MICROPROFILE_SCOPE(OpenGL_Blits);
    std::scoped_lock lock{texture_cache.mutex};
-    texture_cache.BlitImage(dst, src, copy_config);
-    return true;
+    return texture_cache.BlitImage(dst, src, copy_config);
 }

 Tegra::Engines::AccelerateDMAInterface& RasterizerOpenGL::AccessAccelerateDMA() {
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -542,8 +542,7 @@ bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surf
                                             const Tegra::Engines::Fermi2D::Surface& dst,
                                             const Tegra::Engines::Fermi2D::Config& copy_config) {
    std::scoped_lock lock{texture_cache.mutex};
-    texture_cache.BlitImage(dst, src, copy_config);
-    return true;
+    return texture_cache.BlitImage(dst, src, copy_config);
 }

 Tegra::Engines::AccelerateDMAInterface& RasterizerVulkan::AccessAccelerateDMA() {
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -506,10 +506,14 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz
 }

 template <class P>
-void TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
+bool TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
                                const Tegra::Engines::Fermi2D::Surface& src,
                                const Tegra::Engines::Fermi2D::Config& copy) {
-    const BlitImages images = GetBlitImages(dst, src, copy);
+    const auto result = GetBlitImages(dst, src, copy);
+    if (!result) {
+        return false;
+    }
+    const BlitImages images = *result;
    const ImageId dst_id = images.dst_id;
    const ImageId src_id = images.src_id;

@@ -596,6 +600,7 @@ void TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
        runtime.BlitImage(dst_framebuffer, dst_view, src_view, dst_region, src_region, copy.filter,
                          copy.operation);
    }
+    return true;
 }

 template <class P>
@@ -1133,7 +1138,7 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
 }

 template <class P>
-typename TextureCache<P>::BlitImages TextureCache<P>::GetBlitImages(
+std::optional<typename TextureCache<P>::BlitImages> TextureCache<P>::GetBlitImages(
    const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src,
    const Tegra::Engines::Fermi2D::Config& copy) {

@@ -1154,6 +1159,20 @@ typename TextureCache<P>::BlitImages TextureCache<P>::GetBlitImages(
        has_deleted_images = false;
        src_id = FindImage(src_info, src_addr, try_options);
        dst_id = FindImage(dst_info, dst_addr, try_options);
+        if (!copy.must_accelerate) {
+            do {
+                if (!src_id && !dst_id) {
+                    return std::nullopt;
+                }
+                if (src_id && True(slot_images[src_id].flags & ImageFlagBits::GpuModified)) {
+                    break;
+                }
+                if (dst_id && True(slot_images[dst_id].flags & ImageFlagBits::GpuModified)) {
+                    break;
+                }
+                return std::nullopt;
+            } while (false);
+        }
        const ImageBase* const src_image = src_id ? &slot_images[src_id] : nullptr;
        if (src_image && src_image->info.num_samples > 1) {
            RelaxedOptions find_options{FIND_OPTIONS | RelaxedOptions::ForceBrokenViews};
@@ -1194,12 +1213,12 @@ typename TextureCache<P>::BlitImages TextureCache<P>::GetBlitImages(
            dst_id = FindOrInsertImage(dst_info, dst_addr, RelaxedOptions{});
        } while (has_deleted_images);
    }
-    return BlitImages{
+    return {BlitImages{
        .dst_id = dst_id,
        .src_id = src_id,
        .dst_format = dst_info.format,
        .src_format = src_info.format,
-    };
+    }};
 }

 template <class P>
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -174,7 +174,7 @@ public:
    void UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size);

    /// Blit an image with the given parameters
-    void BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
+    bool BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
                   const Tegra::Engines::Fermi2D::Surface& src,
                   const Tegra::Engines::Fermi2D::Config& copy);

@@ -285,9 +285,9 @@ private:
    [[nodiscard]] ImageId JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr);

    /// Return a blit image pair from the given guest blit parameters
-    [[nodiscard]] BlitImages GetBlitImages(const Tegra::Engines::Fermi2D::Surface& dst,
-                                           const Tegra::Engines::Fermi2D::Surface& src,
-                                           const Tegra::Engines::Fermi2D::Config& copy);
+    [[nodiscard]] std::optional<BlitImages> GetBlitImages(
+        const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src,
+        const Tegra::Engines::Fermi2D::Config& copy);

    /// Find or create a sampler from a guest descriptor sampler
    [[nodiscard]] SamplerId FindSampler(const TSCEntry& config);