yuzu/src/video_core/dma_pusher.h

// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later

#pragma once

#include <array>
#include <span>
#include <vector>
#include <boost/container/small_vector.hpp>
#include <queue>

#include "common/bit_field.h"
#include "common/common_types.h"
#include "common/scratch_buffer.h"
#include "video_core/engines/engine_interface.h"
#include "video_core/engines/puller.h"

namespace Core {
class System;
}

namespace Tegra {

namespace Control {
struct ChannelState;
}

class GPU;
class MemoryManager;

enum class SubmissionMode : u32 {
    IncreasingOld = 0,
    Increasing = 1,
    NonIncreasingOld = 2,
    NonIncreasing = 3,
    Inline = 4,
    IncreaseOnce = 5
};

// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
// So the values you see in docs might be multiplied by 4.
// Register documentation:
// https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/classes/host/cla26f.h
//
// Register Description (approx):
// https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/manuals/volta/gv100/dev_pbdma.ref.txt
enum class BufferMethods : u32 {
    BindObject = 0x0,
    Illegal = 0x1,
    Nop = 0x2,
    SemaphoreAddressHigh = 0x4,
    SemaphoreAddressLow = 0x5,
    SemaphoreSequencePayload = 0x6,
    SemaphoreOperation = 0x7,
    NonStallInterrupt = 0x8,
    WrcacheFlush = 0x9,
    MemOpA = 0xA,
    MemOpB = 0xB,
    MemOpC = 0xC,
    MemOpD = 0xD,
    RefCnt = 0x14,
    SemaphoreAcquire = 0x1A,
    SemaphoreRelease = 0x1B,
    SyncpointPayload = 0x1C,
    SyncpointOperation = 0x1D,
    WaitForIdle = 0x1E,
    CRCCheck = 0x1F,
    Yield = 0x20,
    NonPullerMethods = 0x40,
};

struct CommandListHeader {
    union {
        u64 raw;
        BitField<0, 40, GPUVAddr> addr;
        BitField<41, 1, u64> is_non_main;
        BitField<42, 21, u64> size;
    };
};
static_assert(sizeof(CommandListHeader) == sizeof(u64), "CommandListHeader is incorrect size");

union CommandHeader {
    u32 argument;
    BitField<0, 13, u32> method;
    BitField<0, 24, u32> method_count_;
    BitField<13, 3, u32> subchannel;
    BitField<16, 13, u32> arg_count;
    BitField<16, 13, u32> method_count;
    BitField<29, 3, SubmissionMode> mode;
};
static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout");
static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!");

inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, SubmissionMode mode) {
    CommandHeader result{};
    result.method.Assign(static_cast<u32>(method));
    result.arg_count.Assign(arg_count);
    result.mode.Assign(mode);
    return result;
}

struct CommandList final {
    CommandList() = default;
    explicit CommandList(std::size_t size) : command_lists(size) {}
    explicit CommandList(
        boost::container::small_vector<CommandHeader, 512>&& prefetch_command_list_)
        : prefetch_command_list{std::move(prefetch_command_list_)} {}

    boost::container::small_vector<CommandListHeader, 512> command_lists;
    boost::container::small_vector<CommandHeader, 512> prefetch_command_list;
};

/**
 * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the
 * emulated app fills with commands and tells PFIFO to process. The pushbuffers are then assembled
 * into a "command stream" consisting of 32-bit words that make up "commands".
 * See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for
 * details on this implementation.
 */
class DmaPusher final {
public:
    explicit DmaPusher(Core::System& system_, GPU& gpu_, MemoryManager& memory_manager_,
                       Control::ChannelState& channel_state_);
    ~DmaPusher();

    void Push(CommandList&& entries) {
        dma_pushbuffer.push(std::move(entries));
    }

    void DispatchCalls();

    void BindSubchannel(Engines::EngineInterface* engine, u32 subchannel_id) {
        subchannels[subchannel_id] = engine;
    }

    void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);

private:
    static constexpr u32 non_puller_methods = 0x40;
    static constexpr u32 max_subchannels = 8;
    bool Step();
    void ProcessCommands(std::span<const CommandHeader> commands);

    void SetState(const CommandHeader& command_header);

    void CallMethod(u32 argument) const;
    void CallMultiMethod(const u32* base_start, u32 num_methods) const;

    Common::ScratchBuffer<CommandHeader>
        command_headers; ///< Buffer for list of commands fetched at once

    std::queue<CommandList> dma_pushbuffer; ///< Queue of command lists to be processed
    std::size_t dma_pushbuffer_subindex{};  ///< Index within a command list within the pushbuffer

    struct DmaState {
        u32 method;            ///< Current method
        u32 subchannel;        ///< Current subchannel
        u32 method_count;      ///< Current method count
        u32 length_pending;    ///< Large NI command length pending
        GPUVAddr dma_get;      ///< Currently read segment
        u64 dma_word_offset;   ///< Current word ofset from address
        bool non_incrementing; ///< Current command's NI flag
        bool is_last_call;
    };

    DmaState dma_state{};
    bool dma_increment_once{};

    const bool ib_enable{true}; ///< IB mode enabled

    std::array<Engines::EngineInterface*, max_subchannels> subchannels{};

    GPU& gpu;
    Core::System& system;
    MemoryManager& memory_manager;
    mutable Engines::Puller puller;
};

} // namespace Tegra
general: Convert source file copyright comments over to SPDX This formats all copyright comments according to SPDX formatting guidelines. Additionally, this resolves the remaining GPLv2 only licensed files by relicensing them to GPLv2.0-or-later. 2022-04-23 08:59:50 +00:00			`// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project`
			`// SPDX-License-Identifier: GPL-2.0-or-later`
gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00
			`#pragma once`

VideoCore/GPU: Delegate subchannel engines to the dma pusher. 2020-04-28 02:07:21 +00:00			`#include <array>`
dma_pusher: Rework command_headers usage Uses ScratchBuffer and avoids overwriting the command_headers buffer with the prefetch_command_list 2022-12-07 05:45:06 +00:00			`#include <span>`
dma_pushbuffer: Optimize to avoid loop and copy on Push. 2018-11-28 00:17:33 +00:00			`#include <vector>`
Remove memory allocations in some hot paths 2023-05-23 13:45:54 +00:00			`#include <boost/container/small_vector.hpp>`
gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00			`#include <queue>`

			`#include "common/bit_field.h"`
			`#include "common/common_types.h"`
dma_pusher: Rework command_headers usage Uses ScratchBuffer and avoids overwriting the command_headers buffer with the prefetch_command_list 2022-12-07 05:45:06 +00:00			`#include "common/scratch_buffer.h"`
VideoCore/GPU: Delegate subchannel engines to the dma pusher. 2020-04-28 02:07:21 +00:00			`#include "video_core/engines/engine_interface.h"`
VideoCore: implement channels on gpu caches. 2021-11-05 14:52:31 +00:00			`#include "video_core/engines/puller.h"`
gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00
dma_pusher: Remove reliance on the global system instance With this, the video core is now has no calls to the global system instance at all. 2020-04-19 20:12:06 +00:00			`namespace Core {`
			`class System;`
			`}`

gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00			`namespace Tegra {`

VideoCore: implement channels on gpu caches. 2021-11-05 14:52:31 +00:00			`namespace Control {`
			`struct ChannelState;`
			`}`

General: Fix clang build Allows building on clang to work again 2020-11-05 01:41:16 +00:00			`class GPU;`
VideoCore: implement channels on gpu caches. 2021-11-05 14:52:31 +00:00			`class MemoryManager;`
General: Fix clang build Allows building on clang to work again 2020-11-05 01:41:16 +00:00
gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00			`enum class SubmissionMode : u32 {`
			`IncreasingOld = 0,`
			`Increasing = 1,`
			`NonIncreasingOld = 2,`
			`NonIncreasing = 3,`
			`Inline = 4,`
			`IncreaseOnce = 5`
			`};`

video_core: gpu: Implement WaitFence and IncrementSyncPoint. 2020-10-27 05:11:41 +00:00			`// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence`
			`// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.`
			`// So the values you see in docs might be multiplied by 4.`
VideoCore: Refactor syncing. 2022-01-30 09:31:13 +00:00			`// Register documentation:`
			`// https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/classes/host/cla26f.h`
			`//`
			`// Register Description (approx):`
			`// https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/manuals/volta/gv100/dev_pbdma.ref.txt`
video_core: gpu: Implement WaitFence and IncrementSyncPoint. 2020-10-27 05:11:41 +00:00			`enum class BufferMethods : u32 {`
			`BindObject = 0x0,`
VideoCore: Refactor syncing. 2022-01-30 09:31:13 +00:00			`Illegal = 0x1,`
video_core: gpu: Implement WaitFence and IncrementSyncPoint. 2020-10-27 05:11:41 +00:00			`Nop = 0x2,`
			`SemaphoreAddressHigh = 0x4,`
			`SemaphoreAddressLow = 0x5,`
VideoCore: Refactor syncing. 2022-01-30 09:31:13 +00:00			`SemaphoreSequencePayload = 0x6,`
			`SemaphoreOperation = 0x7,`
			`NonStallInterrupt = 0x8,`
video_core: gpu: Implement WaitFence and IncrementSyncPoint. 2020-10-27 05:11:41 +00:00			`WrcacheFlush = 0x9,`
VideoCore: Refactor syncing. 2022-01-30 09:31:13 +00:00			`MemOpA = 0xA,`
			`MemOpB = 0xB,`
			`MemOpC = 0xC,`
			`MemOpD = 0xD,`
video_core: gpu: Implement WaitFence and IncrementSyncPoint. 2020-10-27 05:11:41 +00:00			`RefCnt = 0x14,`
			`SemaphoreAcquire = 0x1A,`
			`SemaphoreRelease = 0x1B,`
VideoCore: Refactor syncing. 2022-01-30 09:31:13 +00:00			`SyncpointPayload = 0x1C,`
			`SyncpointOperation = 0x1D,`
			`WaitForIdle = 0x1E,`
			`CRCCheck = 0x1F,`
video_core: gpu: Implement WaitFence and IncrementSyncPoint. 2020-10-27 05:11:41 +00:00			`Yield = 0x20,`
			`NonPullerMethods = 0x40,`
			`};`

gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00			`struct CommandListHeader {`
			`union {`
			`u64 raw;`
			`BitField<0, 40, GPUVAddr> addr;`
			`BitField<41, 1, u64> is_non_main;`
			`BitField<42, 21, u64> size;`
			`};`
			`};`
			`static_assert(sizeof(CommandListHeader) == sizeof(u64), "CommandListHeader is incorrect size");`

			`union CommandHeader {`
			`u32 argument;`
			`BitField<0, 13, u32> method;`
			`BitField<0, 24, u32> method_count_;`
			`BitField<13, 3, u32> subchannel;`
			`BitField<16, 13, u32> arg_count;`
			`BitField<16, 13, u32> method_count;`
			`BitField<29, 3, SubmissionMode> mode;`
			`};`
			`static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout");`
			`static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!");`

General: Fix clang build Allows building on clang to work again 2020-11-05 01:41:16 +00:00			`inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, SubmissionMode mode) {`
video_core: dma_pusher: Add support for prefetched command lists. 2020-10-30 04:13:04 +00:00			`CommandHeader result{};`
			`result.method.Assign(static_cast<u32>(method));`
			`result.arg_count.Assign(arg_count);`
			`result.mode.Assign(mode);`
			`return result;`
			`}`

			`struct CommandList final {`
			`CommandList() = default;`
			`explicit CommandList(std::size_t size) : command_lists(size) {}`
Remove memory allocations in some hot paths 2023-05-23 13:45:54 +00:00			`explicit CommandList(`
			`boost::container::small_vector<CommandHeader, 512>&& prefetch_command_list_)`
video_core: Resolve more variable shadowing scenarios Resolves variable shadowing scenarios up to the end of the OpenGL code to make it nicer to review. The rest will be resolved in a following commit. 2020-12-04 19:39:12 +00:00			`: prefetch_command_list{std::move(prefetch_command_list_)} {}`
video_core: dma_pusher: Add support for prefetched command lists. 2020-10-30 04:13:04 +00:00
Remove memory allocations in some hot paths 2023-05-23 13:45:54 +00:00			`boost::container::small_vector<CommandListHeader, 512> command_lists;`
			`boost::container::small_vector<CommandHeader, 512> prefetch_command_list;`
video_core: dma_pusher: Add support for prefetched command lists. 2020-10-30 04:13:04 +00:00			`};`
dma_pushbuffer: Optimize to avoid loop and copy on Push. 2018-11-28 00:17:33 +00:00
gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00			`/**`
			`* The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the`
			`* emulated app fills with commands and tells PFIFO to process. The pushbuffers are then assembled`
			`* into a "command stream" consisting of 32-bit words that make up "commands".`
			`* See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for`
			`* details on this implementation.`
			`*/`
video_core: dma_pusher: Add support for prefetched command lists. 2020-10-30 04:13:04 +00:00			`class DmaPusher final {`
gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00			`public:`
VideoCore: implement channels on gpu caches. 2021-11-05 14:52:31 +00:00			`explicit DmaPusher(Core::System& system_, GPU& gpu_, MemoryManager& memory_manager_,`
			`Control::ChannelState& channel_state_);`
gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00			`~DmaPusher();`

dma_pushbuffer: Optimize to avoid loop and copy on Push. 2018-11-28 00:17:33 +00:00			`void Push(CommandList&& entries) {`
			`dma_pushbuffer.push(std::move(entries));`
gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00			`}`

			`void DispatchCalls();`

video_core: Resolve more variable shadowing scenarios Resolves variable shadowing scenarios up to the end of the OpenGL code to make it nicer to review. The rest will be resolved in a following commit. 2020-12-04 19:39:12 +00:00			`void BindSubchannel(Engines::EngineInterface* engine, u32 subchannel_id) {`
VideoCore/GPU: Delegate subchannel engines to the dma pusher. 2020-04-28 02:07:21 +00:00			`subchannels[subchannel_id] = engine;`
			`}`

VideoCore: implement channels on gpu caches. 2021-11-05 14:52:31 +00:00			`void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);`

gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00			`private:`
VideoCore/GPU: Delegate subchannel engines to the dma pusher. 2020-04-28 02:07:21 +00:00			`static constexpr u32 non_puller_methods = 0x40;`
			`static constexpr u32 max_subchannels = 8;`
gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00			`bool Step();`
dma_pusher: Rework command_headers usage Uses ScratchBuffer and avoids overwriting the command_headers buffer with the prefetch_command_list 2022-12-07 05:45:06 +00:00			`void ProcessCommands(std::span<const CommandHeader> commands);`
gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00
			`void SetState(const CommandHeader& command_header);`

			`void CallMethod(u32 argument) const;`
DMAPusher: Propagate multimethod writes into the engines. 2020-04-20 06:16:56 +00:00			`void CallMultiMethod(const u32* base_start, u32 num_methods) const;`
gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00
dma_pusher: Rework command_headers usage Uses ScratchBuffer and avoids overwriting the command_headers buffer with the prefetch_command_list 2022-12-07 05:45:06 +00:00			`Common::ScratchBuffer<CommandHeader>`
			`command_headers; ///< Buffer for list of commands fetched at once`
video_core/dma_pusher: The full list of headers at once. Fetching every u32 from memory leads to a big overhead. So let's fetch all of them as a block if possible. This reduces the Memory::* calls by the dma_pusher by a factor of 10. 2019-02-19 08:44:33 +00:00
dma_pushbuffer: Optimize to avoid loop and copy on Push. 2018-11-28 00:17:33 +00:00			`std::queue<CommandList> dma_pushbuffer; ///< Queue of command lists to be processed`
			`std::size_t dma_pushbuffer_subindex{}; ///< Index within a command list within the pushbuffer`
gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00
			`struct DmaState {`
			`u32 method; ///< Current method`
			`u32 subchannel; ///< Current subchannel`
			`u32 method_count; ///< Current method count`
			`u32 length_pending; ///< Large NI command length pending`
MacroHLE: Add MultidrawIndirect HLE Macro. 2022-02-09 14:00:05 +00:00			`GPUVAddr dma_get; ///< Currently read segment`
MacroHLE: Refactor MacroHLE system. 2022-03-05 07:01:13 +00:00			`u64 dma_word_offset; ///< Current word ofset from address`
video_core/dma_pusher: Silence C4828 warnings This was previously causing: warning C4828: The file contains a character starting at offset 0xa33 that is illegal in the current source character set (codepage 65001). warnings on Windows when compiling yuzu. 2019-01-30 17:36:28 +00:00			`bool non_incrementing; ///< Current command's NI flag`
VideoCore/GPU: Delegate subchannel engines to the dma pusher. 2020-04-28 02:07:21 +00:00			`bool is_last_call;`
gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00			`};`

			`DmaState dma_state{};`
			`bool dma_increment_once{};`

dma_pusher: Rework command_headers usage Uses ScratchBuffer and avoids overwriting the command_headers buffer with the prefetch_command_list 2022-12-07 05:45:06 +00:00			`const bool ib_enable{true}; ///< IB mode enabled`
dma_pusher: Remove reliance on the global system instance With this, the video core is now has no calls to the global system instance at all. 2020-04-19 20:12:06 +00:00
video_core: Resolve more variable shadowing scenarios Resolves variable shadowing scenarios up to the end of the OpenGL code to make it nicer to review. The rest will be resolved in a following commit. 2020-12-04 19:39:12 +00:00			`std::array<Engines::EngineInterface*, max_subchannels> subchannels{};`
VideoCore/GPU: Delegate subchannel engines to the dma pusher. 2020-04-28 02:07:21 +00:00
dma_pusher: Remove reliance on the global system instance With this, the video core is now has no calls to the global system instance at all. 2020-04-19 20:12:06 +00:00			`GPU& gpu;`
			`Core::System& system;`
VideoCore: implement channels on gpu caches. 2021-11-05 14:52:31 +00:00			`MemoryManager& memory_manager;`
			`mutable Engines::Puller puller;`
gpu: Rewrite GPU command list processing with DmaPusher class. - More accurate impl., fixes Undertale (among other games). 2018-11-24 04:20:56 +00:00			`};`

			`} // namespace Tegra`