From 9b82de6b24ea06496846a5fa211a23879b0ff644 Mon Sep 17 00:00:00 2001 From: GPUCode <47210458+GPUCode@users.noreply.github.com> Date: Sat, 24 Jun 2023 01:59:18 +0300 Subject: [PATCH] Refactor software renderer (#6621) --- src/citra/citra.cpp | 28 +- src/citra/emu_window/emu_window_sdl2_sw.cpp | 64 +- src/citra/emu_window/emu_window_sdl2_sw.h | 15 +- src/citra_qt/bootmanager.cpp | 74 +- src/citra_qt/bootmanager.h | 3 +- .../graphics/graphics_vertex_shader.cpp | 4 +- src/citra_qt/main.cpp | 4 +- .../shader/shader_jit_x64_compiler.cpp | 9 +- src/video_core/CMakeLists.txt | 2 - src/video_core/command_processor.cpp | 28 +- src/video_core/geometry_pipeline.cpp | 14 +- src/video_core/pica_types.h | 64 +- src/video_core/rasterizer_accelerated.cpp | 57 +- src/video_core/regs_rasterizer.h | 10 +- src/video_core/renderer_base.h | 10 +- .../renderer_software/rasterizer.cpp | 901 ----------------- src/video_core/renderer_software/rasterizer.h | 44 - .../renderer_software/renderer_software.cpp | 76 +- .../renderer_software/renderer_software.h | 20 +- .../renderer_software/sw_clipper.cpp | 246 ++--- src/video_core/renderer_software/sw_clipper.h | 84 +- .../renderer_software/sw_framebuffer.cpp | 254 +++-- .../renderer_software/sw_framebuffer.h | 57 +- .../renderer_software/sw_lighting.cpp | 175 ++-- .../renderer_software/sw_lighting.h | 14 +- .../renderer_software/sw_proctex.cpp | 60 +- src/video_core/renderer_software/sw_proctex.h | 7 +- .../renderer_software/sw_rasterizer.cpp | 930 +++++++++++++++++- .../renderer_software/sw_rasterizer.h | 62 +- .../renderer_software/sw_texturing.cpp | 109 +- .../renderer_software/sw_texturing.h | 18 +- src/video_core/shader/debug_data.h | 20 +- src/video_core/shader/shader.cpp | 12 +- src/video_core/shader/shader.h | 43 +- src/video_core/shader/shader_interpreter.cpp | 77 +- src/video_core/shader/shader_jit_x64.cpp | 1 + .../shader/shader_jit_x64_compiler.cpp | 2 +- src/video_core/vertex_loader.cpp | 11 +- src/video_core/video_core.cpp | 2 +- 39 files changed, 1815 insertions(+), 1796 deletions(-) delete mode 100644 src/video_core/renderer_software/rasterizer.cpp delete mode 100644 src/video_core/renderer_software/rasterizer.h diff --git a/src/citra/citra.cpp b/src/citra/citra.cpp index 7445c1d07..b8ebb1ff1 100644 --- a/src/citra/citra.cpp +++ b/src/citra/citra.cpp @@ -344,11 +344,14 @@ int main(int argc, char** argv) { return -1; } + auto& system = Core::System::GetInstance(); + auto& movie = Core::Movie::GetInstance(); + if (!movie_record.empty()) { - Core::Movie::GetInstance().PrepareForRecording(); + movie.PrepareForRecording(); } if (!movie_play.empty()) { - Core::Movie::GetInstance().PrepareForPlayback(movie_play); + movie.PrepareForPlayback(movie_play); } // Apply the command line arguments @@ -361,13 +364,13 @@ int main(int argc, char** argv) { EmuWindow_SDL2::InitializeSDL2(); - const auto create_emu_window = [](bool fullscreen, - bool is_secondary) -> std::unique_ptr { + const auto create_emu_window = [&](bool fullscreen, + bool is_secondary) -> std::unique_ptr { switch (Settings::values.graphics_api.GetValue()) { case Settings::GraphicsAPI::OpenGL: return std::make_unique(fullscreen, is_secondary); case Settings::GraphicsAPI::Software: - return std::make_unique(fullscreen, is_secondary); + return std::make_unique(system, fullscreen, is_secondary); } LOG_ERROR(Frontend, "Invalid Graphics API, using OpenGL"); return std::make_unique(fullscreen, is_secondary); @@ -385,7 +388,6 @@ int main(int argc, char** argv) { Common::g_scm_desc); Settings::LogSettings(); - Core::System& system = Core::System::GetInstance(); const Core::System::ResultStatus load_result{ system.Load(*emu_window, filepath, secondary_window.get())}; @@ -437,21 +439,21 @@ int main(int argc, char** argv) { } if (!movie_play.empty()) { - auto metadata = Core::Movie::GetInstance().GetMovieMetadata(movie_play); + auto metadata = movie.GetMovieMetadata(movie_play); LOG_INFO(Movie, "Author: {}", metadata.author); LOG_INFO(Movie, "Rerecord count: {}", metadata.rerecord_count); LOG_INFO(Movie, "Input count: {}", metadata.input_count); - Core::Movie::GetInstance().StartPlayback(movie_play); + movie.StartPlayback(movie_play); } if (!movie_record.empty()) { - Core::Movie::GetInstance().StartRecording(movie_record, movie_record_author); + movie.StartRecording(movie_record, movie_record_author); } if (!dump_video.empty() && DynamicLibrary::FFmpeg::LoadFFmpeg()) { - Layout::FramebufferLayout layout{Layout::FrameLayoutFromResolutionScale( - VideoCore::g_renderer->GetResolutionScaleFactor())}; + const auto layout{ + Layout::FrameLayoutFromResolutionScale(system.Renderer().GetResolutionScaleFactor())}; auto dumper = std::make_shared(); if (dumper->StartDumping(dump_video, layout)) { - Core::System::GetInstance().RegisterVideoDumper(dumper); + system.RegisterVideoDumper(dumper); } } @@ -494,7 +496,7 @@ int main(int argc, char** argv) { main_render_thread.join(); secondary_render_thread.join(); - Core::Movie::GetInstance().Shutdown(); + movie.Shutdown(); auto video_dumper = system.GetVideoDumper(); if (video_dumper && video_dumper->IsDumping()) { diff --git a/src/citra/emu_window/emu_window_sdl2_sw.cpp b/src/citra/emu_window/emu_window_sdl2_sw.cpp index e9d047bb9..71161fdbf 100644 --- a/src/citra/emu_window/emu_window_sdl2_sw.cpp +++ b/src/citra/emu_window/emu_window_sdl2_sw.cpp @@ -9,18 +9,16 @@ #include #include #include "citra/emu_window/emu_window_sdl2_sw.h" -#include "common/color.h" #include "common/scm_rev.h" #include "common/settings.h" +#include "core/core.h" #include "core/frontend/emu_window.h" -#include "core/hw/gpu.h" -#include "core/memory.h" -#include "video_core/video_core.h" +#include "video_core/renderer_software/renderer_software.h" class DummyContext : public Frontend::GraphicsContext {}; -EmuWindow_SDL2_SW::EmuWindow_SDL2_SW(bool fullscreen, bool is_secondary) - : EmuWindow_SDL2{is_secondary} { +EmuWindow_SDL2_SW::EmuWindow_SDL2_SW(Core::System& system_, bool fullscreen, bool is_secondary) + : EmuWindow_SDL2{is_secondary}, system{system_} { std::string window_title = fmt::format("Citra {} | {}-{}", Common::g_build_fullname, Common::g_scm_branch, Common::g_scm_desc); render_window = @@ -67,6 +65,8 @@ void EmuWindow_SDL2_SW::Present() { const auto layout{Layout::DefaultFrameLayout( Core::kScreenTopWidth, Core::kScreenTopHeight + Core::kScreenBottomHeight, false, false)}; + using VideoCore::ScreenId; + while (IsOpen()) { SDL_SetRenderDrawColor(renderer, static_cast(Settings::values.bg_red.GetValue() * 255), @@ -74,62 +74,34 @@ void EmuWindow_SDL2_SW::Present() { static_cast(Settings::values.bg_blue.GetValue() * 255), 0xFF); SDL_RenderClear(renderer); - const auto draw_screen = [&](int fb_id) { - const auto dst_rect = fb_id == 0 ? layout.top_screen : layout.bottom_screen; + const auto draw_screen = [&](ScreenId screen_id) { + const auto dst_rect = + screen_id == ScreenId::TopLeft ? layout.top_screen : layout.bottom_screen; SDL_Rect sdl_rect{static_cast(dst_rect.left), static_cast(dst_rect.top), static_cast(dst_rect.GetWidth()), static_cast(dst_rect.GetHeight())}; - SDL_Surface* screen = LoadFramebuffer(fb_id); + SDL_Surface* screen = LoadFramebuffer(screen_id); SDL_BlitSurface(screen, nullptr, window_surface, &sdl_rect); SDL_FreeSurface(screen); }; - draw_screen(0); - draw_screen(1); + draw_screen(ScreenId::TopLeft); + draw_screen(ScreenId::Bottom); SDL_RenderPresent(renderer); SDL_UpdateWindowSurface(render_window); } } -SDL_Surface* EmuWindow_SDL2_SW::LoadFramebuffer(int fb_id) { - const auto& framebuffer = GPU::g_regs.framebuffer_config[fb_id]; - const PAddr framebuffer_addr = - framebuffer.active_fb == 0 ? framebuffer.address_left1 : framebuffer.address_left2; - - Memory::RasterizerFlushRegion(framebuffer_addr, framebuffer.stride * framebuffer.height); - const u8* framebuffer_data = VideoCore::g_memory->GetPhysicalPointer(framebuffer_addr); - - const int width = framebuffer.height; - const int height = framebuffer.width; - const int bpp = GPU::Regs::BytesPerPixel(framebuffer.color_format); - +SDL_Surface* EmuWindow_SDL2_SW::LoadFramebuffer(VideoCore::ScreenId screen_id) { + const auto& renderer = static_cast(system.Renderer()); + const auto& info = renderer.Screen(screen_id); + const int width = static_cast(info.width); + const int height = static_cast(info.height); SDL_Surface* surface = SDL_CreateRGBSurfaceWithFormat(0, width, height, 0, SDL_PIXELFORMAT_ABGR8888); SDL_LockSurface(surface); - for (int y = 0; y < height; y++) { - for (int x = 0; x < width; x++) { - const u8* pixel = framebuffer_data + (x * height + height - y) * bpp; - const Common::Vec4 color = [&] { - switch (framebuffer.color_format) { - case GPU::Regs::PixelFormat::RGBA8: - return Common::Color::DecodeRGBA8(pixel); - case GPU::Regs::PixelFormat::RGB8: - return Common::Color::DecodeRGB8(pixel); - case GPU::Regs::PixelFormat::RGB565: - return Common::Color::DecodeRGB565(pixel); - case GPU::Regs::PixelFormat::RGB5A1: - return Common::Color::DecodeRGB5A1(pixel); - case GPU::Regs::PixelFormat::RGBA4: - return Common::Color::DecodeRGBA4(pixel); - } - UNREACHABLE(); - }(); - - u8* dst_pixel = reinterpret_cast(surface->pixels) + (y * width + x) * 4; - std::memcpy(dst_pixel, color.AsArray(), sizeof(color)); - } - } + std::memcpy(surface->pixels, info.pixels.data(), info.pixels.size()); SDL_UnlockSurface(surface); return surface; } diff --git a/src/citra/emu_window/emu_window_sdl2_sw.h b/src/citra/emu_window/emu_window_sdl2_sw.h index e1d35f687..22fcd3bd2 100644 --- a/src/citra/emu_window/emu_window_sdl2_sw.h +++ b/src/citra/emu_window/emu_window_sdl2_sw.h @@ -10,9 +10,17 @@ struct SDL_Renderer; struct SDL_Surface; +namespace VideoCore { +enum class ScreenId : u32; +} + +namespace Core { +class System; +} + class EmuWindow_SDL2_SW : public EmuWindow_SDL2 { public: - explicit EmuWindow_SDL2_SW(bool fullscreen, bool is_secondary); + explicit EmuWindow_SDL2_SW(Core::System& system, bool fullscreen, bool is_secondary); ~EmuWindow_SDL2_SW(); void Present() override; @@ -22,7 +30,10 @@ public: private: /// Loads a framebuffer to an SDL surface - SDL_Surface* LoadFramebuffer(int fb_id); + SDL_Surface* LoadFramebuffer(VideoCore::ScreenId screen_id); + + /// The system class. + Core::System& system; /// The SDL software renderer SDL_Renderer* renderer; diff --git a/src/citra_qt/bootmanager.cpp b/src/citra_qt/bootmanager.cpp index 16589f7de..c89869418 100644 --- a/src/citra_qt/bootmanager.cpp +++ b/src/citra_qt/bootmanager.cpp @@ -25,6 +25,7 @@ #include "input_common/motion_emu.h" #include "video_core/custom_textures/custom_tex_manager.h" #include "video_core/renderer_base.h" +#include "video_core/renderer_software/renderer_software.h" #include "video_core/video_core.h" #ifdef HAS_OPENGL @@ -288,7 +289,8 @@ private: #endif struct SoftwareRenderWidget : public RenderWidget { - explicit SoftwareRenderWidget(GRenderWindow* parent) : RenderWidget(parent) {} + explicit SoftwareRenderWidget(GRenderWindow* parent, Core::System& system_) + : RenderWidget(parent), system(system_) {} void Present() override { if (!isVisible()) { @@ -298,61 +300,40 @@ struct SoftwareRenderWidget : public RenderWidget { return; } + using VideoCore::ScreenId; + const auto layout{Layout::DefaultFrameLayout(width(), height(), false, false)}; QPainter painter(this); - const auto draw_screen = [&](int fb_id) { - const auto rect = fb_id == 0 ? layout.top_screen : layout.bottom_screen; - const QImage screen = LoadFramebuffer(fb_id).scaled(rect.GetWidth(), rect.GetHeight()); + const auto draw_screen = [&](ScreenId screen_id) { + const auto rect = + screen_id == ScreenId::TopLeft ? layout.top_screen : layout.bottom_screen; + const QImage screen = + LoadFramebuffer(screen_id).scaled(rect.GetWidth(), rect.GetHeight()); painter.drawImage(rect.left, rect.top, screen); }; painter.fillRect(rect(), qRgb(Settings::values.bg_red.GetValue() * 255, Settings::values.bg_green.GetValue() * 255, Settings::values.bg_blue.GetValue() * 255)); - draw_screen(0); - draw_screen(1); + draw_screen(ScreenId::TopLeft); + draw_screen(ScreenId::Bottom); painter.end(); } - QImage LoadFramebuffer(int fb_id) { - const auto& framebuffer = GPU::g_regs.framebuffer_config[fb_id]; - const PAddr framebuffer_addr = - framebuffer.active_fb == 0 ? framebuffer.address_left1 : framebuffer.address_left2; - - Memory::RasterizerFlushRegion(framebuffer_addr, framebuffer.stride * framebuffer.height); - const u8* framebuffer_data = VideoCore::g_memory->GetPhysicalPointer(framebuffer_addr); - - const int width = framebuffer.height; - const int height = framebuffer.width; - const int bpp = GPU::Regs::BytesPerPixel(framebuffer.color_format); - - QImage image{width, height, QImage::Format_RGBA8888}; - for (int y = 0; y < height; y++) { - for (int x = 0; x < width; x++) { - const u8* pixel = framebuffer_data + (x * height + height - y) * bpp; - const Common::Vec4 color = [&] { - switch (framebuffer.color_format) { - case GPU::Regs::PixelFormat::RGBA8: - return Common::Color::DecodeRGBA8(pixel); - case GPU::Regs::PixelFormat::RGB8: - return Common::Color::DecodeRGB8(pixel); - case GPU::Regs::PixelFormat::RGB565: - return Common::Color::DecodeRGB565(pixel); - case GPU::Regs::PixelFormat::RGB5A1: - return Common::Color::DecodeRGB5A1(pixel); - case GPU::Regs::PixelFormat::RGBA4: - return Common::Color::DecodeRGBA4(pixel); - } - UNREACHABLE(); - }(); - - image.setPixel(x, y, qRgba(color.r(), color.g(), color.b(), color.a())); - } - } + QImage LoadFramebuffer(VideoCore::ScreenId screen_id) { + const auto& renderer = static_cast(system.Renderer()); + const auto& info = renderer.Screen(screen_id); + const int width = static_cast(info.width); + const int height = static_cast(info.height); + QImage image{height, width, QImage::Format_RGBA8888}; + std::memcpy(image.bits(), info.pixels.data(), info.pixels.size()); return image; } + +private: + Core::System& system; }; static Frontend::WindowSystemType GetWindowSystemType() { @@ -401,8 +382,9 @@ static Frontend::EmuWindow::WindowSystemInfo GetWindowSystemInfo(QWindow* window std::unique_ptr GRenderWindow::main_context; -GRenderWindow::GRenderWindow(QWidget* parent_, EmuThread* emu_thread, bool is_secondary_) - : QWidget(parent_), EmuWindow(is_secondary_), emu_thread(emu_thread) { +GRenderWindow::GRenderWindow(QWidget* parent_, EmuThread* emu_thread_, Core::System& system_, + bool is_secondary_) + : QWidget(parent_), EmuWindow(is_secondary_), emu_thread(emu_thread_), system{system_} { setWindowTitle(QStringLiteral("Citra %1 | %2-%3") .arg(QString::fromUtf8(Common::g_build_name), @@ -652,12 +634,12 @@ void GRenderWindow::ReleaseRenderTarget() { void GRenderWindow::CaptureScreenshot(u32 res_scale, const QString& screenshot_path) { if (res_scale == 0) { - res_scale = VideoCore::g_renderer->GetResolutionScaleFactor(); + res_scale = system.Renderer().GetResolutionScaleFactor(); } const auto layout{Layout::FrameLayoutFromResolutionScale(res_scale, is_secondary)}; screenshot_image = QImage(QSize(layout.width, layout.height), QImage::Format_RGB32); - VideoCore::g_renderer->RequestScreenshot( + system.Renderer().RequestScreenshot( screenshot_image.bits(), [this, screenshot_path] { const std::string std_screenshot_path = screenshot_path.toStdString(); @@ -708,7 +690,7 @@ bool GRenderWindow::InitializeOpenGL() { } void GRenderWindow::InitializeSoftware() { - child_widget = new SoftwareRenderWidget(this); + child_widget = new SoftwareRenderWidget(this, system); main_context = std::make_unique(); } diff --git a/src/citra_qt/bootmanager.h b/src/citra_qt/bootmanager.h index f0019886d..59cdfab71 100644 --- a/src/citra_qt/bootmanager.h +++ b/src/citra_qt/bootmanager.h @@ -112,7 +112,7 @@ class GRenderWindow : public QWidget, public Frontend::EmuWindow { Q_OBJECT public: - GRenderWindow(QWidget* parent, EmuThread* emu_thread, bool is_secondary); + GRenderWindow(QWidget* parent, EmuThread* emu_thread, Core::System& system, bool is_secondary); ~GRenderWindow() override; // EmuWindow implementation. @@ -188,6 +188,7 @@ private: QWidget* child_widget = nullptr; EmuThread* emu_thread; + Core::System& system; /// Main context that will be shared with all other contexts that are requested. /// If this is used in a shared context setting, then this should not be used directly, but diff --git a/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp index 859a1448b..e8f0133fc 100644 --- a/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp +++ b/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp @@ -550,8 +550,8 @@ void GraphicsVertexShaderWidget::OnResumed() { } void GraphicsVertexShaderWidget::OnInputAttributeChanged(int index) { - float value = input_data[index]->text().toFloat(); - input_vertex.attr[index / 4][index % 4] = Pica::float24::FromFloat32(value); + const f32 value = input_data[index]->text().toFloat(); + input_vertex.attr[index / 4][index % 4] = Pica::f24::FromFloat32(value); // Re-execute shader with updated value Reload(); } diff --git a/src/citra_qt/main.cpp b/src/citra_qt/main.cpp index 12cf28c24..63d3cc45f 100644 --- a/src/citra_qt/main.cpp +++ b/src/citra_qt/main.cpp @@ -297,8 +297,8 @@ void GMainWindow::InitializeWidgets() { #ifdef CITRA_ENABLE_COMPATIBILITY_REPORTING ui->action_Report_Compatibility->setVisible(true); #endif - render_window = new GRenderWindow(this, emu_thread.get(), false); - secondary_window = new GRenderWindow(this, emu_thread.get(), true); + render_window = new GRenderWindow(this, emu_thread.get(), system, false); + secondary_window = new GRenderWindow(this, emu_thread.get(), system, true); render_window->hide(); secondary_window->hide(); secondary_window->setParent(nullptr); diff --git a/src/tests/video_core/shader/shader_jit_x64_compiler.cpp b/src/tests/video_core/shader/shader_jit_x64_compiler.cpp index 79daca422..92f9f2af5 100644 --- a/src/tests/video_core/shader/shader_jit_x64_compiler.cpp +++ b/src/tests/video_core/shader/shader_jit_x64_compiler.cpp @@ -14,7 +14,6 @@ #include "video_core/shader/shader_interpreter.h" #include "video_core/shader/shader_jit_x64_compiler.h" -using float24 = Pica::float24; using JitShader = Pica::Shader::JitShader; using ShaderInterpreter = Pica::Shader::InterpreterEngine; @@ -51,14 +50,14 @@ public: } void RunJit(Pica::Shader::UnitState& shader_unit, float input) { - shader_unit.registers.input[0].x = float24::FromFloat32(input); - shader_unit.registers.temporary[0].x = float24::FromFloat32(0); + shader_unit.registers.input[0].x = Pica::f24::FromFloat32(input); + shader_unit.registers.temporary[0].x = Pica::f24::Zero(); shader_jit.Run(*shader_setup, shader_unit, 0); } void RunInterpreter(Pica::Shader::UnitState& shader_unit, float input) { - shader_unit.registers.input[0].x = float24::FromFloat32(input); - shader_unit.registers.temporary[0].x = float24::FromFloat32(0); + shader_unit.registers.input[0].x = Pica::f24::FromFloat32(input); + shader_unit.registers.temporary[0].x = Pica::f24::Zero(); shader_interpreter.Run(*shader_setup, shader_unit); } diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 025274e03..af9d929d6 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -83,8 +83,6 @@ add_library(video_core STATIC renderer_opengl/post_processing_opengl.h renderer_opengl/renderer_opengl.cpp renderer_opengl/renderer_opengl.h - renderer_software/rasterizer.cpp - renderer_software/rasterizer.h renderer_software/renderer_software.cpp renderer_software/renderer_software.h renderer_software/sw_clipper.cpp diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index e3b136091..c5094e462 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -90,16 +90,16 @@ static void WriteUniformFloatReg(ShaderRegs& config, Shader::ShaderSetup& setup, for (auto i : {0, 1, 2, 3}) { float buffer_value; std::memcpy(&buffer_value, &uniform_write_buffer[i], sizeof(float)); - uniform[3 - i] = float24::FromFloat32(buffer_value); + uniform[3 - i] = f24::FromFloat32(buffer_value); } } else { // TODO: Untested - uniform.w = float24::FromRaw(uniform_write_buffer[0] >> 8); - uniform.z = float24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | - ((uniform_write_buffer[1] >> 16) & 0xFFFF)); - uniform.y = float24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | - ((uniform_write_buffer[2] >> 24) & 0xFF)); - uniform.x = float24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF); + uniform.w = f24::FromRaw(uniform_write_buffer[0] >> 8); + uniform.z = f24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | + ((uniform_write_buffer[1] >> 16) & 0xFFFF)); + uniform.y = f24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | + ((uniform_write_buffer[2] >> 24) & 0xFF)); + uniform.x = f24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF); } LOG_TRACE(HW_GPU, "Set {} float uniform {:x} to ({} {} {} {})", @@ -182,15 +182,15 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { break; } - Common::Vec4 attribute; + Common::Vec4 attribute; // NOTE: The destination component order indeed is "backwards" - attribute.w = float24::FromRaw(g_state.default_attr_write_buffer[0] >> 8); - attribute.z = float24::FromRaw(((g_state.default_attr_write_buffer[0] & 0xFF) << 16) | - ((g_state.default_attr_write_buffer[1] >> 16) & 0xFFFF)); - attribute.y = float24::FromRaw(((g_state.default_attr_write_buffer[1] & 0xFFFF) << 8) | - ((g_state.default_attr_write_buffer[2] >> 24) & 0xFF)); - attribute.x = float24::FromRaw(g_state.default_attr_write_buffer[2] & 0xFFFFFF); + attribute.w = f24::FromRaw(g_state.default_attr_write_buffer[0] >> 8); + attribute.z = f24::FromRaw(((g_state.default_attr_write_buffer[0] & 0xFF) << 16) | + ((g_state.default_attr_write_buffer[1] >> 16) & 0xFFFF)); + attribute.y = f24::FromRaw(((g_state.default_attr_write_buffer[1] & 0xFFFF) << 8) | + ((g_state.default_attr_write_buffer[2] >> 24) & 0xFF)); + attribute.x = f24::FromRaw(g_state.default_attr_write_buffer[2] & 0xFFFFFF); LOG_TRACE(HW_GPU, "Set default VS attribute {:x} to ({} {} {} {})", (int)setup.index, attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(), diff --git a/src/video_core/geometry_pipeline.cpp b/src/video_core/geometry_pipeline.cpp index af64490bd..957d24bfb 100644 --- a/src/video_core/geometry_pipeline.cpp +++ b/src/video_core/geometry_pipeline.cpp @@ -85,8 +85,8 @@ private: const Regs& regs; Shader::GSUnitState& unit; Shader::AttributeBuffer attribute_buffer; - Common::Vec4* buffer_cur; - Common::Vec4* buffer_end; + Common::Vec4* buffer_cur; + Common::Vec4* buffer_end; unsigned int vs_output_num; GeometryPipeline_Point() : regs(g_state.regs), unit(g_state.gs_unit) {} @@ -146,7 +146,7 @@ public: DEBUG_ASSERT(need_index); // The number of vertex input is put to the uniform register - float24 vertex_num = float24::FromFloat32(static_cast(val)); + f24 vertex_num = f24::FromFloat32(static_cast(val)); setup.uniforms.f[0] = Common::MakeVec(vertex_num, vertex_num, vertex_num, vertex_num); // The second uniform register and so on are used for receiving input vertices @@ -183,7 +183,7 @@ private: Shader::ShaderSetup& setup; unsigned int main_vertex_num; unsigned int total_vertex_num; - Common::Vec4* buffer_cur; + Common::Vec4* buffer_cur; unsigned int vs_output_num; GeometryPipeline_VariablePrimitive() : regs(g_state.regs), setup(g_state.gs) {} @@ -257,9 +257,9 @@ public: private: [[maybe_unused]] const Regs& regs; Shader::ShaderSetup& setup; - Common::Vec4* buffer_begin; - Common::Vec4* buffer_cur; - Common::Vec4* buffer_end; + Common::Vec4* buffer_begin; + Common::Vec4* buffer_cur; + Common::Vec4* buffer_end; unsigned int vs_output_num; GeometryPipeline_FixedPrimitive() : regs(g_state.regs), setup(g_state.gs) {} diff --git a/src/video_core/pica_types.h b/src/video_core/pica_types.h index 33012c259..4fd27f9fe 100644 --- a/src/video_core/pica_types.h +++ b/src/video_core/pica_types.h @@ -25,20 +25,20 @@ namespace Pica { template struct Float { public: - static Float FromFloat32(float val) { + static constexpr Float FromFloat32(float val) { Float ret; ret.value = val; return ret; } - static Float FromRaw(u32 hex) { + static constexpr Float FromRaw(u32 hex) { Float res; - const int width = M + E + 1; - const int bias = 128 - (1 << (E - 1)); - int exponent = (hex >> M) & ((1 << E) - 1); - const unsigned mantissa = hex & ((1 << M) - 1); - const unsigned sign = (hex >> (E + M)) << 31; + const s32 width = M + E + 1; + const s32 bias = 128 - (1 << (E - 1)); + s32 exponent = (hex >> M) & ((1 << E) - 1); + const u32 mantissa = hex & ((1 << M) - 1); + const u32 sign = (hex >> (E + M)) << 31; if (hex & ((1 << (width - 1)) - 1)) { if (exponent == (1 << E) - 1) @@ -55,16 +55,20 @@ public: return res; } - static Float Zero() { + static constexpr Float Zero() { return FromFloat32(0.f); } + static constexpr Float One() { + return FromFloat32(1.f); + } + // Not recommended for anything but logging - float ToFloat32() const { + constexpr float ToFloat32() const { return value; } - Float operator*(const Float& flt) const { + constexpr Float operator*(const Float& flt) const { float result = value * flt.ToFloat32(); // PICA gives 0 instead of NaN when multiplying by inf if (std::isnan(result)) @@ -73,70 +77,70 @@ public: return Float::FromFloat32(result); } - Float operator/(const Float& flt) const { + constexpr Float operator/(const Float& flt) const { return Float::FromFloat32(ToFloat32() / flt.ToFloat32()); } - Float operator+(const Float& flt) const { + constexpr Float operator+(const Float& flt) const { return Float::FromFloat32(ToFloat32() + flt.ToFloat32()); } - Float operator-(const Float& flt) const { + constexpr Float operator-(const Float& flt) const { return Float::FromFloat32(ToFloat32() - flt.ToFloat32()); } - Float& operator*=(const Float& flt) { + constexpr Float& operator*=(const Float& flt) { value = operator*(flt).value; return *this; } - Float& operator/=(const Float& flt) { + constexpr Float& operator/=(const Float& flt) { value /= flt.ToFloat32(); return *this; } - Float& operator+=(const Float& flt) { + constexpr Float& operator+=(const Float& flt) { value += flt.ToFloat32(); return *this; } - Float& operator-=(const Float& flt) { + constexpr Float& operator-=(const Float& flt) { value -= flt.ToFloat32(); return *this; } - Float operator-() const { + constexpr Float operator-() const { return Float::FromFloat32(-ToFloat32()); } - bool operator<(const Float& flt) const { + constexpr bool operator<(const Float& flt) const { return ToFloat32() < flt.ToFloat32(); } - bool operator>(const Float& flt) const { + constexpr bool operator>(const Float& flt) const { return ToFloat32() > flt.ToFloat32(); } - bool operator>=(const Float& flt) const { + constexpr bool operator>=(const Float& flt) const { return ToFloat32() >= flt.ToFloat32(); } - bool operator<=(const Float& flt) const { + constexpr bool operator<=(const Float& flt) const { return ToFloat32() <= flt.ToFloat32(); } - bool operator==(const Float& flt) const { + constexpr bool operator==(const Float& flt) const { return ToFloat32() == flt.ToFloat32(); } - bool operator!=(const Float& flt) const { + constexpr bool operator!=(const Float& flt) const { return ToFloat32() != flt.ToFloat32(); } private: - static const unsigned MASK = (1 << (M + E + 1)) - 1; - static const unsigned MANTISSA_MASK = (1 << M) - 1; - static const unsigned EXPONENT_MASK = (1 << E) - 1; + static constexpr u32 MASK = (1 << (M + E + 1)) - 1; + static constexpr u32 MANTISSA_MASK = (1 << M) - 1; + static constexpr u32 EXPONENT_MASK = (1 << E) - 1; // Stored as a regular float, merely for convenience // TODO: Perform proper arithmetic on this! @@ -149,8 +153,8 @@ private: } }; -using float24 = Float<16, 7>; -using float20 = Float<12, 7>; -using float16 = Float<10, 5>; +using f24 = Pica::Float<16, 7>; +using f20 = Pica::Float<12, 7>; +using f16 = Pica::Float<10, 5>; } // namespace Pica diff --git a/src/video_core/rasterizer_accelerated.cpp b/src/video_core/rasterizer_accelerated.cpp index 7ef4e6bd7..2dcae06f8 100644 --- a/src/video_core/rasterizer_accelerated.cpp +++ b/src/video_core/rasterizer_accelerated.cpp @@ -10,6 +10,8 @@ namespace VideoCore { +using Pica::f24; + static Common::Vec4f ColorRGBA8(const u32 color) { const auto rgba = Common::Vec4u{color >> 0 & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF, color >> 24 & 0xFF}; @@ -73,7 +75,7 @@ RasterizerAccelerated::RasterizerAccelerated(Memory::MemorySystem& memory_) * Fortunately however, the 3DS hardware happens to also use this exact same logic to work around * these issues, making this basic implementation actually more accurate to the hardware. */ -static bool AreQuaternionsOpposite(Common::Vec4 qa, Common::Vec4 qb) { +static bool AreQuaternionsOpposite(Common::Vec4 qa, Common::Vec4 qb) { Common::Vec4f a{qa.x.ToFloat32(), qa.y.ToFloat32(), qa.z.ToFloat32(), qa.w.ToFloat32()}; Common::Vec4f b{qb.x.ToFloat32(), qb.y.ToFloat32(), qb.z.ToFloat32(), qb.w.ToFloat32()}; @@ -612,7 +614,7 @@ void RasterizerAccelerated::NotifyPicaRegisterChanged(u32 id) { } void RasterizerAccelerated::SyncDepthScale() { - float depth_scale = Pica::float24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32(); + const f32 depth_scale = f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32(); if (depth_scale != uniform_block_data.data.depth_scale) { uniform_block_data.data.depth_scale = depth_scale; @@ -621,8 +623,7 @@ void RasterizerAccelerated::SyncDepthScale() { } void RasterizerAccelerated::SyncDepthOffset() { - float depth_offset = - Pica::float24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32(); + const f32 depth_offset = f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32(); if (depth_offset != uniform_block_data.data.depth_offset) { uniform_block_data.data.depth_offset = depth_offset; @@ -646,16 +647,16 @@ void RasterizerAccelerated::SyncFogColor() { void RasterizerAccelerated::SyncProcTexNoise() { const Common::Vec2f proctex_noise_f = { - Pica::float16::FromRaw(regs.texturing.proctex_noise_frequency.u).ToFloat32(), - Pica::float16::FromRaw(regs.texturing.proctex_noise_frequency.v).ToFloat32(), + Pica::f16::FromRaw(regs.texturing.proctex_noise_frequency.u).ToFloat32(), + Pica::f16::FromRaw(regs.texturing.proctex_noise_frequency.v).ToFloat32(), }; const Common::Vec2f proctex_noise_a = { regs.texturing.proctex_noise_u.amplitude / 4095.0f, regs.texturing.proctex_noise_v.amplitude / 4095.0f, }; const Common::Vec2f proctex_noise_p = { - Pica::float16::FromRaw(regs.texturing.proctex_noise_u.phase).ToFloat32(), - Pica::float16::FromRaw(regs.texturing.proctex_noise_v.phase).ToFloat32(), + Pica::f16::FromRaw(regs.texturing.proctex_noise_u.phase).ToFloat32(), + Pica::f16::FromRaw(regs.texturing.proctex_noise_v.phase).ToFloat32(), }; if (proctex_noise_f != uniform_block_data.data.proctex_noise_f || @@ -669,8 +670,8 @@ void RasterizerAccelerated::SyncProcTexNoise() { } void RasterizerAccelerated::SyncProcTexBias() { - const auto proctex_bias = Pica::float16::FromRaw(regs.texturing.proctex.bias_low | - (regs.texturing.proctex_lut.bias_high << 8)) + const auto proctex_bias = Pica::f16::FromRaw(regs.texturing.proctex.bias_low | + (regs.texturing.proctex_lut.bias_high << 8)) .ToFloat32(); if (proctex_bias != uniform_block_data.data.proctex_bias) { uniform_block_data.data.proctex_bias = proctex_bias; @@ -687,7 +688,7 @@ void RasterizerAccelerated::SyncAlphaTest() { } void RasterizerAccelerated::SyncCombinerColor() { - auto combiner_color = ColorRGBA8(regs.texturing.tev_combiner_buffer_color.raw); + const auto combiner_color = ColorRGBA8(regs.texturing.tev_combiner_buffer_color.raw); if (combiner_color != uniform_block_data.data.tev_combiner_buffer_color) { uniform_block_data.data.tev_combiner_buffer_color = combiner_color; uniform_block_data.dirty = true; @@ -695,7 +696,7 @@ void RasterizerAccelerated::SyncCombinerColor() { } void RasterizerAccelerated::SyncTevConstColor( - std::size_t stage_index, const Pica::TexturingRegs::TevStageConfig& tev_stage) { + const size_t stage_index, const Pica::TexturingRegs::TevStageConfig& tev_stage) { const auto const_color = ColorRGBA8(tev_stage.const_color); if (const_color == uniform_block_data.data.const_color[stage_index]) { @@ -707,7 +708,7 @@ void RasterizerAccelerated::SyncTevConstColor( } void RasterizerAccelerated::SyncGlobalAmbient() { - auto color = LightColor(regs.lighting.global_ambient); + const auto color = LightColor(regs.lighting.global_ambient); if (color != uniform_block_data.data.lighting_global_ambient) { uniform_block_data.data.lighting_global_ambient = color; uniform_block_data.dirty = true; @@ -715,7 +716,7 @@ void RasterizerAccelerated::SyncGlobalAmbient() { } void RasterizerAccelerated::SyncLightSpecular0(int light_index) { - auto color = LightColor(regs.lighting.light[light_index].specular_0); + const auto color = LightColor(regs.lighting.light[light_index].specular_0); if (color != uniform_block_data.data.light_src[light_index].specular_0) { uniform_block_data.data.light_src[light_index].specular_0 = color; uniform_block_data.dirty = true; @@ -723,7 +724,7 @@ void RasterizerAccelerated::SyncLightSpecular0(int light_index) { } void RasterizerAccelerated::SyncLightSpecular1(int light_index) { - auto color = LightColor(regs.lighting.light[light_index].specular_1); + const auto color = LightColor(regs.lighting.light[light_index].specular_1); if (color != uniform_block_data.data.light_src[light_index].specular_1) { uniform_block_data.data.light_src[light_index].specular_1 = color; uniform_block_data.dirty = true; @@ -731,7 +732,7 @@ void RasterizerAccelerated::SyncLightSpecular1(int light_index) { } void RasterizerAccelerated::SyncLightDiffuse(int light_index) { - auto color = LightColor(regs.lighting.light[light_index].diffuse); + const auto color = LightColor(regs.lighting.light[light_index].diffuse); if (color != uniform_block_data.data.light_src[light_index].diffuse) { uniform_block_data.data.light_src[light_index].diffuse = color; uniform_block_data.dirty = true; @@ -739,7 +740,7 @@ void RasterizerAccelerated::SyncLightDiffuse(int light_index) { } void RasterizerAccelerated::SyncLightAmbient(int light_index) { - auto color = LightColor(regs.lighting.light[light_index].ambient); + const auto color = LightColor(regs.lighting.light[light_index].ambient); if (color != uniform_block_data.data.light_src[light_index].ambient) { uniform_block_data.data.light_src[light_index].ambient = color; uniform_block_data.dirty = true; @@ -748,9 +749,9 @@ void RasterizerAccelerated::SyncLightAmbient(int light_index) { void RasterizerAccelerated::SyncLightPosition(int light_index) { const Common::Vec3f position = { - Pica::float16::FromRaw(regs.lighting.light[light_index].x).ToFloat32(), - Pica::float16::FromRaw(regs.lighting.light[light_index].y).ToFloat32(), - Pica::float16::FromRaw(regs.lighting.light[light_index].z).ToFloat32(), + Pica::f16::FromRaw(regs.lighting.light[light_index].x).ToFloat32(), + Pica::f16::FromRaw(regs.lighting.light[light_index].y).ToFloat32(), + Pica::f16::FromRaw(regs.lighting.light[light_index].z).ToFloat32(), }; if (position != uniform_block_data.data.light_src[light_index].position) { @@ -771,8 +772,8 @@ void RasterizerAccelerated::SyncLightSpotDirection(int light_index) { } void RasterizerAccelerated::SyncLightDistanceAttenuationBias(int light_index) { - float dist_atten_bias = - Pica::float20::FromRaw(regs.lighting.light[light_index].dist_atten_bias).ToFloat32(); + const f32 dist_atten_bias = + Pica::f20::FromRaw(regs.lighting.light[light_index].dist_atten_bias).ToFloat32(); if (dist_atten_bias != uniform_block_data.data.light_src[light_index].dist_atten_bias) { uniform_block_data.data.light_src[light_index].dist_atten_bias = dist_atten_bias; @@ -781,8 +782,8 @@ void RasterizerAccelerated::SyncLightDistanceAttenuationBias(int light_index) { } void RasterizerAccelerated::SyncLightDistanceAttenuationScale(int light_index) { - float dist_atten_scale = - Pica::float20::FromRaw(regs.lighting.light[light_index].dist_atten_scale).ToFloat32(); + const f32 dist_atten_scale = + Pica::f20::FromRaw(regs.lighting.light[light_index].dist_atten_scale).ToFloat32(); if (dist_atten_scale != uniform_block_data.data.light_src[light_index].dist_atten_scale) { uniform_block_data.data.light_src[light_index].dist_atten_scale = dist_atten_scale; @@ -792,8 +793,8 @@ void RasterizerAccelerated::SyncLightDistanceAttenuationScale(int light_index) { void RasterizerAccelerated::SyncShadowBias() { const auto& shadow = regs.framebuffer.shadow; - float constant = Pica::float16::FromRaw(shadow.constant).ToFloat32(); - float linear = Pica::float16::FromRaw(shadow.linear).ToFloat32(); + const f32 constant = Pica::f16::FromRaw(shadow.constant).ToFloat32(); + const f32 linear = Pica::f16::FromRaw(shadow.linear).ToFloat32(); if (constant != uniform_block_data.data.shadow_bias_constant || linear != uniform_block_data.data.shadow_bias_linear) { @@ -804,7 +805,7 @@ void RasterizerAccelerated::SyncShadowBias() { } void RasterizerAccelerated::SyncShadowTextureBias() { - int bias = regs.texturing.shadow.bias << 1; + const s32 bias = regs.texturing.shadow.bias << 1; if (bias != uniform_block_data.data.shadow_texture_bias) { uniform_block_data.data.shadow_texture_bias = bias; uniform_block_data.dirty = true; @@ -813,7 +814,7 @@ void RasterizerAccelerated::SyncShadowTextureBias() { void RasterizerAccelerated::SyncTextureLodBias(int tex_index) { const auto pica_textures = regs.texturing.GetTextures(); - const float bias = pica_textures[tex_index].config.lod.bias / 256.0f; + const f32 bias = pica_textures[tex_index].config.lod.bias / 256.0f; if (bias != uniform_block_data.data.tex_lod_bias[tex_index]) { uniform_block_data.data.tex_lod_bias[tex_index] = bias; uniform_block_data.dirty = true; diff --git a/src/video_core/regs_rasterizer.h b/src/video_core/regs_rasterizer.h index 245d18b3f..f605d2cd8 100644 --- a/src/video_core/regs_rasterizer.h +++ b/src/video_core/regs_rasterizer.h @@ -37,9 +37,9 @@ struct RasterizerRegs { BitField<0, 1, u32> clip_enable; BitField<0, 24, u32> clip_coef[4]; // float24 - Common::Vec4 GetClipCoef() const { - return {float24::FromRaw(clip_coef[0]), float24::FromRaw(clip_coef[1]), - float24::FromRaw(clip_coef[2]), float24::FromRaw(clip_coef[3])}; + Common::Vec4 GetClipCoef() const { + return {f24::FromRaw(clip_coef[0]), f24::FromRaw(clip_coef[1]), f24::FromRaw(clip_coef[2]), + f24::FromRaw(clip_coef[3])}; } Common::Rectangle GetViewportRect() const { @@ -47,9 +47,9 @@ struct RasterizerRegs { // These registers hold half-width and half-height, so must be multiplied by 2 viewport_corner.x, // left viewport_corner.y + // top - static_cast(float24::FromRaw(viewport_size_y).ToFloat32() * 2), + static_cast(f24::FromRaw(viewport_size_y).ToFloat32() * 2), viewport_corner.x + // right - static_cast(float24::FromRaw(viewport_size_x).ToFloat32() * 2), + static_cast(f24::FromRaw(viewport_size_x).ToFloat32() * 2), viewport_corner.y // bottom }; } diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h index c03800b39..038d5705f 100644 --- a/src/video_core/renderer_base.h +++ b/src/video_core/renderer_base.h @@ -18,6 +18,12 @@ class System; namespace VideoCore { +enum class ScreenId : u32 { + TopLeft, + TopRight, + Bottom, +}; + struct RendererSettings { // Screenshot std::atomic_bool screenshot_requested{false}; @@ -75,7 +81,7 @@ public: return current_fps; } - int GetCurrentFrame() const { + s32 GetCurrentFrame() const { return current_frame; } @@ -108,7 +114,7 @@ protected: Frontend::EmuWindow& render_window; ///< Reference to the render window handle. Frontend::EmuWindow* secondary_window; ///< Reference to the secondary render window handle. f32 current_fps = 0.0f; ///< Current framerate, should be set by the renderer - int current_frame = 0; ///< Current frame, should be set by the renderer + s32 current_frame = 0; ///< Current frame, should be set by the renderer }; } // namespace VideoCore diff --git a/src/video_core/renderer_software/rasterizer.cpp b/src/video_core/renderer_software/rasterizer.cpp deleted file mode 100644 index 9e9fd40e7..000000000 --- a/src/video_core/renderer_software/rasterizer.cpp +++ /dev/null @@ -1,901 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include -#include -#include -#include -#include "common/assert.h" -#include "common/bit_field.h" -#include "common/color.h" -#include "common/common_types.h" -#include "common/logging/log.h" -#include "common/microprofile.h" -#include "common/quaternion.h" -#include "common/vector_math.h" -#include "core/hw/gpu.h" -#include "core/memory.h" -#include "video_core/debug_utils/debug_utils.h" -#include "video_core/pica_state.h" -#include "video_core/pica_types.h" -#include "video_core/regs_framebuffer.h" -#include "video_core/regs_rasterizer.h" -#include "video_core/regs_texturing.h" -#include "video_core/renderer_software/rasterizer.h" -#include "video_core/renderer_software/sw_framebuffer.h" -#include "video_core/renderer_software/sw_lighting.h" -#include "video_core/renderer_software/sw_proctex.h" -#include "video_core/renderer_software/sw_texturing.h" -#include "video_core/shader/shader.h" -#include "video_core/texture/texture_decode.h" -#include "video_core/utils.h" -#include "video_core/video_core.h" - -namespace Pica::Rasterizer { - -// NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values -struct Fix12P4 { - Fix12P4() {} - Fix12P4(u16 val) : val(val) {} - - static u16 FracMask() { - return 0xF; - } - static u16 IntMask() { - return (u16)~0xF; - } - - operator u16() const { - return val; - } - - bool operator<(const Fix12P4& oth) const { - return (u16) * this < (u16)oth; - } - -private: - u16 val; -}; - -/** - * Calculate signed area of the triangle spanned by the three argument vertices. - * The sign denotes an orientation. - * - * @todo define orientation concretely. - */ -static int SignedArea(const Common::Vec2& vtx1, const Common::Vec2& vtx2, - const Common::Vec2& vtx3) { - const auto vec1 = Common::MakeVec(vtx2 - vtx1, 0); - const auto vec2 = Common::MakeVec(vtx3 - vtx1, 0); - // TODO: There is a very small chance this will overflow for sizeof(int) == 4 - return Common::Cross(vec1, vec2).z; -}; - -/// Convert a 3D vector for cube map coordinates to 2D texture coordinates along with the face name -static std::tuple ConvertCubeCoord(float24 u, float24 v, - float24 w, - const TexturingRegs& regs) { - const float abs_u = std::abs(u.ToFloat32()); - const float abs_v = std::abs(v.ToFloat32()); - const float abs_w = std::abs(w.ToFloat32()); - float24 x, y, z; - PAddr addr; - if (abs_u > abs_v && abs_u > abs_w) { - if (u > float24::FromFloat32(0)) { - addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::PositiveX); - y = -v; - } else { - addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::NegativeX); - y = v; - } - x = -w; - z = u; - } else if (abs_v > abs_w) { - if (v > float24::FromFloat32(0)) { - addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::PositiveY); - x = u; - } else { - addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::NegativeY); - x = -u; - } - y = w; - z = v; - } else { - if (w > float24::FromFloat32(0)) { - addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::PositiveZ); - y = -v; - } else { - addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::NegativeZ); - y = v; - } - x = u; - z = w; - } - float24 z_abs = float24::FromFloat32(std::abs(z.ToFloat32())); - const float24 half = float24::FromFloat32(0.5f); - return std::make_tuple(x / z * half + half, y / z * half + half, z_abs, addr); -} - -MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 240)); - -/** - * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing - * culling via recursion. - */ -static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Vertex& v2, - bool reversed = false) { - const auto& regs = g_state.regs; - MICROPROFILE_SCOPE(GPU_Rasterization); - - // vertex positions in rasterizer coordinates - static auto FloatToFix = [](float24 flt) { - // TODO: Rounding here is necessary to prevent garbage pixels at - // triangle borders. Is it that the correct solution, though? - return Fix12P4(static_cast(round(flt.ToFloat32() * 16.0f))); - }; - static auto ScreenToRasterizerCoordinates = [](const Common::Vec3& vec) { - return Common::Vec3{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)}; - }; - - Common::Vec3 vtxpos[3]{ScreenToRasterizerCoordinates(v0.screenpos), - ScreenToRasterizerCoordinates(v1.screenpos), - ScreenToRasterizerCoordinates(v2.screenpos)}; - - if (regs.rasterizer.cull_mode == RasterizerRegs::CullMode::KeepAll) { - // Make sure we always end up with a triangle wound counter-clockwise - if (!reversed && SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) { - ProcessTriangleInternal(v0, v2, v1, true); - return; - } - } else { - if (!reversed && regs.rasterizer.cull_mode == RasterizerRegs::CullMode::KeepClockWise) { - // Reverse vertex order and use the CCW code path. - ProcessTriangleInternal(v0, v2, v1, true); - return; - } - - // Cull away triangles which are wound clockwise. - if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) - return; - } - - u16 min_x = std::min({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x}); - u16 min_y = std::min({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y}); - u16 max_x = std::max({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x}); - u16 max_y = std::max({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y}); - - // Convert the scissor box coordinates to 12.4 fixed point - u16 scissor_x1 = (u16)(regs.rasterizer.scissor_test.x1 << 4); - u16 scissor_y1 = (u16)(regs.rasterizer.scissor_test.y1 << 4); - // x2,y2 have +1 added to cover the entire sub-pixel area - u16 scissor_x2 = (u16)((regs.rasterizer.scissor_test.x2 + 1) << 4); - u16 scissor_y2 = (u16)((regs.rasterizer.scissor_test.y2 + 1) << 4); - - if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Include) { - // Calculate the new bounds - min_x = std::max(min_x, scissor_x1); - min_y = std::max(min_y, scissor_y1); - max_x = std::min(max_x, scissor_x2); - max_y = std::min(max_y, scissor_y2); - } - - min_x &= Fix12P4::IntMask(); - min_y &= Fix12P4::IntMask(); - max_x = ((max_x + Fix12P4::FracMask()) & Fix12P4::IntMask()); - max_y = ((max_y + Fix12P4::FracMask()) & Fix12P4::IntMask()); - - // Triangle filling rules: Pixels on the right-sided edge or on flat bottom edges are not - // drawn. Pixels on any other triangle border are drawn. This is implemented with three bias - // values which are added to the barycentric coordinates w0, w1 and w2, respectively. - // NOTE: These are the PSP filling rules. Not sure if the 3DS uses the same ones... - auto IsRightSideOrFlatBottomEdge = [](const Common::Vec2& vtx, - const Common::Vec2& line1, - const Common::Vec2& line2) { - if (line1.y == line2.y) { - // just check if vertex is above us => bottom line parallel to x-axis - return vtx.y < line1.y; - } else { - // check if vertex is on our left => right side - // TODO: Not sure how likely this is to overflow - return (int)vtx.x < (int)line1.x + ((int)line2.x - (int)line1.x) * - ((int)vtx.y - (int)line1.y) / - ((int)line2.y - (int)line1.y); - } - }; - int bias0 = - IsRightSideOrFlatBottomEdge(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) ? -1 : 0; - int bias1 = - IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0; - int bias2 = - IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0; - - auto w_inverse = Common::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w); - - auto textures = regs.texturing.GetTextures(); - auto tev_stages = regs.texturing.GetTevStages(); - - bool stencil_action_enable = - g_state.regs.framebuffer.output_merger.stencil_test.enable && - g_state.regs.framebuffer.framebuffer.depth_format == FramebufferRegs::DepthFormat::D24S8; - const auto stencil_test = g_state.regs.framebuffer.output_merger.stencil_test; - - // Enter rasterization loop, starting at the center of the topleft bounding box corner. - // TODO: Not sure if looping through x first might be faster - for (u16 y = min_y + 8; y < max_y; y += 0x10) { - for (u16 x = min_x + 8; x < max_x; x += 0x10) { - - // Do not process the pixel if it's inside the scissor box and the scissor mode is set - // to Exclude - if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) { - if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) - continue; - } - - // Calculate the barycentric coordinates w0, w1 and w2 - int w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y}); - int w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y}); - int w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y}); - int wsum = w0 + w1 + w2; - - // If current pixel is not covered by the current primitive - if (w0 < 0 || w1 < 0 || w2 < 0) - continue; - - auto baricentric_coordinates = - Common::MakeVec(float24::FromFloat32(static_cast(w0)), - float24::FromFloat32(static_cast(w1)), - float24::FromFloat32(static_cast(w2))); - float24 interpolated_w_inverse = - float24::FromFloat32(1.0f) / Common::Dot(w_inverse, baricentric_coordinates); - - // interpolated_z = z / w - float interpolated_z_over_w = - (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 + - v2.screenpos[2].ToFloat32() * w2) / - wsum; - - // Not fully accurate. About 3 bits in precision are missing. - // Z-Buffer (z / w * scale + offset) - float depth_scale = float24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32(); - float depth_offset = - float24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32(); - float depth = interpolated_z_over_w * depth_scale + depth_offset; - - // Potentially switch to W-Buffer - if (regs.rasterizer.depthmap_enable == - Pica::RasterizerRegs::DepthBuffering::WBuffering) { - // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w) - depth *= interpolated_w_inverse.ToFloat32() * wsum; - } - - // Clamp the result - depth = std::clamp(depth, 0.0f, 1.0f); - - // Perspective correct attribute interpolation: - // Attribute values cannot be calculated by simple linear interpolation since - // they are not linear in screen space. For example, when interpolating a - // texture coordinate across two vertices, something simple like - // u = (u0*w0 + u1*w1)/(w0+w1) - // will not work. However, the attribute value divided by the - // clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear - // in screenspace. Hence, we can linearly interpolate these two independently and - // calculate the interpolated attribute by dividing the results. - // I.e. - // u_over_w = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1) - // one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1) - // u = u_over_w / one_over_w - // - // The generalization to three vertices is straightforward in baricentric coordinates. - auto GetInterpolatedAttribute = [&](float24 attr0, float24 attr1, float24 attr2) { - auto attr_over_w = Common::MakeVec(attr0, attr1, attr2); - float24 interpolated_attr_over_w = - Common::Dot(attr_over_w, baricentric_coordinates); - return interpolated_attr_over_w * interpolated_w_inverse; - }; - - Common::Vec4 primary_color{ - static_cast(round( - GetInterpolatedAttribute(v0.color.r(), v1.color.r(), v2.color.r()).ToFloat32() * - 255)), - static_cast(round( - GetInterpolatedAttribute(v0.color.g(), v1.color.g(), v2.color.g()).ToFloat32() * - 255)), - static_cast(round( - GetInterpolatedAttribute(v0.color.b(), v1.color.b(), v2.color.b()).ToFloat32() * - 255)), - static_cast(round( - GetInterpolatedAttribute(v0.color.a(), v1.color.a(), v2.color.a()).ToFloat32() * - 255)), - }; - - Common::Vec2 uv[3]; - uv[0].u() = GetInterpolatedAttribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u()); - uv[0].v() = GetInterpolatedAttribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v()); - uv[1].u() = GetInterpolatedAttribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u()); - uv[1].v() = GetInterpolatedAttribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v()); - uv[2].u() = GetInterpolatedAttribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u()); - uv[2].v() = GetInterpolatedAttribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v()); - - Common::Vec4 texture_color[4]{}; - for (int i = 0; i < 3; ++i) { - const auto& texture = textures[i]; - if (!texture.enabled) - continue; - - if (texture.config.address == 0) { - texture_color[i] = {0, 0, 0, 255}; - continue; - } - - int coordinate_i = - (i == 2 && regs.texturing.main_config.texture2_use_coord1) ? 1 : i; - float24 u = uv[coordinate_i].u(); - float24 v = uv[coordinate_i].v(); - - // Only unit 0 respects the texturing type (according to 3DBrew) - // TODO: Refactor so cubemaps and shadowmaps can be handled - PAddr texture_address = texture.config.GetPhysicalAddress(); - float24 shadow_z; - if (i == 0) { - switch (texture.config.type) { - case TexturingRegs::TextureConfig::Texture2D: - break; - case TexturingRegs::TextureConfig::ShadowCube: - case TexturingRegs::TextureConfig::TextureCube: { - auto w = GetInterpolatedAttribute(v0.tc0_w, v1.tc0_w, v2.tc0_w); - std::tie(u, v, shadow_z, texture_address) = - ConvertCubeCoord(u, v, w, regs.texturing); - break; - } - case TexturingRegs::TextureConfig::Projection2D: { - auto tc0_w = GetInterpolatedAttribute(v0.tc0_w, v1.tc0_w, v2.tc0_w); - u /= tc0_w; - v /= tc0_w; - break; - } - case TexturingRegs::TextureConfig::Shadow2D: { - auto tc0_w = GetInterpolatedAttribute(v0.tc0_w, v1.tc0_w, v2.tc0_w); - if (!regs.texturing.shadow.orthographic) { - u /= tc0_w; - v /= tc0_w; - } - - shadow_z = float24::FromFloat32(std::abs(tc0_w.ToFloat32())); - break; - } - case TexturingRegs::TextureConfig::Disabled: - continue; // skip this unit and continue to the next unit - default: - LOG_ERROR(HW_GPU, "Unhandled texture type {:x}", (int)texture.config.type); - UNIMPLEMENTED(); - break; - } - } - - int s = (int)(u * float24::FromFloat32(static_cast(texture.config.width))) - .ToFloat32(); - int t = (int)(v * float24::FromFloat32(static_cast(texture.config.height))) - .ToFloat32(); - - bool use_border_s = false; - bool use_border_t = false; - - if (texture.config.wrap_s == TexturingRegs::TextureConfig::ClampToBorder) { - use_border_s = s < 0 || s >= static_cast(texture.config.width); - } else if (texture.config.wrap_s == TexturingRegs::TextureConfig::ClampToBorder2) { - use_border_s = s >= static_cast(texture.config.width); - } - - if (texture.config.wrap_t == TexturingRegs::TextureConfig::ClampToBorder) { - use_border_t = t < 0 || t >= static_cast(texture.config.height); - } else if (texture.config.wrap_t == TexturingRegs::TextureConfig::ClampToBorder2) { - use_border_t = t >= static_cast(texture.config.height); - } - - if (use_border_s || use_border_t) { - auto border_color = texture.config.border_color; - texture_color[i] = - Common::MakeVec(border_color.r.Value(), border_color.g.Value(), - border_color.b.Value(), border_color.a.Value()) - .Cast(); - } else { - // Textures are laid out from bottom to top, hence we invert the t coordinate. - // NOTE: This may not be the right place for the inversion. - // TODO: Check if this applies to ETC textures, too. - s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width); - t = texture.config.height - 1 - - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height); - - const u8* texture_data = - VideoCore::g_memory->GetPhysicalPointer(texture_address); - auto info = - Texture::TextureInfo::FromPicaRegister(texture.config, texture.format); - - // TODO: Apply the min and mag filters to the texture - texture_color[i] = Texture::LookupTexture(texture_data, s, t, info); - } - - if (i == 0 && (texture.config.type == TexturingRegs::TextureConfig::Shadow2D || - texture.config.type == TexturingRegs::TextureConfig::ShadowCube)) { - - s32 z_int = static_cast(std::min(shadow_z.ToFloat32(), 1.0f) * 0xFFFFFF); - z_int -= regs.texturing.shadow.bias << 1; - auto& color = texture_color[i]; - s32 z_ref = (color.w << 16) | (color.z << 8) | color.y; - u8 density; - if (z_ref >= z_int) { - density = color.x; - } else { - density = 0; - } - texture_color[i] = {density, density, density, density}; - } - } - - // sample procedural texture - if (regs.texturing.main_config.texture3_enable) { - const auto& proctex_uv = uv[regs.texturing.main_config.texture3_coordinates]; - texture_color[3] = ProcTex(proctex_uv.u().ToFloat32(), proctex_uv.v().ToFloat32(), - g_state.regs.texturing, g_state.proctex); - } - - // Texture environment - consists of 6 stages of color and alpha combining. - // - // Color combiners take three input color values from some source (e.g. interpolated - // vertex color, texture color, previous stage, etc), perform some very simple - // operations on each of them (e.g. inversion) and then calculate the output color - // with some basic arithmetic. Alpha combiners can be configured separately but work - // analogously. - Common::Vec4 combiner_output; - Common::Vec4 combiner_buffer = {0, 0, 0, 0}; - Common::Vec4 next_combiner_buffer = - Common::MakeVec(regs.texturing.tev_combiner_buffer_color.r.Value(), - regs.texturing.tev_combiner_buffer_color.g.Value(), - regs.texturing.tev_combiner_buffer_color.b.Value(), - regs.texturing.tev_combiner_buffer_color.a.Value()) - .Cast(); - - Common::Vec4 primary_fragment_color = {0, 0, 0, 0}; - Common::Vec4 secondary_fragment_color = {0, 0, 0, 0}; - - if (!g_state.regs.lighting.disable) { - Common::Quaternion normquat = - Common::Quaternion{ - {GetInterpolatedAttribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(), - GetInterpolatedAttribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(), - GetInterpolatedAttribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()}, - GetInterpolatedAttribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(), - } - .Normalized(); - - Common::Vec3 view{ - GetInterpolatedAttribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(), - GetInterpolatedAttribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(), - GetInterpolatedAttribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(), - }; - std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors( - g_state.regs.lighting, g_state.lighting, normquat, view, texture_color); - } - - for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size(); - ++tev_stage_index) { - const auto& tev_stage = tev_stages[tev_stage_index]; - using Source = TexturingRegs::TevStageConfig::Source; - - auto GetSource = [&](Source source) -> Common::Vec4 { - switch (source) { - case Source::PrimaryColor: - return primary_color; - - case Source::PrimaryFragmentColor: - return primary_fragment_color; - - case Source::SecondaryFragmentColor: - return secondary_fragment_color; - - case Source::Texture0: - return texture_color[0]; - - case Source::Texture1: - return texture_color[1]; - - case Source::Texture2: - return texture_color[2]; - - case Source::Texture3: - return texture_color[3]; - - case Source::PreviousBuffer: - return combiner_buffer; - - case Source::Constant: - return Common::MakeVec(tev_stage.const_r.Value(), tev_stage.const_g.Value(), - tev_stage.const_b.Value(), tev_stage.const_a.Value()) - .Cast(); - - case Source::Previous: - return combiner_output; - - default: - LOG_ERROR(HW_GPU, "Unknown color combiner source {}", (int)source); - UNIMPLEMENTED(); - return {0, 0, 0, 0}; - } - }; - - // color combiner - // NOTE: Not sure if the alpha combiner might use the color output of the previous - // stage as input. Hence, we currently don't directly write the result to - // combiner_output.rgb(), but instead store it in a temporary variable until - // alpha combining has been done. - Common::Vec3 color_result[3] = { - GetColorModifier(tev_stage.color_modifier1, GetSource(tev_stage.color_source1)), - GetColorModifier(tev_stage.color_modifier2, GetSource(tev_stage.color_source2)), - GetColorModifier(tev_stage.color_modifier3, GetSource(tev_stage.color_source3)), - }; - auto color_output = ColorCombine(tev_stage.color_op, color_result); - - u8 alpha_output; - if (tev_stage.color_op == TexturingRegs::TevStageConfig::Operation::Dot3_RGBA) { - // result of Dot3_RGBA operation is also placed to the alpha component - alpha_output = color_output.x; - } else { - // alpha combiner - std::array alpha_result = {{ - GetAlphaModifier(tev_stage.alpha_modifier1, - GetSource(tev_stage.alpha_source1)), - GetAlphaModifier(tev_stage.alpha_modifier2, - GetSource(tev_stage.alpha_source2)), - GetAlphaModifier(tev_stage.alpha_modifier3, - GetSource(tev_stage.alpha_source3)), - }}; - alpha_output = AlphaCombine(tev_stage.alpha_op, alpha_result); - } - - combiner_output[0] = - std::min((unsigned)255, color_output.r() * tev_stage.GetColorMultiplier()); - combiner_output[1] = - std::min((unsigned)255, color_output.g() * tev_stage.GetColorMultiplier()); - combiner_output[2] = - std::min((unsigned)255, color_output.b() * tev_stage.GetColorMultiplier()); - combiner_output[3] = - std::min((unsigned)255, alpha_output * tev_stage.GetAlphaMultiplier()); - - combiner_buffer = next_combiner_buffer; - - if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferColor( - tev_stage_index)) { - next_combiner_buffer.r() = combiner_output.r(); - next_combiner_buffer.g() = combiner_output.g(); - next_combiner_buffer.b() = combiner_output.b(); - } - - if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferAlpha( - tev_stage_index)) { - next_combiner_buffer.a() = combiner_output.a(); - } - } - - const auto& output_merger = regs.framebuffer.output_merger; - - if (output_merger.fragment_operation_mode == - FramebufferRegs::FragmentOperationMode::Shadow) { - u32 depth_int = static_cast(depth * 0xFFFFFF); - // use green color as the shadow intensity - u8 stencil = combiner_output.y; - DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil); - // skip the normal output merger pipeline if it is in shadow mode - continue; - } - - // TODO: Does alpha testing happen before or after stencil? - if (output_merger.alpha_test.enable) { - bool pass = false; - - switch (output_merger.alpha_test.func) { - case FramebufferRegs::CompareFunc::Never: - pass = false; - break; - - case FramebufferRegs::CompareFunc::Always: - pass = true; - break; - - case FramebufferRegs::CompareFunc::Equal: - pass = combiner_output.a() == output_merger.alpha_test.ref; - break; - - case FramebufferRegs::CompareFunc::NotEqual: - pass = combiner_output.a() != output_merger.alpha_test.ref; - break; - - case FramebufferRegs::CompareFunc::LessThan: - pass = combiner_output.a() < output_merger.alpha_test.ref; - break; - - case FramebufferRegs::CompareFunc::LessThanOrEqual: - pass = combiner_output.a() <= output_merger.alpha_test.ref; - break; - - case FramebufferRegs::CompareFunc::GreaterThan: - pass = combiner_output.a() > output_merger.alpha_test.ref; - break; - - case FramebufferRegs::CompareFunc::GreaterThanOrEqual: - pass = combiner_output.a() >= output_merger.alpha_test.ref; - break; - } - - if (!pass) - continue; - } - - // Apply fog combiner - // Not fully accurate. We'd have to know what data type is used to - // store the depth etc. Using float for now until we know more - // about Pica datatypes - if (regs.texturing.fog_mode == TexturingRegs::FogMode::Fog) { - const Common::Vec3 fog_color = - Common::MakeVec(regs.texturing.fog_color.r.Value(), - regs.texturing.fog_color.g.Value(), - regs.texturing.fog_color.b.Value()) - .Cast(); - - // Get index into fog LUT - float fog_index; - if (g_state.regs.texturing.fog_flip) { - fog_index = (1.0f - depth) * 128.0f; - } else { - fog_index = depth * 128.0f; - } - - // Generate clamped fog factor from LUT for given fog index - float fog_i = std::clamp(floorf(fog_index), 0.0f, 127.0f); - float fog_f = fog_index - fog_i; - const auto& fog_lut_entry = g_state.fog.lut[static_cast(fog_i)]; - float fog_factor = fog_lut_entry.ToFloat() + fog_lut_entry.DiffToFloat() * fog_f; - fog_factor = std::clamp(fog_factor, 0.0f, 1.0f); - - // Blend the fog - for (unsigned i = 0; i < 3; i++) { - combiner_output[i] = static_cast(fog_factor * combiner_output[i] + - (1.0f - fog_factor) * fog_color[i]); - } - } - - u8 old_stencil = 0; - - auto UpdateStencil = [stencil_test, x, y, - &old_stencil](Pica::FramebufferRegs::StencilAction action) { - u8 new_stencil = - PerformStencilAction(action, old_stencil, stencil_test.reference_value); - if (g_state.regs.framebuffer.framebuffer.allow_depth_stencil_write != 0) - SetStencil(x >> 4, y >> 4, - (new_stencil & stencil_test.write_mask) | - (old_stencil & ~stencil_test.write_mask)); - }; - - if (stencil_action_enable) { - old_stencil = GetStencil(x >> 4, y >> 4); - u8 dest = old_stencil & stencil_test.input_mask; - u8 ref = stencil_test.reference_value & stencil_test.input_mask; - - bool pass = false; - switch (stencil_test.func) { - case FramebufferRegs::CompareFunc::Never: - pass = false; - break; - - case FramebufferRegs::CompareFunc::Always: - pass = true; - break; - - case FramebufferRegs::CompareFunc::Equal: - pass = (ref == dest); - break; - - case FramebufferRegs::CompareFunc::NotEqual: - pass = (ref != dest); - break; - - case FramebufferRegs::CompareFunc::LessThan: - pass = (ref < dest); - break; - - case FramebufferRegs::CompareFunc::LessThanOrEqual: - pass = (ref <= dest); - break; - - case FramebufferRegs::CompareFunc::GreaterThan: - pass = (ref > dest); - break; - - case FramebufferRegs::CompareFunc::GreaterThanOrEqual: - pass = (ref >= dest); - break; - } - - if (!pass) { - UpdateStencil(stencil_test.action_stencil_fail); - continue; - } - } - - // Convert float to integer - unsigned num_bits = - FramebufferRegs::DepthBitsPerPixel(regs.framebuffer.framebuffer.depth_format); - u32 z = (u32)(depth * ((1 << num_bits) - 1)); - - if (output_merger.depth_test_enable) { - u32 ref_z = GetDepth(x >> 4, y >> 4); - - bool pass = false; - - switch (output_merger.depth_test_func) { - case FramebufferRegs::CompareFunc::Never: - pass = false; - break; - - case FramebufferRegs::CompareFunc::Always: - pass = true; - break; - - case FramebufferRegs::CompareFunc::Equal: - pass = z == ref_z; - break; - - case FramebufferRegs::CompareFunc::NotEqual: - pass = z != ref_z; - break; - - case FramebufferRegs::CompareFunc::LessThan: - pass = z < ref_z; - break; - - case FramebufferRegs::CompareFunc::LessThanOrEqual: - pass = z <= ref_z; - break; - - case FramebufferRegs::CompareFunc::GreaterThan: - pass = z > ref_z; - break; - - case FramebufferRegs::CompareFunc::GreaterThanOrEqual: - pass = z >= ref_z; - break; - } - - if (!pass) { - if (stencil_action_enable) - UpdateStencil(stencil_test.action_depth_fail); - continue; - } - } - - if (regs.framebuffer.framebuffer.allow_depth_stencil_write != 0 && - output_merger.depth_write_enable) { - - SetDepth(x >> 4, y >> 4, z); - } - - // The stencil depth_pass action is executed even if depth testing is disabled - if (stencil_action_enable) - UpdateStencil(stencil_test.action_depth_pass); - - auto dest = GetPixel(x >> 4, y >> 4); - Common::Vec4 blend_output = combiner_output; - - if (output_merger.alphablend_enable) { - auto params = output_merger.alpha_blending; - - auto LookupFactor = [&](unsigned channel, - FramebufferRegs::BlendFactor factor) -> u8 { - DEBUG_ASSERT(channel < 4); - - const Common::Vec4 blend_const = - Common::MakeVec(output_merger.blend_const.r.Value(), - output_merger.blend_const.g.Value(), - output_merger.blend_const.b.Value(), - output_merger.blend_const.a.Value()) - .Cast(); - - switch (factor) { - case FramebufferRegs::BlendFactor::Zero: - return 0; - - case FramebufferRegs::BlendFactor::One: - return 255; - - case FramebufferRegs::BlendFactor::SourceColor: - return combiner_output[channel]; - - case FramebufferRegs::BlendFactor::OneMinusSourceColor: - return 255 - combiner_output[channel]; - - case FramebufferRegs::BlendFactor::DestColor: - return dest[channel]; - - case FramebufferRegs::BlendFactor::OneMinusDestColor: - return 255 - dest[channel]; - - case FramebufferRegs::BlendFactor::SourceAlpha: - return combiner_output.a(); - - case FramebufferRegs::BlendFactor::OneMinusSourceAlpha: - return 255 - combiner_output.a(); - - case FramebufferRegs::BlendFactor::DestAlpha: - return dest.a(); - - case FramebufferRegs::BlendFactor::OneMinusDestAlpha: - return 255 - dest.a(); - - case FramebufferRegs::BlendFactor::ConstantColor: - return blend_const[channel]; - - case FramebufferRegs::BlendFactor::OneMinusConstantColor: - return 255 - blend_const[channel]; - - case FramebufferRegs::BlendFactor::ConstantAlpha: - return blend_const.a(); - - case FramebufferRegs::BlendFactor::OneMinusConstantAlpha: - return 255 - blend_const.a(); - - case FramebufferRegs::BlendFactor::SourceAlphaSaturate: - // Returns 1.0 for the alpha channel - if (channel == 3) - return 255; - return std::min(combiner_output.a(), static_cast(255 - dest.a())); - - default: - LOG_CRITICAL(HW_GPU, "Unknown blend factor {:x}", factor); - UNIMPLEMENTED(); - break; - } - - return combiner_output[channel]; - }; - - auto srcfactor = Common::MakeVec(LookupFactor(0, params.factor_source_rgb), - LookupFactor(1, params.factor_source_rgb), - LookupFactor(2, params.factor_source_rgb), - LookupFactor(3, params.factor_source_a)); - - auto dstfactor = Common::MakeVec(LookupFactor(0, params.factor_dest_rgb), - LookupFactor(1, params.factor_dest_rgb), - LookupFactor(2, params.factor_dest_rgb), - LookupFactor(3, params.factor_dest_a)); - - blend_output = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, - params.blend_equation_rgb); - blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, - dstfactor, params.blend_equation_a) - .a(); - } else { - blend_output = - Common::MakeVec(LogicOp(combiner_output.r(), dest.r(), output_merger.logic_op), - LogicOp(combiner_output.g(), dest.g(), output_merger.logic_op), - LogicOp(combiner_output.b(), dest.b(), output_merger.logic_op), - LogicOp(combiner_output.a(), dest.a(), output_merger.logic_op)); - } - - const Common::Vec4 result = { - output_merger.red_enable ? blend_output.r() : dest.r(), - output_merger.green_enable ? blend_output.g() : dest.g(), - output_merger.blue_enable ? blend_output.b() : dest.b(), - output_merger.alpha_enable ? blend_output.a() : dest.a(), - }; - - if (regs.framebuffer.framebuffer.allow_color_write != 0) - DrawPixel(x >> 4, y >> 4, result); - } - } -} - -void ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2) { - ProcessTriangleInternal(v0, v1, v2); -} - -} // namespace Pica::Rasterizer diff --git a/src/video_core/renderer_software/rasterizer.h b/src/video_core/renderer_software/rasterizer.h deleted file mode 100644 index 8ed4c8ca2..000000000 --- a/src/video_core/renderer_software/rasterizer.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "video_core/shader/shader.h" - -namespace Pica::Rasterizer { - -struct Vertex : Shader::OutputVertex { - Vertex(const OutputVertex& v) : OutputVertex(v) {} - - // Attributes used to store intermediate results - // position after perspective divide - Common::Vec3 screenpos; - - // Linear interpolation - // factor: 0=this, 1=vtx - // Note: This function cannot be called after perspective divide - void Lerp(float24 factor, const Vertex& vtx) { - pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); - quat = quat * factor + vtx.quat * (float24::FromFloat32(1) - factor); - color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); - tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); - tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); - tc0_w = tc0_w * factor + vtx.tc0_w * (float24::FromFloat32(1) - factor); - view = view * factor + vtx.view * (float24::FromFloat32(1) - factor); - tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); - } - - // Linear interpolation - // factor: 0=v0, 1=v1 - // Note: This function cannot be called after perspective divide - static Vertex Lerp(float24 factor, const Vertex& v0, const Vertex& v1) { - Vertex ret = v0; - ret.Lerp(factor, v1); - return ret; - } -}; - -void ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2); - -} // namespace Pica::Rasterizer diff --git a/src/video_core/renderer_software/renderer_software.cpp b/src/video_core/renderer_software/renderer_software.cpp index 9f7bf37d3..b423ae5ce 100644 --- a/src/video_core/renderer_software/renderer_software.cpp +++ b/src/video_core/renderer_software/renderer_software.cpp @@ -2,18 +2,86 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include "common/color.h" +#include "core/core.h" +#include "core/hw/gpu.h" +#include "core/hw/hw.h" +#include "core/hw/lcd.h" #include "video_core/renderer_software/renderer_software.h" -namespace VideoCore { +namespace SwRenderer { RendererSoftware::RendererSoftware(Core::System& system, Frontend::EmuWindow& window) - : VideoCore::RendererBase{system, window, nullptr}, - rasterizer{std::make_unique()} {} + : VideoCore::RendererBase{system, window, nullptr}, memory{system.Memory()}, + rasterizer{std::make_unique(system.Memory())} {} RendererSoftware::~RendererSoftware() = default; void RendererSoftware::SwapBuffers() { + PrepareRenderTarget(); EndFrame(); } -} // namespace VideoCore +void RendererSoftware::PrepareRenderTarget() { + for (int i : {0, 1, 2}) { + const int fb_id = i == 2 ? 1 : 0; + const auto& framebuffer = GPU::g_regs.framebuffer_config[fb_id]; + auto& info = screen_infos[i]; + + u32 lcd_color_addr = + (fb_id == 0) ? LCD_REG_INDEX(color_fill_top) : LCD_REG_INDEX(color_fill_bottom); + lcd_color_addr = HW::VADDR_LCD + 4 * lcd_color_addr; + LCD::Regs::ColorFill color_fill = {0}; + LCD::Read(color_fill.raw, lcd_color_addr); + + if (!color_fill.is_enabled) { + const u32 old_width = std::exchange(info.width, framebuffer.width); + const u32 old_height = std::exchange(info.height, framebuffer.height); + if (framebuffer.width != old_width || framebuffer.height != old_height) [[unlikely]] { + info.pixels.resize(framebuffer.width * framebuffer.height * 4); + } + CopyPixels(i); + } + } +} + +void RendererSoftware::CopyPixels(int i) { + const u32 fb_id = i == 2 ? 1 : 0; + const auto& framebuffer = GPU::g_regs.framebuffer_config[fb_id]; + + const PAddr framebuffer_addr = + framebuffer.active_fb == 0 ? framebuffer.address_left1 : framebuffer.address_left2; + const s32 bpp = GPU::Regs::BytesPerPixel(framebuffer.color_format); + const u8* framebuffer_data = memory.GetPhysicalPointer(framebuffer_addr); + + const s32 stride = framebuffer.stride; + const s32 height = framebuffer.height; + ASSERT(stride * height != 0); + + u32 output_offset = 0; + for (u32 y = 0; y < framebuffer.height; y++) { + for (u32 x = 0; x < framebuffer.width; x++) { + const u8* pixel = framebuffer_data + (y * stride + x) * bpp; + const Common::Vec4 color = [&] { + switch (framebuffer.color_format) { + case GPU::Regs::PixelFormat::RGBA8: + return Common::Color::DecodeRGBA8(pixel); + case GPU::Regs::PixelFormat::RGB8: + return Common::Color::DecodeRGB8(pixel); + case GPU::Regs::PixelFormat::RGB565: + return Common::Color::DecodeRGB565(pixel); + case GPU::Regs::PixelFormat::RGB5A1: + return Common::Color::DecodeRGB5A1(pixel); + case GPU::Regs::PixelFormat::RGBA4: + return Common::Color::DecodeRGBA4(pixel); + } + UNREACHABLE(); + }(); + u8* dest = screen_infos[i].pixels.data() + output_offset; + std::memcpy(dest, color.AsArray(), sizeof(color)); + output_offset += sizeof(color); + } + } +} + +} // namespace SwRenderer diff --git a/src/video_core/renderer_software/renderer_software.h b/src/video_core/renderer_software/renderer_software.h index 443f7c307..1c17e321b 100644 --- a/src/video_core/renderer_software/renderer_software.h +++ b/src/video_core/renderer_software/renderer_software.h @@ -11,7 +11,13 @@ namespace Core { class System; } -namespace VideoCore { +namespace SwRenderer { + +struct ScreenInfo { + u32 width; + u32 height; + std::vector pixels; +}; class RendererSoftware : public VideoCore::RendererBase { public: @@ -22,12 +28,22 @@ public: return rasterizer.get(); } + [[nodiscard]] const ScreenInfo& Screen(VideoCore::ScreenId id) const noexcept { + return screen_infos[static_cast(id)]; + } + void SwapBuffers() override; void TryPresent(int timeout_ms, bool is_secondary) override {} void Sync() override {} private: + void PrepareRenderTarget(); + void CopyPixels(int i); + +private: + Memory::MemorySystem& memory; std::unique_ptr rasterizer; + std::array screen_infos{}; }; -} // namespace VideoCore +} // namespace SwRenderer diff --git a/src/video_core/renderer_software/sw_clipper.cpp b/src/video_core/renderer_software/sw_clipper.cpp index b745353ce..f8d1192d6 100644 --- a/src/video_core/renderer_software/sw_clipper.cpp +++ b/src/video_core/renderer_software/sw_clipper.cpp @@ -1,196 +1,88 @@ -// Copyright 2014 Citra Emulator Project +// Copyright 2023 Citra Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include #include #include -#include -#include "common/bit_field.h" -#include "common/common_types.h" -#include "common/logging/log.h" -#include "common/vector_math.h" -#include "video_core/pica_state.h" -#include "video_core/pica_types.h" -#include "video_core/renderer_software/rasterizer.h" +#include "video_core/regs_texturing.h" #include "video_core/renderer_software/sw_clipper.h" -#include "video_core/shader/shader.h" -using Pica::Rasterizer::Vertex; +namespace SwRenderer { -namespace Pica::Clipper { +using Pica::TexturingRegs; -struct ClippingEdge { -public: - ClippingEdge(Common::Vec4 coeffs, - Common::Vec4 bias = Common::Vec4(float24::FromFloat32(0), - float24::FromFloat32(0), - float24::FromFloat32(0), - float24::FromFloat32(0))) - : coeffs(coeffs), bias(bias) {} - - bool IsInside(const Vertex& vertex) const { - return Common::Dot(vertex.pos + bias, coeffs) >= float24::FromFloat32(0); +void FlipQuaternionIfOpposite(Common::Vec4& a, const Common::Vec4& b) { + if (Common::Dot(a, b) < f24::Zero()) { + a *= f24::FromFloat32(-1.0f); } - - bool IsOutSide(const Vertex& vertex) const { - return !IsInside(vertex); - } - - Vertex GetIntersection(const Vertex& v0, const Vertex& v1) const { - float24 dp = Common::Dot(v0.pos + bias, coeffs); - float24 dp_prev = Common::Dot(v1.pos + bias, coeffs); - float24 factor = dp_prev / (dp_prev - dp); - - return Vertex::Lerp(factor, v0, v1); - } - -private: - [[maybe_unused]] float24 pos; - Common::Vec4 coeffs; - Common::Vec4 bias; }; -static void InitScreenCoordinates(Vertex& vtx) { - struct { - float24 halfsize_x; - float24 offset_x; - float24 halfsize_y; - float24 offset_y; - float24 zscale; - float24 offset_z; - } viewport; +int SignedArea(const Common::Vec2& vtx1, const Common::Vec2& vtx2, + const Common::Vec2& vtx3) { + const auto vec1 = Common::MakeVec(vtx2 - vtx1, 0); + const auto vec2 = Common::MakeVec(vtx3 - vtx1, 0); + // TODO: There is a very small chance this will overflow for sizeof(int) == 4 + return Common::Cross(vec1, vec2).z; +}; - const auto& regs = g_state.regs; - viewport.halfsize_x = float24::FromRaw(regs.rasterizer.viewport_size_x); - viewport.halfsize_y = float24::FromRaw(regs.rasterizer.viewport_size_y); - viewport.offset_x = float24::FromFloat32(static_cast(regs.rasterizer.viewport_corner.x)); - viewport.offset_y = float24::FromFloat32(static_cast(regs.rasterizer.viewport_corner.y)); - - float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w; - vtx.pos.w = inv_w; - vtx.quat *= inv_w; - vtx.color *= inv_w; - vtx.tc0 *= inv_w; - vtx.tc1 *= inv_w; - vtx.tc0_w *= inv_w; - vtx.view *= inv_w; - vtx.tc2 *= inv_w; - - vtx.screenpos[0] = - (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x; - vtx.screenpos[1] = - (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y; - vtx.screenpos[2] = vtx.pos.z * inv_w; -} - -void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const OutputVertex& v2) { - using boost::container::static_vector; - - // Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at - // the new edge (or less in degenerate cases). As such, we can say that each clipping plane - // introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a - // fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9. - static const std::size_t MAX_VERTICES = 9; - static_vector buffer_a = {v0, v1, v2}; - static_vector buffer_b; - - auto FlipQuaternionIfOpposite = [](auto& a, const auto& b) { - if (Common::Dot(a, b) < float24::Zero()) - a = a * float24::FromFloat32(-1.0f); - }; - - // Flip the quaternions if they are opposite to prevent interpolating them over the wrong - // direction. - FlipQuaternionIfOpposite(buffer_a[1].quat, buffer_a[0].quat); - FlipQuaternionIfOpposite(buffer_a[2].quat, buffer_a[0].quat); - - auto* output_list = &buffer_a; - auto* input_list = &buffer_b; - - // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value. - // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest - // epsilon possible within float24 accuracy. - static const float24 EPSILON = float24::FromFloat32(0.00001f); - static const float24 f0 = float24::FromFloat32(0.0); - static const float24 f1 = float24::FromFloat32(1.0); - static const std::array clipping_edges = {{ - {Common::MakeVec(-f1, f0, f0, f1)}, // x = +w - {Common::MakeVec(f1, f0, f0, f1)}, // x = -w - {Common::MakeVec(f0, -f1, f0, f1)}, // y = +w - {Common::MakeVec(f0, f1, f0, f1)}, // y = -w - {Common::MakeVec(f0, f0, -f1, f0)}, // z = 0 - {Common::MakeVec(f0, f0, f1, f1)}, // z = -w - {Common::MakeVec(f0, f0, f0, f1), - Common::Vec4(f0, f0, f0, EPSILON)}, // w = EPSILON - }}; - - // Simple implementation of the Sutherland-Hodgman clipping algorithm. - // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here) - auto Clip = [&](const ClippingEdge& edge) { - std::swap(input_list, output_list); - output_list->clear(); - - const Vertex* reference_vertex = &input_list->back(); - - for (const auto& vertex : *input_list) { - // NOTE: This algorithm changes vertex order in some cases! - if (edge.IsInside(vertex)) { - if (edge.IsOutSide(*reference_vertex)) { - output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); - } - - output_list->push_back(vertex); - } else if (edge.IsInside(*reference_vertex)) { - output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); - } - reference_vertex = &vertex; +std::tuple ConvertCubeCoord(f24 u, f24 v, f24 w, + const Pica::TexturingRegs& regs) { + const float abs_u = std::abs(u.ToFloat32()); + const float abs_v = std::abs(v.ToFloat32()); + const float abs_w = std::abs(w.ToFloat32()); + f24 x, y, z; + PAddr addr; + if (abs_u > abs_v && abs_u > abs_w) { + if (u > f24::Zero()) { + addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::PositiveX); + y = -v; + } else { + addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::NegativeX); + y = v; } - }; - - for (auto edge : clipping_edges) { - Clip(edge); - - // Need to have at least a full triangle to continue... - if (output_list->size() < 3) - return; + x = -w; + z = u; + } else if (abs_v > abs_w) { + if (v > f24::Zero()) { + addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::PositiveY); + x = u; + } else { + addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::NegativeY); + x = -u; + } + y = w; + z = v; + } else { + if (w > f24::Zero()) { + addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::PositiveZ); + y = -v; + } else { + addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::NegativeZ); + y = v; + } + x = u; + z = w; } + const f24 z_abs = f24::FromFloat32(std::abs(z.ToFloat32())); + const f24 half = f24::FromFloat32(0.5f); + return std::make_tuple(x / z * half + half, y / z * half + half, z_abs, addr); +} - if (g_state.regs.rasterizer.clip_enable) { - ClippingEdge custom_edge{g_state.regs.rasterizer.GetClipCoef()}; - Clip(custom_edge); - - if (output_list->size() < 3) - return; - } - - InitScreenCoordinates((*output_list)[0]); - InitScreenCoordinates((*output_list)[1]); - - for (std::size_t i = 0; i < output_list->size() - 2; i++) { - Vertex& vtx0 = (*output_list)[0]; - Vertex& vtx1 = (*output_list)[i + 1]; - Vertex& vtx2 = (*output_list)[i + 2]; - - InitScreenCoordinates(vtx2); - - LOG_TRACE( - Render_Software, - "Triangle {}/{} at position ({:.3}, {:.3}, {:.3}, {:.3f}), " - "({:.3}, {:.3}, {:.3}, {:.3}), ({:.3}, {:.3}, {:.3}, {:.3}) and " - "screen position ({:.2}, {:.2}, {:.2}), ({:.2}, {:.2}, {:.2}), ({:.2}, {:.2}, {:.2})", - i + 1, output_list->size() - 2, vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), - vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(), vtx1.pos.x.ToFloat32(), - vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(), - vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), - vtx2.pos.w.ToFloat32(), vtx0.screenpos.x.ToFloat32(), vtx0.screenpos.y.ToFloat32(), - vtx0.screenpos.z.ToFloat32(), vtx1.screenpos.x.ToFloat32(), - vtx1.screenpos.y.ToFloat32(), vtx1.screenpos.z.ToFloat32(), - vtx2.screenpos.x.ToFloat32(), vtx2.screenpos.y.ToFloat32(), - vtx2.screenpos.z.ToFloat32()); - - Rasterizer::ProcessTriangle(vtx0, vtx1, vtx2); +bool IsRightSideOrFlatBottomEdge(const Common::Vec2& vtx, + const Common::Vec2& line1, + const Common::Vec2& line2) { + if (line1.y == line2.y) { + // Just check if vertex is above us => bottom line parallel to x-axis + return vtx.y < line1.y; + } else { + // Check if vertex is on our left => right side + // TODO: Not sure how likely this is to overflow + const auto svtx = vtx.Cast(); + const auto sline1 = line1.Cast(); + const auto sline2 = line2.Cast(); + return svtx.x < + sline1.x + (sline2.x - sline1.x) * (svtx.y - sline1.y) / (sline2.y - sline1.y); } } -} // namespace Pica::Clipper +} // namespace SwRenderer diff --git a/src/video_core/renderer_software/sw_clipper.h b/src/video_core/renderer_software/sw_clipper.h index c9e14e3d7..737d8d0e9 100644 --- a/src/video_core/renderer_software/sw_clipper.h +++ b/src/video_core/renderer_software/sw_clipper.h @@ -1,19 +1,87 @@ -// Copyright 2014 Citra Emulator Project +// Copyright 2023 Citra Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. #pragma once +#include "common/common_types.h" +#include "common/vector_math.h" +#include "video_core/pica_types.h" + namespace Pica { -namespace Shader { -struct OutputVertex; +struct TexturingRegs; } -namespace Clipper { +namespace SwRenderer { -using Shader::OutputVertex; +using Pica::f24; -void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const OutputVertex& v2); +// NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values +struct Fix12P4 { + Fix12P4() {} + Fix12P4(u16 val) : val(val) {} -} // namespace Clipper -} // namespace Pica + static Fix12P4 FromFloat24(f24 flt) { + // TODO: Rounding here is necessary to prevent garbage pixels at + // triangle borders. Is it that the correct solution, though? + return Fix12P4(static_cast(round(flt.ToFloat32() * 16.0f))); + } + + static u16 FracMask() { + return 0xF; + } + static u16 IntMask() { + return static_cast(~0xF); + } + + operator u16() const { + return val; + } + + bool operator<(const Fix12P4& oth) const { + return (u16) * this < (u16)oth; + } + +private: + u16 val; +}; + +struct Viewport { + f24 halfsize_x; + f24 offset_x; + f24 halfsize_y; + f24 offset_y; + f24 zscale; + f24 offset_z; +}; + +/** + * Flips the quaternions if they are opposite to prevent + * interpolating them over the wrong direction. + */ +void FlipQuaternionIfOpposite(Common::Vec4& a, const Common::Vec4& b); + +/** + * Calculate signed area of the triangle spanned by the three argument vertices. + * The sign denotes an orientation. + **/ +int SignedArea(const Common::Vec2& vtx1, const Common::Vec2& vtx2, + const Common::Vec2& vtx3); + +/** + * Convert a 3D vector for cube map coordinates to 2D texture coordinates along with the face name. + **/ +std::tuple ConvertCubeCoord(f24 u, f24 v, f24 w, + const Pica::TexturingRegs& regs); + +/** + * Triangle filling rules: Pixels on the right-sided edge or on flat bottom edges are not + * drawn. Pixels on any other triangle border are drawn. This is implemented with three bias + * values which are added to the barycentric coordinates w0, w1 and w2, respectively. + * NOTE: These are the PSP filling rules. Not sure if the 3DS uses the same ones... + **/ +bool IsRightSideOrFlatBottomEdge(const Common::Vec2& vtx, + const Common::Vec2& line1, + const Common::Vec2& line2); + +} // namespace SwRenderer diff --git a/src/video_core/renderer_software/sw_framebuffer.cpp b/src/video_core/renderer_software/sw_framebuffer.cpp index fafa6fb79..3f4e47b01 100644 --- a/src/video_core/renderer_software/sw_framebuffer.cpp +++ b/src/video_core/renderer_software/sw_framebuffer.cpp @@ -3,23 +3,46 @@ // Refer to the license.txt file included. #include -#include "common/assert.h" #include "common/color.h" -#include "common/common_types.h" #include "common/logging/log.h" -#include "common/vector_math.h" #include "core/hw/gpu.h" #include "core/memory.h" -#include "video_core/pica_state.h" +#include "video_core/pica_types.h" #include "video_core/regs_framebuffer.h" #include "video_core/renderer_software/sw_framebuffer.h" #include "video_core/utils.h" -#include "video_core/video_core.h" -namespace Pica::Rasterizer { +namespace SwRenderer { -void DrawPixel(int x, int y, const Common::Vec4& color) { - const auto& framebuffer = g_state.regs.framebuffer.framebuffer; +using Pica::f16; +using Pica::FramebufferRegs; + +namespace { + +/// Decode/Encode for shadow map format. It is similar to D24S8 format, +/// but the depth field is in big-endian. +const Common::Vec2 DecodeD24S8Shadow(const u8* bytes) { + return {static_cast((bytes[0] << 16) | (bytes[1] << 8) | bytes[2]), bytes[3]}; +} + +void EncodeD24X8Shadow(u32 depth, u8* bytes) { + bytes[2] = depth & 0xFF; + bytes[1] = (depth >> 8) & 0xFF; + bytes[0] = (depth >> 16) & 0xFF; +} + +void EncodeX24S8Shadow(u8 stencil, u8* bytes) { + bytes[3] = stencil; +} +} // Anonymous namespace + +Framebuffer::Framebuffer(Memory::MemorySystem& memory_, const Pica::FramebufferRegs& regs_) + : memory{memory_}, regs{regs_} {} + +Framebuffer::~Framebuffer() = default; + +void Framebuffer::DrawPixel(int x, int y, const Common::Vec4& color) const { + const auto& framebuffer = regs.framebuffer; const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); // Similarly to textures, the render framebuffer is laid out from bottom to top, too. @@ -27,33 +50,29 @@ void DrawPixel(int x, int y, const Common::Vec4& color) { y = framebuffer.height - y; const u32 coarse_y = y & ~7; - u32 bytes_per_pixel = + const u32 bytes_per_pixel = GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value())); - u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + - coarse_y * framebuffer.width * bytes_per_pixel; - u8* dst_pixel = VideoCore::g_memory->GetPhysicalPointer(addr) + dst_offset; + const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + + coarse_y * framebuffer.width * bytes_per_pixel; + u8* depth_buffer = memory.GetPhysicalPointer(addr); + u8* dst_pixel = depth_buffer + dst_offset; switch (framebuffer.color_format) { case FramebufferRegs::ColorFormat::RGBA8: Common::Color::EncodeRGBA8(color, dst_pixel); break; - case FramebufferRegs::ColorFormat::RGB8: Common::Color::EncodeRGB8(color, dst_pixel); break; - case FramebufferRegs::ColorFormat::RGB5A1: Common::Color::EncodeRGB5A1(color, dst_pixel); break; - case FramebufferRegs::ColorFormat::RGB565: Common::Color::EncodeRGB565(color, dst_pixel); break; - case FramebufferRegs::ColorFormat::RGBA4: Common::Color::EncodeRGBA4(color, dst_pixel); break; - default: LOG_CRITICAL(Render_Software, "Unknown framebuffer color format {:x}", static_cast(framebuffer.color_format.Value())); @@ -61,35 +80,31 @@ void DrawPixel(int x, int y, const Common::Vec4& color) { } } -const Common::Vec4 GetPixel(int x, int y) { - const auto& framebuffer = g_state.regs.framebuffer.framebuffer; +const Common::Vec4 Framebuffer::GetPixel(int x, int y) const { + const auto& framebuffer = regs.framebuffer; const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); y = framebuffer.height - y; const u32 coarse_y = y & ~7; - u32 bytes_per_pixel = + const u32 bytes_per_pixel = GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value())); - u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + - coarse_y * framebuffer.width * bytes_per_pixel; - u8* src_pixel = VideoCore::g_memory->GetPhysicalPointer(addr) + src_offset; + const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + + coarse_y * framebuffer.width * bytes_per_pixel; + const u8* color_buffer = memory.GetPhysicalPointer(addr); + const u8* src_pixel = color_buffer + src_offset; switch (framebuffer.color_format) { case FramebufferRegs::ColorFormat::RGBA8: return Common::Color::DecodeRGBA8(src_pixel); - case FramebufferRegs::ColorFormat::RGB8: return Common::Color::DecodeRGB8(src_pixel); - case FramebufferRegs::ColorFormat::RGB5A1: return Common::Color::DecodeRGB5A1(src_pixel); - case FramebufferRegs::ColorFormat::RGB565: return Common::Color::DecodeRGB565(src_pixel); - case FramebufferRegs::ColorFormat::RGBA4: return Common::Color::DecodeRGBA4(src_pixel); - default: LOG_CRITICAL(Render_Software, "Unknown framebuffer color format {:x}", static_cast(framebuffer.color_format.Value())); @@ -99,19 +114,19 @@ const Common::Vec4 GetPixel(int x, int y) { return {0, 0, 0, 0}; } -u32 GetDepth(int x, int y) { - const auto& framebuffer = g_state.regs.framebuffer.framebuffer; +u32 Framebuffer::GetDepth(int x, int y) const { + const auto& framebuffer = regs.framebuffer; const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); - u8* depth_buffer = VideoCore::g_memory->GetPhysicalPointer(addr); y = framebuffer.height - y; const u32 coarse_y = y & ~7; - u32 bytes_per_pixel = FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); - u32 stride = framebuffer.width * bytes_per_pixel; + const u32 bytes_per_pixel = FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); + const u32 stride = framebuffer.width * bytes_per_pixel; - u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; - u8* src_pixel = depth_buffer + src_offset; + const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; + const u8* depth_buffer = memory.GetPhysicalPointer(addr); + const u8* src_pixel = depth_buffer + src_offset; switch (framebuffer.depth_format) { case FramebufferRegs::DepthFormat::D16: @@ -128,24 +143,23 @@ u32 GetDepth(int x, int y) { } } -u8 GetStencil(int x, int y) { - const auto& framebuffer = g_state.regs.framebuffer.framebuffer; +u8 Framebuffer::GetStencil(int x, int y) const { + const auto& framebuffer = regs.framebuffer; const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); - u8* depth_buffer = VideoCore::g_memory->GetPhysicalPointer(addr); y = framebuffer.height - y; const u32 coarse_y = y & ~7; - u32 bytes_per_pixel = Pica::FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); - u32 stride = framebuffer.width * bytes_per_pixel; + const u32 bytes_per_pixel = Pica::FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); + const u32 stride = framebuffer.width * bytes_per_pixel; - u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; - u8* src_pixel = depth_buffer + src_offset; + const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; + const u8* depth_buffer = memory.GetPhysicalPointer(addr); + const u8* src_pixel = depth_buffer + src_offset; switch (framebuffer.depth_format) { case FramebufferRegs::DepthFormat::D24S8: return Common::Color::DecodeD24S8(src_pixel).y; - default: LOG_WARNING( HW_GPU, @@ -155,33 +169,30 @@ u8 GetStencil(int x, int y) { } } -void SetDepth(int x, int y, u32 value) { - const auto& framebuffer = g_state.regs.framebuffer.framebuffer; +void Framebuffer::SetDepth(int x, int y, u32 value) const { + const auto& framebuffer = regs.framebuffer; const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); - u8* depth_buffer = VideoCore::g_memory->GetPhysicalPointer(addr); y = framebuffer.height - y; const u32 coarse_y = y & ~7; - u32 bytes_per_pixel = FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); - u32 stride = framebuffer.width * bytes_per_pixel; + const u32 bytes_per_pixel = FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); + const u32 stride = framebuffer.width * bytes_per_pixel; - u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; + const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; + u8* depth_buffer = memory.GetPhysicalPointer(addr); u8* dst_pixel = depth_buffer + dst_offset; switch (framebuffer.depth_format) { case FramebufferRegs::DepthFormat::D16: Common::Color::EncodeD16(value, dst_pixel); break; - case FramebufferRegs::DepthFormat::D24: Common::Color::EncodeD24(value, dst_pixel); break; - case FramebufferRegs::DepthFormat::D24S8: Common::Color::EncodeD24X8(value, dst_pixel); break; - default: LOG_CRITICAL(HW_GPU, "Unimplemented depth format {}", static_cast(framebuffer.depth_format.Value())); @@ -190,18 +201,18 @@ void SetDepth(int x, int y, u32 value) { } } -void SetStencil(int x, int y, u8 value) { - const auto& framebuffer = g_state.regs.framebuffer.framebuffer; +void Framebuffer::SetStencil(int x, int y, u8 value) const { + const auto& framebuffer = regs.framebuffer; const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); - u8* depth_buffer = VideoCore::g_memory->GetPhysicalPointer(addr); y = framebuffer.height - y; const u32 coarse_y = y & ~7; - u32 bytes_per_pixel = Pica::FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); - u32 stride = framebuffer.width * bytes_per_pixel; + const u32 bytes_per_pixel = Pica::FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); + const u32 stride = framebuffer.width * bytes_per_pixel; - u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; + const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; + u8* depth_buffer = memory.GetPhysicalPointer(addr); u8* dst_pixel = depth_buffer + dst_offset; switch (framebuffer.depth_format) { @@ -209,11 +220,9 @@ void SetStencil(int x, int y, u8 value) { case Pica::FramebufferRegs::DepthFormat::D24: // Nothing to do break; - case Pica::FramebufferRegs::DepthFormat::D24S8: Common::Color::EncodeX24S8(value, dst_pixel); break; - default: LOG_CRITICAL(HW_GPU, "Unimplemented depth format {}", static_cast(framebuffer.depth_format.Value())); @@ -222,36 +231,65 @@ void SetStencil(int x, int y, u8 value) { } } +void Framebuffer::DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil) const { + const auto& framebuffer = regs.framebuffer; + const auto& shadow = regs.shadow; + const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); + + y = framebuffer.height - y; + + const u32 coarse_y = y & ~7; + u32 bytes_per_pixel = 4; + u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + + coarse_y * framebuffer.width * bytes_per_pixel; + u8* shadow_buffer = memory.GetPhysicalPointer(addr); + u8* dst_pixel = shadow_buffer + dst_offset; + + const auto ref = DecodeD24S8Shadow(dst_pixel); + const u32 ref_z = ref.x; + const u32 ref_s = ref.y; + + if (depth >= ref_z) { + return; + } + + if (stencil == 0) { + EncodeD24X8Shadow(depth, dst_pixel); + } else { + const f16 constant = f16::FromRaw(shadow.constant); + const f16 linear = f16::FromRaw(shadow.linear); + const f16 x_ = f16::FromFloat32(static_cast(depth) / ref_z); + const f16 stencil_new = f16::FromFloat32(stencil) / (constant + linear * x_); + stencil = static_cast(std::clamp(stencil_new.ToFloat32(), 0.0f, 255.0f)); + + if (stencil < ref_s) { + EncodeX24S8Shadow(stencil, dst_pixel); + } + } +} + u8 PerformStencilAction(FramebufferRegs::StencilAction action, u8 old_stencil, u8 ref) { switch (action) { case FramebufferRegs::StencilAction::Keep: return old_stencil; - case FramebufferRegs::StencilAction::Zero: return 0; - case FramebufferRegs::StencilAction::Replace: return ref; - case FramebufferRegs::StencilAction::Increment: // Saturated increment return std::min(old_stencil, 254) + 1; - case FramebufferRegs::StencilAction::Decrement: // Saturated decrement return std::max(old_stencil, 1) - 1; - case FramebufferRegs::StencilAction::Invert: return ~old_stencil; - case FramebufferRegs::StencilAction::IncrementWrap: return old_stencil + 1; - case FramebufferRegs::StencilAction::DecrementWrap: return old_stencil - 1; - default: - LOG_CRITICAL(HW_GPU, "Unknown stencil action {:x}", (int)action); + LOG_CRITICAL(HW_GPU, "Unknown stencil action {:x}", static_cast(action)); UNIMPLEMENTED(); return 0; } @@ -262,24 +300,21 @@ Common::Vec4 EvaluateBlendEquation(const Common::Vec4& src, const Common::Vec4& dest, const Common::Vec4& destfactor, FramebufferRegs::BlendEquation equation) { - Common::Vec4 result; + Common::Vec4i result; - auto src_result = (src * srcfactor).Cast(); - auto dst_result = (dest * destfactor).Cast(); + const auto src_result = (src * srcfactor).Cast(); + const auto dst_result = (dest * destfactor).Cast(); switch (equation) { case FramebufferRegs::BlendEquation::Add: result = (src_result + dst_result) / 255; break; - case FramebufferRegs::BlendEquation::Subtract: result = (src_result - dst_result) / 255; break; - case FramebufferRegs::BlendEquation::ReverseSubtract: result = (dst_result - src_result) / 255; break; - // TODO: How do these two actually work? OpenGL doesn't include the blend factors in the // min/max computations, but is this what the 3DS actually does? case FramebufferRegs::BlendEquation::Min: @@ -288,14 +323,12 @@ Common::Vec4 EvaluateBlendEquation(const Common::Vec4& src, result.b() = std::min(src.b(), dest.b()); result.a() = std::min(src.a(), dest.a()); break; - case FramebufferRegs::BlendEquation::Max: result.r() = std::max(src.r(), dest.r()); result.g() = std::max(src.g(), dest.g()); result.b() = std::max(src.b(), dest.b()); result.a() = std::max(src.a(), dest.a()); break; - default: LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation 0x{:x}", equation); UNIMPLEMENTED(); @@ -309,103 +342,38 @@ u8 LogicOp(u8 src, u8 dest, FramebufferRegs::LogicOp op) { switch (op) { case FramebufferRegs::LogicOp::Clear: return 0; - case FramebufferRegs::LogicOp::And: return src & dest; - case FramebufferRegs::LogicOp::AndReverse: return src & ~dest; - case FramebufferRegs::LogicOp::Copy: return src; - case FramebufferRegs::LogicOp::Set: return 255; - case FramebufferRegs::LogicOp::CopyInverted: return ~src; - case FramebufferRegs::LogicOp::NoOp: return dest; - case FramebufferRegs::LogicOp::Invert: return ~dest; - case FramebufferRegs::LogicOp::Nand: return ~(src & dest); - case FramebufferRegs::LogicOp::Or: return src | dest; - case FramebufferRegs::LogicOp::Nor: return ~(src | dest); - case FramebufferRegs::LogicOp::Xor: return src ^ dest; - case FramebufferRegs::LogicOp::Equiv: return ~(src ^ dest); - case FramebufferRegs::LogicOp::AndInverted: return ~src & dest; - case FramebufferRegs::LogicOp::OrReverse: return src | ~dest; - case FramebufferRegs::LogicOp::OrInverted: return ~src | dest; } - UNREACHABLE(); }; -// Decode/Encode for shadow map format. It is similar to D24S8 format, but the depth field is in -// big-endian -static const Common::Vec2 DecodeD24S8Shadow(const u8* bytes) { - return {static_cast((bytes[0] << 16) | (bytes[1] << 8) | bytes[2]), bytes[3]}; -} - -static void EncodeD24X8Shadow(u32 depth, u8* bytes) { - bytes[2] = depth & 0xFF; - bytes[1] = (depth >> 8) & 0xFF; - bytes[0] = (depth >> 16) & 0xFF; -} - -static void EncodeX24S8Shadow(u8 stencil, u8* bytes) { - bytes[3] = stencil; -} - -void DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil) { - const auto& framebuffer = g_state.regs.framebuffer.framebuffer; - const auto& shadow = g_state.regs.framebuffer.shadow; - const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); - - y = framebuffer.height - y; - - const u32 coarse_y = y & ~7; - u32 bytes_per_pixel = 4; - u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + - coarse_y * framebuffer.width * bytes_per_pixel; - u8* dst_pixel = VideoCore::g_memory->GetPhysicalPointer(addr) + dst_offset; - - auto ref = DecodeD24S8Shadow(dst_pixel); - u32 ref_z = ref.x; - u32 ref_s = ref.y; - - if (depth < ref_z) { - if (stencil == 0) { - EncodeD24X8Shadow(depth, dst_pixel); - } else { - float16 constant = float16::FromRaw(shadow.constant); - float16 linear = float16::FromRaw(shadow.linear); - float16 x_ = float16::FromFloat32(static_cast(depth) / ref_z); - float16 stencil_new = float16::FromFloat32(stencil) / (constant + linear * x_); - stencil = static_cast(std::clamp(stencil_new.ToFloat32(), 0.0f, 255.0f)); - - if (stencil < ref_s) - EncodeX24S8Shadow(stencil, dst_pixel); - } - } -} - -} // namespace Pica::Rasterizer +} // namespace SwRenderer diff --git a/src/video_core/renderer_software/sw_framebuffer.h b/src/video_core/renderer_software/sw_framebuffer.h index 51406ef89..db0cb66d5 100644 --- a/src/video_core/renderer_software/sw_framebuffer.h +++ b/src/video_core/renderer_software/sw_framebuffer.h @@ -8,24 +8,55 @@ #include "common/vector_math.h" #include "video_core/regs_framebuffer.h" -namespace Pica::Rasterizer { +namespace Memory { +class MemorySystem; +} -void DrawPixel(int x, int y, const Common::Vec4& color); -const Common::Vec4 GetPixel(int x, int y); -u32 GetDepth(int x, int y); -u8 GetStencil(int x, int y); -void SetDepth(int x, int y, u32 value); -void SetStencil(int x, int y, u8 value); -u8 PerformStencilAction(FramebufferRegs::StencilAction action, u8 old_stencil, u8 ref); +namespace Pica { +struct FramebufferRegs; +} + +namespace SwRenderer { + +class Framebuffer { +public: + explicit Framebuffer(Memory::MemorySystem& memory, const Pica::FramebufferRegs& framebuffer); + ~Framebuffer(); + + /// Draws a pixel at the specified coordinates. + void DrawPixel(int x, int y, const Common::Vec4& color) const; + + /// Returns the current color at the specified coordinates. + [[nodiscard]] const Common::Vec4 GetPixel(int x, int y) const; + + /// Returns the depth value at the specified coordinates. + [[nodiscard]] u32 GetDepth(int x, int y) const; + + /// Returns the stencil value at the specified coordinates. + [[nodiscard]] u8 GetStencil(int x, int y) const; + + /// Stores the provided depth value at the specified coordinates. + void SetDepth(int x, int y, u32 value) const; + + /// Stores the provided stencil value at the specified coordinates. + void SetStencil(int x, int y, u8 value) const; + + /// Draws a pixel to the shadow buffer. + void DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil) const; + +private: + Memory::MemorySystem& memory; + const Pica::FramebufferRegs& regs; +}; + +u8 PerformStencilAction(Pica::FramebufferRegs::StencilAction action, u8 old_stencil, u8 ref); Common::Vec4 EvaluateBlendEquation(const Common::Vec4& src, const Common::Vec4& srcfactor, const Common::Vec4& dest, const Common::Vec4& destfactor, - FramebufferRegs::BlendEquation equation); + Pica::FramebufferRegs::BlendEquation equation); -u8 LogicOp(u8 src, u8 dest, FramebufferRegs::LogicOp op); +u8 LogicOp(u8 src, u8 dest, Pica::FramebufferRegs::LogicOp op); -void DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil); - -} // namespace Pica::Rasterizer +} // namespace SwRenderer diff --git a/src/video_core/renderer_software/sw_lighting.cpp b/src/video_core/renderer_software/sw_lighting.cpp index b5394c65d..5dd8334eb 100644 --- a/src/video_core/renderer_software/sw_lighting.cpp +++ b/src/video_core/renderer_software/sw_lighting.cpp @@ -5,7 +5,10 @@ #include #include "video_core/renderer_software/sw_lighting.h" -namespace Pica { +namespace SwRenderer { + +using Pica::f16; +using Pica::LightingRegs; static float LookupLightingLut(const Pica::State::Lighting& lighting, std::size_t lut_index, u8 index, float delta) { @@ -14,18 +17,18 @@ static float LookupLightingLut(const Pica::State::Lighting& lighting, std::size_ const auto& lut = lighting.luts[lut_index][index]; - float lut_value = lut.ToFloat(); - float lut_diff = lut.DiffToFloat(); + const float lut_value = lut.ToFloat(); + const float lut_diff = lut.DiffToFloat(); return lut_value + lut_diff * delta; } -std::tuple, Common::Vec4> ComputeFragmentsColors( +std::pair, Common::Vec4> ComputeFragmentsColors( const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state, - const Common::Quaternion& normquat, const Common::Vec3& view, - const Common::Vec4 (&texture_color)[4]) { + const Common::Quaternion& normquat, const Common::Vec3f& view, + std::span, 4> texture_color) { - Common::Vec4 shadow; + Common::Vec4f shadow; if (lighting.config0.enable_shadow) { shadow = texture_color[lighting.config0.shadow_selector].Cast() / 255.0f; if (lighting.config0.shadow_invert) { @@ -35,16 +38,16 @@ std::tuple, Common::Vec4> ComputeFragmentsColors( shadow = Common::MakeVec(1.0f, 1.0f, 1.0f, 1.0f); } - Common::Vec3 surface_normal{}; - Common::Vec3 surface_tangent{}; + Common::Vec3f surface_normal{}; + Common::Vec3f surface_tangent{}; if (lighting.config0.bump_mode != LightingRegs::LightingBumpMode::None) { - Common::Vec3 perturbation = + Common::Vec3f perturbation = texture_color[lighting.config0.bump_selector].xyz().Cast() / 127.5f - Common::MakeVec(1.0f, 1.0f, 1.0f); if (lighting.config0.bump_mode == LightingRegs::LightingBumpMode::NormalMap) { if (!lighting.config0.disable_bump_renorm) { - const float z_square = 1 - perturbation.xy().Length2(); + const f32 z_square = 1 - perturbation.xy().Length2(); perturbation.z = std::sqrt(std::max(z_square, 0.0f)); } surface_normal = perturbation; @@ -65,66 +68,64 @@ std::tuple, Common::Vec4> ComputeFragmentsColors( auto normal = Common::QuaternionRotate(normquat, surface_normal); auto tangent = Common::QuaternionRotate(normquat, surface_tangent); - Common::Vec4 diffuse_sum = {0.0f, 0.0f, 0.0f, 1.0f}; - Common::Vec4 specular_sum = {0.0f, 0.0f, 0.0f, 1.0f}; + Common::Vec4f diffuse_sum = {0.0f, 0.0f, 0.0f, 1.0f}; + Common::Vec4f specular_sum = {0.0f, 0.0f, 0.0f, 1.0f}; - for (unsigned light_index = 0; light_index <= lighting.max_light_index; ++light_index) { - unsigned num = lighting.light_enable.GetNum(light_index); + for (u32 light_index = 0; light_index <= lighting.max_light_index; ++light_index) { + u32 num = lighting.light_enable.GetNum(light_index); const auto& light_config = lighting.light[num]; - Common::Vec3 refl_value = {}; - Common::Vec3 position = {float16::FromRaw(light_config.x).ToFloat32(), - float16::FromRaw(light_config.y).ToFloat32(), - float16::FromRaw(light_config.z).ToFloat32()}; - Common::Vec3 light_vector; + const Common::Vec3f position = {f16::FromRaw(light_config.x).ToFloat32(), + f16::FromRaw(light_config.y).ToFloat32(), + f16::FromRaw(light_config.z).ToFloat32()}; + Common::Vec3f refl_value{}; + Common::Vec3f light_vector{}; - if (light_config.config.directional) + if (light_config.config.directional) { light_vector = position; - else + } else { light_vector = position + view; + } - [[maybe_unused]] float length = light_vector.Normalize(); + [[maybe_unused]] const f32 length = light_vector.Normalize(); - Common::Vec3 norm_view = view.Normalized(); - Common::Vec3 half_vector = norm_view + light_vector; + Common::Vec3f norm_view = view.Normalized(); + Common::Vec3f half_vector = norm_view + light_vector; - float dist_atten = 1.0f; + f32 dist_atten = 1.0f; if (!lighting.IsDistAttenDisabled(num)) { - float scale = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32(); - float bias = Pica::float20::FromRaw(light_config.dist_atten_bias).ToFloat32(); - std::size_t lut = + const f32 scale = Pica::f20::FromRaw(light_config.dist_atten_scale).ToFloat32(); + const f32 bias = Pica::f20::FromRaw(light_config.dist_atten_bias).ToFloat32(); + const std::size_t lut = static_cast(LightingRegs::LightingSampler::DistanceAttenuation) + num; - float sample_loc = std::clamp(scale * length + bias, 0.0f, 1.0f); + const f32 sample_loc = std::clamp(scale * length + bias, 0.0f, 1.0f); - u8 lutindex = + const u8 lutindex = static_cast(std::clamp(std::floor(sample_loc * 256.0f), 0.0f, 255.0f)); - float delta = sample_loc * 256 - lutindex; + const f32 delta = sample_loc * 256 - lutindex; + dist_atten = LookupLightingLut(lighting_state, lut, lutindex, delta); } - auto GetLutValue = [&](LightingRegs::LightingLutInput input, bool abs, - LightingRegs::LightingScale scale_enum, - LightingRegs::LightingSampler sampler) { - float result = 0.0f; + auto get_lut_value = [&](LightingRegs::LightingLutInput input, bool abs, + LightingRegs::LightingScale scale_enum, + LightingRegs::LightingSampler sampler) { + f32 result = 0.0f; switch (input) { case LightingRegs::LightingLutInput::NH: result = Common::Dot(normal, half_vector.Normalized()); break; - case LightingRegs::LightingLutInput::VH: result = Common::Dot(norm_view, half_vector.Normalized()); break; - case LightingRegs::LightingLutInput::NV: result = Common::Dot(normal, norm_view); break; - case LightingRegs::LightingLutInput::LN: result = Common::Dot(light_vector, normal); break; - case LightingRegs::LightingLutInput::SP: { Common::Vec3 spot_dir{light_config.spot_x.Value(), light_config.spot_y.Value(), light_config.spot_z.Value()}; @@ -133,8 +134,8 @@ std::tuple, Common::Vec4> ComputeFragmentsColors( } case LightingRegs::LightingLutInput::CP: if (lighting.config0.config == LightingRegs::LightingConfig::Config7) { - const Common::Vec3 norm_half_vector = half_vector.Normalized(); - const Common::Vec3 half_vector_proj = + const Common::Vec3f norm_half_vector = half_vector.Normalized(); + const Common::Vec3f half_vector_proj = norm_half_vector - normal * Common::Dot(normal, norm_half_vector); result = Common::Dot(half_vector_proj, tangent); } else { @@ -148,58 +149,60 @@ std::tuple, Common::Vec4> ComputeFragmentsColors( } u8 index; - float delta; + f32 delta; if (abs) { - if (light_config.config.two_sided_diffuse) + if (light_config.config.two_sided_diffuse) { result = std::abs(result); - else + } else { result = std::max(result, 0.0f); + } - float flr = std::floor(result * 256.0f); + const f32 flr = std::floor(result * 256.0f); index = static_cast(std::clamp(flr, 0.0f, 255.0f)); delta = result * 256 - index; } else { - float flr = std::floor(result * 128.0f); - s8 signed_index = static_cast(std::clamp(flr, -128.0f, 127.0f)); + const f32 flr = std::floor(result * 128.0f); + const s8 signed_index = static_cast(std::clamp(flr, -128.0f, 127.0f)); delta = result * 128.0f - signed_index; index = static_cast(signed_index); } - float scale = lighting.lut_scale.GetScale(scale_enum); + const f32 scale = lighting.lut_scale.GetScale(scale_enum); return scale * LookupLightingLut(lighting_state, static_cast(sampler), index, delta); }; // If enabled, compute spot light attenuation value - float spot_atten = 1.0f; + f32 spot_atten = 1.0f; if (!lighting.IsSpotAttenDisabled(num) && LightingRegs::IsLightingSamplerSupported( lighting.config0.config, LightingRegs::LightingSampler::SpotlightAttenuation)) { auto lut = LightingRegs::SpotlightAttenuationSampler(num); - spot_atten = GetLutValue(lighting.lut_input.sp, lighting.abs_lut_input.disable_sp == 0, - lighting.lut_scale.sp, lut); + spot_atten = + get_lut_value(lighting.lut_input.sp, lighting.abs_lut_input.disable_sp == 0, + lighting.lut_scale.sp, lut); } // Specular 0 component - float d0_lut_value = 1.0f; + f32 d0_lut_value = 1.0f; if (lighting.config1.disable_lut_d0 == 0 && LightingRegs::IsLightingSamplerSupported( lighting.config0.config, LightingRegs::LightingSampler::Distribution0)) { d0_lut_value = - GetLutValue(lighting.lut_input.d0, lighting.abs_lut_input.disable_d0 == 0, - lighting.lut_scale.d0, LightingRegs::LightingSampler::Distribution0); + get_lut_value(lighting.lut_input.d0, lighting.abs_lut_input.disable_d0 == 0, + lighting.lut_scale.d0, LightingRegs::LightingSampler::Distribution0); } - Common::Vec3 specular_0 = d0_lut_value * light_config.specular_0.ToVec3f(); + Common::Vec3f specular_0 = d0_lut_value * light_config.specular_0.ToVec3f(); // If enabled, lookup ReflectRed value, otherwise, 1.0 is used if (lighting.config1.disable_lut_rr == 0 && LightingRegs::IsLightingSamplerSupported(lighting.config0.config, LightingRegs::LightingSampler::ReflectRed)) { refl_value.x = - GetLutValue(lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0, - lighting.lut_scale.rr, LightingRegs::LightingSampler::ReflectRed); + get_lut_value(lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0, + lighting.lut_scale.rr, LightingRegs::LightingSampler::ReflectRed); } else { refl_value.x = 1.0f; } @@ -209,8 +212,8 @@ std::tuple, Common::Vec4> ComputeFragmentsColors( LightingRegs::IsLightingSamplerSupported(lighting.config0.config, LightingRegs::LightingSampler::ReflectGreen)) { refl_value.y = - GetLutValue(lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0, - lighting.lut_scale.rg, LightingRegs::LightingSampler::ReflectGreen); + get_lut_value(lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0, + lighting.lut_scale.rg, LightingRegs::LightingSampler::ReflectGreen); } else { refl_value.y = refl_value.x; } @@ -220,24 +223,23 @@ std::tuple, Common::Vec4> ComputeFragmentsColors( LightingRegs::IsLightingSamplerSupported(lighting.config0.config, LightingRegs::LightingSampler::ReflectBlue)) { refl_value.z = - GetLutValue(lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0, - lighting.lut_scale.rb, LightingRegs::LightingSampler::ReflectBlue); + get_lut_value(lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0, + lighting.lut_scale.rb, LightingRegs::LightingSampler::ReflectBlue); } else { refl_value.z = refl_value.x; } // Specular 1 component - float d1_lut_value = 1.0f; + f32 d1_lut_value = 1.0f; if (lighting.config1.disable_lut_d1 == 0 && LightingRegs::IsLightingSamplerSupported( lighting.config0.config, LightingRegs::LightingSampler::Distribution1)) { d1_lut_value = - GetLutValue(lighting.lut_input.d1, lighting.abs_lut_input.disable_d1 == 0, - lighting.lut_scale.d1, LightingRegs::LightingSampler::Distribution1); + get_lut_value(lighting.lut_input.d1, lighting.abs_lut_input.disable_d1 == 0, + lighting.lut_scale.d1, LightingRegs::LightingSampler::Distribution1); } - Common::Vec3 specular_1 = - d1_lut_value * refl_value * light_config.specular_1.ToVec3f(); + Common::Vec3f specular_1 = d1_lut_value * refl_value * light_config.specular_1.ToVec3f(); // Fresnel // Note: only the last entry in the light slots applies the Fresnel factor @@ -245,9 +247,9 @@ std::tuple, Common::Vec4> ComputeFragmentsColors( LightingRegs::IsLightingSamplerSupported(lighting.config0.config, LightingRegs::LightingSampler::Fresnel)) { - float lut_value = - GetLutValue(lighting.lut_input.fr, lighting.abs_lut_input.disable_fr == 0, - lighting.lut_scale.fr, LightingRegs::LightingSampler::Fresnel); + const f32 lut_value = + get_lut_value(lighting.lut_input.fr, lighting.abs_lut_input.disable_fr == 0, + lighting.lut_scale.fr, LightingRegs::LightingSampler::Fresnel); // Enabled for diffuse lighting alpha component if (lighting.config0.enable_primary_alpha) { @@ -261,18 +263,19 @@ std::tuple, Common::Vec4> ComputeFragmentsColors( } auto dot_product = Common::Dot(light_vector, normal); - if (light_config.config.two_sided_diffuse) + if (light_config.config.two_sided_diffuse) { dot_product = std::abs(dot_product); - else + } else { dot_product = std::max(dot_product, 0.0f); + } - float clamp_highlights = 1.0f; + f32 clamp_highlights = 1.0f; if (lighting.config0.clamp_highlights) { clamp_highlights = dot_product == 0.0f ? 0.0f : 1.0f; } if (light_config.config.geometric_factor_0 || light_config.config.geometric_factor_1) { - float geo_factor = half_vector.Length2(); + f32 geo_factor = half_vector.Length2(); geo_factor = geo_factor == 0.0f ? 0.0f : std::min(dot_product / geo_factor, 1.0f); if (light_config.config.geometric_factor_0) { specular_0 *= geo_factor; @@ -315,17 +318,17 @@ std::tuple, Common::Vec4> ComputeFragmentsColors( diffuse_sum += Common::MakeVec(lighting.global_ambient.ToVec3f(), 0.0f); - auto diffuse = Common::MakeVec(std::clamp(diffuse_sum.x, 0.0f, 1.0f) * 255, - std::clamp(diffuse_sum.y, 0.0f, 1.0f) * 255, - std::clamp(diffuse_sum.z, 0.0f, 1.0f) * 255, - std::clamp(diffuse_sum.w, 0.0f, 1.0f) * 255) - .Cast(); - auto specular = Common::MakeVec(std::clamp(specular_sum.x, 0.0f, 1.0f) * 255, - std::clamp(specular_sum.y, 0.0f, 1.0f) * 255, - std::clamp(specular_sum.z, 0.0f, 1.0f) * 255, - std::clamp(specular_sum.w, 0.0f, 1.0f) * 255) - .Cast(); - return std::make_tuple(diffuse, specular); + const auto diffuse = Common::MakeVec(std::clamp(diffuse_sum.x, 0.0f, 1.0f) * 255, + std::clamp(diffuse_sum.y, 0.0f, 1.0f) * 255, + std::clamp(diffuse_sum.z, 0.0f, 1.0f) * 255, + std::clamp(diffuse_sum.w, 0.0f, 1.0f) * 255) + .Cast(); + const auto specular = Common::MakeVec(std::clamp(specular_sum.x, 0.0f, 1.0f) * 255, + std::clamp(specular_sum.y, 0.0f, 1.0f) * 255, + std::clamp(specular_sum.z, 0.0f, 1.0f) * 255, + std::clamp(specular_sum.w, 0.0f, 1.0f) * 255) + .Cast(); + return std::make_pair(diffuse, specular); } -} // namespace Pica +} // namespace SwRenderer diff --git a/src/video_core/renderer_software/sw_lighting.h b/src/video_core/renderer_software/sw_lighting.h index 49624761a..98a8d3235 100644 --- a/src/video_core/renderer_software/sw_lighting.h +++ b/src/video_core/renderer_software/sw_lighting.h @@ -4,16 +4,18 @@ #pragma once -#include +#include +#include + #include "common/quaternion.h" #include "common/vector_math.h" #include "video_core/pica_state.h" -namespace Pica { +namespace SwRenderer { -std::tuple, Common::Vec4> ComputeFragmentsColors( +std::pair, Common::Vec4> ComputeFragmentsColors( const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state, - const Common::Quaternion& normquat, const Common::Vec3& view, - const Common::Vec4 (&texture_color)[4]); + const Common::Quaternion& normquat, const Common::Vec3f& view, + std::span, 4> texture_color); -} // namespace Pica +} // namespace SwRenderer diff --git a/src/video_core/renderer_software/sw_proctex.cpp b/src/video_core/renderer_software/sw_proctex.cpp index a3a4a7993..855208e57 100644 --- a/src/video_core/renderer_software/sw_proctex.cpp +++ b/src/video_core/renderer_software/sw_proctex.cpp @@ -4,17 +4,18 @@ #include #include -#include "common/math_util.h" #include "video_core/renderer_software/sw_proctex.h" -namespace Pica::Rasterizer { +namespace SwRenderer { -using ProcTexClamp = TexturingRegs::ProcTexClamp; -using ProcTexShift = TexturingRegs::ProcTexShift; -using ProcTexCombiner = TexturingRegs::ProcTexCombiner; -using ProcTexFilter = TexturingRegs::ProcTexFilter; +namespace { +using ProcTexClamp = Pica::TexturingRegs::ProcTexClamp; +using ProcTexShift = Pica::TexturingRegs::ProcTexShift; +using ProcTexCombiner = Pica::TexturingRegs::ProcTexCombiner; +using ProcTexFilter = Pica::TexturingRegs::ProcTexFilter; +using Pica::f16; -static float LookupLUT(const std::array& lut, float coord) { +float LookupLUT(const std::array& lut, float coord) { // For NoiseLUT/ColorMap/AlphaMap, coord=0.0 is lut[0], coord=127.0/128.0 is lut[127] and // coord=1.0 is lut[127]+lut_diff[127]. For other indices, the result is interpolated using // value entries and difference entries. @@ -26,13 +27,13 @@ static float LookupLUT(const std::array& lut, f // These function are used to generate random noise for procedural texture. Their results are // verified against real hardware, but it's not known if the algorithm is the same as hardware. -static unsigned int NoiseRand1D(unsigned int v) { +unsigned int NoiseRand1D(unsigned int v) { static constexpr std::array table{ {0, 4, 10, 8, 4, 9, 7, 12, 5, 15, 13, 14, 11, 15, 2, 11}}; return ((v % 9 + 2) * 3 & 0xF) ^ table[(v / 9) & 0xF]; } -static float NoiseRand2D(unsigned int x, unsigned int y) { +float NoiseRand2D(unsigned int x, unsigned int y) { static constexpr std::array table{ {10, 2, 15, 8, 0, 7, 4, 5, 5, 13, 2, 6, 13, 9, 3, 14}}; unsigned int u2 = NoiseRand1D(x); @@ -45,11 +46,12 @@ static float NoiseRand2D(unsigned int x, unsigned int y) { return -1.0f + v2 * 2.0f / 15.0f; } -static float NoiseCoef(float u, float v, const TexturingRegs& regs, const State::ProcTex& state) { - const float freq_u = float16::FromRaw(regs.proctex_noise_frequency.u).ToFloat32(); - const float freq_v = float16::FromRaw(regs.proctex_noise_frequency.v).ToFloat32(); - const float phase_u = float16::FromRaw(regs.proctex_noise_u.phase).ToFloat32(); - const float phase_v = float16::FromRaw(regs.proctex_noise_v.phase).ToFloat32(); +float NoiseCoef(float u, float v, const Pica::TexturingRegs& regs, + const Pica::State::ProcTex& state) { + const float freq_u = f16::FromRaw(regs.proctex_noise_frequency.u).ToFloat32(); + const float freq_v = f16::FromRaw(regs.proctex_noise_frequency.v).ToFloat32(); + const float phase_u = f16::FromRaw(regs.proctex_noise_u.phase).ToFloat32(); + const float phase_v = f16::FromRaw(regs.proctex_noise_v.phase).ToFloat32(); const float x = 9 * freq_u * std::abs(u + phase_u); const float y = 9 * freq_v * std::abs(v + phase_v); const int x_int = static_cast(x); @@ -66,7 +68,7 @@ static float NoiseCoef(float u, float v, const TexturingRegs& regs, const State: return Common::BilinearInterp(g0, g1, g2, g3, x_noise, y_noise); } -static float GetShiftOffset(float v, ProcTexShift mode, ProcTexClamp clamp_mode) { +float GetShiftOffset(float v, ProcTexShift mode, ProcTexClamp clamp_mode) { const float offset = (clamp_mode == ProcTexClamp::MirroredRepeat) ? 1 : 0.5f; switch (mode) { case ProcTexShift::None: @@ -81,7 +83,7 @@ static float GetShiftOffset(float v, ProcTexShift mode, ProcTexClamp clamp_mode) } }; -static void ClampCoord(float& coord, ProcTexClamp mode) { +void ClampCoord(float& coord, ProcTexClamp mode) { switch (mode) { case ProcTexClamp::ToZero: if (coord > 1.0f) @@ -112,8 +114,8 @@ static void ClampCoord(float& coord, ProcTexClamp mode) { } } -static float CombineAndMap(float u, float v, ProcTexCombiner combiner, - const std::array& map_table) { +float CombineAndMap(float u, float v, ProcTexCombiner combiner, + const std::array& map_table) { float f; switch (combiner) { case ProcTexCombiner::U: @@ -122,28 +124,28 @@ static float CombineAndMap(float u, float v, ProcTexCombiner combiner, case ProcTexCombiner::U2: f = u * u; break; - case TexturingRegs::ProcTexCombiner::V: + case ProcTexCombiner::V: f = v; break; - case TexturingRegs::ProcTexCombiner::V2: + case ProcTexCombiner::V2: f = v * v; break; - case TexturingRegs::ProcTexCombiner::Add: + case ProcTexCombiner::Add: f = (u + v) * 0.5f; break; - case TexturingRegs::ProcTexCombiner::Add2: + case ProcTexCombiner::Add2: f = (u * u + v * v) * 0.5f; break; - case TexturingRegs::ProcTexCombiner::SqrtAdd2: + case ProcTexCombiner::SqrtAdd2: f = std::min(std::sqrt(u * u + v * v), 1.0f); break; - case TexturingRegs::ProcTexCombiner::Min: + case ProcTexCombiner::Min: f = std::min(u, v); break; - case TexturingRegs::ProcTexCombiner::Max: + case ProcTexCombiner::Max: f = std::max(u, v); break; - case TexturingRegs::ProcTexCombiner::RMax: + case ProcTexCombiner::RMax: f = std::min(((u + v) * 0.5f + std::sqrt(u * u + v * v)) * 0.5f, 1.0f); break; default: @@ -153,8 +155,10 @@ static float CombineAndMap(float u, float v, ProcTexCombiner combiner, } return LookupLUT(map_table, f); } +} // Anonymous namespace -Common::Vec4 ProcTex(float u, float v, const TexturingRegs& regs, const State::ProcTex& state) { +Common::Vec4 ProcTex(float u, float v, const Pica::TexturingRegs& regs, + const Pica::State::ProcTex& state) { u = std::abs(u); v = std::abs(v); @@ -218,4 +222,4 @@ Common::Vec4 ProcTex(float u, float v, const TexturingRegs& regs, const Stat } } -} // namespace Pica::Rasterizer +} // namespace SwRenderer diff --git a/src/video_core/renderer_software/sw_proctex.h b/src/video_core/renderer_software/sw_proctex.h index aead372f5..69836397f 100644 --- a/src/video_core/renderer_software/sw_proctex.h +++ b/src/video_core/renderer_software/sw_proctex.h @@ -8,9 +8,10 @@ #include "common/vector_math.h" #include "video_core/pica_state.h" -namespace Pica::Rasterizer { +namespace SwRenderer { /// Generates procedural texture color for the given coordinates -Common::Vec4 ProcTex(float u, float v, const TexturingRegs& regs, const State::ProcTex& state); +Common::Vec4 ProcTex(float u, float v, const Pica::TexturingRegs& regs, + const Pica::State::ProcTex& state); -} // namespace Pica::Rasterizer +} // namespace SwRenderer diff --git a/src/video_core/renderer_software/sw_rasterizer.cpp b/src/video_core/renderer_software/sw_rasterizer.cpp index cba79273c..7509aa29e 100644 --- a/src/video_core/renderer_software/sw_rasterizer.cpp +++ b/src/video_core/renderer_software/sw_rasterizer.cpp @@ -2,15 +2,937 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include "video_core/renderer_software/sw_clipper.h" +#include +#include "common/logging/log.h" +#include "common/microprofile.h" +#include "common/quaternion.h" +#include "common/vector_math.h" +#include "core/memory.h" +#include "video_core/pica_state.h" +#include "video_core/pica_types.h" +#include "video_core/renderer_software/sw_framebuffer.h" +#include "video_core/renderer_software/sw_lighting.h" +#include "video_core/renderer_software/sw_proctex.h" #include "video_core/renderer_software/sw_rasterizer.h" +#include "video_core/renderer_software/sw_texturing.h" +#include "video_core/shader/shader.h" +#include "video_core/texture/texture_decode.h" -namespace VideoCore { +namespace SwRenderer { + +using Pica::f24; +using Pica::FramebufferRegs; +using Pica::RasterizerRegs; +using Pica::TexturingRegs; +using Pica::Texture::LookupTexture; +using Pica::Texture::TextureInfo; + +struct Vertex : Pica::Shader::OutputVertex { + Vertex(const OutputVertex& v) : OutputVertex(v) {} + + /// Attributes used to store intermediate results position after perspective divide. + Common::Vec3 screenpos; + + /** + * Linear interpolation + * factor: 0=this, 1=vtx + * Note: This function cannot be called after perspective divide. + **/ + void Lerp(f24 factor, const Vertex& vtx) { + pos = pos * factor + vtx.pos * (f24::One() - factor); + quat = quat * factor + vtx.quat * (f24::One() - factor); + color = color * factor + vtx.color * (f24::One() - factor); + tc0 = tc0 * factor + vtx.tc0 * (f24::One() - factor); + tc1 = tc1 * factor + vtx.tc1 * (f24::One() - factor); + tc0_w = tc0_w * factor + vtx.tc0_w * (f24::One() - factor); + view = view * factor + vtx.view * (f24::One() - factor); + tc2 = tc2 * factor + vtx.tc2 * (f24::One() - factor); + } + + /** + * Linear interpolation + * factor: 0=v0, 1=v1 + * Note: This function cannot be called after perspective divide. + **/ + static Vertex Lerp(f24 factor, const Vertex& v0, const Vertex& v1) { + Vertex ret = v0; + ret.Lerp(factor, v1); + return ret; + } +}; + +namespace { + +MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 240)); + +struct ClippingEdge { +public: + constexpr ClippingEdge(Common::Vec4 coeffs, + Common::Vec4 bias = Common::Vec4(f24::Zero(), f24::Zero(), + f24::Zero(), f24::Zero())) + : pos(f24::Zero()), coeffs(coeffs), bias(bias) {} + + bool IsInside(const Vertex& vertex) const { + return Common::Dot(vertex.pos + bias, coeffs) >= f24::Zero(); + } + + bool IsOutSide(const Vertex& vertex) const { + return !IsInside(vertex); + } + + Vertex GetIntersection(const Vertex& v0, const Vertex& v1) const { + const f24 dp = Common::Dot(v0.pos + bias, coeffs); + const f24 dp_prev = Common::Dot(v1.pos + bias, coeffs); + const f24 factor = dp_prev / (dp_prev - dp); + return Vertex::Lerp(factor, v0, v1); + } + +private: + [[maybe_unused]] f24 pos; + Common::Vec4 coeffs; + Common::Vec4 bias; +}; + +} // Anonymous namespace + +RasterizerSoftware::RasterizerSoftware(Memory::MemorySystem& memory_) + : memory{memory_}, state{Pica::g_state}, regs{state.regs}, fb{memory, regs.framebuffer} {} void RasterizerSoftware::AddTriangle(const Pica::Shader::OutputVertex& v0, const Pica::Shader::OutputVertex& v1, const Pica::Shader::OutputVertex& v2) { - Pica::Clipper::ProcessTriangle(v0, v1, v2); + /** + * Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at + * the new edge (or less in degenerate cases). As such, we can say that each clipping plane + * introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a + * fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9. + **/ + static constexpr std::size_t MAX_VERTICES = 9; + + boost::container::static_vector buffer_a = {v0, v1, v2}; + boost::container::static_vector buffer_b; + + FlipQuaternionIfOpposite(buffer_a[1].quat, buffer_a[0].quat); + FlipQuaternionIfOpposite(buffer_a[2].quat, buffer_a[0].quat); + + auto* output_list = &buffer_a; + auto* input_list = &buffer_b; + + // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value. + // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest + // epsilon possible within f24 accuracy. + static constexpr f24 EPSILON = f24::FromFloat32(0.00001f); + static constexpr f24 f0 = f24::Zero(); + static constexpr f24 f1 = f24::One(); + static constexpr std::array clipping_edges = {{ + {Common::MakeVec(-f1, f0, f0, f1)}, // x = +w + {Common::MakeVec(f1, f0, f0, f1)}, // x = -w + {Common::MakeVec(f0, -f1, f0, f1)}, // y = +w + {Common::MakeVec(f0, f1, f0, f1)}, // y = -w + {Common::MakeVec(f0, f0, -f1, f0)}, // z = 0 + {Common::MakeVec(f0, f0, f1, f1)}, // z = -w + {Common::MakeVec(f0, f0, f0, f1), Common::Vec4(f0, f0, f0, EPSILON)}, // w = EPSILON + }}; + + // Simple implementation of the Sutherland-Hodgman clipping algorithm. + // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here) + const auto clip = [&](const ClippingEdge& edge) { + std::swap(input_list, output_list); + output_list->clear(); + + const Vertex* reference_vertex = &input_list->back(); + for (const auto& vertex : *input_list) { + // NOTE: This algorithm changes vertex order in some cases! + if (edge.IsInside(vertex)) { + if (edge.IsOutSide(*reference_vertex)) { + output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); + } + output_list->push_back(vertex); + } else if (edge.IsInside(*reference_vertex)) { + output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); + } + reference_vertex = &vertex; + } + }; + + for (const ClippingEdge& edge : clipping_edges) { + clip(edge); + if (output_list->size() < 3) { + return; + } + } + + if (state.regs.rasterizer.clip_enable) { + const ClippingEdge custom_edge{state.regs.rasterizer.GetClipCoef()}; + clip(custom_edge); + if (output_list->size() < 3) { + return; + } + } + + MakeScreenCoords((*output_list)[0]); + MakeScreenCoords((*output_list)[1]); + + for (std::size_t i = 0; i < output_list->size() - 2; i++) { + Vertex& vtx0 = (*output_list)[0]; + Vertex& vtx1 = (*output_list)[i + 1]; + Vertex& vtx2 = (*output_list)[i + 2]; + + MakeScreenCoords(vtx2); + + LOG_TRACE( + Render_Software, + "Triangle {}/{} at position ({:.3}, {:.3}, {:.3}, {:.3f}), " + "({:.3}, {:.3}, {:.3}, {:.3}), ({:.3}, {:.3}, {:.3}, {:.3}) and " + "screen position ({:.2}, {:.2}, {:.2}), ({:.2}, {:.2}, {:.2}), ({:.2}, {:.2}, {:.2})", + i + 1, output_list->size() - 2, vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), + vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(), vtx1.pos.x.ToFloat32(), + vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(), + vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), + vtx2.pos.w.ToFloat32(), vtx0.screenpos.x.ToFloat32(), vtx0.screenpos.y.ToFloat32(), + vtx0.screenpos.z.ToFloat32(), vtx1.screenpos.x.ToFloat32(), + vtx1.screenpos.y.ToFloat32(), vtx1.screenpos.z.ToFloat32(), + vtx2.screenpos.x.ToFloat32(), vtx2.screenpos.y.ToFloat32(), + vtx2.screenpos.z.ToFloat32()); + + ProcessTriangle(vtx0, vtx1, vtx2); + } } -} // namespace VideoCore +void RasterizerSoftware::MakeScreenCoords(Vertex& vtx) { + Viewport viewport{}; + viewport.halfsize_x = f24::FromRaw(regs.rasterizer.viewport_size_x); + viewport.halfsize_y = f24::FromRaw(regs.rasterizer.viewport_size_y); + viewport.offset_x = f24::FromFloat32(static_cast(regs.rasterizer.viewport_corner.x)); + viewport.offset_y = f24::FromFloat32(static_cast(regs.rasterizer.viewport_corner.y)); + + f24 inv_w = f24::One() / vtx.pos.w; + vtx.pos.w = inv_w; + vtx.quat *= inv_w; + vtx.color *= inv_w; + vtx.tc0 *= inv_w; + vtx.tc1 *= inv_w; + vtx.tc0_w *= inv_w; + vtx.view *= inv_w; + vtx.tc2 *= inv_w; + + vtx.screenpos[0] = (vtx.pos.x * inv_w + f24::One()) * viewport.halfsize_x + viewport.offset_x; + vtx.screenpos[1] = (vtx.pos.y * inv_w + f24::One()) * viewport.halfsize_y + viewport.offset_y; + vtx.screenpos[2] = vtx.pos.z * inv_w; +} + +void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2, + bool reversed) { + MICROPROFILE_SCOPE(GPU_Rasterization); + + // Vertex positions in rasterizer coordinates + static auto screen_to_rasterizer_coords = [](const Common::Vec3& vec) { + return Common::Vec3{Fix12P4::FromFloat24(vec.x), Fix12P4::FromFloat24(vec.y), + Fix12P4::FromFloat24(vec.z)}; + }; + + const std::array, 3> vtxpos = { + screen_to_rasterizer_coords(v0.screenpos), + screen_to_rasterizer_coords(v1.screenpos), + screen_to_rasterizer_coords(v2.screenpos), + }; + + if (regs.rasterizer.cull_mode == RasterizerRegs::CullMode::KeepAll) { + // Make sure we always end up with a triangle wound counter-clockwise + if (!reversed && SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) { + ProcessTriangle(v0, v2, v1, true); + return; + } + } else { + if (!reversed && regs.rasterizer.cull_mode == RasterizerRegs::CullMode::KeepClockWise) { + // Reverse vertex order and use the CCW code path. + ProcessTriangle(v0, v2, v1, true); + return; + } + // Cull away triangles which are wound clockwise. + if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) { + return; + } + } + + u16 min_x = std::min({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x}); + u16 min_y = std::min({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y}); + u16 max_x = std::max({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x}); + u16 max_y = std::max({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y}); + + // Convert the scissor box coordinates to 12.4 fixed point + const u16 scissor_x1 = static_cast(regs.rasterizer.scissor_test.x1 << 4); + const u16 scissor_y1 = static_cast(regs.rasterizer.scissor_test.y1 << 4); + // x2,y2 have +1 added to cover the entire sub-pixel area + const u16 scissor_x2 = static_cast((regs.rasterizer.scissor_test.x2 + 1) << 4); + const u16 scissor_y2 = static_cast((regs.rasterizer.scissor_test.y2 + 1) << 4); + + if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Include) { + // Calculate the new bounds + min_x = std::max(min_x, scissor_x1); + min_y = std::max(min_y, scissor_y1); + max_x = std::min(max_x, scissor_x2); + max_y = std::min(max_y, scissor_y2); + } + + min_x &= Fix12P4::IntMask(); + min_y &= Fix12P4::IntMask(); + max_x = ((max_x + Fix12P4::FracMask()) & Fix12P4::IntMask()); + max_y = ((max_y + Fix12P4::FracMask()) & Fix12P4::IntMask()); + + const int bias0 = + IsRightSideOrFlatBottomEdge(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) ? -1 : 0; + const int bias1 = + IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0; + const int bias2 = + IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0; + + const auto w_inverse = Common::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w); + + auto textures = regs.texturing.GetTextures(); + const auto tev_stages = regs.texturing.GetTevStages(); + + const bool stencil_action_enable = + regs.framebuffer.output_merger.stencil_test.enable && + regs.framebuffer.framebuffer.depth_format == FramebufferRegs::DepthFormat::D24S8; + const auto stencil_test = regs.framebuffer.output_merger.stencil_test; + + // Enter rasterization loop, starting at the center of the topleft bounding box corner. + // TODO: Not sure if looping through x first might be faster + for (u16 y = min_y + 8; y < max_y; y += 0x10) { + for (u16 x = min_x + 8; x < max_x; x += 0x10) { + // Do not process the pixel if it's inside the scissor box and the scissor mode is set + // to Exclude. + if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) { + if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) { + continue; + } + } + + // Calculate the barycentric coordinates w0, w1 and w2 + const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y}); + const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y}); + const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y}); + const s32 wsum = w0 + w1 + w2; + + // If current pixel is not covered by the current primitive + if (w0 < 0 || w1 < 0 || w2 < 0) { + continue; + } + + const auto baricentric_coordinates = Common::MakeVec( + f24::FromFloat32(static_cast(w0)), f24::FromFloat32(static_cast(w1)), + f24::FromFloat32(static_cast(w2))); + const f24 interpolated_w_inverse = + f24::One() / Common::Dot(w_inverse, baricentric_coordinates); + + // interpolated_z = z / w + const float interpolated_z_over_w = + (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 + + v2.screenpos[2].ToFloat32() * w2) / + wsum; + + // Not fully accurate. About 3 bits in precision are missing. + // Z-Buffer (z / w * scale + offset) + const float depth_scale = + f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32(); + const float depth_offset = + f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32(); + float depth = interpolated_z_over_w * depth_scale + depth_offset; + + // Potentially switch to W-Buffer + if (regs.rasterizer.depthmap_enable == + Pica::RasterizerRegs::DepthBuffering::WBuffering) { + // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w) + depth *= interpolated_w_inverse.ToFloat32() * wsum; + } + + // Clamp the result + depth = std::clamp(depth, 0.0f, 1.0f); + + /** + * Perspective correct attribute interpolation: + * Attribute values cannot be calculated by simple linear interpolation since + * they are not linear in screen space. For example, when interpolating a + * texture coordinate across two vertices, something simple like + * u = (u0*w0 + u1*w1)/(w0+w1) + * will not work. However, the attribute value divided by the + * clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear + * in screenspace. Hence, we can linearly interpolate these two independently and + * calculate the interpolated attribute by dividing the results. + * I.e. + * u_over_w = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1) + * one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1) + * u = u_over_w / one_over_w + * + * The generalization to three vertices is straightforward in baricentric coordinates. + **/ + const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) { + auto attr_over_w = Common::MakeVec(attr0, attr1, attr2); + f24 interpolated_attr_over_w = Common::Dot(attr_over_w, baricentric_coordinates); + return interpolated_attr_over_w * interpolated_w_inverse; + }; + + const Common::Vec4 primary_color{ + static_cast( + round(get_interpolated_attribute(v0.color.r(), v1.color.r(), v2.color.r()) + .ToFloat32() * + 255)), + static_cast( + round(get_interpolated_attribute(v0.color.g(), v1.color.g(), v2.color.g()) + .ToFloat32() * + 255)), + static_cast( + round(get_interpolated_attribute(v0.color.b(), v1.color.b(), v2.color.b()) + .ToFloat32() * + 255)), + static_cast( + round(get_interpolated_attribute(v0.color.a(), v1.color.a(), v2.color.a()) + .ToFloat32() * + 255)), + }; + + std::array, 3> uv; + uv[0].u() = get_interpolated_attribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u()); + uv[0].v() = get_interpolated_attribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v()); + uv[1].u() = get_interpolated_attribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u()); + uv[1].v() = get_interpolated_attribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v()); + uv[2].u() = get_interpolated_attribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u()); + uv[2].v() = get_interpolated_attribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v()); + + // Sample bound texture units. + const f24 tc0_w = get_interpolated_attribute(v0.tc0_w, v1.tc0_w, v2.tc0_w); + const auto texture_color = TextureColor(uv, textures, tc0_w); + + Common::Vec4 primary_fragment_color{0, 0, 0, 0}; + Common::Vec4 secondary_fragment_color{0, 0, 0, 0}; + if (!regs.lighting.disable) { + const auto normquat = + Common::Quaternion{ + {get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(), + get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(), + get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()}, + get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(), + } + .Normalized(); + + const Common::Vec3f view{ + get_interpolated_attribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(), + get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(), + get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(), + }; + std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors( + regs.lighting, state.lighting, normquat, view, texture_color); + } + + // Write the TEV stages. + Common::Vec4 combiner_output = + WriteTevConfig(texture_color, tev_stages, primary_color, primary_fragment_color, + secondary_fragment_color); + + const auto& output_merger = regs.framebuffer.output_merger; + if (output_merger.fragment_operation_mode == + FramebufferRegs::FragmentOperationMode::Shadow) { + u32 depth_int = static_cast(depth * 0xFFFFFF); + // Use green color as the shadow intensity + u8 stencil = combiner_output.y; + fb.DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil); + // Skip the normal output merger pipeline if it is in shadow mode + continue; + } + + // Does alpha testing happen before or after stencil? + if (!DoAlphaTest(combiner_output.a())) { + continue; + } + WriteFog(combiner_output, depth); + if (!DoDepthStencilTest(x, y, depth, stencil_action_enable)) { + continue; + } + const auto result = PixelColor(x, y, combiner_output); + if (regs.framebuffer.framebuffer.allow_color_write != 0) { + fb.DrawPixel(x >> 4, y >> 4, result); + } + } + } +} + +std::array, 4> RasterizerSoftware::TextureColor( + std::span, 3> uv, + std::span textures, f24 tc0_w) const { + std::array, 4> texture_color{}; + for (u32 i = 0; i < 3; ++i) { + const auto& texture = textures[i]; + if (!texture.enabled) [[unlikely]] { + continue; + } + if (texture.config.address == 0) [[unlikely]] { + texture_color[i] = {0, 0, 0, 255}; + continue; + } + + const s32 coordinate_i = (i == 2 && regs.texturing.main_config.texture2_use_coord1) ? 1 : i; + f24 u = uv[coordinate_i].u(); + f24 v = uv[coordinate_i].v(); + + // Only unit 0 respects the texturing type (according to 3DBrew) + PAddr texture_address = texture.config.GetPhysicalAddress(); + f24 shadow_z; + if (i == 0) { + switch (texture.config.type) { + case TexturingRegs::TextureConfig::Texture2D: + break; + case TexturingRegs::TextureConfig::ShadowCube: + case TexturingRegs::TextureConfig::TextureCube: { + std::tie(u, v, shadow_z, texture_address) = + ConvertCubeCoord(u, v, tc0_w, regs.texturing); + break; + } + case TexturingRegs::TextureConfig::Projection2D: { + u /= tc0_w; + v /= tc0_w; + break; + } + case TexturingRegs::TextureConfig::Shadow2D: { + if (!regs.texturing.shadow.orthographic) { + u /= tc0_w; + v /= tc0_w; + } + shadow_z = f24::FromFloat32(std::abs(tc0_w.ToFloat32())); + break; + } + case TexturingRegs::TextureConfig::Disabled: + continue; // skip this unit and continue to the next unit + default: + LOG_ERROR(HW_GPU, "Unhandled texture type {:x}", (int)texture.config.type); + UNIMPLEMENTED(); + break; + } + } + + const f24 width = f24::FromFloat32(static_cast(texture.config.width)); + const f24 height = f24::FromFloat32(static_cast(texture.config.height)); + s32 s = static_cast((u * width).ToFloat32()); + s32 t = static_cast((v * height).ToFloat32()); + + bool use_border_s = false; + bool use_border_t = false; + + if (texture.config.wrap_s == TexturingRegs::TextureConfig::ClampToBorder) { + use_border_s = s < 0 || s >= static_cast(texture.config.width); + } else if (texture.config.wrap_s == TexturingRegs::TextureConfig::ClampToBorder2) { + use_border_s = s >= static_cast(texture.config.width); + } + + if (texture.config.wrap_t == TexturingRegs::TextureConfig::ClampToBorder) { + use_border_t = t < 0 || t >= static_cast(texture.config.height); + } else if (texture.config.wrap_t == TexturingRegs::TextureConfig::ClampToBorder2) { + use_border_t = t >= static_cast(texture.config.height); + } + + if (use_border_s || use_border_t) { + const auto border_color = texture.config.border_color; + texture_color[i] = Common::MakeVec(border_color.r.Value(), border_color.g.Value(), + border_color.b.Value(), border_color.a.Value()) + .Cast(); + } else { + // Textures are laid out from bottom to top, hence we invert the t coordinate. + // NOTE: This may not be the right place for the inversion. + // TODO: Check if this applies to ETC textures, too. + s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width); + t = texture.config.height - 1 - + GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height); + + const u8* texture_data = memory.GetPhysicalPointer(texture_address); + const auto info = TextureInfo::FromPicaRegister(texture.config, texture.format); + + // TODO: Apply the min and mag filters to the texture + texture_color[i] = LookupTexture(texture_data, s, t, info); + } + + if (i == 0 && (texture.config.type == TexturingRegs::TextureConfig::Shadow2D || + texture.config.type == TexturingRegs::TextureConfig::ShadowCube)) { + + s32 z_int = static_cast(std::min(shadow_z.ToFloat32(), 1.0f) * 0xFFFFFF); + z_int -= regs.texturing.shadow.bias << 1; + const auto& color = texture_color[i]; + const s32 z_ref = (color.w << 16) | (color.z << 8) | color.y; + u8 density; + if (z_ref >= z_int) { + density = color.x; + } else { + density = 0; + } + texture_color[i] = {density, density, density, density}; + } + } + + // Sample procedural texture + if (regs.texturing.main_config.texture3_enable) { + const auto& proctex_uv = uv[regs.texturing.main_config.texture3_coordinates]; + texture_color[3] = ProcTex(proctex_uv.u().ToFloat32(), proctex_uv.v().ToFloat32(), + regs.texturing, state.proctex); + } + + return texture_color; +} + +Common::Vec4 RasterizerSoftware::PixelColor(u16 x, u16 y, + Common::Vec4& combiner_output) const { + const auto dest = fb.GetPixel(x >> 4, y >> 4); + Common::Vec4 blend_output = combiner_output; + + const auto& output_merger = regs.framebuffer.output_merger; + if (output_merger.alphablend_enable) { + const auto params = output_merger.alpha_blending; + const auto lookup_factor = [&](u32 channel, FramebufferRegs::BlendFactor factor) -> u8 { + DEBUG_ASSERT(channel < 4); + + const Common::Vec4 blend_const = + Common::MakeVec( + output_merger.blend_const.r.Value(), output_merger.blend_const.g.Value(), + output_merger.blend_const.b.Value(), output_merger.blend_const.a.Value()) + .Cast(); + + switch (factor) { + case FramebufferRegs::BlendFactor::Zero: + return 0; + case FramebufferRegs::BlendFactor::One: + return 255; + case FramebufferRegs::BlendFactor::SourceColor: + return combiner_output[channel]; + case FramebufferRegs::BlendFactor::OneMinusSourceColor: + return 255 - combiner_output[channel]; + case FramebufferRegs::BlendFactor::DestColor: + return dest[channel]; + case FramebufferRegs::BlendFactor::OneMinusDestColor: + return 255 - dest[channel]; + case FramebufferRegs::BlendFactor::SourceAlpha: + return combiner_output.a(); + case FramebufferRegs::BlendFactor::OneMinusSourceAlpha: + return 255 - combiner_output.a(); + case FramebufferRegs::BlendFactor::DestAlpha: + return dest.a(); + case FramebufferRegs::BlendFactor::OneMinusDestAlpha: + return 255 - dest.a(); + case FramebufferRegs::BlendFactor::ConstantColor: + return blend_const[channel]; + case FramebufferRegs::BlendFactor::OneMinusConstantColor: + return 255 - blend_const[channel]; + case FramebufferRegs::BlendFactor::ConstantAlpha: + return blend_const.a(); + case FramebufferRegs::BlendFactor::OneMinusConstantAlpha: + return 255 - blend_const.a(); + case FramebufferRegs::BlendFactor::SourceAlphaSaturate: + // Returns 1.0 for the alpha channel + if (channel == 3) { + return 255; + } + return std::min(combiner_output.a(), static_cast(255 - dest.a())); + default: + LOG_CRITICAL(HW_GPU, "Unknown blend factor {:x}", factor); + UNIMPLEMENTED(); + break; + } + return combiner_output[channel]; + }; + + const auto srcfactor = Common::MakeVec( + lookup_factor(0, params.factor_source_rgb), lookup_factor(1, params.factor_source_rgb), + lookup_factor(2, params.factor_source_rgb), lookup_factor(3, params.factor_source_a)); + + const auto dstfactor = Common::MakeVec( + lookup_factor(0, params.factor_dest_rgb), lookup_factor(1, params.factor_dest_rgb), + lookup_factor(2, params.factor_dest_rgb), lookup_factor(3, params.factor_dest_a)); + + blend_output = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, + params.blend_equation_rgb); + blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, + params.blend_equation_a) + .a(); + } else { + blend_output = + Common::MakeVec(LogicOp(combiner_output.r(), dest.r(), output_merger.logic_op), + LogicOp(combiner_output.g(), dest.g(), output_merger.logic_op), + LogicOp(combiner_output.b(), dest.b(), output_merger.logic_op), + LogicOp(combiner_output.a(), dest.a(), output_merger.logic_op)); + } + + const Common::Vec4 result = { + output_merger.red_enable ? blend_output.r() : dest.r(), + output_merger.green_enable ? blend_output.g() : dest.g(), + output_merger.blue_enable ? blend_output.b() : dest.b(), + output_merger.alpha_enable ? blend_output.a() : dest.a(), + }; + + return result; +} + +Common::Vec4 RasterizerSoftware::WriteTevConfig( + std::span, 4> texture_color, + std::span tev_stages, + Common::Vec4 primary_color, Common::Vec4 primary_fragment_color, + Common::Vec4 secondary_fragment_color) const { + /** + * Texture environment - consists of 6 stages of color and alpha combining. + * Color combiners take three input color values from some source (e.g. interpolated + * vertex color, texture color, previous stage, etc), perform some very simple + * operations on each of them (e.g. inversion) and then calculate the output color + * with some basic arithmetic. Alpha combiners can be configured separately but work + * analogously. + **/ + Common::Vec4 combiner_output; + Common::Vec4 combiner_buffer = {0, 0, 0, 0}; + Common::Vec4 next_combiner_buffer = + Common::MakeVec(regs.texturing.tev_combiner_buffer_color.r.Value(), + regs.texturing.tev_combiner_buffer_color.g.Value(), + regs.texturing.tev_combiner_buffer_color.b.Value(), + regs.texturing.tev_combiner_buffer_color.a.Value()) + .Cast(); + + for (u32 tev_stage_index = 0; tev_stage_index < tev_stages.size(); ++tev_stage_index) { + const auto& tev_stage = tev_stages[tev_stage_index]; + using Source = TexturingRegs::TevStageConfig::Source; + + auto get_source = [&](Source source) -> Common::Vec4 { + switch (source) { + case Source::PrimaryColor: + return primary_color; + case Source::PrimaryFragmentColor: + return primary_fragment_color; + case Source::SecondaryFragmentColor: + return secondary_fragment_color; + case Source::Texture0: + return texture_color[0]; + case Source::Texture1: + return texture_color[1]; + case Source::Texture2: + return texture_color[2]; + case Source::Texture3: + return texture_color[3]; + case Source::PreviousBuffer: + return combiner_buffer; + case Source::Constant: + return Common::MakeVec(tev_stage.const_r.Value(), tev_stage.const_g.Value(), + tev_stage.const_b.Value(), tev_stage.const_a.Value()) + .Cast(); + case Source::Previous: + return combiner_output; + default: + LOG_ERROR(HW_GPU, "Unknown color combiner source {}", (int)source); + UNIMPLEMENTED(); + return {0, 0, 0, 0}; + } + }; + + /** + * Color combiner + * NOTE: Not sure if the alpha combiner might use the color output of the previous + * stage as input. Hence, we currently don't directly write the result to + * combiner_output.rgb(), but instead store it in a temporary variable until + * alpha combining has been done. + **/ + const std::array, 3> color_result = { + GetColorModifier(tev_stage.color_modifier1, get_source(tev_stage.color_source1)), + GetColorModifier(tev_stage.color_modifier2, get_source(tev_stage.color_source2)), + GetColorModifier(tev_stage.color_modifier3, get_source(tev_stage.color_source3)), + }; + const Common::Vec3 color_output = ColorCombine(tev_stage.color_op, color_result); + + u8 alpha_output; + if (tev_stage.color_op == TexturingRegs::TevStageConfig::Operation::Dot3_RGBA) { + // result of Dot3_RGBA operation is also placed to the alpha component + alpha_output = color_output.x; + } else { + // alpha combiner + const std::array alpha_result = {{ + GetAlphaModifier(tev_stage.alpha_modifier1, get_source(tev_stage.alpha_source1)), + GetAlphaModifier(tev_stage.alpha_modifier2, get_source(tev_stage.alpha_source2)), + GetAlphaModifier(tev_stage.alpha_modifier3, get_source(tev_stage.alpha_source3)), + }}; + alpha_output = AlphaCombine(tev_stage.alpha_op, alpha_result); + } + + combiner_output[0] = std::min(255U, color_output.r() * tev_stage.GetColorMultiplier()); + combiner_output[1] = std::min(255U, color_output.g() * tev_stage.GetColorMultiplier()); + combiner_output[2] = std::min(255U, color_output.b() * tev_stage.GetColorMultiplier()); + combiner_output[3] = std::min(255U, alpha_output * tev_stage.GetAlphaMultiplier()); + + combiner_buffer = next_combiner_buffer; + + if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferColor( + tev_stage_index)) { + next_combiner_buffer.r() = combiner_output.r(); + next_combiner_buffer.g() = combiner_output.g(); + next_combiner_buffer.b() = combiner_output.b(); + } + + if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferAlpha( + tev_stage_index)) { + next_combiner_buffer.a() = combiner_output.a(); + } + } + return combiner_output; +} + +void RasterizerSoftware::WriteFog(Common::Vec4& combiner_output, float depth) const { + /** + * Apply fog combiner. Not fully accurate. We'd have to know what data type is used to + * store the depth etc. Using float for now until we know more about Pica datatypes. + **/ + if (regs.texturing.fog_mode == TexturingRegs::FogMode::Fog) { + const Common::Vec3 fog_color = + Common::MakeVec(regs.texturing.fog_color.r.Value(), regs.texturing.fog_color.g.Value(), + regs.texturing.fog_color.b.Value()) + .Cast(); + + float fog_index; + if (regs.texturing.fog_flip) { + fog_index = (1.0f - depth) * 128.0f; + } else { + fog_index = depth * 128.0f; + } + + // Generate clamped fog factor from LUT for given fog index + const f32 fog_i = std::clamp(floorf(fog_index), 0.0f, 127.0f); + const f32 fog_f = fog_index - fog_i; + const auto& fog_lut_entry = state.fog.lut[static_cast(fog_i)]; + f32 fog_factor = fog_lut_entry.ToFloat() + fog_lut_entry.DiffToFloat() * fog_f; + fog_factor = std::clamp(fog_factor, 0.0f, 1.0f); + for (u32 i = 0; i < 3; i++) { + combiner_output[i] = static_cast(fog_factor * combiner_output[i] + + (1.0f - fog_factor) * fog_color[i]); + } + } +} + +bool RasterizerSoftware::DoAlphaTest(u8 alpha) const { + const auto& output_merger = regs.framebuffer.output_merger; + if (!output_merger.alpha_test.enable) { + return true; + } + switch (output_merger.alpha_test.func) { + case FramebufferRegs::CompareFunc::Never: + return false; + case FramebufferRegs::CompareFunc::Always: + return true; + case FramebufferRegs::CompareFunc::Equal: + return alpha == output_merger.alpha_test.ref; + case FramebufferRegs::CompareFunc::NotEqual: + return alpha != output_merger.alpha_test.ref; + case FramebufferRegs::CompareFunc::LessThan: + return alpha < output_merger.alpha_test.ref; + case FramebufferRegs::CompareFunc::LessThanOrEqual: + return alpha <= output_merger.alpha_test.ref; + case FramebufferRegs::CompareFunc::GreaterThan: + return alpha > output_merger.alpha_test.ref; + case FramebufferRegs::CompareFunc::GreaterThanOrEqual: + return alpha >= output_merger.alpha_test.ref; + } +} + +bool RasterizerSoftware::DoDepthStencilTest(u16 x, u16 y, float depth, + bool stencil_action_enable) const { + const auto& framebuffer = regs.framebuffer.framebuffer; + const auto stencil_test = regs.framebuffer.output_merger.stencil_test; + u8 old_stencil = 0; + + const auto update_stencil = [&](Pica::FramebufferRegs::StencilAction action) { + const u8 new_stencil = + PerformStencilAction(action, old_stencil, stencil_test.reference_value); + if (framebuffer.allow_depth_stencil_write != 0) { + const u8 stencil = + (new_stencil & stencil_test.write_mask) | (old_stencil & ~stencil_test.write_mask); + fb.SetStencil(x >> 4, y >> 4, stencil); + } + }; + + if (stencil_action_enable) { + old_stencil = fb.GetStencil(x >> 4, y >> 4); + const u8 dest = old_stencil & stencil_test.input_mask; + const u8 ref = stencil_test.reference_value & stencil_test.input_mask; + bool pass = false; + switch (stencil_test.func) { + case FramebufferRegs::CompareFunc::Never: + pass = false; + break; + case FramebufferRegs::CompareFunc::Always: + pass = true; + break; + case FramebufferRegs::CompareFunc::Equal: + pass = (ref == dest); + break; + case FramebufferRegs::CompareFunc::NotEqual: + pass = (ref != dest); + break; + case FramebufferRegs::CompareFunc::LessThan: + pass = (ref < dest); + break; + case FramebufferRegs::CompareFunc::LessThanOrEqual: + pass = (ref <= dest); + break; + case FramebufferRegs::CompareFunc::GreaterThan: + pass = (ref > dest); + break; + case FramebufferRegs::CompareFunc::GreaterThanOrEqual: + pass = (ref >= dest); + break; + } + if (!pass) { + update_stencil(stencil_test.action_stencil_fail); + return false; + } + } + + const u32 num_bits = FramebufferRegs::DepthBitsPerPixel(framebuffer.depth_format); + const u32 z = static_cast(depth * ((1 << num_bits) - 1)); + + const auto& output_merger = regs.framebuffer.output_merger; + if (output_merger.depth_test_enable) { + const u32 ref_z = fb.GetDepth(x >> 4, y >> 4); + bool pass = false; + switch (output_merger.depth_test_func) { + case FramebufferRegs::CompareFunc::Never: + pass = false; + break; + case FramebufferRegs::CompareFunc::Always: + pass = true; + break; + case FramebufferRegs::CompareFunc::Equal: + pass = z == ref_z; + break; + case FramebufferRegs::CompareFunc::NotEqual: + pass = z != ref_z; + break; + case FramebufferRegs::CompareFunc::LessThan: + pass = z < ref_z; + break; + case FramebufferRegs::CompareFunc::LessThanOrEqual: + pass = z <= ref_z; + break; + case FramebufferRegs::CompareFunc::GreaterThan: + pass = z > ref_z; + break; + case FramebufferRegs::CompareFunc::GreaterThanOrEqual: + pass = z >= ref_z; + break; + } + if (!pass) { + if (stencil_action_enable) { + update_stencil(stencil_test.action_depth_fail); + } + return false; + } + } + if (framebuffer.allow_depth_stencil_write != 0 && output_merger.depth_write_enable) { + fb.SetDepth(x >> 4, y >> 4, z); + } + // The stencil depth_pass action is executed even if depth testing is disabled + if (stencil_action_enable) { + update_stencil(stencil_test.action_depth_pass); + } + + return true; +} + +} // namespace SwRenderer diff --git a/src/video_core/renderer_software/sw_rasterizer.h b/src/video_core/renderer_software/sw_rasterizer.h index 26235df01..71b501850 100644 --- a/src/video_core/renderer_software/sw_rasterizer.h +++ b/src/video_core/renderer_software/sw_rasterizer.h @@ -4,16 +4,30 @@ #pragma once -#include "common/common_types.h" +#include + #include "video_core/rasterizer_interface.h" +#include "video_core/regs_texturing.h" +#include "video_core/renderer_software/sw_clipper.h" +#include "video_core/renderer_software/sw_framebuffer.h" namespace Pica::Shader { struct OutputVertex; -} // namespace Pica::Shader +} -namespace VideoCore { +namespace Pica { +struct State; +struct Regs; +} // namespace Pica + +namespace SwRenderer { + +struct Vertex; + +class RasterizerSoftware : public VideoCore::RasterizerInterface { +public: + explicit RasterizerSoftware(Memory::MemorySystem& memory); -class RasterizerSoftware : public RasterizerInterface { void AddTriangle(const Pica::Shader::OutputVertex& v0, const Pica::Shader::OutputVertex& v1, const Pica::Shader::OutputVertex& v2) override; void DrawTriangles() override {} @@ -23,6 +37,44 @@ class RasterizerSoftware : public RasterizerInterface { void InvalidateRegion(PAddr addr, u32 size) override {} void FlushAndInvalidateRegion(PAddr addr, u32 size) override {} void ClearAll(bool flush) override {} + +private: + /// Computes the screen coordinates of the provided vertex. + void MakeScreenCoords(Vertex& vtx); + + /// Processes the triangle defined by the provided vertices. + void ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2, + bool reversed = false); + + /// Returns the texture color of the currently processed pixel. + std::array, 4> TextureColor( + std::span, 3> uv, + std::span textures, f24 tc0_w) const; + + /// Returns the final pixel color with blending or logic ops applied. + Common::Vec4 PixelColor(u16 x, u16 y, Common::Vec4& combiner_output) const; + + /// Emulates the TEV configuration and returns the combiner output. + Common::Vec4 WriteTevConfig( + std::span, 4> texture_color, + std::span tev_stages, + Common::Vec4 primary_color, Common::Vec4 primary_fragment_color, + Common::Vec4 secondary_fragment_color) const; + + /// Blends fog to the combiner output if enabled. + void WriteFog(Common::Vec4& combiner_output, float depth) const; + + /// Performs the alpha test. Returns false if the test failed. + bool DoAlphaTest(u8 alpha) const; + + /// Performs the depth stencil test. Returns false if the test failed. + bool DoDepthStencilTest(u16 x, u16 y, float depth, bool stencil_action_enable) const; + +private: + Memory::MemorySystem& memory; + Pica::State& state; + const Pica::Regs& regs; + Framebuffer fb; }; -} // namespace VideoCore +} // namespace SwRenderer diff --git a/src/video_core/renderer_software/sw_texturing.cpp b/src/video_core/renderer_software/sw_texturing.cpp index 799b7e5c2..a41e733d2 100644 --- a/src/video_core/renderer_software/sw_texturing.cpp +++ b/src/video_core/renderer_software/sw_texturing.cpp @@ -9,41 +9,40 @@ #include "video_core/regs_texturing.h" #include "video_core/renderer_software/sw_texturing.h" -namespace Pica::Rasterizer { +namespace SwRenderer { -using TevStageConfig = TexturingRegs::TevStageConfig; +using TevStageConfig = Pica::TexturingRegs::TevStageConfig; + +int GetWrappedTexCoord(Pica::TexturingRegs::TextureConfig::WrapMode mode, s32 val, u32 size) { + using TextureConfig = Pica::TexturingRegs::TextureConfig; -int GetWrappedTexCoord(TexturingRegs::TextureConfig::WrapMode mode, int val, unsigned size) { switch (mode) { - case TexturingRegs::TextureConfig::ClampToEdge2: + case TextureConfig::ClampToEdge2: // For negative coordinate, ClampToEdge2 behaves the same as Repeat if (val < 0) { - return static_cast(static_cast(val) % size); + return static_cast(static_cast(val) % size); } - // [[fallthrough]] - case TexturingRegs::TextureConfig::ClampToEdge: + [[fallthrough]]; + case TextureConfig::ClampToEdge: val = std::max(val, 0); - val = std::min(val, static_cast(size) - 1); + val = std::min(val, static_cast(size) - 1); return val; - - case TexturingRegs::TextureConfig::ClampToBorder: + case TextureConfig::ClampToBorder: return val; - - case TexturingRegs::TextureConfig::ClampToBorder2: + case TextureConfig::ClampToBorder2: // For ClampToBorder2, the case of positive coordinate beyond the texture size is already // handled outside. Here we only handle the negative coordinate in the same way as Repeat. - case TexturingRegs::TextureConfig::Repeat2: - case TexturingRegs::TextureConfig::Repeat3: - case TexturingRegs::TextureConfig::Repeat: - return static_cast(static_cast(val) % size); - - case TexturingRegs::TextureConfig::MirroredRepeat: { - unsigned int coord = (static_cast(val) % (2 * size)); - if (coord >= size) + case TextureConfig::Repeat2: + case TextureConfig::Repeat3: + case TextureConfig::Repeat: + return static_cast(static_cast(val) % size); + case TextureConfig::MirroredRepeat: { + u32 coord = (static_cast(val) % (2 * size)); + if (coord >= size) { coord = 2 * size - 1 - coord; - return static_cast(coord); + } + return static_cast(coord); } - default: LOG_ERROR(HW_GPU, "Unknown texture coordinate wrapping mode {:x}", (int)mode); UNIMPLEMENTED(); @@ -58,35 +57,25 @@ Common::Vec3 GetColorModifier(TevStageConfig::ColorModifier factor, switch (factor) { case ColorModifier::SourceColor: return values.rgb(); - case ColorModifier::OneMinusSourceColor: return (Common::Vec3(255, 255, 255) - values.rgb()).Cast(); - case ColorModifier::SourceAlpha: return values.aaa(); - case ColorModifier::OneMinusSourceAlpha: return (Common::Vec3(255, 255, 255) - values.aaa()).Cast(); - case ColorModifier::SourceRed: return values.rrr(); - case ColorModifier::OneMinusSourceRed: return (Common::Vec3(255, 255, 255) - values.rrr()).Cast(); - case ColorModifier::SourceGreen: return values.ggg(); - case ColorModifier::OneMinusSourceGreen: return (Common::Vec3(255, 255, 255) - values.ggg()).Cast(); - case ColorModifier::SourceBlue: return values.bbb(); - case ColorModifier::OneMinusSourceBlue: return (Common::Vec3(255, 255, 255) - values.bbb()).Cast(); } - UNREACHABLE(); }; @@ -96,42 +85,33 @@ u8 GetAlphaModifier(TevStageConfig::AlphaModifier factor, const Common::Vec4 switch (factor) { case AlphaModifier::SourceAlpha: return values.a(); - case AlphaModifier::OneMinusSourceAlpha: return 255 - values.a(); - case AlphaModifier::SourceRed: return values.r(); - case AlphaModifier::OneMinusSourceRed: return 255 - values.r(); - case AlphaModifier::SourceGreen: return values.g(); - case AlphaModifier::OneMinusSourceGreen: return 255 - values.g(); - case AlphaModifier::SourceBlue: return values.b(); - case AlphaModifier::OneMinusSourceBlue: return 255 - values.b(); } - UNREACHABLE(); }; -Common::Vec3 ColorCombine(TevStageConfig::Operation op, const Common::Vec3 input[3]) { +Common::Vec3 ColorCombine(TevStageConfig::Operation op, + std::span, 3> input) { using Operation = TevStageConfig::Operation; switch (op) { case Operation::Replace: return input[0]; - case Operation::Modulate: return ((input[0] * input[1]) / 255).Cast(); - case Operation::Add: { auto result = input[0] + input[1]; result.r() = std::min(255, result.r()); @@ -139,46 +119,41 @@ Common::Vec3 ColorCombine(TevStageConfig::Operation op, const Common::Vec3(); } - case Operation::AddSigned: { // TODO(bunnei): Verify that the color conversion from (float) 0.5f to // (byte) 128 is correct - auto result = - input[0].Cast() + input[1].Cast() - Common::MakeVec(128, 128, 128); - result.r() = std::clamp(result.r(), 0, 255); - result.g() = std::clamp(result.g(), 0, 255); - result.b() = std::clamp(result.b(), 0, 255); + Common::Vec3i result = + input[0].Cast() + input[1].Cast() - Common::MakeVec(128, 128, 128); + result.r() = std::clamp(result.r(), 0, 255); + result.g() = std::clamp(result.g(), 0, 255); + result.b() = std::clamp(result.b(), 0, 255); return result.Cast(); } - case Operation::Lerp: return ((input[0] * input[2] + input[1] * (Common::MakeVec(255, 255, 255) - input[2]).Cast()) / 255) .Cast(); - case Operation::Subtract: { - auto result = input[0].Cast() - input[1].Cast(); + auto result = input[0].Cast() - input[1].Cast(); result.r() = std::max(0, result.r()); result.g() = std::max(0, result.g()); result.b() = std::max(0, result.b()); return result.Cast(); } - case Operation::MultiplyThenAdd: { - auto result = (input[0] * input[1] + 255 * input[2].Cast()) / 255; + auto result = (input[0] * input[1] + 255 * input[2].Cast()) / 255; result.r() = std::min(255, result.r()); result.g() = std::min(255, result.g()); result.b() = std::min(255, result.b()); return result.Cast(); } - case Operation::AddThenMultiply: { auto result = input[0] + input[1]; result.r() = std::min(255, result.r()); result.g() = std::min(255, result.g()); result.b() = std::min(255, result.b()); - result = (result * input[2].Cast()) / 255; + result = (result * input[2].Cast()) / 255; return result.Cast(); } case Operation::Dot3_RGB: @@ -187,11 +162,11 @@ Common::Vec3 ColorCombine(TevStageConfig::Operation op, const Common::Vec3(); } default: LOG_ERROR(HW_GPU, "Unknown color combiner operation {}", (int)op); @@ -205,31 +180,23 @@ u8 AlphaCombine(TevStageConfig::Operation op, const std::array& input) { using Operation = TevStageConfig::Operation; case Operation::Replace: return input[0]; - case Operation::Modulate: return input[0] * input[1] / 255; - case Operation::Add: return std::min(255, input[0] + input[1]); - case Operation::AddSigned: { // TODO(bunnei): Verify that the color conversion from (float) 0.5f to (byte) 128 is correct - auto result = static_cast(input[0]) + static_cast(input[1]) - 128; - return static_cast(std::clamp(result, 0, 255)); + auto result = static_cast(input[0]) + static_cast(input[1]) - 128; + return static_cast(std::clamp(result, 0, 255)); } - case Operation::Lerp: return (input[0] * input[2] + input[1] * (255 - input[2])) / 255; - case Operation::Subtract: - return std::max(0, (int)input[0] - (int)input[1]); - + return std::max(0, static_cast(input[0]) - static_cast(input[1])); case Operation::MultiplyThenAdd: return std::min(255, (input[0] * input[1] + 255 * input[2]) / 255); - case Operation::AddThenMultiply: return (std::min(255, (input[0] + input[1])) * input[2]) / 255; - default: LOG_ERROR(HW_GPU, "Unknown alpha combiner operation {}", (int)op); UNIMPLEMENTED(); @@ -237,4 +204,4 @@ u8 AlphaCombine(TevStageConfig::Operation op, const std::array& input) { } }; -} // namespace Pica::Rasterizer +} // namespace SwRenderer diff --git a/src/video_core/renderer_software/sw_texturing.h b/src/video_core/renderer_software/sw_texturing.h index 140a386ab..cf81736aa 100644 --- a/src/video_core/renderer_software/sw_texturing.h +++ b/src/video_core/renderer_software/sw_texturing.h @@ -4,23 +4,25 @@ #pragma once +#include + #include "common/common_types.h" #include "common/vector_math.h" #include "video_core/regs_texturing.h" -namespace Pica::Rasterizer { +namespace SwRenderer { -int GetWrappedTexCoord(TexturingRegs::TextureConfig::WrapMode mode, int val, unsigned size); +int GetWrappedTexCoord(Pica::TexturingRegs::TextureConfig::WrapMode mode, s32 val, u32 size); -Common::Vec3 GetColorModifier(TexturingRegs::TevStageConfig::ColorModifier factor, +Common::Vec3 GetColorModifier(Pica::TexturingRegs::TevStageConfig::ColorModifier factor, const Common::Vec4& values); -u8 GetAlphaModifier(TexturingRegs::TevStageConfig::AlphaModifier factor, +u8 GetAlphaModifier(Pica::TexturingRegs::TevStageConfig::AlphaModifier factor, const Common::Vec4& values); -Common::Vec3 ColorCombine(TexturingRegs::TevStageConfig::Operation op, - const Common::Vec3 input[3]); +Common::Vec3 ColorCombine(Pica::TexturingRegs::TevStageConfig::Operation op, + std::span, 3> input); -u8 AlphaCombine(TexturingRegs::TevStageConfig::Operation op, const std::array& input); +u8 AlphaCombine(Pica::TexturingRegs::TevStageConfig::Operation op, const std::array& input); -} // namespace Pica::Rasterizer +} // namespace SwRenderer diff --git a/src/video_core/shader/debug_data.h b/src/video_core/shader/debug_data.h index 70bb8dc31..a85caeed7 100644 --- a/src/video_core/shader/debug_data.h +++ b/src/video_core/shader/debug_data.h @@ -54,12 +54,12 @@ struct DebugData { LOOP_INT_IN = 0x800, }; - Common::Vec4 src1; - Common::Vec4 src2; - Common::Vec4 src3; + Common::Vec4 src1; + Common::Vec4 src2; + Common::Vec4 src3; - Common::Vec4 dest_in; - Common::Vec4 dest_out; + Common::Vec4 dest_in; + Common::Vec4 dest_out; s32 address_registers[2]; bool conditional_code[2]; @@ -89,7 +89,7 @@ template inline void SetField(DebugDataRecord& record, ValueType value); template <> -inline void SetField(DebugDataRecord& record, float24* value) { +inline void SetField(DebugDataRecord& record, f24* value) { record.src1.x = value[0]; record.src1.y = value[1]; record.src1.z = value[2]; @@ -97,7 +97,7 @@ inline void SetField(DebugDataRecord& record, float24* va } template <> -inline void SetField(DebugDataRecord& record, float24* value) { +inline void SetField(DebugDataRecord& record, f24* value) { record.src2.x = value[0]; record.src2.y = value[1]; record.src2.z = value[2]; @@ -105,7 +105,7 @@ inline void SetField(DebugDataRecord& record, float24* va } template <> -inline void SetField(DebugDataRecord& record, float24* value) { +inline void SetField(DebugDataRecord& record, f24* value) { record.src3.x = value[0]; record.src3.y = value[1]; record.src3.z = value[2]; @@ -113,7 +113,7 @@ inline void SetField(DebugDataRecord& record, float24* va } template <> -inline void SetField(DebugDataRecord& record, float24* value) { +inline void SetField(DebugDataRecord& record, f24* value) { record.dest_in.x = value[0]; record.dest_in.y = value[1]; record.dest_in.z = value[2]; @@ -121,7 +121,7 @@ inline void SetField(DebugDataRecord& record, float24* } template <> -inline void SetField(DebugDataRecord& record, float24* value) { +inline void SetField(DebugDataRecord& record, f24* value) { record.dest_out.x = value[0]; record.dest_out.y = value[1]; record.dest_out.z = value[2]; diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index b54597205..126efd834 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -5,10 +5,10 @@ #include #include #include "common/arch.h" +#include "common/assert.h" #include "common/bit_set.h" #include "common/logging/log.h" #include "common/microprofile.h" -#include "video_core/pica_state.h" #include "video_core/regs_rasterizer.h" #include "video_core/regs_shader.h" #include "video_core/shader/shader.h" @@ -41,11 +41,11 @@ OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs, // Allow us to overflow OutputVertex to avoid branches, since // RasterizerRegs::VSOutputAttributes::INVALID would write to slot 31, which // would be out of bounds otherwise. - std::array vertex_slots_overflow; + std::array vertex_slots_overflow; }; // Assert that OutputVertex has enough space for 24 semantic registers - static_assert(sizeof(std::array) == sizeof(ret), + static_assert(sizeof(std::array) == sizeof(ret), "Struct and array have different sizes."); unsigned int num_attributes = regs.vs_output_total & 7; @@ -61,7 +61,7 @@ OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs, // interpolation for (unsigned i = 0; i < 4; ++i) { float c = std::fabs(ret.color[i].ToFloat32()); - ret.color[i] = float24::FromFloat32(c < 1.0f ? c : 1.0f); + ret.color[i] = f24::FromFloat32(c < 1.0f ? c : 1.0f); } LOG_TRACE(HW_GPU, @@ -86,7 +86,7 @@ void UnitState::LoadInput(const ShaderRegs& config, const AttributeBuffer& input } } -static void CopyRegistersToOutput(std::span, 16> regs, u32 mask, +static void CopyRegistersToOutput(std::span, 16> regs, u32 mask, AttributeBuffer& buffer) { int output_i = 0; for (int reg : Common::BitSet(mask)) { @@ -108,7 +108,7 @@ GSEmitter::~GSEmitter() { delete handlers; } -void GSEmitter::Emit(std::span, 16> output_regs) { +void GSEmitter::Emit(std::span, 16> output_regs) { ASSERT(vertex_id < 3); // TODO: This should be merged with UnitState::WriteOutput somehow CopyRegistersToOutput(output_regs, output_mask, buffer[vertex_id]); diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index bc5eb1595..65a586e31 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -12,7 +12,6 @@ #include #include #include -#include "common/assert.h" #include "common/common_funcs.h" #include "common/common_types.h" #include "common/hash.h" @@ -29,7 +28,7 @@ using ProgramCode = std::array; using SwizzleData = std::array; struct AttributeBuffer { - alignas(16) Common::Vec4 attr[16]; + alignas(16) Common::Vec4 attr[16]; private: friend class boost::serialization::access; @@ -46,16 +45,16 @@ using VertexHandler = std::function; using WindingSetter = std::function; struct OutputVertex { - Common::Vec4 pos; - Common::Vec4 quat; - Common::Vec4 color; - Common::Vec2 tc0; - Common::Vec2 tc1; - float24 tc0_w; + Common::Vec4 pos; + Common::Vec4 quat; + Common::Vec4 color; + Common::Vec2 tc0; + Common::Vec2 tc1; + f24 tc0_w; INSERT_PADDING_WORDS(1); - Common::Vec3 view; + Common::Vec3 view; INSERT_PADDING_WORDS(1); - Common::Vec2 tc2; + Common::Vec2 tc2; static void ValidateSemantics(const RasterizerRegs& regs); static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs, @@ -76,8 +75,8 @@ private: friend class boost::serialization::access; }; #define ASSERT_POS(var, pos) \ - static_assert(offsetof(OutputVertex, var) == pos * sizeof(float24), "Semantic at wrong " \ - "offset.") + static_assert(offsetof(OutputVertex, var) == pos * sizeof(f24), "Semantic at wrong " \ + "offset.") ASSERT_POS(pos, RasterizerRegs::VSOutputAttributes::POSITION_X); ASSERT_POS(quat, RasterizerRegs::VSOutputAttributes::QUATERNION_X); ASSERT_POS(color, RasterizerRegs::VSOutputAttributes::COLOR_R); @@ -109,7 +108,7 @@ struct GSEmitter { GSEmitter(); ~GSEmitter(); - void Emit(std::span, 16> output_regs); + void Emit(std::span, 16> output_regs); private: friend class boost::serialization::access; @@ -136,9 +135,9 @@ struct UnitState { struct Registers { // The registers are accessed by the shader JIT using SSE instructions, and are therefore // required to be 16-byte aligned. - alignas(16) std::array, 16> input; - alignas(16) std::array, 16> temporary; - alignas(16) std::array, 16> output; + alignas(16) std::array, 16> input; + alignas(16) std::array, 16> temporary; + alignas(16) std::array, 16> output; private: friend class boost::serialization::access; @@ -160,18 +159,16 @@ struct UnitState { GSEmitter* emitter_ptr; static std::size_t InputOffset(int register_index) { - return offsetof(UnitState, registers.input) + - register_index * sizeof(Common::Vec4); + return offsetof(UnitState, registers.input) + register_index * sizeof(Common::Vec4); } static std::size_t OutputOffset(int register_index) { - return offsetof(UnitState, registers.output) + - register_index * sizeof(Common::Vec4); + return offsetof(UnitState, registers.output) + register_index * sizeof(Common::Vec4); } static std::size_t TemporaryOffset(int register_index) { return offsetof(UnitState, registers.temporary) + - register_index * sizeof(Common::Vec4); + register_index * sizeof(Common::Vec4); } /** @@ -219,13 +216,13 @@ private: struct Uniforms { // The float uniforms are accessed by the shader JIT using SSE instructions, and are // therefore required to be 16-byte aligned. - alignas(16) std::array, 96> f; + alignas(16) std::array, 96> f; std::array b; std::array, 4> i; static std::size_t GetFloatUniformOffset(unsigned index) { - return offsetof(Uniforms, f) + index * sizeof(Common::Vec4); + return offsetof(Uniforms, f) + index * sizeof(Common::Vec4); } static std::size_t GetBoolUniformOffset(unsigned index) { diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index 18859b2ab..681817686 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -80,7 +80,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData const auto& program_code = setup.program_code; // Placeholder for invalid inputs - static float24 dummy_vec4_float24[4]; + static f24 dummy_vec4_float24[4]; unsigned iteration = 0; bool exit_loop = false; @@ -111,7 +111,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData debug_data.max_offset = std::max(debug_data.max_offset, 1 + program_counter); - auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { + auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const f24* { switch (source_reg.GetRegisterType()) { case RegisterType::Input: return &state.registers.input[source_reg.GetIndex()].x; @@ -137,15 +137,15 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData ? 0 : state.address_registers[instr.common.address_register_index - 1]; - const float24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) + - (is_inverted ? 0 : address_offset)); - const float24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted) + - (is_inverted ? address_offset : 0)); + const f24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) + + (is_inverted ? 0 : address_offset)); + const f24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted) + + (is_inverted ? address_offset : 0)); const bool negate_src1 = ((bool)swizzle.negate_src1 != false); const bool negate_src2 = ((bool)swizzle.negate_src2 != false); - float24 src1[4] = { + f24 src1[4] = { src1_[(int)swizzle.src1_selector_0.Value()], src1_[(int)swizzle.src1_selector_1.Value()], src1_[(int)swizzle.src1_selector_2.Value()], @@ -157,7 +157,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData src1[2] = -src1[2]; src1[3] = -src1[3]; } - float24 src2[4] = { + f24 src2[4] = { src2_[(int)swizzle.src2_selector_0.Value()], src2_[(int)swizzle.src2_selector_1.Value()], src2_[(int)swizzle.src2_selector_2.Value()], @@ -170,12 +170,11 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData src2[3] = -src2[3]; } - float24* dest = - (instr.common.dest.Value() < 0x10) - ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] - : (instr.common.dest.Value() < 0x20) - ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] - : dummy_vec4_float24; + f24* dest = (instr.common.dest.Value() < 0x10) + ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] + : (instr.common.dest.Value() < 0x20) + ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] + : dummy_vec4_float24; debug_data.max_opdesc_id = std::max(debug_data.max_opdesc_id, 1 + instr.common.operand_desc_id); @@ -216,7 +215,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData if (!swizzle.DestComponentEnabled(i)) continue; - dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32())); + dest[i] = f24::FromFloat32(std::floor(src1[i].ToFloat32())); } Record(debug_data, iteration, dest); break; @@ -263,11 +262,10 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData OpCode::Id opcode = instr.opcode.Value().EffectiveOpCode(); if (opcode == OpCode::Id::DPH || opcode == OpCode::Id::DPHI) - src1[3] = float24::FromFloat32(1.0f); + src1[3] = f24::One(); int num_components = (opcode == OpCode::Id::DP3) ? 3 : 4; - float24 dot = std::inner_product(src1, src1 + num_components, src2, - float24::FromFloat32(0.f)); + f24 dot = std::inner_product(src1, src1 + num_components, src2, f24::Zero()); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) @@ -283,7 +281,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData case OpCode::Id::RCP: { Record(debug_data, iteration, src1); Record(debug_data, iteration, dest); - float24 rcp_res = float24::FromFloat32(1.0f / src1[0].ToFloat32()); + f24 rcp_res = f24::FromFloat32(1.0f / src1[0].ToFloat32()); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; @@ -298,7 +296,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData case OpCode::Id::RSQ: { Record(debug_data, iteration, src1); Record(debug_data, iteration, dest); - float24 rsq_res = float24::FromFloat32(1.0f / std::sqrt(src1[0].ToFloat32())); + f24 rsq_res = f24::FromFloat32(1.0f / std::sqrt(src1[0].ToFloat32())); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; @@ -345,8 +343,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData if (!swizzle.DestComponentEnabled(i)) continue; - dest[i] = (src1[i] >= src2[i]) ? float24::FromFloat32(1.0f) - : float24::FromFloat32(0.0f); + dest[i] = (src1[i] >= src2[i]) ? f24::One() : f24::Zero(); } Record(debug_data, iteration, dest); break; @@ -360,8 +357,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData if (!swizzle.DestComponentEnabled(i)) continue; - dest[i] = (src1[i] < src2[i]) ? float24::FromFloat32(1.0f) - : float24::FromFloat32(0.0f); + dest[i] = (src1[i] < src2[i]) ? f24::One() : f24::Zero(); } Record(debug_data, iteration, dest); break; @@ -413,7 +409,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData Record(debug_data, iteration, dest); // EX2 only takes first component exp2 and writes it to all dest components - float24 ex2_res = float24::FromFloat32(std::exp2(src1[0].ToFloat32())); + f24 ex2_res = f24::FromFloat32(std::exp2(src1[0].ToFloat32())); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; @@ -430,7 +426,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData Record(debug_data, iteration, dest); // LG2 only takes the first component log2 and writes it to all dest components - float24 lg2_res = float24::FromFloat32(std::log2(src1[0].ToFloat32())); + f24 lg2_res = f24::FromFloat32(std::log2(src1[0].ToFloat32())); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; @@ -466,17 +462,17 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData ? 0 : state.address_registers[instr.mad.address_register_index - 1]; - const float24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted)); - const float24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted) + - (!is_inverted * address_offset)); - const float24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted) + - (is_inverted * address_offset)); + const f24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted)); + const f24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted) + + (!is_inverted * address_offset)); + const f24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted) + + (is_inverted * address_offset)); const bool negate_src1 = ((bool)mad_swizzle.negate_src1 != false); const bool negate_src2 = ((bool)mad_swizzle.negate_src2 != false); const bool negate_src3 = ((bool)mad_swizzle.negate_src3 != false); - float24 src1[4] = { + f24 src1[4] = { src1_[(int)mad_swizzle.src1_selector_0.Value()], src1_[(int)mad_swizzle.src1_selector_1.Value()], src1_[(int)mad_swizzle.src1_selector_2.Value()], @@ -488,7 +484,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData src1[2] = -src1[2]; src1[3] = -src1[3]; } - float24 src2[4] = { + f24 src2[4] = { src2_[(int)mad_swizzle.src2_selector_0.Value()], src2_[(int)mad_swizzle.src2_selector_1.Value()], src2_[(int)mad_swizzle.src2_selector_2.Value()], @@ -500,7 +496,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData src2[2] = -src2[2]; src2[3] = -src2[3]; } - float24 src3[4] = { + f24 src3[4] = { src3_[(int)mad_swizzle.src3_selector_0.Value()], src3_[(int)mad_swizzle.src3_selector_1.Value()], src3_[(int)mad_swizzle.src3_selector_2.Value()], @@ -513,12 +509,11 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData src3[3] = -src3[3]; } - float24* dest = - (instr.mad.dest.Value() < 0x10) - ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] - : (instr.mad.dest.Value() < 0x20) - ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] - : dummy_vec4_float24; + f24* dest = (instr.mad.dest.Value() < 0x10) + ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] + : (instr.mad.dest.Value() < 0x20) + ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] + : dummy_vec4_float24; Record(debug_data, iteration, src1); Record(debug_data, iteration, src2); @@ -687,7 +682,7 @@ DebugData InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup, DebugData debug_data; // Setup input register table - state.registers.input.fill(Common::Vec4::AssignToAll(float24::Zero())); + state.registers.input.fill(Common::Vec4::AssignToAll(f24::Zero())); state.LoadInput(config, input); RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point); return debug_data; diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index 513e128cb..d97244503 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -5,6 +5,7 @@ #include "common/arch.h" #if CITRA_ARCH(x86_64) +#include "common/assert.h" #include "common/microprofile.h" #include "video_core/shader/shader.h" #include "video_core/shader/shader_jit_x64.h" diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp index 0e87ee20b..00cfde0c2 100644 --- a/src/video_core/shader/shader_jit_x64_compiler.cpp +++ b/src/video_core/shader/shader_jit_x64_compiler.cpp @@ -813,7 +813,7 @@ void JitShader::Compile_JMP(Instruction instr) { } } -static void Emit(GSEmitter* emitter, Common::Vec4 (*output)[16]) { +static void Emit(GSEmitter* emitter, Common::Vec4 (*output)[16]) { emitter->Emit(*output); } diff --git a/src/video_core/vertex_loader.cpp b/src/video_core/vertex_loader.cpp index 1e9aa4d8e..f0a2501ff 100644 --- a/src/video_core/vertex_loader.cpp +++ b/src/video_core/vertex_loader.cpp @@ -98,7 +98,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, const s8* srcdata = reinterpret_cast( VideoCore::g_memory->GetPhysicalPointer(source_addr)); for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { - input.attr[i][comp] = float24::FromFloat32(srcdata[comp]); + input.attr[i][comp] = f24::FromFloat32(srcdata[comp]); } break; } @@ -106,7 +106,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, const u8* srcdata = reinterpret_cast( VideoCore::g_memory->GetPhysicalPointer(source_addr)); for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { - input.attr[i][comp] = float24::FromFloat32(srcdata[comp]); + input.attr[i][comp] = f24::FromFloat32(srcdata[comp]); } break; } @@ -114,7 +114,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, const s16* srcdata = reinterpret_cast( VideoCore::g_memory->GetPhysicalPointer(source_addr)); for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { - input.attr[i][comp] = float24::FromFloat32(srcdata[comp]); + input.attr[i][comp] = f24::FromFloat32(srcdata[comp]); } break; } @@ -122,7 +122,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, const float* srcdata = reinterpret_cast( VideoCore::g_memory->GetPhysicalPointer(source_addr)); for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { - input.attr[i][comp] = float24::FromFloat32(srcdata[comp]); + input.attr[i][comp] = f24::FromFloat32(srcdata[comp]); } break; } @@ -132,8 +132,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, // is *not* carried over from the default attribute settings even if they're // enabled for this attribute. for (unsigned int comp = vertex_attribute_elements[i]; comp < 4; ++comp) { - input.attr[i][comp] = - comp == 3 ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f); + input.attr[i][comp] = comp == 3 ? f24::One() : f24::Zero(); } LOG_TRACE(HW_GPU, diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index b38ca9d55..baf18f2e3 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -40,7 +40,7 @@ void Init(Frontend::EmuWindow& emu_window, Frontend::EmuWindow* secondary_window switch (graphics_api) { case Settings::GraphicsAPI::Software: - g_renderer = std::make_unique(system, emu_window); + g_renderer = std::make_unique(system, emu_window); break; case Settings::GraphicsAPI::OpenGL: g_renderer = std::make_unique(system, emu_window, secondary_window);