diff --git a/src/core/hle/kernel/vm_manager.cpp b/src/core/hle/kernel/vm_manager.cpp index 7a007c065..2d6af0cbb 100644 --- a/src/core/hle/kernel/vm_manager.cpp +++ b/src/core/hle/kernel/vm_manager.cpp @@ -58,7 +58,6 @@ void VMManager::Reset() { page_table.pointers.fill(nullptr); page_table.attributes.fill(Memory::PageType::Unmapped); - page_table.cached_res_count.fill(0); UpdatePageTableForVMA(initial_vma); } diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp index 88684b82d..be95718e9 100644 --- a/src/core/hle/service/gsp_gpu.cpp +++ b/src/core/hle/service/gsp_gpu.cpp @@ -476,10 +476,11 @@ static void ExecuteCommand(const Command& command, u32 thread_id) { // TODO: Consider attempting rasterizer-accelerated surface blit if that usage is ever // possible/likely Memory::RasterizerFlushVirtualRegion(command.dma_request.source_address, - command.dma_request.size, Memory::FlushMode::Flush); + command.dma_request.size, + Memory::FlushMode::Flush); Memory::RasterizerFlushVirtualRegion(command.dma_request.dest_address, command.dma_request.size, - Memory::FlushMode::FlushAndInvalidate); + Memory::FlushMode::Invalidate); // TODO(Subv): These memory accesses should not go through the application's memory mapping. // They should go through the GSP module's memory mapping. diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp index 83ad9d898..9a6458d8e 100644 --- a/src/core/hw/gpu.cpp +++ b/src/core/hw/gpu.cpp @@ -96,20 +96,11 @@ static void MemoryFill(const Regs::MemoryFillConfig& config) { u8* start = Memory::GetPhysicalPointer(start_addr); u8* end = Memory::GetPhysicalPointer(end_addr); - // TODO: Consider always accelerating and returning vector of - // regions that the accelerated fill did not cover to - // reduce/eliminate the fill that the cpu has to do. - // This would also mean that the flush below is not needed. - // Fill should first flush all surfaces that touch but are - // not completely within the fill range. - // Then fill all completely covered surfaces, and return the - // regions that were between surfaces or within the touching - // ones for cpu to manually fill here. if (VideoCore::g_renderer->Rasterizer()->AccelerateFill(config)) return; - Memory::RasterizerFlushAndInvalidateRegion(config.GetStartAddress(), - config.GetEndAddress() - config.GetStartAddress()); + Memory::RasterizerInvalidateRegion(config.GetStartAddress(), + config.GetEndAddress() - config.GetStartAddress()); if (config.fill_24bit) { // fill with 24-bit values @@ -199,7 +190,7 @@ static void DisplayTransfer(const Regs::DisplayTransferConfig& config) { u32 output_size = output_width * output_height * GPU::Regs::BytesPerPixel(config.output_format); Memory::RasterizerFlushRegion(config.GetPhysicalInputAddress(), input_size); - Memory::RasterizerFlushAndInvalidateRegion(config.GetPhysicalOutputAddress(), output_size); + Memory::RasterizerInvalidateRegion(config.GetPhysicalOutputAddress(), output_size); for (u32 y = 0; y < output_height; ++y) { for (u32 x = 0; x < output_width; ++x) { @@ -363,8 +354,12 @@ static void TextureCopy(const Regs::DisplayTransferConfig& config) { size_t contiguous_output_size = config.texture_copy.size / output_width * (output_width + output_gap); - Memory::RasterizerFlushAndInvalidateRegion(config.GetPhysicalOutputAddress(), - static_cast(contiguous_output_size)); + // Only need to flush output if it has a gap + const auto FlushInvalidate_fn = (output_gap != 0) ? + Memory::RasterizerFlushAndInvalidateRegion : + Memory::RasterizerInvalidateRegion; + FlushInvalidate_fn(config.GetPhysicalOutputAddress(), + static_cast(contiguous_output_size)); u32 remaining_input = input_width; u32 remaining_output = output_width; diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 9b394f84b..dde010d4d 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -50,7 +50,6 @@ static void MapPages(PageTable& page_table, u32 base, u32 size, u8* memory, Page page_table.attributes[base] = type; page_table.pointers[base] = memory; - page_table.cached_res_count[base] = 0; base += 1; if (memory != nullptr) @@ -187,7 +186,7 @@ void Write(const VAddr vaddr, const T data) { ASSERT_MSG(false, "Mapped memory page without a pointer @ %08X", vaddr); break; case PageType::RasterizerCachedMemory: { - RasterizerFlushVirtualRegion(vaddr, sizeof(T), FlushMode::FlushAndInvalidate); + RasterizerFlushVirtualRegion(vaddr, sizeof(T), FlushMode::Invalidate); std::memcpy(GetPointerFromVMA(vaddr), &data, sizeof(T)); break; } @@ -195,7 +194,7 @@ void Write(const VAddr vaddr, const T data) { WriteMMIO(GetMMIOHandler(vaddr), vaddr, data); break; case PageType::RasterizerCachedSpecial: { - RasterizerFlushVirtualRegion(vaddr, sizeof(T), FlushMode::FlushAndInvalidate); + RasterizerFlushVirtualRegion(vaddr, sizeof(T), FlushMode::Invalidate); WriteMMIO(GetMMIOHandler(vaddr), vaddr, data); break; } @@ -315,7 +314,7 @@ u8* GetPhysicalPointer(PAddr address) { return target_pointer; } -void RasterizerMarkRegionCached(PAddr start, u32 size, int count_delta) { +void RasterizerMarkRegionCached(PAddr start, u32 size, bool cached) { if (start == 0) { return; } @@ -336,14 +335,10 @@ void RasterizerMarkRegionCached(PAddr start, u32 size, int count_delta) { } VAddr vaddr = *maybe_vaddr; - u8& res_count = current_page_table->cached_res_count[vaddr >> PAGE_BITS]; - ASSERT_MSG(count_delta <= UINT8_MAX - res_count, - "Rasterizer resource cache counter overflow!"); - ASSERT_MSG(count_delta >= -res_count, "Rasterizer resource cache counter underflow!"); + PageType& page_type = current_page_table->attributes[vaddr >> PAGE_BITS]; - // Switch page type to cached if now cached - if (res_count == 0) { - PageType& page_type = current_page_table->attributes[vaddr >> PAGE_BITS]; + if (cached) { + // Switch page type to cached switch (page_type) { case PageType::Unmapped: // It is not necessary for a process to have this region mapped into its address @@ -360,12 +355,8 @@ void RasterizerMarkRegionCached(PAddr start, u32 size, int count_delta) { UNREACHABLE(); } } - - res_count += count_delta; - - // Switch page type to uncached if now uncached - if (res_count == 0) { - PageType& page_type = current_page_table->attributes[vaddr >> PAGE_BITS]; + else { + // Switch page type to uncached switch (page_type) { case PageType::Unmapped: // It is not necessary for a process to have this region mapped into its address @@ -400,6 +391,12 @@ void RasterizerFlushRegion(PAddr start, u32 size) { } } +void RasterizerInvalidateRegion(PAddr start, u32 size) { + if (VideoCore::g_renderer != nullptr) { + VideoCore::g_renderer->Rasterizer()->InvalidateRegion(start, size); + } +} + void RasterizerFlushAndInvalidateRegion(PAddr start, u32 size) { // Since pages are unmapped on shutdown after video core is shutdown, the renderer may be // null here @@ -431,6 +428,9 @@ void RasterizerFlushVirtualRegion(VAddr start, u32 size, FlushMode mode) { case FlushMode::Flush: rasterizer->FlushRegion(physical_start, overlap_size); break; + case FlushMode::Invalidate: + rasterizer->InvalidateRegion(physical_start, overlap_size); + break; case FlushMode::FlushAndInvalidate: rasterizer->FlushAndInvalidateRegion(physical_start, overlap_size); break; @@ -556,13 +556,13 @@ void WriteBlock(const VAddr dest_addr, const void* src_buffer, const size_t size break; } case PageType::RasterizerCachedMemory: { - RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::FlushAndInvalidate); + RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::Invalidate); std::memcpy(GetPointerFromVMA(current_vaddr), src_buffer, copy_amount); break; } case PageType::RasterizerCachedSpecial: { DEBUG_ASSERT(GetMMIOHandler(current_vaddr)); - RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::FlushAndInvalidate); + RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::Invalidate); GetMMIOHandler(current_vaddr)->WriteBlock(current_vaddr, src_buffer, copy_amount); break; } @@ -608,13 +608,13 @@ void ZeroBlock(const VAddr dest_addr, const size_t size) { break; } case PageType::RasterizerCachedMemory: { - RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::FlushAndInvalidate); + RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::Invalidate); std::memset(GetPointerFromVMA(current_vaddr), 0, copy_amount); break; } case PageType::RasterizerCachedSpecial: { DEBUG_ASSERT(GetMMIOHandler(current_vaddr)); - RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::FlushAndInvalidate); + RasterizerFlushVirtualRegion(current_vaddr, copy_amount, FlushMode::Invalidate); GetMMIOHandler(current_vaddr)->WriteBlock(current_vaddr, zeros.data(), copy_amount); break; } diff --git a/src/core/memory.h b/src/core/memory.h index 1865bfea0..052935f74 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -68,12 +68,6 @@ struct PageTable { * the corresponding entry in `pointers` MUST be set to null. */ std::array attributes; - - /** - * Indicates the number of externally cached resources touching a page that should be - * flushed before the memory is accessed - */ - std::array cached_res_count; }; /// Physical memory regions as seen from the ARM11 @@ -232,16 +226,20 @@ boost::optional PhysicalToVirtualAddress(PAddr addr); u8* GetPhysicalPointer(PAddr address); /** - * Adds the supplied value to the rasterizer resource cache counter of each - * page touching the region. + * Mark each page touching the region as cached. */ -void RasterizerMarkRegionCached(PAddr start, u32 size, int count_delta); +void RasterizerMarkRegionCached(PAddr start, u32 size, bool cached); /** - * Flushes any externally cached rasterizer resources touching the given region. - */ +* Flushes any externally cached rasterizer resources touching the given region. +*/ void RasterizerFlushRegion(PAddr start, u32 size); +/** +* Invalidates any externally cached rasterizer resources touching the given region. +*/ +void RasterizerInvalidateRegion(PAddr start, u32 size); + /** * Flushes and invalidates any externally cached rasterizer resources touching the given region. */ @@ -250,6 +248,8 @@ void RasterizerFlushAndInvalidateRegion(PAddr start, u32 size); enum class FlushMode { /// Write back modified surfaces to RAM Flush, + /// Remove region from the cache + Invalidate, /// Write back modified surfaces to RAM, and also remove them from the cache FlushAndInvalidate, }; diff --git a/src/tests/core/arm/arm_test_common.cpp b/src/tests/core/arm/arm_test_common.cpp index cfe0d503a..a43cf892f 100644 --- a/src/tests/core/arm/arm_test_common.cpp +++ b/src/tests/core/arm/arm_test_common.cpp @@ -16,7 +16,6 @@ TestEnvironment::TestEnvironment(bool mutable_memory_) page_table.pointers.fill(nullptr); page_table.attributes.fill(Memory::PageType::Unmapped); - page_table.cached_res_count.fill(0); Memory::MapIoRegion(page_table, 0x00000000, 0x80000000, test_memory); Memory::MapIoRegion(page_table, 0x80000000, 0x80000000, test_memory); diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 8ef7e74c7..1d4c98189 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -38,6 +38,9 @@ public: /// Notify rasterizer that any caches of the specified region should be flushed to 3DS memory virtual void FlushRegion(PAddr addr, u32 size) = 0; + /// Notify rasterizer that any caches of the specified region should be invalidated + virtual void InvalidateRegion(PAddr addr, u32 size) = 0; + /// Notify rasterizer that any caches of the specified region should be flushed to 3DS memory /// and invalidated virtual void FlushAndInvalidateRegion(PAddr addr, u32 size) = 0; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 7e09e4712..b1adc156e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -8,7 +8,6 @@ #include #include #include "common/assert.h" -#include "common/color.h" #include "common/logging/log.h" #include "common/math_util.h" #include "common/microprofile.h" @@ -23,6 +22,9 @@ #include "video_core/renderer_opengl/pica_to_gl.h" #include "video_core/renderer_opengl/renderer_opengl.h" +using PixelFormat = SurfaceParams::PixelFormat; +using SurfaceType = SurfaceParams::SurfaceType; + MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255)); MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100)); @@ -225,12 +227,27 @@ void RasterizerOpenGL::DrawTriangles() { MICROPROFILE_SCOPE(OpenGL_Drawing); const auto& regs = Pica::g_state.regs; + const bool has_stencil = regs.framebuffer.framebuffer.depth_format == Pica::FramebufferRegs::DepthFormat::D24S8; + + const bool write_color_fb = state.color_mask.red_enabled == GL_TRUE || + state.color_mask.green_enabled == GL_TRUE || + state.color_mask.blue_enabled == GL_TRUE || + state.color_mask.alpha_enabled == GL_TRUE; + + const bool write_depth_fb = state.depth.write_mask == GL_TRUE || + (has_stencil && state.stencil.write_mask != 0); + + const bool using_color_fb = regs.framebuffer.framebuffer.GetColorBufferPhysicalAddress() != 0 && + write_color_fb; + const bool using_depth_fb = regs.framebuffer.framebuffer.GetDepthBufferPhysicalAddress() != 0 && + (state.depth.test_enabled || write_depth_fb); + // Sync and bind the framebuffer surfaces - CachedSurface* color_surface; - CachedSurface* depth_surface; + Surface color_surface; + Surface depth_surface; MathUtil::Rectangle rect; std::tie(color_surface, depth_surface, rect) = - res_cache.GetFramebufferSurfaces(regs.framebuffer.framebuffer); + res_cache.GetFramebufferSurfaces(using_color_fb, using_depth_fb); state.draw.draw_framebuffer = framebuffer.handle; state.Apply(); @@ -238,8 +255,7 @@ void RasterizerOpenGL::DrawTriangles() { glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, color_surface != nullptr ? color_surface->texture.handle : 0, 0); if (depth_surface != nullptr) { - if (regs.framebuffer.framebuffer.depth_format == - Pica::FramebufferRegs::DepthFormat::D24S8) { + if (has_stencil) { // attach both depth and stencil glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, depth_surface->texture.handle, 0); @@ -258,37 +274,42 @@ void RasterizerOpenGL::DrawTriangles() { // Sync the viewport // These registers hold half-width and half-height, so must be multiplied by 2 - GLsizei viewport_width = - (GLsizei)Pica::float24::FromRaw(regs.rasterizer.viewport_size_x).ToFloat32() * 2; - GLsizei viewport_height = - (GLsizei)Pica::float24::FromRaw(regs.rasterizer.viewport_size_y).ToFloat32() * 2; + const GLsizei viewport_width = + static_cast(Pica::float24::FromRaw(regs.rasterizer.viewport_size_x).ToFloat32() * 2); + const GLsizei viewport_height = + static_cast(Pica::float24::FromRaw(regs.rasterizer.viewport_size_y).ToFloat32() * 2); + + const float res_scale_width = color_surface != nullptr ? color_surface->res_scale_width : + (depth_surface == nullptr ? 1.0f : depth_surface->res_scale_width); + const float res_scale_height = color_surface != nullptr ? color_surface->res_scale_height : + (depth_surface == nullptr ? 1.0f : depth_surface->res_scale_height); glViewport( - (GLint)(rect.left + regs.rasterizer.viewport_corner.x * color_surface->res_scale_width), - (GLint)(rect.bottom + regs.rasterizer.viewport_corner.y * color_surface->res_scale_height), - (GLsizei)(viewport_width * color_surface->res_scale_width), - (GLsizei)(viewport_height * color_surface->res_scale_height)); + static_cast(rect.left + regs.rasterizer.viewport_corner.x * res_scale_width), + static_cast(rect.bottom + regs.rasterizer.viewport_corner.y * res_scale_height), + static_cast(viewport_width * res_scale_width), + static_cast(viewport_height * res_scale_height)); - if (uniform_block_data.data.framebuffer_scale[0] != color_surface->res_scale_width || - uniform_block_data.data.framebuffer_scale[1] != color_surface->res_scale_height) { + if (uniform_block_data.data.framebuffer_scale[0] != res_scale_width || + uniform_block_data.data.framebuffer_scale[1] != res_scale_height) { - uniform_block_data.data.framebuffer_scale[0] = color_surface->res_scale_width; - uniform_block_data.data.framebuffer_scale[1] = color_surface->res_scale_height; + uniform_block_data.data.framebuffer_scale[0] = res_scale_width; + uniform_block_data.data.framebuffer_scale[1] = res_scale_height; uniform_block_data.dirty = true; } // Scissor checks are window-, not viewport-relative, which means that if the cached texture // sub-rect changes, the scissor bounds also need to be updated. GLint scissor_x1 = static_cast( - rect.left + regs.rasterizer.scissor_test.x1 * color_surface->res_scale_width); + rect.left + regs.rasterizer.scissor_test.x1 * res_scale_width); GLint scissor_y1 = static_cast( - rect.bottom + regs.rasterizer.scissor_test.y1 * color_surface->res_scale_height); + rect.bottom + regs.rasterizer.scissor_test.y1 * res_scale_height); // x2, y2 have +1 added to cover the entire pixel area, otherwise you might get cracks when // scaling or doing multisampling. GLint scissor_x2 = static_cast( - rect.left + (regs.rasterizer.scissor_test.x2 + 1) * color_surface->res_scale_width); + rect.left + (regs.rasterizer.scissor_test.x2 + 1) * res_scale_width); GLint scissor_y2 = static_cast( - rect.bottom + (regs.rasterizer.scissor_test.y2 + 1) * color_surface->res_scale_height); + rect.bottom + (regs.rasterizer.scissor_test.y2 + 1) * res_scale_height); if (uniform_block_data.data.scissor_x1 != scissor_x1 || uniform_block_data.data.scissor_x2 != scissor_x2 || @@ -309,7 +330,7 @@ void RasterizerOpenGL::DrawTriangles() { if (texture.enabled) { texture_samplers[texture_index].SyncWithConfig(texture.config); - CachedSurface* surface = res_cache.GetTextureSurface(texture); + Surface surface = res_cache.GetTextureSurface(texture); if (surface != nullptr) { state.texture_units[texture_index].texture_2d = surface->texture.handle; } else { @@ -386,14 +407,27 @@ void RasterizerOpenGL::DrawTriangles() { glDrawArrays(GL_TRIANGLES, 0, (GLsizei)vertex_batch.size()); // Mark framebuffer surfaces as dirty - // TODO: Restrict invalidation area to the viewport - if (color_surface != nullptr) { - color_surface->dirty = true; - res_cache.FlushRegion(color_surface->addr, color_surface->size, color_surface, true); + const u32 viewport_offset = + ((regs.framebuffer.framebuffer.GetHeight() - regs.rasterizer.viewport_corner.y - viewport_height) + * regs.framebuffer.framebuffer.GetWidth()) + + regs.rasterizer.viewport_corner.x; + + const u32 viewport_size = ((viewport_height - 1) * regs.framebuffer.framebuffer.GetWidth()) + + viewport_width; + + if (color_surface != nullptr && write_color_fb) { + res_cache.InvalidateRegion( + regs.framebuffer.framebuffer.GetColorBufferPhysicalAddress() + + (viewport_offset * color_surface->bytes_per_pixel), + viewport_size * color_surface->bytes_per_pixel, + color_surface); } - if (depth_surface != nullptr) { - depth_surface->dirty = true; - res_cache.FlushRegion(depth_surface->addr, depth_surface->size, depth_surface, true); + if (depth_surface != nullptr && write_depth_fb) { + res_cache.InvalidateRegion( + regs.framebuffer.framebuffer.GetDepthBufferPhysicalAddress() + + (viewport_offset * depth_surface->bytes_per_pixel), + viewport_size * depth_surface->bytes_per_pixel, + depth_surface); } vertex_batch.clear(); @@ -891,227 +925,119 @@ void RasterizerOpenGL::FlushAll() { void RasterizerOpenGL::FlushRegion(PAddr addr, u32 size) { MICROPROFILE_SCOPE(OpenGL_CacheManagement); - res_cache.FlushRegion(addr, size, nullptr, false); + res_cache.FlushRegion(addr, size); +} + +void RasterizerOpenGL::InvalidateRegion(PAddr addr, u32 size) { + MICROPROFILE_SCOPE(OpenGL_CacheManagement); + res_cache.InvalidateRegion(addr, size, nullptr); } void RasterizerOpenGL::FlushAndInvalidateRegion(PAddr addr, u32 size) { MICROPROFILE_SCOPE(OpenGL_CacheManagement); - res_cache.FlushRegion(addr, size, nullptr, true); + res_cache.FlushRegion(addr, size); + res_cache.InvalidateRegion(addr, size, nullptr); } bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransferConfig& config) { MICROPROFILE_SCOPE(OpenGL_Blits); - CachedSurface src_params; + SurfaceParams src_params; src_params.addr = config.GetPhysicalInputAddress(); - // It's important to use the correct source input width to properly skip over parts of the input - // image which will be cropped from the output but still affect the stride of the input image. - src_params.width = config.input_width; - // Using the output's height is fine because we don't read or skip over the remaining part of - // the image, and it allows for smaller texture cache lookup rectangles. + src_params.width = config.output_width; + src_params.stride = config.input_width; src_params.height = config.output_height; src_params.is_tiled = !config.input_linear; - src_params.pixel_format = CachedSurface::PixelFormatFromGPUPixelFormat(config.input_format); + src_params.pixel_format = SurfaceParams::PixelFormatFromGPUPixelFormat(config.input_format); + src_params.UpdateParams(); - CachedSurface dst_params; + SurfaceParams dst_params; dst_params.addr = config.GetPhysicalOutputAddress(); - dst_params.width = - config.scaling != config.NoScale ? config.output_width / 2 : config.output_width.Value(); - dst_params.height = - config.scaling == config.ScaleXY ? config.output_height / 2 : config.output_height.Value(); + dst_params.width = config.scaling != config.NoScale ? config.output_width.Value() / 2 : config.output_width.Value(); + dst_params.height = config.scaling == config.ScaleXY ? config.output_height.Value() / 2 : config.output_height.Value(); dst_params.is_tiled = config.input_linear != config.dont_swizzle; - dst_params.pixel_format = CachedSurface::PixelFormatFromGPUPixelFormat(config.output_format); + dst_params.pixel_format = SurfaceParams::PixelFormatFromGPUPixelFormat(config.output_format); + dst_params.UpdateParams(); MathUtil::Rectangle src_rect; - CachedSurface* src_surface = res_cache.GetSurfaceRect(src_params, false, true, src_rect); - - if (src_surface == nullptr) { + Surface src_surface; + std::tie(src_surface, src_rect) = res_cache.GetSurfaceSubRect(src_params, false, true); + if (src_surface == nullptr) return false; - } - - // Adjust the source rectangle to take into account parts of the input lines being cropped - if (config.input_width > config.output_width) { - src_rect.right -= static_cast((config.input_width - config.output_width) * - src_surface->res_scale_width); - } - - // Require destination surface to have same resolution scale as source to preserve scaling - dst_params.res_scale_width = src_surface->res_scale_width; - dst_params.res_scale_height = src_surface->res_scale_height; MathUtil::Rectangle dst_rect; - CachedSurface* dst_surface = res_cache.GetSurfaceRect(dst_params, true, false, dst_rect); - - if (dst_surface == nullptr) { + Surface dst_surface; + std::tie(dst_surface, dst_rect) = res_cache.GetSurfaceSubRect(dst_params, false, false); + if (dst_surface == nullptr) return false; - } - // Don't accelerate if the src and dst surfaces are the same - if (src_surface == dst_surface) { + if (config.flip_vertically) + std::swap(src_rect.top, src_rect.bottom); + + if (!res_cache.BlitSurfaces(src_surface, src_rect, dst_surface, dst_rect)) return false; - } - if (config.flip_vertically) { - std::swap(dst_rect.top, dst_rect.bottom); - } - - if (!res_cache.TryBlitSurfaces(src_surface, src_rect, dst_surface, dst_rect)) { - return false; - } - - u32 dst_size = dst_params.width * dst_params.height * - CachedSurface::GetFormatBpp(dst_params.pixel_format) / 8; - dst_surface->dirty = true; - res_cache.FlushRegion(config.GetPhysicalOutputAddress(), dst_size, dst_surface, true); + res_cache.InvalidateRegion(dst_params.addr, dst_params.size, dst_surface); return true; } bool RasterizerOpenGL::AccelerateTextureCopy(const GPU::Regs::DisplayTransferConfig& config) { - // TODO(tfarley): Try to hardware accelerate this - return false; + const u32 input_width = config.texture_copy.input_width * 16; + const u32 input_gap = config.texture_copy.input_gap * 16; + const u32 output_width = config.texture_copy.output_width * 16; + const u32 output_gap = config.texture_copy.output_gap * 16; + + if (config.texture_copy.size == 0) + return true; + + if (input_width != output_width || config.texture_copy.size % input_width != 0) + return false; + + SurfaceParams src_params; + src_params.addr = config.GetPhysicalInputAddress(); + src_params.stride = input_width + input_gap; // stride in bytes + src_params.width = input_width; // width in bytes + src_params.height = config.texture_copy.size / input_width; + src_params.size = ((src_params.height - 1) * src_params.stride) + src_params.width; + src_params.end = src_params.addr + src_params.size; + + MathUtil::Rectangle src_rect; + Surface src_surface; + std::tie(src_surface, src_rect) = res_cache.GetTexCopySurface(src_params); + if (src_surface == nullptr) + return false; + + if ((output_gap * 8) % SurfaceParams::GetFormatBpp(src_surface->pixel_format) != 0 || + (src_surface->is_tiled && src_surface->PixelsInBytes(output_gap) % 64 != 0)) + return false; + + SurfaceParams dst_params = *src_surface; + dst_params.addr = config.GetPhysicalOutputAddress(); + dst_params.stride = (output_width + output_gap) * src_surface->stride / src_params.stride; + dst_params.width = output_width * src_surface->stride / src_params.stride; + dst_params.height = src_surface->is_tiled ? src_params.height * 8 : src_params.height; + dst_params.UpdateParams(); + + const bool load_gap = output_gap != 0; // Since we are going to invalidate the gap if there is one, we will have to load it first + MathUtil::Rectangle dst_rect; + Surface dst_surface; + std::tie(dst_surface, dst_rect) = res_cache.GetSurfaceSubRect(dst_params, false, load_gap); + if (src_surface == nullptr) + return false; + + if (!res_cache.BlitSurfaces(src_surface, src_rect, dst_surface, dst_rect)) + return false; + + res_cache.InvalidateRegion(dst_params.addr, dst_params.size, dst_surface); + return true; } bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config) { - MICROPROFILE_SCOPE(OpenGL_Blits); - using PixelFormat = CachedSurface::PixelFormat; - using SurfaceType = CachedSurface::SurfaceType; - - CachedSurface* dst_surface = res_cache.TryGetFillSurface(config); - - if (dst_surface == nullptr) { + Surface dst_surface = res_cache.GetFillSurface(config); + if (dst_surface == nullptr) return false; - } - OpenGLState cur_state = OpenGLState::GetCurState(); - - SurfaceType dst_type = CachedSurface::GetFormatType(dst_surface->pixel_format); - - GLuint old_fb = cur_state.draw.draw_framebuffer; - cur_state.draw.draw_framebuffer = framebuffer.handle; - // TODO: When scissor test is implemented, need to disable scissor test in cur_state here so - // Clear call isn't affected - cur_state.Apply(); - - if (dst_type == SurfaceType::Color || dst_type == SurfaceType::Texture) { - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, - dst_surface->texture.handle, 0); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, - 0); - - GLfloat color_values[4] = {0.0f, 0.0f, 0.0f, 0.0f}; - - // TODO: Handle additional pixel format and fill value size combinations to accelerate more - // cases - // For instance, checking if fill value's bytes/bits repeat to allow filling - // I8/A8/I4/A4/... - // Currently only handles formats that are multiples of the fill value size - - if (config.fill_24bit) { - switch (dst_surface->pixel_format) { - case PixelFormat::RGB8: - color_values[0] = config.value_24bit_r / 255.0f; - color_values[1] = config.value_24bit_g / 255.0f; - color_values[2] = config.value_24bit_b / 255.0f; - break; - default: - return false; - } - } else if (config.fill_32bit) { - u32 value = config.value_32bit; - - switch (dst_surface->pixel_format) { - case PixelFormat::RGBA8: - color_values[0] = (value >> 24) / 255.0f; - color_values[1] = ((value >> 16) & 0xFF) / 255.0f; - color_values[2] = ((value >> 8) & 0xFF) / 255.0f; - color_values[3] = (value & 0xFF) / 255.0f; - break; - default: - return false; - } - } else { - u16 value_16bit = config.value_16bit.Value(); - Math::Vec4 color; - - switch (dst_surface->pixel_format) { - case PixelFormat::RGBA8: - color_values[0] = (value_16bit >> 8) / 255.0f; - color_values[1] = (value_16bit & 0xFF) / 255.0f; - color_values[2] = color_values[0]; - color_values[3] = color_values[1]; - break; - case PixelFormat::RGB5A1: - color = Color::DecodeRGB5A1((const u8*)&value_16bit); - color_values[0] = color[0] / 31.0f; - color_values[1] = color[1] / 31.0f; - color_values[2] = color[2] / 31.0f; - color_values[3] = color[3]; - break; - case PixelFormat::RGB565: - color = Color::DecodeRGB565((const u8*)&value_16bit); - color_values[0] = color[0] / 31.0f; - color_values[1] = color[1] / 63.0f; - color_values[2] = color[2] / 31.0f; - break; - case PixelFormat::RGBA4: - color = Color::DecodeRGBA4((const u8*)&value_16bit); - color_values[0] = color[0] / 15.0f; - color_values[1] = color[1] / 15.0f; - color_values[2] = color[2] / 15.0f; - color_values[3] = color[3] / 15.0f; - break; - case PixelFormat::IA8: - case PixelFormat::RG8: - color_values[0] = (value_16bit >> 8) / 255.0f; - color_values[1] = (value_16bit & 0xFF) / 255.0f; - break; - default: - return false; - } - } - - cur_state.color_mask.red_enabled = GL_TRUE; - cur_state.color_mask.green_enabled = GL_TRUE; - cur_state.color_mask.blue_enabled = GL_TRUE; - cur_state.color_mask.alpha_enabled = GL_TRUE; - cur_state.Apply(); - glClearBufferfv(GL_COLOR, 0, color_values); - } else if (dst_type == SurfaceType::Depth) { - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, - dst_surface->texture.handle, 0); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); - - GLfloat value_float; - if (dst_surface->pixel_format == CachedSurface::PixelFormat::D16) { - value_float = config.value_32bit / 65535.0f; // 2^16 - 1 - } else if (dst_surface->pixel_format == CachedSurface::PixelFormat::D24) { - value_float = config.value_32bit / 16777215.0f; // 2^24 - 1 - } - - cur_state.depth.write_mask = GL_TRUE; - cur_state.Apply(); - glClearBufferfv(GL_DEPTH, 0, &value_float); - } else if (dst_type == SurfaceType::DepthStencil) { - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, - dst_surface->texture.handle, 0); - - GLfloat value_float = (config.value_32bit & 0xFFFFFF) / 16777215.0f; // 2^24 - 1 - GLint value_int = (config.value_32bit >> 24); - - cur_state.depth.write_mask = GL_TRUE; - cur_state.stencil.write_mask = 0xFF; - cur_state.Apply(); - glClearBufferfi(GL_DEPTH_STENCIL, 0, value_float, value_int); - } - - cur_state.draw.draw_framebuffer = old_fb; - // TODO: Return scissor test to previous value when scissor test is implemented - cur_state.Apply(); - - dst_surface->dirty = true; - res_cache.FlushRegion(dst_surface->addr, dst_surface->size, dst_surface, true); + res_cache.InvalidateRegion(dst_surface->addr, dst_surface->size, dst_surface); return true; } @@ -1123,16 +1049,18 @@ bool RasterizerOpenGL::AccelerateDisplay(const GPU::Regs::FramebufferConfig& con } MICROPROFILE_SCOPE(OpenGL_CacheManagement); - CachedSurface src_params; + SurfaceParams src_params; src_params.addr = framebuffer_addr; - src_params.width = config.width; + src_params.width = std::min(config.width.Value(), pixel_stride); src_params.height = config.height; - src_params.pixel_stride = pixel_stride; + src_params.stride = pixel_stride; src_params.is_tiled = false; - src_params.pixel_format = CachedSurface::PixelFormatFromGPUPixelFormat(config.color_format); + src_params.pixel_format = SurfaceParams::PixelFormatFromGPUPixelFormat(config.color_format); + src_params.UpdateParams(); MathUtil::Rectangle src_rect; - CachedSurface* src_surface = res_cache.GetSurfaceRect(src_params, false, true, src_rect); + Surface src_surface; + std::tie(src_surface, src_rect) = res_cache.GetSurfaceSubRect(src_params, false, true); if (src_surface == nullptr) { return false; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 46c62961c..e83cb48fc 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -43,6 +43,7 @@ public: void NotifyPicaRegisterChanged(u32 id) override; void FlushAll() override; void FlushRegion(PAddr addr, u32 size) override; + void InvalidateRegion(PAddr addr, u32 size) override; void FlushAndInvalidateRegion(PAddr addr, u32 size) override; bool AccelerateDisplayTransfer(const GPU::Regs::DisplayTransferConfig& config) override; bool AccelerateTextureCopy(const GPU::Regs::DisplayTransferConfig& config) override; diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index f37894e7a..87edca9d3 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -9,11 +9,14 @@ #include #include #include +#include #include #include "common/bit_field.h" +#include "common/color.h" #include "common/logging/log.h" #include "common/math_util.h" #include "common/microprofile.h" +#include "common/scope_exit.h" #include "common/vector_math.h" #include "core/frontend/emu_window.h" #include "core/memory.h" @@ -25,13 +28,18 @@ #include "video_core/utils.h" #include "video_core/video_core.h" +using SurfaceType = SurfaceParams::SurfaceType; +using PixelFormat = SurfaceParams::PixelFormat; + +static std::array transfer_framebuffers; + struct FormatTuple { GLint internal_format; GLenum format; GLenum type; }; -static const std::array fb_format_tuples = {{ +static constexpr std::array fb_format_tuples = {{ {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8}, // RGBA8 {GL_RGB8, GL_BGR, GL_UNSIGNED_BYTE}, // RGB8 {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1}, // RGB5A1 @@ -39,86 +47,152 @@ static const std::array fb_format_tuples = {{ {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4}, // RGBA4 }}; -static const std::array depth_format_tuples = {{ +static constexpr std::array depth_format_tuples = {{ {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // D16 {}, {GL_DEPTH_COMPONENT24, GL_DEPTH_COMPONENT, GL_UNSIGNED_INT}, // D24 {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // D24S8 }}; -RasterizerCacheOpenGL::RasterizerCacheOpenGL() { - transfer_framebuffers[0].Create(); - transfer_framebuffers[1].Create(); -} +static constexpr FormatTuple tex_tuple = { GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE }; -RasterizerCacheOpenGL::~RasterizerCacheOpenGL() { - FlushAll(); -} - -static void MortonCopyPixels(CachedSurface::PixelFormat pixel_format, u32 width, u32 height, - u32 bytes_per_pixel, u32 gl_bytes_per_pixel, u8* morton_data, - u8* gl_data, bool morton_to_gl) { - using PixelFormat = CachedSurface::PixelFormat; - - u8* data_ptrs[2]; - u32 depth_stencil_shifts[2] = {24, 8}; - - if (morton_to_gl) { - std::swap(depth_stencil_shifts[0], depth_stencil_shifts[1]); +static const FormatTuple& GetFormatTuple(PixelFormat pixel_format) { + const SurfaceType type = SurfaceParams::GetFormatType(pixel_format); + if (type == SurfaceType::Color) { + ASSERT((size_t)pixel_format < fb_format_tuples.size()); + return fb_format_tuples[(unsigned int)pixel_format]; } + else if (type == SurfaceType::Depth || type == SurfaceType::DepthStencil) { + size_t tuple_idx = (size_t)pixel_format - 14; + ASSERT(tuple_idx < depth_format_tuples.size()); + return depth_format_tuples[tuple_idx]; + } + else { + return tex_tuple; + } +} - if (pixel_format == PixelFormat::D24S8) { - for (unsigned y = 0; y < height; ++y) { - for (unsigned x = 0; x < width; ++x) { - const u32 coarse_y = y & ~7; - u32 morton_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + - coarse_y * width * bytes_per_pixel; - u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel; +template +constexpr auto RangeFromInterval(Map& map, const Interval& interval) { + return boost::make_iterator_range(map.equal_range(interval)); +} - data_ptrs[morton_to_gl] = morton_data + morton_offset; - data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index]; +enum MortonCopyFlags : int { + MortonToGl = (1 << 0), + CheckRange = (1 << 1), + D24S8Format = (1 << 2), + BytesPerPixelBits = 3, // bits 3-4 + GLBytesPerPixelBits = 5, // bits 5-6 + MaxValue = (1 << 7) - 1, +}; +template +static void MortonCopyPixels(u32 width, u32 height, const u8* in_data, u8* out_data, PAddr base, PAddr start, PAddr end) { + constexpr bool check_range = (flags & MortonCopyFlags::CheckRange) ? true : false; + constexpr bool morton_to_gl = (flags & MortonCopyFlags::MortonToGl) ? true : false; + + constexpr bool D24S8format = (flags & MortonCopyFlags::D24S8Format) ? true : false; + + constexpr u32 bytes_per_pixel = u32(((flags) >> MortonCopyFlags::BytesPerPixelBits) & 0x3) + 1; // 2bits, starting with value 1 + constexpr u32 gl_bytes_per_pixel = u32(((flags) >> MortonCopyFlags::GLBytesPerPixelBits) & 0x3) + 1; // 2bits, starting with value 1 + + if (check_range) + ASSERT(start >= base && end <= (base + (width * height * bytes_per_pixel))); + const u32 start_offset = start - base; + const u32 end_offset = end - base; + + for (u32 x = 0; x < width; ++x) { + for (u32 y = 0; y < height; ++y) { + const u32 coarse_x = x & ~7; + const u32 coarse_y = y & ~7; + u32 morton_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * width * bytes_per_pixel; + u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel; + + if (check_range) { + if (morton_offset >= end_offset && coarse_x == 0 && coarse_y == 0) // Out of range and new tile + return; + if (morton_offset < start_offset || morton_offset >= end_offset) // Out of range + continue; + } + + const size_t copy_bytes = check_range ? std::min(end_offset - morton_offset, bytes_per_pixel) : bytes_per_pixel; + + const u8* const in_ptr = &in_data[morton_to_gl ? morton_offset : gl_pixel_index]; + u8* const out_ptr = &out_data[morton_to_gl ? gl_pixel_index : morton_offset]; + + if (D24S8format) { // Swap depth and stencil value ordering since 3DS does not match OpenGL - u32 depth_stencil; - memcpy(&depth_stencil, data_ptrs[1], sizeof(u32)); - depth_stencil = (depth_stencil << depth_stencil_shifts[0]) | - (depth_stencil >> depth_stencil_shifts[1]); - - memcpy(data_ptrs[0], &depth_stencil, sizeof(u32)); + constexpr size_t swap_offset = morton_to_gl ? 3 : 1; + std::array swap_buf; + std::memcpy(&swap_buf[4 - swap_offset], &in_ptr[0], swap_offset); + std::memcpy(&swap_buf[0], &in_ptr[swap_offset], 4 - swap_offset); + std::memcpy(out_ptr, &swap_buf[0], copy_bytes); } - } - } else { - for (unsigned y = 0; y < height; ++y) { - for (unsigned x = 0; x < width; ++x) { - const u32 coarse_y = y & ~7; - u32 morton_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + - coarse_y * width * bytes_per_pixel; - u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel; - - data_ptrs[morton_to_gl] = morton_data + morton_offset; - data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index]; - - memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel); + else { + std::memcpy(out_ptr, in_ptr, copy_bytes); } } } } -void RasterizerCacheOpenGL::BlitTextures(GLuint src_tex, GLuint dst_tex, - CachedSurface::SurfaceType type, - const MathUtil::Rectangle& src_rect, - const MathUtil::Rectangle& dst_rect) { - using SurfaceType = CachedSurface::SurfaceType; +template +class FunctionTable { +public: + FunctionTable() { + FillArray(); + } + const auto& operator [](size_t pos) const { + return table[pos]; + } +private: + template + void FillArray() { + table[P - 1] = &MortonCopyPixels

; + FillArray

(); + } + template <> + void FillArray<0>() {} + std::array), size> table; +}; +static const FunctionTable MortonCopyFnTable; +// Allocate an uninitialized texture of appropriate size and format for the surface +static void AllocateSurfaceTexture(GLuint texture, const FormatTuple& format_tuple, u32 width, u32 height) { OpenGLState cur_state = OpenGLState::GetCurState(); + // Keep track of previous texture bindings + GLuint old_tex = cur_state.texture_units[0].texture_2d; + cur_state.texture_units[0].texture_2d = texture; + cur_state.Apply(); + glActiveTexture(GL_TEXTURE0); + + glTexImage2D(GL_TEXTURE_2D, 0, format_tuple.internal_format, width, height, 0, + format_tuple.format, format_tuple.type, nullptr); + + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + + // Restore previous texture bindings + cur_state.texture_units[0].texture_2d = old_tex; + cur_state.Apply(); +} + +static bool BlitTextures(GLuint src_tex, const MathUtil::Rectangle& src_rect, + GLuint dst_tex, const MathUtil::Rectangle& dst_rect, + SurfaceType type) { + OpenGLState cur_state = OpenGLState::GetCurState(); + + OpenGLState prev_state = cur_state; + SCOPE_EXIT({ prev_state.Apply(); }); + // Make sure textures aren't bound to texture units, since going to bind them to framebuffer // components OpenGLState::ResetTexture(src_tex); OpenGLState::ResetTexture(dst_tex); // Keep track of previous framebuffer bindings - GLuint old_fbs[2] = {cur_state.draw.read_framebuffer, cur_state.draw.draw_framebuffer}; cur_state.draw.read_framebuffer = transfer_framebuffers[0].handle; cur_state.draw.draw_framebuffer = transfer_framebuffers[1].handle; cur_state.Apply(); @@ -163,637 +237,890 @@ void RasterizerCacheOpenGL::BlitTextures(GLuint src_tex, GLuint dst_tex, dst_rect.top, dst_rect.right, dst_rect.bottom, buffers, buffers == GL_COLOR_BUFFER_BIT ? GL_LINEAR : GL_NEAREST); - // Restore previous framebuffer bindings - cur_state.draw.read_framebuffer = old_fbs[0]; - cur_state.draw.draw_framebuffer = old_fbs[1]; - cur_state.Apply(); -} - -bool RasterizerCacheOpenGL::TryBlitSurfaces(CachedSurface* src_surface, - const MathUtil::Rectangle& src_rect, - CachedSurface* dst_surface, - const MathUtil::Rectangle& dst_rect) { - - if (!CachedSurface::CheckFormatsBlittable(src_surface->pixel_format, - dst_surface->pixel_format)) { - return false; - } - - BlitTextures(src_surface->texture.handle, dst_surface->texture.handle, - CachedSurface::GetFormatType(src_surface->pixel_format), src_rect, dst_rect); return true; } -static void AllocateSurfaceTexture(GLuint texture, CachedSurface::PixelFormat pixel_format, - u32 width, u32 height) { - // Allocate an uninitialized texture of appropriate size and format for the surface - using SurfaceType = CachedSurface::SurfaceType; - +static bool FillSurface(const Surface& surface, const u8* fill_data) { OpenGLState cur_state = OpenGLState::GetCurState(); - // Keep track of previous texture bindings - GLuint old_tex = cur_state.texture_units[0].texture_2d; - cur_state.texture_units[0].texture_2d = texture; + OpenGLState prev_state = cur_state; + SCOPE_EXIT({ prev_state.Apply(); }); + + OpenGLState::ResetTexture(surface->texture.handle); + + cur_state.draw.draw_framebuffer = transfer_framebuffers[1].handle; cur_state.Apply(); - glActiveTexture(GL_TEXTURE0); - SurfaceType type = CachedSurface::GetFormatType(pixel_format); + if (surface->type == SurfaceType::Color || surface->type == SurfaceType::Texture) { + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, surface->texture.handle, 0); + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); - FormatTuple tuple; - if (type == SurfaceType::Color) { - ASSERT((size_t)pixel_format < fb_format_tuples.size()); - tuple = fb_format_tuples[(unsigned int)pixel_format]; - } else if (type == SurfaceType::Depth || type == SurfaceType::DepthStencil) { - size_t tuple_idx = (size_t)pixel_format - 14; - ASSERT(tuple_idx < depth_format_tuples.size()); - tuple = depth_format_tuples[tuple_idx]; - } else { - tuple = {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}; - } + Pica::Texture::TextureInfo tex_info{}; + tex_info.format = static_cast(surface->pixel_format); + Math::Vec4 color = Pica::Texture::LookupTexture(fill_data, 0, 0, tex_info); - glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, width, height, 0, tuple.format, - tuple.type, nullptr); + std::array color_values = { + color.x / 255.f, + color.y / 255.f, + color.z / 255.f, + color.w / 255.f + }; - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - - // Restore previous texture bindings - cur_state.texture_units[0].texture_2d = old_tex; - cur_state.Apply(); -} - -MICROPROFILE_DEFINE(OpenGL_SurfaceUpload, "OpenGL", "Surface Upload", MP_RGB(128, 64, 192)); -CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bool match_res_scale, - bool load_if_create) { - using PixelFormat = CachedSurface::PixelFormat; - using SurfaceType = CachedSurface::SurfaceType; - - if (params.addr == 0) { - return nullptr; - } - - u32 params_size = - params.width * params.height * CachedSurface::GetFormatBpp(params.pixel_format) / 8; - - // Check for an exact match in existing surfaces - CachedSurface* best_exact_surface = nullptr; - float exact_surface_goodness = -1.f; - - auto surface_interval = - boost::icl::interval::right_open(params.addr, params.addr + params_size); - auto range = surface_cache.equal_range(surface_interval); - for (auto it = range.first; it != range.second; ++it) { - for (auto it2 = it->second.begin(); it2 != it->second.end(); ++it2) { - CachedSurface* surface = it2->get(); - - // Check if the request matches the surface exactly - if (params.addr == surface->addr && params.width == surface->width && - params.height == surface->height && params.pixel_format == surface->pixel_format) { - // Make sure optional param-matching criteria are fulfilled - bool tiling_match = (params.is_tiled == surface->is_tiled); - bool res_scale_match = (params.res_scale_width == surface->res_scale_width && - params.res_scale_height == surface->res_scale_height); - if (!match_res_scale || res_scale_match) { - // Prioritize same-tiling and highest resolution surfaces - float match_goodness = - (float)tiling_match + surface->res_scale_width * surface->res_scale_height; - if (match_goodness > exact_surface_goodness || surface->dirty) { - exact_surface_goodness = match_goodness; - best_exact_surface = surface; - } - } - } - } - } - - // Return the best exact surface if found - if (best_exact_surface != nullptr) { - return best_exact_surface; - } - - // No matching surfaces found, so create a new one - u8* texture_src_data = Memory::GetPhysicalPointer(params.addr); - if (texture_src_data == nullptr) { - return nullptr; - } - - MICROPROFILE_SCOPE(OpenGL_SurfaceUpload); - - // Stride only applies to linear images. - ASSERT(params.pixel_stride == 0 || !params.is_tiled); - - std::shared_ptr new_surface = std::make_shared(); - - new_surface->addr = params.addr; - new_surface->size = params_size; - - new_surface->texture.Create(); - new_surface->width = params.width; - new_surface->height = params.height; - new_surface->pixel_stride = params.pixel_stride; - new_surface->res_scale_width = params.res_scale_width; - new_surface->res_scale_height = params.res_scale_height; - - new_surface->is_tiled = params.is_tiled; - new_surface->pixel_format = params.pixel_format; - new_surface->dirty = false; - - if (!load_if_create) { - // Don't load any data; just allocate the surface's texture - AllocateSurfaceTexture(new_surface->texture.handle, new_surface->pixel_format, - new_surface->GetScaledWidth(), new_surface->GetScaledHeight()); - } else { - // TODO: Consider attempting subrect match in existing surfaces and direct blit here instead - // of memory upload below if that's a common scenario in some game - - Memory::RasterizerFlushRegion(params.addr, params_size); - - // Load data from memory to the new surface - OpenGLState cur_state = OpenGLState::GetCurState(); - - GLuint old_tex = cur_state.texture_units[0].texture_2d; - cur_state.texture_units[0].texture_2d = new_surface->texture.handle; + cur_state.color_mask.red_enabled = GL_TRUE; + cur_state.color_mask.green_enabled = GL_TRUE; + cur_state.color_mask.blue_enabled = GL_TRUE; + cur_state.color_mask.alpha_enabled = GL_TRUE; cur_state.Apply(); - glActiveTexture(GL_TEXTURE0); + glClearBufferfv(GL_COLOR, 0, &color_values[0]); + } + else if (surface->type == SurfaceType::Depth) { + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, surface->texture.handle, 0); + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); - if (!new_surface->is_tiled) { - // TODO: Ensure this will always be a color format, not a depth or other format - ASSERT((size_t)new_surface->pixel_format < fb_format_tuples.size()); - const FormatTuple& tuple = fb_format_tuples[(unsigned int)params.pixel_format]; + u32 value_32bit = 0; + GLfloat value_float; - glPixelStorei(GL_UNPACK_ROW_LENGTH, (GLint)new_surface->pixel_stride); - glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, - tuple.format, tuple.type, texture_src_data); - glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); - } else { - SurfaceType type = CachedSurface::GetFormatType(new_surface->pixel_format); - if (type != SurfaceType::Depth && type != SurfaceType::DepthStencil) { - FormatTuple tuple; - if ((size_t)params.pixel_format < fb_format_tuples.size()) { - tuple = fb_format_tuples[(unsigned int)params.pixel_format]; - } else { - // Texture - tuple = {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}; - } - - std::vector> tex_buffer(params.width * params.height); - - Pica::Texture::TextureInfo tex_info; - tex_info.width = params.width; - tex_info.height = params.height; - tex_info.format = (Pica::TexturingRegs::TextureFormat)params.pixel_format; - tex_info.SetDefaultStride(); - tex_info.physical_address = params.addr; - - for (unsigned y = 0; y < params.height; ++y) { - for (unsigned x = 0; x < params.width; ++x) { - tex_buffer[x + params.width * y] = Pica::Texture::LookupTexture( - texture_src_data, x, params.height - 1 - y, tex_info); - } - } - - glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, - 0, GL_RGBA, GL_UNSIGNED_BYTE, tex_buffer.data()); - } else { - // Depth/Stencil formats need special treatment since they aren't sampleable using - // LookupTexture and can't use RGBA format - size_t tuple_idx = (size_t)params.pixel_format - 14; - ASSERT(tuple_idx < depth_format_tuples.size()); - const FormatTuple& tuple = depth_format_tuples[tuple_idx]; - - u32 bytes_per_pixel = CachedSurface::GetFormatBpp(params.pixel_format) / 8; - - // OpenGL needs 4 bpp alignment for D24 since using GL_UNSIGNED_INT as type - bool use_4bpp = (params.pixel_format == PixelFormat::D24); - - u32 gl_bytes_per_pixel = use_4bpp ? 4 : bytes_per_pixel; - - std::vector temp_fb_depth_buffer(params.width * params.height * - gl_bytes_per_pixel); - - u8* temp_fb_depth_buffer_ptr = - use_4bpp ? temp_fb_depth_buffer.data() + 1 : temp_fb_depth_buffer.data(); - - MortonCopyPixels(params.pixel_format, params.width, params.height, bytes_per_pixel, - gl_bytes_per_pixel, texture_src_data, temp_fb_depth_buffer_ptr, - true); - - glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, - 0, tuple.format, tuple.type, temp_fb_depth_buffer.data()); - } + if (surface->pixel_format == SurfaceParams::PixelFormat::D16) { + std::memcpy(&value_32bit, fill_data, 2); + value_float = value_32bit / 65535.0f; // 2^16 - 1 + } + else if (surface->pixel_format == SurfaceParams::PixelFormat::D24) { + std::memcpy(&value_32bit, fill_data, 3); + value_float = value_32bit / 16777215.0f; // 2^24 - 1 } - // If not 1x scale, blit 1x texture to a new scaled texture and replace texture in surface - if (new_surface->res_scale_width != 1.f || new_surface->res_scale_height != 1.f) { - OGLTexture scaled_texture; - scaled_texture.Create(); - - AllocateSurfaceTexture(scaled_texture.handle, new_surface->pixel_format, - new_surface->GetScaledWidth(), new_surface->GetScaledHeight()); - BlitTextures(new_surface->texture.handle, scaled_texture.handle, - CachedSurface::GetFormatType(new_surface->pixel_format), - MathUtil::Rectangle(0, 0, new_surface->width, new_surface->height), - MathUtil::Rectangle(0, 0, new_surface->GetScaledWidth(), - new_surface->GetScaledHeight())); - - new_surface->texture.Release(); - new_surface->texture.handle = scaled_texture.handle; - scaled_texture.handle = 0; - cur_state.texture_units[0].texture_2d = new_surface->texture.handle; - cur_state.Apply(); - } - - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - - cur_state.texture_units[0].texture_2d = old_tex; + cur_state.depth.write_mask = GL_TRUE; cur_state.Apply(); + glClearBufferfv(GL_DEPTH, 0, &value_float); } + else if (surface->type == SurfaceType::DepthStencil) { + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, surface->texture.handle, 0); - Memory::RasterizerMarkRegionCached(new_surface->addr, new_surface->size, 1); - surface_cache.add(std::make_pair(boost::icl::interval::right_open( - new_surface->addr, new_surface->addr + new_surface->size), - std::set>({new_surface}))); - return new_surface.get(); + u32 value_32bit; + std::memcpy(&value_32bit, fill_data, 4); + + GLfloat value_float = (value_32bit & 0xFFFFFF) / 16777215.0f; // 2^24 - 1 + GLint value_int = (value_32bit >> 24); + + cur_state.depth.write_mask = GL_TRUE; + cur_state.stencil.write_mask = -1; + cur_state.Apply(); + glClearBufferfi(GL_DEPTH_STENCIL, 0, value_float, value_int); + } + return true; } -CachedSurface* RasterizerCacheOpenGL::GetSurfaceRect(const CachedSurface& params, - bool match_res_scale, bool load_if_create, - MathUtil::Rectangle& out_rect) { - if (params.addr == 0) { - return nullptr; - } - - u32 total_pixels = params.width * params.height; - u32 params_size = total_pixels * CachedSurface::GetFormatBpp(params.pixel_format) / 8; - - // Attempt to find encompassing surfaces - CachedSurface* best_subrect_surface = nullptr; - float subrect_surface_goodness = -1.f; - - auto surface_interval = - boost::icl::interval::right_open(params.addr, params.addr + params_size); - auto cache_upper_bound = surface_cache.upper_bound(surface_interval); - for (auto it = surface_cache.lower_bound(surface_interval); it != cache_upper_bound; ++it) { - for (auto it2 = it->second.begin(); it2 != it->second.end(); ++it2) { - CachedSurface* surface = it2->get(); - - // Check if the request is contained in the surface - if (params.addr >= surface->addr && - params.addr + params_size - 1 <= surface->addr + surface->size - 1 && - params.pixel_format == surface->pixel_format) { - // Make sure optional param-matching criteria are fulfilled - bool tiling_match = (params.is_tiled == surface->is_tiled); - bool res_scale_match = (params.res_scale_width == surface->res_scale_width && - params.res_scale_height == surface->res_scale_height); - if (!match_res_scale || res_scale_match) { - // Prioritize same-tiling and highest resolution surfaces - float match_goodness = - (float)tiling_match + surface->res_scale_width * surface->res_scale_height; - if (match_goodness > subrect_surface_goodness || surface->dirty) { - subrect_surface_goodness = match_goodness; - best_subrect_surface = surface; - } - } - } - } - } - - // Return the best subrect surface if found - if (best_subrect_surface != nullptr) { - unsigned int bytes_per_pixel = - (CachedSurface::GetFormatBpp(best_subrect_surface->pixel_format) / 8); - - int x0, y0; - - if (!params.is_tiled) { - u32 begin_pixel_index = (params.addr - best_subrect_surface->addr) / bytes_per_pixel; - x0 = begin_pixel_index % best_subrect_surface->width; - y0 = begin_pixel_index / best_subrect_surface->width; - - out_rect = MathUtil::Rectangle(x0, y0, x0 + params.width, y0 + params.height); - } else { - u32 bytes_per_tile = 8 * 8 * bytes_per_pixel; - u32 tiles_per_row = best_subrect_surface->width / 8; - - u32 begin_tile_index = (params.addr - best_subrect_surface->addr) / bytes_per_tile; - x0 = begin_tile_index % tiles_per_row * 8; - y0 = begin_tile_index / tiles_per_row * 8; - - // Tiled surfaces are flipped vertically in the rasterizer vs. 3DS memory. - out_rect = - MathUtil::Rectangle(x0, best_subrect_surface->height - y0, x0 + params.width, - best_subrect_surface->height - (y0 + params.height)); - } - - out_rect.left = (int)(out_rect.left * best_subrect_surface->res_scale_width); - out_rect.right = (int)(out_rect.right * best_subrect_surface->res_scale_width); - out_rect.top = (int)(out_rect.top * best_subrect_surface->res_scale_height); - out_rect.bottom = (int)(out_rect.bottom * best_subrect_surface->res_scale_height); - - return best_subrect_surface; - } - - // No subrect found - create and return a new surface - if (!params.is_tiled) { - out_rect = MathUtil::Rectangle(0, 0, (int)(params.width * params.res_scale_width), - (int)(params.height * params.res_scale_height)); - } else { - out_rect = MathUtil::Rectangle(0, (int)(params.height * params.res_scale_height), - (int)(params.width * params.res_scale_width), 0); - } - - return GetSurface(params, match_res_scale, load_if_create); -} - -CachedSurface* RasterizerCacheOpenGL::GetTextureSurface( - const Pica::TexturingRegs::FullTextureConfig& config) { - - Pica::Texture::TextureInfo info = - Pica::Texture::TextureInfo::FromPicaRegister(config.config, config.format); - - CachedSurface params; - params.addr = info.physical_address; - params.width = info.width; - params.height = info.height; - params.is_tiled = true; - params.pixel_format = CachedSurface::PixelFormatFromTextureFormat(info.format); - return GetSurface(params, false, true); -} - -std::tuple> -RasterizerCacheOpenGL::GetFramebufferSurfaces( - const Pica::FramebufferRegs::FramebufferConfig& config) { - - const auto& regs = Pica::g_state.regs; - - // Make sur that framebuffers don't overlap if both color and depth are being used - u32 fb_area = config.GetWidth() * config.GetHeight(); - bool framebuffers_overlap = - config.GetColorBufferPhysicalAddress() != 0 && - config.GetDepthBufferPhysicalAddress() != 0 && - MathUtil::IntervalsIntersect( - config.GetColorBufferPhysicalAddress(), - fb_area * GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(config.color_format.Value())), - config.GetDepthBufferPhysicalAddress(), - fb_area * Pica::FramebufferRegs::BytesPerDepthPixel(config.depth_format)); - bool using_color_fb = config.GetColorBufferPhysicalAddress() != 0; - bool depth_write_enable = regs.framebuffer.output_merger.depth_write_enable && - regs.framebuffer.framebuffer.allow_depth_stencil_write; - bool using_depth_fb = config.GetDepthBufferPhysicalAddress() != 0 && - (regs.framebuffer.output_merger.depth_test_enable || depth_write_enable || - !framebuffers_overlap); - - if (framebuffers_overlap && using_color_fb && using_depth_fb) { - LOG_CRITICAL(Render_OpenGL, "Color and depth framebuffer memory regions overlap; " - "overlapping framebuffers not supported!"); - using_depth_fb = false; - } - - // get color and depth surfaces - CachedSurface color_params; - CachedSurface depth_params; - color_params.width = depth_params.width = config.GetWidth(); - color_params.height = depth_params.height = config.GetHeight(); - color_params.is_tiled = depth_params.is_tiled = true; - +SurfaceParams::SurfaceParams() { // Set the internal resolution, assume the same scaling factor for top and bottom screens float resolution_scale_factor = Settings::values.resolution_factor; if (resolution_scale_factor == 0.0f) { // Auto - scale resolution to the window size resolution_scale_factor = VideoCore::g_emu_window->GetFramebufferLayout().GetScalingRatio(); } - // Scale the resolution by the specified factor - color_params.res_scale_width = resolution_scale_factor; - depth_params.res_scale_width = resolution_scale_factor; - color_params.res_scale_height = resolution_scale_factor; - depth_params.res_scale_height = resolution_scale_factor; + res_scale_width = resolution_scale_factor; + res_scale_height = resolution_scale_factor; +} - color_params.addr = config.GetColorBufferPhysicalAddress(); - color_params.pixel_format = CachedSurface::PixelFormatFromColorFormat(config.color_format); +MathUtil::Rectangle CachedSurface::GetSubRect(const SurfaceParams& sub_surface) const { + const u32 begin_pixel_index = PixelsInBytes(sub_surface.addr - addr); + const int x0 = begin_pixel_index % width; + const int y0 = begin_pixel_index / width; - depth_params.addr = config.GetDepthBufferPhysicalAddress(); - depth_params.pixel_format = CachedSurface::PixelFormatFromDepthFormat(config.depth_format); + if (is_tiled) + return MathUtil::Rectangle(x0, height - y0 - sub_surface.height, x0 + sub_surface.width, height - y0); // Bottom to top - MathUtil::Rectangle color_rect; - CachedSurface* color_surface = - using_color_fb ? GetSurfaceRect(color_params, true, true, color_rect) : nullptr; + return MathUtil::Rectangle(x0, y0, x0 + sub_surface.width, y0 + sub_surface.height); // Top to bottom +} - MathUtil::Rectangle depth_rect; - CachedSurface* depth_surface = - using_depth_fb ? GetSurfaceRect(depth_params, true, true, depth_rect) : nullptr; +MathUtil::Rectangle CachedSurface::GetScaledSubRect(const SurfaceParams& sub_surface) const { + auto rect = GetSubRect(sub_surface); + rect.left = static_cast(rect.left * res_scale_width); + rect.right = static_cast(rect.right * res_scale_width); + rect.top = static_cast(rect.top * res_scale_height); + rect.bottom = static_cast(rect.bottom * res_scale_height); + return rect; +} - // Sanity check to make sure found surfaces aren't the same - if (using_depth_fb && using_color_fb && color_surface == depth_surface) { - LOG_CRITICAL( - Render_OpenGL, - "Color and depth framebuffer surfaces overlap; overlapping surfaces not supported!"); - using_depth_fb = false; - depth_surface = nullptr; +bool CachedSurface::ExactMatch(const SurfaceParams& other_surface) const { + return (other_surface.addr == addr && + other_surface.width == width && + other_surface.height == height && + other_surface.stride == stride && + other_surface.pixel_format == pixel_format && + other_surface.is_tiled == is_tiled); +} + +bool CachedSurface::CanSubRect(const SurfaceParams& sub_surface) const { + if (sub_surface.addr < addr || sub_surface.end > end || sub_surface.stride != stride || + sub_surface.pixel_format != pixel_format || sub_surface.is_tiled != is_tiled) + return false; + + auto rect = GetSubRect(sub_surface); + + if (rect.left + sub_surface.width > stride) + return false; + + if (is_tiled) + return ((height - rect.bottom) % 8 == 0 && rect.left % 8 == 0); + + return true; +} + +bool CachedSurface::CanCopy(const SurfaceParams& dest_surface) const { + if (type == SurfaceType::Fill && IsRegionValid(dest_surface.GetInterval()) && + dest_surface.addr >= addr && dest_surface.end <= end) { // dest_surface is within our fill range + if (fill_size != dest_surface.bytes_per_pixel) { + // Check if bits repeat for our fill_size + const u32 dest_bytes_per_pixel = std::max(dest_surface.bytes_per_pixel, 1u); // Take care of 4bpp formats + std::vector fill_test(fill_size * dest_bytes_per_pixel); + + for (u32 i = 0; i < dest_bytes_per_pixel; ++i) + std::memcpy(&fill_test[i * fill_size], &fill_data[0], fill_size); + + for (u32 i = 0; i < fill_size; ++i) + if (std::memcmp(&fill_test[dest_bytes_per_pixel * i], &fill_test[0], dest_bytes_per_pixel) != 0) + return false; + + if (dest_surface.bytes_per_pixel == 0 && (fill_test[0] & 0xF) != (fill_test[0] >> 4)) // 4bpp compare + return false; + } + return true; + } + if (CanSubRect(dest_surface) && dest_surface.width == stride) + return true; + + return false; +} + +static void CopySurface(const Surface& src_surface, const Surface& dest_surface) { + if (src_surface == dest_surface) + return; + + // This is only called when CanCopy is true, no need to run checks here + if (src_surface->type == SurfaceType::Fill) { + // FillSurface needs a 4 bytes buffer + const u32 fill_offset = (dest_surface->addr - src_surface->addr) % src_surface->fill_size; + std::array fill_buffer; + + u32 fill_buff_pos = fill_offset; + for (int i : {0, 1, 2, 3}) + fill_buffer[i] = src_surface->fill_data[fill_buff_pos++ % src_surface->fill_size]; + + FillSurface(dest_surface, &fill_buffer[0]); + } + if (src_surface->CanSubRect(*dest_surface)) { + BlitTextures(src_surface->texture.handle, src_surface->GetScaledSubRect(*dest_surface), + dest_surface->texture.handle, dest_surface->GetScaledRect(), + src_surface->type); + } + dest_surface->gl_buffer_dirty = true; +} + +MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 64, 192)); +void CachedSurface::LoadGLBuffer(PAddr load_start, PAddr load_end) { + ASSERT(type != SurfaceType::Fill); + + const u8* const texture_src_data = Memory::GetPhysicalPointer(addr); + if (texture_src_data == nullptr) + return; + + MICROPROFILE_SCOPE(OpenGL_SurfaceLoad); + + ASSERT(load_start >= addr && load_end <= end); + const u32 start_offset = load_start - addr; + + if (!is_tiled) { + ASSERT(type == SurfaceType::Color); + std::memcpy(&gl_buffer[start_offset], texture_src_data + start_offset, load_end - load_start); + } + else { + if (type == SurfaceType::Texture) { + Pica::Texture::TextureInfo tex_info{}; + tex_info.width = width; + tex_info.height = height; + tex_info.format = static_cast(pixel_format); + tex_info.SetDefaultStride(); + tex_info.physical_address = addr; + + for (unsigned y = 0; y < height; ++y) { + for (unsigned x = 0; x < width; ++x) { + auto vec4 = Pica::Texture::LookupTexture(texture_src_data, x, height - 1 - y, tex_info); + const size_t offset = (x + (width * y)) * 4; + std::memcpy(&gl_buffer[offset], vec4.AsArray(), 4); + } + } + } + else { + size_t copyfn_offset = MortonCopyFlags::MortonToGl; + copyfn_offset |= (bytes_per_pixel - 1) << MortonCopyFlags::BytesPerPixelBits; + copyfn_offset |= (gl_bytes_per_pixel - 1) << MortonCopyFlags::GLBytesPerPixelBits; + + if (load_start != addr || load_end != end) + copyfn_offset |= MortonCopyFlags::CheckRange; + if (pixel_format == PixelFormat::D24S8) + copyfn_offset |= MortonCopyFlags::D24S8Format; + + MortonCopyFnTable[copyfn_offset](width, height, + texture_src_data, &gl_buffer[gl_buffer_offset], addr, load_start, load_end); + } + } +} + +MICROPROFILE_DEFINE(OpenGL_SurfaceFlush, "OpenGL", "Surface Flush", MP_RGB(128, 192, 64)); +void CachedSurface::FlushGLBuffer(PAddr flush_start, PAddr flush_end) { + u8* const dst_buffer = Memory::GetPhysicalPointer(addr); + if (dst_buffer == nullptr) + return; + + MICROPROFILE_SCOPE(OpenGL_SurfaceFlush); + + ASSERT(flush_start >= addr && flush_end <= end); + const u32 start_offset = flush_start - addr; + const u32 end_offset = flush_end - addr; + + if (type == SurfaceType::Fill) { + const u32 coarse_start_offset = start_offset - (start_offset % fill_size); + const u32 backup_bytes = start_offset % fill_size; + std::array backup_data; + if (backup_bytes) + std::memcpy(&backup_data[0], &dst_buffer[coarse_start_offset], backup_bytes); + + for (u32 offset = coarse_start_offset; offset < end_offset; offset += fill_size) + std::memcpy(&dst_buffer[offset], &fill_data[0], std::min(fill_size, end_offset - offset)); + + if (backup_bytes) + std::memcpy(&dst_buffer[coarse_start_offset], &backup_data[0], backup_bytes); + } + else if (!is_tiled) { + ASSERT(type == SurfaceType::Color); + std::memcpy(dst_buffer + start_offset, &gl_buffer[start_offset], flush_end - flush_start); + } + else { + size_t copyfn_offset = (bytes_per_pixel - 1) << MortonCopyFlags::BytesPerPixelBits; + copyfn_offset |= (gl_bytes_per_pixel - 1) << MortonCopyFlags::GLBytesPerPixelBits; + + if (flush_start != addr || flush_end != end) + copyfn_offset |= MortonCopyFlags::CheckRange; + if (pixel_format == PixelFormat::D24S8) + copyfn_offset |= MortonCopyFlags::D24S8Format; + + MortonCopyFnTable[copyfn_offset](width, height, + &gl_buffer[gl_buffer_offset], dst_buffer, addr, flush_start, flush_end); + } +} + +void CachedSurface::UploadGLTexture() { + if (type == SurfaceType::Fill) + return; + + ASSERT(gl_buffer.size() == width * height * gl_bytes_per_pixel); + + const FormatTuple& tuple = GetFormatTuple(pixel_format); + + // Load data from memory to the surface + OpenGLState cur_state = OpenGLState::GetCurState(); + + GLuint old_tex = cur_state.texture_units[0].texture_2d; + cur_state.texture_units[0].texture_2d = texture.handle; + cur_state.Apply(); + + glActiveTexture(GL_TEXTURE0); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, width, height, 0, + tuple.format, tuple.type, &gl_buffer[0]); + + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + + cur_state.texture_units[0].texture_2d = old_tex; + cur_state.Apply(); + + // If not 1x scale, blit 1x texture to a new scaled texture and replace texture in surface + if (res_scale_width != 1.f || res_scale_height != 1.f) { + OGLTexture scaled_texture; + scaled_texture.Create(); + + AllocateSurfaceTexture(scaled_texture.handle, tuple, GetScaledWidth(), GetScaledHeight()); + BlitTextures(texture.handle, GetRect(), scaled_texture.handle, GetScaledRect(), type); + + std::swap(texture.handle, scaled_texture.handle); } - MathUtil::Rectangle rect; + gl_buffer_dirty = false; +} - if (color_surface != nullptr && depth_surface != nullptr && - (depth_rect.left != color_rect.left || depth_rect.top != color_rect.top)) { - // Can't specify separate color and depth viewport offsets in OpenGL, so re-zero both if - // they don't match - if (color_rect.left != 0 || color_rect.top != 0) { - color_surface = GetSurface(color_params, true, true); - } +void CachedSurface::DownloadGLTexture() { + if (gl_buffer.size() == 0) + gl_buffer.resize(width * height * gl_bytes_per_pixel); - if (depth_rect.left != 0 || depth_rect.top != 0) { - depth_surface = GetSurface(depth_params, true, true); - } + if (!gl_buffer_dirty || type == SurfaceType::Fill) + return; - if (!color_surface->is_tiled) { - rect = MathUtil::Rectangle( - 0, 0, (int)(color_params.width * color_params.res_scale_width), - (int)(color_params.height * color_params.res_scale_height)); - } else { - rect = MathUtil::Rectangle( - 0, (int)(color_params.height * color_params.res_scale_height), - (int)(color_params.width * color_params.res_scale_width), 0); + const FormatTuple& tuple = GetFormatTuple(pixel_format); + + OpenGLState cur_state = OpenGLState::GetCurState(); + GLuint old_tex = cur_state.texture_units[0].texture_2d; + + // If not 1x scale, blit scaled texture to a new 1x texture and use that to flush + OGLTexture unscaled_tex; + if (res_scale_width != 1.f || res_scale_height != 1.f) { + unscaled_tex.Create(); + + AllocateSurfaceTexture(unscaled_tex.handle, tuple, width, height); + BlitTextures(texture.handle, GetScaledRect(), unscaled_tex.handle, GetRect(), type); + + cur_state.texture_units[0].texture_2d = unscaled_tex.handle; + } + else { + cur_state.texture_units[0].texture_2d = texture.handle; + } + cur_state.Apply(); + + glActiveTexture(GL_TEXTURE0); + glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, &gl_buffer[0]); + + cur_state.texture_units[0].texture_2d = old_tex; + cur_state.Apply(); + + gl_buffer_dirty = false; +} + +enum MatchType : int { + Exact = (1 << 0), // Surfaces perfectly match + SubRect = (1 << 1), // Surface encompasses params + Invalid = (1 << 2), // Flag that can be applied to other match types, invalid matches require validation before they can be used + Copy = (1 << 3), // Surface we can copy from + TexCopy = (1 << 4), // Surface that will match a display transfer "texture copy" parameters + All = (1 << 5) - 1, + None = 0, +}; + +constexpr MatchType operator | (MatchType lhs, MatchType rhs) { + return static_cast(static_cast(lhs) | static_cast(rhs)); +} + +/// Get the best surface match (and its match type) for the given flags, higher flag value meaning lower priority +template +std::tuple FindMatch(const SurfaceCache& surface_cache, const SurfaceParams& params, bool match_res_scale) { + constexpr float MATCH_GOODNESS_RESET = -1.f; + + Surface match_surface = nullptr; + MatchType match_type = MatchType::All; // Starting from lowest possible priority + float best_match_goodness = MATCH_GOODNESS_RESET; + + for (auto& pair : RangeFromInterval(surface_cache, params.GetInterval())) { + for (auto& surface : pair.second) { + const bool res_scale_match = (params.res_scale_width == surface->res_scale_width && params.res_scale_height == surface->res_scale_height); + const float match_goodness = surface->res_scale_width * surface->res_scale_height; + const MatchType invalid_mask = surface->IsRegionValid(params.GetInterval()) ? MatchType::None : MatchType::Invalid; + + if (!(find_flags & MatchType::Invalid) && invalid_mask == MatchType::Invalid) + continue; + + const auto IsMatch_Helper = [&](MatchType check_type, auto match_fn) { + if (!(find_flags & check_type)) + return false; + + check_type = check_type | invalid_mask; + + // Lower flag value means higher priority + if (match_type < check_type) // We already have a better match type + return true; // Return true to skip to the next surface + + if (!match_fn()) + return false; + + if (!match_res_scale || res_scale_match || surface->type == SurfaceType::Fill) { // Found a match + if (match_type > check_type) { + best_match_goodness = MATCH_GOODNESS_RESET; + match_type = check_type; + } + if (match_goodness > best_match_goodness) { + best_match_goodness = match_goodness; + match_surface = surface; + } + } + return false; + }; + if (IsMatch_Helper(MatchType::Exact, [&] { return surface->ExactMatch(params); })) + continue; + if (IsMatch_Helper(MatchType::SubRect, [&] { return surface->CanSubRect(params); })) + continue; + if (IsMatch_Helper(MatchType::Copy, [&] { return surface->CanCopy(params); })) + continue; + if (IsMatch_Helper(MatchType::TexCopy, [&] { + if (surface->pixel_format == PixelFormat::Invalid || + surface->addr > params.addr || surface->end < params.end || + ((params.addr - surface->addr) * 8) % SurfaceParams::GetFormatBpp(surface->pixel_format) != 0 || + (params.width * 8) % SurfaceParams::GetFormatBpp(surface->pixel_format) != 0 || + (params.stride * 8) % SurfaceParams::GetFormatBpp(surface->pixel_format) != 0) + return false; + + const u32 begin_pixel_index = surface->PixelsInBytes(params.addr - surface->addr); + const int x0 = begin_pixel_index % surface->width; + const int y0 = begin_pixel_index / surface->width; + + if (!surface->is_tiled) + return (surface->PixelsInBytes(params.stride) == surface->stride && + x0 + surface->PixelsInBytes(params.width) <= surface->stride); + + return (surface->PixelsInBytes(params.addr - surface->addr) % 64 == 0 && + surface->PixelsInBytes(params.width) % 64 == 0 && + surface->PixelsInBytes(params.stride) == surface->stride * 8 && + x0 + surface->PixelsInBytes(params.width / 8) <= surface->stride); + })) + continue; } - } else if (color_surface != nullptr) { - rect = color_rect; - } else if (depth_surface != nullptr) { - rect = depth_rect; - } else { - rect = MathUtil::Rectangle(0, 0, 0, 0); + } + return std::make_tuple(match_surface, (match_surface == nullptr) ? MatchType::None : match_type); +} + +RasterizerCacheOpenGL::RasterizerCacheOpenGL() { + transfer_framebuffers[0].Create(); + transfer_framebuffers[1].Create(); +} + +RasterizerCacheOpenGL::~RasterizerCacheOpenGL() { + FlushAll(); + while (!surface_cache.empty()) + UnregisterSurface(*surface_cache.begin()->second.begin()); + transfer_framebuffers[0].Release(); + transfer_framebuffers[1].Release(); +} + +bool RasterizerCacheOpenGL::BlitSurfaces(const Surface& src_surface, + const MathUtil::Rectangle& src_rect, + const Surface& dst_surface, + const MathUtil::Rectangle& dst_rect) { + if (!SurfaceParams::CheckFormatsBlittable(src_surface->pixel_format, + dst_surface->pixel_format)) + return false; + + return BlitTextures(src_surface->texture.handle, src_rect, + dst_surface->texture.handle, dst_rect, + src_surface->type); +} + +Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool match_res_scale, bool load_if_create) { + if (params.addr == 0 || params.height * params.width == 0) { + return nullptr; + } + + // Check for an exact or subrect match in existing surfaces + Surface surface_match; + MatchType match_type; + std::tie(surface_match, match_type) = + FindMatch(surface_cache, params, match_res_scale); + + if (surface_match != nullptr) { + if (load_if_create && (match_type & MatchType::Invalid)) { + ValidateSurface(surface_match, params.addr, params.size); + } + return surface_match; + } + + ASSERT(params.width == params.stride); // Use GetSurfaceSubRect instead + + Surface new_surface = CreateSurface(params); + if (load_if_create) + ValidateSurface(new_surface, params.addr, params.size); + + RegisterSurface(new_surface); + + return new_surface; +} + +SurfaceRect_Tuple RasterizerCacheOpenGL::GetSurfaceSubRect(const SurfaceParams& params, + bool match_res_scale, + bool load_if_create) { + MathUtil::Rectangle out_rect{}; + + if (params.addr == 0 || params.height * params.width == 0) { + return std::make_tuple(nullptr, out_rect); + } + + // Attempt to find encompassing surface + Surface subrect_match; + MatchType match_type; + std::tie(subrect_match, match_type) = FindMatch(surface_cache, params, match_res_scale); + + // Return the best subrect surface if found + if (subrect_match != nullptr) { + out_rect = subrect_match->GetScaledSubRect(params); + // Tiled surfaces are flipped vertically in the rasterizer vs. 3DS memory. + if (params.is_tiled) + std::swap(out_rect.top, out_rect.bottom); + + if (load_if_create && (match_type & MatchType::Invalid)) + ValidateSurface(subrect_match, params.addr, params.size); + + return std::make_tuple(subrect_match, out_rect); + } + + // No subrect found - create and return a new surface + SurfaceParams new_params = params; + new_params.width = params.stride; // Can't have gaps in a surface + new_params.UpdateParams(); + + out_rect = new_params.GetScaledRect(); + if (new_params.is_tiled) + std::swap(out_rect.top, out_rect.bottom); + + // If stride was bigger than width we need to adjust our output rect + out_rect.right = static_cast(params.width * new_params.res_scale_width); + + Surface new_surface = CreateSurface(new_params); + if (load_if_create) + ValidateSurface(new_surface, new_params.addr, new_params.size); + + RegisterSurface(new_surface); + + return std::make_tuple(new_surface, out_rect); +} + +Surface RasterizerCacheOpenGL::GetTextureSurface(const Pica::TexturingRegs::FullTextureConfig& config) { + Pica::Texture::TextureInfo info = Pica::Texture::TextureInfo::FromPicaRegister(config.config, config.format); + + SurfaceParams params; + params.addr = info.physical_address; + params.width = info.width; + params.height = info.height; + params.is_tiled = true; + params.pixel_format = SurfaceParams::PixelFormatFromTextureFormat(info.format); + params.UpdateParams(); + return GetSurface(params, false, true); +} + +SurfaceSurfaceRect_Tuple RasterizerCacheOpenGL::GetFramebufferSurfaces(bool using_color_fb, + bool using_depth_fb) { + const auto& regs = Pica::g_state.regs; + const auto& config = regs.framebuffer.framebuffer; + + // Make sur that framebuffers don't overlap if both color and depth are being used + u32 fb_area = config.GetWidth() * config.GetHeight(); + bool framebuffers_overlap = config.GetColorBufferPhysicalAddress() != 0 && + config.GetDepthBufferPhysicalAddress() != 0 && + MathUtil::IntervalsIntersect( + config.GetColorBufferPhysicalAddress(), + fb_area * GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(config.color_format.Value())), + config.GetDepthBufferPhysicalAddress(), + fb_area * Pica::FramebufferRegs::BytesPerDepthPixel(config.depth_format)); + + if (framebuffers_overlap && using_color_fb && using_depth_fb) { + LOG_CRITICAL(Render_OpenGL, "Color and depth framebuffer memory regions overlap; overlapping framebuffers not supported!"); + using_depth_fb = false; + } + + // get color and depth surfaces + SurfaceParams color_params; + SurfaceParams depth_params; + color_params.is_tiled = depth_params.is_tiled = true; + + color_params.addr = config.GetColorBufferPhysicalAddress(); + color_params.width = depth_params.width = config.GetWidth(); + color_params.height = depth_params.height = config.GetHeight(); + color_params.pixel_format = SurfaceParams::PixelFormatFromColorFormat(config.color_format); + color_params.UpdateParams(); + + MathUtil::Rectangle rect{}; + Surface color_surface = nullptr; + if (using_color_fb) + std::tie(color_surface, rect) = GetSurfaceSubRect(color_params, true, true); + + depth_params.pixel_format = SurfaceParams::PixelFormatFromDepthFormat(config.depth_format); + depth_params.addr = config.GetDepthBufferPhysicalAddress(); + depth_params.UpdateParams(); + + Surface depth_surface = nullptr; + if (using_depth_fb && color_surface != nullptr) { + const PAddr validate_addr = depth_params.addr; + const u32 validate_size = depth_params.size; + + // Can't specify separate color and depth viewport offsets in OpenGL, so make sure depth_surface will have the same offsets + depth_params.addr -= color_surface->PixelsInBytes(color_params.addr - color_surface->addr) * depth_params.bytes_per_pixel; + depth_params.height = color_surface->height; + depth_params.UpdateParams(); + + depth_surface = GetSurface(depth_params, true, false); + ValidateSurface(depth_surface, validate_addr, validate_size); + } + else if (using_depth_fb) { + std::tie(depth_surface, rect) = GetSurfaceSubRect(depth_params, true, true); } return std::make_tuple(color_surface, depth_surface, rect); } -CachedSurface* RasterizerCacheOpenGL::TryGetFillSurface(const GPU::Regs::MemoryFillConfig& config) { - auto surface_interval = - boost::icl::interval::right_open(config.GetStartAddress(), config.GetEndAddress()); - auto range = surface_cache.equal_range(surface_interval); - for (auto it = range.first; it != range.second; ++it) { - for (auto it2 = it->second.begin(); it2 != it->second.end(); ++it2) { - int bits_per_value = 0; - if (config.fill_24bit) { - bits_per_value = 24; - } else if (config.fill_32bit) { - bits_per_value = 32; - } else { - bits_per_value = 16; - } +Surface RasterizerCacheOpenGL::GetFillSurface(const GPU::Regs::MemoryFillConfig& config) { + Surface new_surface = std::make_shared(); - CachedSurface* surface = it2->get(); + new_surface->addr = config.GetStartAddress(); + new_surface->end = config.GetEndAddress(); + new_surface->size = new_surface->end - new_surface->addr; + new_surface->type = SurfaceType::Fill; + std::memcpy(&new_surface->fill_data[0], &config.value_32bit, 4); + if (config.fill_32bit) + new_surface->fill_size = 4; + else if (config.fill_24bit) + new_surface->fill_size = 3; + else + new_surface->fill_size = 2; - if (surface->addr == config.GetStartAddress() && - CachedSurface::GetFormatBpp(surface->pixel_format) == bits_per_value && - (surface->width * surface->height * - CachedSurface::GetFormatBpp(surface->pixel_format) / 8) == - (config.GetEndAddress() - config.GetStartAddress())) { - return surface; - } - } - } - - return nullptr; + RegisterSurface(new_surface); + return new_surface; } -MICROPROFILE_DEFINE(OpenGL_SurfaceDownload, "OpenGL", "Surface Download", MP_RGB(128, 192, 64)); -void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) { - using PixelFormat = CachedSurface::PixelFormat; - using SurfaceType = CachedSurface::SurfaceType; +SurfaceRect_Tuple RasterizerCacheOpenGL::GetTexCopySurface(const SurfaceParams& params) { + MathUtil::Rectangle rect{}; - if (!surface->dirty) { - return; - } + Surface match_surface; + MatchType match_type; + std::tie(match_surface, match_type) = FindMatch(surface_cache, params, false); - MICROPROFILE_SCOPE(OpenGL_SurfaceDownload); + if (match_type & MatchType::Invalid) + ValidateSurface(match_surface, params.addr, params.size); - u8* dst_buffer = Memory::GetPhysicalPointer(surface->addr); - if (dst_buffer == nullptr) { - return; - } + if (match_surface != nullptr) { + SurfaceParams match_subrect = params; + match_subrect.width = match_surface->PixelsInBytes(params.width); + match_subrect.stride = match_surface->PixelsInBytes(params.stride); - OpenGLState cur_state = OpenGLState::GetCurState(); - GLuint old_tex = cur_state.texture_units[0].texture_2d; - - OGLTexture unscaled_tex; - GLuint texture_to_flush = surface->texture.handle; - - // If not 1x scale, blit scaled texture to a new 1x texture and use that to flush - if (surface->res_scale_width != 1.f || surface->res_scale_height != 1.f) { - unscaled_tex.Create(); - - AllocateSurfaceTexture(unscaled_tex.handle, surface->pixel_format, surface->width, - surface->height); - BlitTextures( - surface->texture.handle, unscaled_tex.handle, - CachedSurface::GetFormatType(surface->pixel_format), - MathUtil::Rectangle(0, 0, surface->GetScaledWidth(), surface->GetScaledHeight()), - MathUtil::Rectangle(0, 0, surface->width, surface->height)); - - texture_to_flush = unscaled_tex.handle; - } - - cur_state.texture_units[0].texture_2d = texture_to_flush; - cur_state.Apply(); - glActiveTexture(GL_TEXTURE0); - - if (!surface->is_tiled) { - // TODO: Ensure this will always be a color format, not a depth or other format - ASSERT((size_t)surface->pixel_format < fb_format_tuples.size()); - const FormatTuple& tuple = fb_format_tuples[(unsigned int)surface->pixel_format]; - - glPixelStorei(GL_PACK_ROW_LENGTH, (GLint)surface->pixel_stride); - glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, dst_buffer); - glPixelStorei(GL_PACK_ROW_LENGTH, 0); - } else { - SurfaceType type = CachedSurface::GetFormatType(surface->pixel_format); - if (type != SurfaceType::Depth && type != SurfaceType::DepthStencil) { - ASSERT((size_t)surface->pixel_format < fb_format_tuples.size()); - const FormatTuple& tuple = fb_format_tuples[(unsigned int)surface->pixel_format]; - - u32 bytes_per_pixel = CachedSurface::GetFormatBpp(surface->pixel_format) / 8; - - std::vector temp_gl_buffer(surface->width * surface->height * bytes_per_pixel); - - glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data()); - - // Directly copy pixels. Internal OpenGL color formats are consistent so no conversion - // is necessary. - MortonCopyPixels(surface->pixel_format, surface->width, surface->height, - bytes_per_pixel, bytes_per_pixel, dst_buffer, temp_gl_buffer.data(), - false); - } else { - // Depth/Stencil formats need special treatment since they aren't sampleable using - // LookupTexture and can't use RGBA format - size_t tuple_idx = (size_t)surface->pixel_format - 14; - ASSERT(tuple_idx < depth_format_tuples.size()); - const FormatTuple& tuple = depth_format_tuples[tuple_idx]; - - u32 bytes_per_pixel = CachedSurface::GetFormatBpp(surface->pixel_format) / 8; - - // OpenGL needs 4 bpp alignment for D24 since using GL_UNSIGNED_INT as type - bool use_4bpp = (surface->pixel_format == PixelFormat::D24); - - u32 gl_bytes_per_pixel = use_4bpp ? 4 : bytes_per_pixel; - - std::vector temp_gl_buffer(surface->width * surface->height * gl_bytes_per_pixel); - - glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data()); - - u8* temp_gl_buffer_ptr = use_4bpp ? temp_gl_buffer.data() + 1 : temp_gl_buffer.data(); - - MortonCopyPixels(surface->pixel_format, surface->width, surface->height, - bytes_per_pixel, gl_bytes_per_pixel, dst_buffer, temp_gl_buffer_ptr, - false); + if (match_surface->is_tiled) { + match_subrect.width /= 8; + match_subrect.stride /= 8; + match_subrect.height *= 8; } + + rect = match_surface->GetScaledSubRect(match_subrect); + if (match_surface->is_tiled) + std::swap(rect.top, rect.bottom); } - surface->dirty = false; - - cur_state.texture_units[0].texture_2d = old_tex; - cur_state.Apply(); + return std::make_tuple(match_surface, rect); } -void RasterizerCacheOpenGL::FlushRegion(PAddr addr, u32 size, const CachedSurface* skip_surface, - bool invalidate) { - if (size == 0) { +void RasterizerCacheOpenGL::ValidateSurface(const Surface& surface, PAddr addr, u32 size) { + if (size == 0) + return; + + bool upload_texture = false; + const auto validate_interval = (surface->type != SurfaceType::Texture) ? + SurfaceInterval::right_open(addr, addr + size) : + surface->GetInterval(); + + if (surface->type == SurfaceType::Fill) { + // Sanity check, fill surfaces will always be valid when used + ASSERT(surface->IsRegionValid(validate_interval)); return; } - // Gather up unique surfaces that touch the region - std::unordered_set> touching_surfaces; + for (;;) { + const auto it = surface->invalid_regions.find(validate_interval); + if (it == surface->invalid_regions.end()) + break; - auto surface_interval = boost::icl::interval::right_open(addr, addr + size); - auto cache_upper_bound = surface_cache.upper_bound(surface_interval); - for (auto it = surface_cache.lower_bound(surface_interval); it != cache_upper_bound; ++it) { - std::copy_if(it->second.begin(), it->second.end(), - std::inserter(touching_surfaces, touching_surfaces.end()), - [skip_surface](std::shared_ptr surface) { - return (surface.get() != skip_surface); - }); - } + const auto interval = *it & validate_interval; + const PAddr interval_start = boost::icl::first(interval); + const PAddr interval_end = boost::icl::last_next(interval); - // Flush and invalidate surfaces - for (auto surface : touching_surfaces) { - FlushSurface(surface.get()); - if (invalidate) { - Memory::RasterizerMarkRegionCached(surface->addr, surface->size, -1); - surface_cache.subtract( - std::make_pair(boost::icl::interval::right_open( - surface->addr, surface->addr + surface->size), - std::set>({surface}))); + // Look for a valid surface to blit + SurfaceParams params = *surface; + const u32 pixel_offset = params.PixelsInBytes(interval_start - params.addr); + if (!params.is_tiled) { + params.addr += (pixel_offset - (pixel_offset % params.width)) * + SurfaceParams::GetFormatBpp(params.pixel_format) / 8; // Start of the row + params.height = (params.PixelsInBytes(interval_end - params.addr - 1) / params.width) + 1; } + else { + params.addr += (pixel_offset - (pixel_offset % (params.width * 8))) * + SurfaceParams::GetFormatBpp(params.pixel_format) / 8; // Start of the tiled row + params.height = ((params.PixelsInBytes(interval_end - params.addr - 1) / (params.width * 8)) + 1) * 8; + } + params.UpdateParams(); + + Surface match_surface; + MatchType match_type; + std::tie(match_surface, match_type) = + FindMatch(surface_cache, params, true); + + if (match_type == MatchType::Copy) { + // Need to call CopySurface and possibly create a new one first, which GetSurface will do for us + if (params.GetInterval() == surface->GetInterval()) { + CopySurface(match_surface, surface); + surface->invalid_regions.clear(); + return; + } + Surface tmp_surface = GetSurface(params, true, false); + if (tmp_surface != nullptr) + CopySurface(match_surface, tmp_surface); + match_surface = tmp_surface; + } + + if (match_surface != nullptr) { + const auto src_rect = (match_type == MatchType::SubRect) ? + match_surface->GetScaledSubRect(params) : + match_surface->GetScaledRect(); + const auto dest_rect = surface->GetScaledSubRect(params); + + BlitSurfaces(match_surface, src_rect, surface, dest_rect); + surface->gl_buffer_dirty = true; + + surface->invalid_regions.erase(params.GetInterval()); + continue; + } + + // Load data from 3DS memory + FlushRegion(interval_start, interval_end - interval_start); + surface->DownloadGLTexture(); + surface->LoadGLBuffer(interval_start, interval_end); + upload_texture = true; + + surface->invalid_regions.erase(interval); } + + if (upload_texture) + surface->UploadGLTexture(); +} + +void RasterizerCacheOpenGL::FlushRegion(PAddr addr, u32 size) { + if (size == 0) + return; + + const auto flush_interval = SurfaceInterval::right_open(addr, addr + size); + for (auto& pair : RangeFromInterval(dirty_regions, flush_interval)) { + const auto interval = pair.first & flush_interval; + auto& surface = pair.second; + + // Sanity check, this surface is the last one that marked this region dirty + ASSERT(surface->IsRegionValid(interval)); + surface->DownloadGLTexture(); + surface->FlushGLBuffer(boost::icl::first(interval), boost::icl::last_next(interval)); + } + + // Reset dirty regions + dirty_regions.erase(flush_interval); } void RasterizerCacheOpenGL::FlushAll() { - for (auto& surfaces : surface_cache) { - for (auto& surface : surfaces.second) { - FlushSurface(surface.get()); + FlushRegion(0, 0xFFFFFFFF); +} + +void RasterizerCacheOpenGL::InvalidateRegion(PAddr addr, u32 size, const Surface& region_owner) { + if (size == 0) + return; + + const auto invalid_interval = SurfaceInterval::right_open(addr, addr + size); + + if (region_owner != nullptr) { + ASSERT(region_owner->type != SurfaceType::Texture); + ASSERT(addr >= region_owner->addr && addr + size <= region_owner->end); + ASSERT(region_owner->width == region_owner->stride); // Surfaces can't have a gap + region_owner->gl_buffer_dirty = true; + region_owner->invalid_regions.erase(invalid_interval); + } + + SurfaceSet remove_surfaces; + + for (auto& pair : RangeFromInterval(surface_cache, invalid_interval)) { + for (auto& cached_surface : pair.second) { + if (cached_surface == region_owner) + continue; + + // If cpu is invalidating this region we want to remove it + // to (likely) mark the memory pages as uncached + // but before that we have to flush its region that is still valid + if (region_owner == nullptr) { + const auto flush_intervals = SurfaceRegions(cached_surface->GetInterval()) - invalid_interval; + for (const auto& interval : flush_intervals) { + FlushRegion(boost::icl::first(interval), boost::icl::length(interval)); + } + remove_surfaces.emplace(cached_surface); + continue; + } + + const auto interval = cached_surface->GetInterval() & invalid_interval; + + cached_surface->invalid_regions.insert(interval); + + // Remove only "empty" fill surfaces to avoid destroying and recreating OGL textures + if (cached_surface->type == SurfaceType::Fill && + !cached_surface->IsRegionPartiallyValid(cached_surface->GetInterval())) + remove_surfaces.emplace(cached_surface); } } + + if (region_owner != nullptr) + dirty_regions.set(std::make_pair(invalid_interval, region_owner)); + else + dirty_regions.erase(invalid_interval); + + for (auto& remove_surface : remove_surfaces) + UnregisterSurface(remove_surface); +} + +Surface RasterizerCacheOpenGL::CreateSurface(const SurfaceParams& params) { + Surface surface = std::make_shared(); + static_cast(*surface) = params; + + surface->texture.Create(); + + // OpenGL needs 4 bpp alignment for D24 since using GL_UNSIGNED_INT as type + surface->gl_bytes_per_pixel = + (surface->pixel_format == PixelFormat::D24 || surface->type == SurfaceType::Texture) ? + 4 : + surface->bytes_per_pixel; + + surface->gl_buffer_offset = (surface->pixel_format == PixelFormat::D24) ? 1 : 0; + + surface->gl_buffer_dirty = false; + surface->invalid_regions.insert(surface->GetInterval()); + AllocateSurfaceTexture(surface->texture.handle, + GetFormatTuple(surface->pixel_format), + surface->GetScaledWidth(), + surface->GetScaledHeight()); + + return surface; +} + +void RasterizerCacheOpenGL::RegisterSurface(const Surface& surface) { + surface_cache.add(std::make_pair(surface->GetInterval(), SurfaceSet({ surface }))); + UpdatePagesCachedCount(surface->addr, surface->size, 1); +} + +void RasterizerCacheOpenGL::UnregisterSurface(const Surface& surface) { + UpdatePagesCachedCount(surface->addr, surface->size, -1); + surface_cache.subtract(std::make_pair(surface->GetInterval(), SurfaceSet({ surface }))); +} + +void RasterizerCacheOpenGL::UpdatePagesCachedCount(PAddr addr, u32 size, int delta) { + const u32 num_pages = ((addr + size - 1) >> Memory::PAGE_BITS) - (addr >> Memory::PAGE_BITS) + 1; + const u32 page_start = addr >> Memory::PAGE_BITS; + const u32 page_end = page_start + num_pages; + + // Interval maps will erase segments if count reaches 0, so if delta is negative we have to subtract after iterating + const auto pages_interval = PageMap::interval_type::right_open(page_start, page_end); + if (delta > 0) + cached_pages.add(std::make_pair(pages_interval, delta)); + + for (auto& pair : RangeFromInterval(cached_pages, pages_interval)) { + const auto interval = pair.first & pages_interval; + const int count = pair.second; + + const PAddr interval_start_addr = boost::icl::first(interval) << Memory::PAGE_BITS; + const PAddr interval_end_addr = boost::icl::last_next(interval) << Memory::PAGE_BITS; + const u32 interval_size = interval_end_addr - interval_start_addr; + + if (delta > 0 && count == delta) + Memory::RasterizerMarkRegionCached(interval_start_addr, interval_size, true); + else if (delta < 0 && count == -delta) + Memory::RasterizerMarkRegionCached(interval_start_addr, interval_size, false); + else + ASSERT(count >= 0); + } + + if (delta < 0) + cached_pages.add(std::make_pair(pages_interval, delta)); } diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h index aea20c693..4b84e8ad6 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h @@ -12,6 +12,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-local-typedef" #endif +#include #include #ifdef __GNUC__ #pragma GCC diagnostic pop @@ -20,21 +21,32 @@ #include "common/assert.h" #include "common/common_funcs.h" #include "common/common_types.h" +#include "common/math_util.h" #include "core/hw/gpu.h" #include "video_core/regs_framebuffer.h" #include "video_core/regs_texturing.h" #include "video_core/renderer_opengl/gl_resource_manager.h" -namespace MathUtil { -template -struct Rectangle; -} - struct CachedSurface; +using Surface = std::shared_ptr; +using SurfaceSet = std::set; -using SurfaceCache = boost::icl::interval_map>>; +using SurfaceRegions = boost::icl::interval_set; +using SurfaceMap = boost::icl::interval_map; +using SurfaceCache = boost::icl::interval_map; + +using SurfaceInterval = SurfaceCache::interval_type; +static_assert(std::is_same() && + std::is_same(), "incorrect interval types"); + +using SurfaceRect_Tuple = std::tuple>; +using SurfaceSurfaceRect_Tuple = std::tuple>; + +using PageMap = boost::icl::interval_map; + +struct SurfaceParams { + explicit SurfaceParams(); -struct CachedSurface { enum class PixelFormat { // First 5 formats are shared between textures and color buffers RGBA8 = 0, @@ -68,10 +80,11 @@ struct CachedSurface { Texture = 1, Depth = 2, DepthStencil = 3, - Invalid = 4, + Fill = 4, + Invalid = 5 }; - static unsigned int GetFormatBpp(CachedSurface::PixelFormat format) { + static unsigned int GetFormatBpp(SurfaceParams::PixelFormat format) { static const std::array bpp_table = { 32, // RGBA8 24, // RGB8 @@ -162,31 +175,98 @@ struct CachedSurface { return SurfaceType::Invalid; } + /// Update the params "size", "end", "bytes_per_pixel" and "type" from the already set "addr", "width", "height" and "pixel_format" + void UpdateParams() { + size = width * height * GetFormatBpp(pixel_format) / 8; + + if (stride == 0) + stride = width; + else + size += (stride - width) * (height - 1) * GetFormatBpp(pixel_format) / 8; + + end = addr + size; + type = GetFormatType(pixel_format); + bytes_per_pixel = GetFormatBpp(pixel_format) / 8; + } + + SurfaceInterval GetInterval() const { + return SurfaceInterval::right_open(addr, end); + } + u32 GetScaledWidth() const { - return (u32)(width * res_scale_width); + return static_cast(width * res_scale_width); } u32 GetScaledHeight() const { - return (u32)(height * res_scale_height); + return static_cast(height * res_scale_height); } - PAddr addr; - u32 size; + MathUtil::Rectangle GetRect() const { + return MathUtil::Rectangle(0, 0, width, height); + } - PAddr min_valid; - PAddr max_valid; + MathUtil::Rectangle GetScaledRect() const { + return MathUtil::Rectangle(0, 0, GetScaledWidth(), GetScaledHeight()); + } - OGLTexture texture; - u32 width; - u32 height; - /// Stride between lines, in pixels. Only valid for images in linear format. - u32 pixel_stride = 0; + u32 PixelsInBytes(u32 size) const { + return size * 8 / GetFormatBpp(pixel_format); + } + + PAddr addr = 0; + PAddr end = 0; + u32 size = 0; + + u32 width = 0; + u32 height = 0; + u32 stride = 0; float res_scale_width = 1.f; float res_scale_height = 1.f; - bool is_tiled; - PixelFormat pixel_format; - bool dirty; + bool is_tiled = false; + u32 bytes_per_pixel = 0; + PixelFormat pixel_format = PixelFormat::Invalid; + SurfaceType type = SurfaceType::Invalid; +}; + +struct CachedSurface : SurfaceParams { + bool ExactMatch(const SurfaceParams& other_surface) const; + bool CanSubRect(const SurfaceParams& sub_surface) const; + bool CanCopy(const SurfaceParams& dest_surface) const; + + MathUtil::Rectangle GetSubRect(const SurfaceParams& sub_surface) const; + MathUtil::Rectangle GetScaledSubRect(const SurfaceParams& sub_surface) const; + + bool IsRegionValid(const SurfaceInterval& interval) const { + return (invalid_regions.find(interval) == invalid_regions.end()); + } + + bool IsRegionPartiallyValid(const SurfaceInterval& interval) const { + const auto it = invalid_regions.find(interval); + if (it == invalid_regions.end()) + return true; + return ((boost::icl::first(*it) > addr) || (boost::icl::last_next(*it) < end)); + } + + SurfaceRegions invalid_regions; + + u32 fill_size = 0; /// Number of bytes to read from fill_data + std::array fill_data; + + OGLTexture texture; + + u32 gl_bytes_per_pixel; + int gl_buffer_offset; + std::vector gl_buffer; + bool gl_buffer_dirty; + + // Read/Write data in 3DS memory to/from gl_buffer + void LoadGLBuffer(PAddr load_start, PAddr load_end); + void FlushGLBuffer(PAddr flush_start, PAddr flush_end); + + // Upload/Download data in gl_buffer in/to this surface's texture + void UploadGLTexture(); + void DownloadGLTexture(); }; class RasterizerCacheOpenGL : NonCopyable { @@ -194,46 +274,56 @@ public: RasterizerCacheOpenGL(); ~RasterizerCacheOpenGL(); - /// Blits one texture to another - void BlitTextures(GLuint src_tex, GLuint dst_tex, CachedSurface::SurfaceType type, - const MathUtil::Rectangle& src_rect, - const MathUtil::Rectangle& dst_rect); + /// Blit one surface's texture to another + bool BlitSurfaces(const Surface& src_surface, const MathUtil::Rectangle& src_rect, + const Surface& dst_surface, const MathUtil::Rectangle& dst_rect); - /// Attempt to blit one surface's texture to another - bool TryBlitSurfaces(CachedSurface* src_surface, const MathUtil::Rectangle& src_rect, - CachedSurface* dst_surface, const MathUtil::Rectangle& dst_rect); - - /// Loads a texture from 3DS memory to OpenGL and caches it (if not already cached) - CachedSurface* GetSurface(const CachedSurface& params, bool match_res_scale, - bool load_if_create); + /// Load a texture from 3DS memory to OpenGL and cache it (if not already cached) + Surface GetSurface(const SurfaceParams& params, bool match_res_scale, bool load_if_create); /// Attempt to find a subrect (resolution scaled) of a surface, otherwise loads a texture from /// 3DS memory to OpenGL and caches it (if not already cached) - CachedSurface* GetSurfaceRect(const CachedSurface& params, bool match_res_scale, - bool load_if_create, MathUtil::Rectangle& out_rect); + SurfaceRect_Tuple GetSurfaceSubRect(const SurfaceParams& params, bool match_res_scale, + bool load_if_create); - /// Gets a surface based on the texture configuration - CachedSurface* GetTextureSurface(const Pica::TexturingRegs::FullTextureConfig& config); + /// Get a surface based on the texture configuration + Surface GetTextureSurface(const Pica::TexturingRegs::FullTextureConfig& config); - /// Gets the color and depth surfaces and rect (resolution scaled) based on the framebuffer - /// configuration - std::tuple> GetFramebufferSurfaces( - const Pica::FramebufferRegs::FramebufferConfig& config); + /// Get the color and depth surfaces based on the framebuffer configuration + SurfaceSurfaceRect_Tuple GetFramebufferSurfaces(bool using_color_fb, bool using_depth_fb); - /// Attempt to get a surface that exactly matches the fill region and format - CachedSurface* TryGetFillSurface(const GPU::Regs::MemoryFillConfig& config); + /// Get a surface that matches the fill config + Surface GetFillSurface(const GPU::Regs::MemoryFillConfig& config); - /// Write the surface back to memory - void FlushSurface(CachedSurface* surface); + /// Get a surface that matches a "texture copy" display transfer config + SurfaceRect_Tuple GetTexCopySurface(const SurfaceParams& params); - /// Write any cached resources overlapping the region back to memory (if dirty) and optionally - /// invalidate them in the cache - void FlushRegion(PAddr addr, u32 size, const CachedSurface* skip_surface, bool invalidate); + /// Write any cached resources overlapping the region back to memory (if dirty) + void FlushRegion(PAddr addr, u32 size); + + /// Mark region as being invalidated by region_owner (nullptr if 3DS memory) + void InvalidateRegion(PAddr addr, u32 size, const Surface& region_owner); /// Flush all cached resources tracked by this cache manager void FlushAll(); private: + /// Update surface's texture for given region when necessary + void ValidateSurface(const Surface& surface, PAddr addr, u32 size); + + /// Create a new surface + Surface CreateSurface(const SurfaceParams& params); + + /// Register surface into the cache + void RegisterSurface(const Surface& surface); + + /// Remove surface from the cache + void UnregisterSurface(const Surface& surface); + + /// Increase/decrease the number of surface in pages touching the specified region + void UpdatePagesCachedCount(PAddr addr, u32 size, int delta); + SurfaceCache surface_cache; - OGLFramebuffer transfer_framebuffers[2]; + SurfaceMap dirty_regions; + PageMap cached_pages; }; diff --git a/src/video_core/swrasterizer/swrasterizer.h b/src/video_core/swrasterizer/swrasterizer.h index 6d42d7409..6c524f013 100644 --- a/src/video_core/swrasterizer/swrasterizer.h +++ b/src/video_core/swrasterizer/swrasterizer.h @@ -22,6 +22,7 @@ class SWRasterizer : public RasterizerInterface { void NotifyPicaRegisterChanged(u32 id) override {} void FlushAll() override {} void FlushRegion(PAddr addr, u32 size) override {} + void InvalidateRegion(PAddr addr, u32 size) override {} void FlushAndInvalidateRegion(PAddr addr, u32 size) override {} }; }