diff --git a/src/common/vector_math.h b/src/common/vector_math.h index 7ca8e15f5..e029718bd 100644 --- a/src/common/vector_math.h +++ b/src/common/vector_math.h @@ -407,6 +407,11 @@ inline float Vec3::Normalize() { return length; } +template <> +inline unsigned int Vec3::ToRGB() const { + return (z << 16) | (y << 8) | x; +} + typedef Vec3 Vec3f; template @@ -611,6 +616,11 @@ public: #undef _DEFINE_SWIZZLER3 }; +template <> +inline unsigned int Vec4::ToRGBA() const { + return (w << 24) | (z << 16) | (y << 8) | x; +} + template Vec4 operator*(const V& f, const Vec4& vec) { return MakeVec(f * vec.x, f * vec.y, f * vec.z, f * vec.w); diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 2522064e7..b33869c22 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -1,14 +1,14 @@ set(SRCS - texture/internal/morton.cpp - texture/internal/etc1.cpp - texture/codec.cpp - texture/internal/codecs.cpp renderer_opengl/gl_rasterizer.cpp renderer_opengl/gl_rasterizer_cache.cpp renderer_opengl/gl_shader_gen.cpp renderer_opengl/gl_shader_util.cpp renderer_opengl/gl_state.cpp renderer_opengl/renderer_opengl.cpp + texture/internal/morton.cpp + texture/internal/etc1.cpp + texture/internal/codecs.cpp + texture/codec.cpp debug_utils/debug_utils.cpp clipper.cpp command_processor.cpp @@ -25,12 +25,6 @@ set(SRCS set(HEADERS debug_utils/debug_utils.h - texture/internal/texture_utils.h - texture/internal/morton.h - texture/internal/etc1.h - texture/codec.h - texture/formats.h - texture/internal/codecs.h renderer_opengl/gl_rasterizer.h renderer_opengl/gl_rasterizer_cache.h renderer_opengl/gl_resource_manager.h @@ -39,6 +33,12 @@ set(HEADERS renderer_opengl/gl_state.h renderer_opengl/pica_to_gl.h renderer_opengl/renderer_opengl.h + texture/internal/texture_utils.h + texture/internal/morton.h + texture/internal/etc1.h + texture/internal/codecs.h + texture/codec.h + texture/formats.h clipper.h command_processor.h gpu_debugger.h diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 5a306a5c8..089d9328c 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -21,6 +21,7 @@ #include "video_core/renderer_opengl/gl_shader_util.h" #include "video_core/renderer_opengl/pica_to_gl.h" #include "video_core/renderer_opengl/renderer_opengl.h" +#include "video_core/texture/formats.h" MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255)); @@ -716,7 +717,6 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(PAddr addr, u32 size) { bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransferConfig& config) { MICROPROFILE_SCOPE(OpenGL_Blits); - using PixelFormat = CachedSurface::PixelFormat; using SurfaceType = CachedSurface::SurfaceType; CachedSurface src_params; @@ -728,7 +728,7 @@ bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransfe // the image, and it allows for smaller texture cache lookup rectangles. src_params.height = config.output_height; src_params.is_tiled = !config.input_linear; - src_params.pixel_format = CachedSurface::PixelFormatFromGPUPixelFormat(config.input_format); + src_params.pixel_format = Pica::Texture::Format::FromGPUPixelFormat(config.input_format); CachedSurface dst_params; dst_params.addr = config.GetPhysicalOutputAddress(); @@ -737,7 +737,7 @@ bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransfe dst_params.height = config.scaling == config.ScaleXY ? config.output_height / 2 : config.output_height.Value(); dst_params.is_tiled = config.input_linear != config.dont_swizzle; - dst_params.pixel_format = CachedSurface::PixelFormatFromGPUPixelFormat(config.output_format); + dst_params.pixel_format = Pica::Texture::Format::FromGPUPixelFormat(config.output_format); MathUtil::Rectangle src_rect; CachedSurface* src_surface = res_cache.GetSurfaceRect(src_params, false, true, src_rect); @@ -776,7 +776,7 @@ bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransfe } u32 dst_size = dst_params.width * dst_params.height * - CachedSurface::GetFormatBpp(dst_params.pixel_format) / 8; + Pica::Texture::Format::GetBpp(dst_params.pixel_format) / 8; dst_surface->dirty = true; res_cache.FlushRegion(config.GetPhysicalOutputAddress(), dst_size, dst_surface, true); return true; @@ -789,7 +789,6 @@ bool RasterizerOpenGL::AccelerateTextureCopy(const GPU::Regs::DisplayTransferCon bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config) { MICROPROFILE_SCOPE(OpenGL_Blits); - using PixelFormat = CachedSurface::PixelFormat; using SurfaceType = CachedSurface::SurfaceType; CachedSurface* dst_surface = res_cache.TryGetFillSurface(config); @@ -824,7 +823,7 @@ bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config) if (config.fill_24bit) { switch (dst_surface->pixel_format) { - case PixelFormat::RGB8: + case Pica::Texture::Format::Type::RGB8: color_values[0] = config.value_24bit_r / 255.0f; color_values[1] = config.value_24bit_g / 255.0f; color_values[2] = config.value_24bit_b / 255.0f; @@ -836,7 +835,7 @@ bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config) u32 value = config.value_32bit; switch (dst_surface->pixel_format) { - case PixelFormat::RGBA8: + case Pica::Texture::Format::Type::RGBA8: color_values[0] = (value >> 24) / 255.0f; color_values[1] = ((value >> 16) & 0xFF) / 255.0f; color_values[2] = ((value >> 8) & 0xFF) / 255.0f; @@ -850,34 +849,34 @@ bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config) Math::Vec4 color; switch (dst_surface->pixel_format) { - case PixelFormat::RGBA8: + case Pica::Texture::Format::Type::RGBA8: color_values[0] = (value_16bit >> 8) / 255.0f; color_values[1] = (value_16bit & 0xFF) / 255.0f; color_values[2] = color_values[0]; color_values[3] = color_values[1]; break; - case PixelFormat::RGB5A1: + case Pica::Texture::Format::Type::RGB5A1: color = Color::DecodeRGB5A1((const u8*)&value_16bit); color_values[0] = color[0] / 31.0f; color_values[1] = color[1] / 31.0f; color_values[2] = color[2] / 31.0f; color_values[3] = color[3]; break; - case PixelFormat::RGB565: + case Pica::Texture::Format::Type::RGB565: color = Color::DecodeRGB565((const u8*)&value_16bit); color_values[0] = color[0] / 31.0f; color_values[1] = color[1] / 63.0f; color_values[2] = color[2] / 31.0f; break; - case PixelFormat::RGBA4: + case Pica::Texture::Format::Type::RGBA4: color = Color::DecodeRGBA4((const u8*)&value_16bit); color_values[0] = color[0] / 15.0f; color_values[1] = color[1] / 15.0f; color_values[2] = color[2] / 15.0f; color_values[3] = color[3] / 15.0f; break; - case PixelFormat::IA8: - case PixelFormat::RG8: + case Pica::Texture::Format::Type::IA8: + case Pica::Texture::Format::Type::RG8: color_values[0] = (value_16bit >> 8) / 255.0f; color_values[1] = (value_16bit & 0xFF) / 255.0f; break; @@ -899,9 +898,9 @@ bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config) glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); GLfloat value_float; - if (dst_surface->pixel_format == CachedSurface::PixelFormat::D16) { + if (dst_surface->pixel_format == Pica::Texture::Format::Type::D16) { value_float = config.value_32bit / 65535.0f; // 2^16 - 1 - } else if (dst_surface->pixel_format == CachedSurface::PixelFormat::D24) { + } else if (dst_surface->pixel_format == Pica::Texture::Format::Type::D24) { value_float = config.value_32bit / 16777215.0f; // 2^24 - 1 } @@ -945,7 +944,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const GPU::Regs::FramebufferConfig& con src_params.height = config.height; src_params.pixel_stride = pixel_stride; src_params.is_tiled = false; - src_params.pixel_format = CachedSurface::PixelFormatFromGPUPixelFormat(config.color_format); + src_params.pixel_format = Pica::Texture::Format::FromGPUPixelFormat(config.color_format); MathUtil::Rectangle src_rect; CachedSurface* src_surface = res_cache.GetSurfaceRect(src_params, false, true, src_rect); diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index ef3b06a7b..618a4e1f7 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -21,6 +21,8 @@ #include "video_core/pica_state.h" #include "video_core/renderer_opengl/gl_rasterizer_cache.h" #include "video_core/renderer_opengl/gl_state.h" +#include "video_core/texture/codec.h" +#include "video_core/texture/formats.h" #include "video_core/utils.h" #include "video_core/video_core.h" @@ -30,21 +32,48 @@ struct FormatTuple { GLenum type; }; -static const std::array fb_format_tuples = {{ - {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8}, // RGBA8 - {GL_RGB8, GL_BGR, GL_UNSIGNED_BYTE}, // RGB8 - {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1}, // RGB5A1 - {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5}, // RGB565 - {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4}, // RGBA4 -}}; - -static const std::array depth_format_tuples = {{ +static const std::array format_tuples = {{ + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8}, // RGBA8 + {GL_RGB8, GL_BGR, GL_UNSIGNED_BYTE}, // RGB8 + {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1}, // RGB5A1 + {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5}, // RGB565 + {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4}, // RGBA4 + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // IA8 + {GL_RG8, GL_RG8, GL_UNSIGNED_BYTE}, // RG8 + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // I8 + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // A8 + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // IA4 + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // I4 + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // A4 + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // ETC1 + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // ETC1A4 {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // D16 {}, {GL_DEPTH_COMPONENT24, GL_DEPTH_COMPONENT, GL_UNSIGNED_INT}, // D24 {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // D24S8 }}; +static const std::array native_format = { + true, // RGBA8 + true, // RGB8 + true, // RGB5A1 + true, // RGB565 + true, // RGBA4 + false, // IA8 + true, // RG8 + false, // I8 + false, // A8 + false, // IA4 + false, // I4 + false, // A4 + false, // ETC1 + false, // ETC1A4 + true, // D16 + false, + false, // D24 + false, // D24S8 +}; + RasterizerCacheOpenGL::RasterizerCacheOpenGL() { transfer_framebuffers[0].Create(); transfer_framebuffers[1].Create(); @@ -54,55 +83,6 @@ RasterizerCacheOpenGL::~RasterizerCacheOpenGL() { FlushAll(); } -static void MortonCopyPixels(CachedSurface::PixelFormat pixel_format, u32 width, u32 height, - u32 bytes_per_pixel, u32 gl_bytes_per_pixel, u8* morton_data, - u8* gl_data, bool morton_to_gl) { - using PixelFormat = CachedSurface::PixelFormat; - - u8* data_ptrs[2]; - u32 depth_stencil_shifts[2] = {24, 8}; - - if (morton_to_gl) { - std::swap(depth_stencil_shifts[0], depth_stencil_shifts[1]); - } - - if (pixel_format == PixelFormat::D24S8) { - for (unsigned y = 0; y < height; ++y) { - for (unsigned x = 0; x < width; ++x) { - const u32 coarse_y = y & ~7; - u32 morton_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + - coarse_y * width * bytes_per_pixel; - u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel; - - data_ptrs[morton_to_gl] = morton_data + morton_offset; - data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index]; - - // Swap depth and stencil value ordering since 3DS does not match OpenGL - u32 depth_stencil; - memcpy(&depth_stencil, data_ptrs[1], sizeof(u32)); - depth_stencil = (depth_stencil << depth_stencil_shifts[0]) | - (depth_stencil >> depth_stencil_shifts[1]); - - memcpy(data_ptrs[0], &depth_stencil, sizeof(u32)); - } - } - } else { - for (unsigned y = 0; y < height; ++y) { - for (unsigned x = 0; x < width; ++x) { - const u32 coarse_y = y & ~7; - u32 morton_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + - coarse_y * width * bytes_per_pixel; - u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel; - - data_ptrs[morton_to_gl] = morton_data + morton_offset; - data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index]; - - memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel); - } - } - } -} - void RasterizerCacheOpenGL::BlitTextures(GLuint src_tex, GLuint dst_tex, CachedSurface::SurfaceType type, const MathUtil::Rectangle& src_rect, @@ -184,7 +164,7 @@ bool RasterizerCacheOpenGL::TryBlitSurfaces(CachedSurface* src_surface, return true; } -static void AllocateSurfaceTexture(GLuint texture, CachedSurface::PixelFormat pixel_format, +static void AllocateSurfaceTexture(GLuint texture, Pica::Texture::Format::Type pixel_format, u32 width, u32 height) { // Allocate an uninitialized texture of appropriate size and format for the surface using SurfaceType = CachedSurface::SurfaceType; @@ -199,17 +179,8 @@ static void AllocateSurfaceTexture(GLuint texture, CachedSurface::PixelFormat pi SurfaceType type = CachedSurface::GetFormatType(pixel_format); - FormatTuple tuple; - if (type == SurfaceType::Color) { - ASSERT((size_t)pixel_format < fb_format_tuples.size()); - tuple = fb_format_tuples[(unsigned int)pixel_format]; - } else if (type == SurfaceType::Depth || type == SurfaceType::DepthStencil) { - size_t tuple_idx = (size_t)pixel_format - 14; - ASSERT(tuple_idx < depth_format_tuples.size()); - tuple = depth_format_tuples[tuple_idx]; - } else { - tuple = {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}; - } + ASSERT((size_t)pixel_format < format_tuples.size()); + FormatTuple tuple = format_tuples[(unsigned int)pixel_format]; glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, width, height, 0, tuple.format, tuple.type, nullptr); @@ -227,7 +198,7 @@ static void AllocateSurfaceTexture(GLuint texture, CachedSurface::PixelFormat pi MICROPROFILE_DEFINE(OpenGL_SurfaceUpload, "OpenGL", "Surface Upload", MP_RGB(128, 64, 192)); CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bool match_res_scale, bool load_if_create) { - using PixelFormat = CachedSurface::PixelFormat; + using PixelFormat = Pica::Texture::Format::Type; using SurfaceType = CachedSurface::SurfaceType; if (params.addr == 0) { @@ -235,7 +206,7 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo } u32 params_size = - params.width * params.height * CachedSurface::GetFormatBpp(params.pixel_format) / 8; + params.width * params.height * Pica::Texture::Format::GetBpp(params.pixel_format) / 8; // Check for an exact match in existing surfaces CachedSurface* best_exact_surface = nullptr; @@ -320,72 +291,36 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo if (!new_surface->is_tiled) { // TODO: Ensure this will always be a color format, not a depth or other format - ASSERT((size_t)new_surface->pixel_format < fb_format_tuples.size()); - const FormatTuple& tuple = fb_format_tuples[(unsigned int)params.pixel_format]; + // ASSERT((size_t)new_surface->pixel_format < format_tuples.size()); + const FormatTuple& tuple = format_tuples[(unsigned int)params.pixel_format]; glPixelStorei(GL_UNPACK_ROW_LENGTH, (GLint)new_surface->pixel_stride); glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, tuple.format, tuple.type, texture_src_data); glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); } else { - SurfaceType type = CachedSurface::GetFormatType(new_surface->pixel_format); - if (type != SurfaceType::Depth && type != SurfaceType::DepthStencil) { - FormatTuple tuple; - if ((size_t)params.pixel_format < fb_format_tuples.size()) { - tuple = fb_format_tuples[(unsigned int)params.pixel_format]; - } else { - // Texture - tuple = {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}; - } - - std::vector> tex_buffer(params.width * params.height); - - Pica::DebugUtils::TextureInfo tex_info; - tex_info.width = params.width; - tex_info.height = params.height; - tex_info.stride = - params.width * CachedSurface::GetFormatBpp(params.pixel_format) / 8; - tex_info.format = (Pica::Regs::TextureFormat)params.pixel_format; - tex_info.physical_address = params.addr; - - for (unsigned y = 0; y < params.height; ++y) { - for (unsigned x = 0; x < params.width; ++x) { - tex_buffer[x + params.width * y] = Pica::DebugUtils::LookupTexture( - texture_src_data, x, params.height - 1 - y, tex_info); - } - } - - glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, - 0, GL_RGBA, GL_UNSIGNED_BYTE, tex_buffer.data()); - } else { - // Depth/Stencil formats need special treatment since they aren't sampleable using - // LookupTexture and can't use RGBA format - size_t tuple_idx = (size_t)params.pixel_format - 14; - ASSERT(tuple_idx < depth_format_tuples.size()); - const FormatTuple& tuple = depth_format_tuples[tuple_idx]; - - u32 bytes_per_pixel = CachedSurface::GetFormatBpp(params.pixel_format) / 8; - - // OpenGL needs 4 bpp alignment for D24 since using GL_UNSIGNED_INT as type - bool use_4bpp = (params.pixel_format == PixelFormat::D24); - - u32 gl_bytes_per_pixel = use_4bpp ? 4 : bytes_per_pixel; - - std::vector temp_fb_depth_buffer(params.width * params.height * - gl_bytes_per_pixel); - - u8* temp_fb_depth_buffer_ptr = - use_4bpp ? temp_fb_depth_buffer.data() + 1 : temp_fb_depth_buffer.data(); - - MortonCopyPixels(params.pixel_format, params.width, params.height, bytes_per_pixel, - gl_bytes_per_pixel, texture_src_data, temp_fb_depth_buffer_ptr, - true); - - glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, - 0, tuple.format, tuple.type, temp_fb_depth_buffer.data()); - } + const FormatTuple& tuple = format_tuples[(unsigned int)params.pixel_format]; + std::unique_ptr tmp = Pica::Texture::CodecFactory::build( + // clang-format off + params.pixel_format, texture_src_data, params.width, params.height + // clang-format on + ); + Pica::Texture::Codec* codec = tmp.get(); + codec->configTiling(true, 8); // change 8 for 32 in case the mage is tiled + // on blocks of 32x32 + codec->configRGBATransform(!native_format[(unsigned int)params.pixel_format]); + codec->decode(); + std::unique_ptr decoded_texture = codec->transferInternalBuffer(); + u32 bytes = codec->getInternalBytesPerPixel(); + if (bytes == 3) + bytes = 1; + else if (bytes != 2) + bytes = 4; + glPixelStorei(GL_UNPACK_ALIGNMENT, bytes); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, + tuple.format, tuple.type, decoded_texture.get()); + glPixelStorei(GL_UNPACK_ALIGNMENT, 4); } - // If not 1x scale, blit 1x texture to a new scaled texture and replace texture in surface if (new_surface->res_scale_width != 1.f || new_surface->res_scale_height != 1.f) { OGLTexture scaled_texture; @@ -430,7 +365,7 @@ CachedSurface* RasterizerCacheOpenGL::GetSurfaceRect(const CachedSurface& params } u32 total_pixels = params.width * params.height; - u32 params_size = total_pixels * CachedSurface::GetFormatBpp(params.pixel_format) / 8; + u32 params_size = total_pixels * Pica::Texture::Format::GetBpp(params.pixel_format) / 8; // Attempt to find encompassing surfaces CachedSurface* best_subrect_surface = nullptr; @@ -467,7 +402,7 @@ CachedSurface* RasterizerCacheOpenGL::GetSurfaceRect(const CachedSurface& params // Return the best subrect surface if found if (best_subrect_surface != nullptr) { unsigned int bytes_per_pixel = - (CachedSurface::GetFormatBpp(best_subrect_surface->pixel_format) / 8); + (Pica::Texture::Format::GetBpp(best_subrect_surface->pixel_format) / 8); int x0, y0; @@ -521,7 +456,7 @@ CachedSurface* RasterizerCacheOpenGL::GetTextureSurface( params.width = info.width; params.height = info.height; params.is_tiled = true; - params.pixel_format = CachedSurface::PixelFormatFromTextureFormat(info.format); + params.pixel_format = Pica::Texture::Format::FromTextureFormat(info.format); return GetSurface(params, false, true); } @@ -574,10 +509,10 @@ RasterizerCacheOpenGL::GetFramebufferSurfaces(const Pica::Regs::FramebufferConfi } color_params.addr = config.GetColorBufferPhysicalAddress(); - color_params.pixel_format = CachedSurface::PixelFormatFromColorFormat(config.color_format); + color_params.pixel_format = Pica::Texture::Format::FromColorFormat(config.color_format); depth_params.addr = config.GetDepthBufferPhysicalAddress(); - depth_params.pixel_format = CachedSurface::PixelFormatFromDepthFormat(config.depth_format); + depth_params.pixel_format = Pica::Texture::Format::FromDepthFormat(config.depth_format); MathUtil::Rectangle color_rect; CachedSurface* color_surface = @@ -648,9 +583,9 @@ CachedSurface* RasterizerCacheOpenGL::TryGetFillSurface(const GPU::Regs::MemoryF CachedSurface* surface = it2->get(); if (surface->addr == config.GetStartAddress() && - CachedSurface::GetFormatBpp(surface->pixel_format) == bits_per_value && + Pica::Texture::Format::GetBpp(surface->pixel_format) == bits_per_value && (surface->width * surface->height * - CachedSurface::GetFormatBpp(surface->pixel_format) / 8) == + Pica::Texture::Format::GetBpp(surface->pixel_format) / 8) == (config.GetEndAddress() - config.GetStartAddress())) { return surface; } @@ -662,7 +597,6 @@ CachedSurface* RasterizerCacheOpenGL::TryGetFillSurface(const GPU::Regs::MemoryF MICROPROFILE_DEFINE(OpenGL_SurfaceDownload, "OpenGL", "Surface Download", MP_RGB(128, 192, 64)); void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) { - using PixelFormat = CachedSurface::PixelFormat; using SurfaceType = CachedSurface::SurfaceType; if (!surface->dirty) { @@ -703,53 +637,32 @@ void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) { if (!surface->is_tiled) { // TODO: Ensure this will always be a color format, not a depth or other format - ASSERT((size_t)surface->pixel_format < fb_format_tuples.size()); - const FormatTuple& tuple = fb_format_tuples[(unsigned int)surface->pixel_format]; + // ASSERT((size_t)surface->pixel_format < fb_format_tuples.size()); + const FormatTuple& tuple = format_tuples[(unsigned int)surface->pixel_format]; glPixelStorei(GL_PACK_ROW_LENGTH, (GLint)surface->pixel_stride); glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, dst_buffer); glPixelStorei(GL_PACK_ROW_LENGTH, 0); } else { - SurfaceType type = CachedSurface::GetFormatType(surface->pixel_format); - if (type != SurfaceType::Depth && type != SurfaceType::DepthStencil) { - ASSERT((size_t)surface->pixel_format < fb_format_tuples.size()); - const FormatTuple& tuple = fb_format_tuples[(unsigned int)surface->pixel_format]; + const FormatTuple& tuple = format_tuples[(u32)surface->pixel_format]; + u32 bytes_per_pixel = Pica::Texture::Format::GetBpp(surface->pixel_format) / 8; + if (!native_format[(u32)surface->pixel_format]) + bytes_per_pixel = 4; + std::vector temp_gl_buffer(surface->width * surface->height * bytes_per_pixel); + glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data()); - u32 bytes_per_pixel = CachedSurface::GetFormatBpp(surface->pixel_format) / 8; - - std::vector temp_gl_buffer(surface->width * surface->height * bytes_per_pixel); - - glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data()); - - // Directly copy pixels. Internal OpenGL color formats are consistent so no conversion - // is necessary. - MortonCopyPixels(surface->pixel_format, surface->width, surface->height, - bytes_per_pixel, bytes_per_pixel, dst_buffer, temp_gl_buffer.data(), - false); - } else { - // Depth/Stencil formats need special treatment since they aren't sampleable using - // LookupTexture and can't use RGBA format - size_t tuple_idx = (size_t)surface->pixel_format - 14; - ASSERT(tuple_idx < depth_format_tuples.size()); - const FormatTuple& tuple = depth_format_tuples[tuple_idx]; - - u32 bytes_per_pixel = CachedSurface::GetFormatBpp(surface->pixel_format) / 8; - - // OpenGL needs 4 bpp alignment for D24 since using GL_UNSIGNED_INT as type - bool use_4bpp = (surface->pixel_format == PixelFormat::D24); - - u32 gl_bytes_per_pixel = use_4bpp ? 4 : bytes_per_pixel; - - std::vector temp_gl_buffer(surface->width * surface->height * gl_bytes_per_pixel); - - glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data()); - - u8* temp_gl_buffer_ptr = use_4bpp ? temp_gl_buffer.data() + 1 : temp_gl_buffer.data(); - - MortonCopyPixels(surface->pixel_format, surface->width, surface->height, - bytes_per_pixel, gl_bytes_per_pixel, dst_buffer, temp_gl_buffer_ptr, - false); - } + std::unique_ptr tmp = Pica::Texture::CodecFactory::build( + // clang-format off + surface->pixel_format, temp_gl_buffer.data(), surface->width, surface->height + // clang-format on + ); + Pica::Texture::Codec* codec = tmp.get(); + codec->configTiling(true, 8); // change 8 for 32 in case the mage is tiled + // on blocks of 32x32 + codec->configRGBATransform(!native_format[(u32)surface->pixel_format]); + codec->configPreConvertedRGBA(!native_format[(u32)surface->pixel_format]); + codec->setExternalBuffer(dst_buffer); + codec->encode(); } surface->dirty = false; diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h index b50e8292b..dc17cf6f4 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h @@ -16,6 +16,7 @@ #include "core/hw/gpu.h" #include "video_core/pica.h" #include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/texture/formats.h" namespace MathUtil { template @@ -27,33 +28,6 @@ struct CachedSurface; using SurfaceCache = boost::icl::interval_map>>; struct CachedSurface { - enum class PixelFormat { - // First 5 formats are shared between textures and color buffers - RGBA8 = 0, - RGB8 = 1, - RGB5A1 = 2, - RGB565 = 3, - RGBA4 = 4, - - // Texture-only formats - IA8 = 5, - RG8 = 6, - I8 = 7, - A8 = 8, - IA4 = 9, - I4 = 10, - A4 = 11, - ETC1 = 12, - ETC1A4 = 13, - - // Depth buffer-only formats - D16 = 14, - // gap - D24 = 16, - D24S8 = 17, - - Invalid = 255, - }; enum class SurfaceType { Color = 0, @@ -63,58 +37,8 @@ struct CachedSurface { Invalid = 4, }; - static unsigned int GetFormatBpp(CachedSurface::PixelFormat format) { - static const std::array bpp_table = { - 32, // RGBA8 - 24, // RGB8 - 16, // RGB5A1 - 16, // RGB565 - 16, // RGBA4 - 16, // IA8 - 16, // RG8 - 8, // I8 - 8, // A8 - 8, // IA4 - 4, // I4 - 4, // A4 - 4, // ETC1 - 8, // ETC1A4 - 16, // D16 - 0, - 24, // D24 - 32, // D24S8 - }; - - ASSERT((unsigned int)format < ARRAY_SIZE(bpp_table)); - return bpp_table[(unsigned int)format]; - } - - static PixelFormat PixelFormatFromTextureFormat(Pica::Regs::TextureFormat format) { - return ((unsigned int)format < 14) ? (PixelFormat)format : PixelFormat::Invalid; - } - - static PixelFormat PixelFormatFromColorFormat(Pica::Regs::ColorFormat format) { - return ((unsigned int)format < 5) ? (PixelFormat)format : PixelFormat::Invalid; - } - - static PixelFormat PixelFormatFromDepthFormat(Pica::Regs::DepthFormat format) { - return ((unsigned int)format < 4) ? (PixelFormat)((unsigned int)format + 14) - : PixelFormat::Invalid; - } - - static PixelFormat PixelFormatFromGPUPixelFormat(GPU::Regs::PixelFormat format) { - switch (format) { - // RGB565 and RGB5A1 are switched in PixelFormat compared to ColorFormat - case GPU::Regs::PixelFormat::RGB565: - return PixelFormat::RGB565; - case GPU::Regs::PixelFormat::RGB5A1: - return PixelFormat::RGB5A1; - default: - return ((unsigned int)format < 5) ? (PixelFormat)format : PixelFormat::Invalid; - } - } - - static bool CheckFormatsBlittable(PixelFormat pixel_format_a, PixelFormat pixel_format_b) { + static bool CheckFormatsBlittable(Pica::Texture::Format::Type pixel_format_a, + Pica::Texture::Format::Type pixel_format_b) { SurfaceType a_type = GetFormatType(pixel_format_a); SurfaceType b_type = GetFormatType(pixel_format_b); @@ -134,7 +58,7 @@ struct CachedSurface { return false; } - static SurfaceType GetFormatType(PixelFormat pixel_format) { + static SurfaceType GetFormatType(Pica::Texture::Format::Type pixel_format) { if ((unsigned int)pixel_format < 5) { return SurfaceType::Color; } @@ -143,11 +67,12 @@ struct CachedSurface { return SurfaceType::Texture; } - if (pixel_format == PixelFormat::D16 || pixel_format == PixelFormat::D24) { + if (pixel_format == Pica::Texture::Format::Type::D16 || + pixel_format == Pica::Texture::Format::Type::D24) { return SurfaceType::Depth; } - if (pixel_format == PixelFormat::D24S8) { + if (pixel_format == Pica::Texture::Format::Type::D24S8) { return SurfaceType::DepthStencil; } @@ -177,7 +102,7 @@ struct CachedSurface { float res_scale_height = 1.f; bool is_tiled; - PixelFormat pixel_format; + Pica::Texture::Format::Type pixel_format; bool dirty; }; diff --git a/src/video_core/texture/codec.cpp b/src/video_core/texture/codec.cpp index 5992dcdac..f63aa29ff 100644 --- a/src/video_core/texture/codec.cpp +++ b/src/video_core/texture/codec.cpp @@ -1,6 +1,10 @@ -#include "codec.h" -#include "internal\codecs.h" -#include "internal\morton.h" +#include "common/color.h" +#include "common/math_util.h" +#include "common/swap.h" +#include "common/vector_math.h" +#include "video_core/texture/codec.h" +#include "video_core/texture/internal/codecs.h" +#include "video_core/texture/internal/morton.h" namespace Pica { namespace Texture { @@ -17,18 +21,6 @@ void Codec::encode() { this->encode_morton_pass(); }; -void Codec::setSize() { - this->start_nibbles_size = format_size; -}; - -inline void Codec::setWidth(u32 width) { - this->width = width; -} - -inline void Codec::setHeight(u32 height) { - this->height = height; -} - void Codec::configTiling(bool active, u32 tiling) { this->morton = true; this->morton_pass_tiling = tiling; @@ -63,15 +55,16 @@ bool Codec::invalid() { } void Codec::init(bool decode) { + this->setSize(); + this->expected_nibbles_size = this->start_nibbles_size; if (decode) { if (this->raw_RGBA) this->expected_nibbles_size = 8; } else { - this->start_nibbles_size = this->format_size; - if (this->raw_RGBA) - this->expected_nibbles_size = this->format_size; if (this->preconverted) this->start_nibbles_size = 8; + if (!this->raw_RGBA) + this->expected_nibbles_size = this->start_nibbles_size; } if (!this->external_result_buffer) { size_t buff_size = this->width * this->height * this->expected_nibbles_size / 2; @@ -80,7 +73,7 @@ void Codec::init(bool decode) { } } -inline void Codec::decode_morton_pass() { +void Codec::decode_morton_pass() { if (this->morton_pass_tiling == 8) Decoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height, this->start_nibbles_size * 4); @@ -89,7 +82,7 @@ inline void Codec::decode_morton_pass() { this->start_nibbles_size * 4); } -inline void Codec::encode_morton_pass() { +void Codec::encode_morton_pass() { if (this->morton_pass_tiling == 8) Encoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height, this->start_nibbles_size * 4); @@ -98,41 +91,41 @@ inline void Codec::encode_morton_pass() { this->start_nibbles_size * 4); } -std::unique_ptr CodecFactory::build(Format format, u8* target, u32 width, u32 height) { +std::unique_ptr CodecFactory::build(Format::Type format, u8* target, u32 width, u32 height) { switch (format) { - case Format::RGBA8: + case Format::Type::RGBA8: return std::make_unique(target, width, height); - case Format::RGB8: + case Format::Type::RGB8: return std::make_unique(target, width, height); - case Format::RGB5A1: + case Format::Type::RGB5A1: return std::make_unique(target, width, height); - case Format::RGB565: + case Format::Type::RGB565: return std::make_unique(target, width, height); - case Format::RGBA4: + case Format::Type::RGBA4: return std::make_unique(target, width, height); - case Format::RG8: + case Format::Type::RG8: return std::make_unique(target, width, height); - case Format::IA8: + case Format::Type::IA8: return std::make_unique(target, width, height); - case Format::I8: + case Format::Type::I8: return std::make_unique(target, width, height); - case Format::A8: + case Format::Type::A8: return std::make_unique(target, width, height); - case Format::IA4: + case Format::Type::IA4: return std::make_unique(target, width, height); - case Format::I4: + case Format::Type::I4: return std::make_unique(target, width, height); - case Format::A4: + case Format::Type::A4: return std::make_unique(target, width, height); - case Format::ETC1: + case Format::Type::ETC1: return std::make_unique(target, width, height); - case Format::ETC1A4: + case Format::Type::ETC1A4: return std::make_unique(target, width, height); - case Format::D16: + case Format::Type::D16: return std::make_unique(target, width, height); - case Format::D24: + case Format::Type::D24: return std::make_unique(target, width, height); - case Format::D24S8: + case Format::Type::D24S8: return std::make_unique(target, width, height); default: return nullptr; diff --git a/src/video_core/texture/codec.h b/src/video_core/texture/codec.h index fe873556d..1c0a6b1d9 100644 --- a/src/video_core/texture/codec.h +++ b/src/video_core/texture/codec.h @@ -1,9 +1,10 @@ + +#pragma once + #include #include #include "common/common_types.h" -#include "formats.h" - -#pragma once +#include "video_core/texture/formats.h" namespace Pica { @@ -16,18 +17,23 @@ public: this->target_buffer = target; this->setWidth(width); this->setHeight(height); - this->setSize(); - this->expected_nibbles_size = this->start_nibbles_size; } virtual ~Codec() {} virtual void decode(); virtual void encode(); - void setSize(); + inline void setWidth(u32 width) { + this->width = width; + } - void setWidth(u32 width); - void setHeight(u32 height); + inline void setHeight(u32 height) { + this->height = height; + } + + inline u32 getInternalBytesPerPixel() { + return this->expected_nibbles_size / 2; + } // Common Passes void configTiling(bool active, u32 tiling); @@ -54,7 +60,10 @@ protected: u32 start_nibbles_size; u32 expected_nibbles_size; - const u32 format_size = 8; + + virtual void setSize() { + this->start_nibbles_size = 8; + }; u8* target_buffer; // Initial read buffer u8* passing_buffer; // pointer aliasing: Used and modified by passes @@ -65,12 +74,12 @@ protected: typedef Codec super; - inline void decode_morton_pass(); - inline void encode_morton_pass(); + void decode_morton_pass(); + void encode_morton_pass(); }; namespace CodecFactory { -std::unique_ptr build(Pica::Texture::Format format, u8* target, u32 width, u32 height); +std::unique_ptr build(Pica::Texture::Format::Type format, u8* target, u32 width, u32 height); }; } // Texture diff --git a/src/video_core/texture/formats.h b/src/video_core/texture/formats.h index c15d40c1d..ffb24e615 100644 --- a/src/video_core/texture/formats.h +++ b/src/video_core/texture/formats.h @@ -1,36 +1,96 @@ + #pragma once +#include +#include "common/assert.h" +#include "core/hw/gpu.h" +#include "video_core/pica.h" + namespace Pica { namespace Texture { -enum class Format { - // First 5 formats are shared between textures and color buffers - RGBA8 = 0, - RGB8 = 1, - RGB5A1 = 2, - RGB565 = 3, - RGBA4 = 4, +struct Format { - // Texture-only formats - IA8 = 5, - RG8 = 6, - I8 = 7, - A8 = 8, - IA4 = 9, - I4 = 10, - A4 = 11, - ETC1 = 12, - ETC1A4 = 13, + enum class Type { + // First 5 formats are shared between textures and color buffers + RGBA8 = 0, + RGB8 = 1, + RGB5A1 = 2, + RGB565 = 3, + RGBA4 = 4, - // Depth buffer-only formats - D16 = 14, - // gap - D24 = 16, - D24S8 = 17, + // Texture-only formats + IA8 = 5, + RG8 = 6, + I8 = 7, + A8 = 8, + IA4 = 9, + I4 = 10, + A4 = 11, + ETC1 = 12, + ETC1A4 = 13, - Invalid = 255, -}; + // Depth buffer-only formats + D16 = 14, + // gap + D24 = 16, + D24S8 = 17, + + Invalid = 255, + }; + + static u32 GetBpp(Type format) { + static const std::array bpp_table = { + 32, // RGBA8 + 24, // RGB8 + 16, // RGB5A1 + 16, // RGB565 + 16, // RGBA4 + 16, // IA8 + 16, // RG8 + 8, // I8 + 8, // A8 + 8, // IA4 + 4, // I4 + 4, // A4 + 4, // ETC1 + 8, // ETC1A4 + 16, // D16 + 0, + 24, // D24 + 32, // D24S8 + }; + + ASSERT((u32)format < ARRAY_SIZE(bpp_table)); + return bpp_table[(u32)format]; + } + + static Type FromTextureFormat(Regs::TextureFormat format) { + return ((unsigned int)format < 14) ? (Type)format : Type::Invalid; + } + + static Type FromColorFormat(Regs::ColorFormat format) { + return ((unsigned int)format < 5) ? (Type)format : Type::Invalid; + } + + static Type FromDepthFormat(Regs::DepthFormat format) { + return ((unsigned int)format < 4) ? (Type)((unsigned int)format + 14) : Type::Invalid; + } + + static Type FromGPUPixelFormat(GPU::Regs::PixelFormat format) { + switch (format) { + // RGB565 and RGB5A1 are switched in PixelFormat compared to ColorFormat + case GPU::Regs::PixelFormat::RGB565: + return Type::RGB565; + case GPU::Regs::PixelFormat::RGB5A1: + return Type::RGB5A1; + default: + return ((unsigned int)format < 5) ? (Type)format : Type::Invalid; + } + } + +}; // Format } // Texture diff --git a/src/video_core/texture/internal/codecs.cpp b/src/video_core/texture/internal/codecs.cpp index d647c9ec7..753f33bdd 100644 --- a/src/video_core/texture/internal/codecs.cpp +++ b/src/video_core/texture/internal/codecs.cpp @@ -1,7 +1,19 @@ -#include "codecs.h" -#include "etc1.h" -#include "morton.h" -#include "texture_utils.h" +#include "video_core/texture/internal/codecs.h" +#include "video_core/texture/internal/etc1.h" +#include "video_core/texture/internal/morton.h" +#include "video_core/texture/internal/texture_utils.h" + +/////////////////////////////////////////////////////////////////////////////// +// Optimizations +////////////////////////////////////////////////////////////////////////////// +#ifdef _MSC_VER +#pragma inline_recursion(on) +#elif defined(CLANG_OR_GCC) +#pragma GCC optimize("-fpeel-loops") +#pragma GCC optimize("-fpredictive-commoning") +#pragma GCC optimize("-ftree-loop-distribute-patterns") +#pragma GCC optimize("-ftree-vectorize") +#endif // Decoders #include "decoders.cpp" diff --git a/src/video_core/texture/internal/codecs.h b/src/video_core/texture/internal/codecs.h index 97a5e2869..9fa40908a 100644 --- a/src/video_core/texture/internal/codecs.h +++ b/src/video_core/texture/internal/codecs.h @@ -1,10 +1,11 @@ + +#pragma once + #include #include #include "common/common_types.h" #include "video_core/texture/codec.h" -#pragma once - // each texture format codec class RGBACodec : public Pica::Texture::Codec { public: @@ -13,7 +14,9 @@ public: void encode(); protected: - const u32 format_size = 8; + virtual void setSize() { + this->start_nibbles_size = 8; + }; }; class RGBCodec : public Pica::Texture::Codec { @@ -23,7 +26,9 @@ public: void encode(); protected: - const u32 format_size = 6; + virtual void setSize() { + this->start_nibbles_size = 6; + }; }; class RGB5A1Codec : public Pica::Texture::Codec { @@ -33,7 +38,9 @@ public: void encode(); protected: - const u32 format_size = 4; + virtual void setSize() { + this->start_nibbles_size = 4; + }; }; class RGBA4Codec : public Pica::Texture::Codec { @@ -43,7 +50,9 @@ public: void encode(); protected: - const u32 format_size = 4; + virtual void setSize() { + this->start_nibbles_size = 4; + }; }; class RGB565Codec : public Pica::Texture::Codec { @@ -53,7 +62,9 @@ public: void encode(); protected: - const u32 format_size = 4; + virtual void setSize() { + this->start_nibbles_size = 4; + }; }; class RG8Codec : public Pica::Texture::Codec { @@ -63,7 +74,9 @@ public: void encode(); protected: - const u32 format_size = 4; + virtual void setSize() { + this->start_nibbles_size = 4; + }; }; class IA8Codec : public Pica::Texture::Codec { @@ -73,7 +86,9 @@ public: void encode(); protected: - const u32 format_size = 4; + virtual void setSize() { + this->start_nibbles_size = 4; + }; }; class I8Codec : public Pica::Texture::Codec { @@ -83,7 +98,9 @@ public: void encode(); protected: - const u32 format_size = 2; + virtual void setSize() { + this->start_nibbles_size = 2; + }; }; class A8Codec : public Pica::Texture::Codec { @@ -93,7 +110,9 @@ public: void encode(); protected: - const u32 format_size = 2; + virtual void setSize() { + this->start_nibbles_size = 2; + }; }; class IA4Codec : public Pica::Texture::Codec { @@ -103,7 +122,9 @@ public: void encode(); protected: - const u32 format_size = 2; + virtual void setSize() { + this->start_nibbles_size = 2; + }; }; class I4Codec : public Pica::Texture::Codec { @@ -113,7 +134,9 @@ public: void encode(); protected: - const u32 format_size = 1; + virtual void setSize() { + this->start_nibbles_size = 1; + }; }; class A4Codec : public Pica::Texture::Codec { @@ -123,7 +146,9 @@ public: void encode(); protected: - const u32 format_size = 1; + virtual void setSize() { + this->start_nibbles_size = 1; + }; }; class ETC1Codec : public Pica::Texture::Codec { @@ -133,7 +158,9 @@ public: void encode(); protected: - const u32 format_size = 1; + virtual void setSize() { + this->start_nibbles_size = 1; + }; }; class ETC1A4Codec : public Pica::Texture::Codec { @@ -143,7 +170,9 @@ public: void encode(); protected: - const u32 format_size = 2; + virtual void setSize() { + this->start_nibbles_size = 2; + }; }; class D16Codec : public Pica::Texture::Codec { @@ -153,7 +182,9 @@ public: void encode(); protected: - const u32 format_size = 4; + virtual void setSize() { + this->start_nibbles_size = 4; + }; }; class D24Codec : public Pica::Texture::Codec { @@ -163,7 +194,9 @@ public: void encode(); protected: - const u32 format_size = 6; + virtual void setSize() { + this->start_nibbles_size = 6; + }; }; class D24S8Codec : public Pica::Texture::Codec { @@ -173,5 +206,7 @@ public: void encode(); protected: - const u32 format_size = 8; + virtual void setSize() { + this->start_nibbles_size = 8; + }; }; diff --git a/src/video_core/texture/internal/decoders.cpp b/src/video_core/texture/internal/decoders.cpp index d0b80d013..28672e8fb 100644 --- a/src/video_core/texture/internal/decoders.cpp +++ b/src/video_core/texture/internal/decoders.cpp @@ -1,9 +1,10 @@ + namespace { template decode_func(const u8*)> inline void rgba_pass(u8* read, u8* write) { - u32 pixel = decode_func(read).ToRGBA(); + auto pixel = decode_func(read).ToRGBA(); std::memcpy(write, &pixel, 4); } @@ -72,34 +73,36 @@ void RG8Codec::decode() { namespace { inline u16 convert_nibbles(u8 nibbles) { - return ((u16)Color::Convert4To8((nibbles & 0xF0) >> 4) << 8) | - (u16)Color::Convert4To8((nibbles & 0x0F)); + u16 split = (nibbles & 0xF0) << 4 | (nibbles & 0x0F); + split |= (split << 4); + return split; } -inline u32 build_luminance(u8 intensity, u8 alpha) { +inline u32 build_luminance(u32 intensity, u32 alpha) { return (alpha << 24) | (intensity << 16) | (intensity << 8) | intensity; } inline void intensity_alpha_pass(u8* read, u8* write) { alignas(4) u8 pixel[2]; std::memcpy(pixel, read, 2); - u32 result = build_luminance(pixel[0], pixel[1]); + u32 result = build_luminance(pixel[1], pixel[0]); std::memcpy(write, &result, 4); } inline void intensity_alpha_nibbles_pass(u8* read, u8* write) { - alignas(4) u8 pixel[2]; - std::memcpy(pixel, read, 1); - u16 tmp = convert_nibbles(pixel[0]); - std::memcpy(pixel, &tmp, 2); - u32 result = build_luminance(pixel[0], pixel[1]); + alignas(4) u8 pixel; + std::memcpy(&pixel, read, 1); + u16 tmp = convert_nibbles(pixel); + u8 tmp2[2]; + std::memcpy(tmp2, &tmp, 2); + u32 result = build_luminance(tmp2[1], tmp2[0]); std::memcpy(write, &result, 4); } inline void intensity_pass(u8* read, u8* write) { - alignas(4) u8 pixel[1]; - std::memcpy(pixel, read, 1); - u32 result = build_luminance(pixel[0], 255); + u8 pixel; + std::memcpy(&pixel, read, 1); + u32 result = build_luminance(pixel, 255); std::memcpy(write, &result, 4); } @@ -108,9 +111,9 @@ inline void intensity_nibbles_pass(u8* read, u8* write) { std::memcpy(pixel, read, 1); u16 tmp = convert_nibbles(pixel[0]); std::memcpy(pixel, &tmp, 2); - u32 result = build_luminance(pixel[0], 255); + u32 result = build_luminance(pixel[1], 255); std::memcpy(write, &result, 4); - result = build_luminance(pixel[1], 255); + result = build_luminance(pixel[0], 255); std::memcpy(write + 4, &result, 4); } diff --git a/src/video_core/texture/internal/etc1.cpp b/src/video_core/texture/internal/etc1.cpp index a20dee6d5..5a7edec33 100644 --- a/src/video_core/texture/internal/etc1.cpp +++ b/src/video_core/texture/internal/etc1.cpp @@ -9,14 +9,24 @@ #include "common/math_util.h" #include "common/swap.h" #include "common/vector_math.h" -#include "etc1.h" -#include "texture_utils.h" +#include "video_core/texture/internal/etc1.h" +#include "video_core/texture/internal/texture_utils.h" -constexpr std::array etc1_modifier_table = {{ +namespace { + +#ifdef _DEBUG +#define CONST_FIX static +#else +#define CONST_FIX constexpr +#endif + +CONST_FIX std::array etc1_modifier_table = {{ {2, 8}, {5, 17}, {9, 29}, {13, 42}, {18, 60}, {24, 80}, {33, 106}, {47, 183}, }}; -namespace { +constexpr u32 buildRGBA(u32 r, u32 g, u32 b, u32 a) { + return (a << 24) | (b << 16) | (g << 8) | r; +} union ETC1Tile { u64 raw; @@ -62,7 +72,7 @@ union ETC1Tile { BitField<60, 4, u64> r1; } separate; - const Math::Vec3 GetRGB(u32 x, u32 y) const { + const u32 GetRGB(u32 x, u32 y) const { int texel = 4 * x + y; if (flip) @@ -106,7 +116,7 @@ union ETC1Tile { ret.g() = MathUtil::Clamp(ret.g() + modifier, 0, 255); ret.b() = MathUtil::Clamp(ret.b() + modifier, 0, 255); - return ret.Cast(); + return buildRGBA(ret.r(), ret.g(), ret.b(), 0); } }; @@ -121,7 +131,8 @@ inline void etc1_pass(u8* etc1_buffer, u8* linear_buffer, u32 x_blocks) { std::memcpy(&tile.raw, &etc1_buffer[i * 8], 8); for (u32 k = 0; k < 4; k++) { for (u32 j = 0; j < 4; j++) { - u32 rgba = (tile.GetRGB(j, k).ToRGB()) | 0xFF000000; + auto rgb = tile.GetRGB(j, k); + u32 rgba = rgb | 0xFF000000; std::memcpy(&tmp[k * line + j * 4 + index], &rgba, 4); } } @@ -142,7 +153,8 @@ inline void etc1a4_pass(u8* etc1_buffer, u8* linear_buffer, u32 x_blocks) { for (u32 j = 0; j < 4; j++) { u32 alpha = (alpha_tile >> (4 * (j * 4 + k))) & 0x0F; alpha |= (alpha << 4); - u32 rgba = tile.GetRGB(j, k).ToRGB() | (alpha << 24); + auto rgb = tile.GetRGB(j, k); + u32 rgba = rgb | (alpha << 24); std::memcpy(&tmp[k * line + j * 4 + index], &rgba, 4); } } diff --git a/src/video_core/texture/internal/etc1.h b/src/video_core/texture/internal/etc1.h index fa4535da2..492f19729 100644 --- a/src/video_core/texture/internal/etc1.h +++ b/src/video_core/texture/internal/etc1.h @@ -1,7 +1,6 @@ +#pragma once #include "common/common_types.h" -#pragma once - void ETC1(u8* etc1_buffer, u8* matrix_buffer, u32 width, u32 height); void ETC1A4(u8* etc1_buffer, u8* matrix_buffer, u32 width, u32 height); diff --git a/src/video_core/texture/internal/morton.cpp b/src/video_core/texture/internal/morton.cpp index b50f4e34a..9bd74c800 100644 --- a/src/video_core/texture/internal/morton.cpp +++ b/src/video_core/texture/internal/morton.cpp @@ -2,8 +2,8 @@ #include #include #include "common/common_types.h" -#include "morton.h" -#include "texture_utils.h" +#include "video_core/texture/internal/morton.h" +#include "video_core/texture/internal/texture_utils.h" /////////////////////////////////////////////////////////////////////////////// // Optimizations @@ -15,6 +15,8 @@ // favor fast code over small code. #pragma optimize("t", on) #pragma intrinsic(memcpy) +#define __hot +#define __no_inline __declspec(noinline) #elif defined(CLANG_OR_GCC) // The next 3 will swizle memory copying to help find the best sse/avx shuffling // in case it's possible. Compilation tests have proven effective use of these @@ -22,12 +24,20 @@ #pragma GCC optimize("-fpredictive-commoning") #pragma GCC optimize("-ftree-loop-distribute-patterns") #pragma GCC optimize("-ftree-vectorize") -// limit inlining -#pragma GCC option("--param max-inline-insns-single=128") - +#pragma GCC option("--param inline-unit-growth=400") +#pragma GCC option("--param large-function-growth=800") // The beauty of these compiler options is that they generate better code than // hand written intrinsics, since inline expanding memeory transfers can be pattern // matched with vector instructions available in the target. +#define __no_inline __attribute__((noinline)) +#define __hot __attribute__((hot)) +#if !defined(__forceinline) +#define __forceinline attribute__((always_inline)) +#endif +#else +#define __hot +#define __no_inline +#define __forceinline #endif #pragma region Z_Order @@ -54,11 +64,11 @@ constexpr u32 isBottom(u32 block_index) { } template -inline void swizzle_block(u8*& morton_block, u8* linear_block); +__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block); template -inline void swizzle_block_aux(u8*& morton_block, u8* linear_block) { +__forceinline static void swizzle_block_aux(u8*& morton_block, u8* linear_block) { // move the linear_block pointer to the appropiate block const size_t right = isRight(block_index) * (blocks * nibbles) / 2; const size_t down = isBottom(block_index) * block_size; @@ -67,7 +77,7 @@ inline void swizzle_block_aux(u8*& morton_block, u8* linear_block) { } template -inline void swizzle_block(u8*& morton_block, u8* linear_block) { +__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block) { const size_t new_block_size = block_size / 2; if (blocks <= 2) { // We handle 2*2 blocks on z-order @@ -94,14 +104,14 @@ inline void swizzle_block(u8*& morton_block, u8* linear_block) { } template -void swizzle_pass(u8* morton_block, u8* linear_block) { +__forceinline static void swizzle_pass(u8* morton_block, u8* linear_block) { const size_t block_size = (lines_per_block * lines_per_block * nibbles) / 2; swizzle_block(morton_block, linear_block); } #pragma endregion Z_Order template -void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) { +__hot inline static void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) { const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2; alignas(64) u8 tmp[tile_size]; tiling_pass<&encode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks); @@ -109,7 +119,7 @@ void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) { } template -void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) { +__hot inline static void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) { const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2; alignas(64) u8 tmp[tile_size]; swizzle_pass<&decode, nibbles, lines_per_block>(morton_buffer, tmp); @@ -117,7 +127,7 @@ void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) { } template -void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) { +__hot static void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) { const u32 x_blocks = (width / lines_per_block); const u32 y_blocks = (height / lines_per_block); const size_t line_size = (lines_per_block * nibbles) / 2; @@ -135,9 +145,22 @@ void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) { } } +// keep hot code together +__no_inline __hot static void morton_8x8_32(u8* morton_buffer, u8* matrix_buffer, u32 width, + u32 height, bool decode) { + if (decode) + morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height); + else + morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height); +} + namespace Decoders { bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) { + if (bpp == 32) { + morton_8x8_32(morton_buffer, matrix_buffer, width, height, true); + return true; + } switch (bpp) { case 4: { morton_pass<&decode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height); @@ -159,11 +182,6 @@ bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 return true; break; } - case 32: { - morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } default: { return false; break; @@ -209,6 +227,10 @@ bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u namespace Encoders { bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) { + if (bpp == 32) { + morton_8x8_32(morton_buffer, matrix_buffer, width, height, false); + return true; + } switch (bpp) { case 4: { morton_pass<&encode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height); @@ -230,11 +252,6 @@ bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 return true; break; } - case 32: { - morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } default: { return false; break; diff --git a/src/video_core/texture/internal/morton.h b/src/video_core/texture/internal/morton.h index 36879ecb4..95473744f 100644 --- a/src/video_core/texture/internal/morton.h +++ b/src/video_core/texture/internal/morton.h @@ -1,7 +1,7 @@ -#include "common/common_types.h" - #pragma once +#include "common/common_types.h" + enum class MortonPass { Tile8x8, Tile32x32 }; namespace Decoders { diff --git a/src/video_core/texture/internal/texture_utils.h b/src/video_core/texture/internal/texture_utils.h index ecd7a557b..38d7f96f9 100644 --- a/src/video_core/texture/internal/texture_utils.h +++ b/src/video_core/texture/internal/texture_utils.h @@ -1,3 +1,5 @@ +#pragma once + #include #include #include @@ -5,12 +7,9 @@ #include "common/color.h" #include "common/swap.h" -#pragma once - #if ((defined(__clang__) || defined(__GNUC__)) && !defined(__INTEL_COMPILER)) #define CLANG_OR_GCC #endif - /////////////////////////////////////////////////////////////////////////////// // Optimizations ////////////////////////////////////////////////////////////////////////////// @@ -23,16 +22,6 @@ #pragma GCC optimize("-ftree-vectorize") #endif -// @param read_size is the amount of bytes each pixel takes -inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) { - std::memcpy(matrix_pointer, morton_pointer, read_size); -} - -// @param read_size is the amount of bytes each pixel takes -inline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) { - std::memcpy(morton_pointer, matrix_pointer, read_size); -} - // Pre: width % 8 == 0 && height % 8 == 0 template inline void image_pass_aux_rev(u8* target, u32 width, u32 height) { @@ -80,9 +69,9 @@ inline void image_pass_aux(u8* target, u32 width, u32 height) { template inline void image_pass(u8* target, u32 width, u32 height) { if (read_size > write_size) - image_pass_aux; + image_pass_aux(target, width, height); else - image_pass_aux_rev; + image_pass_aux_rev(target, width, height); } template @@ -96,3 +85,13 @@ void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) { codec(tiled + tiled_index, linear + linear_index, tiled_line_size); } } + +// @param read_size is the amount of bytes each pixel takes +inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) { + std::memcpy(matrix_pointer, morton_pointer, read_size); +} + +// @param read_size is the amount of bytes each pixel takes +inline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) { + std::memcpy(morton_pointer, matrix_pointer, read_size); +}