Integrated the texture submodule into gl backend

2025-07-27 13:30:24 +00:00 · 2017-01-15 18:15:29 -05:00 · 2017-01-15 18:15:29 -05:00 · 30f0d1dbf4
commit 30f0d1dbf4
parent 1a4c8d510d
16 changed files with 436 additions and 450 deletions
--- a/src/common/vector_math.h
+++ b/src/common/vector_math.h
@ -407,6 +407,11 @@ inline float Vec3<float>::Normalize() {
    return length;
 }

+template <>
+inline unsigned int Vec3<unsigned char>::ToRGB() const {
+    return (z << 16) | (y << 8) | x;
+}
+
 typedef Vec3<float> Vec3f;

 template <typename T>
@ -611,6 +616,11 @@ public:
 #undef _DEFINE_SWIZZLER3
 };

+template <>
+inline unsigned int Vec4<unsigned char>::ToRGBA() const {
+    return (w << 24) | (z << 16) | (y << 8) | x;
+}
+
 template <typename T, typename V>
 Vec4<decltype(V{} * T{})> operator*(const V& f, const Vec4<T>& vec) {
    return MakeVec(f * vec.x, f * vec.y, f * vec.z, f * vec.w);
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@ -1,14 +1,14 @@
 set(SRCS
-            texture/internal/morton.cpp
-            texture/internal/etc1.cpp
-            texture/codec.cpp
-            texture/internal/codecs.cpp
            renderer_opengl/gl_rasterizer.cpp
            renderer_opengl/gl_rasterizer_cache.cpp
            renderer_opengl/gl_shader_gen.cpp
            renderer_opengl/gl_shader_util.cpp
            renderer_opengl/gl_state.cpp
            renderer_opengl/renderer_opengl.cpp
+            texture/internal/morton.cpp
+            texture/internal/etc1.cpp
+            texture/internal/codecs.cpp
+            texture/codec.cpp
            debug_utils/debug_utils.cpp
            clipper.cpp
            command_processor.cpp
@ -25,12 +25,6 @@ set(SRCS

 set(HEADERS
            debug_utils/debug_utils.h
-            texture/internal/texture_utils.h
-            texture/internal/morton.h
-            texture/internal/etc1.h
-            texture/codec.h
-            texture/formats.h
-            texture/internal/codecs.h
            renderer_opengl/gl_rasterizer.h
            renderer_opengl/gl_rasterizer_cache.h
            renderer_opengl/gl_resource_manager.h
@ -39,6 +33,12 @@ set(HEADERS
            renderer_opengl/gl_state.h
            renderer_opengl/pica_to_gl.h
            renderer_opengl/renderer_opengl.h
+            texture/internal/texture_utils.h
+            texture/internal/morton.h
+            texture/internal/etc1.h
+            texture/internal/codecs.h
+            texture/codec.h
+            texture/formats.h
            clipper.h
            command_processor.h
            gpu_debugger.h
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@ -21,6 +21,7 @@
 #include "video_core/renderer_opengl/gl_shader_util.h"
 #include "video_core/renderer_opengl/pica_to_gl.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
+#include "video_core/texture/formats.h"

 MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192));
 MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255));
@ -716,7 +717,6 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(PAddr addr, u32 size) {

 bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransferConfig& config) {
    MICROPROFILE_SCOPE(OpenGL_Blits);
-    using PixelFormat = CachedSurface::PixelFormat;
    using SurfaceType = CachedSurface::SurfaceType;

    CachedSurface src_params;
@ -728,7 +728,7 @@ bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransfe
    // the image, and it allows for smaller texture cache lookup rectangles.
    src_params.height = config.output_height;
    src_params.is_tiled = !config.input_linear;
-    src_params.pixel_format = CachedSurface::PixelFormatFromGPUPixelFormat(config.input_format);
+    src_params.pixel_format = Pica::Texture::Format::FromGPUPixelFormat(config.input_format);

    CachedSurface dst_params;
    dst_params.addr = config.GetPhysicalOutputAddress();
@ -737,7 +737,7 @@ bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransfe
    dst_params.height =
        config.scaling == config.ScaleXY ? config.output_height / 2 : config.output_height.Value();
    dst_params.is_tiled = config.input_linear != config.dont_swizzle;
-    dst_params.pixel_format = CachedSurface::PixelFormatFromGPUPixelFormat(config.output_format);
+    dst_params.pixel_format = Pica::Texture::Format::FromGPUPixelFormat(config.output_format);

    MathUtil::Rectangle<int> src_rect;
    CachedSurface* src_surface = res_cache.GetSurfaceRect(src_params, false, true, src_rect);
@ -776,7 +776,7 @@ bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransfe
    }

    u32 dst_size = dst_params.width * dst_params.height *
-                   CachedSurface::GetFormatBpp(dst_params.pixel_format) / 8;
+                   Pica::Texture::Format::GetBpp(dst_params.pixel_format) / 8;
    dst_surface->dirty = true;
    res_cache.FlushRegion(config.GetPhysicalOutputAddress(), dst_size, dst_surface, true);
    return true;
@ -789,7 +789,6 @@ bool RasterizerOpenGL::AccelerateTextureCopy(const GPU::Regs::DisplayTransferCon

 bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config) {
    MICROPROFILE_SCOPE(OpenGL_Blits);
-    using PixelFormat = CachedSurface::PixelFormat;
    using SurfaceType = CachedSurface::SurfaceType;

    CachedSurface* dst_surface = res_cache.TryGetFillSurface(config);
@ -824,7 +823,7 @@ bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config)

        if (config.fill_24bit) {
            switch (dst_surface->pixel_format) {
-            case PixelFormat::RGB8:
+            case Pica::Texture::Format::Type::RGB8:
                color_values[0] = config.value_24bit_r / 255.0f;
                color_values[1] = config.value_24bit_g / 255.0f;
                color_values[2] = config.value_24bit_b / 255.0f;
@ -836,7 +835,7 @@ bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config)
            u32 value = config.value_32bit;

            switch (dst_surface->pixel_format) {
-            case PixelFormat::RGBA8:
+            case Pica::Texture::Format::Type::RGBA8:
                color_values[0] = (value >> 24) / 255.0f;
                color_values[1] = ((value >> 16) & 0xFF) / 255.0f;
                color_values[2] = ((value >> 8) & 0xFF) / 255.0f;
@ -850,34 +849,34 @@ bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config)
            Math::Vec4<u8> color;

            switch (dst_surface->pixel_format) {
-            case PixelFormat::RGBA8:
+            case Pica::Texture::Format::Type::RGBA8:
                color_values[0] = (value_16bit >> 8) / 255.0f;
                color_values[1] = (value_16bit & 0xFF) / 255.0f;
                color_values[2] = color_values[0];
                color_values[3] = color_values[1];
                break;
-            case PixelFormat::RGB5A1:
+            case Pica::Texture::Format::Type::RGB5A1:
                color = Color::DecodeRGB5A1((const u8*)&value_16bit);
                color_values[0] = color[0] / 31.0f;
                color_values[1] = color[1] / 31.0f;
                color_values[2] = color[2] / 31.0f;
                color_values[3] = color[3];
                break;
-            case PixelFormat::RGB565:
+            case Pica::Texture::Format::Type::RGB565:
                color = Color::DecodeRGB565((const u8*)&value_16bit);
                color_values[0] = color[0] / 31.0f;
                color_values[1] = color[1] / 63.0f;
                color_values[2] = color[2] / 31.0f;
                break;
-            case PixelFormat::RGBA4:
+            case Pica::Texture::Format::Type::RGBA4:
                color = Color::DecodeRGBA4((const u8*)&value_16bit);
                color_values[0] = color[0] / 15.0f;
                color_values[1] = color[1] / 15.0f;
                color_values[2] = color[2] / 15.0f;
                color_values[3] = color[3] / 15.0f;
                break;
-            case PixelFormat::IA8:
-            case PixelFormat::RG8:
+            case Pica::Texture::Format::Type::IA8:
+            case Pica::Texture::Format::Type::RG8:
                color_values[0] = (value_16bit >> 8) / 255.0f;
                color_values[1] = (value_16bit & 0xFF) / 255.0f;
                break;
@ -899,9 +898,9 @@ bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config)
        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);

        GLfloat value_float;
-        if (dst_surface->pixel_format == CachedSurface::PixelFormat::D16) {
+        if (dst_surface->pixel_format == Pica::Texture::Format::Type::D16) {
            value_float = config.value_32bit / 65535.0f; // 2^16 - 1
-        } else if (dst_surface->pixel_format == CachedSurface::PixelFormat::D24) {
+        } else if (dst_surface->pixel_format == Pica::Texture::Format::Type::D24) {
            value_float = config.value_32bit / 16777215.0f; // 2^24 - 1
        }

@ -945,7 +944,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const GPU::Regs::FramebufferConfig& con
    src_params.height = config.height;
    src_params.pixel_stride = pixel_stride;
    src_params.is_tiled = false;
-    src_params.pixel_format = CachedSurface::PixelFormatFromGPUPixelFormat(config.color_format);
+    src_params.pixel_format = Pica::Texture::Format::FromGPUPixelFormat(config.color_format);

    MathUtil::Rectangle<int> src_rect;
    CachedSurface* src_surface = res_cache.GetSurfaceRect(src_params, false, true, src_rect);
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@ -21,6 +21,8 @@
 #include "video_core/pica_state.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_state.h"
+#include "video_core/texture/codec.h"
+#include "video_core/texture/formats.h"
 #include "video_core/utils.h"
 #include "video_core/video_core.h"

@ -30,21 +32,48 @@ struct FormatTuple {
    GLenum type;
 };

-static const std::array<FormatTuple, 5> fb_format_tuples = {{
-    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8},     // RGBA8
-    {GL_RGB8, GL_BGR, GL_UNSIGNED_BYTE},              // RGB8
-    {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1}, // RGB5A1
-    {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5},     // RGB565
-    {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4},   // RGBA4
-}};
-
-static const std::array<FormatTuple, 4> depth_format_tuples = {{
+static const std::array<FormatTuple, 18> format_tuples = {{
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8},                  // RGBA8
+    {GL_RGB8, GL_BGR, GL_UNSIGNED_BYTE},                           // RGB8
+    {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1},              // RGB5A1
+    {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5},                  // RGB565
+    {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4},                // RGBA4
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // IA8
+    {GL_RG8, GL_RG8, GL_UNSIGNED_BYTE},                            // RG8
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // I8
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // A8
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // IA4
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // I4
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // A4
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // ETC1
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // ETC1A4
    {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // D16
    {},
    {GL_DEPTH_COMPONENT24, GL_DEPTH_COMPONENT, GL_UNSIGNED_INT},   // D24
    {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // D24S8
 }};

+static const std::array<bool, 18> native_format = {
+    true,  // RGBA8
+    true,  // RGB8
+    true,  // RGB5A1
+    true,  // RGB565
+    true,  // RGBA4
+    false, // IA8
+    true,  // RG8
+    false, // I8
+    false, // A8
+    false, // IA4
+    false, // I4
+    false, // A4
+    false, // ETC1
+    false, // ETC1A4
+    true,  // D16
+    false,
+    false, // D24
+    false, // D24S8
+};
+
 RasterizerCacheOpenGL::RasterizerCacheOpenGL() {
    transfer_framebuffers[0].Create();
    transfer_framebuffers[1].Create();
@ -54,55 +83,6 @@ RasterizerCacheOpenGL::~RasterizerCacheOpenGL() {
    FlushAll();
 }

-static void MortonCopyPixels(CachedSurface::PixelFormat pixel_format, u32 width, u32 height,
-                             u32 bytes_per_pixel, u32 gl_bytes_per_pixel, u8* morton_data,
-                             u8* gl_data, bool morton_to_gl) {
-    using PixelFormat = CachedSurface::PixelFormat;
-
-    u8* data_ptrs[2];
-    u32 depth_stencil_shifts[2] = {24, 8};
-
-    if (morton_to_gl) {
-        std::swap(depth_stencil_shifts[0], depth_stencil_shifts[1]);
-    }
-
-    if (pixel_format == PixelFormat::D24S8) {
-        for (unsigned y = 0; y < height; ++y) {
-            for (unsigned x = 0; x < width; ++x) {
-                const u32 coarse_y = y & ~7;
-                u32 morton_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) +
-                                    coarse_y * width * bytes_per_pixel;
-                u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel;
-
-                data_ptrs[morton_to_gl] = morton_data + morton_offset;
-                data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index];
-
-                // Swap depth and stencil value ordering since 3DS does not match OpenGL
-                u32 depth_stencil;
-                memcpy(&depth_stencil, data_ptrs[1], sizeof(u32));
-                depth_stencil = (depth_stencil << depth_stencil_shifts[0]) |
-                                (depth_stencil >> depth_stencil_shifts[1]);
-
-                memcpy(data_ptrs[0], &depth_stencil, sizeof(u32));
-            }
-        }
-    } else {
-        for (unsigned y = 0; y < height; ++y) {
-            for (unsigned x = 0; x < width; ++x) {
-                const u32 coarse_y = y & ~7;
-                u32 morton_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) +
-                                    coarse_y * width * bytes_per_pixel;
-                u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel;
-
-                data_ptrs[morton_to_gl] = morton_data + morton_offset;
-                data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index];
-
-                memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel);
-            }
-        }
-    }
-}
-
 void RasterizerCacheOpenGL::BlitTextures(GLuint src_tex, GLuint dst_tex,
                                         CachedSurface::SurfaceType type,
                                         const MathUtil::Rectangle<int>& src_rect,
@ -184,7 +164,7 @@ bool RasterizerCacheOpenGL::TryBlitSurfaces(CachedSurface* src_surface,
    return true;
 }

-static void AllocateSurfaceTexture(GLuint texture, CachedSurface::PixelFormat pixel_format,
+static void AllocateSurfaceTexture(GLuint texture, Pica::Texture::Format::Type pixel_format,
                                   u32 width, u32 height) {
    // Allocate an uninitialized texture of appropriate size and format for the surface
    using SurfaceType = CachedSurface::SurfaceType;
@ -199,17 +179,8 @@ static void AllocateSurfaceTexture(GLuint texture, CachedSurface::PixelFormat pi

    SurfaceType type = CachedSurface::GetFormatType(pixel_format);

-    FormatTuple tuple;
-    if (type == SurfaceType::Color) {
-        ASSERT((size_t)pixel_format < fb_format_tuples.size());
-        tuple = fb_format_tuples[(unsigned int)pixel_format];
-    } else if (type == SurfaceType::Depth || type == SurfaceType::DepthStencil) {
-        size_t tuple_idx = (size_t)pixel_format - 14;
-        ASSERT(tuple_idx < depth_format_tuples.size());
-        tuple = depth_format_tuples[tuple_idx];
-    } else {
-        tuple = {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE};
-    }
+    ASSERT((size_t)pixel_format < format_tuples.size());
+    FormatTuple tuple = format_tuples[(unsigned int)pixel_format];

    glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, width, height, 0, tuple.format,
                 tuple.type, nullptr);
@ -227,7 +198,7 @@ static void AllocateSurfaceTexture(GLuint texture, CachedSurface::PixelFormat pi
 MICROPROFILE_DEFINE(OpenGL_SurfaceUpload, "OpenGL", "Surface Upload", MP_RGB(128, 64, 192));
 CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bool match_res_scale,
                                                 bool load_if_create) {
-    using PixelFormat = CachedSurface::PixelFormat;
+    using PixelFormat = Pica::Texture::Format::Type;
    using SurfaceType = CachedSurface::SurfaceType;

    if (params.addr == 0) {
@ -235,7 +206,7 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo
    }

    u32 params_size =
-        params.width * params.height * CachedSurface::GetFormatBpp(params.pixel_format) / 8;
+        params.width * params.height * Pica::Texture::Format::GetBpp(params.pixel_format) / 8;

    // Check for an exact match in existing surfaces
    CachedSurface* best_exact_surface = nullptr;
@ -320,72 +291,36 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo

        if (!new_surface->is_tiled) {
            // TODO: Ensure this will always be a color format, not a depth or other format
-            ASSERT((size_t)new_surface->pixel_format < fb_format_tuples.size());
-            const FormatTuple& tuple = fb_format_tuples[(unsigned int)params.pixel_format];
+            // ASSERT((size_t)new_surface->pixel_format < format_tuples.size());
+            const FormatTuple& tuple = format_tuples[(unsigned int)params.pixel_format];

            glPixelStorei(GL_UNPACK_ROW_LENGTH, (GLint)new_surface->pixel_stride);
            glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0,
                         tuple.format, tuple.type, texture_src_data);
            glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
        } else {
-            SurfaceType type = CachedSurface::GetFormatType(new_surface->pixel_format);
-            if (type != SurfaceType::Depth && type != SurfaceType::DepthStencil) {
-                FormatTuple tuple;
-                if ((size_t)params.pixel_format < fb_format_tuples.size()) {
-                    tuple = fb_format_tuples[(unsigned int)params.pixel_format];
-                } else {
-                    // Texture
-                    tuple = {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE};
-                }
-
-                std::vector<Math::Vec4<u8>> tex_buffer(params.width * params.height);
-
-                Pica::DebugUtils::TextureInfo tex_info;
-                tex_info.width = params.width;
-                tex_info.height = params.height;
-                tex_info.stride =
-                    params.width * CachedSurface::GetFormatBpp(params.pixel_format) / 8;
-                tex_info.format = (Pica::Regs::TextureFormat)params.pixel_format;
-                tex_info.physical_address = params.addr;
-
-                for (unsigned y = 0; y < params.height; ++y) {
-                    for (unsigned x = 0; x < params.width; ++x) {
-                        tex_buffer[x + params.width * y] = Pica::DebugUtils::LookupTexture(
-                            texture_src_data, x, params.height - 1 - y, tex_info);
-                    }
-                }
-
-                glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height,
-                             0, GL_RGBA, GL_UNSIGNED_BYTE, tex_buffer.data());
-            } else {
-                // Depth/Stencil formats need special treatment since they aren't sampleable using
-                // LookupTexture and can't use RGBA format
-                size_t tuple_idx = (size_t)params.pixel_format - 14;
-                ASSERT(tuple_idx < depth_format_tuples.size());
-                const FormatTuple& tuple = depth_format_tuples[tuple_idx];
-
-                u32 bytes_per_pixel = CachedSurface::GetFormatBpp(params.pixel_format) / 8;
-
-                // OpenGL needs 4 bpp alignment for D24 since using GL_UNSIGNED_INT as type
-                bool use_4bpp = (params.pixel_format == PixelFormat::D24);
-
-                u32 gl_bytes_per_pixel = use_4bpp ? 4 : bytes_per_pixel;
-
-                std::vector<u8> temp_fb_depth_buffer(params.width * params.height *
-                                                     gl_bytes_per_pixel);
-
-                u8* temp_fb_depth_buffer_ptr =
-                    use_4bpp ? temp_fb_depth_buffer.data() + 1 : temp_fb_depth_buffer.data();
-
-                MortonCopyPixels(params.pixel_format, params.width, params.height, bytes_per_pixel,
-                                 gl_bytes_per_pixel, texture_src_data, temp_fb_depth_buffer_ptr,
-                                 true);
-
-                glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height,
-                             0, tuple.format, tuple.type, temp_fb_depth_buffer.data());
-            }
+            const FormatTuple& tuple = format_tuples[(unsigned int)params.pixel_format];
+            std::unique_ptr<Pica::Texture::Codec> tmp = Pica::Texture::CodecFactory::build(
+                // clang-format off
+                params.pixel_format, texture_src_data, params.width, params.height
+                // clang-format on
+                );
+            Pica::Texture::Codec* codec = tmp.get();
+            codec->configTiling(true, 8); // change 8 for 32 in case the mage is tiled
+                                          // on blocks of 32x32
+            codec->configRGBATransform(!native_format[(unsigned int)params.pixel_format]);
+            codec->decode();
+            std::unique_ptr<u8[]> decoded_texture = codec->transferInternalBuffer();
+            u32 bytes = codec->getInternalBytesPerPixel();
+            if (bytes == 3)
+                bytes = 1;
+            else if (bytes != 2)
+                bytes = 4;
+            glPixelStorei(GL_UNPACK_ALIGNMENT, bytes);
+            glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0,
+                         tuple.format, tuple.type, decoded_texture.get());
+            glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
        }
-
        // If not 1x scale, blit 1x texture to a new scaled texture and replace texture in surface
        if (new_surface->res_scale_width != 1.f || new_surface->res_scale_height != 1.f) {
            OGLTexture scaled_texture;
@ -430,7 +365,7 @@ CachedSurface* RasterizerCacheOpenGL::GetSurfaceRect(const CachedSurface& params
    }

    u32 total_pixels = params.width * params.height;
-    u32 params_size = total_pixels * CachedSurface::GetFormatBpp(params.pixel_format) / 8;
+    u32 params_size = total_pixels * Pica::Texture::Format::GetBpp(params.pixel_format) / 8;

    // Attempt to find encompassing surfaces
    CachedSurface* best_subrect_surface = nullptr;
@ -467,7 +402,7 @@ CachedSurface* RasterizerCacheOpenGL::GetSurfaceRect(const CachedSurface& params
    // Return the best subrect surface if found
    if (best_subrect_surface != nullptr) {
        unsigned int bytes_per_pixel =
-            (CachedSurface::GetFormatBpp(best_subrect_surface->pixel_format) / 8);
+            (Pica::Texture::Format::GetBpp(best_subrect_surface->pixel_format) / 8);

        int x0, y0;

@ -521,7 +456,7 @@ CachedSurface* RasterizerCacheOpenGL::GetTextureSurface(
    params.width = info.width;
    params.height = info.height;
    params.is_tiled = true;
-    params.pixel_format = CachedSurface::PixelFormatFromTextureFormat(info.format);
+    params.pixel_format = Pica::Texture::Format::FromTextureFormat(info.format);
    return GetSurface(params, false, true);
 }

@ -574,10 +509,10 @@ RasterizerCacheOpenGL::GetFramebufferSurfaces(const Pica::Regs::FramebufferConfi
    }

    color_params.addr = config.GetColorBufferPhysicalAddress();
-    color_params.pixel_format = CachedSurface::PixelFormatFromColorFormat(config.color_format);
+    color_params.pixel_format = Pica::Texture::Format::FromColorFormat(config.color_format);

    depth_params.addr = config.GetDepthBufferPhysicalAddress();
-    depth_params.pixel_format = CachedSurface::PixelFormatFromDepthFormat(config.depth_format);
+    depth_params.pixel_format = Pica::Texture::Format::FromDepthFormat(config.depth_format);

    MathUtil::Rectangle<int> color_rect;
    CachedSurface* color_surface =
@ -648,9 +583,9 @@ CachedSurface* RasterizerCacheOpenGL::TryGetFillSurface(const GPU::Regs::MemoryF
            CachedSurface* surface = it2->get();

            if (surface->addr == config.GetStartAddress() &&
-                CachedSurface::GetFormatBpp(surface->pixel_format) == bits_per_value &&
+                Pica::Texture::Format::GetBpp(surface->pixel_format) == bits_per_value &&
                (surface->width * surface->height *
-                 CachedSurface::GetFormatBpp(surface->pixel_format) / 8) ==
+                 Pica::Texture::Format::GetBpp(surface->pixel_format) / 8) ==
                    (config.GetEndAddress() - config.GetStartAddress())) {
                return surface;
            }
@ -662,7 +597,6 @@ CachedSurface* RasterizerCacheOpenGL::TryGetFillSurface(const GPU::Regs::MemoryF

 MICROPROFILE_DEFINE(OpenGL_SurfaceDownload, "OpenGL", "Surface Download", MP_RGB(128, 192, 64));
 void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) {
-    using PixelFormat = CachedSurface::PixelFormat;
    using SurfaceType = CachedSurface::SurfaceType;

    if (!surface->dirty) {
@ -703,53 +637,32 @@ void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) {

    if (!surface->is_tiled) {
        // TODO: Ensure this will always be a color format, not a depth or other format
-        ASSERT((size_t)surface->pixel_format < fb_format_tuples.size());
-        const FormatTuple& tuple = fb_format_tuples[(unsigned int)surface->pixel_format];
+        // ASSERT((size_t)surface->pixel_format < fb_format_tuples.size());
+        const FormatTuple& tuple = format_tuples[(unsigned int)surface->pixel_format];

        glPixelStorei(GL_PACK_ROW_LENGTH, (GLint)surface->pixel_stride);
        glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, dst_buffer);
        glPixelStorei(GL_PACK_ROW_LENGTH, 0);
    } else {
-        SurfaceType type = CachedSurface::GetFormatType(surface->pixel_format);
-        if (type != SurfaceType::Depth && type != SurfaceType::DepthStencil) {
-            ASSERT((size_t)surface->pixel_format < fb_format_tuples.size());
-            const FormatTuple& tuple = fb_format_tuples[(unsigned int)surface->pixel_format];
+        const FormatTuple& tuple = format_tuples[(u32)surface->pixel_format];
+        u32 bytes_per_pixel = Pica::Texture::Format::GetBpp(surface->pixel_format) / 8;
+        if (!native_format[(u32)surface->pixel_format])
+            bytes_per_pixel = 4;
+        std::vector<u8> temp_gl_buffer(surface->width * surface->height * bytes_per_pixel);
+        glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data());

-            u32 bytes_per_pixel = CachedSurface::GetFormatBpp(surface->pixel_format) / 8;
-
-            std::vector<u8> temp_gl_buffer(surface->width * surface->height * bytes_per_pixel);
-
-            glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data());
-
-            // Directly copy pixels. Internal OpenGL color formats are consistent so no conversion
-            // is necessary.
-            MortonCopyPixels(surface->pixel_format, surface->width, surface->height,
-                             bytes_per_pixel, bytes_per_pixel, dst_buffer, temp_gl_buffer.data(),
-                             false);
-        } else {
-            // Depth/Stencil formats need special treatment since they aren't sampleable using
-            // LookupTexture and can't use RGBA format
-            size_t tuple_idx = (size_t)surface->pixel_format - 14;
-            ASSERT(tuple_idx < depth_format_tuples.size());
-            const FormatTuple& tuple = depth_format_tuples[tuple_idx];
-
-            u32 bytes_per_pixel = CachedSurface::GetFormatBpp(surface->pixel_format) / 8;
-
-            // OpenGL needs 4 bpp alignment for D24 since using GL_UNSIGNED_INT as type
-            bool use_4bpp = (surface->pixel_format == PixelFormat::D24);
-
-            u32 gl_bytes_per_pixel = use_4bpp ? 4 : bytes_per_pixel;
-
-            std::vector<u8> temp_gl_buffer(surface->width * surface->height * gl_bytes_per_pixel);
-
-            glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data());
-
-            u8* temp_gl_buffer_ptr = use_4bpp ? temp_gl_buffer.data() + 1 : temp_gl_buffer.data();
-
-            MortonCopyPixels(surface->pixel_format, surface->width, surface->height,
-                             bytes_per_pixel, gl_bytes_per_pixel, dst_buffer, temp_gl_buffer_ptr,
-                             false);
-        }
+        std::unique_ptr<Pica::Texture::Codec> tmp = Pica::Texture::CodecFactory::build(
+            // clang-format off
+            surface->pixel_format, temp_gl_buffer.data(), surface->width, surface->height
+            // clang-format on
+            );
+        Pica::Texture::Codec* codec = tmp.get();
+        codec->configTiling(true, 8); // change 8 for 32 in case the mage is tiled
+                                      // on blocks of 32x32
+        codec->configRGBATransform(!native_format[(u32)surface->pixel_format]);
+        codec->configPreConvertedRGBA(!native_format[(u32)surface->pixel_format]);
+        codec->setExternalBuffer(dst_buffer);
+        codec->encode();
    }

    surface->dirty = false;
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@ -16,6 +16,7 @@
 #include "core/hw/gpu.h"
 #include "video_core/pica.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/texture/formats.h"

 namespace MathUtil {
 template <class T>
@ -27,33 +28,6 @@ struct CachedSurface;
 using SurfaceCache = boost::icl::interval_map<PAddr, std::set<std::shared_ptr<CachedSurface>>>;

 struct CachedSurface {
-    enum class PixelFormat {
-        // First 5 formats are shared between textures and color buffers
-        RGBA8 = 0,
-        RGB8 = 1,
-        RGB5A1 = 2,
-        RGB565 = 3,
-        RGBA4 = 4,
-
-        // Texture-only formats
-        IA8 = 5,
-        RG8 = 6,
-        I8 = 7,
-        A8 = 8,
-        IA4 = 9,
-        I4 = 10,
-        A4 = 11,
-        ETC1 = 12,
-        ETC1A4 = 13,
-
-        // Depth buffer-only formats
-        D16 = 14,
-        // gap
-        D24 = 16,
-        D24S8 = 17,
-
-        Invalid = 255,
-    };

    enum class SurfaceType {
        Color = 0,
@ -63,58 +37,8 @@ struct CachedSurface {
        Invalid = 4,
    };

-    static unsigned int GetFormatBpp(CachedSurface::PixelFormat format) {
-        static const std::array<unsigned int, 18> bpp_table = {
-            32, // RGBA8
-            24, // RGB8
-            16, // RGB5A1
-            16, // RGB565
-            16, // RGBA4
-            16, // IA8
-            16, // RG8
-            8,  // I8
-            8,  // A8
-            8,  // IA4
-            4,  // I4
-            4,  // A4
-            4,  // ETC1
-            8,  // ETC1A4
-            16, // D16
-            0,
-            24, // D24
-            32, // D24S8
-        };
-
-        ASSERT((unsigned int)format < ARRAY_SIZE(bpp_table));
-        return bpp_table[(unsigned int)format];
-    }
-
-    static PixelFormat PixelFormatFromTextureFormat(Pica::Regs::TextureFormat format) {
-        return ((unsigned int)format < 14) ? (PixelFormat)format : PixelFormat::Invalid;
-    }
-
-    static PixelFormat PixelFormatFromColorFormat(Pica::Regs::ColorFormat format) {
-        return ((unsigned int)format < 5) ? (PixelFormat)format : PixelFormat::Invalid;
-    }
-
-    static PixelFormat PixelFormatFromDepthFormat(Pica::Regs::DepthFormat format) {
-        return ((unsigned int)format < 4) ? (PixelFormat)((unsigned int)format + 14)
-                                          : PixelFormat::Invalid;
-    }
-
-    static PixelFormat PixelFormatFromGPUPixelFormat(GPU::Regs::PixelFormat format) {
-        switch (format) {
-        // RGB565 and RGB5A1 are switched in PixelFormat compared to ColorFormat
-        case GPU::Regs::PixelFormat::RGB565:
-            return PixelFormat::RGB565;
-        case GPU::Regs::PixelFormat::RGB5A1:
-            return PixelFormat::RGB5A1;
-        default:
-            return ((unsigned int)format < 5) ? (PixelFormat)format : PixelFormat::Invalid;
-        }
-    }
-
-    static bool CheckFormatsBlittable(PixelFormat pixel_format_a, PixelFormat pixel_format_b) {
+    static bool CheckFormatsBlittable(Pica::Texture::Format::Type pixel_format_a,
+                                      Pica::Texture::Format::Type pixel_format_b) {
        SurfaceType a_type = GetFormatType(pixel_format_a);
        SurfaceType b_type = GetFormatType(pixel_format_b);

@ -134,7 +58,7 @@ struct CachedSurface {
        return false;
    }

-    static SurfaceType GetFormatType(PixelFormat pixel_format) {
+    static SurfaceType GetFormatType(Pica::Texture::Format::Type pixel_format) {
        if ((unsigned int)pixel_format < 5) {
            return SurfaceType::Color;
        }
@ -143,11 +67,12 @@ struct CachedSurface {
            return SurfaceType::Texture;
        }

-        if (pixel_format == PixelFormat::D16 || pixel_format == PixelFormat::D24) {
+        if (pixel_format == Pica::Texture::Format::Type::D16 ||
+            pixel_format == Pica::Texture::Format::Type::D24) {
            return SurfaceType::Depth;
        }

-        if (pixel_format == PixelFormat::D24S8) {
+        if (pixel_format == Pica::Texture::Format::Type::D24S8) {
            return SurfaceType::DepthStencil;
        }

@ -177,7 +102,7 @@ struct CachedSurface {
    float res_scale_height = 1.f;

    bool is_tiled;
-    PixelFormat pixel_format;
+    Pica::Texture::Format::Type pixel_format;
    bool dirty;
 };

--- a/src/video_core/texture/codec.cpp
+++ b/src/video_core/texture/codec.cpp
@ -1,6 +1,10 @@
-#include "codec.h"
-#include "internal\codecs.h"
-#include "internal\morton.h"
+#include "common/color.h"
+#include "common/math_util.h"
+#include "common/swap.h"
+#include "common/vector_math.h"
+#include "video_core/texture/codec.h"
+#include "video_core/texture/internal/codecs.h"
+#include "video_core/texture/internal/morton.h"

 namespace Pica {
 namespace Texture {
@ -17,18 +21,6 @@ void Codec::encode() {
        this->encode_morton_pass();
 };

-void Codec::setSize() {
-    this->start_nibbles_size = format_size;
-};
-
-inline void Codec::setWidth(u32 width) {
-    this->width = width;
-}
-
-inline void Codec::setHeight(u32 height) {
-    this->height = height;
-}
-
 void Codec::configTiling(bool active, u32 tiling) {
    this->morton = true;
    this->morton_pass_tiling = tiling;
@ -63,15 +55,16 @@ bool Codec::invalid() {
 }

 void Codec::init(bool decode) {
+    this->setSize();
+    this->expected_nibbles_size = this->start_nibbles_size;
    if (decode) {
        if (this->raw_RGBA)
            this->expected_nibbles_size = 8;
    } else {
-        this->start_nibbles_size = this->format_size;
-        if (this->raw_RGBA)
-            this->expected_nibbles_size = this->format_size;
        if (this->preconverted)
            this->start_nibbles_size = 8;
+        if (!this->raw_RGBA)
+            this->expected_nibbles_size = this->start_nibbles_size;
    }
    if (!this->external_result_buffer) {
        size_t buff_size = this->width * this->height * this->expected_nibbles_size / 2;
@ -80,7 +73,7 @@ void Codec::init(bool decode) {
    }
 }

-inline void Codec::decode_morton_pass() {
+void Codec::decode_morton_pass() {
    if (this->morton_pass_tiling == 8)
        Decoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height,
                             this->start_nibbles_size * 4);
@ -89,7 +82,7 @@ inline void Codec::decode_morton_pass() {
                               this->start_nibbles_size * 4);
 }

-inline void Codec::encode_morton_pass() {
+void Codec::encode_morton_pass() {
    if (this->morton_pass_tiling == 8)
        Encoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height,
                             this->start_nibbles_size * 4);
@ -98,41 +91,41 @@ inline void Codec::encode_morton_pass() {
                               this->start_nibbles_size * 4);
 }

-std::unique_ptr<Codec> CodecFactory::build(Format format, u8* target, u32 width, u32 height) {
+std::unique_ptr<Codec> CodecFactory::build(Format::Type format, u8* target, u32 width, u32 height) {
    switch (format) {
-    case Format::RGBA8:
+    case Format::Type::RGBA8:
        return std::make_unique<RGBACodec>(target, width, height);
-    case Format::RGB8:
+    case Format::Type::RGB8:
        return std::make_unique<RGBCodec>(target, width, height);
-    case Format::RGB5A1:
+    case Format::Type::RGB5A1:
        return std::make_unique<RGB5A1Codec>(target, width, height);
-    case Format::RGB565:
+    case Format::Type::RGB565:
        return std::make_unique<RGB565Codec>(target, width, height);
-    case Format::RGBA4:
+    case Format::Type::RGBA4:
        return std::make_unique<RGBA4Codec>(target, width, height);
-    case Format::RG8:
+    case Format::Type::RG8:
        return std::make_unique<RG8Codec>(target, width, height);
-    case Format::IA8:
+    case Format::Type::IA8:
        return std::make_unique<IA8Codec>(target, width, height);
-    case Format::I8:
+    case Format::Type::I8:
        return std::make_unique<I8Codec>(target, width, height);
-    case Format::A8:
+    case Format::Type::A8:
        return std::make_unique<A8Codec>(target, width, height);
-    case Format::IA4:
+    case Format::Type::IA4:
        return std::make_unique<IA4Codec>(target, width, height);
-    case Format::I4:
+    case Format::Type::I4:
        return std::make_unique<I4Codec>(target, width, height);
-    case Format::A4:
+    case Format::Type::A4:
        return std::make_unique<A4Codec>(target, width, height);
-    case Format::ETC1:
+    case Format::Type::ETC1:
        return std::make_unique<ETC1Codec>(target, width, height);
-    case Format::ETC1A4:
+    case Format::Type::ETC1A4:
        return std::make_unique<ETC1A4Codec>(target, width, height);
-    case Format::D16:
+    case Format::Type::D16:
        return std::make_unique<D16Codec>(target, width, height);
-    case Format::D24:
+    case Format::Type::D24:
        return std::make_unique<D24Codec>(target, width, height);
-    case Format::D24S8:
+    case Format::Type::D24S8:
        return std::make_unique<D24S8Codec>(target, width, height);
    default:
        return nullptr;
--- a/src/video_core/texture/codec.h
+++ b/src/video_core/texture/codec.h
@ -1,9 +1,10 @@
+
+#pragma once
+
 #include <iostream>
 #include <memory>
 #include "common/common_types.h"
-#include "formats.h"
-
-#pragma once
+#include "video_core/texture/formats.h"

 namespace Pica {

@ -16,18 +17,23 @@ public:
        this->target_buffer = target;
        this->setWidth(width);
        this->setHeight(height);
-        this->setSize();
-        this->expected_nibbles_size = this->start_nibbles_size;
    }
    virtual ~Codec() {}

    virtual void decode();
    virtual void encode();

-    void setSize();
+    inline void setWidth(u32 width) {
+        this->width = width;
+    }

-    void setWidth(u32 width);
-    void setHeight(u32 height);
+    inline void setHeight(u32 height) {
+        this->height = height;
+    }
+
+    inline u32 getInternalBytesPerPixel() {
+        return this->expected_nibbles_size / 2;
+    }

    // Common Passes
    void configTiling(bool active, u32 tiling);
@ -54,7 +60,10 @@ protected:

    u32 start_nibbles_size;
    u32 expected_nibbles_size;
-    const u32 format_size = 8;
+
+    virtual void setSize() {
+        this->start_nibbles_size = 8;
+    };

    u8* target_buffer;                     // Initial read buffer
    u8* passing_buffer;                    // pointer aliasing: Used and modified by passes
@ -65,12 +74,12 @@ protected:

    typedef Codec super;

-    inline void decode_morton_pass();
-    inline void encode_morton_pass();
+    void decode_morton_pass();
+    void encode_morton_pass();
 };

 namespace CodecFactory {
-std::unique_ptr<Codec> build(Pica::Texture::Format format, u8* target, u32 width, u32 height);
+std::unique_ptr<Codec> build(Pica::Texture::Format::Type format, u8* target, u32 width, u32 height);
 };

 } // Texture
--- a/src/video_core/texture/formats.h
+++ b/src/video_core/texture/formats.h
@ -1,36 +1,96 @@
+
 #pragma once

+#include <array>
+#include "common/assert.h"
+#include "core/hw/gpu.h"
+#include "video_core/pica.h"
+
 namespace Pica {

 namespace Texture {

-enum class Format {
-    // First 5 formats are shared between textures and color buffers
-    RGBA8 = 0,
-    RGB8 = 1,
-    RGB5A1 = 2,
-    RGB565 = 3,
-    RGBA4 = 4,
+struct Format {

-    // Texture-only formats
-    IA8 = 5,
-    RG8 = 6,
-    I8 = 7,
-    A8 = 8,
-    IA4 = 9,
-    I4 = 10,
-    A4 = 11,
-    ETC1 = 12,
-    ETC1A4 = 13,
+    enum class Type {
+        // First 5 formats are shared between textures and color buffers
+        RGBA8 = 0,
+        RGB8 = 1,
+        RGB5A1 = 2,
+        RGB565 = 3,
+        RGBA4 = 4,

-    // Depth buffer-only formats
-    D16 = 14,
-    // gap
-    D24 = 16,
-    D24S8 = 17,
+        // Texture-only formats
+        IA8 = 5,
+        RG8 = 6,
+        I8 = 7,
+        A8 = 8,
+        IA4 = 9,
+        I4 = 10,
+        A4 = 11,
+        ETC1 = 12,
+        ETC1A4 = 13,

-    Invalid = 255,
-};
+        // Depth buffer-only formats
+        D16 = 14,
+        // gap
+        D24 = 16,
+        D24S8 = 17,
+
+        Invalid = 255,
+    };
+
+    static u32 GetBpp(Type format) {
+        static const std::array<unsigned int, 18> bpp_table = {
+            32, // RGBA8
+            24, // RGB8
+            16, // RGB5A1
+            16, // RGB565
+            16, // RGBA4
+            16, // IA8
+            16, // RG8
+            8,  // I8
+            8,  // A8
+            8,  // IA4
+            4,  // I4
+            4,  // A4
+            4,  // ETC1
+            8,  // ETC1A4
+            16, // D16
+            0,
+            24, // D24
+            32, // D24S8
+        };
+
+        ASSERT((u32)format < ARRAY_SIZE(bpp_table));
+        return bpp_table[(u32)format];
+    }
+
+    static Type FromTextureFormat(Regs::TextureFormat format) {
+        return ((unsigned int)format < 14) ? (Type)format : Type::Invalid;
+    }
+
+    static Type FromColorFormat(Regs::ColorFormat format) {
+        return ((unsigned int)format < 5) ? (Type)format : Type::Invalid;
+    }
+
+    static Type FromDepthFormat(Regs::DepthFormat format) {
+        return ((unsigned int)format < 4) ? (Type)((unsigned int)format + 14) : Type::Invalid;
+    }
+
+    static Type FromGPUPixelFormat(GPU::Regs::PixelFormat format) {
+        switch (format) {
+        // RGB565 and RGB5A1 are switched in PixelFormat compared to ColorFormat
+        case GPU::Regs::PixelFormat::RGB565:
+            return Type::RGB565;
+        case GPU::Regs::PixelFormat::RGB5A1:
+            return Type::RGB5A1;
+        default:
+            return ((unsigned int)format < 5) ? (Type)format : Type::Invalid;
+        }
+    }
+
+}; // Format

 } // Texture

--- a/src/video_core/texture/internal/codecs.cpp
+++ b/src/video_core/texture/internal/codecs.cpp
@ -1,7 +1,19 @@
-#include "codecs.h"
-#include "etc1.h"
-#include "morton.h"
-#include "texture_utils.h"
+#include "video_core/texture/internal/codecs.h"
+#include "video_core/texture/internal/etc1.h"
+#include "video_core/texture/internal/morton.h"
+#include "video_core/texture/internal/texture_utils.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// Optimizations
+//////////////////////////////////////////////////////////////////////////////
+#ifdef _MSC_VER
+#pragma inline_recursion(on)
+#elif defined(CLANG_OR_GCC)
+#pragma GCC optimize("-fpeel-loops")
+#pragma GCC optimize("-fpredictive-commoning")
+#pragma GCC optimize("-ftree-loop-distribute-patterns")
+#pragma GCC optimize("-ftree-vectorize")
+#endif

 // Decoders
 #include "decoders.cpp"
--- a/src/video_core/texture/internal/codecs.h
+++ b/src/video_core/texture/internal/codecs.h
@ -1,10 +1,11 @@
+
+#pragma once
+
 #include <iostream>
 #include <memory>
 #include "common/common_types.h"
 #include "video_core/texture/codec.h"

-#pragma once
-
 // each texture format codec
 class RGBACodec : public Pica::Texture::Codec {
 public:
@ -13,7 +14,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 8;
+    virtual void setSize() {
+        this->start_nibbles_size = 8;
+    };
 };

 class RGBCodec : public Pica::Texture::Codec {
@ -23,7 +26,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 6;
+    virtual void setSize() {
+        this->start_nibbles_size = 6;
+    };
 };

 class RGB5A1Codec : public Pica::Texture::Codec {
@ -33,7 +38,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 4;
+    virtual void setSize() {
+        this->start_nibbles_size = 4;
+    };
 };

 class RGBA4Codec : public Pica::Texture::Codec {
@ -43,7 +50,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 4;
+    virtual void setSize() {
+        this->start_nibbles_size = 4;
+    };
 };

 class RGB565Codec : public Pica::Texture::Codec {
@ -53,7 +62,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 4;
+    virtual void setSize() {
+        this->start_nibbles_size = 4;
+    };
 };

 class RG8Codec : public Pica::Texture::Codec {
@ -63,7 +74,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 4;
+    virtual void setSize() {
+        this->start_nibbles_size = 4;
+    };
 };

 class IA8Codec : public Pica::Texture::Codec {
@ -73,7 +86,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 4;
+    virtual void setSize() {
+        this->start_nibbles_size = 4;
+    };
 };

 class I8Codec : public Pica::Texture::Codec {
@ -83,7 +98,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 2;
+    virtual void setSize() {
+        this->start_nibbles_size = 2;
+    };
 };

 class A8Codec : public Pica::Texture::Codec {
@ -93,7 +110,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 2;
+    virtual void setSize() {
+        this->start_nibbles_size = 2;
+    };
 };

 class IA4Codec : public Pica::Texture::Codec {
@ -103,7 +122,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 2;
+    virtual void setSize() {
+        this->start_nibbles_size = 2;
+    };
 };

 class I4Codec : public Pica::Texture::Codec {
@ -113,7 +134,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 1;
+    virtual void setSize() {
+        this->start_nibbles_size = 1;
+    };
 };

 class A4Codec : public Pica::Texture::Codec {
@ -123,7 +146,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 1;
+    virtual void setSize() {
+        this->start_nibbles_size = 1;
+    };
 };

 class ETC1Codec : public Pica::Texture::Codec {
@ -133,7 +158,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 1;
+    virtual void setSize() {
+        this->start_nibbles_size = 1;
+    };
 };

 class ETC1A4Codec : public Pica::Texture::Codec {
@ -143,7 +170,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 2;
+    virtual void setSize() {
+        this->start_nibbles_size = 2;
+    };
 };

 class D16Codec : public Pica::Texture::Codec {
@ -153,7 +182,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 4;
+    virtual void setSize() {
+        this->start_nibbles_size = 4;
+    };
 };

 class D24Codec : public Pica::Texture::Codec {
@ -163,7 +194,9 @@ public:
    void encode();

 protected:
-    const u32 format_size = 6;
+    virtual void setSize() {
+        this->start_nibbles_size = 6;
+    };
 };

 class D24S8Codec : public Pica::Texture::Codec {
@ -173,5 +206,7 @@ public:
    void encode();

 protected:
-    const u32 format_size = 8;
+    virtual void setSize() {
+        this->start_nibbles_size = 8;
+    };
 };
--- a/src/video_core/texture/internal/decoders.cpp
+++ b/src/video_core/texture/internal/decoders.cpp
@ -1,9 +1,10 @@

+
 namespace {

 template <const Math::Vec4<u8> decode_func(const u8*)>
 inline void rgba_pass(u8* read, u8* write) {
-    u32 pixel = decode_func(read).ToRGBA();
+    auto pixel = decode_func(read).ToRGBA();
    std::memcpy(write, &pixel, 4);
 }

@ -72,34 +73,36 @@ void RG8Codec::decode() {
 namespace {

 inline u16 convert_nibbles(u8 nibbles) {
-    return ((u16)Color::Convert4To8((nibbles & 0xF0) >> 4) << 8) |
-           (u16)Color::Convert4To8((nibbles & 0x0F));
+    u16 split = (nibbles & 0xF0) << 4 | (nibbles & 0x0F);
+    split |= (split << 4);
+    return split;
 }

-inline u32 build_luminance(u8 intensity, u8 alpha) {
+inline u32 build_luminance(u32 intensity, u32 alpha) {
    return (alpha << 24) | (intensity << 16) | (intensity << 8) | intensity;
 }

 inline void intensity_alpha_pass(u8* read, u8* write) {
    alignas(4) u8 pixel[2];
    std::memcpy(pixel, read, 2);
-    u32 result = build_luminance(pixel[0], pixel[1]);
+    u32 result = build_luminance(pixel[1], pixel[0]);
    std::memcpy(write, &result, 4);
 }

 inline void intensity_alpha_nibbles_pass(u8* read, u8* write) {
-    alignas(4) u8 pixel[2];
-    std::memcpy(pixel, read, 1);
-    u16 tmp = convert_nibbles(pixel[0]);
-    std::memcpy(pixel, &tmp, 2);
-    u32 result = build_luminance(pixel[0], pixel[1]);
+    alignas(4) u8 pixel;
+    std::memcpy(&pixel, read, 1);
+    u16 tmp = convert_nibbles(pixel);
+    u8 tmp2[2];
+    std::memcpy(tmp2, &tmp, 2);
+    u32 result = build_luminance(tmp2[1], tmp2[0]);
    std::memcpy(write, &result, 4);
 }

 inline void intensity_pass(u8* read, u8* write) {
-    alignas(4) u8 pixel[1];
-    std::memcpy(pixel, read, 1);
-    u32 result = build_luminance(pixel[0], 255);
+    u8 pixel;
+    std::memcpy(&pixel, read, 1);
+    u32 result = build_luminance(pixel, 255);
    std::memcpy(write, &result, 4);
 }

@ -108,9 +111,9 @@ inline void intensity_nibbles_pass(u8* read, u8* write) {
    std::memcpy(pixel, read, 1);
    u16 tmp = convert_nibbles(pixel[0]);
    std::memcpy(pixel, &tmp, 2);
-    u32 result = build_luminance(pixel[0], 255);
+    u32 result = build_luminance(pixel[1], 255);
    std::memcpy(write, &result, 4);
-    result = build_luminance(pixel[1], 255);
+    result = build_luminance(pixel[0], 255);
    std::memcpy(write + 4, &result, 4);
 }

--- a/src/video_core/texture/internal/etc1.cpp
+++ b/src/video_core/texture/internal/etc1.cpp
@ -9,14 +9,24 @@
 #include "common/math_util.h"
 #include "common/swap.h"
 #include "common/vector_math.h"
-#include "etc1.h"
-#include "texture_utils.h"
+#include "video_core/texture/internal/etc1.h"
+#include "video_core/texture/internal/texture_utils.h"

-constexpr std::array<u8[2], 8> etc1_modifier_table = {{
+namespace {
+
+#ifdef _DEBUG
+#define CONST_FIX static
+#else
+#define CONST_FIX constexpr
+#endif
+
+CONST_FIX std::array<u8[2], 8> etc1_modifier_table = {{
    {2, 8}, {5, 17}, {9, 29}, {13, 42}, {18, 60}, {24, 80}, {33, 106}, {47, 183},
 }};

-namespace {
+constexpr u32 buildRGBA(u32 r, u32 g, u32 b, u32 a) {
+    return (a << 24) | (b << 16) | (g << 8) | r;
+}

 union ETC1Tile {
    u64 raw;
@ -62,7 +72,7 @@ union ETC1Tile {
        BitField<60, 4, u64> r1;
    } separate;

-    const Math::Vec3<u8> GetRGB(u32 x, u32 y) const {
+    const u32 GetRGB(u32 x, u32 y) const {
        int texel = 4 * x + y;

        if (flip)
@ -106,7 +116,7 @@ union ETC1Tile {
        ret.g() = MathUtil::Clamp(ret.g() + modifier, 0, 255);
        ret.b() = MathUtil::Clamp(ret.b() + modifier, 0, 255);

-        return ret.Cast<u8>();
+        return buildRGBA(ret.r(), ret.g(), ret.b(), 0);
    }
 };

@ -121,7 +131,8 @@ inline void etc1_pass(u8* etc1_buffer, u8* linear_buffer, u32 x_blocks) {
        std::memcpy(&tile.raw, &etc1_buffer[i * 8], 8);
        for (u32 k = 0; k < 4; k++) {
            for (u32 j = 0; j < 4; j++) {
-                u32 rgba = (tile.GetRGB(j, k).ToRGB()) | 0xFF000000;
+                auto rgb = tile.GetRGB(j, k);
+                u32 rgba = rgb | 0xFF000000;
                std::memcpy(&tmp[k * line + j * 4 + index], &rgba, 4);
            }
        }
@ -142,7 +153,8 @@ inline void etc1a4_pass(u8* etc1_buffer, u8* linear_buffer, u32 x_blocks) {
            for (u32 j = 0; j < 4; j++) {
                u32 alpha = (alpha_tile >> (4 * (j * 4 + k))) & 0x0F;
                alpha |= (alpha << 4);
-                u32 rgba = tile.GetRGB(j, k).ToRGB() | (alpha << 24);
+                auto rgb = tile.GetRGB(j, k);
+                u32 rgba = rgb | (alpha << 24);
                std::memcpy(&tmp[k * line + j * 4 + index], &rgba, 4);
            }
        }
--- a/src/video_core/texture/internal/etc1.h
+++ b/src/video_core/texture/internal/etc1.h
@ -1,7 +1,6 @@
+#pragma once

 #include "common/common_types.h"

-#pragma once
-
 void ETC1(u8* etc1_buffer, u8* matrix_buffer, u32 width, u32 height);
 void ETC1A4(u8* etc1_buffer, u8* matrix_buffer, u32 width, u32 height);
--- a/src/video_core/texture/internal/morton.cpp
+++ b/src/video_core/texture/internal/morton.cpp
@ -2,8 +2,8 @@
 #include <memory>
 #include <utility>
 #include "common/common_types.h"
-#include "morton.h"
-#include "texture_utils.h"
+#include "video_core/texture/internal/morton.h"
+#include "video_core/texture/internal/texture_utils.h"

 ///////////////////////////////////////////////////////////////////////////////
 // Optimizations
@ -15,6 +15,8 @@
 // favor fast code over small code.
 #pragma optimize("t", on)
 #pragma intrinsic(memcpy)
+#define __hot
+#define __no_inline __declspec(noinline)
 #elif defined(CLANG_OR_GCC)
 // The next 3 will swizle memory copying to help find the best sse/avx shuffling
 // in case it's possible. Compilation tests have proven effective use of these
@ -22,12 +24,20 @@
 #pragma GCC optimize("-fpredictive-commoning")
 #pragma GCC optimize("-ftree-loop-distribute-patterns")
 #pragma GCC optimize("-ftree-vectorize")
-// limit inlining
-#pragma GCC option("--param max-inline-insns-single=128")
-
+#pragma GCC option("--param inline-unit-growth=400")
+#pragma GCC option("--param large-function-growth=800")
 // The beauty of these compiler options is that they generate better code than
 // hand written intrinsics, since inline expanding memeory transfers can be pattern
 // matched with vector instructions available in the target.
+#define __no_inline __attribute__((noinline))
+#define __hot __attribute__((hot))
+#if !defined(__forceinline)
+#define __forceinline attribute__((always_inline))
+#endif
+#else
+#define __hot
+#define __no_inline
+#define __forceinline
 #endif

 #pragma region Z_Order
@ -54,11 +64,11 @@ constexpr u32 isBottom(u32 block_index) {
 }

 template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
-inline void swizzle_block(u8*& morton_block, u8* linear_block);
+__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block);

 template <void codec(u8*, u8*, size_t), size_t nibbles, u32 block_index, u32 blocks,
          size_t block_size>
-inline void swizzle_block_aux(u8*& morton_block, u8* linear_block) {
+__forceinline static void swizzle_block_aux(u8*& morton_block, u8* linear_block) {
    // move the linear_block pointer to the appropiate block
    const size_t right = isRight(block_index) * (blocks * nibbles) / 2;
    const size_t down = isBottom(block_index) * block_size;
@ -67,7 +77,7 @@ inline void swizzle_block_aux(u8*& morton_block, u8* linear_block) {
 }

 template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
-inline void swizzle_block(u8*& morton_block, u8* linear_block) {
+__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block) {
    const size_t new_block_size = block_size / 2;
    if (blocks <= 2) {
        // We handle 2*2 blocks on z-order
@ -94,14 +104,14 @@ inline void swizzle_block(u8*& morton_block, u8* linear_block) {
 }

 template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
-void swizzle_pass(u8* morton_block, u8* linear_block) {
+__forceinline static void swizzle_pass(u8* morton_block, u8* linear_block) {
    const size_t block_size = (lines_per_block * lines_per_block * nibbles) / 2;
    swizzle_block<codec, nibbles, lines_per_block, block_size>(morton_block, linear_block);
 }
 #pragma endregion Z_Order

 template <size_t nibbles, size_t lines_per_block>
-void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
+__hot inline static void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
    const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
    alignas(64) u8 tmp[tile_size];
    tiling_pass<&encode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
@ -109,7 +119,7 @@ void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
 }

 template <size_t nibbles, size_t lines_per_block>
-void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
+__hot inline static void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
    const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
    alignas(64) u8 tmp[tile_size];
    swizzle_pass<&decode, nibbles, lines_per_block>(morton_buffer, tmp);
@ -117,7 +127,7 @@ void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
 }

 template <void codec(u8*, u8*, u32), size_t nibbles, size_t lines_per_block>
-void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) {
+__hot static void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) {
    const u32 x_blocks = (width / lines_per_block);
    const u32 y_blocks = (height / lines_per_block);
    const size_t line_size = (lines_per_block * nibbles) / 2;
@ -135,9 +145,22 @@ void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) {
    }
 }

+// keep hot code together
+__no_inline __hot static void morton_8x8_32(u8* morton_buffer, u8* matrix_buffer, u32 width,
+                                            u32 height, bool decode) {
+    if (decode)
+        morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
+    else
+        morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
+}
+
 namespace Decoders {

 bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
+    if (bpp == 32) {
+        morton_8x8_32(morton_buffer, matrix_buffer, width, height, true);
+        return true;
+    }
    switch (bpp) {
    case 4: {
        morton_pass<&decode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
@ -159,11 +182,6 @@ bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32
        return true;
        break;
    }
-    case 32: {
-        morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
    default: {
        return false;
        break;
@ -209,6 +227,10 @@ bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u
 namespace Encoders {

 bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
+    if (bpp == 32) {
+        morton_8x8_32(morton_buffer, matrix_buffer, width, height, false);
+        return true;
+    }
    switch (bpp) {
    case 4: {
        morton_pass<&encode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
@ -230,11 +252,6 @@ bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32
        return true;
        break;
    }
-    case 32: {
-        morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
    default: {
        return false;
        break;
--- a/src/video_core/texture/internal/morton.h
+++ b/src/video_core/texture/internal/morton.h
@ -1,7 +1,7 @@
-#include "common/common_types.h"
-
 #pragma once

+#include "common/common_types.h"
+
 enum class MortonPass { Tile8x8, Tile32x32 };

 namespace Decoders {
--- a/src/video_core/texture/internal/texture_utils.h
+++ b/src/video_core/texture/internal/texture_utils.h
@ -1,3 +1,5 @@
+#pragma once
+
 #include <array>
 #include <cstring>
 #include <memory>
@ -5,12 +7,9 @@
 #include "common/color.h"
 #include "common/swap.h"

-#pragma once
-
 #if ((defined(__clang__) || defined(__GNUC__)) && !defined(__INTEL_COMPILER))
 #define CLANG_OR_GCC
 #endif
-
 ///////////////////////////////////////////////////////////////////////////////
 // Optimizations
 //////////////////////////////////////////////////////////////////////////////
@ -23,16 +22,6 @@
 #pragma GCC optimize("-ftree-vectorize")
 #endif

-// @param read_size is the amount of bytes each pixel takes
-inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
-    std::memcpy(matrix_pointer, morton_pointer, read_size);
-}
-
-// @param read_size is the amount of bytes each pixel takes
-inline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
-    std::memcpy(morton_pointer, matrix_pointer, read_size);
-}
-
 // Pre: width % 8 == 0 && height % 8 == 0
 template <void pass(u8*, u8*), u32 read_size, u32 write_size, u32 tuning = 2>
 inline void image_pass_aux_rev(u8* target, u32 width, u32 height) {
@ -80,9 +69,9 @@ inline void image_pass_aux(u8* target, u32 width, u32 height) {
 template <void pass(u8*, u8*), u32 read_size, u32 write_size, u32 tuning = 2>
 inline void image_pass(u8* target, u32 width, u32 height) {
    if (read_size > write_size)
-        image_pass_aux<pass, read_size, write_size, tuning>;
+        image_pass_aux<pass, read_size, write_size, tuning>(target, width, height);
    else
-        image_pass_aux_rev<pass, read_size, write_size, tuning>;
+        image_pass_aux_rev<pass, read_size, write_size, tuning>(target, width, height);
 }

 template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
@ -96,3 +85,13 @@ void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) {
        codec(tiled + tiled_index, linear + linear_index, tiled_line_size);
    }
 }
+
+// @param read_size is the amount of bytes each pixel takes
+inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
+    std::memcpy(matrix_pointer, morton_pointer, read_size);
+}
+
+// @param read_size is the amount of bytes each pixel takes
+inline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
+    std::memcpy(morton_pointer, matrix_pointer, read_size);
+}