From bed6207ac72fcfbe65c6050324c7c50cced9339b Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fernandosahmkow1@hotmail.com>
Date: Mon, 16 Jan 2017 19:18:38 -0500
Subject: [PATCH] Fixes, cleaning and small refactors

---
 src/video_core/CMakeLists.txt                 |   1 +
 .../renderer_opengl/gl_rasterizer_cache.cpp   |  53 +--
 src/video_core/texture/codec.cpp              |  12 +-
 src/video_core/texture/formats.h              |  21 +-
 src/video_core/texture/internal/codecs.h      | 103 +++++-
 src/video_core/texture/internal/decoders.cpp  |  67 ++--
 src/video_core/texture/internal/encoders.cpp  |  23 +-
 src/video_core/texture/internal/etc1.cpp      |  16 +
 src/video_core/texture/internal/morton.cpp    | 305 ++----------------
 src/video_core/texture/internal/morton.h      |   4 +-
 .../texture/internal/morton8x8_optimized.cpp  | 253 +++++++++++++++
 .../texture/internal/texture_utils.h          |  25 +-
 12 files changed, 484 insertions(+), 399 deletions(-)
 create mode 100644 src/video_core/texture/internal/morton8x8_optimized.cpp

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index b33869c22..061923911 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,3 +1,4 @@
+
 set(SRCS
             renderer_opengl/gl_rasterizer.cpp
             renderer_opengl/gl_rasterizer_cache.cpp
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index ac029b48f..5389a8941 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -26,6 +26,9 @@
 #include "video_core/utils.h"
 #include "video_core/video_core.h"
 
+#define TEXTURE_CACHE_SIZE (1024 * 1024 * 8) // 8MB inner cache for decoding/encoding
+alignas(64) static u8 TextureCache[TEXTURE_CACHE_SIZE];
+
 struct FormatTuple {
     GLint internal_format;
     GLenum format;
@@ -39,7 +42,7 @@ static const std::array<FormatTuple, 18> format_tuples = {{
     {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5},                  // RGB565
     {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4},                // RGBA4
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // IA8
-    {GL_RG8, GL_RG8, GL_UNSIGNED_BYTE},                            // RG8
+    {GL_RG8, GL_RG, GL_UNSIGNED_BYTE},                             // RG8
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // I8
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // A8
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // IA4
@@ -70,7 +73,7 @@ static const std::array<bool, 18> native_format = {
     false, // ETC1A4
     true,  // D16
     false,
-    false, // D24
+    true,  // D24
     false, // D24S8
 };
 
@@ -306,26 +309,29 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo
                 // clang-format on
                 );
             Pica::Texture::Codec* codec = tmp.get();
-            codec->configTiling(true, 8); // change 8 for 32 in case the mage is tiled
+            codec->configTiling(true, 8); // change 8 for 32 in case the image is tiled
                                           // on blocks of 32x32
             codec->configRGBATransform(!native_format[(unsigned int)params.pixel_format]);
             codec->validate();
             if (!codec->invalid()) {
-                codec->decode();
-                std::unique_ptr<u8[]> decoded_texture = codec->transferInternalBuffer();
-                u32 bytes = codec->getInternalBytesPerPixel();
-                if (bytes == 3)
-                    bytes = 1;
-                else if (bytes != 2)
-                    bytes = 4;
-                glPixelStorei(GL_UNPACK_ALIGNMENT, bytes);
-                glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height,
-                             0, tuple.format, tuple.type, decoded_texture.get());
-                glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
+                u32 estimated_size =
+                    params.width * params.height * codec->getInternalBytesPerPixel();
+                if (estimated_size <= TEXTURE_CACHE_SIZE) {
+                    codec->setExternalBuffer(TextureCache);
+                    codec->decode();
+                    glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
+                                 params.height, 0, tuple.format, tuple.type, TextureCache);
+                } else {
+                    codec->decode();
+                    std::unique_ptr<u8[]> decoded_texture = codec->transferInternalBuffer();
+                    glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
+                                 params.height, 0, tuple.format, tuple.type, decoded_texture.get());
+                }
             } else {
                 LOG_WARNING(Render_OpenGL,
                             "Invalid texture sent to renderer; width: %d height %d type: %d",
                             params.width, params.height, (unsigned int)params.pixel_format);
+                return nullptr;
             }
         }
         // If not 1x scale, blit 1x texture to a new scaled texture and replace texture in surface
@@ -652,15 +658,22 @@ void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) {
         glPixelStorei(GL_PACK_ROW_LENGTH, 0);
     } else {
         const FormatTuple& tuple = format_tuples[(u32)surface->pixel_format];
-        u32 bytes_per_pixel = Pica::Texture::Format::GetBpp(surface->pixel_format) / 8;
+        u32 bits_per_pixel = Pica::Texture::Format::GetBpp(surface->pixel_format);
         if (!native_format[(u32)surface->pixel_format])
-            bytes_per_pixel = 4;
-        std::vector<u8> temp_gl_buffer(surface->width * surface->height * bytes_per_pixel);
-        glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data());
-
+            bits_per_pixel = 32;
+        u32 size = surface->width * surface->height * bits_per_pixel / 8;
+        std::vector<u8> temp_gl_buffer;
+        u8* temporal_buffer;
+        if (size <= TEXTURE_CACHE_SIZE)
+            temporal_buffer = TextureCache;
+        else {
+            temp_gl_buffer.resize(size);
+            temporal_buffer = temp_gl_buffer.data();
+        }
+        glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temporal_buffer);
         std::unique_ptr<Pica::Texture::Codec> tmp = Pica::Texture::CodecFactory::build(
             // clang-format off
-            surface->pixel_format, temp_gl_buffer.data(), surface->width, surface->height
+            surface->pixel_format, temporal_buffer, surface->width, surface->height
             // clang-format on
             );
         Pica::Texture::Codec* codec = tmp.get();
diff --git a/src/video_core/texture/codec.cpp b/src/video_core/texture/codec.cpp
index 42f39c23f..d13a6dd45 100644
--- a/src/video_core/texture/codec.cpp
+++ b/src/video_core/texture/codec.cpp
@@ -67,7 +67,7 @@ void Codec::init(bool decode) {
             this->expected_nibbles_size = this->start_nibbles_size;
     }
     this->validate();
-    if (!this->external_result_buffer || !this->invalid()) {
+    if (!this->external_result_buffer) {
         size_t buff_size = this->width * this->height * this->expected_nibbles_size / 2;
         this->internal_buffer = std::make_unique<u8[]>(buff_size);
         this->passing_buffer = this->internal_buffer.get();
@@ -91,7 +91,7 @@ void Codec::validate() {
         this->invalid_state = true;
         return;
     }
-    if (this->morton && this->morton_pass_tiling != 8 && this->morton_pass_tiling != 32) {
+    if (this->morton && this->morton_pass_tiling != 8) {
         this->invalid_state = true;
         return;
     }
@@ -102,18 +102,12 @@ inline void Codec::decode_morton_pass() {
     if (this->morton_pass_tiling == 8)
         Decoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height,
                              this->start_nibbles_size * 4);
-    else if (this->morton_pass_tiling == 32)
-        Decoders::Morton_32x32(this->target_buffer, this->passing_buffer, this->width, this->height,
-                               this->start_nibbles_size * 4);
 }
 
 inline void Codec::encode_morton_pass() {
     if (this->morton_pass_tiling == 8)
-        Encoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height,
+        Encoders::Morton_8x8(this->passing_buffer, this->target_buffer, this->width, this->height,
                              this->start_nibbles_size * 4);
-    else if (this->morton_pass_tiling == 32)
-        Encoders::Morton_32x32(this->target_buffer, this->passing_buffer, this->width, this->height,
-                               this->start_nibbles_size * 4);
 }
 
 std::unique_ptr<Codec> CodecFactory::build(Format::Type format, u8* target, u32 width, u32 height) {
diff --git a/src/video_core/texture/formats.h b/src/video_core/texture/formats.h
index ffb24e615..6418e780d 100644
--- a/src/video_core/texture/formats.h
+++ b/src/video_core/texture/formats.h
@@ -40,7 +40,7 @@ struct Format {
         Invalid = 255,
     };
 
-    static u32 GetBpp(Type format) {
+    static const u32 GetBpp(Type format) {
         static const std::array<unsigned int, 18> bpp_table = {
             32, // RGBA8
             24, // RGB8
@@ -66,19 +66,19 @@ struct Format {
         return bpp_table[(u32)format];
     }
 
-    static Type FromTextureFormat(Regs::TextureFormat format) {
+    static constexpr Type FromTextureFormat(Regs::TextureFormat format) {
         return ((unsigned int)format < 14) ? (Type)format : Type::Invalid;
     }
 
-    static Type FromColorFormat(Regs::ColorFormat format) {
+    static constexpr Type FromColorFormat(Regs::ColorFormat format) {
         return ((unsigned int)format < 5) ? (Type)format : Type::Invalid;
     }
 
-    static Type FromDepthFormat(Regs::DepthFormat format) {
+    static constexpr Type FromDepthFormat(Regs::DepthFormat format) {
         return ((unsigned int)format < 4) ? (Type)((unsigned int)format + 14) : Type::Invalid;
     }
 
-    static Type FromGPUPixelFormat(GPU::Regs::PixelFormat format) {
+    static const Type FromGPUPixelFormat(GPU::Regs::PixelFormat format) {
         switch (format) {
         // RGB565 and RGB5A1 are switched in PixelFormat compared to ColorFormat
         case GPU::Regs::PixelFormat::RGB565:
@@ -92,6 +92,17 @@ struct Format {
 
 }; // Format
 
+struct Info {
+    PAddr physical_address;
+    int width;
+    int height;
+    int stride;
+    Pica::Regs::TextureFormat format;
+
+    static Info FromPicaRegister(const Pica::Regs::TextureConfig& config,
+                                 const Pica::Regs::TextureFormat& format);
+};
+
 } // Texture
 
 } // Pica
diff --git a/src/video_core/texture/internal/codecs.h b/src/video_core/texture/internal/codecs.h
index 9fa40908a..5864d09b3 100644
--- a/src/video_core/texture/internal/codecs.h
+++ b/src/video_core/texture/internal/codecs.h
@@ -5,6 +5,7 @@
 #include <memory>
 #include "common/common_types.h"
 #include "video_core/texture/codec.h"
+#include "video_core/texture/formats.h"
 
 // each texture format codec
 class RGBACodec : public Pica::Texture::Codec {
@@ -15,7 +16,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 8;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::RGBA8
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -27,7 +32,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 6;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::RGB8
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -39,7 +48,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 4;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::RGB5A1
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -51,7 +64,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 4;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::RGBA4
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -63,7 +80,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 4;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::RGB565
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -75,7 +96,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 4;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::RG8
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -87,7 +112,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 4;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::IA8
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -99,7 +128,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 2;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::I8
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -111,7 +144,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 2;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::A8
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -123,7 +160,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 2;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::IA4
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -135,7 +176,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 1;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::I4
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -147,7 +192,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 1;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::A4
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -159,7 +208,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 1;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::ETC1
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -171,7 +224,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 2;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::ETC1A4
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -183,7 +240,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 4;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::D16
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -195,7 +256,11 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 6;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::D24
+        ) / 4;
+        // clang-format on
     };
 };
 
@@ -207,6 +272,10 @@ public:
 
 protected:
     virtual void setSize() {
-        this->start_nibbles_size = 8;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::D24S8
+        ) / 4;
+        // clang-format on
     };
 };
diff --git a/src/video_core/texture/internal/decoders.cpp b/src/video_core/texture/internal/decoders.cpp
index 28672e8fb..2ee7b936a 100644
--- a/src/video_core/texture/internal/decoders.cpp
+++ b/src/video_core/texture/internal/decoders.cpp
@@ -83,9 +83,9 @@ inline u32 build_luminance(u32 intensity, u32 alpha) {
 }
 
 inline void intensity_alpha_pass(u8* read, u8* write) {
-    alignas(4) u8 pixel[2];
-    std::memcpy(pixel, read, 2);
-    u32 result = build_luminance(pixel[1], pixel[0]);
+    u16 pixel;
+    std::memcpy(&pixel, read, 2);
+    u32 result = build_luminance(pixel >> 8, pixel & 0x00FF);
     std::memcpy(write, &result, 4);
 }
 
@@ -93,9 +93,7 @@ inline void intensity_alpha_nibbles_pass(u8* read, u8* write) {
     alignas(4) u8 pixel;
     std::memcpy(&pixel, read, 1);
     u16 tmp = convert_nibbles(pixel);
-    u8 tmp2[2];
-    std::memcpy(tmp2, &tmp, 2);
-    u32 result = build_luminance(tmp2[1], tmp2[0]);
+    u32 result = build_luminance(tmp >> 8, tmp & 0x00FF);
     std::memcpy(write, &result, 4);
 }
 
@@ -107,31 +105,29 @@ inline void intensity_pass(u8* read, u8* write) {
 }
 
 inline void intensity_nibbles_pass(u8* read, u8* write) {
-    alignas(4) u8 pixel[2];
-    std::memcpy(pixel, read, 1);
-    u16 tmp = convert_nibbles(pixel[0]);
-    std::memcpy(pixel, &tmp, 2);
-    u32 result = build_luminance(pixel[1], 255);
+    u8 pixel;
+    std::memcpy(&pixel, read, 1);
+    u16 tmp = convert_nibbles(pixel);
+    u32 result = build_luminance(tmp & 0x00FF, 255);
     std::memcpy(write, &result, 4);
-    result = build_luminance(pixel[0], 255);
+    result = build_luminance(tmp >> 8, 255);
     std::memcpy(write + 4, &result, 4);
 }
 
 inline void alpha_pass(u8* read, u8* write) {
-    alignas(4) u8 pixel[1];
-    std::memcpy(pixel, read, 1);
-    u32 result = build_luminance(0, pixel[0]);
+    u8 pixel;
+    std::memcpy(&pixel, read, 1);
+    u32 result = build_luminance(0, pixel);
     std::memcpy(write, &result, 4);
 }
 
 inline void alpha_nibbles_pass(u8* read, u8* write) {
-    alignas(4) u8 pixel[2];
-    std::memcpy(pixel, read, 1);
-    u16 tmp = convert_nibbles(pixel[0]);
-    std::memcpy(pixel, &tmp, 2);
-    u32 result = build_luminance(0, pixel[0]);
+    u8 pixel;
+    std::memcpy(&pixel, read, 1);
+    u16 tmp = convert_nibbles(pixel);
+    u32 result = build_luminance(0, tmp & 0x00FF);
     std::memcpy(write, &result, 4);
-    result = build_luminance(0, pixel[1]);
+    result = build_luminance(0, tmp >> 8);
     std::memcpy(write + 4, &result, 4);
 }
 
@@ -207,7 +203,7 @@ void ETC1A4Codec::decode() {
     ETC1A4(this->target_buffer, this->passing_buffer, this->width, this->height);
 }
 
-namespace {
+namespace Decode {
 
 inline void expand_depth16_pass(u8* read, u8* write) {
     alignas(4) u8 pixel[4];
@@ -224,11 +220,18 @@ inline void expand_depth24_pass(u8* read, u8* write) {
     std::memcpy(write, pixel, 4);
 }
 
-inline void fix_stencil_pass(u8* read, u8* write) {
-    u32 pixel;
-    std::memcpy(&pixel, read, 4);
-    pixel = (pixel << 8) | (pixel >> 24);
-    std::memcpy(write, &pixel, 4);
+inline void d24s8_pass(u8* target, u32 width, u32 height) {
+    const size_t sub_iters = 8;
+    const size_t iters = width * height / sub_iters;
+    for (u32 i = 0; i < iters; i++) {
+        for (u32 j = 0; j < sub_iters; j++) {
+            u32 pixel;
+            std::memcpy(&pixel, target, 4);
+            pixel = (pixel >> 24) | (pixel << 8);
+            std::memcpy(target, &pixel, 4);
+            target += 4;
+        }
+    }
 }
 
 } // Anonymous
@@ -236,7 +239,7 @@ inline void fix_stencil_pass(u8* read, u8* write) {
 void D16Codec::decode() {
     super::decode();
     if (this->raw_RGBA)
-        image_pass<&expand_depth16_pass, 4, 8>(
+        image_pass<&Decode::expand_depth16_pass, 4, 8>(
             // clang-format off
             this->passing_buffer, this->width, this->height
             // clang-format on
@@ -246,7 +249,7 @@ void D16Codec::decode() {
 void D24Codec::decode() {
     super::decode();
     if (this->raw_RGBA)
-        image_pass<&expand_depth24_pass, 6, 8>(
+        image_pass<&Decode::expand_depth24_pass, 6, 8>(
             // clang-format off
             this->passing_buffer, this->width, this->height
             // clang-format on
@@ -256,9 +259,5 @@ void D24Codec::decode() {
 void D24S8Codec::decode() {
     super::decode();
     if (this->raw_RGBA)
-        image_pass<&fix_stencil_pass, 8, 8, 8>(
-            // clang-format off
-            this->passing_buffer, this->width, this->height
-            // clang-format on
-            );
+        Decode::d24s8_pass(this->passing_buffer, this->width, this->height);
 }
diff --git a/src/video_core/texture/internal/encoders.cpp b/src/video_core/texture/internal/encoders.cpp
index 0844bb737..fb4616fc2 100644
--- a/src/video_core/texture/internal/encoders.cpp
+++ b/src/video_core/texture/internal/encoders.cpp
@@ -69,11 +69,18 @@ inline void contract_depth24_pass(u8* read, u8* write) {
     std::memcpy(write, pixel, 3);
 }
 
-inline void fix_stencil_pass(u8* read, u8* write) {
-    u32 pixel;
-    std::memcpy(&pixel, read, 4);
-    pixel = (pixel >> 24) | (pixel << 8);
-    std::memcpy(write, &pixel, 4);
+inline void d24s8_pass(u8* target, u32 width, u32 height) {
+    const size_t sub_iters = 8;
+    const size_t iters = width * height / sub_iters;
+    for (u32 i = 0; i < iters; i++) {
+        for (u32 j = 0; j < sub_iters; j++) {
+            u32 pixel;
+            std::memcpy(&pixel, target, 4);
+            pixel = (pixel >> 8) | (pixel << 24);
+            std::memcpy(target, &pixel, 4);
+            target += 4;
+        }
+    }
 }
 
 } // Anonymous
@@ -101,9 +108,5 @@ void D24Codec::encode() {
 void D24S8Codec::encode() {
     super::encode();
     if (this->raw_RGBA)
-        image_pass<&Encode::fix_stencil_pass, 8, 8, 8>(
-            // clang-format off
-            this->passing_buffer, this->width, this->height
-            // clang-format on
-            );
+        Encode::d24s8_pass(this->passing_buffer, this->width, this->height);
 }
diff --git a/src/video_core/texture/internal/etc1.cpp b/src/video_core/texture/internal/etc1.cpp
index 5a7edec33..9003c91a8 100644
--- a/src/video_core/texture/internal/etc1.cpp
+++ b/src/video_core/texture/internal/etc1.cpp
@@ -122,6 +122,22 @@ union ETC1Tile {
 
 } // anonymous namespace
 
+inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
+    std::memcpy(matrix_pointer, morton_pointer, read_size);
+}
+
+template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
+void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) {
+    const size_t tiled_line_size = (lines_per_block * nibbles) / 2;
+    const size_t row_length = x_blocks * tiled_line_size;
+    for (u32 i = 0; i < lines_per_block; i++) {
+        const u32 k = (lines_per_block - 1 - i);
+        const size_t tiled_index = i * tiled_line_size;
+        const size_t linear_index = k * row_length;
+        codec(tiled + tiled_index, linear + linear_index, tiled_line_size);
+    }
+}
+
 inline void etc1_pass(u8* etc1_buffer, u8* linear_buffer, u32 x_blocks) {
     const size_t line = 8 * 4;
     alignas(64) u8 tmp[line * 8];
diff --git a/src/video_core/texture/internal/morton.cpp b/src/video_core/texture/internal/morton.cpp
index 9bd74c800..5b84db7a0 100644
--- a/src/video_core/texture/internal/morton.cpp
+++ b/src/video_core/texture/internal/morton.cpp
@@ -1,295 +1,40 @@
-#include <cstring>
-#include <memory>
-#include <utility>
 #include "common/common_types.h"
 #include "video_core/texture/internal/morton.h"
-#include "video_core/texture/internal/texture_utils.h"
 
-///////////////////////////////////////////////////////////////////////////////
-// Optimizations
-//////////////////////////////////////////////////////////////////////////////
-#ifdef _MSC_VER
-#pragma inline_recursion(on)
-// Normaly set to 16 by default, the best balance seems to be on 8 for this module
-#pragma inline_depth(8)
-// favor fast code over small code.
-#pragma optimize("t", on)
-#pragma intrinsic(memcpy)
-#define __hot
-#define __no_inline __declspec(noinline)
-#elif defined(CLANG_OR_GCC)
-// The next 3 will swizle memory copying to help find the best sse/avx shuffling
-// in case it's possible. Compilation tests have proven effective use of these
-// flags on gcc and clang.
-#pragma GCC optimize("-fpredictive-commoning")
-#pragma GCC optimize("-ftree-loop-distribute-patterns")
-#pragma GCC optimize("-ftree-vectorize")
-#pragma GCC option("--param inline-unit-growth=400")
-#pragma GCC option("--param large-function-growth=800")
-// The beauty of these compiler options is that they generate better code than
-// hand written intrinsics, since inline expanding memeory transfers can be pattern
-// matched with vector instructions available in the target.
-#define __no_inline __attribute__((noinline))
-#define __hot __attribute__((hot))
-#if !defined(__forceinline)
-#define __forceinline attribute__((always_inline))
-#endif
-#else
-#define __hot
-#define __no_inline
-#define __forceinline
-#endif
-
-#pragma region Z_Order
-/////////////////////////////////////////////////////////////////////////////
-//          Z-Order:
-//
-//                    0-->1
-//                      /
-//                    2-->3
-//
-// for more information look at: https://en.wikipedia.org/wiki/Z-order_curve
-/////////////////////////////////////////////////////////////////////////////
-#define TOP_LEFT 0
-#define TOP_RIGHT 1
-#define BOTTOM_LEFT 2
-#define BOTTOM_RIGHT 3
-
-constexpr u32 isRight(u32 block_index) {
-    return (block_index % 2);
+static u32 Part1By1(u32 x) {
+    x &= 0x0000ffff;                 // x = ---- ---- ---- ---- fedc ba98 7654 3210
+    x = (x ^ (x << 8)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
+    x = (x ^ (x << 4)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
+    x = (x ^ (x << 2)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
+    x = (x ^ (x << 1)) & 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
+    return x;
 }
 
-constexpr u32 isBottom(u32 block_index) {
-    return (block_index / 2);
+static u32 Compact1By1(u32 x) {
+    x &= 0x55555555;                 // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
+    x = (x ^ (x >> 1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
+    x = (x ^ (x >> 2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
+    x = (x ^ (x >> 4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
+    x = (x ^ (x >> 8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
+    return x;
 }
 
-template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
-__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block);
-
-template <void codec(u8*, u8*, size_t), size_t nibbles, u32 block_index, u32 blocks,
-          size_t block_size>
-__forceinline static void swizzle_block_aux(u8*& morton_block, u8* linear_block) {
-    // move the linear_block pointer to the appropiate block
-    const size_t right = isRight(block_index) * (blocks * nibbles) / 2;
-    const size_t down = isBottom(block_index) * block_size;
-    u8* new_linear = linear_block + right + down;
-    swizzle_block<codec, nibbles, blocks, block_size>(morton_block, new_linear);
+static u32 EncodeMorton(u32 x, u32 y) {
+    return (Part1By1(y) << 1) | Part1By1(x);
 }
 
-template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
-__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block) {
-    const size_t new_block_size = block_size / 2;
-    if (blocks <= 2) {
-        // We handle 2*2 blocks on z-order
-        const size_t read_size = nibbles; // just for clearness. It's the same amount
-        // TOP_LEFT & TOP_RIGHT
-        codec(morton_block, linear_block, read_size);
-        morton_block += read_size;
-        // BOTTOM_LEFT & BOTTOM_RIGHT
-        codec(morton_block, linear_block + new_block_size, read_size);
-        morton_block += read_size;
-    } else {
-        // we divide the block into 4 blocks in z-order corecursively
-        // until we have 2x2 blocks.
-        const u32 subdivide = blocks / 2;
-        swizzle_block_aux<codec, nibbles, TOP_LEFT, subdivide, new_block_size>(morton_block,
-                                                                               linear_block);
-        swizzle_block_aux<codec, nibbles, TOP_RIGHT, subdivide, new_block_size>(morton_block,
-                                                                                linear_block);
-        swizzle_block_aux<codec, nibbles, BOTTOM_LEFT, subdivide, new_block_size>(morton_block,
-                                                                                  linear_block);
-        swizzle_block_aux<codec, nibbles, BOTTOM_RIGHT, subdivide, new_block_size>(morton_block,
-                                                                                   linear_block);
-    }
+static u32 DecodeMortonX(u32 code) {
+    return Compact1By1(code >> 0);
 }
 
-template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
-__forceinline static void swizzle_pass(u8* morton_block, u8* linear_block) {
-    const size_t block_size = (lines_per_block * lines_per_block * nibbles) / 2;
-    swizzle_block<codec, nibbles, lines_per_block, block_size>(morton_block, linear_block);
-}
-#pragma endregion Z_Order
-
-template <size_t nibbles, size_t lines_per_block>
-__hot inline static void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
-    const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
-    alignas(64) u8 tmp[tile_size];
-    tiling_pass<&encode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
-    swizzle_pass<&encode, nibbles, lines_per_block>(morton_buffer, tmp);
+static u32 DecodeMortonY(u32 code) {
+    return Compact1By1(code >> 1);
 }
 
-template <size_t nibbles, size_t lines_per_block>
-__hot inline static void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
-    const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
-    alignas(64) u8 tmp[tile_size];
-    swizzle_pass<&decode, nibbles, lines_per_block>(morton_buffer, tmp);
-    tiling_pass<&decode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
+u32 MortonOffset(u32 x, u32 y, u32 width, u32 height, u32 tiling, u32 bpp) {
+    u32 tile = (x + y * height) * width / (tiling * tiling);
+    tile = (tile * bpp) / 8;
+    return tile + EncodeMorton(x % tiling, y % tiling);
 }
 
-template <void codec(u8*, u8*, u32), size_t nibbles, size_t lines_per_block>
-__hot static void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) {
-    const u32 x_blocks = (width / lines_per_block);
-    const u32 y_blocks = (height / lines_per_block);
-    const size_t line_size = (lines_per_block * nibbles) / 2;
-    const size_t tile_size = lines_per_block * line_size;
-    const size_t stride_size = width * line_size;
-    matrix_buffer = matrix_buffer + ((height * width * nibbles) / 2) - stride_size;
-    for (u32 y = 0; y < y_blocks; y++) {
-        u8* linear_buffer = matrix_buffer;
-        for (u32 x = 0; x != x_blocks; x++) {
-            codec(morton_buffer, linear_buffer, x_blocks);
-            linear_buffer += line_size;
-            morton_buffer += tile_size;
-        }
-        matrix_buffer -= stride_size;
-    }
-}
-
-// keep hot code together
-__no_inline __hot static void morton_8x8_32(u8* morton_buffer, u8* matrix_buffer, u32 width,
-                                            u32 height, bool decode) {
-    if (decode)
-        morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
-    else
-        morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
-}
-
-namespace Decoders {
-
-bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
-    if (bpp == 32) {
-        morton_8x8_32(morton_buffer, matrix_buffer, width, height, true);
-        return true;
-    }
-    switch (bpp) {
-    case 4: {
-        morton_pass<&decode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 8: {
-        morton_pass<&decode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 16: {
-        morton_pass<&decode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 24: {
-        morton_pass<&decode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    default: {
-        return false;
-        break;
-    }
-    }
-}
-
-bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
-    switch (bpp) {
-    case 4: {
-        morton_pass<&decode_pass<1, 32>, 1, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 8: {
-        morton_pass<&decode_pass<2, 32>, 2, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 16: {
-        morton_pass<&decode_pass<4, 32>, 4, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 24: {
-        morton_pass<&decode_pass<6, 32>, 6, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 32: {
-        morton_pass<&decode_pass<8, 32>, 8, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    default: {
-        return false;
-        break;
-    }
-    }
-}
-}
-
-namespace Encoders {
-
-bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
-    if (bpp == 32) {
-        morton_8x8_32(morton_buffer, matrix_buffer, width, height, false);
-        return true;
-    }
-    switch (bpp) {
-    case 4: {
-        morton_pass<&encode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 8: {
-        morton_pass<&encode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 16: {
-        morton_pass<&encode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 24: {
-        morton_pass<&encode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    default: {
-        return false;
-        break;
-    }
-    }
-}
-
-bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
-    switch (bpp) {
-    case 4: {
-        morton_pass<&encode_pass<1, 32>, 1, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 8: {
-        morton_pass<&encode_pass<2, 32>, 2, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 16: {
-        morton_pass<&encode_pass<4, 32>, 4, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 24: {
-        morton_pass<&encode_pass<6, 32>, 6, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 32: {
-        morton_pass<&encode_pass<8, 32>, 8, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    default: {
-        return false;
-        break;
-    }
-    }
-}
-}
+#include "morton8x8_optimized.cpp"
diff --git a/src/video_core/texture/internal/morton.h b/src/video_core/texture/internal/morton.h
index 95473744f..73fa22eb3 100644
--- a/src/video_core/texture/internal/morton.h
+++ b/src/video_core/texture/internal/morton.h
@@ -2,14 +2,12 @@
 
 #include "common/common_types.h"
 
-enum class MortonPass { Tile8x8, Tile32x32 };
+u32 MortonOffset(u32 x, u32 y, u32 width, u32 height, u32 tiling, u32 bpp);
 
 namespace Decoders {
 bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
-bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
 }
 
 namespace Encoders {
 bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
-bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
 }
diff --git a/src/video_core/texture/internal/morton8x8_optimized.cpp b/src/video_core/texture/internal/morton8x8_optimized.cpp
new file mode 100644
index 000000000..d8511be56
--- /dev/null
+++ b/src/video_core/texture/internal/morton8x8_optimized.cpp
@@ -0,0 +1,253 @@
+
+#include <cstring>
+#include <memory>
+#include <utility>
+#include "common/common_types.h"
+
+#if ((defined(__clang__) || defined(__GNUC__)) && !defined(__INTEL_COMPILER))
+#define CLANG_OR_GCC
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// Optimizations
+//////////////////////////////////////////////////////////////////////////////
+#ifdef _MSC_VER
+#pragma inline_recursion(on)
+#pragma intrinsic(memcpy)
+#define __hot
+#define __no_inline __declspec(noinline)
+#elif defined(CLANG_OR_GCC)
+#pragma GCC push_options
+// The next 3 will swizle memory copying to help find the best sse/avx shuffling
+// in case it's possible. Compilation tests have proven effective use of these
+// flags on gcc and clang.
+#pragma GCC optimize("-fpredictive-commoning")
+#pragma GCC optimize("-ftree-loop-distribute-patterns")
+#pragma GCC optimize("-ftree-vectorize")
+// The beauty of these compiler options is that they generate better code than
+// hand written intrinsics, since inline expanding memeory transfers can be pattern
+// matched with vector instructions available in the target.
+#define __no_inline __attribute__((noinline))
+#define __hot __attribute__((hot))
+#if !defined(__forceinline)
+#define __forceinline attribute__((always_inline))
+#endif
+#else
+#define __hot
+#define __no_inline
+#define __forceinline inline
+#endif
+
+#pragma region Z_Order
+/////////////////////////////////////////////////////////////////////////////
+//          Z-Order:
+//
+//                    0-->1
+//                      /
+//                    2-->3
+//
+// for more information look at: https://en.wikipedia.org/wiki/Z-order_curve
+/////////////////////////////////////////////////////////////////////////////
+#define TOP_LEFT 0
+#define TOP_RIGHT 1
+#define BOTTOM_LEFT 2
+#define BOTTOM_RIGHT 3
+
+// @param read_size is the amount of bytes each pixel takes
+__forceinline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
+    std::memcpy(matrix_pointer, morton_pointer, read_size);
+}
+
+// @param read_size is the amount of bytes each pixel takes
+__forceinline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
+    std::memcpy(morton_pointer, matrix_pointer, read_size);
+}
+
+constexpr u32 isRight(u32 block_index) {
+    return (block_index % 2);
+}
+
+constexpr u32 isBottom(u32 block_index) {
+    return (block_index / 2);
+}
+
+template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
+__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block);
+
+template <void codec(u8*, u8*, size_t), size_t nibbles, u32 block_index, u32 blocks,
+          size_t block_size>
+__forceinline static void swizzle_block_aux(u8*& morton_block, u8* linear_block) {
+    // move the linear_block pointer to the appropiate block
+    const size_t right = isRight(block_index) * (blocks * nibbles) / 2;
+    const size_t down = isBottom(block_index) * block_size;
+    u8* new_linear = linear_block + right + down;
+    swizzle_block<codec, nibbles, blocks, block_size>(morton_block, new_linear);
+}
+
+template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
+__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block) {
+    const size_t new_block_size = block_size / 2;
+    if (blocks <= 2) {
+        // We handle 2*2 blocks on z-order
+        const size_t read_size = nibbles; // just for clearness. It's the same amount
+        // TOP_LEFT & TOP_RIGHT
+        codec(morton_block, linear_block, read_size);
+        morton_block += read_size;
+        // BOTTOM_LEFT & BOTTOM_RIGHT
+        codec(morton_block, linear_block + new_block_size, read_size);
+        morton_block += read_size;
+    } else {
+        // we divide the block into 4 blocks in z-order corecursively
+        // until we have 2x2 blocks.
+        const u32 subdivide = blocks / 2;
+        swizzle_block_aux<codec, nibbles, TOP_LEFT, subdivide, new_block_size>(morton_block,
+                                                                               linear_block);
+        swizzle_block_aux<codec, nibbles, TOP_RIGHT, subdivide, new_block_size>(morton_block,
+                                                                                linear_block);
+        swizzle_block_aux<codec, nibbles, BOTTOM_LEFT, subdivide, new_block_size>(morton_block,
+                                                                                  linear_block);
+        swizzle_block_aux<codec, nibbles, BOTTOM_RIGHT, subdivide, new_block_size>(morton_block,
+                                                                                   linear_block);
+    }
+}
+
+template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
+__forceinline static void swizzle_pass(u8* morton_block, u8* linear_block) {
+    const size_t block_size = (lines_per_block * lines_per_block * nibbles) / 2;
+    swizzle_block<codec, nibbles, lines_per_block, block_size>(morton_block, linear_block);
+}
+#pragma endregion Z_Order
+
+template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
+__forceinline void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) {
+    const size_t tiled_line_size = (lines_per_block * nibbles) / 2;
+    const size_t row_length = x_blocks * tiled_line_size;
+    for (u32 i = 0; i < lines_per_block; i++) {
+        const u32 k = (lines_per_block - 1 - i);
+        const size_t tiled_index = i * tiled_line_size;
+        const size_t linear_index = k * row_length;
+        codec(tiled + tiled_index, linear + linear_index, tiled_line_size);
+    }
+}
+
+template <size_t nibbles, size_t lines_per_block>
+__forceinline static void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
+    const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
+    alignas(64) u8 tmp[tile_size];
+    tiling_pass<&encode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
+    swizzle_pass<&encode, nibbles, lines_per_block>(morton_buffer, tmp);
+}
+
+template <size_t nibbles, size_t lines_per_block>
+__forceinline static void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
+    const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
+    alignas(64) u8 tmp[tile_size];
+    swizzle_pass<&decode, nibbles, lines_per_block>(morton_buffer, tmp);
+    tiling_pass<&decode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
+}
+
+template <void codec(u8*, u8*, u32), size_t nibbles, size_t lines_per_block>
+static void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) {
+    const u32 x_blocks = (width / lines_per_block);
+    const u32 y_blocks = (height / lines_per_block);
+    const size_t line_size = (lines_per_block * nibbles) / 2;
+    const size_t tile_size = lines_per_block * line_size;
+    const size_t stride_size = width * line_size;
+    matrix_buffer = matrix_buffer + ((height * width * nibbles) / 2) - stride_size;
+    for (u32 y = 0; y < y_blocks; y++) {
+        u8* linear_buffer = matrix_buffer;
+        for (u32 x = 0; x != x_blocks; x++) {
+            codec(morton_buffer, linear_buffer, x_blocks);
+            linear_buffer += line_size;
+            morton_buffer += tile_size;
+        }
+        matrix_buffer -= stride_size;
+    }
+}
+
+namespace Decoders {
+
+bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
+    switch (bpp) {
+    case 4: {
+        morton_pass<&decode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 8: {
+        morton_pass<&decode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 16: {
+        morton_pass<&decode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 24: {
+        morton_pass<&decode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 32: {
+        morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    default: {
+        return false;
+        break;
+    }
+    }
+}
+}
+
+namespace Encoders {
+
+bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
+    switch (bpp) {
+    case 4: {
+        morton_pass<&encode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 8: {
+        morton_pass<&encode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 16: {
+        morton_pass<&encode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 24: {
+        morton_pass<&encode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 32: {
+        morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    default: {
+        return false;
+        break;
+    }
+    }
+}
+}
+
+#ifdef _MSC_VER
+#undef __hot
+#undef __no_inline
+#elif defined(CLANG_OR_GCC)
+#pragma GCC pop_options
+#undef __no_inline
+#undef __hot
+#else
+#undef __hot
+#undef __no_inline
+#undef __forceinline
+#endif
diff --git a/src/video_core/texture/internal/texture_utils.h b/src/video_core/texture/internal/texture_utils.h
index 38d7f96f9..536a9873d 100644
--- a/src/video_core/texture/internal/texture_utils.h
+++ b/src/video_core/texture/internal/texture_utils.h
@@ -16,6 +16,7 @@
 #ifdef _MSC_VER
 #pragma inline_recursion(on)
 #elif defined(CLANG_OR_GCC)
+#pragma GCC push_options
 #pragma GCC optimize("-fpeel-loops")
 #pragma GCC optimize("-fpredictive-commoning")
 #pragma GCC optimize("-ftree-loop-distribute-patterns")
@@ -74,24 +75,6 @@ inline void image_pass(u8* target, u32 width, u32 height) {
         image_pass_aux_rev<pass, read_size, write_size, tuning>(target, width, height);
 }
 
-template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
-void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) {
-    const size_t tiled_line_size = (lines_per_block * nibbles) / 2;
-    const size_t row_length = x_blocks * tiled_line_size;
-    for (u32 i = 0; i < lines_per_block; i++) {
-        const u32 k = (lines_per_block - 1 - i);
-        const size_t tiled_index = i * tiled_line_size;
-        const size_t linear_index = k * row_length;
-        codec(tiled + tiled_index, linear + linear_index, tiled_line_size);
-    }
-}
-
-// @param read_size is the amount of bytes each pixel takes
-inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
-    std::memcpy(matrix_pointer, morton_pointer, read_size);
-}
-
-// @param read_size is the amount of bytes each pixel takes
-inline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
-    std::memcpy(morton_pointer, matrix_pointer, read_size);
-}
+#if defined(CLANG_OR_GCC)
+#pragma GCC pop_options
+#endif