Fixes, cleaning and small refactors

2025-08-17 12:41:00 +00:00 · 2017-01-16 19:18:38 -05:00 · 2017-01-16 19:18:38 -05:00 · bed6207ac7
commit bed6207ac7
parent 1c6965f106
12 changed files with 484 additions and 399 deletions
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@ -1,3 +1,4 @@
+
 set(SRCS
            renderer_opengl/gl_rasterizer.cpp
            renderer_opengl/gl_rasterizer_cache.cpp
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@ -26,6 +26,9 @@
 #include "video_core/utils.h"
 #include "video_core/video_core.h"

+#define TEXTURE_CACHE_SIZE (1024 * 1024 * 8) // 8MB inner cache for decoding/encoding
+alignas(64) static u8 TextureCache[TEXTURE_CACHE_SIZE];
+
 struct FormatTuple {
    GLint internal_format;
    GLenum format;
@ -39,7 +42,7 @@ static const std::array<FormatTuple, 18> format_tuples = {{
    {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5},                  // RGB565
    {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4},                // RGBA4
    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // IA8
-    {GL_RG8, GL_RG8, GL_UNSIGNED_BYTE},                            // RG8
+    {GL_RG8, GL_RG, GL_UNSIGNED_BYTE},                             // RG8
    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // I8
    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // A8
    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                         // IA4
@ -70,7 +73,7 @@ static const std::array<bool, 18> native_format = {
    false, // ETC1A4
    true,  // D16
    false,
-    false, // D24
+    true,  // D24
    false, // D24S8
 };

@ -306,26 +309,29 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo
                // clang-format on
                );
            Pica::Texture::Codec* codec = tmp.get();
-            codec->configTiling(true, 8); // change 8 for 32 in case the mage is tiled
+            codec->configTiling(true, 8); // change 8 for 32 in case the image is tiled
                                          // on blocks of 32x32
            codec->configRGBATransform(!native_format[(unsigned int)params.pixel_format]);
            codec->validate();
            if (!codec->invalid()) {
-                codec->decode();
-                std::unique_ptr<u8[]> decoded_texture = codec->transferInternalBuffer();
-                u32 bytes = codec->getInternalBytesPerPixel();
-                if (bytes == 3)
-                    bytes = 1;
-                else if (bytes != 2)
-                    bytes = 4;
-                glPixelStorei(GL_UNPACK_ALIGNMENT, bytes);
-                glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height,
-                             0, tuple.format, tuple.type, decoded_texture.get());
-                glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
+                u32 estimated_size =
+                    params.width * params.height * codec->getInternalBytesPerPixel();
+                if (estimated_size <= TEXTURE_CACHE_SIZE) {
+                    codec->setExternalBuffer(TextureCache);
+                    codec->decode();
+                    glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
+                                 params.height, 0, tuple.format, tuple.type, TextureCache);
+                } else {
+                    codec->decode();
+                    std::unique_ptr<u8[]> decoded_texture = codec->transferInternalBuffer();
+                    glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
+                                 params.height, 0, tuple.format, tuple.type, decoded_texture.get());
+                }
            } else {
                LOG_WARNING(Render_OpenGL,
                            "Invalid texture sent to renderer; width: %d height %d type: %d",
                            params.width, params.height, (unsigned int)params.pixel_format);
+                return nullptr;
            }
        }
        // If not 1x scale, blit 1x texture to a new scaled texture and replace texture in surface
@ -652,15 +658,22 @@ void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) {
        glPixelStorei(GL_PACK_ROW_LENGTH, 0);
    } else {
        const FormatTuple& tuple = format_tuples[(u32)surface->pixel_format];
-        u32 bytes_per_pixel = Pica::Texture::Format::GetBpp(surface->pixel_format) / 8;
+        u32 bits_per_pixel = Pica::Texture::Format::GetBpp(surface->pixel_format);
        if (!native_format[(u32)surface->pixel_format])
-            bytes_per_pixel = 4;
-        std::vector<u8> temp_gl_buffer(surface->width * surface->height * bytes_per_pixel);
-        glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data());
-
+            bits_per_pixel = 32;
+        u32 size = surface->width * surface->height * bits_per_pixel / 8;
+        std::vector<u8> temp_gl_buffer;
+        u8* temporal_buffer;
+        if (size <= TEXTURE_CACHE_SIZE)
+            temporal_buffer = TextureCache;
+        else {
+            temp_gl_buffer.resize(size);
+            temporal_buffer = temp_gl_buffer.data();
+        }
+        glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temporal_buffer);
        std::unique_ptr<Pica::Texture::Codec> tmp = Pica::Texture::CodecFactory::build(
            // clang-format off
-            surface->pixel_format, temp_gl_buffer.data(), surface->width, surface->height
+            surface->pixel_format, temporal_buffer, surface->width, surface->height
            // clang-format on
            );
        Pica::Texture::Codec* codec = tmp.get();
--- a/src/video_core/texture/codec.cpp
+++ b/src/video_core/texture/codec.cpp
@ -67,7 +67,7 @@ void Codec::init(bool decode) {
            this->expected_nibbles_size = this->start_nibbles_size;
    }
    this->validate();
-    if (!this->external_result_buffer || !this->invalid()) {
+    if (!this->external_result_buffer) {
        size_t buff_size = this->width * this->height * this->expected_nibbles_size / 2;
        this->internal_buffer = std::make_unique<u8[]>(buff_size);
        this->passing_buffer = this->internal_buffer.get();
@ -91,7 +91,7 @@ void Codec::validate() {
        this->invalid_state = true;
        return;
    }
-    if (this->morton && this->morton_pass_tiling != 8 && this->morton_pass_tiling != 32) {
+    if (this->morton && this->morton_pass_tiling != 8) {
        this->invalid_state = true;
        return;
    }
@ -102,18 +102,12 @@ inline void Codec::decode_morton_pass() {
    if (this->morton_pass_tiling == 8)
        Decoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height,
                             this->start_nibbles_size * 4);
-    else if (this->morton_pass_tiling == 32)
-        Decoders::Morton_32x32(this->target_buffer, this->passing_buffer, this->width, this->height,
-                               this->start_nibbles_size * 4);
 }

 inline void Codec::encode_morton_pass() {
    if (this->morton_pass_tiling == 8)
-        Encoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height,
+        Encoders::Morton_8x8(this->passing_buffer, this->target_buffer, this->width, this->height,
                             this->start_nibbles_size * 4);
-    else if (this->morton_pass_tiling == 32)
-        Encoders::Morton_32x32(this->target_buffer, this->passing_buffer, this->width, this->height,
-                               this->start_nibbles_size * 4);
 }

 std::unique_ptr<Codec> CodecFactory::build(Format::Type format, u8* target, u32 width, u32 height) {
--- a/src/video_core/texture/formats.h
+++ b/src/video_core/texture/formats.h
@ -40,7 +40,7 @@ struct Format {
        Invalid = 255,
    };

-    static u32 GetBpp(Type format) {
+    static const u32 GetBpp(Type format) {
        static const std::array<unsigned int, 18> bpp_table = {
            32, // RGBA8
            24, // RGB8
@ -66,19 +66,19 @@ struct Format {
        return bpp_table[(u32)format];
    }

-    static Type FromTextureFormat(Regs::TextureFormat format) {
+    static constexpr Type FromTextureFormat(Regs::TextureFormat format) {
        return ((unsigned int)format < 14) ? (Type)format : Type::Invalid;
    }

-    static Type FromColorFormat(Regs::ColorFormat format) {
+    static constexpr Type FromColorFormat(Regs::ColorFormat format) {
        return ((unsigned int)format < 5) ? (Type)format : Type::Invalid;
    }

-    static Type FromDepthFormat(Regs::DepthFormat format) {
+    static constexpr Type FromDepthFormat(Regs::DepthFormat format) {
        return ((unsigned int)format < 4) ? (Type)((unsigned int)format + 14) : Type::Invalid;
    }

-    static Type FromGPUPixelFormat(GPU::Regs::PixelFormat format) {
+    static const Type FromGPUPixelFormat(GPU::Regs::PixelFormat format) {
        switch (format) {
        // RGB565 and RGB5A1 are switched in PixelFormat compared to ColorFormat
        case GPU::Regs::PixelFormat::RGB565:
@ -92,6 +92,17 @@ struct Format {

 }; // Format

+struct Info {
+    PAddr physical_address;
+    int width;
+    int height;
+    int stride;
+    Pica::Regs::TextureFormat format;
+
+    static Info FromPicaRegister(const Pica::Regs::TextureConfig& config,
+                                 const Pica::Regs::TextureFormat& format);
+};
+
 } // Texture

 } // Pica
--- a/src/video_core/texture/internal/codecs.h
+++ b/src/video_core/texture/internal/codecs.h
@ -5,6 +5,7 @@
 #include <memory>
 #include "common/common_types.h"
 #include "video_core/texture/codec.h"
+#include "video_core/texture/formats.h"

 // each texture format codec
 class RGBACodec : public Pica::Texture::Codec {
@ -15,7 +16,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 8;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::RGBA8
+        ) / 4;
+        // clang-format on
    };
 };

@ -27,7 +32,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 6;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::RGB8
+        ) / 4;
+        // clang-format on
    };
 };

@ -39,7 +48,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 4;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::RGB5A1
+        ) / 4;
+        // clang-format on
    };
 };

@ -51,7 +64,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 4;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::RGBA4
+        ) / 4;
+        // clang-format on
    };
 };

@ -63,7 +80,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 4;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::RGB565
+        ) / 4;
+        // clang-format on
    };
 };

@ -75,7 +96,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 4;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::RG8
+        ) / 4;
+        // clang-format on
    };
 };

@ -87,7 +112,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 4;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::IA8
+        ) / 4;
+        // clang-format on
    };
 };

@ -99,7 +128,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 2;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::I8
+        ) / 4;
+        // clang-format on
    };
 };

@ -111,7 +144,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 2;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::A8
+        ) / 4;
+        // clang-format on
    };
 };

@ -123,7 +160,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 2;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::IA4
+        ) / 4;
+        // clang-format on
    };
 };

@ -135,7 +176,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 1;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::I4
+        ) / 4;
+        // clang-format on
    };
 };

@ -147,7 +192,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 1;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::A4
+        ) / 4;
+        // clang-format on
    };
 };

@ -159,7 +208,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 1;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::ETC1
+        ) / 4;
+        // clang-format on
    };
 };

@ -171,7 +224,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 2;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::ETC1A4
+        ) / 4;
+        // clang-format on
    };
 };

@ -183,7 +240,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 4;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::D16
+        ) / 4;
+        // clang-format on
    };
 };

@ -195,7 +256,11 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 6;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::D24
+        ) / 4;
+        // clang-format on
    };
 };

@ -207,6 +272,10 @@ public:

 protected:
    virtual void setSize() {
-        this->start_nibbles_size = 8;
+        // clang-format off
+        this->start_nibbles_size = Pica::Texture::Format::GetBpp(
+            Pica::Texture::Format::Type::D24S8
+        ) / 4;
+        // clang-format on
    };
 };
--- a/src/video_core/texture/internal/decoders.cpp
+++ b/src/video_core/texture/internal/decoders.cpp
@ -83,9 +83,9 @@ inline u32 build_luminance(u32 intensity, u32 alpha) {
 }

 inline void intensity_alpha_pass(u8* read, u8* write) {
-    alignas(4) u8 pixel[2];
-    std::memcpy(pixel, read, 2);
-    u32 result = build_luminance(pixel[1], pixel[0]);
+    u16 pixel;
+    std::memcpy(&pixel, read, 2);
+    u32 result = build_luminance(pixel >> 8, pixel & 0x00FF);
    std::memcpy(write, &result, 4);
 }

@ -93,9 +93,7 @@ inline void intensity_alpha_nibbles_pass(u8* read, u8* write) {
    alignas(4) u8 pixel;
    std::memcpy(&pixel, read, 1);
    u16 tmp = convert_nibbles(pixel);
-    u8 tmp2[2];
-    std::memcpy(tmp2, &tmp, 2);
-    u32 result = build_luminance(tmp2[1], tmp2[0]);
+    u32 result = build_luminance(tmp >> 8, tmp & 0x00FF);
    std::memcpy(write, &result, 4);
 }

@ -107,31 +105,29 @@ inline void intensity_pass(u8* read, u8* write) {
 }

 inline void intensity_nibbles_pass(u8* read, u8* write) {
-    alignas(4) u8 pixel[2];
-    std::memcpy(pixel, read, 1);
-    u16 tmp = convert_nibbles(pixel[0]);
-    std::memcpy(pixel, &tmp, 2);
-    u32 result = build_luminance(pixel[1], 255);
+    u8 pixel;
+    std::memcpy(&pixel, read, 1);
+    u16 tmp = convert_nibbles(pixel);
+    u32 result = build_luminance(tmp & 0x00FF, 255);
    std::memcpy(write, &result, 4);
-    result = build_luminance(pixel[0], 255);
+    result = build_luminance(tmp >> 8, 255);
    std::memcpy(write + 4, &result, 4);
 }

 inline void alpha_pass(u8* read, u8* write) {
-    alignas(4) u8 pixel[1];
-    std::memcpy(pixel, read, 1);
-    u32 result = build_luminance(0, pixel[0]);
+    u8 pixel;
+    std::memcpy(&pixel, read, 1);
+    u32 result = build_luminance(0, pixel);
    std::memcpy(write, &result, 4);
 }

 inline void alpha_nibbles_pass(u8* read, u8* write) {
-    alignas(4) u8 pixel[2];
-    std::memcpy(pixel, read, 1);
-    u16 tmp = convert_nibbles(pixel[0]);
-    std::memcpy(pixel, &tmp, 2);
-    u32 result = build_luminance(0, pixel[0]);
+    u8 pixel;
+    std::memcpy(&pixel, read, 1);
+    u16 tmp = convert_nibbles(pixel);
+    u32 result = build_luminance(0, tmp & 0x00FF);
    std::memcpy(write, &result, 4);
-    result = build_luminance(0, pixel[1]);
+    result = build_luminance(0, tmp >> 8);
    std::memcpy(write + 4, &result, 4);
 }

@ -207,7 +203,7 @@ void ETC1A4Codec::decode() {
    ETC1A4(this->target_buffer, this->passing_buffer, this->width, this->height);
 }

-namespace {
+namespace Decode {

 inline void expand_depth16_pass(u8* read, u8* write) {
    alignas(4) u8 pixel[4];
@ -224,11 +220,18 @@ inline void expand_depth24_pass(u8* read, u8* write) {
    std::memcpy(write, pixel, 4);
 }

-inline void fix_stencil_pass(u8* read, u8* write) {
-    u32 pixel;
-    std::memcpy(&pixel, read, 4);
-    pixel = (pixel << 8) | (pixel >> 24);
-    std::memcpy(write, &pixel, 4);
+inline void d24s8_pass(u8* target, u32 width, u32 height) {
+    const size_t sub_iters = 8;
+    const size_t iters = width * height / sub_iters;
+    for (u32 i = 0; i < iters; i++) {
+        for (u32 j = 0; j < sub_iters; j++) {
+            u32 pixel;
+            std::memcpy(&pixel, target, 4);
+            pixel = (pixel >> 24) | (pixel << 8);
+            std::memcpy(target, &pixel, 4);
+            target += 4;
+        }
+    }
 }

 } // Anonymous
@ -236,7 +239,7 @@ inline void fix_stencil_pass(u8* read, u8* write) {
 void D16Codec::decode() {
    super::decode();
    if (this->raw_RGBA)
-        image_pass<&expand_depth16_pass, 4, 8>(
+        image_pass<&Decode::expand_depth16_pass, 4, 8>(
            // clang-format off
            this->passing_buffer, this->width, this->height
            // clang-format on
@ -246,7 +249,7 @@ void D16Codec::decode() {
 void D24Codec::decode() {
    super::decode();
    if (this->raw_RGBA)
-        image_pass<&expand_depth24_pass, 6, 8>(
+        image_pass<&Decode::expand_depth24_pass, 6, 8>(
            // clang-format off
            this->passing_buffer, this->width, this->height
            // clang-format on
@ -256,9 +259,5 @@ void D24Codec::decode() {
 void D24S8Codec::decode() {
    super::decode();
    if (this->raw_RGBA)
-        image_pass<&fix_stencil_pass, 8, 8, 8>(
-            // clang-format off
-            this->passing_buffer, this->width, this->height
-            // clang-format on
-            );
+        Decode::d24s8_pass(this->passing_buffer, this->width, this->height);
 }
--- a/src/video_core/texture/internal/encoders.cpp
+++ b/src/video_core/texture/internal/encoders.cpp
@ -69,11 +69,18 @@ inline void contract_depth24_pass(u8* read, u8* write) {
    std::memcpy(write, pixel, 3);
 }

-inline void fix_stencil_pass(u8* read, u8* write) {
-    u32 pixel;
-    std::memcpy(&pixel, read, 4);
-    pixel = (pixel >> 24) | (pixel << 8);
-    std::memcpy(write, &pixel, 4);
+inline void d24s8_pass(u8* target, u32 width, u32 height) {
+    const size_t sub_iters = 8;
+    const size_t iters = width * height / sub_iters;
+    for (u32 i = 0; i < iters; i++) {
+        for (u32 j = 0; j < sub_iters; j++) {
+            u32 pixel;
+            std::memcpy(&pixel, target, 4);
+            pixel = (pixel >> 8) | (pixel << 24);
+            std::memcpy(target, &pixel, 4);
+            target += 4;
+        }
+    }
 }

 } // Anonymous
@ -101,9 +108,5 @@ void D24Codec::encode() {
 void D24S8Codec::encode() {
    super::encode();
    if (this->raw_RGBA)
-        image_pass<&Encode::fix_stencil_pass, 8, 8, 8>(
-            // clang-format off
-            this->passing_buffer, this->width, this->height
-            // clang-format on
-            );
+        Encode::d24s8_pass(this->passing_buffer, this->width, this->height);
 }
--- a/src/video_core/texture/internal/etc1.cpp
+++ b/src/video_core/texture/internal/etc1.cpp
@ -122,6 +122,22 @@ union ETC1Tile {

 } // anonymous namespace

+inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
+    std::memcpy(matrix_pointer, morton_pointer, read_size);
+}
+
+template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
+void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) {
+    const size_t tiled_line_size = (lines_per_block * nibbles) / 2;
+    const size_t row_length = x_blocks * tiled_line_size;
+    for (u32 i = 0; i < lines_per_block; i++) {
+        const u32 k = (lines_per_block - 1 - i);
+        const size_t tiled_index = i * tiled_line_size;
+        const size_t linear_index = k * row_length;
+        codec(tiled + tiled_index, linear + linear_index, tiled_line_size);
+    }
+}
+
 inline void etc1_pass(u8* etc1_buffer, u8* linear_buffer, u32 x_blocks) {
    const size_t line = 8 * 4;
    alignas(64) u8 tmp[line * 8];
--- a/src/video_core/texture/internal/morton.cpp
+++ b/src/video_core/texture/internal/morton.cpp
@ -1,295 +1,40 @@
-#include <cstring>
-#include <memory>
-#include <utility>
 #include "common/common_types.h"
 #include "video_core/texture/internal/morton.h"
-#include "video_core/texture/internal/texture_utils.h"

-///////////////////////////////////////////////////////////////////////////////
-// Optimizations
-//////////////////////////////////////////////////////////////////////////////
-#ifdef _MSC_VER
-#pragma inline_recursion(on)
-// Normaly set to 16 by default, the best balance seems to be on 8 for this module
-#pragma inline_depth(8)
-// favor fast code over small code.
-#pragma optimize("t", on)
-#pragma intrinsic(memcpy)
-#define __hot
-#define __no_inline __declspec(noinline)
-#elif defined(CLANG_OR_GCC)
-// The next 3 will swizle memory copying to help find the best sse/avx shuffling
-// in case it's possible. Compilation tests have proven effective use of these
-// flags on gcc and clang.
-#pragma GCC optimize("-fpredictive-commoning")
-#pragma GCC optimize("-ftree-loop-distribute-patterns")
-#pragma GCC optimize("-ftree-vectorize")
-#pragma GCC option("--param inline-unit-growth=400")
-#pragma GCC option("--param large-function-growth=800")
-// The beauty of these compiler options is that they generate better code than
-// hand written intrinsics, since inline expanding memeory transfers can be pattern
-// matched with vector instructions available in the target.
-#define __no_inline __attribute__((noinline))
-#define __hot __attribute__((hot))
-#if !defined(__forceinline)
-#define __forceinline attribute__((always_inline))
-#endif
-#else
-#define __hot
-#define __no_inline
-#define __forceinline
-#endif
-
-#pragma region Z_Order
-/////////////////////////////////////////////////////////////////////////////
-//          Z-Order:
-//
-//                    0-->1
-//                      /
-//                    2-->3
-//
-// for more information look at: https://en.wikipedia.org/wiki/Z-order_curve
-/////////////////////////////////////////////////////////////////////////////
-#define TOP_LEFT 0
-#define TOP_RIGHT 1
-#define BOTTOM_LEFT 2
-#define BOTTOM_RIGHT 3
-
-constexpr u32 isRight(u32 block_index) {
-    return (block_index % 2);
+static u32 Part1By1(u32 x) {
+    x &= 0x0000ffff;                 // x = ---- ---- ---- ---- fedc ba98 7654 3210
+    x = (x ^ (x << 8)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
+    x = (x ^ (x << 4)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
+    x = (x ^ (x << 2)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
+    x = (x ^ (x << 1)) & 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
+    return x;
 }

-constexpr u32 isBottom(u32 block_index) {
-    return (block_index / 2);
+static u32 Compact1By1(u32 x) {
+    x &= 0x55555555;                 // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
+    x = (x ^ (x >> 1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
+    x = (x ^ (x >> 2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
+    x = (x ^ (x >> 4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
+    x = (x ^ (x >> 8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
+    return x;
 }

-template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
-__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block);
-
-template <void codec(u8*, u8*, size_t), size_t nibbles, u32 block_index, u32 blocks,
-          size_t block_size>
-__forceinline static void swizzle_block_aux(u8*& morton_block, u8* linear_block) {
-    // move the linear_block pointer to the appropiate block
-    const size_t right = isRight(block_index) * (blocks * nibbles) / 2;
-    const size_t down = isBottom(block_index) * block_size;
-    u8* new_linear = linear_block + right + down;
-    swizzle_block<codec, nibbles, blocks, block_size>(morton_block, new_linear);
+static u32 EncodeMorton(u32 x, u32 y) {
+    return (Part1By1(y) << 1) | Part1By1(x);
 }

-template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
-__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block) {
-    const size_t new_block_size = block_size / 2;
-    if (blocks <= 2) {
-        // We handle 2*2 blocks on z-order
-        const size_t read_size = nibbles; // just for clearness. It's the same amount
-        // TOP_LEFT & TOP_RIGHT
-        codec(morton_block, linear_block, read_size);
-        morton_block += read_size;
-        // BOTTOM_LEFT & BOTTOM_RIGHT
-        codec(morton_block, linear_block + new_block_size, read_size);
-        morton_block += read_size;
-    } else {
-        // we divide the block into 4 blocks in z-order corecursively
-        // until we have 2x2 blocks.
-        const u32 subdivide = blocks / 2;
-        swizzle_block_aux<codec, nibbles, TOP_LEFT, subdivide, new_block_size>(morton_block,
-                                                                               linear_block);
-        swizzle_block_aux<codec, nibbles, TOP_RIGHT, subdivide, new_block_size>(morton_block,
-                                                                                linear_block);
-        swizzle_block_aux<codec, nibbles, BOTTOM_LEFT, subdivide, new_block_size>(morton_block,
-                                                                                  linear_block);
-        swizzle_block_aux<codec, nibbles, BOTTOM_RIGHT, subdivide, new_block_size>(morton_block,
-                                                                                   linear_block);
-    }
+static u32 DecodeMortonX(u32 code) {
+    return Compact1By1(code >> 0);
 }

-template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
-__forceinline static void swizzle_pass(u8* morton_block, u8* linear_block) {
-    const size_t block_size = (lines_per_block * lines_per_block * nibbles) / 2;
-    swizzle_block<codec, nibbles, lines_per_block, block_size>(morton_block, linear_block);
-}
-#pragma endregion Z_Order
-
-template <size_t nibbles, size_t lines_per_block>
-__hot inline static void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
-    const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
-    alignas(64) u8 tmp[tile_size];
-    tiling_pass<&encode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
-    swizzle_pass<&encode, nibbles, lines_per_block>(morton_buffer, tmp);
+static u32 DecodeMortonY(u32 code) {
+    return Compact1By1(code >> 1);
 }

-template <size_t nibbles, size_t lines_per_block>
-__hot inline static void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
-    const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
-    alignas(64) u8 tmp[tile_size];
-    swizzle_pass<&decode, nibbles, lines_per_block>(morton_buffer, tmp);
-    tiling_pass<&decode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
+u32 MortonOffset(u32 x, u32 y, u32 width, u32 height, u32 tiling, u32 bpp) {
+    u32 tile = (x + y * height) * width / (tiling * tiling);
+    tile = (tile * bpp) / 8;
+    return tile + EncodeMorton(x % tiling, y % tiling);
 }

-template <void codec(u8*, u8*, u32), size_t nibbles, size_t lines_per_block>
-__hot static void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) {
-    const u32 x_blocks = (width / lines_per_block);
-    const u32 y_blocks = (height / lines_per_block);
-    const size_t line_size = (lines_per_block * nibbles) / 2;
-    const size_t tile_size = lines_per_block * line_size;
-    const size_t stride_size = width * line_size;
-    matrix_buffer = matrix_buffer + ((height * width * nibbles) / 2) - stride_size;
-    for (u32 y = 0; y < y_blocks; y++) {
-        u8* linear_buffer = matrix_buffer;
-        for (u32 x = 0; x != x_blocks; x++) {
-            codec(morton_buffer, linear_buffer, x_blocks);
-            linear_buffer += line_size;
-            morton_buffer += tile_size;
-        }
-        matrix_buffer -= stride_size;
-    }
-}
-
-// keep hot code together
-__no_inline __hot static void morton_8x8_32(u8* morton_buffer, u8* matrix_buffer, u32 width,
-                                            u32 height, bool decode) {
-    if (decode)
-        morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
-    else
-        morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
-}
-
-namespace Decoders {
-
-bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
-    if (bpp == 32) {
-        morton_8x8_32(morton_buffer, matrix_buffer, width, height, true);
-        return true;
-    }
-    switch (bpp) {
-    case 4: {
-        morton_pass<&decode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 8: {
-        morton_pass<&decode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 16: {
-        morton_pass<&decode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 24: {
-        morton_pass<&decode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    default: {
-        return false;
-        break;
-    }
-    }
-}
-
-bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
-    switch (bpp) {
-    case 4: {
-        morton_pass<&decode_pass<1, 32>, 1, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 8: {
-        morton_pass<&decode_pass<2, 32>, 2, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 16: {
-        morton_pass<&decode_pass<4, 32>, 4, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 24: {
-        morton_pass<&decode_pass<6, 32>, 6, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 32: {
-        morton_pass<&decode_pass<8, 32>, 8, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    default: {
-        return false;
-        break;
-    }
-    }
-}
-}
-
-namespace Encoders {
-
-bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
-    if (bpp == 32) {
-        morton_8x8_32(morton_buffer, matrix_buffer, width, height, false);
-        return true;
-    }
-    switch (bpp) {
-    case 4: {
-        morton_pass<&encode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 8: {
-        morton_pass<&encode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 16: {
-        morton_pass<&encode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 24: {
-        morton_pass<&encode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    default: {
-        return false;
-        break;
-    }
-    }
-}
-
-bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
-    switch (bpp) {
-    case 4: {
-        morton_pass<&encode_pass<1, 32>, 1, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 8: {
-        morton_pass<&encode_pass<2, 32>, 2, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 16: {
-        morton_pass<&encode_pass<4, 32>, 4, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 24: {
-        morton_pass<&encode_pass<6, 32>, 6, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    case 32: {
-        morton_pass<&encode_pass<8, 32>, 8, 32>(morton_buffer, matrix_buffer, width, height);
-        return true;
-        break;
-    }
-    default: {
-        return false;
-        break;
-    }
-    }
-}
-}
+#include "morton8x8_optimized.cpp"
--- a/src/video_core/texture/internal/morton.h
+++ b/src/video_core/texture/internal/morton.h
@ -2,14 +2,12 @@

 #include "common/common_types.h"

-enum class MortonPass { Tile8x8, Tile32x32 };
+u32 MortonOffset(u32 x, u32 y, u32 width, u32 height, u32 tiling, u32 bpp);

 namespace Decoders {
 bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
-bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
 }

 namespace Encoders {
 bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
-bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
 }
--- a/src/video_core/texture/internal/morton8x8_optimized.cpp
+++ b/src/video_core/texture/internal/morton8x8_optimized.cpp
@ -0,0 +1,253 @@
+
+#include <cstring>
+#include <memory>
+#include <utility>
+#include "common/common_types.h"
+
+#if ((defined(__clang__) || defined(__GNUC__)) && !defined(__INTEL_COMPILER))
+#define CLANG_OR_GCC
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// Optimizations
+//////////////////////////////////////////////////////////////////////////////
+#ifdef _MSC_VER
+#pragma inline_recursion(on)
+#pragma intrinsic(memcpy)
+#define __hot
+#define __no_inline __declspec(noinline)
+#elif defined(CLANG_OR_GCC)
+#pragma GCC push_options
+// The next 3 will swizle memory copying to help find the best sse/avx shuffling
+// in case it's possible. Compilation tests have proven effective use of these
+// flags on gcc and clang.
+#pragma GCC optimize("-fpredictive-commoning")
+#pragma GCC optimize("-ftree-loop-distribute-patterns")
+#pragma GCC optimize("-ftree-vectorize")
+// The beauty of these compiler options is that they generate better code than
+// hand written intrinsics, since inline expanding memeory transfers can be pattern
+// matched with vector instructions available in the target.
+#define __no_inline __attribute__((noinline))
+#define __hot __attribute__((hot))
+#if !defined(__forceinline)
+#define __forceinline attribute__((always_inline))
+#endif
+#else
+#define __hot
+#define __no_inline
+#define __forceinline inline
+#endif
+
+#pragma region Z_Order
+/////////////////////////////////////////////////////////////////////////////
+//          Z-Order:
+//
+//                    0-->1
+//                      /
+//                    2-->3
+//
+// for more information look at: https://en.wikipedia.org/wiki/Z-order_curve
+/////////////////////////////////////////////////////////////////////////////
+#define TOP_LEFT 0
+#define TOP_RIGHT 1
+#define BOTTOM_LEFT 2
+#define BOTTOM_RIGHT 3
+
+// @param read_size is the amount of bytes each pixel takes
+__forceinline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
+    std::memcpy(matrix_pointer, morton_pointer, read_size);
+}
+
+// @param read_size is the amount of bytes each pixel takes
+__forceinline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
+    std::memcpy(morton_pointer, matrix_pointer, read_size);
+}
+
+constexpr u32 isRight(u32 block_index) {
+    return (block_index % 2);
+}
+
+constexpr u32 isBottom(u32 block_index) {
+    return (block_index / 2);
+}
+
+template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
+__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block);
+
+template <void codec(u8*, u8*, size_t), size_t nibbles, u32 block_index, u32 blocks,
+          size_t block_size>
+__forceinline static void swizzle_block_aux(u8*& morton_block, u8* linear_block) {
+    // move the linear_block pointer to the appropiate block
+    const size_t right = isRight(block_index) * (blocks * nibbles) / 2;
+    const size_t down = isBottom(block_index) * block_size;
+    u8* new_linear = linear_block + right + down;
+    swizzle_block<codec, nibbles, blocks, block_size>(morton_block, new_linear);
+}
+
+template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
+__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block) {
+    const size_t new_block_size = block_size / 2;
+    if (blocks <= 2) {
+        // We handle 2*2 blocks on z-order
+        const size_t read_size = nibbles; // just for clearness. It's the same amount
+        // TOP_LEFT & TOP_RIGHT
+        codec(morton_block, linear_block, read_size);
+        morton_block += read_size;
+        // BOTTOM_LEFT & BOTTOM_RIGHT
+        codec(morton_block, linear_block + new_block_size, read_size);
+        morton_block += read_size;
+    } else {
+        // we divide the block into 4 blocks in z-order corecursively
+        // until we have 2x2 blocks.
+        const u32 subdivide = blocks / 2;
+        swizzle_block_aux<codec, nibbles, TOP_LEFT, subdivide, new_block_size>(morton_block,
+                                                                               linear_block);
+        swizzle_block_aux<codec, nibbles, TOP_RIGHT, subdivide, new_block_size>(morton_block,
+                                                                                linear_block);
+        swizzle_block_aux<codec, nibbles, BOTTOM_LEFT, subdivide, new_block_size>(morton_block,
+                                                                                  linear_block);
+        swizzle_block_aux<codec, nibbles, BOTTOM_RIGHT, subdivide, new_block_size>(morton_block,
+                                                                                   linear_block);
+    }
+}
+
+template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
+__forceinline static void swizzle_pass(u8* morton_block, u8* linear_block) {
+    const size_t block_size = (lines_per_block * lines_per_block * nibbles) / 2;
+    swizzle_block<codec, nibbles, lines_per_block, block_size>(morton_block, linear_block);
+}
+#pragma endregion Z_Order
+
+template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
+__forceinline void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) {
+    const size_t tiled_line_size = (lines_per_block * nibbles) / 2;
+    const size_t row_length = x_blocks * tiled_line_size;
+    for (u32 i = 0; i < lines_per_block; i++) {
+        const u32 k = (lines_per_block - 1 - i);
+        const size_t tiled_index = i * tiled_line_size;
+        const size_t linear_index = k * row_length;
+        codec(tiled + tiled_index, linear + linear_index, tiled_line_size);
+    }
+}
+
+template <size_t nibbles, size_t lines_per_block>
+__forceinline static void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
+    const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
+    alignas(64) u8 tmp[tile_size];
+    tiling_pass<&encode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
+    swizzle_pass<&encode, nibbles, lines_per_block>(morton_buffer, tmp);
+}
+
+template <size_t nibbles, size_t lines_per_block>
+__forceinline static void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
+    const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
+    alignas(64) u8 tmp[tile_size];
+    swizzle_pass<&decode, nibbles, lines_per_block>(morton_buffer, tmp);
+    tiling_pass<&decode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
+}
+
+template <void codec(u8*, u8*, u32), size_t nibbles, size_t lines_per_block>
+static void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) {
+    const u32 x_blocks = (width / lines_per_block);
+    const u32 y_blocks = (height / lines_per_block);
+    const size_t line_size = (lines_per_block * nibbles) / 2;
+    const size_t tile_size = lines_per_block * line_size;
+    const size_t stride_size = width * line_size;
+    matrix_buffer = matrix_buffer + ((height * width * nibbles) / 2) - stride_size;
+    for (u32 y = 0; y < y_blocks; y++) {
+        u8* linear_buffer = matrix_buffer;
+        for (u32 x = 0; x != x_blocks; x++) {
+            codec(morton_buffer, linear_buffer, x_blocks);
+            linear_buffer += line_size;
+            morton_buffer += tile_size;
+        }
+        matrix_buffer -= stride_size;
+    }
+}
+
+namespace Decoders {
+
+bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
+    switch (bpp) {
+    case 4: {
+        morton_pass<&decode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 8: {
+        morton_pass<&decode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 16: {
+        morton_pass<&decode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 24: {
+        morton_pass<&decode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 32: {
+        morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    default: {
+        return false;
+        break;
+    }
+    }
+}
+}
+
+namespace Encoders {
+
+bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
+    switch (bpp) {
+    case 4: {
+        morton_pass<&encode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 8: {
+        morton_pass<&encode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 16: {
+        morton_pass<&encode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 24: {
+        morton_pass<&encode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    case 32: {
+        morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
+        return true;
+        break;
+    }
+    default: {
+        return false;
+        break;
+    }
+    }
+}
+}
+
+#ifdef _MSC_VER
+#undef __hot
+#undef __no_inline
+#elif defined(CLANG_OR_GCC)
+#pragma GCC pop_options
+#undef __no_inline
+#undef __hot
+#else
+#undef __hot
+#undef __no_inline
+#undef __forceinline
+#endif
--- a/src/video_core/texture/internal/texture_utils.h
+++ b/src/video_core/texture/internal/texture_utils.h
@ -16,6 +16,7 @@
 #ifdef _MSC_VER
 #pragma inline_recursion(on)
 #elif defined(CLANG_OR_GCC)
+#pragma GCC push_options
 #pragma GCC optimize("-fpeel-loops")
 #pragma GCC optimize("-fpredictive-commoning")
 #pragma GCC optimize("-ftree-loop-distribute-patterns")
@ -74,24 +75,6 @@ inline void image_pass(u8* target, u32 width, u32 height) {
        image_pass_aux_rev<pass, read_size, write_size, tuning>(target, width, height);
 }

-template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
-void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) {
-    const size_t tiled_line_size = (lines_per_block * nibbles) / 2;
-    const size_t row_length = x_blocks * tiled_line_size;
-    for (u32 i = 0; i < lines_per_block; i++) {
-        const u32 k = (lines_per_block - 1 - i);
-        const size_t tiled_index = i * tiled_line_size;
-        const size_t linear_index = k * row_length;
-        codec(tiled + tiled_index, linear + linear_index, tiled_line_size);
-    }
-}
-
-// @param read_size is the amount of bytes each pixel takes
-inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
-    std::memcpy(matrix_pointer, morton_pointer, read_size);
-}
-
-// @param read_size is the amount of bytes each pixel takes
-inline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
-    std::memcpy(morton_pointer, matrix_pointer, read_size);
-}
+#if defined(CLANG_OR_GCC)
+#pragma GCC pop_options
+#endif