Remade texture decoding/encoding

2016-12-30 01:26:45 -05:00
parent f556d6ee90
commit afb6f88af1
4 changed files with 712 additions and 98 deletions
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -6,6 +6,7 @@ set(SRCS
            renderer_opengl/gl_state.cpp
            renderer_opengl/renderer_opengl.cpp
            debug_utils/debug_utils.cpp
            texture_codecs/codecs.cpp
            clipper.cpp
            command_processor.cpp
            pica.cpp
@@ -21,6 +22,7 @@ set(SRCS
 set(HEADERS
            debug_utils/debug_utils.h
            texture_codecs/codecs.h
            renderer_opengl/gl_rasterizer.h
            renderer_opengl/gl_rasterizer_cache.h
            renderer_opengl/gl_resource_manager.h
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -18,6 +18,7 @@
 #include "core/frontend/emu_window.h"
 #include "core/memory.h"
 #include "video_core/debug_utils/debug_utils.h"
 #include "video_core/texture_codecs/codecs.h"
 #include "video_core/pica_state.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_state.h"
@@ -54,55 +55,6 @@ RasterizerCacheOpenGL::~RasterizerCacheOpenGL() {
    FlushAll();
 }
 static void MortonCopyPixels(CachedSurface::PixelFormat pixel_format, u32 width, u32 height,
                             u32 bytes_per_pixel, u32 gl_bytes_per_pixel, u8* morton_data,
                             u8* gl_data, bool morton_to_gl) {
    using PixelFormat = CachedSurface::PixelFormat;
    u8* data_ptrs[2];
    u32 depth_stencil_shifts[2] = {24, 8};
    if (morton_to_gl) {
        std::swap(depth_stencil_shifts[0], depth_stencil_shifts[1]);
    }
    if (pixel_format == PixelFormat::D24S8) {
        for (unsigned y = 0; y < height; ++y) {
            for (unsigned x = 0; x < width; ++x) {
                const u32 coarse_y = y & ~7;
                u32 morton_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) +
                                    coarse_y * width * bytes_per_pixel;
                u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel;
                data_ptrs[morton_to_gl] = morton_data + morton_offset;
                data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index];
                // Swap depth and stencil value ordering since 3DS does not match OpenGL
                u32 depth_stencil;
                memcpy(&depth_stencil, data_ptrs[1], sizeof(u32));
                depth_stencil = (depth_stencil << depth_stencil_shifts[0]) |
                                (depth_stencil >> depth_stencil_shifts[1]);
                memcpy(data_ptrs[0], &depth_stencil, sizeof(u32));
            }
        }
    } else {
        for (unsigned y = 0; y < height; ++y) {
            for (unsigned x = 0; x < width; ++x) {
                const u32 coarse_y = y & ~7;
                u32 morton_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) +
                                    coarse_y * width * bytes_per_pixel;
                u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel;
                data_ptrs[morton_to_gl] = morton_data + morton_offset;
                data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index];
                memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel);
            }
        }
    }
 }
 void RasterizerCacheOpenGL::BlitTextures(GLuint src_tex, GLuint dst_tex,
                                         CachedSurface::SurfaceType type,
                                         const MathUtil::Rectangle<int>& src_rect,
@@ -224,6 +176,205 @@ static void AllocateSurfaceTexture(GLuint texture, CachedSurface::PixelFormat pi
    cur_state.Apply();
 }
 // TODO: refactor this function into a factory method, sepparating format decoding
 // from ogl texture loading. Thus the decoder could be used for different backends.
 static void DecodeTexture(const CachedSurface& params, u8* texture_src_data, FormatTuple tuple) {
    CachedSurface::PixelFormat format = params.pixel_format;
    int invalid_conditions = 0;
    invalid_conditions |= (texture_src_data == 0);
    invalid_conditions |= (params.width < 8);
    invalid_conditions |= (params.height < 8);
    if (invalid_conditions) {
        LOG_CRITICAL(Render_OpenGL, "Invalid Texture sent to decoder! ");
        return;
    }
    switch (format) {
        case CachedSurface::PixelFormat::RGBA8: {
            u8* tex_buffer = new u8[params.width * params.height * 4];
            u8* in_buffer = texture_src_data;
            Pica::Decoders::Morton(
                in_buffer, tex_buffer, params.width, params.height, 4
            );
            Pica::Decoders::BigEndian(
                reinterpret_cast<u32*>(tex_buffer), params.width, params.height
            );
            glTexImage2D(
                GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
                params.height, 0, GL_RGBA, GL_UNSIGNED_BYTE, tex_buffer
            );
            // FIXME: swizzle would work perfectly iff
            // flushing surfaces wouldnt mess it up
            // it would be ideal to use swizzling instead of BigEndianDecoding
            // in either case, this could be tracked and fixed on the
            // fragment shader.
            //if (!params.flushed) {
                //GLint swiz[4] = {GL_ALPHA, GL_BLUE, GL_GREEN, GL_RED};
                //glTexParameteriv(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_RGBA, swiz);
            //}
            delete tex_buffer;
            return;
        }
        case CachedSurface::PixelFormat::RGB8: {
            u8* tex_buffer = new u8[params.width * params.height * 3];
            Pica::Decoders::Morton(
                texture_src_data, tex_buffer, params.width, params.height, 3
            );
            glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
            glTexImage2D(
                GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
                params.height, 0, GL_BGR, GL_UNSIGNED_BYTE, tex_buffer
            );
            glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
            delete tex_buffer;
            return;
        }
        case CachedSurface::PixelFormat::RGB5A1: {
            u8* tex_buffer = new u8[params.width * params.height * 2];
            u8* in_buffer = texture_src_data;
            Pica::Decoders::Morton(
                in_buffer, tex_buffer, params.width, params.height, 2
            );
            glPixelStorei(GL_UNPACK_ALIGNMENT, 2);
            glPixelStorei(GL_UNPACK_LSB_FIRST, GL_TRUE);
            glTexImage2D(
                GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
                params.height, 0, GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1, tex_buffer
            );
            glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
            glPixelStorei(GL_UNPACK_LSB_FIRST, GL_FALSE);
            delete tex_buffer;
            return;
        }
        case CachedSurface::PixelFormat::RGB565: {
            u8* tex_buffer = new u8[params.width * params.height * 2];
            u8* in_buffer = texture_src_data;
            Pica::Decoders::Morton(
                in_buffer, tex_buffer, params.width, params.height, 2
            );
            glPixelStorei(GL_UNPACK_ALIGNMENT, 2);
            glPixelStorei(GL_UNPACK_LSB_FIRST, GL_TRUE);
            glTexImage2D(
                GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
                params.height, 0, GL_RGB, GL_UNSIGNED_SHORT_5_6_5, tex_buffer
            );
            glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
            glPixelStorei(GL_UNPACK_LSB_FIRST, GL_FALSE);
            delete tex_buffer;
            return;
        }
        case CachedSurface::PixelFormat::RGBA4: {
            u8* tex_buffer = new u8[params.width * params.height * 2];
            u8* in_buffer = texture_src_data;
            Pica::Decoders::Morton(
                in_buffer, tex_buffer, params.width, params.height, 2
            );
            glPixelStorei(GL_UNPACK_ALIGNMENT, 2);
            glPixelStorei(GL_UNPACK_LSB_FIRST, GL_TRUE);
            glTexImage2D(
                GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
                params.height, 0, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4, tex_buffer
            );
            glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
            glPixelStorei(GL_UNPACK_LSB_FIRST, GL_FALSE);
            delete tex_buffer;
            return;
        }
        case CachedSurface::PixelFormat::RG8: {
            u8* tex_buffer = new u8[params.width * params.height * 2];
            u8* in_buffer = texture_src_data;
            Pica::Decoders::Morton(
                in_buffer, tex_buffer, params.width >> 1, params.height >> 1, 2
            );
            glPixelStorei(GL_UNPACK_ALIGNMENT, 2);
            glPixelStorei(GL_UNPACK_LSB_FIRST, GL_TRUE);
            glTexImage2D(
                GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
                params.height, 0, GL_RG, GL_UNSIGNED_BYTE, tex_buffer
            );
            glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
            glPixelStorei(GL_UNPACK_LSB_FIRST, GL_FALSE);
            delete tex_buffer;
            return;
        }
        case CachedSurface::PixelFormat::D16: {
            u8* tex_buffer = new u8[params.width * params.height * 2];
            u8* in_buffer = texture_src_data;
            Pica::Decoders::Morton(
                in_buffer, tex_buffer, params.width, params.height, 2
            );
            glPixelStorei(GL_UNPACK_ALIGNMENT, 2);
            glTexImage2D(
                GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
                params.height, 0, tuple.format, tuple.type, tex_buffer
            );
            glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
            delete tex_buffer;
            return;
        }
        case CachedSurface::PixelFormat::D24: {
            u8* tex_buffer = new u8[params.width * params.height * 4];
            Pica::Decoders::MortonU24_U32(
                texture_src_data, tex_buffer, params.width, params.height
            );
            glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
            glTexImage2D(
                GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
                params.height, 0, tuple.format, tuple.type, tex_buffer
            );
            glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
            delete tex_buffer;
            return;
        }
        case CachedSurface::PixelFormat::D24S8: {
            u32 size = params.width * params.height;
            u8* tmp_buffer = new u8[size * 4];
            u8* in_buffer = texture_src_data;
            Pica::Decoders::Morton(
                in_buffer, tmp_buffer, params.width, params.height, 4
            );
            u32* tex_buffer = reinterpret_cast<u32*>(tmp_buffer);
            Pica::Decoders::Depth(tex_buffer, params.width, params.height);
            glTexImage2D(
                GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
                params.height, 0, tuple.format, tuple.type, tex_buffer
            );
            // FIXME: swizzle requires to be set up on glstate in order to work
            // correctly.
            // GLint swiz[4] = {GL_GREEN, GL_BLUE, GL_ALPHA, GL_RED};
            // glTexParameteriv(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_RGBA, swiz);
            delete tex_buffer;
            return;
        }
        // TODO: ETC1 and ETCA4 need a decoder
        // Fallback to LookupTexture
        case CachedSurface::PixelFormat::ETC1:
        case CachedSurface::PixelFormat::ETC1A4:
        default: {
            break;
        }
    }
    u32* tex_buffer = new u32[params.width * params.height];
    Pica::DebugUtils::TextureInfo tex_info;
    tex_info.width = params.width;
    tex_info.height = params.height;
    tex_info.stride =
        params.width * CachedSurface::GetFormatBpp(params.pixel_format) / 8;
    tex_info.format = (Pica::Regs::TextureFormat)params.pixel_format;
    tex_info.physical_address = params.addr;
    for (unsigned y = 0; y < params.height; ++y) {
        for (unsigned x = 0; x < params.width; ++x) {
            Math::Vec4<u8> v = Pica::DebugUtils::LookupTexture(
                texture_src_data, x, params.height - 1 - y, tex_info);
            tex_buffer[x + y*params.width] = *reinterpret_cast<u32*>(v.AsArray());
        }
    }
    glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height,
                 0, GL_RGBA, GL_UNSIGNED_BYTE, tex_buffer);
    delete tex_buffer;
    return;
 }
 MICROPROFILE_DEFINE(OpenGL_SurfaceUpload, "OpenGL", "Surface Upload", MP_RGB(128, 64, 192));
 CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bool match_res_scale,
                                                 bool load_if_create) {
@@ -337,52 +488,14 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo
                    // Texture
                    tuple = {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE};
                }
-
+                DecodeTexture(params, texture_src_data, tuple);
                std::vector<Math::Vec4<u8>> tex_buffer(params.width * params.height);
                Pica::DebugUtils::TextureInfo tex_info;
                tex_info.width = params.width;
                tex_info.height = params.height;
                tex_info.stride =
                    params.width * CachedSurface::GetFormatBpp(params.pixel_format) / 8;
                tex_info.format = (Pica::Regs::TextureFormat)params.pixel_format;
                tex_info.physical_address = params.addr;
                for (unsigned y = 0; y < params.height; ++y) {
                    for (unsigned x = 0; x < params.width; ++x) {
                        tex_buffer[x + params.width * y] = Pica::DebugUtils::LookupTexture(
                            texture_src_data, x, params.height - 1 - y, tex_info);
                    }
                }
                glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height,
                             0, GL_RGBA, GL_UNSIGNED_BYTE, tex_buffer.data());
            } else {
                // Depth/Stencil formats need special treatment since they aren't sampleable using
                // LookupTexture and can't use RGBA format
                size_t tuple_idx = (size_t)params.pixel_format - 14;
                ASSERT(tuple_idx < depth_format_tuples.size());
                const FormatTuple& tuple = depth_format_tuples[tuple_idx];
-
+                DecodeTexture(params, texture_src_data, tuple);
                u32 bytes_per_pixel = CachedSurface::GetFormatBpp(params.pixel_format) / 8;
                // OpenGL needs 4 bpp alignment for D24 since using GL_UNSIGNED_INT as type
                bool use_4bpp = (params.pixel_format == PixelFormat::D24);
                u32 gl_bytes_per_pixel = use_4bpp ? 4 : bytes_per_pixel;
                std::vector<u8> temp_fb_depth_buffer(params.width * params.height *
                                                     gl_bytes_per_pixel);
                u8* temp_fb_depth_buffer_ptr =
                    use_4bpp ? temp_fb_depth_buffer.data() + 1 : temp_fb_depth_buffer.data();
                MortonCopyPixels(params.pixel_format, params.width, params.height, bytes_per_pixel,
                                 gl_bytes_per_pixel, texture_src_data, temp_fb_depth_buffer_ptr,
                                 true);
                glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height,
                             0, tuple.format, tuple.type, temp_fb_depth_buffer.data());
            }
        }
@@ -716,9 +829,10 @@ void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) {
            // Directly copy pixels. Internal OpenGL color formats are consistent so no conversion
            // is necessary.
-            MortonCopyPixels(surface->pixel_format, surface->width, surface->height,
+            Pica::Encoders::Morton(
-                             bytes_per_pixel, bytes_per_pixel, dst_buffer, temp_gl_buffer.data(),
+                temp_gl_buffer.data(), dst_buffer,
-                             false);
+                surface->width, surface->height, bytes_per_pixel
            );
        } else {
            // Depth/Stencil formats need special treatment since they aren't sampleable using
            // LookupTexture and can't use RGBA format
@@ -730,18 +844,37 @@ void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) {
            // OpenGL needs 4 bpp alignment for D24 since using GL_UNSIGNED_INT as type
            bool use_4bpp = (surface->pixel_format == PixelFormat::D24);
            u32 gl_bytes_per_pixel = use_4bpp ? 4 : bytes_per_pixel;
            std::vector<u8> temp_gl_buffer(surface->width * surface->height * gl_bytes_per_pixel);
            glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data());
-            u8* temp_gl_buffer_ptr = use_4bpp ? temp_gl_buffer.data() + 1 : temp_gl_buffer.data();
+            switch (surface->pixel_format) {
-
+                case PixelFormat::D24: {
-            MortonCopyPixels(surface->pixel_format, surface->width, surface->height,
+                    Pica::Encoders::Morton(
-                             bytes_per_pixel, gl_bytes_per_pixel, dst_buffer, temp_gl_buffer_ptr,
+                        temp_gl_buffer.data(), dst_buffer,
-                             false);
+                        surface->width, surface->height, 4
                    );
                    break;
                }
                case PixelFormat::D24S8: {
                    Pica::Encoders::Morton(
                        temp_gl_buffer.data(), dst_buffer,
                        surface->width, surface->height, 4
                    );
                    u32* tex_buffer = reinterpret_cast<u32*>(dst_buffer);
                    Pica::Encoders::Depth(tex_buffer, surface->width, surface->height);
                    break;
                }
                default: {
                    Pica::Encoders::Morton(
                        temp_gl_buffer.data(), dst_buffer,
                        surface->width, surface->height, bytes_per_pixel
                    );
                    break;
                }
            }
        }
    }
--- a/src/video_core/texture_codecs/codecs.cpp
+++ b/src/video_core/texture_codecs/codecs.cpp
@@ -0,0 +1,435 @@
 #include "video_core/texture_codecs/codecs.h"
 #include "common/assert.h"
 #include <utility>
    /*
     * Static compilers can't always detect if vectorization is possible,
     * if the programmer is 100% sure it's, possible to vectorize a set
     * of actions, it can hint the compiler that it can vectorize a loop
     * unconditionaly.
     */
    #ifdef _MSC_VER
        #define VECTORIZE_NEXT _Pragma("loop( ivdep )")
    #elif __GNUC__
        #define VECTORIZE_NEXT _Pragma("GCC ivdep")
    #elif __clang__
        #define VECTORIZE_NEXT _Pragma("clang loop vectorize(enable) interleave(enable)")
    #else
        #define VECTORIZE_NEXT
    #endif
    //Note: The function layout is made on purpose to help the compiler
    //unfold the loop and simplify the moves to the best appropiate type in use.
    //compiling for ivy-bridge-up will unfold the loop further and use AVX2
    template <class T>
    inline void decode_simple(T* from, T* out) {
        out[0] = from[0];
        out[1] = from[1];
    }
    template <>
    inline void decode_simple<u32>(u32* from, u32* out) {
        u64* out1 = reinterpret_cast<u64*>(out);
        u64* from1 = reinterpret_cast<u64*>(from);
        out1[0] = from1[0];
    }
    template <>
    inline void decode_simple<u16>(u16* from, u16* out) {
        u32* out1 = reinterpret_cast<u32*>(out);
        u32* from1 = reinterpret_cast<u32*>(from);
        out1[0] = from1[0];
    }
    template <>
    inline void decode_simple<u8>(u8* from, u8* out) {
        u16* out1 = reinterpret_cast<u16*>(out);
        u16* from1 = reinterpret_cast<u16*>(from);
        out1[0] = from1[0];
    }
    template <class T>
    inline void encode_simple(T* from, T* out) {
        from[0] = out[0];
        from[1] = out[1];
    }
    template <>
    inline void encode_simple<u32>(u32* from, u32* out) {
        u64* out1 = reinterpret_cast<u64*>(out);
        u64* from1 = reinterpret_cast<u64*>(from);
        from1[0] = out1[0];
    }
    template <>
    inline void encode_simple<u16>(u16* from, u16* out) {
        u32* out1 = reinterpret_cast<u32*>(out);
        u32* from1 = reinterpret_cast<u32*>(from);
        from1[0] = out1[0];
    }
    template <>
    inline void encode_simple<u8>(u8* from, u8* out) {
        u16* out1 = reinterpret_cast<u16*>(out);
        u16* from1 = reinterpret_cast<u16*>(from);
        from1[0] = out1[0];
    }
    template <class T>
    inline void decode_u4(T* from, T* out) {
        out[0] = (from[0] & 0x0F);
        out[1] = (from[0] & 0xF0) >> 4;
    }
    template <class T>
    inline void decode_u24(T* from, T* out) {
        out[0] = from[0];
        out[1] = from[1];
        out[2] = from[2];
        out[3] = from[3];
        out[4] = from[4];
        out[5] = from[5];
    }
    template <class T>
    inline void encode_u24(T* from, T* out) {
        from[0] = out[0];
        from[1] = out[1];
        from[2] = out[2];
        from[3] = out[3];
        from[4] = out[4];
        from[5] = out[5];
    }
    template <class T>
    inline void decode_depth(T* from, T* out) {
        out[0] = from[0];
        out[1] = from[1];
        out[2] = from[2];
        out[3] = 0;
        out[4] = from[3];
        out[5] = from[4];
        out[6] = from[5];
        out[7] = 0;
    }
    template <class T>
    inline void encode_depth(T* from, T* out) {
        out[0] = from[0];
        out[1] = from[1];
        out[2] = from[2];
        out[3] = from[4];
        out[4] = from[5];
        out[5] = from[6];
    }
    template <class T, void func(T*,T*), int read_size, int write_size>
    inline void morton_block2x2(T* from, T* &w1, T* &w2) {
        func(from, w1);
        w1 += write_size*2;
        func(from + read_size*2, w2);
        w2 += write_size*2;
    }
    template <class T, void func(T*,T*), int read_size, int write_size>
    inline void morton_block4x4(T* from, T** w1, T** w2) {
        T* tmp_block = from;
        morton_block2x2<T,func,read_size, write_size>(tmp_block, w1[0], w1[1]);
        tmp_block += read_size*4;
        morton_block2x2<T,func,read_size, write_size>(tmp_block, w1[0], w1[1]);
        tmp_block += read_size*4;
        morton_block2x2<T,func,read_size, write_size>(tmp_block, w2[0], w2[1]);
        tmp_block += read_size*4;
        morton_block2x2<T,func,read_size, write_size>(tmp_block, w2[0], w2[1]);
    }
    template <class T, void func(T*,T*), int read_size, int write_size>
    inline void morton_block8x8( T* from, T** cursors ) {
        T* tmp_block = from;
        morton_block4x4<T,func,read_size, write_size>(tmp_block, &cursors[0], &cursors[2]);
        tmp_block += read_size*16;
        morton_block4x4<T,func,read_size, write_size>(tmp_block, &cursors[0], &cursors[2]);
        tmp_block += read_size*16;
        morton_block4x4<T,func,read_size, write_size>(tmp_block, &cursors[4], &cursors[6]);
        tmp_block += read_size*16;
        morton_block4x4<T,func,read_size, write_size>(tmp_block, &cursors[4], &cursors[6]);
    }
    template <class T, int read_size>
    inline void rewind_cursors(T** cursors, T* write_p, u32 width) {
        cursors[0] = write_p;
        cursors[1] = write_p - read_size*width;
        cursors[2] = write_p - read_size*2*width;
        cursors[3] = write_p - read_size*3*width;
        cursors[4] = write_p - read_size*4*width;
        cursors[5] = write_p - read_size*5*width;
        cursors[6] = write_p - read_size*6*width;
        cursors[7] = write_p - read_size*7*width;
    }
    template <class T, void func(T*,T*), int read_size, int write_size>
    inline void morton(T* in_p, T* write_p, u32 width, u32 height) {
        u32 x_blocks = (width/8);
        u32 y_blocks = (height/8);
        T* block_pointer = in_p;
        T* cursors[8];
        u32 step = (8*width)*write_size;
        write_p += read_size*(width*(height - 1));
        for (u32 y = 0; y != y_blocks; y++) {
            rewind_cursors<T,read_size>(cursors,write_p,width);
            VECTORIZE_NEXT for (u32 x = 0; x != x_blocks; x++) {
                morton_block8x8<T,func,read_size, write_size>(block_pointer, cursors);
                block_pointer += 64*read_size;
            }
            write_p -= step;
        }
    }
    // These macros are used to unroll/unfold the same action on tight loops
    // should be used on actions that don't branch the pipeline.
    // Static compilers can't detect unrollable loops easily. Normaly,
    // they require some profiling data to unroll loops.
    #define LOOP_UNROLL_1(CODE) CODE
    #define LOOP_UNROLL_2(CODE) LOOP_UNROLL_1(CODE); LOOP_UNROLL_1(CODE)
    #define LOOP_UNROLL_4(CODE) LOOP_UNROLL_2(CODE); LOOP_UNROLL_2(CODE)
    #define LOOP_UNROLL_8(CODE) LOOP_UNROLL_4(CODE); LOOP_UNROLL_4(CODE)
    #define LOOP_UNROLL_16(CODE) LOOP_UNROLL_8(CODE); LOOP_UNROLL_8(CODE)
    template <class T, void func(T*&)>
    inline void map_image(T* &out_buffer, u32 width, u32 height) {
        u32 writes = width*height/16; // 16 unfolds
        VECTORIZE_NEXT for(u32 i = 0; i != writes; i++) {
            LOOP_UNROLL_16(func(out_buffer));
        }
        // Now just do the rest
        writes = width*height - (writes*16);
        u32 jump = (writes % 8);
        // This form of loop unfolding works for every set of data at the
        // expense of not marshelling/vectorizing but won't break the pipeline
        switch (jump) {
                do {
                    jump = 8;
                    func(out_buffer);
            case 7:
                    func(out_buffer);
            case 6:
                    func(out_buffer);
            case 5:
                    func(out_buffer);
            case 4:
                    func(out_buffer);
            case 3:
                    func(out_buffer);
            case 2:
                    func(out_buffer);
            case 1:
                    func(out_buffer);
            case 0:
            default:
                    writes -= jump;
                } while (writes != 0);
        }
    }
    // Big Endian Decoding
    template <class T>
    inline void big_endian_write(T* &out_buffer) {
        int size = sizeof(T) >> 3;
        u8* b = reinterpret_cast<u8*>(&out_buffer);
        u8 tmp[size];
        for(u32 i = 0; i != size; i++)
            tmp[i] = b[i];
        for(u32 i = 0; i != size; i++)
            b[i] = tmp[size-1-i];
        out_buffer++;
    }
    inline void big_u32(u8* &out_buffer) {
        u8* b = (out_buffer);
        u8 tmp[4] = { b[3], b[2], b[1], b[0] };
        b[0] = tmp[0];
        b[1] = tmp[1];
        b[2] = tmp[2];
        b[3] = tmp[3];
        out_buffer+=4;
    }
    static inline void rotateLeft(u32* &out_buffer) {
        out_buffer[0] = (out_buffer[0] >> 24) |  (out_buffer[0] << 8);
        out_buffer++;
    }
    static inline void rotateRight(u32* &out_buffer) {
        out_buffer[0] = (out_buffer[0] >> 8) |  (out_buffer[0] << 24);
        out_buffer++;
    }
    constexpr u8 Convert4To8(u8 value) {
        return (value << 4) | value;
    }
    inline void nimble_write(u8* &in_buffer, u8* &out_buffer) {
        out_buffer[0] = Convert4To8((*in_buffer & 0xF0) >> 4);
        out_buffer[1] = Convert4To8(*in_buffer & 0x0F);
        in_buffer++;
        out_buffer+=2;
    }
 namespace Pica {
 namespace Encoders {
    bool Morton(u8* in_buffer, u8* out_buffer, u32 width, u32 height, u32 bytespp) {
        // Sanity checks
        std::swap(in_buffer,out_buffer);
        ASSERT(in_buffer != nullptr && out_buffer != nullptr);
        ASSERT(((u64)in_buffer & 3) == 0);
        ASSERT(((u64)out_buffer & 3) == 0);
        ASSERT(width >= 8);
        ASSERT(height >= 8);
        ASSERT((width*height) % 64 == 0);
        switch(bytespp) {
            case 1: {
                morton<u8,&encode_simple,1,1>(
                    in_buffer, out_buffer, width, height
                );
                return true;
                break;
            }
            case 2: {
                morton<u16,&encode_simple,1,1>(
                    reinterpret_cast<u16*>(in_buffer), reinterpret_cast<u16*>(out_buffer),
                    width, height
                );
                return true;
                break;
            }
            case 3: {
                morton<u8,&encode_u24,3,3>(
                    in_buffer, out_buffer,
                    width, height
                );
                return true;
                break;
            }
            case 4: {
                morton<u32,&encode_simple,1,1>(
                    reinterpret_cast<u32*>(in_buffer), reinterpret_cast<u32*>(out_buffer),
                    width, height
                );
                return true;
                break;
            }
            case 8: {
                morton<u64,&encode_simple,1,1>(
                    reinterpret_cast<u64*>(in_buffer), reinterpret_cast<u64*>(out_buffer),
                    width, height
                );
                return true;
                break;
            }
            default: {
                return false;
                break;
            }
        }
    }
    void MortonU32_U24(u8* in_buffer, u8* out_buffer, u32 width, u32 height) {
        morton<u8,&encode_depth,4,3>(in_buffer, out_buffer, width, height);
    }
    void Depth(u32* out_buffer, u32 width, u32 height) {
        map_image<u32,&rotateRight>(out_buffer,width,height);
    }
 } // Encoders
 namespace Decoders {
    void MortonU24_U32(u8* in_buffer, u8* out_buffer, u32 width, u32 height) {
        morton<u8,&decode_depth,3,4>(in_buffer, out_buffer, width, height);
    }
    bool Morton(u8* in_buffer, u8* out_buffer, u32 width, u32 height, u32 bytespp) {
        // Sanity checks
        ASSERT(in_buffer != nullptr && out_buffer != nullptr);
        ASSERT(((u64)in_buffer & 3) == 0);
        ASSERT(((u64)out_buffer & 3) == 0);
        ASSERT(width >= 8);
        ASSERT(height >= 8);
        ASSERT((width*height) % 64 == 0);
        switch(bytespp) {
            case 1: {
                morton<u8,&decode_simple,1,1>(
                    in_buffer, out_buffer, width, height
                );
                return true;
                break;
            }
            case 2: {
                morton<u16,&decode_simple,1,1>(
                    reinterpret_cast<u16*>(in_buffer), reinterpret_cast<u16*>(out_buffer),
                    width, height
                );
                return true;
                break;
            }
            case 3: {
                morton<u8,&decode_u24,3,3>(
                    in_buffer, out_buffer,
                    width, height
                );
                return true;
                break;
            }
            case 4: {
                morton<u32,&decode_simple,1,1>(
                    reinterpret_cast<u32*>(in_buffer), reinterpret_cast<u32*>(out_buffer),
                    width, height
                );
                return true;
                break;
            }
            case 8: {
                morton<u64,&decode_simple,1,1>(
                    reinterpret_cast<u64*>(in_buffer), reinterpret_cast<u64*>(out_buffer),
                    width, height
                );
                return true;
                break;
            }
            default: {
                return false;
                break;
            }
        }
    }
    void BigEndian(u32* out_buffer, u32 width, u32 height) {
        u8* tmp = reinterpret_cast<u8*>(out_buffer);
        map_image<u8,&big_u32>(tmp,width,height);
    }
    void Depth(u32* out_buffer, u32 width, u32 height) {
        map_image<u32,&rotateLeft>(out_buffer,width,height);
    }
    //Nimbles
    void Nimbles(u8* in_buffer, u8* out_buffer, u32 width, u32 height) {
        u32 writes = width*height/32; // 16 unfolds
        for(u32 i = 0; i != writes; i++) {
            LOOP_UNROLL_16(nimble_write(in_buffer, out_buffer));
        }
        // Now just do the rest
        writes = width*height - (writes*32);
        for(u32 i = 0; i != writes; i++) {
            LOOP_UNROLL_1(nimble_write(in_buffer, out_buffer));
        }
    }
 } // TextureUtils
 } // Pica
--- a/src/video_core/texture_codecs/codecs.h
+++ b/src/video_core/texture_codecs/codecs.h
@@ -0,0 +1,44 @@
 #pragma once
 #include "common/common_types.h"
 namespace Pica {
 namespace Encoders {
    /**
     * Encodes textures in raw texel data into z-order/morton-order
     * @param in_buffer pointer to the texture that needs encoding.
     * @param out_buffer pointer to a buffer where the encoded image will be written.
     * @param width texture's width
     * @param width texture's height
     * @param bytespp bytes per pixel
     */
    bool Morton(u8* in_buffer, u8* out_buffer, u32 width, u32 height, u32 bytespp);
    void MortonU32_U24(u8* in_buffer, u8* out_buffer, u32 width, u32 height);
    void Depth(u32* out_buffer, u32 width, u32 height);
 } // Encoders
 namespace Decoders {
    /**
     * Decodes textures using z-order/morton-order into raw texel data
     * @param in_buffer pointer to the texture that needs decoding.
     * @param out_buffer pointer to a buffer where the decoded image will be written.
     * @param width texture's width
     * @param width texture's height
     * @param bytespp bytes per pixel
     */
    bool Morton(u8* in_buffer, u8* out_buffer, u32 width, u32 height, u32 bytespp);
    void MortonU24_U32(u8* in_buffer, u8* out_buffer, u32 width, u32 height);
    void BigEndian(u32* out_buffer, u32 width, u32 height);
    void Depth(u32* out_buffer, u32 width, u32 height);
    void Nimbles(u8* in_buffer, u8* out_buffer, u32 width, u32 height);
 } // Decoders
 } // Pica