diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 6ca319b59..5e70bee7f 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -6,6 +6,7 @@ set(SRCS renderer_opengl/gl_state.cpp renderer_opengl/renderer_opengl.cpp debug_utils/debug_utils.cpp + texture_codecs/codecs.cpp clipper.cpp command_processor.cpp pica.cpp @@ -21,6 +22,7 @@ set(SRCS set(HEADERS debug_utils/debug_utils.h + texture_codecs/codecs.h renderer_opengl/gl_rasterizer.h renderer_opengl/gl_rasterizer_cache.h renderer_opengl/gl_resource_manager.h diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index 85aa06cd5..637ee0ac9 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -21,6 +21,7 @@ #include "video_core/pica_state.h" #include "video_core/renderer_opengl/gl_rasterizer_cache.h" #include "video_core/renderer_opengl/gl_state.h" +#include "video_core/texture_codecs/codecs.h" #include "video_core/utils.h" #include "video_core/video_core.h" @@ -54,55 +55,6 @@ RasterizerCacheOpenGL::~RasterizerCacheOpenGL() { FlushAll(); } -static void MortonCopyPixels(CachedSurface::PixelFormat pixel_format, u32 width, u32 height, - u32 bytes_per_pixel, u32 gl_bytes_per_pixel, u8* morton_data, - u8* gl_data, bool morton_to_gl) { - using PixelFormat = CachedSurface::PixelFormat; - - u8* data_ptrs[2]; - u32 depth_stencil_shifts[2] = {24, 8}; - - if (morton_to_gl) { - std::swap(depth_stencil_shifts[0], depth_stencil_shifts[1]); - } - - if (pixel_format == PixelFormat::D24S8) { - for (unsigned y = 0; y < height; ++y) { - for (unsigned x = 0; x < width; ++x) { - const u32 coarse_y = y & ~7; - u32 morton_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + - coarse_y * width * bytes_per_pixel; - u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel; - - data_ptrs[morton_to_gl] = morton_data + morton_offset; - data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index]; - - // Swap depth and stencil value ordering since 3DS does not match OpenGL - u32 depth_stencil; - memcpy(&depth_stencil, data_ptrs[1], sizeof(u32)); - depth_stencil = (depth_stencil << depth_stencil_shifts[0]) | - (depth_stencil >> depth_stencil_shifts[1]); - - memcpy(data_ptrs[0], &depth_stencil, sizeof(u32)); - } - } - } else { - for (unsigned y = 0; y < height; ++y) { - for (unsigned x = 0; x < width; ++x) { - const u32 coarse_y = y & ~7; - u32 morton_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + - coarse_y * width * bytes_per_pixel; - u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel; - - data_ptrs[morton_to_gl] = morton_data + morton_offset; - data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index]; - - memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel); - } - } - } -} - void RasterizerCacheOpenGL::BlitTextures(GLuint src_tex, GLuint dst_tex, CachedSurface::SurfaceType type, const MathUtil::Rectangle& src_rect, @@ -224,6 +176,175 @@ static void AllocateSurfaceTexture(GLuint texture, CachedSurface::PixelFormat pi cur_state.Apply(); } +// TODO: refactor this function into a factory method, sepparating format decoding +// from ogl texture loading. Thus the decoder could be used for different backends. +static void DecodeTexture(const CachedSurface& params, u8* texture_src_data, FormatTuple tuple) { + CachedSurface::PixelFormat format = params.pixel_format; + int invalid_conditions = 0; + invalid_conditions |= (texture_src_data == 0); + invalid_conditions |= (params.width < 8); + invalid_conditions |= (params.height < 8); + if (invalid_conditions != 0) { + LOG_CRITICAL(Render_OpenGL, "Invalid texture sent to the texture decoder!"); + return; + } + switch (format) { + case CachedSurface::PixelFormat::RGBA8: { + std::unique_ptr tmp(new u8[params.width * params.height * 4]); + u8* tex_buffer = tmp.get(); + u8* in_buffer = texture_src_data; + Pica::Decoders::Morton(in_buffer, tex_buffer, params.width, params.height, 4); + Pica::Decoders::BigEndian(tex_buffer, params.width, params.height); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, + GL_RGBA, GL_UNSIGNED_BYTE, tex_buffer); + return; + } + case CachedSurface::PixelFormat::RGB8: { + std::unique_ptr tmp(new u8[params.width * params.height * 3]); + u8* tex_buffer = tmp.get(); + Pica::Decoders::Morton(texture_src_data, tex_buffer, params.width, params.height, 3); + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, + GL_BGR, GL_UNSIGNED_BYTE, tex_buffer); + glPixelStorei(GL_UNPACK_ALIGNMENT, 4); + return; + } + case CachedSurface::PixelFormat::RGB5A1: { + std::unique_ptr tmp(new u8[params.width * params.height * 2]); + u8* tex_buffer = tmp.get(); + u8* in_buffer = texture_src_data; + Pica::Decoders::Morton(in_buffer, tex_buffer, params.width, params.height, 2); + glPixelStorei(GL_UNPACK_ALIGNMENT, 2); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, + GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1, tex_buffer); + glPixelStorei(GL_UNPACK_ALIGNMENT, 4); + return; + } + case CachedSurface::PixelFormat::RGB565: { + std::unique_ptr tmp(new u8[params.width * params.height * 2]); + u8* tex_buffer = tmp.get(); + u8* in_buffer = texture_src_data; + Pica::Decoders::Morton(in_buffer, tex_buffer, params.width, params.height, 2); + glPixelStorei(GL_UNPACK_ALIGNMENT, 2); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, + GL_RGB, GL_UNSIGNED_SHORT_5_6_5, tex_buffer); + glPixelStorei(GL_UNPACK_ALIGNMENT, 4); + return; + } + case CachedSurface::PixelFormat::RGBA4: { + std::unique_ptr tmp(new u8[params.width * params.height * 2]); + u8* tex_buffer = tmp.get(); + u8* in_buffer = texture_src_data; + Pica::Decoders::Morton(in_buffer, tex_buffer, params.width, params.height, 2); + glPixelStorei(GL_UNPACK_ALIGNMENT, 2); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, + GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4, tex_buffer); + glPixelStorei(GL_UNPACK_ALIGNMENT, 4); + return; + } + case CachedSurface::PixelFormat::IA8: { + std::unique_ptr tmp(new u8[params.width * params.height * 4]); + u8* tex_buffer = tmp.get(); + u8* in_buffer = texture_src_data; + Pica::Decoders::Morton(in_buffer, tex_buffer, params.width, params.height, 2); + Pica::Decoders::IA8(tex_buffer, params.width, params.height); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, + GL_RGBA, GL_UNSIGNED_BYTE, tex_buffer); + return; + } + case CachedSurface::PixelFormat::RG8: { + std::unique_ptr tmp(new u8[params.width * params.height * 4]); + u8* tex_buffer = tmp.get(); + u8* in_buffer = texture_src_data; + Pica::Decoders::Morton(in_buffer, tex_buffer, params.width, params.height, 2); + glPixelStorei(GL_UNPACK_ALIGNMENT, 2); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, GL_RG, + GL_UNSIGNED_BYTE, tex_buffer); + glPixelStorei(GL_UNPACK_ALIGNMENT, 4); + return; + } + case CachedSurface::PixelFormat::I8: { + std::unique_ptr tmp(new u8[params.width * params.height * 4]); + u8* tex_buffer = tmp.get(); + u8* in_buffer = texture_src_data; + Pica::Decoders::Morton(in_buffer, tex_buffer, params.width, params.height, 1); + Pica::Decoders::I8(tex_buffer, params.width, params.height); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, + GL_RGBA, GL_UNSIGNED_BYTE, tex_buffer); + return; + } + case CachedSurface::PixelFormat::A8: { + std::unique_ptr tmp(new u8[params.width * params.height * 4]); + u8* tex_buffer = tmp.get(); + u8* in_buffer = texture_src_data; + Pica::Decoders::Morton(in_buffer, tex_buffer, params.width, params.height, 1); + Pica::Decoders::A8(tex_buffer, params.width, params.height); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, + GL_RGBA, GL_UNSIGNED_BYTE, tex_buffer); + return; + } + case CachedSurface::PixelFormat::D16: { + std::unique_ptr tmp(new u8[params.width * params.height * 2]); + u8* tex_buffer = tmp.get(); + u8* in_buffer = texture_src_data; + Pica::Decoders::Morton(in_buffer, tex_buffer, params.width, params.height, 2); + glPixelStorei(GL_UNPACK_ALIGNMENT, 2); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, + tuple.format, tuple.type, tex_buffer); + glPixelStorei(GL_UNPACK_ALIGNMENT, 4); + return; + } + case CachedSurface::PixelFormat::D24: { + std::unique_ptr tmp(new u8[params.width * params.height * 3]); + u8* tex_buffer = tmp.get(); + Pica::Decoders::Morton(texture_src_data, tex_buffer, params.width, params.height, 3); + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, + tuple.format, tuple.type, tex_buffer); + glPixelStorei(GL_UNPACK_ALIGNMENT, 4); + return; + } + case CachedSurface::PixelFormat::D24S8: { + std::unique_ptr tmp(new u8[params.width * params.height * 4]); + u8* tex_buffer = tmp.get(); + u8* in_buffer = texture_src_data; + Pica::Decoders::Morton(in_buffer, tex_buffer, params.width, params.height, 4); + Pica::Decoders::Depth(tex_buffer, params.width, params.height); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, + tuple.format, tuple.type, tex_buffer); + // FIXME: swizzle requires to be set up on glstate in order to work + // correctly. + // GLint swiz[4] = {GL_GREEN, GL_BLUE, GL_ALPHA, GL_RED}; + // glTexParameteriv(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_RGBA, swiz); + return; + } + // TODO: ETC1 and ETCA4 need a decoder + // Fallback to LookupTexture + case CachedSurface::PixelFormat::ETC1: + case CachedSurface::PixelFormat::ETC1A4: + default: { break; } + } + u32* tex_buffer = new u32[params.width * params.height]; + Pica::DebugUtils::TextureInfo tex_info; + tex_info.width = params.width; + tex_info.height = params.height; + tex_info.stride = params.width * CachedSurface::GetFormatBpp(params.pixel_format) / 8; + tex_info.format = (Pica::Regs::TextureFormat)params.pixel_format; + tex_info.physical_address = params.addr; + + for (unsigned y = 0; y < params.height; ++y) { + for (unsigned x = 0; x < params.width; ++x) { + Math::Vec4 v = Pica::DebugUtils::LookupTexture(texture_src_data, x, + params.height - 1 - y, tex_info); + tex_buffer[x + y * params.width] = *reinterpret_cast(v.AsArray()); + } + } + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0, GL_RGBA, + GL_UNSIGNED_BYTE, tex_buffer); + delete tex_buffer; + return; +} + MICROPROFILE_DEFINE(OpenGL_SurfaceUpload, "OpenGL", "Surface Upload", MP_RGB(128, 64, 192)); CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bool match_res_scale, bool load_if_create) { @@ -337,52 +458,14 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo // Texture tuple = {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}; } - - std::vector> tex_buffer(params.width * params.height); - - Pica::DebugUtils::TextureInfo tex_info; - tex_info.width = params.width; - tex_info.height = params.height; - tex_info.stride = - params.width * CachedSurface::GetFormatBpp(params.pixel_format) / 8; - tex_info.format = (Pica::Regs::TextureFormat)params.pixel_format; - tex_info.physical_address = params.addr; - - for (unsigned y = 0; y < params.height; ++y) { - for (unsigned x = 0; x < params.width; ++x) { - tex_buffer[x + params.width * y] = Pica::DebugUtils::LookupTexture( - texture_src_data, x, params.height - 1 - y, tex_info); - } - } - - glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, - 0, GL_RGBA, GL_UNSIGNED_BYTE, tex_buffer.data()); + DecodeTexture(params, texture_src_data, tuple); } else { // Depth/Stencil formats need special treatment since they aren't sampleable using // LookupTexture and can't use RGBA format size_t tuple_idx = (size_t)params.pixel_format - 14; ASSERT(tuple_idx < depth_format_tuples.size()); const FormatTuple& tuple = depth_format_tuples[tuple_idx]; - - u32 bytes_per_pixel = CachedSurface::GetFormatBpp(params.pixel_format) / 8; - - // OpenGL needs 4 bpp alignment for D24 since using GL_UNSIGNED_INT as type - bool use_4bpp = (params.pixel_format == PixelFormat::D24); - - u32 gl_bytes_per_pixel = use_4bpp ? 4 : bytes_per_pixel; - - std::vector temp_fb_depth_buffer(params.width * params.height * - gl_bytes_per_pixel); - - u8* temp_fb_depth_buffer_ptr = - use_4bpp ? temp_fb_depth_buffer.data() + 1 : temp_fb_depth_buffer.data(); - - MortonCopyPixels(params.pixel_format, params.width, params.height, bytes_per_pixel, - gl_bytes_per_pixel, texture_src_data, temp_fb_depth_buffer_ptr, - true); - - glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, - 0, tuple.format, tuple.type, temp_fb_depth_buffer.data()); + DecodeTexture(params, texture_src_data, tuple); } } @@ -716,9 +799,8 @@ void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) { // Directly copy pixels. Internal OpenGL color formats are consistent so no conversion // is necessary. - MortonCopyPixels(surface->pixel_format, surface->width, surface->height, - bytes_per_pixel, bytes_per_pixel, dst_buffer, temp_gl_buffer.data(), - false); + Pica::Encoders::Morton(temp_gl_buffer.data(), dst_buffer, surface->width, + surface->height, bytes_per_pixel); } else { // Depth/Stencil formats need special treatment since they aren't sampleable using // LookupTexture and can't use RGBA format @@ -730,18 +812,32 @@ void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) { // OpenGL needs 4 bpp alignment for D24 since using GL_UNSIGNED_INT as type bool use_4bpp = (surface->pixel_format == PixelFormat::D24); - u32 gl_bytes_per_pixel = use_4bpp ? 4 : bytes_per_pixel; std::vector temp_gl_buffer(surface->width * surface->height * gl_bytes_per_pixel); + glPixelStorei(GL_PACK_ALIGNMENT, 1); glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data()); + glPixelStorei(GL_PACK_ALIGNMENT, 4); - u8* temp_gl_buffer_ptr = use_4bpp ? temp_gl_buffer.data() + 1 : temp_gl_buffer.data(); - - MortonCopyPixels(surface->pixel_format, surface->width, surface->height, - bytes_per_pixel, gl_bytes_per_pixel, dst_buffer, temp_gl_buffer_ptr, - false); + switch (surface->pixel_format) { + case PixelFormat::D24: { + Pica::Encoders::Morton(temp_gl_buffer.data(), dst_buffer, surface->width, + surface->height, 3); + break; + } + case PixelFormat::D24S8: { + Pica::Encoders::Morton(temp_gl_buffer.data(), dst_buffer, surface->width, + surface->height, 4); + Pica::Encoders::Depth(dst_buffer, surface->width, surface->height); + break; + } + default: { + Pica::Encoders::Morton(temp_gl_buffer.data(), dst_buffer, surface->width, + surface->height, bytes_per_pixel); + break; + } + } } } diff --git a/src/video_core/texture_codecs/codecs.cpp b/src/video_core/texture_codecs/codecs.cpp new file mode 100644 index 000000000..0444b9e24 --- /dev/null +++ b/src/video_core/texture_codecs/codecs.cpp @@ -0,0 +1,482 @@ +#include +#include +#include +#include "common/assert.h" +#include "video_core/texture_codecs/codecs.h" + +/* + * Static compilers can't always detect if vectorization is possible, + * if the programmer is 100% sure it's possible to vectorize a set + * of actions, it can hint the compiler that it can vectorize a loop + * unconditionaly. + */ +#ifdef _MSC_VER +#define VECTORIZE_NEXT __pragma("loop( ivdep )") +#elif __GNUC__ +#define VECTORIZE_NEXT _Pragma("GCC ivdep") +#elif __clang__ +#define VECTORIZE_NEXT _Pragma("clang loop vectorize(enable) interleave(enable)") +#else +#define VECTORIZE_NEXT +#endif + +// from GNU C Library under GPL 2. +// https://github.com/lattera/glibc/blob/master/bits/byteswap.h +/* Swap bytes in 32 bit value. */ + +#ifdef __GNUC__ +static inline unsigned int byte_swap_32(unsigned int __bsx) { + return __builtin_bswap32(__bsx); +} +#else +#define __bswap_constant_32(x) \ + ((((x)&0xff000000u) >> 24) | (((x)&0x00ff0000u) >> 8) | (((x)&0x0000ff00u) << 8) | \ + (((x)&0x000000ffu) << 24)) +static inline unsigned int byte_swap_32(unsigned int __bsx) { + return __bswap_constant_32(__bsx); +} +#endif + +// lil detail for linux, funny that mingw has no problem without it. +using namespace std; + +// Note: The function layout is made on purpose to help the compiler +// unfold the loop and simplify the moves to the best appropiate type in use. +// compiling for ivy-bridge-up will unfold the loop further and use AVX2 +template +inline void decode_simple(u8* from, u8* out) { + memcpy(out, from, read_size * 2); +} + +template +inline void encode_simple(u8* from, u8* out) { + memcpy(from, out, read_size * 2); +} + +inline void decode_depth(u8* from, u8* out) { + out[0] = from[0]; + out[1] = from[1]; + out[2] = from[2]; + out[3] = 0; + out[4] = from[3]; + out[5] = from[4]; + out[6] = from[5]; + out[7] = 0; +} + +inline void encode_depth(u8* from, u8* out) { + out[0] = from[0]; + out[1] = from[1]; + out[2] = from[2]; + out[3] = from[4]; + out[4] = from[5]; + out[5] = from[6]; +} + +// finaly, we decode to cursors/encode to blocks, the corresponding data by +// moving the appropiate +// 02 03 -> encode/decode second +// ----- +// 00 01 -> encode/decode first +template +inline void morton_block2x2(u8* from, u8*& w1, u8*& w2) { + func(from, w1); + w1 += write_size * 2; + func(from + read_size * 2, w2); + w2 += write_size * 2; +} + +// Again, we subdivide the 4x4 tiles and assign the each 2x2 subblock to the +// corresponding cursors. +// +// 10 11 | 14 15 +// 08 09 | 12 13 +// ------------ +// 02 03 | 06 07 +// 00 01 | 04 05 +template +inline void morton_block4x4(u8* from, u8** w1, u8** w2) { + u8* tmp_block = from; + morton_block2x2(tmp_block, w1[0], w1[1]); + tmp_block += read_size * 4; + morton_block2x2(tmp_block, w1[0], w1[1]); + tmp_block += read_size * 4; + morton_block2x2(tmp_block, w2[0], w2[1]); + tmp_block += read_size * 4; + morton_block2x2(tmp_block, w2[0], w2[1]); +} + +// We subdivide the 8x8 tiles and assign the each 4x4 subblock to the +// corresponding cursors. +// +// 42 43 46 47 | 58 59 62 63 +// 40 41 44 45 | 56 57 60 61 +// 34 35 38 39 | 50 51 54 55 +// 32 33 36 37 | 48 49 52 53 +// ----------------------- +// 10 11 14 15 | 26 27 30 31 +// 08 09 12 13 | 24 25 28 29 +// 02 03 06 07 | 18 19 22 23 +// 00 01 04 05 | 16 17 20 21 +template +inline void morton_block8x8(u8* from, u8** cursors) { + u8* tmp_block = from; + morton_block4x4(tmp_block, &cursors[0], &cursors[2]); + tmp_block += read_size * 16; + morton_block4x4(tmp_block, &cursors[0], &cursors[2]); + tmp_block += read_size * 16; + morton_block4x4(tmp_block, &cursors[4], &cursors[6]); + tmp_block += read_size * 16; + morton_block4x4(tmp_block, &cursors[4], &cursors[6]); +} + +template +inline void rewind_cursors(u8** cursors, u8* write_p, u32 width) { + cursors[0] = write_p; + cursors[1] = write_p - read_size * width; + cursors[2] = write_p - read_size * 2 * width; + cursors[3] = write_p - read_size * 3 * width; + cursors[4] = write_p - read_size * 4 * width; + cursors[5] = write_p - read_size * 5 * width; + cursors[6] = write_p - read_size * 6 * width; + cursors[7] = write_p - read_size * 7 * width; +} + +// from video_cor/utils.h +// Images are split into 8x8 tiles. Each tile is composed of four 4x4 subtiles each +// of which is composed of four 2x2 subtiles each of which is composed of four texels. +// Each structure is embedded into the next-bigger one in a diagonal pattern, e.g. +// texels are laid out in a 2x2 subtile like this: +// 2 3 +// 0 1 +// +// The full 8x8 tile has the texels arranged like this: +// +// 42 43 46 47 58 59 62 63 +// 40 41 44 45 56 57 60 61 +// 34 35 38 39 50 51 54 55 +// 32 33 36 37 48 49 52 53 +// 10 11 14 15 26 27 30 31 +// 08 09 12 13 24 25 28 29 +// 02 03 06 07 18 19 22 23 +// 00 01 04 05 16 17 20 21 +// +// This pattern is what's called Z-order curve, or Morton order. +// +// The algorithm below processos z-ordered images block by block. +// reading/writting in 8 cursors which point to the start of each +// row of a normal width*height raw pixel image. +template +inline void morton(u8* in_p, u8* write_p, u32 width, u32 height) { + u32 x_blocks = (width / 8); + u32 y_blocks = (height / 8); + u8* block_pointer = in_p; + u8* cursors[8]; + u32 step = (8 * width) * write_size; + write_p += read_size * (width * (height - 1)); + for (u32 y = 0; y != y_blocks; y++) { + rewind_cursors(cursors, write_p, width); + VECTORIZE_NEXT for (u32 x = 0; x != x_blocks; x++) { + morton_block8x8(block_pointer, cursors); + block_pointer += 64 * read_size; + } + write_p -= step; + } +} + +// These macros are used to unroll/unfold the same action on tight loops +// should be used on actions that don't branch the pipeline. +// Static compilers can't detect unrollable loops easily. Normaly, +// they require some profiling data to unroll loops. +#define LOOP_UNROLL_1(CODE) CODE +#define LOOP_UNROLL_2(CODE) \ + LOOP_UNROLL_1(CODE); \ + LOOP_UNROLL_1(CODE) +#define LOOP_UNROLL_4(CODE) \ + LOOP_UNROLL_2(CODE); \ + LOOP_UNROLL_2(CODE) +#define LOOP_UNROLL_8(CODE) \ + LOOP_UNROLL_4(CODE); \ + LOOP_UNROLL_4(CODE) +#define LOOP_UNROLL_16(CODE) \ + LOOP_UNROLL_8(CODE); \ + LOOP_UNROLL_8(CODE) + +template +inline void map_image(u8*& out_buffer, u32 width, u32 height) { + u32 writes = width * height / 16; // 16 unfolds + VECTORIZE_NEXT for (u32 i = 0; i != writes; i++) { + LOOP_UNROLL_16(func(out_buffer)); + } + // Now just do the rest + writes = width * height - (writes * 16); + u32 jump = (writes % 8); + // This form of loop unfolding works for every set of data at the + // expense of not marshelling/vectorizing but won't break the pipeline + switch (jump) { + do { + jump = 8; + func(out_buffer); + case 7: + func(out_buffer); + case 6: + func(out_buffer); + case 5: + func(out_buffer); + case 4: + func(out_buffer); + case 3: + func(out_buffer); + case 2: + func(out_buffer); + case 1: + func(out_buffer); + case 0: + default: + writes -= jump; + } while (writes != 0); + } +} + +template +inline void unfold_image(u8*& read_cursor, u8*& write_cursor, u32 width, u32 height) { + u32 writes = width * height / 16; // 16 unfolds + VECTORIZE_NEXT for (u32 i = 0; i != writes; i++) { + LOOP_UNROLL_16(func(read_cursor, write_cursor)); + } + // Now just do the rest + writes = width * height - (writes * 16); + u32 jump = (writes % 8); + // This form of loop unfolding works for every set of data at the + // expense of not marshelling/vectorizing but won't break the pipeline + switch (jump) { + do { + jump = 8; + func(read_cursor, write_cursor); + case 7: + func(read_cursor, write_cursor); + case 6: + func(read_cursor, write_cursor); + case 5: + func(read_cursor, write_cursor); + case 4: + func(read_cursor, write_cursor); + case 3: + func(read_cursor, write_cursor); + case 2: + func(read_cursor, write_cursor); + case 1: + func(read_cursor, write_cursor); + case 0: + default: + writes -= jump; + } while (writes != 0); + } +} + +// Big Endian Decoding +inline void big_u32(u8*& out_buffer) { + u32 tmp; + memcpy(&tmp, out_buffer, sizeof(u32)); + tmp = byte_swap_32(tmp); + memcpy(out_buffer, &tmp, sizeof(u32)); + out_buffer += 4; +} + +inline void color_i8(u8*& read_cursor, u8*& write_cursor) { + read_cursor -= 1; + write_cursor -= 4; + u32 tmp = 0; + u8 tmp2; + memcpy(&tmp2, read_cursor, sizeof(u8)); + tmp = tmp2 & 0x000000FF; + tmp = (tmp << 16) | (tmp << 8) | tmp | 0xFF000000; + memcpy(write_cursor, &tmp, sizeof(u32)); +} + +inline void color_a8(u8*& read_cursor, u8*& write_cursor) { + read_cursor -= 1; + write_cursor -= 4; + u32 tmp = 0; + u8 tmp2; + memcpy(&tmp2, read_cursor, sizeof(u8)); + tmp = tmp2 & 0x000000FF; + tmp = tmp << 24; + memcpy(write_cursor, &tmp, sizeof(u32)); +} + +inline void color_ia8(u8*& read_cursor, u8*& write_cursor) { + read_cursor -= 2; + write_cursor -= 4; + u32 tmp = 0; + u16 tmp2; + memcpy(&tmp2, read_cursor, sizeof(u16)); + tmp = tmp2 & 0x0000FF00; + tmp2 = tmp2 & 0x00FF; + tmp = (tmp << 8) | (tmp >> 8) | tmp | (tmp2 << 24); + memcpy(write_cursor, &tmp, sizeof(u32)); +} + +static inline void rotateLeft(u8*& out_buffer) { + u32 tmp; + memcpy(&tmp, out_buffer, sizeof(u32)); + tmp = (tmp >> 8) | (tmp << 24); + memcpy(out_buffer, &tmp, sizeof(u32)); + out_buffer += 4; +} + +static inline void rotateRight(u8*& out_buffer) { + u32 tmp; + memcpy(&tmp, out_buffer, sizeof(u32)); + tmp = (tmp >> 24) | (tmp << 8); + memcpy(out_buffer, &tmp, sizeof(u32)); + out_buffer += 4; +} + +constexpr u8 Convert4To8(u8 value) { + return (value << 4) | value; +} + +inline void nimble_write(u8*& in_buffer, u8*& out_buffer) { + out_buffer[0] = Convert4To8((*in_buffer & 0xF0) >> 4); + out_buffer[1] = Convert4To8(*in_buffer & 0x0F); + in_buffer++; + out_buffer += 2; +} + +namespace Pica { + +namespace Encoders { + +bool Morton(u8* in_buffer, u8* out_buffer, u32 width, u32 height, u32 bytespp) { + // Sanity checks + std::swap(in_buffer, out_buffer); + ASSERT(in_buffer != nullptr && out_buffer != nullptr); + ASSERT(((u64)in_buffer & 3) == 0); + ASSERT(((u64)out_buffer & 3) == 0); + ASSERT(width >= 8); + ASSERT(height >= 8); + ASSERT((width * height) % 64 == 0); + switch (bytespp) { + case 1: { + morton<&encode_simple<1>, 1, 1>(in_buffer, out_buffer, width, height); + return true; + break; + } + case 2: { + morton<&encode_simple<2>, 2, 2>(in_buffer, out_buffer, width, height); + return true; + break; + } + case 3: { + morton<&encode_simple<3>, 3, 3>(in_buffer, out_buffer, width, height); + return true; + break; + } + case 4: { + morton<&encode_simple<4>, 4, 4>(in_buffer, out_buffer, width, height); + return true; + break; + } + default: { + return false; + break; + } + } +} + +void MortonU32_U24(u8* in_buffer, u8* out_buffer, u32 width, u32 height) { + morton<&encode_depth, 4, 3>(in_buffer, out_buffer, width, height); +} + +void Depth(u8* out_buffer, u32 width, u32 height) { + map_image<&rotateLeft>(out_buffer, width, height); +} + +} // Encoders + +namespace Decoders { + +void MortonU24_U32(u8* in_buffer, u8* out_buffer, u32 width, u32 height) { + morton<&decode_depth, 3, 4>(in_buffer, out_buffer, width, height); +} + +bool Morton(u8* in_buffer, u8* out_buffer, u32 width, u32 height, u32 bytespp) { + // Sanity checks + ASSERT(in_buffer != nullptr && out_buffer != nullptr); + ASSERT(((u64)in_buffer & 3) == 0); + ASSERT(((u64)out_buffer & 3) == 0); + ASSERT(width >= 8); + ASSERT(height >= 8); + ASSERT((width * height) % 64 == 0); + switch (bytespp) { + case 1: { + morton<&decode_simple<1>, 1, 1>(in_buffer, out_buffer, width, height); + return true; + break; + } + case 2: { + morton<&decode_simple<2>, 2, 2>(in_buffer, out_buffer, width, height); + return true; + break; + } + case 3: { + morton<&decode_simple<3>, 3, 3>(in_buffer, out_buffer, width, height); + return true; + break; + } + case 4: { + morton<&decode_simple<4>, 4, 4>(in_buffer, out_buffer, width, height); + return true; + break; + } + default: { + return false; + break; + } + } +} + +void BigEndian(u8* out_buffer, u32 width, u32 height) { + map_image<&big_u32>(out_buffer, width, height); +} + +void Depth(u8* out_buffer, u32 width, u32 height) { + map_image<&rotateRight>(out_buffer, width, height); +} + +void I8(u8* out_buffer, u32 width, u32 height) { + u8* read_cursor = out_buffer + (width * height); + u8* write_cursor = out_buffer + (width * height * 4); + unfold_image<&color_i8>(read_cursor, write_cursor, width, height); +} + +void A8(u8* out_buffer, u32 width, u32 height) { + u8* read_cursor = out_buffer + (width * height); + u8* write_cursor = out_buffer + (width * height * 4); + unfold_image<&color_a8>(read_cursor, write_cursor, width, height); +} + +void IA8(u8* out_buffer, u32 width, u32 height) { + u8* read_cursor = out_buffer + (width * height * 2); + u8* write_cursor = out_buffer + (width * height * 4); + unfold_image<&color_ia8>(read_cursor, write_cursor, width, height); +} + +// Nimbles + +void Nimbles(u8* in_buffer, u8* out_buffer, u32 width, u32 height) { + u32 writes = width * height / 32; // 16 unfolds + for (u32 i = 0; i != writes; i++) { + LOOP_UNROLL_16(nimble_write(in_buffer, out_buffer)); + } + // Now just do the rest + writes = width * height - (writes * 32); + for (u32 i = 0; i != writes; i++) { + LOOP_UNROLL_1(nimble_write(in_buffer, out_buffer)); + } +} + +} // TextureUtils + +} // Pica diff --git a/src/video_core/texture_codecs/codecs.h b/src/video_core/texture_codecs/codecs.h new file mode 100644 index 000000000..015c80236 --- /dev/null +++ b/src/video_core/texture_codecs/codecs.h @@ -0,0 +1,49 @@ + +#pragma once + +#include "common/common_types.h" + +namespace Pica { + +namespace Encoders { +/** + * Encodes textures in raw texel data into z-order/morton-order + * @param in_buffer pointer to the texture that needs encoding. + * @param out_buffer pointer to a buffer where the encoded image will be written. + * @param width texture's width + * @param width texture's height + * @param bytespp bytes per pixel + */ +bool Morton(u8* in_buffer, u8* out_buffer, u32 width, u32 height, u32 bytespp); +void MortonU32_U24(u8* in_buffer, u8* out_buffer, u32 width, u32 height); + +void Depth(u8* out_buffer, u32 width, u32 height); + +} // Encoders + +namespace Decoders { + +/** + * Decodes textures using z-order/morton-order into raw texel data + * @param in_buffer pointer to the texture that needs decoding. + * @param out_buffer pointer to a buffer where the decoded image will be written. + * @param width texture's width + * @param width texture's height + * @param bytespp bytes per pixel + */ +bool Morton(u8* in_buffer, u8* out_buffer, u32 width, u32 height, u32 bytespp); +void MortonU24_U32(u8* in_buffer, u8* out_buffer, u32 width, u32 height); + +void BigEndian(u8* out_buffer, u32 width, u32 height); + +void Depth(u8* out_buffer, u32 width, u32 height); + +void I8(u8* out_buffer, u32 width, u32 height); +void A8(u8* out_buffer, u32 width, u32 height); +void IA8(u8* out_buffer, u32 width, u32 height); + +void Nimbles(u8* in_buffer, u8* out_buffer, u32 width, u32 height); + +} // Decoders + +} // Pica