diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index b33869c22..061923911 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -1,3 +1,4 @@ + set(SRCS renderer_opengl/gl_rasterizer.cpp renderer_opengl/gl_rasterizer_cache.cpp diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index ac029b48f..5389a8941 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -26,6 +26,9 @@ #include "video_core/utils.h" #include "video_core/video_core.h" +#define TEXTURE_CACHE_SIZE (1024 * 1024 * 8) // 8MB inner cache for decoding/encoding +alignas(64) static u8 TextureCache[TEXTURE_CACHE_SIZE]; + struct FormatTuple { GLint internal_format; GLenum format; @@ -39,7 +42,7 @@ static const std::array format_tuples = {{ {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5}, // RGB565 {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4}, // RGBA4 {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // IA8 - {GL_RG8, GL_RG8, GL_UNSIGNED_BYTE}, // RG8 + {GL_RG8, GL_RG, GL_UNSIGNED_BYTE}, // RG8 {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // I8 {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // A8 {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // IA4 @@ -70,7 +73,7 @@ static const std::array native_format = { false, // ETC1A4 true, // D16 false, - false, // D24 + true, // D24 false, // D24S8 }; @@ -306,26 +309,29 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo // clang-format on ); Pica::Texture::Codec* codec = tmp.get(); - codec->configTiling(true, 8); // change 8 for 32 in case the mage is tiled + codec->configTiling(true, 8); // change 8 for 32 in case the image is tiled // on blocks of 32x32 codec->configRGBATransform(!native_format[(unsigned int)params.pixel_format]); codec->validate(); if (!codec->invalid()) { - codec->decode(); - std::unique_ptr decoded_texture = codec->transferInternalBuffer(); - u32 bytes = codec->getInternalBytesPerPixel(); - if (bytes == 3) - bytes = 1; - else if (bytes != 2) - bytes = 4; - glPixelStorei(GL_UNPACK_ALIGNMENT, bytes); - glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, - 0, tuple.format, tuple.type, decoded_texture.get()); - glPixelStorei(GL_UNPACK_ALIGNMENT, 4); + u32 estimated_size = + params.width * params.height * codec->getInternalBytesPerPixel(); + if (estimated_size <= TEXTURE_CACHE_SIZE) { + codec->setExternalBuffer(TextureCache); + codec->decode(); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, + params.height, 0, tuple.format, tuple.type, TextureCache); + } else { + codec->decode(); + std::unique_ptr decoded_texture = codec->transferInternalBuffer(); + glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, + params.height, 0, tuple.format, tuple.type, decoded_texture.get()); + } } else { LOG_WARNING(Render_OpenGL, "Invalid texture sent to renderer; width: %d height %d type: %d", params.width, params.height, (unsigned int)params.pixel_format); + return nullptr; } } // If not 1x scale, blit 1x texture to a new scaled texture and replace texture in surface @@ -652,15 +658,22 @@ void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) { glPixelStorei(GL_PACK_ROW_LENGTH, 0); } else { const FormatTuple& tuple = format_tuples[(u32)surface->pixel_format]; - u32 bytes_per_pixel = Pica::Texture::Format::GetBpp(surface->pixel_format) / 8; + u32 bits_per_pixel = Pica::Texture::Format::GetBpp(surface->pixel_format); if (!native_format[(u32)surface->pixel_format]) - bytes_per_pixel = 4; - std::vector temp_gl_buffer(surface->width * surface->height * bytes_per_pixel); - glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data()); - + bits_per_pixel = 32; + u32 size = surface->width * surface->height * bits_per_pixel / 8; + std::vector temp_gl_buffer; + u8* temporal_buffer; + if (size <= TEXTURE_CACHE_SIZE) + temporal_buffer = TextureCache; + else { + temp_gl_buffer.resize(size); + temporal_buffer = temp_gl_buffer.data(); + } + glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temporal_buffer); std::unique_ptr tmp = Pica::Texture::CodecFactory::build( // clang-format off - surface->pixel_format, temp_gl_buffer.data(), surface->width, surface->height + surface->pixel_format, temporal_buffer, surface->width, surface->height // clang-format on ); Pica::Texture::Codec* codec = tmp.get(); diff --git a/src/video_core/texture/codec.cpp b/src/video_core/texture/codec.cpp index 42f39c23f..d13a6dd45 100644 --- a/src/video_core/texture/codec.cpp +++ b/src/video_core/texture/codec.cpp @@ -67,7 +67,7 @@ void Codec::init(bool decode) { this->expected_nibbles_size = this->start_nibbles_size; } this->validate(); - if (!this->external_result_buffer || !this->invalid()) { + if (!this->external_result_buffer) { size_t buff_size = this->width * this->height * this->expected_nibbles_size / 2; this->internal_buffer = std::make_unique(buff_size); this->passing_buffer = this->internal_buffer.get(); @@ -91,7 +91,7 @@ void Codec::validate() { this->invalid_state = true; return; } - if (this->morton && this->morton_pass_tiling != 8 && this->morton_pass_tiling != 32) { + if (this->morton && this->morton_pass_tiling != 8) { this->invalid_state = true; return; } @@ -102,18 +102,12 @@ inline void Codec::decode_morton_pass() { if (this->morton_pass_tiling == 8) Decoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height, this->start_nibbles_size * 4); - else if (this->morton_pass_tiling == 32) - Decoders::Morton_32x32(this->target_buffer, this->passing_buffer, this->width, this->height, - this->start_nibbles_size * 4); } inline void Codec::encode_morton_pass() { if (this->morton_pass_tiling == 8) - Encoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height, + Encoders::Morton_8x8(this->passing_buffer, this->target_buffer, this->width, this->height, this->start_nibbles_size * 4); - else if (this->morton_pass_tiling == 32) - Encoders::Morton_32x32(this->target_buffer, this->passing_buffer, this->width, this->height, - this->start_nibbles_size * 4); } std::unique_ptr CodecFactory::build(Format::Type format, u8* target, u32 width, u32 height) { diff --git a/src/video_core/texture/formats.h b/src/video_core/texture/formats.h index ffb24e615..6418e780d 100644 --- a/src/video_core/texture/formats.h +++ b/src/video_core/texture/formats.h @@ -40,7 +40,7 @@ struct Format { Invalid = 255, }; - static u32 GetBpp(Type format) { + static const u32 GetBpp(Type format) { static const std::array bpp_table = { 32, // RGBA8 24, // RGB8 @@ -66,19 +66,19 @@ struct Format { return bpp_table[(u32)format]; } - static Type FromTextureFormat(Regs::TextureFormat format) { + static constexpr Type FromTextureFormat(Regs::TextureFormat format) { return ((unsigned int)format < 14) ? (Type)format : Type::Invalid; } - static Type FromColorFormat(Regs::ColorFormat format) { + static constexpr Type FromColorFormat(Regs::ColorFormat format) { return ((unsigned int)format < 5) ? (Type)format : Type::Invalid; } - static Type FromDepthFormat(Regs::DepthFormat format) { + static constexpr Type FromDepthFormat(Regs::DepthFormat format) { return ((unsigned int)format < 4) ? (Type)((unsigned int)format + 14) : Type::Invalid; } - static Type FromGPUPixelFormat(GPU::Regs::PixelFormat format) { + static const Type FromGPUPixelFormat(GPU::Regs::PixelFormat format) { switch (format) { // RGB565 and RGB5A1 are switched in PixelFormat compared to ColorFormat case GPU::Regs::PixelFormat::RGB565: @@ -92,6 +92,17 @@ struct Format { }; // Format +struct Info { + PAddr physical_address; + int width; + int height; + int stride; + Pica::Regs::TextureFormat format; + + static Info FromPicaRegister(const Pica::Regs::TextureConfig& config, + const Pica::Regs::TextureFormat& format); +}; + } // Texture } // Pica diff --git a/src/video_core/texture/internal/codecs.h b/src/video_core/texture/internal/codecs.h index 9fa40908a..5864d09b3 100644 --- a/src/video_core/texture/internal/codecs.h +++ b/src/video_core/texture/internal/codecs.h @@ -5,6 +5,7 @@ #include #include "common/common_types.h" #include "video_core/texture/codec.h" +#include "video_core/texture/formats.h" // each texture format codec class RGBACodec : public Pica::Texture::Codec { @@ -15,7 +16,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 8; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::RGBA8 + ) / 4; + // clang-format on }; }; @@ -27,7 +32,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 6; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::RGB8 + ) / 4; + // clang-format on }; }; @@ -39,7 +48,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 4; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::RGB5A1 + ) / 4; + // clang-format on }; }; @@ -51,7 +64,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 4; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::RGBA4 + ) / 4; + // clang-format on }; }; @@ -63,7 +80,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 4; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::RGB565 + ) / 4; + // clang-format on }; }; @@ -75,7 +96,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 4; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::RG8 + ) / 4; + // clang-format on }; }; @@ -87,7 +112,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 4; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::IA8 + ) / 4; + // clang-format on }; }; @@ -99,7 +128,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 2; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::I8 + ) / 4; + // clang-format on }; }; @@ -111,7 +144,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 2; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::A8 + ) / 4; + // clang-format on }; }; @@ -123,7 +160,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 2; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::IA4 + ) / 4; + // clang-format on }; }; @@ -135,7 +176,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 1; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::I4 + ) / 4; + // clang-format on }; }; @@ -147,7 +192,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 1; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::A4 + ) / 4; + // clang-format on }; }; @@ -159,7 +208,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 1; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::ETC1 + ) / 4; + // clang-format on }; }; @@ -171,7 +224,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 2; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::ETC1A4 + ) / 4; + // clang-format on }; }; @@ -183,7 +240,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 4; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::D16 + ) / 4; + // clang-format on }; }; @@ -195,7 +256,11 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 6; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::D24 + ) / 4; + // clang-format on }; }; @@ -207,6 +272,10 @@ public: protected: virtual void setSize() { - this->start_nibbles_size = 8; + // clang-format off + this->start_nibbles_size = Pica::Texture::Format::GetBpp( + Pica::Texture::Format::Type::D24S8 + ) / 4; + // clang-format on }; }; diff --git a/src/video_core/texture/internal/decoders.cpp b/src/video_core/texture/internal/decoders.cpp index 28672e8fb..2ee7b936a 100644 --- a/src/video_core/texture/internal/decoders.cpp +++ b/src/video_core/texture/internal/decoders.cpp @@ -83,9 +83,9 @@ inline u32 build_luminance(u32 intensity, u32 alpha) { } inline void intensity_alpha_pass(u8* read, u8* write) { - alignas(4) u8 pixel[2]; - std::memcpy(pixel, read, 2); - u32 result = build_luminance(pixel[1], pixel[0]); + u16 pixel; + std::memcpy(&pixel, read, 2); + u32 result = build_luminance(pixel >> 8, pixel & 0x00FF); std::memcpy(write, &result, 4); } @@ -93,9 +93,7 @@ inline void intensity_alpha_nibbles_pass(u8* read, u8* write) { alignas(4) u8 pixel; std::memcpy(&pixel, read, 1); u16 tmp = convert_nibbles(pixel); - u8 tmp2[2]; - std::memcpy(tmp2, &tmp, 2); - u32 result = build_luminance(tmp2[1], tmp2[0]); + u32 result = build_luminance(tmp >> 8, tmp & 0x00FF); std::memcpy(write, &result, 4); } @@ -107,31 +105,29 @@ inline void intensity_pass(u8* read, u8* write) { } inline void intensity_nibbles_pass(u8* read, u8* write) { - alignas(4) u8 pixel[2]; - std::memcpy(pixel, read, 1); - u16 tmp = convert_nibbles(pixel[0]); - std::memcpy(pixel, &tmp, 2); - u32 result = build_luminance(pixel[1], 255); + u8 pixel; + std::memcpy(&pixel, read, 1); + u16 tmp = convert_nibbles(pixel); + u32 result = build_luminance(tmp & 0x00FF, 255); std::memcpy(write, &result, 4); - result = build_luminance(pixel[0], 255); + result = build_luminance(tmp >> 8, 255); std::memcpy(write + 4, &result, 4); } inline void alpha_pass(u8* read, u8* write) { - alignas(4) u8 pixel[1]; - std::memcpy(pixel, read, 1); - u32 result = build_luminance(0, pixel[0]); + u8 pixel; + std::memcpy(&pixel, read, 1); + u32 result = build_luminance(0, pixel); std::memcpy(write, &result, 4); } inline void alpha_nibbles_pass(u8* read, u8* write) { - alignas(4) u8 pixel[2]; - std::memcpy(pixel, read, 1); - u16 tmp = convert_nibbles(pixel[0]); - std::memcpy(pixel, &tmp, 2); - u32 result = build_luminance(0, pixel[0]); + u8 pixel; + std::memcpy(&pixel, read, 1); + u16 tmp = convert_nibbles(pixel); + u32 result = build_luminance(0, tmp & 0x00FF); std::memcpy(write, &result, 4); - result = build_luminance(0, pixel[1]); + result = build_luminance(0, tmp >> 8); std::memcpy(write + 4, &result, 4); } @@ -207,7 +203,7 @@ void ETC1A4Codec::decode() { ETC1A4(this->target_buffer, this->passing_buffer, this->width, this->height); } -namespace { +namespace Decode { inline void expand_depth16_pass(u8* read, u8* write) { alignas(4) u8 pixel[4]; @@ -224,11 +220,18 @@ inline void expand_depth24_pass(u8* read, u8* write) { std::memcpy(write, pixel, 4); } -inline void fix_stencil_pass(u8* read, u8* write) { - u32 pixel; - std::memcpy(&pixel, read, 4); - pixel = (pixel << 8) | (pixel >> 24); - std::memcpy(write, &pixel, 4); +inline void d24s8_pass(u8* target, u32 width, u32 height) { + const size_t sub_iters = 8; + const size_t iters = width * height / sub_iters; + for (u32 i = 0; i < iters; i++) { + for (u32 j = 0; j < sub_iters; j++) { + u32 pixel; + std::memcpy(&pixel, target, 4); + pixel = (pixel >> 24) | (pixel << 8); + std::memcpy(target, &pixel, 4); + target += 4; + } + } } } // Anonymous @@ -236,7 +239,7 @@ inline void fix_stencil_pass(u8* read, u8* write) { void D16Codec::decode() { super::decode(); if (this->raw_RGBA) - image_pass<&expand_depth16_pass, 4, 8>( + image_pass<&Decode::expand_depth16_pass, 4, 8>( // clang-format off this->passing_buffer, this->width, this->height // clang-format on @@ -246,7 +249,7 @@ void D16Codec::decode() { void D24Codec::decode() { super::decode(); if (this->raw_RGBA) - image_pass<&expand_depth24_pass, 6, 8>( + image_pass<&Decode::expand_depth24_pass, 6, 8>( // clang-format off this->passing_buffer, this->width, this->height // clang-format on @@ -256,9 +259,5 @@ void D24Codec::decode() { void D24S8Codec::decode() { super::decode(); if (this->raw_RGBA) - image_pass<&fix_stencil_pass, 8, 8, 8>( - // clang-format off - this->passing_buffer, this->width, this->height - // clang-format on - ); + Decode::d24s8_pass(this->passing_buffer, this->width, this->height); } diff --git a/src/video_core/texture/internal/encoders.cpp b/src/video_core/texture/internal/encoders.cpp index 0844bb737..fb4616fc2 100644 --- a/src/video_core/texture/internal/encoders.cpp +++ b/src/video_core/texture/internal/encoders.cpp @@ -69,11 +69,18 @@ inline void contract_depth24_pass(u8* read, u8* write) { std::memcpy(write, pixel, 3); } -inline void fix_stencil_pass(u8* read, u8* write) { - u32 pixel; - std::memcpy(&pixel, read, 4); - pixel = (pixel >> 24) | (pixel << 8); - std::memcpy(write, &pixel, 4); +inline void d24s8_pass(u8* target, u32 width, u32 height) { + const size_t sub_iters = 8; + const size_t iters = width * height / sub_iters; + for (u32 i = 0; i < iters; i++) { + for (u32 j = 0; j < sub_iters; j++) { + u32 pixel; + std::memcpy(&pixel, target, 4); + pixel = (pixel >> 8) | (pixel << 24); + std::memcpy(target, &pixel, 4); + target += 4; + } + } } } // Anonymous @@ -101,9 +108,5 @@ void D24Codec::encode() { void D24S8Codec::encode() { super::encode(); if (this->raw_RGBA) - image_pass<&Encode::fix_stencil_pass, 8, 8, 8>( - // clang-format off - this->passing_buffer, this->width, this->height - // clang-format on - ); + Encode::d24s8_pass(this->passing_buffer, this->width, this->height); } diff --git a/src/video_core/texture/internal/etc1.cpp b/src/video_core/texture/internal/etc1.cpp index 5a7edec33..9003c91a8 100644 --- a/src/video_core/texture/internal/etc1.cpp +++ b/src/video_core/texture/internal/etc1.cpp @@ -122,6 +122,22 @@ union ETC1Tile { } // anonymous namespace +inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) { + std::memcpy(matrix_pointer, morton_pointer, read_size); +} + +template +void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) { + const size_t tiled_line_size = (lines_per_block * nibbles) / 2; + const size_t row_length = x_blocks * tiled_line_size; + for (u32 i = 0; i < lines_per_block; i++) { + const u32 k = (lines_per_block - 1 - i); + const size_t tiled_index = i * tiled_line_size; + const size_t linear_index = k * row_length; + codec(tiled + tiled_index, linear + linear_index, tiled_line_size); + } +} + inline void etc1_pass(u8* etc1_buffer, u8* linear_buffer, u32 x_blocks) { const size_t line = 8 * 4; alignas(64) u8 tmp[line * 8]; diff --git a/src/video_core/texture/internal/morton.cpp b/src/video_core/texture/internal/morton.cpp index 9bd74c800..5b84db7a0 100644 --- a/src/video_core/texture/internal/morton.cpp +++ b/src/video_core/texture/internal/morton.cpp @@ -1,295 +1,40 @@ -#include -#include -#include #include "common/common_types.h" #include "video_core/texture/internal/morton.h" -#include "video_core/texture/internal/texture_utils.h" -/////////////////////////////////////////////////////////////////////////////// -// Optimizations -////////////////////////////////////////////////////////////////////////////// -#ifdef _MSC_VER -#pragma inline_recursion(on) -// Normaly set to 16 by default, the best balance seems to be on 8 for this module -#pragma inline_depth(8) -// favor fast code over small code. -#pragma optimize("t", on) -#pragma intrinsic(memcpy) -#define __hot -#define __no_inline __declspec(noinline) -#elif defined(CLANG_OR_GCC) -// The next 3 will swizle memory copying to help find the best sse/avx shuffling -// in case it's possible. Compilation tests have proven effective use of these -// flags on gcc and clang. -#pragma GCC optimize("-fpredictive-commoning") -#pragma GCC optimize("-ftree-loop-distribute-patterns") -#pragma GCC optimize("-ftree-vectorize") -#pragma GCC option("--param inline-unit-growth=400") -#pragma GCC option("--param large-function-growth=800") -// The beauty of these compiler options is that they generate better code than -// hand written intrinsics, since inline expanding memeory transfers can be pattern -// matched with vector instructions available in the target. -#define __no_inline __attribute__((noinline)) -#define __hot __attribute__((hot)) -#if !defined(__forceinline) -#define __forceinline attribute__((always_inline)) -#endif -#else -#define __hot -#define __no_inline -#define __forceinline -#endif - -#pragma region Z_Order -///////////////////////////////////////////////////////////////////////////// -// Z-Order: -// -// 0-->1 -// / -// 2-->3 -// -// for more information look at: https://en.wikipedia.org/wiki/Z-order_curve -///////////////////////////////////////////////////////////////////////////// -#define TOP_LEFT 0 -#define TOP_RIGHT 1 -#define BOTTOM_LEFT 2 -#define BOTTOM_RIGHT 3 - -constexpr u32 isRight(u32 block_index) { - return (block_index % 2); +static u32 Part1By1(u32 x) { + x &= 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210 + x = (x ^ (x << 8)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210 + x = (x ^ (x << 4)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210 + x = (x ^ (x << 2)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10 + x = (x ^ (x << 1)) & 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0 + return x; } -constexpr u32 isBottom(u32 block_index) { - return (block_index / 2); +static u32 Compact1By1(u32 x) { + x &= 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0 + x = (x ^ (x >> 1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10 + x = (x ^ (x >> 2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210 + x = (x ^ (x >> 4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210 + x = (x ^ (x >> 8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210 + return x; } -template -__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block); - -template -__forceinline static void swizzle_block_aux(u8*& morton_block, u8* linear_block) { - // move the linear_block pointer to the appropiate block - const size_t right = isRight(block_index) * (blocks * nibbles) / 2; - const size_t down = isBottom(block_index) * block_size; - u8* new_linear = linear_block + right + down; - swizzle_block(morton_block, new_linear); +static u32 EncodeMorton(u32 x, u32 y) { + return (Part1By1(y) << 1) | Part1By1(x); } -template -__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block) { - const size_t new_block_size = block_size / 2; - if (blocks <= 2) { - // We handle 2*2 blocks on z-order - const size_t read_size = nibbles; // just for clearness. It's the same amount - // TOP_LEFT & TOP_RIGHT - codec(morton_block, linear_block, read_size); - morton_block += read_size; - // BOTTOM_LEFT & BOTTOM_RIGHT - codec(morton_block, linear_block + new_block_size, read_size); - morton_block += read_size; - } else { - // we divide the block into 4 blocks in z-order corecursively - // until we have 2x2 blocks. - const u32 subdivide = blocks / 2; - swizzle_block_aux(morton_block, - linear_block); - swizzle_block_aux(morton_block, - linear_block); - swizzle_block_aux(morton_block, - linear_block); - swizzle_block_aux(morton_block, - linear_block); - } +static u32 DecodeMortonX(u32 code) { + return Compact1By1(code >> 0); } -template -__forceinline static void swizzle_pass(u8* morton_block, u8* linear_block) { - const size_t block_size = (lines_per_block * lines_per_block * nibbles) / 2; - swizzle_block(morton_block, linear_block); -} -#pragma endregion Z_Order - -template -__hot inline static void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) { - const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2; - alignas(64) u8 tmp[tile_size]; - tiling_pass<&encode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks); - swizzle_pass<&encode, nibbles, lines_per_block>(morton_buffer, tmp); +static u32 DecodeMortonY(u32 code) { + return Compact1By1(code >> 1); } -template -__hot inline static void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) { - const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2; - alignas(64) u8 tmp[tile_size]; - swizzle_pass<&decode, nibbles, lines_per_block>(morton_buffer, tmp); - tiling_pass<&decode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks); +u32 MortonOffset(u32 x, u32 y, u32 width, u32 height, u32 tiling, u32 bpp) { + u32 tile = (x + y * height) * width / (tiling * tiling); + tile = (tile * bpp) / 8; + return tile + EncodeMorton(x % tiling, y % tiling); } -template -__hot static void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) { - const u32 x_blocks = (width / lines_per_block); - const u32 y_blocks = (height / lines_per_block); - const size_t line_size = (lines_per_block * nibbles) / 2; - const size_t tile_size = lines_per_block * line_size; - const size_t stride_size = width * line_size; - matrix_buffer = matrix_buffer + ((height * width * nibbles) / 2) - stride_size; - for (u32 y = 0; y < y_blocks; y++) { - u8* linear_buffer = matrix_buffer; - for (u32 x = 0; x != x_blocks; x++) { - codec(morton_buffer, linear_buffer, x_blocks); - linear_buffer += line_size; - morton_buffer += tile_size; - } - matrix_buffer -= stride_size; - } -} - -// keep hot code together -__no_inline __hot static void morton_8x8_32(u8* morton_buffer, u8* matrix_buffer, u32 width, - u32 height, bool decode) { - if (decode) - morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height); - else - morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height); -} - -namespace Decoders { - -bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) { - if (bpp == 32) { - morton_8x8_32(morton_buffer, matrix_buffer, width, height, true); - return true; - } - switch (bpp) { - case 4: { - morton_pass<&decode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - case 8: { - morton_pass<&decode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - case 16: { - morton_pass<&decode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - case 24: { - morton_pass<&decode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - default: { - return false; - break; - } - } -} - -bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) { - switch (bpp) { - case 4: { - morton_pass<&decode_pass<1, 32>, 1, 32>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - case 8: { - morton_pass<&decode_pass<2, 32>, 2, 32>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - case 16: { - morton_pass<&decode_pass<4, 32>, 4, 32>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - case 24: { - morton_pass<&decode_pass<6, 32>, 6, 32>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - case 32: { - morton_pass<&decode_pass<8, 32>, 8, 32>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - default: { - return false; - break; - } - } -} -} - -namespace Encoders { - -bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) { - if (bpp == 32) { - morton_8x8_32(morton_buffer, matrix_buffer, width, height, false); - return true; - } - switch (bpp) { - case 4: { - morton_pass<&encode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - case 8: { - morton_pass<&encode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - case 16: { - morton_pass<&encode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - case 24: { - morton_pass<&encode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - default: { - return false; - break; - } - } -} - -bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) { - switch (bpp) { - case 4: { - morton_pass<&encode_pass<1, 32>, 1, 32>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - case 8: { - morton_pass<&encode_pass<2, 32>, 2, 32>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - case 16: { - morton_pass<&encode_pass<4, 32>, 4, 32>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - case 24: { - morton_pass<&encode_pass<6, 32>, 6, 32>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - case 32: { - morton_pass<&encode_pass<8, 32>, 8, 32>(morton_buffer, matrix_buffer, width, height); - return true; - break; - } - default: { - return false; - break; - } - } -} -} +#include "morton8x8_optimized.cpp" diff --git a/src/video_core/texture/internal/morton.h b/src/video_core/texture/internal/morton.h index 95473744f..73fa22eb3 100644 --- a/src/video_core/texture/internal/morton.h +++ b/src/video_core/texture/internal/morton.h @@ -2,14 +2,12 @@ #include "common/common_types.h" -enum class MortonPass { Tile8x8, Tile32x32 }; +u32 MortonOffset(u32 x, u32 y, u32 width, u32 height, u32 tiling, u32 bpp); namespace Decoders { bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp); -bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp); } namespace Encoders { bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp); -bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp); } diff --git a/src/video_core/texture/internal/morton8x8_optimized.cpp b/src/video_core/texture/internal/morton8x8_optimized.cpp new file mode 100644 index 000000000..d8511be56 --- /dev/null +++ b/src/video_core/texture/internal/morton8x8_optimized.cpp @@ -0,0 +1,253 @@ + +#include +#include +#include +#include "common/common_types.h" + +#if ((defined(__clang__) || defined(__GNUC__)) && !defined(__INTEL_COMPILER)) +#define CLANG_OR_GCC +#endif + +/////////////////////////////////////////////////////////////////////////////// +// Optimizations +////////////////////////////////////////////////////////////////////////////// +#ifdef _MSC_VER +#pragma inline_recursion(on) +#pragma intrinsic(memcpy) +#define __hot +#define __no_inline __declspec(noinline) +#elif defined(CLANG_OR_GCC) +#pragma GCC push_options +// The next 3 will swizle memory copying to help find the best sse/avx shuffling +// in case it's possible. Compilation tests have proven effective use of these +// flags on gcc and clang. +#pragma GCC optimize("-fpredictive-commoning") +#pragma GCC optimize("-ftree-loop-distribute-patterns") +#pragma GCC optimize("-ftree-vectorize") +// The beauty of these compiler options is that they generate better code than +// hand written intrinsics, since inline expanding memeory transfers can be pattern +// matched with vector instructions available in the target. +#define __no_inline __attribute__((noinline)) +#define __hot __attribute__((hot)) +#if !defined(__forceinline) +#define __forceinline attribute__((always_inline)) +#endif +#else +#define __hot +#define __no_inline +#define __forceinline inline +#endif + +#pragma region Z_Order +///////////////////////////////////////////////////////////////////////////// +// Z-Order: +// +// 0-->1 +// / +// 2-->3 +// +// for more information look at: https://en.wikipedia.org/wiki/Z-order_curve +///////////////////////////////////////////////////////////////////////////// +#define TOP_LEFT 0 +#define TOP_RIGHT 1 +#define BOTTOM_LEFT 2 +#define BOTTOM_RIGHT 3 + +// @param read_size is the amount of bytes each pixel takes +__forceinline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) { + std::memcpy(matrix_pointer, morton_pointer, read_size); +} + +// @param read_size is the amount of bytes each pixel takes +__forceinline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) { + std::memcpy(morton_pointer, matrix_pointer, read_size); +} + +constexpr u32 isRight(u32 block_index) { + return (block_index % 2); +} + +constexpr u32 isBottom(u32 block_index) { + return (block_index / 2); +} + +template +__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block); + +template +__forceinline static void swizzle_block_aux(u8*& morton_block, u8* linear_block) { + // move the linear_block pointer to the appropiate block + const size_t right = isRight(block_index) * (blocks * nibbles) / 2; + const size_t down = isBottom(block_index) * block_size; + u8* new_linear = linear_block + right + down; + swizzle_block(morton_block, new_linear); +} + +template +__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block) { + const size_t new_block_size = block_size / 2; + if (blocks <= 2) { + // We handle 2*2 blocks on z-order + const size_t read_size = nibbles; // just for clearness. It's the same amount + // TOP_LEFT & TOP_RIGHT + codec(morton_block, linear_block, read_size); + morton_block += read_size; + // BOTTOM_LEFT & BOTTOM_RIGHT + codec(morton_block, linear_block + new_block_size, read_size); + morton_block += read_size; + } else { + // we divide the block into 4 blocks in z-order corecursively + // until we have 2x2 blocks. + const u32 subdivide = blocks / 2; + swizzle_block_aux(morton_block, + linear_block); + swizzle_block_aux(morton_block, + linear_block); + swizzle_block_aux(morton_block, + linear_block); + swizzle_block_aux(morton_block, + linear_block); + } +} + +template +__forceinline static void swizzle_pass(u8* morton_block, u8* linear_block) { + const size_t block_size = (lines_per_block * lines_per_block * nibbles) / 2; + swizzle_block(morton_block, linear_block); +} +#pragma endregion Z_Order + +template +__forceinline void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) { + const size_t tiled_line_size = (lines_per_block * nibbles) / 2; + const size_t row_length = x_blocks * tiled_line_size; + for (u32 i = 0; i < lines_per_block; i++) { + const u32 k = (lines_per_block - 1 - i); + const size_t tiled_index = i * tiled_line_size; + const size_t linear_index = k * row_length; + codec(tiled + tiled_index, linear + linear_index, tiled_line_size); + } +} + +template +__forceinline static void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) { + const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2; + alignas(64) u8 tmp[tile_size]; + tiling_pass<&encode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks); + swizzle_pass<&encode, nibbles, lines_per_block>(morton_buffer, tmp); +} + +template +__forceinline static void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) { + const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2; + alignas(64) u8 tmp[tile_size]; + swizzle_pass<&decode, nibbles, lines_per_block>(morton_buffer, tmp); + tiling_pass<&decode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks); +} + +template +static void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) { + const u32 x_blocks = (width / lines_per_block); + const u32 y_blocks = (height / lines_per_block); + const size_t line_size = (lines_per_block * nibbles) / 2; + const size_t tile_size = lines_per_block * line_size; + const size_t stride_size = width * line_size; + matrix_buffer = matrix_buffer + ((height * width * nibbles) / 2) - stride_size; + for (u32 y = 0; y < y_blocks; y++) { + u8* linear_buffer = matrix_buffer; + for (u32 x = 0; x != x_blocks; x++) { + codec(morton_buffer, linear_buffer, x_blocks); + linear_buffer += line_size; + morton_buffer += tile_size; + } + matrix_buffer -= stride_size; + } +} + +namespace Decoders { + +bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) { + switch (bpp) { + case 4: { + morton_pass<&decode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 8: { + morton_pass<&decode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 16: { + morton_pass<&decode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 24: { + morton_pass<&decode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 32: { + morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + default: { + return false; + break; + } + } +} +} + +namespace Encoders { + +bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) { + switch (bpp) { + case 4: { + morton_pass<&encode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 8: { + morton_pass<&encode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 16: { + morton_pass<&encode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 24: { + morton_pass<&encode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 32: { + morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + default: { + return false; + break; + } + } +} +} + +#ifdef _MSC_VER +#undef __hot +#undef __no_inline +#elif defined(CLANG_OR_GCC) +#pragma GCC pop_options +#undef __no_inline +#undef __hot +#else +#undef __hot +#undef __no_inline +#undef __forceinline +#endif diff --git a/src/video_core/texture/internal/texture_utils.h b/src/video_core/texture/internal/texture_utils.h index 38d7f96f9..536a9873d 100644 --- a/src/video_core/texture/internal/texture_utils.h +++ b/src/video_core/texture/internal/texture_utils.h @@ -16,6 +16,7 @@ #ifdef _MSC_VER #pragma inline_recursion(on) #elif defined(CLANG_OR_GCC) +#pragma GCC push_options #pragma GCC optimize("-fpeel-loops") #pragma GCC optimize("-fpredictive-commoning") #pragma GCC optimize("-ftree-loop-distribute-patterns") @@ -74,24 +75,6 @@ inline void image_pass(u8* target, u32 width, u32 height) { image_pass_aux_rev(target, width, height); } -template -void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) { - const size_t tiled_line_size = (lines_per_block * nibbles) / 2; - const size_t row_length = x_blocks * tiled_line_size; - for (u32 i = 0; i < lines_per_block; i++) { - const u32 k = (lines_per_block - 1 - i); - const size_t tiled_index = i * tiled_line_size; - const size_t linear_index = k * row_length; - codec(tiled + tiled_index, linear + linear_index, tiled_line_size); - } -} - -// @param read_size is the amount of bytes each pixel takes -inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) { - std::memcpy(matrix_pointer, morton_pointer, read_size); -} - -// @param read_size is the amount of bytes each pixel takes -inline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) { - std::memcpy(morton_pointer, matrix_pointer, read_size); -} +#if defined(CLANG_OR_GCC) +#pragma GCC pop_options +#endif