diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 6ca319b59..2522064e7 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -1,4 +1,8 @@ set(SRCS + texture/internal/morton.cpp + texture/internal/etc1.cpp + texture/codec.cpp + texture/internal/codecs.cpp renderer_opengl/gl_rasterizer.cpp renderer_opengl/gl_rasterizer_cache.cpp renderer_opengl/gl_shader_gen.cpp @@ -21,6 +25,12 @@ set(SRCS set(HEADERS debug_utils/debug_utils.h + texture/internal/texture_utils.h + texture/internal/morton.h + texture/internal/etc1.h + texture/codec.h + texture/formats.h + texture/internal/codecs.h renderer_opengl/gl_rasterizer.h renderer_opengl/gl_rasterizer_cache.h renderer_opengl/gl_resource_manager.h diff --git a/src/video_core/texture/codec.cpp b/src/video_core/texture/codec.cpp new file mode 100644 index 000000000..5992dcdac --- /dev/null +++ b/src/video_core/texture/codec.cpp @@ -0,0 +1,143 @@ +#include "codec.h" +#include "internal\codecs.h" +#include "internal\morton.h" + +namespace Pica { +namespace Texture { + +void Codec::decode() { + this->init(true); + if (this->morton) + this->decode_morton_pass(); +}; + +void Codec::encode() { + this->init(false); + if (this->morton) + this->encode_morton_pass(); +}; + +void Codec::setSize() { + this->start_nibbles_size = format_size; +}; + +inline void Codec::setWidth(u32 width) { + this->width = width; +} + +inline void Codec::setHeight(u32 height) { + this->height = height; +} + +void Codec::configTiling(bool active, u32 tiling) { + this->morton = true; + this->morton_pass_tiling = tiling; + if (tiling != 8 && tiling != 32) { + this->invalid_state = true; + } +} + +void Codec::configRGBATransform(bool active) { + this->raw_RGBA = active; +} + +void Codec::configPreConvertedRGBA(bool active) { + this->preconverted = active; +} + +void Codec::setExternalBuffer(u8* external) { + this->external_result_buffer = true; + this->passing_buffer = external; +} + +std::unique_ptr Codec::transferInternalBuffer() { + if (!this->external_result_buffer) { + std::unique_ptr result(std::move(this->internal_buffer)); + return result; + } + return nullptr; +} + +bool Codec::invalid() { + return this->invalid_state; +} + +void Codec::init(bool decode) { + if (decode) { + if (this->raw_RGBA) + this->expected_nibbles_size = 8; + } else { + this->start_nibbles_size = this->format_size; + if (this->raw_RGBA) + this->expected_nibbles_size = this->format_size; + if (this->preconverted) + this->start_nibbles_size = 8; + } + if (!this->external_result_buffer) { + size_t buff_size = this->width * this->height * this->expected_nibbles_size / 2; + this->internal_buffer = std::make_unique(buff_size); + this->passing_buffer = this->internal_buffer.get(); + } +} + +inline void Codec::decode_morton_pass() { + if (this->morton_pass_tiling == 8) + Decoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height, + this->start_nibbles_size * 4); + else if (this->morton_pass_tiling == 32) + Decoders::Morton_32x32(this->target_buffer, this->passing_buffer, this->width, this->height, + this->start_nibbles_size * 4); +} + +inline void Codec::encode_morton_pass() { + if (this->morton_pass_tiling == 8) + Encoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height, + this->start_nibbles_size * 4); + else if (this->morton_pass_tiling == 32) + Encoders::Morton_32x32(this->target_buffer, this->passing_buffer, this->width, this->height, + this->start_nibbles_size * 4); +} + +std::unique_ptr CodecFactory::build(Format format, u8* target, u32 width, u32 height) { + switch (format) { + case Format::RGBA8: + return std::make_unique(target, width, height); + case Format::RGB8: + return std::make_unique(target, width, height); + case Format::RGB5A1: + return std::make_unique(target, width, height); + case Format::RGB565: + return std::make_unique(target, width, height); + case Format::RGBA4: + return std::make_unique(target, width, height); + case Format::RG8: + return std::make_unique(target, width, height); + case Format::IA8: + return std::make_unique(target, width, height); + case Format::I8: + return std::make_unique(target, width, height); + case Format::A8: + return std::make_unique(target, width, height); + case Format::IA4: + return std::make_unique(target, width, height); + case Format::I4: + return std::make_unique(target, width, height); + case Format::A4: + return std::make_unique(target, width, height); + case Format::ETC1: + return std::make_unique(target, width, height); + case Format::ETC1A4: + return std::make_unique(target, width, height); + case Format::D16: + return std::make_unique(target, width, height); + case Format::D24: + return std::make_unique(target, width, height); + case Format::D24S8: + return std::make_unique(target, width, height); + default: + return nullptr; + } +} + +} // Texture +} // Pica diff --git a/src/video_core/texture/codec.h b/src/video_core/texture/codec.h new file mode 100644 index 000000000..fe873556d --- /dev/null +++ b/src/video_core/texture/codec.h @@ -0,0 +1,78 @@ +#include +#include +#include "common/common_types.h" +#include "formats.h" + +#pragma once + +namespace Pica { + +namespace Texture { + +class Codec { + +public: + Codec(u8* target, u32 width, u32 height) { + this->target_buffer = target; + this->setWidth(width); + this->setHeight(height); + this->setSize(); + this->expected_nibbles_size = this->start_nibbles_size; + } + virtual ~Codec() {} + + virtual void decode(); + virtual void encode(); + + void setSize(); + + void setWidth(u32 width); + void setHeight(u32 height); + + // Common Passes + void configTiling(bool active, u32 tiling); + void configRGBATransform(bool active); + void configPreConvertedRGBA(bool active); + + void setExternalBuffer(u8* external); + std::unique_ptr transferInternalBuffer(); + + bool invalid(); + +protected: + u32 width; + u32 height; + + // passes + bool invalid_state = false; + bool morton = true; + u32 morton_pass_tiling = 8; + bool raw_RGBA = false; + bool preconverted = false; + bool disable_components = false; + u32 disable_components_mask = 0; + + u32 start_nibbles_size; + u32 expected_nibbles_size; + const u32 format_size = 8; + + u8* target_buffer; // Initial read buffer + u8* passing_buffer; // pointer aliasing: Used and modified by passes + std::unique_ptr internal_buffer; // used if no external buffer is provided + bool external_result_buffer = false; + + void init(bool decode); + + typedef Codec super; + + inline void decode_morton_pass(); + inline void encode_morton_pass(); +}; + +namespace CodecFactory { +std::unique_ptr build(Pica::Texture::Format format, u8* target, u32 width, u32 height); +}; + +} // Texture + +} // Pica diff --git a/src/video_core/texture/formats.h b/src/video_core/texture/formats.h new file mode 100644 index 000000000..c15d40c1d --- /dev/null +++ b/src/video_core/texture/formats.h @@ -0,0 +1,37 @@ +#pragma once + +namespace Pica { + +namespace Texture { + +enum class Format { + // First 5 formats are shared between textures and color buffers + RGBA8 = 0, + RGB8 = 1, + RGB5A1 = 2, + RGB565 = 3, + RGBA4 = 4, + + // Texture-only formats + IA8 = 5, + RG8 = 6, + I8 = 7, + A8 = 8, + IA4 = 9, + I4 = 10, + A4 = 11, + ETC1 = 12, + ETC1A4 = 13, + + // Depth buffer-only formats + D16 = 14, + // gap + D24 = 16, + D24S8 = 17, + + Invalid = 255, +}; + +} // Texture + +} // Pica diff --git a/src/video_core/texture/internal/codecs.cpp b/src/video_core/texture/internal/codecs.cpp new file mode 100644 index 000000000..d647c9ec7 --- /dev/null +++ b/src/video_core/texture/internal/codecs.cpp @@ -0,0 +1,10 @@ +#include "codecs.h" +#include "etc1.h" +#include "morton.h" +#include "texture_utils.h" + +// Decoders +#include "decoders.cpp" + +// Encoders +#include "encoders.cpp" diff --git a/src/video_core/texture/internal/codecs.h b/src/video_core/texture/internal/codecs.h new file mode 100644 index 000000000..97a5e2869 --- /dev/null +++ b/src/video_core/texture/internal/codecs.h @@ -0,0 +1,177 @@ +#include +#include +#include "common/common_types.h" +#include "video_core/texture/codec.h" + +#pragma once + +// each texture format codec +class RGBACodec : public Pica::Texture::Codec { +public: + RGBACodec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 8; +}; + +class RGBCodec : public Pica::Texture::Codec { +public: + RGBCodec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 6; +}; + +class RGB5A1Codec : public Pica::Texture::Codec { +public: + RGB5A1Codec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 4; +}; + +class RGBA4Codec : public Pica::Texture::Codec { +public: + RGBA4Codec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 4; +}; + +class RGB565Codec : public Pica::Texture::Codec { +public: + RGB565Codec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 4; +}; + +class RG8Codec : public Pica::Texture::Codec { +public: + RG8Codec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 4; +}; + +class IA8Codec : public Pica::Texture::Codec { +public: + IA8Codec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 4; +}; + +class I8Codec : public Pica::Texture::Codec { +public: + I8Codec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 2; +}; + +class A8Codec : public Pica::Texture::Codec { +public: + A8Codec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 2; +}; + +class IA4Codec : public Pica::Texture::Codec { +public: + IA4Codec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 2; +}; + +class I4Codec : public Pica::Texture::Codec { +public: + I4Codec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 1; +}; + +class A4Codec : public Pica::Texture::Codec { +public: + A4Codec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 1; +}; + +class ETC1Codec : public Pica::Texture::Codec { +public: + ETC1Codec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 1; +}; + +class ETC1A4Codec : public Pica::Texture::Codec { +public: + ETC1A4Codec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 2; +}; + +class D16Codec : public Pica::Texture::Codec { +public: + D16Codec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 4; +}; + +class D24Codec : public Pica::Texture::Codec { +public: + D24Codec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 6; +}; + +class D24S8Codec : public Pica::Texture::Codec { +public: + D24S8Codec(u8* target, u32 width, u32 height) : Pica::Texture::Codec(target, width, height) {} + void decode(); + void encode(); + +protected: + const u32 format_size = 8; +}; diff --git a/src/video_core/texture/internal/decoders.cpp b/src/video_core/texture/internal/decoders.cpp new file mode 100644 index 000000000..d0b80d013 --- /dev/null +++ b/src/video_core/texture/internal/decoders.cpp @@ -0,0 +1,261 @@ + +namespace { + +template decode_func(const u8*)> +inline void rgba_pass(u8* read, u8* write) { + u32 pixel = decode_func(read).ToRGBA(); + std::memcpy(write, &pixel, 4); +} + +} // Anonymous + +void RGBACodec::decode() { + super::decode(); + if (this->raw_RGBA) + image_pass<&rgba_pass<&Color::DecodeRGBA8>, 8, 8, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +void RGBCodec::decode() { + super::decode(); + if (this->raw_RGBA) + image_pass<&rgba_pass<&Color::DecodeRGB8>, 6, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +void RGB5A1Codec::decode() { + super::decode(); + if (this->raw_RGBA) + image_pass<&rgba_pass<&Color::DecodeRGB5A1>, 4, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +void RGB565Codec::decode() { + super::decode(); + if (this->raw_RGBA) + image_pass<&rgba_pass<&Color::DecodeRGB565>, 4, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +void RGBA4Codec::decode() { + super::decode(); + if (this->raw_RGBA) + image_pass<&rgba_pass<&Color::DecodeRGBA4>, 4, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +void RG8Codec::decode() { + super::decode(); + if (this->raw_RGBA) + image_pass<&rgba_pass<&Color::DecodeRG8>, 4, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +namespace { + +inline u16 convert_nibbles(u8 nibbles) { + return ((u16)Color::Convert4To8((nibbles & 0xF0) >> 4) << 8) | + (u16)Color::Convert4To8((nibbles & 0x0F)); +} + +inline u32 build_luminance(u8 intensity, u8 alpha) { + return (alpha << 24) | (intensity << 16) | (intensity << 8) | intensity; +} + +inline void intensity_alpha_pass(u8* read, u8* write) { + alignas(4) u8 pixel[2]; + std::memcpy(pixel, read, 2); + u32 result = build_luminance(pixel[0], pixel[1]); + std::memcpy(write, &result, 4); +} + +inline void intensity_alpha_nibbles_pass(u8* read, u8* write) { + alignas(4) u8 pixel[2]; + std::memcpy(pixel, read, 1); + u16 tmp = convert_nibbles(pixel[0]); + std::memcpy(pixel, &tmp, 2); + u32 result = build_luminance(pixel[0], pixel[1]); + std::memcpy(write, &result, 4); +} + +inline void intensity_pass(u8* read, u8* write) { + alignas(4) u8 pixel[1]; + std::memcpy(pixel, read, 1); + u32 result = build_luminance(pixel[0], 255); + std::memcpy(write, &result, 4); +} + +inline void intensity_nibbles_pass(u8* read, u8* write) { + alignas(4) u8 pixel[2]; + std::memcpy(pixel, read, 1); + u16 tmp = convert_nibbles(pixel[0]); + std::memcpy(pixel, &tmp, 2); + u32 result = build_luminance(pixel[0], 255); + std::memcpy(write, &result, 4); + result = build_luminance(pixel[1], 255); + std::memcpy(write + 4, &result, 4); +} + +inline void alpha_pass(u8* read, u8* write) { + alignas(4) u8 pixel[1]; + std::memcpy(pixel, read, 1); + u32 result = build_luminance(0, pixel[0]); + std::memcpy(write, &result, 4); +} + +inline void alpha_nibbles_pass(u8* read, u8* write) { + alignas(4) u8 pixel[2]; + std::memcpy(pixel, read, 1); + u16 tmp = convert_nibbles(pixel[0]); + std::memcpy(pixel, &tmp, 2); + u32 result = build_luminance(0, pixel[0]); + std::memcpy(write, &result, 4); + result = build_luminance(0, pixel[1]); + std::memcpy(write + 4, &result, 4); +} + +} // Anonymous + +void IA8Codec::decode() { + super::decode(); + if (this->raw_RGBA) + image_pass<&intensity_alpha_pass, 4, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +void IA4Codec::decode() { + super::decode(); + if (this->raw_RGBA) + image_pass<&intensity_alpha_nibbles_pass, 2, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +void I8Codec::decode() { + super::decode(); + if (this->raw_RGBA) + image_pass<&intensity_pass, 2, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +void I4Codec::decode() { + super::decode(); + if (this->raw_RGBA) + image_pass<&intensity_nibbles_pass, 1, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +void A8Codec::decode() { + super::decode(); + if (this->raw_RGBA) + image_pass<&alpha_pass, 2, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +void A4Codec::decode() { + super::decode(); + if (this->raw_RGBA) + image_pass<&alpha_nibbles_pass, 1, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +void ETC1Codec::decode() { + this->init(true); + ETC1(this->target_buffer, this->passing_buffer, this->width, this->height); +} + +void ETC1A4Codec::decode() { + this->init(true); + ETC1A4(this->target_buffer, this->passing_buffer, this->width, this->height); +} + +namespace { + +inline void expand_depth16_pass(u8* read, u8* write) { + alignas(4) u8 pixel[4]; + std::memcpy(pixel, read, 2); + pixel[2] = 255; + pixel[3] = 255; + std::memcpy(write, pixel, 4); +} + +inline void expand_depth24_pass(u8* read, u8* write) { + alignas(4) u8 pixel[4]; + std::memcpy(pixel, read, 3); + pixel[3] = 255; + std::memcpy(write, pixel, 4); +} + +inline void fix_stencil_pass(u8* read, u8* write) { + u32 pixel; + std::memcpy(&pixel, read, 4); + pixel = (pixel << 8) | (pixel >> 24); + std::memcpy(write, &pixel, 4); +} + +} // Anonymous + +void D16Codec::decode() { + super::decode(); + if (this->raw_RGBA) + image_pass<&expand_depth16_pass, 4, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +void D24Codec::decode() { + super::decode(); + if (this->raw_RGBA) + image_pass<&expand_depth24_pass, 6, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +void D24S8Codec::decode() { + super::decode(); + if (this->raw_RGBA) + image_pass<&fix_stencil_pass, 8, 8, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} diff --git a/src/video_core/texture/internal/encoders.cpp b/src/video_core/texture/internal/encoders.cpp new file mode 100644 index 000000000..0844bb737 --- /dev/null +++ b/src/video_core/texture/internal/encoders.cpp @@ -0,0 +1,109 @@ + +void RGBACodec::encode() { + super::encode(); +} + +void RGBCodec::encode() { + super::encode(); +} + +void RGB5A1Codec::encode() { + super::encode(); +} + +void RGB565Codec::encode() { + super::encode(); +} + +void RGBA4Codec::encode() { + super::encode(); +} + +void RG8Codec::encode() { + super::encode(); +} + +void IA8Codec::encode() { + super::encode(); +} + +void IA4Codec::encode() { + super::encode(); +} + +void I8Codec::encode() { + super::encode(); +} + +void I4Codec::encode() { + super::encode(); +} + +void A8Codec::encode() { + super::encode(); +} + +void A4Codec::encode() { + super::encode(); +} + +void ETC1Codec::encode() { + super::encode(); +} + +void ETC1A4Codec::encode() { + super::encode(); +} + +namespace Encode { + +inline void contract_depth16_pass(u8* read, u8* write) { + alignas(4) u8 pixel[4]; + std::memcpy(pixel, read, 4); + std::memcpy(write, pixel, 2); +} + +inline void contract_depth24_pass(u8* read, u8* write) { + alignas(4) u8 pixel[4]; + std::memcpy(pixel, read, 4); + std::memcpy(write, pixel, 3); +} + +inline void fix_stencil_pass(u8* read, u8* write) { + u32 pixel; + std::memcpy(&pixel, read, 4); + pixel = (pixel >> 24) | (pixel << 8); + std::memcpy(write, &pixel, 4); +} + +} // Anonymous + +void D16Codec::encode() { + super::encode(); + if (this->raw_RGBA) + image_pass<&Encode::contract_depth16_pass, 8, 4, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +void D24Codec::encode() { + super::encode(); + if (this->raw_RGBA) + image_pass<&Encode::contract_depth24_pass, 8, 6>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} + +void D24S8Codec::encode() { + super::encode(); + if (this->raw_RGBA) + image_pass<&Encode::fix_stencil_pass, 8, 8, 8>( + // clang-format off + this->passing_buffer, this->width, this->height + // clang-format on + ); +} diff --git a/src/video_core/texture/internal/etc1.cpp b/src/video_core/texture/internal/etc1.cpp new file mode 100644 index 000000000..a20dee6d5 --- /dev/null +++ b/src/video_core/texture/internal/etc1.cpp @@ -0,0 +1,187 @@ +#include +#include +#include +#include +#include "common/assert.h" +#include "common/bit_field.h" +#include "common/color.h" +#include "common/common_types.h" +#include "common/math_util.h" +#include "common/swap.h" +#include "common/vector_math.h" +#include "etc1.h" +#include "texture_utils.h" + +constexpr std::array etc1_modifier_table = {{ + {2, 8}, {5, 17}, {9, 29}, {13, 42}, {18, 60}, {24, 80}, {33, 106}, {47, 183}, +}}; + +namespace { + +union ETC1Tile { + u64 raw; + + // Each of these two is a collection of 16 bits (one per lookup value) + BitField<0, 16, u64> table_subindexes; + BitField<16, 16, u64> negation_flags; + + unsigned GetTableSubIndex(unsigned index) const { + return (table_subindexes >> index) & 1; + } + + bool GetNegationFlag(unsigned index) const { + return ((negation_flags >> index) & 1) == 1; + } + + BitField<32, 1, u64> flip; + BitField<33, 1, u64> differential_mode; + + BitField<34, 3, u64> table_index_2; + BitField<37, 3, u64> table_index_1; + + union { + // delta value + base value + BitField<40, 3, s64> db; + BitField<43, 5, u64> b; + + BitField<48, 3, s64> dg; + BitField<51, 5, u64> g; + + BitField<56, 3, s64> dr; + BitField<59, 5, u64> r; + } differential; + + union { + BitField<40, 4, u64> b2; + BitField<44, 4, u64> b1; + + BitField<48, 4, u64> g2; + BitField<52, 4, u64> g1; + + BitField<56, 4, u64> r2; + BitField<60, 4, u64> r1; + } separate; + + const Math::Vec3 GetRGB(u32 x, u32 y) const { + int texel = 4 * x + y; + + if (flip) + std::swap(x, y); + + // Lookup base value + Math::Vec3 ret; + if (differential_mode) { + ret.r() = static_cast(differential.r); + ret.g() = static_cast(differential.g); + ret.b() = static_cast(differential.b); + if (x >= 2) { + ret.r() += static_cast(differential.dr); + ret.g() += static_cast(differential.dg); + ret.b() += static_cast(differential.db); + } + ret.r() = Color::Convert5To8(ret.r()); + ret.g() = Color::Convert5To8(ret.g()); + ret.b() = Color::Convert5To8(ret.b()); + } else { + if (x < 2) { + ret.r() = Color::Convert4To8(static_cast(separate.r1)); + ret.g() = Color::Convert4To8(static_cast(separate.g1)); + ret.b() = Color::Convert4To8(static_cast(separate.b1)); + } else { + ret.r() = Color::Convert4To8(static_cast(separate.r2)); + ret.g() = Color::Convert4To8(static_cast(separate.g2)); + ret.b() = Color::Convert4To8(static_cast(separate.b2)); + } + } + + // Add modifier + unsigned table_index = + static_cast((x < 2) ? table_index_1.Value() : table_index_2.Value()); + + int modifier = etc1_modifier_table[table_index][GetTableSubIndex(texel)]; + if (GetNegationFlag(texel)) + modifier *= -1; + + ret.r() = MathUtil::Clamp(ret.r() + modifier, 0, 255); + ret.g() = MathUtil::Clamp(ret.g() + modifier, 0, 255); + ret.b() = MathUtil::Clamp(ret.b() + modifier, 0, 255); + + return ret.Cast(); + } +}; + +} // anonymous namespace + +inline void etc1_pass(u8* etc1_buffer, u8* linear_buffer, u32 x_blocks) { + const size_t line = 8 * 4; + alignas(64) u8 tmp[line * 8]; + for (u32 i = 0; i < 4; i++) { + ETC1Tile tile; + const size_t index = (i % 2) * (line / 2) + (i / 2) * line * 4; + std::memcpy(&tile.raw, &etc1_buffer[i * 8], 8); + for (u32 k = 0; k < 4; k++) { + for (u32 j = 0; j < 4; j++) { + u32 rgba = (tile.GetRGB(j, k).ToRGB()) | 0xFF000000; + std::memcpy(&tmp[k * line + j * 4 + index], &rgba, 4); + } + } + } + tiling_pass<&decode, 8, 8>(linear_buffer, tmp, x_blocks); +} + +inline void etc1a4_pass(u8* etc1_buffer, u8* linear_buffer, u32 x_blocks) { + const size_t line = 8 * 4; + alignas(64) u8 tmp[line * 8]; + for (u32 i = 0; i < 4; i++) { + ETC1Tile tile; + u64 alpha_tile; + const size_t index = (i % 2) * (line / 2) + (i / 2) * line * 4; + std::memcpy(&alpha_tile, &etc1_buffer[i * 16], 8); + std::memcpy(&tile.raw, &etc1_buffer[i * 16 + 8], 8); + for (u32 k = 0; k < 4; k++) { + for (u32 j = 0; j < 4; j++) { + u32 alpha = (alpha_tile >> (4 * (j * 4 + k))) & 0x0F; + alpha |= (alpha << 4); + u32 rgba = tile.GetRGB(j, k).ToRGB() | (alpha << 24); + std::memcpy(&tmp[k * line + j * 4 + index], &rgba, 4); + } + } + } + tiling_pass<&decode, 8, 8>(linear_buffer, tmp, x_blocks); +} + +void ETC1A4(u8* etc1_buffer, u8* matrix_buffer, u32 width, u32 height) { + const u32 x_blocks = (width / 8); + const u32 y_blocks = (height / 8); + const size_t line_size = 8 * 4; + const size_t tile_size = 8 * 8; + const size_t stride_size = width * line_size; + matrix_buffer = matrix_buffer + (height * width * 4) - stride_size; + for (u32 y = 0; y < y_blocks; y++) { + u8* linear_buffer = matrix_buffer; + for (u32 x = 0; x != x_blocks; x++) { + etc1a4_pass(etc1_buffer, linear_buffer, x_blocks); + linear_buffer += line_size; + etc1_buffer += tile_size; + } + matrix_buffer -= stride_size; + } +} + +void ETC1(u8* etc1_buffer, u8* matrix_buffer, u32 width, u32 height) { + const u32 x_blocks = (width / 8); + const u32 y_blocks = (height / 8); + const size_t line_size = 8 * 4; + const size_t tile_size = 8 * 8 / 2; + const size_t stride_size = width * line_size; + matrix_buffer = matrix_buffer + (height * width * 4) - stride_size; + for (u32 y = 0; y < y_blocks; y++) { + u8* linear_buffer = matrix_buffer; + for (u32 x = 0; x != x_blocks; x++) { + etc1_pass(etc1_buffer, linear_buffer, x_blocks); + linear_buffer += line_size; + etc1_buffer += tile_size; + } + matrix_buffer -= stride_size; + } +} diff --git a/src/video_core/texture/internal/etc1.h b/src/video_core/texture/internal/etc1.h new file mode 100644 index 000000000..fa4535da2 --- /dev/null +++ b/src/video_core/texture/internal/etc1.h @@ -0,0 +1,7 @@ + +#include "common/common_types.h" + +#pragma once + +void ETC1(u8* etc1_buffer, u8* matrix_buffer, u32 width, u32 height); +void ETC1A4(u8* etc1_buffer, u8* matrix_buffer, u32 width, u32 height); diff --git a/src/video_core/texture/internal/morton.cpp b/src/video_core/texture/internal/morton.cpp new file mode 100644 index 000000000..b50f4e34a --- /dev/null +++ b/src/video_core/texture/internal/morton.cpp @@ -0,0 +1,278 @@ +#include +#include +#include +#include "common/common_types.h" +#include "morton.h" +#include "texture_utils.h" + +/////////////////////////////////////////////////////////////////////////////// +// Optimizations +////////////////////////////////////////////////////////////////////////////// +#ifdef _MSC_VER +#pragma inline_recursion(on) +// Normaly set to 16 by default, the best balance seems to be on 8 for this module +#pragma inline_depth(8) +// favor fast code over small code. +#pragma optimize("t", on) +#pragma intrinsic(memcpy) +#elif defined(CLANG_OR_GCC) +// The next 3 will swizle memory copying to help find the best sse/avx shuffling +// in case it's possible. Compilation tests have proven effective use of these +// flags on gcc and clang. +#pragma GCC optimize("-fpredictive-commoning") +#pragma GCC optimize("-ftree-loop-distribute-patterns") +#pragma GCC optimize("-ftree-vectorize") +// limit inlining +#pragma GCC option("--param max-inline-insns-single=128") + +// The beauty of these compiler options is that they generate better code than +// hand written intrinsics, since inline expanding memeory transfers can be pattern +// matched with vector instructions available in the target. +#endif + +#pragma region Z_Order +///////////////////////////////////////////////////////////////////////////// +// Z-Order: +// +// 0-->1 +// / +// 2-->3 +// +// for more information look at: https://en.wikipedia.org/wiki/Z-order_curve +///////////////////////////////////////////////////////////////////////////// +#define TOP_LEFT 0 +#define TOP_RIGHT 1 +#define BOTTOM_LEFT 2 +#define BOTTOM_RIGHT 3 + +constexpr u32 isRight(u32 block_index) { + return (block_index % 2); +} + +constexpr u32 isBottom(u32 block_index) { + return (block_index / 2); +} + +template +inline void swizzle_block(u8*& morton_block, u8* linear_block); + +template +inline void swizzle_block_aux(u8*& morton_block, u8* linear_block) { + // move the linear_block pointer to the appropiate block + const size_t right = isRight(block_index) * (blocks * nibbles) / 2; + const size_t down = isBottom(block_index) * block_size; + u8* new_linear = linear_block + right + down; + swizzle_block(morton_block, new_linear); +} + +template +inline void swizzle_block(u8*& morton_block, u8* linear_block) { + const size_t new_block_size = block_size / 2; + if (blocks <= 2) { + // We handle 2*2 blocks on z-order + const size_t read_size = nibbles; // just for clearness. It's the same amount + // TOP_LEFT & TOP_RIGHT + codec(morton_block, linear_block, read_size); + morton_block += read_size; + // BOTTOM_LEFT & BOTTOM_RIGHT + codec(morton_block, linear_block + new_block_size, read_size); + morton_block += read_size; + } else { + // we divide the block into 4 blocks in z-order corecursively + // until we have 2x2 blocks. + const u32 subdivide = blocks / 2; + swizzle_block_aux(morton_block, + linear_block); + swizzle_block_aux(morton_block, + linear_block); + swizzle_block_aux(morton_block, + linear_block); + swizzle_block_aux(morton_block, + linear_block); + } +} + +template +void swizzle_pass(u8* morton_block, u8* linear_block) { + const size_t block_size = (lines_per_block * lines_per_block * nibbles) / 2; + swizzle_block(morton_block, linear_block); +} +#pragma endregion Z_Order + +template +void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) { + const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2; + alignas(64) u8 tmp[tile_size]; + tiling_pass<&encode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks); + swizzle_pass<&encode, nibbles, lines_per_block>(morton_buffer, tmp); +} + +template +void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) { + const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2; + alignas(64) u8 tmp[tile_size]; + swizzle_pass<&decode, nibbles, lines_per_block>(morton_buffer, tmp); + tiling_pass<&decode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks); +} + +template +void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) { + const u32 x_blocks = (width / lines_per_block); + const u32 y_blocks = (height / lines_per_block); + const size_t line_size = (lines_per_block * nibbles) / 2; + const size_t tile_size = lines_per_block * line_size; + const size_t stride_size = width * line_size; + matrix_buffer = matrix_buffer + ((height * width * nibbles) / 2) - stride_size; + for (u32 y = 0; y < y_blocks; y++) { + u8* linear_buffer = matrix_buffer; + for (u32 x = 0; x != x_blocks; x++) { + codec(morton_buffer, linear_buffer, x_blocks); + linear_buffer += line_size; + morton_buffer += tile_size; + } + matrix_buffer -= stride_size; + } +} + +namespace Decoders { + +bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) { + switch (bpp) { + case 4: { + morton_pass<&decode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 8: { + morton_pass<&decode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 16: { + morton_pass<&decode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 24: { + morton_pass<&decode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 32: { + morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + default: { + return false; + break; + } + } +} + +bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) { + switch (bpp) { + case 4: { + morton_pass<&decode_pass<1, 32>, 1, 32>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 8: { + morton_pass<&decode_pass<2, 32>, 2, 32>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 16: { + morton_pass<&decode_pass<4, 32>, 4, 32>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 24: { + morton_pass<&decode_pass<6, 32>, 6, 32>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 32: { + morton_pass<&decode_pass<8, 32>, 8, 32>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + default: { + return false; + break; + } + } +} +} + +namespace Encoders { + +bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) { + switch (bpp) { + case 4: { + morton_pass<&encode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 8: { + morton_pass<&encode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 16: { + morton_pass<&encode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 24: { + morton_pass<&encode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 32: { + morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + default: { + return false; + break; + } + } +} + +bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) { + switch (bpp) { + case 4: { + morton_pass<&encode_pass<1, 32>, 1, 32>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 8: { + morton_pass<&encode_pass<2, 32>, 2, 32>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 16: { + morton_pass<&encode_pass<4, 32>, 4, 32>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 24: { + morton_pass<&encode_pass<6, 32>, 6, 32>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + case 32: { + morton_pass<&encode_pass<8, 32>, 8, 32>(morton_buffer, matrix_buffer, width, height); + return true; + break; + } + default: { + return false; + break; + } + } +} +} diff --git a/src/video_core/texture/internal/morton.h b/src/video_core/texture/internal/morton.h new file mode 100644 index 000000000..36879ecb4 --- /dev/null +++ b/src/video_core/texture/internal/morton.h @@ -0,0 +1,15 @@ +#include "common/common_types.h" + +#pragma once + +enum class MortonPass { Tile8x8, Tile32x32 }; + +namespace Decoders { +bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp); +bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp); +} + +namespace Encoders { +bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp); +bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp); +} diff --git a/src/video_core/texture/internal/texture_utils.h b/src/video_core/texture/internal/texture_utils.h new file mode 100644 index 000000000..ecd7a557b --- /dev/null +++ b/src/video_core/texture/internal/texture_utils.h @@ -0,0 +1,98 @@ +#include +#include +#include +#include +#include "common/color.h" +#include "common/swap.h" + +#pragma once + +#if ((defined(__clang__) || defined(__GNUC__)) && !defined(__INTEL_COMPILER)) +#define CLANG_OR_GCC +#endif + +/////////////////////////////////////////////////////////////////////////////// +// Optimizations +////////////////////////////////////////////////////////////////////////////// +#ifdef _MSC_VER +#pragma inline_recursion(on) +#elif defined(CLANG_OR_GCC) +#pragma GCC optimize("-fpeel-loops") +#pragma GCC optimize("-fpredictive-commoning") +#pragma GCC optimize("-ftree-loop-distribute-patterns") +#pragma GCC optimize("-ftree-vectorize") +#endif + +// @param read_size is the amount of bytes each pixel takes +inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) { + std::memcpy(matrix_pointer, morton_pointer, read_size); +} + +// @param read_size is the amount of bytes each pixel takes +inline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) { + std::memcpy(morton_pointer, matrix_pointer, read_size); +} + +// Pre: width % 8 == 0 && height % 8 == 0 +template +inline void image_pass_aux_rev(u8* target, u32 width, u32 height) { + const u32 nibbles = (read_size < 2) & 0x01; + const u32 pixels = width * height / (1 + nibbles); + const u32 read_size_amortized = read_size / (2 - nibbles); + const u32 write_size_amortized = write_size / (2 - nibbles); + const u32 sub_iters = tuning; + const u32 iters = pixels / sub_iters; + u8* read = target + (pixels - 1) * read_size_amortized; + u8* write = target + (pixels - 1) * write_size_amortized; + for (u32 i = 0; i < iters; i++) { + // Sub_iterations allow the compiler to know a set of inner + // iterations within compile time, thus it can do better optimizations. + for (u32 k = 0; k < sub_iters; k++) { + pass(read, write); + read -= read_size_amortized; + write -= write_size_amortized; + } + } +} + +// Pre: width % 8 == 0 && height % 8 == 0 +template +inline void image_pass_aux(u8* target, u32 width, u32 height) { + const u32 nibbles = (write_size < 2) & 0x01; + const u32 pixels = width * height / (1 + nibbles); + const u32 read_size_amortized = read_size / (2 - nibbles); + const u32 write_size_amortized = write_size / (2 - nibbles); + const u32 sub_iters = tuning; + const u32 iters = pixels / sub_iters; + u8* read = target; + u8* write = target; + for (u32 i = 0; i < iters; i++) { + // Sub_iterations allow the compiler to know a set of inner + // iterations within compile time, thus it can do better optimizations. + for (u32 k = 0; k < sub_iters; k++) { + pass(read, write); + read += read_size_amortized; + write += write_size_amortized; + } + } +} + +template +inline void image_pass(u8* target, u32 width, u32 height) { + if (read_size > write_size) + image_pass_aux; + else + image_pass_aux_rev; +} + +template +void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) { + const size_t tiled_line_size = (lines_per_block * nibbles) / 2; + const size_t row_length = x_blocks * tiled_line_size; + for (u32 i = 0; i < lines_per_block; i++) { + const u32 k = (lines_per_block - 1 - i); + const size_t tiled_index = i * tiled_line_size; + const size_t linear_index = k * row_length; + codec(tiled + tiled_index, linear + linear_index, tiled_line_size); + } +}