mirror of
https://github.com/citra-emu/citra.git
synced 2024-11-25 23:40:14 +00:00
Fixes, cleaning and small refactors
This commit is contained in:
parent
1c6965f106
commit
bed6207ac7
@ -1,3 +1,4 @@
|
||||
|
||||
set(SRCS
|
||||
renderer_opengl/gl_rasterizer.cpp
|
||||
renderer_opengl/gl_rasterizer_cache.cpp
|
||||
|
@ -26,6 +26,9 @@
|
||||
#include "video_core/utils.h"
|
||||
#include "video_core/video_core.h"
|
||||
|
||||
#define TEXTURE_CACHE_SIZE (1024 * 1024 * 8) // 8MB inner cache for decoding/encoding
|
||||
alignas(64) static u8 TextureCache[TEXTURE_CACHE_SIZE];
|
||||
|
||||
struct FormatTuple {
|
||||
GLint internal_format;
|
||||
GLenum format;
|
||||
@ -39,7 +42,7 @@ static const std::array<FormatTuple, 18> format_tuples = {{
|
||||
{GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5}, // RGB565
|
||||
{GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4}, // RGBA4
|
||||
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // IA8
|
||||
{GL_RG8, GL_RG8, GL_UNSIGNED_BYTE}, // RG8
|
||||
{GL_RG8, GL_RG, GL_UNSIGNED_BYTE}, // RG8
|
||||
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // I8
|
||||
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // A8
|
||||
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // IA4
|
||||
@ -70,7 +73,7 @@ static const std::array<bool, 18> native_format = {
|
||||
false, // ETC1A4
|
||||
true, // D16
|
||||
false,
|
||||
false, // D24
|
||||
true, // D24
|
||||
false, // D24S8
|
||||
};
|
||||
|
||||
@ -306,26 +309,29 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo
|
||||
// clang-format on
|
||||
);
|
||||
Pica::Texture::Codec* codec = tmp.get();
|
||||
codec->configTiling(true, 8); // change 8 for 32 in case the mage is tiled
|
||||
codec->configTiling(true, 8); // change 8 for 32 in case the image is tiled
|
||||
// on blocks of 32x32
|
||||
codec->configRGBATransform(!native_format[(unsigned int)params.pixel_format]);
|
||||
codec->validate();
|
||||
if (!codec->invalid()) {
|
||||
codec->decode();
|
||||
std::unique_ptr<u8[]> decoded_texture = codec->transferInternalBuffer();
|
||||
u32 bytes = codec->getInternalBytesPerPixel();
|
||||
if (bytes == 3)
|
||||
bytes = 1;
|
||||
else if (bytes != 2)
|
||||
bytes = 4;
|
||||
glPixelStorei(GL_UNPACK_ALIGNMENT, bytes);
|
||||
glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height,
|
||||
0, tuple.format, tuple.type, decoded_texture.get());
|
||||
glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
|
||||
u32 estimated_size =
|
||||
params.width * params.height * codec->getInternalBytesPerPixel();
|
||||
if (estimated_size <= TEXTURE_CACHE_SIZE) {
|
||||
codec->setExternalBuffer(TextureCache);
|
||||
codec->decode();
|
||||
glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
|
||||
params.height, 0, tuple.format, tuple.type, TextureCache);
|
||||
} else {
|
||||
codec->decode();
|
||||
std::unique_ptr<u8[]> decoded_texture = codec->transferInternalBuffer();
|
||||
glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
|
||||
params.height, 0, tuple.format, tuple.type, decoded_texture.get());
|
||||
}
|
||||
} else {
|
||||
LOG_WARNING(Render_OpenGL,
|
||||
"Invalid texture sent to renderer; width: %d height %d type: %d",
|
||||
params.width, params.height, (unsigned int)params.pixel_format);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
// If not 1x scale, blit 1x texture to a new scaled texture and replace texture in surface
|
||||
@ -652,15 +658,22 @@ void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) {
|
||||
glPixelStorei(GL_PACK_ROW_LENGTH, 0);
|
||||
} else {
|
||||
const FormatTuple& tuple = format_tuples[(u32)surface->pixel_format];
|
||||
u32 bytes_per_pixel = Pica::Texture::Format::GetBpp(surface->pixel_format) / 8;
|
||||
u32 bits_per_pixel = Pica::Texture::Format::GetBpp(surface->pixel_format);
|
||||
if (!native_format[(u32)surface->pixel_format])
|
||||
bytes_per_pixel = 4;
|
||||
std::vector<u8> temp_gl_buffer(surface->width * surface->height * bytes_per_pixel);
|
||||
glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data());
|
||||
|
||||
bits_per_pixel = 32;
|
||||
u32 size = surface->width * surface->height * bits_per_pixel / 8;
|
||||
std::vector<u8> temp_gl_buffer;
|
||||
u8* temporal_buffer;
|
||||
if (size <= TEXTURE_CACHE_SIZE)
|
||||
temporal_buffer = TextureCache;
|
||||
else {
|
||||
temp_gl_buffer.resize(size);
|
||||
temporal_buffer = temp_gl_buffer.data();
|
||||
}
|
||||
glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temporal_buffer);
|
||||
std::unique_ptr<Pica::Texture::Codec> tmp = Pica::Texture::CodecFactory::build(
|
||||
// clang-format off
|
||||
surface->pixel_format, temp_gl_buffer.data(), surface->width, surface->height
|
||||
surface->pixel_format, temporal_buffer, surface->width, surface->height
|
||||
// clang-format on
|
||||
);
|
||||
Pica::Texture::Codec* codec = tmp.get();
|
||||
|
@ -67,7 +67,7 @@ void Codec::init(bool decode) {
|
||||
this->expected_nibbles_size = this->start_nibbles_size;
|
||||
}
|
||||
this->validate();
|
||||
if (!this->external_result_buffer || !this->invalid()) {
|
||||
if (!this->external_result_buffer) {
|
||||
size_t buff_size = this->width * this->height * this->expected_nibbles_size / 2;
|
||||
this->internal_buffer = std::make_unique<u8[]>(buff_size);
|
||||
this->passing_buffer = this->internal_buffer.get();
|
||||
@ -91,7 +91,7 @@ void Codec::validate() {
|
||||
this->invalid_state = true;
|
||||
return;
|
||||
}
|
||||
if (this->morton && this->morton_pass_tiling != 8 && this->morton_pass_tiling != 32) {
|
||||
if (this->morton && this->morton_pass_tiling != 8) {
|
||||
this->invalid_state = true;
|
||||
return;
|
||||
}
|
||||
@ -102,18 +102,12 @@ inline void Codec::decode_morton_pass() {
|
||||
if (this->morton_pass_tiling == 8)
|
||||
Decoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height,
|
||||
this->start_nibbles_size * 4);
|
||||
else if (this->morton_pass_tiling == 32)
|
||||
Decoders::Morton_32x32(this->target_buffer, this->passing_buffer, this->width, this->height,
|
||||
this->start_nibbles_size * 4);
|
||||
}
|
||||
|
||||
inline void Codec::encode_morton_pass() {
|
||||
if (this->morton_pass_tiling == 8)
|
||||
Encoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height,
|
||||
Encoders::Morton_8x8(this->passing_buffer, this->target_buffer, this->width, this->height,
|
||||
this->start_nibbles_size * 4);
|
||||
else if (this->morton_pass_tiling == 32)
|
||||
Encoders::Morton_32x32(this->target_buffer, this->passing_buffer, this->width, this->height,
|
||||
this->start_nibbles_size * 4);
|
||||
}
|
||||
|
||||
std::unique_ptr<Codec> CodecFactory::build(Format::Type format, u8* target, u32 width, u32 height) {
|
||||
|
@ -40,7 +40,7 @@ struct Format {
|
||||
Invalid = 255,
|
||||
};
|
||||
|
||||
static u32 GetBpp(Type format) {
|
||||
static const u32 GetBpp(Type format) {
|
||||
static const std::array<unsigned int, 18> bpp_table = {
|
||||
32, // RGBA8
|
||||
24, // RGB8
|
||||
@ -66,19 +66,19 @@ struct Format {
|
||||
return bpp_table[(u32)format];
|
||||
}
|
||||
|
||||
static Type FromTextureFormat(Regs::TextureFormat format) {
|
||||
static constexpr Type FromTextureFormat(Regs::TextureFormat format) {
|
||||
return ((unsigned int)format < 14) ? (Type)format : Type::Invalid;
|
||||
}
|
||||
|
||||
static Type FromColorFormat(Regs::ColorFormat format) {
|
||||
static constexpr Type FromColorFormat(Regs::ColorFormat format) {
|
||||
return ((unsigned int)format < 5) ? (Type)format : Type::Invalid;
|
||||
}
|
||||
|
||||
static Type FromDepthFormat(Regs::DepthFormat format) {
|
||||
static constexpr Type FromDepthFormat(Regs::DepthFormat format) {
|
||||
return ((unsigned int)format < 4) ? (Type)((unsigned int)format + 14) : Type::Invalid;
|
||||
}
|
||||
|
||||
static Type FromGPUPixelFormat(GPU::Regs::PixelFormat format) {
|
||||
static const Type FromGPUPixelFormat(GPU::Regs::PixelFormat format) {
|
||||
switch (format) {
|
||||
// RGB565 and RGB5A1 are switched in PixelFormat compared to ColorFormat
|
||||
case GPU::Regs::PixelFormat::RGB565:
|
||||
@ -92,6 +92,17 @@ struct Format {
|
||||
|
||||
}; // Format
|
||||
|
||||
struct Info {
|
||||
PAddr physical_address;
|
||||
int width;
|
||||
int height;
|
||||
int stride;
|
||||
Pica::Regs::TextureFormat format;
|
||||
|
||||
static Info FromPicaRegister(const Pica::Regs::TextureConfig& config,
|
||||
const Pica::Regs::TextureFormat& format);
|
||||
};
|
||||
|
||||
} // Texture
|
||||
|
||||
} // Pica
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <memory>
|
||||
#include "common/common_types.h"
|
||||
#include "video_core/texture/codec.h"
|
||||
#include "video_core/texture/formats.h"
|
||||
|
||||
// each texture format codec
|
||||
class RGBACodec : public Pica::Texture::Codec {
|
||||
@ -15,7 +16,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 8;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::RGBA8
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -27,7 +32,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 6;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::RGB8
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -39,7 +48,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 4;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::RGB5A1
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -51,7 +64,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 4;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::RGBA4
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -63,7 +80,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 4;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::RGB565
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -75,7 +96,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 4;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::RG8
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -87,7 +112,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 4;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::IA8
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -99,7 +128,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 2;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::I8
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -111,7 +144,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 2;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::A8
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -123,7 +160,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 2;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::IA4
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -135,7 +176,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 1;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::I4
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -147,7 +192,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 1;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::A4
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -159,7 +208,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 1;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::ETC1
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -171,7 +224,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 2;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::ETC1A4
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -183,7 +240,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 4;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::D16
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -195,7 +256,11 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 6;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::D24
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
||||
@ -207,6 +272,10 @@ public:
|
||||
|
||||
protected:
|
||||
virtual void setSize() {
|
||||
this->start_nibbles_size = 8;
|
||||
// clang-format off
|
||||
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
|
||||
Pica::Texture::Format::Type::D24S8
|
||||
) / 4;
|
||||
// clang-format on
|
||||
};
|
||||
};
|
||||
|
@ -83,9 +83,9 @@ inline u32 build_luminance(u32 intensity, u32 alpha) {
|
||||
}
|
||||
|
||||
inline void intensity_alpha_pass(u8* read, u8* write) {
|
||||
alignas(4) u8 pixel[2];
|
||||
std::memcpy(pixel, read, 2);
|
||||
u32 result = build_luminance(pixel[1], pixel[0]);
|
||||
u16 pixel;
|
||||
std::memcpy(&pixel, read, 2);
|
||||
u32 result = build_luminance(pixel >> 8, pixel & 0x00FF);
|
||||
std::memcpy(write, &result, 4);
|
||||
}
|
||||
|
||||
@ -93,9 +93,7 @@ inline void intensity_alpha_nibbles_pass(u8* read, u8* write) {
|
||||
alignas(4) u8 pixel;
|
||||
std::memcpy(&pixel, read, 1);
|
||||
u16 tmp = convert_nibbles(pixel);
|
||||
u8 tmp2[2];
|
||||
std::memcpy(tmp2, &tmp, 2);
|
||||
u32 result = build_luminance(tmp2[1], tmp2[0]);
|
||||
u32 result = build_luminance(tmp >> 8, tmp & 0x00FF);
|
||||
std::memcpy(write, &result, 4);
|
||||
}
|
||||
|
||||
@ -107,31 +105,29 @@ inline void intensity_pass(u8* read, u8* write) {
|
||||
}
|
||||
|
||||
inline void intensity_nibbles_pass(u8* read, u8* write) {
|
||||
alignas(4) u8 pixel[2];
|
||||
std::memcpy(pixel, read, 1);
|
||||
u16 tmp = convert_nibbles(pixel[0]);
|
||||
std::memcpy(pixel, &tmp, 2);
|
||||
u32 result = build_luminance(pixel[1], 255);
|
||||
u8 pixel;
|
||||
std::memcpy(&pixel, read, 1);
|
||||
u16 tmp = convert_nibbles(pixel);
|
||||
u32 result = build_luminance(tmp & 0x00FF, 255);
|
||||
std::memcpy(write, &result, 4);
|
||||
result = build_luminance(pixel[0], 255);
|
||||
result = build_luminance(tmp >> 8, 255);
|
||||
std::memcpy(write + 4, &result, 4);
|
||||
}
|
||||
|
||||
inline void alpha_pass(u8* read, u8* write) {
|
||||
alignas(4) u8 pixel[1];
|
||||
std::memcpy(pixel, read, 1);
|
||||
u32 result = build_luminance(0, pixel[0]);
|
||||
u8 pixel;
|
||||
std::memcpy(&pixel, read, 1);
|
||||
u32 result = build_luminance(0, pixel);
|
||||
std::memcpy(write, &result, 4);
|
||||
}
|
||||
|
||||
inline void alpha_nibbles_pass(u8* read, u8* write) {
|
||||
alignas(4) u8 pixel[2];
|
||||
std::memcpy(pixel, read, 1);
|
||||
u16 tmp = convert_nibbles(pixel[0]);
|
||||
std::memcpy(pixel, &tmp, 2);
|
||||
u32 result = build_luminance(0, pixel[0]);
|
||||
u8 pixel;
|
||||
std::memcpy(&pixel, read, 1);
|
||||
u16 tmp = convert_nibbles(pixel);
|
||||
u32 result = build_luminance(0, tmp & 0x00FF);
|
||||
std::memcpy(write, &result, 4);
|
||||
result = build_luminance(0, pixel[1]);
|
||||
result = build_luminance(0, tmp >> 8);
|
||||
std::memcpy(write + 4, &result, 4);
|
||||
}
|
||||
|
||||
@ -207,7 +203,7 @@ void ETC1A4Codec::decode() {
|
||||
ETC1A4(this->target_buffer, this->passing_buffer, this->width, this->height);
|
||||
}
|
||||
|
||||
namespace {
|
||||
namespace Decode {
|
||||
|
||||
inline void expand_depth16_pass(u8* read, u8* write) {
|
||||
alignas(4) u8 pixel[4];
|
||||
@ -224,11 +220,18 @@ inline void expand_depth24_pass(u8* read, u8* write) {
|
||||
std::memcpy(write, pixel, 4);
|
||||
}
|
||||
|
||||
inline void fix_stencil_pass(u8* read, u8* write) {
|
||||
u32 pixel;
|
||||
std::memcpy(&pixel, read, 4);
|
||||
pixel = (pixel << 8) | (pixel >> 24);
|
||||
std::memcpy(write, &pixel, 4);
|
||||
inline void d24s8_pass(u8* target, u32 width, u32 height) {
|
||||
const size_t sub_iters = 8;
|
||||
const size_t iters = width * height / sub_iters;
|
||||
for (u32 i = 0; i < iters; i++) {
|
||||
for (u32 j = 0; j < sub_iters; j++) {
|
||||
u32 pixel;
|
||||
std::memcpy(&pixel, target, 4);
|
||||
pixel = (pixel >> 24) | (pixel << 8);
|
||||
std::memcpy(target, &pixel, 4);
|
||||
target += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // Anonymous
|
||||
@ -236,7 +239,7 @@ inline void fix_stencil_pass(u8* read, u8* write) {
|
||||
void D16Codec::decode() {
|
||||
super::decode();
|
||||
if (this->raw_RGBA)
|
||||
image_pass<&expand_depth16_pass, 4, 8>(
|
||||
image_pass<&Decode::expand_depth16_pass, 4, 8>(
|
||||
// clang-format off
|
||||
this->passing_buffer, this->width, this->height
|
||||
// clang-format on
|
||||
@ -246,7 +249,7 @@ void D16Codec::decode() {
|
||||
void D24Codec::decode() {
|
||||
super::decode();
|
||||
if (this->raw_RGBA)
|
||||
image_pass<&expand_depth24_pass, 6, 8>(
|
||||
image_pass<&Decode::expand_depth24_pass, 6, 8>(
|
||||
// clang-format off
|
||||
this->passing_buffer, this->width, this->height
|
||||
// clang-format on
|
||||
@ -256,9 +259,5 @@ void D24Codec::decode() {
|
||||
void D24S8Codec::decode() {
|
||||
super::decode();
|
||||
if (this->raw_RGBA)
|
||||
image_pass<&fix_stencil_pass, 8, 8, 8>(
|
||||
// clang-format off
|
||||
this->passing_buffer, this->width, this->height
|
||||
// clang-format on
|
||||
);
|
||||
Decode::d24s8_pass(this->passing_buffer, this->width, this->height);
|
||||
}
|
||||
|
@ -69,11 +69,18 @@ inline void contract_depth24_pass(u8* read, u8* write) {
|
||||
std::memcpy(write, pixel, 3);
|
||||
}
|
||||
|
||||
inline void fix_stencil_pass(u8* read, u8* write) {
|
||||
u32 pixel;
|
||||
std::memcpy(&pixel, read, 4);
|
||||
pixel = (pixel >> 24) | (pixel << 8);
|
||||
std::memcpy(write, &pixel, 4);
|
||||
inline void d24s8_pass(u8* target, u32 width, u32 height) {
|
||||
const size_t sub_iters = 8;
|
||||
const size_t iters = width * height / sub_iters;
|
||||
for (u32 i = 0; i < iters; i++) {
|
||||
for (u32 j = 0; j < sub_iters; j++) {
|
||||
u32 pixel;
|
||||
std::memcpy(&pixel, target, 4);
|
||||
pixel = (pixel >> 8) | (pixel << 24);
|
||||
std::memcpy(target, &pixel, 4);
|
||||
target += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // Anonymous
|
||||
@ -101,9 +108,5 @@ void D24Codec::encode() {
|
||||
void D24S8Codec::encode() {
|
||||
super::encode();
|
||||
if (this->raw_RGBA)
|
||||
image_pass<&Encode::fix_stencil_pass, 8, 8, 8>(
|
||||
// clang-format off
|
||||
this->passing_buffer, this->width, this->height
|
||||
// clang-format on
|
||||
);
|
||||
Encode::d24s8_pass(this->passing_buffer, this->width, this->height);
|
||||
}
|
||||
|
@ -122,6 +122,22 @@ union ETC1Tile {
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
|
||||
std::memcpy(matrix_pointer, morton_pointer, read_size);
|
||||
}
|
||||
|
||||
template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
|
||||
void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) {
|
||||
const size_t tiled_line_size = (lines_per_block * nibbles) / 2;
|
||||
const size_t row_length = x_blocks * tiled_line_size;
|
||||
for (u32 i = 0; i < lines_per_block; i++) {
|
||||
const u32 k = (lines_per_block - 1 - i);
|
||||
const size_t tiled_index = i * tiled_line_size;
|
||||
const size_t linear_index = k * row_length;
|
||||
codec(tiled + tiled_index, linear + linear_index, tiled_line_size);
|
||||
}
|
||||
}
|
||||
|
||||
inline void etc1_pass(u8* etc1_buffer, u8* linear_buffer, u32 x_blocks) {
|
||||
const size_t line = 8 * 4;
|
||||
alignas(64) u8 tmp[line * 8];
|
||||
|
@ -1,295 +1,40 @@
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include "common/common_types.h"
|
||||
#include "video_core/texture/internal/morton.h"
|
||||
#include "video_core/texture/internal/texture_utils.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Optimizations
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef _MSC_VER
|
||||
#pragma inline_recursion(on)
|
||||
// Normaly set to 16 by default, the best balance seems to be on 8 for this module
|
||||
#pragma inline_depth(8)
|
||||
// favor fast code over small code.
|
||||
#pragma optimize("t", on)
|
||||
#pragma intrinsic(memcpy)
|
||||
#define __hot
|
||||
#define __no_inline __declspec(noinline)
|
||||
#elif defined(CLANG_OR_GCC)
|
||||
// The next 3 will swizle memory copying to help find the best sse/avx shuffling
|
||||
// in case it's possible. Compilation tests have proven effective use of these
|
||||
// flags on gcc and clang.
|
||||
#pragma GCC optimize("-fpredictive-commoning")
|
||||
#pragma GCC optimize("-ftree-loop-distribute-patterns")
|
||||
#pragma GCC optimize("-ftree-vectorize")
|
||||
#pragma GCC option("--param inline-unit-growth=400")
|
||||
#pragma GCC option("--param large-function-growth=800")
|
||||
// The beauty of these compiler options is that they generate better code than
|
||||
// hand written intrinsics, since inline expanding memeory transfers can be pattern
|
||||
// matched with vector instructions available in the target.
|
||||
#define __no_inline __attribute__((noinline))
|
||||
#define __hot __attribute__((hot))
|
||||
#if !defined(__forceinline)
|
||||
#define __forceinline attribute__((always_inline))
|
||||
#endif
|
||||
#else
|
||||
#define __hot
|
||||
#define __no_inline
|
||||
#define __forceinline
|
||||
#endif
|
||||
|
||||
#pragma region Z_Order
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// Z-Order:
|
||||
//
|
||||
// 0-->1
|
||||
// /
|
||||
// 2-->3
|
||||
//
|
||||
// for more information look at: https://en.wikipedia.org/wiki/Z-order_curve
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
#define TOP_LEFT 0
|
||||
#define TOP_RIGHT 1
|
||||
#define BOTTOM_LEFT 2
|
||||
#define BOTTOM_RIGHT 3
|
||||
|
||||
constexpr u32 isRight(u32 block_index) {
|
||||
return (block_index % 2);
|
||||
static u32 Part1By1(u32 x) {
|
||||
x &= 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
|
||||
x = (x ^ (x << 8)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
|
||||
x = (x ^ (x << 4)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
|
||||
x = (x ^ (x << 2)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
|
||||
x = (x ^ (x << 1)) & 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
|
||||
return x;
|
||||
}
|
||||
|
||||
constexpr u32 isBottom(u32 block_index) {
|
||||
return (block_index / 2);
|
||||
static u32 Compact1By1(u32 x) {
|
||||
x &= 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
|
||||
x = (x ^ (x >> 1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
|
||||
x = (x ^ (x >> 2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
|
||||
x = (x ^ (x >> 4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
|
||||
x = (x ^ (x >> 8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
|
||||
return x;
|
||||
}
|
||||
|
||||
template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
|
||||
__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block);
|
||||
|
||||
template <void codec(u8*, u8*, size_t), size_t nibbles, u32 block_index, u32 blocks,
|
||||
size_t block_size>
|
||||
__forceinline static void swizzle_block_aux(u8*& morton_block, u8* linear_block) {
|
||||
// move the linear_block pointer to the appropiate block
|
||||
const size_t right = isRight(block_index) * (blocks * nibbles) / 2;
|
||||
const size_t down = isBottom(block_index) * block_size;
|
||||
u8* new_linear = linear_block + right + down;
|
||||
swizzle_block<codec, nibbles, blocks, block_size>(morton_block, new_linear);
|
||||
static u32 EncodeMorton(u32 x, u32 y) {
|
||||
return (Part1By1(y) << 1) | Part1By1(x);
|
||||
}
|
||||
|
||||
template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
|
||||
__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block) {
|
||||
const size_t new_block_size = block_size / 2;
|
||||
if (blocks <= 2) {
|
||||
// We handle 2*2 blocks on z-order
|
||||
const size_t read_size = nibbles; // just for clearness. It's the same amount
|
||||
// TOP_LEFT & TOP_RIGHT
|
||||
codec(morton_block, linear_block, read_size);
|
||||
morton_block += read_size;
|
||||
// BOTTOM_LEFT & BOTTOM_RIGHT
|
||||
codec(morton_block, linear_block + new_block_size, read_size);
|
||||
morton_block += read_size;
|
||||
} else {
|
||||
// we divide the block into 4 blocks in z-order corecursively
|
||||
// until we have 2x2 blocks.
|
||||
const u32 subdivide = blocks / 2;
|
||||
swizzle_block_aux<codec, nibbles, TOP_LEFT, subdivide, new_block_size>(morton_block,
|
||||
linear_block);
|
||||
swizzle_block_aux<codec, nibbles, TOP_RIGHT, subdivide, new_block_size>(morton_block,
|
||||
linear_block);
|
||||
swizzle_block_aux<codec, nibbles, BOTTOM_LEFT, subdivide, new_block_size>(morton_block,
|
||||
linear_block);
|
||||
swizzle_block_aux<codec, nibbles, BOTTOM_RIGHT, subdivide, new_block_size>(morton_block,
|
||||
linear_block);
|
||||
}
|
||||
static u32 DecodeMortonX(u32 code) {
|
||||
return Compact1By1(code >> 0);
|
||||
}
|
||||
|
||||
template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
|
||||
__forceinline static void swizzle_pass(u8* morton_block, u8* linear_block) {
|
||||
const size_t block_size = (lines_per_block * lines_per_block * nibbles) / 2;
|
||||
swizzle_block<codec, nibbles, lines_per_block, block_size>(morton_block, linear_block);
|
||||
}
|
||||
#pragma endregion Z_Order
|
||||
|
||||
template <size_t nibbles, size_t lines_per_block>
|
||||
__hot inline static void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
|
||||
const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
|
||||
alignas(64) u8 tmp[tile_size];
|
||||
tiling_pass<&encode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
|
||||
swizzle_pass<&encode, nibbles, lines_per_block>(morton_buffer, tmp);
|
||||
static u32 DecodeMortonY(u32 code) {
|
||||
return Compact1By1(code >> 1);
|
||||
}
|
||||
|
||||
template <size_t nibbles, size_t lines_per_block>
|
||||
__hot inline static void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
|
||||
const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
|
||||
alignas(64) u8 tmp[tile_size];
|
||||
swizzle_pass<&decode, nibbles, lines_per_block>(morton_buffer, tmp);
|
||||
tiling_pass<&decode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
|
||||
u32 MortonOffset(u32 x, u32 y, u32 width, u32 height, u32 tiling, u32 bpp) {
|
||||
u32 tile = (x + y * height) * width / (tiling * tiling);
|
||||
tile = (tile * bpp) / 8;
|
||||
return tile + EncodeMorton(x % tiling, y % tiling);
|
||||
}
|
||||
|
||||
template <void codec(u8*, u8*, u32), size_t nibbles, size_t lines_per_block>
|
||||
__hot static void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) {
|
||||
const u32 x_blocks = (width / lines_per_block);
|
||||
const u32 y_blocks = (height / lines_per_block);
|
||||
const size_t line_size = (lines_per_block * nibbles) / 2;
|
||||
const size_t tile_size = lines_per_block * line_size;
|
||||
const size_t stride_size = width * line_size;
|
||||
matrix_buffer = matrix_buffer + ((height * width * nibbles) / 2) - stride_size;
|
||||
for (u32 y = 0; y < y_blocks; y++) {
|
||||
u8* linear_buffer = matrix_buffer;
|
||||
for (u32 x = 0; x != x_blocks; x++) {
|
||||
codec(morton_buffer, linear_buffer, x_blocks);
|
||||
linear_buffer += line_size;
|
||||
morton_buffer += tile_size;
|
||||
}
|
||||
matrix_buffer -= stride_size;
|
||||
}
|
||||
}
|
||||
|
||||
// keep hot code together
|
||||
__no_inline __hot static void morton_8x8_32(u8* morton_buffer, u8* matrix_buffer, u32 width,
|
||||
u32 height, bool decode) {
|
||||
if (decode)
|
||||
morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
else
|
||||
morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
}
|
||||
|
||||
namespace Decoders {
|
||||
|
||||
bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
|
||||
if (bpp == 32) {
|
||||
morton_8x8_32(morton_buffer, matrix_buffer, width, height, true);
|
||||
return true;
|
||||
}
|
||||
switch (bpp) {
|
||||
case 4: {
|
||||
morton_pass<&decode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
morton_pass<&decode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 16: {
|
||||
morton_pass<&decode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 24: {
|
||||
morton_pass<&decode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
|
||||
switch (bpp) {
|
||||
case 4: {
|
||||
morton_pass<&decode_pass<1, 32>, 1, 32>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
morton_pass<&decode_pass<2, 32>, 2, 32>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 16: {
|
||||
morton_pass<&decode_pass<4, 32>, 4, 32>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 24: {
|
||||
morton_pass<&decode_pass<6, 32>, 6, 32>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 32: {
|
||||
morton_pass<&decode_pass<8, 32>, 8, 32>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace Encoders {
|
||||
|
||||
bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
|
||||
if (bpp == 32) {
|
||||
morton_8x8_32(morton_buffer, matrix_buffer, width, height, false);
|
||||
return true;
|
||||
}
|
||||
switch (bpp) {
|
||||
case 4: {
|
||||
morton_pass<&encode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
morton_pass<&encode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 16: {
|
||||
morton_pass<&encode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 24: {
|
||||
morton_pass<&encode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
|
||||
switch (bpp) {
|
||||
case 4: {
|
||||
morton_pass<&encode_pass<1, 32>, 1, 32>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
morton_pass<&encode_pass<2, 32>, 2, 32>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 16: {
|
||||
morton_pass<&encode_pass<4, 32>, 4, 32>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 24: {
|
||||
morton_pass<&encode_pass<6, 32>, 6, 32>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 32: {
|
||||
morton_pass<&encode_pass<8, 32>, 8, 32>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#include "morton8x8_optimized.cpp"
|
||||
|
@ -2,14 +2,12 @@
|
||||
|
||||
#include "common/common_types.h"
|
||||
|
||||
enum class MortonPass { Tile8x8, Tile32x32 };
|
||||
u32 MortonOffset(u32 x, u32 y, u32 width, u32 height, u32 tiling, u32 bpp);
|
||||
|
||||
namespace Decoders {
|
||||
bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
|
||||
bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
|
||||
}
|
||||
|
||||
namespace Encoders {
|
||||
bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
|
||||
bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
|
||||
}
|
||||
|
253
src/video_core/texture/internal/morton8x8_optimized.cpp
Normal file
253
src/video_core/texture/internal/morton8x8_optimized.cpp
Normal file
@ -0,0 +1,253 @@
|
||||
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include "common/common_types.h"
|
||||
|
||||
#if ((defined(__clang__) || defined(__GNUC__)) && !defined(__INTEL_COMPILER))
|
||||
#define CLANG_OR_GCC
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Optimizations
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef _MSC_VER
|
||||
#pragma inline_recursion(on)
|
||||
#pragma intrinsic(memcpy)
|
||||
#define __hot
|
||||
#define __no_inline __declspec(noinline)
|
||||
#elif defined(CLANG_OR_GCC)
|
||||
#pragma GCC push_options
|
||||
// The next 3 will swizle memory copying to help find the best sse/avx shuffling
|
||||
// in case it's possible. Compilation tests have proven effective use of these
|
||||
// flags on gcc and clang.
|
||||
#pragma GCC optimize("-fpredictive-commoning")
|
||||
#pragma GCC optimize("-ftree-loop-distribute-patterns")
|
||||
#pragma GCC optimize("-ftree-vectorize")
|
||||
// The beauty of these compiler options is that they generate better code than
|
||||
// hand written intrinsics, since inline expanding memeory transfers can be pattern
|
||||
// matched with vector instructions available in the target.
|
||||
#define __no_inline __attribute__((noinline))
|
||||
#define __hot __attribute__((hot))
|
||||
#if !defined(__forceinline)
|
||||
#define __forceinline attribute__((always_inline))
|
||||
#endif
|
||||
#else
|
||||
#define __hot
|
||||
#define __no_inline
|
||||
#define __forceinline inline
|
||||
#endif
|
||||
|
||||
#pragma region Z_Order
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// Z-Order:
|
||||
//
|
||||
// 0-->1
|
||||
// /
|
||||
// 2-->3
|
||||
//
|
||||
// for more information look at: https://en.wikipedia.org/wiki/Z-order_curve
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
#define TOP_LEFT 0
|
||||
#define TOP_RIGHT 1
|
||||
#define BOTTOM_LEFT 2
|
||||
#define BOTTOM_RIGHT 3
|
||||
|
||||
// @param read_size is the amount of bytes each pixel takes
|
||||
__forceinline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
|
||||
std::memcpy(matrix_pointer, morton_pointer, read_size);
|
||||
}
|
||||
|
||||
// @param read_size is the amount of bytes each pixel takes
|
||||
__forceinline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
|
||||
std::memcpy(morton_pointer, matrix_pointer, read_size);
|
||||
}
|
||||
|
||||
constexpr u32 isRight(u32 block_index) {
|
||||
return (block_index % 2);
|
||||
}
|
||||
|
||||
constexpr u32 isBottom(u32 block_index) {
|
||||
return (block_index / 2);
|
||||
}
|
||||
|
||||
template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
|
||||
__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block);
|
||||
|
||||
template <void codec(u8*, u8*, size_t), size_t nibbles, u32 block_index, u32 blocks,
|
||||
size_t block_size>
|
||||
__forceinline static void swizzle_block_aux(u8*& morton_block, u8* linear_block) {
|
||||
// move the linear_block pointer to the appropiate block
|
||||
const size_t right = isRight(block_index) * (blocks * nibbles) / 2;
|
||||
const size_t down = isBottom(block_index) * block_size;
|
||||
u8* new_linear = linear_block + right + down;
|
||||
swizzle_block<codec, nibbles, blocks, block_size>(morton_block, new_linear);
|
||||
}
|
||||
|
||||
template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
|
||||
__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block) {
|
||||
const size_t new_block_size = block_size / 2;
|
||||
if (blocks <= 2) {
|
||||
// We handle 2*2 blocks on z-order
|
||||
const size_t read_size = nibbles; // just for clearness. It's the same amount
|
||||
// TOP_LEFT & TOP_RIGHT
|
||||
codec(morton_block, linear_block, read_size);
|
||||
morton_block += read_size;
|
||||
// BOTTOM_LEFT & BOTTOM_RIGHT
|
||||
codec(morton_block, linear_block + new_block_size, read_size);
|
||||
morton_block += read_size;
|
||||
} else {
|
||||
// we divide the block into 4 blocks in z-order corecursively
|
||||
// until we have 2x2 blocks.
|
||||
const u32 subdivide = blocks / 2;
|
||||
swizzle_block_aux<codec, nibbles, TOP_LEFT, subdivide, new_block_size>(morton_block,
|
||||
linear_block);
|
||||
swizzle_block_aux<codec, nibbles, TOP_RIGHT, subdivide, new_block_size>(morton_block,
|
||||
linear_block);
|
||||
swizzle_block_aux<codec, nibbles, BOTTOM_LEFT, subdivide, new_block_size>(morton_block,
|
||||
linear_block);
|
||||
swizzle_block_aux<codec, nibbles, BOTTOM_RIGHT, subdivide, new_block_size>(morton_block,
|
||||
linear_block);
|
||||
}
|
||||
}
|
||||
|
||||
template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
|
||||
__forceinline static void swizzle_pass(u8* morton_block, u8* linear_block) {
|
||||
const size_t block_size = (lines_per_block * lines_per_block * nibbles) / 2;
|
||||
swizzle_block<codec, nibbles, lines_per_block, block_size>(morton_block, linear_block);
|
||||
}
|
||||
#pragma endregion Z_Order
|
||||
|
||||
template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
|
||||
__forceinline void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) {
|
||||
const size_t tiled_line_size = (lines_per_block * nibbles) / 2;
|
||||
const size_t row_length = x_blocks * tiled_line_size;
|
||||
for (u32 i = 0; i < lines_per_block; i++) {
|
||||
const u32 k = (lines_per_block - 1 - i);
|
||||
const size_t tiled_index = i * tiled_line_size;
|
||||
const size_t linear_index = k * row_length;
|
||||
codec(tiled + tiled_index, linear + linear_index, tiled_line_size);
|
||||
}
|
||||
}
|
||||
|
||||
template <size_t nibbles, size_t lines_per_block>
|
||||
__forceinline static void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
|
||||
const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
|
||||
alignas(64) u8 tmp[tile_size];
|
||||
tiling_pass<&encode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
|
||||
swizzle_pass<&encode, nibbles, lines_per_block>(morton_buffer, tmp);
|
||||
}
|
||||
|
||||
template <size_t nibbles, size_t lines_per_block>
|
||||
__forceinline static void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
|
||||
const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
|
||||
alignas(64) u8 tmp[tile_size];
|
||||
swizzle_pass<&decode, nibbles, lines_per_block>(morton_buffer, tmp);
|
||||
tiling_pass<&decode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
|
||||
}
|
||||
|
||||
template <void codec(u8*, u8*, u32), size_t nibbles, size_t lines_per_block>
|
||||
static void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) {
|
||||
const u32 x_blocks = (width / lines_per_block);
|
||||
const u32 y_blocks = (height / lines_per_block);
|
||||
const size_t line_size = (lines_per_block * nibbles) / 2;
|
||||
const size_t tile_size = lines_per_block * line_size;
|
||||
const size_t stride_size = width * line_size;
|
||||
matrix_buffer = matrix_buffer + ((height * width * nibbles) / 2) - stride_size;
|
||||
for (u32 y = 0; y < y_blocks; y++) {
|
||||
u8* linear_buffer = matrix_buffer;
|
||||
for (u32 x = 0; x != x_blocks; x++) {
|
||||
codec(morton_buffer, linear_buffer, x_blocks);
|
||||
linear_buffer += line_size;
|
||||
morton_buffer += tile_size;
|
||||
}
|
||||
matrix_buffer -= stride_size;
|
||||
}
|
||||
}
|
||||
|
||||
namespace Decoders {
|
||||
|
||||
bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
|
||||
switch (bpp) {
|
||||
case 4: {
|
||||
morton_pass<&decode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
morton_pass<&decode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 16: {
|
||||
morton_pass<&decode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 24: {
|
||||
morton_pass<&decode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 32: {
|
||||
morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace Encoders {
|
||||
|
||||
bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
|
||||
switch (bpp) {
|
||||
case 4: {
|
||||
morton_pass<&encode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
morton_pass<&encode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 16: {
|
||||
morton_pass<&encode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 24: {
|
||||
morton_pass<&encode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
case 32: {
|
||||
morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#undef __hot
|
||||
#undef __no_inline
|
||||
#elif defined(CLANG_OR_GCC)
|
||||
#pragma GCC pop_options
|
||||
#undef __no_inline
|
||||
#undef __hot
|
||||
#else
|
||||
#undef __hot
|
||||
#undef __no_inline
|
||||
#undef __forceinline
|
||||
#endif
|
@ -16,6 +16,7 @@
|
||||
#ifdef _MSC_VER
|
||||
#pragma inline_recursion(on)
|
||||
#elif defined(CLANG_OR_GCC)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC optimize("-fpeel-loops")
|
||||
#pragma GCC optimize("-fpredictive-commoning")
|
||||
#pragma GCC optimize("-ftree-loop-distribute-patterns")
|
||||
@ -74,24 +75,6 @@ inline void image_pass(u8* target, u32 width, u32 height) {
|
||||
image_pass_aux_rev<pass, read_size, write_size, tuning>(target, width, height);
|
||||
}
|
||||
|
||||
template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
|
||||
void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) {
|
||||
const size_t tiled_line_size = (lines_per_block * nibbles) / 2;
|
||||
const size_t row_length = x_blocks * tiled_line_size;
|
||||
for (u32 i = 0; i < lines_per_block; i++) {
|
||||
const u32 k = (lines_per_block - 1 - i);
|
||||
const size_t tiled_index = i * tiled_line_size;
|
||||
const size_t linear_index = k * row_length;
|
||||
codec(tiled + tiled_index, linear + linear_index, tiled_line_size);
|
||||
}
|
||||
}
|
||||
|
||||
// @param read_size is the amount of bytes each pixel takes
|
||||
inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
|
||||
std::memcpy(matrix_pointer, morton_pointer, read_size);
|
||||
}
|
||||
|
||||
// @param read_size is the amount of bytes each pixel takes
|
||||
inline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
|
||||
std::memcpy(morton_pointer, matrix_pointer, read_size);
|
||||
}
|
||||
#if defined(CLANG_OR_GCC)
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user