Fixes, cleaning and small refactors

This commit is contained in:
Fernando Sahmkow 2017-01-16 19:18:38 -05:00
parent 1c6965f106
commit bed6207ac7
12 changed files with 484 additions and 399 deletions

View File

@ -1,3 +1,4 @@
set(SRCS
renderer_opengl/gl_rasterizer.cpp
renderer_opengl/gl_rasterizer_cache.cpp

View File

@ -26,6 +26,9 @@
#include "video_core/utils.h"
#include "video_core/video_core.h"
#define TEXTURE_CACHE_SIZE (1024 * 1024 * 8) // 8MB inner cache for decoding/encoding
alignas(64) static u8 TextureCache[TEXTURE_CACHE_SIZE];
struct FormatTuple {
GLint internal_format;
GLenum format;
@ -39,7 +42,7 @@ static const std::array<FormatTuple, 18> format_tuples = {{
{GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5}, // RGB565
{GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4}, // RGBA4
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // IA8
{GL_RG8, GL_RG8, GL_UNSIGNED_BYTE}, // RG8
{GL_RG8, GL_RG, GL_UNSIGNED_BYTE}, // RG8
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // I8
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // A8
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // IA4
@ -70,7 +73,7 @@ static const std::array<bool, 18> native_format = {
false, // ETC1A4
true, // D16
false,
false, // D24
true, // D24
false, // D24S8
};
@ -306,26 +309,29 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo
// clang-format on
);
Pica::Texture::Codec* codec = tmp.get();
codec->configTiling(true, 8); // change 8 for 32 in case the mage is tiled
codec->configTiling(true, 8); // change 8 for 32 in case the image is tiled
// on blocks of 32x32
codec->configRGBATransform(!native_format[(unsigned int)params.pixel_format]);
codec->validate();
if (!codec->invalid()) {
codec->decode();
std::unique_ptr<u8[]> decoded_texture = codec->transferInternalBuffer();
u32 bytes = codec->getInternalBytesPerPixel();
if (bytes == 3)
bytes = 1;
else if (bytes != 2)
bytes = 4;
glPixelStorei(GL_UNPACK_ALIGNMENT, bytes);
glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height,
0, tuple.format, tuple.type, decoded_texture.get());
glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
u32 estimated_size =
params.width * params.height * codec->getInternalBytesPerPixel();
if (estimated_size <= TEXTURE_CACHE_SIZE) {
codec->setExternalBuffer(TextureCache);
codec->decode();
glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
params.height, 0, tuple.format, tuple.type, TextureCache);
} else {
codec->decode();
std::unique_ptr<u8[]> decoded_texture = codec->transferInternalBuffer();
glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width,
params.height, 0, tuple.format, tuple.type, decoded_texture.get());
}
} else {
LOG_WARNING(Render_OpenGL,
"Invalid texture sent to renderer; width: %d height %d type: %d",
params.width, params.height, (unsigned int)params.pixel_format);
return nullptr;
}
}
// If not 1x scale, blit 1x texture to a new scaled texture and replace texture in surface
@ -652,15 +658,22 @@ void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) {
glPixelStorei(GL_PACK_ROW_LENGTH, 0);
} else {
const FormatTuple& tuple = format_tuples[(u32)surface->pixel_format];
u32 bytes_per_pixel = Pica::Texture::Format::GetBpp(surface->pixel_format) / 8;
u32 bits_per_pixel = Pica::Texture::Format::GetBpp(surface->pixel_format);
if (!native_format[(u32)surface->pixel_format])
bytes_per_pixel = 4;
std::vector<u8> temp_gl_buffer(surface->width * surface->height * bytes_per_pixel);
glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data());
bits_per_pixel = 32;
u32 size = surface->width * surface->height * bits_per_pixel / 8;
std::vector<u8> temp_gl_buffer;
u8* temporal_buffer;
if (size <= TEXTURE_CACHE_SIZE)
temporal_buffer = TextureCache;
else {
temp_gl_buffer.resize(size);
temporal_buffer = temp_gl_buffer.data();
}
glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temporal_buffer);
std::unique_ptr<Pica::Texture::Codec> tmp = Pica::Texture::CodecFactory::build(
// clang-format off
surface->pixel_format, temp_gl_buffer.data(), surface->width, surface->height
surface->pixel_format, temporal_buffer, surface->width, surface->height
// clang-format on
);
Pica::Texture::Codec* codec = tmp.get();

View File

@ -67,7 +67,7 @@ void Codec::init(bool decode) {
this->expected_nibbles_size = this->start_nibbles_size;
}
this->validate();
if (!this->external_result_buffer || !this->invalid()) {
if (!this->external_result_buffer) {
size_t buff_size = this->width * this->height * this->expected_nibbles_size / 2;
this->internal_buffer = std::make_unique<u8[]>(buff_size);
this->passing_buffer = this->internal_buffer.get();
@ -91,7 +91,7 @@ void Codec::validate() {
this->invalid_state = true;
return;
}
if (this->morton && this->morton_pass_tiling != 8 && this->morton_pass_tiling != 32) {
if (this->morton && this->morton_pass_tiling != 8) {
this->invalid_state = true;
return;
}
@ -102,18 +102,12 @@ inline void Codec::decode_morton_pass() {
if (this->morton_pass_tiling == 8)
Decoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height,
this->start_nibbles_size * 4);
else if (this->morton_pass_tiling == 32)
Decoders::Morton_32x32(this->target_buffer, this->passing_buffer, this->width, this->height,
this->start_nibbles_size * 4);
}
inline void Codec::encode_morton_pass() {
if (this->morton_pass_tiling == 8)
Encoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height,
Encoders::Morton_8x8(this->passing_buffer, this->target_buffer, this->width, this->height,
this->start_nibbles_size * 4);
else if (this->morton_pass_tiling == 32)
Encoders::Morton_32x32(this->target_buffer, this->passing_buffer, this->width, this->height,
this->start_nibbles_size * 4);
}
std::unique_ptr<Codec> CodecFactory::build(Format::Type format, u8* target, u32 width, u32 height) {

View File

@ -40,7 +40,7 @@ struct Format {
Invalid = 255,
};
static u32 GetBpp(Type format) {
static const u32 GetBpp(Type format) {
static const std::array<unsigned int, 18> bpp_table = {
32, // RGBA8
24, // RGB8
@ -66,19 +66,19 @@ struct Format {
return bpp_table[(u32)format];
}
static Type FromTextureFormat(Regs::TextureFormat format) {
static constexpr Type FromTextureFormat(Regs::TextureFormat format) {
return ((unsigned int)format < 14) ? (Type)format : Type::Invalid;
}
static Type FromColorFormat(Regs::ColorFormat format) {
static constexpr Type FromColorFormat(Regs::ColorFormat format) {
return ((unsigned int)format < 5) ? (Type)format : Type::Invalid;
}
static Type FromDepthFormat(Regs::DepthFormat format) {
static constexpr Type FromDepthFormat(Regs::DepthFormat format) {
return ((unsigned int)format < 4) ? (Type)((unsigned int)format + 14) : Type::Invalid;
}
static Type FromGPUPixelFormat(GPU::Regs::PixelFormat format) {
static const Type FromGPUPixelFormat(GPU::Regs::PixelFormat format) {
switch (format) {
// RGB565 and RGB5A1 are switched in PixelFormat compared to ColorFormat
case GPU::Regs::PixelFormat::RGB565:
@ -92,6 +92,17 @@ struct Format {
}; // Format
struct Info {
PAddr physical_address;
int width;
int height;
int stride;
Pica::Regs::TextureFormat format;
static Info FromPicaRegister(const Pica::Regs::TextureConfig& config,
const Pica::Regs::TextureFormat& format);
};
} // Texture
} // Pica

View File

@ -5,6 +5,7 @@
#include <memory>
#include "common/common_types.h"
#include "video_core/texture/codec.h"
#include "video_core/texture/formats.h"
// each texture format codec
class RGBACodec : public Pica::Texture::Codec {
@ -15,7 +16,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 8;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::RGBA8
) / 4;
// clang-format on
};
};
@ -27,7 +32,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 6;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::RGB8
) / 4;
// clang-format on
};
};
@ -39,7 +48,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 4;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::RGB5A1
) / 4;
// clang-format on
};
};
@ -51,7 +64,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 4;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::RGBA4
) / 4;
// clang-format on
};
};
@ -63,7 +80,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 4;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::RGB565
) / 4;
// clang-format on
};
};
@ -75,7 +96,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 4;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::RG8
) / 4;
// clang-format on
};
};
@ -87,7 +112,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 4;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::IA8
) / 4;
// clang-format on
};
};
@ -99,7 +128,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 2;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::I8
) / 4;
// clang-format on
};
};
@ -111,7 +144,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 2;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::A8
) / 4;
// clang-format on
};
};
@ -123,7 +160,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 2;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::IA4
) / 4;
// clang-format on
};
};
@ -135,7 +176,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 1;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::I4
) / 4;
// clang-format on
};
};
@ -147,7 +192,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 1;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::A4
) / 4;
// clang-format on
};
};
@ -159,7 +208,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 1;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::ETC1
) / 4;
// clang-format on
};
};
@ -171,7 +224,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 2;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::ETC1A4
) / 4;
// clang-format on
};
};
@ -183,7 +240,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 4;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::D16
) / 4;
// clang-format on
};
};
@ -195,7 +256,11 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 6;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::D24
) / 4;
// clang-format on
};
};
@ -207,6 +272,10 @@ public:
protected:
virtual void setSize() {
this->start_nibbles_size = 8;
// clang-format off
this->start_nibbles_size = Pica::Texture::Format::GetBpp(
Pica::Texture::Format::Type::D24S8
) / 4;
// clang-format on
};
};

View File

@ -83,9 +83,9 @@ inline u32 build_luminance(u32 intensity, u32 alpha) {
}
inline void intensity_alpha_pass(u8* read, u8* write) {
alignas(4) u8 pixel[2];
std::memcpy(pixel, read, 2);
u32 result = build_luminance(pixel[1], pixel[0]);
u16 pixel;
std::memcpy(&pixel, read, 2);
u32 result = build_luminance(pixel >> 8, pixel & 0x00FF);
std::memcpy(write, &result, 4);
}
@ -93,9 +93,7 @@ inline void intensity_alpha_nibbles_pass(u8* read, u8* write) {
alignas(4) u8 pixel;
std::memcpy(&pixel, read, 1);
u16 tmp = convert_nibbles(pixel);
u8 tmp2[2];
std::memcpy(tmp2, &tmp, 2);
u32 result = build_luminance(tmp2[1], tmp2[0]);
u32 result = build_luminance(tmp >> 8, tmp & 0x00FF);
std::memcpy(write, &result, 4);
}
@ -107,31 +105,29 @@ inline void intensity_pass(u8* read, u8* write) {
}
inline void intensity_nibbles_pass(u8* read, u8* write) {
alignas(4) u8 pixel[2];
std::memcpy(pixel, read, 1);
u16 tmp = convert_nibbles(pixel[0]);
std::memcpy(pixel, &tmp, 2);
u32 result = build_luminance(pixel[1], 255);
u8 pixel;
std::memcpy(&pixel, read, 1);
u16 tmp = convert_nibbles(pixel);
u32 result = build_luminance(tmp & 0x00FF, 255);
std::memcpy(write, &result, 4);
result = build_luminance(pixel[0], 255);
result = build_luminance(tmp >> 8, 255);
std::memcpy(write + 4, &result, 4);
}
inline void alpha_pass(u8* read, u8* write) {
alignas(4) u8 pixel[1];
std::memcpy(pixel, read, 1);
u32 result = build_luminance(0, pixel[0]);
u8 pixel;
std::memcpy(&pixel, read, 1);
u32 result = build_luminance(0, pixel);
std::memcpy(write, &result, 4);
}
inline void alpha_nibbles_pass(u8* read, u8* write) {
alignas(4) u8 pixel[2];
std::memcpy(pixel, read, 1);
u16 tmp = convert_nibbles(pixel[0]);
std::memcpy(pixel, &tmp, 2);
u32 result = build_luminance(0, pixel[0]);
u8 pixel;
std::memcpy(&pixel, read, 1);
u16 tmp = convert_nibbles(pixel);
u32 result = build_luminance(0, tmp & 0x00FF);
std::memcpy(write, &result, 4);
result = build_luminance(0, pixel[1]);
result = build_luminance(0, tmp >> 8);
std::memcpy(write + 4, &result, 4);
}
@ -207,7 +203,7 @@ void ETC1A4Codec::decode() {
ETC1A4(this->target_buffer, this->passing_buffer, this->width, this->height);
}
namespace {
namespace Decode {
inline void expand_depth16_pass(u8* read, u8* write) {
alignas(4) u8 pixel[4];
@ -224,11 +220,18 @@ inline void expand_depth24_pass(u8* read, u8* write) {
std::memcpy(write, pixel, 4);
}
inline void fix_stencil_pass(u8* read, u8* write) {
u32 pixel;
std::memcpy(&pixel, read, 4);
pixel = (pixel << 8) | (pixel >> 24);
std::memcpy(write, &pixel, 4);
inline void d24s8_pass(u8* target, u32 width, u32 height) {
const size_t sub_iters = 8;
const size_t iters = width * height / sub_iters;
for (u32 i = 0; i < iters; i++) {
for (u32 j = 0; j < sub_iters; j++) {
u32 pixel;
std::memcpy(&pixel, target, 4);
pixel = (pixel >> 24) | (pixel << 8);
std::memcpy(target, &pixel, 4);
target += 4;
}
}
}
} // Anonymous
@ -236,7 +239,7 @@ inline void fix_stencil_pass(u8* read, u8* write) {
void D16Codec::decode() {
super::decode();
if (this->raw_RGBA)
image_pass<&expand_depth16_pass, 4, 8>(
image_pass<&Decode::expand_depth16_pass, 4, 8>(
// clang-format off
this->passing_buffer, this->width, this->height
// clang-format on
@ -246,7 +249,7 @@ void D16Codec::decode() {
void D24Codec::decode() {
super::decode();
if (this->raw_RGBA)
image_pass<&expand_depth24_pass, 6, 8>(
image_pass<&Decode::expand_depth24_pass, 6, 8>(
// clang-format off
this->passing_buffer, this->width, this->height
// clang-format on
@ -256,9 +259,5 @@ void D24Codec::decode() {
void D24S8Codec::decode() {
super::decode();
if (this->raw_RGBA)
image_pass<&fix_stencil_pass, 8, 8, 8>(
// clang-format off
this->passing_buffer, this->width, this->height
// clang-format on
);
Decode::d24s8_pass(this->passing_buffer, this->width, this->height);
}

View File

@ -69,11 +69,18 @@ inline void contract_depth24_pass(u8* read, u8* write) {
std::memcpy(write, pixel, 3);
}
inline void fix_stencil_pass(u8* read, u8* write) {
u32 pixel;
std::memcpy(&pixel, read, 4);
pixel = (pixel >> 24) | (pixel << 8);
std::memcpy(write, &pixel, 4);
inline void d24s8_pass(u8* target, u32 width, u32 height) {
const size_t sub_iters = 8;
const size_t iters = width * height / sub_iters;
for (u32 i = 0; i < iters; i++) {
for (u32 j = 0; j < sub_iters; j++) {
u32 pixel;
std::memcpy(&pixel, target, 4);
pixel = (pixel >> 8) | (pixel << 24);
std::memcpy(target, &pixel, 4);
target += 4;
}
}
}
} // Anonymous
@ -101,9 +108,5 @@ void D24Codec::encode() {
void D24S8Codec::encode() {
super::encode();
if (this->raw_RGBA)
image_pass<&Encode::fix_stencil_pass, 8, 8, 8>(
// clang-format off
this->passing_buffer, this->width, this->height
// clang-format on
);
Encode::d24s8_pass(this->passing_buffer, this->width, this->height);
}

View File

@ -122,6 +122,22 @@ union ETC1Tile {
} // anonymous namespace
inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
std::memcpy(matrix_pointer, morton_pointer, read_size);
}
template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) {
const size_t tiled_line_size = (lines_per_block * nibbles) / 2;
const size_t row_length = x_blocks * tiled_line_size;
for (u32 i = 0; i < lines_per_block; i++) {
const u32 k = (lines_per_block - 1 - i);
const size_t tiled_index = i * tiled_line_size;
const size_t linear_index = k * row_length;
codec(tiled + tiled_index, linear + linear_index, tiled_line_size);
}
}
inline void etc1_pass(u8* etc1_buffer, u8* linear_buffer, u32 x_blocks) {
const size_t line = 8 * 4;
alignas(64) u8 tmp[line * 8];

View File

@ -1,295 +1,40 @@
#include <cstring>
#include <memory>
#include <utility>
#include "common/common_types.h"
#include "video_core/texture/internal/morton.h"
#include "video_core/texture/internal/texture_utils.h"
///////////////////////////////////////////////////////////////////////////////
// Optimizations
//////////////////////////////////////////////////////////////////////////////
#ifdef _MSC_VER
#pragma inline_recursion(on)
// Normaly set to 16 by default, the best balance seems to be on 8 for this module
#pragma inline_depth(8)
// favor fast code over small code.
#pragma optimize("t", on)
#pragma intrinsic(memcpy)
#define __hot
#define __no_inline __declspec(noinline)
#elif defined(CLANG_OR_GCC)
// The next 3 will swizle memory copying to help find the best sse/avx shuffling
// in case it's possible. Compilation tests have proven effective use of these
// flags on gcc and clang.
#pragma GCC optimize("-fpredictive-commoning")
#pragma GCC optimize("-ftree-loop-distribute-patterns")
#pragma GCC optimize("-ftree-vectorize")
#pragma GCC option("--param inline-unit-growth=400")
#pragma GCC option("--param large-function-growth=800")
// The beauty of these compiler options is that they generate better code than
// hand written intrinsics, since inline expanding memeory transfers can be pattern
// matched with vector instructions available in the target.
#define __no_inline __attribute__((noinline))
#define __hot __attribute__((hot))
#if !defined(__forceinline)
#define __forceinline attribute__((always_inline))
#endif
#else
#define __hot
#define __no_inline
#define __forceinline
#endif
#pragma region Z_Order
/////////////////////////////////////////////////////////////////////////////
// Z-Order:
//
// 0-->1
// /
// 2-->3
//
// for more information look at: https://en.wikipedia.org/wiki/Z-order_curve
/////////////////////////////////////////////////////////////////////////////
#define TOP_LEFT 0
#define TOP_RIGHT 1
#define BOTTOM_LEFT 2
#define BOTTOM_RIGHT 3
constexpr u32 isRight(u32 block_index) {
return (block_index % 2);
static u32 Part1By1(u32 x) {
x &= 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
x = (x ^ (x << 8)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
x = (x ^ (x << 4)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
x = (x ^ (x << 2)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
x = (x ^ (x << 1)) & 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
return x;
}
constexpr u32 isBottom(u32 block_index) {
return (block_index / 2);
static u32 Compact1By1(u32 x) {
x &= 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
x = (x ^ (x >> 1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
x = (x ^ (x >> 2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
x = (x ^ (x >> 4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
x = (x ^ (x >> 8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
return x;
}
template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block);
template <void codec(u8*, u8*, size_t), size_t nibbles, u32 block_index, u32 blocks,
size_t block_size>
__forceinline static void swizzle_block_aux(u8*& morton_block, u8* linear_block) {
// move the linear_block pointer to the appropiate block
const size_t right = isRight(block_index) * (blocks * nibbles) / 2;
const size_t down = isBottom(block_index) * block_size;
u8* new_linear = linear_block + right + down;
swizzle_block<codec, nibbles, blocks, block_size>(morton_block, new_linear);
static u32 EncodeMorton(u32 x, u32 y) {
return (Part1By1(y) << 1) | Part1By1(x);
}
template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block) {
const size_t new_block_size = block_size / 2;
if (blocks <= 2) {
// We handle 2*2 blocks on z-order
const size_t read_size = nibbles; // just for clearness. It's the same amount
// TOP_LEFT & TOP_RIGHT
codec(morton_block, linear_block, read_size);
morton_block += read_size;
// BOTTOM_LEFT & BOTTOM_RIGHT
codec(morton_block, linear_block + new_block_size, read_size);
morton_block += read_size;
} else {
// we divide the block into 4 blocks in z-order corecursively
// until we have 2x2 blocks.
const u32 subdivide = blocks / 2;
swizzle_block_aux<codec, nibbles, TOP_LEFT, subdivide, new_block_size>(morton_block,
linear_block);
swizzle_block_aux<codec, nibbles, TOP_RIGHT, subdivide, new_block_size>(morton_block,
linear_block);
swizzle_block_aux<codec, nibbles, BOTTOM_LEFT, subdivide, new_block_size>(morton_block,
linear_block);
swizzle_block_aux<codec, nibbles, BOTTOM_RIGHT, subdivide, new_block_size>(morton_block,
linear_block);
}
static u32 DecodeMortonX(u32 code) {
return Compact1By1(code >> 0);
}
template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
__forceinline static void swizzle_pass(u8* morton_block, u8* linear_block) {
const size_t block_size = (lines_per_block * lines_per_block * nibbles) / 2;
swizzle_block<codec, nibbles, lines_per_block, block_size>(morton_block, linear_block);
}
#pragma endregion Z_Order
template <size_t nibbles, size_t lines_per_block>
__hot inline static void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
alignas(64) u8 tmp[tile_size];
tiling_pass<&encode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
swizzle_pass<&encode, nibbles, lines_per_block>(morton_buffer, tmp);
static u32 DecodeMortonY(u32 code) {
return Compact1By1(code >> 1);
}
template <size_t nibbles, size_t lines_per_block>
__hot inline static void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
alignas(64) u8 tmp[tile_size];
swizzle_pass<&decode, nibbles, lines_per_block>(morton_buffer, tmp);
tiling_pass<&decode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
u32 MortonOffset(u32 x, u32 y, u32 width, u32 height, u32 tiling, u32 bpp) {
u32 tile = (x + y * height) * width / (tiling * tiling);
tile = (tile * bpp) / 8;
return tile + EncodeMorton(x % tiling, y % tiling);
}
template <void codec(u8*, u8*, u32), size_t nibbles, size_t lines_per_block>
__hot static void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) {
const u32 x_blocks = (width / lines_per_block);
const u32 y_blocks = (height / lines_per_block);
const size_t line_size = (lines_per_block * nibbles) / 2;
const size_t tile_size = lines_per_block * line_size;
const size_t stride_size = width * line_size;
matrix_buffer = matrix_buffer + ((height * width * nibbles) / 2) - stride_size;
for (u32 y = 0; y < y_blocks; y++) {
u8* linear_buffer = matrix_buffer;
for (u32 x = 0; x != x_blocks; x++) {
codec(morton_buffer, linear_buffer, x_blocks);
linear_buffer += line_size;
morton_buffer += tile_size;
}
matrix_buffer -= stride_size;
}
}
// keep hot code together
__no_inline __hot static void morton_8x8_32(u8* morton_buffer, u8* matrix_buffer, u32 width,
u32 height, bool decode) {
if (decode)
morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
else
morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
}
namespace Decoders {
bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
if (bpp == 32) {
morton_8x8_32(morton_buffer, matrix_buffer, width, height, true);
return true;
}
switch (bpp) {
case 4: {
morton_pass<&decode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 8: {
morton_pass<&decode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 16: {
morton_pass<&decode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 24: {
morton_pass<&decode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
default: {
return false;
break;
}
}
}
bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
switch (bpp) {
case 4: {
morton_pass<&decode_pass<1, 32>, 1, 32>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 8: {
morton_pass<&decode_pass<2, 32>, 2, 32>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 16: {
morton_pass<&decode_pass<4, 32>, 4, 32>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 24: {
morton_pass<&decode_pass<6, 32>, 6, 32>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 32: {
morton_pass<&decode_pass<8, 32>, 8, 32>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
default: {
return false;
break;
}
}
}
}
namespace Encoders {
bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
if (bpp == 32) {
morton_8x8_32(morton_buffer, matrix_buffer, width, height, false);
return true;
}
switch (bpp) {
case 4: {
morton_pass<&encode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 8: {
morton_pass<&encode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 16: {
morton_pass<&encode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 24: {
morton_pass<&encode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
default: {
return false;
break;
}
}
}
bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
switch (bpp) {
case 4: {
morton_pass<&encode_pass<1, 32>, 1, 32>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 8: {
morton_pass<&encode_pass<2, 32>, 2, 32>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 16: {
morton_pass<&encode_pass<4, 32>, 4, 32>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 24: {
morton_pass<&encode_pass<6, 32>, 6, 32>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 32: {
morton_pass<&encode_pass<8, 32>, 8, 32>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
default: {
return false;
break;
}
}
}
}
#include "morton8x8_optimized.cpp"

View File

@ -2,14 +2,12 @@
#include "common/common_types.h"
enum class MortonPass { Tile8x8, Tile32x32 };
u32 MortonOffset(u32 x, u32 y, u32 width, u32 height, u32 tiling, u32 bpp);
namespace Decoders {
bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
}
namespace Encoders {
bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp);
}

View File

@ -0,0 +1,253 @@
#include <cstring>
#include <memory>
#include <utility>
#include "common/common_types.h"
#if ((defined(__clang__) || defined(__GNUC__)) && !defined(__INTEL_COMPILER))
#define CLANG_OR_GCC
#endif
///////////////////////////////////////////////////////////////////////////////
// Optimizations
//////////////////////////////////////////////////////////////////////////////
#ifdef _MSC_VER
#pragma inline_recursion(on)
#pragma intrinsic(memcpy)
#define __hot
#define __no_inline __declspec(noinline)
#elif defined(CLANG_OR_GCC)
#pragma GCC push_options
// The next 3 will swizle memory copying to help find the best sse/avx shuffling
// in case it's possible. Compilation tests have proven effective use of these
// flags on gcc and clang.
#pragma GCC optimize("-fpredictive-commoning")
#pragma GCC optimize("-ftree-loop-distribute-patterns")
#pragma GCC optimize("-ftree-vectorize")
// The beauty of these compiler options is that they generate better code than
// hand written intrinsics, since inline expanding memeory transfers can be pattern
// matched with vector instructions available in the target.
#define __no_inline __attribute__((noinline))
#define __hot __attribute__((hot))
#if !defined(__forceinline)
#define __forceinline attribute__((always_inline))
#endif
#else
#define __hot
#define __no_inline
#define __forceinline inline
#endif
#pragma region Z_Order
/////////////////////////////////////////////////////////////////////////////
// Z-Order:
//
// 0-->1
// /
// 2-->3
//
// for more information look at: https://en.wikipedia.org/wiki/Z-order_curve
/////////////////////////////////////////////////////////////////////////////
#define TOP_LEFT 0
#define TOP_RIGHT 1
#define BOTTOM_LEFT 2
#define BOTTOM_RIGHT 3
// @param read_size is the amount of bytes each pixel takes
__forceinline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
std::memcpy(matrix_pointer, morton_pointer, read_size);
}
// @param read_size is the amount of bytes each pixel takes
__forceinline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
std::memcpy(morton_pointer, matrix_pointer, read_size);
}
constexpr u32 isRight(u32 block_index) {
return (block_index % 2);
}
constexpr u32 isBottom(u32 block_index) {
return (block_index / 2);
}
template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block);
template <void codec(u8*, u8*, size_t), size_t nibbles, u32 block_index, u32 blocks,
size_t block_size>
__forceinline static void swizzle_block_aux(u8*& morton_block, u8* linear_block) {
// move the linear_block pointer to the appropiate block
const size_t right = isRight(block_index) * (blocks * nibbles) / 2;
const size_t down = isBottom(block_index) * block_size;
u8* new_linear = linear_block + right + down;
swizzle_block<codec, nibbles, blocks, block_size>(morton_block, new_linear);
}
template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block) {
const size_t new_block_size = block_size / 2;
if (blocks <= 2) {
// We handle 2*2 blocks on z-order
const size_t read_size = nibbles; // just for clearness. It's the same amount
// TOP_LEFT & TOP_RIGHT
codec(morton_block, linear_block, read_size);
morton_block += read_size;
// BOTTOM_LEFT & BOTTOM_RIGHT
codec(morton_block, linear_block + new_block_size, read_size);
morton_block += read_size;
} else {
// we divide the block into 4 blocks in z-order corecursively
// until we have 2x2 blocks.
const u32 subdivide = blocks / 2;
swizzle_block_aux<codec, nibbles, TOP_LEFT, subdivide, new_block_size>(morton_block,
linear_block);
swizzle_block_aux<codec, nibbles, TOP_RIGHT, subdivide, new_block_size>(morton_block,
linear_block);
swizzle_block_aux<codec, nibbles, BOTTOM_LEFT, subdivide, new_block_size>(morton_block,
linear_block);
swizzle_block_aux<codec, nibbles, BOTTOM_RIGHT, subdivide, new_block_size>(morton_block,
linear_block);
}
}
template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
__forceinline static void swizzle_pass(u8* morton_block, u8* linear_block) {
const size_t block_size = (lines_per_block * lines_per_block * nibbles) / 2;
swizzle_block<codec, nibbles, lines_per_block, block_size>(morton_block, linear_block);
}
#pragma endregion Z_Order
template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
__forceinline void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) {
const size_t tiled_line_size = (lines_per_block * nibbles) / 2;
const size_t row_length = x_blocks * tiled_line_size;
for (u32 i = 0; i < lines_per_block; i++) {
const u32 k = (lines_per_block - 1 - i);
const size_t tiled_index = i * tiled_line_size;
const size_t linear_index = k * row_length;
codec(tiled + tiled_index, linear + linear_index, tiled_line_size);
}
}
template <size_t nibbles, size_t lines_per_block>
__forceinline static void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
alignas(64) u8 tmp[tile_size];
tiling_pass<&encode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
swizzle_pass<&encode, nibbles, lines_per_block>(morton_buffer, tmp);
}
template <size_t nibbles, size_t lines_per_block>
__forceinline static void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
alignas(64) u8 tmp[tile_size];
swizzle_pass<&decode, nibbles, lines_per_block>(morton_buffer, tmp);
tiling_pass<&decode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
}
template <void codec(u8*, u8*, u32), size_t nibbles, size_t lines_per_block>
static void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) {
const u32 x_blocks = (width / lines_per_block);
const u32 y_blocks = (height / lines_per_block);
const size_t line_size = (lines_per_block * nibbles) / 2;
const size_t tile_size = lines_per_block * line_size;
const size_t stride_size = width * line_size;
matrix_buffer = matrix_buffer + ((height * width * nibbles) / 2) - stride_size;
for (u32 y = 0; y < y_blocks; y++) {
u8* linear_buffer = matrix_buffer;
for (u32 x = 0; x != x_blocks; x++) {
codec(morton_buffer, linear_buffer, x_blocks);
linear_buffer += line_size;
morton_buffer += tile_size;
}
matrix_buffer -= stride_size;
}
}
namespace Decoders {
bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
switch (bpp) {
case 4: {
morton_pass<&decode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 8: {
morton_pass<&decode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 16: {
morton_pass<&decode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 24: {
morton_pass<&decode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 32: {
morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
default: {
return false;
break;
}
}
}
}
namespace Encoders {
bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
switch (bpp) {
case 4: {
morton_pass<&encode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 8: {
morton_pass<&encode_pass<2, 8>, 2, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 16: {
morton_pass<&encode_pass<4, 8>, 4, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 24: {
morton_pass<&encode_pass<6, 8>, 6, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
case 32: {
morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
default: {
return false;
break;
}
}
}
}
#ifdef _MSC_VER
#undef __hot
#undef __no_inline
#elif defined(CLANG_OR_GCC)
#pragma GCC pop_options
#undef __no_inline
#undef __hot
#else
#undef __hot
#undef __no_inline
#undef __forceinline
#endif

View File

@ -16,6 +16,7 @@
#ifdef _MSC_VER
#pragma inline_recursion(on)
#elif defined(CLANG_OR_GCC)
#pragma GCC push_options
#pragma GCC optimize("-fpeel-loops")
#pragma GCC optimize("-fpredictive-commoning")
#pragma GCC optimize("-ftree-loop-distribute-patterns")
@ -74,24 +75,6 @@ inline void image_pass(u8* target, u32 width, u32 height) {
image_pass_aux_rev<pass, read_size, write_size, tuning>(target, width, height);
}
template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) {
const size_t tiled_line_size = (lines_per_block * nibbles) / 2;
const size_t row_length = x_blocks * tiled_line_size;
for (u32 i = 0; i < lines_per_block; i++) {
const u32 k = (lines_per_block - 1 - i);
const size_t tiled_index = i * tiled_line_size;
const size_t linear_index = k * row_length;
codec(tiled + tiled_index, linear + linear_index, tiled_line_size);
}
}
// @param read_size is the amount of bytes each pixel takes
inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
std::memcpy(matrix_pointer, morton_pointer, read_size);
}
// @param read_size is the amount of bytes each pixel takes
inline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
std::memcpy(morton_pointer, matrix_pointer, read_size);
}
#if defined(CLANG_OR_GCC)
#pragma GCC pop_options
#endif