Integrated the texture submodule into gl backend

This commit is contained in:
Fernando Sahmkow 2017-01-15 18:15:29 -05:00
parent 1a4c8d510d
commit 30f0d1dbf4
16 changed files with 436 additions and 450 deletions

View File

@ -407,6 +407,11 @@ inline float Vec3<float>::Normalize() {
return length;
}
template <>
inline unsigned int Vec3<unsigned char>::ToRGB() const {
return (z << 16) | (y << 8) | x;
}
typedef Vec3<float> Vec3f;
template <typename T>
@ -611,6 +616,11 @@ public:
#undef _DEFINE_SWIZZLER3
};
template <>
inline unsigned int Vec4<unsigned char>::ToRGBA() const {
return (w << 24) | (z << 16) | (y << 8) | x;
}
template <typename T, typename V>
Vec4<decltype(V{} * T{})> operator*(const V& f, const Vec4<T>& vec) {
return MakeVec(f * vec.x, f * vec.y, f * vec.z, f * vec.w);

View File

@ -1,14 +1,14 @@
set(SRCS
texture/internal/morton.cpp
texture/internal/etc1.cpp
texture/codec.cpp
texture/internal/codecs.cpp
renderer_opengl/gl_rasterizer.cpp
renderer_opengl/gl_rasterizer_cache.cpp
renderer_opengl/gl_shader_gen.cpp
renderer_opengl/gl_shader_util.cpp
renderer_opengl/gl_state.cpp
renderer_opengl/renderer_opengl.cpp
texture/internal/morton.cpp
texture/internal/etc1.cpp
texture/internal/codecs.cpp
texture/codec.cpp
debug_utils/debug_utils.cpp
clipper.cpp
command_processor.cpp
@ -25,12 +25,6 @@ set(SRCS
set(HEADERS
debug_utils/debug_utils.h
texture/internal/texture_utils.h
texture/internal/morton.h
texture/internal/etc1.h
texture/codec.h
texture/formats.h
texture/internal/codecs.h
renderer_opengl/gl_rasterizer.h
renderer_opengl/gl_rasterizer_cache.h
renderer_opengl/gl_resource_manager.h
@ -39,6 +33,12 @@ set(HEADERS
renderer_opengl/gl_state.h
renderer_opengl/pica_to_gl.h
renderer_opengl/renderer_opengl.h
texture/internal/texture_utils.h
texture/internal/morton.h
texture/internal/etc1.h
texture/internal/codecs.h
texture/codec.h
texture/formats.h
clipper.h
command_processor.h
gpu_debugger.h

View File

@ -21,6 +21,7 @@
#include "video_core/renderer_opengl/gl_shader_util.h"
#include "video_core/renderer_opengl/pica_to_gl.h"
#include "video_core/renderer_opengl/renderer_opengl.h"
#include "video_core/texture/formats.h"
MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255));
@ -716,7 +717,6 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(PAddr addr, u32 size) {
bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransferConfig& config) {
MICROPROFILE_SCOPE(OpenGL_Blits);
using PixelFormat = CachedSurface::PixelFormat;
using SurfaceType = CachedSurface::SurfaceType;
CachedSurface src_params;
@ -728,7 +728,7 @@ bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransfe
// the image, and it allows for smaller texture cache lookup rectangles.
src_params.height = config.output_height;
src_params.is_tiled = !config.input_linear;
src_params.pixel_format = CachedSurface::PixelFormatFromGPUPixelFormat(config.input_format);
src_params.pixel_format = Pica::Texture::Format::FromGPUPixelFormat(config.input_format);
CachedSurface dst_params;
dst_params.addr = config.GetPhysicalOutputAddress();
@ -737,7 +737,7 @@ bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransfe
dst_params.height =
config.scaling == config.ScaleXY ? config.output_height / 2 : config.output_height.Value();
dst_params.is_tiled = config.input_linear != config.dont_swizzle;
dst_params.pixel_format = CachedSurface::PixelFormatFromGPUPixelFormat(config.output_format);
dst_params.pixel_format = Pica::Texture::Format::FromGPUPixelFormat(config.output_format);
MathUtil::Rectangle<int> src_rect;
CachedSurface* src_surface = res_cache.GetSurfaceRect(src_params, false, true, src_rect);
@ -776,7 +776,7 @@ bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransfe
}
u32 dst_size = dst_params.width * dst_params.height *
CachedSurface::GetFormatBpp(dst_params.pixel_format) / 8;
Pica::Texture::Format::GetBpp(dst_params.pixel_format) / 8;
dst_surface->dirty = true;
res_cache.FlushRegion(config.GetPhysicalOutputAddress(), dst_size, dst_surface, true);
return true;
@ -789,7 +789,6 @@ bool RasterizerOpenGL::AccelerateTextureCopy(const GPU::Regs::DisplayTransferCon
bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config) {
MICROPROFILE_SCOPE(OpenGL_Blits);
using PixelFormat = CachedSurface::PixelFormat;
using SurfaceType = CachedSurface::SurfaceType;
CachedSurface* dst_surface = res_cache.TryGetFillSurface(config);
@ -824,7 +823,7 @@ bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config)
if (config.fill_24bit) {
switch (dst_surface->pixel_format) {
case PixelFormat::RGB8:
case Pica::Texture::Format::Type::RGB8:
color_values[0] = config.value_24bit_r / 255.0f;
color_values[1] = config.value_24bit_g / 255.0f;
color_values[2] = config.value_24bit_b / 255.0f;
@ -836,7 +835,7 @@ bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config)
u32 value = config.value_32bit;
switch (dst_surface->pixel_format) {
case PixelFormat::RGBA8:
case Pica::Texture::Format::Type::RGBA8:
color_values[0] = (value >> 24) / 255.0f;
color_values[1] = ((value >> 16) & 0xFF) / 255.0f;
color_values[2] = ((value >> 8) & 0xFF) / 255.0f;
@ -850,34 +849,34 @@ bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config)
Math::Vec4<u8> color;
switch (dst_surface->pixel_format) {
case PixelFormat::RGBA8:
case Pica::Texture::Format::Type::RGBA8:
color_values[0] = (value_16bit >> 8) / 255.0f;
color_values[1] = (value_16bit & 0xFF) / 255.0f;
color_values[2] = color_values[0];
color_values[3] = color_values[1];
break;
case PixelFormat::RGB5A1:
case Pica::Texture::Format::Type::RGB5A1:
color = Color::DecodeRGB5A1((const u8*)&value_16bit);
color_values[0] = color[0] / 31.0f;
color_values[1] = color[1] / 31.0f;
color_values[2] = color[2] / 31.0f;
color_values[3] = color[3];
break;
case PixelFormat::RGB565:
case Pica::Texture::Format::Type::RGB565:
color = Color::DecodeRGB565((const u8*)&value_16bit);
color_values[0] = color[0] / 31.0f;
color_values[1] = color[1] / 63.0f;
color_values[2] = color[2] / 31.0f;
break;
case PixelFormat::RGBA4:
case Pica::Texture::Format::Type::RGBA4:
color = Color::DecodeRGBA4((const u8*)&value_16bit);
color_values[0] = color[0] / 15.0f;
color_values[1] = color[1] / 15.0f;
color_values[2] = color[2] / 15.0f;
color_values[3] = color[3] / 15.0f;
break;
case PixelFormat::IA8:
case PixelFormat::RG8:
case Pica::Texture::Format::Type::IA8:
case Pica::Texture::Format::Type::RG8:
color_values[0] = (value_16bit >> 8) / 255.0f;
color_values[1] = (value_16bit & 0xFF) / 255.0f;
break;
@ -899,9 +898,9 @@ bool RasterizerOpenGL::AccelerateFill(const GPU::Regs::MemoryFillConfig& config)
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
GLfloat value_float;
if (dst_surface->pixel_format == CachedSurface::PixelFormat::D16) {
if (dst_surface->pixel_format == Pica::Texture::Format::Type::D16) {
value_float = config.value_32bit / 65535.0f; // 2^16 - 1
} else if (dst_surface->pixel_format == CachedSurface::PixelFormat::D24) {
} else if (dst_surface->pixel_format == Pica::Texture::Format::Type::D24) {
value_float = config.value_32bit / 16777215.0f; // 2^24 - 1
}
@ -945,7 +944,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const GPU::Regs::FramebufferConfig& con
src_params.height = config.height;
src_params.pixel_stride = pixel_stride;
src_params.is_tiled = false;
src_params.pixel_format = CachedSurface::PixelFormatFromGPUPixelFormat(config.color_format);
src_params.pixel_format = Pica::Texture::Format::FromGPUPixelFormat(config.color_format);
MathUtil::Rectangle<int> src_rect;
CachedSurface* src_surface = res_cache.GetSurfaceRect(src_params, false, true, src_rect);

View File

@ -21,6 +21,8 @@
#include "video_core/pica_state.h"
#include "video_core/renderer_opengl/gl_rasterizer_cache.h"
#include "video_core/renderer_opengl/gl_state.h"
#include "video_core/texture/codec.h"
#include "video_core/texture/formats.h"
#include "video_core/utils.h"
#include "video_core/video_core.h"
@ -30,21 +32,48 @@ struct FormatTuple {
GLenum type;
};
static const std::array<FormatTuple, 5> fb_format_tuples = {{
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8}, // RGBA8
{GL_RGB8, GL_BGR, GL_UNSIGNED_BYTE}, // RGB8
{GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1}, // RGB5A1
{GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5}, // RGB565
{GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4}, // RGBA4
}};
static const std::array<FormatTuple, 4> depth_format_tuples = {{
static const std::array<FormatTuple, 18> format_tuples = {{
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8}, // RGBA8
{GL_RGB8, GL_BGR, GL_UNSIGNED_BYTE}, // RGB8
{GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1}, // RGB5A1
{GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5}, // RGB565
{GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4}, // RGBA4
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // IA8
{GL_RG8, GL_RG8, GL_UNSIGNED_BYTE}, // RG8
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // I8
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // A8
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // IA4
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // I4
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // A4
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // ETC1
{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // ETC1A4
{GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // D16
{},
{GL_DEPTH_COMPONENT24, GL_DEPTH_COMPONENT, GL_UNSIGNED_INT}, // D24
{GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // D24S8
}};
static const std::array<bool, 18> native_format = {
true, // RGBA8
true, // RGB8
true, // RGB5A1
true, // RGB565
true, // RGBA4
false, // IA8
true, // RG8
false, // I8
false, // A8
false, // IA4
false, // I4
false, // A4
false, // ETC1
false, // ETC1A4
true, // D16
false,
false, // D24
false, // D24S8
};
RasterizerCacheOpenGL::RasterizerCacheOpenGL() {
transfer_framebuffers[0].Create();
transfer_framebuffers[1].Create();
@ -54,55 +83,6 @@ RasterizerCacheOpenGL::~RasterizerCacheOpenGL() {
FlushAll();
}
static void MortonCopyPixels(CachedSurface::PixelFormat pixel_format, u32 width, u32 height,
u32 bytes_per_pixel, u32 gl_bytes_per_pixel, u8* morton_data,
u8* gl_data, bool morton_to_gl) {
using PixelFormat = CachedSurface::PixelFormat;
u8* data_ptrs[2];
u32 depth_stencil_shifts[2] = {24, 8};
if (morton_to_gl) {
std::swap(depth_stencil_shifts[0], depth_stencil_shifts[1]);
}
if (pixel_format == PixelFormat::D24S8) {
for (unsigned y = 0; y < height; ++y) {
for (unsigned x = 0; x < width; ++x) {
const u32 coarse_y = y & ~7;
u32 morton_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) +
coarse_y * width * bytes_per_pixel;
u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel;
data_ptrs[morton_to_gl] = morton_data + morton_offset;
data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index];
// Swap depth and stencil value ordering since 3DS does not match OpenGL
u32 depth_stencil;
memcpy(&depth_stencil, data_ptrs[1], sizeof(u32));
depth_stencil = (depth_stencil << depth_stencil_shifts[0]) |
(depth_stencil >> depth_stencil_shifts[1]);
memcpy(data_ptrs[0], &depth_stencil, sizeof(u32));
}
}
} else {
for (unsigned y = 0; y < height; ++y) {
for (unsigned x = 0; x < width; ++x) {
const u32 coarse_y = y & ~7;
u32 morton_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) +
coarse_y * width * bytes_per_pixel;
u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel;
data_ptrs[morton_to_gl] = morton_data + morton_offset;
data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index];
memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel);
}
}
}
}
void RasterizerCacheOpenGL::BlitTextures(GLuint src_tex, GLuint dst_tex,
CachedSurface::SurfaceType type,
const MathUtil::Rectangle<int>& src_rect,
@ -184,7 +164,7 @@ bool RasterizerCacheOpenGL::TryBlitSurfaces(CachedSurface* src_surface,
return true;
}
static void AllocateSurfaceTexture(GLuint texture, CachedSurface::PixelFormat pixel_format,
static void AllocateSurfaceTexture(GLuint texture, Pica::Texture::Format::Type pixel_format,
u32 width, u32 height) {
// Allocate an uninitialized texture of appropriate size and format for the surface
using SurfaceType = CachedSurface::SurfaceType;
@ -199,17 +179,8 @@ static void AllocateSurfaceTexture(GLuint texture, CachedSurface::PixelFormat pi
SurfaceType type = CachedSurface::GetFormatType(pixel_format);
FormatTuple tuple;
if (type == SurfaceType::Color) {
ASSERT((size_t)pixel_format < fb_format_tuples.size());
tuple = fb_format_tuples[(unsigned int)pixel_format];
} else if (type == SurfaceType::Depth || type == SurfaceType::DepthStencil) {
size_t tuple_idx = (size_t)pixel_format - 14;
ASSERT(tuple_idx < depth_format_tuples.size());
tuple = depth_format_tuples[tuple_idx];
} else {
tuple = {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE};
}
ASSERT((size_t)pixel_format < format_tuples.size());
FormatTuple tuple = format_tuples[(unsigned int)pixel_format];
glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, width, height, 0, tuple.format,
tuple.type, nullptr);
@ -227,7 +198,7 @@ static void AllocateSurfaceTexture(GLuint texture, CachedSurface::PixelFormat pi
MICROPROFILE_DEFINE(OpenGL_SurfaceUpload, "OpenGL", "Surface Upload", MP_RGB(128, 64, 192));
CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bool match_res_scale,
bool load_if_create) {
using PixelFormat = CachedSurface::PixelFormat;
using PixelFormat = Pica::Texture::Format::Type;
using SurfaceType = CachedSurface::SurfaceType;
if (params.addr == 0) {
@ -235,7 +206,7 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo
}
u32 params_size =
params.width * params.height * CachedSurface::GetFormatBpp(params.pixel_format) / 8;
params.width * params.height * Pica::Texture::Format::GetBpp(params.pixel_format) / 8;
// Check for an exact match in existing surfaces
CachedSurface* best_exact_surface = nullptr;
@ -320,72 +291,36 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo
if (!new_surface->is_tiled) {
// TODO: Ensure this will always be a color format, not a depth or other format
ASSERT((size_t)new_surface->pixel_format < fb_format_tuples.size());
const FormatTuple& tuple = fb_format_tuples[(unsigned int)params.pixel_format];
// ASSERT((size_t)new_surface->pixel_format < format_tuples.size());
const FormatTuple& tuple = format_tuples[(unsigned int)params.pixel_format];
glPixelStorei(GL_UNPACK_ROW_LENGTH, (GLint)new_surface->pixel_stride);
glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0,
tuple.format, tuple.type, texture_src_data);
glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
} else {
SurfaceType type = CachedSurface::GetFormatType(new_surface->pixel_format);
if (type != SurfaceType::Depth && type != SurfaceType::DepthStencil) {
FormatTuple tuple;
if ((size_t)params.pixel_format < fb_format_tuples.size()) {
tuple = fb_format_tuples[(unsigned int)params.pixel_format];
} else {
// Texture
tuple = {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE};
}
std::vector<Math::Vec4<u8>> tex_buffer(params.width * params.height);
Pica::DebugUtils::TextureInfo tex_info;
tex_info.width = params.width;
tex_info.height = params.height;
tex_info.stride =
params.width * CachedSurface::GetFormatBpp(params.pixel_format) / 8;
tex_info.format = (Pica::Regs::TextureFormat)params.pixel_format;
tex_info.physical_address = params.addr;
for (unsigned y = 0; y < params.height; ++y) {
for (unsigned x = 0; x < params.width; ++x) {
tex_buffer[x + params.width * y] = Pica::DebugUtils::LookupTexture(
texture_src_data, x, params.height - 1 - y, tex_info);
}
}
glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height,
0, GL_RGBA, GL_UNSIGNED_BYTE, tex_buffer.data());
} else {
// Depth/Stencil formats need special treatment since they aren't sampleable using
// LookupTexture and can't use RGBA format
size_t tuple_idx = (size_t)params.pixel_format - 14;
ASSERT(tuple_idx < depth_format_tuples.size());
const FormatTuple& tuple = depth_format_tuples[tuple_idx];
u32 bytes_per_pixel = CachedSurface::GetFormatBpp(params.pixel_format) / 8;
// OpenGL needs 4 bpp alignment for D24 since using GL_UNSIGNED_INT as type
bool use_4bpp = (params.pixel_format == PixelFormat::D24);
u32 gl_bytes_per_pixel = use_4bpp ? 4 : bytes_per_pixel;
std::vector<u8> temp_fb_depth_buffer(params.width * params.height *
gl_bytes_per_pixel);
u8* temp_fb_depth_buffer_ptr =
use_4bpp ? temp_fb_depth_buffer.data() + 1 : temp_fb_depth_buffer.data();
MortonCopyPixels(params.pixel_format, params.width, params.height, bytes_per_pixel,
gl_bytes_per_pixel, texture_src_data, temp_fb_depth_buffer_ptr,
true);
glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height,
0, tuple.format, tuple.type, temp_fb_depth_buffer.data());
}
const FormatTuple& tuple = format_tuples[(unsigned int)params.pixel_format];
std::unique_ptr<Pica::Texture::Codec> tmp = Pica::Texture::CodecFactory::build(
// clang-format off
params.pixel_format, texture_src_data, params.width, params.height
// clang-format on
);
Pica::Texture::Codec* codec = tmp.get();
codec->configTiling(true, 8); // change 8 for 32 in case the mage is tiled
// on blocks of 32x32
codec->configRGBATransform(!native_format[(unsigned int)params.pixel_format]);
codec->decode();
std::unique_ptr<u8[]> decoded_texture = codec->transferInternalBuffer();
u32 bytes = codec->getInternalBytesPerPixel();
if (bytes == 3)
bytes = 1;
else if (bytes != 2)
bytes = 4;
glPixelStorei(GL_UNPACK_ALIGNMENT, bytes);
glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0,
tuple.format, tuple.type, decoded_texture.get());
glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
}
// If not 1x scale, blit 1x texture to a new scaled texture and replace texture in surface
if (new_surface->res_scale_width != 1.f || new_surface->res_scale_height != 1.f) {
OGLTexture scaled_texture;
@ -430,7 +365,7 @@ CachedSurface* RasterizerCacheOpenGL::GetSurfaceRect(const CachedSurface& params
}
u32 total_pixels = params.width * params.height;
u32 params_size = total_pixels * CachedSurface::GetFormatBpp(params.pixel_format) / 8;
u32 params_size = total_pixels * Pica::Texture::Format::GetBpp(params.pixel_format) / 8;
// Attempt to find encompassing surfaces
CachedSurface* best_subrect_surface = nullptr;
@ -467,7 +402,7 @@ CachedSurface* RasterizerCacheOpenGL::GetSurfaceRect(const CachedSurface& params
// Return the best subrect surface if found
if (best_subrect_surface != nullptr) {
unsigned int bytes_per_pixel =
(CachedSurface::GetFormatBpp(best_subrect_surface->pixel_format) / 8);
(Pica::Texture::Format::GetBpp(best_subrect_surface->pixel_format) / 8);
int x0, y0;
@ -521,7 +456,7 @@ CachedSurface* RasterizerCacheOpenGL::GetTextureSurface(
params.width = info.width;
params.height = info.height;
params.is_tiled = true;
params.pixel_format = CachedSurface::PixelFormatFromTextureFormat(info.format);
params.pixel_format = Pica::Texture::Format::FromTextureFormat(info.format);
return GetSurface(params, false, true);
}
@ -574,10 +509,10 @@ RasterizerCacheOpenGL::GetFramebufferSurfaces(const Pica::Regs::FramebufferConfi
}
color_params.addr = config.GetColorBufferPhysicalAddress();
color_params.pixel_format = CachedSurface::PixelFormatFromColorFormat(config.color_format);
color_params.pixel_format = Pica::Texture::Format::FromColorFormat(config.color_format);
depth_params.addr = config.GetDepthBufferPhysicalAddress();
depth_params.pixel_format = CachedSurface::PixelFormatFromDepthFormat(config.depth_format);
depth_params.pixel_format = Pica::Texture::Format::FromDepthFormat(config.depth_format);
MathUtil::Rectangle<int> color_rect;
CachedSurface* color_surface =
@ -648,9 +583,9 @@ CachedSurface* RasterizerCacheOpenGL::TryGetFillSurface(const GPU::Regs::MemoryF
CachedSurface* surface = it2->get();
if (surface->addr == config.GetStartAddress() &&
CachedSurface::GetFormatBpp(surface->pixel_format) == bits_per_value &&
Pica::Texture::Format::GetBpp(surface->pixel_format) == bits_per_value &&
(surface->width * surface->height *
CachedSurface::GetFormatBpp(surface->pixel_format) / 8) ==
Pica::Texture::Format::GetBpp(surface->pixel_format) / 8) ==
(config.GetEndAddress() - config.GetStartAddress())) {
return surface;
}
@ -662,7 +597,6 @@ CachedSurface* RasterizerCacheOpenGL::TryGetFillSurface(const GPU::Regs::MemoryF
MICROPROFILE_DEFINE(OpenGL_SurfaceDownload, "OpenGL", "Surface Download", MP_RGB(128, 192, 64));
void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) {
using PixelFormat = CachedSurface::PixelFormat;
using SurfaceType = CachedSurface::SurfaceType;
if (!surface->dirty) {
@ -703,53 +637,32 @@ void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) {
if (!surface->is_tiled) {
// TODO: Ensure this will always be a color format, not a depth or other format
ASSERT((size_t)surface->pixel_format < fb_format_tuples.size());
const FormatTuple& tuple = fb_format_tuples[(unsigned int)surface->pixel_format];
// ASSERT((size_t)surface->pixel_format < fb_format_tuples.size());
const FormatTuple& tuple = format_tuples[(unsigned int)surface->pixel_format];
glPixelStorei(GL_PACK_ROW_LENGTH, (GLint)surface->pixel_stride);
glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, dst_buffer);
glPixelStorei(GL_PACK_ROW_LENGTH, 0);
} else {
SurfaceType type = CachedSurface::GetFormatType(surface->pixel_format);
if (type != SurfaceType::Depth && type != SurfaceType::DepthStencil) {
ASSERT((size_t)surface->pixel_format < fb_format_tuples.size());
const FormatTuple& tuple = fb_format_tuples[(unsigned int)surface->pixel_format];
const FormatTuple& tuple = format_tuples[(u32)surface->pixel_format];
u32 bytes_per_pixel = Pica::Texture::Format::GetBpp(surface->pixel_format) / 8;
if (!native_format[(u32)surface->pixel_format])
bytes_per_pixel = 4;
std::vector<u8> temp_gl_buffer(surface->width * surface->height * bytes_per_pixel);
glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data());
u32 bytes_per_pixel = CachedSurface::GetFormatBpp(surface->pixel_format) / 8;
std::vector<u8> temp_gl_buffer(surface->width * surface->height * bytes_per_pixel);
glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data());
// Directly copy pixels. Internal OpenGL color formats are consistent so no conversion
// is necessary.
MortonCopyPixels(surface->pixel_format, surface->width, surface->height,
bytes_per_pixel, bytes_per_pixel, dst_buffer, temp_gl_buffer.data(),
false);
} else {
// Depth/Stencil formats need special treatment since they aren't sampleable using
// LookupTexture and can't use RGBA format
size_t tuple_idx = (size_t)surface->pixel_format - 14;
ASSERT(tuple_idx < depth_format_tuples.size());
const FormatTuple& tuple = depth_format_tuples[tuple_idx];
u32 bytes_per_pixel = CachedSurface::GetFormatBpp(surface->pixel_format) / 8;
// OpenGL needs 4 bpp alignment for D24 since using GL_UNSIGNED_INT as type
bool use_4bpp = (surface->pixel_format == PixelFormat::D24);
u32 gl_bytes_per_pixel = use_4bpp ? 4 : bytes_per_pixel;
std::vector<u8> temp_gl_buffer(surface->width * surface->height * gl_bytes_per_pixel);
glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, temp_gl_buffer.data());
u8* temp_gl_buffer_ptr = use_4bpp ? temp_gl_buffer.data() + 1 : temp_gl_buffer.data();
MortonCopyPixels(surface->pixel_format, surface->width, surface->height,
bytes_per_pixel, gl_bytes_per_pixel, dst_buffer, temp_gl_buffer_ptr,
false);
}
std::unique_ptr<Pica::Texture::Codec> tmp = Pica::Texture::CodecFactory::build(
// clang-format off
surface->pixel_format, temp_gl_buffer.data(), surface->width, surface->height
// clang-format on
);
Pica::Texture::Codec* codec = tmp.get();
codec->configTiling(true, 8); // change 8 for 32 in case the mage is tiled
// on blocks of 32x32
codec->configRGBATransform(!native_format[(u32)surface->pixel_format]);
codec->configPreConvertedRGBA(!native_format[(u32)surface->pixel_format]);
codec->setExternalBuffer(dst_buffer);
codec->encode();
}
surface->dirty = false;

View File

@ -16,6 +16,7 @@
#include "core/hw/gpu.h"
#include "video_core/pica.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/texture/formats.h"
namespace MathUtil {
template <class T>
@ -27,33 +28,6 @@ struct CachedSurface;
using SurfaceCache = boost::icl::interval_map<PAddr, std::set<std::shared_ptr<CachedSurface>>>;
struct CachedSurface {
enum class PixelFormat {
// First 5 formats are shared between textures and color buffers
RGBA8 = 0,
RGB8 = 1,
RGB5A1 = 2,
RGB565 = 3,
RGBA4 = 4,
// Texture-only formats
IA8 = 5,
RG8 = 6,
I8 = 7,
A8 = 8,
IA4 = 9,
I4 = 10,
A4 = 11,
ETC1 = 12,
ETC1A4 = 13,
// Depth buffer-only formats
D16 = 14,
// gap
D24 = 16,
D24S8 = 17,
Invalid = 255,
};
enum class SurfaceType {
Color = 0,
@ -63,58 +37,8 @@ struct CachedSurface {
Invalid = 4,
};
static unsigned int GetFormatBpp(CachedSurface::PixelFormat format) {
static const std::array<unsigned int, 18> bpp_table = {
32, // RGBA8
24, // RGB8
16, // RGB5A1
16, // RGB565
16, // RGBA4
16, // IA8
16, // RG8
8, // I8
8, // A8
8, // IA4
4, // I4
4, // A4
4, // ETC1
8, // ETC1A4
16, // D16
0,
24, // D24
32, // D24S8
};
ASSERT((unsigned int)format < ARRAY_SIZE(bpp_table));
return bpp_table[(unsigned int)format];
}
static PixelFormat PixelFormatFromTextureFormat(Pica::Regs::TextureFormat format) {
return ((unsigned int)format < 14) ? (PixelFormat)format : PixelFormat::Invalid;
}
static PixelFormat PixelFormatFromColorFormat(Pica::Regs::ColorFormat format) {
return ((unsigned int)format < 5) ? (PixelFormat)format : PixelFormat::Invalid;
}
static PixelFormat PixelFormatFromDepthFormat(Pica::Regs::DepthFormat format) {
return ((unsigned int)format < 4) ? (PixelFormat)((unsigned int)format + 14)
: PixelFormat::Invalid;
}
static PixelFormat PixelFormatFromGPUPixelFormat(GPU::Regs::PixelFormat format) {
switch (format) {
// RGB565 and RGB5A1 are switched in PixelFormat compared to ColorFormat
case GPU::Regs::PixelFormat::RGB565:
return PixelFormat::RGB565;
case GPU::Regs::PixelFormat::RGB5A1:
return PixelFormat::RGB5A1;
default:
return ((unsigned int)format < 5) ? (PixelFormat)format : PixelFormat::Invalid;
}
}
static bool CheckFormatsBlittable(PixelFormat pixel_format_a, PixelFormat pixel_format_b) {
static bool CheckFormatsBlittable(Pica::Texture::Format::Type pixel_format_a,
Pica::Texture::Format::Type pixel_format_b) {
SurfaceType a_type = GetFormatType(pixel_format_a);
SurfaceType b_type = GetFormatType(pixel_format_b);
@ -134,7 +58,7 @@ struct CachedSurface {
return false;
}
static SurfaceType GetFormatType(PixelFormat pixel_format) {
static SurfaceType GetFormatType(Pica::Texture::Format::Type pixel_format) {
if ((unsigned int)pixel_format < 5) {
return SurfaceType::Color;
}
@ -143,11 +67,12 @@ struct CachedSurface {
return SurfaceType::Texture;
}
if (pixel_format == PixelFormat::D16 || pixel_format == PixelFormat::D24) {
if (pixel_format == Pica::Texture::Format::Type::D16 ||
pixel_format == Pica::Texture::Format::Type::D24) {
return SurfaceType::Depth;
}
if (pixel_format == PixelFormat::D24S8) {
if (pixel_format == Pica::Texture::Format::Type::D24S8) {
return SurfaceType::DepthStencil;
}
@ -177,7 +102,7 @@ struct CachedSurface {
float res_scale_height = 1.f;
bool is_tiled;
PixelFormat pixel_format;
Pica::Texture::Format::Type pixel_format;
bool dirty;
};

View File

@ -1,6 +1,10 @@
#include "codec.h"
#include "internal\codecs.h"
#include "internal\morton.h"
#include "common/color.h"
#include "common/math_util.h"
#include "common/swap.h"
#include "common/vector_math.h"
#include "video_core/texture/codec.h"
#include "video_core/texture/internal/codecs.h"
#include "video_core/texture/internal/morton.h"
namespace Pica {
namespace Texture {
@ -17,18 +21,6 @@ void Codec::encode() {
this->encode_morton_pass();
};
void Codec::setSize() {
this->start_nibbles_size = format_size;
};
inline void Codec::setWidth(u32 width) {
this->width = width;
}
inline void Codec::setHeight(u32 height) {
this->height = height;
}
void Codec::configTiling(bool active, u32 tiling) {
this->morton = true;
this->morton_pass_tiling = tiling;
@ -63,15 +55,16 @@ bool Codec::invalid() {
}
void Codec::init(bool decode) {
this->setSize();
this->expected_nibbles_size = this->start_nibbles_size;
if (decode) {
if (this->raw_RGBA)
this->expected_nibbles_size = 8;
} else {
this->start_nibbles_size = this->format_size;
if (this->raw_RGBA)
this->expected_nibbles_size = this->format_size;
if (this->preconverted)
this->start_nibbles_size = 8;
if (!this->raw_RGBA)
this->expected_nibbles_size = this->start_nibbles_size;
}
if (!this->external_result_buffer) {
size_t buff_size = this->width * this->height * this->expected_nibbles_size / 2;
@ -80,7 +73,7 @@ void Codec::init(bool decode) {
}
}
inline void Codec::decode_morton_pass() {
void Codec::decode_morton_pass() {
if (this->morton_pass_tiling == 8)
Decoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height,
this->start_nibbles_size * 4);
@ -89,7 +82,7 @@ inline void Codec::decode_morton_pass() {
this->start_nibbles_size * 4);
}
inline void Codec::encode_morton_pass() {
void Codec::encode_morton_pass() {
if (this->morton_pass_tiling == 8)
Encoders::Morton_8x8(this->target_buffer, this->passing_buffer, this->width, this->height,
this->start_nibbles_size * 4);
@ -98,41 +91,41 @@ inline void Codec::encode_morton_pass() {
this->start_nibbles_size * 4);
}
std::unique_ptr<Codec> CodecFactory::build(Format format, u8* target, u32 width, u32 height) {
std::unique_ptr<Codec> CodecFactory::build(Format::Type format, u8* target, u32 width, u32 height) {
switch (format) {
case Format::RGBA8:
case Format::Type::RGBA8:
return std::make_unique<RGBACodec>(target, width, height);
case Format::RGB8:
case Format::Type::RGB8:
return std::make_unique<RGBCodec>(target, width, height);
case Format::RGB5A1:
case Format::Type::RGB5A1:
return std::make_unique<RGB5A1Codec>(target, width, height);
case Format::RGB565:
case Format::Type::RGB565:
return std::make_unique<RGB565Codec>(target, width, height);
case Format::RGBA4:
case Format::Type::RGBA4:
return std::make_unique<RGBA4Codec>(target, width, height);
case Format::RG8:
case Format::Type::RG8:
return std::make_unique<RG8Codec>(target, width, height);
case Format::IA8:
case Format::Type::IA8:
return std::make_unique<IA8Codec>(target, width, height);
case Format::I8:
case Format::Type::I8:
return std::make_unique<I8Codec>(target, width, height);
case Format::A8:
case Format::Type::A8:
return std::make_unique<A8Codec>(target, width, height);
case Format::IA4:
case Format::Type::IA4:
return std::make_unique<IA4Codec>(target, width, height);
case Format::I4:
case Format::Type::I4:
return std::make_unique<I4Codec>(target, width, height);
case Format::A4:
case Format::Type::A4:
return std::make_unique<A4Codec>(target, width, height);
case Format::ETC1:
case Format::Type::ETC1:
return std::make_unique<ETC1Codec>(target, width, height);
case Format::ETC1A4:
case Format::Type::ETC1A4:
return std::make_unique<ETC1A4Codec>(target, width, height);
case Format::D16:
case Format::Type::D16:
return std::make_unique<D16Codec>(target, width, height);
case Format::D24:
case Format::Type::D24:
return std::make_unique<D24Codec>(target, width, height);
case Format::D24S8:
case Format::Type::D24S8:
return std::make_unique<D24S8Codec>(target, width, height);
default:
return nullptr;

View File

@ -1,9 +1,10 @@
#pragma once
#include <iostream>
#include <memory>
#include "common/common_types.h"
#include "formats.h"
#pragma once
#include "video_core/texture/formats.h"
namespace Pica {
@ -16,18 +17,23 @@ public:
this->target_buffer = target;
this->setWidth(width);
this->setHeight(height);
this->setSize();
this->expected_nibbles_size = this->start_nibbles_size;
}
virtual ~Codec() {}
virtual void decode();
virtual void encode();
void setSize();
inline void setWidth(u32 width) {
this->width = width;
}
void setWidth(u32 width);
void setHeight(u32 height);
inline void setHeight(u32 height) {
this->height = height;
}
inline u32 getInternalBytesPerPixel() {
return this->expected_nibbles_size / 2;
}
// Common Passes
void configTiling(bool active, u32 tiling);
@ -54,7 +60,10 @@ protected:
u32 start_nibbles_size;
u32 expected_nibbles_size;
const u32 format_size = 8;
virtual void setSize() {
this->start_nibbles_size = 8;
};
u8* target_buffer; // Initial read buffer
u8* passing_buffer; // pointer aliasing: Used and modified by passes
@ -65,12 +74,12 @@ protected:
typedef Codec super;
inline void decode_morton_pass();
inline void encode_morton_pass();
void decode_morton_pass();
void encode_morton_pass();
};
namespace CodecFactory {
std::unique_ptr<Codec> build(Pica::Texture::Format format, u8* target, u32 width, u32 height);
std::unique_ptr<Codec> build(Pica::Texture::Format::Type format, u8* target, u32 width, u32 height);
};
} // Texture

View File

@ -1,36 +1,96 @@
#pragma once
#include <array>
#include "common/assert.h"
#include "core/hw/gpu.h"
#include "video_core/pica.h"
namespace Pica {
namespace Texture {
enum class Format {
// First 5 formats are shared between textures and color buffers
RGBA8 = 0,
RGB8 = 1,
RGB5A1 = 2,
RGB565 = 3,
RGBA4 = 4,
struct Format {
// Texture-only formats
IA8 = 5,
RG8 = 6,
I8 = 7,
A8 = 8,
IA4 = 9,
I4 = 10,
A4 = 11,
ETC1 = 12,
ETC1A4 = 13,
enum class Type {
// First 5 formats are shared between textures and color buffers
RGBA8 = 0,
RGB8 = 1,
RGB5A1 = 2,
RGB565 = 3,
RGBA4 = 4,
// Depth buffer-only formats
D16 = 14,
// gap
D24 = 16,
D24S8 = 17,
// Texture-only formats
IA8 = 5,
RG8 = 6,
I8 = 7,
A8 = 8,
IA4 = 9,
I4 = 10,
A4 = 11,
ETC1 = 12,
ETC1A4 = 13,
Invalid = 255,
};
// Depth buffer-only formats
D16 = 14,
// gap
D24 = 16,
D24S8 = 17,
Invalid = 255,
};
static u32 GetBpp(Type format) {
static const std::array<unsigned int, 18> bpp_table = {
32, // RGBA8
24, // RGB8
16, // RGB5A1
16, // RGB565
16, // RGBA4
16, // IA8
16, // RG8
8, // I8
8, // A8
8, // IA4
4, // I4
4, // A4
4, // ETC1
8, // ETC1A4
16, // D16
0,
24, // D24
32, // D24S8
};
ASSERT((u32)format < ARRAY_SIZE(bpp_table));
return bpp_table[(u32)format];
}
static Type FromTextureFormat(Regs::TextureFormat format) {
return ((unsigned int)format < 14) ? (Type)format : Type::Invalid;
}
static Type FromColorFormat(Regs::ColorFormat format) {
return ((unsigned int)format < 5) ? (Type)format : Type::Invalid;
}
static Type FromDepthFormat(Regs::DepthFormat format) {
return ((unsigned int)format < 4) ? (Type)((unsigned int)format + 14) : Type::Invalid;
}
static Type FromGPUPixelFormat(GPU::Regs::PixelFormat format) {
switch (format) {
// RGB565 and RGB5A1 are switched in PixelFormat compared to ColorFormat
case GPU::Regs::PixelFormat::RGB565:
return Type::RGB565;
case GPU::Regs::PixelFormat::RGB5A1:
return Type::RGB5A1;
default:
return ((unsigned int)format < 5) ? (Type)format : Type::Invalid;
}
}
}; // Format
} // Texture

View File

@ -1,7 +1,19 @@
#include "codecs.h"
#include "etc1.h"
#include "morton.h"
#include "texture_utils.h"
#include "video_core/texture/internal/codecs.h"
#include "video_core/texture/internal/etc1.h"
#include "video_core/texture/internal/morton.h"
#include "video_core/texture/internal/texture_utils.h"
///////////////////////////////////////////////////////////////////////////////
// Optimizations
//////////////////////////////////////////////////////////////////////////////
#ifdef _MSC_VER
#pragma inline_recursion(on)
#elif defined(CLANG_OR_GCC)
#pragma GCC optimize("-fpeel-loops")
#pragma GCC optimize("-fpredictive-commoning")
#pragma GCC optimize("-ftree-loop-distribute-patterns")
#pragma GCC optimize("-ftree-vectorize")
#endif
// Decoders
#include "decoders.cpp"

View File

@ -1,10 +1,11 @@
#pragma once
#include <iostream>
#include <memory>
#include "common/common_types.h"
#include "video_core/texture/codec.h"
#pragma once
// each texture format codec
class RGBACodec : public Pica::Texture::Codec {
public:
@ -13,7 +14,9 @@ public:
void encode();
protected:
const u32 format_size = 8;
virtual void setSize() {
this->start_nibbles_size = 8;
};
};
class RGBCodec : public Pica::Texture::Codec {
@ -23,7 +26,9 @@ public:
void encode();
protected:
const u32 format_size = 6;
virtual void setSize() {
this->start_nibbles_size = 6;
};
};
class RGB5A1Codec : public Pica::Texture::Codec {
@ -33,7 +38,9 @@ public:
void encode();
protected:
const u32 format_size = 4;
virtual void setSize() {
this->start_nibbles_size = 4;
};
};
class RGBA4Codec : public Pica::Texture::Codec {
@ -43,7 +50,9 @@ public:
void encode();
protected:
const u32 format_size = 4;
virtual void setSize() {
this->start_nibbles_size = 4;
};
};
class RGB565Codec : public Pica::Texture::Codec {
@ -53,7 +62,9 @@ public:
void encode();
protected:
const u32 format_size = 4;
virtual void setSize() {
this->start_nibbles_size = 4;
};
};
class RG8Codec : public Pica::Texture::Codec {
@ -63,7 +74,9 @@ public:
void encode();
protected:
const u32 format_size = 4;
virtual void setSize() {
this->start_nibbles_size = 4;
};
};
class IA8Codec : public Pica::Texture::Codec {
@ -73,7 +86,9 @@ public:
void encode();
protected:
const u32 format_size = 4;
virtual void setSize() {
this->start_nibbles_size = 4;
};
};
class I8Codec : public Pica::Texture::Codec {
@ -83,7 +98,9 @@ public:
void encode();
protected:
const u32 format_size = 2;
virtual void setSize() {
this->start_nibbles_size = 2;
};
};
class A8Codec : public Pica::Texture::Codec {
@ -93,7 +110,9 @@ public:
void encode();
protected:
const u32 format_size = 2;
virtual void setSize() {
this->start_nibbles_size = 2;
};
};
class IA4Codec : public Pica::Texture::Codec {
@ -103,7 +122,9 @@ public:
void encode();
protected:
const u32 format_size = 2;
virtual void setSize() {
this->start_nibbles_size = 2;
};
};
class I4Codec : public Pica::Texture::Codec {
@ -113,7 +134,9 @@ public:
void encode();
protected:
const u32 format_size = 1;
virtual void setSize() {
this->start_nibbles_size = 1;
};
};
class A4Codec : public Pica::Texture::Codec {
@ -123,7 +146,9 @@ public:
void encode();
protected:
const u32 format_size = 1;
virtual void setSize() {
this->start_nibbles_size = 1;
};
};
class ETC1Codec : public Pica::Texture::Codec {
@ -133,7 +158,9 @@ public:
void encode();
protected:
const u32 format_size = 1;
virtual void setSize() {
this->start_nibbles_size = 1;
};
};
class ETC1A4Codec : public Pica::Texture::Codec {
@ -143,7 +170,9 @@ public:
void encode();
protected:
const u32 format_size = 2;
virtual void setSize() {
this->start_nibbles_size = 2;
};
};
class D16Codec : public Pica::Texture::Codec {
@ -153,7 +182,9 @@ public:
void encode();
protected:
const u32 format_size = 4;
virtual void setSize() {
this->start_nibbles_size = 4;
};
};
class D24Codec : public Pica::Texture::Codec {
@ -163,7 +194,9 @@ public:
void encode();
protected:
const u32 format_size = 6;
virtual void setSize() {
this->start_nibbles_size = 6;
};
};
class D24S8Codec : public Pica::Texture::Codec {
@ -173,5 +206,7 @@ public:
void encode();
protected:
const u32 format_size = 8;
virtual void setSize() {
this->start_nibbles_size = 8;
};
};

View File

@ -1,9 +1,10 @@
namespace {
template <const Math::Vec4<u8> decode_func(const u8*)>
inline void rgba_pass(u8* read, u8* write) {
u32 pixel = decode_func(read).ToRGBA();
auto pixel = decode_func(read).ToRGBA();
std::memcpy(write, &pixel, 4);
}
@ -72,34 +73,36 @@ void RG8Codec::decode() {
namespace {
inline u16 convert_nibbles(u8 nibbles) {
return ((u16)Color::Convert4To8((nibbles & 0xF0) >> 4) << 8) |
(u16)Color::Convert4To8((nibbles & 0x0F));
u16 split = (nibbles & 0xF0) << 4 | (nibbles & 0x0F);
split |= (split << 4);
return split;
}
inline u32 build_luminance(u8 intensity, u8 alpha) {
inline u32 build_luminance(u32 intensity, u32 alpha) {
return (alpha << 24) | (intensity << 16) | (intensity << 8) | intensity;
}
inline void intensity_alpha_pass(u8* read, u8* write) {
alignas(4) u8 pixel[2];
std::memcpy(pixel, read, 2);
u32 result = build_luminance(pixel[0], pixel[1]);
u32 result = build_luminance(pixel[1], pixel[0]);
std::memcpy(write, &result, 4);
}
inline void intensity_alpha_nibbles_pass(u8* read, u8* write) {
alignas(4) u8 pixel[2];
std::memcpy(pixel, read, 1);
u16 tmp = convert_nibbles(pixel[0]);
std::memcpy(pixel, &tmp, 2);
u32 result = build_luminance(pixel[0], pixel[1]);
alignas(4) u8 pixel;
std::memcpy(&pixel, read, 1);
u16 tmp = convert_nibbles(pixel);
u8 tmp2[2];
std::memcpy(tmp2, &tmp, 2);
u32 result = build_luminance(tmp2[1], tmp2[0]);
std::memcpy(write, &result, 4);
}
inline void intensity_pass(u8* read, u8* write) {
alignas(4) u8 pixel[1];
std::memcpy(pixel, read, 1);
u32 result = build_luminance(pixel[0], 255);
u8 pixel;
std::memcpy(&pixel, read, 1);
u32 result = build_luminance(pixel, 255);
std::memcpy(write, &result, 4);
}
@ -108,9 +111,9 @@ inline void intensity_nibbles_pass(u8* read, u8* write) {
std::memcpy(pixel, read, 1);
u16 tmp = convert_nibbles(pixel[0]);
std::memcpy(pixel, &tmp, 2);
u32 result = build_luminance(pixel[0], 255);
u32 result = build_luminance(pixel[1], 255);
std::memcpy(write, &result, 4);
result = build_luminance(pixel[1], 255);
result = build_luminance(pixel[0], 255);
std::memcpy(write + 4, &result, 4);
}

View File

@ -9,14 +9,24 @@
#include "common/math_util.h"
#include "common/swap.h"
#include "common/vector_math.h"
#include "etc1.h"
#include "texture_utils.h"
#include "video_core/texture/internal/etc1.h"
#include "video_core/texture/internal/texture_utils.h"
constexpr std::array<u8[2], 8> etc1_modifier_table = {{
namespace {
#ifdef _DEBUG
#define CONST_FIX static
#else
#define CONST_FIX constexpr
#endif
CONST_FIX std::array<u8[2], 8> etc1_modifier_table = {{
{2, 8}, {5, 17}, {9, 29}, {13, 42}, {18, 60}, {24, 80}, {33, 106}, {47, 183},
}};
namespace {
constexpr u32 buildRGBA(u32 r, u32 g, u32 b, u32 a) {
return (a << 24) | (b << 16) | (g << 8) | r;
}
union ETC1Tile {
u64 raw;
@ -62,7 +72,7 @@ union ETC1Tile {
BitField<60, 4, u64> r1;
} separate;
const Math::Vec3<u8> GetRGB(u32 x, u32 y) const {
const u32 GetRGB(u32 x, u32 y) const {
int texel = 4 * x + y;
if (flip)
@ -106,7 +116,7 @@ union ETC1Tile {
ret.g() = MathUtil::Clamp(ret.g() + modifier, 0, 255);
ret.b() = MathUtil::Clamp(ret.b() + modifier, 0, 255);
return ret.Cast<u8>();
return buildRGBA(ret.r(), ret.g(), ret.b(), 0);
}
};
@ -121,7 +131,8 @@ inline void etc1_pass(u8* etc1_buffer, u8* linear_buffer, u32 x_blocks) {
std::memcpy(&tile.raw, &etc1_buffer[i * 8], 8);
for (u32 k = 0; k < 4; k++) {
for (u32 j = 0; j < 4; j++) {
u32 rgba = (tile.GetRGB(j, k).ToRGB()) | 0xFF000000;
auto rgb = tile.GetRGB(j, k);
u32 rgba = rgb | 0xFF000000;
std::memcpy(&tmp[k * line + j * 4 + index], &rgba, 4);
}
}
@ -142,7 +153,8 @@ inline void etc1a4_pass(u8* etc1_buffer, u8* linear_buffer, u32 x_blocks) {
for (u32 j = 0; j < 4; j++) {
u32 alpha = (alpha_tile >> (4 * (j * 4 + k))) & 0x0F;
alpha |= (alpha << 4);
u32 rgba = tile.GetRGB(j, k).ToRGB() | (alpha << 24);
auto rgb = tile.GetRGB(j, k);
u32 rgba = rgb | (alpha << 24);
std::memcpy(&tmp[k * line + j * 4 + index], &rgba, 4);
}
}

View File

@ -1,7 +1,6 @@
#pragma once
#include "common/common_types.h"
#pragma once
void ETC1(u8* etc1_buffer, u8* matrix_buffer, u32 width, u32 height);
void ETC1A4(u8* etc1_buffer, u8* matrix_buffer, u32 width, u32 height);

View File

@ -2,8 +2,8 @@
#include <memory>
#include <utility>
#include "common/common_types.h"
#include "morton.h"
#include "texture_utils.h"
#include "video_core/texture/internal/morton.h"
#include "video_core/texture/internal/texture_utils.h"
///////////////////////////////////////////////////////////////////////////////
// Optimizations
@ -15,6 +15,8 @@
// favor fast code over small code.
#pragma optimize("t", on)
#pragma intrinsic(memcpy)
#define __hot
#define __no_inline __declspec(noinline)
#elif defined(CLANG_OR_GCC)
// The next 3 will swizle memory copying to help find the best sse/avx shuffling
// in case it's possible. Compilation tests have proven effective use of these
@ -22,12 +24,20 @@
#pragma GCC optimize("-fpredictive-commoning")
#pragma GCC optimize("-ftree-loop-distribute-patterns")
#pragma GCC optimize("-ftree-vectorize")
// limit inlining
#pragma GCC option("--param max-inline-insns-single=128")
#pragma GCC option("--param inline-unit-growth=400")
#pragma GCC option("--param large-function-growth=800")
// The beauty of these compiler options is that they generate better code than
// hand written intrinsics, since inline expanding memeory transfers can be pattern
// matched with vector instructions available in the target.
#define __no_inline __attribute__((noinline))
#define __hot __attribute__((hot))
#if !defined(__forceinline)
#define __forceinline attribute__((always_inline))
#endif
#else
#define __hot
#define __no_inline
#define __forceinline
#endif
#pragma region Z_Order
@ -54,11 +64,11 @@ constexpr u32 isBottom(u32 block_index) {
}
template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
inline void swizzle_block(u8*& morton_block, u8* linear_block);
__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block);
template <void codec(u8*, u8*, size_t), size_t nibbles, u32 block_index, u32 blocks,
size_t block_size>
inline void swizzle_block_aux(u8*& morton_block, u8* linear_block) {
__forceinline static void swizzle_block_aux(u8*& morton_block, u8* linear_block) {
// move the linear_block pointer to the appropiate block
const size_t right = isRight(block_index) * (blocks * nibbles) / 2;
const size_t down = isBottom(block_index) * block_size;
@ -67,7 +77,7 @@ inline void swizzle_block_aux(u8*& morton_block, u8* linear_block) {
}
template <void codec(u8*, u8*, size_t), size_t nibbles, u32 blocks, size_t block_size>
inline void swizzle_block(u8*& morton_block, u8* linear_block) {
__forceinline static void swizzle_block(u8*& morton_block, u8* linear_block) {
const size_t new_block_size = block_size / 2;
if (blocks <= 2) {
// We handle 2*2 blocks on z-order
@ -94,14 +104,14 @@ inline void swizzle_block(u8*& morton_block, u8* linear_block) {
}
template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
void swizzle_pass(u8* morton_block, u8* linear_block) {
__forceinline static void swizzle_pass(u8* morton_block, u8* linear_block) {
const size_t block_size = (lines_per_block * lines_per_block * nibbles) / 2;
swizzle_block<codec, nibbles, lines_per_block, block_size>(morton_block, linear_block);
}
#pragma endregion Z_Order
template <size_t nibbles, size_t lines_per_block>
void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
__hot inline static void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
alignas(64) u8 tmp[tile_size];
tiling_pass<&encode, nibbles, lines_per_block>(linear_buffer, tmp, x_blocks);
@ -109,7 +119,7 @@ void encode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
}
template <size_t nibbles, size_t lines_per_block>
void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
__hot inline static void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
const u32 tile_size = (lines_per_block * lines_per_block * nibbles) / 2;
alignas(64) u8 tmp[tile_size];
swizzle_pass<&decode, nibbles, lines_per_block>(morton_buffer, tmp);
@ -117,7 +127,7 @@ void decode_pass(u8* morton_buffer, u8* linear_buffer, u32 x_blocks) {
}
template <void codec(u8*, u8*, u32), size_t nibbles, size_t lines_per_block>
void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) {
__hot static void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) {
const u32 x_blocks = (width / lines_per_block);
const u32 y_blocks = (height / lines_per_block);
const size_t line_size = (lines_per_block * nibbles) / 2;
@ -135,9 +145,22 @@ void morton_pass(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height) {
}
}
// keep hot code together
__no_inline __hot static void morton_8x8_32(u8* morton_buffer, u8* matrix_buffer, u32 width,
u32 height, bool decode) {
if (decode)
morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
else
morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
}
namespace Decoders {
bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
if (bpp == 32) {
morton_8x8_32(morton_buffer, matrix_buffer, width, height, true);
return true;
}
switch (bpp) {
case 4: {
morton_pass<&decode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
@ -159,11 +182,6 @@ bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32
return true;
break;
}
case 32: {
morton_pass<&decode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
default: {
return false;
break;
@ -209,6 +227,10 @@ bool Morton_32x32(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u
namespace Encoders {
bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32 bpp) {
if (bpp == 32) {
morton_8x8_32(morton_buffer, matrix_buffer, width, height, false);
return true;
}
switch (bpp) {
case 4: {
morton_pass<&encode_pass<1, 8>, 1, 8>(morton_buffer, matrix_buffer, width, height);
@ -230,11 +252,6 @@ bool Morton_8x8(u8* morton_buffer, u8* matrix_buffer, u32 width, u32 height, u32
return true;
break;
}
case 32: {
morton_pass<&encode_pass<8, 8>, 8, 8>(morton_buffer, matrix_buffer, width, height);
return true;
break;
}
default: {
return false;
break;

View File

@ -1,7 +1,7 @@
#include "common/common_types.h"
#pragma once
#include "common/common_types.h"
enum class MortonPass { Tile8x8, Tile32x32 };
namespace Decoders {

View File

@ -1,3 +1,5 @@
#pragma once
#include <array>
#include <cstring>
#include <memory>
@ -5,12 +7,9 @@
#include "common/color.h"
#include "common/swap.h"
#pragma once
#if ((defined(__clang__) || defined(__GNUC__)) && !defined(__INTEL_COMPILER))
#define CLANG_OR_GCC
#endif
///////////////////////////////////////////////////////////////////////////////
// Optimizations
//////////////////////////////////////////////////////////////////////////////
@ -23,16 +22,6 @@
#pragma GCC optimize("-ftree-vectorize")
#endif
// @param read_size is the amount of bytes each pixel takes
inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
std::memcpy(matrix_pointer, morton_pointer, read_size);
}
// @param read_size is the amount of bytes each pixel takes
inline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
std::memcpy(morton_pointer, matrix_pointer, read_size);
}
// Pre: width % 8 == 0 && height % 8 == 0
template <void pass(u8*, u8*), u32 read_size, u32 write_size, u32 tuning = 2>
inline void image_pass_aux_rev(u8* target, u32 width, u32 height) {
@ -80,9 +69,9 @@ inline void image_pass_aux(u8* target, u32 width, u32 height) {
template <void pass(u8*, u8*), u32 read_size, u32 write_size, u32 tuning = 2>
inline void image_pass(u8* target, u32 width, u32 height) {
if (read_size > write_size)
image_pass_aux<pass, read_size, write_size, tuning>;
image_pass_aux<pass, read_size, write_size, tuning>(target, width, height);
else
image_pass_aux_rev<pass, read_size, write_size, tuning>;
image_pass_aux_rev<pass, read_size, write_size, tuning>(target, width, height);
}
template <void codec(u8*, u8*, size_t), size_t nibbles, size_t lines_per_block>
@ -96,3 +85,13 @@ void tiling_pass(u8* linear, u8* tiled, u32 x_blocks) {
codec(tiled + tiled_index, linear + linear_index, tiled_line_size);
}
}
// @param read_size is the amount of bytes each pixel takes
inline void decode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
std::memcpy(matrix_pointer, morton_pointer, read_size);
}
// @param read_size is the amount of bytes each pixel takes
inline void encode(u8* morton_pointer, u8* matrix_pointer, size_t read_size) {
std::memcpy(morton_pointer, matrix_pointer, read_size);
}