astc_decoder: Compute offset swizzles in-shader
Alleviates the dependency on the swizzle table and a uniform which is constant for all ASTC texture sizes.
This commit is contained in:
		| @@ -10,8 +10,7 @@ | ||||
| #define END_PUSH_CONSTANTS }; | ||||
| #define UNIFORM(n) | ||||
| #define BINDING_INPUT_BUFFER 0 | ||||
| #define BINDING_SWIZZLE_BUFFER 1 | ||||
| #define BINDING_OUTPUT_IMAGE 2 | ||||
| #define BINDING_OUTPUT_IMAGE 1 | ||||
|  | ||||
| #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | ||||
|  | ||||
| @@ -19,7 +18,6 @@ | ||||
| #define END_PUSH_CONSTANTS | ||||
| #define UNIFORM(n) layout(location = n) uniform | ||||
| #define BINDING_INPUT_BUFFER 0 | ||||
| #define BINDING_SWIZZLE_BUFFER 1 | ||||
| #define BINDING_OUTPUT_IMAGE 0 | ||||
|  | ||||
| #endif | ||||
| @@ -28,13 +26,11 @@ layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; | ||||
|  | ||||
| BEGIN_PUSH_CONSTANTS | ||||
| UNIFORM(1) uvec2 block_dims; | ||||
|  | ||||
| UNIFORM(2) uint bytes_per_block_log2; | ||||
| UNIFORM(3) uint layer_stride; | ||||
| UNIFORM(4) uint block_size; | ||||
| UNIFORM(5) uint x_shift; | ||||
| UNIFORM(6) uint block_height; | ||||
| UNIFORM(7) uint block_height_mask; | ||||
| UNIFORM(2) uint layer_stride; | ||||
| UNIFORM(3) uint block_size; | ||||
| UNIFORM(4) uint x_shift; | ||||
| UNIFORM(5) uint block_height; | ||||
| UNIFORM(6) uint block_height_mask; | ||||
| END_PUSH_CONSTANTS | ||||
|  | ||||
| struct EncodingData { | ||||
| @@ -53,35 +49,17 @@ struct TexelWeightParams { | ||||
|     bool void_extent_hdr; | ||||
| }; | ||||
|  | ||||
| // Swizzle data | ||||
| layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable { | ||||
|     uint swizzle_table[]; | ||||
| }; | ||||
|  | ||||
| layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { | ||||
|     uvec4 astc_data[]; | ||||
| }; | ||||
|  | ||||
| layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image; | ||||
|  | ||||
| const uint GOB_SIZE_X = 64; | ||||
| const uint GOB_SIZE_Y = 8; | ||||
| const uint GOB_SIZE_Z = 1; | ||||
| const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z; | ||||
|  | ||||
| const uint GOB_SIZE_X_SHIFT = 6; | ||||
| const uint GOB_SIZE_Y_SHIFT = 3; | ||||
| const uint GOB_SIZE_Z_SHIFT = 0; | ||||
| const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT; | ||||
| const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT; | ||||
|  | ||||
| const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1); | ||||
|  | ||||
| const int BLOCK_SIZE_IN_BYTES = 16; | ||||
|  | ||||
| const int BLOCK_INFO_ERROR = 0; | ||||
| const int BLOCK_INFO_VOID_EXTENT_HDR = 1; | ||||
| const int BLOCK_INFO_VOID_EXTENT_LDR = 2; | ||||
| const int BLOCK_INFO_NORMAL = 3; | ||||
| const uint BYTES_PER_BLOCK_LOG2 = 4; | ||||
|  | ||||
| const int JUST_BITS = 0; | ||||
| const int QUINT = 1; | ||||
| @@ -168,8 +146,10 @@ int texel_vector_index = 0; | ||||
| uint unquantized_texel_weights[2][144]; | ||||
|  | ||||
| uint SwizzleOffset(uvec2 pos) { | ||||
|     pos = pos & SWIZZLE_MASK; | ||||
|     return swizzle_table[pos.y * 64 + pos.x]; | ||||
|     uint x = pos.x; | ||||
|     uint y = pos.y; | ||||
|     return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 + | ||||
|                           (y % 2) * 16 + (x % 16); | ||||
| } | ||||
|  | ||||
| // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] | ||||
| @@ -1253,7 +1233,7 @@ void DecompressBlock(ivec3 coord) { | ||||
|  | ||||
| void main() { | ||||
|     uvec3 pos = gl_GlobalInvocationID; | ||||
|     pos.x <<= bytes_per_block_log2; | ||||
|     pos.x <<= BYTES_PER_BLOCK_LOG2; | ||||
|  | ||||
|     // Read as soon as possible due to its latency | ||||
|     const uint swizzle = SwizzleOffset(pos.xy); | ||||
|   | ||||
| @@ -68,7 +68,6 @@ UtilShaders::~UtilShaders() = default; | ||||
| void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, | ||||
|                              std::span<const VideoCommon::SwizzleParameters> swizzles) { | ||||
|     static constexpr GLuint BINDING_INPUT_BUFFER = 0; | ||||
|     static constexpr GLuint BINDING_SWIZZLE_BUFFER = 1; | ||||
|     static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; | ||||
|  | ||||
|     const Extent2D tile_size{ | ||||
| @@ -76,10 +75,9 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, | ||||
|         .height = VideoCore::Surface::DefaultBlockHeight(image.info.format), | ||||
|     }; | ||||
|     program_manager.BindComputeProgram(astc_decoder_program.handle); | ||||
|     glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); | ||||
|  | ||||
|     glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); | ||||
|     glUniform2ui(1, tile_size.width, tile_size.height); | ||||
|  | ||||
|     // Ensure buffer data is valid before dispatching | ||||
|     glFlush(); | ||||
|     for (const SwizzleParameters& swizzle : swizzles) { | ||||
| @@ -90,13 +88,13 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, | ||||
|         const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); | ||||
|         ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0})); | ||||
|         ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0})); | ||||
|         ASSERT(params.bytes_per_block_log2 == 4); | ||||
|  | ||||
|         glUniform1ui(2, params.bytes_per_block_log2); | ||||
|         glUniform1ui(3, params.layer_stride); | ||||
|         glUniform1ui(4, params.block_size); | ||||
|         glUniform1ui(5, params.x_shift); | ||||
|         glUniform1ui(6, params.block_height); | ||||
|         glUniform1ui(7, params.block_height_mask); | ||||
|         glUniform1ui(2, params.layer_stride); | ||||
|         glUniform1ui(3, params.block_size); | ||||
|         glUniform1ui(4, params.x_shift); | ||||
|         glUniform1ui(5, params.block_height); | ||||
|         glUniform1ui(6, params.block_height_mask); | ||||
|  | ||||
|         // ASTC texture data | ||||
|         glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, | ||||
|   | ||||
| @@ -34,9 +34,8 @@ using Tegra::Texture::SWIZZLE_TABLE; | ||||
| namespace { | ||||
|  | ||||
| constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0; | ||||
| constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 1; | ||||
| constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 2; | ||||
| constexpr size_t ASTC_NUM_BINDINGS = 3; | ||||
| constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 1; | ||||
| constexpr size_t ASTC_NUM_BINDINGS = 2; | ||||
|  | ||||
| template <size_t size> | ||||
| inline constexpr VkPushConstantRange COMPUTE_PUSH_CONSTANT_RANGE{ | ||||
| @@ -80,13 +79,6 @@ constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCR | ||||
|         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|         .pImmutableSamplers = nullptr, | ||||
|     }, | ||||
|     { | ||||
|         .binding = ASTC_BINDING_SWIZZLE_BUFFER, | ||||
|         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||||
|         .descriptorCount = 1, | ||||
|         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|         .pImmutableSamplers = nullptr, | ||||
|     }, | ||||
|     { | ||||
|         .binding = ASTC_BINDING_OUTPUT_IMAGE, | ||||
|         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, | ||||
| @@ -98,12 +90,12 @@ constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCR | ||||
|  | ||||
| constexpr DescriptorBankInfo ASTC_BANK_INFO{ | ||||
|     .uniform_buffers = 0, | ||||
|     .storage_buffers = 2, | ||||
|     .storage_buffers = 1, | ||||
|     .texture_buffers = 0, | ||||
|     .image_buffers = 0, | ||||
|     .textures = 0, | ||||
|     .images = 1, | ||||
|     .score = 3, | ||||
|     .score = 2, | ||||
| }; | ||||
|  | ||||
| constexpr VkDescriptorUpdateTemplateEntryKHR INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE{ | ||||
| @@ -125,14 +117,6 @@ constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS> | ||||
|             .offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry), | ||||
|             .stride = sizeof(DescriptorUpdateEntry), | ||||
|         }, | ||||
|         { | ||||
|             .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER, | ||||
|             .dstArrayElement = 0, | ||||
|             .descriptorCount = 1, | ||||
|             .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||||
|             .offset = ASTC_BINDING_SWIZZLE_BUFFER * sizeof(DescriptorUpdateEntry), | ||||
|             .stride = sizeof(DescriptorUpdateEntry), | ||||
|         }, | ||||
|         { | ||||
|             .dstBinding = ASTC_BINDING_OUTPUT_IMAGE, | ||||
|             .dstArrayElement = 0, | ||||
| @@ -145,7 +129,6 @@ constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS> | ||||
|  | ||||
| struct AstcPushConstants { | ||||
|     std::array<u32, 2> blocks_dims; | ||||
|     u32 bytes_per_block_log2; | ||||
|     u32 layer_stride; | ||||
|     u32 block_size; | ||||
|     u32 x_shift; | ||||
| @@ -336,42 +319,6 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_, | ||||
|  | ||||
| ASTCDecoderPass::~ASTCDecoderPass() = default; | ||||
|  | ||||
| void ASTCDecoderPass::MakeDataBuffer() { | ||||
|     constexpr size_t TOTAL_BUFFER_SIZE = sizeof(SWIZZLE_TABLE); | ||||
|     data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ | ||||
|         .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, | ||||
|         .pNext = nullptr, | ||||
|         .flags = 0, | ||||
|         .size = TOTAL_BUFFER_SIZE, | ||||
|         .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, | ||||
|         .sharingMode = VK_SHARING_MODE_EXCLUSIVE, | ||||
|         .queueFamilyIndexCount = 0, | ||||
|         .pQueueFamilyIndices = nullptr, | ||||
|     }); | ||||
|     data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload); | ||||
|  | ||||
|     const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload); | ||||
|     std::memcpy(staging_ref.mapped_span.data(), &SWIZZLE_TABLE, sizeof(SWIZZLE_TABLE)); | ||||
|  | ||||
|     scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer, | ||||
|                       TOTAL_BUFFER_SIZE](vk::CommandBuffer cmdbuf) { | ||||
|         static constexpr VkMemoryBarrier write_barrier{ | ||||
|             .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||||
|             .pNext = nullptr, | ||||
|             .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, | ||||
|             .dstAccessMask = VK_ACCESS_SHADER_READ_BIT, | ||||
|         }; | ||||
|         const VkBufferCopy copy{ | ||||
|             .srcOffset = offset, | ||||
|             .dstOffset = 0, | ||||
|             .size = TOTAL_BUFFER_SIZE, | ||||
|         }; | ||||
|         cmdbuf.CopyBuffer(src, dst, copy); | ||||
|         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, | ||||
|                                0, write_barrier); | ||||
|     }); | ||||
| } | ||||
|  | ||||
| void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, | ||||
|                                std::span<const VideoCommon::SwizzleParameters> swizzles) { | ||||
|     using namespace VideoCommon::Accelerated; | ||||
| @@ -380,9 +327,6 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, | ||||
|         VideoCore::Surface::DefaultBlockHeight(image.info.format), | ||||
|     }; | ||||
|     scheduler.RequestOutsideRenderPassOperationContext(); | ||||
|     if (!data_buffer) { | ||||
|         MakeDataBuffer(); | ||||
|     } | ||||
|     const VkPipeline vk_pipeline = *pipeline; | ||||
|     const VkImageAspectFlags aspect_mask = image.AspectMask(); | ||||
|     const VkImage vk_image = image.Handle(); | ||||
| @@ -421,7 +365,6 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, | ||||
|         update_descriptor_queue.Acquire(); | ||||
|         update_descriptor_queue.AddBuffer(map.buffer, input_offset, | ||||
|                                           image.guest_size_bytes - swizzle.buffer_offset); | ||||
|         update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(SWIZZLE_TABLE)); | ||||
|         update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level)); | ||||
|         const void* const descriptor_data{update_descriptor_queue.UpdateData()}; | ||||
|  | ||||
| @@ -429,11 +372,11 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, | ||||
|         const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); | ||||
|         ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0})); | ||||
|         ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0})); | ||||
|         ASSERT(params.bytes_per_block_log2 == 4); | ||||
|         scheduler.Record([this, num_dispatches_x, num_dispatches_y, num_dispatches_z, block_dims, | ||||
|                           params, descriptor_data](vk::CommandBuffer cmdbuf) { | ||||
|             const AstcPushConstants uniforms{ | ||||
|                 .blocks_dims = block_dims, | ||||
|                 .bytes_per_block_log2 = params.bytes_per_block_log2, | ||||
|                 .layer_stride = params.layer_stride, | ||||
|                 .block_size = params.block_size, | ||||
|                 .x_shift = params.x_shift, | ||||
|   | ||||
| @@ -96,15 +96,10 @@ public: | ||||
|                   std::span<const VideoCommon::SwizzleParameters> swizzles); | ||||
|  | ||||
| private: | ||||
|     void MakeDataBuffer(); | ||||
|  | ||||
|     VKScheduler& scheduler; | ||||
|     StagingBufferPool& staging_buffer_pool; | ||||
|     VKUpdateDescriptorQueue& update_descriptor_queue; | ||||
|     MemoryAllocator& memory_allocator; | ||||
|  | ||||
|     vk::Buffer data_buffer; | ||||
|     MemoryCommit data_buffer_commit; | ||||
| }; | ||||
|  | ||||
| } // namespace Vulkan | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 ameerj
					ameerj