citra: Migrate to tracy profiler from Microprofile

This commit is contained in:
GPUCode
2024-02-05 00:16:59 +02:00
parent 8e2415f455
commit f6efa049dd
123 changed files with 44521 additions and 11365 deletions

View File

@@ -0,0 +1,43 @@
#include "../common/TracyAlloc.hpp"
#ifdef TRACY_USE_RPMALLOC
#include <atomic>
#include "../common/TracyForceInline.hpp"
#include "../common/TracyYield.hpp"
namespace tracy
{
extern thread_local bool RpThreadInitDone;
extern std::atomic<int> RpInitDone;
extern std::atomic<int> RpInitLock;
tracy_no_inline static void InitRpmallocPlumbing()
{
const auto done = RpInitDone.load( std::memory_order_acquire );
if( !done )
{
int expected = 0;
while( !RpInitLock.compare_exchange_weak( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; YieldThread(); }
const auto done = RpInitDone.load( std::memory_order_acquire );
if( !done )
{
rpmalloc_initialize();
RpInitDone.store( 1, std::memory_order_release );
}
RpInitLock.store( 0, std::memory_order_release );
}
rpmalloc_thread_initialize();
RpThreadInitDone = true;
}
TRACY_API void InitRpmalloc()
{
if( !RpThreadInitDone ) InitRpmallocPlumbing();
}
}
#endif

View File

@@ -0,0 +1,419 @@
namespace tracy
{
#if defined __linux__ && defined __ARM_ARCH
static const char* DecodeArmImplementer( uint32_t v )
{
static char buf[16];
switch( v )
{
case 0x41: return "ARM";
case 0x42: return "Broadcom";
case 0x43: return "Cavium";
case 0x44: return "DEC";
case 0x46: return "Fujitsu";
case 0x48: return "HiSilicon";
case 0x49: return "Infineon";
case 0x4d: return "Motorola";
case 0x4e: return "Nvidia";
case 0x50: return "Applied Micro";
case 0x51: return "Qualcomm";
case 0x53: return "Samsung";
case 0x54: return "Texas Instruments";
case 0x56: return "Marvell";
case 0x61: return "Apple";
case 0x66: return "Faraday";
case 0x68: return "HXT";
case 0x69: return "Intel";
case 0xc0: return "Ampere Computing";
default: break;
}
sprintf( buf, "0x%x", v );
return buf;
}
static const char* DecodeArmPart( uint32_t impl, uint32_t part )
{
static char buf[16];
switch( impl )
{
case 0x41: // ARM
switch( part )
{
case 0x810: return "810";
case 0x920: return "920";
case 0x922: return "922";
case 0x926: return "926";
case 0x940: return "940";
case 0x946: return "946";
case 0x966: return "966";
case 0xa20: return "1020";
case 0xa22: return "1022";
case 0xa26: return "1026";
case 0xb02: return "11 MPCore";
case 0xb36: return "1136";
case 0xb56: return "1156";
case 0xb76: return "1176";
case 0xc05: return " Cortex-A5";
case 0xc07: return " Cortex-A7";
case 0xc08: return " Cortex-A8";
case 0xc09: return " Cortex-A9";
case 0xc0c: return " Cortex-A12";
case 0xc0d: return " Rockchip RK3288";
case 0xc0e: return " Cortex-A17";
case 0xc0f: return " Cortex-A15";
case 0xc14: return " Cortex-R4";
case 0xc15: return " Cortex-R5";
case 0xc17: return " Cortex-R7";
case 0xc18: return " Cortex-R8";
case 0xc20: return " Cortex-M0";
case 0xc21: return " Cortex-M1";
case 0xc23: return " Cortex-M3";
case 0xc24: return " Cortex-M4";
case 0xc27: return " Cortex-M7";
case 0xc60: return " Cortex-M0+";
case 0xd00: return " AArch64 simulator";
case 0xd01: return " Cortex-A32";
case 0xd02: return " Cortex-A34";
case 0xd03: return " Cortex-A53";
case 0xd04: return " Cortex-A35";
case 0xd05: return " Cortex-A55";
case 0xd06: return " Cortex-A65";
case 0xd07: return " Cortex-A57";
case 0xd08: return " Cortex-A72";
case 0xd09: return " Cortex-A73";
case 0xd0a: return " Cortex-A75";
case 0xd0b: return " Cortex-A76";
case 0xd0c: return " Neoverse N1";
case 0xd0d: return " Cortex-A77";
case 0xd0e: return " Cortex-A76AE";
case 0xd0f: return " AEMv8";
case 0xd13: return " Cortex-R52";
case 0xd20: return " Cortex-M23";
case 0xd21: return " Cortex-M33";
case 0xd22: return " Cortex-M55";
case 0xd40: return " Neoverse V1";
case 0xd41: return " Cortex-A78";
case 0xd42: return " Cortex-A78AE";
case 0xd43: return " Cortex-A65AE";
case 0xd44: return " Cortex-X1";
case 0xd47: return " Cortex-A710";
case 0xd48: return " Cortex-X2";
case 0xd49: return " Neoverse N2";
case 0xd4a: return " Neoverse E1";
case 0xd4b: return " Cortex-A78C";
case 0xd4c: return " Cortex-X1C";
default: break;
}
case 0x42: // Broadcom
switch( part )
{
case 0xf: return " Brahma B15";
case 0x100: return " Brahma B53";
case 0x516: return " ThunderX2";
default: break;
}
case 0x43: // Cavium
switch( part )
{
case 0xa0: return " ThunderX";
case 0xa1: return " ThunderX 88XX";
case 0xa2: return " ThunderX 81XX";
case 0xa3: return " ThunderX 83XX";
case 0xaf: return " ThunderX2 99xx";
case 0xb0: return " OcteonTX2";
case 0xb1: return " OcteonTX2 T98";
case 0xb2: return " OcteonTX2 T96";
case 0xb3: return " OcteonTX2 F95";
case 0xb4: return " OcteonTX2 F95N";
case 0xb5: return " OcteonTX2 F95MM";
case 0xb6: return " OcteonTX2 F95O";
case 0xb8: return " ThunderX3 T110";
default: break;
}
case 0x44: // DEC
switch( part )
{
case 0xa10: return " SA110";
case 0xa11: return " SA1100";
default: break;
}
case 0x46: // Fujitsu
switch( part )
{
case 0x1: return " A64FX";
default: break;
}
case 0x48: // HiSilicon
switch( part )
{
case 0xd01: return " TSV100";
case 0xd40: return " Kirin 980";
default: break;
}
case 0x4e: // Nvidia
switch( part )
{
case 0x0: return " Denver";
case 0x3: return " Denver 2";
case 0x4: return " Carmel";
default: break;
}
case 0x50: // Applied Micro
switch( part )
{
case 0x0: return " X-Gene";
default: break;
}
case 0x51: // Qualcomm
switch( part )
{
case 0xf: return " Scorpion";
case 0x2d: return " Scorpion";
case 0x4d: return " Krait";
case 0x6f: return " Krait";
case 0x200: return " Kryo";
case 0x201: return " Kryo Silver (Snapdragon 821)";
case 0x205: return " Kryo Gold";
case 0x211: return " Kryo Silver (Snapdragon 820)";
case 0x800: return " Kryo 260 / 280 Gold";
case 0x801: return " Kryo 260 / 280 Silver";
case 0x802: return " Kryo 385 Gold";
case 0x803: return " Kryo 385 Silver";
case 0x804: return " Kryo 485 Gold";
case 0x805: return " Kryo 4xx/5xx Silver";
case 0xc00: return " Falkor";
case 0xc01: return " Saphira";
default: break;
}
case 0x53: // Samsung
switch( part )
{
case 0x1: return " Exynos M1/M2";
case 0x2: return " Exynos M3";
case 0x3: return " Exynos M4";
case 0x4: return " Exynos M5";
default: break;
}
case 0x54: // Texas Instruments
switch( part )
{
case 0x925: return " TI925";
default: break;
}
case 0x56: // Marvell
switch( part )
{
case 0x131: return " Feroceon 88FR131";
case 0x581: return " PJ4 / PJ4B";
case 0x584: return " PJ4B-MP / PJ4C";
default: break;
}
case 0x61: // Apple
switch( part )
{
case 0x1: return " Cyclone";
case 0x2: return " Typhoon";
case 0x3: return " Typhoon/Capri";
case 0x4: return " Twister";
case 0x5: return " Twister/Elba/Malta";
case 0x6: return " Hurricane";
case 0x7: return " Hurricane/Myst";
case 0x22: return " M1 Icestorm";
case 0x23: return " M1 Firestorm";
case 0x24: return " M1 Icestorm Pro";
case 0x25: return " M1 Firestorm Pro";
case 0x28: return " M1 Icestorm Max";
case 0x29: return " M1 Firestorm Max";
default: break;
}
case 0x66: // Faraday
switch( part )
{
case 0x526: return " FA526";
case 0x626: return " FA626";
default: break;
}
case 0x68: // HXT
switch( part )
{
case 0x0: return " Phecda";
default: break;
}
case 0xc0: // Ampere Computing
switch( part )
{
case 0xac3: return " Ampere1";
default: break;
}
default: break;
}
sprintf( buf, " 0x%x", part );
return buf;
}
#elif defined __APPLE__ && TARGET_OS_IPHONE == 1
static const char* DecodeIosDevice( const char* id )
{
static const char* DeviceTable[] = {
"i386", "32-bit simulator",
"x86_64", "64-bit simulator",
"iPhone1,1", "iPhone",
"iPhone1,2", "iPhone 3G",
"iPhone2,1", "iPhone 3GS",
"iPhone3,1", "iPhone 4 (GSM)",
"iPhone3,2", "iPhone 4 (GSM)",
"iPhone3,3", "iPhone 4 (CDMA)",
"iPhone4,1", "iPhone 4S",
"iPhone5,1", "iPhone 5 (A1428)",
"iPhone5,2", "iPhone 5 (A1429)",
"iPhone5,3", "iPhone 5c (A1456/A1532)",
"iPhone5,4", "iPhone 5c (A1507/A1516/1526/A1529)",
"iPhone6,1", "iPhone 5s (A1433/A1533)",
"iPhone6,2", "iPhone 5s (A1457/A1518/A1528/A1530)",
"iPhone7,1", "iPhone 6 Plus",
"iPhone7,2", "iPhone 6",
"iPhone8,1", "iPhone 6S",
"iPhone8,2", "iPhone 6S Plus",
"iPhone8,4", "iPhone SE",
"iPhone9,1", "iPhone 7 (CDMA)",
"iPhone9,2", "iPhone 7 Plus (CDMA)",
"iPhone9,3", "iPhone 7 (GSM)",
"iPhone9,4", "iPhone 7 Plus (GSM)",
"iPhone10,1", "iPhone 8 (CDMA)",
"iPhone10,2", "iPhone 8 Plus (CDMA)",
"iPhone10,3", "iPhone X (CDMA)",
"iPhone10,4", "iPhone 8 (GSM)",
"iPhone10,5", "iPhone 8 Plus (GSM)",
"iPhone10,6", "iPhone X (GSM)",
"iPhone11,2", "iPhone XS",
"iPhone11,4", "iPhone XS Max",
"iPhone11,6", "iPhone XS Max China",
"iPhone11,8", "iPhone XR",
"iPhone12,1", "iPhone 11",
"iPhone12,3", "iPhone 11 Pro",
"iPhone12,5", "iPhone 11 Pro Max",
"iPhone12,8", "iPhone SE 2nd Gen",
"iPhone13,1", "iPhone 12 Mini",
"iPhone13,2", "iPhone 12",
"iPhone13,3", "iPhone 12 Pro",
"iPhone13,4", "iPhone 12 Pro Max",
"iPhone14,2", "iPhone 13 Pro",
"iPhone14,3", "iPhone 13 Pro Max",
"iPhone14,4", "iPhone 13 Mini",
"iPhone14,5", "iPhone 13",
"iPhone14,6", "iPhone SE 3rd Gen",
"iPhone14,7", "iPhone 14",
"iPhone14,8", "iPhone 14 Plus",
"iPhone15,2", "iPhone 14 Pro",
"iPhone15,3", "iPhone 14 Pro Max",
"iPhone15,4", "iPhone 15",
"iPhone15,5", "iPhone 15 Plus",
"iPhone16,1", "iPhone 15 Pro",
"iPhone16,2", "iPhone 15 Pro Max",
"iPad1,1", "iPad (A1219/A1337)",
"iPad2,1", "iPad 2 (A1395)",
"iPad2,2", "iPad 2 (A1396)",
"iPad2,3", "iPad 2 (A1397)",
"iPad2,4", "iPad 2 (A1395)",
"iPad2,5", "iPad Mini (A1432)",
"iPad2,6", "iPad Mini (A1454)",
"iPad2,7", "iPad Mini (A1455)",
"iPad3,1", "iPad 3 (A1416)",
"iPad3,2", "iPad 3 (A1403)",
"iPad3,3", "iPad 3 (A1430)",
"iPad3,4", "iPad 4 (A1458)",
"iPad3,5", "iPad 4 (A1459)",
"iPad3,6", "iPad 4 (A1460)",
"iPad4,1", "iPad Air (A1474)",
"iPad4,2", "iPad Air (A1475)",
"iPad4,3", "iPad Air (A1476)",
"iPad4,4", "iPad Mini 2 (A1489)",
"iPad4,5", "iPad Mini 2 (A1490)",
"iPad4,6", "iPad Mini 2 (A1491)",
"iPad4,7", "iPad Mini 3 (A1599)",
"iPad4,8", "iPad Mini 3 (A1600)",
"iPad4,9", "iPad Mini 3 (A1601)",
"iPad5,1", "iPad Mini 4 (A1538)",
"iPad5,2", "iPad Mini 4 (A1550)",
"iPad5,3", "iPad Air 2 (A1566)",
"iPad5,4", "iPad Air 2 (A1567)",
"iPad6,3", "iPad Pro 9.7\" (A1673)",
"iPad6,4", "iPad Pro 9.7\" (A1674)",
"iPad6,5", "iPad Pro 9.7\" (A1675)",
"iPad6,7", "iPad Pro 12.9\" (A1584)",
"iPad6,8", "iPad Pro 12.9\" (A1652)",
"iPad6,11", "iPad 5th gen (A1822)",
"iPad6,12", "iPad 5th gen (A1823)",
"iPad7,1", "iPad Pro 12.9\" 2nd gen (A1670)",
"iPad7,2", "iPad Pro 12.9\" 2nd gen (A1671/A1821)",
"iPad7,3", "iPad Pro 10.5\" (A1701)",
"iPad7,4", "iPad Pro 10.5\" (A1709)",
"iPad7,5", "iPad 6th gen (A1893)",
"iPad7,6", "iPad 6th gen (A1954)",
"iPad7,11", "iPad 7th gen 10.2\" (Wifi)",
"iPad7,12", "iPad 7th gen 10.2\" (Wifi+Cellular)",
"iPad8,1", "iPad Pro 11\" (A1980)",
"iPad8,2", "iPad Pro 11\" (A1980)",
"iPad8,3", "iPad Pro 11\" (A1934/A1979/A2013)",
"iPad8,4", "iPad Pro 11\" (A1934/A1979/A2013)",
"iPad8,5", "iPad Pro 12.9\" 3rd gen (A1876)",
"iPad8,6", "iPad Pro 12.9\" 3rd gen (A1876)",
"iPad8,7", "iPad Pro 12.9\" 3rd gen (A1895/A1983/A2014)",
"iPad8,8", "iPad Pro 12.9\" 3rd gen (A1895/A1983/A2014)",
"iPad8,9", "iPad Pro 11\" 2nd gen (Wifi)",
"iPad8,10", "iPad Pro 11\" 2nd gen (Wifi+Cellular)",
"iPad8,11", "iPad Pro 12.9\" 4th gen (Wifi)",
"iPad8,12", "iPad Pro 12.9\" 4th gen (Wifi+Cellular)",
"iPad11,1", "iPad Mini 5th gen (A2133)",
"iPad11,2", "iPad Mini 5th gen (A2124/A2125/A2126)",
"iPad11,3", "iPad Air 3rd gen (A2152)",
"iPad11,4", "iPad Air 3rd gen (A2123/A2153/A2154)",
"iPad11,6", "iPad 8th gen (WiFi)",
"iPad11,7", "iPad 8th gen (WiFi+Cellular)",
"iPad12,1", "iPad 9th Gen (WiFi)",
"iPad12,2", "iPad 9th Gen (WiFi+Cellular)",
"iPad13,1", "iPad Air 4th gen (WiFi)",
"iPad13,2", "iPad Air 4th gen (WiFi+Cellular)",
"iPad13,4", "iPad Pro 11\" 3rd gen",
"iPad13,5", "iPad Pro 11\" 3rd gen",
"iPad13,6", "iPad Pro 11\" 3rd gen",
"iPad13,7", "iPad Pro 11\" 3rd gen",
"iPad13,8", "iPad Pro 12.9\" 5th gen",
"iPad13,9", "iPad Pro 12.9\" 5th gen",
"iPad13,10", "iPad Pro 12.9\" 5th gen",
"iPad13,11", "iPad Pro 12.9\" 5th gen",
"iPad13,16", "iPad Air 5th Gen (WiFi)",
"iPad13,17", "iPad Air 5th Gen (WiFi+Cellular)",
"iPad13,18", "iPad 10th Gen",
"iPad13,19", "iPad 10th Gen",
"iPad14,1", "iPad mini 6th Gen (WiFi)",
"iPad14,2", "iPad mini 6th Gen (WiFi+Cellular)",
"iPad14,3", "iPad Pro 11\" 4th Gen",
"iPad14,4", "iPad Pro 11\" 4th Gen",
"iPad14,5", "iPad Pro 12.9\" 6th Gen",
"iPad14,6", "iPad Pro 12.9\" 6th Gen",
"iPod1,1", "iPod Touch",
"iPod2,1", "iPod Touch 2nd gen",
"iPod3,1", "iPod Touch 3rd gen",
"iPod4,1", "iPod Touch 4th gen",
"iPod5,1", "iPod Touch 5th gen",
"iPod7,1", "iPod Touch 6th gen",
"iPod9,1", "iPod Touch 7th gen",
nullptr
};
auto ptr = DeviceTable;
while( *ptr )
{
if( strcmp( ptr[0], id ) == 0 ) return ptr[1];
ptr += 2;
}
return id;
}
#endif
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,35 @@
#ifndef __TRACYCALLSTACK_H__
#define __TRACYCALLSTACK_H__
#ifndef TRACY_NO_CALLSTACK
# if !defined _WIN32
# include <sys/param.h>
# endif
# if defined _WIN32
# include "../common/TracyUwp.hpp"
# ifndef TRACY_UWP
# define TRACY_HAS_CALLSTACK 1
# endif
# elif defined __ANDROID__
# if !defined __arm__ || __ANDROID_API__ >= 21
# define TRACY_HAS_CALLSTACK 2
# else
# define TRACY_HAS_CALLSTACK 5
# endif
# elif defined __linux
# if defined _GNU_SOURCE && defined __GLIBC__
# define TRACY_HAS_CALLSTACK 3
# else
# define TRACY_HAS_CALLSTACK 2
# endif
# elif defined __APPLE__
# define TRACY_HAS_CALLSTACK 4
# elif defined BSD
# define TRACY_HAS_CALLSTACK 6
# endif
#endif
#endif

View File

@@ -0,0 +1,153 @@
#ifndef __TRACYCALLSTACK_HPP__
#define __TRACYCALLSTACK_HPP__
#include "../common/TracyApi.h"
#include "../common/TracyForceInline.hpp"
#include "TracyCallstack.h"
#if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5
# include <unwind.h>
#elif TRACY_HAS_CALLSTACK >= 3
# ifdef TRACY_LIBUNWIND_BACKTRACE
// libunwind is, in general, significantly faster than execinfo based backtraces
# define UNW_LOCAL_ONLY
# include <libunwind.h>
# else
# include <execinfo.h>
# endif
#endif
#ifndef TRACY_HAS_CALLSTACK
namespace tracy
{
static tracy_force_inline void* Callstack( int depth ) { return nullptr; }
}
#else
#ifdef TRACY_DEBUGINFOD
# include <elfutils/debuginfod.h>
#endif
#include <assert.h>
#include <stdint.h>
#include "../common/TracyAlloc.hpp"
namespace tracy
{
struct CallstackSymbolData
{
const char* file;
uint32_t line;
bool needFree;
uint64_t symAddr;
};
struct CallstackEntry
{
const char* name;
const char* file;
uint32_t line;
uint32_t symLen;
uint64_t symAddr;
};
struct CallstackEntryData
{
const CallstackEntry* data;
uint8_t size;
const char* imageName;
};
CallstackSymbolData DecodeSymbolAddress( uint64_t ptr );
const char* DecodeCallstackPtrFast( uint64_t ptr );
CallstackEntryData DecodeCallstackPtr( uint64_t ptr );
void InitCallstack();
void InitCallstackCritical();
void EndCallstack();
const char* GetKernelModulePath( uint64_t addr );
#ifdef TRACY_DEBUGINFOD
const uint8_t* GetBuildIdForImage( const char* image, size_t& size );
debuginfod_client* GetDebuginfodClient();
#endif
#if TRACY_HAS_CALLSTACK == 1
extern "C"
{
typedef unsigned long (__stdcall *___tracy_t_RtlWalkFrameChain)( void**, unsigned long, unsigned long );
TRACY_API extern ___tracy_t_RtlWalkFrameChain ___tracy_RtlWalkFrameChain;
}
static tracy_force_inline void* Callstack( int depth )
{
assert( depth >= 1 && depth < 63 );
auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) );
const auto num = ___tracy_RtlWalkFrameChain( (void**)( trace + 1 ), depth, 0 );
*trace = num;
return trace;
}
#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5
struct BacktraceState
{
void** current;
void** end;
};
static _Unwind_Reason_Code tracy_unwind_callback( struct _Unwind_Context* ctx, void* arg )
{
auto state = (BacktraceState*)arg;
uintptr_t pc = _Unwind_GetIP( ctx );
if( pc )
{
if( state->current == state->end ) return _URC_END_OF_STACK;
*state->current++ = (void*)pc;
}
return _URC_NO_REASON;
}
static tracy_force_inline void* Callstack( int depth )
{
assert( depth >= 1 && depth < 63 );
auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) );
BacktraceState state = { (void**)(trace+1), (void**)(trace+1+depth) };
_Unwind_Backtrace( tracy_unwind_callback, &state );
*trace = (uintptr_t*)state.current - trace + 1;
return trace;
}
#elif TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
static tracy_force_inline void* Callstack( int depth )
{
assert( depth >= 1 );
auto trace = (uintptr_t*)tracy_malloc( ( 1 + (size_t)depth ) * sizeof( uintptr_t ) );
#ifdef TRACY_LIBUNWIND_BACKTRACE
size_t num = unw_backtrace( (void**)(trace+1), depth );
#else
const auto num = (size_t)backtrace( (void**)(trace+1), depth );
#endif
*trace = num;
return trace;
}
#endif
}
#endif
#endif

View File

@@ -0,0 +1,12 @@
#ifndef __TRACYCPUID_HPP__
#define __TRACYCPUID_HPP__
// Prior to GCC 11 the cpuid.h header did not have any include guards and thus
// including it more than once would cause a compiler error due to symbol
// redefinitions. In order to support older GCC versions, we have to wrap this
// include between custom include guards to prevent this issue.
// See also https://github.com/wolfpld/tracy/issues/452
#include <cpuid.h>
#endif

View File

@@ -0,0 +1,11 @@
#ifndef __TRACYPRINT_HPP__
#define __TRACYPRINT_HPP__
#ifdef TRACY_VERBOSE
# include <stdio.h>
# define TracyDebug(...) fprintf( stderr, __VA_ARGS__ );
#else
# define TracyDebug(...)
#endif
#endif

View File

@@ -0,0 +1,644 @@
#include "TracyDxt1.hpp"
#include "../common/TracyForceInline.hpp"
#include <assert.h>
#include <stdint.h>
#include <string.h>
#ifdef __ARM_NEON
# include <arm_neon.h>
#endif
#if defined __AVX__ && !defined __SSE4_1__
# define __SSE4_1__
#endif
#if defined __SSE4_1__ || defined __AVX2__
# ifdef _MSC_VER
# include <intrin.h>
# else
# include <x86intrin.h>
# ifndef _mm256_cvtsi256_si32
# define _mm256_cvtsi256_si32( v ) ( _mm_cvtsi128_si32( _mm256_castsi256_si128( v ) ) )
# endif
# endif
#endif
namespace tracy
{
static inline uint16_t to565( uint8_t r, uint8_t g, uint8_t b )
{
return ( ( r & 0xF8 ) << 8 ) | ( ( g & 0xFC ) << 3 ) | ( b >> 3 );
}
static inline uint16_t to565( uint32_t c )
{
return
( ( c & 0xF80000 ) >> 19 ) |
( ( c & 0x00FC00 ) >> 5 ) |
( ( c & 0x0000F8 ) << 8 );
}
static const uint16_t DivTable[255*3+1] = {
0xffff, 0xffff, 0xffff, 0xffff, 0xcccc, 0xaaaa, 0x9249, 0x8000, 0x71c7, 0x6666, 0x5d17, 0x5555, 0x4ec4, 0x4924, 0x4444, 0x4000,
0x3c3c, 0x38e3, 0x35e5, 0x3333, 0x30c3, 0x2e8b, 0x2c85, 0x2aaa, 0x28f5, 0x2762, 0x25ed, 0x2492, 0x234f, 0x2222, 0x2108, 0x2000,
0x1f07, 0x1e1e, 0x1d41, 0x1c71, 0x1bac, 0x1af2, 0x1a41, 0x1999, 0x18f9, 0x1861, 0x17d0, 0x1745, 0x16c1, 0x1642, 0x15c9, 0x1555,
0x14e5, 0x147a, 0x1414, 0x13b1, 0x1352, 0x12f6, 0x129e, 0x1249, 0x11f7, 0x11a7, 0x115b, 0x1111, 0x10c9, 0x1084, 0x1041, 0x1000,
0x0fc0, 0x0f83, 0x0f48, 0x0f0f, 0x0ed7, 0x0ea0, 0x0e6c, 0x0e38, 0x0e07, 0x0dd6, 0x0da7, 0x0d79, 0x0d4c, 0x0d20, 0x0cf6, 0x0ccc,
0x0ca4, 0x0c7c, 0x0c56, 0x0c30, 0x0c0c, 0x0be8, 0x0bc5, 0x0ba2, 0x0b81, 0x0b60, 0x0b40, 0x0b21, 0x0b02, 0x0ae4, 0x0ac7, 0x0aaa,
0x0a8e, 0x0a72, 0x0a57, 0x0a3d, 0x0a23, 0x0a0a, 0x09f1, 0x09d8, 0x09c0, 0x09a9, 0x0991, 0x097b, 0x0964, 0x094f, 0x0939, 0x0924,
0x090f, 0x08fb, 0x08e7, 0x08d3, 0x08c0, 0x08ad, 0x089a, 0x0888, 0x0876, 0x0864, 0x0853, 0x0842, 0x0831, 0x0820, 0x0810, 0x0800,
0x07f0, 0x07e0, 0x07d1, 0x07c1, 0x07b3, 0x07a4, 0x0795, 0x0787, 0x0779, 0x076b, 0x075d, 0x0750, 0x0743, 0x0736, 0x0729, 0x071c,
0x070f, 0x0703, 0x06f7, 0x06eb, 0x06df, 0x06d3, 0x06c8, 0x06bc, 0x06b1, 0x06a6, 0x069b, 0x0690, 0x0685, 0x067b, 0x0670, 0x0666,
0x065c, 0x0652, 0x0648, 0x063e, 0x0634, 0x062b, 0x0621, 0x0618, 0x060f, 0x0606, 0x05fd, 0x05f4, 0x05eb, 0x05e2, 0x05d9, 0x05d1,
0x05c9, 0x05c0, 0x05b8, 0x05b0, 0x05a8, 0x05a0, 0x0598, 0x0590, 0x0588, 0x0581, 0x0579, 0x0572, 0x056b, 0x0563, 0x055c, 0x0555,
0x054e, 0x0547, 0x0540, 0x0539, 0x0532, 0x052b, 0x0525, 0x051e, 0x0518, 0x0511, 0x050b, 0x0505, 0x04fe, 0x04f8, 0x04f2, 0x04ec,
0x04e6, 0x04e0, 0x04da, 0x04d4, 0x04ce, 0x04c8, 0x04c3, 0x04bd, 0x04b8, 0x04b2, 0x04ad, 0x04a7, 0x04a2, 0x049c, 0x0497, 0x0492,
0x048d, 0x0487, 0x0482, 0x047d, 0x0478, 0x0473, 0x046e, 0x0469, 0x0465, 0x0460, 0x045b, 0x0456, 0x0452, 0x044d, 0x0448, 0x0444,
0x043f, 0x043b, 0x0436, 0x0432, 0x042d, 0x0429, 0x0425, 0x0421, 0x041c, 0x0418, 0x0414, 0x0410, 0x040c, 0x0408, 0x0404, 0x0400,
0x03fc, 0x03f8, 0x03f4, 0x03f0, 0x03ec, 0x03e8, 0x03e4, 0x03e0, 0x03dd, 0x03d9, 0x03d5, 0x03d2, 0x03ce, 0x03ca, 0x03c7, 0x03c3,
0x03c0, 0x03bc, 0x03b9, 0x03b5, 0x03b2, 0x03ae, 0x03ab, 0x03a8, 0x03a4, 0x03a1, 0x039e, 0x039b, 0x0397, 0x0394, 0x0391, 0x038e,
0x038b, 0x0387, 0x0384, 0x0381, 0x037e, 0x037b, 0x0378, 0x0375, 0x0372, 0x036f, 0x036c, 0x0369, 0x0366, 0x0364, 0x0361, 0x035e,
0x035b, 0x0358, 0x0355, 0x0353, 0x0350, 0x034d, 0x034a, 0x0348, 0x0345, 0x0342, 0x0340, 0x033d, 0x033a, 0x0338, 0x0335, 0x0333,
0x0330, 0x032e, 0x032b, 0x0329, 0x0326, 0x0324, 0x0321, 0x031f, 0x031c, 0x031a, 0x0317, 0x0315, 0x0313, 0x0310, 0x030e, 0x030c,
0x0309, 0x0307, 0x0305, 0x0303, 0x0300, 0x02fe, 0x02fc, 0x02fa, 0x02f7, 0x02f5, 0x02f3, 0x02f1, 0x02ef, 0x02ec, 0x02ea, 0x02e8,
0x02e6, 0x02e4, 0x02e2, 0x02e0, 0x02de, 0x02dc, 0x02da, 0x02d8, 0x02d6, 0x02d4, 0x02d2, 0x02d0, 0x02ce, 0x02cc, 0x02ca, 0x02c8,
0x02c6, 0x02c4, 0x02c2, 0x02c0, 0x02be, 0x02bc, 0x02bb, 0x02b9, 0x02b7, 0x02b5, 0x02b3, 0x02b1, 0x02b0, 0x02ae, 0x02ac, 0x02aa,
0x02a8, 0x02a7, 0x02a5, 0x02a3, 0x02a1, 0x02a0, 0x029e, 0x029c, 0x029b, 0x0299, 0x0297, 0x0295, 0x0294, 0x0292, 0x0291, 0x028f,
0x028d, 0x028c, 0x028a, 0x0288, 0x0287, 0x0285, 0x0284, 0x0282, 0x0280, 0x027f, 0x027d, 0x027c, 0x027a, 0x0279, 0x0277, 0x0276,
0x0274, 0x0273, 0x0271, 0x0270, 0x026e, 0x026d, 0x026b, 0x026a, 0x0268, 0x0267, 0x0265, 0x0264, 0x0263, 0x0261, 0x0260, 0x025e,
0x025d, 0x025c, 0x025a, 0x0259, 0x0257, 0x0256, 0x0255, 0x0253, 0x0252, 0x0251, 0x024f, 0x024e, 0x024d, 0x024b, 0x024a, 0x0249,
0x0247, 0x0246, 0x0245, 0x0243, 0x0242, 0x0241, 0x0240, 0x023e, 0x023d, 0x023c, 0x023b, 0x0239, 0x0238, 0x0237, 0x0236, 0x0234,
0x0233, 0x0232, 0x0231, 0x0230, 0x022e, 0x022d, 0x022c, 0x022b, 0x022a, 0x0229, 0x0227, 0x0226, 0x0225, 0x0224, 0x0223, 0x0222,
0x0220, 0x021f, 0x021e, 0x021d, 0x021c, 0x021b, 0x021a, 0x0219, 0x0218, 0x0216, 0x0215, 0x0214, 0x0213, 0x0212, 0x0211, 0x0210,
0x020f, 0x020e, 0x020d, 0x020c, 0x020b, 0x020a, 0x0209, 0x0208, 0x0207, 0x0206, 0x0205, 0x0204, 0x0203, 0x0202, 0x0201, 0x0200,
0x01ff, 0x01fe, 0x01fd, 0x01fc, 0x01fb, 0x01fa, 0x01f9, 0x01f8, 0x01f7, 0x01f6, 0x01f5, 0x01f4, 0x01f3, 0x01f2, 0x01f1, 0x01f0,
0x01ef, 0x01ee, 0x01ed, 0x01ec, 0x01eb, 0x01ea, 0x01e9, 0x01e9, 0x01e8, 0x01e7, 0x01e6, 0x01e5, 0x01e4, 0x01e3, 0x01e2, 0x01e1,
0x01e0, 0x01e0, 0x01df, 0x01de, 0x01dd, 0x01dc, 0x01db, 0x01da, 0x01da, 0x01d9, 0x01d8, 0x01d7, 0x01d6, 0x01d5, 0x01d4, 0x01d4,
0x01d3, 0x01d2, 0x01d1, 0x01d0, 0x01cf, 0x01cf, 0x01ce, 0x01cd, 0x01cc, 0x01cb, 0x01cb, 0x01ca, 0x01c9, 0x01c8, 0x01c7, 0x01c7,
0x01c6, 0x01c5, 0x01c4, 0x01c3, 0x01c3, 0x01c2, 0x01c1, 0x01c0, 0x01c0, 0x01bf, 0x01be, 0x01bd, 0x01bd, 0x01bc, 0x01bb, 0x01ba,
0x01ba, 0x01b9, 0x01b8, 0x01b7, 0x01b7, 0x01b6, 0x01b5, 0x01b4, 0x01b4, 0x01b3, 0x01b2, 0x01b2, 0x01b1, 0x01b0, 0x01af, 0x01af,
0x01ae, 0x01ad, 0x01ad, 0x01ac, 0x01ab, 0x01aa, 0x01aa, 0x01a9, 0x01a8, 0x01a8, 0x01a7, 0x01a6, 0x01a6, 0x01a5, 0x01a4, 0x01a4,
0x01a3, 0x01a2, 0x01a2, 0x01a1, 0x01a0, 0x01a0, 0x019f, 0x019e, 0x019e, 0x019d, 0x019c, 0x019c, 0x019b, 0x019a, 0x019a, 0x0199,
0x0198, 0x0198, 0x0197, 0x0197, 0x0196, 0x0195, 0x0195, 0x0194, 0x0193, 0x0193, 0x0192, 0x0192, 0x0191, 0x0190, 0x0190, 0x018f,
0x018f, 0x018e, 0x018d, 0x018d, 0x018c, 0x018b, 0x018b, 0x018a, 0x018a, 0x0189, 0x0189, 0x0188, 0x0187, 0x0187, 0x0186, 0x0186,
0x0185, 0x0184, 0x0184, 0x0183, 0x0183, 0x0182, 0x0182, 0x0181, 0x0180, 0x0180, 0x017f, 0x017f, 0x017e, 0x017e, 0x017d, 0x017d,
0x017c, 0x017b, 0x017b, 0x017a, 0x017a, 0x0179, 0x0179, 0x0178, 0x0178, 0x0177, 0x0177, 0x0176, 0x0175, 0x0175, 0x0174, 0x0174,
0x0173, 0x0173, 0x0172, 0x0172, 0x0171, 0x0171, 0x0170, 0x0170, 0x016f, 0x016f, 0x016e, 0x016e, 0x016d, 0x016d, 0x016c, 0x016c,
0x016b, 0x016b, 0x016a, 0x016a, 0x0169, 0x0169, 0x0168, 0x0168, 0x0167, 0x0167, 0x0166, 0x0166, 0x0165, 0x0165, 0x0164, 0x0164,
0x0163, 0x0163, 0x0162, 0x0162, 0x0161, 0x0161, 0x0160, 0x0160, 0x015f, 0x015f, 0x015e, 0x015e, 0x015d, 0x015d, 0x015d, 0x015c,
0x015c, 0x015b, 0x015b, 0x015a, 0x015a, 0x0159, 0x0159, 0x0158, 0x0158, 0x0158, 0x0157, 0x0157, 0x0156, 0x0156
};
#if defined __ARM_NEON && defined __aarch64__
static const uint16_t DivTableNEON[255*3+1] = {
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x1c71, 0x1af2, 0x1999, 0x1861, 0x1745, 0x1642, 0x1555, 0x147a, 0x13b1, 0x12f6, 0x1249, 0x11a7, 0x1111, 0x1084, 0x1000,
0x0f83, 0x0f0f, 0x0ea0, 0x0e38, 0x0dd6, 0x0d79, 0x0d20, 0x0ccc, 0x0c7c, 0x0c30, 0x0be8, 0x0ba2, 0x0b60, 0x0b21, 0x0ae4, 0x0aaa,
0x0a72, 0x0a3d, 0x0a0a, 0x09d8, 0x09a9, 0x097b, 0x094f, 0x0924, 0x08fb, 0x08d3, 0x08ad, 0x0888, 0x0864, 0x0842, 0x0820, 0x0800,
0x07e0, 0x07c1, 0x07a4, 0x0787, 0x076b, 0x0750, 0x0736, 0x071c, 0x0703, 0x06eb, 0x06d3, 0x06bc, 0x06a6, 0x0690, 0x067b, 0x0666,
0x0652, 0x063e, 0x062b, 0x0618, 0x0606, 0x05f4, 0x05e2, 0x05d1, 0x05c0, 0x05b0, 0x05a0, 0x0590, 0x0581, 0x0572, 0x0563, 0x0555,
0x0547, 0x0539, 0x052b, 0x051e, 0x0511, 0x0505, 0x04f8, 0x04ec, 0x04e0, 0x04d4, 0x04c8, 0x04bd, 0x04b2, 0x04a7, 0x049c, 0x0492,
0x0487, 0x047d, 0x0473, 0x0469, 0x0460, 0x0456, 0x044d, 0x0444, 0x043b, 0x0432, 0x0429, 0x0421, 0x0418, 0x0410, 0x0408, 0x0400,
0x03f8, 0x03f0, 0x03e8, 0x03e0, 0x03d9, 0x03d2, 0x03ca, 0x03c3, 0x03bc, 0x03b5, 0x03ae, 0x03a8, 0x03a1, 0x039b, 0x0394, 0x038e,
0x0387, 0x0381, 0x037b, 0x0375, 0x036f, 0x0369, 0x0364, 0x035e, 0x0358, 0x0353, 0x034d, 0x0348, 0x0342, 0x033d, 0x0338, 0x0333,
0x032e, 0x0329, 0x0324, 0x031f, 0x031a, 0x0315, 0x0310, 0x030c, 0x0307, 0x0303, 0x02fe, 0x02fa, 0x02f5, 0x02f1, 0x02ec, 0x02e8,
0x02e4, 0x02e0, 0x02dc, 0x02d8, 0x02d4, 0x02d0, 0x02cc, 0x02c8, 0x02c4, 0x02c0, 0x02bc, 0x02b9, 0x02b5, 0x02b1, 0x02ae, 0x02aa,
0x02a7, 0x02a3, 0x02a0, 0x029c, 0x0299, 0x0295, 0x0292, 0x028f, 0x028c, 0x0288, 0x0285, 0x0282, 0x027f, 0x027c, 0x0279, 0x0276,
0x0273, 0x0270, 0x026d, 0x026a, 0x0267, 0x0264, 0x0261, 0x025e, 0x025c, 0x0259, 0x0256, 0x0253, 0x0251, 0x024e, 0x024b, 0x0249,
0x0246, 0x0243, 0x0241, 0x023e, 0x023c, 0x0239, 0x0237, 0x0234, 0x0232, 0x0230, 0x022d, 0x022b, 0x0229, 0x0226, 0x0224, 0x0222,
0x021f, 0x021d, 0x021b, 0x0219, 0x0216, 0x0214, 0x0212, 0x0210, 0x020e, 0x020c, 0x020a, 0x0208, 0x0206, 0x0204, 0x0202, 0x0200,
0x01fe, 0x01fc, 0x01fa, 0x01f8, 0x01f6, 0x01f4, 0x01f2, 0x01f0, 0x01ee, 0x01ec, 0x01ea, 0x01e9, 0x01e7, 0x01e5, 0x01e3, 0x01e1,
0x01e0, 0x01de, 0x01dc, 0x01da, 0x01d9, 0x01d7, 0x01d5, 0x01d4, 0x01d2, 0x01d0, 0x01cf, 0x01cd, 0x01cb, 0x01ca, 0x01c8, 0x01c7,
0x01c5, 0x01c3, 0x01c2, 0x01c0, 0x01bf, 0x01bd, 0x01bc, 0x01ba, 0x01b9, 0x01b7, 0x01b6, 0x01b4, 0x01b3, 0x01b2, 0x01b0, 0x01af,
0x01ad, 0x01ac, 0x01aa, 0x01a9, 0x01a8, 0x01a6, 0x01a5, 0x01a4, 0x01a2, 0x01a1, 0x01a0, 0x019e, 0x019d, 0x019c, 0x019a, 0x0199,
0x0198, 0x0197, 0x0195, 0x0194, 0x0193, 0x0192, 0x0190, 0x018f, 0x018e, 0x018d, 0x018b, 0x018a, 0x0189, 0x0188, 0x0187, 0x0186,
0x0184, 0x0183, 0x0182, 0x0181, 0x0180, 0x017f, 0x017e, 0x017d, 0x017b, 0x017a, 0x0179, 0x0178, 0x0177, 0x0176, 0x0175, 0x0174,
0x0173, 0x0172, 0x0171, 0x0170, 0x016f, 0x016e, 0x016d, 0x016c, 0x016b, 0x016a, 0x0169, 0x0168, 0x0167, 0x0166, 0x0165, 0x0164,
0x0163, 0x0162, 0x0161, 0x0160, 0x015f, 0x015e, 0x015d, 0x015c, 0x015b, 0x015a, 0x0159, 0x0158, 0x0158, 0x0157, 0x0156, 0x0155,
0x0154, 0x0153, 0x0152, 0x0151, 0x0150, 0x0150, 0x014f, 0x014e, 0x014d, 0x014c, 0x014b, 0x014a, 0x014a, 0x0149, 0x0148, 0x0147,
0x0146, 0x0146, 0x0145, 0x0144, 0x0143, 0x0142, 0x0142, 0x0141, 0x0140, 0x013f, 0x013e, 0x013e, 0x013d, 0x013c, 0x013b, 0x013b,
0x013a, 0x0139, 0x0138, 0x0138, 0x0137, 0x0136, 0x0135, 0x0135, 0x0134, 0x0133, 0x0132, 0x0132, 0x0131, 0x0130, 0x0130, 0x012f,
0x012e, 0x012e, 0x012d, 0x012c, 0x012b, 0x012b, 0x012a, 0x0129, 0x0129, 0x0128, 0x0127, 0x0127, 0x0126, 0x0125, 0x0125, 0x0124,
0x0123, 0x0123, 0x0122, 0x0121, 0x0121, 0x0120, 0x0120, 0x011f, 0x011e, 0x011e, 0x011d, 0x011c, 0x011c, 0x011b, 0x011b, 0x011a,
0x0119, 0x0119, 0x0118, 0x0118, 0x0117, 0x0116, 0x0116, 0x0115, 0x0115, 0x0114, 0x0113, 0x0113, 0x0112, 0x0112, 0x0111, 0x0111,
0x0110, 0x010f, 0x010f, 0x010e, 0x010e, 0x010d, 0x010d, 0x010c, 0x010c, 0x010b, 0x010a, 0x010a, 0x0109, 0x0109, 0x0108, 0x0108,
0x0107, 0x0107, 0x0106, 0x0106, 0x0105, 0x0105, 0x0104, 0x0104, 0x0103, 0x0103, 0x0102, 0x0102, 0x0101, 0x0101, 0x0100, 0x0100,
0x00ff, 0x00ff, 0x00fe, 0x00fe, 0x00fd, 0x00fd, 0x00fc, 0x00fc, 0x00fb, 0x00fb, 0x00fa, 0x00fa, 0x00f9, 0x00f9, 0x00f8, 0x00f8,
0x00f7, 0x00f7, 0x00f6, 0x00f6, 0x00f5, 0x00f5, 0x00f4, 0x00f4, 0x00f4, 0x00f3, 0x00f3, 0x00f2, 0x00f2, 0x00f1, 0x00f1, 0x00f0,
0x00f0, 0x00f0, 0x00ef, 0x00ef, 0x00ee, 0x00ee, 0x00ed, 0x00ed, 0x00ed, 0x00ec, 0x00ec, 0x00eb, 0x00eb, 0x00ea, 0x00ea, 0x00ea,
0x00e9, 0x00e9, 0x00e8, 0x00e8, 0x00e7, 0x00e7, 0x00e7, 0x00e6, 0x00e6, 0x00e5, 0x00e5, 0x00e5, 0x00e4, 0x00e4, 0x00e3, 0x00e3,
0x00e3, 0x00e2, 0x00e2, 0x00e1, 0x00e1, 0x00e1, 0x00e0, 0x00e0, 0x00e0, 0x00df, 0x00df, 0x00de, 0x00de, 0x00de, 0x00dd, 0x00dd,
0x00dd, 0x00dc, 0x00dc, 0x00db, 0x00db, 0x00db, 0x00da, 0x00da, 0x00da, 0x00d9, 0x00d9, 0x00d9, 0x00d8, 0x00d8, 0x00d7, 0x00d7,
0x00d7, 0x00d6, 0x00d6, 0x00d6, 0x00d5, 0x00d5, 0x00d5, 0x00d4, 0x00d4, 0x00d4, 0x00d3, 0x00d3, 0x00d3, 0x00d2, 0x00d2, 0x00d2,
0x00d1, 0x00d1, 0x00d1, 0x00d0, 0x00d0, 0x00d0, 0x00cf, 0x00cf, 0x00cf, 0x00ce, 0x00ce, 0x00ce, 0x00cd, 0x00cd, 0x00cd, 0x00cc,
0x00cc, 0x00cc, 0x00cb, 0x00cb, 0x00cb, 0x00ca, 0x00ca, 0x00ca, 0x00c9, 0x00c9, 0x00c9, 0x00c9, 0x00c8, 0x00c8, 0x00c8, 0x00c7,
0x00c7, 0x00c7, 0x00c6, 0x00c6, 0x00c6, 0x00c5, 0x00c5, 0x00c5, 0x00c5, 0x00c4, 0x00c4, 0x00c4, 0x00c3, 0x00c3, 0x00c3, 0x00c3,
0x00c2, 0x00c2, 0x00c2, 0x00c1, 0x00c1, 0x00c1, 0x00c1, 0x00c0, 0x00c0, 0x00c0, 0x00bf, 0x00bf, 0x00bf, 0x00bf, 0x00be, 0x00be,
0x00be, 0x00bd, 0x00bd, 0x00bd, 0x00bd, 0x00bc, 0x00bc, 0x00bc, 0x00bc, 0x00bb, 0x00bb, 0x00bb, 0x00ba, 0x00ba, 0x00ba, 0x00ba,
0x00b9, 0x00b9, 0x00b9, 0x00b9, 0x00b8, 0x00b8, 0x00b8, 0x00b8, 0x00b7, 0x00b7, 0x00b7, 0x00b7, 0x00b6, 0x00b6, 0x00b6, 0x00b6,
0x00b5, 0x00b5, 0x00b5, 0x00b5, 0x00b4, 0x00b4, 0x00b4, 0x00b4, 0x00b3, 0x00b3, 0x00b3, 0x00b3, 0x00b2, 0x00b2, 0x00b2, 0x00b2,
0x00b1, 0x00b1, 0x00b1, 0x00b1, 0x00b0, 0x00b0, 0x00b0, 0x00b0, 0x00af, 0x00af, 0x00af, 0x00af, 0x00ae, 0x00ae, 0x00ae, 0x00ae,
0x00ae, 0x00ad, 0x00ad, 0x00ad, 0x00ad, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ab, 0x00ab, 0x00ab, 0x00ab,
};
#endif
static tracy_force_inline uint64_t ProcessRGB( const uint8_t* src )
{
#ifdef __SSE4_1__
__m128i px0 = _mm_loadu_si128(((__m128i*)src) + 0);
__m128i px1 = _mm_loadu_si128(((__m128i*)src) + 1);
__m128i px2 = _mm_loadu_si128(((__m128i*)src) + 2);
__m128i px3 = _mm_loadu_si128(((__m128i*)src) + 3);
__m128i smask = _mm_set1_epi32( 0xF8FCF8 );
__m128i sd0 = _mm_and_si128( px0, smask );
__m128i sd1 = _mm_and_si128( px1, smask );
__m128i sd2 = _mm_and_si128( px2, smask );
__m128i sd3 = _mm_and_si128( px3, smask );
__m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
__m128i sc0 = _mm_cmpeq_epi8(sd0, sc);
__m128i sc1 = _mm_cmpeq_epi8(sd1, sc);
__m128i sc2 = _mm_cmpeq_epi8(sd2, sc);
__m128i sc3 = _mm_cmpeq_epi8(sd3, sc);
__m128i sm0 = _mm_and_si128(sc0, sc1);
__m128i sm1 = _mm_and_si128(sc2, sc3);
__m128i sm = _mm_and_si128(sm0, sm1);
if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) )
{
return uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
}
__m128i amask = _mm_set1_epi32( 0xFFFFFF );
px0 = _mm_and_si128( px0, amask );
px1 = _mm_and_si128( px1, amask );
px2 = _mm_and_si128( px2, amask );
px3 = _mm_and_si128( px3, amask );
__m128i min0 = _mm_min_epu8( px0, px1 );
__m128i min1 = _mm_min_epu8( px2, px3 );
__m128i min2 = _mm_min_epu8( min0, min1 );
__m128i max0 = _mm_max_epu8( px0, px1 );
__m128i max1 = _mm_max_epu8( px2, px3 );
__m128i max2 = _mm_max_epu8( max0, max1 );
__m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
__m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
__m128i min4 = _mm_min_epu8( min2, min3 );
__m128i max4 = _mm_max_epu8( max2, max3 );
__m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
__m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
__m128i rmin = _mm_min_epu8( min4, min5 );
__m128i rmax = _mm_max_epu8( max4, max5 );
__m128i range1 = _mm_subs_epu8( rmax, rmin );
__m128i range2 = _mm_sad_epu8( rmax, rmin );
uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1;
__m128i range = _mm_set1_epi16( DivTable[vrange] );
__m128i inset1 = _mm_srli_epi16( range1, 4 );
__m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) );
__m128i min = _mm_adds_epu8( rmin, inset );
__m128i max = _mm_subs_epu8( rmax, inset );
__m128i c0 = _mm_subs_epu8( px0, rmin );
__m128i c1 = _mm_subs_epu8( px1, rmin );
__m128i c2 = _mm_subs_epu8( px2, rmin );
__m128i c3 = _mm_subs_epu8( px3, rmin );
__m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) );
__m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) );
__m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) );
__m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) );
__m128i s0 = _mm_hadd_epi16( is0, is1 );
__m128i s1 = _mm_hadd_epi16( is2, is3 );
__m128i m0 = _mm_mulhi_epu16( s0, range );
__m128i m1 = _mm_mulhi_epu16( s1, range );
__m128i p0 = _mm_packus_epi16( m0, m1 );
__m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) );
__m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 );
__m128i p3 = _mm_or_si128( p1, p2 );
__m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) );
uint32_t vmin = _mm_cvtsi128_si32( min );
uint32_t vmax = _mm_cvtsi128_si32( max );
uint32_t vp = _mm_cvtsi128_si32( p );
return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
#elif defined __ARM_NEON
# ifdef __aarch64__
uint8x16x4_t px = vld4q_u8( src );
uint8x16_t lr = px.val[0];
uint8x16_t lg = px.val[1];
uint8x16_t lb = px.val[2];
uint8_t rmaxr = vmaxvq_u8( lr );
uint8_t rmaxg = vmaxvq_u8( lg );
uint8_t rmaxb = vmaxvq_u8( lb );
uint8_t rminr = vminvq_u8( lr );
uint8_t rming = vminvq_u8( lg );
uint8_t rminb = vminvq_u8( lb );
int rr = rmaxr - rminr;
int rg = rmaxg - rming;
int rb = rmaxb - rminb;
int vrange1 = rr + rg + rb;
uint16_t vrange2 = DivTableNEON[vrange1];
uint8_t insetr = rr >> 4;
uint8_t insetg = rg >> 4;
uint8_t insetb = rb >> 4;
uint8_t minr = rminr + insetr;
uint8_t ming = rming + insetg;
uint8_t minb = rminb + insetb;
uint8_t maxr = rmaxr - insetr;
uint8_t maxg = rmaxg - insetg;
uint8_t maxb = rmaxb - insetb;
uint8x16_t cr = vsubq_u8( lr, vdupq_n_u8( rminr ) );
uint8x16_t cg = vsubq_u8( lg, vdupq_n_u8( rming ) );
uint8x16_t cb = vsubq_u8( lb, vdupq_n_u8( rminb ) );
uint16x8_t is0l = vaddl_u8( vget_low_u8( cr ), vget_low_u8( cg ) );
uint16x8_t is0h = vaddl_u8( vget_high_u8( cr ), vget_high_u8( cg ) );
uint16x8_t is1l = vaddw_u8( is0l, vget_low_u8( cb ) );
uint16x8_t is1h = vaddw_u8( is0h, vget_high_u8( cb ) );
int16x8_t range = vdupq_n_s16( vrange2 );
uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1l ), range ) );
uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1h ), range ) );
uint8x8_t p00 = vmovn_u16( m0 );
uint8x8_t p01 = vmovn_u16( m1 );
uint8x16_t p0 = vcombine_u8( p00, p01 );
uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
uint32x4_t p3 = vaddq_u32( p1, p2 );
uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
uint32_t vp;
vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
return uint64_t( ( uint64_t( to565( minr, ming, minb ) ) << 16 ) | to565( maxr, maxg, maxb ) | ( uint64_t( vp ) << 32 ) );
# else
uint32x4_t px0 = vld1q_u32( (uint32_t*)src );
uint32x4_t px1 = vld1q_u32( (uint32_t*)src + 4 );
uint32x4_t px2 = vld1q_u32( (uint32_t*)src + 8 );
uint32x4_t px3 = vld1q_u32( (uint32_t*)src + 12 );
uint32x4_t smask = vdupq_n_u32( 0xF8FCF8 );
uint32x4_t sd0 = vandq_u32( smask, px0 );
uint32x4_t sd1 = vandq_u32( smask, px1 );
uint32x4_t sd2 = vandq_u32( smask, px2 );
uint32x4_t sd3 = vandq_u32( smask, px3 );
uint32x4_t sc = vdupq_n_u32( sd0[0] );
uint32x4_t sc0 = vceqq_u32( sd0, sc );
uint32x4_t sc1 = vceqq_u32( sd1, sc );
uint32x4_t sc2 = vceqq_u32( sd2, sc );
uint32x4_t sc3 = vceqq_u32( sd3, sc );
uint32x4_t sm0 = vandq_u32( sc0, sc1 );
uint32x4_t sm1 = vandq_u32( sc2, sc3 );
int64x2_t sm = vreinterpretq_s64_u32( vandq_u32( sm0, sm1 ) );
if( sm[0] == -1 && sm[1] == -1 )
{
return uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
}
uint32x4_t mask = vdupq_n_u32( 0xFFFFFF );
uint8x16_t l0 = vreinterpretq_u8_u32( vandq_u32( mask, px0 ) );
uint8x16_t l1 = vreinterpretq_u8_u32( vandq_u32( mask, px1 ) );
uint8x16_t l2 = vreinterpretq_u8_u32( vandq_u32( mask, px2 ) );
uint8x16_t l3 = vreinterpretq_u8_u32( vandq_u32( mask, px3 ) );
uint8x16_t min0 = vminq_u8( l0, l1 );
uint8x16_t min1 = vminq_u8( l2, l3 );
uint8x16_t min2 = vminq_u8( min0, min1 );
uint8x16_t max0 = vmaxq_u8( l0, l1 );
uint8x16_t max1 = vmaxq_u8( l2, l3 );
uint8x16_t max2 = vmaxq_u8( max0, max1 );
uint8x16_t min3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( min2 ) ) );
uint8x16_t max3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( max2 ) ) );
uint8x16_t min4 = vminq_u8( min2, min3 );
uint8x16_t max4 = vmaxq_u8( max2, max3 );
uint8x16_t min5 = vcombine_u8( vget_high_u8( min4 ), vget_low_u8( min4 ) );
uint8x16_t max5 = vcombine_u8( vget_high_u8( max4 ), vget_low_u8( max4 ) );
uint8x16_t rmin = vminq_u8( min4, min5 );
uint8x16_t rmax = vmaxq_u8( max4, max5 );
uint8x16_t range1 = vsubq_u8( rmax, rmin );
uint8x8_t range2 = vget_low_u8( range1 );
uint8x8x2_t range3 = vzip_u8( range2, vdup_n_u8( 0 ) );
uint16x4_t range4 = vreinterpret_u16_u8( range3.val[0] );
uint16_t vrange1;
uint16x4_t range5 = vpadd_u16( range4, range4 );
uint16x4_t range6 = vpadd_u16( range5, range5 );
vst1_lane_u16( &vrange1, range6, 0 );
uint32_t vrange2 = ( 2 << 16 ) / uint32_t( vrange1 + 1 );
uint16x8_t range = vdupq_n_u16( vrange2 );
uint8x16_t inset = vshrq_n_u8( range1, 4 );
uint8x16_t min = vaddq_u8( rmin, inset );
uint8x16_t max = vsubq_u8( rmax, inset );
uint8x16_t c0 = vsubq_u8( l0, rmin );
uint8x16_t c1 = vsubq_u8( l1, rmin );
uint8x16_t c2 = vsubq_u8( l2, rmin );
uint8x16_t c3 = vsubq_u8( l3, rmin );
uint16x8_t is0 = vpaddlq_u8( c0 );
uint16x8_t is1 = vpaddlq_u8( c1 );
uint16x8_t is2 = vpaddlq_u8( c2 );
uint16x8_t is3 = vpaddlq_u8( c3 );
uint16x4_t is4 = vpadd_u16( vget_low_u16( is0 ), vget_high_u16( is0 ) );
uint16x4_t is5 = vpadd_u16( vget_low_u16( is1 ), vget_high_u16( is1 ) );
uint16x4_t is6 = vpadd_u16( vget_low_u16( is2 ), vget_high_u16( is2 ) );
uint16x4_t is7 = vpadd_u16( vget_low_u16( is3 ), vget_high_u16( is3 ) );
uint16x8_t s0 = vcombine_u16( is4, is5 );
uint16x8_t s1 = vcombine_u16( is6, is7 );
uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s0 ), vreinterpretq_s16_u16( range ) ) );
uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s1 ), vreinterpretq_s16_u16( range ) ) );
uint8x8_t p00 = vmovn_u16( m0 );
uint8x8_t p01 = vmovn_u16( m1 );
uint8x16_t p0 = vcombine_u8( p00, p01 );
uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
uint32x4_t p3 = vaddq_u32( p1, p2 );
uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
uint32_t vmin, vmax, vp;
vst1q_lane_u32( &vmin, vreinterpretq_u32_u8( min ), 0 );
vst1q_lane_u32( &vmax, vreinterpretq_u32_u8( max ), 0 );
vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
# endif
#else
uint32_t ref;
memcpy( &ref, src, 4 );
uint32_t refMask = ref & 0xF8FCF8;
auto stmp = src + 4;
for( int i=1; i<16; i++ )
{
uint32_t px;
memcpy( &px, stmp, 4 );
if( ( px & 0xF8FCF8 ) != refMask ) break;
stmp += 4;
}
if( stmp == src + 64 )
{
return uint64_t( to565( ref ) ) << 16;
}
uint8_t min[3] = { src[0], src[1], src[2] };
uint8_t max[3] = { src[0], src[1], src[2] };
auto tmp = src + 4;
for( int i=1; i<16; i++ )
{
for( int j=0; j<3; j++ )
{
if( tmp[j] < min[j] ) min[j] = tmp[j];
else if( tmp[j] > max[j] ) max[j] = tmp[j];
}
tmp += 4;
}
const uint32_t range = DivTable[max[0] - min[0] + max[1] - min[1] + max[2] - min[2]];
const uint32_t rmin = min[0] + min[1] + min[2];
for( int i=0; i<3; i++ )
{
const uint8_t inset = ( max[i] - min[i] ) >> 4;
min[i] += inset;
max[i] -= inset;
}
uint32_t data = 0;
for( int i=0; i<16; i++ )
{
const uint32_t c = src[0] + src[1] + src[2] - rmin;
const uint8_t idx = ( c * range ) >> 16;
data |= idx << (i*2);
src += 4;
}
return uint64_t( ( uint64_t( to565( min[0], min[1], min[2] ) ) << 16 ) | to565( max[0], max[1], max[2] ) | ( uint64_t( data ) << 32 ) );
#endif
}
#ifdef __AVX2__
static tracy_force_inline void ProcessRGB_AVX( const uint8_t* src, char*& dst )
{
__m256i px0 = _mm256_loadu_si256(((__m256i*)src) + 0);
__m256i px1 = _mm256_loadu_si256(((__m256i*)src) + 1);
__m256i px2 = _mm256_loadu_si256(((__m256i*)src) + 2);
__m256i px3 = _mm256_loadu_si256(((__m256i*)src) + 3);
__m256i smask = _mm256_set1_epi32( 0xF8FCF8 );
__m256i sd0 = _mm256_and_si256( px0, smask );
__m256i sd1 = _mm256_and_si256( px1, smask );
__m256i sd2 = _mm256_and_si256( px2, smask );
__m256i sd3 = _mm256_and_si256( px3, smask );
__m256i sc = _mm256_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
__m256i sc0 = _mm256_cmpeq_epi8( sd0, sc );
__m256i sc1 = _mm256_cmpeq_epi8( sd1, sc );
__m256i sc2 = _mm256_cmpeq_epi8( sd2, sc );
__m256i sc3 = _mm256_cmpeq_epi8( sd3, sc );
__m256i sm0 = _mm256_and_si256( sc0, sc1 );
__m256i sm1 = _mm256_and_si256( sc2, sc3 );
__m256i sm = _mm256_and_si256( sm0, sm1 );
const int64_t solid0 = 1 - _mm_testc_si128( _mm256_castsi256_si128( sm ), _mm_set1_epi32( -1 ) );
const int64_t solid1 = 1 - _mm_testc_si128( _mm256_extracti128_si256( sm, 1 ), _mm_set1_epi32( -1 ) );
if( solid0 + solid1 == 0 )
{
const auto c0 = uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
const auto c1 = uint64_t( to565( src[16], src[17], src[18] ) ) << 16;
memcpy( dst, &c0, 8 );
memcpy( dst+8, &c1, 8 );
dst += 16;
return;
}
__m256i amask = _mm256_set1_epi32( 0xFFFFFF );
px0 = _mm256_and_si256( px0, amask );
px1 = _mm256_and_si256( px1, amask );
px2 = _mm256_and_si256( px2, amask );
px3 = _mm256_and_si256( px3, amask );
__m256i min0 = _mm256_min_epu8( px0, px1 );
__m256i min1 = _mm256_min_epu8( px2, px3 );
__m256i min2 = _mm256_min_epu8( min0, min1 );
__m256i max0 = _mm256_max_epu8( px0, px1 );
__m256i max1 = _mm256_max_epu8( px2, px3 );
__m256i max2 = _mm256_max_epu8( max0, max1 );
__m256i min3 = _mm256_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
__m256i max3 = _mm256_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
__m256i min4 = _mm256_min_epu8( min2, min3 );
__m256i max4 = _mm256_max_epu8( max2, max3 );
__m256i min5 = _mm256_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
__m256i max5 = _mm256_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
__m256i rmin = _mm256_min_epu8( min4, min5 );
__m256i rmax = _mm256_max_epu8( max4, max5 );
__m256i range1 = _mm256_subs_epu8( rmax, rmin );
__m256i range2 = _mm256_sad_epu8( rmax, rmin );
uint16_t vrange0 = DivTable[_mm256_cvtsi256_si32( range2 ) >> 1];
uint16_t vrange1 = DivTable[_mm256_extract_epi16( range2, 8 ) >> 1];
__m256i range00 = _mm256_set1_epi16( vrange0 );
__m256i range = _mm256_inserti128_si256( range00, _mm_set1_epi16( vrange1 ), 1 );
__m256i inset1 = _mm256_srli_epi16( range1, 4 );
__m256i inset = _mm256_and_si256( inset1, _mm256_set1_epi8( 0xF ) );
__m256i min = _mm256_adds_epu8( rmin, inset );
__m256i max = _mm256_subs_epu8( rmax, inset );
__m256i c0 = _mm256_subs_epu8( px0, rmin );
__m256i c1 = _mm256_subs_epu8( px1, rmin );
__m256i c2 = _mm256_subs_epu8( px2, rmin );
__m256i c3 = _mm256_subs_epu8( px3, rmin );
__m256i is0 = _mm256_maddubs_epi16( c0, _mm256_set1_epi8( 1 ) );
__m256i is1 = _mm256_maddubs_epi16( c1, _mm256_set1_epi8( 1 ) );
__m256i is2 = _mm256_maddubs_epi16( c2, _mm256_set1_epi8( 1 ) );
__m256i is3 = _mm256_maddubs_epi16( c3, _mm256_set1_epi8( 1 ) );
__m256i s0 = _mm256_hadd_epi16( is0, is1 );
__m256i s1 = _mm256_hadd_epi16( is2, is3 );
__m256i m0 = _mm256_mulhi_epu16( s0, range );
__m256i m1 = _mm256_mulhi_epu16( s1, range );
__m256i p0 = _mm256_packus_epi16( m0, m1 );
__m256i p1 = _mm256_or_si256( _mm256_srai_epi32( p0, 6 ), _mm256_srai_epi32( p0, 12 ) );
__m256i p2 = _mm256_or_si256( _mm256_srai_epi32( p0, 18 ), p0 );
__m256i p3 = _mm256_or_si256( p1, p2 );
__m256i p =_mm256_shuffle_epi8( p3, _mm256_set1_epi32( 0x0C080400 ) );
__m256i mm0 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), min );
__m256i mm1 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), max );
__m256i mm2 = _mm256_unpacklo_epi64( mm1, mm0 );
__m256i mmr = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 11 ), 11 );
__m256i mmg = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 26 ), 5 );
__m256i mmb = _mm256_srli_epi64( _mm256_slli_epi64( mm2, 16 ), 59 );
__m256i mm3 = _mm256_or_si256( mmr, mmg );
__m256i mm4 = _mm256_or_si256( mm3, mmb );
__m256i mm5 = _mm256_shuffle_epi8( mm4, _mm256_set1_epi32( 0x09080100 ) );
__m256i d0 = _mm256_unpacklo_epi32( mm5, p );
__m256i d1 = _mm256_permute4x64_epi64( d0, _MM_SHUFFLE( 3, 2, 2, 0 ) );
__m128i d2 = _mm256_castsi256_si128( d1 );
__m128i mask = _mm_set_epi64x( 0xFFFF0000 | -solid1, 0xFFFF0000 | -solid0 );
__m128i d3 = _mm_and_si128( d2, mask );
_mm_storeu_si128( (__m128i*)dst, d3 );
dst += 16;
}
#endif
void CompressImageDxt1( const char* src, char* dst, int w, int h )
{
assert( (w % 4) == 0 && (h % 4) == 0 );
#ifdef __AVX2__
if( w%8 == 0 )
{
uint32_t buf[8*4];
int i = 0;
auto blocks = w * h / 32;
do
{
auto tmp = (char*)buf;
memcpy( tmp, src, 8*4 );
memcpy( tmp + 8*4, src + w * 4, 8*4 );
memcpy( tmp + 16*4, src + w * 8, 8*4 );
memcpy( tmp + 24*4, src + w * 12, 8*4 );
src += 8*4;
if( ++i == w/8 )
{
src += w * 3 * 4;
i = 0;
}
ProcessRGB_AVX( (uint8_t*)buf, dst );
}
while( --blocks );
}
else
#endif
{
uint32_t buf[4*4];
int i = 0;
auto ptr = dst;
auto blocks = w * h / 16;
do
{
auto tmp = (char*)buf;
memcpy( tmp, src, 4*4 );
memcpy( tmp + 4*4, src + w * 4, 4*4 );
memcpy( tmp + 8*4, src + w * 8, 4*4 );
memcpy( tmp + 12*4, src + w * 12, 4*4 );
src += 4*4;
if( ++i == w/4 )
{
src += w * 3 * 4;
i = 0;
}
const auto c = ProcessRGB( (uint8_t*)buf );
memcpy( ptr, &c, sizeof( uint64_t ) );
ptr += sizeof( uint64_t );
}
while( --blocks );
}
}
}

View File

@@ -0,0 +1,11 @@
#ifndef __TRACYDXT1_HPP__
#define __TRACYDXT1_HPP__
namespace tracy
{
void CompressImageDxt1( const char* src, char* dst, int w, int h );
}
#endif

View File

@@ -0,0 +1,118 @@
#ifndef __TRACYFASTVECTOR_HPP__
#define __TRACYFASTVECTOR_HPP__
#include <assert.h>
#include <stddef.h>
#include "../common/TracyAlloc.hpp"
#include "../common/TracyForceInline.hpp"
namespace tracy
{
template<typename T>
class FastVector
{
public:
using iterator = T*;
using const_iterator = const T*;
FastVector( size_t capacity )
: m_ptr( (T*)tracy_malloc( sizeof( T ) * capacity ) )
, m_write( m_ptr )
, m_end( m_ptr + capacity )
{
assert( capacity != 0 );
}
FastVector( const FastVector& ) = delete;
FastVector( FastVector&& ) = delete;
~FastVector()
{
tracy_free( m_ptr );
}
FastVector& operator=( const FastVector& ) = delete;
FastVector& operator=( FastVector&& ) = delete;
bool empty() const { return m_ptr == m_write; }
size_t size() const { return m_write - m_ptr; }
T* data() { return m_ptr; }
const T* data() const { return m_ptr; };
T* begin() { return m_ptr; }
const T* begin() const { return m_ptr; }
T* end() { return m_write; }
const T* end() const { return m_write; }
T& front() { assert( !empty() ); return m_ptr[0]; }
const T& front() const { assert( !empty() ); return m_ptr[0]; }
T& back() { assert( !empty() ); return m_write[-1]; }
const T& back() const { assert( !empty() ); return m_write[-1]; }
T& operator[]( size_t idx ) { return m_ptr[idx]; }
const T& operator[]( size_t idx ) const { return m_ptr[idx]; }
T* push_next()
{
if( m_write == m_end ) AllocMore();
return m_write++;
}
T* prepare_next()
{
if( m_write == m_end ) AllocMore();
return m_write;
}
void commit_next()
{
m_write++;
}
void clear()
{
m_write = m_ptr;
}
void swap( FastVector& vec )
{
const auto ptr1 = m_ptr;
const auto ptr2 = vec.m_ptr;
const auto write1 = m_write;
const auto write2 = vec.m_write;
const auto end1 = m_end;
const auto end2 = vec.m_end;
m_ptr = ptr2;
vec.m_ptr = ptr1;
m_write = write2;
vec.m_write = write1;
m_end = end2;
vec.m_end = end1;
}
private:
tracy_no_inline void AllocMore()
{
const auto cap = size_t( m_end - m_ptr ) * 2;
const auto size = size_t( m_write - m_ptr );
T* ptr = (T*)tracy_malloc( sizeof( T ) * cap );
memcpy( ptr, m_ptr, size * sizeof( T ) );
tracy_free_fast( m_ptr );
m_ptr = ptr;
m_write = m_ptr + size;
m_end = m_ptr + cap;
}
T* m_ptr;
T* m_write;
T* m_end;
};
}
#endif

View File

@@ -0,0 +1,546 @@
#ifndef __TRACYLOCK_HPP__
#define __TRACYLOCK_HPP__
#include <atomic>
#include <limits>
#include "../common/TracySystem.hpp"
#include "../common/TracyAlign.hpp"
#include "TracyProfiler.hpp"
namespace tracy
{
class LockableCtx
{
public:
tracy_force_inline LockableCtx( const SourceLocationData* srcloc )
: m_id( GetLockCounter().fetch_add( 1, std::memory_order_relaxed ) )
#ifdef TRACY_ON_DEMAND
, m_lockCount( 0 )
, m_active( false )
#endif
{
assert( m_id != (std::numeric_limits<uint32_t>::max)() );
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockAnnounce );
MemWrite( &item->lockAnnounce.id, m_id );
MemWrite( &item->lockAnnounce.time, Profiler::GetTime() );
MemWrite( &item->lockAnnounce.lckloc, (uint64_t)srcloc );
MemWrite( &item->lockAnnounce.type, LockType::Lockable );
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item );
#endif
Profiler::QueueSerialFinish();
}
LockableCtx( const LockableCtx& ) = delete;
LockableCtx& operator=( const LockableCtx& ) = delete;
tracy_force_inline ~LockableCtx()
{
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockTerminate );
MemWrite( &item->lockTerminate.id, m_id );
MemWrite( &item->lockTerminate.time, Profiler::GetTime() );
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item );
#endif
Profiler::QueueSerialFinish();
}
tracy_force_inline bool BeforeLock()
{
#ifdef TRACY_ON_DEMAND
bool queue = false;
const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
const auto active = m_active.load( std::memory_order_relaxed );
if( locks == 0 || active )
{
const bool connected = GetProfiler().IsConnected();
if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
if( connected ) queue = true;
}
if( !queue ) return false;
#endif
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockWait );
MemWrite( &item->lockWait.thread, GetThreadHandle() );
MemWrite( &item->lockWait.id, m_id );
MemWrite( &item->lockWait.time, Profiler::GetTime() );
Profiler::QueueSerialFinish();
return true;
}
tracy_force_inline void AfterLock()
{
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockObtain );
MemWrite( &item->lockObtain.thread, GetThreadHandle() );
MemWrite( &item->lockObtain.id, m_id );
MemWrite( &item->lockObtain.time, Profiler::GetTime() );
Profiler::QueueSerialFinish();
}
tracy_force_inline void AfterUnlock()
{
#ifdef TRACY_ON_DEMAND
m_lockCount.fetch_sub( 1, std::memory_order_relaxed );
if( !m_active.load( std::memory_order_relaxed ) ) return;
if( !GetProfiler().IsConnected() )
{
m_active.store( false, std::memory_order_relaxed );
return;
}
#endif
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockRelease );
MemWrite( &item->lockRelease.id, m_id );
MemWrite( &item->lockRelease.time, Profiler::GetTime() );
Profiler::QueueSerialFinish();
}
tracy_force_inline void AfterTryLock( bool acquired )
{
#ifdef TRACY_ON_DEMAND
if( !acquired ) return;
bool queue = false;
const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
const auto active = m_active.load( std::memory_order_relaxed );
if( locks == 0 || active )
{
const bool connected = GetProfiler().IsConnected();
if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
if( connected ) queue = true;
}
if( !queue ) return;
#endif
if( acquired )
{
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockObtain );
MemWrite( &item->lockObtain.thread, GetThreadHandle() );
MemWrite( &item->lockObtain.id, m_id );
MemWrite( &item->lockObtain.time, Profiler::GetTime() );
Profiler::QueueSerialFinish();
}
}
tracy_force_inline void Mark( const SourceLocationData* srcloc )
{
#ifdef TRACY_ON_DEMAND
const auto active = m_active.load( std::memory_order_relaxed );
if( !active ) return;
const auto connected = GetProfiler().IsConnected();
if( !connected )
{
if( active ) m_active.store( false, std::memory_order_relaxed );
return;
}
#endif
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockMark );
MemWrite( &item->lockMark.thread, GetThreadHandle() );
MemWrite( &item->lockMark.id, m_id );
MemWrite( &item->lockMark.srcloc, (uint64_t)srcloc );
Profiler::QueueSerialFinish();
}
tracy_force_inline void CustomName( const char* name, size_t size )
{
assert( size < (std::numeric_limits<uint16_t>::max)() );
auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, name, size );
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockName );
MemWrite( &item->lockNameFat.id, m_id );
MemWrite( &item->lockNameFat.name, (uint64_t)ptr );
MemWrite( &item->lockNameFat.size, (uint16_t)size );
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item );
#endif
Profiler::QueueSerialFinish();
}
private:
uint32_t m_id;
#ifdef TRACY_ON_DEMAND
std::atomic<uint32_t> m_lockCount;
std::atomic<bool> m_active;
#endif
};
template<class T>
class Lockable
{
public:
tracy_force_inline Lockable( const SourceLocationData* srcloc )
: m_ctx( srcloc )
{
}
Lockable( const Lockable& ) = delete;
Lockable& operator=( const Lockable& ) = delete;
tracy_force_inline void lock()
{
const auto runAfter = m_ctx.BeforeLock();
m_lockable.lock();
if( runAfter ) m_ctx.AfterLock();
}
tracy_force_inline void unlock()
{
m_lockable.unlock();
m_ctx.AfterUnlock();
}
tracy_force_inline bool try_lock()
{
const auto acquired = m_lockable.try_lock();
m_ctx.AfterTryLock( acquired );
return acquired;
}
tracy_force_inline void Mark( const SourceLocationData* srcloc )
{
m_ctx.Mark( srcloc );
}
tracy_force_inline void CustomName( const char* name, size_t size )
{
m_ctx.CustomName( name, size );
}
private:
T m_lockable;
LockableCtx m_ctx;
};
class SharedLockableCtx
{
public:
tracy_force_inline SharedLockableCtx( const SourceLocationData* srcloc )
: m_id( GetLockCounter().fetch_add( 1, std::memory_order_relaxed ) )
#ifdef TRACY_ON_DEMAND
, m_lockCount( 0 )
, m_active( false )
#endif
{
assert( m_id != (std::numeric_limits<uint32_t>::max)() );
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockAnnounce );
MemWrite( &item->lockAnnounce.id, m_id );
MemWrite( &item->lockAnnounce.time, Profiler::GetTime() );
MemWrite( &item->lockAnnounce.lckloc, (uint64_t)srcloc );
MemWrite( &item->lockAnnounce.type, LockType::SharedLockable );
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item );
#endif
Profiler::QueueSerialFinish();
}
SharedLockableCtx( const SharedLockableCtx& ) = delete;
SharedLockableCtx& operator=( const SharedLockableCtx& ) = delete;
tracy_force_inline ~SharedLockableCtx()
{
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockTerminate );
MemWrite( &item->lockTerminate.id, m_id );
MemWrite( &item->lockTerminate.time, Profiler::GetTime() );
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item );
#endif
Profiler::QueueSerialFinish();
}
tracy_force_inline bool BeforeLock()
{
#ifdef TRACY_ON_DEMAND
bool queue = false;
const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
const auto active = m_active.load( std::memory_order_relaxed );
if( locks == 0 || active )
{
const bool connected = GetProfiler().IsConnected();
if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
if( connected ) queue = true;
}
if( !queue ) return false;
#endif
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockWait );
MemWrite( &item->lockWait.thread, GetThreadHandle() );
MemWrite( &item->lockWait.id, m_id );
MemWrite( &item->lockWait.time, Profiler::GetTime() );
Profiler::QueueSerialFinish();
return true;
}
tracy_force_inline void AfterLock()
{
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockObtain );
MemWrite( &item->lockObtain.thread, GetThreadHandle() );
MemWrite( &item->lockObtain.id, m_id );
MemWrite( &item->lockObtain.time, Profiler::GetTime() );
Profiler::QueueSerialFinish();
}
tracy_force_inline void AfterUnlock()
{
#ifdef TRACY_ON_DEMAND
m_lockCount.fetch_sub( 1, std::memory_order_relaxed );
if( !m_active.load( std::memory_order_relaxed ) ) return;
if( !GetProfiler().IsConnected() )
{
m_active.store( false, std::memory_order_relaxed );
return;
}
#endif
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockRelease );
MemWrite( &item->lockRelease.id, m_id );
MemWrite( &item->lockRelease.time, Profiler::GetTime() );
Profiler::QueueSerialFinish();
}
tracy_force_inline void AfterTryLock( bool acquired )
{
#ifdef TRACY_ON_DEMAND
if( !acquired ) return;
bool queue = false;
const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
const auto active = m_active.load( std::memory_order_relaxed );
if( locks == 0 || active )
{
const bool connected = GetProfiler().IsConnected();
if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
if( connected ) queue = true;
}
if( !queue ) return;
#endif
if( acquired )
{
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockObtain );
MemWrite( &item->lockObtain.thread, GetThreadHandle() );
MemWrite( &item->lockObtain.id, m_id );
MemWrite( &item->lockObtain.time, Profiler::GetTime() );
Profiler::QueueSerialFinish();
}
}
tracy_force_inline bool BeforeLockShared()
{
#ifdef TRACY_ON_DEMAND
bool queue = false;
const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
const auto active = m_active.load( std::memory_order_relaxed );
if( locks == 0 || active )
{
const bool connected = GetProfiler().IsConnected();
if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
if( connected ) queue = true;
}
if( !queue ) return false;
#endif
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockSharedWait );
MemWrite( &item->lockWait.thread, GetThreadHandle() );
MemWrite( &item->lockWait.id, m_id );
MemWrite( &item->lockWait.time, Profiler::GetTime() );
Profiler::QueueSerialFinish();
return true;
}
tracy_force_inline void AfterLockShared()
{
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockSharedObtain );
MemWrite( &item->lockObtain.thread, GetThreadHandle() );
MemWrite( &item->lockObtain.id, m_id );
MemWrite( &item->lockObtain.time, Profiler::GetTime() );
Profiler::QueueSerialFinish();
}
tracy_force_inline void AfterUnlockShared()
{
#ifdef TRACY_ON_DEMAND
m_lockCount.fetch_sub( 1, std::memory_order_relaxed );
if( !m_active.load( std::memory_order_relaxed ) ) return;
if( !GetProfiler().IsConnected() )
{
m_active.store( false, std::memory_order_relaxed );
return;
}
#endif
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockSharedRelease );
MemWrite( &item->lockReleaseShared.thread, GetThreadHandle() );
MemWrite( &item->lockReleaseShared.id, m_id );
MemWrite( &item->lockReleaseShared.time, Profiler::GetTime() );
Profiler::QueueSerialFinish();
}
tracy_force_inline void AfterTryLockShared( bool acquired )
{
#ifdef TRACY_ON_DEMAND
if( !acquired ) return;
bool queue = false;
const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
const auto active = m_active.load( std::memory_order_relaxed );
if( locks == 0 || active )
{
const bool connected = GetProfiler().IsConnected();
if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
if( connected ) queue = true;
}
if( !queue ) return;
#endif
if( acquired )
{
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockSharedObtain );
MemWrite( &item->lockObtain.thread, GetThreadHandle() );
MemWrite( &item->lockObtain.id, m_id );
MemWrite( &item->lockObtain.time, Profiler::GetTime() );
Profiler::QueueSerialFinish();
}
}
tracy_force_inline void Mark( const SourceLocationData* srcloc )
{
#ifdef TRACY_ON_DEMAND
const auto active = m_active.load( std::memory_order_relaxed );
if( !active ) return;
const auto connected = GetProfiler().IsConnected();
if( !connected )
{
if( active ) m_active.store( false, std::memory_order_relaxed );
return;
}
#endif
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockMark );
MemWrite( &item->lockMark.thread, GetThreadHandle() );
MemWrite( &item->lockMark.id, m_id );
MemWrite( &item->lockMark.srcloc, (uint64_t)srcloc );
Profiler::QueueSerialFinish();
}
tracy_force_inline void CustomName( const char* name, size_t size )
{
assert( size < (std::numeric_limits<uint16_t>::max)() );
auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, name, size );
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockName );
MemWrite( &item->lockNameFat.id, m_id );
MemWrite( &item->lockNameFat.name, (uint64_t)ptr );
MemWrite( &item->lockNameFat.size, (uint16_t)size );
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item );
#endif
Profiler::QueueSerialFinish();
}
private:
uint32_t m_id;
#ifdef TRACY_ON_DEMAND
std::atomic<uint32_t> m_lockCount;
std::atomic<bool> m_active;
#endif
};
template<class T>
class SharedLockable
{
public:
tracy_force_inline SharedLockable( const SourceLocationData* srcloc )
: m_ctx( srcloc )
{
}
SharedLockable( const SharedLockable& ) = delete;
SharedLockable& operator=( const SharedLockable& ) = delete;
tracy_force_inline void lock()
{
const auto runAfter = m_ctx.BeforeLock();
m_lockable.lock();
if( runAfter ) m_ctx.AfterLock();
}
tracy_force_inline void unlock()
{
m_lockable.unlock();
m_ctx.AfterUnlock();
}
tracy_force_inline bool try_lock()
{
const auto acquired = m_lockable.try_lock();
m_ctx.AfterTryLock( acquired );
return acquired;
}
tracy_force_inline void lock_shared()
{
const auto runAfter = m_ctx.BeforeLockShared();
m_lockable.lock_shared();
if( runAfter ) m_ctx.AfterLockShared();
}
tracy_force_inline void unlock_shared()
{
m_lockable.unlock_shared();
m_ctx.AfterUnlockShared();
}
tracy_force_inline bool try_lock_shared()
{
const auto acquired = m_lockable.try_lock_shared();
m_ctx.AfterTryLockShared( acquired );
return acquired;
}
tracy_force_inline void Mark( const SourceLocationData* srcloc )
{
m_ctx.Mark( srcloc );
}
tracy_force_inline void CustomName( const char* name, size_t size )
{
m_ctx.CustomName( name, size );
}
private:
T m_lockable;
SharedLockableCtx m_ctx;
};
}
#endif

View File

@@ -0,0 +1,26 @@
#ifdef TRACY_ENABLE
# ifdef __linux__
# include "TracyDebug.hpp"
# ifdef TRACY_VERBOSE
# include <dlfcn.h>
# include <link.h>
# endif
extern "C" int dlclose( void* hnd )
{
#ifdef TRACY_VERBOSE
struct link_map* lm;
if( dlinfo( hnd, RTLD_DI_LINKMAP, &lm ) == 0 )
{
TracyDebug( "Overriding dlclose for %s\n", lm->l_name );
}
else
{
TracyDebug( "Overriding dlclose for unknown object (%s)\n", dlerror() );
}
#endif
return 0;
}
# endif
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,141 @@
#include <atomic>
#include <assert.h>
#include <errno.h>
#include <linux/perf_event.h>
#include <stdint.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <unistd.h>
#include "TracyDebug.hpp"
namespace tracy
{
class RingBuffer
{
public:
RingBuffer( unsigned int size, int fd, int id, int cpu = -1 )
: m_size( size )
, m_id( id )
, m_cpu( cpu )
, m_fd( fd )
{
const auto pageSize = uint32_t( getpagesize() );
assert( size >= pageSize );
assert( __builtin_popcount( size ) == 1 );
m_mapSize = size + pageSize;
auto mapAddr = mmap( nullptr, m_mapSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 );
if( mapAddr == MAP_FAILED )
{
TracyDebug( "mmap failed: errno %i (%s)\n", errno, strerror( errno ) );
m_fd = 0;
m_metadata = nullptr;
close( fd );
return;
}
m_metadata = (perf_event_mmap_page*)mapAddr;
assert( m_metadata->data_offset == pageSize );
m_buffer = ((char*)mapAddr) + pageSize;
m_tail = m_metadata->data_tail;
}
~RingBuffer()
{
if( m_metadata ) munmap( m_metadata, m_mapSize );
if( m_fd ) close( m_fd );
}
RingBuffer( const RingBuffer& ) = delete;
RingBuffer& operator=( const RingBuffer& ) = delete;
RingBuffer( RingBuffer&& other )
{
memcpy( (char*)&other, (char*)this, sizeof( RingBuffer ) );
m_metadata = nullptr;
m_fd = 0;
}
RingBuffer& operator=( RingBuffer&& other )
{
memcpy( (char*)&other, (char*)this, sizeof( RingBuffer ) );
m_metadata = nullptr;
m_fd = 0;
return *this;
}
bool IsValid() const { return m_metadata != nullptr; }
int GetId() const { return m_id; }
int GetCpu() const { return m_cpu; }
void Enable()
{
ioctl( m_fd, PERF_EVENT_IOC_ENABLE, 0 );
}
void Read( void* dst, uint64_t offset, uint64_t cnt )
{
const auto size = m_size;
auto src = ( m_tail + offset ) % size;
if( src + cnt <= size )
{
memcpy( dst, m_buffer + src, cnt );
}
else
{
const auto s0 = size - src;
const auto buf = m_buffer;
memcpy( dst, buf + src, s0 );
memcpy( (char*)dst + s0, buf, cnt - s0 );
}
}
void Advance( uint64_t cnt )
{
m_tail += cnt;
StoreTail();
}
bool CheckTscCaps() const
{
return m_metadata->cap_user_time_zero;
}
int64_t ConvertTimeToTsc( int64_t timestamp ) const
{
if( !m_metadata->cap_user_time_zero ) return 0;
const auto time = timestamp - m_metadata->time_zero;
const auto quot = time / m_metadata->time_mult;
const auto rem = time % m_metadata->time_mult;
return ( quot << m_metadata->time_shift ) + ( rem << m_metadata->time_shift ) / m_metadata->time_mult;
}
uint64_t LoadHead() const
{
return std::atomic_load_explicit( (const volatile std::atomic<uint64_t>*)&m_metadata->data_head, std::memory_order_acquire );
}
uint64_t GetTail() const
{
return m_tail;
}
private:
void StoreTail()
{
std::atomic_store_explicit( (volatile std::atomic<uint64_t>*)&m_metadata->data_tail, m_tail, std::memory_order_release );
}
unsigned int m_size;
uint64_t m_tail;
char* m_buffer;
int m_id;
int m_cpu;
perf_event_mmap_page* m_metadata;
size_t m_mapSize;
int m_fd;
};
}

View File

@@ -0,0 +1,175 @@
#ifndef __TRACYSCOPED_HPP__
#define __TRACYSCOPED_HPP__
#include <limits>
#include <stdint.h>
#include <string.h>
#include "../common/TracySystem.hpp"
#include "../common/TracyAlign.hpp"
#include "../common/TracyAlloc.hpp"
#include "TracyProfiler.hpp"
namespace tracy
{
class ScopedZone
{
public:
ScopedZone( const ScopedZone& ) = delete;
ScopedZone( ScopedZone&& ) = delete;
ScopedZone& operator=( const ScopedZone& ) = delete;
ScopedZone& operator=( ScopedZone&& ) = delete;
tracy_force_inline ScopedZone( const SourceLocationData* srcloc, bool is_active = true )
#ifdef TRACY_ON_DEMAND
: m_active( is_active && GetProfiler().IsConnected() )
#else
: m_active( is_active )
#endif
{
if( !m_active ) return;
#ifdef TRACY_ON_DEMAND
m_connectionId = GetProfiler().ConnectionId();
#endif
TracyQueuePrepare( QueueType::ZoneBegin );
MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
TracyQueueCommit( zoneBeginThread );
}
tracy_force_inline ScopedZone( const SourceLocationData* srcloc, int depth, bool is_active = true )
#ifdef TRACY_ON_DEMAND
: m_active( is_active && GetProfiler().IsConnected() )
#else
: m_active( is_active )
#endif
{
if( !m_active ) return;
#ifdef TRACY_ON_DEMAND
m_connectionId = GetProfiler().ConnectionId();
#endif
GetProfiler().SendCallstack( depth );
TracyQueuePrepare( QueueType::ZoneBeginCallstack );
MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
TracyQueueCommit( zoneBeginThread );
}
tracy_force_inline ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool is_active = true )
#ifdef TRACY_ON_DEMAND
: m_active( is_active && GetProfiler().IsConnected() )
#else
: m_active( is_active )
#endif
{
if( !m_active ) return;
#ifdef TRACY_ON_DEMAND
m_connectionId = GetProfiler().ConnectionId();
#endif
TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLoc );
const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
MemWrite( &item->zoneBegin.srcloc, srcloc );
TracyQueueCommit( zoneBeginThread );
}
tracy_force_inline ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool is_active = true )
#ifdef TRACY_ON_DEMAND
: m_active( is_active && GetProfiler().IsConnected() )
#else
: m_active( is_active )
#endif
{
if( !m_active ) return;
#ifdef TRACY_ON_DEMAND
m_connectionId = GetProfiler().ConnectionId();
#endif
GetProfiler().SendCallstack( depth );
TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLocCallstack );
const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
MemWrite( &item->zoneBegin.srcloc, srcloc );
TracyQueueCommit( zoneBeginThread );
}
tracy_force_inline ~ScopedZone()
{
if( !m_active ) return;
#ifdef TRACY_ON_DEMAND
if( GetProfiler().ConnectionId() != m_connectionId ) return;
#endif
TracyQueuePrepare( QueueType::ZoneEnd );
MemWrite( &item->zoneEnd.time, Profiler::GetTime() );
TracyQueueCommit( zoneEndThread );
}
tracy_force_inline void Text( const char* txt, size_t size )
{
assert( size < (std::numeric_limits<uint16_t>::max)() );
if( !m_active ) return;
#ifdef TRACY_ON_DEMAND
if( GetProfiler().ConnectionId() != m_connectionId ) return;
#endif
auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, txt, size );
TracyQueuePrepare( QueueType::ZoneText );
MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
MemWrite( &item->zoneTextFat.size, (uint16_t)size );
TracyQueueCommit( zoneTextFatThread );
}
tracy_force_inline void Name( const char* txt, size_t size )
{
assert( size < (std::numeric_limits<uint16_t>::max)() );
if( !m_active ) return;
#ifdef TRACY_ON_DEMAND
if( GetProfiler().ConnectionId() != m_connectionId ) return;
#endif
auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, txt, size );
TracyQueuePrepare( QueueType::ZoneName );
MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
MemWrite( &item->zoneTextFat.size, (uint16_t)size );
TracyQueueCommit( zoneTextFatThread );
}
tracy_force_inline void Color( uint32_t color )
{
if( !m_active ) return;
#ifdef TRACY_ON_DEMAND
if( GetProfiler().ConnectionId() != m_connectionId ) return;
#endif
TracyQueuePrepare( QueueType::ZoneColor );
MemWrite( &item->zoneColor.b, uint8_t( ( color ) & 0xFF ) );
MemWrite( &item->zoneColor.g, uint8_t( ( color >> 8 ) & 0xFF ) );
MemWrite( &item->zoneColor.r, uint8_t( ( color >> 16 ) & 0xFF ) );
TracyQueueCommit( zoneColorThread );
}
tracy_force_inline void Value( uint64_t value )
{
if( !m_active ) return;
#ifdef TRACY_ON_DEMAND
if( GetProfiler().ConnectionId() != m_connectionId ) return;
#endif
TracyQueuePrepare( QueueType::ZoneValue );
MemWrite( &item->zoneValue.value, value );
TracyQueueCommit( zoneValueThread );
}
tracy_force_inline bool IsActive() const { return m_active; }
private:
const bool m_active;
#ifdef TRACY_ON_DEMAND
uint64_t m_connectionId = 0;
#endif
};
}
#endif

View File

@@ -0,0 +1,41 @@
#ifndef __TRACYSTRINGHELPERS_HPP__
#define __TRACYSTRINGHELPERS_HPP__
#include <assert.h>
#include <string.h>
#include "../common/TracyAlloc.hpp"
#include "../common/TracyForceInline.hpp"
namespace tracy
{
static tracy_force_inline char* CopyString( const char* src, size_t sz )
{
auto dst = (char*)tracy_malloc( sz + 1 );
memcpy( dst, src, sz );
dst[sz] = '\0';
return dst;
}
static tracy_force_inline char* CopyString( const char* src )
{
return CopyString( src, strlen( src ) );
}
static tracy_force_inline char* CopyStringFast( const char* src, size_t sz )
{
auto dst = (char*)tracy_malloc_fast( sz + 1 );
memcpy( dst, src, sz );
dst[sz] = '\0';
return dst;
}
static tracy_force_inline char* CopyStringFast( const char* src )
{
return CopyStringFast( src, strlen( src ) );
}
}
#endif

View File

@@ -0,0 +1,164 @@
#include "TracySysPower.hpp"
#ifdef TRACY_HAS_SYSPOWER
#include <sys/types.h>
#include <dirent.h>
#include <chrono>
#include <inttypes.h>
#include <stdio.h>
#include <string.h>
#include "TracyDebug.hpp"
#include "TracyProfiler.hpp"
#include "../common/TracyAlloc.hpp"
namespace tracy
{
SysPower::SysPower()
: m_domains( 4 )
, m_lastTime( 0 )
{
ScanDirectory( "/sys/devices/virtual/powercap/intel-rapl", -1 );
}
SysPower::~SysPower()
{
for( auto& v : m_domains )
{
fclose( v.handle );
// Do not release v.name, as it may be still needed
}
}
void SysPower::Tick()
{
auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count();
if( t - m_lastTime > 10000000 ) // 10 ms
{
m_lastTime = t;
for( auto& v : m_domains )
{
char tmp[32];
if( fread( tmp, 1, 32, v.handle ) > 0 )
{
rewind( v.handle );
auto p = (uint64_t)atoll( tmp );
uint64_t delta;
if( p >= v.value )
{
delta = p - v.value;
}
else
{
delta = v.overflow - v.value + p;
}
v.value = p;
TracyLfqPrepare( QueueType::SysPowerReport );
MemWrite( &item->sysPower.time, Profiler::GetTime() );
MemWrite( &item->sysPower.delta, delta );
MemWrite( &item->sysPower.name, (uint64_t)v.name );
TracyLfqCommit;
}
}
}
}
void SysPower::ScanDirectory( const char* path, int parent )
{
DIR* dir = opendir( path );
if( !dir ) return;
struct dirent* ent;
uint64_t maxRange = 0;
char* name = nullptr;
FILE* handle = nullptr;
while( ( ent = readdir( dir ) ) )
{
if( ent->d_type == DT_REG )
{
if( strcmp( ent->d_name, "max_energy_range_uj" ) == 0 )
{
char tmp[PATH_MAX];
snprintf( tmp, PATH_MAX, "%s/max_energy_range_uj", path );
FILE* f = fopen( tmp, "r" );
if( f )
{
void(fscanf( f, "%" PRIu64, &maxRange ));
fclose( f );
}
}
else if( strcmp( ent->d_name, "name" ) == 0 )
{
char tmp[PATH_MAX];
snprintf( tmp, PATH_MAX, "%s/name", path );
FILE* f = fopen( tmp, "r" );
if( f )
{
char ntmp[128];
if( fgets( ntmp, 128, f ) )
{
// Last character is newline, skip it
const auto sz = strlen( ntmp ) - 1;
if( parent < 0 )
{
name = (char*)tracy_malloc( sz + 1 );
memcpy( name, ntmp, sz );
name[sz] = '\0';
}
else
{
const auto p = m_domains[parent];
const auto psz = strlen( p.name );
name = (char*)tracy_malloc( psz + sz + 2 );
memcpy( name, p.name, psz );
name[psz] = ':';
memcpy( name+psz+1, ntmp, sz );
name[psz+sz+1] = '\0';
}
}
fclose( f );
}
}
else if( strcmp( ent->d_name, "energy_uj" ) == 0 )
{
char tmp[PATH_MAX];
snprintf( tmp, PATH_MAX, "%s/energy_uj", path );
handle = fopen( tmp, "r" );
}
}
if( name && handle && maxRange > 0 ) break;
}
if( name && handle && maxRange > 0 )
{
parent = (int)m_domains.size();
Domain* domain = m_domains.push_next();
domain->value = 0;
domain->overflow = maxRange;
domain->handle = handle;
domain->name = name;
TracyDebug( "Power domain id %i, %s found at %s\n", parent, name, path );
}
else
{
if( name ) tracy_free( name );
if( handle ) fclose( handle );
}
rewinddir( dir );
while( ( ent = readdir( dir ) ) )
{
if( ent->d_type == DT_DIR && strncmp( ent->d_name, "intel-rapl:", 11 ) == 0 )
{
char tmp[PATH_MAX];
snprintf( tmp, PATH_MAX, "%s/%s", path, ent->d_name );
ScanDirectory( tmp, parent );
}
}
closedir( dir );
}
}
#endif

View File

@@ -0,0 +1,44 @@
#ifndef __TRACYSYSPOWER_HPP__
#define __TRACYSYSPOWER_HPP__
#if defined __linux__
# define TRACY_HAS_SYSPOWER
#endif
#ifdef TRACY_HAS_SYSPOWER
#include <stdint.h>
#include <stdio.h>
#include "TracyFastVector.hpp"
namespace tracy
{
class SysPower
{
struct Domain
{
uint64_t value;
uint64_t overflow;
FILE* handle;
const char* name;
};
public:
SysPower();
~SysPower();
void Tick();
private:
void ScanDirectory( const char* path, int parent );
FastVector<Domain> m_domains;
uint64_t m_lastTime;
};
}
#endif
#endif

View File

@@ -0,0 +1,108 @@
#include "TracySysTime.hpp"
#ifdef TRACY_HAS_SYSTIME
# if defined _WIN32
# include <windows.h>
# elif defined __linux__
# include <stdio.h>
# include <inttypes.h>
# elif defined __APPLE__
# include <mach/mach_host.h>
# include <mach/host_info.h>
# elif defined BSD
# include <sys/types.h>
# include <sys/sysctl.h>
# endif
namespace tracy
{
# if defined _WIN32
static inline uint64_t ConvertTime( const FILETIME& t )
{
return ( uint64_t( t.dwHighDateTime ) << 32 ) | uint64_t( t.dwLowDateTime );
}
void SysTime::ReadTimes()
{
FILETIME idleTime;
FILETIME kernelTime;
FILETIME userTime;
GetSystemTimes( &idleTime, &kernelTime, &userTime );
idle = ConvertTime( idleTime );
const auto kernel = ConvertTime( kernelTime );
const auto user = ConvertTime( userTime );
used = kernel + user;
}
# elif defined __linux__
void SysTime::ReadTimes()
{
uint64_t user, nice, system;
FILE* f = fopen( "/proc/stat", "r" );
if( f )
{
int read = fscanf( f, "cpu %" PRIu64 " %" PRIu64 " %" PRIu64" %" PRIu64, &user, &nice, &system, &idle );
fclose( f );
if (read == 4)
{
used = user + nice + system;
}
}
}
# elif defined __APPLE__
void SysTime::ReadTimes()
{
host_cpu_load_info_data_t info;
mach_msg_type_number_t cnt = HOST_CPU_LOAD_INFO_COUNT;
host_statistics( mach_host_self(), HOST_CPU_LOAD_INFO, reinterpret_cast<host_info_t>( &info ), &cnt );
used = info.cpu_ticks[CPU_STATE_USER] + info.cpu_ticks[CPU_STATE_NICE] + info.cpu_ticks[CPU_STATE_SYSTEM];
idle = info.cpu_ticks[CPU_STATE_IDLE];
}
# elif defined BSD
void SysTime::ReadTimes()
{
u_long data[5];
size_t sz = sizeof( data );
sysctlbyname( "kern.cp_time", &data, &sz, nullptr, 0 );
used = data[0] + data[1] + data[2] + data[3];
idle = data[4];
}
#endif
SysTime::SysTime()
{
ReadTimes();
}
float SysTime::Get()
{
const auto oldUsed = used;
const auto oldIdle = idle;
ReadTimes();
const auto diffIdle = idle - oldIdle;
const auto diffUsed = used - oldUsed;
#if defined _WIN32
return diffUsed == 0 ? -1 : ( diffUsed - diffIdle ) * 100.f / diffUsed;
#elif defined __linux__ || defined __APPLE__ || defined BSD
const auto total = diffUsed + diffIdle;
return total == 0 ? -1 : diffUsed * 100.f / total;
#endif
}
}
#endif

View File

@@ -0,0 +1,36 @@
#ifndef __TRACYSYSTIME_HPP__
#define __TRACYSYSTIME_HPP__
#if defined _WIN32 || defined __linux__ || defined __APPLE__
# define TRACY_HAS_SYSTIME
#else
# include <sys/param.h>
#endif
#ifdef BSD
# define TRACY_HAS_SYSTIME
#endif
#ifdef TRACY_HAS_SYSTIME
#include <stdint.h>
namespace tracy
{
class SysTime
{
public:
SysTime();
float Get();
void ReadTimes();
private:
uint64_t idle, used;
};
}
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,28 @@
#ifndef __TRACYSYSTRACE_HPP__
#define __TRACYSYSTRACE_HPP__
#if !defined TRACY_NO_SYSTEM_TRACING && ( defined _WIN32 || defined __linux__ )
# include "../common/TracyUwp.hpp"
# ifndef TRACY_UWP
# define TRACY_HAS_SYSTEM_TRACING
# endif
#endif
#ifdef TRACY_HAS_SYSTEM_TRACING
#include <stdint.h>
namespace tracy
{
bool SysTraceStart( int64_t& samplingPeriod );
void SysTraceStop();
void SysTraceWorker( void* ptr );
void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const char*& name );
}
#endif
#endif

View File

@@ -0,0 +1,90 @@
#ifndef __TRACYTHREAD_HPP__
#define __TRACYTHREAD_HPP__
#if defined _WIN32
# include <windows.h>
#else
# include <pthread.h>
#endif
#ifdef TRACY_MANUAL_LIFETIME
# include "tracy_rpmalloc.hpp"
#endif
namespace tracy
{
#ifdef TRACY_MANUAL_LIFETIME
extern thread_local bool RpThreadInitDone;
#endif
class ThreadExitHandler
{
public:
~ThreadExitHandler()
{
#ifdef TRACY_MANUAL_LIFETIME
rpmalloc_thread_finalize( 1 );
RpThreadInitDone = false;
#endif
}
};
#if defined _WIN32
class Thread
{
public:
Thread( void(*func)( void* ptr ), void* ptr )
: m_func( func )
, m_ptr( ptr )
, m_hnd( CreateThread( nullptr, 0, Launch, this, 0, nullptr ) )
{}
~Thread()
{
WaitForSingleObject( m_hnd, INFINITE );
CloseHandle( m_hnd );
}
HANDLE Handle() const { return m_hnd; }
private:
static DWORD WINAPI Launch( void* ptr ) { ((Thread*)ptr)->m_func( ((Thread*)ptr)->m_ptr ); return 0; }
void(*m_func)( void* ptr );
void* m_ptr;
HANDLE m_hnd;
};
#else
class Thread
{
public:
Thread( void(*func)( void* ptr ), void* ptr )
: m_func( func )
, m_ptr( ptr )
{
pthread_create( &m_thread, nullptr, Launch, this );
}
~Thread()
{
pthread_join( m_thread, nullptr );
}
pthread_t Handle() const { return m_thread; }
private:
static void* Launch( void* ptr ) { ((Thread*)ptr)->m_func( ((Thread*)ptr)->m_ptr ); return nullptr; }
void(*m_func)( void* ptr );
void* m_ptr;
pthread_t m_thread;
};
#endif
}
#endif

View File

@@ -0,0 +1,148 @@
/*
Copyright (c) 2020 Erik Rigtorp <erik@rigtorp.se>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#pragma once
#include <atomic>
#include <cassert>
#include <cstddef>
#include <stdexcept>
#include <type_traits> // std::enable_if, std::is_*_constructible
#include "../common/TracyAlloc.hpp"
#if defined (_MSC_VER)
#pragma warning(push)
#pragma warning(disable:4324)
#endif
namespace tracy {
template <typename T> class SPSCQueue {
public:
explicit SPSCQueue(const size_t capacity)
: capacity_(capacity) {
capacity_++; // Needs one slack element
slots_ = (T*)tracy_malloc(sizeof(T) * (capacity_ + 2 * kPadding));
static_assert(alignof(SPSCQueue<T>) == kCacheLineSize, "");
static_assert(sizeof(SPSCQueue<T>) >= 3 * kCacheLineSize, "");
assert(reinterpret_cast<char *>(&readIdx_) -
reinterpret_cast<char *>(&writeIdx_) >=
static_cast<std::ptrdiff_t>(kCacheLineSize));
}
~SPSCQueue() {
while (front()) {
pop();
}
tracy_free(slots_);
}
// non-copyable and non-movable
SPSCQueue(const SPSCQueue &) = delete;
SPSCQueue &operator=(const SPSCQueue &) = delete;
template <typename... Args>
void emplace(Args &&...args) noexcept(
std::is_nothrow_constructible<T, Args &&...>::value) {
static_assert(std::is_constructible<T, Args &&...>::value,
"T must be constructible with Args&&...");
auto const writeIdx = writeIdx_.load(std::memory_order_relaxed);
auto nextWriteIdx = writeIdx + 1;
if (nextWriteIdx == capacity_) {
nextWriteIdx = 0;
}
while (nextWriteIdx == readIdxCache_) {
readIdxCache_ = readIdx_.load(std::memory_order_acquire);
}
new (&slots_[writeIdx + kPadding]) T(std::forward<Args>(args)...);
writeIdx_.store(nextWriteIdx, std::memory_order_release);
}
T *front() noexcept {
auto const readIdx = readIdx_.load(std::memory_order_relaxed);
if (readIdx == writeIdxCache_) {
writeIdxCache_ = writeIdx_.load(std::memory_order_acquire);
if (writeIdxCache_ == readIdx) {
return nullptr;
}
}
return &slots_[readIdx + kPadding];
}
void pop() noexcept {
static_assert(std::is_nothrow_destructible<T>::value,
"T must be nothrow destructible");
auto const readIdx = readIdx_.load(std::memory_order_relaxed);
assert(writeIdx_.load(std::memory_order_acquire) != readIdx);
slots_[readIdx + kPadding].~T();
auto nextReadIdx = readIdx + 1;
if (nextReadIdx == capacity_) {
nextReadIdx = 0;
}
readIdx_.store(nextReadIdx, std::memory_order_release);
}
size_t size() const noexcept {
std::ptrdiff_t diff = writeIdx_.load(std::memory_order_acquire) -
readIdx_.load(std::memory_order_acquire);
if (diff < 0) {
diff += capacity_;
}
return static_cast<size_t>(diff);
}
bool empty() const noexcept {
return writeIdx_.load(std::memory_order_acquire) ==
readIdx_.load(std::memory_order_acquire);
}
size_t capacity() const noexcept { return capacity_ - 1; }
private:
static constexpr size_t kCacheLineSize = 64;
// Padding to avoid false sharing between slots_ and adjacent allocations
static constexpr size_t kPadding = (kCacheLineSize - 1) / sizeof(T) + 1;
private:
size_t capacity_;
T *slots_;
// Align to cache line size in order to avoid false sharing
// readIdxCache_ and writeIdxCache_ is used to reduce the amount of cache
// coherency traffic
alignas(kCacheLineSize) std::atomic<size_t> writeIdx_ = {0};
alignas(kCacheLineSize) size_t readIdxCache_ = 0;
alignas(kCacheLineSize) std::atomic<size_t> readIdx_ = {0};
alignas(kCacheLineSize) size_t writeIdxCache_ = 0;
// Padding to avoid adjacent allocations to share cache line with
// writeIdxCache_
char padding_[kCacheLineSize - sizeof(SPSCQueue<T>::writeIdxCache_)];
};
} // namespace rigtorp
#if defined (_MSC_VER)
#pragma warning(pop)
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,363 @@
/* rpmalloc.h - Memory allocator - Public Domain - 2016 Mattias Jansson
*
* This library provides a cross-platform lock free thread caching malloc implementation in C11.
* The latest source code is always available at
*
* https://github.com/mjansson/rpmalloc
*
* This library is put in the public domain; you can redistribute it and/or modify it without any restrictions.
*
*/
#pragma once
#include <stddef.h>
#include "../common/TracyApi.h"
namespace tracy
{
#if defined(__clang__) || defined(__GNUC__)
# define RPMALLOC_EXPORT __attribute__((visibility("default")))
# define RPMALLOC_ALLOCATOR
# if (defined(__clang_major__) && (__clang_major__ < 4)) || (defined(__GNUC__) && defined(ENABLE_PRELOAD) && ENABLE_PRELOAD)
# define RPMALLOC_ATTRIB_MALLOC
# define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)
# else
# define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__))
# define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size)))
# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) __attribute__((alloc_size(count, size)))
# endif
# define RPMALLOC_CDECL
#elif defined(_MSC_VER)
# define RPMALLOC_EXPORT
# define RPMALLOC_ALLOCATOR __declspec(allocator) __declspec(restrict)
# define RPMALLOC_ATTRIB_MALLOC
# define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count,size)
# define RPMALLOC_CDECL __cdecl
#else
# define RPMALLOC_EXPORT
# define RPMALLOC_ALLOCATOR
# define RPMALLOC_ATTRIB_MALLOC
# define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count,size)
# define RPMALLOC_CDECL
#endif
//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes. Will introduce
// a very small overhead due to some size calculations not being compile time constants
#ifndef RPMALLOC_CONFIGURABLE
#define RPMALLOC_CONFIGURABLE 0
#endif
//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_* functions).
// Will introduce a very small overhead to track fully allocated spans in heaps
#ifndef RPMALLOC_FIRST_CLASS_HEAPS
#define RPMALLOC_FIRST_CLASS_HEAPS 0
#endif
//! Flag to rpaligned_realloc to not preserve content in reallocation
#define RPMALLOC_NO_PRESERVE 1
//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be done in-place,
// in which case the original pointer is still valid (just like a call to realloc which failes to allocate
// a new block).
#define RPMALLOC_GROW_OR_FAIL 2
typedef struct rpmalloc_global_statistics_t {
//! Current amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1)
size_t mapped;
//! Peak amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1)
size_t mapped_peak;
//! Current amount of memory in global caches for small and medium sizes (<32KiB)
size_t cached;
//! Current amount of memory allocated in huge allocations, i.e larger than LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1)
size_t huge_alloc;
//! Peak amount of memory allocated in huge allocations, i.e larger than LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1)
size_t huge_alloc_peak;
//! Total amount of memory mapped since initialization (only if ENABLE_STATISTICS=1)
size_t mapped_total;
//! Total amount of memory unmapped since initialization (only if ENABLE_STATISTICS=1)
size_t unmapped_total;
} rpmalloc_global_statistics_t;
typedef struct rpmalloc_thread_statistics_t {
//! Current number of bytes available in thread size class caches for small and medium sizes (<32KiB)
size_t sizecache;
//! Current number of bytes available in thread span caches for small and medium sizes (<32KiB)
size_t spancache;
//! Total number of bytes transitioned from thread cache to global cache (only if ENABLE_STATISTICS=1)
size_t thread_to_global;
//! Total number of bytes transitioned from global cache to thread cache (only if ENABLE_STATISTICS=1)
size_t global_to_thread;
//! Per span count statistics (only if ENABLE_STATISTICS=1)
struct {
//! Currently used number of spans
size_t current;
//! High water mark of spans used
size_t peak;
//! Number of spans transitioned to global cache
size_t to_global;
//! Number of spans transitioned from global cache
size_t from_global;
//! Number of spans transitioned to thread cache
size_t to_cache;
//! Number of spans transitioned from thread cache
size_t from_cache;
//! Number of spans transitioned to reserved state
size_t to_reserved;
//! Number of spans transitioned from reserved state
size_t from_reserved;
//! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls)
size_t map_calls;
} span_use[64];
//! Per size class statistics (only if ENABLE_STATISTICS=1)
struct {
//! Current number of allocations
size_t alloc_current;
//! Peak number of allocations
size_t alloc_peak;
//! Total number of allocations
size_t alloc_total;
//! Total number of frees
size_t free_total;
//! Number of spans transitioned to cache
size_t spans_to_cache;
//! Number of spans transitioned from cache
size_t spans_from_cache;
//! Number of spans transitioned from reserved state
size_t spans_from_reserved;
//! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls)
size_t map_calls;
} size_use[128];
} rpmalloc_thread_statistics_t;
typedef struct rpmalloc_config_t {
//! Map memory pages for the given number of bytes. The returned address MUST be
// aligned to the rpmalloc span size, which will always be a power of two.
// Optionally the function can store an alignment offset in the offset variable
// in case it performs alignment and the returned pointer is offset from the
// actual start of the memory region due to this alignment. The alignment offset
// will be passed to the memory unmap function. The alignment offset MUST NOT be
// larger than 65535 (storable in an uint16_t), if it is you must use natural
// alignment to shift it into 16 bits. If you set a memory_map function, you
// must also set a memory_unmap function or else the default implementation will
// be used for both. This function must be thread safe, it can be called by
// multiple threads simultaneously.
void* (*memory_map)(size_t size, size_t* offset);
//! Unmap the memory pages starting at address and spanning the given number of bytes.
// If release is set to non-zero, the unmap is for an entire span range as returned by
// a previous call to memory_map and that the entire range should be released. The
// release argument holds the size of the entire span range. If release is set to 0,
// the unmap is a partial decommit of a subset of the mapped memory range.
// If you set a memory_unmap function, you must also set a memory_map function or
// else the default implementation will be used for both. This function must be thread
// safe, it can be called by multiple threads simultaneously.
void (*memory_unmap)(void* address, size_t size, size_t offset, size_t release);
//! Called when an assert fails, if asserts are enabled. Will use the standard assert()
// if this is not set.
void (*error_callback)(const char* message);
//! Called when a call to map memory pages fails (out of memory). If this callback is
// not set or returns zero the library will return a null pointer in the allocation
// call. If this callback returns non-zero the map call will be retried. The argument
// passed is the number of bytes that was requested in the map call. Only used if
// the default system memory map function is used (memory_map callback is not set).
int (*map_fail_callback)(size_t size);
//! Size of memory pages. The page size MUST be a power of two. All memory mapping
// requests to memory_map will be made with size set to a multiple of the page size.
// Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system page size is used.
size_t page_size;
//! Size of a span of memory blocks. MUST be a power of two, and in [4096,262144]
// range (unless 0 - set to 0 to use the default span size). Used if RPMALLOC_CONFIGURABLE
// is defined to 1.
size_t span_size;
//! Number of spans to map at each request to map new virtual memory blocks. This can
// be used to minimize the system call overhead at the cost of virtual memory address
// space. The extra mapped pages will not be written until actually used, so physical
// committed memory should not be affected in the default implementation. Will be
// aligned to a multiple of spans that match memory page size in case of huge pages.
size_t span_map_count;
//! Enable use of large/huge pages. If this flag is set to non-zero and page size is
// zero, the allocator will try to enable huge pages and auto detect the configuration.
// If this is set to non-zero and page_size is also non-zero, the allocator will
// assume huge pages have been configured and enabled prior to initializing the
// allocator.
// For Windows, see https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support
// For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
int enable_huge_pages;
//! Respectively allocated pages and huge allocated pages names for systems
// supporting it to be able to distinguish among anonymous regions.
const char *page_name;
const char *huge_page_name;
} rpmalloc_config_t;
//! Initialize allocator with default configuration
TRACY_API int
rpmalloc_initialize(void);
//! Initialize allocator with given configuration
RPMALLOC_EXPORT int
rpmalloc_initialize_config(const rpmalloc_config_t* config);
//! Get allocator configuration
RPMALLOC_EXPORT const rpmalloc_config_t*
rpmalloc_config(void);
//! Finalize allocator
TRACY_API void
rpmalloc_finalize(void);
//! Initialize allocator for calling thread
TRACY_API void
rpmalloc_thread_initialize(void);
//! Finalize allocator for calling thread
TRACY_API void
rpmalloc_thread_finalize(int release_caches);
//! Perform deferred deallocations pending for the calling thread heap
RPMALLOC_EXPORT void
rpmalloc_thread_collect(void);
//! Query if allocator is initialized for calling thread
RPMALLOC_EXPORT int
rpmalloc_is_thread_initialized(void);
//! Get per-thread statistics
RPMALLOC_EXPORT void
rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats);
//! Get global statistics
RPMALLOC_EXPORT void
rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats);
//! Dump all statistics in human readable format to file (should be a FILE*)
RPMALLOC_EXPORT void
rpmalloc_dump_statistics(void* file);
//! Allocate a memory block of at least the given size
TRACY_API RPMALLOC_ALLOCATOR void*
rpmalloc(size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(1);
//! Free the given memory block
TRACY_API void
rpfree(void* ptr);
//! Allocate a memory block of at least the given size and zero initialize it
RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2);
//! Reallocate the given block to at least the given size
TRACY_API RPMALLOC_ALLOCATOR void*
rprealloc(void* ptr, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
//! Reallocate the given block to at least the given size and alignment,
// with optional control flags (see RPMALLOC_NO_PRESERVE).
// Alignment must be a power of two and a multiple of sizeof(void*),
// and should ideally be less than memory page size. A caveat of rpmalloc
// internals is that this must also be strictly less than the span size (default 64KiB)
RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3);
//! Allocate a memory block of at least the given size and alignment.
// Alignment must be a power of two and a multiple of sizeof(void*),
// and should ideally be less than memory page size. A caveat of rpmalloc
// internals is that this must also be strictly less than the span size (default 64KiB)
RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
//! Allocate a memory block of at least the given size and alignment, and zero initialize it.
// Alignment must be a power of two and a multiple of sizeof(void*),
// and should ideally be less than memory page size. A caveat of rpmalloc
// internals is that this must also be strictly less than the span size (default 64KiB)
RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
rpaligned_calloc(size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
//! Allocate a memory block of at least the given size and alignment.
// Alignment must be a power of two and a multiple of sizeof(void*),
// and should ideally be less than memory page size. A caveat of rpmalloc
// internals is that this must also be strictly less than the span size (default 64KiB)
RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
//! Allocate a memory block of at least the given size and alignment.
// Alignment must be a power of two and a multiple of sizeof(void*),
// and should ideally be less than memory page size. A caveat of rpmalloc
// internals is that this must also be strictly less than the span size (default 64KiB)
RPMALLOC_EXPORT int
rpposix_memalign(void** memptr, size_t alignment, size_t size);
//! Query the usable size of the given memory block (from given pointer to the end of block)
RPMALLOC_EXPORT size_t
rpmalloc_usable_size(void* ptr);
#if RPMALLOC_FIRST_CLASS_HEAPS
//! Heap type
typedef struct heap_t rpmalloc_heap_t;
//! Acquire a new heap. Will reuse existing released heaps or allocate memory for a new heap
// if none available. Heap API is implemented with the strict assumption that only one single
// thread will call heap functions for a given heap at any given time, no functions are thread safe.
RPMALLOC_EXPORT rpmalloc_heap_t*
rpmalloc_heap_acquire(void);
//! Release a heap (does NOT free the memory allocated by the heap, use rpmalloc_heap_free_all before destroying the heap).
// Releasing a heap will enable it to be reused by other threads. Safe to pass a null pointer.
RPMALLOC_EXPORT void
rpmalloc_heap_release(rpmalloc_heap_t* heap);
//! Allocate a memory block of at least the given size using the given heap.
RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
//! Allocate a memory block of at least the given size using the given heap. The returned
// block will have the requested alignment. Alignment must be a power of two and a multiple of sizeof(void*),
// and should ideally be less than memory page size. A caveat of rpmalloc
// internals is that this must also be strictly less than the span size (default 64KiB).
RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3);
//! Allocate a memory block of at least the given size using the given heap and zero initialize it.
RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
//! Allocate a memory block of at least the given size using the given heap and zero initialize it. The returned
// block will have the requested alignment. Alignment must either be zero, or a power of two and a multiple of sizeof(void*),
// and should ideally be less than memory page size. A caveat of rpmalloc
// internals is that this must also be strictly less than the span size (default 64KiB).
RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
//! Reallocate the given block to at least the given size. The memory block MUST be allocated
// by the same heap given to this function.
RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3);
//! Reallocate the given block to at least the given size. The memory block MUST be allocated
// by the same heap given to this function. The returned block will have the requested alignment.
// Alignment must be either zero, or a power of two and a multiple of sizeof(void*), and should ideally be
// less than memory page size. A caveat of rpmalloc internals is that this must also be strictly less than
// the span size (default 64KiB).
RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(4);
//! Free the given memory block from the given heap. The memory block MUST be allocated
// by the same heap given to this function.
RPMALLOC_EXPORT void
rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr);
//! Free all memory allocated by the heap
RPMALLOC_EXPORT void
rpmalloc_heap_free_all(rpmalloc_heap_t* heap);
//! Set the given heap as the current heap for the calling thread. A heap MUST only be current heap
// for a single thread, a heap can never be shared between multiple threads. The previous
// current heap for the calling thread is released to be reused by other threads.
RPMALLOC_EXPORT void
rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap);
#endif
}