mirror of
https://github.com/citra-emu/citra.git
synced 2024-11-26 01:50:14 +00:00
Implemented functional vectorization utility
This commit is contained in:
parent
0b897c45d2
commit
ef5dd19f1d
@ -56,6 +56,7 @@ set(HEADERS
|
|||||||
thread_queue_list.h
|
thread_queue_list.h
|
||||||
timer.h
|
timer.h
|
||||||
vector_math.h
|
vector_math.h
|
||||||
|
vectorize.h
|
||||||
)
|
)
|
||||||
|
|
||||||
if(ARCHITECTURE_x86_64)
|
if(ARCHITECTURE_x86_64)
|
||||||
|
195
src/common/vectorize.h
Normal file
195
src/common/vectorize.h
Normal file
@ -0,0 +1,195 @@
|
|||||||
|
// Copyright 2016 Citra Emulator Project / PPSSPP Project
|
||||||
|
// Licensed under GPLv2 or any later version
|
||||||
|
// Refer to the license.txt file included.
|
||||||
|
#include "common/common_types.h"
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
/*
|
||||||
|
* Vectorize lib
|
||||||
|
* This small utility will allow you to simplify repititive actions while
|
||||||
|
* transforming/generating/consuming data in an 'ordered set' or array.
|
||||||
|
* it achieves this under better performance when the next conditions are met:
|
||||||
|
* - your morphism(transforming function) does not branch(no-ifs, no-switches, no-calls)
|
||||||
|
* - very simple actions.
|
||||||
|
* - morphisms are not monadic or dependant on state. (they are in theory pure functions).
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#define VECTORIZE_NEXT __pragma("loop( ivdep )")
|
||||||
|
#elif __GNUC__
|
||||||
|
#define VECTORIZE_NEXT _Pragma("GCC ivdep")
|
||||||
|
#elif __clang__
|
||||||
|
#define VECTORIZE_NEXT _Pragma("clang loop vectorize(enable) interleave(enable)")
|
||||||
|
#else
|
||||||
|
#define VECTORIZE_NEXT
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Static compilers can't always detect if vectorization is possible,
|
||||||
|
* if the programmer is 100% sure it's possible to vectorize a set
|
||||||
|
* of actions, it can hint the compiler that it can vectorize a loop
|
||||||
|
* unconditionaly.
|
||||||
|
*/
|
||||||
|
namespace Common {
|
||||||
|
|
||||||
|
// These macros are used to unroll/unfold the same action on tight loops
|
||||||
|
// should be used on actions that don't branch the pipeline.
|
||||||
|
// Static compilers can't detect unrollable loops easily. Normaly,
|
||||||
|
// they require some profiling data to unroll loops.
|
||||||
|
#define LOOP_UNROLL_1(CODE) CODE
|
||||||
|
#define LOOP_UNROLL_2(CODE) \
|
||||||
|
LOOP_UNROLL_1(CODE); \
|
||||||
|
LOOP_UNROLL_1(CODE)
|
||||||
|
#define LOOP_UNROLL_4(CODE) \
|
||||||
|
LOOP_UNROLL_2(CODE); \
|
||||||
|
LOOP_UNROLL_2(CODE)
|
||||||
|
#define LOOP_UNROLL_8(CODE) \
|
||||||
|
LOOP_UNROLL_4(CODE); \
|
||||||
|
LOOP_UNROLL_4(CODE)
|
||||||
|
|
||||||
|
// use this before a loop to tell the compiler, it's unconditionaly vectorizeable
|
||||||
|
|
||||||
|
// Endofunctor: https://www.quora.com/What-is-an-endofunctor
|
||||||
|
// map transforms each value in the set applying a function continuesly
|
||||||
|
// example:
|
||||||
|
//
|
||||||
|
// in C++
|
||||||
|
// inline void array_cosine(f32*& my_set) {
|
||||||
|
// *myset = cos(my_set[0]);
|
||||||
|
// my_set += 1;
|
||||||
|
// }
|
||||||
|
// .....
|
||||||
|
// map<f32,&array_cosine>(my_array);
|
||||||
|
//
|
||||||
|
template <class T, void morphism(T*&)>
|
||||||
|
void map(T*& set, u32 set_size) {
|
||||||
|
u32 steps = set_size / 8; // 16 unfolds
|
||||||
|
VECTORIZE_NEXT for (u32 i = 0; i != steps; i++) {
|
||||||
|
LOOP_UNROLL_8(morphism(set));
|
||||||
|
}
|
||||||
|
// Now just do the rest
|
||||||
|
steps = set_size - (steps * 8);
|
||||||
|
u32 jump = (steps % 8);
|
||||||
|
// This form of loop unfolding works for every set of data at the
|
||||||
|
// expense of not marshelling/vectorizing but won't break the pipeline
|
||||||
|
switch (jump) {
|
||||||
|
do {
|
||||||
|
jump = 8;
|
||||||
|
morphism(set);
|
||||||
|
case 7:
|
||||||
|
morphism(set);
|
||||||
|
case 6:
|
||||||
|
morphism(set);
|
||||||
|
case 5:
|
||||||
|
morphism(set);
|
||||||
|
case 4:
|
||||||
|
morphism(set);
|
||||||
|
case 3:
|
||||||
|
morphism(set);
|
||||||
|
case 2:
|
||||||
|
morphism(set);
|
||||||
|
case 1:
|
||||||
|
morphism(set);
|
||||||
|
case 0:
|
||||||
|
default:
|
||||||
|
steps -= jump;
|
||||||
|
} while (steps != 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Anamorphism: https://en.wikipedia.org/wiki/Anamorphism
|
||||||
|
// unfold takes a finite ordered set (an array for instance)
|
||||||
|
// and generates a subset of values from the corresponding
|
||||||
|
// seed value in your set.
|
||||||
|
template <class T, void morphism(T*&, T*&)>
|
||||||
|
void unfold(T*& set, T*& generator, u32 set_size) {
|
||||||
|
u32 steps = set_size / 8; // 16 unfolds
|
||||||
|
VECTORIZE_NEXT for (u32 i = 0; i != steps; i++) {
|
||||||
|
LOOP_UNROLL_8(morphism(set, generator));
|
||||||
|
}
|
||||||
|
// Now just do the rest
|
||||||
|
steps = set_size - (steps * 8);
|
||||||
|
u32 jump = (steps % 8);
|
||||||
|
// This form of loop unfolding works for every set of data at the
|
||||||
|
// expense of not marshelling/vectorizing but won't break the pipeline
|
||||||
|
switch (jump) {
|
||||||
|
do {
|
||||||
|
jump = 8;
|
||||||
|
morphism(set, generator);
|
||||||
|
case 7:
|
||||||
|
morphism(set, generator);
|
||||||
|
case 6:
|
||||||
|
morphism(set, generator);
|
||||||
|
case 5:
|
||||||
|
morphism(set, generator);
|
||||||
|
case 4:
|
||||||
|
morphism(set, generator);
|
||||||
|
case 3:
|
||||||
|
morphism(set, generator);
|
||||||
|
case 2:
|
||||||
|
morphism(set, generator);
|
||||||
|
case 1:
|
||||||
|
morphism(set, generator);
|
||||||
|
case 0:
|
||||||
|
default:
|
||||||
|
steps -= jump;
|
||||||
|
} while (steps != 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Catamorphism: https://wiki.haskell.org/Catamorphisms
|
||||||
|
// fold takes a finite ordered set (an array for instance)
|
||||||
|
// and collapses all values to a consumer.
|
||||||
|
// pseudocode example:
|
||||||
|
// fold (*) [1,2,3] 1 = (3*(2*(1*1))) = 6
|
||||||
|
//
|
||||||
|
// in C++
|
||||||
|
// inline void array_product(u32*& my_set, u32& my_consumer) {
|
||||||
|
// my_consumer *= *my_set;
|
||||||
|
// my_set += 1;
|
||||||
|
// }
|
||||||
|
// .....
|
||||||
|
// u32 result = 1;
|
||||||
|
// fold<u32,&array_product>(my_array, result);
|
||||||
|
//
|
||||||
|
template <class T, void morphism(T*&, T&)>
|
||||||
|
void fold(T*& set, T& consumer, u32 set_size) {
|
||||||
|
u32 steps = set_size / 8; // 16 unfolds
|
||||||
|
VECTORIZE_NEXT for (u32 i = 0; i != steps; i++) {
|
||||||
|
LOOP_UNROLL_8(morphism(set, consumer));
|
||||||
|
}
|
||||||
|
// Now just do the rest
|
||||||
|
steps = set_size - (steps * 8);
|
||||||
|
u32 jump = (steps % 8);
|
||||||
|
// This form of loop unfolding works for every set of data at the
|
||||||
|
// expense of not marshelling/vectorizing but won't break the pipeline
|
||||||
|
switch (jump) {
|
||||||
|
do {
|
||||||
|
jump = 8;
|
||||||
|
morphism(set, consumer);
|
||||||
|
case 7:
|
||||||
|
morphism(set, consumer);
|
||||||
|
case 6:
|
||||||
|
morphism(set, consumer);
|
||||||
|
case 5:
|
||||||
|
morphism(set, consumer);
|
||||||
|
case 4:
|
||||||
|
morphism(set, consumer);
|
||||||
|
case 3:
|
||||||
|
morphism(set, consumer);
|
||||||
|
case 2:
|
||||||
|
morphism(set, consumer);
|
||||||
|
case 1:
|
||||||
|
morphism(set, consumer);
|
||||||
|
case 0:
|
||||||
|
default:
|
||||||
|
steps -= jump;
|
||||||
|
} while (steps != 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef LOOP_UNROLL_8
|
||||||
|
#undef LOOP_UNROLL_4
|
||||||
|
#undef LOOP_UNROLL_2
|
||||||
|
#undef LOOP_UNROLL_1
|
||||||
|
|
||||||
|
} // Common
|
Loading…
Reference in New Issue
Block a user