shader_jit: Add optimizations up to x86-64-v4 (#6668)

2023-07-11 09:21:37 -07:00
parent 6da4853360
commit a94af8ea62
1 changed files with 157 additions and 63 deletions
--- a/src/video_core/shader/shader_jit_x64_compiler.cpp
+++ b/src/video_core/shader/shader_jit_x64_compiler.cpp
@@ -338,15 +338,39 @@ void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) {
    // where neither source was, this NaN was generated by a 0 * inf multiplication, and so the
    // result should be transformed to 0 to match PICA fp rules.
    if (host_caps.has(Cpu::tAVX512F | Cpu::tAVX512VL | Cpu::tAVX512DQ)) {
        vmulps(scratch, src1, src2);
        // Mask of any NaN values found in the result
        const Xbyak::Opmask zero_mask = k1;
        vcmpunordps(zero_mask, scratch, scratch);
        // Mask of any non-NaN inputs producing NaN results
        vcmpordps(zero_mask | zero_mask, src1, src2);
        knotb(zero_mask, zero_mask);
        vmovaps(src1 | zero_mask | T_z, scratch);
        return;
    }
    // Set scratch to mask of (src1 != NaN and src2 != NaN)
    if (host_caps.has(Cpu::tAVX)) {
        vcmpordps(scratch, src1, src2);
    } else {
        movaps(scratch, src1);
        cmpordps(scratch, src2);
    }
    mulps(src1, src2);
    // Set src2 to mask of (result == NaN)
    if (host_caps.has(Cpu::tAVX)) {
        vcmpunordps(src2, src2, src1);
    } else {
        movaps(src2, src1);
        cmpunordps(src2, src2);
    }
    // Clear components where scratch != src2 (i.e. if result is NaN where neither source was NaN)
    xorps(scratch, src2);
@@ -406,6 +430,11 @@ void JitShader::Compile_DP3(Instruction instr) {
    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
    if (host_caps.has(Cpu::tAVX)) {
        vshufps(SRC3, SRC1, SRC1, _MM_SHUFFLE(2, 2, 2, 2));
        vshufps(SRC2, SRC1, SRC1, _MM_SHUFFLE(1, 1, 1, 1));
        vshufps(SRC1, SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0));
    } else {
        movaps(SRC2, SRC1);
        shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1));
@@ -413,6 +442,8 @@ void JitShader::Compile_DP3(Instruction instr) {
        shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2));
        shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0));
    }
    addps(SRC1, SRC2);
    addps(SRC1, SRC3);
@@ -589,9 +620,15 @@ void JitShader::Compile_MOV(Instruction instr) {
 void JitShader::Compile_RCP(Instruction instr) {
    Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
    if (host_caps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) {
        // Accurate to 14 bits of precisions rather than 12 bits of rcpss
        vrcp14ss(SRC1, SRC1, SRC1);
    } else {
        // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica
        // performs this operation more accurately. This should be checked on hardware.
        rcpss(SRC1, SRC1);
    }
    shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
    Compile_DestEnable(instr, SRC1);
@@ -600,9 +637,15 @@ void JitShader::Compile_RCP(Instruction instr) {
 void JitShader::Compile_RSQ(Instruction instr) {
    Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
    if (host_caps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) {
        // Accurate to 14 bits of precisions rather than 12 bits of rsqrtss
        vrsqrt14ss(SRC1, SRC1, SRC1);
    } else {
        // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica
        // performs this operation more accurately. This should be checked on hardware.
        rsqrtss(SRC1, SRC1);
    }
    shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
    Compile_DestEnable(instr, SRC1);
@@ -1050,22 +1093,36 @@ Xbyak::Label JitShader::CompilePrelude_Log2() {
    jp(input_is_nan);
    jae(input_out_of_range);
-    // Split input
+    // Split input: SRC1=MANT[1,2) SCRATCH2=Exponent
    if (host_caps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) {
        vgetexpss(SCRATCH2, SRC1, SRC1);
        vgetmantss(SRC1, SRC1, SRC1, 0x0'0);
    } else {
        movd(eax, SRC1);
        mov(edx, eax);
        and_(eax, 0x7f800000);
        and_(edx, 0x007fffff);
    movss(SCRATCH, xword[rip + c0]); // Preload c0.
        or_(edx, 0x3f800000);
        movd(SRC1, edx);
        // SRC1 now contains the mantissa of the input.
    mulss(SCRATCH, SRC1);
        shr(eax, 23);
        sub(eax, 0x7f);
        cvtsi2ss(SCRATCH2, eax);
        // SCRATCH2 now contains the exponent of the input.
    }
    movss(SCRATCH, xword[rip + c0]);
    // Complete computation of polynomial
    if (host_caps.has(Cpu::tFMA)) {
        vfmadd213ss(SCRATCH, SRC1, xword[rip + c1]);
        vfmadd213ss(SCRATCH, SRC1, xword[rip + c2]);
        vfmadd213ss(SCRATCH, SRC1, xword[rip + c3]);
        vfmadd213ss(SCRATCH, SRC1, xword[rip + c4]);
        subss(SRC1, ONE);
        vfmadd231ss(SCRATCH2, SCRATCH, SRC1);
    } else {
        mulss(SCRATCH, SRC1);
        addss(SCRATCH, xword[rip + c1]);
        mulss(SCRATCH, SRC1);
        addss(SCRATCH, xword[rip + c2]);
@@ -1076,6 +1133,7 @@ Xbyak::Label JitShader::CompilePrelude_Log2() {
        addss(SCRATCH, xword[rip + c4]);
        mulss(SCRATCH, SRC1);
        addss(SCRATCH2, SCRATCH);
    }
    // Duplicate result across vector
    xorps(SRC1, SRC1); // break dependency chain
@@ -1122,26 +1180,60 @@ Xbyak::Label JitShader::CompilePrelude_Exp2() {
    // Handle edge cases
    ucomiss(SRC1, SRC1);
    jp(ret_label);
    // Decompose input:
    // SCRATCH=2^round(input)
    // SRC1=input-round(input) [-0.5, 0.5)
    if (host_caps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) {
        // input - 0.5
        vsubss(SCRATCH, SRC1, xword[rip + half]);
        // trunc(input - 0.5)
        vrndscaless(SCRATCH2, SCRATCH, SCRATCH, _MM_FROUND_TRUNC);
        // SCRATCH = 1 * 2^(trunc(input - 0.5))
        vscalefss(SCRATCH, ONE, SCRATCH2);
        // SRC1 = input-trunc(input - 0.5)
        vsubss(SRC1, SRC1, SCRATCH2);
    } else {
        // Clamp to maximum range since we shift the value directly into the exponent.
        minss(SRC1, xword[rip + input_max]);
        maxss(SRC1, xword[rip + input_min]);
-    // Decompose input
+        if (host_caps.has(Cpu::tAVX)) {
            vsubss(SCRATCH, SRC1, xword[rip + half]);
        } else {
            movss(SCRATCH, SRC1);
    movss(SCRATCH2, xword[rip + c0]); // Preload c0.
            subss(SCRATCH, xword[rip + half]);
        }
        if (host_caps.has(Cpu::tSSE41)) {
            roundss(SCRATCH, SCRATCH, _MM_FROUND_TRUNC);
            cvtss2si(eax, SCRATCH);
        } else {
            cvtss2si(eax, SCRATCH);
            cvtsi2ss(SCRATCH, eax);
        }
        // SCRATCH now contains input rounded to the nearest integer.
        add(eax, 0x7f);
        subss(SRC1, SCRATCH);
        // SRC1 contains input - round(input), which is in [-0.5, 0.5).
    mulss(SCRATCH2, SRC1);
        shl(eax, 23);
        movd(SCRATCH, eax);
        // SCRATCH contains 2^(round(input)).
    }
    // Complete computation of polynomial.
    movss(SCRATCH2, xword[rip + c0]);
    if (host_caps.has(Cpu::tFMA)) {
        vfmadd213ss(SCRATCH2, SRC1, xword[rip + c1]);
        vfmadd213ss(SCRATCH2, SRC1, xword[rip + c2]);
        vfmadd213ss(SCRATCH2, SRC1, xword[rip + c3]);
        vfmadd213ss(SRC1, SCRATCH2, xword[rip + c4]);
    } else {
        mulss(SCRATCH2, SRC1);
        addss(SCRATCH2, xword[rip + c1]);
        mulss(SCRATCH2, SRC1);
        addss(SCRATCH2, xword[rip + c2]);
@@ -1149,6 +1241,8 @@ Xbyak::Label JitShader::CompilePrelude_Exp2() {
        addss(SCRATCH2, xword[rip + c3]);
        mulss(SRC1, SCRATCH2);
        addss(SRC1, xword[rip + c4]);
    }
    mulss(SRC1, SCRATCH);
    // Duplicate result across vector