From b25a9c60a983c817e8efb73eff98ddfadadfa43c Mon Sep 17 00:00:00 2001 From: ChibiDenDen Date: Fri, 17 Apr 2015 02:05:15 +0300 Subject: [PATCH] Some Optimizations --- .../arm/dyncom/arm_dyncom_interpreter.cpp | 477 ++++++++++-------- src/core/arm/dyncom/arm_dyncom_run.h | 11 +- src/video_core/rasterizer.cpp | 113 ++--- src/video_core/vertex_shader.cpp | 38 +- 4 files changed, 345 insertions(+), 294 deletions(-) diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp index fde11e4ff..06109af03 100644 --- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp +++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp @@ -247,10 +247,11 @@ static void LnSWoUB(ImmediateOffset)(ARMul_State* cpu, unsigned int inst, unsign unsigned int Rn = BITS(inst, 16, 19); unsigned int addr; - if (U_BIT) - addr = CHECK_READ_REG15_WA(cpu, Rn) + OFFSET_12; - else - addr = CHECK_READ_REG15_WA(cpu, Rn) - OFFSET_12; + //if (U_BIT) + // addr = CHECK_READ_REG15_WA(cpu, Rn) + OFFSET_12; + //else + // addr = CHECK_READ_REG15_WA(cpu, Rn) - OFFSET_12; + addr = CHECK_READ_REG15_WA(cpu, Rn) + (OFFSET_12 ^ (U_BIT - 1)) + (U_BIT ^ 1); virt_addr = addr; } @@ -1126,6 +1127,22 @@ int CondPassed(ARMul_State* cpu, unsigned int cond) { #define CFLAG cpu->CFlag #define VFLAG cpu->VFlag + //uint8_t bits = + // (ZFLAG << 0) | // EQ, NE + // (CFLAG << 1) | // CS, CC + // (NFLAG << 2) | // MI, PL + // (VFLAG << 3) | // VS, VC + // ((CFLAG & ~ZFLAG) << 4) | // HI, LS + // ((NFLAG ^ VFLAG ^ 1) << 5) | // GE, LT + // (((NFLAG ^ VFLAG ^ 1) & ~ZFLAG) << 6) | // GT, LE + // (1 << 7); + // return ((bits >> (cond >> 1)) & 1) ^ (cond & 1) ^ ((cond + 1) >> 4); + + uint8_t bits[8] = { ZFLAG, CFLAG, NFLAG, VFLAG, CFLAG & ~ZFLAG, NFLAG ^ VFLAG ^ 1, ~ZFLAG, 1 }; + bits[6] &= bits[5]; + return bits[cond >> 1] ^ (cond & 1) ^ ((cond + 1) >> 4); + + /* int temp = 0; switch (cond) { @@ -1178,7 +1195,9 @@ int CondPassed(ARMul_State* cpu, unsigned int cond) { temp = 1; break; } + return temp; + */ } enum DECODE_STATUS { @@ -3533,15 +3552,42 @@ const transop_fp_t arm_instruction_trans[] = { INTERPRETER_TRANSLATE(blx_1_thumb) }; -typedef std::unordered_map bb_map; -static bb_map CreamCache; +//typedef std::unordered_map bb_map; +//static bb_map CreamCache; +static int *CreamCache = nullptr, CreamCacheSize = 0; static void insert_bb(unsigned int addr, int start) { - CreamCache[addr] = start; + //CreamCache[addr] = start; + if (addr >= CreamCacheSize) + { + unsigned int v = addr; + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + + auto NewPtr = new int[v]; + if (CreamCache) + { + memcpy(NewPtr, CreamCache, CreamCacheSize * 4); + delete [] CreamCache; + } + memset(NewPtr + CreamCacheSize, 0xFF, (v - CreamCacheSize) * 4); + CreamCache = NewPtr; + CreamCacheSize = v; + } + CreamCache[addr] = start; } static int find_bb(unsigned int addr, int& start) { - int ret = -1; + if (addr >= CreamCacheSize) return -1; + start = CreamCache[addr]; + if (start == -1) return -1; + return 0; + /*int ret = -1; bb_map::const_iterator it = CreamCache.find(addr); if (it != CreamCache.end()) { start = static_cast(it->second); @@ -3549,7 +3595,7 @@ static int find_bb(unsigned int addr, int& start) { } else { ret = -1; } - return ret; + return ret;*/ } enum { @@ -3716,6 +3762,9 @@ unsigned InterpreterMainLoop(ARMul_State* state) { #define INC_PC(l) ptr += sizeof(arm_inst) + l + unsigned int num_instrs = 0; + ARMul_State* cpu = state; + arm_inst* inst_base; // GCC and Clang have a C++ extension to support a lookup table of labels. Otherwise, fallback to a // clunky switch statement. #if defined __GNUC__ || defined __clang__ @@ -3724,208 +3773,211 @@ unsigned InterpreterMainLoop(ARMul_State* state) { num_instrs++; \ goto *InstLabel[inst_base->idx] #else -#define GOTO_NEXT_INST \ - if (num_instrs >= cpu->NumInstrsToExecute) goto END; \ - num_instrs++; \ - switch(inst_base->idx) { \ - case 0: goto VMLA_INST; \ - case 1: goto VMLS_INST; \ - case 2: goto VNMLA_INST; \ - case 3: goto VNMLA_INST; \ - case 4: goto VNMLS_INST; \ - case 5: goto VNMUL_INST; \ - case 6: goto VMUL_INST; \ - case 7: goto VADD_INST; \ - case 8: goto VSUB_INST; \ - case 9: goto VDIV_INST; \ - case 10: goto VMOVI_INST; \ - case 11: goto VMOVR_INST; \ - case 12: goto VABS_INST; \ - case 13: goto VNEG_INST; \ - case 14: goto VSQRT_INST; \ - case 15: goto VCMP_INST; \ - case 16: goto VCMP2_INST; \ - case 17: goto VCVTBDS_INST; \ - case 18: goto VCVTBFF_INST; \ - case 19: goto VCVTBFI_INST; \ - case 20: goto VMOVBRS_INST; \ - case 21: goto VMSR_INST; \ - case 22: goto VMOVBRC_INST; \ - case 23: goto VMRS_INST; \ - case 24: goto VMOVBCR_INST; \ - case 25: goto VMOVBRRSS_INST; \ - case 26: goto VMOVBRRD_INST; \ - case 27: goto VSTR_INST; \ - case 28: goto VPUSH_INST; \ - case 29: goto VSTM_INST; \ - case 30: goto VPOP_INST; \ - case 31: goto VLDR_INST; \ - case 32: goto VLDM_INST ; \ - case 33: goto SRS_INST; \ - case 34: goto RFE_INST; \ - case 35: goto BKPT_INST; \ - case 36: goto BLX_INST; \ - case 37: goto CPS_INST; \ - case 38: goto PLD_INST; \ - case 39: goto SETEND_INST; \ - case 40: goto CLREX_INST; \ - case 41: goto REV16_INST; \ - case 42: goto USAD8_INST; \ - case 43: goto SXTB_INST; \ - case 44: goto UXTB_INST; \ - case 45: goto SXTH_INST; \ - case 46: goto SXTB16_INST; \ - case 47: goto UXTH_INST; \ - case 48: goto UXTB16_INST; \ - case 49: goto CPY_INST; \ - case 50: goto UXTAB_INST; \ - case 51: goto SSUB8_INST; \ - case 52: goto SHSUB8_INST; \ - case 53: goto SSUBADDX_INST; \ - case 54: goto STREX_INST; \ - case 55: goto STREXB_INST; \ - case 56: goto SWP_INST; \ - case 57: goto SWPB_INST; \ - case 58: goto SSUB16_INST; \ - case 59: goto SSAT16_INST; \ - case 60: goto SHSUBADDX_INST; \ - case 61: goto QSUBADDX_INST; \ - case 62: goto SHADDSUBX_INST; \ - case 63: goto SHADD8_INST; \ - case 64: goto SHADD16_INST; \ - case 65: goto SEL_INST; \ - case 66: goto SADDSUBX_INST; \ - case 67: goto SADD8_INST; \ - case 68: goto SADD16_INST; \ - case 69: goto SHSUB16_INST; \ - case 70: goto UMAAL_INST; \ - case 71: goto UXTAB16_INST; \ - case 72: goto USUBADDX_INST; \ - case 73: goto USUB8_INST; \ - case 74: goto USUB16_INST; \ - case 75: goto USAT16_INST; \ - case 76: goto USADA8_INST; \ - case 77: goto UQSUBADDX_INST; \ - case 78: goto UQSUB8_INST; \ - case 79: goto UQSUB16_INST; \ - case 80: goto UQADDSUBX_INST; \ - case 81: goto UQADD8_INST; \ - case 82: goto UQADD16_INST; \ - case 83: goto SXTAB_INST; \ - case 84: goto UHSUBADDX_INST; \ - case 85: goto UHSUB8_INST; \ - case 86: goto UHSUB16_INST; \ - case 87: goto UHADDSUBX_INST; \ - case 88: goto UHADD8_INST; \ - case 89: goto UHADD16_INST; \ - case 90: goto UADDSUBX_INST; \ - case 91: goto UADD8_INST; \ - case 92: goto UADD16_INST; \ - case 93: goto SXTAH_INST; \ - case 94: goto SXTAB16_INST; \ - case 95: goto QADD8_INST; \ - case 96: goto BXJ_INST; \ - case 97: goto CLZ_INST; \ - case 98: goto UXTAH_INST; \ - case 99: goto BX_INST; \ - case 100: goto REV_INST; \ - case 101: goto BLX_INST; \ - case 102: goto REVSH_INST; \ - case 103: goto QADD_INST; \ - case 104: goto QADD16_INST; \ - case 105: goto QADDSUBX_INST; \ - case 106: goto LDREX_INST; \ - case 107: goto QDADD_INST; \ - case 108: goto QDSUB_INST; \ - case 109: goto QSUB_INST; \ - case 110: goto LDREXB_INST; \ - case 111: goto QSUB8_INST; \ - case 112: goto QSUB16_INST; \ - case 113: goto SMUAD_INST; \ - case 114: goto SMMUL_INST; \ - case 115: goto SMUSD_INST; \ - case 116: goto SMLSD_INST; \ - case 117: goto SMLSLD_INST; \ - case 118: goto SMMLA_INST; \ - case 119: goto SMMLS_INST; \ - case 120: goto SMLALD_INST; \ - case 121: goto SMLAD_INST; \ - case 122: goto SMLAW_INST; \ - case 123: goto SMULW_INST; \ - case 124: goto PKHTB_INST; \ - case 125: goto PKHBT_INST; \ - case 126: goto SMUL_INST; \ - case 127: goto SMLALXY_INST; \ - case 128: goto SMLA_INST; \ - case 129: goto MCRR_INST; \ - case 130: goto MRRC_INST; \ - case 131: goto CMP_INST; \ - case 132: goto TST_INST; \ - case 133: goto TEQ_INST; \ - case 134: goto CMN_INST; \ - case 135: goto SMULL_INST; \ - case 136: goto UMULL_INST; \ - case 137: goto UMLAL_INST; \ - case 138: goto SMLAL_INST; \ - case 139: goto MUL_INST; \ - case 140: goto MLA_INST; \ - case 141: goto SSAT_INST; \ - case 142: goto USAT_INST; \ - case 143: goto MRS_INST; \ - case 144: goto MSR_INST; \ - case 145: goto AND_INST; \ - case 146: goto BIC_INST; \ - case 147: goto LDM_INST; \ - case 148: goto EOR_INST; \ - case 149: goto ADD_INST; \ - case 150: goto RSB_INST; \ - case 151: goto RSC_INST; \ - case 152: goto SBC_INST; \ - case 153: goto ADC_INST; \ - case 154: goto SUB_INST; \ - case 155: goto ORR_INST; \ - case 156: goto MVN_INST; \ - case 157: goto MOV_INST; \ - case 158: goto STM_INST; \ - case 159: goto LDM_INST; \ - case 160: goto LDRSH_INST; \ - case 161: goto STM_INST; \ - case 162: goto LDM_INST; \ - case 163: goto LDRSB_INST; \ - case 164: goto STRD_INST; \ - case 165: goto LDRH_INST; \ - case 166: goto STRH_INST; \ - case 167: goto LDRD_INST; \ - case 168: goto STRT_INST; \ - case 169: goto STRBT_INST; \ - case 170: goto LDRBT_INST; \ - case 171: goto LDRT_INST; \ - case 172: goto MRC_INST; \ - case 173: goto MCR_INST; \ - case 174: goto MSR_INST; \ - case 175: goto LDRB_INST; \ - case 176: goto STRB_INST; \ - case 177: goto LDR_INST; \ - case 178: goto LDRCOND_INST ; \ - case 179: goto STR_INST; \ - case 180: goto CDP_INST; \ - case 181: goto STC_INST; \ - case 182: goto LDC_INST; \ - case 183: goto SWI_INST; \ - case 184: goto BBL_INST; \ - case 185: goto LDREXD_INST; \ - case 186: goto STREXD_INST; \ - case 187: goto LDREXH_INST; \ - case 188: goto STREXH_INST; \ - case 189: goto B_2_THUMB ; \ - case 190: goto B_COND_THUMB ; \ - case 191: goto BL_1_THUMB ; \ - case 192: goto BL_2_THUMB ; \ - case 193: goto BLX_1_THUMB ; \ - case 194: goto DISPATCH; \ - case 195: goto INIT_INST_LENGTH; \ - case 196: goto END; \ - } + goto next_inst_end; +#define GOTO_NEXT_INST goto next_inst; +next_inst: + if (num_instrs >= cpu->NumInstrsToExecute) goto END; + num_instrs++; + switch(inst_base->idx) { + case 0: goto VMLA_INST; + case 1: goto VMLS_INST; + case 2: goto VNMLA_INST; + case 3: goto VNMLA_INST; + case 4: goto VNMLS_INST; + case 5: goto VNMUL_INST; + case 6: goto VMUL_INST; + case 7: goto VADD_INST; + case 8: goto VSUB_INST; + case 9: goto VDIV_INST; + case 10: goto VMOVI_INST; + case 11: goto VMOVR_INST; + case 12: goto VABS_INST; + case 13: goto VNEG_INST; + case 14: goto VSQRT_INST; + case 15: goto VCMP_INST; + case 16: goto VCMP2_INST; + case 17: goto VCVTBDS_INST; + case 18: goto VCVTBFF_INST; + case 19: goto VCVTBFI_INST; + case 20: goto VMOVBRS_INST; + case 21: goto VMSR_INST; + case 22: goto VMOVBRC_INST; + case 23: goto VMRS_INST; + case 24: goto VMOVBCR_INST; + case 25: goto VMOVBRRSS_INST; + case 26: goto VMOVBRRD_INST; + case 27: goto VSTR_INST; + case 28: goto VPUSH_INST; + case 29: goto VSTM_INST; + case 30: goto VPOP_INST; + case 31: goto VLDR_INST; + case 32: goto VLDM_INST ; + case 33: goto SRS_INST; + case 34: goto RFE_INST; + case 35: goto BKPT_INST; + case 36: goto BLX_INST; + case 37: goto CPS_INST; + case 38: goto PLD_INST; + case 39: goto SETEND_INST; + case 40: goto CLREX_INST; + case 41: goto REV16_INST; + case 42: goto USAD8_INST; + case 43: goto SXTB_INST; + case 44: goto UXTB_INST; + case 45: goto SXTH_INST; + case 46: goto SXTB16_INST; + case 47: goto UXTH_INST; + case 48: goto UXTB16_INST; + case 49: goto CPY_INST; + case 50: goto UXTAB_INST; + case 51: goto SSUB8_INST; + case 52: goto SHSUB8_INST; + case 53: goto SSUBADDX_INST; + case 54: goto STREX_INST; + case 55: goto STREXB_INST; + case 56: goto SWP_INST; + case 57: goto SWPB_INST; + case 58: goto SSUB16_INST; + case 59: goto SSAT16_INST; + case 60: goto SHSUBADDX_INST; + case 61: goto QSUBADDX_INST; + case 62: goto SHADDSUBX_INST; + case 63: goto SHADD8_INST; + case 64: goto SHADD16_INST; + case 65: goto SEL_INST; + case 66: goto SADDSUBX_INST; + case 67: goto SADD8_INST; + case 68: goto SADD16_INST; + case 69: goto SHSUB16_INST; + case 70: goto UMAAL_INST; + case 71: goto UXTAB16_INST; + case 72: goto USUBADDX_INST; + case 73: goto USUB8_INST; + case 74: goto USUB16_INST; + case 75: goto USAT16_INST; + case 76: goto USADA8_INST; + case 77: goto UQSUBADDX_INST; + case 78: goto UQSUB8_INST; + case 79: goto UQSUB16_INST; + case 80: goto UQADDSUBX_INST; + case 81: goto UQADD8_INST; + case 82: goto UQADD16_INST; + case 83: goto SXTAB_INST; + case 84: goto UHSUBADDX_INST; + case 85: goto UHSUB8_INST; + case 86: goto UHSUB16_INST; + case 87: goto UHADDSUBX_INST; + case 88: goto UHADD8_INST; + case 89: goto UHADD16_INST; + case 90: goto UADDSUBX_INST; + case 91: goto UADD8_INST; + case 92: goto UADD16_INST; + case 93: goto SXTAH_INST; + case 94: goto SXTAB16_INST; + case 95: goto QADD8_INST; + case 96: goto BXJ_INST; + case 97: goto CLZ_INST; + case 98: goto UXTAH_INST; + case 99: goto BX_INST; + case 100: goto REV_INST; + case 101: goto BLX_INST; + case 102: goto REVSH_INST; + case 103: goto QADD_INST; + case 104: goto QADD16_INST; + case 105: goto QADDSUBX_INST; + case 106: goto LDREX_INST; + case 107: goto QDADD_INST; + case 108: goto QDSUB_INST; + case 109: goto QSUB_INST; + case 110: goto LDREXB_INST; + case 111: goto QSUB8_INST; + case 112: goto QSUB16_INST; + case 113: goto SMUAD_INST; + case 114: goto SMMUL_INST; + case 115: goto SMUSD_INST; + case 116: goto SMLSD_INST; + case 117: goto SMLSLD_INST; + case 118: goto SMMLA_INST; + case 119: goto SMMLS_INST; + case 120: goto SMLALD_INST; + case 121: goto SMLAD_INST; + case 122: goto SMLAW_INST; + case 123: goto SMULW_INST; + case 124: goto PKHTB_INST; + case 125: goto PKHBT_INST; + case 126: goto SMUL_INST; + case 127: goto SMLALXY_INST; + case 128: goto SMLA_INST; + case 129: goto MCRR_INST; + case 130: goto MRRC_INST; + case 131: goto CMP_INST; + case 132: goto TST_INST; + case 133: goto TEQ_INST; + case 134: goto CMN_INST; + case 135: goto SMULL_INST; + case 136: goto UMULL_INST; + case 137: goto UMLAL_INST; + case 138: goto SMLAL_INST; + case 139: goto MUL_INST; + case 140: goto MLA_INST; + case 141: goto SSAT_INST; + case 142: goto USAT_INST; + case 143: goto MRS_INST; + case 144: goto MSR_INST; + case 145: goto AND_INST; + case 146: goto BIC_INST; + case 147: goto LDM_INST; + case 148: goto EOR_INST; + case 149: goto ADD_INST; + case 150: goto RSB_INST; + case 151: goto RSC_INST; + case 152: goto SBC_INST; + case 153: goto ADC_INST; + case 154: goto SUB_INST; + case 155: goto ORR_INST; + case 156: goto MVN_INST; + case 157: goto MOV_INST; + case 158: goto STM_INST; + case 159: goto LDM_INST; + case 160: goto LDRSH_INST; + case 161: goto STM_INST; + case 162: goto LDM_INST; + case 163: goto LDRSB_INST; + case 164: goto STRD_INST; + case 165: goto LDRH_INST; + case 166: goto STRH_INST; + case 167: goto LDRD_INST; + case 168: goto STRT_INST; + case 169: goto STRBT_INST; + case 170: goto LDRBT_INST; + case 171: goto LDRT_INST; + case 172: goto MRC_INST; + case 173: goto MCR_INST; + case 174: goto MSR_INST; + case 175: goto LDRB_INST; + case 176: goto STRB_INST; + case 177: goto LDR_INST; + case 178: goto LDRCOND_INST ; + case 179: goto STR_INST; + case 180: goto CDP_INST; + case 181: goto STC_INST; + case 182: goto LDC_INST; + case 183: goto SWI_INST; + case 184: goto BBL_INST; + case 185: goto LDREXD_INST; + case 186: goto STREXD_INST; + case 187: goto LDREXH_INST; + case 188: goto STREXH_INST; + case 189: goto B_2_THUMB ; + case 190: goto B_COND_THUMB ; + case 191: goto BL_1_THUMB ; + case 192: goto BL_2_THUMB ; + case 193: goto BLX_1_THUMB ; + case 194: goto DISPATCH; + case 195: goto INIT_INST_LENGTH; + case 196: goto END; + } +next_inst_end: #endif #define UPDATE_NFLAG(dst) (cpu->NFlag = BIT(dst, 31) ? 1 : 0) @@ -3948,7 +4000,6 @@ unsigned InterpreterMainLoop(ARMul_State* state) { #define PC (cpu->Reg[15]) #define CHECK_EXT_INT if (!cpu->NirqSig && !(cpu->Cpsr & 0x80)) goto END; - ARMul_State* cpu = state; // GCC and Clang have a C++ extension to support a lookup table of labels. Otherwise, fallback // to a clunky switch statement. @@ -3978,10 +4029,8 @@ unsigned InterpreterMainLoop(ARMul_State* state) { &&INIT_INST_LENGTH,&&END }; #endif - arm_inst* inst_base; unsigned int addr; unsigned int phys_addr; - unsigned int num_instrs = 0; int ptr; diff --git a/src/core/arm/dyncom/arm_dyncom_run.h b/src/core/arm/dyncom/arm_dyncom_run.h index 85774c565..18c178846 100644 --- a/src/core/arm/dyncom/arm_dyncom_run.h +++ b/src/core/arm/dyncom/arm_dyncom_run.h @@ -26,7 +26,8 @@ void switch_mode(ARMul_State* core, uint32_t mode); // two bytes in size. Thus we don't need to worry about ThumbEE // or Thumb-2 where instructions can be 4 bytes in length. static inline u32 GET_INST_SIZE(ARMul_State* core) { - return core->TFlag? 2 : 4; + //return core->TFlag? 2 : 4; + return 4 - (core->TFlag << 1); } /** @@ -40,7 +41,9 @@ static inline u32 GET_INST_SIZE(ARMul_State* core) { * If the PC is not being read, then the value stored in the register is returned. */ static inline u32 CHECK_READ_REG15_WA(ARMul_State* core, int Rn) { - return (Rn == 15) ? ((core->Reg[15] & ~0x3) + GET_INST_SIZE(core) * 2) : core->Reg[Rn]; + //return (Rn == 15) ? ((core->Reg[15] & ~0x3) + GET_INST_SIZE(core) * 2) : core->Reg[Rn]; + auto Rn15 = (Rn + 1) >> 4; + return (core->Reg[Rn] & ~(Rn15 | (Rn15 << 1))) + ((GET_INST_SIZE(core) << 1) & ~(Rn15 - 1)); } /** @@ -53,5 +56,7 @@ static inline u32 CHECK_READ_REG15_WA(ARMul_State* core, int Rn) { * If the PC is not being read, then the values stored in the register is returned. */ static inline u32 CHECK_READ_REG15(ARMul_State* core, int Rn) { - return (Rn == 15) ? ((core->Reg[15] & ~0x1) + GET_INST_SIZE(core) * 2) : core->Reg[Rn]; + //return (Rn == 15) ? ((core->Reg[15] & ~0x1) + GET_INST_SIZE(core) * 2) : core->Reg[Rn]; + auto Rn15 = (Rn + 1) >> 4; + return (core->Reg[Rn] & ~(Rn15)) + ((GET_INST_SIZE(core) << 1) & ~(Rn15 - 1)); } diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp index dd46f0ec3..f5cdc9832 100644 --- a/src/video_core/rasterizer.cpp +++ b/src/video_core/rasterizer.cpp @@ -177,15 +177,31 @@ private: * * @todo define orientation concretely. */ -static int SignedArea (const Math::Vec2& vtx1, - const Math::Vec2& vtx2, - const Math::Vec2& vtx3) { +static int SignedArea (Math::Vec2 vtx1, + Math::Vec2 vtx2, + Math::Vec2 vtx3) { const auto vec1 = Math::MakeVec(vtx2 - vtx1, 0); const auto vec2 = Math::MakeVec(vtx3 - vtx1, 0); // TODO: There is a very small chance this will overflow for sizeof(int) == 4 return Math::Cross(vec1, vec2).z; }; +static u8 GetAlphaModifierMatrixA[] = { 0, 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 0, 0 }; +static u8 GetAlphaModifierMatrixB[] = { 0, 0, 0, 0, 0, 0, 0, 255, 0, 0, 0, 0, 0, 0 }; + +inline u8 GetAlphaModifier(Regs::TevStageConfig::AlphaModifier factor, Math::Vec4 values) +{ + using AlphaModifier = Regs::TevStageConfig::AlphaModifier; + + auto fi = (int) factor; + + return + values.a() * GetAlphaModifierMatrixA[fi + 6] + GetAlphaModifierMatrixB[fi + 6] + + values.r() * GetAlphaModifierMatrixA[fi + 4] + GetAlphaModifierMatrixB[fi + 4] + + values.g() * GetAlphaModifierMatrixA[fi + 2] + GetAlphaModifierMatrixB[fi + 2] + + values.b() * GetAlphaModifierMatrixA[fi + 0] + GetAlphaModifierMatrixB[fi + 0]; +} + /** * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing * culling via recursion. @@ -375,41 +391,48 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, // operations on each of them (e.g. inversion) and then calculate the output color // with some basic arithmetic. Alpha combiners can be configured separately but work // analogously. + using Source = Regs::TevStageConfig::Source; Math::Vec4 combiner_output; + Math::Vec4 constant, empty = {}; + + Math::Vec4 *results[0x11]; + results[(int) Source::PrimaryColor] = results[(int) Source::PrimaryFragmentColor] = &primary_color; + results[(int) Source::Texture0] = texture_color + 0; + results[(int) Source::Texture1] = texture_color + 1; + results[(int) Source::Texture2] = texture_color + 2; + results[(int) 0x7] = ∅ + results[(int) 0x8] = ∅ + results[(int) 0x9] = ∅ + results[(int) 0xa] = ∅ + results[(int) 0xb] = ∅ + results[(int) 0xc] = ∅ + results[(int) 0xd] = ∅ + results[(int) Source::Constant] = &constant; + results[(int) Source::Previous] = &combiner_output; + results[(int) 0x11] = ∅ + + + int tempsI = 0; for (const auto& tev_stage : tev_stages) { - using Source = Regs::TevStageConfig::Source; using ColorModifier = Regs::TevStageConfig::ColorModifier; using AlphaModifier = Regs::TevStageConfig::AlphaModifier; using Operation = Regs::TevStageConfig::Operation; - auto GetSource = [&](Source source) -> Math::Vec4 { - switch (source) { - // TODO: What's the difference between these two? - case Source::PrimaryColor: - case Source::PrimaryFragmentColor: - return primary_color; + struct + { + Math::Vec4 **results, &constant; + const Pica::Regs::TevStageConfig &tev_stage; - case Source::Texture0: - return texture_color[0]; - - case Source::Texture1: - return texture_color[1]; - - case Source::Texture2: - return texture_color[2]; - - case Source::Constant: - return {tev_stage.const_r, tev_stage.const_g, tev_stage.const_b, tev_stage.const_a}; - - case Source::Previous: - return combiner_output; - - default: - LOG_ERROR(HW_GPU, "Unknown color combiner source %d\n", (int)source); - UNIMPLEMENTED(); - return {}; - } - }; + Math::Vec4 &operator()(Source source) + { + constant = { tev_stage.const_r, tev_stage.const_g, tev_stage.const_b, tev_stage.const_a }; + return *(results[std::min((int) source, 0x11)]); + } + } + GetSource + { + results, constant, tev_stage + }; static auto GetColorModifier = [](ColorModifier factor, const Math::Vec4& values) -> Math::Vec3 { switch (factor) { @@ -445,34 +468,6 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, } }; - static auto GetAlphaModifier = [](AlphaModifier factor, const Math::Vec4& values) -> u8 { - switch (factor) { - case AlphaModifier::SourceAlpha: - return values.a(); - - case AlphaModifier::OneMinusSourceAlpha: - return 255 - values.a(); - - case AlphaModifier::SourceRed: - return values.r(); - - case AlphaModifier::OneMinusSourceRed: - return 255 - values.r(); - - case AlphaModifier::SourceGreen: - return values.g(); - - case AlphaModifier::OneMinusSourceGreen: - return 255 - values.g(); - - case AlphaModifier::SourceBlue: - return values.b(); - - case AlphaModifier::OneMinusSourceBlue: - return 255 - values.b(); - } - }; - static auto ColorCombine = [](Operation op, const Math::Vec3 input[3]) -> Math::Vec3 { switch (op) { case Operation::Replace: diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp index e8d865172..616a287f3 100644 --- a/src/video_core/vertex_shader.cpp +++ b/src/video_core/vertex_shader.cpp @@ -155,10 +155,11 @@ static void ProcessShaderCode(VertexShaderState& state) { } }; - switch (instr.opcode.Value().GetInfo().type) { + auto info = instr.opcode.Value().GetInfo(); + switch (info.type) { case OpCode::Type::Arithmetic: { - bool is_inverted = 0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed); + bool is_inverted = 0 != (info.subtype & OpCode::Info::SrcInversed); // TODO: We don't really support this properly: For instance, the address register // offset needs to be applied to SRC2 instead, etc. // For now, we just abort in this situation. @@ -568,22 +569,23 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes) { const auto& attribute_register_map = registers.vs_input_register_map; float24 dummy_register; boost::fill(state.input_register_table, &dummy_register); - if(num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x; - if(num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x; - if(num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x; - if(num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x; - if(num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x; - if(num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x; - if(num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x; - if(num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x; - if(num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x; - if(num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x; - if(num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x; - if(num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x; - if(num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x; - if(num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x; - if(num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x; - if(num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x; + if(num_attributes > 0) {state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x; + if(num_attributes > 1) {state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x; + if(num_attributes > 2) {state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x; + if(num_attributes > 3) {state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x; + if(num_attributes > 4) {state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x; + if(num_attributes > 5) {state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x; + if(num_attributes > 6) {state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x; + if(num_attributes > 7) {state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x; + if(num_attributes > 8) {state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x; + if(num_attributes > 9) {state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x; + if(num_attributes > 10){ state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x; + if(num_attributes > 11){ state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x; + if(num_attributes > 12){ state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x; + if(num_attributes > 13){ state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x; + if(num_attributes > 14){ state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x; + if(num_attributes > 15){ state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x; + }}}}}}}}}}}}}}}} state.conditional_code[0] = false; state.conditional_code[1] = false;