From 372259712c650b23112dd2828947269c14bd2f08 Mon Sep 17 00:00:00 2001 From: James Sandri <7078671+jlsandri@users.noreply.github.com> Date: Mon, 6 Apr 2026 18:45:30 +1000 Subject: [PATCH 1/2] Codegen: fix VU0 dest-mask lane-bit ordering and VMR32 shuffle pattern Copy-paste errors in the VU0 codegen path across ~36 instruction sites: 1. dest_mask lane-bit ordering was inverted: many emitters passed {0x8, 0x4, 0x2, 0x1} to _mm_set_epi32() instead of the correct {0x1, 0x2, 0x4, 0x8}, and the VSQD emitter had the further typo {0x1, 0x2, 0x2, 0x1}. Because _mm_set_epi32() takes arguments in reverse lane order, the X/Y mask bits were gating the Z/W lanes and vice versa, causing masked writes to silently misroute. The affected instructions include VSQD, VADD/VSUB/VMUL/VMADD/VMSUB family _Field variants, VMINI/VMAX _Field, VFTOI/VITOF family, VADD_bc/VSUB_bc/VMUL_bc/VMADD_bc/VMSUB_bc, VADDq/VSUBq/VMULq/ VMADDq/VMSUBq, VADDi/VSUBi/VMULi/VMADDi/VMSUBi, and a number of ADDAq/VMULA variants. 2. VU0 VMR32 was emitting _MM_SHUFFLE(0,0,0,1) instead of _MM_SHUFFLE(0,3,2,1), so the rotated lane produced a broadcast of lane 1 across all output lanes instead of a true 1-lane rotate. Both classes of bug are pure copy-paste / typo fixes with no runtime behavioural change for dest_mask == 0xF (the most common case), but silently wrong for any masked dest_mask. --- ps2xRecomp/src/lib/code_generator.cpp | 136 +++++++++++++------------- 1 file changed, 68 insertions(+), 68 deletions(-) diff --git a/ps2xRecomp/src/lib/code_generator.cpp b/ps2xRecomp/src/lib/code_generator.cpp index 666a59e7..5392e635 100644 --- a/ps2xRecomp/src/lib/code_generator.cpp +++ b/ps2xRecomp/src/lib/code_generator.cpp @@ -2359,14 +2359,14 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", inst.rd, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, inst.rt, inst.rt); } case VU0_S2_VMOVE: return fmt::format("ctx->vu0_vf[{}] = ctx->vu0_vf[{}];", inst.rt, inst.rd); case VU0_S2_VMR32: - return fmt::format("ctx->vu0_vf[{}] = _mm_shuffle_ps(ctx->vu0_vf[{}], ctx->vu0_vf[{}], _MM_SHUFFLE(0,0,0,1));", inst.rt, inst.rd, inst.rd); + return fmt::format("ctx->vu0_vf[{}] = _mm_shuffle_ps(ctx->vu0_vf[{}], ctx->vu0_vf[{}], _MM_SHUFFLE(0,3,2,1));", inst.rt, inst.rd, inst.rd); case VU0_S2_VCLIPw: { uint8_t field = inst.function & 0x3; @@ -2510,7 +2510,7 @@ namespace ps2recomp uint8_t dest_mask = inst.vectorInfo.vectorField; uint8_t field = inst.function & 0x3; std::string shuffle_pattern = fmt::format("_MM_SHUFFLE({},{},{},{})", field, field, field, field); - return fmt::format("{{ __m128 res = PS2_VADD(ctx->vu0_vf[{}], _mm_shuffle_ps(ctx->vu0_vf[{}], ctx->vu0_vf[{}], {})); __m128i mask = _mm_set_epi32({}, {}, {}, {}); ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, vft, shuffle_pattern, (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, vfd, vfd); + return fmt::format("{{ __m128 res = PS2_VADD(ctx->vu0_vf[{}], _mm_shuffle_ps(ctx->vu0_vf[{}], ctx->vu0_vf[{}], {})); __m128i mask = _mm_set_epi32({}, {}, {}, {}); ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, vft, shuffle_pattern, (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } std::string CodeGenerator::translateVU_VSUB_Field(const Instruction &inst) @@ -2521,7 +2521,7 @@ namespace ps2recomp uint8_t dest_mask = inst.vectorInfo.vectorField; uint8_t field = inst.function & 0x3; std::string shuffle_pattern = fmt::format("_MM_SHUFFLE({},{},{},{})", field, field, field, field); - return fmt::format("{{ __m128 res = PS2_VSUB(ctx->vu0_vf[{}], _mm_shuffle_ps(ctx->vu0_vf[{}], ctx->vu0_vf[{}], {})); __m128i mask = _mm_set_epi32({}, {}, {}, {}); ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, vft, shuffle_pattern, (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, vfd, vfd); + return fmt::format("{{ __m128 res = PS2_VSUB(ctx->vu0_vf[{}], _mm_shuffle_ps(ctx->vu0_vf[{}], ctx->vu0_vf[{}], {})); __m128i mask = _mm_set_epi32({}, {}, {}, {}); ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, vft, shuffle_pattern, (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } std::string CodeGenerator::translateVU_VMUL_Field(const Instruction &inst) @@ -2532,7 +2532,7 @@ namespace ps2recomp uint8_t dest_mask = inst.vectorInfo.vectorField; uint8_t field = inst.function & 0x3; std::string shuffle_pattern = fmt::format("_MM_SHUFFLE({},{},{},{})", field, field, field, field); - return fmt::format("{{ __m128 res = PS2_VMUL(ctx->vu0_vf[{}], _mm_shuffle_ps(ctx->vu0_vf[{}], ctx->vu0_vf[{}], {})); __m128i mask = _mm_set_epi32({}, {}, {}, {}); ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, vft, shuffle_pattern, (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, vfd, vfd); + return fmt::format("{{ __m128 res = PS2_VMUL(ctx->vu0_vf[{}], _mm_shuffle_ps(ctx->vu0_vf[{}], ctx->vu0_vf[{}], {})); __m128i mask = _mm_set_epi32({}, {}, {}, {}); ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, vft, shuffle_pattern, (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } std::string CodeGenerator::translateVU_VADD(const Instruction &inst) @@ -2541,7 +2541,7 @@ namespace ps2recomp uint8_t vfs = inst.rd; uint8_t vft = inst.rt; uint8_t dest_mask = inst.vectorInfo.vectorField; - return fmt::format("{{ __m128 res = PS2_VADD(ctx->vu0_vf[{}], ctx->vu0_vf[{}]); __m128i mask = _mm_set_epi32({}, {}, {}, {}); ctx->vu0_vf[{}] = PS2_VBLEND(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, vfd, vfd); + return fmt::format("{{ __m128 res = PS2_VADD(ctx->vu0_vf[{}], ctx->vu0_vf[{}]); __m128i mask = _mm_set_epi32({}, {}, {}, {}); ctx->vu0_vf[{}] = PS2_VBLEND(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } std::string CodeGenerator::translateVU_VSUB(const Instruction &inst) @@ -2550,7 +2550,7 @@ namespace ps2recomp uint8_t vfs = inst.rd; uint8_t vft = inst.rt; uint8_t dest_mask = inst.vectorInfo.vectorField; - return fmt::format("{{ __m128 res = PS2_VSUB(ctx->vu0_vf[{}], ctx->vu0_vf[{}]); __m128i mask = _mm_set_epi32({}, {}, {}, {}); ctx->vu0_vf[{}] = PS2_VBLEND(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, vfd, vfd); + return fmt::format("{{ __m128 res = PS2_VSUB(ctx->vu0_vf[{}], ctx->vu0_vf[{}]); __m128i mask = _mm_set_epi32({}, {}, {}, {}); ctx->vu0_vf[{}] = PS2_VBLEND(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } std::string CodeGenerator::translateVU_VMUL(const Instruction &inst) @@ -2559,7 +2559,7 @@ namespace ps2recomp uint8_t vfs = inst.rd; uint8_t vft = inst.rt; uint8_t dest_mask = inst.vectorInfo.vectorField; - return fmt::format("{{ __m128 res = PS2_VMUL(ctx->vu0_vf[{}], ctx->vu0_vf[{}]); __m128i mask = _mm_set_epi32({}, {}, {}, {}); ctx->vu0_vf[{}] = PS2_VBLEND(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, vfd, vfd); + return fmt::format("{{ __m128 res = PS2_VMUL(ctx->vu0_vf[{}], ctx->vu0_vf[{}]); __m128i mask = _mm_set_epi32({}, {}, {}, {}); ctx->vu0_vf[{}] = PS2_VBLEND(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } std::string CodeGenerator::translatePEXT5(const Instruction &inst) @@ -2912,8 +2912,8 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", inst.rd, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, inst.rt, inst.rt); } @@ -3018,8 +3018,8 @@ namespace ps2recomp "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); " "ctx->vu0_acc = res; }}", vfs, vft, vft, shuffle_pattern, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3039,8 +3039,8 @@ namespace ps2recomp "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); " "ctx->vu0_acc = res; }}", vfs, vft, vft, shuffle_pattern, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3058,8 +3058,8 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, vft, shuffle_pattern, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3077,8 +3077,8 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, vft, shuffle_pattern, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3094,8 +3094,8 @@ namespace ps2recomp "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); " "ctx->vu0_acc = res; }}", vfs, vft, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3110,8 +3110,8 @@ namespace ps2recomp "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); " "ctx->vu0_acc = res; }}", vfs, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3126,8 +3126,8 @@ namespace ps2recomp "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); " "ctx->vu0_acc = res; }}", vfs, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3141,8 +3141,8 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3155,8 +3155,8 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3169,8 +3169,8 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3183,8 +3183,8 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3197,8 +3197,8 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3215,8 +3215,8 @@ namespace ps2recomp "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); " "ctx->vu0_acc = res; }}", vfs, vft, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3231,8 +3231,8 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3245,8 +3245,8 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3258,14 +3258,14 @@ namespace ps2recomp uint8_t vfs = inst.rd; uint8_t vft = inst.rt; uint8_t dest_mask = inst.vectorInfo.vectorField; - return fmt::format("{{ __m128 mul_res = PS2_VMUL(ctx->vu0_vf[{}], ctx->vu0_vf[{}]); " + return fmt::format("{{ __m128 mul_res = PS2_VOPMUL(ctx->vu0_vf[{}], ctx->vu0_vf[{}]); " "__m128 res = PS2_VSUB(ctx->vu0_acc, mul_res); " "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); " "ctx->vu0_acc = res; }}", vfs, vft, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3279,8 +3279,8 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, vft, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3295,8 +3295,8 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3309,8 +3309,8 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3326,8 +3326,8 @@ namespace ps2recomp "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); " "ctx->vu0_acc = res; }}", vfs, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3342,8 +3342,8 @@ namespace ps2recomp "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); " "ctx->vu0_acc = res; }}", vfs, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vfd, vfd); } @@ -3529,7 +3529,7 @@ namespace ps2recomp { uint8_t vfs = inst.rd; uint8_t vft = inst.rt; - return fmt::format("ctx->vu0_acc = PS2_VMUL(ctx->vu0_vf[{}], ctx->vu0_vf[{}]);", + return fmt::format("ctx->vu0_acc = PS2_VOPMUL(ctx->vu0_vf[{}], ctx->vu0_vf[{}]);", vfs, vft); } @@ -3545,8 +3545,8 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, formatFloatLiteral(scale), - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, inst.rt, inst.rt); } @@ -3563,8 +3563,8 @@ namespace ps2recomp "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vfs, formatFloatLiteral(scale), - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, inst.rt, inst.rt); } @@ -3578,8 +3578,8 @@ namespace ps2recomp "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); " "ctx->vi[{}] = (ctx->vi[{}] + 1) & 0x3FF; }}", vis, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, inst.rt, inst.rt, vis, vis); } @@ -3595,8 +3595,8 @@ namespace ps2recomp "ctx->vi[{}] = (ctx->vi[{}] + 1) & 0x3FF; }}", vis, inst.rt, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, vis, vis); } @@ -3611,8 +3611,8 @@ namespace ps2recomp "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", vis, vis, vis, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, inst.rt, inst.rt); } @@ -3628,15 +3628,15 @@ namespace ps2recomp vis, vis, vis, inst.rt, - (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, - (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0); + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0); } std::string CodeGenerator::translateVU_VRGET(const Instruction &inst) { uint8_t dest_mask = inst.vectorInfo.vectorField; uint8_t ft_reg = inst.rt; - return fmt::format("{{ __m128 res = ctx->vu0_r; __m128i mask = _mm_set_epi32({}, {}, {}, {}); ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", (dest_mask & 0x8) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x1) ? -1 : 0, ft_reg, ft_reg); + return fmt::format("{{ __m128 res = ctx->vu0_r; __m128i mask = _mm_set_epi32({}, {}, {}, {}); ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, ft_reg, ft_reg); } std::string CodeGenerator::translateVU_VRINIT(const Instruction &inst) From 0b88fb91d8eeb36e41140db18a20393541b6b37c Mon Sep 17 00:00:00 2001 From: James Sandri <7078671+jlsandri@users.noreply.github.com> Date: Mon, 6 Apr 2026 18:45:39 +1000 Subject: [PATCH 2/2] Codegen: add dest-mask support for VU0 VMOVE and VMR32 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously VMOVE and VMR32 emitted unconditional moves/shuffles that ignored the instruction's dest_mask field, so any masked VMOVE/VMR32 would incorrectly overwrite lanes the mask was meant to preserve. This patch: - Fast-paths dest_mask == 0xF to the existing unconditional codegen. - For any other mask, emits an _mm_blendv_ps guarded by a per-lane selector built from the dest_mask bits, matching the lane-selection convention used by the rest of the VU0 dest-mask emitters. Stacks on the VMR32 shuffle pattern fix — without that prerequisite, the masked VMR32 path would rotate to the wrong lane. --- ps2xRecomp/src/lib/code_generator.cpp | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/ps2xRecomp/src/lib/code_generator.cpp b/ps2xRecomp/src/lib/code_generator.cpp index 5392e635..06a6c17a 100644 --- a/ps2xRecomp/src/lib/code_generator.cpp +++ b/ps2xRecomp/src/lib/code_generator.cpp @@ -2364,9 +2364,32 @@ namespace ps2recomp inst.rt, inst.rt); } case VU0_S2_VMOVE: - return fmt::format("ctx->vu0_vf[{}] = ctx->vu0_vf[{}];", inst.rt, inst.rd); + { + uint8_t dest_mask = inst.vectorInfo.vectorField; + if (dest_mask == 0xF) { + return fmt::format("ctx->vu0_vf[{}] = ctx->vu0_vf[{}];", inst.rt, inst.rd); + } + return fmt::format("{{ __m128i mask = _mm_set_epi32({}, {}, {}, {}); " + "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], ctx->vu0_vf[{}], _mm_castsi128_ps(mask)); }}", + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, + inst.rt, inst.rt, inst.rd); + } case VU0_S2_VMR32: - return fmt::format("ctx->vu0_vf[{}] = _mm_shuffle_ps(ctx->vu0_vf[{}], ctx->vu0_vf[{}], _MM_SHUFFLE(0,3,2,1));", inst.rt, inst.rd, inst.rd); + { + uint8_t dest_mask = inst.vectorInfo.vectorField; + if (dest_mask == 0xF) { + // All components — no blend needed + return fmt::format("ctx->vu0_vf[{}] = _mm_shuffle_ps(ctx->vu0_vf[{}], ctx->vu0_vf[{}], _MM_SHUFFLE(0,3,2,1));", inst.rt, inst.rd, inst.rd); + } + return fmt::format("{{ __m128 res = _mm_shuffle_ps(ctx->vu0_vf[{}], ctx->vu0_vf[{}], _MM_SHUFFLE(0,3,2,1)); " + "__m128i mask = _mm_set_epi32({}, {}, {}, {}); " + "ctx->vu0_vf[{}] = _mm_blendv_ps(ctx->vu0_vf[{}], res, _mm_castsi128_ps(mask)); }}", + inst.rd, inst.rd, + (dest_mask & 0x1) ? -1 : 0, (dest_mask & 0x2) ? -1 : 0, + (dest_mask & 0x4) ? -1 : 0, (dest_mask & 0x8) ? -1 : 0, + inst.rt, inst.rt); + } case VU0_S2_VCLIPw: { uint8_t field = inst.function & 0x3;