From ef43776c7bf4a176978171c9406b24f7d266c499 Mon Sep 17 00:00:00 2001 From: GPUCode <47210458+GPUCode@users.noreply.github.com> Date: Wed, 18 Oct 2023 19:41:36 +0300 Subject: [PATCH] shader: Fix address register offset behavior in x64 Jit (#6942) * shader: Fix address register offset behavior in x64 Jit * shader: Remove redundant jump * tests: Add address register tests * shader: Remove additional pre-multiplications by 16 * tests: Add catch-stringifier for vec4f * tests: Format --- .../shader/shader_jit_x64_compiler.cpp | 59 ++++++++++++++++++ .../shader/shader_jit_x64_compiler.cpp | 62 +++++++++++-------- 2 files changed, 94 insertions(+), 27 deletions(-) diff --git a/src/tests/video_core/shader/shader_jit_x64_compiler.cpp b/src/tests/video_core/shader/shader_jit_x64_compiler.cpp index cb44958fd7..cb76decf31 100644 --- a/src/tests/video_core/shader/shader_jit_x64_compiler.cpp +++ b/src/tests/video_core/shader/shader_jit_x64_compiler.cpp @@ -28,6 +28,15 @@ static constexpr Common::Vec4f vec4_nan = Common::Vec4f::AssignToAll(NAN); static constexpr Common::Vec4f vec4_one = Common::Vec4f::AssignToAll(1.0f); static constexpr Common::Vec4f vec4_zero = Common::Vec4f::AssignToAll(0.0f); +namespace Catch { +template <> +struct StringMaker { + static std::string convert(Common::Vec4f value) { + return fmt::format("({}, {}, {}, {})", value.r(), value.g(), value.b(), value.a()); + } +}; +} // namespace Catch + static std::unique_ptr CompileShaderSetup( std::initializer_list code) { const auto shbin = nihstro::InlineAsm::CompileToRawBinary(code); @@ -385,6 +394,56 @@ TEST_CASE("RSQ", "[video_core][shader][shader_jit]") { REQUIRE(shader.Run({0.0625f}).x == Catch::Approx(4.0f).margin(0.004f)); } +TEST_CASE("Address Register Offset", "[video_core][shader][shader_jit]") { + const auto sh_input = SourceRegister::MakeInput(0); + const auto sh_c40 = SourceRegister::MakeFloat(40); + const auto sh_output = DestRegister::MakeOutput(0); + + auto shader = ShaderTest({ + // mova a0.x, sh_input.x + {OpCode::Id::MOVA, DestRegister{}, "x", sh_input, "x", SourceRegister{}, "", + nihstro::InlineAsm::RelativeAddress::A1}, + // mov sh_output.xyzw, c40[a0.x].xyzw + {OpCode::Id::MOV, sh_output, "xyzw", sh_c40, "xyzw", SourceRegister{}, "", + nihstro::InlineAsm::RelativeAddress::A1}, + {OpCode::Id::END}, + }); + + // Prepare shader uniforms + const bool inverted = true; + std::array f_uniforms; + for (u32 i = 0; i < 0x80; i++) { + if (i >= 0x00 && i < 0x60) { + const u32 base = inverted ? (0x60 - i) : i; + const auto color = (base * 2.f) / 255.0f; + const auto color_f24 = Pica::f24::FromFloat32(color); + shader.shader_setup->uniforms.f[i] = {color_f24, color_f24, color_f24, + Pica::f24::One()}; + f_uniforms[i] = {color, color, color, 1.f}; + } else if (i >= 0x60 && i < 0x70) { + const u8 color = static_cast((i - 0x60) * 0x10); + shader.shader_setup->uniforms.i[i - 0x60] = {color, color, color, 255}; + } else if (i >= 0x70 && i < 0x80) { + shader.shader_setup->uniforms.b[i - 0x70] = i >= 0x78; + } + } + + REQUIRE(shader.Run(0.f) == f_uniforms[40]); + REQUIRE(shader.Run(13.f) == f_uniforms[53]); + REQUIRE(shader.Run(50.f) == f_uniforms[90]); + REQUIRE(shader.Run(60.f) == vec4_one); + REQUIRE(shader.Run(74.f) == vec4_one); + REQUIRE(shader.Run(87.f) == vec4_one); + REQUIRE(shader.Run(88.f) == f_uniforms[0]); + REQUIRE(shader.Run(128.f) == f_uniforms[40]); + REQUIRE(shader.Run(-40.f) == f_uniforms[0]); + REQUIRE(shader.Run(-42.f) == vec4_one); + REQUIRE(shader.Run(-70.f) == vec4_one); + REQUIRE(shader.Run(-73.f) == f_uniforms[95]); + REQUIRE(shader.Run(-127.f) == f_uniforms[41]); + REQUIRE(shader.Run(-129.f) == f_uniforms[40]); +} + // TODO: Requires fix from https://github.com/neobrain/nihstro/issues/68 // TEST_CASE("MAD", "[video_core][shader][shader_jit]") { // const auto sh_input1 = SourceRegister::MakeInput(0); diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp index 6a30d4e23e..9336061c2f 100644 --- a/src/video_core/shader/shader_jit_x64_compiler.cpp +++ b/src/video_core/shader/shader_jit_x64_compiler.cpp @@ -232,21 +232,45 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe address_register_index = instr.common.address_register_index; } - if (src_num == offset_src && address_register_index != 0) { + if (src_reg.GetRegisterType() == RegisterType::FloatUniform && src_num == offset_src && + address_register_index != 0) { + Xbyak::Reg64 address_reg; switch (address_register_index) { - case 1: // address offset 1 - movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]); + case 1: + address_reg = ADDROFFS_REG_0; break; - case 2: // address offset 2 - movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]); + case 2: + address_reg = ADDROFFS_REG_1; break; - case 3: // address offset 3 - movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]); + case 3: + address_reg = LOOPCOUNT_REG.cvt64(); break; default: UNREACHABLE(); break; } + // s32 offset = address_reg >= -128 && address_reg <= 127 ? address_reg : 0; + // u32 index = (src_reg.GetIndex() + offset) & 0x7f; + + // First we add 128 to address_reg so the first comparison is turned to + // address_reg >= 0 && address_reg < 256 which can be performed with + // a single unsigned comparison (cmovb) + lea(eax, ptr[address_reg + 128]); + mov(ebx, src_reg.GetIndex()); + mov(ecx, address_reg.cvt32()); + add(ecx, ebx); + cmp(eax, 256); + cmovb(ebx, ecx); + and_(ebx, 0x7f); + + // index > 95 ? vec4(1.0) : uniforms.f[index]; + movaps(dest, ONE); + cmp(ebx, 95); + Label load_end; + jg(load_end); + shl(rbx, 4); + movaps(dest, xword[src_ptr + rbx]); + L(load_end); } else { // Load the source movaps(dest, xword[src_ptr + src_offset_disp]); @@ -590,24 +614,14 @@ void JitShader::Compile_MOVA(Instruction instr) { // Move and sign-extend high 32 bits shr(rax, 32); movsxd(ADDROFFS_REG_1, eax); - - // Multiply by 16 to be used as an offset later - shl(ADDROFFS_REG_0, 4); - shl(ADDROFFS_REG_1, 4); } else { if (swiz.DestComponentEnabled(0)) { // Move and sign-extend low 32 bits movsxd(ADDROFFS_REG_0, eax); - - // Multiply by 16 to be used as an offset later - shl(ADDROFFS_REG_0, 4); } else if (swiz.DestComponentEnabled(1)) { // Move and sign-extend high 32 bits shr(rax, 32); movsxd(ADDROFFS_REG_1, eax); - - // Multiply by 16 to be used as an offset later - shl(ADDROFFS_REG_1, 4); } } } @@ -659,9 +673,6 @@ void JitShader::Compile_END(Instruction instr) { mov(byte[STATE + offsetof(UnitState, conditional_code[1])], COND1.cvt8()); // Save address/loop registers - sar(ADDROFFS_REG_0, 4); - sar(ADDROFFS_REG_1, 4); - sar(LOOPCOUNT_REG, 4); mov(dword[STATE + offsetof(UnitState, address_registers[0])], ADDROFFS_REG_0.cvt32()); mov(dword[STATE + offsetof(UnitState, address_registers[1])], ADDROFFS_REG_1.cvt32()); mov(dword[STATE + offsetof(UnitState, address_registers[2])], LOOPCOUNT_REG); @@ -813,11 +824,11 @@ void JitShader::Compile_LOOP(Instruction instr) { std::size_t offset = Uniforms::GetIntUniformOffset(instr.flow_control.int_uniform_id); mov(LOOPCOUNT, dword[UNIFORMS + offset]); mov(LOOPCOUNT_REG, LOOPCOUNT); - shr(LOOPCOUNT_REG, 4); - and_(LOOPCOUNT_REG, 0xFF0); // Y-component is the start + shr(LOOPCOUNT_REG, 8); + and_(LOOPCOUNT_REG, 0xFF); // Y-component is the start mov(LOOPINC, LOOPCOUNT); - shr(LOOPINC, 12); - and_(LOOPINC, 0xFF0); // Z-component is the incrementer + shr(LOOPINC, 16); + and_(LOOPINC, 0xFF); // Z-component is the incrementer movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count add(LOOPCOUNT, 1); // Iteration count is X-component + 1 @@ -993,9 +1004,6 @@ void JitShader::Compile(const std::array* program_ movsxd(ADDROFFS_REG_0, dword[STATE + offsetof(UnitState, address_registers[0])]); movsxd(ADDROFFS_REG_1, dword[STATE + offsetof(UnitState, address_registers[1])]); mov(LOOPCOUNT_REG, dword[STATE + offsetof(UnitState, address_registers[2])]); - shl(ADDROFFS_REG_0, 4); - shl(ADDROFFS_REG_1, 4); - shl(LOOPCOUNT_REG, 4); // Load conditional code mov(COND0, byte[STATE + offsetof(UnitState, conditional_code[0])]);