From 235a251d3cec20251105f59fe9670f08cb8bb1a8 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Sat, 25 Nov 2017 03:17:30 +0000 Subject: [PATCH 1/2] tests: Add tests for x64 shader jit Tests LG2 and EX2 instructions --- src/tests/CMakeLists.txt | 10 +- .../shader/shader_jit_x64_compiler.cpp | 91 +++++++++++++++++++ src/video_core/CMakeLists.txt | 2 +- 3 files changed, 100 insertions(+), 3 deletions(-) create mode 100644 src/tests/video_core/shader/shader_jit_x64_compiler.cpp diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 1aac0daa23..488cbc554f 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -13,11 +13,17 @@ set(HEADERS core/arm/arm_test_common.h ) +if (ARCHITECTURE_x86_64) + set(SRCS ${SRCS} + video_core/shader/shader_jit_x64_compiler.cpp + ) +endif() + create_directory_groups(${SRCS} ${HEADERS}) add_executable(tests ${SRCS} ${HEADERS}) -target_link_libraries(tests PRIVATE common core) +target_link_libraries(tests PRIVATE common core video_core) target_link_libraries(tests PRIVATE glad) # To support linker work-around -target_link_libraries(tests PRIVATE ${PLATFORM_LIBRARIES} catch-single-include Threads::Threads) +target_link_libraries(tests PRIVATE ${PLATFORM_LIBRARIES} catch-single-include nihstro-headers Threads::Threads) add_test(NAME tests COMMAND tests) diff --git a/src/tests/video_core/shader/shader_jit_x64_compiler.cpp b/src/tests/video_core/shader/shader_jit_x64_compiler.cpp new file mode 100644 index 0000000000..7008670f51 --- /dev/null +++ b/src/tests/video_core/shader/shader_jit_x64_compiler.cpp @@ -0,0 +1,91 @@ +// Copyright 2017 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include +#include +#include +#include +#include "video_core/shader/shader_jit_x64_compiler.h" + +using float24 = Pica::float24; +using JitShader = Pica::Shader::JitShader; + +using DestRegister = nihstro::DestRegister; +using OpCode = nihstro::OpCode; +using SourceRegister = nihstro::SourceRegister; + +static std::unique_ptr CompileShader(std::initializer_list code) { + const auto shbin = nihstro::InlineAsm::CompileToRawBinary(code); + + std::array program_code{}; + std::array swizzle_data{}; + + std::transform(shbin.program.begin(), shbin.program.end(), program_code.begin(), + [](const auto& x) { return x.hex; }); + std::transform(shbin.swizzle_table.begin(), shbin.swizzle_table.end(), swizzle_data.begin(), + [](const auto& x) { return x.hex; }); + + auto shader = std::make_unique(); + shader->Compile(&program_code, &swizzle_data); + + return shader; +} + +class ShaderTest { +public: + explicit ShaderTest(std::initializer_list code) + : shader(CompileShader(code)) {} + + float Run(float input) { + Pica::Shader::ShaderSetup shader_setup; + Pica::Shader::UnitState shader_unit; + + shader_unit.registers.input[0].x = float24::FromFloat32(input); + shader->Run(shader_setup, shader_unit, 0); + return shader_unit.registers.output[0].x.ToFloat32(); + } + +public: + std::unique_ptr shader; +}; + +TEST_CASE("LG2", "[video_core][shader][shader_jit]") { + const auto sh_input = SourceRegister::MakeInput(0); + const auto sh_output = DestRegister::MakeOutput(0); + + auto shader = ShaderTest({ + // clang-format off + {OpCode::Id::LG2, sh_output, sh_input}, + {OpCode::Id::END}, + // clang-format on + }); + + REQUIRE(std::isnan(shader.Run(NAN))); + REQUIRE(std::isnan(shader.Run(-1.f))); + REQUIRE(std::isinf(shader.Run(0.f))); + REQUIRE(shader.Run(4.f) == Approx(2.f)); + REQUIRE(shader.Run(64.f) == Approx(6.f)); + REQUIRE(shader.Run(1.e24f) == Approx(79.7262742773f)); +} + +TEST_CASE("EX2", "[video_core][shader][shader_jit]") { + const auto sh_input = SourceRegister::MakeInput(0); + const auto sh_output = DestRegister::MakeOutput(0); + + auto shader = ShaderTest({ + // clang-format off + {OpCode::Id::EX2, sh_output, sh_input}, + {OpCode::Id::END}, + // clang-format on + }); + + REQUIRE(std::isnan(shader.Run(NAN))); + REQUIRE(shader.Run(-800.f) == Approx(0.f)); + REQUIRE(shader.Run(0.f) == Approx(1.f)); + REQUIRE(shader.Run(2.f) == Approx(4.f)); + REQUIRE(shader.Run(6.f) == Approx(64.f)); + REQUIRE(shader.Run(79.7262742773f) == Approx(1.e24f)); + REQUIRE(std::isinf(shader.Run(800.f))); +} diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 82f47d8a90..d492e5188a 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -87,7 +87,7 @@ target_link_libraries(video_core PUBLIC common core) target_link_libraries(video_core PRIVATE glad nihstro-headers) if (ARCHITECTURE_x86_64) - target_link_libraries(video_core PRIVATE xbyak) + target_link_libraries(video_core PUBLIC xbyak) endif() if (PNG_FOUND) From c1aef260afefa45e45857954484721997303f8ad Mon Sep 17 00:00:00 2001 From: MerryMage Date: Sat, 25 Nov 2017 03:18:55 +0000 Subject: [PATCH 2/2] shader_jit_x64_compiler: Remove ABI overhead of LG2 and EX2 This involves reimplementing log2f and exp2f. --- .../shader/shader_jit_x64_compiler.cpp | 192 ++++++++++++++++-- .../shader/shader_jit_x64_compiler.h | 10 + 2 files changed, 185 insertions(+), 17 deletions(-) diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp index 5a856dcaa6..c8afdd543c 100644 --- a/src/video_core/shader/shader_jit_x64_compiler.cpp +++ b/src/video_core/shader/shader_jit_x64_compiler.cpp @@ -432,27 +432,13 @@ void JitShader::Compile_DPH(Instruction instr) { void JitShader::Compile_EX2(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - movss(xmm0, SRC1); // ABI_PARAM1 - - ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - CallFarFunction(*this, exp2f); - ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - - shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN - movaps(SRC1, xmm0); + call(exp2_subroutine); Compile_DestEnable(instr, SRC1); } void JitShader::Compile_LG2(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - movss(xmm0, SRC1); // ABI_PARAM1 - - ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - CallFarFunction(*this, log2f); - ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - - shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN - movaps(SRC1, xmm0); + call(log2_subroutine); Compile_DestEnable(instr, SRC1); } @@ -935,7 +921,179 @@ void JitShader::Compile(const std::array* program_ LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", getSize()); } -JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {} +JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) { + CompilePrelude(); +} + +void JitShader::CompilePrelude() { + log2_subroutine = CompilePrelude_Log2(); + exp2_subroutine = CompilePrelude_Exp2(); +} + +Xbyak::Label JitShader::CompilePrelude_Log2() { + Xbyak::Label subroutine; + + // SSE does not have a log instruction, thus we must approximate. + // We perform this approximation first performaing a range reduction into the range [1.0, 2.0). + // A minimax polynomial which was fit for the function log2(x) / (x - 1) is then evaluated. + // We multiply the result by (x - 1) then restore the result into the appropriate range. + + // Coefficients for the minimax polynomial. + // f(x) computes approximately log2(x) / (x - 1). + // f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)). + align(64); + const void* c0 = getCurr(); + dd(0x3d74552f); + const void* c1 = getCurr(); + dd(0xbeee7397); + const void* c2 = getCurr(); + dd(0x3fbd96dd); + const void* c3 = getCurr(); + dd(0xc02153f6); + const void* c4 = getCurr(); + dd(0x4038d96c); + + align(16); + const void* negative_infinity_vector = getCurr(); + dd(0xff800000); + dd(0xff800000); + dd(0xff800000); + dd(0xff800000); + const void* default_qnan_vector = getCurr(); + dd(0x7fc00000); + dd(0x7fc00000); + dd(0x7fc00000); + dd(0x7fc00000); + + Xbyak::Label input_is_nan, input_is_zero, input_out_of_range; + + align(16); + L(input_out_of_range); + je(input_is_zero); + movaps(SRC1, xword[rip + default_qnan_vector]); + ret(); + L(input_is_zero); + movaps(SRC1, xword[rip + negative_infinity_vector]); + ret(); + + align(16); + L(subroutine); + + // Here we handle edge cases: input in {NaN, 0, -Inf, Negative}. + xorps(SCRATCH, SCRATCH); + ucomiss(SCRATCH, SRC1); + jp(input_is_nan); + jae(input_out_of_range); + + // Split input + movd(eax, SRC1); + mov(edx, eax); + and_(eax, 0x7f800000); + and_(edx, 0x007fffff); + movss(SCRATCH, xword[rip + c0]); // Preload c0. + or_(edx, 0x3f800000); + movd(SRC1, edx); + // SRC1 now contains the mantissa of the input. + mulss(SCRATCH, SRC1); + shr(eax, 23); + sub(eax, 0x7f); + cvtsi2ss(SCRATCH2, eax); + // SCRATCH2 now contains the exponent of the input. + + // Complete computation of polynomial + addss(SCRATCH, xword[rip + c1]); + mulss(SCRATCH, SRC1); + addss(SCRATCH, xword[rip + c2]); + mulss(SCRATCH, SRC1); + addss(SCRATCH, xword[rip + c3]); + mulss(SCRATCH, SRC1); + subss(SRC1, ONE); + addss(SCRATCH, xword[rip + c4]); + mulss(SCRATCH, SRC1); + addss(SCRATCH2, SCRATCH); + + // Duplicate result across vector + xorps(SRC1, SRC1); // break dependency chain + movss(SRC1, SCRATCH2); + L(input_is_nan); + shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); + + ret(); + + return subroutine; +} + +Xbyak::Label JitShader::CompilePrelude_Exp2() { + Xbyak::Label subroutine; + + // SSE does not have a exp instruction, thus we must approximate. + // We perform this approximation first performaing a range reduction into the range [-0.5, 0.5). + // A minimax polynomial which was fit for the function exp2(x) is then evaluated. + // We then restore the result into the appropriate range. + + align(64); + const void* input_max = getCurr(); + dd(0x43010000); + const void* input_min = getCurr(); + dd(0xc2fdffff); + const void* c0 = getCurr(); + dd(0x3c5dbe69); + const void* half = getCurr(); + dd(0x3f000000); + const void* c1 = getCurr(); + dd(0x3d5509f9); + const void* c2 = getCurr(); + dd(0x3e773cc5); + const void* c3 = getCurr(); + dd(0x3f3168b3); + const void* c4 = getCurr(); + dd(0x3f800016); + + Xbyak::Label ret_label; + + align(16); + L(subroutine); + + // Handle edge cases + ucomiss(SRC1, SRC1); + jp(ret_label); + // Clamp to maximum range since we shift the value directly into the exponent. + minss(SRC1, xword[rip + input_max]); + maxss(SRC1, xword[rip + input_min]); + + // Decompose input + movss(SCRATCH, SRC1); + movss(SCRATCH2, xword[rip + c0]); // Preload c0. + subss(SCRATCH, xword[rip + half]); + cvtss2si(eax, SCRATCH); + cvtsi2ss(SCRATCH, eax); + // SCRATCH now contains input rounded to the nearest integer. + add(eax, 0x7f); + subss(SRC1, SCRATCH); + // SRC1 contains input - round(input), which is in [-0.5, 0.5). + mulss(SCRATCH2, SRC1); + shl(eax, 23); + movd(SCRATCH, eax); + // SCRATCH contains 2^(round(input)). + + // Complete computation of polynomial. + addss(SCRATCH2, xword[rip + c1]); + mulss(SCRATCH2, SRC1); + addss(SCRATCH2, xword[rip + c2]); + mulss(SCRATCH2, SRC1); + addss(SCRATCH2, xword[rip + c3]); + mulss(SRC1, SCRATCH2); + addss(SRC1, xword[rip + c4]); + mulss(SRC1, SCRATCH); + + // Duplicate result across vector + L(ret_label); + shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); + + ret(); + + return subroutine; +} } // namespace Shader diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h index 4aee56b1d5..4e4123374e 100644 --- a/src/video_core/shader/shader_jit_x64_compiler.h +++ b/src/video_core/shader/shader_jit_x64_compiler.h @@ -106,6 +106,13 @@ private: */ void FindReturnOffsets(); + /** + * Emits data and code for utility functions. + */ + void CompilePrelude(); + Xbyak::Label CompilePrelude_Log2(); + Xbyak::Label CompilePrelude_Exp2(); + const std::array* program_code = nullptr; const std::array* swizzle_data = nullptr; @@ -120,6 +127,9 @@ private: using CompiledShader = void(const void* setup, void* state, const u8* start_addr); CompiledShader* program = nullptr; + + Xbyak::Label log2_subroutine; + Xbyak::Label exp2_subroutine; }; } // Shader