From 41929371dcc267a1f367dbfb458d22b9a3cce9e8 Mon Sep 17 00:00:00 2001 From: Dwayne Slater Date: Tue, 2 Jan 2018 18:32:33 -0500 Subject: [PATCH] Optimize AttributeBuffer to OutputVertex conversion (#3283) Optimize AttributeBuffer to OutputVertex conversion First I unrolled the inner loop, then I pushed semantics validation outside of the hotloop. I also added overflow slots to avoid conditional branches. Super Mario 3D Land's intro runs at almost full speed when compiled with Clang, and theres a noticible speed increase in MSVC. GCC hasn't been tested but I'm confident in its ability to optimize this code. --- src/video_core/command_processor.cpp | 3 ++ src/video_core/regs_rasterizer.h | 2 ++ src/video_core/shader/shader.cpp | 46 +++++++++++++++++----------- src/video_core/shader/shader.h | 1 + 4 files changed, 34 insertions(+), 18 deletions(-) diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index c34a277dbe..3ccd1297e1 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -221,6 +221,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { MICROPROFILE_SCOPE(GPU_Drawing); immediate_attribute_id = 0; + Shader::OutputVertex::ValidateSemantics(regs.rasterizer); + auto* shader_engine = Shader::GetEngine(); shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); @@ -289,6 +291,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { // Later, these can be compiled and cached. const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress(); VertexLoader loader(regs.pipeline); + Shader::OutputVertex::ValidateSemantics(regs.rasterizer); // Load vertices bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed)); diff --git a/src/video_core/regs_rasterizer.h b/src/video_core/regs_rasterizer.h index 4fef00d763..9f68494ce6 100644 --- a/src/video_core/regs_rasterizer.h +++ b/src/video_core/regs_rasterizer.h @@ -87,6 +87,8 @@ struct RasterizerRegs { BitField<8, 5, Semantic> map_y; BitField<16, 5, Semantic> map_z; BitField<24, 5, Semantic> map_w; + + u32 raw; } vs_output_attributes[7]; INSERT_PADDING_WORDS(0xe); diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 2d0ffe8216..194c9df119 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include #include #include #include "common/bit_set.h" @@ -21,32 +22,41 @@ namespace Pica { namespace Shader { +void OutputVertex::ValidateSemantics(const RasterizerRegs& regs) { + unsigned int num_attributes = regs.vs_output_total; + ASSERT(num_attributes <= 7); + for (size_t attrib = 0; attrib < num_attributes; ++attrib) { + u32 output_register_map = regs.vs_output_attributes[attrib].raw; + for (size_t comp = 0; comp < 4; ++comp) { + u32 semantic = (output_register_map >> (8 * comp)) & 0x1F; + ASSERT_MSG(semantic < 24 || semantic == RasterizerRegs::VSOutputAttributes::INVALID, + "Invalid/unknown semantic id: %" PRIu32, semantic); + } + } +} + OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs, const AttributeBuffer& input) { // Setup output data union { OutputVertex ret{}; - std::array vertex_slots; + // Allow us to overflow OutputVertex to avoid branches, since + // RasterizerRegs::VSOutputAttributes::INVALID would write to slot 31, which + // would be out of bounds otherwise. + std::array vertex_slots_overflow; }; - static_assert(sizeof(vertex_slots) == sizeof(ret), "Struct and array have different sizes."); - unsigned int num_attributes = regs.vs_output_total; - ASSERT(num_attributes <= 7); - for (unsigned int i = 0; i < num_attributes; ++i) { - const auto& output_register_map = regs.vs_output_attributes[i]; + // Assert that OutputVertex has enough space for 24 semantic registers + static_assert(sizeof(std::array) == sizeof(ret), + "Struct and array have different sizes."); - RasterizerRegs::VSOutputAttributes::Semantic semantics[4] = { - output_register_map.map_x, output_register_map.map_y, output_register_map.map_z, - output_register_map.map_w}; - - for (unsigned comp = 0; comp < 4; ++comp) { - RasterizerRegs::VSOutputAttributes::Semantic semantic = semantics[comp]; - if (semantic < vertex_slots.size()) { - vertex_slots[semantic] = input.attr[i][comp]; - } else if (semantic != RasterizerRegs::VSOutputAttributes::INVALID) { - LOG_ERROR(HW_GPU, "Invalid/unknown semantic id: %u", (unsigned int)semantic); - } - } + unsigned int num_attributes = regs.vs_output_total & 7; + for (size_t attrib = 0; attrib < num_attributes; ++attrib) { + const auto output_register_map = regs.vs_output_attributes[attrib]; + vertex_slots_overflow[output_register_map.map_x] = input.attr[attrib][0]; + vertex_slots_overflow[output_register_map.map_y] = input.attr[attrib][1]; + vertex_slots_overflow[output_register_map.map_z] = input.attr[attrib][2]; + vertex_slots_overflow[output_register_map.map_w] = input.attr[attrib][3]; } // The hardware takes the absolute and saturates vertex colors like this, *before* doing diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 8740a16189..21ac010649 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -50,6 +50,7 @@ struct OutputVertex { INSERT_PADDING_WORDS(1); Math::Vec2 tc2; + static void ValidateSemantics(const RasterizerRegs& regs); static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs, const AttributeBuffer& output); };