From c2ff7a95c3ffee1c964735a0a4bd0e34cf76cab8 Mon Sep 17 00:00:00 2001 From: Shawn Xu Date: Fri, 21 Mar 2025 11:24:11 -0700 Subject: [PATCH] Cleanup fused updates Passed Non-regression STC: LLR: 2.95 (-2.94,2.94) <-1.75,0.25> Total: 70656 W: 18257 L: 18077 D: 34322 Ptnml(0-2): 217, 7912, 18879, 8114, 206 https://tests.stockfishchess.org/tests/view/67e23ae78888403457d876d4 closes https://github.com/official-stockfish/Stockfish/pull/5941 No functional change --- src/nnue/nnue_accumulator.cpp | 280 +++++++++++----------------- src/nnue/nnue_common.h | 2 +- src/nnue/nnue_feature_transformer.h | 54 ++++++ src/types.h | 9 + 4 files changed, 169 insertions(+), 176 deletions(-) diff --git a/src/nnue/nnue_accumulator.cpp b/src/nnue/nnue_accumulator.cpp index d693cc03..efa8df90 100644 --- a/src/nnue/nnue_accumulator.cpp +++ b/src/nnue/nnue_accumulator.cpp @@ -21,8 +21,10 @@ #include #include #include +#include #include "../bitboard.h" +#include "../misc.h" #include "../position.h" #include "../types.h" #include "network.h" @@ -185,7 +187,7 @@ void AccumulatorStack::backward_update_incremental( const Square ksq = pos.square(Perspective); for (std::size_t next = m_current_idx - 2; next >= end; next--) - update_accumulator_incremental( + update_accumulator_incremental( featureTransformer, ksq, m_accumulators[next], m_accumulators[next + 1]); assert((m_accumulators[end].*accPtr).computed[Perspective]); @@ -208,6 +210,67 @@ AccumulatorStack::evaluate, bool> = true> +void fused_row_reduce(const ElementType* in, ElementType* out, const Ts* const... rows) { + constexpr IndexType size = Width * sizeof(ElementType) / sizeof(typename VectorWrapper::type); + + auto* vecIn = reinterpret_cast(in); + auto* vecOut = reinterpret_cast(out); + + for (IndexType i = 0; i < size; ++i) + vecOut[i] = fused( + vecIn[i], reinterpret_cast(rows)[i]...); +} + +template AccumulatorState::*accPtr> +struct AccumulatorUpdateContext { + const FeatureTransformer& featureTransformer; + const AccumulatorState& from; + AccumulatorState& to; + + AccumulatorUpdateContext(const FeatureTransformer& ft, + const AccumulatorState& accF, + AccumulatorState& accT) noexcept : + featureTransformer{ft}, + from{accF}, + to{accT} {} + + template, bool> = true> + void apply(const Ts... indices) { + auto to_weight_vector = [&](const IndexType index) { + return &featureTransformer.weights[index * Dimensions]; + }; + + auto to_psqt_weight_vector = [&](const IndexType index) { + return &featureTransformer.psqtWeights[index * PSQTBuckets]; + }; + + fused_row_reduce((from.*accPtr).accumulation[Perspective], + (to.*accPtr).accumulation[Perspective], + to_weight_vector(indices)...); + + fused_row_reduce( + (from.*accPtr).psqtAccumulation[Perspective], (to.*accPtr).psqtAccumulation[Perspective], + to_psqt_weight_vector(indices)...); + } +}; + +template AccumulatorState::*accPtr> +auto make_accumulator_update_context( + const FeatureTransformer& featureTransformer, + const AccumulatorState& accumulatorFrom, + AccumulatorState& accumulatorTo) noexcept { + return AccumulatorUpdateContext{ + featureTransformer, accumulatorFrom, accumulatorTo}; +} + template(ksq, computed.dirtyPiece, added, removed); - if (removed.size() == 0 && added.size() == 0) + assert(added.size() == 1 || added.size() == 2); + assert(removed.size() == 1 || removed.size() == 2); + + if (Forward) + assert(added.size() <= removed.size()); + else + assert(removed.size() <= added.size()); + + // Workaround compiler warning for uninitialized variables, replicated on + // profile builds on windows with gcc 14.2.0. + // TODO remove once unneeded + sf_assume(added.size() == 1 || added.size() == 2); + sf_assume(removed.size() == 1 || removed.size() == 2); + + auto updateContext = + make_accumulator_update_context(featureTransformer, computed, target_state); + + if ((Forward && removed.size() == 1) || (Backward && added.size() == 1)) { - std::memcpy((target_state.*accPtr).accumulation[Perspective], - (computed.*accPtr).accumulation[Perspective], - TransformedFeatureDimensions * sizeof(BiasType)); - std::memcpy((target_state.*accPtr).psqtAccumulation[Perspective], - (computed.*accPtr).psqtAccumulation[Perspective], - PSQTBuckets * sizeof(PSQTWeightType)); + assert(added.size() == 1 && removed.size() == 1); + updateContext.template apply(added[0], removed[0]); + } + else if (Forward && added.size() == 1) + { + assert(removed.size() == 2); + updateContext.template apply(added[0], removed[0], removed[1]); + } + else if (Backward && removed.size() == 1) + { + assert(added.size() == 2); + updateContext.template apply(added[0], added[1], removed[0]); } else { - assert(added.size() == 1 || added.size() == 2); - assert(removed.size() == 1 || removed.size() == 2); - - if (Forward) - assert(added.size() <= removed.size()); - else - assert(removed.size() <= added.size()); - - // Workaround compiler warning for uninitialized variables, replicated on - // profile builds on windows with gcc 14.2.0. - // TODO remove once unneeded - sf_assume(added.size() == 1 || added.size() == 2); - sf_assume(removed.size() == 1 || removed.size() == 2); - -#ifdef VECTOR - auto* accIn = - reinterpret_cast(&(computed.*accPtr).accumulation[Perspective][0]); - auto* accOut = - reinterpret_cast(&(target_state.*accPtr).accumulation[Perspective][0]); - - const IndexType offsetA0 = TransformedFeatureDimensions * added[0]; - auto* columnA0 = reinterpret_cast(&featureTransformer.weights[offsetA0]); - const IndexType offsetR0 = TransformedFeatureDimensions * removed[0]; - auto* columnR0 = reinterpret_cast(&featureTransformer.weights[offsetR0]); - - if ((Forward && removed.size() == 1) || (Backwards && added.size() == 1)) - { - assert(added.size() == 1 && removed.size() == 1); - for (IndexType i = 0; - i < TransformedFeatureDimensions * sizeof(WeightType) / sizeof(vec_t); ++i) - accOut[i] = vec_add_16(vec_sub_16(accIn[i], columnR0[i]), columnA0[i]); - } - else if (Forward && added.size() == 1) - { - assert(removed.size() == 2); - const IndexType offsetR1 = TransformedFeatureDimensions * removed[1]; - auto* columnR1 = reinterpret_cast(&featureTransformer.weights[offsetR1]); - - for (IndexType i = 0; - i < TransformedFeatureDimensions * sizeof(WeightType) / sizeof(vec_t); ++i) - accOut[i] = vec_sub_16(vec_add_16(accIn[i], columnA0[i]), - vec_add_16(columnR0[i], columnR1[i])); - } - else if (Backwards && removed.size() == 1) - { - assert(added.size() == 2); - const IndexType offsetA1 = TransformedFeatureDimensions * added[1]; - auto* columnA1 = reinterpret_cast(&featureTransformer.weights[offsetA1]); - - for (IndexType i = 0; - i < TransformedFeatureDimensions * sizeof(WeightType) / sizeof(vec_t); ++i) - accOut[i] = vec_add_16(vec_add_16(accIn[i], columnA0[i]), - vec_sub_16(columnA1[i], columnR0[i])); - } - else - { - assert(added.size() == 2 && removed.size() == 2); - const IndexType offsetA1 = TransformedFeatureDimensions * added[1]; - auto* columnA1 = reinterpret_cast(&featureTransformer.weights[offsetA1]); - const IndexType offsetR1 = TransformedFeatureDimensions * removed[1]; - auto* columnR1 = reinterpret_cast(&featureTransformer.weights[offsetR1]); - - for (IndexType i = 0; - i < TransformedFeatureDimensions * sizeof(WeightType) / sizeof(vec_t); ++i) - accOut[i] = vec_add_16(accIn[i], vec_sub_16(vec_add_16(columnA0[i], columnA1[i]), - vec_add_16(columnR0[i], columnR1[i]))); - } - - auto* accPsqtIn = - reinterpret_cast(&(computed.*accPtr).psqtAccumulation[Perspective][0]); - auto* accPsqtOut = - reinterpret_cast(&(target_state.*accPtr).psqtAccumulation[Perspective][0]); - - const IndexType offsetPsqtA0 = PSQTBuckets * added[0]; - auto* columnPsqtA0 = - reinterpret_cast(&featureTransformer.psqtWeights[offsetPsqtA0]); - const IndexType offsetPsqtR0 = PSQTBuckets * removed[0]; - auto* columnPsqtR0 = - reinterpret_cast(&featureTransformer.psqtWeights[offsetPsqtR0]); - - if ((Forward && removed.size() == 1) - || (Backwards && added.size() == 1)) // added.size() == removed.size() == 1 - { - for (std::size_t i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); - ++i) - accPsqtOut[i] = - vec_add_psqt_32(vec_sub_psqt_32(accPsqtIn[i], columnPsqtR0[i]), columnPsqtA0[i]); - } - else if (Forward && added.size() == 1) - { - const IndexType offsetPsqtR1 = PSQTBuckets * removed[1]; - auto* columnPsqtR1 = - reinterpret_cast(&featureTransformer.psqtWeights[offsetPsqtR1]); - - for (std::size_t i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); - ++i) - accPsqtOut[i] = vec_sub_psqt_32(vec_add_psqt_32(accPsqtIn[i], columnPsqtA0[i]), - vec_add_psqt_32(columnPsqtR0[i], columnPsqtR1[i])); - } - else if (Backwards && removed.size() == 1) - { - const IndexType offsetPsqtA1 = PSQTBuckets * added[1]; - auto* columnPsqtA1 = - reinterpret_cast(&featureTransformer.psqtWeights[offsetPsqtA1]); - - for (std::size_t i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); - ++i) - accPsqtOut[i] = vec_add_psqt_32(vec_add_psqt_32(accPsqtIn[i], columnPsqtA0[i]), - vec_sub_psqt_32(columnPsqtA1[i], columnPsqtR0[i])); - } - else - { - const IndexType offsetPsqtA1 = PSQTBuckets * added[1]; - auto* columnPsqtA1 = - reinterpret_cast(&featureTransformer.psqtWeights[offsetPsqtA1]); - const IndexType offsetPsqtR1 = PSQTBuckets * removed[1]; - auto* columnPsqtR1 = - reinterpret_cast(&featureTransformer.psqtWeights[offsetPsqtR1]); - - for (std::size_t i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); - ++i) - accPsqtOut[i] = vec_add_psqt_32( - accPsqtIn[i], vec_sub_psqt_32(vec_add_psqt_32(columnPsqtA0[i], columnPsqtA1[i]), - vec_add_psqt_32(columnPsqtR0[i], columnPsqtR1[i]))); - } -#else - std::memcpy((target_state.*accPtr).accumulation[Perspective], - (computed.*accPtr).accumulation[Perspective], - TransformedFeatureDimensions * sizeof(BiasType)); - std::memcpy((target_state.*accPtr).psqtAccumulation[Perspective], - (computed.*accPtr).psqtAccumulation[Perspective], - PSQTBuckets * sizeof(PSQTWeightType)); - - // Difference calculation for the deactivated features - for (const auto index : removed) - { - const IndexType offset = TransformedFeatureDimensions * index; - for (IndexType i = 0; i < TransformedFeatureDimensions; ++i) - (target_state.*accPtr).accumulation[Perspective][i] -= - featureTransformer.weights[offset + i]; - - for (std::size_t i = 0; i < PSQTBuckets; ++i) - (target_state.*accPtr).psqtAccumulation[Perspective][i] -= - featureTransformer.psqtWeights[index * PSQTBuckets + i]; - } - - // Difference calculation for the activated features - for (const auto index : added) - { - const IndexType offset = TransformedFeatureDimensions * index; - for (IndexType i = 0; i < TransformedFeatureDimensions; ++i) - (target_state.*accPtr).accumulation[Perspective][i] += - featureTransformer.weights[offset + i]; - - for (std::size_t i = 0; i < PSQTBuckets; ++i) - (target_state.*accPtr).psqtAccumulation[Perspective][i] += - featureTransformer.psqtWeights[index * PSQTBuckets + i]; - } -#endif + assert(added.size() == 2 && removed.size() == 2); + updateContext.template apply(added[0], added[1], removed[0], + removed[1]); } (target_state.*accPtr).computed[Perspective] = true; @@ -477,7 +407,7 @@ void update_accumulator_refresh_cache( auto* columnA = reinterpret_cast(&featureTransformer.weights[offsetA]); for (IndexType k = 0; k < Tiling::NumRegs; ++k) - acc[k] = vec_add_16(acc[k], vec_sub_16(columnA[k], columnR[k])); + acc[k] = fused(acc[k], columnA[k], columnR[k]); } if (combineLast3) { @@ -496,8 +426,8 @@ void update_accumulator_refresh_cache( reinterpret_cast(&featureTransformer.weights[offsetR2]); for (IndexType k = 0; k < Tiling::NumRegs; ++k) - acc[k] = vec_sub_16(vec_add_16(acc[k], columnA[k]), - vec_add_16(columnR[k], columnR2[k])); + acc[k] = fused(acc[k], columnA[k], columnR[k], + columnR2[k]); } else { @@ -507,8 +437,8 @@ void update_accumulator_refresh_cache( reinterpret_cast(&featureTransformer.weights[offsetA2]); for (IndexType k = 0; k < Tiling::NumRegs; ++k) - acc[k] = vec_add_16(vec_sub_16(acc[k], columnR[k]), - vec_add_16(columnA[k], columnA2[k])); + acc[k] = fused(acc[k], columnA[k], columnA2[k], + columnR[k]); } } else diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index b217c358..e6e3017d 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -281,7 +281,7 @@ inline void write_leb_128(std::ostream& stream, const IntType* values, std::size enum IncUpdateDirection { FORWARD, - BACKWARDS + BACKWARD }; } // namespace Stockfish::Eval::NNUE diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 20e85be3..9dee29c1 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -143,6 +143,60 @@ using psqt_vec_t = int32x4_t; #endif +struct Vec16Wrapper { +#ifdef VECTOR + using type = vec_t; + static type add(const type& lhs, const type& rhs) { return vec_add_16(lhs, rhs); } + static type sub(const type& lhs, const type& rhs) { return vec_sub_16(lhs, rhs); } +#else + using type = BiasType; + static type add(const type& lhs, const type& rhs) { return lhs + rhs; } + static type sub(const type& lhs, const type& rhs) { return lhs - rhs; } +#endif +}; + +struct Vec32Wrapper { +#ifdef VECTOR + using type = psqt_vec_t; + static type add(const type& lhs, const type& rhs) { return vec_add_psqt_32(lhs, rhs); } + static type sub(const type& lhs, const type& rhs) { return vec_sub_psqt_32(lhs, rhs); } +#else + using type = PSQTWeightType; + static type add(const type& lhs, const type& rhs) { return lhs + rhs; } + static type sub(const type& lhs, const type& rhs) { return lhs - rhs; } +#endif +}; + +enum UpdateOperation { + Add, + Sub +}; + +template = true> +typename VecWrapper::type fused(const typename VecWrapper::type& in) { + return in; +} + +template, bool> = true, + std::enable_if_t = true> +typename VecWrapper::type +fused(const typename VecWrapper::type& in, const T& operand, const Ts&... operands) { + switch (update_op) + { + case Add : + return fused(VecWrapper::add(in, operand), operands...); + case Sub : + return fused(VecWrapper::sub(in, operand), operands...); + } +} + // Returns the inverse of a permutation template constexpr std::array diff --git a/src/types.h b/src/types.h index 6465dfd6..d6af929e 100644 --- a/src/types.h +++ b/src/types.h @@ -38,6 +38,7 @@ #include #include + #include #if defined(_MSC_VER) // Disable some silly and noisy warnings from MSVC compiler @@ -429,6 +430,14 @@ class Move { std::uint16_t data; }; +template +struct is_all_same { + static constexpr bool value = (std::is_same_v && ...); +}; + +template +constexpr auto is_all_same_v = is_all_same::value; + } // namespace Stockfish #endif // #ifndef TYPES_H_INCLUDED