From 62ecdfe82cb33f5d0d3394c07bac2c7be97ff84b Mon Sep 17 00:00:00 2001 From: Shawn Xu Date: Sun, 12 Jan 2025 12:53:08 -0800 Subject: [PATCH] simplify accumulator updates After #5759 accumulator updates are strictly on a per-move basis. Therefore, the generic code for updating multiple moves at once is no longer needed. Passed Non-regression STC: LLR: 3.00 (-2.94,2.94) <-1.75,0.25> Total: 81696 W: 21204 L: 21039 D: 39453 Ptnml(0-2): 210, 8431, 23416, 8566, 225 https://tests.stockfishchess.org/tests/view/67823a24a31c4c13e83518a8 closes https://github.com/official-stockfish/Stockfish/pull/5760 no functional change --- src/nnue/nnue_feature_transformer.h | 193 ++++++++++++---------------- 1 file changed, 81 insertions(+), 112 deletions(-) diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 14fdecd7..7a37cda8 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -472,25 +472,20 @@ class FeatureTransformer { return st; } - // Computes the accumulator of the next position. + // Given a computed accumulator, computes the accumulator of the next position. template void update_accumulator_incremental(const Position& pos, StateInfo* computed) const { assert((computed->*accPtr).computed[Perspective]); assert(computed->next != nullptr); -#ifdef VECTOR - // Gcc-10.2 unnecessarily spills AVX2 registers if this array - // is defined in the VECTOR code below, once in each branch. - vec_t acc[Tiling::NumRegs]; - psqt_vec_t psqt[Tiling::NumPsqtRegs]; -#endif - const Square ksq = pos.square(Perspective); // The size must be enough to contain the largest possible update. // That might depend on the feature set and generally relies on the // feature set's update cost calculation to be correct and never allow // updates with more added/removed features than MaxActiveDimensions. + // In this case, the maximum size of both feature addition and removal + // is 2, since we are incrementally updating one move at a time. FeatureSet::IndexList removed, added; FeatureSet::append_changed_indices(ksq, computed->next->dirtyPiece, removed, added); @@ -498,51 +493,76 @@ class FeatureTransformer { StateInfo* next = computed->next; assert(!(next->*accPtr).computed[Perspective]); -#ifdef VECTOR - if ((removed.size() == 1 || removed.size() == 2) && added.size() == 1) + if (removed.size() == 0 && added.size() == 0) { + std::memcpy((next->*accPtr).accumulation[Perspective], + (computed->*accPtr).accumulation[Perspective], + HalfDimensions * sizeof(BiasType)); + std::memcpy((next->*accPtr).psqtAccumulation[Perspective], + (computed->*accPtr).psqtAccumulation[Perspective], + PSQTBuckets * sizeof(PSQTWeightType)); + } + else + { + assert(added.size() == 1 || added.size() == 2); + assert(removed.size() == 1 || removed.size() == 2); + assert(added.size() <= removed.size()); + +#ifdef VECTOR auto* accIn = reinterpret_cast(&(computed->*accPtr).accumulation[Perspective][0]); auto* accOut = reinterpret_cast(&(next->*accPtr).accumulation[Perspective][0]); + const IndexType offsetA0 = HalfDimensions * added[0]; + auto* columnA0 = reinterpret_cast(&weights[offsetA0]); const IndexType offsetR0 = HalfDimensions * removed[0]; auto* columnR0 = reinterpret_cast(&weights[offsetR0]); - const IndexType offsetA = HalfDimensions * added[0]; - auto* columnA = reinterpret_cast(&weights[offsetA]); if (removed.size() == 1) { for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i) - accOut[i] = vec_add_16(vec_sub_16(accIn[i], columnR0[i]), columnA[i]); + accOut[i] = vec_add_16(vec_sub_16(accIn[i], columnR0[i]), columnA0[i]); } - else + else if (added.size() == 1) { const IndexType offsetR1 = HalfDimensions * removed[1]; auto* columnR1 = reinterpret_cast(&weights[offsetR1]); for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i) - accOut[i] = vec_sub_16(vec_add_16(accIn[i], columnA[i]), + accOut[i] = vec_sub_16(vec_add_16(accIn[i], columnA0[i]), vec_add_16(columnR0[i], columnR1[i])); } + else + { + const IndexType offsetA1 = HalfDimensions * added[1]; + auto* columnA1 = reinterpret_cast(&weights[offsetA1]); + const IndexType offsetR1 = HalfDimensions * removed[1]; + auto* columnR1 = reinterpret_cast(&weights[offsetR1]); + + for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i) + accOut[i] = + vec_add_16(accIn[i], vec_sub_16(vec_add_16(columnA0[i], columnA1[i]), + vec_add_16(columnR0[i], columnR1[i]))); + } auto* accPsqtIn = reinterpret_cast( &(computed->*accPtr).psqtAccumulation[Perspective][0]); auto* accPsqtOut = reinterpret_cast(&(next->*accPtr).psqtAccumulation[Perspective][0]); + const IndexType offsetPsqtA0 = PSQTBuckets * added[0]; + auto* columnPsqtA0 = reinterpret_cast(&psqtWeights[offsetPsqtA0]); const IndexType offsetPsqtR0 = PSQTBuckets * removed[0]; auto* columnPsqtR0 = reinterpret_cast(&psqtWeights[offsetPsqtR0]); - const IndexType offsetPsqtA = PSQTBuckets * added[0]; - auto* columnPsqtA = reinterpret_cast(&psqtWeights[offsetPsqtA]); if (removed.size() == 1) { for (std::size_t i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); ++i) accPsqtOut[i] = vec_add_psqt_32(vec_sub_psqt_32(accPsqtIn[i], columnPsqtR0[i]), - columnPsqtA[i]); + columnPsqtA0[i]); } - else + else if (added.size() == 1) { const IndexType offsetPsqtR1 = PSQTBuckets * removed[1]; auto* columnPsqtR1 = @@ -551,110 +571,58 @@ class FeatureTransformer { for (std::size_t i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); ++i) accPsqtOut[i] = - vec_sub_psqt_32(vec_add_psqt_32(accPsqtIn[i], columnPsqtA[i]), + vec_sub_psqt_32(vec_add_psqt_32(accPsqtIn[i], columnPsqtA0[i]), vec_add_psqt_32(columnPsqtR0[i], columnPsqtR1[i])); } - } - else - { - for (IndexType i = 0; i < HalfDimensions / Tiling::TileHeight; ++i) + else { - // Load accumulator - auto* accTileIn = reinterpret_cast( - &(computed->*accPtr).accumulation[Perspective][i * Tiling::TileHeight]); - for (IndexType j = 0; j < Tiling::NumRegs; ++j) - acc[j] = vec_load(&accTileIn[j]); + const IndexType offsetPsqtA1 = PSQTBuckets * added[1]; + auto* columnPsqtA1 = + reinterpret_cast(&psqtWeights[offsetPsqtA1]); + const IndexType offsetPsqtR1 = PSQTBuckets * removed[1]; + auto* columnPsqtR1 = + reinterpret_cast(&psqtWeights[offsetPsqtR1]); - // Difference calculation for the deactivated features - for (const auto index : removed) - { - const IndexType offset = HalfDimensions * index + i * Tiling::TileHeight; - auto* column = reinterpret_cast(&weights[offset]); - for (IndexType j = 0; j < Tiling::NumRegs; ++j) - acc[j] = vec_sub_16(acc[j], column[j]); - } - - // Difference calculation for the activated features - for (const auto index : added) - { - const IndexType offset = HalfDimensions * index + i * Tiling::TileHeight; - auto* column = reinterpret_cast(&weights[offset]); - for (IndexType j = 0; j < Tiling::NumRegs; ++j) - acc[j] = vec_add_16(acc[j], column[j]); - } - - // Store accumulator - auto* accTileOut = reinterpret_cast( - &(next->*accPtr).accumulation[Perspective][i * Tiling::TileHeight]); - for (IndexType j = 0; j < Tiling::NumRegs; ++j) - vec_store(&accTileOut[j], acc[j]); + for (std::size_t i = 0; + i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); ++i) + accPsqtOut[i] = vec_add_psqt_32( + accPsqtIn[i], + vec_sub_psqt_32(vec_add_psqt_32(columnPsqtA0[i], columnPsqtA1[i]), + vec_add_psqt_32(columnPsqtR0[i], columnPsqtR1[i]))); } - - for (IndexType i = 0; i < PSQTBuckets / Tiling::PsqtTileHeight; ++i) - { - // Load accumulator - auto* accTilePsqtIn = reinterpret_cast( - &(computed->*accPtr).psqtAccumulation[Perspective][i * Tiling::PsqtTileHeight]); - for (std::size_t j = 0; j < Tiling::NumPsqtRegs; ++j) - psqt[j] = vec_load_psqt(&accTilePsqtIn[j]); - - // Difference calculation for the deactivated features - for (const auto index : removed) - { - const IndexType offset = PSQTBuckets * index + i * Tiling::PsqtTileHeight; - auto* columnPsqt = reinterpret_cast(&psqtWeights[offset]); - for (std::size_t j = 0; j < Tiling::NumPsqtRegs; ++j) - psqt[j] = vec_sub_psqt_32(psqt[j], columnPsqt[j]); - } - - // Difference calculation for the activated features - for (const auto index : added) - { - const IndexType offset = PSQTBuckets * index + i * Tiling::PsqtTileHeight; - auto* columnPsqt = reinterpret_cast(&psqtWeights[offset]); - for (std::size_t j = 0; j < Tiling::NumPsqtRegs; ++j) - psqt[j] = vec_add_psqt_32(psqt[j], columnPsqt[j]); - } - - // Store accumulator - auto* accTilePsqtOut = reinterpret_cast( - &(next->*accPtr).psqtAccumulation[Perspective][i * Tiling::PsqtTileHeight]); - for (std::size_t j = 0; j < Tiling::NumPsqtRegs; ++j) - vec_store_psqt(&accTilePsqtOut[j], psqt[j]); - } - } #else - std::memcpy((next->*accPtr).accumulation[Perspective], - (computed->*accPtr).accumulation[Perspective], - HalfDimensions * sizeof(BiasType)); - std::memcpy((next->*accPtr).psqtAccumulation[Perspective], - (computed->*accPtr).psqtAccumulation[Perspective], - PSQTBuckets * sizeof(PSQTWeightType)); + std::memcpy((next->*accPtr).accumulation[Perspective], + (computed->*accPtr).accumulation[Perspective], + HalfDimensions * sizeof(BiasType)); + std::memcpy((next->*accPtr).psqtAccumulation[Perspective], + (computed->*accPtr).psqtAccumulation[Perspective], + PSQTBuckets * sizeof(PSQTWeightType)); - // Difference calculation for the deactivated features - for (const auto index : removed) - { - const IndexType offset = HalfDimensions * index; - for (IndexType i = 0; i < HalfDimensions; ++i) - (next->*accPtr).accumulation[Perspective][i] -= weights[offset + i]; + // Difference calculation for the deactivated features + for (const auto index : removed) + { + const IndexType offset = HalfDimensions * index; + for (IndexType i = 0; i < HalfDimensions; ++i) + (next->*accPtr).accumulation[Perspective][i] -= weights[offset + i]; - for (std::size_t i = 0; i < PSQTBuckets; ++i) - (next->*accPtr).psqtAccumulation[Perspective][i] -= - psqtWeights[index * PSQTBuckets + i]; - } + for (std::size_t i = 0; i < PSQTBuckets; ++i) + (next->*accPtr).psqtAccumulation[Perspective][i] -= + psqtWeights[index * PSQTBuckets + i]; + } - // Difference calculation for the activated features - for (const auto index : added) - { - const IndexType offset = HalfDimensions * index; - for (IndexType i = 0; i < HalfDimensions; ++i) - (next->*accPtr).accumulation[Perspective][i] += weights[offset + i]; + // Difference calculation for the activated features + for (const auto index : added) + { + const IndexType offset = HalfDimensions * index; + for (IndexType i = 0; i < HalfDimensions; ++i) + (next->*accPtr).accumulation[Perspective][i] += weights[offset + i]; - for (std::size_t i = 0; i < PSQTBuckets; ++i) - (next->*accPtr).psqtAccumulation[Perspective][i] += - psqtWeights[index * PSQTBuckets + i]; - } + for (std::size_t i = 0; i < PSQTBuckets; ++i) + (next->*accPtr).psqtAccumulation[Perspective][i] += + psqtWeights[index * PSQTBuckets + i]; + } #endif + } (next->*accPtr).computed[Perspective] = true; @@ -662,6 +630,7 @@ class FeatureTransformer { update_accumulator_incremental(pos, next); } + template void update_accumulator_refresh_cache(const Position& pos, AccumulatorCaches::Cache* cache) const {