From e608eab8dd9f7bd68f192d56d742f621674b8fa8 Mon Sep 17 00:00:00 2001 From: mstembera Date: Sun, 12 May 2024 04:45:01 -0700 Subject: [PATCH] Optimize update_accumulator_refresh_cache() STC https://tests.stockfishchess.org/tests/view/664105df26ac5f9b286d30e6 LLR: 2.94 (-2.94,2.94) <0.00,2.00> Total: 178528 W: 46235 L: 45750 D: 86543 Ptnml(0-2): 505, 17792, 52142, 18363, 462 Combo of two yellow speedups https://tests.stockfishchess.org/tests/view/6640abf9d163897c63214f5c LLR: -2.93 (-2.94,2.94) <0.00,2.00> Total: 355744 W: 91714 L: 91470 D: 172560 Ptnml(0-2): 913, 36233, 103384, 36381, 961 https://tests.stockfishchess.org/tests/view/6628ce073fe04ce4cefc739c LLR: -2.93 (-2.94,2.94) <0.00,2.00> Total: 627040 W: 162001 L: 161339 D: 303700 Ptnml(0-2): 2268, 72379, 163532, 73105, 2236 closes https://github.com/official-stockfish/Stockfish/pull/5239 No functional change --- src/nnue/nnue_feature_transformer.h | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 2b11adef..bcd14e6f 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -664,7 +664,11 @@ class FeatureTransformer { for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) { - auto entryTile = reinterpret_cast(&entry.accumulation[j * TileHeight]); + auto accTile = + reinterpret_cast(&accumulator.accumulation[Perspective][j * TileHeight]); + auto entryTile = + reinterpret_cast(&entry.accumulation[j * TileHeight]); + for (IndexType k = 0; k < NumRegs; ++k) acc[k] = entryTile[k]; @@ -679,7 +683,7 @@ class FeatureTransformer { auto columnA = reinterpret_cast(&weights[offsetA]); for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(vec_sub_16(acc[k], columnR[k]), columnA[k]); + acc[k] = vec_add_16(acc[k], vec_sub_16(columnA[k], columnR[k])); } for (; i < int(removed.size()); ++i) { @@ -702,12 +706,17 @@ class FeatureTransformer { for (IndexType k = 0; k < NumRegs; k++) vec_store(&entryTile[k], acc[k]); + for (IndexType k = 0; k < NumRegs; k++) + vec_store(&accTile[k], acc[k]); } for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) { - auto entryTilePsqt = - reinterpret_cast(&entry.psqtAccumulation[j * PsqtTileHeight]); + auto accTilePsqt = reinterpret_cast( + &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); + auto entryTilePsqt = reinterpret_cast( + &entry.psqtAccumulation[j * PsqtTileHeight]); + for (std::size_t k = 0; k < NumPsqtRegs; ++k) psqt[k] = entryTilePsqt[k]; @@ -732,6 +741,8 @@ class FeatureTransformer { for (std::size_t k = 0; k < NumPsqtRegs; ++k) vec_store_psqt(&entryTilePsqt[k], psqt[k]); + for (std::size_t k = 0; k < NumPsqtRegs; ++k) + vec_store_psqt(&accTilePsqt[k], psqt[k]); } #else @@ -755,8 +766,6 @@ class FeatureTransformer { entry.psqtAccumulation[k] += psqtWeights[index * PSQTBuckets + k]; } -#endif - // The accumulator of the refresh entry has been updated. // Now copy its content to the actual accumulator we were refreshing @@ -765,6 +774,7 @@ class FeatureTransformer { std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation, sizeof(int32_t) * PSQTBuckets); +#endif for (Color c : {WHITE, BLACK}) entry.byColorBB[c] = pos.pieces(c);