From 9382f854b3a67c5a970ad3342a3c12454974eccd Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Wed, 30 Sep 2020 21:22:36 +0200 Subject: [PATCH 01/27] Schedule threads fairly under valgrind fixes a rare case that can cause CI to fail when running multithreaded under valgrind. closes https://github.com/official-stockfish/Stockfish/pull/3165 No functional change. --- tests/instrumented.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/instrumented.sh b/tests/instrumented.sh index 03ded74a..03e9c9de 100755 --- a/tests/instrumented.sh +++ b/tests/instrumented.sh @@ -20,7 +20,7 @@ case $1 in --valgrind-thread) echo "valgrind-thread testing started" prefix='' - exeprefix='valgrind --error-exitcode=42' + exeprefix='valgrind --fair-sched=try --error-exitcode=42' postfix='1>/dev/null' threads="2" ;; From 17fb3a8ce0ccd2532f667fe685c4189d0bfe3b5b Mon Sep 17 00:00:00 2001 From: Unai Corzo Date: Fri, 2 Oct 2020 22:00:55 +0200 Subject: [PATCH 02/27] Simplify away futility pruning for captures Remove futility pruning for captures. STC https://tests.stockfishchess.org/tests/view/5f749bfed930428c36d34c56 LLR: 2.94 (-2.94,2.94) {-1.25,0.25} Total: 38064 W: 4011 L: 3929 D: 30124 Ptnml(0-2): 192, 3004, 12567, 3068, 201 LTC https://tests.stockfishchess.org/tests/view/5f74d99bf18675b1ce2f7412 LLR: 2.94 (-2.94,2.94) {-0.75,0.25} Total: 184984 W: 8567 L: 8610 D: 167807 Ptnml(0-2): 146, 7593, 77058, 7548, 147 closes https://github.com/official-stockfish/Stockfish/pull/3166 bench: 3890648 --- src/search.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index c7343ce8..eaa79fb9 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1049,15 +1049,6 @@ moves_loop: // When in check, search starts from here && captureHistory[movedPiece][to_sq(move)][type_of(pos.piece_on(to_sq(move)))] < 0) continue; - // Futility pruning for captures - if ( !givesCheck - && lmrDepth < 6 - && !(PvNode && abs(bestValue) < 2) - && !ss->inCheck - && ss->staticEval + 169 + 244 * lmrDepth - + PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] <= alpha) - continue; - // See based pruning if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo) continue; From 767b4f4fbe5ab2e63aceabd9005f4e1eb7cbcb51 Mon Sep 17 00:00:00 2001 From: FauziAkram Date: Fri, 2 Oct 2020 15:32:19 +0300 Subject: [PATCH 03/27] Pawn Tuning Tuning of pawns, for classical evaluation: Passed STC: https://tests.stockfishchess.org/tests/view/5f771f0e52560f5fc78559ec LLR: 2.96 (-2.94,2.94) {-0.25,1.25} Total: 252696 W: 50321 L: 49692 D: 152683 Ptnml(0-2): 4614, 29845, 57049, 29978, 4862 Passed LTC: https://tests.stockfishchess.org/tests/view/5f77cfef090dcf9aaa16d38b LLR: 2.94 (-2.94,2.94) {0.25,1.25} Total: 48184 W: 6556 L: 6193 D: 35435 Ptnml(0-2): 335, 4516, 14100, 4733, 408 closes https://github.com/official-stockfish/Stockfish/pull/3169 bench: 4016121 --- src/pawns.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/pawns.cpp b/src/pawns.cpp index af0f6618..a5102db8 100644 --- a/src/pawns.cpp +++ b/src/pawns.cpp @@ -30,21 +30,21 @@ namespace { #define S(mg, eg) make_score(mg, eg) // Pawn penalties - constexpr Score Backward = S( 8, 27); - constexpr Score Doubled = S(11, 55); - constexpr Score Isolated = S( 5, 17); - constexpr Score WeakLever = S( 2, 54); - constexpr Score WeakUnopposed = S(15, 25); + constexpr Score Backward = S( 8, 25); + constexpr Score Doubled = S(10, 55); + constexpr Score Isolated = S( 3, 15); + constexpr Score WeakLever = S( 3, 55); + constexpr Score WeakUnopposed = S(13, 25); // Bonus for blocked pawns at 5th or 6th rank - constexpr Score BlockedPawn[2] = { S(-13, -4), S(-4, 3) }; + constexpr Score BlockedPawn[2] = { S(-13, -4), S(-5, 2) }; constexpr Score BlockedStorm[RANK_NB] = { S(0, 0), S(0, 0), S(76, 78), S(-10, 15), S(-7, 10), S(-4, 6), S(-1, 2) }; // Connected pawn bonus - constexpr int Connected[RANK_NB] = { 0, 7, 8, 11, 24, 45, 85 }; + constexpr int Connected[RANK_NB] = { 0, 5, 7, 11, 24, 48, 86 }; // Strength of pawn shelter for our king by [distance from edge][rank]. // RANK_1 = 0 is used for files where we have no pawn, or pawn is behind our king. @@ -147,7 +147,7 @@ namespace { if (support | phalanx) { int v = Connected[r] * (2 + bool(phalanx) - bool(opposed)) - + 21 * popcount(support); + + 22 * popcount(support); score += make_score(v, v * (r - 2) / 4); } From ba73f8ce0d545a0f627b5bc8ba274ae9c85918f3 Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Wed, 14 Oct 2020 10:23:30 +0200 Subject: [PATCH 04/27] Update default net to nn-04cf2b4ed1da.nnue Further tune the net parameters, now the last but one layer (32x32). To limit the number of parameters optimized, the network layer was decomposed using SVD, and the singular values were treated as parameters and tuned. Tuning branch: https://github.com/vondele/Stockfish/tree/svdTune Tuner: https://github.com/vondele/nevergrad4sf passed STC: https://tests.stockfishchess.org/tests/view/5f83e82f8ea73fb8ddf83e4e LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 8488 W: 944 L: 795 D: 6749 Ptnml(0-2): 39, 609, 2811, 734, 51 passed LTC: https://tests.stockfishchess.org/tests/view/5f83f4118ea73fb8ddf83e66 LLR: 2.94 (-2.94,2.94) {0.25,1.25} Total: 169016 W: 8043 L: 7589 D: 153384 Ptnml(0-2): 133, 6623, 70538, 7085, 129 closes https://github.com/official-stockfish/Stockfish/pull/3181 Bench: 3945198 --- src/evaluate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evaluate.h b/src/evaluate.h index 4b57a050..6a17f284 100644 --- a/src/evaluate.h +++ b/src/evaluate.h @@ -36,7 +36,7 @@ namespace Eval { // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue // for the build process (profile-build and fishtest) to work. Do not change the // name of the macro, as it is used in the Makefile. - #define EvalFileDefaultName "nn-baeb9ef2d183.nnue" + #define EvalFileDefaultName "nn-04cf2b4ed1da.nnue" namespace NNUE { From 4a5cc1365f48f7fff08d3184cadac7a0a75dda6d Mon Sep 17 00:00:00 2001 From: FauziAkram Date: Tue, 6 Oct 2020 22:43:48 +0300 Subject: [PATCH 05/27] RookOnQueenFile Removal Removing Rook On Queen File looks beneficial, and it might even bring some ELO. I will try to reintroduce it with a different method later on. Passed STC: https://tests.stockfishchess.org/tests/view/5f7cea204389873867eb10cb LLR: 2.94 (-2.94,2.94) {-1.25,0.25} Total: 18624 W: 3800 L: 3568 D: 11256 Ptnml(0-2): 308, 2131, 4257, 2253, 363 Passed LTC: https://tests.stockfishchess.org/tests/view/5f7d76a4e936c6892bf50598 LLR: 2.95 (-2.94,2.94) {-0.75,0.25} Total: 117864 W: 15515 L: 15340 D: 87009 Ptnml(0-2): 926, 11127, 34671, 11262, 946 closes https://github.com/official-stockfish/Stockfish/pull/3176 Bench: 3756191 --- src/evaluate.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 25e3bdc1..c68577a3 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -265,7 +265,6 @@ namespace { constexpr Score ReachableOutpost = S( 31, 22); constexpr Score RestrictedPiece = S( 7, 7); constexpr Score RookOnKingRing = S( 16, 0); - constexpr Score RookOnQueenFile = S( 6, 11); constexpr Score SliderOnQueen = S( 60, 18); constexpr Score ThreatByKing = S( 24, 89); constexpr Score ThreatByPawnPush = S( 48, 39); @@ -481,10 +480,6 @@ namespace { if (Pt == ROOK) { - // Bonus for rook on the same file as a queen - if (file_bb(s) & pos.pieces(QUEEN)) - score += RookOnQueenFile; - // Bonus for rook on an open or semi-open file if (pos.is_on_semiopen_file(Us, s)) score += RookOnFile[pos.is_on_semiopen_file(Them, s)]; From 288a604411fa72b06b30f16194cd03592b28f6f2 Mon Sep 17 00:00:00 2001 From: Unai Corzo Date: Mon, 12 Oct 2020 09:03:49 +0200 Subject: [PATCH 06/27] Scale factor tweak Add !pawnsOnBothFlanks heuristic to scale factor. STC https://tests.stockfishchess.org/tests/view/5f8080575b3847b5d41f9134 LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 250960 W: 49779 L: 49168 D: 152013 Ptnml(0-2): 4224, 28822, 58802, 29383, 4249 LTC https://tests.stockfishchess.org/tests/view/5f832f498ea73fb8ddf83ddb LLR: 2.95 (-2.94,2.94) {0.25,1.25} Total: 88584 W: 11827 L: 11388 D: 65369 Ptnml(0-2): 585, 8079, 26578, 8412, 638 closes https://github.com/official-stockfish/Stockfish/pull/3179 bench: 3834252 --- src/evaluate.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index c68577a3..425ba6f8 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -905,7 +905,9 @@ namespace { sf = 37 + 3 * (pos.count(WHITE) == 1 ? pos.count(BLACK) + pos.count(BLACK) : pos.count(WHITE) + pos.count(WHITE)); else - sf = std::min(sf, 36 + 7 * pos.count(strongSide)); + sf = std::min(sf, 36 + 7 * pos.count(strongSide)) - 4 * !pawnsOnBothFlanks; + + sf -= 4 * !pawnsOnBothFlanks; } // Interpolate between the middlegame and (scaled by 'sf') endgame score From 281d520cc2bb0123efd230fce45119b57f0bae0d Mon Sep 17 00:00:00 2001 From: mstembera Date: Sun, 18 Oct 2020 04:23:28 -0700 Subject: [PATCH 07/27] Update default net to nn-eba324f53044.nnue The new net is based on the previous net 04cf2b4ed1da but with the biases for the 1st hidden layer tuned SPSA, see the SPSA session on fishtest there: https://tests.stockfishchess.org/tests/view/5f875213dcdad978fe8c5211 Thanks to @vondele for writing out the net, see discussion in this thread: https://github.com/mstembera/Stockfish/commit/432da86721647dff1d9426a7cdcfd2dbada8155e Passed STC: LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 15000 W: 1640 L: 1483 D: 11877 Ptnml(0-2): 50, 1183, 4908, 1278, 81 https://tests.stockfishchess.org/tests/view/5f8955e20fea1a44ec4f0a5d Passed LTC: LLR: 2.96 (-2.94,2.94) {0.25,1.25} Total: 81272 W: 3948 L: 3682 D: 73642 Ptnml(0-2): 64, 3194, 33856, 3456, 66 https://tests.stockfishchess.org/tests/view/5f89e8efeae8a6e60644d6e7 closes https://github.com/official-stockfish/Stockfish/pull/3187 Bench: 3762411 --- src/evaluate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evaluate.h b/src/evaluate.h index 6a17f284..6a8603ad 100644 --- a/src/evaluate.h +++ b/src/evaluate.h @@ -36,7 +36,7 @@ namespace Eval { // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue // for the build process (profile-build and fishtest) to work. Do not change the // name of the macro, as it is used in the Makefile. - #define EvalFileDefaultName "nn-04cf2b4ed1da.nnue" + #define EvalFileDefaultName "nn-eba324f53044.nnue" namespace NNUE { From 560c776397483feaaa0deb5b666f46ff3f5b655f Mon Sep 17 00:00:00 2001 From: Vizvezdenec Date: Sat, 17 Oct 2020 13:40:10 +0200 Subject: [PATCH 08/27] Do more reductions for late quiet moves in case of consecutive fail highs. Idea of this patch can be described as following - in case we have consecutive fail highs and we reach late enough moves at root node probability of remaining quiet moves being able to produce even bigger value than moves that produced previous cutoff (so ones that should be high in move ordering but now they fail to produce beta cutoff because we actually reached high move count) should be quiet small so we can reduce them more. passed STC LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 53392 W: 5681 L: 5474 D: 42237 Ptnml(0-2): 214, 4104, 17894, 4229, 255 https://tests.stockfishchess.org/tests/view/5f88501adcdad978fe8c527e passed LTC LLR: 2.94 (-2.94,2.94) {0.25,1.25} Total: 59136 W: 2773 L: 2564 D: 53799 Ptnml(0-2): 30, 2117, 25078, 2300, 43 https://tests.stockfishchess.org/tests/view/5f884dbfdcdad978fe8c527a closes https://github.com/official-stockfish/Stockfish/pull/3184 Bench: 4066972 --- src/search.cpp | 5 ++++- src/thread.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/search.cpp b/src/search.cpp index eaa79fb9..ab58ca64 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -417,7 +417,7 @@ void Thread::search() { // Start with a small aspiration window and, in the case of a fail // high/low, re-search with a bigger window until we don't fail // high/low anymore. - int failedHighCnt = 0; + failedHighCnt = 0; while (true) { Depth adjustedDepth = std::max(1, rootDepth - failedHighCnt - searchAgainCounter); @@ -1177,6 +1177,9 @@ moves_loop: // When in check, search starts from here if (ttCapture) r++; + // Increase reduction at root if failing high + r += rootNode ? thisThread->failedHighCnt * thisThread->failedHighCnt * moveCount / 512 : 0; + // Increase reduction for cut nodes (~10 Elo) if (cutNode) r += 2; diff --git a/src/thread.h b/src/thread.h index 34b99015..6a73423b 100644 --- a/src/thread.h +++ b/src/thread.h @@ -73,6 +73,7 @@ public: CapturePieceToHistory captureHistory; ContinuationHistory continuationHistory[2][2]; Score contempt; + int failedHighCnt; }; From f5dfad5d72e164b57b787c0224046d641b3ade84 Mon Sep 17 00:00:00 2001 From: xoto10 Date: Wed, 21 Oct 2020 14:52:13 +0100 Subject: [PATCH 09/27] Reduce big time spikes by reducing PV re-searches. Save time by reducing PV re-searches above original depth. Instead use 5% extra time on every move. STC 10+0.1 th 1 : LLR: 2.93 (-2.94,2.94) {-0.25,1.25} Total: 90688 W: 9702 L: 9436 D: 71550 Ptnml(0-2): 408, 7252, 29792, 7450, 442 https://tests.stockfishchess.org/tests/view/5f8df807bacb75a4f9a47223 LTC 60+0.6 th 1 : LLR: 2.97 (-2.94,2.94) {0.25,1.25} Total: 97856 W: 4602 L: 4303 D: 88951 Ptnml(0-2): 53, 3757, 41057, 3960, 101 https://tests.stockfishchess.org/tests/view/5f8ec4872c92c7fe3a8c602d closes https://github.com/official-stockfish/Stockfish/pull/3192 Bench 3943959 --- src/search.cpp | 4 +++- src/timeman.cpp | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index ab58ca64..65ed9b73 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -565,6 +565,7 @@ namespace { constexpr bool PvNode = NT == PV; const bool rootNode = PvNode && ss->ply == 0; + const Depth maxNextDepth = rootNode ? depth : depth + 1; // Check if we have an upcoming move which draws by repetition, or // if the opponent had an alternative move earlier to this position. @@ -1259,7 +1260,8 @@ moves_loop: // When in check, search starts from here (ss+1)->pv = pv; (ss+1)->pv[0] = MOVE_NONE; - value = -search(pos, ss+1, -beta, -alpha, newDepth, false); + value = -search(pos, ss+1, -beta, -alpha, + std::min(maxNextDepth, newDepth), false); } // Step 18. Undo move diff --git a/src/timeman.cpp b/src/timeman.cpp index 6d9c95ef..da08f12d 100644 --- a/src/timeman.cpp +++ b/src/timeman.cpp @@ -75,7 +75,7 @@ void TimeManagement::init(Search::LimitsType& limits, Color us, int ply) { // game time for the current move, so also cap to 20% of available game time. if (limits.movestogo == 0) { - optScale = std::min(0.008 + std::pow(ply + 3.0, 0.5) / 250.0, + optScale = std::min(0.0084 + std::pow(ply + 3.0, 0.5) * 0.0042, 0.2 * limits.time[us] / double(timeLeft)); maxScale = std::min(7.0, 4.0 + ply / 12.0); } From 258af8ae44fc15407996e0a21a80ee8b9cfa12cb Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Sun, 18 Oct 2020 15:01:19 +0200 Subject: [PATCH 10/27] Add net as dependency of config cleaner output and error message if the server is down and the net is not available. closes https://github.com/official-stockfish/Stockfish/pull/3188 No functional change --- src/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Makefile b/src/Makefile index 54868b39..87203547 100644 --- a/src/Makefile +++ b/src/Makefile @@ -711,7 +711,7 @@ endif config-sanity icc-profile-use icc-profile-make gcc-profile-use gcc-profile-make \ clang-profile-use clang-profile-make -build: config-sanity net +build: net config-sanity $(MAKE) ARCH=$(ARCH) COMP=$(COMP) all profile-build: net config-sanity objclean profileclean @@ -784,7 +784,7 @@ default: all: $(EXE) .depend -config-sanity: +config-sanity: net @echo "" @echo "Config:" @echo "debug: '$(debug)'" From 2046d5da30b2cd505b69bddb40062b0d37b43bc7 Mon Sep 17 00:00:00 2001 From: syzygy1 <3028851+syzygy1@users.noreply.github.com> Date: Tue, 20 Oct 2020 21:06:06 +0200 Subject: [PATCH 11/27] More incremental accumulator updates This patch was inspired by c065abd which updates the accumulator, if possible, based on the accumulator of two plies back if the accumulator of the preceding ply is not available. With this patch we look back even further in the position history in an attempt to reduce the number of complete recomputations. When we find a usable accumulator for the position N plies back, we also update the accumulator of the position N-1 plies back because that accumulator is most likely to be helpful later when evaluating positions in sibling branches. By not updating all intermediate accumulators immediately, we avoid doing too much work that is not certain to be useful. Overall, roughly 2-3% speedup. This patch makes the code more specific to the net architecture, changing input features of the net will require additional changes to the incremental update code as discussed in the PR #3193 and #3191. Passed STC: https://tests.stockfishchess.org/tests/view/5f9056712c92c7fe3a8c60d0 LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 10040 W: 1116 L: 968 D: 7956 Ptnml(0-2): 42, 722, 3365, 828, 63 closes https://github.com/official-stockfish/Stockfish/pull/3193 No functional change. --- src/nnue/features/feature_set.h | 108 ----------- src/nnue/nnue_accumulator.h | 5 +- src/nnue/nnue_feature_transformer.h | 288 ++++++++++++++-------------- src/position.cpp | 17 +- 4 files changed, 157 insertions(+), 261 deletions(-) diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h index 26198114..975824b6 100644 --- a/src/nnue/features/feature_set.h +++ b/src/nnue/features/feature_set.h @@ -43,90 +43,6 @@ namespace Eval::NNUE::Features { template class FeatureSetBase { - public: - // Get a list of indices for active features - template - static void AppendActiveIndices( - const Position& pos, TriggerEvent trigger, IndexListType active[2]) { - - for (Color perspective : { WHITE, BLACK }) { - Derived::CollectActiveIndices( - pos, trigger, perspective, &active[perspective]); - } - } - - // Get a list of indices for recently changed features - template - static void AppendChangedIndices( - const PositionType& pos, TriggerEvent trigger, - IndexListType removed[2], IndexListType added[2], bool reset[2]) { - - auto collect_for_one = [&](const DirtyPiece& dp) { - for (Color perspective : { WHITE, BLACK }) { - switch (trigger) { - case TriggerEvent::kFriendKingMoved: - reset[perspective] = dp.piece[0] == make_piece(perspective, KING); - break; - default: - assert(false); - break; - } - if (reset[perspective]) { - Derived::CollectActiveIndices( - pos, trigger, perspective, &added[perspective]); - } else { - Derived::CollectChangedIndices( - pos, dp, trigger, perspective, - &removed[perspective], &added[perspective]); - } - } - }; - - auto collect_for_two = [&](const DirtyPiece& dp1, const DirtyPiece& dp2) { - for (Color perspective : { WHITE, BLACK }) { - switch (trigger) { - case TriggerEvent::kFriendKingMoved: - reset[perspective] = dp1.piece[0] == make_piece(perspective, KING) - || dp2.piece[0] == make_piece(perspective, KING); - break; - default: - assert(false); - break; - } - if (reset[perspective]) { - Derived::CollectActiveIndices( - pos, trigger, perspective, &added[perspective]); - } else { - Derived::CollectChangedIndices( - pos, dp1, trigger, perspective, - &removed[perspective], &added[perspective]); - Derived::CollectChangedIndices( - pos, dp2, trigger, perspective, - &removed[perspective], &added[perspective]); - } - } - }; - - if (pos.state()->previous->accumulator.computed_accumulation) { - const auto& prev_dp = pos.state()->dirtyPiece; - if (prev_dp.dirty_num == 0) return; - collect_for_one(prev_dp); - } else { - const auto& prev_dp = pos.state()->previous->dirtyPiece; - if (prev_dp.dirty_num == 0) { - const auto& prev2_dp = pos.state()->dirtyPiece; - if (prev2_dp.dirty_num == 0) return; - collect_for_one(prev2_dp); - } else { - const auto& prev2_dp = pos.state()->dirtyPiece; - if (prev2_dp.dirty_num == 0) { - collect_for_one(prev_dp); - } else { - collect_for_two(prev_dp, prev2_dp); - } - } - } - } }; // Class template that represents the feature set @@ -146,30 +62,6 @@ namespace Eval::NNUE::Features { CompileTimeList; static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues; - private: - // Get a list of indices for active features - static void CollectActiveIndices( - const Position& pos, const TriggerEvent trigger, const Color perspective, - IndexList* const active) { - if (FeatureType::kRefreshTrigger == trigger) { - FeatureType::AppendActiveIndices(pos, perspective, active); - } - } - - // Get a list of indices for recently changed features - static void CollectChangedIndices( - const Position& pos, const DirtyPiece& dp, const TriggerEvent trigger, const Color perspective, - IndexList* const removed, IndexList* const added) { - - if (FeatureType::kRefreshTrigger == trigger) { - FeatureType::AppendChangedIndices(pos, dp, perspective, removed, added); - } - } - - // Make the base class and the class template that recursively uses itself a friend - friend class FeatureSetBase; - template - friend class FeatureSet; }; } // namespace Eval::NNUE::Features diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index 26370710..a357d835 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -25,11 +25,14 @@ namespace Eval::NNUE { + // The accumulator of a StateInfo without parent is set to the INIT state + enum AccumulatorState { EMPTY, COMPUTED, INIT }; + // Class that holds the result of affine transformation of input features struct alignas(kCacheLineSize) Accumulator { std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions]; - bool computed_accumulation; + AccumulatorState state[2]; }; } // namespace Eval::NNUE diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 2f86d20a..f145c848 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -32,7 +32,7 @@ namespace Eval::NNUE { // If vector instructions are enabled, we update and refresh the // accumulator tile by tile such that each tile fits in the CPU's // vector registers. - #define TILING + #define VECTOR #ifdef USE_AVX512 typedef __m512i vec_t; @@ -75,7 +75,7 @@ namespace Eval::NNUE { static constexpr IndexType kNumRegs = 16; #else - #undef TILING + #undef VECTOR #endif @@ -86,7 +86,7 @@ namespace Eval::NNUE { // Number of output dimensions for one side static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions; - #ifdef TILING + #ifdef VECTOR static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2; static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions"); #endif @@ -119,32 +119,11 @@ namespace Eval::NNUE { return !stream.fail(); } - // Proceed with the difference calculation if possible - bool UpdateAccumulatorIfPossible(const Position& pos) const { - - const auto now = pos.state(); - if (now->accumulator.computed_accumulation) - return true; - - const auto prev = now->previous; - if (prev) { - if (prev->accumulator.computed_accumulation) { - UpdateAccumulator(pos); - return true; - } else if (prev->previous && prev->previous->accumulator.computed_accumulation) { - UpdateAccumulator(pos); - return true; - } - } - - return false; - } - // Convert input features void Transform(const Position& pos, OutputType* output) const { - if (!UpdateAccumulatorIfPossible(pos)) - RefreshAccumulator(pos); + UpdateAccumulator(pos, WHITE); + UpdateAccumulator(pos, BLACK); const auto& accumulation = pos.state()->accumulator.accumulation; @@ -240,27 +219,142 @@ namespace Eval::NNUE { } private: - // Calculate cumulative value without using difference calculation - void RefreshAccumulator(const Position& pos) const { + void UpdateAccumulator(const Position& pos, const Color c) const { - auto& accumulator = pos.state()->accumulator; - IndexType i = 0; - Features::IndexList active_indices[2]; - RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i], - active_indices); - for (Color perspective : { WHITE, BLACK }) { - #ifdef TILING - for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) { + #ifdef VECTOR + // Gcc-10.2 unnecessarily spills AVX2 registers if this array + // is defined in the VECTOR code below, once in each branch + vec_t acc[kNumRegs]; + #endif + + // Look for a usable accumulator of an earlier position. We keep track + // of the estimated gain in terms of features to be added/subtracted. + StateInfo *st = pos.state(), *next = nullptr; + int gain = popcount(pos.pieces()) - 2; + while (st->accumulator.state[c] == EMPTY) + { + auto& dp = st->dirtyPiece; + // The first condition tests whether an incremental update is + // possible at all: if this side's king has moved, it is not possible. + static_assert(std::is_same_v>, + "Current code assumes that only kFriendlyKingMoved refresh trigger is being used."); + if ( dp.piece[0] == make_piece(c, KING) + || (gain -= dp.dirty_num + 1) < 0) + break; + next = st; + st = st->previous; + } + + if (st->accumulator.state[c] == COMPUTED) + { + if (next == nullptr) + return; + + // Update incrementally in two steps. First, we update the "next" + // accumulator. Then, we update the current accumulator (pos.state()). + + // Gather all features to be updated. This code assumes HalfKP features + // only and doesn't support refresh triggers. + static_assert(std::is_same_v>, + RawFeatures>); + Features::IndexList removed[2], added[2]; + Features::HalfKP::AppendChangedIndices(pos, + next->dirtyPiece, c, &removed[0], &added[0]); + for (StateInfo *st2 = pos.state(); st2 != next; st2 = st2->previous) + Features::HalfKP::AppendChangedIndices(pos, + st2->dirtyPiece, c, &removed[1], &added[1]); + + // Mark the accumulators as computed. + next->accumulator.state[c] = COMPUTED; + pos.state()->accumulator.state[c] = COMPUTED; + + // Now update the accumulators listed in info[], where the last element is a sentinel. + StateInfo *info[3] = + { next, next == pos.state() ? nullptr : pos.state(), nullptr }; + #ifdef VECTOR + for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) + { + // Load accumulator + auto accTile = reinterpret_cast( + &st->accumulator.accumulation[c][0][j * kTileHeight]); + for (IndexType k = 0; k < kNumRegs; ++k) + acc[k] = vec_load(&accTile[k]); + + for (IndexType i = 0; info[i]; ++i) + { + // Difference calculation for the deactivated features + for (const auto index : removed[i]) + { + const IndexType offset = kHalfDimensions * index + j * kTileHeight; + auto column = reinterpret_cast(&weights_[offset]); + for (IndexType k = 0; k < kNumRegs; ++k) + acc[k] = vec_sub_16(acc[k], column[k]); + } + + // Difference calculation for the activated features + for (const auto index : added[i]) + { + const IndexType offset = kHalfDimensions * index + j * kTileHeight; + auto column = reinterpret_cast(&weights_[offset]); + for (IndexType k = 0; k < kNumRegs; ++k) + acc[k] = vec_add_16(acc[k], column[k]); + } + + // Store accumulator + accTile = reinterpret_cast( + &info[i]->accumulator.accumulation[c][0][j * kTileHeight]); + for (IndexType k = 0; k < kNumRegs; ++k) + vec_store(&accTile[k], acc[k]); + } + } + + #else + for (IndexType i = 0; info[i]; ++i) + { + std::memcpy(info[i]->accumulator.accumulation[c][0], + st->accumulator.accumulation[c][0], + kHalfDimensions * sizeof(BiasType)); + st = info[i]; + + // Difference calculation for the deactivated features + for (const auto index : removed[i]) + { + const IndexType offset = kHalfDimensions * index; + + for (IndexType j = 0; j < kHalfDimensions; ++j) + st->accumulator.accumulation[c][0][j] -= weights_[offset + j]; + } + + // Difference calculation for the activated features + for (const auto index : added[i]) + { + const IndexType offset = kHalfDimensions * index; + + for (IndexType j = 0; j < kHalfDimensions; ++j) + st->accumulator.accumulation[c][0][j] += weights_[offset + j]; + } + } + #endif + } + else + { + // Refresh the accumulator + auto& accumulator = pos.state()->accumulator; + accumulator.state[c] = COMPUTED; + Features::IndexList active; + Features::HalfKP::AppendActiveIndices(pos, c, &active); + + #ifdef VECTOR + for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) + { auto biasesTile = reinterpret_cast( &biases_[j * kTileHeight]); - auto accTile = reinterpret_cast( - &accumulator.accumulation[perspective][i][j * kTileHeight]); - vec_t acc[kNumRegs]; - - for (unsigned k = 0; k < kNumRegs; ++k) + for (IndexType k = 0; k < kNumRegs; ++k) acc[k] = biasesTile[k]; - for (const auto index : active_indices[perspective]) { + for (const auto index : active) + { const IndexType offset = kHalfDimensions * index + j * kTileHeight; auto column = reinterpret_cast(&weights_[offset]); @@ -268,18 +362,22 @@ namespace Eval::NNUE { acc[k] = vec_add_16(acc[k], column[k]); } + auto accTile = reinterpret_cast( + &accumulator.accumulation[c][0][j * kTileHeight]); for (unsigned k = 0; k < kNumRegs; k++) vec_store(&accTile[k], acc[k]); } + #else - std::memcpy(accumulator.accumulation[perspective][i], biases_, + std::memcpy(accumulator.accumulation[c][0], biases_, kHalfDimensions * sizeof(BiasType)); - for (const auto index : active_indices[perspective]) { + for (const auto index : active) + { const IndexType offset = kHalfDimensions * index; for (IndexType j = 0; j < kHalfDimensions; ++j) - accumulator.accumulation[perspective][i][j] += weights_[offset + j]; + accumulator.accumulation[c][0][j] += weights_[offset + j]; } #endif } @@ -287,106 +385,6 @@ namespace Eval::NNUE { #if defined(USE_MMX) _mm_empty(); #endif - - accumulator.computed_accumulation = true; - } - - // Calculate cumulative value using difference calculation - void UpdateAccumulator(const Position& pos) const { - - Accumulator* prev_accumulator; - assert(pos.state()->previous); - if (pos.state()->previous->accumulator.computed_accumulation) { - prev_accumulator = &pos.state()->previous->accumulator; - } - else { - assert(pos.state()->previous->previous); - assert(pos.state()->previous->previous->accumulator.computed_accumulation); - prev_accumulator = &pos.state()->previous->previous->accumulator; - } - - auto& accumulator = pos.state()->accumulator; - IndexType i = 0; - Features::IndexList removed_indices[2], added_indices[2]; - bool reset[2] = { false, false }; - RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i], - removed_indices, added_indices, reset); - - #ifdef TILING - for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) { - for (Color perspective : { WHITE, BLACK }) { - auto accTile = reinterpret_cast( - &accumulator.accumulation[perspective][i][j * kTileHeight]); - vec_t acc[kNumRegs]; - - if (reset[perspective]) { - auto biasesTile = reinterpret_cast( - &biases_[j * kTileHeight]); - for (unsigned k = 0; k < kNumRegs; ++k) - acc[k] = biasesTile[k]; - } else { - auto prevAccTile = reinterpret_cast( - &prev_accumulator->accumulation[perspective][i][j * kTileHeight]); - for (IndexType k = 0; k < kNumRegs; ++k) - acc[k] = vec_load(&prevAccTile[k]); - - // Difference calculation for the deactivated features - for (const auto index : removed_indices[perspective]) { - const IndexType offset = kHalfDimensions * index + j * kTileHeight; - auto column = reinterpret_cast(&weights_[offset]); - - for (IndexType k = 0; k < kNumRegs; ++k) - acc[k] = vec_sub_16(acc[k], column[k]); - } - } - { // Difference calculation for the activated features - for (const auto index : added_indices[perspective]) { - const IndexType offset = kHalfDimensions * index + j * kTileHeight; - auto column = reinterpret_cast(&weights_[offset]); - - for (IndexType k = 0; k < kNumRegs; ++k) - acc[k] = vec_add_16(acc[k], column[k]); - } - } - - for (IndexType k = 0; k < kNumRegs; ++k) - vec_store(&accTile[k], acc[k]); - } - } - #if defined(USE_MMX) - _mm_empty(); - #endif - - #else - for (Color perspective : { WHITE, BLACK }) { - - if (reset[perspective]) { - std::memcpy(accumulator.accumulation[perspective][i], biases_, - kHalfDimensions * sizeof(BiasType)); - } else { - std::memcpy(accumulator.accumulation[perspective][i], - prev_accumulator->accumulation[perspective][i], - kHalfDimensions * sizeof(BiasType)); - // Difference calculation for the deactivated features - for (const auto index : removed_indices[perspective]) { - const IndexType offset = kHalfDimensions * index; - - for (IndexType j = 0; j < kHalfDimensions; ++j) - accumulator.accumulation[perspective][i][j] -= weights_[offset + j]; - } - } - { // Difference calculation for the activated features - for (const auto index : added_indices[perspective]) { - const IndexType offset = kHalfDimensions * index; - - for (IndexType j = 0; j < kHalfDimensions; ++j) - accumulator.accumulation[perspective][i][j] += weights_[offset + j]; - } - } - } - #endif - - accumulator.computed_accumulation = true; } using BiasType = std::int16_t; diff --git a/src/position.cpp b/src/position.cpp index e6a760d2..b707293d 100644 --- a/src/position.cpp +++ b/src/position.cpp @@ -279,6 +279,8 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th chess960 = isChess960; thisThread = th; set_state(st); + st->accumulator.state[WHITE] = Eval::NNUE::INIT; + st->accumulator.state[BLACK] = Eval::NNUE::INIT; assert(pos_is_ok()); @@ -703,7 +705,8 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { ++st->pliesFromNull; // Used by NNUE - st->accumulator.computed_accumulation = false; + st->accumulator.state[WHITE] = Eval::NNUE::EMPTY; + st->accumulator.state[BLACK] = Eval::NNUE::EMPTY; auto& dp = st->dirtyPiece; dp.dirty_num = 1; @@ -996,16 +999,16 @@ void Position::do_null_move(StateInfo& newSt) { assert(!checkers()); assert(&newSt != st); - if (Eval::useNNUE) - { - std::memcpy(&newSt, st, sizeof(StateInfo)); - } - else - std::memcpy(&newSt, st, offsetof(StateInfo, accumulator)); + std::memcpy(&newSt, st, offsetof(StateInfo, accumulator)); newSt.previous = st; st = &newSt; + st->dirtyPiece.dirty_num = 0; + st->dirtyPiece.piece[0] = NO_PIECE; // Avoid checks in UpdateAccumulator() + st->accumulator.state[WHITE] = Eval::NNUE::EMPTY; + st->accumulator.state[BLACK] = Eval::NNUE::EMPTY; + if (st->epSquare != SQ_NONE) { st->key ^= Zobrist::enpassant[file_of(st->epSquare)]; From bde3505758417c6cd77f2e09edac5bbd5f58b570 Mon Sep 17 00:00:00 2001 From: FauziAkram Date: Sat, 24 Oct 2020 02:01:04 +0300 Subject: [PATCH 12/27] Bishop Pawns based on Files Passed STC: https://tests.stockfishchess.org/tests/view/5f8cc8145a4eacb45305da3c LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 132544 W: 27795 L: 27328 D: 77421 Ptnml(0-2): 2756, 15558, 29272, 15835, 2851 Passed LTC: https://tests.stockfishchess.org/tests/view/5f8df614bacb75a4f9a4721e LLR: 2.94 (-2.94,2.94) {0.25,1.25} Total: 169608 W: 23257 L: 22622 D: 123729 Ptnml(0-2): 1408, 16316, 48758, 16877, 1445 closes https://github.com/official-stockfish/Stockfish/pull/3194 Bench: 4067106 --- src/evaluate.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 425ba6f8..030d1017 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -222,6 +222,12 @@ namespace { S(112,178), S(114,185), S(114,187), S(119,221) } }; + // BishopPawns[distance from edge] contains a file-dependent penalty for pawns on + // squares of the same color as our bishop. + constexpr Score BishopPawns[int(FILE_NB) / 2] = { + S(3, 8), S(3, 9), S(1, 8), S(3, 7) + }; + // KingProtector[knight/bishop] contains penalty for each distance unit to own king constexpr Score KingProtector[] = { S(8, 9), S(6, 9) }; @@ -252,7 +258,6 @@ namespace { // Assorted bonuses and penalties constexpr Score BadOutpost = S( -7, 36); constexpr Score BishopOnKingRing = S( 24, 0); - constexpr Score BishopPawns = S( 3, 7); constexpr Score BishopXRayPawns = S( 4, 5); constexpr Score CorneredBishop = S( 50, 50); constexpr Score FlankAttacks = S( 8, 0); @@ -453,7 +458,7 @@ namespace { // when the bishop is outside the pawn chain. Bitboard blocked = pos.pieces(Us, PAWN) & shift(pos.pieces()); - score -= BishopPawns * pos.pawns_on_same_color_squares(Us, s) + score -= BishopPawns[edge_distance(file_of(s))] * pos.pawns_on_same_color_squares(Us, s) * (!(attackedBy[Us][PAWN] & s) + popcount(blocked & CenterFiles)); // Penalty for all enemy pawns x-rayed @@ -906,7 +911,7 @@ namespace { : pos.count(WHITE) + pos.count(WHITE)); else sf = std::min(sf, 36 + 7 * pos.count(strongSide)) - 4 * !pawnsOnBothFlanks; - + sf -= 4 * !pawnsOnBothFlanks; } From 6328135264d3b13a2cef3f0c835a27192cae0f40 Mon Sep 17 00:00:00 2001 From: SFisGOD Date: Wed, 28 Oct 2020 04:24:55 +0800 Subject: [PATCH 13/27] Update default net to nn-2eb2e0707c2b.nnue Optimization of the net weights of the 32 x 32 layer (1024 parameters) and net biases of the 512 x 32 layer (32 parameters) using SPSA. Tuning of 32 x 32 Layer (800,000 games, 5 seconds time control): https://tests.stockfishchess.org/tests/view/5f942040d3978d7e86f1aa05 Tuning of 512 x 32 Layer (80,000 games, 20 seconds time control): https://tests.stockfishchess.org/tests/view/5f8f926d2c92c7fe3a8c608b STC: LLR: 2.96 (-2.94,2.94) {-0.25,1.25} Total: 17336 W: 1918 L: 1754 D: 13664 Ptnml(0-2): 79, 1344, 5672, 1480, 93 https://tests.stockfishchess.org/tests/view/5f9882346a2c112b60691b34 LTC: LLR: 2.94 (-2.94,2.94) {0.25,1.25} Total: 37304 W: 1822 L: 1651 D: 33831 Ptnml(0-2): 27, 1461, 15501, 1640, 23 https://tests.stockfishchess.org/tests/view/5f98a4b36a2c112b60691b40 closes https://github.com/official-stockfish/Stockfish/pull/3201 Bench: 3403528 --- src/evaluate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evaluate.h b/src/evaluate.h index 6a8603ad..6e5db6a3 100644 --- a/src/evaluate.h +++ b/src/evaluate.h @@ -36,7 +36,7 @@ namespace Eval { // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue // for the build process (profile-build and fishtest) to work. Do not change the // name of the macro, as it is used in the Makefile. - #define EvalFileDefaultName "nn-eba324f53044.nnue" + #define EvalFileDefaultName "nn-2eb2e0707c2b.nnue" namespace NNUE { From 0f6c08c73f516873b312cb8fce0d824a2167b075 Mon Sep 17 00:00:00 2001 From: syzygy1 <3028851+syzygy1@users.noreply.github.com> Date: Tue, 27 Oct 2020 19:22:41 +0100 Subject: [PATCH 14/27] Do not skip non-recapture ttMove when in check The qsearch() MovePicker incorrectly skips a non-recapture ttMove when in check (if depth <= DEPTH_QS_RECAPTURES). This is clearly not intended and can cause qsearch() to return a mate score when there is no mate. Introduced in cad300c and 6596f0e, as observed by joergoster in #3171 and #3198. This PR fixes the bug by not skipping the non-recapture ttMove when in check. Passed non-regression STC: https://tests.stockfishchess.org/tests/view/5f9867ea6a2c112b60691b10 LLR: 2.98 (-2.94,2.94) {-1.25,0.25} Total: 27112 W: 2943 L: 2842 D: 21327 Ptnml(0-2): 127, 2170, 8878, 2237, 144 Passed non-regression LTC: https://tests.stockfishchess.org/tests/view/5f9967326a2c112b60691bb0 LLR: 2.99 (-2.94,2.94) {-0.75,0.25} Total: 18392 W: 807 L: 738 D: 16847 Ptnml(0-2): 9, 655, 7802, 718, 12 closes https://github.com/official-stockfish/Stockfish/pull/3199 closes https://github.com/official-stockfish/Stockfish/pull/3198 Bench: 3870606 --- src/movepick.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/movepick.cpp b/src/movepick.cpp index 153d323e..f5e02385 100644 --- a/src/movepick.cpp +++ b/src/movepick.cpp @@ -73,8 +73,9 @@ MovePicker::MovePicker(const Position& p, Move ttm, Depth d, const ButterflyHist assert(d <= 0); stage = (pos.checkers() ? EVASION_TT : QSEARCH_TT) + - !(ttm && (depth > DEPTH_QS_RECAPTURES || to_sq(ttm) == recaptureSquare) - && pos.pseudo_legal(ttm)); + !( ttm + && (pos.checkers() || depth > DEPTH_QS_RECAPTURES || to_sq(ttm) == recaptureSquare) + && pos.pseudo_legal(ttm)); } /// MovePicker constructor for ProbCut: we generate captures with SEE greater From dfc7f88650bf8bda4a381d36e209209cf63a9bcc Mon Sep 17 00:00:00 2001 From: mstembera Date: Fri, 30 Oct 2020 13:45:40 -0700 Subject: [PATCH 15/27] Update default net to nn-cb26f10b1fd9.nnue Result of https://tests.stockfishchess.org/tests/view/5f9a06796a2c112b60691c0f tuning. STC LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 53712 W: 5776 L: 5561 D: 42375 Ptnml(0-2): 253, 4282, 17604, 4431, 286 https://tests.stockfishchess.org/tests/view/5f9c7bbc6a2c112b60691d4d LTC LLR: 2.97 (-2.94,2.94) {0.25,1.25} Total: 80184 W: 4007 L: 3739 D: 72438 Ptnml(0-2): 58, 3302, 33130, 3518, 84 https://tests.stockfishchess.org/tests/view/5f9d01f06a2c112b60691d87 closes https://github.com/official-stockfish/Stockfish/pull/3209 bench: 3517795 --- src/evaluate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evaluate.h b/src/evaluate.h index 6e5db6a3..6bec27db 100644 --- a/src/evaluate.h +++ b/src/evaluate.h @@ -36,7 +36,7 @@ namespace Eval { // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue // for the build process (profile-build and fishtest) to work. Do not change the // name of the macro, as it is used in the Makefile. - #define EvalFileDefaultName "nn-2eb2e0707c2b.nnue" + #define EvalFileDefaultName "nn-cb26f10b1fd9.nnue" namespace NNUE { From 75e06a1c89ebac9c9ec4247bc82ec728a2bffe1e Mon Sep 17 00:00:00 2001 From: Tomasz Sobczyk Date: Thu, 29 Oct 2020 00:14:53 +0100 Subject: [PATCH 16/27] Optimize affine transform for SSSE3 and higher targets. A non-functional speedup. Unroll the loops going over the output dimensions in the affine transform layers by a factor of 4 and perform 4 horizontal additions at a time. Instead of doing naive horizontal additions on each vector separately use hadd and shuffling between vectors to reduce the number of instructions by using all lanes for all stages of the horizontal adds. passed STC of the initial version: LLR: 2.95 (-2.94,2.94) {-0.25,1.25} Total: 17808 W: 1914 L: 1756 D: 14138 Ptnml(0-2): 76, 1330, 5948, 1460, 90 https://tests.stockfishchess.org/tests/view/5f9d516f6a2c112b60691da3 passed STC of the final version after cleanup: LLR: 2.95 (-2.94,2.94) {-0.25,1.25} Total: 16296 W: 1750 L: 1595 D: 12951 Ptnml(0-2): 72, 1192, 5479, 1319, 86 https://tests.stockfishchess.org/tests/view/5f9df5776a2c112b60691de3 closes https://github.com/official-stockfish/Stockfish/pull/3203 No functional change --- src/nnue/layers/affine_transform.h | 478 +++++++++++++++++++++++------ 1 file changed, 384 insertions(+), 94 deletions(-) diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 94d0b5a9..f0292e45 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -74,113 +74,400 @@ namespace Eval::NNUE::Layers { const TransformedFeatureType* transformed_features, char* buffer) const { const auto input = previous_layer_.Propagate( transformed_features, buffer + kSelfBufferSize); + +#if defined (USE_AVX512) + + [[maybe_unused]] const __m512i kOnes512 = _mm512_set1_epi16(1); + + [[maybe_unused]] auto m512_hadd = [](__m512i sum, int bias) -> int { + return _mm512_reduce_add_epi32(sum) + bias; + }; + + [[maybe_unused]] auto m512_haddx4 = [](__m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i { + __m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1); + __m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1); + + __m512i sum23a = _mm512_unpacklo_epi32(sum2, sum3); + __m512i sum23b = _mm512_unpackhi_epi32(sum2, sum3); + + __m512i sum01 = _mm512_add_epi32(sum01a, sum01b); + __m512i sum23 = _mm512_add_epi32(sum23a, sum23b); + + __m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23); + __m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23); + + __m512i sum = _mm512_add_epi32(sum0123a, sum0123b); + + __m256i sum256lo = _mm512_castsi512_si256(sum); + __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1); + + sum256lo = _mm256_add_epi32(sum256lo, sum256hi); + + __m128i sum128lo = _mm256_castsi256_si128(sum256lo); + __m128i sum128hi = _mm256_extracti128_si256(sum256lo, 1); + + return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias); + }; + + [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) { +#if defined (USE_VNNI) + acc = _mm512_dpbusd_epi32(acc, a, b); +#else + __m512i product0 = _mm512_maddubs_epi16(a, b); + product0 = _mm512_madd_epi16(product0, kOnes512); + acc = _mm512_add_epi32(acc, product0); +#endif + }; + +#endif +#if defined (USE_AVX2) + + [[maybe_unused]] const __m256i kOnes256 = _mm256_set1_epi16(1); + + [[maybe_unused]] auto m256_hadd = [](__m256i sum, int bias) -> int { + __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC)); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB)); + return _mm_cvtsi128_si32(sum128) + bias; + }; + + [[maybe_unused]] auto m256_haddx4 = [](__m256i sum0, __m256i sum1, __m256i sum2, __m256i sum3, __m128i bias) -> __m128i { + sum0 = _mm256_hadd_epi32(sum0, sum1); + sum2 = _mm256_hadd_epi32(sum2, sum3); + + sum0 = _mm256_hadd_epi32(sum0, sum2); + + __m128i sum128lo = _mm256_castsi256_si128(sum0); + __m128i sum128hi = _mm256_extracti128_si256(sum0, 1); + + return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias); + }; + + [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) { +#if defined (USE_VNNI) + acc = _mm256_dpbusd_epi32(acc, a, b); +#else + __m256i product0 = _mm256_maddubs_epi16(a, b); + product0 = _mm256_madd_epi16(product0, kOnes256); + acc = _mm256_add_epi32(acc, product0); +#endif + }; + +#endif + +#if defined (USE_SSSE3) + + [[maybe_unused]] const __m128i kOnes128 = _mm_set1_epi16(1); + + [[maybe_unused]] auto m128_hadd = [](__m128i sum, int bias) -> int { + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB + return _mm_cvtsi128_si32(sum) + bias; + }; + + [[maybe_unused]] auto m128_haddx4 = [](__m128i sum0, __m128i sum1, __m128i sum2, __m128i sum3, __m128i bias) -> __m128i { + sum0 = _mm_hadd_epi32(sum0, sum1); + sum2 = _mm_hadd_epi32(sum2, sum3); + + sum0 = _mm_hadd_epi32(sum0, sum2); + + return _mm_add_epi32(sum0, bias); + }; + + [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) { + __m128i product0 = _mm_maddubs_epi16(a, b); + product0 = _mm_madd_epi16(product0, kOnes128); + acc = _mm_add_epi32(acc, product0); + }; + +#endif + +#if defined (USE_AVX512) + + constexpr IndexType kNumChunks512 = kPaddedInputDimensions / (kSimdWidth * 2); + constexpr IndexType kNumChunks256 = kPaddedInputDimensions / kSimdWidth; + const auto output = reinterpret_cast(buffer); - #if defined(USE_AVX512) - constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2); - const auto input_vector = reinterpret_cast(input); - #if !defined(USE_VNNI) - const __m512i kOnes = _mm512_set1_epi16(1); - #endif + // Since to saturate a zmm register it takes 64 bytes we + // cannot use AVX512 for the smaller affine transforms. + // Instead we fallback to a AVX2 implementation if the + // kInputDimensions isn't a multiple of 64. + // Note that this means that for example for + // kInputDimensions of 96 we fallback to AVX2 even though + // the first 64 elements could be processed with AVX512. + // This is caused by mixing the __m256 and __m512 variables + // required to better handle that case and it would + // require handling more cases statically not to lose performance. + // This should be revisited if such input dimensions are to be considered. + [[maybe_unused]] const auto input_vector512 = reinterpret_cast(input); + [[maybe_unused]] const auto input_vector256 = reinterpret_cast(input); + + // kOutputDimensions is either 1 or a multiple of kSimdWidth + // because then it is also an input dimension. + if constexpr (kOutputDimensions % 4 == 0) + { + for (IndexType i = 0; i < kOutputDimensions; i += 4) + { + const IndexType offset0 = (i + 0) * kPaddedInputDimensions; + const IndexType offset1 = (i + 1) * kPaddedInputDimensions; + const IndexType offset2 = (i + 2) * kPaddedInputDimensions; + const IndexType offset3 = (i + 3) * kPaddedInputDimensions; + + const __m128i bias = *reinterpret_cast(&biases_[i]); + __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]); + + if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0) + { + __m512i sum0 = _mm512_setzero_si512(); + __m512i sum1 = _mm512_setzero_si512(); + __m512i sum2 = _mm512_setzero_si512(); + __m512i sum3 = _mm512_setzero_si512(); + + const auto row0 = reinterpret_cast(&weights_[offset0]); + const auto row1 = reinterpret_cast(&weights_[offset1]); + const auto row2 = reinterpret_cast(&weights_[offset2]); + const auto row3 = reinterpret_cast(&weights_[offset3]); + + for (IndexType j = 0; j < kNumChunks512; ++j) + { + const __m512i in = input_vector512[j]; + + m512_add_dpbusd_epi32(sum0, in, row0[j]); + m512_add_dpbusd_epi32(sum1, in, row1[j]); + m512_add_dpbusd_epi32(sum2, in, row2[j]); + m512_add_dpbusd_epi32(sum3, in, row3[j]); + } + + *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias); + } + else + { + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + + const auto row0 = reinterpret_cast(&weights_[offset0]); + const auto row1 = reinterpret_cast(&weights_[offset1]); + const auto row2 = reinterpret_cast(&weights_[offset2]); + const auto row3 = reinterpret_cast(&weights_[offset3]); + + for (IndexType j = 0; j < kNumChunks256; ++j) + { + const __m256i in = input_vector256[j]; + + m256_add_dpbusd_epi32(sum0, in, row0[j]); + m256_add_dpbusd_epi32(sum1, in, row1[j]); + m256_add_dpbusd_epi32(sum2, in, row2[j]); + m256_add_dpbusd_epi32(sum3, in, row3[j]); + } + + *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias); + } + } + } + else if constexpr (kOutputDimensions == 1) + { + if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0) + { + __m512i sum0 = _mm512_setzero_si512(); + + const auto row0 = reinterpret_cast(&weights_[0]); + + for (IndexType j = 0; j < kNumChunks512; ++j) + { + const __m512i in = input_vector512[j]; + + m512_add_dpbusd_epi32(sum0, in, row0[j]); + } + + output[0] = m512_hadd(sum0, biases_[0]); + } + else + { + __m256i sum0 = _mm256_setzero_si256(); + + const auto row0 = reinterpret_cast(&weights_[0]); + + for (IndexType j = 0; j < kNumChunks256; ++j) + { + const __m256i in = input_vector256[j]; + + m256_add_dpbusd_epi32(sum0, in, row0[j]); + } + + output[0] = m256_hadd(sum0, biases_[0]); + } + } + else + { + // This case can never happen because kOutputDimensions + // is always 1 or a multiple of kSimdWidth. + assert(false); + } + +#elif defined (USE_AVX2) - #elif defined(USE_AVX2) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; + + const auto output = reinterpret_cast(buffer); const auto input_vector = reinterpret_cast(input); - #if !defined(USE_VNNI) - const __m256i kOnes = _mm256_set1_epi16(1); - #endif - #elif defined(USE_SSE2) + // kOutputDimensions is either 1 or a multiple of kSimdWidth + // because then it is also an input dimension. + if constexpr (kOutputDimensions % 4 == 0) + { + for (IndexType i = 0; i < kOutputDimensions; i += 4) + { + const IndexType offset0 = (i + 0) * kPaddedInputDimensions; + const IndexType offset1 = (i + 1) * kPaddedInputDimensions; + const IndexType offset2 = (i + 2) * kPaddedInputDimensions; + const IndexType offset3 = (i + 3) * kPaddedInputDimensions; + + const __m128i bias = *reinterpret_cast(&biases_[i]); + __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]); + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + + const auto row0 = reinterpret_cast(&weights_[offset0]); + const auto row1 = reinterpret_cast(&weights_[offset1]); + const auto row2 = reinterpret_cast(&weights_[offset2]); + const auto row3 = reinterpret_cast(&weights_[offset3]); + + for (IndexType j = 0; j < kNumChunks; ++j) + { + const __m256i in = input_vector[j]; + + m256_add_dpbusd_epi32(sum0, in, row0[j]); + m256_add_dpbusd_epi32(sum1, in, row1[j]); + m256_add_dpbusd_epi32(sum2, in, row2[j]); + m256_add_dpbusd_epi32(sum3, in, row3[j]); + } + + *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias); + } + } + else if constexpr (kOutputDimensions == 1) + { + __m256i sum0 = _mm256_setzero_si256(); + + const auto row0 = reinterpret_cast(&weights_[0]); + + for (IndexType j = 0; j < kNumChunks; ++j) + { + const __m256i in = input_vector[j]; + + m256_add_dpbusd_epi32(sum0, in, row0[j]); + } + + output[0] = m256_hadd(sum0, biases_[0]); + } + else + { + // This case can never happen because kOutputDimensions + // is always 1 or a multiple of kSimdWidth. + assert(false); + } + +#elif defined (USE_SSSE3) + constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; - #ifndef USE_SSSE3 - const __m128i kZeros = _mm_setzero_si128(); - #else - const __m128i kOnes = _mm_set1_epi16(1); - #endif + + auto output = reinterpret_cast(buffer); const auto input_vector = reinterpret_cast(input); - #elif defined(USE_MMX) + // kOutputDimensions is either 1 or a multiple of kSimdWidth + // because then it is also an input dimension. + if constexpr (kOutputDimensions % 4 == 0) + { + for (IndexType i = 0; i < kOutputDimensions; i += 4) + { + const IndexType offset0 = (i + 0) * kPaddedInputDimensions; + const IndexType offset1 = (i + 1) * kPaddedInputDimensions; + const IndexType offset2 = (i + 2) * kPaddedInputDimensions; + const IndexType offset3 = (i + 3) * kPaddedInputDimensions; + + const __m128i bias = *reinterpret_cast(&biases_[i]); + __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]); + + __m128i sum0 = _mm_setzero_si128(); + __m128i sum1 = _mm_setzero_si128(); + __m128i sum2 = _mm_setzero_si128(); + __m128i sum3 = _mm_setzero_si128(); + + const auto row0 = reinterpret_cast(&weights_[offset0]); + const auto row1 = reinterpret_cast(&weights_[offset1]); + const auto row2 = reinterpret_cast(&weights_[offset2]); + const auto row3 = reinterpret_cast(&weights_[offset3]); + + for (int j = 0; j < (int)kNumChunks; j += 1) + { + const __m128i in = input_vector[j]; + + m128_add_dpbusd_epi32(sum0, in, row0[j]); + m128_add_dpbusd_epi32(sum1, in, row1[j]); + m128_add_dpbusd_epi32(sum2, in, row2[j]); + m128_add_dpbusd_epi32(sum3, in, row3[j]); + } + + *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias); + } + } + else if constexpr (kOutputDimensions == 1) + { + __m128i sum0 = _mm_setzero_si128(); + + const auto row0 = reinterpret_cast(&weights_[0]); + + for (int j = 0; j < (int)kNumChunks; j += 1) + { + const __m128i in = input_vector[j]; + + m128_add_dpbusd_epi32(sum0, in, row0[j]); + } + + output[0] = m128_hadd(sum0, biases_[0]); + } + else + { + // This case can never happen because kOutputDimensions + // is always 1 or a multiple of kSimdWidth. + assert(false); + } + +#else + +// Use old implementation for the other architectures. + + auto output = reinterpret_cast(buffer); + +#if defined(USE_SSE2) + constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; +#ifndef USE_SSSE3 + const __m128i kZeros = _mm_setzero_si128(); +#else + const __m128i kOnes = _mm_set1_epi16(1); +#endif + const auto input_vector = reinterpret_cast(input); + +#elif defined(USE_MMX) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; const __m64 kZeros = _mm_setzero_si64(); const auto input_vector = reinterpret_cast(input); - #elif defined(USE_NEON) +#elif defined(USE_NEON) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; const auto input_vector = reinterpret_cast(input); - #endif +#endif for (IndexType i = 0; i < kOutputDimensions; ++i) { const IndexType offset = i * kPaddedInputDimensions; - #if defined(USE_AVX512) - __m512i sum = _mm512_setzero_si512(); - const auto row = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { - #if defined(USE_VNNI) - sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j])); - #else - __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j])); - product = _mm512_madd_epi16(product, kOnes); - sum = _mm512_add_epi32(sum, product); - #endif - } - - // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks. - // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit) - // and we have to do one more 256bit chunk. - if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2) - { - const auto iv256 = reinterpret_cast(&input_vector[kNumChunks]); - const auto row256 = reinterpret_cast(&row[kNumChunks]); - #if defined(USE_VNNI) - __m256i product256 = _mm256_dpbusd_epi32( - _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0])); - sum = _mm512_inserti32x8(sum, product256, 0); - #else - __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0])); - sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256)); - #endif - } - output[i] = _mm512_reduce_add_epi32(sum) + biases_[i]; - - #elif defined(USE_AVX2) - __m256i sum = _mm256_setzero_si256(); - const auto row = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { - #if defined(USE_VNNI) - sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j])); - #else - __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j])); - product = _mm256_madd_epi16(product, kOnes); - sum = _mm256_add_epi32(sum, product); - #endif - } - __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); - sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC)); - sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB)); - output[i] = _mm_cvtsi128_si32(sum128) + biases_[i]; - - #elif defined(USE_SSSE3) - __m128i sum = _mm_setzero_si128(); - const auto row = reinterpret_cast(&weights_[offset]); - for (int j = 0; j < (int)kNumChunks - 1; j += 2) { - __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j])); - product0 = _mm_madd_epi16(product0, kOnes); - sum = _mm_add_epi32(sum, product0); - __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1])); - product1 = _mm_madd_epi16(product1, kOnes); - sum = _mm_add_epi32(sum, product1); - } - if (kNumChunks & 0x1) { - __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1])); - product = _mm_madd_epi16(product, kOnes); - sum = _mm_add_epi32(sum, product); - } - sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC - sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB - output[i] = _mm_cvtsi128_si32(sum) + biases_[i]; - - #elif defined(USE_SSE2) +#if defined(USE_SSE2) __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]); __m128i sum_hi = kZeros; const auto row = reinterpret_cast(&weights_[offset]); @@ -204,7 +491,7 @@ namespace Eval::NNUE::Layers { sum = _mm_add_epi32(sum, sum_second_32); output[i] = _mm_cvtsi128_si32(sum); - #elif defined(USE_MMX) +#elif defined(USE_MMX) __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]); __m64 sum_hi = kZeros; const auto row = reinterpret_cast(&weights_[offset]); @@ -225,7 +512,7 @@ namespace Eval::NNUE::Layers { sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum)); output[i] = _mm_cvtsi64_si32(sum); - #elif defined(USE_NEON) +#elif defined(USE_NEON) int32x4_t sum = {biases_[i]}; const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { @@ -235,18 +522,21 @@ namespace Eval::NNUE::Layers { } output[i] = sum[0] + sum[1] + sum[2] + sum[3]; - #else +#else OutputType sum = biases_[i]; for (IndexType j = 0; j < kInputDimensions; ++j) { sum += weights_[offset + j] * input[j]; } output[i] = sum; - #endif +#endif } - #if defined(USE_MMX) +#if defined(USE_MMX) _mm_empty(); - #endif +#endif + +#endif + return output; } From 931070b65ac0332469a24765a60eb27e038f73bc Mon Sep 17 00:00:00 2001 From: FauziAkram Date: Thu, 29 Oct 2020 17:33:18 +0300 Subject: [PATCH 17/27] Elo Worth in King Danger Adding the EloWorth for each term in King Danger. Should be useful for simplifications, tuning patches, and new ideas. closes https://github.com/official-stockfish/Stockfish/pull/3204 non-functional change --- src/evaluate.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 030d1017..4ade46fa 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -582,18 +582,18 @@ namespace { int kingFlankAttack = popcount(b1) + popcount(b2); int kingFlankDefense = popcount(b3); - kingDanger += kingAttackersCount[Them] * kingAttackersWeight[Them] - + 185 * popcount(kingRing[Us] & weak) - + 148 * popcount(unsafeChecks) - + 98 * popcount(pos.blockers_for_king(Us)) - + 69 * kingAttacksCount[Them] - + 3 * kingFlankAttack * kingFlankAttack / 8 - + mg_value(mobility[Them] - mobility[Us]) - - 873 * !pos.count(Them) - - 100 * bool(attackedBy[Us][KNIGHT] & attackedBy[Us][KING]) - - 6 * mg_value(score) / 8 - - 4 * kingFlankDefense - + 37; + kingDanger += kingAttackersCount[Them] * kingAttackersWeight[Them] // (~10 Elo) + + 185 * popcount(kingRing[Us] & weak) // (~15 Elo) + + 148 * popcount(unsafeChecks) // (~4 Elo) + + 98 * popcount(pos.blockers_for_king(Us)) // (~2 Elo) + + 69 * kingAttacksCount[Them] // (~0.5 Elo) + + 3 * kingFlankAttack * kingFlankAttack / 8 // (~0.5 Elo) + + mg_value(mobility[Them] - mobility[Us]) // (~0.5 Elo) + - 873 * !pos.count(Them) // (~24 Elo) + - 100 * bool(attackedBy[Us][KNIGHT] & attackedBy[Us][KING]) // (~5 Elo) + - 6 * mg_value(score) / 8 // (~8 Elo) + - 4 * kingFlankDefense // (~5 Elo) + + 37; // (~0.5 Elo) // Transform the kingDanger units into a Score, and subtract it from the evaluation if (kingDanger > 100) From a260c9a8a24a2630a900efc3821000c3481b0c5d Mon Sep 17 00:00:00 2001 From: "J. Oster" Date: Sun, 1 Nov 2020 18:33:17 +0100 Subject: [PATCH 18/27] Fix incorrect pruning in qsearch Only do countermove based pruning in qsearch if we already have a move with a better score than a TB loss. This patch fixes a bug (started as 843a961) that incorrectly prunes moves if in check, and adds an assert to make sure no wrong mate scores are given in the future. It replaces a no-op moveCount check with a check for bestValue. Initially discussed in #3171 and later in #3199, #3198 and #3210. This PR effectively closes #3171 It also likely fixes #3196 where this causes user visible incorrect TB scores, which probably result from these incorrect mate scores. Passed STC and LTC non-regression tests. https://tests.stockfishchess.org/tests/view/5f9ef8dabca9bf35bae7f648 LLR: 2.93 (-2.94,2.94) {-1.25,0.25} Total: 21672 W: 2339 L: 2230 D: 17103 Ptnml(0-2): 126, 1689, 7083, 1826, 112 https://tests.stockfishchess.org/tests/view/5f9f0caebca9bf35bae7f666 LLR: 2.97 (-2.94,2.94) {-0.75,0.25} Total: 33152 W: 1551 L: 1485 D: 30116 Ptnml(0-2): 27, 1308, 13832, 1390, 19 closes https://github.com/official-stockfish/Stockfish/pull/3214 Bench: 3625915 --- src/search.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/search.cpp b/src/search.cpp index 65ed9b73..743449fa 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1565,7 +1565,7 @@ moves_loop: // When in check, search starts from here // CounterMove based pruning if ( !captureOrPromotion - && moveCount + && bestValue > VALUE_TB_LOSS_IN_MAX_PLY && (*contHist[0])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold && (*contHist[1])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold) continue; @@ -1600,7 +1600,11 @@ moves_loop: // When in check, search starts from here // All legal moves have been searched. A special case: if we're in check // and no legal moves were found, it is checkmate. if (ss->inCheck && bestValue == -VALUE_INFINITE) + { + assert(!MoveList(pos).size()); + return mated_in(ss->ply); // Plies to mate from the root + } tte->save(posKey, value_to_tt(bestValue, ss->ply), pvHit, bestValue >= beta ? BOUND_LOWER : From 3f6451eff7c62e8d4a33c5b11f055a81b3da8387 Mon Sep 17 00:00:00 2001 From: Tomasz Sobczyk Date: Tue, 3 Nov 2020 11:23:35 +0100 Subject: [PATCH 19/27] Manually align arrays on the stack as a workaround to issues with overaligned alignas() on stack variables in gcc < 9.3 on windows. closes https://github.com/official-stockfish/Stockfish/pull/3217 fixes #3216 No functional change --- src/misc.h | 12 ++++++++++++ src/nnue/evaluate_nnue.cpp | 25 ++++++++++++++++++++++--- src/nnue/layers/clipped_relu.h | 10 +++++----- src/nnue/nnue_common.h | 23 ----------------------- src/nnue/nnue_feature_transformer.h | 14 +++++++------- src/position.cpp | 4 ++++ src/search.cpp | 8 ++++++++ src/types.h | 6 ++++++ 8 files changed, 64 insertions(+), 38 deletions(-) diff --git a/src/misc.h b/src/misc.h index bc48f303..682ef816 100644 --- a/src/misc.h +++ b/src/misc.h @@ -24,6 +24,7 @@ #include #include #include +#include #include "types.h" @@ -63,6 +64,17 @@ std::ostream& operator<<(std::ostream&, SyncCout); #define sync_cout std::cout << IO_LOCK #define sync_endl std::endl << IO_UNLOCK +// `ptr` must point to an array of size at least +// `sizeof(T) * N + alignment` bytes, where `N` is the +// number of elements in the array. +template +T* align_ptr_up(T* ptr) +{ + static_assert(alignof(T) < Alignment); + + const uintptr_t ptrint = reinterpret_cast(reinterpret_cast(ptr)); + return reinterpret_cast(reinterpret_cast((ptrint + (Alignment - 1)) / Alignment * Alignment)); +} /// xorshift64star Pseudo-Random Number Generator /// This class is based on original code written and dedicated diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp index b5dcd992..b0ed7d2f 100644 --- a/src/nnue/evaluate_nnue.cpp +++ b/src/nnue/evaluate_nnue.cpp @@ -25,6 +25,7 @@ #include "../position.h" #include "../misc.h" #include "../uci.h" +#include "../types.h" #include "evaluate_nnue.h" @@ -126,10 +127,28 @@ namespace Eval::NNUE { // Evaluation function. Perform differential calculation. Value evaluate(const Position& pos) { - alignas(kCacheLineSize) TransformedFeatureType - transformed_features[FeatureTransformer::kBufferSize]; + // We manually align the arrays on the stack because with gcc < 9.3 + // overaligning stack variables with alignas() doesn't work correctly. + + constexpr uint64_t alignment = kCacheLineSize; + +#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN) + TransformedFeatureType transformed_features_unaligned[ + FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)]; + char buffer_unaligned[Network::kBufferSize + alignment]; + + auto* transformed_features = align_ptr_up(&transformed_features_unaligned[0]); + auto* buffer = align_ptr_up(&buffer_unaligned[0]); +#else + alignas(alignment) + TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize]; + alignas(alignment) char buffer[Network::kBufferSize]; +#endif + + ASSERT_ALIGNED(transformed_features, alignment); + ASSERT_ALIGNED(buffer, alignment); + feature_transformer->Transform(pos, transformed_features); - alignas(kCacheLineSize) char buffer[Network::kBufferSize]; const auto output = network->Propagate(transformed_features, buffer); return static_cast(output[0] / FV_SCALE); diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index 44d8a7de..7f6d67bf 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -74,12 +74,12 @@ namespace Eval::NNUE::Layers { const auto out = reinterpret_cast<__m256i*>(output); for (IndexType i = 0; i < kNumChunks; ++i) { const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32( - _mm256_loadA_si256(&in[i * 4 + 0]), - _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits); + _mm256_load_si256(&in[i * 4 + 0]), + _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits); const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32( - _mm256_loadA_si256(&in[i * 4 + 2]), - _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits); - _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( + _mm256_load_si256(&in[i * 4 + 2]), + _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits); + _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( _mm256_packs_epi16(words0, words1), kZero), kOffsets)); } constexpr IndexType kStart = kNumChunks * kSimdWidth; diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index 8afea186..a9664262 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -43,29 +43,6 @@ #include #endif -// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary -// compiled with older g++ crashes because the output memory is not aligned -// even though alignas is specified. -#if defined(USE_AVX2) -#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__) -#define _mm256_loadA_si256 _mm256_loadu_si256 -#define _mm256_storeA_si256 _mm256_storeu_si256 -#else -#define _mm256_loadA_si256 _mm256_load_si256 -#define _mm256_storeA_si256 _mm256_store_si256 -#endif -#endif - -#if defined(USE_AVX512) -#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__) -#define _mm512_loadA_si512 _mm512_loadu_si512 -#define _mm512_storeA_si512 _mm512_storeu_si512 -#else -#define _mm512_loadA_si512 _mm512_load_si512 -#define _mm512_storeA_si512 _mm512_store_si512 -#endif -#endif - namespace Eval::NNUE { // Version of the evaluation file diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index f145c848..c3f012e4 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -36,16 +36,16 @@ namespace Eval::NNUE { #ifdef USE_AVX512 typedef __m512i vec_t; - #define vec_load(a) _mm512_loadA_si512(a) - #define vec_store(a,b) _mm512_storeA_si512(a,b) + #define vec_load(a) _mm512_load_si512(a) + #define vec_store(a,b) _mm512_store_si512(a,b) #define vec_add_16(a,b) _mm512_add_epi16(a,b) #define vec_sub_16(a,b) _mm512_sub_epi16(a,b) static constexpr IndexType kNumRegs = 8; // only 8 are needed #elif USE_AVX2 typedef __m256i vec_t; - #define vec_load(a) _mm256_loadA_si256(a) - #define vec_store(a,b) _mm256_storeA_si256(a,b) + #define vec_load(a) _mm256_load_si256(a) + #define vec_store(a,b) _mm256_store_si256(a,b) #define vec_add_16(a,b) _mm256_add_epi16(a,b) #define vec_sub_16(a,b) _mm256_sub_epi16(a,b) static constexpr IndexType kNumRegs = 16; @@ -157,11 +157,11 @@ namespace Eval::NNUE { #if defined(USE_AVX2) auto out = reinterpret_cast<__m256i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - __m256i sum0 = _mm256_loadA_si256( + __m256i sum0 = _mm256_load_si256( &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 0]); - __m256i sum1 = _mm256_loadA_si256( + __m256i sum1 = _mm256_load_si256( &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 1]); - _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( + _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( _mm256_packs_epi16(sum0, sum1), kZero), kControl)); } diff --git a/src/position.cpp b/src/position.cpp index b707293d..5ce7da22 100644 --- a/src/position.cpp +++ b/src/position.cpp @@ -77,6 +77,8 @@ std::ostream& operator<<(std::ostream& os, const Position& pos) { && !pos.can_castle(ANY_CASTLING)) { StateInfo st; + ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize); + Position p; p.set(pos.fen(), pos.is_chess960(), &st, pos.this_thread()); Tablebases::ProbeState s1, s2; @@ -1318,6 +1320,8 @@ bool Position::pos_is_ok() const { assert(0 && "pos_is_ok: Bitboards"); StateInfo si = *st; + ASSERT_ALIGNED(&si, Eval::NNUE::kCacheLineSize); + set_state(&si); if (std::memcmp(&si, st, sizeof(StateInfo))) assert(0 && "pos_is_ok: State"); diff --git a/src/search.cpp b/src/search.cpp index 743449fa..12c32194 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -164,6 +164,8 @@ namespace { uint64_t perft(Position& pos, Depth depth) { StateInfo st; + ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize); + uint64_t cnt, nodes = 0; const bool leaf = (depth == 2); @@ -590,6 +592,8 @@ namespace { Move pv[MAX_PLY+1], capturesSearched[32], quietsSearched[64]; StateInfo st; + ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize); + TTEntry* tte; Key posKey; Move ttMove, move, excludedMove, bestMove; @@ -1403,6 +1407,8 @@ moves_loop: // When in check, search starts from here Move pv[MAX_PLY+1]; StateInfo st; + ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize); + TTEntry* tte; Key posKey; Move ttMove, move, bestMove; @@ -1898,6 +1904,8 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) { bool RootMove::extract_ponder_from_tt(Position& pos) { StateInfo st; + ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize); + bool ttHit; assert(pv.size() == 1); diff --git a/src/types.h b/src/types.h index 5873c698..bf692f7e 100644 --- a/src/types.h +++ b/src/types.h @@ -57,6 +57,12 @@ /// _WIN32 Building on Windows (any) /// _WIN64 Building on Windows 64 bit +#if defined(__GNUC__ ) && (__GNUC__ < 9 || (__GNUC__ == 9 && __GNUC_MINOR__ <= 2)) && defined(_WIN32) && !defined(__clang__) +#define ALIGNAS_ON_STACK_VARIABLES_BROKEN +#endif + +#define ASSERT_ALIGNED(ptr, alignment) assert(reinterpret_cast(ptr) % alignment == 0) + #if defined(_WIN64) && defined(_MSC_VER) // No Makefile used # include // Microsoft header for _BitScanForward64() # define IS_64BIT From 04a320666efce725ef66d1a84aaef493a880153d Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Fri, 23 Oct 2020 07:39:35 +0200 Subject: [PATCH 20/27] Change handling the special case of a single legal move. Using no searching time in case of a single legal move is not beneficial from a strength point of view, and this special case can be easily removed: STC: LLR: 2.93 (-2.94,2.94) {-1.25,0.25} Total: 22472 W: 2458 L: 2357 D: 17657 Ptnml(0-2): 106, 1733, 7453, 1842, 102 https://tests.stockfishchess.org/tests/view/5f926cbc81eda81bd78cb6df LTC: LLR: 2.94 (-2.94,2.94) {-0.75,0.25} Total: 37880 W: 1736 L: 1682 D: 34462 Ptnml(0-2): 22, 1392, 16057, 1448, 21 https://tests.stockfishchess.org/tests/view/5f92a26081eda81bd78cb6fe The advantage of using the normal time management for a single legal move is that scores reported for that move are reasonable, not searching leads to artifacts during games (see e.g. https://tcec-chess.com/#div=sf&game=96&season=19) The disadvantage of using normal time management of a single legal move is that thinking times can be unnaturally long, making it 'painful to watch' in online tournaments. This patch uses normal time management, but caps the used time to 500ms. This should lead to reasonable scores, and be hardly perceptible. closes https://github.com/official-stockfish/Stockfish/pull/3195 closes https://github.com/official-stockfish/Stockfish/pull/3183 variant of a patch suggested by SFisGOD No functional change. --- src/search.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index 12c32194..6e37fba1 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -521,10 +521,14 @@ void Thread::search() { } double bestMoveInstability = 1 + 2 * totBestMoveChanges / Threads.size(); - double totalTime = rootMoves.size() == 1 ? 0 : - Time.optimum() * fallingEval * reduction * bestMoveInstability; + double totalTime = Time.optimum() * fallingEval * reduction * bestMoveInstability; - // Stop the search if we have exceeded the totalTime, at least 1ms search + // Cap used time in case of a single legal move for a better viewer experience in tournaments + // yielding correct scores and sufficiently fast moves. + if (rootMoves.size() == 1) + totalTime = std::min(500.0, totalTime); + + // Stop the search if we have exceeded the totalTime if (Time.elapsed() > totalTime) { // If we are allowed to ponder do not stop the search now but From 7fc47eeb6f6b5f3c5ff697e974093ff14413e42c Mon Sep 17 00:00:00 2001 From: FauziAkram Date: Thu, 5 Nov 2020 01:54:53 +0200 Subject: [PATCH 21/27] Introducing King On File this new concept calculates bonuses/penalties for the king when the king is in a semiopen or open file. Passed STC: LLR: 2.95 (-2.94,2.94) {-0.25,1.25} Total: 44904 W: 9365 L: 9028 D: 26511 Ptnml(0-2): 857, 5309, 9841, 5530, 915 https://tests.stockfishchess.org/tests/view/5fa343625d72639a7acef72b Passed LTC: LLR: 2.94 (-2.94,2.94) {0.25,1.25} Total: 60552 W: 8449 L: 8051 D: 44052 Ptnml(0-2): 466, 5772, 17481, 6012, 545 https://tests.stockfishchess.org/tests/view/5fa40e365d72639a7acef79e closes https://github.com/official-stockfish/Stockfish/pull/3219 Bench: 3689484 --- src/pawns.cpp | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/pawns.cpp b/src/pawns.cpp index a5102db8..fde70ba5 100644 --- a/src/pawns.cpp +++ b/src/pawns.cpp @@ -49,10 +49,10 @@ namespace { // Strength of pawn shelter for our king by [distance from edge][rank]. // RANK_1 = 0 is used for files where we have no pawn, or pawn is behind our king. constexpr Value ShelterStrength[int(FILE_NB) / 2][RANK_NB] = { - { V( -6), V( 81), V( 93), V( 58), V( 39), V( 18), V( 25) }, - { V(-43), V( 61), V( 35), V(-49), V(-29), V(-11), V( -63) }, - { V(-10), V( 75), V( 23), V( -2), V( 32), V( 3), V( -45) }, - { V(-39), V(-13), V(-29), V(-52), V(-48), V(-67), V(-166) } + { V( -5), V( 82), V( 92), V( 54), V( 36), V( 22), V( 28) }, + { V(-44), V( 63), V( 33), V(-50), V(-30), V(-12), V( -62) }, + { V(-11), V( 77), V( 22), V( -6), V( 31), V( 8), V( -45) }, + { V(-39), V(-12), V(-29), V(-50), V(-43), V(-68), V(-164) } }; // Danger of enemy pawns moving toward our king by [distance from edge][rank]. @@ -60,12 +60,17 @@ namespace { // is behind our king. Note that UnblockedStorm[0][1-2] accommodate opponent pawn // on edge, likely blocked by our king. constexpr Value UnblockedStorm[int(FILE_NB) / 2][RANK_NB] = { - { V( 85), V(-289), V(-166), V(97), V(50), V( 45), V( 50) }, - { V( 46), V( -25), V( 122), V(45), V(37), V(-10), V( 20) }, - { V( -6), V( 51), V( 168), V(34), V(-2), V(-22), V(-14) }, - { V(-15), V( -11), V( 101), V( 4), V(11), V(-15), V(-29) } + { V( 87), V(-288), V(-168), V( 96), V( 47), V( 44), V( 46) }, + { V( 42), V( -25), V( 120), V( 45), V( 34), V( -9), V( 24) }, + { V( -8), V( 51), V( 167), V( 35), V( -4), V(-16), V(-12) }, + { V(-17), V( -13), V( 100), V( 4), V( 9), V(-16), V(-31) } }; + // KingOnFile[semi-open Us][semi-open Them] contains bonuses/penalties + // for king when the king is on a semi-open or open file. + constexpr Score KingOnFile[2][2] = {{ S(-19,12), S(-6, 7) }, + { S( 0, 2), S( 6,-5) }}; + #undef S #undef V @@ -237,6 +242,9 @@ Score Entry::evaluate_shelter(const Position& pos, Square ksq) const { bonus -= make_score(UnblockedStorm[d][theirRank], 0); } + // King On File + bonus -= KingOnFile[pos.is_on_semiopen_file(Us, ksq)][pos.is_on_semiopen_file(Them, ksq)]; + return bonus; } From ba35c88ab84b959d41a67b3d8fcb40adc6537ec8 Mon Sep 17 00:00:00 2001 From: Tomasz Sobczyk Date: Tue, 3 Nov 2020 22:49:10 +0100 Subject: [PATCH 22/27] AVX-512 for smaller affine and feature transforms. For the feature transformer the code is analogical to AVX2 since there was room for easy adaptation of wider simd registers. For the smaller affine transforms that have 32 byte stride we keep 2 columns in one zmm register. We also unroll more aggressively so that in the end we have to do 16 parallel horizontal additions on ymm slices each consisting of 4 32-bit integers. The slices are embedded in 8 zmm registers. These changes provide about 1.5% speedup for AVX-512 builds. Closes https://github.com/official-stockfish/Stockfish/pull/3218 No functional change. --- src/nnue/layers/affine_transform.h | 129 +++++++++++++++++++++++++++- src/nnue/nnue_feature_transformer.h | 27 ++++-- 2 files changed, 148 insertions(+), 8 deletions(-) diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index f0292e45..47c9c488 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -83,7 +83,21 @@ namespace Eval::NNUE::Layers { return _mm512_reduce_add_epi32(sum) + bias; }; - [[maybe_unused]] auto m512_haddx4 = [](__m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i { + // This function takes + // sum0 = [xmm0a, xmm0b, xmm0c, xmm0d] + // sum1 = [xmm1a, xmm1b, xmm1c, xmm1d] + // sum2 = [xmm2a, xmm2b, xmm2c, xmm2d] + // sum3 = [xmm3a, xmm3b, xmm3c, xmm3d] + // and returns + // ret = [ + // reduce_add_epi32(xmm0a), reduce_add_epi32(xmm1a), reduce_add_epi32(xmm2a), reduce_add_epi32(xmm3a), + // reduce_add_epi32(xmm0b), reduce_add_epi32(xmm1b), reduce_add_epi32(xmm2b), reduce_add_epi32(xmm3b), + // reduce_add_epi32(xmm0c), reduce_add_epi32(xmm1c), reduce_add_epi32(xmm2c), reduce_add_epi32(xmm3c), + // reduce_add_epi32(xmm0d), reduce_add_epi32(xmm1d), reduce_add_epi32(xmm2d), reduce_add_epi32(xmm3d) + // ] + [[maybe_unused]] auto m512_hadd128x16_interleave = []( + __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3) -> __m512i { + __m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1); __m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1); @@ -96,7 +110,13 @@ namespace Eval::NNUE::Layers { __m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23); __m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23); - __m512i sum = _mm512_add_epi32(sum0123a, sum0123b); + return _mm512_add_epi32(sum0123a, sum0123b); + }; + + [[maybe_unused]] auto m512_haddx4 = [m512_hadd128x16_interleave]( + __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i { + + __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3); __m256i sum256lo = _mm512_castsi512_si256(sum); __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1); @@ -109,6 +129,58 @@ namespace Eval::NNUE::Layers { return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias); }; + [[maybe_unused]] auto m512_haddx8 = [m512_hadd128x16_interleave]( + __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, + __m512i sum4, __m512i sum5, __m512i sum6, __m512i sum7, __m256i bias) -> __m256i { + + __m512i suma = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3); + __m512i sumb = m512_hadd128x16_interleave(sum4, sum5, sum6, sum7); + + __m512i indices0 = _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13); + __m512i indices1 = _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15); + __m512i x = _mm512_add_epi32( + _mm512_permutex2var_epi64(suma, indices0, sumb), + _mm512_permutex2var_epi64(suma, indices1, sumb)); + + __m256i sum256lo = _mm512_castsi512_si256(x); + __m256i sum256hi = _mm512_extracti64x4_epi64(x, 1); + + return _mm256_add_epi32(_mm256_add_epi32(sum256lo, sum256hi), bias); + }; + + [[maybe_unused]] auto m512_hadd256x8 =[m512_hadd128x16_interleave]( + __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m256i bias) -> __m256i { + + __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3); + + __m512i indices = _mm512_setr_epi32( + 0, 4, 8, 12, 2, 6, 10, 14, + 1, 5, 9, 13, 3, 7, 11, 15); + sum = _mm512_permutexvar_epi32(indices, sum); + + __m256i sum256lo = _mm512_castsi512_si256(sum); + __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1); + + return _mm256_add_epi32(_mm256_hadd_epi32(sum256lo, sum256hi), bias); + }; + + [[maybe_unused]] auto m512_hadd256x16 = [m512_hadd128x16_interleave]( + __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, + __m512i sum4, __m512i sum5, __m512i sum6, __m512i sum7, __m512i bias) -> __m512i { + + __m512i suma = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3); + __m512i sumb = m512_hadd128x16_interleave(sum4, sum5, sum6, sum7); + + __m512i indices0 = _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13); + __m512i indices1 = _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15); + __m512i x = _mm512_add_epi32( + _mm512_permutex2var_epi64(suma, indices0, sumb), + _mm512_permutex2var_epi64(suma, indices1, sumb)); + + __m512i indices = _mm512_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); + return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias); + }; + [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) { #if defined (USE_VNNI) acc = _mm512_dpbusd_epi32(acc, a, b); @@ -205,7 +277,58 @@ namespace Eval::NNUE::Layers { // kOutputDimensions is either 1 or a multiple of kSimdWidth // because then it is also an input dimension. - if constexpr (kOutputDimensions % 4 == 0) + if constexpr (kOutputDimensions % 16 == 0 && kNumChunks256 == 1) + { + for (IndexType i = 0; i < kOutputDimensions; i += 16) + { + const IndexType offset01a = (i + 0) * kPaddedInputDimensions; + const IndexType offset23a = (i + 2) * kPaddedInputDimensions; + const IndexType offset45a = (i + 4) * kPaddedInputDimensions; + const IndexType offset67a = (i + 6) * kPaddedInputDimensions; + const IndexType offset01b = (i + 8) * kPaddedInputDimensions; + const IndexType offset23b = (i + 10) * kPaddedInputDimensions; + const IndexType offset45b = (i + 12) * kPaddedInputDimensions; + const IndexType offset67b = (i + 14) * kPaddedInputDimensions; + + const __m512i bias = *reinterpret_cast(&biases_[i]); + __m512i* outptr = reinterpret_cast<__m512i*>(&output[i]); + + __m512i sum01a = _mm512_setzero_si512(); + __m512i sum23a = _mm512_setzero_si512(); + __m512i sum45a = _mm512_setzero_si512(); + __m512i sum67a = _mm512_setzero_si512(); + __m512i sum01b = _mm512_setzero_si512(); + __m512i sum23b = _mm512_setzero_si512(); + __m512i sum45b = _mm512_setzero_si512(); + __m512i sum67b = _mm512_setzero_si512(); + + const auto row01a = *reinterpret_cast(&weights_[offset01a]); + const auto row23a = *reinterpret_cast(&weights_[offset23a]); + const auto row45a = *reinterpret_cast(&weights_[offset45a]); + const auto row67a = *reinterpret_cast(&weights_[offset67a]); + const auto row01b = *reinterpret_cast(&weights_[offset01b]); + const auto row23b = *reinterpret_cast(&weights_[offset23b]); + const auto row45b = *reinterpret_cast(&weights_[offset45b]); + const auto row67b = *reinterpret_cast(&weights_[offset67b]); + + const __m256i in256 = input_vector256[0]; + const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1); + + m512_add_dpbusd_epi32(sum01a, in, row01a); + m512_add_dpbusd_epi32(sum23a, in, row23a); + m512_add_dpbusd_epi32(sum45a, in, row45a); + m512_add_dpbusd_epi32(sum67a, in, row67a); + m512_add_dpbusd_epi32(sum01b, in, row01b); + m512_add_dpbusd_epi32(sum23b, in, row23b); + m512_add_dpbusd_epi32(sum45b, in, row45b); + m512_add_dpbusd_epi32(sum67b, in, row67b); + + *outptr = m512_hadd256x16( + sum01a, sum23a, sum45a, sum67a, + sum01b, sum23b, sum45b, sum67b, bias); + } + } + else if constexpr (kOutputDimensions % 4 == 0) { for (IndexType i = 0; i < kOutputDimensions; i += 4) { diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index c3f012e4..f49777b5 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -127,7 +127,13 @@ namespace Eval::NNUE { const auto& accumulation = pos.state()->accumulator.accumulation; - #if defined(USE_AVX2) + #if defined(USE_AVX512) + constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth * 2); + static_assert(kHalfDimensions % (kSimdWidth * 2) == 0); + const __m512i kControl = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + const __m512i kZero = _mm512_setzero_si512(); + + #elif defined(USE_AVX2) constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; constexpr int kControl = 0b11011000; const __m256i kZero = _mm256_setzero_si256(); @@ -154,13 +160,24 @@ namespace Eval::NNUE { for (IndexType p = 0; p < 2; ++p) { const IndexType offset = kHalfDimensions * p; - #if defined(USE_AVX2) + #if defined(USE_AVX512) + auto out = reinterpret_cast<__m512i*>(&output[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + __m512i sum0 = _mm512_load_si512( + &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 0]); + __m512i sum1 = _mm512_load_si512( + &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 1]); + _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(kControl, + _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), kZero))); + } + + #elif defined(USE_AVX2) auto out = reinterpret_cast<__m256i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m256i sum0 = _mm256_load_si256( &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 0]); __m256i sum1 = _mm256_load_si256( - &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 1]); + &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 1]); _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( _mm256_packs_epi16(sum0, sum1), kZero), kControl)); } @@ -177,9 +194,9 @@ namespace Eval::NNUE { _mm_store_si128(&out[j], #ifdef USE_SSE41 - _mm_max_epi8(packedbytes, kZero) + _mm_max_epi8(packedbytes, kZero) #else - _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s) + _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s) #endif ); From 32edb1d009e09a9442cb7393920e072ffd08005d Mon Sep 17 00:00:00 2001 From: SFisGOD Date: Sat, 7 Nov 2020 08:50:02 +0800 Subject: [PATCH 23/27] Update default net to nn-c3ca321c51c9.nnue Optimization of the net biases of the 32 x 32 layer and the output layer. Tuning of 32 x 32 layer (200k games, 5 seconds TC) https://tests.stockfishchess.org/tests/view/5f9aaf266a2c112b60691c68 STC: LLR: 2.95 (-2.94,2.94) {-0.25,1.25} Total: 41848 W: 4665 L: 4461 D: 32722 Ptnml(0-2): 239, 3308, 13659, 3446, 272 https://tests.stockfishchess.org/tests/view/5fa5ef5a936c54e11ec9954f LTC: LLR: 2.94 (-2.94,2.94) {0.25,1.25} Total: 88008 W: 4045 L: 3768 D: 80195 Ptnml(0-2): 69, 3339, 36908, 3622, 66 https://tests.stockfishchess.org/tests/view/5fa62a78936c54e11ec99577 closes https://github.com/official-stockfish/Stockfish/pull/3220 Bench: 3649288 --- src/evaluate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evaluate.h b/src/evaluate.h index 6bec27db..06c66f71 100644 --- a/src/evaluate.h +++ b/src/evaluate.h @@ -36,7 +36,7 @@ namespace Eval { // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue // for the build process (profile-build and fishtest) to work. Do not change the // name of the macro, as it is used in the Makefile. - #define EvalFileDefaultName "nn-cb26f10b1fd9.nnue" + #define EvalFileDefaultName "nn-c3ca321c51c9.nnue" namespace NNUE { From 392b529c3f52103ad47ad096b86103c17758cb4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= Date: Fri, 6 Nov 2020 19:20:27 +0100 Subject: [PATCH 24/27] Qsearch pruning: follow-up This is a follow-up of the recent qsearch pruning patch in https://github.com/official-stockfish/Stockfish/commit/a260c9a8a24a2630a900efc3821000c3481b0c5d We now use the same guard condition (testing that we already have a defense with a score better score than a TB loss) for all pruning heuristics in qsearch(). This allows some pruning when in check, but in a controlled way to ensure that no wrong mate scores appear. Tested with Elo-gaining bounds: STC: LLR: 2.97 (-2.94,2.94) {-0.25,1.25} Total: 22632 W: 2433 L: 2264 D: 17935 Ptnml(0-2): 98, 1744, 7487, 1865, 122 https://tests.stockfishchess.org/tests/view/5fa59405936c54e11ec99515 LTC: LLR: 2.94 (-2.94,2.94) {0.25,1.25} Total: 105432 W: 4965 L: 4648 D: 95819 Ptnml(0-2): 85, 4110, 44011, 4423, 87 https://tests.stockfishchess.org/tests/view/5fa5b609936c54e11ec9952a closes https://github.com/official-stockfish/Stockfish/pull/3221 Bench: 3578092 --- src/search.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index 6e37fba1..b5b93bf0 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1525,7 +1525,7 @@ moves_loop: // When in check, search starts from here moveCount++; // Futility pruning - if ( !ss->inCheck + if ( bestValue > VALUE_TB_LOSS_IN_MAX_PLY && !givesCheck && futilityBase > -VALUE_KNOWN_WIN && !pos.advanced_pawn_push(move)) @@ -1552,7 +1552,7 @@ moves_loop: // When in check, search starts from here } // Do not search moves with negative SEE values - if ( !ss->inCheck + if ( bestValue > VALUE_TB_LOSS_IN_MAX_PLY && !(givesCheck && pos.is_discovery_check_on_king(~pos.side_to_move(), move)) && !pos.see_ge(move)) continue; From b5781150ea8557e2030f8bc8b4eadede0ecec6bd Mon Sep 17 00:00:00 2001 From: lonfom169 <50217346+lonfom169@users.noreply.github.com> Date: Sun, 8 Nov 2020 23:43:32 -0300 Subject: [PATCH 25/27] Increase reduction based on the number of best move changes. Thanks to Vizvezdenec for the PvNode idea and also to vondele the !PvNode idea. Passed STC: LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 19120 W: 1998 L: 1839 D: 15283 Ptnml(0-2): 76, 1445, 6375, 1572, 92 https://tests.stockfishchess.org/tests/view/5fa8af3e67cbf42301d6a6c9 Passed LTC: LLR: 2.94 (-2.94,2.94) {0.25,1.25} Total: 75584 W: 3454 L: 3205 D: 68925 Ptnml(0-2): 54, 2832, 31771, 3081, 54 closes https://github.com/official-stockfish/Stockfish/pull/3224 Bench: 3595418 --- AUTHORS | 1 + src/search.cpp | 3 +++ 2 files changed, 4 insertions(+) diff --git a/AUTHORS b/AUTHORS index 198dfa5a..f0356090 100644 --- a/AUTHORS +++ b/AUTHORS @@ -19,6 +19,7 @@ Alain Savard (Rocky640) Alayan Feh (Alayan-stk-2) Alexander Kure Alexander Pagel (Lolligerhans) +Alfredo Menezes (lonfom169) Ali AlZhrani (Cooffe) Andrew Grant (AndyGrant) Andrey Neporada (nepal) diff --git a/src/search.cpp b/src/search.cpp index b5b93bf0..56b56733 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1169,6 +1169,9 @@ moves_loop: // When in check, search starts from here if (ss->ttPv) r -= 2; + if (!PvNode && depth > 10 && thisThread->bestMoveChanges <= 2) + r++; + if (moveCountPruning && !formerPv) r++; From 285bf7041ad214156188823eb9118e6af7f4b2e4 Mon Sep 17 00:00:00 2001 From: SFisGOD Date: Tue, 10 Nov 2020 18:28:43 +0100 Subject: [PATCH 26/27] Increase reduction at root when the best move does not change frequently STC: LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 51320 W: 5159 L: 4956 D: 41205 Ptnml(0-2): 215, 3897, 17242, 4082, 224 https://tests.stockfishchess.org/tests/view/5faa072367cbf42301d6a767 LTC: LLR: 2.98 (-2.94,2.94) {0.25,1.25} Total: 15952 W: 762 L: 642 D: 14548 Ptnml(0-2): 8, 561, 6725, 667, 15 https://tests.stockfishchess.org/tests/view/5faa4c3567cbf42301d6a794 closes https://github.com/official-stockfish/Stockfish/pull/3225 Bench: 3954692 --- AUTHORS | 2 +- src/search.cpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/AUTHORS b/AUTHORS index f0356090..f30be4de 100644 --- a/AUTHORS +++ b/AUTHORS @@ -86,7 +86,7 @@ Jekaa Jerry Donald Watson (jerrydonaldwatson) jjoshua2 Jonathan Calovski (Mysseno) -Jonathan Dumale (SFisGOD) +Jonathan Buladas Dumale (SFisGOD) Joost VandeVondele (vondele) Jörg Oster (joergoster) Joseph Ellis (jhellis3) diff --git a/src/search.cpp b/src/search.cpp index 56b56733..66ef5043 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1161,7 +1161,7 @@ moves_loop: // When in check, search starts from here if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024) r--; - // Reduction if other threads are searching this position + // Increase reduction if other threads are searching this position if (th.marked()) r++; @@ -1169,7 +1169,8 @@ moves_loop: // When in check, search starts from here if (ss->ttPv) r -= 2; - if (!PvNode && depth > 10 && thisThread->bestMoveChanges <= 2) + // Increase reduction at root and non-PV nodes when the best move does not change frequently + if ((rootNode || !PvNode) && depth > 10 && thisThread->bestMoveChanges <= 2) r++; if (moveCountPruning && !formerPv) From f9595828eb7e5e970b0be3ee5f84ddd726845523 Mon Sep 17 00:00:00 2001 From: FauziAkram Date: Wed, 11 Nov 2020 20:56:29 +0200 Subject: [PATCH 27/27] Rook Mobility Tweak Passed STC: LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 171152 W: 34715 L: 34202 D: 102235 Ptnml(0-2): 3278, 20155, 38228, 20606, 3309 https://tests.stockfishchess.org/tests/view/5fa861f467cbf42301d6a68e Passed LTC: LLR: 2.94 (-2.94,2.94) {0.25,1.25} Total: 149616 W: 20471 L: 19882 D: 109263 Ptnml(0-2): 1172, 14434, 43102, 14833, 1267 https://tests.stockfishchess.org/tests/view/5fa9c8ff67cbf42301d6a74f closes https://github.com/official-stockfish/Stockfish/pull/3226 Bench: 3597730 --- src/evaluate.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 4ade46fa..34ebe6c3 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -212,9 +212,9 @@ namespace { { S(-47,-59), S(-20,-25), S( 14, -8), S( 29, 12), S( 39, 21), S( 53, 40), // Bishop S( 53, 56), S( 60, 58), S( 62, 65), S( 69, 72), S( 78, 78), S( 83, 87), S( 91, 88), S( 96, 98) }, - { S(-61,-82), S(-20,-17), S( 2, 23) ,S( 3, 40), S( 4, 72), S( 11,100), // Rook - S( 22,104), S( 31,120), S( 39,134), S(40 ,138), S( 41,158), S( 47,163), - S( 59,168), S( 60,169), S( 64,173) }, + { S(-60,-82), S(-24,-15), S( 0, 17) ,S( 3, 43), S( 4, 72), S( 14,100), // Rook + S( 20,102), S( 30,122), S( 41,133), S(41 ,139), S( 41,153), S( 45,160), + S( 57,165), S( 58,170), S( 67,175) }, { S(-29,-49), S(-16,-29), S( -8, -8), S( -8, 17), S( 18, 39), S( 25, 54), // Queen S( 23, 59), S( 37, 73), S( 41, 76), S( 54, 95), S( 65, 95) ,S( 68,101), S( 69,124), S( 70,128), S( 70,132), S( 70,133) ,S( 71,136), S( 72,140),