From 9382f854b3a67c5a970ad3342a3c12454974eccd Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 30 Sep 2020 21:22:36 +0200
Subject: [PATCH 01/27] Schedule threads fairly under valgrind

fixes a rare case that can cause CI to fail when running multithreaded under valgrind.

closes https://github.com/official-stockfish/Stockfish/pull/3165

No functional change.
---
 tests/instrumented.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index 03ded74a..03e9c9de 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -20,7 +20,7 @@ case $1 in
   --valgrind-thread)
     echo "valgrind-thread testing started"
     prefix=''
-    exeprefix='valgrind --error-exitcode=42'
+    exeprefix='valgrind --fair-sched=try --error-exitcode=42'
     postfix='1>/dev/null'
     threads="2"
   ;;

From 17fb3a8ce0ccd2532f667fe685c4189d0bfe3b5b Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Fri, 2 Oct 2020 22:00:55 +0200
Subject: [PATCH 02/27] Simplify away futility pruning for captures

Remove futility pruning for captures.

STC https://tests.stockfishchess.org/tests/view/5f749bfed930428c36d34c56
LLR: 2.94 (-2.94,2.94) {-1.25,0.25}
Total: 38064 W: 4011 L: 3929 D: 30124
Ptnml(0-2): 192, 3004, 12567, 3068, 201

LTC https://tests.stockfishchess.org/tests/view/5f74d99bf18675b1ce2f7412
LLR: 2.94 (-2.94,2.94) {-0.75,0.25}
Total: 184984 W: 8567 L: 8610 D: 167807
Ptnml(0-2): 146, 7593, 77058, 7548, 147

closes https://github.com/official-stockfish/Stockfish/pull/3166

bench: 3890648
---
 src/search.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index c7343ce8..eaa79fb9 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1049,15 +1049,6 @@ moves_loop: // When in check, search starts from here
                   && captureHistory[movedPiece][to_sq(move)][type_of(pos.piece_on(to_sq(move)))] < 0)
                   continue;
 
-              // Futility pruning for captures
-              if (   !givesCheck
-                  && lmrDepth < 6
-                  && !(PvNode && abs(bestValue) < 2)
-                  && !ss->inCheck
-                  && ss->staticEval + 169 + 244 * lmrDepth
-                     + PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] <= alpha)
-                  continue;
-
               // See based pruning
               if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo)
                   continue;

From 767b4f4fbe5ab2e63aceabd9005f4e1eb7cbcb51 Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Fri, 2 Oct 2020 15:32:19 +0300
Subject: [PATCH 03/27] Pawn Tuning

Tuning of pawns, for classical evaluation:

Passed STC:
https://tests.stockfishchess.org/tests/view/5f771f0e52560f5fc78559ec
LLR: 2.96 (-2.94,2.94) {-0.25,1.25}
Total: 252696 W: 50321 L: 49692 D: 152683
Ptnml(0-2): 4614, 29845, 57049, 29978, 4862

Passed LTC:
https://tests.stockfishchess.org/tests/view/5f77cfef090dcf9aaa16d38b
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 48184 W: 6556 L: 6193 D: 35435
Ptnml(0-2): 335, 4516, 14100, 4733, 408

closes https://github.com/official-stockfish/Stockfish/pull/3169

bench: 4016121
---
 src/pawns.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/pawns.cpp b/src/pawns.cpp
index af0f6618..a5102db8 100644
--- a/src/pawns.cpp
+++ b/src/pawns.cpp
@@ -30,21 +30,21 @@ namespace {
   #define S(mg, eg) make_score(mg, eg)
 
   // Pawn penalties
-  constexpr Score Backward      = S( 8, 27);
-  constexpr Score Doubled       = S(11, 55);
-  constexpr Score Isolated      = S( 5, 17);
-  constexpr Score WeakLever     = S( 2, 54);
-  constexpr Score WeakUnopposed = S(15, 25);
+  constexpr Score Backward      = S( 8, 25);
+  constexpr Score Doubled       = S(10, 55);
+  constexpr Score Isolated      = S( 3, 15);
+  constexpr Score WeakLever     = S( 3, 55);
+  constexpr Score WeakUnopposed = S(13, 25);
 
   // Bonus for blocked pawns at 5th or 6th rank
-  constexpr Score BlockedPawn[2] = { S(-13, -4), S(-4, 3) };
+  constexpr Score BlockedPawn[2] = { S(-13, -4), S(-5, 2) };
 
   constexpr Score BlockedStorm[RANK_NB] = {
     S(0, 0), S(0, 0), S(76, 78), S(-10, 15), S(-7, 10), S(-4, 6), S(-1, 2)
   };
 
   // Connected pawn bonus
-  constexpr int Connected[RANK_NB] = { 0, 7, 8, 11, 24, 45, 85 };
+  constexpr int Connected[RANK_NB] = { 0, 5, 7, 11, 24, 48, 86 };
 
   // Strength of pawn shelter for our king by [distance from edge][rank].
   // RANK_1 = 0 is used for files where we have no pawn, or pawn is behind our king.
@@ -147,7 +147,7 @@ namespace {
         if (support | phalanx)
         {
             int v =  Connected[r] * (2 + bool(phalanx) - bool(opposed))
-                   + 21 * popcount(support);
+                   + 22 * popcount(support);
 
             score += make_score(v, v * (r - 2) / 4);
         }

From ba73f8ce0d545a0f627b5bc8ba274ae9c85918f3 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 14 Oct 2020 10:23:30 +0200
Subject: [PATCH 04/27] Update default net to nn-04cf2b4ed1da.nnue

Further tune the net parameters, now the last but one layer (32x32).
To limit the number of parameters optimized, the network layer was
decomposed using SVD, and the singular values were treated
as parameters and tuned.

Tuning branch: https://github.com/vondele/Stockfish/tree/svdTune
Tuner: https://github.com/vondele/nevergrad4sf

passed STC:
https://tests.stockfishchess.org/tests/view/5f83e82f8ea73fb8ddf83e4e
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 8488 W: 944 L: 795 D: 6749
Ptnml(0-2): 39, 609, 2811, 734, 51

passed LTC:
https://tests.stockfishchess.org/tests/view/5f83f4118ea73fb8ddf83e66
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 169016 W: 8043 L: 7589 D: 153384
Ptnml(0-2): 133, 6623, 70538, 7085, 129

closes https://github.com/official-stockfish/Stockfish/pull/3181

Bench: 3945198
---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index 4b57a050..6a17f284 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -36,7 +36,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-baeb9ef2d183.nnue"
+  #define EvalFileDefaultName   "nn-04cf2b4ed1da.nnue"
 
   namespace NNUE {
 

From 4a5cc1365f48f7fff08d3184cadac7a0a75dda6d Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Tue, 6 Oct 2020 22:43:48 +0300
Subject: [PATCH 05/27] RookOnQueenFile Removal

Removing Rook On Queen File looks beneficial, and it might even bring some ELO.
I will try to reintroduce it with a different method later on.

Passed STC:
https://tests.stockfishchess.org/tests/view/5f7cea204389873867eb10cb
LLR: 2.94 (-2.94,2.94) {-1.25,0.25}
Total: 18624 W: 3800 L: 3568 D: 11256
Ptnml(0-2): 308, 2131, 4257, 2253, 363

Passed LTC:
https://tests.stockfishchess.org/tests/view/5f7d76a4e936c6892bf50598
LLR: 2.95 (-2.94,2.94) {-0.75,0.25}
Total: 117864 W: 15515 L: 15340 D: 87009
Ptnml(0-2): 926, 11127, 34671, 11262, 946

closes https://github.com/official-stockfish/Stockfish/pull/3176

Bench: 3756191
---
 src/evaluate.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 25e3bdc1..c68577a3 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -265,7 +265,6 @@ namespace {
   constexpr Score ReachableOutpost    = S( 31, 22);
   constexpr Score RestrictedPiece     = S(  7,  7);
   constexpr Score RookOnKingRing      = S( 16,  0);
-  constexpr Score RookOnQueenFile     = S(  6, 11);
   constexpr Score SliderOnQueen       = S( 60, 18);
   constexpr Score ThreatByKing        = S( 24, 89);
   constexpr Score ThreatByPawnPush    = S( 48, 39);
@@ -481,10 +480,6 @@ namespace {
 
         if (Pt == ROOK)
         {
-            // Bonus for rook on the same file as a queen
-            if (file_bb(s) & pos.pieces(QUEEN))
-                score += RookOnQueenFile;
-
             // Bonus for rook on an open or semi-open file
             if (pos.is_on_semiopen_file(Us, s))
                 score += RookOnFile[pos.is_on_semiopen_file(Them, s)];

From 288a604411fa72b06b30f16194cd03592b28f6f2 Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Mon, 12 Oct 2020 09:03:49 +0200
Subject: [PATCH 06/27] Scale factor tweak

Add !pawnsOnBothFlanks heuristic to scale factor.

STC https://tests.stockfishchess.org/tests/view/5f8080575b3847b5d41f9134
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 250960 W: 49779 L: 49168 D: 152013
Ptnml(0-2): 4224, 28822, 58802, 29383, 4249

LTC https://tests.stockfishchess.org/tests/view/5f832f498ea73fb8ddf83ddb
LLR: 2.95 (-2.94,2.94) {0.25,1.25}
Total: 88584 W: 11827 L: 11388 D: 65369
Ptnml(0-2): 585, 8079, 26578, 8412, 638

closes https://github.com/official-stockfish/Stockfish/pull/3179

bench: 3834252
---
 src/evaluate.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index c68577a3..425ba6f8 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -905,7 +905,9 @@ namespace {
             sf = 37 + 3 * (pos.count<QUEEN>(WHITE) == 1 ? pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK)
                                                         : pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE));
         else
-            sf = std::min(sf, 36 + 7 * pos.count<PAWN>(strongSide));
+            sf = std::min(sf, 36 + 7 * pos.count<PAWN>(strongSide)) - 4 * !pawnsOnBothFlanks;
+      
+        sf -= 4 * !pawnsOnBothFlanks;
     }
 
     // Interpolate between the middlegame and (scaled by 'sf') endgame score

From 281d520cc2bb0123efd230fce45119b57f0bae0d Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Sun, 18 Oct 2020 04:23:28 -0700
Subject: [PATCH 07/27] Update default net to nn-eba324f53044.nnue

The new net is based on the previous net 04cf2b4ed1da but with the biases
for the 1st hidden layer tuned SPSA, see the SPSA session on fishtest there:
https://tests.stockfishchess.org/tests/view/5f875213dcdad978fe8c5211

Thanks to @vondele for writing out the net, see discussion in this thread:
https://github.com/mstembera/Stockfish/commit/432da86721647dff1d9426a7cdcfd2dbada8155e

Passed STC:
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 15000 W: 1640 L: 1483 D: 11877
Ptnml(0-2): 50, 1183, 4908, 1278, 81
https://tests.stockfishchess.org/tests/view/5f8955e20fea1a44ec4f0a5d

Passed LTC:
LLR: 2.96 (-2.94,2.94) {0.25,1.25}
Total: 81272 W: 3948 L: 3682 D: 73642
Ptnml(0-2): 64, 3194, 33856, 3456, 66
https://tests.stockfishchess.org/tests/view/5f89e8efeae8a6e60644d6e7

closes https://github.com/official-stockfish/Stockfish/pull/3187

Bench: 3762411
---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index 6a17f284..6a8603ad 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -36,7 +36,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-04cf2b4ed1da.nnue"
+  #define EvalFileDefaultName   "nn-eba324f53044.nnue"
 
   namespace NNUE {
 

From 560c776397483feaaa0deb5b666f46ff3f5b655f Mon Sep 17 00:00:00 2001
From: Vizvezdenec <Vizvezdenec@gmail.com>
Date: Sat, 17 Oct 2020 13:40:10 +0200
Subject: [PATCH 08/27] Do more reductions for late quiet moves in case of
 consecutive fail highs.

Idea of this patch can be described as following - in case we have consecutive fail highs and we reach late enough moves at root node probability of remaining quiet moves being able to produce even bigger value than moves that produced previous cutoff (so ones that should be high in move ordering but now they fail to produce beta cutoff because we actually reached high move count) should be quiet small so we can reduce them more.

passed STC
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 53392 W: 5681 L: 5474 D: 42237
Ptnml(0-2): 214, 4104, 17894, 4229, 255
https://tests.stockfishchess.org/tests/view/5f88501adcdad978fe8c527e

passed LTC
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 59136 W: 2773 L: 2564 D: 53799
Ptnml(0-2): 30, 2117, 25078, 2300, 43
https://tests.stockfishchess.org/tests/view/5f884dbfdcdad978fe8c527a

closes https://github.com/official-stockfish/Stockfish/pull/3184

Bench: 4066972
---
 src/search.cpp | 5 ++++-
 src/thread.h   | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index eaa79fb9..ab58ca64 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -417,7 +417,7 @@ void Thread::search() {
           // Start with a small aspiration window and, in the case of a fail
           // high/low, re-search with a bigger window until we don't fail
           // high/low anymore.
-          int failedHighCnt = 0;
+          failedHighCnt = 0;
           while (true)
           {
               Depth adjustedDepth = std::max(1, rootDepth - failedHighCnt - searchAgainCounter);
@@ -1177,6 +1177,9 @@ moves_loop: // When in check, search starts from here
               if (ttCapture)
                   r++;
 
+              // Increase reduction at root if failing high
+              r += rootNode ? thisThread->failedHighCnt * thisThread->failedHighCnt * moveCount / 512 : 0;
+
               // Increase reduction for cut nodes (~10 Elo)
               if (cutNode)
                   r += 2;
diff --git a/src/thread.h b/src/thread.h
index 34b99015..6a73423b 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -73,6 +73,7 @@ public:
   CapturePieceToHistory captureHistory;
   ContinuationHistory continuationHistory[2][2];
   Score contempt;
+  int failedHighCnt;
 };
 
 

From f5dfad5d72e164b57b787c0224046d641b3ade84 Mon Sep 17 00:00:00 2001
From: xoto10 <buylow001@gmail.com>
Date: Wed, 21 Oct 2020 14:52:13 +0100
Subject: [PATCH 09/27] Reduce big time spikes by reducing PV re-searches.

Save time by reducing PV re-searches above original depth. Instead use 5% extra time on every move.

STC 10+0.1 th 1 :
LLR: 2.93 (-2.94,2.94) {-0.25,1.25}
Total: 90688 W: 9702 L: 9436 D: 71550
Ptnml(0-2): 408, 7252, 29792, 7450, 442
https://tests.stockfishchess.org/tests/view/5f8df807bacb75a4f9a47223

LTC 60+0.6 th 1 :
LLR: 2.97 (-2.94,2.94) {0.25,1.25}
Total: 97856 W: 4602 L: 4303 D: 88951
Ptnml(0-2): 53, 3757, 41057, 3960, 101
https://tests.stockfishchess.org/tests/view/5f8ec4872c92c7fe3a8c602d

closes https://github.com/official-stockfish/Stockfish/pull/3192

Bench 3943959
---
 src/search.cpp  | 4 +++-
 src/timeman.cpp | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index ab58ca64..65ed9b73 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -565,6 +565,7 @@ namespace {
 
     constexpr bool PvNode = NT == PV;
     const bool rootNode = PvNode && ss->ply == 0;
+    const Depth maxNextDepth = rootNode ? depth : depth + 1;
 
     // Check if we have an upcoming move which draws by repetition, or
     // if the opponent had an alternative move earlier to this position.
@@ -1259,7 +1260,8 @@ moves_loop: // When in check, search starts from here
           (ss+1)->pv = pv;
           (ss+1)->pv[0] = MOVE_NONE;
 
-          value = -search<PV>(pos, ss+1, -beta, -alpha, newDepth, false);
+          value = -search<PV>(pos, ss+1, -beta, -alpha,
+                              std::min(maxNextDepth, newDepth), false);
       }
 
       // Step 18. Undo move
diff --git a/src/timeman.cpp b/src/timeman.cpp
index 6d9c95ef..da08f12d 100644
--- a/src/timeman.cpp
+++ b/src/timeman.cpp
@@ -75,7 +75,7 @@ void TimeManagement::init(Search::LimitsType& limits, Color us, int ply) {
   // game time for the current move, so also cap to 20% of available game time.
   if (limits.movestogo == 0)
   {
-      optScale = std::min(0.008 + std::pow(ply + 3.0, 0.5) / 250.0,
+      optScale = std::min(0.0084 + std::pow(ply + 3.0, 0.5) * 0.0042,
                            0.2 * limits.time[us] / double(timeLeft));
       maxScale = std::min(7.0, 4.0 + ply / 12.0);
   }

From 258af8ae44fc15407996e0a21a80ee8b9cfa12cb Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 18 Oct 2020 15:01:19 +0200
Subject: [PATCH 10/27] Add net as dependency of config

cleaner output and error message if the server is down and the net is not available.

closes https://github.com/official-stockfish/Stockfish/pull/3188

No functional change
---
 src/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 54868b39..87203547 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -711,7 +711,7 @@ endif
         config-sanity icc-profile-use icc-profile-make gcc-profile-use gcc-profile-make \
         clang-profile-use clang-profile-make
 
-build: config-sanity net
+build: net config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
 
 profile-build: net config-sanity objclean profileclean
@@ -784,7 +784,7 @@ default:
 
 all: $(EXE) .depend
 
-config-sanity:
+config-sanity: net
 	@echo ""
 	@echo "Config:"
 	@echo "debug: '$(debug)'"

From 2046d5da30b2cd505b69bddb40062b0d37b43bc7 Mon Sep 17 00:00:00 2001
From: syzygy1 <3028851+syzygy1@users.noreply.github.com>
Date: Tue, 20 Oct 2020 21:06:06 +0200
Subject: [PATCH 11/27] More incremental accumulator updates

This patch was inspired by c065abd which updates the accumulator,
if possible, based on the accumulator of two plies back if
the accumulator of the preceding ply is not available.

With this patch we look back even further in the position history
in an attempt to reduce the number of complete recomputations.
When we find a usable accumulator for the position N plies back,
we also update the accumulator of the position N-1 plies back
because that accumulator is most likely to be helpful later
when evaluating positions in sibling branches.
By not updating all intermediate accumulators immediately,
we avoid doing too much work that is not certain to be useful.
Overall, roughly 2-3% speedup.

This patch makes the code more specific to the net architecture,
changing input features of the net will require additional changes
to the incremental update code as discussed in the PR #3193 and #3191.

Passed STC:
https://tests.stockfishchess.org/tests/view/5f9056712c92c7fe3a8c60d0
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 10040 W: 1116 L: 968 D: 7956
Ptnml(0-2): 42, 722, 3365, 828, 63

closes https://github.com/official-stockfish/Stockfish/pull/3193

No functional change.
---
 src/nnue/features/feature_set.h     | 108 -----------
 src/nnue/nnue_accumulator.h         |   5 +-
 src/nnue/nnue_feature_transformer.h | 288 ++++++++++++++--------------
 src/position.cpp                    |  17 +-
 4 files changed, 157 insertions(+), 261 deletions(-)

diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index 26198114..975824b6 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -43,90 +43,6 @@ namespace Eval::NNUE::Features {
   template <typename Derived>
   class FeatureSetBase {
 
-   public:
-    // Get a list of indices for active features
-    template <typename IndexListType>
-    static void AppendActiveIndices(
-        const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
-
-      for (Color perspective : { WHITE, BLACK }) {
-        Derived::CollectActiveIndices(
-            pos, trigger, perspective, &active[perspective]);
-      }
-    }
-
-    // Get a list of indices for recently changed features
-    template <typename PositionType, typename IndexListType>
-    static void AppendChangedIndices(
-        const PositionType& pos, TriggerEvent trigger,
-        IndexListType removed[2], IndexListType added[2], bool reset[2]) {
-
-      auto collect_for_one = [&](const DirtyPiece& dp) {
-        for (Color perspective : { WHITE, BLACK }) {
-          switch (trigger) {
-            case TriggerEvent::kFriendKingMoved:
-              reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
-              break;
-            default:
-              assert(false);
-              break;
-          }
-          if (reset[perspective]) {
-            Derived::CollectActiveIndices(
-                pos, trigger, perspective, &added[perspective]);
-          } else {
-            Derived::CollectChangedIndices(
-                pos, dp, trigger, perspective,
-                &removed[perspective], &added[perspective]);
-          }
-        }
-      };
-
-      auto collect_for_two = [&](const DirtyPiece& dp1, const DirtyPiece& dp2) {
-        for (Color perspective : { WHITE, BLACK }) {
-          switch (trigger) {
-            case TriggerEvent::kFriendKingMoved:
-              reset[perspective] = dp1.piece[0] == make_piece(perspective, KING)
-                                || dp2.piece[0] == make_piece(perspective, KING);
-              break;
-            default:
-              assert(false);
-              break;
-          }
-          if (reset[perspective]) {
-            Derived::CollectActiveIndices(
-                pos, trigger, perspective, &added[perspective]);
-          } else {
-            Derived::CollectChangedIndices(
-                pos, dp1, trigger, perspective,
-                &removed[perspective], &added[perspective]);
-            Derived::CollectChangedIndices(
-                pos, dp2, trigger, perspective,
-                &removed[perspective], &added[perspective]);
-          }
-        }
-      };
-
-      if (pos.state()->previous->accumulator.computed_accumulation) {
-        const auto& prev_dp = pos.state()->dirtyPiece;
-        if (prev_dp.dirty_num == 0) return;
-        collect_for_one(prev_dp);
-      } else {
-        const auto& prev_dp = pos.state()->previous->dirtyPiece;
-        if (prev_dp.dirty_num == 0) {
-          const auto& prev2_dp = pos.state()->dirtyPiece;
-          if (prev2_dp.dirty_num == 0) return;
-          collect_for_one(prev2_dp);
-        } else {
-          const auto& prev2_dp = pos.state()->dirtyPiece;
-          if (prev2_dp.dirty_num == 0) {
-            collect_for_one(prev_dp);
-          } else {
-            collect_for_two(prev_dp, prev2_dp);
-          }
-        }
-      }
-    }
   };
 
   // Class template that represents the feature set
@@ -146,30 +62,6 @@ namespace Eval::NNUE::Features {
         CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
     static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
 
-   private:
-    // Get a list of indices for active features
-    static void CollectActiveIndices(
-        const Position& pos, const TriggerEvent trigger, const Color perspective,
-        IndexList* const active) {
-      if (FeatureType::kRefreshTrigger == trigger) {
-        FeatureType::AppendActiveIndices(pos, perspective, active);
-      }
-    }
-
-    // Get a list of indices for recently changed features
-    static void CollectChangedIndices(
-        const Position& pos, const DirtyPiece& dp, const TriggerEvent trigger, const Color perspective,
-        IndexList* const removed, IndexList* const added) {
-
-      if (FeatureType::kRefreshTrigger == trigger) {
-        FeatureType::AppendChangedIndices(pos, dp, perspective, removed, added);
-      }
-    }
-
-    // Make the base class and the class template that recursively uses itself a friend
-    friend class FeatureSetBase<FeatureSet>;
-    template <typename... FeatureTypes>
-    friend class FeatureSet;
   };
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index 26370710..a357d835 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -25,11 +25,14 @@
 
 namespace Eval::NNUE {
 
+  // The accumulator of a StateInfo without parent is set to the INIT state
+  enum AccumulatorState { EMPTY, COMPUTED, INIT };
+
   // Class that holds the result of affine transformation of input features
   struct alignas(kCacheLineSize) Accumulator {
     std::int16_t
         accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
-    bool computed_accumulation;
+    AccumulatorState state[2];
   };
 
 }  // namespace Eval::NNUE
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 2f86d20a..f145c848 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -32,7 +32,7 @@ namespace Eval::NNUE {
   // If vector instructions are enabled, we update and refresh the
   // accumulator tile by tile such that each tile fits in the CPU's
   // vector registers.
-  #define TILING
+  #define VECTOR
 
   #ifdef USE_AVX512
   typedef __m512i vec_t;
@@ -75,7 +75,7 @@ namespace Eval::NNUE {
   static constexpr IndexType kNumRegs = 16;
 
   #else
-  #undef TILING
+  #undef VECTOR
 
   #endif
 
@@ -86,7 +86,7 @@ namespace Eval::NNUE {
     // Number of output dimensions for one side
     static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
 
-    #ifdef TILING
+    #ifdef VECTOR
     static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
     static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
     #endif
@@ -119,32 +119,11 @@ namespace Eval::NNUE {
       return !stream.fail();
     }
 
-    // Proceed with the difference calculation if possible
-    bool UpdateAccumulatorIfPossible(const Position& pos) const {
-
-      const auto now = pos.state();
-      if (now->accumulator.computed_accumulation)
-        return true;
-
-      const auto prev = now->previous;
-      if (prev) {
-        if (prev->accumulator.computed_accumulation) {
-          UpdateAccumulator(pos);
-          return true;
-        } else if (prev->previous && prev->previous->accumulator.computed_accumulation) {
-          UpdateAccumulator(pos);
-          return true;
-        }
-      }
-
-      return false;
-    }
-
     // Convert input features
     void Transform(const Position& pos, OutputType* output) const {
 
-      if (!UpdateAccumulatorIfPossible(pos))
-        RefreshAccumulator(pos);
+      UpdateAccumulator(pos, WHITE);
+      UpdateAccumulator(pos, BLACK);
 
       const auto& accumulation = pos.state()->accumulator.accumulation;
 
@@ -240,27 +219,142 @@ namespace Eval::NNUE {
     }
 
    private:
-    // Calculate cumulative value without using difference calculation
-    void RefreshAccumulator(const Position& pos) const {
+    void UpdateAccumulator(const Position& pos, const Color c) const {
 
-      auto& accumulator = pos.state()->accumulator;
-      IndexType i = 0;
-      Features::IndexList active_indices[2];
-      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
-                                       active_indices);
-      for (Color perspective : { WHITE, BLACK }) {
-  #ifdef TILING
-        for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+  #ifdef VECTOR
+      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
+      // is defined in the VECTOR code below, once in each branch
+      vec_t acc[kNumRegs];
+  #endif
+
+      // Look for a usable accumulator of an earlier position. We keep track
+      // of the estimated gain in terms of features to be added/subtracted.
+      StateInfo *st = pos.state(), *next = nullptr;
+      int gain = popcount(pos.pieces()) - 2;
+      while (st->accumulator.state[c] == EMPTY)
+      {
+        auto& dp = st->dirtyPiece;
+        // The first condition tests whether an incremental update is
+        // possible at all: if this side's king has moved, it is not possible.
+        static_assert(std::is_same_v<RawFeatures::SortedTriggerSet,
+              Features::CompileTimeList<Features::TriggerEvent, Features::TriggerEvent::kFriendKingMoved>>,
+              "Current code assumes that only kFriendlyKingMoved refresh trigger is being used.");
+        if (   dp.piece[0] == make_piece(c, KING)
+            || (gain -= dp.dirty_num + 1) < 0)
+          break;
+        next = st;
+        st = st->previous;
+      }
+
+      if (st->accumulator.state[c] == COMPUTED)
+      {
+        if (next == nullptr)
+          return;
+
+        // Update incrementally in two steps. First, we update the "next"
+        // accumulator. Then, we update the current accumulator (pos.state()).
+
+        // Gather all features to be updated. This code assumes HalfKP features
+        // only and doesn't support refresh triggers.
+        static_assert(std::is_same_v<Features::FeatureSet<Features::HalfKP<Features::Side::kFriend>>,
+                                     RawFeatures>);
+        Features::IndexList removed[2], added[2];
+        Features::HalfKP<Features::Side::kFriend>::AppendChangedIndices(pos,
+            next->dirtyPiece, c, &removed[0], &added[0]);
+        for (StateInfo *st2 = pos.state(); st2 != next; st2 = st2->previous)
+          Features::HalfKP<Features::Side::kFriend>::AppendChangedIndices(pos,
+              st2->dirtyPiece, c, &removed[1], &added[1]);
+
+        // Mark the accumulators as computed.
+        next->accumulator.state[c] = COMPUTED;
+        pos.state()->accumulator.state[c] = COMPUTED;
+
+        // Now update the accumulators listed in info[], where the last element is a sentinel.
+        StateInfo *info[3] =
+          { next, next == pos.state() ? nullptr : pos.state(), nullptr };
+  #ifdef VECTOR
+        for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j)
+        {
+          // Load accumulator
+          auto accTile = reinterpret_cast<vec_t*>(
+            &st->accumulator.accumulation[c][0][j * kTileHeight]);
+          for (IndexType k = 0; k < kNumRegs; ++k)
+            acc[k] = vec_load(&accTile[k]);
+
+          for (IndexType i = 0; info[i]; ++i)
+          {
+            // Difference calculation for the deactivated features
+            for (const auto index : removed[i])
+            {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_sub_16(acc[k], column[k]);
+            }
+
+            // Difference calculation for the activated features
+            for (const auto index : added[i])
+            {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_add_16(acc[k], column[k]);
+            }
+
+            // Store accumulator
+            accTile = reinterpret_cast<vec_t*>(
+              &info[i]->accumulator.accumulation[c][0][j * kTileHeight]);
+            for (IndexType k = 0; k < kNumRegs; ++k)
+              vec_store(&accTile[k], acc[k]);
+          }
+        }
+
+  #else
+        for (IndexType i = 0; info[i]; ++i)
+        {
+          std::memcpy(info[i]->accumulator.accumulation[c][0],
+              st->accumulator.accumulation[c][0],
+              kHalfDimensions * sizeof(BiasType));
+          st = info[i];
+
+          // Difference calculation for the deactivated features
+          for (const auto index : removed[i])
+          {
+            const IndexType offset = kHalfDimensions * index;
+
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              st->accumulator.accumulation[c][0][j] -= weights_[offset + j];
+          }
+
+          // Difference calculation for the activated features
+          for (const auto index : added[i])
+          {
+            const IndexType offset = kHalfDimensions * index;
+
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              st->accumulator.accumulation[c][0][j] += weights_[offset + j];
+          }
+        }
+  #endif
+      }
+      else
+      {
+        // Refresh the accumulator
+        auto& accumulator = pos.state()->accumulator;
+        accumulator.state[c] = COMPUTED;
+        Features::IndexList active;
+        Features::HalfKP<Features::Side::kFriend>::AppendActiveIndices(pos, c, &active);
+
+  #ifdef VECTOR
+        for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j)
+        {
           auto biasesTile = reinterpret_cast<const vec_t*>(
               &biases_[j * kTileHeight]);
-          auto accTile = reinterpret_cast<vec_t*>(
-              &accumulator.accumulation[perspective][i][j * kTileHeight]);
-          vec_t acc[kNumRegs];
-
-          for (unsigned k = 0; k < kNumRegs; ++k)
+          for (IndexType k = 0; k < kNumRegs; ++k)
             acc[k] = biasesTile[k];
 
-          for (const auto index : active_indices[perspective]) {
+          for (const auto index : active)
+          {
             const IndexType offset = kHalfDimensions * index + j * kTileHeight;
             auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
 
@@ -268,18 +362,22 @@ namespace Eval::NNUE {
               acc[k] = vec_add_16(acc[k], column[k]);
           }
 
+          auto accTile = reinterpret_cast<vec_t*>(
+              &accumulator.accumulation[c][0][j * kTileHeight]);
           for (unsigned k = 0; k < kNumRegs; k++)
             vec_store(&accTile[k], acc[k]);
         }
+
   #else
-        std::memcpy(accumulator.accumulation[perspective][i], biases_,
+        std::memcpy(accumulator.accumulation[c][0], biases_,
             kHalfDimensions * sizeof(BiasType));
 
-        for (const auto index : active_indices[perspective]) {
+        for (const auto index : active)
+        {
           const IndexType offset = kHalfDimensions * index;
 
           for (IndexType j = 0; j < kHalfDimensions; ++j)
-            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+            accumulator.accumulation[c][0][j] += weights_[offset + j];
         }
   #endif
       }
@@ -287,106 +385,6 @@ namespace Eval::NNUE {
   #if defined(USE_MMX)
       _mm_empty();
   #endif
-
-      accumulator.computed_accumulation = true;
-    }
-
-    // Calculate cumulative value using difference calculation
-    void UpdateAccumulator(const Position& pos) const {
-
-      Accumulator* prev_accumulator;
-      assert(pos.state()->previous);
-      if (pos.state()->previous->accumulator.computed_accumulation) {
-        prev_accumulator = &pos.state()->previous->accumulator;
-      }
-      else {
-        assert(pos.state()->previous->previous);
-        assert(pos.state()->previous->previous->accumulator.computed_accumulation);
-        prev_accumulator = &pos.state()->previous->previous->accumulator;
-      }
-
-      auto& accumulator = pos.state()->accumulator;
-      IndexType i = 0;
-      Features::IndexList removed_indices[2], added_indices[2];
-      bool reset[2] = { false, false };
-      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
-                                        removed_indices, added_indices, reset);
-
-  #ifdef TILING
-      for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
-        for (Color perspective : { WHITE, BLACK }) {
-          auto accTile = reinterpret_cast<vec_t*>(
-              &accumulator.accumulation[perspective][i][j * kTileHeight]);
-          vec_t acc[kNumRegs];
-
-          if (reset[perspective]) {
-            auto biasesTile = reinterpret_cast<const vec_t*>(
-                &biases_[j * kTileHeight]);
-            for (unsigned k = 0; k < kNumRegs; ++k)
-              acc[k] = biasesTile[k];
-          } else {
-            auto prevAccTile = reinterpret_cast<const vec_t*>(
-                &prev_accumulator->accumulation[perspective][i][j * kTileHeight]);
-            for (IndexType k = 0; k < kNumRegs; ++k)
-              acc[k] = vec_load(&prevAccTile[k]);
-
-            // Difference calculation for the deactivated features
-            for (const auto index : removed_indices[perspective]) {
-              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-              for (IndexType k = 0; k < kNumRegs; ++k)
-                acc[k] = vec_sub_16(acc[k], column[k]);
-            }
-          }
-          { // Difference calculation for the activated features
-            for (const auto index : added_indices[perspective]) {
-              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-              for (IndexType k = 0; k < kNumRegs; ++k)
-                acc[k] = vec_add_16(acc[k], column[k]);
-            }
-          }
-
-          for (IndexType k = 0; k < kNumRegs; ++k)
-            vec_store(&accTile[k], acc[k]);
-        }
-      }
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
-
-  #else
-      for (Color perspective : { WHITE, BLACK }) {
-
-        if (reset[perspective]) {
-          std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                      kHalfDimensions * sizeof(BiasType));
-        } else {
-          std::memcpy(accumulator.accumulation[perspective][i],
-                      prev_accumulator->accumulation[perspective][i],
-                      kHalfDimensions * sizeof(BiasType));
-          // Difference calculation for the deactivated features
-          for (const auto index : removed_indices[perspective]) {
-            const IndexType offset = kHalfDimensions * index;
-
-            for (IndexType j = 0; j < kHalfDimensions; ++j)
-              accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
-          }
-        }
-        { // Difference calculation for the activated features
-          for (const auto index : added_indices[perspective]) {
-            const IndexType offset = kHalfDimensions * index;
-
-            for (IndexType j = 0; j < kHalfDimensions; ++j)
-              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-          }
-        }
-      }
-  #endif
-
-      accumulator.computed_accumulation = true;
     }
 
     using BiasType = std::int16_t;
diff --git a/src/position.cpp b/src/position.cpp
index e6a760d2..b707293d 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -279,6 +279,8 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
   chess960 = isChess960;
   thisThread = th;
   set_state(st);
+  st->accumulator.state[WHITE] = Eval::NNUE::INIT;
+  st->accumulator.state[BLACK] = Eval::NNUE::INIT;
 
   assert(pos_is_ok());
 
@@ -703,7 +705,8 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   ++st->pliesFromNull;
 
   // Used by NNUE
-  st->accumulator.computed_accumulation = false;
+  st->accumulator.state[WHITE] = Eval::NNUE::EMPTY;
+  st->accumulator.state[BLACK] = Eval::NNUE::EMPTY;
   auto& dp = st->dirtyPiece;
   dp.dirty_num = 1;
 
@@ -996,16 +999,16 @@ void Position::do_null_move(StateInfo& newSt) {
   assert(!checkers());
   assert(&newSt != st);
 
-  if (Eval::useNNUE)
-  {
-      std::memcpy(&newSt, st, sizeof(StateInfo));
-  }
-  else
-      std::memcpy(&newSt, st, offsetof(StateInfo, accumulator));
+  std::memcpy(&newSt, st, offsetof(StateInfo, accumulator));
 
   newSt.previous = st;
   st = &newSt;
 
+  st->dirtyPiece.dirty_num = 0;
+  st->dirtyPiece.piece[0] = NO_PIECE; // Avoid checks in UpdateAccumulator()
+  st->accumulator.state[WHITE] = Eval::NNUE::EMPTY;
+  st->accumulator.state[BLACK] = Eval::NNUE::EMPTY;
+
   if (st->epSquare != SQ_NONE)
   {
       st->key ^= Zobrist::enpassant[file_of(st->epSquare)];

From bde3505758417c6cd77f2e09edac5bbd5f58b570 Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Sat, 24 Oct 2020 02:01:04 +0300
Subject: [PATCH 12/27] Bishop Pawns based on Files

Passed STC:
https://tests.stockfishchess.org/tests/view/5f8cc8145a4eacb45305da3c
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 132544 W: 27795 L: 27328 D: 77421
Ptnml(0-2): 2756, 15558, 29272, 15835, 2851

Passed LTC:
https://tests.stockfishchess.org/tests/view/5f8df614bacb75a4f9a4721e
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 169608 W: 23257 L: 22622 D: 123729
Ptnml(0-2): 1408, 16316, 48758, 16877, 1445

closes https://github.com/official-stockfish/Stockfish/pull/3194

Bench: 4067106
---
 src/evaluate.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 425ba6f8..030d1017 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -222,6 +222,12 @@ namespace {
       S(112,178), S(114,185), S(114,187), S(119,221) }
   };
 
+  // BishopPawns[distance from edge] contains a file-dependent penalty for pawns on
+  // squares of the same color as our bishop.
+  constexpr Score BishopPawns[int(FILE_NB) / 2] = {
+    S(3, 8), S(3, 9), S(1, 8), S(3, 7)
+  };
+
   // KingProtector[knight/bishop] contains penalty for each distance unit to own king
   constexpr Score KingProtector[] = { S(8, 9), S(6, 9) };
 
@@ -252,7 +258,6 @@ namespace {
   // Assorted bonuses and penalties
   constexpr Score BadOutpost          = S( -7, 36);
   constexpr Score BishopOnKingRing    = S( 24,  0);
-  constexpr Score BishopPawns         = S(  3,  7);
   constexpr Score BishopXRayPawns     = S(  4,  5);
   constexpr Score CorneredBishop      = S( 50, 50);
   constexpr Score FlankAttacks        = S(  8,  0);
@@ -453,7 +458,7 @@ namespace {
                 // when the bishop is outside the pawn chain.
                 Bitboard blocked = pos.pieces(Us, PAWN) & shift<Down>(pos.pieces());
 
-                score -= BishopPawns * pos.pawns_on_same_color_squares(Us, s)
+                score -= BishopPawns[edge_distance(file_of(s))] * pos.pawns_on_same_color_squares(Us, s)
                                      * (!(attackedBy[Us][PAWN] & s) + popcount(blocked & CenterFiles));
 
                 // Penalty for all enemy pawns x-rayed
@@ -906,7 +911,7 @@ namespace {
                                                         : pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE));
         else
             sf = std::min(sf, 36 + 7 * pos.count<PAWN>(strongSide)) - 4 * !pawnsOnBothFlanks;
-      
+
         sf -= 4 * !pawnsOnBothFlanks;
     }
 

From 6328135264d3b13a2cef3f0c835a27192cae0f40 Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Wed, 28 Oct 2020 04:24:55 +0800
Subject: [PATCH 13/27] Update default net to nn-2eb2e0707c2b.nnue

Optimization of the net weights of the 32 x 32 layer (1024 parameters) and net biases of the 512 x 32 layer (32 parameters) using SPSA.

Tuning of 32 x 32 Layer (800,000 games, 5 seconds time control):
https://tests.stockfishchess.org/tests/view/5f942040d3978d7e86f1aa05

Tuning of 512 x 32 Layer (80,000 games, 20 seconds time control):
https://tests.stockfishchess.org/tests/view/5f8f926d2c92c7fe3a8c608b

STC:
LLR: 2.96 (-2.94,2.94) {-0.25,1.25}
Total: 17336 W: 1918 L: 1754 D: 13664
Ptnml(0-2): 79, 1344, 5672, 1480, 93
https://tests.stockfishchess.org/tests/view/5f9882346a2c112b60691b34

LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 37304 W: 1822 L: 1651 D: 33831
Ptnml(0-2): 27, 1461, 15501, 1640, 23
https://tests.stockfishchess.org/tests/view/5f98a4b36a2c112b60691b40

closes https://github.com/official-stockfish/Stockfish/pull/3201

Bench: 3403528
---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index 6a8603ad..6e5db6a3 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -36,7 +36,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-eba324f53044.nnue"
+  #define EvalFileDefaultName   "nn-2eb2e0707c2b.nnue"
 
   namespace NNUE {
 

From 0f6c08c73f516873b312cb8fce0d824a2167b075 Mon Sep 17 00:00:00 2001
From: syzygy1 <3028851+syzygy1@users.noreply.github.com>
Date: Tue, 27 Oct 2020 19:22:41 +0100
Subject: [PATCH 14/27] Do not skip non-recapture ttMove when in check

The qsearch() MovePicker incorrectly skips a non-recapture ttMove
when in check (if depth <= DEPTH_QS_RECAPTURES). This is clearly not
intended and can cause qsearch() to return a mate score when there
is no mate. Introduced in cad300c and 6596f0e, as observed by
joergoster in #3171 and #3198.

This PR fixes the bug by not skipping the non-recapture ttMove when in check.

Passed non-regression STC:
https://tests.stockfishchess.org/tests/view/5f9867ea6a2c112b60691b10
LLR: 2.98 (-2.94,2.94) {-1.25,0.25}
Total: 27112 W: 2943 L: 2842 D: 21327
Ptnml(0-2): 127, 2170, 8878, 2237, 144

Passed non-regression LTC:
https://tests.stockfishchess.org/tests/view/5f9967326a2c112b60691bb0
LLR: 2.99 (-2.94,2.94) {-0.75,0.25}
Total: 18392 W: 807 L: 738 D: 16847
Ptnml(0-2): 9, 655, 7802, 718, 12

closes https://github.com/official-stockfish/Stockfish/pull/3199
closes https://github.com/official-stockfish/Stockfish/pull/3198

Bench: 3870606
---
 src/movepick.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/movepick.cpp b/src/movepick.cpp
index 153d323e..f5e02385 100644
--- a/src/movepick.cpp
+++ b/src/movepick.cpp
@@ -73,8 +73,9 @@ MovePicker::MovePicker(const Position& p, Move ttm, Depth d, const ButterflyHist
   assert(d <= 0);
 
   stage = (pos.checkers() ? EVASION_TT : QSEARCH_TT) +
-           !(ttm && (depth > DEPTH_QS_RECAPTURES || to_sq(ttm) == recaptureSquare)
-                 && pos.pseudo_legal(ttm));
+          !(   ttm
+            && (pos.checkers() || depth > DEPTH_QS_RECAPTURES || to_sq(ttm) == recaptureSquare)
+            && pos.pseudo_legal(ttm));
 }
 
 /// MovePicker constructor for ProbCut: we generate captures with SEE greater

From dfc7f88650bf8bda4a381d36e209209cf63a9bcc Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Fri, 30 Oct 2020 13:45:40 -0700
Subject: [PATCH 15/27] Update default net to nn-cb26f10b1fd9.nnue

Result of https://tests.stockfishchess.org/tests/view/5f9a06796a2c112b60691c0f tuning.

STC
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 53712 W: 5776 L: 5561 D: 42375
Ptnml(0-2): 253, 4282, 17604, 4431, 286
https://tests.stockfishchess.org/tests/view/5f9c7bbc6a2c112b60691d4d

LTC
LLR: 2.97 (-2.94,2.94) {0.25,1.25}
Total: 80184 W: 4007 L: 3739 D: 72438
Ptnml(0-2): 58, 3302, 33130, 3518, 84
https://tests.stockfishchess.org/tests/view/5f9d01f06a2c112b60691d87

closes https://github.com/official-stockfish/Stockfish/pull/3209

bench: 3517795
---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index 6e5db6a3..6bec27db 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -36,7 +36,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-2eb2e0707c2b.nnue"
+  #define EvalFileDefaultName   "nn-cb26f10b1fd9.nnue"
 
   namespace NNUE {
 

From 75e06a1c89ebac9c9ec4247bc82ec728a2bffe1e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 29 Oct 2020 00:14:53 +0100
Subject: [PATCH 16/27] Optimize affine transform for SSSE3 and higher targets.

A non-functional speedup. Unroll the loops going over
the output dimensions in the affine transform layers by
a factor of 4 and perform 4 horizontal additions at a time.
Instead of doing naive horizontal additions on each vector
separately use hadd and shuffling between vectors to reduce
the number of instructions by using all lanes for all stages
of the horizontal adds.

passed STC of the initial version:
LLR: 2.95 (-2.94,2.94) {-0.25,1.25}
Total: 17808 W: 1914 L: 1756 D: 14138
Ptnml(0-2): 76, 1330, 5948, 1460, 90
https://tests.stockfishchess.org/tests/view/5f9d516f6a2c112b60691da3

passed STC of the final version after cleanup:
LLR: 2.95 (-2.94,2.94) {-0.25,1.25}
Total: 16296 W: 1750 L: 1595 D: 12951
Ptnml(0-2): 72, 1192, 5479, 1319, 86
https://tests.stockfishchess.org/tests/view/5f9df5776a2c112b60691de3

closes https://github.com/official-stockfish/Stockfish/pull/3203

No functional change
---
 src/nnue/layers/affine_transform.h | 478 +++++++++++++++++++++++------
 1 file changed, 384 insertions(+), 94 deletions(-)

diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 94d0b5a9..f0292e45 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -74,113 +74,400 @@ namespace Eval::NNUE::Layers {
         const TransformedFeatureType* transformed_features, char* buffer) const {
       const auto input = previous_layer_.Propagate(
           transformed_features, buffer + kSelfBufferSize);
+
+#if defined (USE_AVX512)
+
+      [[maybe_unused]] const __m512i kOnes512 = _mm512_set1_epi16(1);
+
+      [[maybe_unused]] auto m512_hadd = [](__m512i sum, int bias) -> int {
+        return _mm512_reduce_add_epi32(sum) + bias;
+      };
+
+      [[maybe_unused]] auto m512_haddx4 = [](__m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i {
+        __m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1);
+        __m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1);
+
+        __m512i sum23a = _mm512_unpacklo_epi32(sum2, sum3);
+        __m512i sum23b = _mm512_unpackhi_epi32(sum2, sum3);
+
+        __m512i sum01 = _mm512_add_epi32(sum01a, sum01b);
+        __m512i sum23 = _mm512_add_epi32(sum23a, sum23b);
+
+        __m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23);
+        __m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23);
+
+        __m512i sum = _mm512_add_epi32(sum0123a, sum0123b);
+
+        __m256i sum256lo = _mm512_castsi512_si256(sum);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
+
+        sum256lo = _mm256_add_epi32(sum256lo, sum256hi);
+
+        __m128i sum128lo = _mm256_castsi256_si128(sum256lo);
+        __m128i sum128hi = _mm256_extracti128_si256(sum256lo, 1);
+
+        return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
+      };
+
+      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
+#if defined (USE_VNNI)
+        acc = _mm512_dpbusd_epi32(acc, a, b);
+#else
+        __m512i product0 = _mm512_maddubs_epi16(a, b);
+        product0 = _mm512_madd_epi16(product0, kOnes512);
+        acc = _mm512_add_epi32(acc, product0);
+#endif
+      };
+
+#endif
+#if defined (USE_AVX2)
+
+      [[maybe_unused]] const __m256i kOnes256 = _mm256_set1_epi16(1);
+
+      [[maybe_unused]] auto m256_hadd = [](__m256i sum, int bias) -> int {
+        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+        return _mm_cvtsi128_si32(sum128) + bias;
+      };
+
+      [[maybe_unused]] auto m256_haddx4 = [](__m256i sum0, __m256i sum1, __m256i sum2, __m256i sum3, __m128i bias) -> __m128i {
+        sum0 = _mm256_hadd_epi32(sum0, sum1);
+        sum2 = _mm256_hadd_epi32(sum2, sum3);
+
+        sum0 = _mm256_hadd_epi32(sum0, sum2);
+
+        __m128i sum128lo = _mm256_castsi256_si128(sum0);
+        __m128i sum128hi = _mm256_extracti128_si256(sum0, 1);
+
+        return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
+      };
+
+      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
+#if defined (USE_VNNI)
+        acc = _mm256_dpbusd_epi32(acc, a, b);
+#else
+        __m256i product0 = _mm256_maddubs_epi16(a, b);
+        product0 = _mm256_madd_epi16(product0, kOnes256);
+        acc = _mm256_add_epi32(acc, product0);
+#endif
+      };
+
+#endif
+
+#if defined (USE_SSSE3)
+
+      [[maybe_unused]] const __m128i kOnes128 = _mm_set1_epi16(1);
+
+      [[maybe_unused]] auto m128_hadd = [](__m128i sum, int bias) -> int {
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
+        return _mm_cvtsi128_si32(sum) + bias;
+      };
+
+      [[maybe_unused]] auto m128_haddx4 = [](__m128i sum0, __m128i sum1, __m128i sum2, __m128i sum3, __m128i bias) -> __m128i {
+        sum0 = _mm_hadd_epi32(sum0, sum1);
+        sum2 = _mm_hadd_epi32(sum2, sum3);
+
+        sum0 = _mm_hadd_epi32(sum0, sum2);
+
+        return _mm_add_epi32(sum0, bias);
+      };
+
+      [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) {
+        __m128i product0 = _mm_maddubs_epi16(a, b);
+        product0 = _mm_madd_epi16(product0, kOnes128);
+        acc = _mm_add_epi32(acc, product0);
+      };
+
+#endif
+
+#if defined (USE_AVX512)
+
+      constexpr IndexType kNumChunks512 = kPaddedInputDimensions / (kSimdWidth * 2);
+      constexpr IndexType kNumChunks256 = kPaddedInputDimensions / kSimdWidth;
+
       const auto output = reinterpret_cast<OutputType*>(buffer);
 
-  #if defined(USE_AVX512)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
-      const auto input_vector = reinterpret_cast<const __m512i*>(input);
-  #if !defined(USE_VNNI)
-      const __m512i kOnes = _mm512_set1_epi16(1);
-  #endif
+      // Since to saturate a zmm register it takes 64 bytes we
+      // cannot use AVX512 for the smaller affine transforms.
+      // Instead we fallback to a AVX2 implementation if the
+      // kInputDimensions isn't a multiple of 64.
+      // Note that this means that for example for
+      // kInputDimensions of 96 we fallback to AVX2 even though
+      // the first 64 elements could be processed with AVX512.
+      // This is caused by mixing the __m256 and __m512 variables
+      // required to better handle that case and it would
+      // require handling more cases statically not to lose performance.
+      // This should be revisited if such input dimensions are to be considered.
+      [[maybe_unused]] const auto input_vector512 = reinterpret_cast<const __m512i*>(input);
+      [[maybe_unused]] const auto input_vector256 = reinterpret_cast<const __m256i*>(input);
+
+      // kOutputDimensions is either 1 or a multiple of kSimdWidth
+      // because then it is also an input dimension.
+      if constexpr (kOutputDimensions % 4 == 0)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 4)
+        {
+          const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
+          const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
+
+          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
+          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
+
+          if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
+          {
+            __m512i sum0 = _mm512_setzero_si512();
+            __m512i sum1 = _mm512_setzero_si512();
+            __m512i sum2 = _mm512_setzero_si512();
+            __m512i sum3 = _mm512_setzero_si512();
+
+            const auto row0 = reinterpret_cast<const __m512i*>(&weights_[offset0]);
+            const auto row1 = reinterpret_cast<const __m512i*>(&weights_[offset1]);
+            const auto row2 = reinterpret_cast<const __m512i*>(&weights_[offset2]);
+            const auto row3 = reinterpret_cast<const __m512i*>(&weights_[offset3]);
+
+            for (IndexType j = 0; j < kNumChunks512; ++j)
+            {
+              const __m512i in = input_vector512[j];
+
+              m512_add_dpbusd_epi32(sum0, in, row0[j]);
+              m512_add_dpbusd_epi32(sum1, in, row1[j]);
+              m512_add_dpbusd_epi32(sum2, in, row2[j]);
+              m512_add_dpbusd_epi32(sum3, in, row3[j]);
+            }
+
+            *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias);
+          }
+          else
+          {
+            __m256i sum0 = _mm256_setzero_si256();
+            __m256i sum1 = _mm256_setzero_si256();
+            __m256i sum2 = _mm256_setzero_si256();
+            __m256i sum3 = _mm256_setzero_si256();
+
+            const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
+            const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
+            const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
+            const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
+
+            for (IndexType j = 0; j < kNumChunks256; ++j)
+            {
+              const __m256i in = input_vector256[j];
+
+              m256_add_dpbusd_epi32(sum0, in, row0[j]);
+              m256_add_dpbusd_epi32(sum1, in, row1[j]);
+              m256_add_dpbusd_epi32(sum2, in, row2[j]);
+              m256_add_dpbusd_epi32(sum3, in, row3[j]);
+            }
+
+            *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
+          }
+        }
+      }
+      else if constexpr (kOutputDimensions == 1)
+      {
+        if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
+        {
+          __m512i sum0 = _mm512_setzero_si512();
+
+          const auto row0 = reinterpret_cast<const __m512i*>(&weights_[0]);
+
+          for (IndexType j = 0; j < kNumChunks512; ++j)
+          {
+            const __m512i in = input_vector512[j];
+
+            m512_add_dpbusd_epi32(sum0, in, row0[j]);
+          }
+
+          output[0] = m512_hadd(sum0, biases_[0]);
+        }
+        else
+        {
+          __m256i sum0 = _mm256_setzero_si256();
+
+          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
+
+          for (IndexType j = 0; j < kNumChunks256; ++j)
+          {
+            const __m256i in = input_vector256[j];
+
+            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+          }
+
+          output[0] = m256_hadd(sum0, biases_[0]);
+        }
+      }
+      else
+      {
+        // This case can never happen because kOutputDimensions
+        // is always 1 or a multiple of kSimdWidth.
+        assert(false);
+      }
+
+#elif defined (USE_AVX2)
 
-  #elif defined(USE_AVX2)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+
+      const auto output = reinterpret_cast<OutputType*>(buffer);
       const auto input_vector = reinterpret_cast<const __m256i*>(input);
-  #if !defined(USE_VNNI)
-      const __m256i kOnes = _mm256_set1_epi16(1);
-  #endif
 
-  #elif defined(USE_SSE2)
+      // kOutputDimensions is either 1 or a multiple of kSimdWidth
+      // because then it is also an input dimension.
+      if constexpr (kOutputDimensions % 4 == 0)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 4)
+        {
+          const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
+          const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
+
+          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
+          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
+
+          __m256i sum0 = _mm256_setzero_si256();
+          __m256i sum1 = _mm256_setzero_si256();
+          __m256i sum2 = _mm256_setzero_si256();
+          __m256i sum3 = _mm256_setzero_si256();
+
+          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
+          const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
+          const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
+          const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
+
+          for (IndexType j = 0; j < kNumChunks; ++j)
+          {
+            const __m256i in = input_vector[j];
+
+            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+            m256_add_dpbusd_epi32(sum1, in, row1[j]);
+            m256_add_dpbusd_epi32(sum2, in, row2[j]);
+            m256_add_dpbusd_epi32(sum3, in, row3[j]);
+          }
+
+          *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
+        }
+      }
+      else if constexpr (kOutputDimensions == 1)
+      {
+        __m256i sum0 = _mm256_setzero_si256();
+
+        const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
+
+        for (IndexType j = 0; j < kNumChunks; ++j)
+        {
+          const __m256i in = input_vector[j];
+
+            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+        }
+
+        output[0] = m256_hadd(sum0, biases_[0]);
+      }
+      else
+      {
+        // This case can never happen because kOutputDimensions
+        // is always 1 or a multiple of kSimdWidth.
+        assert(false);
+      }
+
+#elif defined (USE_SSSE3)
+
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-  #ifndef USE_SSSE3
-      const __m128i kZeros = _mm_setzero_si128();
-  #else
-      const __m128i kOnes = _mm_set1_epi16(1);
-  #endif
+
+      auto output = reinterpret_cast<OutputType*>(buffer);
       const auto input_vector = reinterpret_cast<const __m128i*>(input);
 
-  #elif defined(USE_MMX)
+      // kOutputDimensions is either 1 or a multiple of kSimdWidth
+      // because then it is also an input dimension.
+      if constexpr (kOutputDimensions % 4 == 0)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 4)
+        {
+          const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
+          const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
+
+          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
+          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
+
+          __m128i sum0 = _mm_setzero_si128();
+          __m128i sum1 = _mm_setzero_si128();
+          __m128i sum2 = _mm_setzero_si128();
+          __m128i sum3 = _mm_setzero_si128();
+
+          const auto row0 = reinterpret_cast<const __m128i*>(&weights_[offset0]);
+          const auto row1 = reinterpret_cast<const __m128i*>(&weights_[offset1]);
+          const auto row2 = reinterpret_cast<const __m128i*>(&weights_[offset2]);
+          const auto row3 = reinterpret_cast<const __m128i*>(&weights_[offset3]);
+
+          for (int j = 0; j < (int)kNumChunks; j += 1)
+          {
+            const __m128i in = input_vector[j];
+
+            m128_add_dpbusd_epi32(sum0, in, row0[j]);
+            m128_add_dpbusd_epi32(sum1, in, row1[j]);
+            m128_add_dpbusd_epi32(sum2, in, row2[j]);
+            m128_add_dpbusd_epi32(sum3, in, row3[j]);
+          }
+
+          *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias);
+        }
+      }
+      else if constexpr (kOutputDimensions == 1)
+      {
+        __m128i sum0 = _mm_setzero_si128();
+
+        const auto row0 = reinterpret_cast<const __m128i*>(&weights_[0]);
+
+        for (int j = 0; j < (int)kNumChunks; j += 1)
+        {
+          const __m128i in = input_vector[j];
+
+          m128_add_dpbusd_epi32(sum0, in, row0[j]);
+        }
+
+        output[0] = m128_hadd(sum0, biases_[0]);
+      }
+      else
+      {
+        // This case can never happen because kOutputDimensions
+        // is always 1 or a multiple of kSimdWidth.
+        assert(false);
+      }
+
+#else
+
+// Use old implementation for the other architectures.
+
+      auto output = reinterpret_cast<OutputType*>(buffer);
+
+#if defined(USE_SSE2)
+      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+#ifndef USE_SSSE3
+      const __m128i kZeros = _mm_setzero_si128();
+#else
+      const __m128i kOnes = _mm_set1_epi16(1);
+#endif
+      const auto input_vector = reinterpret_cast<const __m128i*>(input);
+
+#elif defined(USE_MMX)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
       const __m64 kZeros = _mm_setzero_si64();
       const auto input_vector = reinterpret_cast<const __m64*>(input);
 
-  #elif defined(USE_NEON)
+#elif defined(USE_NEON)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
       const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
-  #endif
+#endif
 
       for (IndexType i = 0; i < kOutputDimensions; ++i) {
         const IndexType offset = i * kPaddedInputDimensions;
 
-  #if defined(USE_AVX512)
-        __m512i sum = _mm512_setzero_si512();
-        const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-            sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #else
-            __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-            product = _mm512_madd_epi16(product, kOnes);
-            sum = _mm512_add_epi32(sum, product);
-  #endif
-        }
-
-        // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
-        // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
-        // and we have to do one more 256bit chunk.
-        if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
-        {
-            const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
-            const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
-  #if defined(USE_VNNI)
-            __m256i product256 = _mm256_dpbusd_epi32(
-                _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_inserti32x8(sum, product256, 0);
-  #else
-            __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
-  #endif
-        }
-        output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
-
-  #elif defined(USE_AVX2)
-        __m256i sum = _mm256_setzero_si256();
-        const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-          sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-  #else
-          __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-          product = _mm256_madd_epi16(product, kOnes);
-          sum = _mm256_add_epi32(sum, product);
-  #endif
-        }
-        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
-        output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
-
-  #elif defined(USE_SSSE3)
-        __m128i sum = _mm_setzero_si128();
-        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
-        for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
-          __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
-          product0 = _mm_madd_epi16(product0, kOnes);
-          sum = _mm_add_epi32(sum, product0);
-          __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
-          product1 = _mm_madd_epi16(product1, kOnes);
-          sum = _mm_add_epi32(sum, product1);
-        }
-        if (kNumChunks & 0x1) {
-          __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
-          product = _mm_madd_epi16(product, kOnes);
-          sum = _mm_add_epi32(sum, product);
-        }
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
-        output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
-
-  #elif defined(USE_SSE2)
+#if defined(USE_SSE2)
         __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
         __m128i sum_hi = kZeros;
         const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
@@ -204,7 +491,7 @@ namespace Eval::NNUE::Layers {
         sum = _mm_add_epi32(sum, sum_second_32);
         output[i] = _mm_cvtsi128_si32(sum);
 
-  #elif defined(USE_MMX)
+#elif defined(USE_MMX)
         __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
         __m64 sum_hi = kZeros;
         const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
@@ -225,7 +512,7 @@ namespace Eval::NNUE::Layers {
         sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
         output[i] = _mm_cvtsi64_si32(sum);
 
-  #elif defined(USE_NEON)
+#elif defined(USE_NEON)
         int32x4_t sum = {biases_[i]};
         const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -235,18 +522,21 @@ namespace Eval::NNUE::Layers {
         }
         output[i] = sum[0] + sum[1] + sum[2] + sum[3];
 
-  #else
+#else
         OutputType sum = biases_[i];
         for (IndexType j = 0; j < kInputDimensions; ++j) {
           sum += weights_[offset + j] * input[j];
         }
         output[i] = sum;
-  #endif
+#endif
 
       }
-  #if defined(USE_MMX)
+#if defined(USE_MMX)
       _mm_empty();
-  #endif
+#endif
+
+#endif
+
       return output;
     }
 

From 931070b65ac0332469a24765a60eb27e038f73bc Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Thu, 29 Oct 2020 17:33:18 +0300
Subject: [PATCH 17/27] Elo Worth in King Danger

Adding the EloWorth for each term in King Danger.
Should be useful for simplifications, tuning patches, and new ideas.

closes https://github.com/official-stockfish/Stockfish/pull/3204

non-functional change
---
 src/evaluate.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 030d1017..4ade46fa 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -582,18 +582,18 @@ namespace {
     int kingFlankAttack  = popcount(b1) + popcount(b2);
     int kingFlankDefense = popcount(b3);
 
-    kingDanger +=        kingAttackersCount[Them] * kingAttackersWeight[Them]
-                 + 185 * popcount(kingRing[Us] & weak)
-                 + 148 * popcount(unsafeChecks)
-                 +  98 * popcount(pos.blockers_for_king(Us))
-                 +  69 * kingAttacksCount[Them]
-                 +   3 * kingFlankAttack * kingFlankAttack / 8
-                 +       mg_value(mobility[Them] - mobility[Us])
-                 - 873 * !pos.count<QUEEN>(Them)
-                 - 100 * bool(attackedBy[Us][KNIGHT] & attackedBy[Us][KING])
-                 -   6 * mg_value(score) / 8
-                 -   4 * kingFlankDefense
-                 +  37;
+    kingDanger +=        kingAttackersCount[Them] * kingAttackersWeight[Them] // (~10 Elo)
+                 + 185 * popcount(kingRing[Us] & weak)                        // (~15 Elo)
+                 + 148 * popcount(unsafeChecks)                               // (~4 Elo)
+                 +  98 * popcount(pos.blockers_for_king(Us))                  // (~2 Elo)
+                 +  69 * kingAttacksCount[Them]                               // (~0.5 Elo)
+                 +   3 * kingFlankAttack * kingFlankAttack / 8                // (~0.5 Elo)
+                 +       mg_value(mobility[Them] - mobility[Us])              // (~0.5 Elo)
+                 - 873 * !pos.count<QUEEN>(Them)                              // (~24 Elo)
+                 - 100 * bool(attackedBy[Us][KNIGHT] & attackedBy[Us][KING])  // (~5 Elo)
+                 -   6 * mg_value(score) / 8                                  // (~8 Elo)
+                 -   4 * kingFlankDefense                                     // (~5 Elo)
+                 +  37;                                                       // (~0.5 Elo)
 
     // Transform the kingDanger units into a Score, and subtract it from the evaluation
     if (kingDanger > 100)

From a260c9a8a24a2630a900efc3821000c3481b0c5d Mon Sep 17 00:00:00 2001
From: "J. Oster" <osterj165@googlemail.com>
Date: Sun, 1 Nov 2020 18:33:17 +0100
Subject: [PATCH 18/27] Fix incorrect pruning in qsearch

Only do countermove based pruning in qsearch if we already have a move with a better score than a TB loss.

This patch fixes a bug (started as 843a961) that incorrectly prunes moves if in check,
and adds an assert to make sure no wrong mate scores are given in the future.
It replaces a no-op moveCount check with a check for bestValue.

Initially discussed in #3171 and later in #3199, #3198 and #3210.
This PR effectively closes #3171
It also likely fixes #3196 where this causes user visible incorrect TB scores,
which probably result from these incorrect mate scores.

Passed STC and LTC non-regression tests.
https://tests.stockfishchess.org/tests/view/5f9ef8dabca9bf35bae7f648
LLR: 2.93 (-2.94,2.94) {-1.25,0.25}
Total: 21672 W: 2339 L: 2230 D: 17103
Ptnml(0-2): 126, 1689, 7083, 1826, 112

https://tests.stockfishchess.org/tests/view/5f9f0caebca9bf35bae7f666
LLR: 2.97 (-2.94,2.94) {-0.75,0.25}
Total: 33152 W: 1551 L: 1485 D: 30116
Ptnml(0-2): 27, 1308, 13832, 1390, 19

closes https://github.com/official-stockfish/Stockfish/pull/3214

Bench: 3625915
---
 src/search.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 65ed9b73..743449fa 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1565,7 +1565,7 @@ moves_loop: // When in check, search starts from here
 
       // CounterMove based pruning
       if (  !captureOrPromotion
-          && moveCount
+          && bestValue > VALUE_TB_LOSS_IN_MAX_PLY
           && (*contHist[0])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold
           && (*contHist[1])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold)
           continue;
@@ -1600,7 +1600,11 @@ moves_loop: // When in check, search starts from here
     // All legal moves have been searched. A special case: if we're in check
     // and no legal moves were found, it is checkmate.
     if (ss->inCheck && bestValue == -VALUE_INFINITE)
+    {
+        assert(!MoveList<LEGAL>(pos).size());
+
         return mated_in(ss->ply); // Plies to mate from the root
+    }
 
     tte->save(posKey, value_to_tt(bestValue, ss->ply), pvHit,
               bestValue >= beta ? BOUND_LOWER :

From 3f6451eff7c62e8d4a33c5b11f055a81b3da8387 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 3 Nov 2020 11:23:35 +0100
Subject: [PATCH 19/27] Manually align arrays on the stack

as a workaround to issues with overaligned alignas() on stack variables in gcc < 9.3 on windows.

closes https://github.com/official-stockfish/Stockfish/pull/3217

fixes #3216

No functional change
---
 src/misc.h                          | 12 ++++++++++++
 src/nnue/evaluate_nnue.cpp          | 25 ++++++++++++++++++++++---
 src/nnue/layers/clipped_relu.h      | 10 +++++-----
 src/nnue/nnue_common.h              | 23 -----------------------
 src/nnue/nnue_feature_transformer.h | 14 +++++++-------
 src/position.cpp                    |  4 ++++
 src/search.cpp                      |  8 ++++++++
 src/types.h                         |  6 ++++++
 8 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/src/misc.h b/src/misc.h
index bc48f303..682ef816 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -24,6 +24,7 @@
 #include <ostream>
 #include <string>
 #include <vector>
+#include <cstdint>
 
 #include "types.h"
 
@@ -63,6 +64,17 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 #define sync_cout std::cout << IO_LOCK
 #define sync_endl std::endl << IO_UNLOCK
 
+// `ptr` must point to an array of size at least
+// `sizeof(T) * N + alignment` bytes, where `N` is the
+// number of elements in the array.
+template <uintptr_t Alignment, typename T>
+T* align_ptr_up(T* ptr)
+{
+  static_assert(alignof(T) < Alignment);
+
+  const uintptr_t ptrint = reinterpret_cast<uintptr_t>(reinterpret_cast<char*>(ptr));
+  return reinterpret_cast<T*>(reinterpret_cast<char*>((ptrint + (Alignment - 1)) / Alignment * Alignment));
+}
 
 /// xorshift64star Pseudo-Random Number Generator
 /// This class is based on original code written and dedicated
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index b5dcd992..b0ed7d2f 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -25,6 +25,7 @@
 #include "../position.h"
 #include "../misc.h"
 #include "../uci.h"
+#include "../types.h"
 
 #include "evaluate_nnue.h"
 
@@ -126,10 +127,28 @@ namespace Eval::NNUE {
   // Evaluation function. Perform differential calculation.
   Value evaluate(const Position& pos) {
 
-    alignas(kCacheLineSize) TransformedFeatureType
-        transformed_features[FeatureTransformer::kBufferSize];
+    // We manually align the arrays on the stack because with gcc < 9.3
+    // overaligning stack variables with alignas() doesn't work correctly.
+
+    constexpr uint64_t alignment = kCacheLineSize;
+
+#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
+    TransformedFeatureType transformed_features_unaligned[
+      FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
+    char buffer_unaligned[Network::kBufferSize + alignment];
+
+    auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
+    auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
+#else
+    alignas(alignment)
+      TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
+    alignas(alignment) char buffer[Network::kBufferSize];
+#endif
+
+    ASSERT_ALIGNED(transformed_features, alignment);
+    ASSERT_ALIGNED(buffer, alignment);
+
     feature_transformer->Transform(pos, transformed_features);
-    alignas(kCacheLineSize) char buffer[Network::kBufferSize];
     const auto output = network->Propagate(transformed_features, buffer);
 
     return static_cast<Value>(output[0] / FV_SCALE);
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 44d8a7de..7f6d67bf 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -74,12 +74,12 @@ namespace Eval::NNUE::Layers {
       const auto out = reinterpret_cast<__m256i*>(output);
       for (IndexType i = 0; i < kNumChunks; ++i) {
         const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 0]),
-            _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
+            _mm256_load_si256(&in[i * 4 + 0]),
+            _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
         const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 2]),
-            _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
-        _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+            _mm256_load_si256(&in[i * 4 + 2]),
+            _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
+        _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
             _mm256_packs_epi16(words0, words1), kZero), kOffsets));
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 8afea186..a9664262 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -43,29 +43,6 @@
 #include <arm_neon.h>
 #endif
 
-// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary
-//       compiled with older g++ crashes because the output memory is not aligned
-//       even though alignas is specified.
-#if defined(USE_AVX2)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
-#define _mm256_loadA_si256  _mm256_loadu_si256
-#define _mm256_storeA_si256 _mm256_storeu_si256
-#else
-#define _mm256_loadA_si256  _mm256_load_si256
-#define _mm256_storeA_si256 _mm256_store_si256
-#endif
-#endif
-
-#if defined(USE_AVX512)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
-#define _mm512_loadA_si512   _mm512_loadu_si512
-#define _mm512_storeA_si512  _mm512_storeu_si512
-#else
-#define _mm512_loadA_si512   _mm512_load_si512
-#define _mm512_storeA_si512  _mm512_store_si512
-#endif
-#endif
-
 namespace Eval::NNUE {
 
   // Version of the evaluation file
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index f145c848..c3f012e4 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -36,16 +36,16 @@ namespace Eval::NNUE {
 
   #ifdef USE_AVX512
   typedef __m512i vec_t;
-  #define vec_load(a) _mm512_loadA_si512(a)
-  #define vec_store(a,b) _mm512_storeA_si512(a,b)
+  #define vec_load(a) _mm512_load_si512(a)
+  #define vec_store(a,b) _mm512_store_si512(a,b)
   #define vec_add_16(a,b) _mm512_add_epi16(a,b)
   #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
   static constexpr IndexType kNumRegs = 8; // only 8 are needed
 
   #elif USE_AVX2
   typedef __m256i vec_t;
-  #define vec_load(a) _mm256_loadA_si256(a)
-  #define vec_store(a,b) _mm256_storeA_si256(a,b)
+  #define vec_load(a) _mm256_load_si256(a)
+  #define vec_store(a,b) _mm256_store_si256(a,b)
   #define vec_add_16(a,b) _mm256_add_epi16(a,b)
   #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
   static constexpr IndexType kNumRegs = 16;
@@ -157,11 +157,11 @@ namespace Eval::NNUE {
   #if defined(USE_AVX2)
         auto out = reinterpret_cast<__m256i*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i sum0 = _mm256_loadA_si256(
+          __m256i sum0 = _mm256_load_si256(
               &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m256i sum1 = _mm256_loadA_si256(
+          __m256i sum1 = _mm256_load_si256(
             &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
-          _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
         }
 
diff --git a/src/position.cpp b/src/position.cpp
index b707293d..5ce7da22 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -77,6 +77,8 @@ std::ostream& operator<<(std::ostream& os, const Position& pos) {
       && !pos.can_castle(ANY_CASTLING))
   {
       StateInfo st;
+      ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
       Position p;
       p.set(pos.fen(), pos.is_chess960(), &st, pos.this_thread());
       Tablebases::ProbeState s1, s2;
@@ -1318,6 +1320,8 @@ bool Position::pos_is_ok() const {
               assert(0 && "pos_is_ok: Bitboards");
 
   StateInfo si = *st;
+  ASSERT_ALIGNED(&si, Eval::NNUE::kCacheLineSize);
+
   set_state(&si);
   if (std::memcmp(&si, st, sizeof(StateInfo)))
       assert(0 && "pos_is_ok: State");
diff --git a/src/search.cpp b/src/search.cpp
index 743449fa..12c32194 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -164,6 +164,8 @@ namespace {
   uint64_t perft(Position& pos, Depth depth) {
 
     StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
     uint64_t cnt, nodes = 0;
     const bool leaf = (depth == 2);
 
@@ -590,6 +592,8 @@ namespace {
 
     Move pv[MAX_PLY+1], capturesSearched[32], quietsSearched[64];
     StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
     TTEntry* tte;
     Key posKey;
     Move ttMove, move, excludedMove, bestMove;
@@ -1403,6 +1407,8 @@ moves_loop: // When in check, search starts from here
 
     Move pv[MAX_PLY+1];
     StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
     TTEntry* tte;
     Key posKey;
     Move ttMove, move, bestMove;
@@ -1898,6 +1904,8 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
 bool RootMove::extract_ponder_from_tt(Position& pos) {
 
     StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
     bool ttHit;
 
     assert(pv.size() == 1);
diff --git a/src/types.h b/src/types.h
index 5873c698..bf692f7e 100644
--- a/src/types.h
+++ b/src/types.h
@@ -57,6 +57,12 @@
 /// _WIN32             Building on Windows (any)
 /// _WIN64             Building on Windows 64 bit
 
+#if defined(__GNUC__ ) && (__GNUC__ < 9 || (__GNUC__ == 9 && __GNUC_MINOR__ <= 2)) && defined(_WIN32) && !defined(__clang__)
+#define ALIGNAS_ON_STACK_VARIABLES_BROKEN
+#endif
+
+#define ASSERT_ALIGNED(ptr, alignment) assert(reinterpret_cast<uintptr_t>(ptr) % alignment == 0)
+
 #if defined(_WIN64) && defined(_MSC_VER) // No Makefile used
 #  include <intrin.h> // Microsoft header for _BitScanForward64()
 #  define IS_64BIT

From 04a320666efce725ef66d1a84aaef493a880153d Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Fri, 23 Oct 2020 07:39:35 +0200
Subject: [PATCH 20/27] Change handling the special case of a single legal
 move.

Using no searching time in case of a single legal move is not beneficial from
a strength point of view, and this special case can be easily removed:

STC:
LLR: 2.93 (-2.94,2.94) {-1.25,0.25}
Total: 22472 W: 2458 L: 2357 D: 17657
Ptnml(0-2): 106, 1733, 7453, 1842, 102
https://tests.stockfishchess.org/tests/view/5f926cbc81eda81bd78cb6df

LTC:
LLR: 2.94 (-2.94,2.94) {-0.75,0.25}
Total: 37880 W: 1736 L: 1682 D: 34462
Ptnml(0-2): 22, 1392, 16057, 1448, 21
https://tests.stockfishchess.org/tests/view/5f92a26081eda81bd78cb6fe

The advantage of using the normal time management for a single legal move is that scores
reported for that move are reasonable, not searching leads to artifacts during games
(see e.g. https://tcec-chess.com/#div=sf&game=96&season=19)

The disadvantage of using normal time management of a single legal move is that thinking
times can be unnaturally long, making it 'painful to watch' in online tournaments.

This patch uses normal time management, but caps the used time to 500ms.
This should lead to reasonable scores, and be hardly perceptible.

closes https://github.com/official-stockfish/Stockfish/pull/3195
closes https://github.com/official-stockfish/Stockfish/pull/3183

variant of a patch suggested by SFisGOD

No functional change.
---
 src/search.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 12c32194..6e37fba1 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -521,10 +521,14 @@ void Thread::search() {
           }
           double bestMoveInstability = 1 + 2 * totBestMoveChanges / Threads.size();
 
-          double totalTime = rootMoves.size() == 1 ? 0 :
-                             Time.optimum() * fallingEval * reduction * bestMoveInstability;
+          double totalTime = Time.optimum() * fallingEval * reduction * bestMoveInstability;
 
-          // Stop the search if we have exceeded the totalTime, at least 1ms search
+          // Cap used time in case of a single legal move for a better viewer experience in tournaments
+          // yielding correct scores and sufficiently fast moves.
+          if (rootMoves.size() == 1)
+              totalTime = std::min(500.0, totalTime);
+
+          // Stop the search if we have exceeded the totalTime
           if (Time.elapsed() > totalTime)
           {
               // If we are allowed to ponder do not stop the search now but

From 7fc47eeb6f6b5f3c5ff697e974093ff14413e42c Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Thu, 5 Nov 2020 01:54:53 +0200
Subject: [PATCH 21/27] Introducing King On File

this new concept calculates bonuses/penalties for the king when the king is in a semiopen or open file.

Passed STC:
LLR: 2.95 (-2.94,2.94) {-0.25,1.25}
Total: 44904 W: 9365 L: 9028 D: 26511
Ptnml(0-2): 857, 5309, 9841, 5530, 915
https://tests.stockfishchess.org/tests/view/5fa343625d72639a7acef72b

Passed LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 60552 W: 8449 L: 8051 D: 44052
Ptnml(0-2): 466, 5772, 17481, 6012, 545
https://tests.stockfishchess.org/tests/view/5fa40e365d72639a7acef79e

closes https://github.com/official-stockfish/Stockfish/pull/3219

Bench: 3689484
---
 src/pawns.cpp | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/pawns.cpp b/src/pawns.cpp
index a5102db8..fde70ba5 100644
--- a/src/pawns.cpp
+++ b/src/pawns.cpp
@@ -49,10 +49,10 @@ namespace {
   // Strength of pawn shelter for our king by [distance from edge][rank].
   // RANK_1 = 0 is used for files where we have no pawn, or pawn is behind our king.
   constexpr Value ShelterStrength[int(FILE_NB) / 2][RANK_NB] = {
-    { V( -6), V( 81), V( 93), V( 58), V( 39), V( 18), V(  25) },
-    { V(-43), V( 61), V( 35), V(-49), V(-29), V(-11), V( -63) },
-    { V(-10), V( 75), V( 23), V( -2), V( 32), V(  3), V( -45) },
-    { V(-39), V(-13), V(-29), V(-52), V(-48), V(-67), V(-166) }
+    { V( -5), V( 82), V( 92), V( 54), V( 36), V( 22), V(  28) },
+    { V(-44), V( 63), V( 33), V(-50), V(-30), V(-12), V( -62) },
+    { V(-11), V( 77), V( 22), V( -6), V( 31), V(  8), V( -45) },
+    { V(-39), V(-12), V(-29), V(-50), V(-43), V(-68), V(-164) }
   };
 
   // Danger of enemy pawns moving toward our king by [distance from edge][rank].
@@ -60,12 +60,17 @@ namespace {
   // is behind our king. Note that UnblockedStorm[0][1-2] accommodate opponent pawn
   // on edge, likely blocked by our king.
   constexpr Value UnblockedStorm[int(FILE_NB) / 2][RANK_NB] = {
-    { V( 85), V(-289), V(-166), V(97), V(50), V( 45), V( 50) },
-    { V( 46), V( -25), V( 122), V(45), V(37), V(-10), V( 20) },
-    { V( -6), V(  51), V( 168), V(34), V(-2), V(-22), V(-14) },
-    { V(-15), V( -11), V( 101), V( 4), V(11), V(-15), V(-29) }
+    { V( 87), V(-288), V(-168), V( 96), V( 47), V( 44), V( 46) },
+    { V( 42), V( -25), V( 120), V( 45), V( 34), V( -9), V( 24) },
+    { V( -8), V(  51), V( 167), V( 35), V( -4), V(-16), V(-12) },
+    { V(-17), V( -13), V( 100), V(  4), V(  9), V(-16), V(-31) }
   };
 
+  // KingOnFile[semi-open Us][semi-open Them] contains bonuses/penalties
+  // for king when the king is on a semi-open or open file.
+  constexpr Score KingOnFile[2][2] = {{ S(-19,12), S(-6, 7)  },
+                                     {  S(  0, 2), S( 6,-5) }};
+
   #undef S
   #undef V
 
@@ -237,6 +242,9 @@ Score Entry::evaluate_shelter(const Position& pos, Square ksq) const {
           bonus -= make_score(UnblockedStorm[d][theirRank], 0);
   }
 
+  // King On File
+  bonus -= KingOnFile[pos.is_on_semiopen_file(Us, ksq)][pos.is_on_semiopen_file(Them, ksq)];
+
   return bonus;
 }
 

From ba35c88ab84b959d41a67b3d8fcb40adc6537ec8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 3 Nov 2020 22:49:10 +0100
Subject: [PATCH 22/27] AVX-512 for smaller affine and feature transforms.

For the feature transformer the code is analogical to AVX2 since there was room for easy adaptation of wider simd registers.

For the smaller affine transforms that have 32 byte stride we keep 2 columns in one zmm register. We also unroll more aggressively so that in the end we have to do 16 parallel horizontal additions on ymm slices each consisting of 4 32-bit integers. The slices are embedded in 8 zmm registers.

These changes provide about 1.5% speedup for AVX-512 builds.

Closes https://github.com/official-stockfish/Stockfish/pull/3218

No functional change.
---
 src/nnue/layers/affine_transform.h  | 129 +++++++++++++++++++++++++++-
 src/nnue/nnue_feature_transformer.h |  27 ++++--
 2 files changed, 148 insertions(+), 8 deletions(-)

diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index f0292e45..47c9c488 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -83,7 +83,21 @@ namespace Eval::NNUE::Layers {
         return _mm512_reduce_add_epi32(sum) + bias;
       };
 
-      [[maybe_unused]] auto m512_haddx4 = [](__m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i {
+      // This function takes
+      //   sum0 = [xmm0a, xmm0b, xmm0c, xmm0d]
+      //   sum1 = [xmm1a, xmm1b, xmm1c, xmm1d]
+      //   sum2 = [xmm2a, xmm2b, xmm2c, xmm2d]
+      //   sum3 = [xmm3a, xmm3b, xmm3c, xmm3d]
+      // and returns
+      //   ret = [
+      //     reduce_add_epi32(xmm0a), reduce_add_epi32(xmm1a), reduce_add_epi32(xmm2a), reduce_add_epi32(xmm3a),
+      //     reduce_add_epi32(xmm0b), reduce_add_epi32(xmm1b), reduce_add_epi32(xmm2b), reduce_add_epi32(xmm3b),
+      //     reduce_add_epi32(xmm0c), reduce_add_epi32(xmm1c), reduce_add_epi32(xmm2c), reduce_add_epi32(xmm3c),
+      //     reduce_add_epi32(xmm0d), reduce_add_epi32(xmm1d), reduce_add_epi32(xmm2d), reduce_add_epi32(xmm3d)
+      //   ]
+      [[maybe_unused]] auto m512_hadd128x16_interleave = [](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3) -> __m512i {
+
         __m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1);
         __m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1);
 
@@ -96,7 +110,13 @@ namespace Eval::NNUE::Layers {
         __m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23);
         __m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23);
 
-        __m512i sum = _mm512_add_epi32(sum0123a, sum0123b);
+        return _mm512_add_epi32(sum0123a, sum0123b);
+      };
+
+      [[maybe_unused]] auto m512_haddx4 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i {
+
+        __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
 
         __m256i sum256lo = _mm512_castsi512_si256(sum);
         __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
@@ -109,6 +129,58 @@ namespace Eval::NNUE::Layers {
         return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
       };
 
+      [[maybe_unused]] auto m512_haddx8 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3,
+        __m512i sum4, __m512i sum5, __m512i sum6, __m512i sum7, __m256i bias) -> __m256i {
+
+        __m512i suma = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+        __m512i sumb = m512_hadd128x16_interleave(sum4, sum5, sum6, sum7);
+
+        __m512i indices0 = _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13);
+        __m512i indices1 = _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15);
+        __m512i x = _mm512_add_epi32(
+          _mm512_permutex2var_epi64(suma, indices0, sumb),
+          _mm512_permutex2var_epi64(suma, indices1, sumb));
+
+        __m256i sum256lo = _mm512_castsi512_si256(x);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(x, 1);
+
+        return _mm256_add_epi32(_mm256_add_epi32(sum256lo, sum256hi), bias);
+      };
+
+      [[maybe_unused]] auto m512_hadd256x8 =[m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m256i bias) -> __m256i {
+
+        __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+
+        __m512i indices = _mm512_setr_epi32(
+          0, 4, 8, 12, 2, 6, 10, 14,
+          1, 5, 9, 13, 3, 7, 11, 15);
+        sum = _mm512_permutexvar_epi32(indices, sum);
+
+        __m256i sum256lo = _mm512_castsi512_si256(sum);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
+
+        return _mm256_add_epi32(_mm256_hadd_epi32(sum256lo, sum256hi), bias);
+      };
+
+      [[maybe_unused]] auto m512_hadd256x16 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3,
+        __m512i sum4, __m512i sum5, __m512i sum6, __m512i sum7, __m512i bias) -> __m512i {
+
+        __m512i suma = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+        __m512i sumb = m512_hadd128x16_interleave(sum4, sum5, sum6, sum7);
+
+        __m512i indices0 = _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13);
+        __m512i indices1 = _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15);
+        __m512i x = _mm512_add_epi32(
+          _mm512_permutex2var_epi64(suma, indices0, sumb),
+          _mm512_permutex2var_epi64(suma, indices1, sumb));
+
+        __m512i indices = _mm512_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15);
+        return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias);
+      };
+
       [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
 #if defined (USE_VNNI)
         acc = _mm512_dpbusd_epi32(acc, a, b);
@@ -205,7 +277,58 @@ namespace Eval::NNUE::Layers {
 
       // kOutputDimensions is either 1 or a multiple of kSimdWidth
       // because then it is also an input dimension.
-      if constexpr (kOutputDimensions % 4 == 0)
+      if constexpr (kOutputDimensions % 16 == 0 && kNumChunks256 == 1)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 16)
+        {
+          const IndexType offset01a = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset23a = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset45a = (i + 4) * kPaddedInputDimensions;
+          const IndexType offset67a = (i + 6) * kPaddedInputDimensions;
+          const IndexType offset01b = (i + 8) * kPaddedInputDimensions;
+          const IndexType offset23b = (i + 10) * kPaddedInputDimensions;
+          const IndexType offset45b = (i + 12) * kPaddedInputDimensions;
+          const IndexType offset67b = (i + 14) * kPaddedInputDimensions;
+
+          const __m512i bias = *reinterpret_cast<const __m512i*>(&biases_[i]);
+          __m512i* outptr = reinterpret_cast<__m512i*>(&output[i]);
+
+          __m512i sum01a = _mm512_setzero_si512();
+          __m512i sum23a = _mm512_setzero_si512();
+          __m512i sum45a = _mm512_setzero_si512();
+          __m512i sum67a = _mm512_setzero_si512();
+          __m512i sum01b = _mm512_setzero_si512();
+          __m512i sum23b = _mm512_setzero_si512();
+          __m512i sum45b = _mm512_setzero_si512();
+          __m512i sum67b = _mm512_setzero_si512();
+
+          const auto row01a = *reinterpret_cast<const __m512i*>(&weights_[offset01a]);
+          const auto row23a = *reinterpret_cast<const __m512i*>(&weights_[offset23a]);
+          const auto row45a = *reinterpret_cast<const __m512i*>(&weights_[offset45a]);
+          const auto row67a = *reinterpret_cast<const __m512i*>(&weights_[offset67a]);
+          const auto row01b = *reinterpret_cast<const __m512i*>(&weights_[offset01b]);
+          const auto row23b = *reinterpret_cast<const __m512i*>(&weights_[offset23b]);
+          const auto row45b = *reinterpret_cast<const __m512i*>(&weights_[offset45b]);
+          const auto row67b = *reinterpret_cast<const __m512i*>(&weights_[offset67b]);
+
+          const __m256i in256 = input_vector256[0];
+          const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1);
+
+          m512_add_dpbusd_epi32(sum01a, in, row01a);
+          m512_add_dpbusd_epi32(sum23a, in, row23a);
+          m512_add_dpbusd_epi32(sum45a, in, row45a);
+          m512_add_dpbusd_epi32(sum67a, in, row67a);
+          m512_add_dpbusd_epi32(sum01b, in, row01b);
+          m512_add_dpbusd_epi32(sum23b, in, row23b);
+          m512_add_dpbusd_epi32(sum45b, in, row45b);
+          m512_add_dpbusd_epi32(sum67b, in, row67b);
+
+          *outptr = m512_hadd256x16(
+            sum01a, sum23a, sum45a, sum67a,
+            sum01b, sum23b, sum45b, sum67b, bias);
+        }
+      }
+      else if constexpr (kOutputDimensions % 4 == 0)
       {
         for (IndexType i = 0; i < kOutputDimensions; i += 4)
         {
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index c3f012e4..f49777b5 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -127,7 +127,13 @@ namespace Eval::NNUE {
 
       const auto& accumulation = pos.state()->accumulator.accumulation;
 
-  #if defined(USE_AVX2)
+  #if defined(USE_AVX512)
+      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth * 2);
+      static_assert(kHalfDimensions % (kSimdWidth * 2) == 0);
+      const __m512i kControl = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+      const __m512i kZero = _mm512_setzero_si512();
+
+  #elif defined(USE_AVX2)
       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
       constexpr int kControl = 0b11011000;
       const __m256i kZero = _mm256_setzero_si256();
@@ -154,13 +160,24 @@ namespace Eval::NNUE {
       for (IndexType p = 0; p < 2; ++p) {
         const IndexType offset = kHalfDimensions * p;
 
-  #if defined(USE_AVX2)
+  #if defined(USE_AVX512)
+        auto out = reinterpret_cast<__m512i*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m512i sum0 = _mm512_load_si512(
+              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m512i sum1 = _mm512_load_si512(
+              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(kControl,
+              _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), kZero)));
+        }
+
+  #elif defined(USE_AVX2)
         auto out = reinterpret_cast<__m256i*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
           __m256i sum0 = _mm256_load_si256(
               &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
           __m256i sum1 = _mm256_load_si256(
-            &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
           _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
         }
@@ -177,9 +194,9 @@ namespace Eval::NNUE {
           _mm_store_si128(&out[j],
 
   #ifdef USE_SSE41
-            _mm_max_epi8(packedbytes, kZero)
+              _mm_max_epi8(packedbytes, kZero)
   #else
-            _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+              _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
   #endif
 
           );

From 32edb1d009e09a9442cb7393920e072ffd08005d Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Sat, 7 Nov 2020 08:50:02 +0800
Subject: [PATCH 23/27] Update default net to nn-c3ca321c51c9.nnue

Optimization of the net biases of the 32 x 32 layer and the output layer.

Tuning of 32 x 32 layer (200k games, 5 seconds TC)
https://tests.stockfishchess.org/tests/view/5f9aaf266a2c112b60691c68

STC:
LLR: 2.95 (-2.94,2.94) {-0.25,1.25}
Total: 41848 W: 4665 L: 4461 D: 32722
Ptnml(0-2): 239, 3308, 13659, 3446, 272
https://tests.stockfishchess.org/tests/view/5fa5ef5a936c54e11ec9954f

LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 88008 W: 4045 L: 3768 D: 80195
Ptnml(0-2): 69, 3339, 36908, 3622, 66
https://tests.stockfishchess.org/tests/view/5fa62a78936c54e11ec99577

closes https://github.com/official-stockfish/Stockfish/pull/3220

Bench: 3649288
---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index 6bec27db..06c66f71 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -36,7 +36,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-cb26f10b1fd9.nnue"
+  #define EvalFileDefaultName   "nn-c3ca321c51c9.nnue"
 
   namespace NNUE {
 

From 392b529c3f52103ad47ad096b86103c17758cb4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= <cassio@free.fr>
Date: Fri, 6 Nov 2020 19:20:27 +0100
Subject: [PATCH 24/27] Qsearch pruning: follow-up

This is a follow-up of the recent qsearch pruning patch in
https://github.com/official-stockfish/Stockfish/commit/a260c9a8a24a2630a900efc3821000c3481b0c5d

We now use the same guard condition (testing that we already have a defense with
a score better  score than a TB loss) for all pruning heuristics in qsearch().
This allows some pruning when in check, but  in a controlled way to ensure that
no wrong mate scores appear.

Tested with Elo-gaining bounds:

STC:
LLR: 2.97 (-2.94,2.94) {-0.25,1.25}
Total: 22632 W: 2433 L: 2264 D: 17935
Ptnml(0-2): 98, 1744, 7487, 1865, 122
https://tests.stockfishchess.org/tests/view/5fa59405936c54e11ec99515

LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 105432 W: 4965 L: 4648 D: 95819
Ptnml(0-2): 85, 4110, 44011, 4423, 87
https://tests.stockfishchess.org/tests/view/5fa5b609936c54e11ec9952a

closes https://github.com/official-stockfish/Stockfish/pull/3221

Bench: 3578092
---
 src/search.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 6e37fba1..b5b93bf0 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1525,7 +1525,7 @@ moves_loop: // When in check, search starts from here
       moveCount++;
 
       // Futility pruning
-      if (   !ss->inCheck
+      if (    bestValue > VALUE_TB_LOSS_IN_MAX_PLY
           && !givesCheck
           &&  futilityBase > -VALUE_KNOWN_WIN
           && !pos.advanced_pawn_push(move))
@@ -1552,7 +1552,7 @@ moves_loop: // When in check, search starts from here
       }
 
       // Do not search moves with negative SEE values
-      if (   !ss->inCheck
+      if (    bestValue > VALUE_TB_LOSS_IN_MAX_PLY
           && !(givesCheck && pos.is_discovery_check_on_king(~pos.side_to_move(), move))
           && !pos.see_ge(move))
           continue;

From b5781150ea8557e2030f8bc8b4eadede0ecec6bd Mon Sep 17 00:00:00 2001
From: lonfom169 <50217346+lonfom169@users.noreply.github.com>
Date: Sun, 8 Nov 2020 23:43:32 -0300
Subject: [PATCH 25/27] Increase reduction based on the number of best move
 changes.

Thanks to Vizvezdenec for the PvNode idea and also to vondele the !PvNode idea.

Passed STC:
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 19120 W: 1998 L: 1839 D: 15283
Ptnml(0-2): 76, 1445, 6375, 1572, 92
https://tests.stockfishchess.org/tests/view/5fa8af3e67cbf42301d6a6c9

Passed LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 75584 W: 3454 L: 3205 D: 68925
Ptnml(0-2): 54, 2832, 31771, 3081, 54

closes https://github.com/official-stockfish/Stockfish/pull/3224

Bench: 3595418
---
 AUTHORS        | 1 +
 src/search.cpp | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/AUTHORS b/AUTHORS
index 198dfa5a..f0356090 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -19,6 +19,7 @@ Alain Savard (Rocky640)
 Alayan Feh (Alayan-stk-2)
 Alexander Kure
 Alexander Pagel (Lolligerhans)
+Alfredo Menezes (lonfom169)
 Ali AlZhrani (Cooffe)
 Andrew Grant (AndyGrant)
 Andrey Neporada (nepal)
diff --git a/src/search.cpp b/src/search.cpp
index b5b93bf0..56b56733 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1169,6 +1169,9 @@ moves_loop: // When in check, search starts from here
           if (ss->ttPv)
               r -= 2;
 
+          if (!PvNode && depth > 10 && thisThread->bestMoveChanges <= 2)
+              r++;
+
           if (moveCountPruning && !formerPv)
               r++;
 

From 285bf7041ad214156188823eb9118e6af7f4b2e4 Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Tue, 10 Nov 2020 18:28:43 +0100
Subject: [PATCH 26/27] Increase reduction at root

when the best move does not change frequently

STC:
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 51320 W: 5159 L: 4956 D: 41205
Ptnml(0-2): 215, 3897, 17242, 4082, 224
https://tests.stockfishchess.org/tests/view/5faa072367cbf42301d6a767

LTC:
LLR: 2.98 (-2.94,2.94) {0.25,1.25}
Total: 15952 W: 762 L: 642 D: 14548
Ptnml(0-2): 8, 561, 6725, 667, 15
https://tests.stockfishchess.org/tests/view/5faa4c3567cbf42301d6a794

closes https://github.com/official-stockfish/Stockfish/pull/3225

Bench: 3954692
---
 AUTHORS        | 2 +-
 src/search.cpp | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index f0356090..f30be4de 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -86,7 +86,7 @@ Jekaa
 Jerry Donald Watson (jerrydonaldwatson)
 jjoshua2
 Jonathan Calovski (Mysseno)
-Jonathan Dumale (SFisGOD)
+Jonathan Buladas Dumale (SFisGOD)
 Joost VandeVondele (vondele)
 Jörg Oster (joergoster)
 Joseph Ellis (jhellis3)
diff --git a/src/search.cpp b/src/search.cpp
index 56b56733..66ef5043 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1161,7 +1161,7 @@ moves_loop: // When in check, search starts from here
           if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024)
               r--;
 
-          // Reduction if other threads are searching this position
+          // Increase reduction if other threads are searching this position
           if (th.marked())
               r++;
 
@@ -1169,7 +1169,8 @@ moves_loop: // When in check, search starts from here
           if (ss->ttPv)
               r -= 2;
 
-          if (!PvNode && depth > 10 && thisThread->bestMoveChanges <= 2)
+          // Increase reduction at root and non-PV nodes when the best move does not change frequently
+          if ((rootNode || !PvNode) && depth > 10 && thisThread->bestMoveChanges <= 2)
               r++;
 
           if (moveCountPruning && !formerPv)

From f9595828eb7e5e970b0be3ee5f84ddd726845523 Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Wed, 11 Nov 2020 20:56:29 +0200
Subject: [PATCH 27/27] Rook Mobility Tweak

Passed STC:
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 171152 W: 34715 L: 34202 D: 102235
Ptnml(0-2): 3278, 20155, 38228, 20606, 3309
https://tests.stockfishchess.org/tests/view/5fa861f467cbf42301d6a68e

Passed LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 149616 W: 20471 L: 19882 D: 109263
Ptnml(0-2): 1172, 14434, 43102, 14833, 1267
https://tests.stockfishchess.org/tests/view/5fa9c8ff67cbf42301d6a74f

closes https://github.com/official-stockfish/Stockfish/pull/3226

Bench: 3597730
---
 src/evaluate.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 4ade46fa..34ebe6c3 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -212,9 +212,9 @@ namespace {
     { S(-47,-59), S(-20,-25), S( 14, -8), S( 29, 12), S( 39, 21), S( 53, 40), // Bishop
       S( 53, 56), S( 60, 58), S( 62, 65), S( 69, 72), S( 78, 78), S( 83, 87),
       S( 91, 88), S( 96, 98) },
-    { S(-61,-82), S(-20,-17), S(  2, 23) ,S(  3, 40), S(  4, 72), S( 11,100), // Rook
-      S( 22,104), S( 31,120), S( 39,134), S(40 ,138), S( 41,158), S( 47,163),
-      S( 59,168), S( 60,169), S( 64,173) },
+    { S(-60,-82), S(-24,-15), S(  0, 17) ,S(  3, 43), S(  4, 72), S( 14,100), // Rook
+      S( 20,102), S( 30,122), S( 41,133), S(41 ,139), S( 41,153), S( 45,160),
+      S( 57,165), S( 58,170), S( 67,175) },
     { S(-29,-49), S(-16,-29), S( -8, -8), S( -8, 17), S( 18, 39), S( 25, 54), // Queen
       S( 23, 59), S( 37, 73), S( 41, 76), S( 54, 95), S( 65, 95) ,S( 68,101),
       S( 69,124), S( 70,128), S( 70,132), S( 70,133) ,S( 71,136), S( 72,140),