From 651ec3b31ee68db50f38ccd8fcdedbd6673cd9ed Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Mon, 10 Aug 2020 07:18:15 +0200
Subject: [PATCH 01/58] Revert "Avoid special casing for MinGW"

This reverts commit a6e89293df5af35931b61d86b6de3872a981c100.

The offending setup has been found as gcc/mingw 7.3 (on Ubuntu 18.04).

fixes https://github.com/official-stockfish/Stockfish/issues/2963

closes https://github.com/official-stockfish/Stockfish/issues/2968

No functional change.
---
 src/nnue/layers/affine_transform.h  | 29 ++++++++++++++---
 src/nnue/layers/clipped_relu.h      | 49 +++++++++++++++++++++++++----
 src/nnue/nnue_feature_transformer.h | 34 ++++++++++++++++++--
 3 files changed, 98 insertions(+), 14 deletions(-)
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index ecc3008a..b585bc87 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -104,8 +104,13 @@ namespace Eval::NNUE::Layers {
         __m512i sum = _mm512_setzero_si512();
         const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-            __m512i product = _mm512_maddubs_epi16(
-              _mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+            __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+  #else
+            __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+  #endif
+
             product = _mm512_madd_epi16(product, kOnes);
             sum = _mm512_add_epi32(sum, product);
         }
@@ -120,8 +125,12 @@ namespace Eval::NNUE::Layers {
             const auto row_256 = reinterpret_cast<const __m256i*>(&weights_[offset]);
             int j = kNumChunks * 2;
 
-            __m256i sum256 = _mm256_maddubs_epi16(
-              _mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
+  #if defined(__MINGW32__) || defined(__MINGW64__)  // See HACK comment below in AVX2.
+            __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
+  #else
+            __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
+  #endif
+
             sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1));
             sum256 = _mm256_hadd_epi32(sum256, sum256);
             sum256 = _mm256_hadd_epi32(sum256, sum256);
@@ -135,7 +144,17 @@ namespace Eval::NNUE::Layers {
         const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
           __m256i product = _mm256_maddubs_epi16(
-            _mm256_load_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+            // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
+            //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
+            //       even though alignas is specified.
+            _mm256_loadu_si256
+  #else
+            _mm256_load_si256
+  #endif
+
+            (&input_vector[j]), _mm256_load_si256(&row[j]));
           product = _mm256_madd_epi16(product, kOnes);
           sum = _mm256_add_epi32(sum, product);
         }
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 7e5fcf4a..7ade598f 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -74,13 +74,50 @@ namespace Eval::NNUE::Layers {
       const auto out = reinterpret_cast<__m256i*>(output);
       for (IndexType i = 0; i < kNumChunks; ++i) {
         const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-          _mm256_load_si256(&in[i * 4 + 0]),
-          _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+          // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
+          //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
+          //       even though alignas is specified.
+          _mm256_loadu_si256
+  #else
+          _mm256_load_si256
+  #endif
+
+          (&in[i * 4 + 0]),
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+          _mm256_loadu_si256
+  #else
+          _mm256_load_si256
+  #endif
+
+          (&in[i * 4 + 1])), kWeightScaleBits);
         const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-          _mm256_load_si256(&in[i * 4 + 2]),
-          _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
-        _mm256_store_si256(
-            &out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+          _mm256_loadu_si256
+  #else
+          _mm256_load_si256
+  #endif
+
+          (&in[i * 4 + 2]),
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+          _mm256_loadu_si256
+  #else
+          _mm256_load_si256
+  #endif
+
+          (&in[i * 4 + 3])), kWeightScaleBits);
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+        _mm256_storeu_si256
+  #else
+        _mm256_store_si256
+  #endif
+
+          (&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
             _mm256_packs_epi16(words0, words1), kZero), kOffsets));
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index f899d761..1cfebbe4 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -110,12 +110,36 @@ namespace Eval::NNUE {
         auto out = reinterpret_cast<__m256i*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
           __m256i sum0 =
-            _mm256_load_si256(&reinterpret_cast<const __m256i*>(
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+            // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
+            //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
+            //       even though alignas is specified.
+            _mm256_loadu_si256
+  #else
+            _mm256_load_si256
+  #endif
+
+            (&reinterpret_cast<const __m256i*>(
               accumulation[perspectives[p]][0])[j * 2 + 0]);
           __m256i sum1 =
-            _mm256_load_si256(&reinterpret_cast<const __m256i*>(
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+            _mm256_loadu_si256
+  #else
+            _mm256_load_si256
+  #endif
+
+            (&reinterpret_cast<const __m256i*>(
               accumulation[perspectives[p]][0])[j * 2 + 1]);
-          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+          _mm256_storeu_si256
+  #else
+          _mm256_store_si256
+  #endif
+
+          (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
         }
 
@@ -178,7 +202,11 @@ namespace Eval::NNUE {
           auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
           for (IndexType j = 0; j < kNumChunks; ++j) {
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+            _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j]));
+  #else
             accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
+  #endif
           }
 
   #elif defined(USE_SSE2)

From bcdf41dadc8a5f8a23116236a0f449a08b46dc6b Mon Sep 17 00:00:00 2001
From: Sergio Vieri <sergio.vieri.hp@gmail.com>
Date: Mon, 10 Aug 2020 08:47:52 +0800
Subject: [PATCH 02/58] Update default net to nn-112bb1c8cdb5.nnue

First trained net using search eval instead of pv leaf static eval.

Net created at: 20200810-0744

passed STC: https://tests.stockfishchess.org/tests/view/5f30995d90816720665373f8
LLR: 2.93 (-2.94,2.94) {-0.50,1.50}
Total: 15416 W: 2071 L: 1920 D: 11425
Ptnml(0-2): 123, 1376, 4563, 1519, 127

passed LTC: https://tests.stockfishchess.org/tests/view/5f30a104908167206653742b
LLR: 2.93 (-2.94,2.94) {0.25,1.75}
Total: 29792 W: 2003 L: 1834 D: 25955
Ptnml(0-2): 50, 1541, 11550, 1700, 55

closes https://github.com/official-stockfish/Stockfish/pull/2966

Bench: 4084753
---
 src/ucioption.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index faeb78ae..b0689d6d 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -79,7 +79,7 @@ void init(OptionsMap& o) {
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
   o["Use NNUE"]              << Option(false, on_use_NNUE);
-  o["EvalFile"]              << Option("nn-9931db908a9b.nnue", on_eval_file);
+  o["EvalFile"]              << Option("nn-112bb1c8cdb5.nnue", on_eval_file);
 }
 
 

From a54f9011c3bf3581fe7daffef6be2d586e6662c1 Mon Sep 17 00:00:00 2001
From: jjoshua2 <jjoshua2@gmail.com>
Date: Sun, 9 Aug 2020 16:16:04 -0400
Subject: [PATCH 03/58] simplying hybrid condition

STC https://tests.stockfishchess.org/tests/view/5f3059d1908167206653736b:
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 12520 W: 766 L: 727 D: 11027
Ptnml(0-2): 13, 624, 4949, 659, 15

LTC: https://tests.stockfishchess.org/tests/view/5f30863a90816720665373d1
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 12520 W: 766 L: 727 D: 11027
Ptnml(0-2): 13, 624, 4949, 659, 15

closes: https://github.com/official-stockfish/Stockfish/pull/2965

Bench: 4084753
---
 src/evaluate.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index ce35c630..caab2979 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -114,7 +114,7 @@ namespace {
   constexpr Value LazyThreshold1 =  Value(1400);
   constexpr Value LazyThreshold2 =  Value(1300);
   constexpr Value SpaceThreshold = Value(12222);
-  constexpr Value NNUEThreshold  =   Value(460);
+  constexpr Value NNUEThreshold  =   Value(575);
 
   // KingAttackWeights[PieceType] contains king attack weights by piece type
   constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -945,7 +945,7 @@ Value Eval::evaluate(const Position& pos) {
   {
       Value v = eg_value(pos.psq_score());
       // Take NNUE eval only on balanced positions
-      if (abs(v) < NNUEThreshold + 20 * pos.count<PAWN>())
+      if (abs(v) < NNUEThreshold)
          return NNUE::evaluate(pos) + Tempo;
   }
   return Evaluation<NO_TRACE>(pos).value();

From 875183b310a8249922c2155e82cb4cecfae2097e Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Sun, 9 Aug 2020 23:50:59 -0700
Subject: [PATCH 04/58] Workaround using unaligned loads for gcc < 9

despite usage of alignas, the generated (avx2/avx512) code with older compilers needs to use
unaligned loads with older gcc (e.g. confirmed crash with gcc 7.3/mingw on abrok).

Better performance thus requires gcc >= 9 on hardware supporting avx2/avx512

closes https://github.com/official-stockfish/Stockfish/pull/2969

No functional change
---
 src/nnue/layers/affine_transform.h  | 32 +++----------------
 src/nnue/layers/clipped_relu.h      | 48 +++--------------------------
 src/nnue/nnue_common.h              | 21 +++++++++++++
 src/nnue/nnue_feature_transformer.h | 42 ++++---------------------
 4 files changed, 36 insertions(+), 107 deletions(-)

diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index b585bc87..20ec2f12 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -104,13 +104,7 @@ namespace Eval::NNUE::Layers {
         __m512i sum = _mm512_setzero_si512();
         const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #else
-            __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #endif
-
+            __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
             product = _mm512_madd_epi16(product, kOnes);
             sum = _mm512_add_epi32(sum, product);
         }
@@ -124,13 +118,7 @@ namespace Eval::NNUE::Layers {
             const auto iv_256  = reinterpret_cast<const __m256i*>(input);
             const auto row_256 = reinterpret_cast<const __m256i*>(&weights_[offset]);
             int j = kNumChunks * 2;
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)  // See HACK comment below in AVX2.
-            __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
-  #else
-            __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
-  #endif
-
+            __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
             sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1));
             sum256 = _mm256_hadd_epi32(sum256, sum256);
             sum256 = _mm256_hadd_epi32(sum256, sum256);
@@ -143,18 +131,7 @@ namespace Eval::NNUE::Layers {
         __m256i sum = _mm256_setzero_si256();
         const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i product = _mm256_maddubs_epi16(
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
-            //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
-            //       even though alignas is specified.
-            _mm256_loadu_si256
-  #else
-            _mm256_load_si256
-  #endif
-
-            (&input_vector[j]), _mm256_load_si256(&row[j]));
+          __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
           product = _mm256_madd_epi16(product, kOnes);
           sum = _mm256_add_epi32(sum, product);
         }
@@ -168,8 +145,7 @@ namespace Eval::NNUE::Layers {
         __m128i sum = _mm_cvtsi32_si128(biases_[i]);
         const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m128i product = _mm_maddubs_epi16(
-              _mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+          __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
           product = _mm_madd_epi16(product, kOnes);
           sum = _mm_add_epi32(sum, product);
         }
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 7ade598f..13196ec2 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -74,50 +74,12 @@ namespace Eval::NNUE::Layers {
       const auto out = reinterpret_cast<__m256i*>(output);
       for (IndexType i = 0; i < kNumChunks; ++i) {
         const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
-          //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
-          //       even though alignas is specified.
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 0]),
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 1])), kWeightScaleBits);
+            _mm256_loadA_si256(&in[i * 4 + 0]),
+            _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
         const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 2]),
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 3])), kWeightScaleBits);
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-        _mm256_storeu_si256
-  #else
-        _mm256_store_si256
-  #endif
-
-          (&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+            _mm256_loadA_si256(&in[i * 4 + 2]),
+            _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
+        _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
             _mm256_packs_epi16(words0, words1), kZero), kOffsets));
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 972ef3e5..e7ce84f7 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -37,6 +37,27 @@
 #include <arm_neon.h>
 #endif
 
+// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary
+//       compiled with older g++ crashes because the output memory is not aligned
+//       even though alignas is specified.
+#if defined(USE_AVX2)
+#if defined(__GNUC__ ) && (__GNUC__ < 9)
+#define _mm256_loadA_si256  _mm256_loadu_si256
+#define _mm256_storeA_si256 _mm256_storeu_si256
+#else
+#define _mm256_loadA_si256  _mm256_load_si256
+#define _mm256_storeA_si256 _mm256_store_si256
+#endif
+#endif
+
+#if defined(USE_AVX512)
+#if defined(__GNUC__ ) && (__GNUC__ < 9)
+#define _mm512_loadA_si512  _mm512_loadu_si512
+#else
+#define _mm512_loadA_si512  _mm512_load_si512
+#endif
+#endif
+
 namespace Eval::NNUE {
 
   // Version of the evaluation file
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 1cfebbe4..cbcc26f3 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -109,37 +109,11 @@ namespace Eval::NNUE {
   #if defined(USE_AVX2)
         auto out = reinterpret_cast<__m256i*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i sum0 =
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
-            //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
-            //       even though alignas is specified.
-            _mm256_loadu_si256
-  #else
-            _mm256_load_si256
-  #endif
-
-            (&reinterpret_cast<const __m256i*>(
-              accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m256i sum1 =
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            _mm256_loadu_si256
-  #else
-            _mm256_load_si256
-  #endif
-
-            (&reinterpret_cast<const __m256i*>(
-              accumulation[perspectives[p]][0])[j * 2 + 1]);
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_storeu_si256
-  #else
-          _mm256_store_si256
-  #endif
-
-          (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+          __m256i sum0 = _mm256_loadA_si256(
+              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m256i sum1 = _mm256_loadA_si256(
+            &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
         }
 
@@ -202,11 +176,7 @@ namespace Eval::NNUE {
           auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
           for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j]));
-  #else
-            accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
-  #endif
+            _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
           }
 
   #elif defined(USE_SSE2)

From ad2ad4c65706c18a5383506d361f1f23fc6a26ab Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Mon, 10 Aug 2020 15:39:22 +0800
Subject: [PATCH 05/58] Modify castling extension

Extend castling only if there are few friendly pieces on the castling side.

Inspired by silversolver1's (Rahul Dsilva) test
https://tests.stockfishchess.org/tests/view/5f0fef560640035f9d2978cf

STC:
LLR: 2.94 (-2.94,2.94) {-0.50,1.50}
Total: 7096 W: 947 L: 818 D: 5331
Ptnml(0-2): 32, 604, 2181, 665, 66
https://tests.stockfishchess.org/tests/view/5f309f729081672066537426

LTC:
LLR: 2.96 (-2.94,2.94) {0.25,1.75}
Total: 4712 W: 300 L: 215 D: 4197
Ptnml(0-2): 2, 190, 1895, 259, 10
https://tests.stockfishchess.org/tests/view/5f30a2039081672066537430

closes https://github.com/official-stockfish/Stockfish/pull/2970

Bench: 4094850
---
 src/search.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 0a2519b6..3d2bb422 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1131,7 +1131,8 @@ moves_loop: // When in check, search starts from here
           extension = 1;
 
       // Castling extension
-      if (type_of(move) == CASTLING)
+      if (   type_of(move) == CASTLING
+          && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 3)
           extension = 1;
 
       // Late irreversible move extension

From cb0504028e8830dbc71be53cbd701d78c3d562a1 Mon Sep 17 00:00:00 2001
From: sf-x <sf-x@users.noreply.github.com>
Date: Sun, 9 Aug 2020 18:01:18 +0300
Subject: [PATCH 06/58] Makefile rework/cleanup

Makefile targets x86-64-sse42, x86-sse3 are removed; x86-64-sse41
is renamed to x86-64-sse41-popcnt (it did enable popcnt).

Makefile variables sse3, sse42, their associated compilation flags
and code in misc.cpp are removed.

closes https://github.com/official-stockfish/Stockfish/pull/2922

No functional change
---
 src/Makefile | 58 +++-------------------------------------------------
 src/misc.cpp |  6 ------
 2 files changed, 3 insertions(+), 61 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 571172b2..a48e7dcb 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -68,10 +68,8 @@ endif
 # prefetch = yes/no   --- -DUSE_PREFETCH   --- Use prefetch asm-instruction
 # popcnt = yes/no     --- -DUSE_POPCNT     --- Use popcnt asm-instruction
 # sse = yes/no        --- -msse            --- Use Intel Streaming SIMD Extensions
-# sse3 = yes/no       --- -msse3           --- Use Intel Streaming SIMD Extensions 3
 # ssse3 = yes/no      --- -mssse3          --- Use Intel Supplemental Streaming SIMD Extensions 3
 # sse41 = yes/no      --- -msse4.1         --- Use Intel Streaming SIMD Extensions 4.1
-# sse42 = yes/no      --- -msse4.2         --- Use Intel Streaming SIMD Extensions 4.2
 # avx2 = yes/no       --- -mavx2           --- Use Intel Advanced Vector Extensions 2
 # pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
 # avx512 = yes/no     --- -mavx512bw       --- Use Intel Advanced Vector Extensions 512
@@ -89,10 +87,8 @@ bits = 64
 prefetch = no
 popcnt = no
 sse = no
-sse3 = no
 ssse3 = no
 sse41 = no
-sse42 = no
 avx2 = no
 pext = no
 avx512 = no
@@ -127,18 +123,10 @@ ifeq ($(ARCH),x86-64)
 	sse = yes
 endif
 
-ifeq ($(ARCH),x86-64-sse3)
-	arch = x86_64
-	prefetch = yes
-	sse = yes
-	sse3 = yes
-endif
-
 ifeq ($(ARCH),x86-64-sse3-popcnt)
 	arch = x86_64
 	prefetch = yes
 	sse = yes
-	sse3 = yes
 	popcnt = yes
 endif
 
@@ -146,39 +134,25 @@ ifeq ($(ARCH),x86-64-ssse3)
 	arch = x86_64
 	prefetch = yes
 	sse = yes
-	sse3 = yes
 	ssse3 = yes
 endif
 
-ifeq ($(ARCH),x86-64-sse41)
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
-	sse = yes
-	sse3 = yes
-	ssse3 = yes
-	sse41 = yes
-endif
-
 ifeq ($(ARCH),x86-64-modern)
 	arch = x86_64
 	prefetch = yes
 	popcnt = yes
 	sse = yes
-	sse3 = yes
 	ssse3 = yes
 	sse41 = yes
 endif
 
-ifeq ($(ARCH),x86-64-sse42)
+ifeq ($(ARCH),x86-64-sse41-popcnt)
 	arch = x86_64
 	prefetch = yes
 	popcnt = yes
 	sse = yes
-	sse3 = yes
 	ssse3 = yes
 	sse41 = yes
-	sse42 = yes
 endif
 
 ifeq ($(ARCH),x86-64-avx2)
@@ -186,10 +160,8 @@ ifeq ($(ARCH),x86-64-avx2)
 	prefetch = yes
 	popcnt = yes
 	sse = yes
-	sse3 = yes
 	ssse3 = yes
 	sse41 = yes
-	sse42 = yes
 	avx2 = yes
 endif
 
@@ -198,10 +170,8 @@ ifeq ($(ARCH),x86-64-bmi2)
 	prefetch = yes
 	popcnt = yes
 	sse = yes
-	sse3 = yes
 	ssse3 = yes
 	sse41 = yes
-	sse42 = yes
 	avx2 = yes
 	pext = yes
 endif
@@ -211,10 +181,8 @@ ifeq ($(ARCH),x86-64-avx512)
 	prefetch = yes
 	popcnt = yes
 	sse = yes
-	sse3 = yes
 	ssse3 = yes
 	sse41 = yes
-	sse42 = yes
 	avx2 = yes
 	pext = yes
 	avx512 = yes
@@ -450,13 +418,6 @@ ifeq ($(avx512),yes)
 	endif
 endif
 
-ifeq ($(sse42),yes)
-	CXXFLAGS += -DUSE_SSE42
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
-		CXXFLAGS += -msse4.2
-	endif
-endif
-
 ifeq ($(sse41),yes)
 	CXXFLAGS += -DUSE_SSE41
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
@@ -471,13 +432,6 @@ ifeq ($(ssse3),yes)
 	endif
 endif
 
-ifeq ($(sse3),yes)
-	CXXFLAGS += -DUSE_SSE3
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
-		CXXFLAGS += -msse3
-	endif
-endif
-
 ifeq ($(neon),yes)
 	CXXFLAGS += -DUSE_NEON
 endif
@@ -557,12 +511,10 @@ help:
 	@echo "x86-64-avx512           > x86 64-bit with avx512 support"
 	@echo "x86-64-bmi2             > x86 64-bit with bmi2 support"
 	@echo "x86-64-avx2             > x86 64-bit with avx2 support"
-	@echo "x86-64-sse42            > x86 64-bit with sse42 support"
-	@echo "x86-64-modern           > x86 64-bit with sse41 support (x86-64-sse41)"
-	@echo "x86-64-sse41            > x86 64-bit with sse41 support"
+	@echo "x86-64-sse41-popcnt     > x86 64-bit with sse41 and popcnt support"
+	@echo "x86-64-modern           > the same as previous (x86-64-sse41-popcnt)"
 	@echo "x86-64-ssse3            > x86 64-bit with ssse3 support"
 	@echo "x86-64-sse3-popcnt      > x86 64-bit with sse3 and popcnt support"
-	@echo "x86-64-sse3             > x86 64-bit with sse3 support"
 	@echo "x86-64                  > x86 64-bit generic"
 	@echo "x86-32                  > x86 32-bit (also enables SSE)"
 	@echo "x86-32-old              > x86 32-bit fall back for old hardware"
@@ -669,10 +621,8 @@ config-sanity:
 	@echo "prefetch: '$(prefetch)'"
 	@echo "popcnt: '$(popcnt)'"
 	@echo "sse: '$(sse)'"
-	@echo "sse3: '$(sse3)'"
 	@echo "ssse3: '$(ssse3)'"
 	@echo "sse41: '$(sse41)'"
-	@echo "sse42: '$(sse42)'"
 	@echo "avx2: '$(avx2)'"
 	@echo "pext: '$(pext)'"
 	@echo "avx512: '$(avx512)'"
@@ -695,10 +645,8 @@ config-sanity:
 	@test "$(prefetch)" = "yes" || test "$(prefetch)" = "no"
 	@test "$(popcnt)" = "yes" || test "$(popcnt)" = "no"
 	@test "$(sse)" = "yes" || test "$(sse)" = "no"
-	@test "$(sse3)" = "yes" || test "$(sse3)" = "no"
 	@test "$(ssse3)" = "yes" || test "$(ssse3)" = "no"
 	@test "$(sse41)" = "yes" || test "$(sse41)" = "no"
-	@test "$(sse42)" = "yes" || test "$(sse42)" = "no"
 	@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
 	@test "$(pext)" = "yes" || test "$(pext)" = "no"
 	@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
diff --git a/src/misc.cpp b/src/misc.cpp
index bdd7bccb..5061ae13 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -220,17 +220,11 @@ const std::string compiler_info() {
   #if defined(USE_AVX2)
     compiler += " AVX2";
   #endif
-  #if defined(USE_SSE42)
-    compiler += " SSE42";
-  #endif
   #if defined(USE_SSE41)
     compiler += " SSE41";
   #endif
   #if defined(USE_SSSE3)
     compiler += " SSSE3";
-  #endif
-  #if defined(USE_SSE3)
-    compiler += " SSE3";
   #endif
     compiler += (HasPext ? " BMI2" : "");
     compiler += (HasPopCnt ? " POPCNT" : "");

From f948cd008d3a289ebbadc463271f84888e8069ba Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Sun, 9 Aug 2020 16:23:33 -0700
Subject: [PATCH 07/58] Cleanup and optimize SSE/AVX code

AVX512 +4% faster
AVX2 +1% faster
SSSE3 +5% faster

passed non-regression STC:
STC https://tests.stockfishchess.org/tests/view/5f31249f90816720665374f6
LLR: 2.96 (-2.94,2.94) {-1.50,0.50}
Total: 17576 W: 2344 L: 2245 D: 12987
Ptnml(0-2): 127, 1570, 5292, 1675, 124

closes https://github.com/official-stockfish/Stockfish/pull/2962

No functional change
---
 src/nnue/layers/affine_transform.h  | 46 +++++++++++++++--------------
 src/nnue/nnue_accumulator.h         |  2 +-
 src/nnue/nnue_common.h              |  6 ++--
 src/nnue/nnue_feature_transformer.h | 21 +++++++------
 4 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 20ec2f12..89cfaad7 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -108,24 +108,19 @@ namespace Eval::NNUE::Layers {
             product = _mm512_madd_epi16(product, kOnes);
             sum = _mm512_add_epi32(sum, product);
         }
-        output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
 
         // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
         // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
         // and we have to do one more 256bit chunk.
         if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
         {
-            const auto iv_256  = reinterpret_cast<const __m256i*>(input);
-            const auto row_256 = reinterpret_cast<const __m256i*>(&weights_[offset]);
-            int j = kNumChunks * 2;
-            __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
-            sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1));
-            sum256 = _mm256_hadd_epi32(sum256, sum256);
-            sum256 = _mm256_hadd_epi32(sum256, sum256);
-            const __m128i lo = _mm256_extracti128_si256(sum256, 0);
-            const __m128i hi = _mm256_extracti128_si256(sum256, 1);
-            output[i] += _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi);
+            const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
+            const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
+            __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
+            product256 = _mm256_madd_epi16(product256, _mm256_set1_epi16(1));
+            sum = _mm512_add_epi32(sum, _mm512_zextsi256_si512(product256));
         }
+        output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
 
   #elif defined(USE_AVX2)
         __m256i sum = _mm256_setzero_si256();
@@ -135,23 +130,30 @@ namespace Eval::NNUE::Layers {
           product = _mm256_madd_epi16(product, kOnes);
           sum = _mm256_add_epi32(sum, product);
         }
-        sum = _mm256_hadd_epi32(sum, sum);
-        sum = _mm256_hadd_epi32(sum, sum);
-        const __m128i lo = _mm256_extracti128_si256(sum, 0);
-        const __m128i hi = _mm256_extracti128_si256(sum, 1);
-        output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi) + biases_[i];
+        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+        output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
 
   #elif defined(USE_SSSE3)
-        __m128i sum = _mm_cvtsi32_si128(biases_[i]);
+        __m128i sum = _mm_setzero_si128();
         const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+        for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
+          __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+          product0 = _mm_madd_epi16(product0, kOnes);
+          sum = _mm_add_epi32(sum, product0);
+          __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
+          product1 = _mm_madd_epi16(product1, kOnes);
+          sum = _mm_add_epi32(sum, product1);
+        }
+        if (kNumChunks & 0x1) {
+          __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
           product = _mm_madd_epi16(product, kOnes);
           sum = _mm_add_epi32(sum, product);
         }
-        sum = _mm_hadd_epi32(sum, sum);
-        sum = _mm_hadd_epi32(sum, sum);
-        output[i] = _mm_cvtsi128_si32(sum);
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
+        output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
 
   #elif defined(USE_NEON)
         int32x4_t sum = {biases_[i]};
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index 2a354a3c..69dfaad2 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -26,7 +26,7 @@
 namespace Eval::NNUE {
 
   // Class that holds the result of affine transformation of input features
-  struct alignas(32) Accumulator {
+  struct alignas(kCacheLineSize) Accumulator {
     std::int16_t
         accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
     Value score;
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index e7ce84f7..ff33cc79 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -52,9 +52,11 @@
 
 #if defined(USE_AVX512)
 #if defined(__GNUC__ ) && (__GNUC__ < 9)
-#define _mm512_loadA_si512  _mm512_loadu_si512
+#define _mm512_loadA_si512   _mm512_loadu_si512
+#define _mm512_storeA_si512  _mm512_storeu_si512
 #else
-#define _mm512_loadA_si512  _mm512_load_si512
+#define _mm512_loadA_si512   _mm512_load_si512
+#define _mm512_storeA_si512  _mm512_store_si512
 #endif
 #endif
 
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index cbcc26f3..3818e444 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -169,38 +169,41 @@ namespace Eval::NNUE {
                    kHalfDimensions * sizeof(BiasType));
         for (const auto index : active_indices[perspective]) {
           const IndexType offset = kHalfDimensions * index;
+  #if defined(USE_AVX512)
+          auto accumulation = reinterpret_cast<__m512i*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m512i*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+          for (IndexType j = 0; j < kNumChunks; ++j)
+            _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j]));
 
-  #if defined(USE_AVX2)
+  #elif defined(USE_AVX2)
           auto accumulation = reinterpret_cast<__m256i*>(
               &accumulator.accumulation[perspective][i][0]);
           auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
+          for (IndexType j = 0; j < kNumChunks; ++j)
             _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
-          }
 
   #elif defined(USE_SSE2)
           auto accumulation = reinterpret_cast<__m128i*>(
               &accumulator.accumulation[perspective][i][0]);
           auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
+          for (IndexType j = 0; j < kNumChunks; ++j)
             accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
-          }
 
   #elif defined(USE_NEON)
           auto accumulation = reinterpret_cast<int16x8_t*>(
               &accumulator.accumulation[perspective][i][0]);
           auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
+          for (IndexType j = 0; j < kNumChunks; ++j)
             accumulation[j] = vaddq_s16(accumulation[j], column[j]);
-          }
 
   #else
-          for (IndexType j = 0; j < kHalfDimensions; ++j) {
+          for (IndexType j = 0; j < kHalfDimensions; ++j)
             accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-          }
   #endif
 
         }

From 21df37d7fd4dcc9b4a9c319382cc43685c0259c8 Mon Sep 17 00:00:00 2001
From: Fanael Linithien <fanael4@gmail.com>
Date: Sun, 9 Aug 2020 16:20:45 +0200
Subject: [PATCH 08/58] Provide vectorized NNUE code for SSE2 and MMX targets

This patch allows old x86 CPUs, from AMD K8 (which the x86-64 baseline
targets) all the way down to the Pentium MMX, to benefit from NNUE with
comparable performance hit versus hand-written eval as on more modern
processors.

NPS of the bench with NNUE enabled on a Pentium III 1.13 GHz (using the
MMX code):
  master: 38951
  this patch: 80586

NPS of the bench with NNUE enabled using baseline x86-64 arch, which is
how linux distros are likely to package stockfish, on a modern CPU
(using the SSE2 code):
  master: 882584
  this patch: 1203945

closes https://github.com/official-stockfish/Stockfish/pull/2956

No functional change.
---
 AUTHORS                             |  1 +
 src/Makefile                        | 13 ++++++-
 src/misc.cpp                        |  3 ++
 src/nnue/layers/affine_transform.h  | 59 ++++++++++++++++++++++++++++-
 src/nnue/layers/clipped_relu.h      | 20 +++++++++-
 src/nnue/nnue_common.h              |  6 +++
 src/nnue/nnue_feature_transformer.h | 54 +++++++++++++++++++++++++-
 7 files changed, 150 insertions(+), 6 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 21ef3e50..41b89705 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -53,6 +53,7 @@ Ernesto Gatti
 Linmiao Xu (linrock)
 Fabian Beuke (madnight)
 Fabian Fichter (ianfab)
+Fanael Linithien (Fanael)
 fanon
 Fauzi Akram Dabat (FauziAkram)
 Felix Wittmann
diff --git a/src/Makefile b/src/Makefile
index a48e7dcb..3d84f482 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -86,6 +86,7 @@ sanitize = no
 bits = 64
 prefetch = no
 popcnt = no
+mmx = no
 sse = no
 ssse3 = no
 sse41 = no
@@ -110,6 +111,7 @@ ifeq ($(ARCH),x86-32)
 	arch = i386
 	bits = 32
 	prefetch = yes
+	mmx = yes
 	sse = yes
 endif
 
@@ -250,7 +252,7 @@ ifeq ($(COMP),gcc)
 	ifneq ($(KERNEL),Darwin)
 	   LDFLAGS += -Wl,--no-as-needed
 	endif
-	
+
 	gccversion = $(shell $(CXX) --version)
 	gccisclang = $(findstring clang,$(gccversion))
 endif
@@ -432,6 +434,13 @@ ifeq ($(ssse3),yes)
 	endif
 endif
 
+ifeq ($(mmx),yes)
+	CXXFLAGS += -DUSE_MMX
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -mmmx
+	endif
+endif
+
 ifeq ($(neon),yes)
 	CXXFLAGS += -DUSE_NEON
 endif
@@ -516,7 +525,7 @@ help:
 	@echo "x86-64-ssse3            > x86 64-bit with ssse3 support"
 	@echo "x86-64-sse3-popcnt      > x86 64-bit with sse3 and popcnt support"
 	@echo "x86-64                  > x86 64-bit generic"
-	@echo "x86-32                  > x86 32-bit (also enables SSE)"
+	@echo "x86-32                  > x86 32-bit (also enables MMX and SSE)"
 	@echo "x86-32-old              > x86 32-bit fall back for old hardware"
 	@echo "ppc-64                  > PPC 64-bit"
 	@echo "ppc-32                  > PPC 32-bit"
diff --git a/src/misc.cpp b/src/misc.cpp
index 5061ae13..401a6505 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -228,6 +228,9 @@ const std::string compiler_info() {
   #endif
     compiler += (HasPext ? " BMI2" : "");
     compiler += (HasPopCnt ? " POPCNT" : "");
+  #if defined(USE_MMX)
+    compiler += " MMX";
+  #endif
   #if !defined(NDEBUG)
     compiler += " DEBUG";
   #endif
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 89cfaad7..985ee71a 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -87,11 +87,20 @@ namespace Eval::NNUE::Layers {
       const __m256i kOnes = _mm256_set1_epi16(1);
       const auto input_vector = reinterpret_cast<const __m256i*>(input);
 
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+  #ifndef USE_SSSE3
+      const __m128i kZeros = _mm_setzero_si128();
+  #else
       const __m128i kOnes = _mm_set1_epi16(1);
+  #endif
       const auto input_vector = reinterpret_cast<const __m128i*>(input);
 
+  #elif defined(USE_MMX)
+      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+      const __m64 kZeros = _mm_setzero_si64();
+      const auto input_vector = reinterpret_cast<const __m64*>(input);
+
   #elif defined(USE_NEON)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
       const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
@@ -155,6 +164,51 @@ namespace Eval::NNUE::Layers {
         sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
         output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
 
+  #elif defined(USE_SSE2)
+        __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
+        __m128i sum_hi = kZeros;
+        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m128i row_j = _mm_load_si128(&row[j]);
+          __m128i input_j = _mm_load_si128(&input_vector[j]);
+          __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
+          __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
+          __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
+          __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
+          __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
+          __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
+          __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
+          sum_lo = _mm_add_epi32(sum_lo, product_lo);
+          sum_hi = _mm_add_epi32(sum_hi, product_hi);
+        }
+        __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
+        __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
+        sum = _mm_add_epi32(sum, sum_high_64);
+        __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
+        sum = _mm_add_epi32(sum, sum_second_32);
+        output[i] = _mm_cvtsi128_si32(sum);
+
+  #elif defined(USE_MMX)
+        __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
+        __m64 sum_hi = kZeros;
+        const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m64 row_j = row[j];
+          __m64 input_j = input_vector[j];
+          __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
+          __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
+          __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
+          __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
+          __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
+          __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
+          __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
+          sum_lo = _mm_add_pi32(sum_lo, product_lo);
+          sum_hi = _mm_add_pi32(sum_hi, product_hi);
+        }
+        __m64 sum = _mm_add_pi32(sum_lo, sum_hi);
+        sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
+        output[i] = _mm_cvtsi64_si32(sum);
+
   #elif defined(USE_NEON)
         int32x4_t sum = {biases_[i]};
         const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
@@ -174,6 +228,9 @@ namespace Eval::NNUE::Layers {
   #endif
 
       }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
       return output;
     }
 
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 13196ec2..44d8a7de 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -84,7 +84,7 @@ namespace Eval::NNUE::Layers {
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
 
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
       constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
 
   #ifdef USE_SSE41
@@ -115,6 +115,24 @@ namespace Eval::NNUE::Layers {
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
 
+  #elif defined(USE_MMX)
+      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+      const __m64 k0x80s = _mm_set1_pi8(-128);
+      const auto in = reinterpret_cast<const __m64*>(input);
+      const auto out = reinterpret_cast<__m64*>(output);
+      for (IndexType i = 0; i < kNumChunks; ++i) {
+        const __m64 words0 = _mm_srai_pi16(
+            _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
+            kWeightScaleBits);
+        const __m64 words1 = _mm_srai_pi16(
+            _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
+            kWeightScaleBits);
+        const __m64 packedbytes = _mm_packs_pi16(words0, words1);
+        out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+      }
+      _mm_empty();
+      constexpr IndexType kStart = kNumChunks * kSimdWidth;
+
   #elif defined(USE_NEON)
       constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
       const int8x8_t kZero = {0};
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index ff33cc79..cb1251c5 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -33,6 +33,9 @@
 #elif defined(USE_SSE2)
 #include <emmintrin.h>
 
+#elif defined(USE_MMX)
+#include <mmintrin.h>
+
 #elif defined(USE_NEON)
 #include <arm_neon.h>
 #endif
@@ -79,6 +82,9 @@ namespace Eval::NNUE {
   #elif defined(USE_SSE2)
   constexpr std::size_t kSimdWidth = 16;
 
+  #elif defined(USE_MMX)
+  constexpr std::size_t kSimdWidth = 8;
+
   #elif defined(USE_NEON)
   constexpr std::size_t kSimdWidth = 16;
   #endif
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 3818e444..40f2603d 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -88,7 +88,7 @@ namespace Eval::NNUE {
       constexpr int kControl = 0b11011000;
       const __m256i kZero = _mm256_setzero_si256();
 
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
 
   #ifdef USE_SSE41
@@ -97,6 +97,10 @@ namespace Eval::NNUE {
       const __m128i k0x80s = _mm_set1_epi8(-128);
   #endif
 
+  #elif defined(USE_MMX)
+      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+      const __m64 k0x80s = _mm_set1_pi8(-128);
+
   #elif defined(USE_NEON)
       constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
       const int8x8_t kZero = {0};
@@ -117,7 +121,7 @@ namespace Eval::NNUE {
               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
         }
 
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
         auto out = reinterpret_cast<__m128i*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
           __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
@@ -137,6 +141,17 @@ namespace Eval::NNUE {
           );
         }
 
+  #elif defined(USE_MMX)
+        auto out = reinterpret_cast<__m64*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m64 sum0 = *(&reinterpret_cast<const __m64*>(
+              accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m64 sum1 = *(&reinterpret_cast<const __m64*>(
+              accumulation[perspectives[p]][0])[j * 2 + 1]);
+          const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
+          out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+        }
+
   #elif defined(USE_NEON)
         const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -154,6 +169,9 @@ namespace Eval::NNUE {
   #endif
 
       }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
     }
 
    private:
@@ -193,6 +211,15 @@ namespace Eval::NNUE {
           for (IndexType j = 0; j < kNumChunks; ++j)
             accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
 
+  #elif defined(USE_MMX)
+          auto accumulation = reinterpret_cast<__m64*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+            accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
+          }
+
   #elif defined(USE_NEON)
           auto accumulation = reinterpret_cast<int16x8_t*>(
               &accumulator.accumulation[perspective][i][0]);
@@ -208,6 +235,9 @@ namespace Eval::NNUE {
 
         }
       }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
 
       accumulator.computed_accumulation = true;
       accumulator.computed_score = false;
@@ -234,6 +264,11 @@ namespace Eval::NNUE {
         auto accumulation = reinterpret_cast<__m128i*>(
             &accumulator.accumulation[perspective][i][0]);
 
+  #elif defined(USE_MMX)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<__m64*>(
+            &accumulator.accumulation[perspective][i][0]);
+
   #elif defined(USE_NEON)
         constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
         auto accumulation = reinterpret_cast<int16x8_t*>(
@@ -263,6 +298,12 @@ namespace Eval::NNUE {
               accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
             }
 
+  #elif defined(USE_MMX)
+            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
+            }
+
   #elif defined(USE_NEON)
             auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
             for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -294,6 +335,12 @@ namespace Eval::NNUE {
               accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
             }
 
+  #elif defined(USE_MMX)
+            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
+            }
+
   #elif defined(USE_NEON)
             auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
             for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -310,6 +357,9 @@ namespace Eval::NNUE {
           }
         }
       }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
 
       accumulator.computed_accumulation = true;
       accumulator.computed_score = false;

From 220ef1d27d9cd006a30b07ab726999c8181d10f0 Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Mon, 10 Aug 2020 15:38:44 +0200
Subject: [PATCH 09/58] Assorted search parameter tune

STC https://tests.stockfishchess.org/tests/view/5f31219090816720665374ec
LLR: 2.96 (-2.94,2.94) {-0.50,1.50}
Total: 3376 W: 487 L: 359 D: 2530
Ptnml(0-2): 17, 253, 1042, 337, 39

LTC https://tests.stockfishchess.org/tests/view/5f3127f79081672066537502
LLR: 2.93 (-2.94,2.94) {0.25,1.75}
Total: 8360 W: 581 L: 475 D: 7304
Ptnml(0-2): 11, 407, 3238, 513, 11

closes https://github.com/official-stockfish/Stockfish/pull/2971

bench: 4733874
---
 src/search.cpp | 60 +++++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 3d2bb422..676427f7 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -63,9 +63,9 @@ namespace {
   constexpr uint64_t TtHitAverageResolution = 1024;
 
   // Razor and futility margins
-  constexpr int RazorMargin = 527;
+  constexpr int RazorMargin = 510;
   Value futility_margin(Depth d, bool improving) {
-    return Value(227 * (d - improving));
+    return Value(223 * (d - improving));
   }
 
   // Reductions lookup table, initialized at startup
@@ -73,7 +73,7 @@ namespace {
 
   Depth reduction(bool i, Depth d, int mn) {
     int r = Reductions[d] * Reductions[mn];
-    return (r + 570) / 1024 + (!i && r > 1018);
+    return (r + 509) / 1024 + (!i && r > 894);
   }
 
   constexpr int futility_move_count(bool improving, Depth depth) {
@@ -82,7 +82,7 @@ namespace {
 
   // History and stats update bonus, based on depth
   int stat_bonus(Depth d) {
-    return d > 15 ? 27 : 17 * d * d + 133 * d - 134;
+    return d > 13 ? 29 : 17 * d * d + 134 * d - 134;
   }
 
   // Add a small random component to draw evaluations to avoid 3fold-blindness
@@ -192,7 +192,7 @@ namespace {
 void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
-      Reductions[i] = int((24.8 + std::log(Threads.size())) * std::log(i));
+      Reductions[i] = int((22.0 + std::log(Threads.size())) * std::log(i));
 }
 
 
@@ -403,12 +403,12 @@ void Thread::search() {
           if (rootDepth >= 4)
           {
               Value prev = rootMoves[pvIdx].previousScore;
-              delta = Value(19);
+              delta = Value(17);
               alpha = std::max(prev - delta,-VALUE_INFINITE);
               beta  = std::min(prev + delta, VALUE_INFINITE);
 
               // Adjust contempt based on root move's previousScore (dynamic contempt)
-              int dct = ct + (110 - ct / 2) * prev / (abs(prev) + 140);
+              int dct = ct + (105 - ct / 2) * prev / (abs(prev) + 149);
 
               contempt = (us == WHITE ?  make_score(dct, dct / 2)
                                       : -make_score(dct, dct / 2));
@@ -506,13 +506,13 @@ void Thread::search() {
           && !Threads.stop
           && !mainThread->stopOnPonderhit)
       {
-          double fallingEval = (296 + 6 * (mainThread->bestPreviousScore - bestValue)
-                                    + 6 * (mainThread->iterValue[iterIdx] - bestValue)) / 725.0;
+          double fallingEval = (318 + 6 * (mainThread->bestPreviousScore - bestValue)
+                                    + 6 * (mainThread->iterValue[iterIdx] - bestValue)) / 825.0;
           fallingEval = Utility::clamp(fallingEval, 0.5, 1.5);
 
           // If the bestMove is stable over several iterations, reduce time accordingly
-          timeReduction = lastBestMoveDepth + 10 < completedDepth ? 1.92 : 0.95;
-          double reduction = (1.47 + mainThread->previousTimeReduction) / (2.22 * timeReduction);
+          timeReduction = lastBestMoveDepth + 9 < completedDepth ? 1.92 : 0.95;
+          double reduction = (1.47 + mainThread->previousTimeReduction) / (2.32 * timeReduction);
 
           // Use part of the gained time from a previous stable move for the current move
           for (Thread* th : Threads)
@@ -537,7 +537,7 @@ void Thread::search() {
           }
           else if (   Threads.increaseDepth
                    && !mainThread->ponder
-                   && Time.elapsed() > totalTime * 0.56)
+                   && Time.elapsed() > totalTime * 0.58)
                    Threads.increaseDepth = false;
           else
                    Threads.increaseDepth = true;
@@ -824,10 +824,10 @@ namespace {
     // Step 9. Null move search with verification search (~40 Elo)
     if (   !PvNode
         && (ss-1)->currentMove != MOVE_NULL
-        && (ss-1)->statScore < 23824
+        && (ss-1)->statScore < 22977
         &&  eval >= beta
         &&  eval >= ss->staticEval
-        &&  ss->staticEval >= beta - 28 * depth - 28 * improving + 94 * ttPv + 200
+        &&  ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ttPv + 182
         && !excludedMove
         &&  pos.non_pawn_material(us)
         && (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor))
@@ -835,7 +835,7 @@ namespace {
         assert(eval - beta >= 0);
 
         // Null move dynamic reduction based on depth and value
-        Depth R = (737 + 77 * depth) / 246 + std::min(int(eval - beta) / 192, 3);
+        Depth R = (817 + 71 * depth) / 213 + std::min(int(eval - beta) / 192, 3);
 
         ss->currentMove = MOVE_NULL;
         ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0];
@@ -1028,17 +1028,17 @@ moves_loop: // When in check, search starts from here
                   continue;
 
               // Futility pruning: parent node (~5 Elo)
-              if (   lmrDepth < 8
+              if (   lmrDepth < 7
                   && !ss->inCheck
-                  && ss->staticEval + 284 + 188 * lmrDepth <= alpha
+                  && ss->staticEval + 283 + 170 * lmrDepth <= alpha
                   &&  (*contHist[0])[movedPiece][to_sq(move)]
                     + (*contHist[1])[movedPiece][to_sq(move)]
                     + (*contHist[3])[movedPiece][to_sq(move)]
-                    + (*contHist[5])[movedPiece][to_sq(move)] / 2 < 28388)
+                    + (*contHist[5])[movedPiece][to_sq(move)] / 2 < 27376)
                   continue;
 
               // Prune moves with negative SEE (~20 Elo)
-              if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 17)) * lmrDepth * lmrDepth)))
+              if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
                   continue;
           }
           else
@@ -1055,12 +1055,12 @@ moves_loop: // When in check, search starts from here
                   && !(PvNode && abs(bestValue) < 2)
                   && PieceValue[MG][type_of(movedPiece)] >= PieceValue[MG][type_of(pos.piece_on(to_sq(move)))]
                   && !ss->inCheck
-                  && ss->staticEval + 178 + 261 * lmrDepth
+                  && ss->staticEval + 169 + 244 * lmrDepth
                      + PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] <= alpha)
                   continue;
 
               // See based pruning
-              if (!pos.see_ge(move, Value(-202) * depth)) // (~25 Elo)
+              if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo)
                   continue;
           }
       }
@@ -1166,7 +1166,7 @@ moves_loop: // When in check, search starts from here
               || moveCountPruning
               || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha
               || cutNode
-              || thisThread->ttHitAverage < 415 * TtHitAverageResolution * TtHitAverageWindow / 1024))
+              || thisThread->ttHitAverage < 427 * TtHitAverageResolution * TtHitAverageWindow / 1024))
       {
           Depth r = reduction(improving, depth, moveCount);
 
@@ -1178,7 +1178,7 @@ moves_loop: // When in check, search starts from here
               r--;
 
           // Decrease reduction if the ttHit running average is large
-          if (thisThread->ttHitAverage > 473 * TtHitAverageResolution * TtHitAverageWindow / 1024)
+          if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024)
               r--;
 
           // Reduction if other threads are searching this position
@@ -1221,17 +1221,17 @@ moves_loop: // When in check, search starts from here
                              + (*contHist[0])[movedPiece][to_sq(move)]
                              + (*contHist[1])[movedPiece][to_sq(move)]
                              + (*contHist[3])[movedPiece][to_sq(move)]
-                             - 4826;
+                             - 5287;
 
               // Decrease/increase reduction by comparing opponent's stat score (~10 Elo)
-              if (ss->statScore >= -100 && (ss-1)->statScore < -112)
+              if (ss->statScore >= -106 && (ss-1)->statScore < -104)
                   r--;
 
-              else if ((ss-1)->statScore >= -125 && ss->statScore < -138)
+              else if ((ss-1)->statScore >= -119 && ss->statScore < -140)
                   r++;
 
               // Decrease/increase reduction for moves with a good/bad history (~30 Elo)
-              r -= ss->statScore / 14615;
+              r -= ss->statScore / 14884;
           }
           else
           {
@@ -1241,7 +1241,7 @@ moves_loop: // When in check, search starts from here
 
             // Unless giving check, this capture is likely bad
             if (   !givesCheck
-                && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 211 * depth <= alpha)
+                && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 213 * depth <= alpha)
                 r++;
           }
 
@@ -1503,7 +1503,7 @@ moves_loop: // When in check, search starts from here
         if (PvNode && bestValue > alpha)
             alpha = bestValue;
 
-        futilityBase = bestValue + 141;
+        futilityBase = bestValue + 145;
     }
 
     const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory,
@@ -1754,7 +1754,7 @@ moves_loop: // When in check, search starts from here
     }
 
     if (depth > 11 && ss->ply < MAX_LPH)
-        thisThread->lowPlyHistory[ss->ply][from_to(move)] << stat_bonus(depth - 6);
+        thisThread->lowPlyHistory[ss->ply][from_to(move)] << stat_bonus(depth - 7);
   }
 
   // When playing with strength handicap, choose best move among a set of RootMoves

From a72cec1ff854a77a92452c2afe2001e05f06e6d4 Mon Sep 17 00:00:00 2001
From: Vizvezdenec <Vizvezdenec@gmail.com>
Date: Sat, 18 Jul 2020 16:30:00 +0300
Subject: [PATCH 10/58] Add comments to probCut code

and rename a variable

closes https://github.com/official-stockfish/Stockfish/pull/2819

No functional change
---
 src/search.cpp | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 676427f7..ef47fd22 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -596,7 +596,7 @@ namespace {
     Key posKey;
     Move ttMove, move, excludedMove, bestMove;
     Depth extension, newDepth;
-    Value bestValue, value, ttValue, eval, maxValue, probcutBeta;
+    Value bestValue, value, ttValue, eval, maxValue, probCutBeta;
     bool ttHit, ttPv, formerPv, givesCheck, improving, didLMR, priorCapture;
     bool captureOrPromotion, doFullDepthSearch, moveCountPruning,
          ttCapture, singularQuietLMR;
@@ -871,7 +871,7 @@ namespace {
         }
     }
 
-    probcutBeta = beta + 176 - 49 * improving;
+    probCutBeta = beta + 176 - 49 * improving;
 
     // Step 10. ProbCut (~10 Elo)
     // If we have a good enough capture and a reduced search returns a value
@@ -879,21 +879,27 @@ namespace {
     if (   !PvNode
         &&  depth > 4
         &&  abs(beta) < VALUE_TB_WIN_IN_MAX_PLY
-        && !(   ttHit
-             && tte->depth() >= depth - 3
+        // if value from transposition table is lower than probCutBeta, don't attempt probCut
+        // there and in further interactions with transposition table cutoff depth is set to depth - 3
+        // because probCut search has depth set to depth - 4 but we also do a move before it
+        // so effective depth is equal to depth - 3
+        && !(   ttHit 
+             && tte->depth() >= depth - 3 
              && ttValue != VALUE_NONE
-             && ttValue < probcutBeta))
+             && ttValue < probCutBeta))
     {
+        // if ttMove is a capture and value from transposition table is good enough produce probCut
+        // cutoff without digging into actual probCut search
         if (   ttHit
             && tte->depth() >= depth - 3
             && ttValue != VALUE_NONE
-            && ttValue >= probcutBeta
+            && ttValue >= probCutBeta
             && ttMove
             && pos.capture_or_promotion(ttMove))
-            return probcutBeta;
+            return probCutBeta;
 
-        assert(probcutBeta < VALUE_INFINITE);
-        MovePicker mp(pos, ttMove, probcutBeta - ss->staticEval, &captureHistory);
+        assert(probCutBeta < VALUE_INFINITE);
+        MovePicker mp(pos, ttMove, probCutBeta - ss->staticEval, &captureHistory);
         int probCutCount = 0;
 
         while (   (move = mp.next_move()) != MOVE_NONE
@@ -915,16 +921,17 @@ namespace {
                 pos.do_move(move, st);
 
                 // Perform a preliminary qsearch to verify that the move holds
-                value = -qsearch<NonPV>(pos, ss+1, -probcutBeta, -probcutBeta+1);
+                value = -qsearch<NonPV>(pos, ss+1, -probCutBeta, -probCutBeta+1);
 
                 // If the qsearch held, perform the regular search
-                if (value >= probcutBeta)
-                    value = -search<NonPV>(pos, ss+1, -probcutBeta, -probcutBeta+1, depth - 4, !cutNode);
+                if (value >= probCutBeta)
+                    value = -search<NonPV>(pos, ss+1, -probCutBeta, -probCutBeta+1, depth - 4, !cutNode);
 
                 pos.undo_move(move);
 
-                if (value >= probcutBeta)
+                if (value >= probCutBeta)
                 {
+                    // if transposition table doesn't have equal or more deep info write probCut data into it
                     if ( !(ttHit
                        && tte->depth() >= depth - 3
                        && ttValue != VALUE_NONE))

From 4ab8b0b738fe4ae58588efb421fd7b1643b2ef66 Mon Sep 17 00:00:00 2001
From: Guy Vreuls <guyvreuls@gmail.com>
Date: Tue, 11 Aug 2020 04:38:38 +0200
Subject: [PATCH 11/58] Fix parallel LTO issues on Windows

This adds -save-temps to the linker flags when parallel LTO is used on
MinGW/MSYS.

fixes #2977

closes https://github.com/official-stockfish/Stockfish/pull/2978

No functional change.
---
 src/Makefile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 3d84f482..fd2618a4 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -472,6 +472,11 @@ ifeq ($(debug), no)
 	ifeq ($(gccisclang),)
 		CXXFLAGS += -flto
 		LDFLAGS += $(CXXFLAGS) -flto=jobserver
+		ifneq ($(findstring MINGW,$(KERNEL)),)
+			LDFLAGS += -save-temps
+		else ifneq ($(findstring MSYS,$(KERNEL)),)
+			LDFLAGS += -save-temps
+		endif
 	else
 		CXXFLAGS += -flto=thin
 		LDFLAGS += $(CXXFLAGS)
@@ -605,7 +610,7 @@ objclean:
 # clean auxiliary profiling files
 profileclean:
 	@rm -rf profdir
-	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda
+	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s
 	@rm -f stockfish.profdata *.profraw
 
 default:

From 399cddf444666cf1671c5281f7a8e78887b4f400 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Mon, 10 Aug 2020 16:14:17 +0200
Subject: [PATCH 12/58] More aligned_alloc changes to support Android

Move to posix_memalign for those platforms, in particular android,
that do not fully support c++17 std::aligned_alloc() (and are not windows)

see https://github.com/official-stockfish/Stockfish/issues/2860

closes https://github.com/official-stockfish/Stockfish/pull/2973

No functional change
---
 src/misc.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/misc.cpp b/src/misc.cpp
index 401a6505..fc3746cf 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -51,6 +51,11 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
 #include <sys/mman.h>
 #endif
 
+#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
+#define POSIXALIGNEDALLOC
+#include <stdlib.h>
+#endif
+
 #include "misc.h"
 #include "thread.h"
 
@@ -318,8 +323,11 @@ void prefetch(void* addr) {
 ///
 
 void* std_aligned_alloc(size_t alignment, size_t size) {
-#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
-  return aligned_alloc(alignment, size);
+#if defined(POSIXALIGNEDALLOC)
+  void *pointer;
+  if(posix_memalign(&pointer, alignment, size) == 0)
+      return pointer;
+  return nullptr;
 #elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
   return _mm_malloc(size, alignment);
 #else
@@ -328,7 +336,7 @@ void* std_aligned_alloc(size_t alignment, size_t size) {
 }
 
 void std_aligned_free(void* ptr) {
-#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
+#if defined(POSIXALIGNEDALLOC)
   free(ptr);
 #elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
   _mm_free(ptr);

From f46c73040c16a078b884825c203feee6b0a8850b Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Mon, 10 Aug 2020 12:52:46 -0700
Subject: [PATCH 13/58] Fix AVX512 build with older compilers

avoids an intrinsic that is missing in gcc < 10.

For this target, might trigger another gcc bug on windows that
requires up-to-date gcc 8, 9, or 10, or usage of clang.

Fixes https://github.com/official-stockfish/Stockfish/issues/2975

closes https://github.com/official-stockfish/Stockfish/pull/2976

No functional change
---
 src/Makefile                       | 2 +-
 src/nnue/layers/affine_transform.h | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index fd2618a4..e34fbf61 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -416,7 +416,7 @@ endif
 ifeq ($(avx512),yes)
 	CXXFLAGS += -DUSE_AVX512
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
-		CXXFLAGS += -mavx512bw
+		CXXFLAGS += -mavx512f -mavx512bw
 	endif
 endif
 
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 985ee71a..8d2acd18 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -126,8 +126,7 @@ namespace Eval::NNUE::Layers {
             const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
             const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
             __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            product256 = _mm256_madd_epi16(product256, _mm256_set1_epi16(1));
-            sum = _mm512_add_epi32(sum, _mm512_zextsi256_si512(product256));
+            sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
         }
         output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
 

From ea6220f3813e5b76b444a02905eaf2c556bdb368 Mon Sep 17 00:00:00 2001
From: Guy Vreuls <guyvreuls@gmail.com>
Date: Fri, 7 Aug 2020 17:07:46 +0200
Subject: [PATCH 14/58] This commit enables a mixed bench, to improve CI and
 allow for PGO (profile-build) of the NNUE part of the code.

Joint work gvreuls / vondele

* Download the default NNUE net in AppVeyor
* Download net in travis CI `make net`
* Adjust tests to cover more archs, speedup instrumented testing
* Introduce 'mixed' bench as default, with further options:

classical, NNUE, mixed.

mixed (default) and NNUE require the default net to be present,
which can be obtained with

```
make net
```

Further examples (first is equivalent to `./stockfish bench`):

```
./stockfish bench 16 1 13 default depth mixed
./stockfish bench 16 1 13 default depth classical
./stockfish bench 16 1 13 default depth NNUE
```

The net is now downloaded automatically if needed for `profile-build`
(usual `build` works fine without net present)

PGO gives a nice speedup on fishtest:

passed STC:
LLR: 2.93 (-2.94,2.94) {-0.50,1.50}
Total: 3360 W: 469 L: 343 D: 2548
Ptnml(0-2): 20, 246, 1030, 356, 28
https://tests.stockfishchess.org/tests/view/5f31b5499081672066537569

passed LTC:
LLR: 2.97 (-2.94,2.94) {0.25,1.75}
Total: 8824 W: 609 L: 502 D: 7713
Ptnml(0-2): 8, 430, 3438, 519, 17
https://tests.stockfishchess.org/tests/view/5f31c87b908167206653757c

closes https://github.com/official-stockfish/Stockfish/pull/2931

fixes https://github.com/official-stockfish/Stockfish/issues/2907

requires fishtest updates before commit

Bench: 4290577
---
 .travis.yml           | 27 +++++++++++++++++++++------
 appveyor.yml          | 14 ++++++++++++++
 src/Makefile          |  2 +-
 src/benchmark.cpp     | 13 +++++++++++--
 tests/instrumented.sh |  8 ++++----
 5 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index d563a1e1..0dd38047 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -43,6 +43,9 @@ before_script:
   - cd src
 
 script:
+  # Download net
+  - make net
+
   # Obtain bench reference from git log
   - git log HEAD | grep "\b[Bb]ench[ :]\+[0-9]\{7\}" | head -n 1 | sed "s/[^0-9]*\([0-9]*\).*/\1/g" > git_sig
   - export benchref=$(cat git_sig)
@@ -55,14 +58,26 @@ script:
   #
   # Verify bench number against various builds
   - export CXXFLAGS="-Werror -D_GLIBCXX_DEBUG"
-  - make clean && make -j2 ARCH=x86-64 optimize=no debug=yes build && ../tests/signature.sh $benchref
+  - make clean && make -j2 ARCH=x86-64-modern optimize=no debug=yes build && ../tests/signature.sh $benchref
+  - export CXXFLAGS="-Werror"
+  - make clean && make -j2 ARCH=x86-64-modern build && ../tests/signature.sh $benchref
+  - make clean && make -j2 ARCH=x86-64-ssse3 build && ../tests/signature.sh $benchref
+  - make clean && make -j2 ARCH=x86-64-sse3-popcnt build && ../tests/signature.sh $benchref
+  - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-old build && ../tests/signature.sh $benchref; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" && "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
+
+  # compile only for some more advanced architectures (might not run in travis)
+  - make clean && make -j2 ARCH=x86-64-avx2 build
+  - make clean && make -j2 ARCH=x86-64-bmi2 build
+  # needs gcc 10 to compile
+  - if [[ "$COMPILER" != "g++-8" ]]; then make clean && make -j2 ARCH=x86-64-avx512 build; fi
 
   #
   # Check perft and reproducible search
-  - export CXXFLAGS="-Werror"
-  - make clean && make -j2 ARCH=x86-64 build
+  - make clean && make -j2 ARCH=x86-64-modern build
   - ../tests/perft.sh
   - ../tests/reprosearch.sh
 
@@ -70,11 +85,11 @@ script:
   # Valgrind
   #
   - export CXXFLAGS="-O1 -fno-inline"
-  - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64 debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind; fi
+  - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind; fi
   - if [ -x "$(command -v valgrind )" ]; then ../tests/instrumented.sh --valgrind-thread; fi
 
   #
   # Sanitizer
   #
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64 sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64 sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi
diff --git a/appveyor.yml b/appveyor.yml
index d356ba2f..a3732a23 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -61,6 +61,20 @@ before_build:
 
 build_script:
   - cmake --build . --config %CONFIGURATION% -- /verbosity:minimal
+  - ps: |
+      # Download default NNUE net from fishtest
+      $nnuenet = Get-Content -Path src\ucioption.cpp | Select-String -CaseSensitive -Pattern "Option" | Select-String -CaseSensitive -Pattern "nn-[a-z0-9]{12}.nnue"
+      $dummy = $nnuenet -match "(?<nnuenet>nn-[a-z0-9]{12}.nnue)"
+      $nnuenet = $Matches.nnuenet
+      Write-Host "Default net:" $nnuenet
+      $nnuedownloadurl = "https://tests.stockfishchess.org/api/nn/$nnuenet"
+      $nnuefilepath = "src\${env:CONFIGURATION}\$nnuenet"
+      if (Test-Path -Path $nnuefilepath) {
+            Write-Host "Already available."
+      } else {
+            Write-Host "Downloading $nnuedownloadurl to $nnuefilepath"
+            Invoke-WebRequest -Uri $nnuedownloadurl -OutFile $nnuefilepath
+      }
 
 before_test:
   - cd src/%CONFIGURATION%
diff --git a/src/Makefile b/src/Makefile
index e34fbf61..c00b60b5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -569,7 +569,7 @@ help:
 build: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
 
-profile-build: config-sanity objclean profileclean
+profile-build: config-sanity objclean profileclean net
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make)
diff --git a/src/benchmark.cpp b/src/benchmark.cpp
index 6041d642..806e9840 100644
--- a/src/benchmark.cpp
+++ b/src/benchmark.cpp
@@ -95,8 +95,9 @@ const vector<string> Defaults = {
 /// setup_bench() builds a list of UCI commands to be run by bench. There
 /// are five parameters: TT size in MB, number of search threads that
 /// should be used, the limit value spent for each position, a file name
-/// where to look for positions in FEN format and the type of the limit:
-/// depth, perft, nodes and movetime (in millisecs).
+/// where to look for positions in FEN format, the type of the limit:
+/// depth, perft, nodes and movetime (in millisecs), and evaluation type
+/// mixed (default), classical, NNUE.
 ///
 /// bench -> search default positions up to depth 13
 /// bench 64 1 15 -> search default positions up to depth 15 (TT = 64MB)
@@ -115,6 +116,7 @@ vector<string> setup_bench(const Position& current, istream& is) {
   string limit     = (is >> token) ? token : "13";
   string fenFile   = (is >> token) ? token : "default";
   string limitType = (is >> token) ? token : "depth";
+  string evalType  = (is >> token) ? token : "mixed";
 
   go = limitType == "eval" ? "eval" : "go " + limitType + " " + limit;
 
@@ -146,13 +148,20 @@ vector<string> setup_bench(const Position& current, istream& is) {
   list.emplace_back("setoption name Hash value " + ttSize);
   list.emplace_back("ucinewgame");
 
+  size_t posCounter = 0;
+
   for (const string& fen : fens)
       if (fen.find("setoption") != string::npos)
           list.emplace_back(fen);
       else
       {
+          if (evalType == "classical" || (evalType == "mixed" && posCounter % 2 == 0))
+              list.emplace_back("setoption name Use NNUE value false");
+          else if (evalType == "NNUE" || (evalType == "mixed" && posCounter % 2 != 0))
+              list.emplace_back("setoption name Use NNUE value true");
           list.emplace_back("position fen " + fen);
           list.emplace_back(go);
+          ++posCounter;
       }
 
   return list;
diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index ae6d5c4b..03ded74a 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -70,7 +70,7 @@ for args in "eval" \
             "go depth 10" \
             "go movetime 1000" \
             "go wtime 8000 btime 8000 winc 500 binc 500" \
-            "bench 128 $threads 10 default depth"
+            "bench 128 $threads 8 default depth"
 do
 
    echo "$prefix $exeprefix ./stockfish $args $postfix"
@@ -80,7 +80,7 @@ done
 
 # more general testing, following an uci protocol exchange
 cat << EOF > game.exp
- set timeout 10
+ set timeout 240
  spawn $exeprefix ./stockfish
 
  send "uci\n"
@@ -98,7 +98,7 @@ cat << EOF > game.exp
  expect "bestmove"
 
  send "position fen 5rk1/1K4p1/8/8/3B4/8/8/8 b - - 0 1\n"
- send "go depth 30\n"
+ send "go depth 20\n"
  expect "bestmove"
 
  send "quit\n"
@@ -121,7 +121,7 @@ cat << EOF > syzygy.exp
  send "uci\n"
  send "setoption name SyzygyPath value ../tests/syzygy/\n"
  expect "info string Found 35 tablebases" {} timeout {exit 1}
- send "bench 128 1 10 default depth\n"
+ send "bench 128 1 8 default depth\n"
  send "quit\n"
  expect eof
 

From ee060464129f8d3af184efa013177a4ef387a394 Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Mon, 10 Aug 2020 21:13:56 +0800
Subject: [PATCH 15/58] Tweak castling extension

Change condition from three friendly pieces to two. This now means that we only extend castling on the king side if there are no other friendly pieces aside from king and rook. For the queen side, we only extend if there is only a rook and another friendly piece or if there is only a single rook and no other friendly piece but this is very rare.

STC:
LLR: 3.20 (-2.94,2.94) {-0.50,1.50}
Total: 31144 W: 4086 L: 3903 D: 23155
Ptnml(0-2): 227, 2843, 9278, 2968, 256
https://tests.stockfishchess.org/tests/view/5f31487f9081672066537516

LTC:
LLR: 2.93 (-2.94,2.94) {0.25,1.75}
Total: 57816 W: 3786 L: 3538 D: 50492
Ptnml(0-2): 92, 2991, 22488, 3251, 86
https://tests.stockfishchess.org/tests/view/5f3167c3908167206653753d

closes https://github.com/official-stockfish/Stockfish/pull/2980

Bench: 4244812
---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index ef47fd22..c5b4332f 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1139,7 +1139,7 @@ moves_loop: // When in check, search starts from here
 
       // Castling extension
       if (   type_of(move) == CASTLING
-          && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 3)
+          && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2)
           extension = 1;
 
       // Late irreversible move extension

From 992f549ae7f4f73b025429c44bdbbc65de917f6c Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Tue, 11 Aug 2020 21:11:17 +0200
Subject: [PATCH 16/58] Restrict avx2 hack to windows target

this workaround is possibly rather a windows & gcc specific problem. See e.g.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412#c25

on Linux with gcc 8 this patch brings roughly a 8% speedup.
However, probably needs some testing in the wild.

includes a workaround for an old msys make (3.81) installation (fixes #2984)

No functional change
---
 src/Makefile           | 2 +-
 src/nnue/nnue_common.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index c00b60b5..e82b066b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -569,7 +569,7 @@ help:
 build: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
 
-profile-build: config-sanity objclean profileclean net
+profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make)
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index cb1251c5..eab7d258 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -44,7 +44,7 @@
 //       compiled with older g++ crashes because the output memory is not aligned
 //       even though alignas is specified.
 #if defined(USE_AVX2)
-#if defined(__GNUC__ ) && (__GNUC__ < 9)
+#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32)
 #define _mm256_loadA_si256  _mm256_loadu_si256
 #define _mm256_storeA_si256 _mm256_storeu_si256
 #else
@@ -54,7 +54,7 @@
 #endif
 
 #if defined(USE_AVX512)
-#if defined(__GNUC__ ) && (__GNUC__ < 9)
+#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32)
 #define _mm512_loadA_si512   _mm512_loadu_si512
 #define _mm512_storeA_si512  _mm512_storeu_si512
 #else

From 6bc0256292cf51d390fee0cb78963da884dc2677 Mon Sep 17 00:00:00 2001
From: Daylen Yang <services@daylenyang.com>
Date: Tue, 11 Aug 2020 12:02:48 -0700
Subject: [PATCH 17/58] Use posix_memalign for Apple Silicon instead of
 _mm_malloc

fails to build on that target, because of missing Intel Intrinsics.
macOS has posix_memalign() since ~2014 so we can simplify the code and just use that for all Apple platforms.

closes https://github.com/official-stockfish/Stockfish/pull/2985

No functional change.
---
 src/misc.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/misc.cpp b/src/misc.cpp
index fc3746cf..aeb3c912 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -51,7 +51,7 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
 #include <sys/mman.h>
 #endif
 
-#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
+#if defined(__APPLE__) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
 #define POSIXALIGNEDALLOC
 #include <stdlib.h>
 #endif
@@ -328,7 +328,7 @@ void* std_aligned_alloc(size_t alignment, size_t size) {
   if(posix_memalign(&pointer, alignment, size) == 0)
       return pointer;
   return nullptr;
-#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
+#elif defined(_WIN32)
   return _mm_malloc(size, alignment);
 #else
   return std::aligned_alloc(alignment, size);
@@ -338,7 +338,7 @@ void* std_aligned_alloc(size_t alignment, size_t size) {
 void std_aligned_free(void* ptr) {
 #if defined(POSIXALIGNEDALLOC)
   free(ptr);
-#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
+#elif defined(_WIN32)
   _mm_free(ptr);
 #else
   free(ptr);

From dd63b98fb06e050aa961fbad6fd1f9316f2b17df Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Tue, 11 Aug 2020 12:59:39 -0700
Subject: [PATCH 18/58] Add support for VNNI

Adds support for Vector Neural Network Instructions (avx512), as available on Intel Cascade Lake

The _mm512_dpbusd_epi32() intrinsic (vpdpbusd instruction) is taylor made for NNUE.

on a cascade lake CPU (AWS C5.24x.large, gcc 10) NNUE eval is at roughly 78% nps of classical
(single core test)

bench 1024 1 24 default depth:
target 	classical 	NNUE 	ratio
vnni 	2207232 	1725987 	78.20
avx512 	2216789 	1671734 	75.41
avx2 	2194006 	1611263 	73.44
modern 	2185001 	1352469 	61.90

closes https://github.com/official-stockfish/Stockfish/pull/2987

No functional change
---
 src/Makefile                       | 25 +++++++++++++++++++++++++
 src/misc.cpp                       |  3 +++
 src/nnue/layers/affine_transform.h | 14 +++++++++++++-
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index e82b066b..0804cdd5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -73,6 +73,7 @@ endif
 # avx2 = yes/no       --- -mavx2           --- Use Intel Advanced Vector Extensions 2
 # pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
 # avx512 = yes/no     --- -mavx512bw       --- Use Intel Advanced Vector Extensions 512
+# vnni = yes/no       --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 512
 # neon = yes/no       --- -DUSE_NEON       --- Use ARM SIMD architecture
 #
 # Note that Makefile is space sensitive, so when adding new architectures
@@ -93,6 +94,7 @@ sse41 = no
 avx2 = no
 pext = no
 avx512 = no
+vnni = no
 neon = no
 ARCH = x86-64-modern
 
@@ -190,6 +192,19 @@ ifeq ($(ARCH),x86-64-avx512)
 	avx512 = yes
 endif
 
+ifeq ($(ARCH),x86-64-vnni)
+	arch = x86_64
+	prefetch = yes
+	popcnt = yes
+	sse = yes
+	ssse3 = yes
+	sse41 = yes
+	avx2 = yes
+	pext = yes
+	avx512 = yes
+	vnni = yes
+endif
+
 ifeq ($(ARCH),armv7)
 	arch = armv7
 	prefetch = yes
@@ -420,6 +435,13 @@ ifeq ($(avx512),yes)
 	endif
 endif
 
+ifeq ($(vnni),yes)
+	CXXFLAGS += -DUSE_VNNI
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl
+	endif
+endif
+
 ifeq ($(sse41),yes)
 	CXXFLAGS += -DUSE_SSE41
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
@@ -522,6 +544,7 @@ help:
 	@echo ""
 	@echo "Supported archs:"
 	@echo ""
+	@echo "x86-64-vnni             > x86 64-bit with vnni support"
 	@echo "x86-64-avx512           > x86 64-bit with avx512 support"
 	@echo "x86-64-bmi2             > x86 64-bit with bmi2 support"
 	@echo "x86-64-avx2             > x86 64-bit with avx2 support"
@@ -640,6 +663,7 @@ config-sanity:
 	@echo "avx2: '$(avx2)'"
 	@echo "pext: '$(pext)'"
 	@echo "avx512: '$(avx512)'"
+	@echo "vnni: '$(vnni)'"
 	@echo "neon: '$(neon)'"
 	@echo ""
 	@echo "Flags:"
@@ -664,6 +688,7 @@ config-sanity:
 	@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
 	@test "$(pext)" = "yes" || test "$(pext)" = "no"
 	@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
+	@test "$(vnni)" = "yes" || test "$(vnni)" = "no"
 	@test "$(neon)" = "yes" || test "$(neon)" = "no"
 	@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang"
 
diff --git a/src/misc.cpp b/src/misc.cpp
index aeb3c912..ab52d30b 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -219,6 +219,9 @@ const std::string compiler_info() {
 
   compiler += "\nCompilation settings include: ";
   compiler += (Is64Bit ? " 64bit" : " 32bit");
+  #if defined(USE_VNNI)
+    compiler += " VNNI";
+  #endif
   #if defined(USE_AVX512)
     compiler += " AVX512";
   #endif
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 8d2acd18..322e3240 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -79,8 +79,10 @@ namespace Eval::NNUE::Layers {
 
   #if defined(USE_AVX512)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
-      const __m512i kOnes = _mm512_set1_epi16(1);
       const auto input_vector = reinterpret_cast<const __m512i*>(input);
+  #if !defined(USE_VNNI)
+      const __m512i kOnes = _mm512_set1_epi16(1);
+  #endif
 
   #elif defined(USE_AVX2)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
@@ -113,9 +115,13 @@ namespace Eval::NNUE::Layers {
         __m512i sum = _mm512_setzero_si512();
         const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
+  #if defined(USE_VNNI)
+            sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+  #else
             __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
             product = _mm512_madd_epi16(product, kOnes);
             sum = _mm512_add_epi32(sum, product);
+  #endif
         }
 
         // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
@@ -125,8 +131,14 @@ namespace Eval::NNUE::Layers {
         {
             const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
             const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
+  #if defined(USE_VNNI)
+            __m256i product256 = _mm256_dpbusd_epi32(
+                _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
+            sum = _mm512_inserti32x8(sum, product256, 0);
+  #else
             __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
             sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
+  #endif
         }
         output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
 

From 69cfe28f315b559cb1a07c0806266aa2850b5d4b Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 12 Aug 2020 17:21:12 +0200
Subject: [PATCH 19/58] Output the SSE2 flag in compiler_info

was missing in the list of outputs, slightly reorder flags.
explicitly add -msse2 if USE_SSE2 (is implicit already, -msse -m64).

closes https://github.com/official-stockfish/Stockfish/pull/2990

No functional change.
---
 src/Makefile | 2 +-
 src/misc.cpp | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 0804cdd5..027cc3e3 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -468,7 +468,7 @@ ifeq ($(neon),yes)
 endif
 
 ifeq ($(arch),x86_64)
-	CXXFLAGS += -DUSE_SSE2
+	CXXFLAGS += -msse2 -DUSE_SSE2
 endif
 
 ### 3.7 pext
diff --git a/src/misc.cpp b/src/misc.cpp
index ab52d30b..1cee4726 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -225,6 +225,7 @@ const std::string compiler_info() {
   #if defined(USE_AVX512)
     compiler += " AVX512";
   #endif
+  compiler += (HasPext ? " BMI2" : "");
   #if defined(USE_AVX2)
     compiler += " AVX2";
   #endif
@@ -234,11 +235,14 @@ const std::string compiler_info() {
   #if defined(USE_SSSE3)
     compiler += " SSSE3";
   #endif
-    compiler += (HasPext ? " BMI2" : "");
-    compiler += (HasPopCnt ? " POPCNT" : "");
+  #if defined(USE_SSE2)
+    compiler += " SSE2";
+  #endif
+  compiler += (HasPopCnt ? " POPCNT" : "");
   #if defined(USE_MMX)
     compiler += " MMX";
   #endif
+
   #if !defined(NDEBUG)
     compiler += " DEBUG";
   #endif

From 67e48418afd58dd69708dcd67dea6161f61ef76f Mon Sep 17 00:00:00 2001
From: Sergio Vieri <sergio.vieri.hp@gmail.com>
Date: Wed, 12 Aug 2020 23:21:21 +0800
Subject: [PATCH 20/58] Update default net to nn-82215d0fd0df.nnue

Net created at: 20200812-2257

passed STC: https://tests.stockfishchess.org/tests/view/5f340ca99e5f2effc089da17
LLR: 2.96 (-2.94,2.94) {-0.50,1.50}
Total: 5744 W: 756 L: 627 D: 4361
Ptnml(0-2): 28, 485, 1731, 586, 42

passed LTC: https://tests.stockfishchess.org/tests/view/5f341eba9e5f2effc089da23
LLR: 2.94 (-2.94,2.94) {0.25,1.75}
Total: 17136 W: 1041 L: 917 D: 15178
Ptnml(0-2): 13, 813, 6807, 907, 28

closes https://github.com/official-stockfish/Stockfish/pull/2992

Bench: 3935117
---
 src/ucioption.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index b0689d6d..0a35d01b 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -79,7 +79,7 @@ void init(OptionsMap& o) {
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
   o["Use NNUE"]              << Option(false, on_use_NNUE);
-  o["EvalFile"]              << Option("nn-112bb1c8cdb5.nnue", on_eval_file);
+  o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);
 }
 
 

From e8ea215a13e009b78a148fda831392eb3224107e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= <cassio@free.fr>
Date: Thu, 13 Aug 2020 13:40:06 +0200
Subject: [PATCH 21/58] Clean-up Makefile help

Do not show the details of the default architecture for a simple "make help"
invocation, as the details are most likely to confuse beginners. Instead we
make it clear which architecture is the default and put an example at the end
of the Makefile as an incentative to use "make help ARCH=blah" to discover
the flags used by the different architectures.

```
    make help
    make help ARCH=x86-64-ssse3
```

Also clean-up and modernize a bit the Makefile examples while at it.

closes https://github.com/official-stockfish/Stockfish/pull/2996

No functional change
---
 src/Makefile | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 027cc3e3..a9fb7b81 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -81,6 +81,11 @@ endif
 # at the end of the line for flag values.
 
 ### 2.1. General and architecture defaults
+
+ifeq ($(ARCH),)
+    empty_arch = yes
+endif
+
 optimize = yes
 debug = no
 sanitize = no
@@ -99,6 +104,7 @@ neon = no
 ARCH = x86-64-modern
 
 ### 2.2 Architecture specific
+
 ifeq ($(ARCH),general-32)
 	arch = any
 	bits = 32
@@ -141,16 +147,7 @@ ifeq ($(ARCH),x86-64-ssse3)
 	ssse3 = yes
 endif
 
-ifeq ($(ARCH),x86-64-modern)
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
-	sse = yes
-	ssse3 = yes
-	sse41 = yes
-endif
-
-ifeq ($(ARCH),x86-64-sse41-popcnt)
+ifeq ($(ARCH),$(filter $(ARCH),x86-64-sse41-popcnt x86-64-modern))
 	arch = x86_64
 	prefetch = yes
 	popcnt = yes
@@ -535,12 +532,13 @@ help:
 	@echo ""
 	@echo "Supported targets:"
 	@echo ""
+	@echo "help                    > Display architecture details"
 	@echo "build                   > Standard build"
-	@echo "profile-build           > Standard build with PGO"
+	@echo "net                     > Download the default nnue net"
+	@echo "profile-build           > Faster build (with profile-guided optimization)"
 	@echo "strip                   > Strip executable"
 	@echo "install                 > Install executable"
 	@echo "clean                   > Clean up"
-	@echo "net                     > Download the default nnue net"
 	@echo ""
 	@echo "Supported archs:"
 	@echo ""
@@ -549,7 +547,7 @@ help:
 	@echo "x86-64-bmi2             > x86 64-bit with bmi2 support"
 	@echo "x86-64-avx2             > x86 64-bit with avx2 support"
 	@echo "x86-64-sse41-popcnt     > x86 64-bit with sse41 and popcnt support"
-	@echo "x86-64-modern           > the same as previous (x86-64-sse41-popcnt)"
+	@echo "x86-64-modern           > common modern CPU, currently x86-64-sse41-popcnt"
 	@echo "x86-64-ssse3            > x86 64-bit with ssse3 support"
 	@echo "x86-64-sse3-popcnt      > x86 64-bit with sse3 and popcnt support"
 	@echo "x86-64                  > x86 64-bit generic"
@@ -572,17 +570,20 @@ help:
 	@echo ""
 	@echo "Simple examples. If you don't know what to do, you likely want to run: "
 	@echo ""
-	@echo "make -j build ARCH=x86-64    (This is for 64-bit systems)"
-	@echo "make -j build ARCH=x86-32    (This is for 32-bit systems)"
+	@echo "make -j build ARCH=x86-64  (A portable, slow compile for 64-bit systems)"
+	@echo "make -j build ARCH=x86-32  (A portable, slow compile for 32-bit systems)"
 	@echo ""
-	@echo "Advanced examples, for experienced users: "
+	@echo "Advanced examples, for experienced users looking for performance: "
 	@echo ""
-	@echo "make -j build ARCH=x86-64-modern COMP=clang"
-	@echo "make -j profile-build ARCH=x86-64-bmi2 COMP=gcc COMPCXX=g++-4.8"
-	@echo ""
-	@echo "The selected architecture $(ARCH) enables the following configuration: "
+	@echo "make    help  ARCH=x86-64-bmi2"
+	@echo "make -j profile-build ARCH=x86-64-bmi2 COMP=gcc COMPCXX=g++-9.0"
+	@echo "make -j build ARCH=x86-64-ssse3 COMP=clang"
 	@echo ""
+ifneq ($(empty_arch), yes)
+	@echo "-------------------------------\n"
+	@echo "The selected architecture $(ARCH) will enable the following configuration: "
 	@$(MAKE) ARCH=$(ARCH) COMP=$(COMP) config-sanity
+endif
 
 
 .PHONY: help build profile-build strip install clean net objclean profileclean \

From ce009ea1aaecc577bbdf208cef8e61dd1827a18e Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Thu, 13 Aug 2020 22:54:13 +0200
Subject: [PATCH 22/58] Verify SHA of downloaded net file

check SHA of the available and downloaded file.

Document the format requirement on the default net.

Also allow curl to make possibly insecure connections, as needed for old curl.

fixes https://github.com/official-stockfish/Stockfish/issues/2998

closes https://github.com/official-stockfish/Stockfish/pull/3000

No functional change.
---
 src/Makefile      | 4 +++-
 src/ucioption.cpp | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index a9fb7b81..38f607cb 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -624,8 +624,10 @@ net:
 	$(eval nnuenet := $(shell grep EvalFile ucioption.cpp | grep Option | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/'))
 	@echo "Default net: $(nnuenet)"
 	$(eval nnuedownloadurl := https://tests.stockfishchess.org/api/nn/$(nnuenet))
-	$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -sL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi))
+	$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi))
 	@if test -f "$(nnuenet)"; then echo "Already available."; else echo "Downloading $(nnuedownloadurl)"; $(curl_or_wget) $(nnuedownloadurl) > $(nnuenet); fi
+	$(eval shasum_command := $(shell if hash shasum 2>/dev/null; then echo "shasum -a 256 "; elif hash sha256sum 2>/dev/null; then echo "sha256sum "; fi))
+	@if [ "$(nnuenet)" != "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then echo "Failed download or $(nnuenet) corrupted, please delete!"; exit 1; fi
 
 # clean binaries and objects
 objclean:
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 0a35d01b..2b66a475 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -79,6 +79,8 @@ void init(OptionsMap& o) {
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
   o["Use NNUE"]              << Option(false, on_use_NNUE);
+  // The default must follow the format nn-[SHA256 first 12 digits].nnue
+  // for the build process (profile-build and fishtest) to work.
   o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);
 }
 

From e5f450cf0bfe5a34dd4ea51a5592a71be4514601 Mon Sep 17 00:00:00 2001
From: Miguel Lahoz <miguel_lahoz@protonmail.com>
Date: Mon, 10 Aug 2020 22:57:11 +0800
Subject: [PATCH 23/58] Also dampen NNUE eval with 50 move rule

Move the existing dampening function last so that NNUE evaluations are
also handled as we approach the 50 move rule.

STC:
LLR: 2.95 (-2.94,2.94) {-0.50,1.50}
Total: 4792 W: 695 L: 561 D: 3536
Ptnml(0-2): 19, 420, 1422, 478, 57
https://tests.stockfishchess.org/tests/view/5f3164179081672066537534

LTC:
LLR: 8.62 (-2.94,2.94) {0.25,1.75}
Total: 286744 W: 18494 L: 17430 D: 250820
Ptnml(0-2): 418, 14886, 111745, 15860, 463
https://tests.stockfishchess.org/tests/view/5f316b039081672066537541

closes https://github.com/official-stockfish/Stockfish/pull/3004

Bench: 4001800
---
 src/evaluate.cpp | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index caab2979..00fd2005 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -927,9 +927,6 @@ make_v:
     // Side to move point of view
     v = (pos.side_to_move() == WHITE ? v : -v) + Tempo;
 
-    // Damp down the evaluation linearly when shuffling
-    v = v * (100 - pos.rule50_count()) / 100;
-
     return v;
   }
 
@@ -941,14 +938,15 @@ make_v:
 
 Value Eval::evaluate(const Position& pos) {
 
-  if (Eval::useNNUE)
-  {
-      Value v = eg_value(pos.psq_score());
-      // Take NNUE eval only on balanced positions
-      if (abs(v) < NNUEThreshold)
-         return NNUE::evaluate(pos) + Tempo;
-  }
-  return Evaluation<NO_TRACE>(pos).value();
+  bool classical = !Eval::useNNUE
+                ||  abs(eg_value(pos.psq_score())) >= NNUEThreshold;
+  Value v = classical ? Evaluation<NO_TRACE>(pos).value()
+                      : NNUE::evaluate(pos) + Tempo;
+
+  // Damp down the evaluation linearly when shuffling
+  v = v * (100 - pos.rule50_count()) / 100;
+
+  return v;
 }
 
 /// trace() is like evaluate(), but instead of returning a value, it returns

From 6eb186c97e9d808970d0b1369bcd7aca60612e26 Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Fri, 14 Aug 2020 04:49:33 -0700
Subject: [PATCH 24/58] Try to match relative magnitude of NNUE eval to
 classical

The idea is that since we are mixing NNUE and classical evals matching their magnitudes closer allows for better comparisons.

STC https://tests.stockfishchess.org/tests/view/5f35a65411a9b1a1dbf18e2b
LLR: 2.94 (-2.94,2.94) {-0.50,1.50}
Total: 9840 W: 1150 L: 1027 D: 7663
Ptnml(0-2): 49, 772, 3175, 855, 69

LTC https://tests.stockfishchess.org/tests/view/5f35bcbe11a9b1a1dbf18e47
LLR: 2.93 (-2.94,2.94) {0.25,1.75}
Total: 44424 W: 2492 L: 2294 D: 39638
Ptnml(0-2): 42, 2015, 17915, 2183, 57

also corrects the location to clamp the evaluation (non-function on bench).

closes https://github.com/official-stockfish/Stockfish/pull/3003

bench: 3905447
---
 src/evaluate.cpp           | 5 ++++-
 src/nnue/evaluate_nnue.cpp | 5 +----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 00fd2005..a453fa0f 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -941,11 +941,14 @@ Value Eval::evaluate(const Position& pos) {
   bool classical = !Eval::useNNUE
                 ||  abs(eg_value(pos.psq_score())) >= NNUEThreshold;
   Value v = classical ? Evaluation<NO_TRACE>(pos).value()
-                      : NNUE::evaluate(pos) + Tempo;
+                      : NNUE::evaluate(pos) * 5 / 4 + Tempo;
 
   // Damp down the evaluation linearly when shuffling
   v = v * (100 - pos.rule50_count()) / 100;
 
+  // Guarantee evalution outside of TB range
+  v = Utility::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
+
   return v;
 }
 
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index af0894b2..a6ece8e2 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -159,10 +159,7 @@ namespace Eval::NNUE {
 
   // Evaluation function. Perform differential calculation.
   Value evaluate(const Position& pos) {
-    Value v = ComputeScore(pos, false);
-    v = Utility::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
-
-    return v;
+    return ComputeScore(pos, false);
   }
 
   // Evaluation function. Perform full calculation.

From cd0b8b4cf28208fffef931322749205a0ddc6066 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Fri, 14 Aug 2020 22:18:12 +0200
Subject: [PATCH 25/58] Use NNUE more for fortresses

Increases the use of NNUE evaluation in positions without captures/pawn moves,
by increasing the NNUEThreshold threshold with rule50_count.

This patch will force Stockfish to use NNUE eval more and more in materially
unbalanced positions, when it seems that the classical eval is struggling to
win and only manages to shuffle. This will ask the (slower) NNUE eval to
double-check the potential fortress branches of the search tree, but only
when necessary.

passed STC:
https://tests.stockfishchess.org/tests/view/5f36f1bf11a9b1a1dbf192d8
LLR: 2.93 (-2.94,2.94) {-0.50,1.50}
Total: 51824 W: 5836 L: 5653 D: 40335
Ptnml(0-2): 264, 4356, 16512, 4493, 287

passed LTC:
https://tests.stockfishchess.org/tests/view/5f37836111a9b1a1dbf1936d
LLR: 2.93 (-2.94,2.94) {0.25,1.75}
Total: 29768 W: 1747 L: 1590 D: 26431
Ptnml(0-2): 33, 1347, 11977, 1484, 43

closes https://github.com/official-stockfish/Stockfish/pull/3011

Bench: 4173967
---
 src/evaluate.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index a453fa0f..3a620a78 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -939,7 +939,7 @@ make_v:
 Value Eval::evaluate(const Position& pos) {
 
   bool classical = !Eval::useNNUE
-                ||  abs(eg_value(pos.psq_score())) >= NNUEThreshold;
+                ||  abs(eg_value(pos.psq_score())) >= NNUEThreshold * (16 + pos.rule50_count()) / 16;
   Value v = classical ? Evaluation<NO_TRACE>(pos).value()
                       : NNUE::evaluate(pos) * 5 / 4 + Tempo;
 

From 8cf43c6317665295eece747ed1589ee33a435d2c Mon Sep 17 00:00:00 2001
From: Daylen Yang <services@daylenyang.com>
Date: Fri, 14 Aug 2020 19:53:46 -0700
Subject: [PATCH 26/58] Display NEON in compiler string

if NEON intrinsics are being used and USE_NEON is defined.

closes https://github.com/official-stockfish/Stockfish/pull/3008

No functional change
---
 src/misc.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/misc.cpp b/src/misc.cpp
index 1cee4726..459ea100 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -242,6 +242,9 @@ const std::string compiler_info() {
   #if defined(USE_MMX)
     compiler += " MMX";
   #endif
+  #if defined(USE_NEON)
+    compiler += " NEON";
+  #endif
 
   #if !defined(NDEBUG)
     compiler += " DEBUG";

From 72dc7a5c54554a8c7c4bf68aa7de2d4de05f3294 Mon Sep 17 00:00:00 2001
From: syzygy1 <3028851+syzygy1@users.noreply.github.com>
Date: Sat, 15 Aug 2020 16:50:39 +0200
Subject: [PATCH 27/58] Assume network file is in little-endian byte order

This patch fixes the byte order when reading 16- and 32-bit values from the network file on a big-endian machine.

Bytes are ordered in read_le() using unsigned arithmetic, which doesn't need tricks to determine the endianness of the machine. Unfortunately the compiler doesn't seem to be able to optimise the ordering operation, but reading in the weights is not a time-critical operation and the extra time it takes should not be noticeable.

Big endian systems are still untested with NNUE.

fixes #3007

closes https://github.com/official-stockfish/Stockfish/pull/3009

No functional change.
---
 src/nnue/evaluate_nnue.cpp          |  8 ++++----
 src/nnue/layers/affine_transform.h  |  9 ++++-----
 src/nnue/nnue_common.h              | 19 +++++++++++++++++++
 src/nnue/nnue_feature_transformer.h |  8 ++++----
 4 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index a6ece8e2..3aa85943 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -77,7 +77,7 @@ namespace Eval::NNUE {
   bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
 
     std::uint32_t header;
-    stream.read(reinterpret_cast<char*>(&header), sizeof(header));
+    header = read_le<std::uint32_t>(stream);
     if (!stream || header != T::GetHashValue()) return false;
     return pointer->ReadParameters(stream);
   }
@@ -96,9 +96,9 @@ namespace Eval::NNUE {
     std::uint32_t* hash_value, std::string* architecture) {
 
     std::uint32_t version, size;
-    stream.read(reinterpret_cast<char*>(&version), sizeof(version));
-    stream.read(reinterpret_cast<char*>(hash_value), sizeof(*hash_value));
-    stream.read(reinterpret_cast<char*>(&size), sizeof(size));
+    version = read_le<std::uint32_t>(stream);
+    *hash_value = read_le<std::uint32_t>(stream);
+    size = read_le<std::uint32_t>(stream);
     if (!stream || version != kVersion) return false;
     architecture->resize(size);
     stream.read(&(*architecture)[0], size);
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 322e3240..bac258e8 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -62,11 +62,10 @@ namespace Eval::NNUE::Layers {
    // Read network parameters
     bool ReadParameters(std::istream& stream) {
       if (!previous_layer_.ReadParameters(stream)) return false;
-      stream.read(reinterpret_cast<char*>(biases_),
-                  kOutputDimensions * sizeof(BiasType));
-      stream.read(reinterpret_cast<char*>(weights_),
-                  kOutputDimensions * kPaddedInputDimensions *
-                  sizeof(WeightType));
+      for (std::size_t i = 0; i < kOutputDimensions; ++i)
+        biases_[i] = read_le<BiasType>(stream);
+      for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i)
+        weights_[i] = read_le<WeightType>(stream);
       return !stream.fail();
     }
 
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index eab7d258..61f18aee 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -21,6 +21,9 @@
 #ifndef NNUE_COMMON_H_INCLUDED
 #define NNUE_COMMON_H_INCLUDED
 
+#include <cstring>
+#include <iostream>
+
 #if defined(USE_AVX2)
 #include <immintrin.h>
 
@@ -101,6 +104,22 @@ namespace Eval::NNUE {
     return (n + base - 1) / base * base;
   }
 
+  // Read a signed or unsigned integer from  a stream in little-endian order
+  template <typename IntType>
+  inline IntType read_le(std::istream& stream) {
+    // Read the relevant bytes from the stream in little-endian order
+    std::uint8_t u[sizeof(IntType)];
+    stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
+    // Use unsigned arithmetic to convert to machine order
+    typename std::make_unsigned<IntType>::type v = 0;
+    for (std::size_t i = 0; i < sizeof(IntType); ++i)
+      v = (v << 8) | u[sizeof(IntType) - i - 1];
+    // Copy the machine-ordered bytes into a potentially signed value
+    IntType w;
+    std::memcpy(&w, &v, sizeof(IntType));
+    return w;
+  }
+
 }  // namespace Eval::NNUE
 
 #endif // #ifndef NNUE_COMMON_H_INCLUDED
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 40f2603d..4db9be9f 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -55,10 +55,10 @@ namespace Eval::NNUE {
 
     // Read network parameters
     bool ReadParameters(std::istream& stream) {
-      stream.read(reinterpret_cast<char*>(biases_),
-                  kHalfDimensions * sizeof(BiasType));
-      stream.read(reinterpret_cast<char*>(weights_),
-                  kHalfDimensions * kInputDimensions * sizeof(WeightType));
+      for (std::size_t i = 0; i < kHalfDimensions; ++i)
+        biases_[i] = read_le<BiasType>(stream);
+      for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
+        weights_[i] = read_le<WeightType>(stream);
       return !stream.fail();
     }
 

From 65572de4a79ab017c19d85eacee865afe7bfc7c1 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 16 Aug 2020 13:21:07 +0200
Subject: [PATCH 28/58] Add further targets to travis testing

general-32, general-64 and help

closes https://github.com/official-stockfish/Stockfish/pull/3014

No functional change
---
 .travis.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 0dd38047..45f1bd3d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -51,11 +51,12 @@ script:
   - export benchref=$(cat git_sig)
   - echo "Reference bench:" $benchref
 
-  #
   # Compiler version string
   - $COMPILER -v
 
-  #
+  # test help target
+  - make help
+
   # Verify bench number against various builds
   - export CXXFLAGS="-Werror -D_GLIBCXX_DEBUG"
   - make clean && make -j2 ARCH=x86-64-modern optimize=no debug=yes build && ../tests/signature.sh $benchref
@@ -64,8 +65,10 @@ script:
   - make clean && make -j2 ARCH=x86-64-ssse3 build && ../tests/signature.sh $benchref
   - make clean && make -j2 ARCH=x86-64-sse3-popcnt build && ../tests/signature.sh $benchref
   - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-old build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" && "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
 

From 81d716f5ccff3f0898ae985b9ef69f79d014bdc5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= <cassio@free.fr>
Date: Sun, 16 Aug 2020 21:46:54 +0200
Subject: [PATCH 29/58] Reformat code in little-endian patch

Reformat code and rename the function to "read_little_endian()" in the recent
commit by Ronald de Man for support of big endian systems.

closes https://github.com/official-stockfish/Stockfish/pull/3016

No functional change
-----

Recommended net: https://tests.stockfishchess.org/api/nn/nn-82215d0fd0df.nnue
---
 src/nnue/evaluate_nnue.cpp          | 14 +++++++-------
 src/nnue/layers/affine_transform.h  |  4 ++--
 src/nnue/nnue_common.h              | 30 +++++++++++++++--------------
 src/nnue/nnue_feature_transformer.h |  4 ++--
 4 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 3aa85943..dfbb1ac2 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -77,7 +77,7 @@ namespace Eval::NNUE {
   bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
 
     std::uint32_t header;
-    header = read_le<std::uint32_t>(stream);
+    header = read_little_endian<std::uint32_t>(stream);
     if (!stream || header != T::GetHashValue()) return false;
     return pointer->ReadParameters(stream);
   }
@@ -92,13 +92,13 @@ namespace Eval::NNUE {
   }
 
   // Read network header
-  bool ReadHeader(std::istream& stream,
-    std::uint32_t* hash_value, std::string* architecture) {
-
+  bool ReadHeader(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
+  {
     std::uint32_t version, size;
-    version = read_le<std::uint32_t>(stream);
-    *hash_value = read_le<std::uint32_t>(stream);
-    size = read_le<std::uint32_t>(stream);
+
+    version     = read_little_endian<std::uint32_t>(stream);
+    *hash_value = read_little_endian<std::uint32_t>(stream);
+    size        = read_little_endian<std::uint32_t>(stream);
     if (!stream || version != kVersion) return false;
     architecture->resize(size);
     stream.read(&(*architecture)[0], size);
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index bac258e8..7ac5a1c0 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -63,9 +63,9 @@ namespace Eval::NNUE::Layers {
     bool ReadParameters(std::istream& stream) {
       if (!previous_layer_.ReadParameters(stream)) return false;
       for (std::size_t i = 0; i < kOutputDimensions; ++i)
-        biases_[i] = read_le<BiasType>(stream);
+        biases_[i] = read_little_endian<BiasType>(stream);
       for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i)
-        weights_[i] = read_le<WeightType>(stream);
+        weights_[i] = read_little_endian<WeightType>(stream);
       return !stream.fail();
     }
 
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 61f18aee..4c93e3d1 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -101,23 +101,25 @@ namespace Eval::NNUE {
   // Round n up to be a multiple of base
   template <typename IntType>
   constexpr IntType CeilToMultiple(IntType n, IntType base) {
-    return (n + base - 1) / base * base;
+      return (n + base - 1) / base * base;
   }
 
-  // Read a signed or unsigned integer from  a stream in little-endian order
+  // read_little_endian() is our utility to read an integer (signed or unsigned, any size)
+  // from a stream in little-endian order. We swap the byte order after the read if
+  // necessary to return a result with the byte ordering of the compiling machine.
   template <typename IntType>
-  inline IntType read_le(std::istream& stream) {
-    // Read the relevant bytes from the stream in little-endian order
-    std::uint8_t u[sizeof(IntType)];
-    stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
-    // Use unsigned arithmetic to convert to machine order
-    typename std::make_unsigned<IntType>::type v = 0;
-    for (std::size_t i = 0; i < sizeof(IntType); ++i)
-      v = (v << 8) | u[sizeof(IntType) - i - 1];
-    // Copy the machine-ordered bytes into a potentially signed value
-    IntType w;
-    std::memcpy(&w, &v, sizeof(IntType));
-    return w;
+  inline IntType read_little_endian(std::istream& stream) {
+
+      IntType result;
+      std::uint8_t u[sizeof(IntType)];
+      typename std::make_unsigned<IntType>::type v = 0;
+
+      stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
+      for (std::size_t i = 0; i < sizeof(IntType); ++i)
+          v = (v << 8) | u[sizeof(IntType) - i - 1];
+
+      std::memcpy(&result, &v, sizeof(IntType));
+      return result;
   }
 
 }  // namespace Eval::NNUE
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 4db9be9f..43707610 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -56,9 +56,9 @@ namespace Eval::NNUE {
     // Read network parameters
     bool ReadParameters(std::istream& stream) {
       for (std::size_t i = 0; i < kHalfDimensions; ++i)
-        biases_[i] = read_le<BiasType>(stream);
+        biases_[i] = read_little_endian<BiasType>(stream);
       for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
-        weights_[i] = read_le<WeightType>(stream);
+        weights_[i] = read_little_endian<WeightType>(stream);
       return !stream.fail();
     }
 

From 0e17a89e4dee73bd46e496cf6bed467432f116e6 Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Mon, 17 Aug 2020 09:22:15 +0200
Subject: [PATCH 30/58] Simplify away the passed pawn extension

STC https://tests.stockfishchess.org/tests/view/5f3955f0e98b6c64b3df41d7
LLR: 2.96 (-2.94,2.94) {-1.50,0.50}
Total: 31992 W: 3611 L: 3548 D: 24833
Ptnml(0-2): 174, 2658, 10273, 2713, 178

LTC https://tests.stockfishchess.org/tests/view/5f399e41e98b6c64b3df4210
LLR: 3.01 (-2.94,2.94) {-1.50,0.50}
Total: 29568 W: 1488 L: 1480 D: 26600
Ptnml(0-2): 40, 1272, 12142, 1300, 30

closes https://github.com/official-stockfish/Stockfish/pull/3017

bench: 3844671

-----

Recommended net: https://tests.stockfishchess.org/api/nn/nn-82215d0fd0df.nnue
---
 src/search.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index c5b4332f..83fb722f 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1126,12 +1126,6 @@ moves_loop: // When in check, search starts from here
                && (pos.is_discovery_check_on_king(~us, move) || pos.see_ge(move)))
           extension = 1;
 
-      // Passed pawn extension
-      else if (   move == ss->killers[0]
-               && pos.advanced_pawn_push(move)
-               && pos.pawn_passed(us, to_sq(move)))
-          extension = 1;
-
       // Last captures extension
       else if (   PieceValue[EG][pos.captured_piece()] > PawnValueEg
                && pos.non_pawn_material() <= 2 * RookValueMg)

From 65b976439f8867e81682c0b66da6796ad3176177 Mon Sep 17 00:00:00 2001
From: notruck <56622488+notruck@users.noreply.github.com>
Date: Sun, 16 Aug 2020 08:59:13 -0700
Subject: [PATCH 31/58] Support building for Android using NDK

The easiest way to use the NDK in conjunction with this Makefile (tested on linux-x86_64):

1. Download the latest NDK (r21d) from Google from https://developer.android.com/ndk/downloads
2. Place and unzip the NDK in $HOME/ndk folder
3. Export the path variable e.g., `export PATH=$PATH:$HOME/ndk/android-ndk-r21d/toolchains/llvm/prebuilt/linux-x86_64/bin`
4. cd to your Stockfish/src dir
5. Issue `make -j ARCH=armv8 COMP=ndk build`  (use `ARCH=armv7` or `ARCH=armv7-neon` for older CPUs)
6. Optionally `make -j ARCH=armv8 COMP=ndk strip`
7. That's all. Enjoy!

Improves support from Raspberry Pi (incomplete?) and compiling on arm in general

closes https://github.com/official-stockfish/Stockfish/pull/3015

fixes https://github.com/official-stockfish/Stockfish/issues/2860

fixes https://github.com/official-stockfish/Stockfish/issues/2641

Support is still fragile as we're missing CI on these targets. Nevertheless tested with:

```bash
  # build crosses from ubuntu 20.04 on x86 to various arch/OS combos
  # tested with suitable packages installed
  # (build-essentials, mingw-w64, g++-arm-linux-gnueabihf, NDK (r21d) from google)

  # cross to Android
  export PATH=$HOME/ndk/android-ndk-r21d/toolchains/llvm/prebuilt/linux-x86_64/bin:$PATH
  make clean && make -j build ARCH=armv7         COMP=ndk  && make -j build ARCH=armv7 COMP=ndk strip
  make clean && make -j build ARCH=armv7-neon    COMP=ndk  && make -j build ARCH=armv7-neon COMP=ndk strip
  make clean && make -j build ARCH=armv8         COMP=ndk  && make -j build ARCH=armv8 COMP=ndk strip

  # cross to Raspberry Pi
  make clean && make -j build ARCH=armv7         COMP=gcc COMPILER=arm-linux-gnueabihf-g++
  make clean && make -j build ARCH=armv7-neon    COMP=gcc COMPILER=arm-linux-gnueabihf-g++

  # cross to Windows
  make clean && make -j build ARCH=x86-64-modern COMP=mingw
```

No functional change
---
 AUTHORS      |  1 +
 src/Makefile | 65 ++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 56 insertions(+), 10 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 41b89705..d8f4d30e 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -127,6 +127,7 @@ Niklas Fiekas (niklasf)
 Nikolay Kostov (NikolayIT)
 Nguyen Pham (nguyenpham)
 Norman Schmidt (FireFather)
+notruck
 Ondrej Mosnáček (WOnder93)
 Oskar Werkelin Ahlin
 Pablo Vazquez
diff --git a/src/Makefile b/src/Makefile
index 38f607cb..0f458aa1 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -102,6 +102,7 @@ avx512 = no
 vnni = no
 neon = no
 ARCH = x86-64-modern
+STRIP = strip
 
 ### 2.2 Architecture specific
 
@@ -208,6 +209,14 @@ ifeq ($(ARCH),armv7)
 	bits = 32
 endif
 
+ifeq ($(ARCH),armv7-neon)
+	arch = armv7
+	prefetch = yes
+	popcnt = yes
+	neon = yes
+	bits = 32
+endif
+
 ifeq ($(ARCH),armv8)
 	arch = armv8-a
 	prefetch = yes
@@ -251,7 +260,7 @@ ifeq ($(COMP),gcc)
 	CXX=g++
 	CXXFLAGS += -pedantic -Wextra -Wshadow
 
-	ifeq ($(ARCH),$(filter $(ARCH),armv7 armv8))
+	ifeq ($(arch),$(filter $(arch),armv7 armv8-a))
 		ifeq ($(OS),Android)
 			CXXFLAGS += -m$(bits)
 			LDFLAGS += -m$(bits)
@@ -261,6 +270,10 @@ ifeq ($(COMP),gcc)
 		LDFLAGS += -m$(bits)
 	endif
 
+	ifeq ($(arch),$(filter $(arch),armv7))
+		LDFLAGS += -latomic
+	endif
+
 	ifneq ($(KERNEL),Darwin)
 	   LDFLAGS += -Wl,--no-as-needed
 	endif
@@ -311,7 +324,7 @@ ifeq ($(COMP),clang)
 	endif
 	endif
 
-	ifeq ($(ARCH),$(filter $(ARCH),armv7 armv8))
+	ifeq ($(arch),$(filter $(arch),armv7 armv8))
 		ifeq ($(OS),Android)
 			CXXFLAGS += -m$(bits)
 			LDFLAGS += -m$(bits)
@@ -340,6 +353,25 @@ ifeq ($(KERNEL),Darwin)
 	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14
 endif
 
+# To cross-compile for Android, NDK version r21 or later is recommended.
+# In earlier NDK versions, you'll need to pass -fno-addrsig if using GNU binutils.
+# Currently we don't know how to make PGO builds with the NDK yet.
+ifeq ($(COMP),ndk)
+	CXXFLAGS += -stdlib=libc++ -fPIE
+	ifeq ($(arch),armv7)
+		comp=armv7a-linux-androideabi16-clang
+		CXX=armv7a-linux-androideabi16-clang++
+		CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon
+		STRIP=arm-linux-androideabi-strip
+	endif
+	ifeq ($(arch),armv8-a)
+		comp=aarch64-linux-android21-clang
+		CXX=aarch64-linux-android21-clang++
+		STRIP=aarch64-linux-android-strip
+	endif
+	LDFLAGS += -static-libstdc++ -pie -lm -latomic
+endif
+
 ### Travis CI script uses COMPILER to overwrite CXX
 ifdef COMPILER
 	COMPCXX=$(COMPILER)
@@ -356,7 +388,9 @@ ifneq ($(comp),mingw)
 	ifneq ($(OS),Android)
 		# Haiku has pthreads in its libroot, so only link it in on other platforms
 		ifneq ($(KERNEL),Haiku)
-			LDFLAGS += -lpthread
+			ifneq ($(COMP),ndk)
+				LDFLAGS += -lpthread
+			endif
 		endif
 	endif
 endif
@@ -401,7 +435,6 @@ endif
 ifeq ($(prefetch),yes)
 	ifeq ($(sse),yes)
 		CXXFLAGS += -msse
-		DEPENDFLAGS += -msse
 	endif
 else
 	CXXFLAGS += -DNO_PREFETCH
@@ -409,7 +442,7 @@ endif
 
 ### 3.6 popcnt
 ifeq ($(popcnt),yes)
-	ifeq ($(arch),$(filter $(arch),ppc64 armv8-a arm64))
+	ifeq ($(arch),$(filter $(arch),ppc64 armv7 armv8-a arm64))
 		CXXFLAGS += -DUSE_POPCNT
 	else ifeq ($(comp),icc)
 		CXXFLAGS += -msse3 -DUSE_POPCNT
@@ -418,6 +451,7 @@ ifeq ($(popcnt),yes)
 	endif
 endif
 
+
 ifeq ($(avx2),yes)
 	CXXFLAGS += -DUSE_AVX2
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
@@ -462,6 +496,11 @@ endif
 
 ifeq ($(neon),yes)
 	CXXFLAGS += -DUSE_NEON
+	ifeq ($(KERNEL),Linux)
+	ifneq ($(COMP),ndk)
+		CXXFLAGS += -mfpu=neon
+	endif
+	endif
 endif
 
 ifeq ($(arch),x86_64)
@@ -481,7 +520,10 @@ endif
 ### needs access to the optimization flags.
 ifeq ($(optimize),yes)
 ifeq ($(debug), no)
-	ifeq ($(comp),clang)
+	ifeq ($(COMP),ndk)
+		CXXFLAGS += -flto=thin
+		LDFLAGS += $(CXXFLAGS)
+	else ifeq ($(comp),clang)
 		CXXFLAGS += -flto=thin
 		LDFLAGS += $(CXXFLAGS)
 
@@ -502,7 +544,7 @@ ifeq ($(debug), no)
 	endif
 
 # To use LTO and static linking on windows, the tool chain requires a recent gcc:
-# gcc version 10.1 in msys2 or TDM-GCC version 9.2 are know to work, older might not.
+# gcc version 10.1 in msys2 or TDM-GCC version 9.2 are known to work, older might not.
 # So, only enable it for a cross from Linux by default.
 	else ifeq ($(comp),mingw)
 	ifeq ($(KERNEL),Linux)
@@ -556,7 +598,8 @@ help:
 	@echo "ppc-64                  > PPC 64-bit"
 	@echo "ppc-32                  > PPC 32-bit"
 	@echo "armv7                   > ARMv7 32-bit"
-	@echo "armv8                   > ARMv8 64-bit"
+	@echo "armv7-neon"             > ARMv7 32-bit with popcnt and neon"
+	@echo "armv8                   > ARMv8 64-bit with popcnt and neon"
 	@echo "apple-silicon           > Apple silicon ARM64"
 	@echo "general-64              > unspecified 64-bit"
 	@echo "general-32              > unspecified 32-bit"
@@ -567,6 +610,7 @@ help:
 	@echo "mingw                   > Gnu compiler with MinGW under Windows"
 	@echo "clang                   > LLVM Clang compiler"
 	@echo "icc                     > Intel compiler"
+	@echo "ndk                     > Google NDK to cross-compile for Android"
 	@echo ""
 	@echo "Simple examples. If you don't know what to do, you likely want to run: "
 	@echo ""
@@ -609,7 +653,7 @@ profile-build: net config-sanity objclean profileclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean
 
 strip:
-	strip $(EXE)
+	$(STRIP) $(EXE)
 
 install:
 	-mkdir -p -m 755 $(BINDIR)
@@ -693,7 +737,8 @@ config-sanity:
 	@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
 	@test "$(vnni)" = "yes" || test "$(vnni)" = "no"
 	@test "$(neon)" = "yes" || test "$(neon)" = "no"
-	@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang"
+	@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" \
+	|| test "$(comp)" = "armv7a-linux-androideabi16-clang"  || test "$(comp)" = "aarch64-linux-android21-clang"
 
 $(EXE): $(OBJS)
 	+$(CXX) -o $@ $(OBJS) $(LDFLAGS)

From 1c0b7bdf4f77b8160cebe8af96b28230e870a136 Mon Sep 17 00:00:00 2001
From: VoyagerOne <excelgeek@gmail.com>
Date: Mon, 17 Aug 2020 08:58:03 -0400
Subject: [PATCH 32/58] Remove history bonus from Eval

STC:
LLR: 2.92 (-2.94,2.94) {-1.50,0.50}
Total: 26776 W: 2787 L: 2725 D: 21264
https://tests.stockfishchess.org/tests/view/5f39d6beb38d442594aabd9b

LTC:
LLR: 2.93 (-2.94,2.94) {-1.50,0.50}
Total: 12968 W: 635 L: 608 D: 11725
https://tests.stockfishchess.org/tests/view/5f39decfb38d442594aabda7

closes https://github.com/official-stockfish/Stockfish/pull/3019

Bench:  4335100
---
 src/search.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 83fb722f..7c839dfc 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -794,11 +794,7 @@ namespace {
     else
     {
         if ((ss-1)->currentMove != MOVE_NULL)
-        {
-            int bonus = -(ss-1)->statScore / 512;
-
-            ss->staticEval = eval = evaluate(pos) + bonus;
-        }
+            ss->staticEval = eval = evaluate(pos);
         else
             ss->staticEval = eval = -(ss-1)->staticEval + 2 * Tempo;
 

From 581b92e4a70b99fa5a22f7a1a38f2c8d2099769f Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Mon, 17 Aug 2020 18:22:32 +0200
Subject: [PATCH 33/58] Remove last captures extension

STC https://tests.stockfishchess.org/tests/view/5f395657e98b6c64b3df41dd
LLR: 2.95 (-2.94,2.94) {-1.50,0.50}
Total: 144664 W: 15426 L: 15537 D: 113701
Ptnml(0-2): 612, 11341, 48537, 11230, 612

LTC https://tests.stockfishchess.org/tests/view/5f3a2ec7b38d442594aabdd7
LLR: 2.96 (-2.94,2.94) {-1.50,0.50}
Total: 22728 W: 1161 L: 1146 D: 20421
Ptnml(0-2): 21, 960, 9388, 973, 22

closes https://github.com/official-stockfish/Stockfish/pull/3020

bench: 3832662
---
 src/search.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 7c839dfc..1d5bc5f7 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1122,11 +1122,6 @@ moves_loop: // When in check, search starts from here
                && (pos.is_discovery_check_on_king(~us, move) || pos.see_ge(move)))
           extension = 1;
 
-      // Last captures extension
-      else if (   PieceValue[EG][pos.captured_piece()] > PawnValueEg
-               && pos.non_pawn_material() <= 2 * RookValueMg)
-          extension = 1;
-
       // Castling extension
       if (   type_of(move) == CASTLING
           && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2)

From 1bcc981a5a70e3065b4ff588644f270136fd7e3c Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Sun, 16 Aug 2020 15:23:50 -0700
Subject: [PATCH 34/58] Fallback to NNUE

If the classical eval ends up much smaller than estimated fall back to NNUE.
Also use multiply instead of divide for the threshold comparison for smoother transitions without rounding.

STC https://tests.stockfishchess.org/tests/view/5f3a5011b38d442594aabdfe
LLR: 2.96 (-2.94,2.94) {-0.50,1.50}
Total: 57352 W: 6325 L: 6135 D: 44892
Ptnml(0-2): 277, 4748, 18482, 4846, 323

LTC https://tests.stockfishchess.org/tests/view/5f3aee9db38d442594aabe82
LLR: 2.95 (-2.94,2.94) {0.25,1.75}
Total: 16232 W: 897 L: 781 D: 14554
Ptnml(0-2): 19, 679, 6616, 771, 31

closes https://github.com/official-stockfish/Stockfish/pull/3023

bench: 4026216

-----

Recommended net: https://tests.stockfishchess.org/api/nn/nn-82215d0fd0df.nnue
---
 src/evaluate.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 3a620a78..1bd89353 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -114,7 +114,8 @@ namespace {
   constexpr Value LazyThreshold1 =  Value(1400);
   constexpr Value LazyThreshold2 =  Value(1300);
   constexpr Value SpaceThreshold = Value(12222);
-  constexpr Value NNUEThreshold  =   Value(575);
+  constexpr Value NNUEThreshold1 =   Value(550);
+  constexpr Value NNUEThreshold2 =   Value(150);
 
   // KingAttackWeights[PieceType] contains king attack weights by piece type
   constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -939,10 +940,13 @@ make_v:
 Value Eval::evaluate(const Position& pos) {
 
   bool classical = !Eval::useNNUE
-                ||  abs(eg_value(pos.psq_score())) >= NNUEThreshold * (16 + pos.rule50_count()) / 16;
+                ||  abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
   Value v = classical ? Evaluation<NO_TRACE>(pos).value()
                       : NNUE::evaluate(pos) * 5 / 4 + Tempo;
 
+  if (classical && Eval::useNNUE && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
+      v = NNUE::evaluate(pos) * 5 / 4 + Tempo;
+
   // Damp down the evaluation linearly when shuffling
   v = v * (100 - pos.rule50_count()) / 100;
 

From fbae5614eb1e82bccd37fbcfb0d2ca388b7a9a7d Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Tue, 18 Aug 2020 08:49:06 +0200
Subject: [PATCH 35/58] Fix Makefile typo

remove stray quote, shown with `make help`

No functional change
---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 0f458aa1..1f8ba455 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -598,7 +598,7 @@ help:
 	@echo "ppc-64                  > PPC 64-bit"
 	@echo "ppc-32                  > PPC 32-bit"
 	@echo "armv7                   > ARMv7 32-bit"
-	@echo "armv7-neon"             > ARMv7 32-bit with popcnt and neon"
+	@echo "armv7-neon              > ARMv7 32-bit with popcnt and neon"
 	@echo "armv8                   > ARMv8 64-bit with popcnt and neon"
 	@echo "apple-silicon           > Apple silicon ARM64"
 	@echo "general-64              > unspecified 64-bit"

From 384d6844841e9f2da8f5a913c7620440f9e05ab5 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Tue, 18 Aug 2020 18:06:28 +0200
Subject: [PATCH 36/58] Better error message on missing curl/wget

provide clean error/warning message for missing curl/wget, sha256sum/shasum

fixes https://github.com/official-stockfish/Stockfish/issues/3025

closes https://github.com/official-stockfish/Stockfish/pull/3026

No functional change
---
 src/Makefile | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 1f8ba455..a3feb68e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -669,9 +669,24 @@ net:
 	@echo "Default net: $(nnuenet)"
 	$(eval nnuedownloadurl := https://tests.stockfishchess.org/api/nn/$(nnuenet))
 	$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi))
-	@if test -f "$(nnuenet)"; then echo "Already available."; else echo "Downloading $(nnuedownloadurl)"; $(curl_or_wget) $(nnuedownloadurl) > $(nnuenet); fi
+	@if test -f "$(nnuenet)"; then \
+            echo "Already available."; \
+         else \
+            if [ "x$(curl_or_wget)" = "x" ]; then \
+               echo "Automatic download failed: neither curl nor wget is installed. Install one of these tools or download the net manually"; exit 1; \
+            else \
+               echo "Downloading $(nnuedownloadurl)"; $(curl_or_wget) $(nnuedownloadurl) > $(nnuenet);\
+            fi; \
+        fi;
 	$(eval shasum_command := $(shell if hash shasum 2>/dev/null; then echo "shasum -a 256 "; elif hash sha256sum 2>/dev/null; then echo "sha256sum "; fi))
-	@if [ "$(nnuenet)" != "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then echo "Failed download or $(nnuenet) corrupted, please delete!"; exit 1; fi
+	@if [ "x$(shasum_command)" != "x" ]; then \
+	    if [ "$(nnuenet)" != "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then \
+                echo "Failed download or $(nnuenet) corrupted, please delete!"; exit 1; \
+            fi \
+         else \
+            echo "shasum / sha256sum not found, skipping net validation"; \
+        fi
+
 
 # clean binaries and objects
 objclean:

From 42e8789f0b3935b7ea389b3aa929e05e0a016872 Mon Sep 17 00:00:00 2001
From: syzygy1 <3028851+syzygy1@users.noreply.github.com>
Date: Tue, 18 Aug 2020 01:56:12 +0200
Subject: [PATCH 37/58] Expanded support for x86-32 architectures.

add new ARCH targets

x86-32-sse41-popcnt     > x86 32-bit with sse41 and popcnt support
x86-32-sse2             > x86 32-bit with sse2 support
x86-32                  > x86 32-bit generic (with mmx and sse support)

retire x86-32-old (use general-32)

closes https://github.com/official-stockfish/Stockfish/pull/3022

No functional change.
---
 .travis.yml  |   3 +-
 src/Makefile | 145 +++++++++++++++++++++++++++++++--------------------
 2 files changed, 91 insertions(+), 57 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 45f1bd3d..12596f1e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -67,9 +67,10 @@ script:
   - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse41-popcnt build && ../tests/signature.sh $benchref; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-old build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" && "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
 
   # compile only for some more advanced architectures (might not run in travis)
diff --git a/src/Makefile b/src/Makefile
index a3feb68e..79c7333a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -67,11 +67,13 @@ endif
 # bits = 64/32        --- -DIS_64BIT       --- 64-/32-bit operating system
 # prefetch = yes/no   --- -DUSE_PREFETCH   --- Use prefetch asm-instruction
 # popcnt = yes/no     --- -DUSE_POPCNT     --- Use popcnt asm-instruction
+# pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
 # sse = yes/no        --- -msse            --- Use Intel Streaming SIMD Extensions
+# mmx = yes/no        --- -mmmx            --- Use Intel MMX instructions
+# sse2 = yes/no       --- -msse2           --- Use Intel Streaming SIMD Extensions 2
 # ssse3 = yes/no      --- -mssse3          --- Use Intel Supplemental Streaming SIMD Extensions 3
 # sse41 = yes/no      --- -msse4.1         --- Use Intel Streaming SIMD Extensions 4.1
 # avx2 = yes/no       --- -mavx2           --- Use Intel Advanced Vector Extensions 2
-# pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
 # avx512 = yes/no     --- -mavx512bw       --- Use Intel Advanced Vector Extensions 512
 # vnni = yes/no       --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 512
 # neon = yes/no       --- -DUSE_NEON       --- Use ARM SIMD architecture
@@ -92,12 +94,13 @@ sanitize = no
 bits = 64
 prefetch = no
 popcnt = no
-mmx = no
+pext = no
 sse = no
+mmx = no
+sse2 = no
 ssse3 = no
 sse41 = no
 avx2 = no
-pext = no
 avx512 = no
 vnni = no
 neon = no
@@ -106,83 +109,82 @@ STRIP = strip
 
 ### 2.2 Architecture specific
 
-ifeq ($(ARCH),general-32)
-	arch = any
-	bits = 32
-endif
+ifeq ($(findstring x86,$(ARCH)),x86)
 
-ifeq ($(ARCH),x86-32-old)
+# x86-32/64
+
+ifeq ($(findstring x86-32,$(ARCH)),x86-32)
 	arch = i386
 	bits = 32
-endif
-
-ifeq ($(ARCH),x86-32)
-	arch = i386
-	bits = 32
-	prefetch = yes
+	sse = yes
 	mmx = yes
-	sse = yes
-endif
-
-ifeq ($(ARCH),general-64)
-	arch = any
-endif
-
-ifeq ($(ARCH),x86-64)
+else
 	arch = x86_64
-	prefetch = yes
+	sse = yes
+	sse2 = yes
+endif
+
+ifeq ($(findstring -sse,$(ARCH)),-sse)
 	sse = yes
 endif
 
-ifeq ($(ARCH),x86-64-sse3-popcnt)
-	arch = x86_64
-	prefetch = yes
-	sse = yes
+ifeq ($(findstring -popcnt,$(ARCH)),-popcnt)
 	popcnt = yes
 endif
 
-ifeq ($(ARCH),x86-64-ssse3)
-	arch = x86_64
-	prefetch = yes
+ifeq ($(findstring -mmx,$(ARCH)),-mmx)
+	mmx = yes
+endif
+
+ifeq ($(findstring -sse2,$(ARCH)),-sse2)
 	sse = yes
+	sse2 = yes
+endif
+
+ifeq ($(findstring -ssse3,$(ARCH)),-ssse3)
+	sse = yes
+	sse2 = yes
 	ssse3 = yes
 endif
 
-ifeq ($(ARCH),$(filter $(ARCH),x86-64-sse41-popcnt x86-64-modern))
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
+ifeq ($(findstring -sse41,$(ARCH)),-sse41)
 	sse = yes
+	sse2 = yes
 	ssse3 = yes
 	sse41 = yes
 endif
 
-ifeq ($(ARCH),x86-64-avx2)
-	arch = x86_64
-	prefetch = yes
+ifeq ($(findstring -modern,$(ARCH)),-modern)
 	popcnt = yes
 	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+endif
+
+ifeq ($(findstring -avx2,$(ARCH)),-avx2)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
 	ssse3 = yes
 	sse41 = yes
 	avx2 = yes
 endif
 
-ifeq ($(ARCH),x86-64-bmi2)
-	arch = x86_64
-	prefetch = yes
+ifeq ($(findstring -bmi2,$(ARCH)),-bmi2)
 	popcnt = yes
 	sse = yes
+	sse2 = yes
 	ssse3 = yes
 	sse41 = yes
 	avx2 = yes
 	pext = yes
 endif
 
-ifeq ($(ARCH),x86-64-avx512)
-	arch = x86_64
-	prefetch = yes
+ifeq ($(findstring -avx512,$(ARCH)),-avx512)
 	popcnt = yes
 	sse = yes
+	sse2 = yes
 	ssse3 = yes
 	sse41 = yes
 	avx2 = yes
@@ -190,11 +192,10 @@ ifeq ($(ARCH),x86-64-avx512)
 	avx512 = yes
 endif
 
-ifeq ($(ARCH),x86-64-vnni)
-	arch = x86_64
-	prefetch = yes
+ifeq ($(findstring -vnni,$(ARCH)),-vnni)
 	popcnt = yes
 	sse = yes
+	sse2 = yes
 	ssse3 = yes
 	sse41 = yes
 	avx2 = yes
@@ -203,6 +204,28 @@ ifeq ($(ARCH),x86-64-vnni)
 	vnni = yes
 endif
 
+ifeq ($(sse),yes)
+	prefetch = yes
+endif
+
+# 64-bit pext is not available on x86-32
+ifeq ($(bits),32)
+	pext = no
+endif
+
+else
+
+# all other architectures
+
+ifeq ($(ARCH),general-32)
+	arch = any
+	bits = 32
+endif
+
+ifeq ($(ARCH),general-64)
+	arch = any
+endif
+
 ifeq ($(ARCH),armv7)
 	arch = armv7
 	prefetch = yes
@@ -242,6 +265,8 @@ ifeq ($(ARCH),ppc-64)
 	prefetch = yes
 endif
 
+endif
+
 ### ==========================================================================
 ### Section 3. Low-level Configuration
 ### ==========================================================================
@@ -487,6 +512,13 @@ ifeq ($(ssse3),yes)
 	endif
 endif
 
+ifeq ($(sse2),yes)
+	CXXFLAGS += -DUSE_SSE2
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -msse2
+	endif
+endif
+
 ifeq ($(mmx),yes)
 	CXXFLAGS += -DUSE_MMX
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
@@ -503,10 +535,6 @@ ifeq ($(neon),yes)
 	endif
 endif
 
-ifeq ($(arch),x86_64)
-	CXXFLAGS += -msse2 -DUSE_SSE2
-endif
-
 ### 3.7 pext
 ifeq ($(pext),yes)
 	CXXFLAGS += -DUSE_PEXT
@@ -592,9 +620,10 @@ help:
 	@echo "x86-64-modern           > common modern CPU, currently x86-64-sse41-popcnt"
 	@echo "x86-64-ssse3            > x86 64-bit with ssse3 support"
 	@echo "x86-64-sse3-popcnt      > x86 64-bit with sse3 and popcnt support"
-	@echo "x86-64                  > x86 64-bit generic"
-	@echo "x86-32                  > x86 32-bit (also enables MMX and SSE)"
-	@echo "x86-32-old              > x86 32-bit fall back for old hardware"
+	@echo "x86-64                  > x86 64-bit generic (with sse2 support)"
+	@echo "x86-32-sse41-popcnt     > x86 32-bit with sse41 and popcnt support"
+	@echo "x86-32-sse2             > x86 32-bit with sse2 support"
+	@echo "x86-32                  > x86 32-bit generic (with mmx and sse support)"
 	@echo "ppc-64                  > PPC 64-bit"
 	@echo "ppc-32                  > PPC 32-bit"
 	@echo "armv7                   > ARMv7 32-bit"
@@ -624,7 +653,7 @@ help:
 	@echo "make -j build ARCH=x86-64-ssse3 COMP=clang"
 	@echo ""
 ifneq ($(empty_arch), yes)
-	@echo "-------------------------------\n"
+	@echo "-------------------------------"
 	@echo "The selected architecture $(ARCH) will enable the following configuration: "
 	@$(MAKE) ARCH=$(ARCH) COMP=$(COMP) config-sanity
 endif
@@ -719,11 +748,13 @@ config-sanity:
 	@echo "os: '$(OS)'"
 	@echo "prefetch: '$(prefetch)'"
 	@echo "popcnt: '$(popcnt)'"
+	@echo "pext: '$(pext)'"
 	@echo "sse: '$(sse)'"
+	@echo "mmx: '$(mmx)'"
+	@echo "sse2: '$(sse2)'"
 	@echo "ssse3: '$(ssse3)'"
 	@echo "sse41: '$(sse41)'"
 	@echo "avx2: '$(avx2)'"
-	@echo "pext: '$(pext)'"
 	@echo "avx512: '$(avx512)'"
 	@echo "vnni: '$(vnni)'"
 	@echo "neon: '$(neon)'"
@@ -744,11 +775,13 @@ config-sanity:
 	@test "$(bits)" = "32" || test "$(bits)" = "64"
 	@test "$(prefetch)" = "yes" || test "$(prefetch)" = "no"
 	@test "$(popcnt)" = "yes" || test "$(popcnt)" = "no"
+	@test "$(pext)" = "yes" || test "$(pext)" = "no"
 	@test "$(sse)" = "yes" || test "$(sse)" = "no"
+	@test "$(mmx)" = "yes" || test "$(mmx)" = "no"
+	@test "$(sse2)" = "yes" || test "$(sse2)" = "no"
 	@test "$(ssse3)" = "yes" || test "$(ssse3)" = "no"
 	@test "$(sse41)" = "yes" || test "$(sse41)" = "no"
 	@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
-	@test "$(pext)" = "yes" || test "$(pext)" = "no"
 	@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
 	@test "$(vnni)" = "yes" || test "$(vnni)" = "no"
 	@test "$(neon)" = "yes" || test "$(neon)" = "no"

From 2deb08a52946379d4cebb1082e5d740d1d027122 Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Tue, 18 Aug 2020 18:54:28 +0800
Subject: [PATCH 38/58] Reintroduce last captures extension

STC:
LLR: 2.93 (-2.94,2.94) {-0.50,1.50}
Total: 34840 W: 3834 L: 3682 D: 27324
Ptnml(0-2): 153, 2767, 11455, 2865, 180
https://tests.stockfishchess.org/tests/view/5f3bb380b38d442594aabefc

LTC:
LLR: 2.95 (-2.94,2.94) {0.25,1.75}
Total: 15832 W: 890 L: 776 D: 14166
Ptnml(0-2): 17, 669, 6429, 785, 16
https://tests.stockfishchess.org/tests/view/5f3c46a0a95672ddd56c632a

closes https://github.com/official-stockfish/Stockfish/pull/3028

see also https://github.com/official-stockfish/Stockfish/pull/3020

Bench: 4348811
---
 src/search.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/search.cpp b/src/search.cpp
index 1d5bc5f7..7c839dfc 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1122,6 +1122,11 @@ moves_loop: // When in check, search starts from here
                && (pos.is_discovery_check_on_king(~us, move) || pos.see_ge(move)))
           extension = 1;
 
+      // Last captures extension
+      else if (   PieceValue[EG][pos.captured_piece()] > PawnValueEg
+               && pos.non_pawn_material() <= 2 * RookValueMg)
+          extension = 1;
+
       // Castling extension
       if (   type_of(move) == CASTLING
           && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2)

From a1ad8604a11459a94189f857e368d0fbb72da25d Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 19 Aug 2020 19:21:41 +0200
Subject: [PATCH 39/58] Send error message as an UCI info string

some GUIs do not show the error message when the engine terminates in the no-net case, as it is send to cerr.
Instead send it as an info string, which the GUI will more likely display.

closes https://github.com/official-stockfish/Stockfish/pull/3031

No functional change.
---
 src/evaluate.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 1bd89353..c84d894f 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -53,10 +53,11 @@ namespace Eval {
         UCI::OptionsMap defaults;
         UCI::init(defaults);
 
-        std::cerr << "NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully. "
-                  << "These network evaluation parameters must be available, and compatible with this version of the code. "
-                  << "The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file. "
-                  << "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << std::endl;
+        sync_cout << "info string ERROR: NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully." << sync_endl;
+        sync_cout << "info string ERROR: The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file." << sync_endl;
+        sync_cout << "info string ERROR: The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << sync_endl;
+        sync_cout << "info string ERROR: If the UCI option Use NNUE is set to true, network evaluation parameters compatible with the program must be available." << sync_endl;
+        sync_cout << "info string ERROR: The engine will be terminated now." << sync_endl;
         std::exit(EXIT_FAILURE);
     }
 

From daac86691de55fe388b9b727794c7d27f2b90d5c Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Thu, 20 Aug 2020 14:24:49 +0200
Subject: [PATCH 40/58] Set Use NNUE by default to true

Since the initial stages of the merge, progress has been made so that
this seems the best option now:

* NNUE is clearly stronger on most relevant hardware and time controls
* All of our CI and testing infrastructure has been adjusted
* The default net is easy to get (further ideas #3030)

fixes https://github.com/official-stockfish/Stockfish/issues/2861

closes https://github.com/official-stockfish/Stockfish/pull/3033

No functional change.
---
 src/ucioption.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 2b66a475..ec83c7c8 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -78,7 +78,7 @@ void init(OptionsMap& o) {
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
-  o["Use NNUE"]              << Option(false, on_use_NNUE);
+  o["Use NNUE"]              << Option(true, on_use_NNUE);
   // The default must follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work.
   o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);

From 8b45b1c4907b4b2186441e02edd3b0c37f8b3269 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Fri, 21 Aug 2020 07:42:19 +0200
Subject: [PATCH 41/58] Deal with very old linux kernels

MADV_HUGEPAGE might not be available, for kernels before 2.6.38 (released 2011). Just skip the madvise.

closes https://github.com/official-stockfish/Stockfish/pull/3039

No functional change
---
 src/misc.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/misc.cpp b/src/misc.cpp
index 459ea100..56a3dcad 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -367,7 +367,9 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
   size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
   if (posix_memalign(&mem, alignment, size))
      mem = nullptr;
+#if defined(MADV_HUGEPAGE)
   madvise(mem, allocSize, MADV_HUGEPAGE);
+#endif
   return mem;
 }
 

From 15abcaedc1e32d4de913a2f7dea12578912371b7 Mon Sep 17 00:00:00 2001
From: gsobala <gsobala@gmail.com>
Date: Fri, 21 Aug 2020 11:28:53 +0100
Subject: [PATCH 42/58] Update Makefile for macOS

Changes to deal with compilation (particularly profile-build) on macOS.
(1) The default toolchain has gcc masquerading as clang,
    the previous Makefile was not picking up the required changes
    to the different profiling tools.
(2) The previous Makefile test for gccisclang occurred before
    a potential overwrite of CXX by COMPCXX
(3) llvm-profdata no longer runs as a command on macOS and
    instead is invoked by ``xcrun llvm-profdata``
(4) Needs to support use of true gcc using e.g.
    ``make build ... COMPCXX=g++-10``
(5) enable profile-build in travis for macOS

closes https://github.com/official-stockfish/Stockfish/pull/3043

No functional change
---
 .travis.yml  |  3 ++-
 AUTHORS      |  1 +
 src/Makefile | 16 ++++++++++++----
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 12596f1e..a029c4fc 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -71,7 +71,8 @@ script:
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" && "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
+  # workaround: exclude a custom version of llvm+clang, which doesn't find llvm-profdata on ubuntu
+  - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
 
   # compile only for some more advanced architectures (might not run in travis)
   - make clean && make -j2 ARCH=x86-64-avx2 build
diff --git a/AUTHORS b/AUTHORS
index d8f4d30e..c96f870a 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -59,6 +59,7 @@ Fauzi Akram Dabat (FauziAkram)
 Felix Wittmann
 gamander
 Gary Heckman (gheckman)
+George Sobala (gsobala)
 gguliash
 Gian-Carlo Pascutto (gcp)
 Gontran Lemaire (gonlem)
diff --git a/src/Makefile b/src/Makefile
index 79c7333a..b969ba04 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -302,9 +302,6 @@ ifeq ($(COMP),gcc)
 	ifneq ($(KERNEL),Darwin)
 	   LDFLAGS += -Wl,--no-as-needed
 	endif
-
-	gccversion = $(shell $(CXX) --version)
-	gccisclang = $(findstring clang,$(gccversion))
 endif
 
 ifeq ($(COMP),mingw)
@@ -376,6 +373,7 @@ endif
 ifeq ($(KERNEL),Darwin)
 	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14
 	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14
+	XCRUN = xcrun
 endif
 
 # To cross-compile for Android, NDK version r21 or later is recommended.
@@ -407,6 +405,16 @@ ifdef COMPCXX
 	CXX=$(COMPCXX)
 endif
 
+### Sometimes gcc is really clang
+ifeq ($(COMP),gcc)
+	gccversion = $(shell $(CXX) --version)
+	gccisclang = $(findstring clang,$(gccversion))
+	ifneq ($(gccisclang),)
+		profile_make = clang-profile-make
+		profile_use = clang-profile-use
+	endif
+endif
+
 ### On mingw use Windows threads, otherwise POSIX
 ifneq ($(comp),mingw)
 	# On Android Bionic's C library comes with its own pthread implementation bundled in
@@ -798,7 +806,7 @@ clang-profile-make:
 	all
 
 clang-profile-use:
-	llvm-profdata merge -output=stockfish.profdata *.profraw
+	$(XCRUN) llvm-profdata merge -output=stockfish.profdata *.profraw
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
 	EXTRACXXFLAGS='-fprofile-instr-use=stockfish.profdata' \
 	EXTRALDFLAGS='-fprofile-use ' \

From e64b957274b94e89ad1a6e3ec4571c9082246a0a Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Fri, 21 Aug 2020 09:24:25 +0200
Subject: [PATCH 43/58] Simplify away internal iterative deepening

Remove the iterative deepening step.
Instead, employ a depth reduction if the position is not in TT and on the PV.

STC https://tests.stockfishchess.org/tests/view/5f3ce6eaa95672ddd56c637e
LLR: 2.97 (-2.94,2.94) {-0.50,1.50}
Total: 41096 W: 4421 L: 4257 D: 32418
Ptnml(0-2): 207, 3259, 13460, 3407, 215

LTC (old) https://tests.stockfishchess.org/tests/view/5f3d7d4fa95672ddd56c640b
LLR: 2.92 (-2.94,2.94) {-1.50,0.50}
Total: 26032 W: 1320 L: 1309 D: 23403
Ptnml(0-2): 22, 1152, 10654, 1169, 19

LTC (new) https://tests.stockfishchess.org/tests/view/5f3e31e0a95672ddd56c6464
LLR: 2.95 (-2.94,2.94) {-0.75,0.25}
Total: 34160 W: 1844 L: 1766 D: 30550
Ptnml(0-2): 33, 1533, 13876, 1599, 39

bench: 3849173
---
 src/search.cpp | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 7c839dfc..ba13680c 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -939,15 +939,11 @@ namespace {
             }
     }
 
-    // Step 11. Internal iterative deepening (~1 Elo)
-    if (depth >= 7 && !ttMove)
-    {
-        search<NT>(pos, ss, alpha, beta, depth - 7, cutNode);
-
-        tte = TT.probe(posKey, ttHit);
-        ttValue = ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE;
-        ttMove = ttHit ? tte->move() : MOVE_NONE;
-    }
+    // Step 11. If the position is not in TT, decrease depth by 2
+    if (   PvNode
+        && depth >= 6
+        && !ttMove)
+        depth -= 2;
 
 moves_loop: // When in check, search starts from here
 

From cbcb05ca092160137c166f84e7e9da3d6bb4e2d3 Mon Sep 17 00:00:00 2001
From: MJZ1977 <37274752+MJZ1977@users.noreply.github.com>
Date: Fri, 21 Aug 2020 10:57:34 +0200
Subject: [PATCH 44/58] Display classic and NNUE evaluation in trace mode

show both the classical and NNUE evaluation,
as well as the Final evaluation.

closes https://github.com/official-stockfish/Stockfish/pull/3042

No functional change.
---
 src/evaluate.cpp | 65 ++++++++++++++++++++++++++----------------------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index c84d894f..c66938d6 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -972,42 +972,47 @@ std::string Eval::trace(const Position& pos) {
 
   Value v;
 
-  if (Eval::useNNUE)
-  {
-      v = NNUE::evaluate(pos);
-  }
-  else
-  {
-      std::memset(scores, 0, sizeof(scores));
+  std::memset(scores, 0, sizeof(scores));
 
-      pos.this_thread()->contempt = SCORE_ZERO; // Reset any dynamic contempt
+  pos.this_thread()->contempt = SCORE_ZERO; // Reset any dynamic contempt
 
-      v = Evaluation<TRACE>(pos).value();
+  v = Evaluation<TRACE>(pos).value();
 
-      ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2)
-         << "     Term    |    White    |    Black    |    Total   \n"
-         << "             |   MG    EG  |   MG    EG  |   MG    EG \n"
-         << " ------------+-------------+-------------+------------\n"
-         << "    Material | " << Term(MATERIAL)
-         << "   Imbalance | " << Term(IMBALANCE)
-         << "       Pawns | " << Term(PAWN)
-         << "     Knights | " << Term(KNIGHT)
-         << "     Bishops | " << Term(BISHOP)
-         << "       Rooks | " << Term(ROOK)
-         << "      Queens | " << Term(QUEEN)
-         << "    Mobility | " << Term(MOBILITY)
-         << " King safety | " << Term(KING)
-         << "     Threats | " << Term(THREAT)
-         << "      Passed | " << Term(PASSED)
-         << "       Space | " << Term(SPACE)
-         << "    Winnable | " << Term(WINNABLE)
-         << " ------------+-------------+-------------+------------\n"
-         << "       Total | " << Term(TOTAL);
-  }
+  ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2)
+     << "     Term    |    White    |    Black    |    Total   \n"
+     << "             |   MG    EG  |   MG    EG  |   MG    EG \n"
+     << " ------------+-------------+-------------+------------\n"
+     << "    Material | " << Term(MATERIAL)
+     << "   Imbalance | " << Term(IMBALANCE)
+     << "       Pawns | " << Term(PAWN)
+     << "     Knights | " << Term(KNIGHT)
+     << "     Bishops | " << Term(BISHOP)
+     << "       Rooks | " << Term(ROOK)
+     << "      Queens | " << Term(QUEEN)
+     << "    Mobility | " << Term(MOBILITY)
+     << " King safety | " << Term(KING)
+     << "     Threats | " << Term(THREAT)
+     << "      Passed | " << Term(PASSED)
+     << "       Space | " << Term(SPACE)
+     << "    Winnable | " << Term(WINNABLE)
+     << " ------------+-------------+-------------+------------\n"
+     << "       Total | " << Term(TOTAL);
 
   v = pos.side_to_move() == WHITE ? v : -v;
 
-  ss << "\nFinal evaluation: " << to_cp(v) << " (white side)\n";
+  ss << "\nClassical evaluation: " << to_cp(v) << " (white side)\n";
+
+  if (Eval::useNNUE)
+  {
+      v = NNUE::evaluate(pos);
+      v = pos.side_to_move() == WHITE ? v : -v;
+      ss << "\nNNUE evaluation:      " << to_cp(v) << " (white side)\n";
+  }
+
+  v = evaluate(pos);
+  v = pos.side_to_move() == WHITE ? v : -v;
+  ss << "\nFinal evaluation:     " << to_cp(v) << " (white side)\n";
+
 
   return ss.str();
 }

From 34f67c57223d73ad40d583ccc033c75eb0df2453 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Fri, 21 Aug 2020 22:10:55 +0200
Subject: [PATCH 45/58] Explicitly rely on pthreads if possible

allows us to set the needed stacksize on thread creation.

Useful for environments with too small a default stack size (e.g. Alpine Linux with musl).

Passed STC, no regression:

LLR: 2.96 (-2.94,2.94) {-1.25,0.25}
Total: 17816 W: 1344 L: 1275 D: 15197
Ptnml(0-2): 30, 1057, 6682, 1092, 47
https://tests.stockfishchess.org/tests/view/5f402b5587a5c3c63d8f534d

closes https://github.com/official-stockfish/Stockfish/pull/3047

fixes https://github.com/official-stockfish/Stockfish/issues/3041

No functional change.
---
 src/Makefile           | 1 +
 src/thread_win32_osx.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index b969ba04..74ef87b9 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -417,6 +417,7 @@ endif
 
 ### On mingw use Windows threads, otherwise POSIX
 ifneq ($(comp),mingw)
+	CXXFLAGS += -DUSE_PTHREADS
 	# On Android Bionic's C library comes with its own pthread implementation bundled in
 	ifneq ($(OS),Android)
 		# Haiku has pthreads in its libroot, so only link it in on other platforms
diff --git a/src/thread_win32_osx.h b/src/thread_win32_osx.h
index c4b55a48..75ef5d9a 100644
--- a/src/thread_win32_osx.h
+++ b/src/thread_win32_osx.h
@@ -27,7 +27,7 @@
 /// The implementation calls pthread_create() with the stack size parameter
 /// equal to the linux 8MB default, on platforms that support it.
 
-#if defined(__APPLE__) || defined(__MINGW32__) || defined(__MINGW64__)
+#if defined(__APPLE__) || defined(__MINGW32__) || defined(__MINGW64__) || defined(USE_PTHREADS)
 
 #include <pthread.h>
 

From 3542033342f15625f808013b69aa8c2d274a2f91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20Nicolet?= <Stephane.Nicolet@u-paris2.fr>
Date: Sat, 22 Aug 2020 11:37:53 +0200
Subject: [PATCH 46/58] Instructions to build on older Macintosh

In recent Macs, it is possible to use the Clang compiler provided by Apple
to compile Stockfish out of the box, and this is the method used by default
in our Makefile (the Makefile sets the macosx-version-min=10.14 flag to select
the right libc++ library for the Clang compiler with recent c++17 support).

But it is quite possible to compile and run Stockfish on older Macs! Below
we describe a method to install a recent GNU compiler on these Macs, to get
the c++17 support. We have tested the following procedure to install gcc10 on
machines running Mac OS 10.7, Mac OS 10.9 and Mac OS 10.13:

1) install XCode for your machine.

2) install Apple command-line developer tools for XCode, by typing the following
   command in a Terminal:

```
      sudo xcode-select --install
```

3) go to the Stockfish "src" directory, then try a default build and run Stockfish:

```
      make clean
      make build
      make net
      ./stockfish
```

4) if step 3 worked, congrats! You have a compiler recent enough on your Mac
to compile Stockfish. If not, continue with step 5 to install GNU gcc10 :-)

5) install the MacPorts package manager (https://www.macports.org/install.php),
for instance using the fast method in the "macOS Package (.pkg) Installer"
section of the page.

6) use the "port" command to install the gcc10 package of MacPorts by typing the
following command:

```
    sudo port install gcc10
```

With this step, MacPorts will install the gcc10 compiler under the name "g++-mp-10"
in the /opt/local/bin directory:

```
   which g++-mp-10

   /opt/local/bin/g++-mp-10       <--- answer
```

7) You can now go back to the "src" directory of Stockfish, and try to build
Stockfish by pointing at the right compiler:

```
   make clean
   make build COMP=gcc COMPCXX=/opt/local/bin/g++-mp-10
   make net
   ./stockfish
```

8) Enjoy Stockfish on Macintosh!

See this pull request for further discussion:
https://github.com/official-stockfish/Stockfish/pull/3049

No functional change
---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 74ef87b9..b0274504 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -370,7 +370,7 @@ else
 endif
 endif
 
-ifeq ($(KERNEL),Darwin)
+ifeq ($(KERNEL),Darwin) 
 	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14
 	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14
 	XCRUN = xcrun

From 5f1843c9cb55afcd3fb1da9e9dc4b0092f25d9f0 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sat, 11 Jul 2020 16:59:33 +0200
Subject: [PATCH 47/58] Small trivial cleanups

closes https://github.com/official-stockfish/Stockfish/pull/2801

No functional change
---
 README.md        | 53 +++++++++++++++++++++++++-----------------------
 src/Makefile     |  2 +-
 src/bitboard.cpp | 12 ++++++++++-
 src/bitboard.h   | 10 ---------
 src/evaluate.cpp | 17 ++++++++--------
 src/material.cpp |  2 +-
 src/misc.cpp     | 17 ++++++++--------
 src/misc.h       |  8 --------
 src/movegen.cpp  |  2 +-
 src/movepick.cpp | 14 ++++++-------
 src/movepick.h   |  8 ++++----
 src/pawns.cpp    |  2 +-
 src/position.cpp |  4 ++--
 src/position.h   |  5 +++++
 src/search.cpp   | 15 +++++++-------
 src/timeman.cpp  | 18 ++++++++--------
 src/uci.cpp      |  2 +-
 17 files changed, 96 insertions(+), 95 deletions(-)

diff --git a/README.md b/README.md
index 7b6ddf4c..2cc88bf4 100644
--- a/README.md
+++ b/README.md
@@ -4,17 +4,17 @@
 [![Build Status](https://ci.appveyor.com/api/projects/status/github/official-stockfish/Stockfish?branch=master&svg=true)](https://ci.appveyor.com/project/mcostalba/stockfish/branch/master)
 
 [Stockfish](https://stockfishchess.org) is a free, powerful UCI chess engine
-derived from Glaurung 2.1. It features two evaluation functions, the classical
-evaluation based on handcrafted terms, and the NNUE evaluation based on
-efficiently updateable neural networks. The classical evaluation runs efficiently
-on most 64bit CPU architectures, while the NNUE evaluation benefits strongly from the
-vector intrinsics available on modern CPUs (avx2 or similar).
+derived from Glaurung 2.1. Stockfish is not a complete chess program and requires a
+UCI-compatible graphical user interface (GUI) (e.g. XBoard with PolyGlot, Scid,
+Cute Chess, eboard, Arena, Sigma Chess, Shredder, Chess Partner or Fritz) in order
+to be used comfortably. Read the documentation for your GUI of choice for information
+about how to use Stockfish with it.
 
-Stockfish is not a complete chess program and requires a
-UCI-compatible GUI (e.g. XBoard with PolyGlot, Scid, Cute Chess, eboard, Arena,
-Sigma Chess, Shredder, Chess Partner or Fritz) in order to be used comfortably.
-Read the documentation for your GUI of choice for information about how to use
-Stockfish with it.
+The Stockfish engine features two evaluation functions for chess, the classical
+evaluation based on handcrafted terms, and the NNUE evaluation based on efficiently
+updateable neural networks. The classical evaluation runs efficiently on most 64bit
+CPU architectures, while the NNUE evaluation benefits strongly from the vector
+intrinsics available on modern CPUs (avx2 or similar).
 
 
 ## Files
@@ -28,10 +28,13 @@ This distribution of Stockfish consists of the following files:
   * src, a subdirectory containing the full source code, including a Makefile
     that can be used to compile Stockfish on Unix-like systems.
 
-To use the NNUE evaluation an additional data file with neural network parameters
-needs to be downloaded. The filename for the default set can be found as the default
-value of the `EvalFile` UCI option, with the format
-`nn-[SHA256 first 12 digits].nnue` (e.g. nn-c157e0a5755b.nnue). This file can be downloaded from
+  * a file with the .nnue extension, storing the neural network for the NNUE 
+    evaluation.
+
+Note: to use the NNUE evaluation, the additional data file with neural network parameters
+needs to be downloaded. The filename for the default net can be found as the default
+value of the `EvalFile` UCI option, with the format `nn-[SHA256 first 12 digits].nnue`
+(for instance, `nn-c157e0a5755b.nnue`). This file can be downloaded from
 ```
 https://tests.stockfishchess.org/api/nn/[filename]
 ```
@@ -64,14 +67,6 @@ Currently, Stockfish has the following UCI options:
     The name of the file of the NNUE evaluation parameters. Depending on the GUI the
     filename should include the full path to the folder/directory that contains the file.
 
-  * #### Contempt
-    A positive value for contempt favors middle game positions and avoids draws,
-    effective for the classical evaluation only.
-
-  * #### Analysis Contempt
-    By default, contempt is set to prefer the side to move. Set this option to "White"
-    or "Black" to analyse with contempt for that side, or "Off" to disable contempt.
-
   * #### UCI_AnalyseMode
     An option handled by your GUI.
 
@@ -120,6 +115,14 @@ Currently, Stockfish has the following UCI options:
     Limit Syzygy tablebase probing to positions with at most this many pieces left
     (including kings and pawns).
 
+  * #### Contempt
+    A positive value for contempt favors middle game positions and avoids draws,
+    effective for the classical evaluation only.
+
+  * #### Analysis Contempt
+    By default, contempt is set to prefer the side to move. Set this option to "White"
+    or "Black" to analyse with contempt for that side, or "Off" to disable contempt.
+
   * #### Move Overhead
     Assume a time delay of x ms due to network and GUI overheads. This is useful to
     avoid losses on time in those cases.
@@ -138,7 +141,7 @@ Currently, Stockfish has the following UCI options:
   * #### Debug Log File
     Write all communication to and from the engine into a text file.
 
-## Classical and NNUE evaluation
+## A note on classical and NNUE evaluation
 
 Both approaches assign a value to a position that is used in alpha-beta (PVS) search
 to find the best move. The classical evaluation computes this value as a function
@@ -226,6 +229,7 @@ targets with corresponding descriptions.
     cd src
     make help
     make build ARCH=x86-64-modern
+    make net
 ```
 
 When not using the Makefile to compile (for instance with Microsoft MSVC) you
@@ -237,8 +241,7 @@ compiler you used to create your executable. These informations can
 be found by typing the following commands in a console:
 
 ```
-    ./stockfish
-    compiler
+    ./stockfish compiler
 ```
 
 ## Understanding the code base and participating in the project
diff --git a/src/Makefile b/src/Makefile
index b0274504..74ef87b9 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -370,7 +370,7 @@ else
 endif
 endif
 
-ifeq ($(KERNEL),Darwin) 
+ifeq ($(KERNEL),Darwin)
 	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14
 	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14
 	XCRUN = xcrun
diff --git a/src/bitboard.cpp b/src/bitboard.cpp
index f531010c..80206b58 100644
--- a/src/bitboard.cpp
+++ b/src/bitboard.cpp
@@ -39,6 +39,16 @@ namespace {
   Bitboard BishopTable[0x1480]; // To store bishop attacks
 
   void init_magics(PieceType pt, Bitboard table[], Magic magics[]);
+
+}
+
+
+/// safe_destination() returns the bitboard of target square for the given step
+/// from the given square. If the step is off the board, returns empty bitboard.
+
+inline Bitboard safe_destination(Square s, int step) {
+    Square to = Square(s + step);
+    return is_ok(to) && distance(s, to) <= 2 ? square_bb(to) : Bitboard(0);
 }
 
 
@@ -110,7 +120,7 @@ namespace {
     Direction   RookDirections[4] = {NORTH, SOUTH, EAST, WEST};
     Direction BishopDirections[4] = {NORTH_EAST, SOUTH_EAST, SOUTH_WEST, NORTH_WEST};
 
-    for(Direction d : (pt == ROOK ? RookDirections : BishopDirections))
+    for (Direction d : (pt == ROOK ? RookDirections : BishopDirections))
     {
         Square s = sq;
         while(safe_destination(s, d) && !(occupied & s))
diff --git a/src/bitboard.h b/src/bitboard.h
index a899d879..29d8f66d 100644
--- a/src/bitboard.h
+++ b/src/bitboard.h
@@ -279,16 +279,6 @@ inline int edge_distance(File f) { return std::min(f, File(FILE_H - f)); }
 inline int edge_distance(Rank r) { return std::min(r, Rank(RANK_8 - r)); }
 
 
-/// safe_destination() returns the bitboard of target square for the given step
-/// from the given square. If the step is off the board, returns empty bitboard.
-
-inline Bitboard safe_destination(Square s, int step)
-{
-    Square to = Square(s + step);
-    return is_ok(to) && distance(s, to) <= 2 ? square_bb(to) : Bitboard(0);
-}
-
-
 /// attacks_bb(Square) returns the pseudo attacks of the give piece type
 /// assuming an empty board.
 
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index c66938d6..ce92db9a 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -288,8 +288,8 @@ namespace {
     attackedBy2[Us] = dblAttackByPawn | (attackedBy[Us][KING] & attackedBy[Us][PAWN]);
 
     // Init our king safety tables
-    Square s = make_square(Utility::clamp(file_of(ksq), FILE_B, FILE_G),
-                           Utility::clamp(rank_of(ksq), RANK_2, RANK_7));
+    Square s = make_square(std::clamp(file_of(ksq), FILE_B, FILE_G),
+                           std::clamp(rank_of(ksq), RANK_2, RANK_7));
     kingRing[Us] = attacks_bb<KING>(s) | s;
 
     kingAttackersCount[Them] = popcount(kingRing[Us] & pe->pawn_attacks(Them));
@@ -686,8 +686,8 @@ namespace {
             Square blockSq = s + Up;
 
             // Adjust bonus based on the king's proximity
-            bonus += make_score(0, (  (king_proximity(Them, blockSq) * 19) / 4
-                                     - king_proximity(Us,   blockSq) *  2) * w);
+            bonus += make_score(0, (  king_proximity(Them, blockSq) * 19 / 4
+                                    - king_proximity(Us,   blockSq) *  2) * w);
 
             // If blockSq is not the queening square then consider also a second push
             if (r != RANK_7)
@@ -731,7 +731,7 @@ namespace {
 
 
   // Evaluation::space() computes a space evaluation for a given side, aiming to improve game
-  // play in the opening. It is based on the number of safe squares on the 4 central files
+  // play in the opening. It is based on the number of safe squares on the four central files
   // on ranks 2 to 4. Completely safe squares behind a friendly pawn are counted twice.
   // Finally, the space bonus is multiplied by a weight which decreases according to occupancy.
 
@@ -804,7 +804,7 @@ namespace {
     // Now apply the bonus: note that we find the attacking side by extracting the
     // sign of the midgame or endgame values, and that we carefully cap the bonus
     // so that the midgame and endgame scores do not change sign after the bonus.
-    int u = ((mg > 0) - (mg < 0)) * Utility::clamp(complexity + 50, -abs(mg), 0);
+    int u = ((mg > 0) - (mg < 0)) * std::clamp(complexity + 50, -abs(mg), 0);
     int v = ((eg > 0) - (eg < 0)) * std::max(complexity, -abs(eg));
 
     mg += u;
@@ -951,8 +951,8 @@ Value Eval::evaluate(const Position& pos) {
   // Damp down the evaluation linearly when shuffling
   v = v * (100 - pos.rule50_count()) / 100;
 
-  // Guarantee evalution outside of TB range
-  v = Utility::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
+  // Guarantee evaluation does not hit the tablebase range
+  v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
 
   return v;
 }
@@ -1013,6 +1013,5 @@ std::string Eval::trace(const Position& pos) {
   v = pos.side_to_move() == WHITE ? v : -v;
   ss << "\nFinal evaluation:     " << to_cp(v) << " (white side)\n";
 
-
   return ss.str();
 }
diff --git a/src/material.cpp b/src/material.cpp
index 0ef9926f..870a5e11 100644
--- a/src/material.cpp
+++ b/src/material.cpp
@@ -130,7 +130,7 @@ Entry* probe(const Position& pos) {
 
   Value npm_w = pos.non_pawn_material(WHITE);
   Value npm_b = pos.non_pawn_material(BLACK);
-  Value npm   = Utility::clamp(npm_w + npm_b, EndgameLimit, MidgameLimit);
+  Value npm   = std::clamp(npm_w + npm_b, EndgameLimit, MidgameLimit);
 
   // Map total non-pawn material into [PHASE_ENDGAME, PHASE_MIDGAME]
   e->gamePhase = Phase(((npm - EndgameLimit) * PHASE_MIDGAME) / (MidgameLimit - EndgameLimit));
diff --git a/src/misc.cpp b/src/misc.cpp
index 56a3dcad..80c436ac 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -328,16 +328,16 @@ void prefetch(void* addr) {
 
 #endif
 
-/// Wrappers for systems where the c++17 implementation doesn't guarantee the availability of aligned_alloc.
-/// Memory allocated with std_aligned_alloc must be freed with std_aligned_free.
-///
+
+/// std_aligned_alloc() is our wrapper for systems where the c++17 implementation
+/// does not guarantee the availability of aligned_alloc(). Memory allocated with
+/// std_aligned_alloc() must be freed with std_aligned_free().
 
 void* std_aligned_alloc(size_t alignment, size_t size) {
+
 #if defined(POSIXALIGNEDALLOC)
-  void *pointer;
-  if(posix_memalign(&pointer, alignment, size) == 0)
-      return pointer;
-  return nullptr;
+  void *mem;
+  return posix_memalign(&mem, alignment, size) ? nullptr : mem;
 #elif defined(_WIN32)
   return _mm_malloc(size, alignment);
 #else
@@ -346,6 +346,7 @@ void* std_aligned_alloc(size_t alignment, size_t size) {
 }
 
 void std_aligned_free(void* ptr) {
+
 #if defined(POSIXALIGNEDALLOC)
   free(ptr);
 #elif defined(_WIN32)
@@ -355,7 +356,7 @@ void std_aligned_free(void* ptr) {
 #endif
 }
 
-/// aligned_ttmem_alloc() will return suitably aligned memory, and if possible use large pages.
+/// aligned_ttmem_alloc() will return suitably aligned memory, if possible using large pages.
 /// The returned pointer is the aligned one, while the mem argument is the one that needs
 /// to be passed to free. With c++17 some of this functionality could be simplified.
 
diff --git a/src/misc.h b/src/misc.h
index eb4e05c0..8ad17b50 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -65,14 +65,6 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 #define sync_cout std::cout << IO_LOCK
 #define sync_endl std::endl << IO_UNLOCK
 
-namespace Utility {
-
-/// Clamp a value between lo and hi. Available in c++17.
-template<class T> constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
-  return v < lo ? lo : v > hi ? hi : v;
-}
-
-}
 
 /// xorshift64star Pseudo-Random Number Generator
 /// This class is based on original code written and dedicated
diff --git a/src/movegen.cpp b/src/movegen.cpp
index d74df4c3..3340f65c 100644
--- a/src/movegen.cpp
+++ b/src/movegen.cpp
@@ -248,7 +248,7 @@ namespace {
             *moveList++ = make_move(ksq, pop_lsb(&b));
 
         if ((Type != CAPTURES) && pos.can_castle(Us & ANY_CASTLING))
-            for(CastlingRights cr : { Us & KING_SIDE, Us & QUEEN_SIDE } )
+            for (CastlingRights cr : { Us & KING_SIDE, Us & QUEEN_SIDE } )
                 if (!pos.castling_impeded(cr) && pos.can_castle(cr))
                     *moveList++ = make<CASTLING>(ksq, pos.castling_rook_square(cr));
     }
diff --git a/src/movepick.cpp b/src/movepick.cpp
index 96a44449..153d323e 100644
--- a/src/movepick.cpp
+++ b/src/movepick.cpp
@@ -182,7 +182,7 @@ top:
           --endMoves;
 
       ++stage;
-      /* fallthrough */
+      [[fallthrough]];
 
   case REFUTATION:
       if (select<Next>([&](){ return    *cur != MOVE_NONE
@@ -190,7 +190,7 @@ top:
                                     &&  pos.pseudo_legal(*cur); }))
           return *(cur - 1);
       ++stage;
-      /* fallthrough */
+      [[fallthrough]];
 
   case QUIET_INIT:
       if (!skipQuiets)
@@ -203,7 +203,7 @@ top:
       }
 
       ++stage;
-      /* fallthrough */
+      [[fallthrough]];
 
   case QUIET:
       if (   !skipQuiets
@@ -217,7 +217,7 @@ top:
       endMoves = endBadCaptures;
 
       ++stage;
-      /* fallthrough */
+      [[fallthrough]];
 
   case BAD_CAPTURE:
       return select<Next>([](){ return true; });
@@ -228,7 +228,7 @@ top:
 
       score<EVASIONS>();
       ++stage;
-      /* fallthrough */
+      [[fallthrough]];
 
   case EVASION:
       return select<Best>([](){ return true; });
@@ -246,14 +246,14 @@ top:
           return MOVE_NONE;
 
       ++stage;
-      /* fallthrough */
+      [[fallthrough]];
 
   case QCHECK_INIT:
       cur = moves;
       endMoves = generate<QUIET_CHECKS>(pos, cur);
 
       ++stage;
-      /* fallthrough */
+      [[fallthrough]];
 
   case QCHECK:
       return select<Next>([](){ return true; });
diff --git a/src/movepick.h b/src/movepick.h
index f080935a..97ea5bec 100644
--- a/src/movepick.h
+++ b/src/movepick.h
@@ -86,14 +86,14 @@ enum StatsType { NoCaptures, Captures };
 /// the move's from and to squares, see www.chessprogramming.org/Butterfly_Boards
 typedef Stats<int16_t, 10692, COLOR_NB, int(SQUARE_NB) * int(SQUARE_NB)> ButterflyHistory;
 
-/// At higher depths LowPlyHistory records successful quiet moves near the root and quiet
-/// moves which are/were in the PV (ttPv)
-/// It is cleared with each new search and filled during iterative deepening
+/// At higher depths LowPlyHistory records successful quiet moves near the root
+/// and quiet moves which are/were in the PV (ttPv). It is cleared with each new
+/// search and filled during iterative deepening.
 constexpr int MAX_LPH = 4;
 typedef Stats<int16_t, 10692, MAX_LPH, int(SQUARE_NB) * int(SQUARE_NB)> LowPlyHistory;
 
 /// CounterMoveHistory stores counter moves indexed by [piece][to] of the previous
-/// move, see www.chessprogramming.org/Countermove_Heuristic
+/// move, see www.chessprogramming.org/Countermove_Heuristic 
 typedef Stats<Move, NOT_USED, PIECE_NB, SQUARE_NB> CounterMoveHistory;
 
 /// CapturePieceToHistory is addressed by a move's [piece][to][captured piece type]
diff --git a/src/pawns.cpp b/src/pawns.cpp
index 868d0c8e..af0f6618 100644
--- a/src/pawns.cpp
+++ b/src/pawns.cpp
@@ -219,7 +219,7 @@ Score Entry::evaluate_shelter(const Position& pos, Square ksq) const {
 
   Score bonus = make_score(5, 5);
 
-  File center = Utility::clamp(file_of(ksq), FILE_B, FILE_G);
+  File center = std::clamp(file_of(ksq), FILE_B, FILE_G);
   for (File f = File(center - 1); f <= File(center + 1); ++f)
   {
       b = ourPawns & file_bb(f);
diff --git a/src/position.cpp b/src/position.cpp
index 46e5d78b..02898547 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -1145,8 +1145,8 @@ bool Position::see_ge(Move m, Value threshold) const {
 
       // Don't allow pinned pieces to attack (except the king) as long as
       // there are pinners on their original square.
-      if (st->pinners[~stm] & occupied)
-          stmAttackers &= ~st->blockersForKing[stm];
+      if (pinners(~stm) & occupied)
+          stmAttackers &= ~blockers_for_king(stm);
 
       if (!stmAttackers)
           break;
diff --git a/src/position.h b/src/position.h
index a77050eb..5ce17277 100644
--- a/src/position.h
+++ b/src/position.h
@@ -113,6 +113,7 @@ public:
   Bitboard checkers() const;
   Bitboard blockers_for_king(Color c) const;
   Bitboard check_squares(PieceType pt) const;
+  Bitboard pinners(Color c) const;
   bool is_discovery_check_on_king(Color c, Move m) const;
 
   // Attacks to/from a given square
@@ -309,6 +310,10 @@ inline Bitboard Position::blockers_for_king(Color c) const {
   return st->blockersForKing[c];
 }
 
+inline Bitboard Position::pinners(Color c) const {
+  return st->pinners[c];
+}
+
 inline Bitboard Position::check_squares(PieceType pt) const {
   return st->checkSquares[pt];
 }
diff --git a/src/search.cpp b/src/search.cpp
index ba13680c..82d8bb9d 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -335,7 +335,7 @@ void Thread::search() {
   // for match (TC 60+0.6) results spanning a wide range of k values.
   PRNG rng(now());
   double floatLevel = Options["UCI_LimitStrength"] ?
-                      Utility::clamp(std::pow((Options["UCI_Elo"] - 1346.6) / 143.4, 1 / 0.806), 0.0, 20.0) :
+                      std::clamp(std::pow((Options["UCI_Elo"] - 1346.6) / 143.4, 1 / 0.806), 0.0, 20.0) :
                         double(Options["Skill Level"]);
   int intLevel = int(floatLevel) +
                  ((floatLevel - int(floatLevel)) * 1024 > rng.rand<unsigned>() % 1024  ? 1 : 0);
@@ -508,7 +508,7 @@ void Thread::search() {
       {
           double fallingEval = (318 + 6 * (mainThread->bestPreviousScore - bestValue)
                                     + 6 * (mainThread->iterValue[iterIdx] - bestValue)) / 825.0;
-          fallingEval = Utility::clamp(fallingEval, 0.5, 1.5);
+          fallingEval = std::clamp(fallingEval, 0.5, 1.5);
 
           // If the bestMove is stable over several iterations, reduce time accordingly
           timeReduction = lastBestMoveDepth + 9 < completedDepth ? 1.92 : 0.95;
@@ -807,8 +807,9 @@ namespace {
         &&  eval <= alpha - RazorMargin)
         return qsearch<NT>(pos, ss, alpha, beta);
 
-    improving =  (ss-2)->staticEval == VALUE_NONE ? (ss->staticEval > (ss-4)->staticEval
-              || (ss-4)->staticEval == VALUE_NONE) : ss->staticEval > (ss-2)->staticEval;
+    improving =  (ss-2)->staticEval == VALUE_NONE
+               ? ss->staticEval > (ss-4)->staticEval || (ss-4)->staticEval == VALUE_NONE
+               : ss->staticEval > (ss-2)->staticEval;
 
     // Step 8. Futility pruning: child node (~50 Elo)
     if (   !PvNode
@@ -879,8 +880,8 @@ namespace {
         // there and in further interactions with transposition table cutoff depth is set to depth - 3
         // because probCut search has depth set to depth - 4 but we also do a move before it
         // so effective depth is equal to depth - 3
-        && !(   ttHit 
-             && tte->depth() >= depth - 3 
+        && !(   ttHit
+             && tte->depth() >= depth - 3
              && ttValue != VALUE_NONE
              && ttValue < probCutBeta))
     {
@@ -1238,7 +1239,7 @@ moves_loop: // When in check, search starts from here
                 r++;
           }
 
-          Depth d = Utility::clamp(newDepth - r, 1, newDepth);
+          Depth d = std::clamp(newDepth - r, 1, newDepth);
 
           value = -search<NonPV>(pos, ss+1, -(alpha+1), -alpha, d, true);
 
diff --git a/src/timeman.cpp b/src/timeman.cpp
index df4ba9b2..6d9c95ef 100644
--- a/src/timeman.cpp
+++ b/src/timeman.cpp
@@ -38,9 +38,9 @@ void TimeManagement::init(Search::LimitsType& limits, Color us, int ply) {
   TimePoint slowMover       = TimePoint(Options["Slow Mover"]);
   TimePoint npmsec          = TimePoint(Options["nodestime"]);
 
-  // opt_scale is a percentage of available time to use for the current move.
-  // max_scale is a multiplier applied to optimumTime.
-  double opt_scale, max_scale;
+  // optScale is a percentage of available time to use for the current move.
+  // maxScale is a multiplier applied to optimumTime.
+  double optScale, maxScale;
 
   // If we have to play in 'nodes as time' mode, then convert from time
   // to nodes, and use resulting values in time management formulas.
@@ -75,22 +75,22 @@ void TimeManagement::init(Search::LimitsType& limits, Color us, int ply) {
   // game time for the current move, so also cap to 20% of available game time.
   if (limits.movestogo == 0)
   {
-      opt_scale = std::min(0.008 + std::pow(ply + 3.0, 0.5) / 250.0,
+      optScale = std::min(0.008 + std::pow(ply + 3.0, 0.5) / 250.0,
                            0.2 * limits.time[us] / double(timeLeft));
-      max_scale = std::min(7.0, 4.0 + ply / 12.0);
+      maxScale = std::min(7.0, 4.0 + ply / 12.0);
   }
 
   // x moves in y seconds (+ z increment)
   else
   {
-      opt_scale = std::min((0.8 + ply / 128.0) / mtg,
+      optScale = std::min((0.8 + ply / 128.0) / mtg,
                             0.8 * limits.time[us] / double(timeLeft));
-      max_scale = std::min(6.3, 1.5 + 0.11 * mtg);
+      maxScale = std::min(6.3, 1.5 + 0.11 * mtg);
   }
 
   // Never use more than 80% of the available time for this move
-  optimumTime = TimePoint(opt_scale * timeLeft);
-  maximumTime = TimePoint(std::min(0.8 * limits.time[us] - moveOverhead, max_scale * optimumTime));
+  optimumTime = TimePoint(optScale * timeLeft);
+  maximumTime = TimePoint(std::min(0.8 * limits.time[us] - moveOverhead, maxScale * optimumTime));
 
   if (Options["Ponder"])
       optimumTime += optimumTime / 4;
diff --git a/src/uci.cpp b/src/uci.cpp
index d6486320..bc0ee0a0 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -211,7 +211,7 @@ namespace {
      double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
 
      // Transform eval to centipawns with limited range
-     double x = Utility::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0);
+     double x = std::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0);
 
      // Return win rate in per mille (rounded to nearest)
      return int(0.5 + 1000 / (1 + std::exp((a - x) / b)));

From cc9d503ddea998890112efd08fae3705f2727e37 Mon Sep 17 00:00:00 2001
From: syzygy1 <3028851+syzygy1@users.noreply.github.com>
Date: Sat, 22 Aug 2020 13:36:34 +0200
Subject: [PATCH 48/58] Skip the alignment bug workaround for Clang

Clang-10.0.0 poses as gcc-4.2:

$ clang++ -E -dM - </dev/null | grep GNUC

This means that Clang is using the workaround for the alignment bug of gcc-8
even though it does not have the bug (as far as I know).

This patch should speed up AVX2 and AVX512 compiles on Windows (when using Clang),
because it disables (for Clang) the gcc workaround we had introduced in this commit:
https://github.com/official-stockfish/Stockfish/commit/875183b310a8249922c2155e82cb4cecfae2097e

closes https://github.com/official-stockfish/Stockfish/pull/3050

No functional change.
---
 src/nnue/nnue_common.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 4c93e3d1..a9d8e5af 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -47,7 +47,7 @@
 //       compiled with older g++ crashes because the output memory is not aligned
 //       even though alignas is specified.
 #if defined(USE_AVX2)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32)
+#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
 #define _mm256_loadA_si256  _mm256_loadu_si256
 #define _mm256_storeA_si256 _mm256_storeu_si256
 #else
@@ -57,7 +57,7 @@
 #endif
 
 #if defined(USE_AVX512)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32)
+#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
 #define _mm512_loadA_si512   _mm512_loadu_si512
 #define _mm512_storeA_si512  _mm512_storeu_si512
 #else

From d5f86b63597fdb25b8c670e42ca468c99ca95a74 Mon Sep 17 00:00:00 2001
From: Vizvezdenec <Vizvezdenec@gmail.com>
Date: Sun, 23 Aug 2020 14:22:32 +0300
Subject: [PATCH 49/58] Introduce movecount pruning for qsearch()

If in quiescence search, we assume that me can prune late moves when:

a) the move ordering count of the move is : moveCount > abs(depth) + 2
b) we are not in check
c) the late move does not give check
d) the late move is not an advanced pawn push

Modification of an original idea by @VoyagerOne.

STC
https://tests.stockfishchess.org/tests/view/5f40581787a5c3c63d8f535f
LLR: 2.96 (-2.94,2.94) {-0.25,1.25}
Total: 132848 W: 14999 L: 14661 D: 103188
Ptnml(0-2): 684, 11242, 42309, 11430, 759

LTC
https://tests.stockfishchess.org/tests/view/5f4226da87a5c3c63d8f5412
LLR: 2.98 (-2.94,2.94) {0.25,1.25}
Total: 12008 W: 678 L: 551 D: 10779
Ptnml(0-2): 8, 485, 4899, 596, 16

closes https://github.com/official-stockfish/Stockfish/pull/3053

Bench: 3749974
---
 src/movepick.h | 2 +-
 src/search.cpp | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/movepick.h b/src/movepick.h
index 97ea5bec..4c0ad551 100644
--- a/src/movepick.h
+++ b/src/movepick.h
@@ -93,7 +93,7 @@ constexpr int MAX_LPH = 4;
 typedef Stats<int16_t, 10692, MAX_LPH, int(SQUARE_NB) * int(SQUARE_NB)> LowPlyHistory;
 
 /// CounterMoveHistory stores counter moves indexed by [piece][to] of the previous
-/// move, see www.chessprogramming.org/Countermove_Heuristic 
+/// move, see www.chessprogramming.org/Countermove_Heuristic
 typedef Stats<Move, NOT_USED, PIECE_NB, SQUARE_NB> CounterMoveHistory;
 
 /// CapturePieceToHistory is addressed by a move's [piece][to][captured piece type]
diff --git a/src/search.cpp b/src/search.cpp
index 82d8bb9d..266e2db3 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1531,6 +1531,10 @@ moves_loop: // When in check, search starts from here
       {
           assert(type_of(move) != ENPASSANT); // Due to !pos.advanced_pawn_push
 
+          // moveCount pruning
+          if (moveCount > abs(depth) + 2)
+              continue;
+
           futilityValue = futilityBase + PieceValue[EG][pos.piece_on(to_sq(move))];
 
           if (futilityValue <= alpha)
@@ -1547,7 +1551,7 @@ moves_loop: // When in check, search starts from here
       }
 
       // Do not search moves with negative SEE values
-      if (  !ss->inCheck && !pos.see_ge(move))
+      if (!ss->inCheck && !pos.see_ge(move))
           continue;
 
       // Speculative prefetch as early as possible

From e453f09f06f41680ef96f594f593f8de33e62b8f Mon Sep 17 00:00:00 2001
From: George Sobala <gsobala@gmail.com>
Date: Mon, 24 Aug 2020 06:37:42 +0100
Subject: [PATCH 50/58] armv8 AArch64 does not require -mfpu=neon

-mpfu is not required on AArch64 / armv8 architecture on Linux and throws an error if present.
This PR has been tested on gcc and clang on Gentoo-64 and Raspian-64 on a Raspberry Pi 4,
as well as with a cross from Ubuntu
(`make clean && make -j build ARCH=armv8         COMP=gcc COMPILER=aarch64-linux-gnu-g++`)

fixes https://github.com/official-stockfish/Stockfish/issues/3056
closes https://github.com/official-stockfish/Stockfish/pull/3059

No functional change
---
 src/Makefile | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 74ef87b9..3e1b7c35 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -241,7 +241,7 @@ ifeq ($(ARCH),armv7-neon)
 endif
 
 ifeq ($(ARCH),armv8)
-	arch = armv8-a
+	arch = armv8
 	prefetch = yes
 	popcnt = yes
 	neon = yes
@@ -285,7 +285,7 @@ ifeq ($(COMP),gcc)
 	CXX=g++
 	CXXFLAGS += -pedantic -Wextra -Wshadow
 
-	ifeq ($(arch),$(filter $(arch),armv7 armv8-a))
+	ifeq ($(arch),$(filter $(arch),armv7 armv8))
 		ifeq ($(OS),Android)
 			CXXFLAGS += -m$(bits)
 			LDFLAGS += -m$(bits)
@@ -387,7 +387,7 @@ ifeq ($(COMP),ndk)
 		CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon
 		STRIP=arm-linux-androideabi-strip
 	endif
-	ifeq ($(arch),armv8-a)
+	ifeq ($(arch),armv8)
 		comp=aarch64-linux-android21-clang
 		CXX=aarch64-linux-android21-clang++
 		STRIP=aarch64-linux-android-strip
@@ -476,7 +476,7 @@ endif
 
 ### 3.6 popcnt
 ifeq ($(popcnt),yes)
-	ifeq ($(arch),$(filter $(arch),ppc64 armv7 armv8-a arm64))
+	ifeq ($(arch),$(filter $(arch),ppc64 armv7 armv8 arm64))
 		CXXFLAGS += -DUSE_POPCNT
 	else ifeq ($(comp),icc)
 		CXXFLAGS += -msse3 -DUSE_POPCNT
@@ -539,9 +539,11 @@ ifeq ($(neon),yes)
 	CXXFLAGS += -DUSE_NEON
 	ifeq ($(KERNEL),Linux)
 	ifneq ($(COMP),ndk)
+	ifneq ($(arch),armv8)
 		CXXFLAGS += -mfpu=neon
 	endif
 	endif
+	endif
 endif
 
 ### 3.7 pext
@@ -780,7 +782,7 @@ config-sanity:
 	@test "$(optimize)" = "yes" || test "$(optimize)" = "no"
 	@test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \
 	 test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || \
-	 test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a" || test "$(arch)" = "arm64"
+	 test "$(arch)" = "armv7" || test "$(arch)" = "armv8" || test "$(arch)" = "arm64"
 	@test "$(bits)" = "32" || test "$(bits)" = "64"
 	@test "$(prefetch)" = "yes" || test "$(prefetch)" = "no"
 	@test "$(popcnt)" = "yes" || test "$(popcnt)" = "no"

From 701b2427bd84d112376ce858b66befc5b66c4bb2 Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Thu, 20 Aug 2020 16:59:27 -0700
Subject: [PATCH 51/58] Support VNNI on 256bit vectors

due to downclocking on current chips (tested up to cascade lake)
supporting avx512 and vnni512, it is better to use avx2 or vnni256
in multithreaded (in particular hyperthreaded) engine use.
In single threaded use, the picture is different.

gcc compilation for vnni256 requires a toolchain for gcc >= 9.

closes https://github.com/official-stockfish/Stockfish/pull/3038

No functional change
---
 .travis.yml                        |  6 +++--
 src/Makefile                       | 39 ++++++++++++++++++++++++------
 src/nnue/layers/affine_transform.h |  8 +++++-
 3 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a029c4fc..c1e6d6df 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -77,8 +77,10 @@ script:
   # compile only for some more advanced architectures (might not run in travis)
   - make clean && make -j2 ARCH=x86-64-avx2 build
   - make clean && make -j2 ARCH=x86-64-bmi2 build
-  # needs gcc 10 to compile
-  - if [[ "$COMPILER" != "g++-8" ]]; then make clean && make -j2 ARCH=x86-64-avx512 build; fi
+  - make clean && make -j2 ARCH=x86-64-avx512 build
+  - make clean && make -j2 ARCH=x86-64-vnni512 build
+  # requires gcc 9 or higher
+  - if [[ "$COMPILER" != "g++-8" ]]; make clean && make -j2 ARCH=x86-64-vnni256 build; fi
 
   #
   # Check perft and reproducible search
diff --git a/src/Makefile b/src/Makefile
index 3e1b7c35..228ea851 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -75,7 +75,8 @@ endif
 # sse41 = yes/no      --- -msse4.1         --- Use Intel Streaming SIMD Extensions 4.1
 # avx2 = yes/no       --- -mavx2           --- Use Intel Advanced Vector Extensions 2
 # avx512 = yes/no     --- -mavx512bw       --- Use Intel Advanced Vector Extensions 512
-# vnni = yes/no       --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 512
+# vnni256 = yes/no    --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 256
+# vnni512 = yes/no    --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 512
 # neon = yes/no       --- -DUSE_NEON       --- Use ARM SIMD architecture
 #
 # Note that Makefile is space sensitive, so when adding new architectures
@@ -102,7 +103,8 @@ ssse3 = no
 sse41 = no
 avx2 = no
 avx512 = no
-vnni = no
+vnni256 = no
+vnni512 = no
 neon = no
 ARCH = x86-64-modern
 STRIP = strip
@@ -192,7 +194,18 @@ ifeq ($(findstring -avx512,$(ARCH)),-avx512)
 	avx512 = yes
 endif
 
-ifeq ($(findstring -vnni,$(ARCH)),-vnni)
+ifeq ($(findstring -vnni256,$(ARCH)),-vnni256)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+	avx2 = yes
+	pext = yes
+	vnni256 = yes
+endif
+
+ifeq ($(findstring -vnni512,$(ARCH)),-vnni512)
 	popcnt = yes
 	sse = yes
 	sse2 = yes
@@ -201,7 +214,7 @@ ifeq ($(findstring -vnni,$(ARCH)),-vnni)
 	avx2 = yes
 	pext = yes
 	avx512 = yes
-	vnni = yes
+	vnni512 = yes
 endif
 
 ifeq ($(sse),yes)
@@ -500,7 +513,14 @@ ifeq ($(avx512),yes)
 	endif
 endif
 
-ifeq ($(vnni),yes)
+ifeq ($(vnni256),yes)
+	CXXFLAGS += -DUSE_VNNI
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=256
+	endif
+endif
+
+ifeq ($(vnni512),yes)
 	CXXFLAGS += -DUSE_VNNI
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
 		CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl
@@ -623,7 +643,8 @@ help:
 	@echo ""
 	@echo "Supported archs:"
 	@echo ""
-	@echo "x86-64-vnni             > x86 64-bit with vnni support"
+	@echo "x86-64-vnni512          > x86 64-bit with vnni support 512bit wide"
+	@echo "x86-64-vnni256          > x86 64-bit with vnni support 256bit wide"
 	@echo "x86-64-avx512           > x86 64-bit with avx512 support"
 	@echo "x86-64-bmi2             > x86 64-bit with bmi2 support"
 	@echo "x86-64-avx2             > x86 64-bit with avx2 support"
@@ -767,7 +788,8 @@ config-sanity:
 	@echo "sse41: '$(sse41)'"
 	@echo "avx2: '$(avx2)'"
 	@echo "avx512: '$(avx512)'"
-	@echo "vnni: '$(vnni)'"
+	@echo "vnni256: '$(vnni256)'"
+	@echo "vnni512: '$(vnni512)'"
 	@echo "neon: '$(neon)'"
 	@echo ""
 	@echo "Flags:"
@@ -794,7 +816,8 @@ config-sanity:
 	@test "$(sse41)" = "yes" || test "$(sse41)" = "no"
 	@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
 	@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
-	@test "$(vnni)" = "yes" || test "$(vnni)" = "no"
+	@test "$(vnni256)" = "yes" || test "$(vnni256)" = "no"
+	@test "$(vnni512)" = "yes" || test "$(vnni512)" = "no"
 	@test "$(neon)" = "yes" || test "$(neon)" = "no"
 	@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" \
 	|| test "$(comp)" = "armv7a-linux-androideabi16-clang"  || test "$(comp)" = "aarch64-linux-android21-clang"
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 7ac5a1c0..94d0b5a9 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -85,8 +85,10 @@ namespace Eval::NNUE::Layers {
 
   #elif defined(USE_AVX2)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-      const __m256i kOnes = _mm256_set1_epi16(1);
       const auto input_vector = reinterpret_cast<const __m256i*>(input);
+  #if !defined(USE_VNNI)
+      const __m256i kOnes = _mm256_set1_epi16(1);
+  #endif
 
   #elif defined(USE_SSE2)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
@@ -145,9 +147,13 @@ namespace Eval::NNUE::Layers {
         __m256i sum = _mm256_setzero_si256();
         const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
+  #if defined(USE_VNNI)
+          sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+  #else
           __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
           product = _mm256_madd_epi16(product, kOnes);
           sum = _mm256_add_epi32(sum, product);
+  #endif
         }
         __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
         sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));

From f7b3f0e8426bbf7414d139ed9d1cfa7a98d7314d Mon Sep 17 00:00:00 2001
From: Sami Kiminki <skiminki@users.noreply.github.com>
Date: Fri, 21 Aug 2020 12:12:39 +0300
Subject: [PATCH 52/58] Allow TT entries with key16==0 to be fetched

Fix the issue where a TT entry with key16==0 would always be reported
as a miss. Instead, we'll use depth8 to detect whether the TT entry is
occupied. In order to do that, we'll change DEPTH_OFFSET to -7
(depth8==0) to distinguish between an unoccupied entry and the
otherwise lowest possible depth, i.e., DEPTH_NONE (depth8==1).

To prevent a performance regression, we'll reorder the TT entry fields
by the access order of TranspositionTable::probe(). Memory in general
works fastest when accessed in sequential order. We'll also match the
store order in TTEntry::save() with the entry field order, and
re-order the 'if-or' expressions in TTEntry::save() from the cheapest
to the most expensive.

Finally, as we now have a proper TT entry occupancy test, we'll fix a
minor corner case with hashfull reporting. To reproduce:
- Use a big hash
- Either:
  a. Start 31 very quick searches (this wraparounds generation to 0); or
  b. Force generation of the first search to 0.
- go depth infinite

Before the fix, hashfull would incorrectly report nearly full hash
immediately after the search start, since
TranspositionTable::hashfull() used to consider only the entry
generation and not whether the entry was actually occupied.

STC:
LLR: 2.95 (-2.94,2.94) {-0.25,1.25}
Total: 36848 W: 4091 L: 3898 D: 28859
Ptnml(0-2): 158, 2996, 11972, 3091, 207
https://tests.stockfishchess.org/tests/view/5f3f98d5dc02a01a0c2881f7

LTC:
LLR: 2.95 (-2.94,2.94) {0.25,1.25}
Total: 32280 W: 1828 L: 1653 D: 28799
Ptnml(0-2): 34, 1428, 13051, 1583, 44
https://tests.stockfishchess.org/tests/view/5f3fe77a87a5c3c63d8f5332

closes https://github.com/official-stockfish/Stockfish/pull/3048

Bench: 3760677
---
 src/tt.cpp  | 21 +++++++++++----------
 src/tt.h    | 12 ++++++------
 src/types.h |  3 ++-
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/tt.cpp b/src/tt.cpp
index d494c27d..60a3a5f1 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -37,18 +37,19 @@ void TTEntry::save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev)
   if (m || (uint16_t)k != key16)
       move16 = (uint16_t)m;
 
-  // Overwrite less valuable entries
-  if ((uint16_t)k != key16
-      || d - DEPTH_OFFSET > depth8 - 4
-      || b == BOUND_EXACT)
+  // Overwrite less valuable entries (cheapest checks first)
+  if (b == BOUND_EXACT
+      || (uint16_t)k != key16
+      || d - DEPTH_OFFSET > depth8 - 4)
   {
-      assert(d >= DEPTH_OFFSET);
+      assert(d > DEPTH_OFFSET);
+      assert(d < 256 + DEPTH_OFFSET);
 
       key16     = (uint16_t)k;
+      depth8    = (uint8_t)(d - DEPTH_OFFSET);
+      genBound8 = (uint8_t)(TT.generation8 | uint8_t(pv) << 2 | b);
       value16   = (int16_t)v;
       eval16    = (int16_t)ev;
-      genBound8 = (uint8_t)(TT.generation8 | uint8_t(pv) << 2 | b);
-      depth8    = (uint8_t)(d - DEPTH_OFFSET);
   }
 }
 
@@ -119,11 +120,11 @@ TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
   const uint16_t key16 = (uint16_t)key;  // Use the low 16 bits as key inside the cluster
 
   for (int i = 0; i < ClusterSize; ++i)
-      if (!tte[i].key16 || tte[i].key16 == key16)
+      if (tte[i].key16 == key16 || !tte[i].depth8)
       {
           tte[i].genBound8 = uint8_t(generation8 | (tte[i].genBound8 & 0x7)); // Refresh
 
-          return found = (bool)tte[i].key16, &tte[i];
+          return found = (bool)tte[i].depth8, &tte[i];
       }
 
   // Find an entry to be replaced according to the replacement strategy
@@ -149,7 +150,7 @@ int TranspositionTable::hashfull() const {
   int cnt = 0;
   for (int i = 0; i < 1000; ++i)
       for (int j = 0; j < ClusterSize; ++j)
-          cnt += (table[i].entry[j].genBound8 & 0xF8) == generation8;
+          cnt += table[i].entry[j].depth8 && (table[i].entry[j].genBound8 & 0xF8) == generation8;
 
   return cnt / ClusterSize;
 }
diff --git a/src/tt.h b/src/tt.h
index c177ca52..fdfd6769 100644
--- a/src/tt.h
+++ b/src/tt.h
@@ -25,13 +25,13 @@
 /// TTEntry struct is the 10 bytes transposition table entry, defined as below:
 ///
 /// key        16 bit
-/// move       16 bit
-/// value      16 bit
-/// eval value 16 bit
+/// depth       8 bit
 /// generation  5 bit
 /// pv node     1 bit
 /// bound type  2 bit
-/// depth       8 bit
+/// move       16 bit
+/// value      16 bit
+/// eval value 16 bit
 
 struct TTEntry {
 
@@ -47,11 +47,11 @@ private:
   friend class TranspositionTable;
 
   uint16_t key16;
+  uint8_t  depth8;
+  uint8_t  genBound8;
   uint16_t move16;
   int16_t  value16;
   int16_t  eval16;
-  uint8_t  genBound8;
-  uint8_t  depth8;
 };
 
 
diff --git a/src/types.h b/src/types.h
index 73da41e2..1cd711b6 100644
--- a/src/types.h
+++ b/src/types.h
@@ -232,7 +232,8 @@ enum : int {
   DEPTH_QS_RECAPTURES = -5,
 
   DEPTH_NONE   = -6,
-  DEPTH_OFFSET = DEPTH_NONE
+
+  DEPTH_OFFSET = -7 // value used only for TT entry occupancy check
 };
 
 enum Square : int {

From 843a961a8c10d5949e04718b829e3b3d5adeedb4 Mon Sep 17 00:00:00 2001
From: Vizvezdenec <Vizvezdenec@gmail.com>
Date: Mon, 24 Aug 2020 08:04:16 +0300
Subject: [PATCH 53/58] Introduce countermove based pruning for qsearch

This patch continues work of previous patch in introducing pruning heuristics in qsearch by analogy to main search, now with countermove based pruning.
Idea is that if move is late enough and is quite check (we do generate them in qsearch) and has bad enough countermove history - prune it.

passed STC
https://tests.stockfishchess.org/tests/view/5f41220287a5c3c63d8f53c5
LLR: 2.93 (-2.94,2.94) {-0.25,1.25}
Total: 35944 W: 4127 L: 3929 D: 27888
Ptnml(0-2): 196, 2970, 11459, 3134, 213

passed LTC
https://tests.stockfishchess.org/tests/view/5f41862f87a5c3c63d8f53e8
LLR: 2.95 (-2.94,2.94) {0.25,1.25}
Total: 138448 W: 7655 L: 7252 D: 123541
Ptnml(0-2): 145, 6247, 56043, 6638, 151

closes https://github.com/official-stockfish/Stockfish/pull/3058

Bench: 3610676
---
 src/search.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/search.cpp b/src/search.cpp
index 266e2db3..2ca64a01 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1570,6 +1570,12 @@ moves_loop: // When in check, search starts from here
                                                                 [pos.moved_piece(move)]
                                                                 [to_sq(move)];
 
+      if (  !captureOrPromotion
+          && moveCount >= abs(depth) + 1
+          && (*contHist[0])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold
+          && (*contHist[1])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold)
+          continue;
+
       // Make and search the move
       pos.do_move(move, st, givesCheck);
       value = -qsearch<NT>(pos, ss+1, -beta, -alpha, depth - 1);

From 530fccbf272ffe424ae53a464b91db148cc968ae Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Mon, 24 Aug 2020 03:38:01 -0700
Subject: [PATCH 54/58] Allow for VNNI256 compilation with g++-8

explicitly pass needed -mavx512f -mavx512bw flags

closes https://github.com/official-stockfish/Stockfish/pull/3061

No functional change
---
 .travis.yml  | 3 +--
 src/Makefile | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index c1e6d6df..092c7f53 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -79,8 +79,7 @@ script:
   - make clean && make -j2 ARCH=x86-64-bmi2 build
   - make clean && make -j2 ARCH=x86-64-avx512 build
   - make clean && make -j2 ARCH=x86-64-vnni512 build
-  # requires gcc 9 or higher
-  - if [[ "$COMPILER" != "g++-8" ]]; make clean && make -j2 ARCH=x86-64-vnni256 build; fi
+  - make clean && make -j2 ARCH=x86-64-vnni256 build
 
   #
   # Check perft and reproducible search
diff --git a/src/Makefile b/src/Makefile
index 228ea851..2e85a144 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -516,7 +516,7 @@ endif
 ifeq ($(vnni256),yes)
 	CXXFLAGS += -DUSE_VNNI
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
-		CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=256
+		CXXFLAGS += -mavx512f -mavx512bw -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=256
 	endif
 endif
 

From b0b4ca17db49ed03057b5fa4ee4a12dab0e9c9e6 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Mon, 24 Aug 2020 21:32:04 +0200
Subject: [PATCH 55/58] Check ARCH=.... variable

to prevent user errors or generating untested code,
check explicitly that the ARCH variable is equivalent to a supported architecture
as listed in `make help`.

To nevertheless compile for an untested target the user can override the internal
variable, passing the undocumented `SUPPORTED_ARCH=true` to make.

closes https://github.com/official-stockfish/Stockfish/pull/3062

No functional change.
---
 src/Makefile | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 2e85a144..703aa230 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -85,8 +85,15 @@ endif
 
 ### 2.1. General and architecture defaults
 
-ifeq ($(ARCH),)
-    empty_arch = yes
+# explicitly check for the list of supported architectures (as listed with make help),
+# the user can override with `make ARCH=x86-32-vnni256 SUPPORTED_ARCH=true`
+ifeq ($(ARCH),$(filter $(ARCH),x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-bmi2 x86-64-avx2 \
+                               x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
+                               x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 \
+                               armv7 armv7-neon armv8 apple-silicon general-64 general-32))
+   SUPPORTED_ARCH=true
+else
+   SUPPORTED_ARCH=false
 endif
 
 optimize = yes
@@ -625,6 +632,7 @@ endif
 ### Section 4. Public Targets
 ### ==========================================================================
 
+
 help:
 	@echo ""
 	@echo "To compile stockfish, type: "
@@ -684,10 +692,12 @@ help:
 	@echo "make -j profile-build ARCH=x86-64-bmi2 COMP=gcc COMPCXX=g++-9.0"
 	@echo "make -j build ARCH=x86-64-ssse3 COMP=clang"
 	@echo ""
-ifneq ($(empty_arch), yes)
 	@echo "-------------------------------"
+ifeq ($(SUPPORTED_ARCH), true)
 	@echo "The selected architecture $(ARCH) will enable the following configuration: "
 	@$(MAKE) ARCH=$(ARCH) COMP=$(COMP) config-sanity
+else
+	@echo "Specify a supported architecture with the ARCH option for more details"
 endif
 
 
@@ -802,6 +812,7 @@ config-sanity:
 	@test "$(debug)" = "yes" || test "$(debug)" = "no"
 	@test "$(sanitize)" = "undefined" || test "$(sanitize)" = "thread" || test "$(sanitize)" = "address" || test "$(sanitize)" = "no"
 	@test "$(optimize)" = "yes" || test "$(optimize)" = "no"
+	@test "$(SUPPORTED_ARCH)" = "true"
 	@test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \
 	 test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || \
 	 test "$(arch)" = "armv7" || test "$(arch)" = "armv8" || test "$(arch)" = "arm64"

From 9b4967071e2fb116673820127522bc43d01d2257 Mon Sep 17 00:00:00 2001
From: syzygy1 <3028851+syzygy1@users.noreply.github.com>
Date: Mon, 24 Aug 2020 02:29:38 +0200
Subject: [PATCH 56/58] Remove EvalList

This patch removes the EvalList structure from the Position object and generally simplifies the interface between do_move() and the NNUE code.

The NNUE evaluation function first calculates the "accumulator". The accumulator consists of two halves: one for white's perspective, one for black's perspective.

If the "friendly king" has moved or the accumulator for the parent position is not available, the accumulator for this half has to be calculated from scratch. To do this, the NNUE node needs to know the positions and types of all non-king pieces and the position of the friendly king. This information can easily be obtained from the Position object.

If the "friendly king" has not moved, its half of the accumulator can be calculated by incrementally updating the accumulator for the previous position. For this, the NNUE code needs to know which pieces have been added to which squares and which pieces have been removed from which squares. In principle this information can be derived from the Position object and StateInfo struct (in the same way as undo_move() does this). However, it is probably a bit faster to prepare this information in do_move(), so I have kept the DirtyPiece struct. Since the DirtyPiece struct now stores the squares rather than "PieceSquare" indices, there are now at most three "dirty pieces" (previously two). A promotion move that captures a piece removes the capturing pawn and the captured piece from the board (to SQ_NONE) and moves the promoted piece to the promotion square (from SQ_NONE).

An STC test has confirmed a small speedup:

https://tests.stockfishchess.org/tests/view/5f43f06b5089a564a10d850a
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 87704 W: 9763 L: 9500 D: 68441
Ptnml(0-2): 426, 6950, 28845, 7197, 434

closes https://github.com/official-stockfish/Stockfish/pull/3068

No functional change
---
 src/nnue/evaluate_nnue.cpp      |  43 ++++++------
 src/nnue/features/feature_set.h |   3 +-
 src/nnue/features/half_kp.cpp   |  58 ++++++----------
 src/nnue/features/half_kp.h     |  10 +--
 src/nnue/nnue_common.h          |  21 ++++++
 src/position.cpp                |  95 +++++++-------------------
 src/position.h                  |  23 -------
 src/types.h                     | 116 +++-----------------------------
 8 files changed, 96 insertions(+), 273 deletions(-)

diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index dfbb1ac2..e6619089 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -29,30 +29,29 @@
 
 #include "evaluate_nnue.h"
 
-ExtPieceSquare kpp_board_index[PIECE_NB] = {
- // convention: W - us, B - them
- // viewed from other side, W and B are reversed
-    { PS_NONE,     PS_NONE     },
-    { PS_W_PAWN,   PS_B_PAWN   },
-    { PS_W_KNIGHT, PS_B_KNIGHT },
-    { PS_W_BISHOP, PS_B_BISHOP },
-    { PS_W_ROOK,   PS_B_ROOK   },
-    { PS_W_QUEEN,  PS_B_QUEEN  },
-    { PS_W_KING,   PS_B_KING   },
-    { PS_NONE,     PS_NONE     },
-    { PS_NONE,     PS_NONE     },
-    { PS_B_PAWN,   PS_W_PAWN   },
-    { PS_B_KNIGHT, PS_W_KNIGHT },
-    { PS_B_BISHOP, PS_W_BISHOP },
-    { PS_B_ROOK,   PS_W_ROOK   },
-    { PS_B_QUEEN,  PS_W_QUEEN  },
-    { PS_B_KING,   PS_W_KING   },
-    { PS_NONE,     PS_NONE     }
-};
-
-
 namespace Eval::NNUE {
 
+  uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
+   // convention: W - us, B - them
+   // viewed from other side, W and B are reversed
+      { PS_NONE,     PS_NONE     },
+      { PS_W_PAWN,   PS_B_PAWN   },
+      { PS_W_KNIGHT, PS_B_KNIGHT },
+      { PS_W_BISHOP, PS_B_BISHOP },
+      { PS_W_ROOK,   PS_B_ROOK   },
+      { PS_W_QUEEN,  PS_B_QUEEN  },
+      { PS_W_KING,   PS_B_KING   },
+      { PS_NONE,     PS_NONE     },
+      { PS_NONE,     PS_NONE     },
+      { PS_B_PAWN,   PS_W_PAWN   },
+      { PS_B_KNIGHT, PS_W_KNIGHT },
+      { PS_B_BISHOP, PS_W_BISHOP },
+      { PS_B_ROOK,   PS_W_ROOK   },
+      { PS_B_QUEEN,  PS_W_QUEEN  },
+      { PS_B_KING,   PS_W_KING   },
+      { PS_NONE,     PS_NONE     }
+  };
+
   // Input feature converter
   AlignedPtr<FeatureTransformer> feature_transformer;
 
diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index 79ca83ae..558a6b22 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -68,8 +68,7 @@ namespace Eval::NNUE::Features {
         reset[perspective] = false;
         switch (trigger) {
           case TriggerEvent::kFriendKingMoved:
-            reset[perspective] =
-                dp.pieceId[0] == PIECE_ID_KING + perspective;
+            reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
             break;
           default:
             assert(false);
diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index 628add6e..88e384a3 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -23,25 +23,17 @@
 
 namespace Eval::NNUE::Features {
 
-  // Find the index of the feature quantity from the king position and PieceSquare
-  template <Side AssociatedKing>
-  inline IndexType HalfKP<AssociatedKing>::MakeIndex(Square sq_k, PieceSquare p) {
-    return static_cast<IndexType>(PS_END) * static_cast<IndexType>(sq_k) + p;
+  // Orient a square according to perspective (rotates by 180 for black)
+  inline Square orient(Color perspective, Square s) {
+    return Square(int(s) ^ (bool(perspective) * 63));
   }
 
-  // Get pieces information
+  // Find the index of the feature quantity from the king position and PieceSquare
   template <Side AssociatedKing>
-  inline void HalfKP<AssociatedKing>::GetPieces(
-      const Position& pos, Color perspective,
-      PieceSquare** pieces, Square* sq_target_k) {
+  inline IndexType HalfKP<AssociatedKing>::MakeIndex(
+      Color perspective, Square s, Piece pc, Square ksq) {
 
-    *pieces = (perspective == BLACK) ?
-        pos.eval_list()->piece_list_fb() :
-        pos.eval_list()->piece_list_fw();
-    const PieceId target = (AssociatedKing == Side::kFriend) ?
-        static_cast<PieceId>(PIECE_ID_KING + perspective) :
-        static_cast<PieceId>(PIECE_ID_KING + ~perspective);
-    *sq_target_k = static_cast<Square>(((*pieces)[target] - PS_W_KING) % SQUARE_NB);
+    return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq);
   }
 
   // Get a list of indices for active features
@@ -49,16 +41,11 @@ namespace Eval::NNUE::Features {
   void HalfKP<AssociatedKing>::AppendActiveIndices(
       const Position& pos, Color perspective, IndexList* active) {
 
-    // Do nothing if array size is small to avoid compiler warning
-    if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
-
-    PieceSquare* pieces;
-    Square sq_target_k;
-    GetPieces(pos, perspective, &pieces, &sq_target_k);
-    for (PieceId i = PIECE_ID_ZERO; i < PIECE_ID_KING; ++i) {
-      if (pieces[i] != PS_NONE) {
-        active->push_back(MakeIndex(sq_target_k, pieces[i]));
-      }
+    Square ksq = orient(perspective, pos.square<KING>(perspective));
+    Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+    while (bb) {
+      Square s = pop_lsb(&bb);
+      active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
     }
   }
 
@@ -68,22 +55,15 @@ namespace Eval::NNUE::Features {
       const Position& pos, Color perspective,
       IndexList* removed, IndexList* added) {
 
-    PieceSquare* pieces;
-    Square sq_target_k;
-    GetPieces(pos, perspective, &pieces, &sq_target_k);
+    Square ksq = orient(perspective, pos.square<KING>(perspective));
     const auto& dp = pos.state()->dirtyPiece;
     for (int i = 0; i < dp.dirty_num; ++i) {
-      if (dp.pieceId[i] >= PIECE_ID_KING) continue;
-      const auto old_p = static_cast<PieceSquare>(
-          dp.old_piece[i].from[perspective]);
-      if (old_p != PS_NONE) {
-        removed->push_back(MakeIndex(sq_target_k, old_p));
-      }
-      const auto new_p = static_cast<PieceSquare>(
-          dp.new_piece[i].from[perspective]);
-      if (new_p != PS_NONE) {
-        added->push_back(MakeIndex(sq_target_k, new_p));
-      }
+      Piece pc = dp.piece[i];
+      if (type_of(pc) == KING) continue;
+      if (dp.from[i] != SQ_NONE)
+        removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
+      if (dp.to[i] != SQ_NONE)
+        added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
     }
   }
 
diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h
index 99842eea..ee6a8df3 100644
--- a/src/nnue/features/half_kp.h
+++ b/src/nnue/features/half_kp.h
@@ -41,7 +41,7 @@ namespace Eval::NNUE::Features {
     static constexpr IndexType kDimensions =
         static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END);
     // Maximum number of simultaneously active features
-    static constexpr IndexType kMaxActiveDimensions = PIECE_ID_KING;
+    static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
     // Trigger for full calculation instead of difference calculation
     static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kFriendKingMoved;
 
@@ -53,13 +53,9 @@ namespace Eval::NNUE::Features {
     static void AppendChangedIndices(const Position& pos, Color perspective,
                                      IndexList* removed, IndexList* added);
 
-    // Index of a feature for a given king position and another piece on some square
-    static IndexType MakeIndex(Square sq_k, PieceSquare p);
-
    private:
-    // Get pieces information
-    static void GetPieces(const Position& pos, Color perspective,
-                          PieceSquare** pieces, Square* sq_target_k);
+    // Index of a feature for a given king position and another piece on some square
+    static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
   };
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index a9d8e5af..7bc905dc 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -94,6 +94,27 @@ namespace Eval::NNUE {
 
   constexpr std::size_t kMaxSimdWidth = 32;
 
+  // unique number for each piece type on each square
+  enum {
+    PS_NONE     =  0,
+    PS_W_PAWN   =  1,
+    PS_B_PAWN   =  1 * SQUARE_NB + 1,
+    PS_W_KNIGHT =  2 * SQUARE_NB + 1,
+    PS_B_KNIGHT =  3 * SQUARE_NB + 1,
+    PS_W_BISHOP =  4 * SQUARE_NB + 1,
+    PS_B_BISHOP =  5 * SQUARE_NB + 1,
+    PS_W_ROOK   =  6 * SQUARE_NB + 1,
+    PS_B_ROOK   =  7 * SQUARE_NB + 1,
+    PS_W_QUEEN  =  8 * SQUARE_NB + 1,
+    PS_B_QUEEN  =  9 * SQUARE_NB + 1,
+    PS_W_KING   = 10 * SQUARE_NB + 1,
+    PS_END      = PS_W_KING, // pieces without kings (pawns included)
+    PS_B_KING   = 11 * SQUARE_NB + 1,
+    PS_END2     = 12 * SQUARE_NB + 1
+  };
+
+  extern uint32_t kpp_board_index[PIECE_NB][COLOR_NB];
+
   // Type of input feature after conversion
   using TransformedFeatureType = std::uint8_t;
   using IndexType = std::uint32_t;
diff --git a/src/position.cpp b/src/position.cpp
index 02898547..fe89b753 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -198,9 +198,6 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
   std::fill_n(&pieceList[0][0], sizeof(pieceList) / sizeof(Square), SQ_NONE);
   st = si;
 
-  // Each piece on board gets a unique ID used to track the piece later
-  PieceId piece_id, next_piece_id = PIECE_ID_ZERO;
-
   ss >> std::noskipws;
 
   // 1. Piece placement
@@ -212,21 +209,8 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
       else if (token == '/')
           sq += 2 * SOUTH;
 
-      else if ((idx = PieceToChar.find(token)) != string::npos)
-      {
-          auto pc = Piece(idx);
-          put_piece(pc, sq);
-
-          if (Eval::useNNUE)
-          {
-              // Kings get a fixed ID, other pieces get ID in order of placement
-              piece_id =
-                (idx == W_KING) ? PIECE_ID_WKING :
-                (idx == B_KING) ? PIECE_ID_BKING :
-                next_piece_id++;
-              evalList.put_piece(piece_id, sq, pc);
-          }
-
+      else if ((idx = PieceToChar.find(token)) != string::npos) {
+          put_piece(Piece(idx), sq);
           ++sq;
       }
   }
@@ -721,8 +705,6 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   // Used by NNUE
   st->accumulator.computed_accumulation = false;
   st->accumulator.computed_score = false;
-  PieceId dp0 = PIECE_ID_NONE;
-  PieceId dp1 = PIECE_ID_NONE;
   auto& dp = st->dirtyPiece;
   dp.dirty_num = 1;
 
@@ -775,12 +757,10 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
 
       if (Eval::useNNUE)
       {
-          dp.dirty_num = 2; // 2 pieces moved
-          dp1 = piece_id_on(capsq);
-          dp.pieceId[1] = dp1;
-          dp.old_piece[1] = evalList.piece_with_id(dp1);
-          evalList.put_piece(dp1, capsq, NO_PIECE);
-          dp.new_piece[1] = evalList.piece_with_id(dp1);
+          dp.dirty_num = 2;  // 1 piece moved, 1 piece captured
+          dp.piece[1] = captured;
+          dp.from[1] = capsq;
+          dp.to[1] = SQ_NONE;
       }
 
       // Update board and piece lists
@@ -821,11 +801,9 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   {
       if (Eval::useNNUE)
       {
-          dp0 = piece_id_on(from);
-          dp.pieceId[0] = dp0;
-          dp.old_piece[0] = evalList.piece_with_id(dp0);
-          evalList.put_piece(dp0, to, pc);
-          dp.new_piece[0] = evalList.piece_with_id(dp0);
+          dp.piece[0] = pc;
+          dp.from[0] = from;
+          dp.to[0] = to;
       }
 
       move_piece(from, to);
@@ -854,9 +832,12 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
 
           if (Eval::useNNUE)
           {
-              dp0 = piece_id_on(to);
-              evalList.put_piece(dp0, to, promotion);
-              dp.new_piece[0] = evalList.piece_with_id(dp0);
+              // Promoting pawn to SQ_NONE, promoted piece from SQ_NONE
+              dp.to[0] = SQ_NONE;
+              dp.piece[dp.dirty_num] = promotion;
+              dp.from[dp.dirty_num] = SQ_NONE;
+              dp.to[dp.dirty_num] = to;
+              dp.dirty_num++;
           }
 
           // Update hash keys
@@ -950,12 +931,6 @@ void Position::undo_move(Move m) {
   {
       move_piece(to, from); // Put the piece back at the source square
 
-      if (Eval::useNNUE)
-      {
-          PieceId dp0 = st->dirtyPiece.pieceId[0];
-          evalList.put_piece(dp0, from, pc);
-      }
-
       if (st->capturedPiece)
       {
           Square capsq = to;
@@ -972,14 +947,6 @@ void Position::undo_move(Move m) {
           }
 
           put_piece(st->capturedPiece, capsq); // Restore the captured piece
-
-          if (Eval::useNNUE)
-          {
-              PieceId dp1 = st->dirtyPiece.pieceId[1];
-              assert(evalList.piece_with_id(dp1).from[WHITE] == PS_NONE);
-              assert(evalList.piece_with_id(dp1).from[BLACK] == PS_NONE);
-              evalList.put_piece(dp1, capsq, st->capturedPiece);
-          }
       }
   }
 
@@ -1001,32 +968,16 @@ void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Squ
   rto = relative_square(us, kingSide ? SQ_F1 : SQ_D1);
   to = relative_square(us, kingSide ? SQ_G1 : SQ_C1);
 
-  if (Eval::useNNUE)
+  if (Do && Eval::useNNUE)
   {
-      PieceId dp0, dp1;
       auto& dp = st->dirtyPiece;
-      dp.dirty_num = 2; // 2 pieces moved
-
-      if (Do)
-      {
-          dp0 = piece_id_on(from);
-          dp1 = piece_id_on(rfrom);
-          dp.pieceId[0] = dp0;
-          dp.old_piece[0] = evalList.piece_with_id(dp0);
-          evalList.put_piece(dp0, to, make_piece(us, KING));
-          dp.new_piece[0] = evalList.piece_with_id(dp0);
-          dp.pieceId[1] = dp1;
-          dp.old_piece[1] = evalList.piece_with_id(dp1);
-          evalList.put_piece(dp1, rto, make_piece(us, ROOK));
-          dp.new_piece[1] = evalList.piece_with_id(dp1);
-      }
-      else
-      {
-          dp0 = piece_id_on(to);
-          dp1 = piece_id_on(rto);
-          evalList.put_piece(dp0, from, make_piece(us, KING));
-          evalList.put_piece(dp1, rfrom, make_piece(us, ROOK));
-      }
+      dp.piece[0] = make_piece(us, KING);
+      dp.from[0] = from;
+      dp.to[0] = to;
+      dp.piece[1] = make_piece(us, ROOK);
+      dp.from[1] = rfrom;
+      dp.to[1] = rto;
+      dp.dirty_num = 2;
   }
 
   // Remove both pieces first since squares could overlap in Chess960
diff --git a/src/position.h b/src/position.h
index 5ce17277..d6f5c9fd 100644
--- a/src/position.h
+++ b/src/position.h
@@ -171,7 +171,6 @@ public:
 
   // Used by NNUE
   StateInfo* state() const;
-  const EvalList* eval_list() const;
 
 private:
   // Initialization helpers (used while setting up a position)
@@ -186,9 +185,6 @@ private:
   template<bool Do>
   void do_castling(Color us, Square from, Square& to, Square& rfrom, Square& rto);
 
-  // ID of a piece on a given square
-  PieceId piece_id_on(Square sq) const;
-
   // Data members
   Piece board[SQUARE_NB];
   Bitboard byTypeBB[PIECE_TYPE_NB];
@@ -205,9 +201,6 @@ private:
   Thread* thisThread;
   StateInfo* st;
   bool chess960;
-
-  // List of pieces used in NNUE evaluation function
-  EvalList evalList;
 };
 
 namespace PSQT {
@@ -451,20 +444,4 @@ inline StateInfo* Position::state() const {
   return st;
 }
 
-inline const EvalList* Position::eval_list() const {
-
-  return &evalList;
-}
-
-inline PieceId Position::piece_id_on(Square sq) const
-{
-
-  assert(piece_on(sq) != NO_PIECE);
-
-  PieceId pid = evalList.piece_id_list[sq];
-  assert(is_ok(pid));
-
-  return pid;
-}
-
 #endif // #ifndef POSITION_H_INCLUDED
diff --git a/src/types.h b/src/types.h
index 1cd711b6..5873c698 100644
--- a/src/types.h
+++ b/src/types.h
@@ -201,22 +201,6 @@ enum Piece {
   PIECE_NB = 16
 };
 
-// An ID used to track the pieces. Max. 32 pieces on board.
-enum PieceId {
-  PIECE_ID_ZERO   = 0,
-  PIECE_ID_KING   = 30,
-  PIECE_ID_WKING  = 30,
-  PIECE_ID_BKING  = 31,
-  PIECE_ID_NONE   = 32
-};
-
-inline PieceId operator++(PieceId& d, int) {
-
-  PieceId x = d;
-  d = PieceId(int(d) + 1);
-  return x;
-}
-
 constexpr Value PieceValue[PHASE_NB][PIECE_NB] = {
   { VALUE_ZERO, PawnValueMg, KnightValueMg, BishopValueMg, RookValueMg, QueenValueMg, VALUE_ZERO, VALUE_ZERO,
     VALUE_ZERO, PawnValueMg, KnightValueMg, BishopValueMg, RookValueMg, QueenValueMg, VALUE_ZERO, VALUE_ZERO },
@@ -271,93 +255,20 @@ enum Rank : int {
   RANK_1, RANK_2, RANK_3, RANK_4, RANK_5, RANK_6, RANK_7, RANK_8, RANK_NB
 };
 
-// unique number for each piece type on each square
-enum PieceSquare : uint32_t {
-  PS_NONE     =  0,
-  PS_W_PAWN   =  1,
-  PS_B_PAWN   =  1 * SQUARE_NB + 1,
-  PS_W_KNIGHT =  2 * SQUARE_NB + 1,
-  PS_B_KNIGHT =  3 * SQUARE_NB + 1,
-  PS_W_BISHOP =  4 * SQUARE_NB + 1,
-  PS_B_BISHOP =  5 * SQUARE_NB + 1,
-  PS_W_ROOK   =  6 * SQUARE_NB + 1,
-  PS_B_ROOK   =  7 * SQUARE_NB + 1,
-  PS_W_QUEEN  =  8 * SQUARE_NB + 1,
-  PS_B_QUEEN  =  9 * SQUARE_NB + 1,
-  PS_W_KING   = 10 * SQUARE_NB + 1,
-  PS_END      = PS_W_KING, // pieces without kings (pawns included)
-  PS_B_KING   = 11 * SQUARE_NB + 1,
-  PS_END2     = 12 * SQUARE_NB + 1
-};
-
-struct ExtPieceSquare {
-  PieceSquare from[COLOR_NB];
-};
-
-// Array for finding the PieceSquare corresponding to the piece on the board
-extern ExtPieceSquare kpp_board_index[PIECE_NB];
-
-constexpr bool is_ok(PieceId pid);
-constexpr Square rotate180(Square sq);
-
-// Structure holding which tracked piece (PieceId) is where (PieceSquare)
-class EvalList {
-
-public:
-  // Max. number of pieces without kings is 30 but must be a multiple of 4 in AVX2
-  static const int MAX_LENGTH = 32;
-
-  // Array that holds the piece id for the pieces on the board
-  PieceId piece_id_list[SQUARE_NB];
-
-  // List of pieces, separate from White and Black POV
-  PieceSquare* piece_list_fw() const { return const_cast<PieceSquare*>(pieceListFw); }
-  PieceSquare* piece_list_fb() const { return const_cast<PieceSquare*>(pieceListFb); }
-
-  // Place the piece pc with piece_id on the square sq on the board
-  void put_piece(PieceId piece_id, Square sq, Piece pc)
-  {
-      assert(is_ok(piece_id));
-      if (pc != NO_PIECE)
-      {
-          pieceListFw[piece_id] = PieceSquare(kpp_board_index[pc].from[WHITE] + sq);
-          pieceListFb[piece_id] = PieceSquare(kpp_board_index[pc].from[BLACK] + rotate180(sq));
-          piece_id_list[sq] = piece_id;
-      }
-      else
-      {
-          pieceListFw[piece_id] = PS_NONE;
-          pieceListFb[piece_id] = PS_NONE;
-          piece_id_list[sq] = piece_id;
-      }
-  }
-
-  // Convert the specified piece_id piece to ExtPieceSquare type and return it
-  ExtPieceSquare piece_with_id(PieceId piece_id) const
-  {
-      ExtPieceSquare eps;
-      eps.from[WHITE] = pieceListFw[piece_id];
-      eps.from[BLACK] = pieceListFb[piece_id];
-      return eps;
-  }
-
-private:
-  PieceSquare pieceListFw[MAX_LENGTH];
-  PieceSquare pieceListFb[MAX_LENGTH];
-};
-
-// For differential evaluation of pieces that changed since last turn
+// Keep track of what a move changes on the board (used by NNUE)
 struct DirtyPiece {
 
   // Number of changed pieces
   int dirty_num;
 
-  // The ids of changed pieces, max. 2 pieces can change in one move
-  PieceId pieceId[2];
+  // Max 3 pieces can change in one move. A promotion with capture moves
+  // both the pawn and the captured piece to SQ_NONE and the piece promoted
+  // to from SQ_NONE to the capture square.
+  Piece piece[3];
 
-  // What changed from the piece with that piece number
-  ExtPieceSquare old_piece[2];
-  ExtPieceSquare new_piece[2];
+  // From and to squares, which may be SQ_NONE
+  Square from[3];
+  Square to[3];
 };
 
 /// Score enum stores a middlegame and an endgame value in a single integer (enum).
@@ -407,8 +318,6 @@ ENABLE_FULL_OPERATORS_ON(Value)
 ENABLE_FULL_OPERATORS_ON(Direction)
 
 ENABLE_INCR_OPERATORS_ON(Piece)
-ENABLE_INCR_OPERATORS_ON(PieceSquare)
-ENABLE_INCR_OPERATORS_ON(PieceId)
 ENABLE_INCR_OPERATORS_ON(PieceType)
 ENABLE_INCR_OPERATORS_ON(Square)
 ENABLE_INCR_OPERATORS_ON(File)
@@ -497,10 +406,6 @@ inline Color color_of(Piece pc) {
   return Color(pc >> 3);
 }
 
-constexpr bool is_ok(PieceId pid) {
-  return pid < PIECE_ID_NONE;
-}
-
 constexpr bool is_ok(Square s) {
   return s >= SQ_A1 && s <= SQ_H8;
 }
@@ -537,11 +442,6 @@ constexpr Square to_sq(Move m) {
   return Square(m & 0x3F);
 }
 
-// Return relative square when turning the board 180 degrees
-constexpr Square rotate180(Square sq) {
-  return (Square)(sq ^ 0x3F);
-}
-
 constexpr int from_to(Move m) {
  return m & 0xFFF;
 }

From 95b8f3f8005598fc3ad07a7a8f6440d828cabc29 Mon Sep 17 00:00:00 2001
From: VoyagerOne <excelgeek@gmail.com>
Date: Sun, 23 Aug 2020 12:04:50 -0400
Subject: [PATCH 57/58] Remove Reduce Depth

Remove Reduce Depth at PV nodes.

STC:
LLR: 2.94 (-2.94,2.94) {-1.25,0.25}
Total: 56760 W: 6299 L: 6236 D: 44225
Ptnml(0-2): 286, 4843, 18076, 4872, 303
https://tests.stockfishchess.org/tests/view/5f41356087a5c3c63d8f53c9

LTC:
LLR: 2.95 (-2.94,2.94) {-0.75,0.25}
Total: 17496 W: 954 L: 865 D: 15677
Ptnml(0-2): 13, 768, 7098, 855, 14
https://tests.stockfishchess.org/tests/view/5f41bb7687a5c3c63d8f53f9

closes https://github.com/official-stockfish/Stockfish/pull/3055

Bench: 3555051
---
 src/search.cpp | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 2ca64a01..d6b611a3 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -940,12 +940,6 @@ namespace {
             }
     }
 
-    // Step 11. If the position is not in TT, decrease depth by 2
-    if (   PvNode
-        && depth >= 6
-        && !ttMove)
-        depth -= 2;
-
 moves_loop: // When in check, search starts from here
 
     const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory,
@@ -969,7 +963,7 @@ moves_loop: // When in check, search starts from here
     // Mark this node as being searched
     ThreadHolding th(thisThread, posKey, ss->ply);
 
-    // Step 12. Loop through all pseudo-legal moves until no moves remain
+    // Step 11. Loop through all pseudo-legal moves until no moves remain
     // or a beta cutoff occurs.
     while ((move = mp.next_move(moveCountPruning)) != MOVE_NONE)
     {
@@ -1007,7 +1001,7 @@ moves_loop: // When in check, search starts from here
       // Calculate new depth for this move
       newDepth = depth - 1;
 
-      // Step 13. Pruning at shallow depth (~200 Elo)
+      // Step 12. Pruning at shallow depth (~200 Elo)
       if (  !rootNode
           && pos.non_pawn_material(us)
           && bestValue > VALUE_TB_LOSS_IN_MAX_PLY)
@@ -1065,7 +1059,7 @@ moves_loop: // When in check, search starts from here
           }
       }
 
-      // Step 14. Extensions (~75 Elo)
+      // Step 13. Extensions (~75 Elo)
 
       // Singular extension search (~70 Elo). If all moves but one fail low on a
       // search of (alpha-s, beta-s), and just one fails high on (alpha, beta),
@@ -1148,10 +1142,10 @@ moves_loop: // When in check, search starts from here
                                                                 [movedPiece]
                                                                 [to_sq(move)];
 
-      // Step 15. Make the move
+      // Step 14. Make the move
       pos.do_move(move, st, givesCheck);
 
-      // Step 16. Reduced depth search (LMR, ~200 Elo). If the move fails high it will be
+      // Step 15. Reduced depth search (LMR, ~200 Elo). If the move fails high it will be
       // re-searched at full depth.
       if (    depth >= 3
           &&  moveCount > 1 + 2 * rootNode + 2 * (PvNode && abs(bestValue) < 2)
@@ -1254,7 +1248,7 @@ moves_loop: // When in check, search starts from here
           didLMR = false;
       }
 
-      // Step 17. Full depth search when LMR is skipped or fails high
+      // Step 16. Full depth search when LMR is skipped or fails high
       if (doFullDepthSearch)
       {
           value = -search<NonPV>(pos, ss+1, -(alpha+1), -alpha, newDepth, !cutNode);
@@ -1282,12 +1276,12 @@ moves_loop: // When in check, search starts from here
           value = -search<PV>(pos, ss+1, -beta, -alpha, newDepth, false);
       }
 
-      // Step 18. Undo move
+      // Step 17. Undo move
       pos.undo_move(move);
 
       assert(value > -VALUE_INFINITE && value < VALUE_INFINITE);
 
-      // Step 19. Check for a new best move
+      // Step 18. Check for a new best move
       // Finished searching the move. If a stop occurred, the return value of
       // the search cannot be trusted, and we return immediately without
       // updating best move, PV and TT.
@@ -1364,7 +1358,7 @@ moves_loop: // When in check, search starts from here
         return VALUE_DRAW;
     */
 
-    // Step 20. Check for mate and stalemate
+    // Step 19. Check for mate and stalemate
     // All legal moves have been searched and if there are no legal moves, it
     // must be a mate or a stalemate. If we are in a singular extension search then
     // return a fail low score.

From 242a7d9fead561488ca176a4687deef8859918f2 Mon Sep 17 00:00:00 2001
From: VoyagerOne <excelgeek@gmail.com>
Date: Tue, 25 Aug 2020 09:10:47 -0400
Subject: [PATCH 58/58] Simplify MCP in QS

Simplify moveCount pruning in QS by removing depth dependency.

STC
LLR: 2.94 (-2.94,2.94) {-1.25,0.25}
Total: 42960 W: 4741 L: 4661 D: 33558
Ptnml(0-2): 218, 3574, 13804, 3678, 206
https://tests.stockfishchess.org/tests/view/5f42e3f75089a564a10d8493

LTC
LLR: 2.94 (-2.94,2.94) {-0.75,0.25}
Total: 66672 W: 3563 L: 3508 D: 59601
Ptnml(0-2): 71, 3064, 26996, 3149, 56
https://tests.stockfishchess.org/tests/view/5f4353285089a564a10d84d0

closes https://github.com/official-stockfish/Stockfish/pull/3067

Bench: 4074430
---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index d6b611a3..cae8a684 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1526,7 +1526,7 @@ moves_loop: // When in check, search starts from here
           assert(type_of(move) != ENPASSANT); // Due to !pos.advanced_pawn_push
 
           // moveCount pruning
-          if (moveCount > abs(depth) + 2)
+          if (moveCount > 2)
               continue;
 
           futilityValue = futilityBase + PieceValue[EG][pos.piece_on(to_sq(move))];