From 651ec3b31ee68db50f38ccd8fcdedbd6673cd9ed Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Mon, 10 Aug 2020 07:18:15 +0200 Subject: [PATCH 01/58] Revert "Avoid special casing for MinGW" This reverts commit a6e89293df5af35931b61d86b6de3872a981c100. The offending setup has been found as gcc/mingw 7.3 (on Ubuntu 18.04). fixes https://github.com/official-stockfish/Stockfish/issues/2963 closes https://github.com/official-stockfish/Stockfish/issues/2968 No functional change. --- src/nnue/layers/affine_transform.h | 29 ++++++++++++++--- src/nnue/layers/clipped_relu.h | 49 +++++++++++++++++++++++++---- src/nnue/nnue_feature_transformer.h | 34 ++++++++++++++++++-- 3 files changed, 98 insertions(+), 14 deletions(-) diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index ecc3008a..b585bc87 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -104,8 +104,13 @@ namespace Eval::NNUE::Layers { __m512i sum = _mm512_setzero_si512(); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - __m512i product = _mm512_maddubs_epi16( - _mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j])); + + #if defined(__MINGW32__) || defined(__MINGW64__) + __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j])); + #else + __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j])); + #endif + product = _mm512_madd_epi16(product, kOnes); sum = _mm512_add_epi32(sum, product); } @@ -120,8 +125,12 @@ namespace Eval::NNUE::Layers { const auto row_256 = reinterpret_cast(&weights_[offset]); int j = kNumChunks * 2; - __m256i sum256 = _mm256_maddubs_epi16( - _mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); + #if defined(__MINGW32__) || defined(__MINGW64__) // See HACK comment below in AVX2. + __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); + #else + __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); + #endif + sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1)); sum256 = _mm256_hadd_epi32(sum256, sum256); sum256 = _mm256_hadd_epi32(sum256, sum256); @@ -135,7 +144,17 @@ namespace Eval::NNUE::Layers { const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m256i product = _mm256_maddubs_epi16( - _mm256_load_si256(&input_vector[j]), _mm256_load_si256(&row[j])); + + #if defined(__MINGW32__) || defined(__MINGW64__) + // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary + // compiled with g++ in MSYS2 crashes here because the output memory is not aligned + // even though alignas is specified. + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&input_vector[j]), _mm256_load_si256(&row[j])); product = _mm256_madd_epi16(product, kOnes); sum = _mm256_add_epi32(sum, product); } diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index 7e5fcf4a..7ade598f 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -74,13 +74,50 @@ namespace Eval::NNUE::Layers { const auto out = reinterpret_cast<__m256i*>(output); for (IndexType i = 0; i < kNumChunks; ++i) { const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32( - _mm256_load_si256(&in[i * 4 + 0]), - _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits); + + #if defined(__MINGW32__) || defined(__MINGW64__) + // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary + // compiled with g++ in MSYS2 crashes here because the output memory is not aligned + // even though alignas is specified. + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&in[i * 4 + 0]), + + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&in[i * 4 + 1])), kWeightScaleBits); const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32( - _mm256_load_si256(&in[i * 4 + 2]), - _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits); - _mm256_store_si256( - &out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( + + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&in[i * 4 + 2]), + + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&in[i * 4 + 3])), kWeightScaleBits); + + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_storeu_si256 + #else + _mm256_store_si256 + #endif + + (&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( _mm256_packs_epi16(words0, words1), kZero), kOffsets)); } constexpr IndexType kStart = kNumChunks * kSimdWidth; diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index f899d761..1cfebbe4 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -110,12 +110,36 @@ namespace Eval::NNUE { auto out = reinterpret_cast<__m256i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m256i sum0 = - _mm256_load_si256(&reinterpret_cast( + + #if defined(__MINGW32__) || defined(__MINGW64__) + // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary + // compiled with g++ in MSYS2 crashes here because the output memory is not aligned + // even though alignas is specified. + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&reinterpret_cast( accumulation[perspectives[p]][0])[j * 2 + 0]); __m256i sum1 = - _mm256_load_si256(&reinterpret_cast( + + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&reinterpret_cast( accumulation[perspectives[p]][0])[j * 2 + 1]); - _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( + + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_storeu_si256 + #else + _mm256_store_si256 + #endif + + (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( _mm256_packs_epi16(sum0, sum1), kZero), kControl)); } @@ -178,7 +202,11 @@ namespace Eval::NNUE { auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); for (IndexType j = 0; j < kNumChunks; ++j) { + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j])); + #else accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]); + #endif } #elif defined(USE_SSE2) From bcdf41dadc8a5f8a23116236a0f449a08b46dc6b Mon Sep 17 00:00:00 2001 From: Sergio Vieri Date: Mon, 10 Aug 2020 08:47:52 +0800 Subject: [PATCH 02/58] Update default net to nn-112bb1c8cdb5.nnue First trained net using search eval instead of pv leaf static eval. Net created at: 20200810-0744 passed STC: https://tests.stockfishchess.org/tests/view/5f30995d90816720665373f8 LLR: 2.93 (-2.94,2.94) {-0.50,1.50} Total: 15416 W: 2071 L: 1920 D: 11425 Ptnml(0-2): 123, 1376, 4563, 1519, 127 passed LTC: https://tests.stockfishchess.org/tests/view/5f30a104908167206653742b LLR: 2.93 (-2.94,2.94) {0.25,1.75} Total: 29792 W: 2003 L: 1834 D: 25955 Ptnml(0-2): 50, 1541, 11550, 1700, 55 closes https://github.com/official-stockfish/Stockfish/pull/2966 Bench: 4084753 --- src/ucioption.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ucioption.cpp b/src/ucioption.cpp index faeb78ae..b0689d6d 100644 --- a/src/ucioption.cpp +++ b/src/ucioption.cpp @@ -79,7 +79,7 @@ void init(OptionsMap& o) { o["Syzygy50MoveRule"] << Option(true); o["SyzygyProbeLimit"] << Option(7, 0, 7); o["Use NNUE"] << Option(false, on_use_NNUE); - o["EvalFile"] << Option("nn-9931db908a9b.nnue", on_eval_file); + o["EvalFile"] << Option("nn-112bb1c8cdb5.nnue", on_eval_file); } From a54f9011c3bf3581fe7daffef6be2d586e6662c1 Mon Sep 17 00:00:00 2001 From: jjoshua2 Date: Sun, 9 Aug 2020 16:16:04 -0400 Subject: [PATCH 03/58] simplying hybrid condition STC https://tests.stockfishchess.org/tests/view/5f3059d1908167206653736b: LLR: 2.94 (-2.94,2.94) {-1.50,0.50} Total: 12520 W: 766 L: 727 D: 11027 Ptnml(0-2): 13, 624, 4949, 659, 15 LTC: https://tests.stockfishchess.org/tests/view/5f30863a90816720665373d1 LLR: 2.94 (-2.94,2.94) {-1.50,0.50} Total: 12520 W: 766 L: 727 D: 11027 Ptnml(0-2): 13, 624, 4949, 659, 15 closes: https://github.com/official-stockfish/Stockfish/pull/2965 Bench: 4084753 --- src/evaluate.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index ce35c630..caab2979 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -114,7 +114,7 @@ namespace { constexpr Value LazyThreshold1 = Value(1400); constexpr Value LazyThreshold2 = Value(1300); constexpr Value SpaceThreshold = Value(12222); - constexpr Value NNUEThreshold = Value(460); + constexpr Value NNUEThreshold = Value(575); // KingAttackWeights[PieceType] contains king attack weights by piece type constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 }; @@ -945,7 +945,7 @@ Value Eval::evaluate(const Position& pos) { { Value v = eg_value(pos.psq_score()); // Take NNUE eval only on balanced positions - if (abs(v) < NNUEThreshold + 20 * pos.count()) + if (abs(v) < NNUEThreshold) return NNUE::evaluate(pos) + Tempo; } return Evaluation(pos).value(); From 875183b310a8249922c2155e82cb4cecfae2097e Mon Sep 17 00:00:00 2001 From: mstembera Date: Sun, 9 Aug 2020 23:50:59 -0700 Subject: [PATCH 04/58] Workaround using unaligned loads for gcc < 9 despite usage of alignas, the generated (avx2/avx512) code with older compilers needs to use unaligned loads with older gcc (e.g. confirmed crash with gcc 7.3/mingw on abrok). Better performance thus requires gcc >= 9 on hardware supporting avx2/avx512 closes https://github.com/official-stockfish/Stockfish/pull/2969 No functional change --- src/nnue/layers/affine_transform.h | 32 +++---------------- src/nnue/layers/clipped_relu.h | 48 +++-------------------------- src/nnue/nnue_common.h | 21 +++++++++++++ src/nnue/nnue_feature_transformer.h | 42 ++++--------------------- 4 files changed, 36 insertions(+), 107 deletions(-) diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index b585bc87..20ec2f12 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -104,13 +104,7 @@ namespace Eval::NNUE::Layers { __m512i sum = _mm512_setzero_si512(); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - - #if defined(__MINGW32__) || defined(__MINGW64__) - __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j])); - #else - __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j])); - #endif - + __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j])); product = _mm512_madd_epi16(product, kOnes); sum = _mm512_add_epi32(sum, product); } @@ -124,13 +118,7 @@ namespace Eval::NNUE::Layers { const auto iv_256 = reinterpret_cast(input); const auto row_256 = reinterpret_cast(&weights_[offset]); int j = kNumChunks * 2; - - #if defined(__MINGW32__) || defined(__MINGW64__) // See HACK comment below in AVX2. - __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); - #else - __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); - #endif - + __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1)); sum256 = _mm256_hadd_epi32(sum256, sum256); sum256 = _mm256_hadd_epi32(sum256, sum256); @@ -143,18 +131,7 @@ namespace Eval::NNUE::Layers { __m256i sum = _mm256_setzero_si256(); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - __m256i product = _mm256_maddubs_epi16( - - #if defined(__MINGW32__) || defined(__MINGW64__) - // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary - // compiled with g++ in MSYS2 crashes here because the output memory is not aligned - // even though alignas is specified. - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&input_vector[j]), _mm256_load_si256(&row[j])); + __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j])); product = _mm256_madd_epi16(product, kOnes); sum = _mm256_add_epi32(sum, product); } @@ -168,8 +145,7 @@ namespace Eval::NNUE::Layers { __m128i sum = _mm_cvtsi32_si128(biases_[i]); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - __m128i product = _mm_maddubs_epi16( - _mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j])); + __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j])); product = _mm_madd_epi16(product, kOnes); sum = _mm_add_epi32(sum, product); } diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index 7ade598f..13196ec2 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -74,50 +74,12 @@ namespace Eval::NNUE::Layers { const auto out = reinterpret_cast<__m256i*>(output); for (IndexType i = 0; i < kNumChunks; ++i) { const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32( - - #if defined(__MINGW32__) || defined(__MINGW64__) - // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary - // compiled with g++ in MSYS2 crashes here because the output memory is not aligned - // even though alignas is specified. - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 0]), - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 1])), kWeightScaleBits); + _mm256_loadA_si256(&in[i * 4 + 0]), + _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits); const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32( - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 2]), - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 3])), kWeightScaleBits); - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_storeu_si256 - #else - _mm256_store_si256 - #endif - - (&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( + _mm256_loadA_si256(&in[i * 4 + 2]), + _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits); + _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( _mm256_packs_epi16(words0, words1), kZero), kOffsets)); } constexpr IndexType kStart = kNumChunks * kSimdWidth; diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index 972ef3e5..e7ce84f7 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -37,6 +37,27 @@ #include #endif +// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary +// compiled with older g++ crashes because the output memory is not aligned +// even though alignas is specified. +#if defined(USE_AVX2) +#if defined(__GNUC__ ) && (__GNUC__ < 9) +#define _mm256_loadA_si256 _mm256_loadu_si256 +#define _mm256_storeA_si256 _mm256_storeu_si256 +#else +#define _mm256_loadA_si256 _mm256_load_si256 +#define _mm256_storeA_si256 _mm256_store_si256 +#endif +#endif + +#if defined(USE_AVX512) +#if defined(__GNUC__ ) && (__GNUC__ < 9) +#define _mm512_loadA_si512 _mm512_loadu_si512 +#else +#define _mm512_loadA_si512 _mm512_load_si512 +#endif +#endif + namespace Eval::NNUE { // Version of the evaluation file diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 1cfebbe4..cbcc26f3 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -109,37 +109,11 @@ namespace Eval::NNUE { #if defined(USE_AVX2) auto out = reinterpret_cast<__m256i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - __m256i sum0 = - - #if defined(__MINGW32__) || defined(__MINGW64__) - // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary - // compiled with g++ in MSYS2 crashes here because the output memory is not aligned - // even though alignas is specified. - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&reinterpret_cast( - accumulation[perspectives[p]][0])[j * 2 + 0]); - __m256i sum1 = - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&reinterpret_cast( - accumulation[perspectives[p]][0])[j * 2 + 1]); - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_storeu_si256 - #else - _mm256_store_si256 - #endif - - (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( + __m256i sum0 = _mm256_loadA_si256( + &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 0]); + __m256i sum1 = _mm256_loadA_si256( + &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 1]); + _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( _mm256_packs_epi16(sum0, sum1), kZero), kControl)); } @@ -202,11 +176,7 @@ namespace Eval::NNUE { auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); for (IndexType j = 0; j < kNumChunks; ++j) { - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j])); - #else - accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]); - #endif + _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j])); } #elif defined(USE_SSE2) From ad2ad4c65706c18a5383506d361f1f23fc6a26ab Mon Sep 17 00:00:00 2001 From: SFisGOD Date: Mon, 10 Aug 2020 15:39:22 +0800 Subject: [PATCH 05/58] Modify castling extension Extend castling only if there are few friendly pieces on the castling side. Inspired by silversolver1's (Rahul Dsilva) test https://tests.stockfishchess.org/tests/view/5f0fef560640035f9d2978cf STC: LLR: 2.94 (-2.94,2.94) {-0.50,1.50} Total: 7096 W: 947 L: 818 D: 5331 Ptnml(0-2): 32, 604, 2181, 665, 66 https://tests.stockfishchess.org/tests/view/5f309f729081672066537426 LTC: LLR: 2.96 (-2.94,2.94) {0.25,1.75} Total: 4712 W: 300 L: 215 D: 4197 Ptnml(0-2): 2, 190, 1895, 259, 10 https://tests.stockfishchess.org/tests/view/5f30a2039081672066537430 closes https://github.com/official-stockfish/Stockfish/pull/2970 Bench: 4094850 --- src/search.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/search.cpp b/src/search.cpp index 0a2519b6..3d2bb422 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1131,7 +1131,8 @@ moves_loop: // When in check, search starts from here extension = 1; // Castling extension - if (type_of(move) == CASTLING) + if ( type_of(move) == CASTLING + && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 3) extension = 1; // Late irreversible move extension From cb0504028e8830dbc71be53cbd701d78c3d562a1 Mon Sep 17 00:00:00 2001 From: sf-x Date: Sun, 9 Aug 2020 18:01:18 +0300 Subject: [PATCH 06/58] Makefile rework/cleanup Makefile targets x86-64-sse42, x86-sse3 are removed; x86-64-sse41 is renamed to x86-64-sse41-popcnt (it did enable popcnt). Makefile variables sse3, sse42, their associated compilation flags and code in misc.cpp are removed. closes https://github.com/official-stockfish/Stockfish/pull/2922 No functional change --- src/Makefile | 58 +++------------------------------------------------- src/misc.cpp | 6 ------ 2 files changed, 3 insertions(+), 61 deletions(-) diff --git a/src/Makefile b/src/Makefile index 571172b2..a48e7dcb 100644 --- a/src/Makefile +++ b/src/Makefile @@ -68,10 +68,8 @@ endif # prefetch = yes/no --- -DUSE_PREFETCH --- Use prefetch asm-instruction # popcnt = yes/no --- -DUSE_POPCNT --- Use popcnt asm-instruction # sse = yes/no --- -msse --- Use Intel Streaming SIMD Extensions -# sse3 = yes/no --- -msse3 --- Use Intel Streaming SIMD Extensions 3 # ssse3 = yes/no --- -mssse3 --- Use Intel Supplemental Streaming SIMD Extensions 3 # sse41 = yes/no --- -msse4.1 --- Use Intel Streaming SIMD Extensions 4.1 -# sse42 = yes/no --- -msse4.2 --- Use Intel Streaming SIMD Extensions 4.2 # avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2 # pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction # avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512 @@ -89,10 +87,8 @@ bits = 64 prefetch = no popcnt = no sse = no -sse3 = no ssse3 = no sse41 = no -sse42 = no avx2 = no pext = no avx512 = no @@ -127,18 +123,10 @@ ifeq ($(ARCH),x86-64) sse = yes endif -ifeq ($(ARCH),x86-64-sse3) - arch = x86_64 - prefetch = yes - sse = yes - sse3 = yes -endif - ifeq ($(ARCH),x86-64-sse3-popcnt) arch = x86_64 prefetch = yes sse = yes - sse3 = yes popcnt = yes endif @@ -146,39 +134,25 @@ ifeq ($(ARCH),x86-64-ssse3) arch = x86_64 prefetch = yes sse = yes - sse3 = yes ssse3 = yes endif -ifeq ($(ARCH),x86-64-sse41) - arch = x86_64 - prefetch = yes - popcnt = yes - sse = yes - sse3 = yes - ssse3 = yes - sse41 = yes -endif - ifeq ($(ARCH),x86-64-modern) arch = x86_64 prefetch = yes popcnt = yes sse = yes - sse3 = yes ssse3 = yes sse41 = yes endif -ifeq ($(ARCH),x86-64-sse42) +ifeq ($(ARCH),x86-64-sse41-popcnt) arch = x86_64 prefetch = yes popcnt = yes sse = yes - sse3 = yes ssse3 = yes sse41 = yes - sse42 = yes endif ifeq ($(ARCH),x86-64-avx2) @@ -186,10 +160,8 @@ ifeq ($(ARCH),x86-64-avx2) prefetch = yes popcnt = yes sse = yes - sse3 = yes ssse3 = yes sse41 = yes - sse42 = yes avx2 = yes endif @@ -198,10 +170,8 @@ ifeq ($(ARCH),x86-64-bmi2) prefetch = yes popcnt = yes sse = yes - sse3 = yes ssse3 = yes sse41 = yes - sse42 = yes avx2 = yes pext = yes endif @@ -211,10 +181,8 @@ ifeq ($(ARCH),x86-64-avx512) prefetch = yes popcnt = yes sse = yes - sse3 = yes ssse3 = yes sse41 = yes - sse42 = yes avx2 = yes pext = yes avx512 = yes @@ -450,13 +418,6 @@ ifeq ($(avx512),yes) endif endif -ifeq ($(sse42),yes) - CXXFLAGS += -DUSE_SSE42 - ifeq ($(comp),$(filter $(comp),gcc clang mingw)) - CXXFLAGS += -msse4.2 - endif -endif - ifeq ($(sse41),yes) CXXFLAGS += -DUSE_SSE41 ifeq ($(comp),$(filter $(comp),gcc clang mingw)) @@ -471,13 +432,6 @@ ifeq ($(ssse3),yes) endif endif -ifeq ($(sse3),yes) - CXXFLAGS += -DUSE_SSE3 - ifeq ($(comp),$(filter $(comp),gcc clang mingw)) - CXXFLAGS += -msse3 - endif -endif - ifeq ($(neon),yes) CXXFLAGS += -DUSE_NEON endif @@ -557,12 +511,10 @@ help: @echo "x86-64-avx512 > x86 64-bit with avx512 support" @echo "x86-64-bmi2 > x86 64-bit with bmi2 support" @echo "x86-64-avx2 > x86 64-bit with avx2 support" - @echo "x86-64-sse42 > x86 64-bit with sse42 support" - @echo "x86-64-modern > x86 64-bit with sse41 support (x86-64-sse41)" - @echo "x86-64-sse41 > x86 64-bit with sse41 support" + @echo "x86-64-sse41-popcnt > x86 64-bit with sse41 and popcnt support" + @echo "x86-64-modern > the same as previous (x86-64-sse41-popcnt)" @echo "x86-64-ssse3 > x86 64-bit with ssse3 support" @echo "x86-64-sse3-popcnt > x86 64-bit with sse3 and popcnt support" - @echo "x86-64-sse3 > x86 64-bit with sse3 support" @echo "x86-64 > x86 64-bit generic" @echo "x86-32 > x86 32-bit (also enables SSE)" @echo "x86-32-old > x86 32-bit fall back for old hardware" @@ -669,10 +621,8 @@ config-sanity: @echo "prefetch: '$(prefetch)'" @echo "popcnt: '$(popcnt)'" @echo "sse: '$(sse)'" - @echo "sse3: '$(sse3)'" @echo "ssse3: '$(ssse3)'" @echo "sse41: '$(sse41)'" - @echo "sse42: '$(sse42)'" @echo "avx2: '$(avx2)'" @echo "pext: '$(pext)'" @echo "avx512: '$(avx512)'" @@ -695,10 +645,8 @@ config-sanity: @test "$(prefetch)" = "yes" || test "$(prefetch)" = "no" @test "$(popcnt)" = "yes" || test "$(popcnt)" = "no" @test "$(sse)" = "yes" || test "$(sse)" = "no" - @test "$(sse3)" = "yes" || test "$(sse3)" = "no" @test "$(ssse3)" = "yes" || test "$(ssse3)" = "no" @test "$(sse41)" = "yes" || test "$(sse41)" = "no" - @test "$(sse42)" = "yes" || test "$(sse42)" = "no" @test "$(avx2)" = "yes" || test "$(avx2)" = "no" @test "$(pext)" = "yes" || test "$(pext)" = "no" @test "$(avx512)" = "yes" || test "$(avx512)" = "no" diff --git a/src/misc.cpp b/src/misc.cpp index bdd7bccb..5061ae13 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -220,17 +220,11 @@ const std::string compiler_info() { #if defined(USE_AVX2) compiler += " AVX2"; #endif - #if defined(USE_SSE42) - compiler += " SSE42"; - #endif #if defined(USE_SSE41) compiler += " SSE41"; #endif #if defined(USE_SSSE3) compiler += " SSSE3"; - #endif - #if defined(USE_SSE3) - compiler += " SSE3"; #endif compiler += (HasPext ? " BMI2" : ""); compiler += (HasPopCnt ? " POPCNT" : ""); From f948cd008d3a289ebbadc463271f84888e8069ba Mon Sep 17 00:00:00 2001 From: mstembera Date: Sun, 9 Aug 2020 16:23:33 -0700 Subject: [PATCH 07/58] Cleanup and optimize SSE/AVX code AVX512 +4% faster AVX2 +1% faster SSSE3 +5% faster passed non-regression STC: STC https://tests.stockfishchess.org/tests/view/5f31249f90816720665374f6 LLR: 2.96 (-2.94,2.94) {-1.50,0.50} Total: 17576 W: 2344 L: 2245 D: 12987 Ptnml(0-2): 127, 1570, 5292, 1675, 124 closes https://github.com/official-stockfish/Stockfish/pull/2962 No functional change --- src/nnue/layers/affine_transform.h | 46 +++++++++++++++-------------- src/nnue/nnue_accumulator.h | 2 +- src/nnue/nnue_common.h | 6 ++-- src/nnue/nnue_feature_transformer.h | 21 +++++++------ 4 files changed, 41 insertions(+), 34 deletions(-) diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 20ec2f12..89cfaad7 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -108,24 +108,19 @@ namespace Eval::NNUE::Layers { product = _mm512_madd_epi16(product, kOnes); sum = _mm512_add_epi32(sum, product); } - output[i] = _mm512_reduce_add_epi32(sum) + biases_[i]; // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks. // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit) // and we have to do one more 256bit chunk. if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2) { - const auto iv_256 = reinterpret_cast(input); - const auto row_256 = reinterpret_cast(&weights_[offset]); - int j = kNumChunks * 2; - __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); - sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1)); - sum256 = _mm256_hadd_epi32(sum256, sum256); - sum256 = _mm256_hadd_epi32(sum256, sum256); - const __m128i lo = _mm256_extracti128_si256(sum256, 0); - const __m128i hi = _mm256_extracti128_si256(sum256, 1); - output[i] += _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi); + const auto iv256 = reinterpret_cast(&input_vector[kNumChunks]); + const auto row256 = reinterpret_cast(&row[kNumChunks]); + __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0])); + product256 = _mm256_madd_epi16(product256, _mm256_set1_epi16(1)); + sum = _mm512_add_epi32(sum, _mm512_zextsi256_si512(product256)); } + output[i] = _mm512_reduce_add_epi32(sum) + biases_[i]; #elif defined(USE_AVX2) __m256i sum = _mm256_setzero_si256(); @@ -135,23 +130,30 @@ namespace Eval::NNUE::Layers { product = _mm256_madd_epi16(product, kOnes); sum = _mm256_add_epi32(sum, product); } - sum = _mm256_hadd_epi32(sum, sum); - sum = _mm256_hadd_epi32(sum, sum); - const __m128i lo = _mm256_extracti128_si256(sum, 0); - const __m128i hi = _mm256_extracti128_si256(sum, 1); - output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi) + biases_[i]; + __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC)); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB)); + output[i] = _mm_cvtsi128_si32(sum128) + biases_[i]; #elif defined(USE_SSSE3) - __m128i sum = _mm_cvtsi32_si128(biases_[i]); + __m128i sum = _mm_setzero_si128(); const auto row = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { - __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j])); + for (int j = 0; j < (int)kNumChunks - 1; j += 2) { + __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j])); + product0 = _mm_madd_epi16(product0, kOnes); + sum = _mm_add_epi32(sum, product0); + __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1])); + product1 = _mm_madd_epi16(product1, kOnes); + sum = _mm_add_epi32(sum, product1); + } + if (kNumChunks & 0x1) { + __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1])); product = _mm_madd_epi16(product, kOnes); sum = _mm_add_epi32(sum, product); } - sum = _mm_hadd_epi32(sum, sum); - sum = _mm_hadd_epi32(sum, sum); - output[i] = _mm_cvtsi128_si32(sum); + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB + output[i] = _mm_cvtsi128_si32(sum) + biases_[i]; #elif defined(USE_NEON) int32x4_t sum = {biases_[i]}; diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index 2a354a3c..69dfaad2 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -26,7 +26,7 @@ namespace Eval::NNUE { // Class that holds the result of affine transformation of input features - struct alignas(32) Accumulator { + struct alignas(kCacheLineSize) Accumulator { std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions]; Value score; diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index e7ce84f7..ff33cc79 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -52,9 +52,11 @@ #if defined(USE_AVX512) #if defined(__GNUC__ ) && (__GNUC__ < 9) -#define _mm512_loadA_si512 _mm512_loadu_si512 +#define _mm512_loadA_si512 _mm512_loadu_si512 +#define _mm512_storeA_si512 _mm512_storeu_si512 #else -#define _mm512_loadA_si512 _mm512_load_si512 +#define _mm512_loadA_si512 _mm512_load_si512 +#define _mm512_storeA_si512 _mm512_store_si512 #endif #endif diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index cbcc26f3..3818e444 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -169,38 +169,41 @@ namespace Eval::NNUE { kHalfDimensions * sizeof(BiasType)); for (const auto index : active_indices[perspective]) { const IndexType offset = kHalfDimensions * index; + #if defined(USE_AVX512) + auto accumulation = reinterpret_cast<__m512i*>( + &accumulator.accumulation[perspective][i][0]); + auto column = reinterpret_cast(&weights_[offset]); + constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; + for (IndexType j = 0; j < kNumChunks; ++j) + _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j])); - #if defined(USE_AVX2) + #elif defined(USE_AVX2) auto accumulation = reinterpret_cast<__m256i*>( &accumulator.accumulation[perspective][i][0]); auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) { + for (IndexType j = 0; j < kNumChunks; ++j) _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j])); - } #elif defined(USE_SSE2) auto accumulation = reinterpret_cast<__m128i*>( &accumulator.accumulation[perspective][i][0]); auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) { + for (IndexType j = 0; j < kNumChunks; ++j) accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); - } #elif defined(USE_NEON) auto accumulation = reinterpret_cast( &accumulator.accumulation[perspective][i][0]); auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) { + for (IndexType j = 0; j < kNumChunks; ++j) accumulation[j] = vaddq_s16(accumulation[j], column[j]); - } #else - for (IndexType j = 0; j < kHalfDimensions; ++j) { + for (IndexType j = 0; j < kHalfDimensions; ++j) accumulator.accumulation[perspective][i][j] += weights_[offset + j]; - } #endif } From 21df37d7fd4dcc9b4a9c319382cc43685c0259c8 Mon Sep 17 00:00:00 2001 From: Fanael Linithien Date: Sun, 9 Aug 2020 16:20:45 +0200 Subject: [PATCH 08/58] Provide vectorized NNUE code for SSE2 and MMX targets This patch allows old x86 CPUs, from AMD K8 (which the x86-64 baseline targets) all the way down to the Pentium MMX, to benefit from NNUE with comparable performance hit versus hand-written eval as on more modern processors. NPS of the bench with NNUE enabled on a Pentium III 1.13 GHz (using the MMX code): master: 38951 this patch: 80586 NPS of the bench with NNUE enabled using baseline x86-64 arch, which is how linux distros are likely to package stockfish, on a modern CPU (using the SSE2 code): master: 882584 this patch: 1203945 closes https://github.com/official-stockfish/Stockfish/pull/2956 No functional change. --- AUTHORS | 1 + src/Makefile | 13 ++++++- src/misc.cpp | 3 ++ src/nnue/layers/affine_transform.h | 59 ++++++++++++++++++++++++++++- src/nnue/layers/clipped_relu.h | 20 +++++++++- src/nnue/nnue_common.h | 6 +++ src/nnue/nnue_feature_transformer.h | 54 +++++++++++++++++++++++++- 7 files changed, 150 insertions(+), 6 deletions(-) diff --git a/AUTHORS b/AUTHORS index 21ef3e50..41b89705 100644 --- a/AUTHORS +++ b/AUTHORS @@ -53,6 +53,7 @@ Ernesto Gatti Linmiao Xu (linrock) Fabian Beuke (madnight) Fabian Fichter (ianfab) +Fanael Linithien (Fanael) fanon Fauzi Akram Dabat (FauziAkram) Felix Wittmann diff --git a/src/Makefile b/src/Makefile index a48e7dcb..3d84f482 100644 --- a/src/Makefile +++ b/src/Makefile @@ -86,6 +86,7 @@ sanitize = no bits = 64 prefetch = no popcnt = no +mmx = no sse = no ssse3 = no sse41 = no @@ -110,6 +111,7 @@ ifeq ($(ARCH),x86-32) arch = i386 bits = 32 prefetch = yes + mmx = yes sse = yes endif @@ -250,7 +252,7 @@ ifeq ($(COMP),gcc) ifneq ($(KERNEL),Darwin) LDFLAGS += -Wl,--no-as-needed endif - + gccversion = $(shell $(CXX) --version) gccisclang = $(findstring clang,$(gccversion)) endif @@ -432,6 +434,13 @@ ifeq ($(ssse3),yes) endif endif +ifeq ($(mmx),yes) + CXXFLAGS += -DUSE_MMX + ifeq ($(comp),$(filter $(comp),gcc clang mingw)) + CXXFLAGS += -mmmx + endif +endif + ifeq ($(neon),yes) CXXFLAGS += -DUSE_NEON endif @@ -516,7 +525,7 @@ help: @echo "x86-64-ssse3 > x86 64-bit with ssse3 support" @echo "x86-64-sse3-popcnt > x86 64-bit with sse3 and popcnt support" @echo "x86-64 > x86 64-bit generic" - @echo "x86-32 > x86 32-bit (also enables SSE)" + @echo "x86-32 > x86 32-bit (also enables MMX and SSE)" @echo "x86-32-old > x86 32-bit fall back for old hardware" @echo "ppc-64 > PPC 64-bit" @echo "ppc-32 > PPC 32-bit" diff --git a/src/misc.cpp b/src/misc.cpp index 5061ae13..401a6505 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -228,6 +228,9 @@ const std::string compiler_info() { #endif compiler += (HasPext ? " BMI2" : ""); compiler += (HasPopCnt ? " POPCNT" : ""); + #if defined(USE_MMX) + compiler += " MMX"; + #endif #if !defined(NDEBUG) compiler += " DEBUG"; #endif diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 89cfaad7..985ee71a 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -87,11 +87,20 @@ namespace Eval::NNUE::Layers { const __m256i kOnes = _mm256_set1_epi16(1); const auto input_vector = reinterpret_cast(input); - #elif defined(USE_SSSE3) + #elif defined(USE_SSE2) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; + #ifndef USE_SSSE3 + const __m128i kZeros = _mm_setzero_si128(); + #else const __m128i kOnes = _mm_set1_epi16(1); + #endif const auto input_vector = reinterpret_cast(input); + #elif defined(USE_MMX) + constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; + const __m64 kZeros = _mm_setzero_si64(); + const auto input_vector = reinterpret_cast(input); + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; const auto input_vector = reinterpret_cast(input); @@ -155,6 +164,51 @@ namespace Eval::NNUE::Layers { sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB output[i] = _mm_cvtsi128_si32(sum) + biases_[i]; + #elif defined(USE_SSE2) + __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]); + __m128i sum_hi = kZeros; + const auto row = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + __m128i row_j = _mm_load_si128(&row[j]); + __m128i input_j = _mm_load_si128(&input_vector[j]); + __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j); + __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs); + __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs); + __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros); + __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros); + __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo); + __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi); + sum_lo = _mm_add_epi32(sum_lo, product_lo); + sum_hi = _mm_add_epi32(sum_hi, product_hi); + } + __m128i sum = _mm_add_epi32(sum_lo, sum_hi); + __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2)); + sum = _mm_add_epi32(sum, sum_high_64); + __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2)); + sum = _mm_add_epi32(sum, sum_second_32); + output[i] = _mm_cvtsi128_si32(sum); + + #elif defined(USE_MMX) + __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]); + __m64 sum_hi = kZeros; + const auto row = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + __m64 row_j = row[j]; + __m64 input_j = input_vector[j]; + __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j); + __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs); + __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs); + __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros); + __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros); + __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo); + __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi); + sum_lo = _mm_add_pi32(sum_lo, product_lo); + sum_hi = _mm_add_pi32(sum_hi, product_hi); + } + __m64 sum = _mm_add_pi32(sum_lo, sum_hi); + sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum)); + output[i] = _mm_cvtsi64_si32(sum); + #elif defined(USE_NEON) int32x4_t sum = {biases_[i]}; const auto row = reinterpret_cast(&weights_[offset]); @@ -174,6 +228,9 @@ namespace Eval::NNUE::Layers { #endif } + #if defined(USE_MMX) + _mm_empty(); + #endif return output; } diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index 13196ec2..44d8a7de 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -84,7 +84,7 @@ namespace Eval::NNUE::Layers { } constexpr IndexType kStart = kNumChunks * kSimdWidth; - #elif defined(USE_SSSE3) + #elif defined(USE_SSE2) constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; #ifdef USE_SSE41 @@ -115,6 +115,24 @@ namespace Eval::NNUE::Layers { } constexpr IndexType kStart = kNumChunks * kSimdWidth; + #elif defined(USE_MMX) + constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; + const __m64 k0x80s = _mm_set1_pi8(-128); + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast<__m64*>(output); + for (IndexType i = 0; i < kNumChunks; ++i) { + const __m64 words0 = _mm_srai_pi16( + _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]), + kWeightScaleBits); + const __m64 words1 = _mm_srai_pi16( + _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]), + kWeightScaleBits); + const __m64 packedbytes = _mm_packs_pi16(words0, words1); + out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); + } + _mm_empty(); + constexpr IndexType kStart = kNumChunks * kSimdWidth; + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2); const int8x8_t kZero = {0}; diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index ff33cc79..cb1251c5 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -33,6 +33,9 @@ #elif defined(USE_SSE2) #include +#elif defined(USE_MMX) +#include + #elif defined(USE_NEON) #include #endif @@ -79,6 +82,9 @@ namespace Eval::NNUE { #elif defined(USE_SSE2) constexpr std::size_t kSimdWidth = 16; + #elif defined(USE_MMX) + constexpr std::size_t kSimdWidth = 8; + #elif defined(USE_NEON) constexpr std::size_t kSimdWidth = 16; #endif diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 3818e444..40f2603d 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -88,7 +88,7 @@ namespace Eval::NNUE { constexpr int kControl = 0b11011000; const __m256i kZero = _mm256_setzero_si256(); - #elif defined(USE_SSSE3) + #elif defined(USE_SSE2) constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; #ifdef USE_SSE41 @@ -97,6 +97,10 @@ namespace Eval::NNUE { const __m128i k0x80s = _mm_set1_epi8(-128); #endif + #elif defined(USE_MMX) + constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; + const __m64 k0x80s = _mm_set1_pi8(-128); + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); const int8x8_t kZero = {0}; @@ -117,7 +121,7 @@ namespace Eval::NNUE { _mm256_packs_epi16(sum0, sum1), kZero), kControl)); } - #elif defined(USE_SSSE3) + #elif defined(USE_SSE2) auto out = reinterpret_cast<__m128i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m128i sum0 = _mm_load_si128(&reinterpret_cast( @@ -137,6 +141,17 @@ namespace Eval::NNUE { ); } + #elif defined(USE_MMX) + auto out = reinterpret_cast<__m64*>(&output[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + __m64 sum0 = *(&reinterpret_cast( + accumulation[perspectives[p]][0])[j * 2 + 0]); + __m64 sum1 = *(&reinterpret_cast( + accumulation[perspectives[p]][0])[j * 2 + 1]); + const __m64 packedbytes = _mm_packs_pi16(sum0, sum1); + out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); + } + #elif defined(USE_NEON) const auto out = reinterpret_cast(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { @@ -154,6 +169,9 @@ namespace Eval::NNUE { #endif } + #if defined(USE_MMX) + _mm_empty(); + #endif } private: @@ -193,6 +211,15 @@ namespace Eval::NNUE { for (IndexType j = 0; j < kNumChunks; ++j) accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); + #elif defined(USE_MMX) + auto accumulation = reinterpret_cast<__m64*>( + &accumulator.accumulation[perspective][i][0]); + auto column = reinterpret_cast(&weights_[offset]); + constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); + for (IndexType j = 0; j < kNumChunks; ++j) { + accumulation[j] = _mm_add_pi16(accumulation[j], column[j]); + } + #elif defined(USE_NEON) auto accumulation = reinterpret_cast( &accumulator.accumulation[perspective][i][0]); @@ -208,6 +235,9 @@ namespace Eval::NNUE { } } + #if defined(USE_MMX) + _mm_empty(); + #endif accumulator.computed_accumulation = true; accumulator.computed_score = false; @@ -234,6 +264,11 @@ namespace Eval::NNUE { auto accumulation = reinterpret_cast<__m128i*>( &accumulator.accumulation[perspective][i][0]); + #elif defined(USE_MMX) + constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); + auto accumulation = reinterpret_cast<__m64*>( + &accumulator.accumulation[perspective][i][0]); + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); auto accumulation = reinterpret_cast( @@ -263,6 +298,12 @@ namespace Eval::NNUE { accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]); } + #elif defined(USE_MMX) + auto column = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]); + } + #elif defined(USE_NEON) auto column = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { @@ -294,6 +335,12 @@ namespace Eval::NNUE { accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); } + #elif defined(USE_MMX) + auto column = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + accumulation[j] = _mm_add_pi16(accumulation[j], column[j]); + } + #elif defined(USE_NEON) auto column = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { @@ -310,6 +357,9 @@ namespace Eval::NNUE { } } } + #if defined(USE_MMX) + _mm_empty(); + #endif accumulator.computed_accumulation = true; accumulator.computed_score = false; From 220ef1d27d9cd006a30b07ab726999c8181d10f0 Mon Sep 17 00:00:00 2001 From: Unai Corzo Date: Mon, 10 Aug 2020 15:38:44 +0200 Subject: [PATCH 09/58] Assorted search parameter tune STC https://tests.stockfishchess.org/tests/view/5f31219090816720665374ec LLR: 2.96 (-2.94,2.94) {-0.50,1.50} Total: 3376 W: 487 L: 359 D: 2530 Ptnml(0-2): 17, 253, 1042, 337, 39 LTC https://tests.stockfishchess.org/tests/view/5f3127f79081672066537502 LLR: 2.93 (-2.94,2.94) {0.25,1.75} Total: 8360 W: 581 L: 475 D: 7304 Ptnml(0-2): 11, 407, 3238, 513, 11 closes https://github.com/official-stockfish/Stockfish/pull/2971 bench: 4733874 --- src/search.cpp | 60 +++++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index 3d2bb422..676427f7 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -63,9 +63,9 @@ namespace { constexpr uint64_t TtHitAverageResolution = 1024; // Razor and futility margins - constexpr int RazorMargin = 527; + constexpr int RazorMargin = 510; Value futility_margin(Depth d, bool improving) { - return Value(227 * (d - improving)); + return Value(223 * (d - improving)); } // Reductions lookup table, initialized at startup @@ -73,7 +73,7 @@ namespace { Depth reduction(bool i, Depth d, int mn) { int r = Reductions[d] * Reductions[mn]; - return (r + 570) / 1024 + (!i && r > 1018); + return (r + 509) / 1024 + (!i && r > 894); } constexpr int futility_move_count(bool improving, Depth depth) { @@ -82,7 +82,7 @@ namespace { // History and stats update bonus, based on depth int stat_bonus(Depth d) { - return d > 15 ? 27 : 17 * d * d + 133 * d - 134; + return d > 13 ? 29 : 17 * d * d + 134 * d - 134; } // Add a small random component to draw evaluations to avoid 3fold-blindness @@ -192,7 +192,7 @@ namespace { void Search::init() { for (int i = 1; i < MAX_MOVES; ++i) - Reductions[i] = int((24.8 + std::log(Threads.size())) * std::log(i)); + Reductions[i] = int((22.0 + std::log(Threads.size())) * std::log(i)); } @@ -403,12 +403,12 @@ void Thread::search() { if (rootDepth >= 4) { Value prev = rootMoves[pvIdx].previousScore; - delta = Value(19); + delta = Value(17); alpha = std::max(prev - delta,-VALUE_INFINITE); beta = std::min(prev + delta, VALUE_INFINITE); // Adjust contempt based on root move's previousScore (dynamic contempt) - int dct = ct + (110 - ct / 2) * prev / (abs(prev) + 140); + int dct = ct + (105 - ct / 2) * prev / (abs(prev) + 149); contempt = (us == WHITE ? make_score(dct, dct / 2) : -make_score(dct, dct / 2)); @@ -506,13 +506,13 @@ void Thread::search() { && !Threads.stop && !mainThread->stopOnPonderhit) { - double fallingEval = (296 + 6 * (mainThread->bestPreviousScore - bestValue) - + 6 * (mainThread->iterValue[iterIdx] - bestValue)) / 725.0; + double fallingEval = (318 + 6 * (mainThread->bestPreviousScore - bestValue) + + 6 * (mainThread->iterValue[iterIdx] - bestValue)) / 825.0; fallingEval = Utility::clamp(fallingEval, 0.5, 1.5); // If the bestMove is stable over several iterations, reduce time accordingly - timeReduction = lastBestMoveDepth + 10 < completedDepth ? 1.92 : 0.95; - double reduction = (1.47 + mainThread->previousTimeReduction) / (2.22 * timeReduction); + timeReduction = lastBestMoveDepth + 9 < completedDepth ? 1.92 : 0.95; + double reduction = (1.47 + mainThread->previousTimeReduction) / (2.32 * timeReduction); // Use part of the gained time from a previous stable move for the current move for (Thread* th : Threads) @@ -537,7 +537,7 @@ void Thread::search() { } else if ( Threads.increaseDepth && !mainThread->ponder - && Time.elapsed() > totalTime * 0.56) + && Time.elapsed() > totalTime * 0.58) Threads.increaseDepth = false; else Threads.increaseDepth = true; @@ -824,10 +824,10 @@ namespace { // Step 9. Null move search with verification search (~40 Elo) if ( !PvNode && (ss-1)->currentMove != MOVE_NULL - && (ss-1)->statScore < 23824 + && (ss-1)->statScore < 22977 && eval >= beta && eval >= ss->staticEval - && ss->staticEval >= beta - 28 * depth - 28 * improving + 94 * ttPv + 200 + && ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ttPv + 182 && !excludedMove && pos.non_pawn_material(us) && (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor)) @@ -835,7 +835,7 @@ namespace { assert(eval - beta >= 0); // Null move dynamic reduction based on depth and value - Depth R = (737 + 77 * depth) / 246 + std::min(int(eval - beta) / 192, 3); + Depth R = (817 + 71 * depth) / 213 + std::min(int(eval - beta) / 192, 3); ss->currentMove = MOVE_NULL; ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0]; @@ -1028,17 +1028,17 @@ moves_loop: // When in check, search starts from here continue; // Futility pruning: parent node (~5 Elo) - if ( lmrDepth < 8 + if ( lmrDepth < 7 && !ss->inCheck - && ss->staticEval + 284 + 188 * lmrDepth <= alpha + && ss->staticEval + 283 + 170 * lmrDepth <= alpha && (*contHist[0])[movedPiece][to_sq(move)] + (*contHist[1])[movedPiece][to_sq(move)] + (*contHist[3])[movedPiece][to_sq(move)] - + (*contHist[5])[movedPiece][to_sq(move)] / 2 < 28388) + + (*contHist[5])[movedPiece][to_sq(move)] / 2 < 27376) continue; // Prune moves with negative SEE (~20 Elo) - if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 17)) * lmrDepth * lmrDepth))) + if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth))) continue; } else @@ -1055,12 +1055,12 @@ moves_loop: // When in check, search starts from here && !(PvNode && abs(bestValue) < 2) && PieceValue[MG][type_of(movedPiece)] >= PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] && !ss->inCheck - && ss->staticEval + 178 + 261 * lmrDepth + && ss->staticEval + 169 + 244 * lmrDepth + PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] <= alpha) continue; // See based pruning - if (!pos.see_ge(move, Value(-202) * depth)) // (~25 Elo) + if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo) continue; } } @@ -1166,7 +1166,7 @@ moves_loop: // When in check, search starts from here || moveCountPruning || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha || cutNode - || thisThread->ttHitAverage < 415 * TtHitAverageResolution * TtHitAverageWindow / 1024)) + || thisThread->ttHitAverage < 427 * TtHitAverageResolution * TtHitAverageWindow / 1024)) { Depth r = reduction(improving, depth, moveCount); @@ -1178,7 +1178,7 @@ moves_loop: // When in check, search starts from here r--; // Decrease reduction if the ttHit running average is large - if (thisThread->ttHitAverage > 473 * TtHitAverageResolution * TtHitAverageWindow / 1024) + if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024) r--; // Reduction if other threads are searching this position @@ -1221,17 +1221,17 @@ moves_loop: // When in check, search starts from here + (*contHist[0])[movedPiece][to_sq(move)] + (*contHist[1])[movedPiece][to_sq(move)] + (*contHist[3])[movedPiece][to_sq(move)] - - 4826; + - 5287; // Decrease/increase reduction by comparing opponent's stat score (~10 Elo) - if (ss->statScore >= -100 && (ss-1)->statScore < -112) + if (ss->statScore >= -106 && (ss-1)->statScore < -104) r--; - else if ((ss-1)->statScore >= -125 && ss->statScore < -138) + else if ((ss-1)->statScore >= -119 && ss->statScore < -140) r++; // Decrease/increase reduction for moves with a good/bad history (~30 Elo) - r -= ss->statScore / 14615; + r -= ss->statScore / 14884; } else { @@ -1241,7 +1241,7 @@ moves_loop: // When in check, search starts from here // Unless giving check, this capture is likely bad if ( !givesCheck - && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 211 * depth <= alpha) + && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 213 * depth <= alpha) r++; } @@ -1503,7 +1503,7 @@ moves_loop: // When in check, search starts from here if (PvNode && bestValue > alpha) alpha = bestValue; - futilityBase = bestValue + 141; + futilityBase = bestValue + 145; } const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory, @@ -1754,7 +1754,7 @@ moves_loop: // When in check, search starts from here } if (depth > 11 && ss->ply < MAX_LPH) - thisThread->lowPlyHistory[ss->ply][from_to(move)] << stat_bonus(depth - 6); + thisThread->lowPlyHistory[ss->ply][from_to(move)] << stat_bonus(depth - 7); } // When playing with strength handicap, choose best move among a set of RootMoves From a72cec1ff854a77a92452c2afe2001e05f06e6d4 Mon Sep 17 00:00:00 2001 From: Vizvezdenec Date: Sat, 18 Jul 2020 16:30:00 +0300 Subject: [PATCH 10/58] Add comments to probCut code and rename a variable closes https://github.com/official-stockfish/Stockfish/pull/2819 No functional change --- src/search.cpp | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index 676427f7..ef47fd22 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -596,7 +596,7 @@ namespace { Key posKey; Move ttMove, move, excludedMove, bestMove; Depth extension, newDepth; - Value bestValue, value, ttValue, eval, maxValue, probcutBeta; + Value bestValue, value, ttValue, eval, maxValue, probCutBeta; bool ttHit, ttPv, formerPv, givesCheck, improving, didLMR, priorCapture; bool captureOrPromotion, doFullDepthSearch, moveCountPruning, ttCapture, singularQuietLMR; @@ -871,7 +871,7 @@ namespace { } } - probcutBeta = beta + 176 - 49 * improving; + probCutBeta = beta + 176 - 49 * improving; // Step 10. ProbCut (~10 Elo) // If we have a good enough capture and a reduced search returns a value @@ -879,21 +879,27 @@ namespace { if ( !PvNode && depth > 4 && abs(beta) < VALUE_TB_WIN_IN_MAX_PLY - && !( ttHit - && tte->depth() >= depth - 3 + // if value from transposition table is lower than probCutBeta, don't attempt probCut + // there and in further interactions with transposition table cutoff depth is set to depth - 3 + // because probCut search has depth set to depth - 4 but we also do a move before it + // so effective depth is equal to depth - 3 + && !( ttHit + && tte->depth() >= depth - 3 && ttValue != VALUE_NONE - && ttValue < probcutBeta)) + && ttValue < probCutBeta)) { + // if ttMove is a capture and value from transposition table is good enough produce probCut + // cutoff without digging into actual probCut search if ( ttHit && tte->depth() >= depth - 3 && ttValue != VALUE_NONE - && ttValue >= probcutBeta + && ttValue >= probCutBeta && ttMove && pos.capture_or_promotion(ttMove)) - return probcutBeta; + return probCutBeta; - assert(probcutBeta < VALUE_INFINITE); - MovePicker mp(pos, ttMove, probcutBeta - ss->staticEval, &captureHistory); + assert(probCutBeta < VALUE_INFINITE); + MovePicker mp(pos, ttMove, probCutBeta - ss->staticEval, &captureHistory); int probCutCount = 0; while ( (move = mp.next_move()) != MOVE_NONE @@ -915,16 +921,17 @@ namespace { pos.do_move(move, st); // Perform a preliminary qsearch to verify that the move holds - value = -qsearch(pos, ss+1, -probcutBeta, -probcutBeta+1); + value = -qsearch(pos, ss+1, -probCutBeta, -probCutBeta+1); // If the qsearch held, perform the regular search - if (value >= probcutBeta) - value = -search(pos, ss+1, -probcutBeta, -probcutBeta+1, depth - 4, !cutNode); + if (value >= probCutBeta) + value = -search(pos, ss+1, -probCutBeta, -probCutBeta+1, depth - 4, !cutNode); pos.undo_move(move); - if (value >= probcutBeta) + if (value >= probCutBeta) { + // if transposition table doesn't have equal or more deep info write probCut data into it if ( !(ttHit && tte->depth() >= depth - 3 && ttValue != VALUE_NONE)) From 4ab8b0b738fe4ae58588efb421fd7b1643b2ef66 Mon Sep 17 00:00:00 2001 From: Guy Vreuls Date: Tue, 11 Aug 2020 04:38:38 +0200 Subject: [PATCH 11/58] Fix parallel LTO issues on Windows This adds -save-temps to the linker flags when parallel LTO is used on MinGW/MSYS. fixes #2977 closes https://github.com/official-stockfish/Stockfish/pull/2978 No functional change. --- src/Makefile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index 3d84f482..fd2618a4 100644 --- a/src/Makefile +++ b/src/Makefile @@ -472,6 +472,11 @@ ifeq ($(debug), no) ifeq ($(gccisclang),) CXXFLAGS += -flto LDFLAGS += $(CXXFLAGS) -flto=jobserver + ifneq ($(findstring MINGW,$(KERNEL)),) + LDFLAGS += -save-temps + else ifneq ($(findstring MSYS,$(KERNEL)),) + LDFLAGS += -save-temps + endif else CXXFLAGS += -flto=thin LDFLAGS += $(CXXFLAGS) @@ -605,7 +610,7 @@ objclean: # clean auxiliary profiling files profileclean: @rm -rf profdir - @rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda + @rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s @rm -f stockfish.profdata *.profraw default: From 399cddf444666cf1671c5281f7a8e78887b4f400 Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Mon, 10 Aug 2020 16:14:17 +0200 Subject: [PATCH 12/58] More aligned_alloc changes to support Android Move to posix_memalign for those platforms, in particular android, that do not fully support c++17 std::aligned_alloc() (and are not windows) see https://github.com/official-stockfish/Stockfish/issues/2860 closes https://github.com/official-stockfish/Stockfish/pull/2973 No functional change --- src/misc.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/misc.cpp b/src/misc.cpp index 401a6505..fc3746cf 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -51,6 +51,11 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY); #include #endif +#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) +#define POSIXALIGNEDALLOC +#include +#endif + #include "misc.h" #include "thread.h" @@ -318,8 +323,11 @@ void prefetch(void* addr) { /// void* std_aligned_alloc(size_t alignment, size_t size) { -#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) - return aligned_alloc(alignment, size); +#if defined(POSIXALIGNEDALLOC) + void *pointer; + if(posix_memalign(&pointer, alignment, size) == 0) + return pointer; + return nullptr; #elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES))) return _mm_malloc(size, alignment); #else @@ -328,7 +336,7 @@ void* std_aligned_alloc(size_t alignment, size_t size) { } void std_aligned_free(void* ptr) { -#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) +#if defined(POSIXALIGNEDALLOC) free(ptr); #elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES))) _mm_free(ptr); From f46c73040c16a078b884825c203feee6b0a8850b Mon Sep 17 00:00:00 2001 From: mstembera Date: Mon, 10 Aug 2020 12:52:46 -0700 Subject: [PATCH 13/58] Fix AVX512 build with older compilers avoids an intrinsic that is missing in gcc < 10. For this target, might trigger another gcc bug on windows that requires up-to-date gcc 8, 9, or 10, or usage of clang. Fixes https://github.com/official-stockfish/Stockfish/issues/2975 closes https://github.com/official-stockfish/Stockfish/pull/2976 No functional change --- src/Makefile | 2 +- src/nnue/layers/affine_transform.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/Makefile b/src/Makefile index fd2618a4..e34fbf61 100644 --- a/src/Makefile +++ b/src/Makefile @@ -416,7 +416,7 @@ endif ifeq ($(avx512),yes) CXXFLAGS += -DUSE_AVX512 ifeq ($(comp),$(filter $(comp),gcc clang mingw)) - CXXFLAGS += -mavx512bw + CXXFLAGS += -mavx512f -mavx512bw endif endif diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 985ee71a..8d2acd18 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -126,8 +126,7 @@ namespace Eval::NNUE::Layers { const auto iv256 = reinterpret_cast(&input_vector[kNumChunks]); const auto row256 = reinterpret_cast(&row[kNumChunks]); __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0])); - product256 = _mm256_madd_epi16(product256, _mm256_set1_epi16(1)); - sum = _mm512_add_epi32(sum, _mm512_zextsi256_si512(product256)); + sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256)); } output[i] = _mm512_reduce_add_epi32(sum) + biases_[i]; From ea6220f3813e5b76b444a02905eaf2c556bdb368 Mon Sep 17 00:00:00 2001 From: Guy Vreuls Date: Fri, 7 Aug 2020 17:07:46 +0200 Subject: [PATCH 14/58] This commit enables a mixed bench, to improve CI and allow for PGO (profile-build) of the NNUE part of the code. Joint work gvreuls / vondele * Download the default NNUE net in AppVeyor * Download net in travis CI `make net` * Adjust tests to cover more archs, speedup instrumented testing * Introduce 'mixed' bench as default, with further options: classical, NNUE, mixed. mixed (default) and NNUE require the default net to be present, which can be obtained with ``` make net ``` Further examples (first is equivalent to `./stockfish bench`): ``` ./stockfish bench 16 1 13 default depth mixed ./stockfish bench 16 1 13 default depth classical ./stockfish bench 16 1 13 default depth NNUE ``` The net is now downloaded automatically if needed for `profile-build` (usual `build` works fine without net present) PGO gives a nice speedup on fishtest: passed STC: LLR: 2.93 (-2.94,2.94) {-0.50,1.50} Total: 3360 W: 469 L: 343 D: 2548 Ptnml(0-2): 20, 246, 1030, 356, 28 https://tests.stockfishchess.org/tests/view/5f31b5499081672066537569 passed LTC: LLR: 2.97 (-2.94,2.94) {0.25,1.75} Total: 8824 W: 609 L: 502 D: 7713 Ptnml(0-2): 8, 430, 3438, 519, 17 https://tests.stockfishchess.org/tests/view/5f31c87b908167206653757c closes https://github.com/official-stockfish/Stockfish/pull/2931 fixes https://github.com/official-stockfish/Stockfish/issues/2907 requires fishtest updates before commit Bench: 4290577 --- .travis.yml | 27 +++++++++++++++++++++------ appveyor.yml | 14 ++++++++++++++ src/Makefile | 2 +- src/benchmark.cpp | 13 +++++++++++-- tests/instrumented.sh | 8 ++++---- 5 files changed, 51 insertions(+), 13 deletions(-) diff --git a/.travis.yml b/.travis.yml index d563a1e1..0dd38047 100644 --- a/.travis.yml +++ b/.travis.yml @@ -43,6 +43,9 @@ before_script: - cd src script: + # Download net + - make net + # Obtain bench reference from git log - git log HEAD | grep "\b[Bb]ench[ :]\+[0-9]\{7\}" | head -n 1 | sed "s/[^0-9]*\([0-9]*\).*/\1/g" > git_sig - export benchref=$(cat git_sig) @@ -55,14 +58,26 @@ script: # # Verify bench number against various builds - export CXXFLAGS="-Werror -D_GLIBCXX_DEBUG" - - make clean && make -j2 ARCH=x86-64 optimize=no debug=yes build && ../tests/signature.sh $benchref + - make clean && make -j2 ARCH=x86-64-modern optimize=no debug=yes build && ../tests/signature.sh $benchref + - export CXXFLAGS="-Werror" + - make clean && make -j2 ARCH=x86-64-modern build && ../tests/signature.sh $benchref + - make clean && make -j2 ARCH=x86-64-ssse3 build && ../tests/signature.sh $benchref + - make clean && make -j2 ARCH=x86-64-sse3-popcnt build && ../tests/signature.sh $benchref + - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-old build && ../tests/signature.sh $benchref; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" && "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi + + # compile only for some more advanced architectures (might not run in travis) + - make clean && make -j2 ARCH=x86-64-avx2 build + - make clean && make -j2 ARCH=x86-64-bmi2 build + # needs gcc 10 to compile + - if [[ "$COMPILER" != "g++-8" ]]; then make clean && make -j2 ARCH=x86-64-avx512 build; fi # # Check perft and reproducible search - - export CXXFLAGS="-Werror" - - make clean && make -j2 ARCH=x86-64 build + - make clean && make -j2 ARCH=x86-64-modern build - ../tests/perft.sh - ../tests/reprosearch.sh @@ -70,11 +85,11 @@ script: # Valgrind # - export CXXFLAGS="-O1 -fno-inline" - - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64 debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind; fi + - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind; fi - if [ -x "$(command -v valgrind )" ]; then ../tests/instrumented.sh --valgrind-thread; fi # # Sanitizer # - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64 sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64 sanitize=thread optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=thread optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi diff --git a/appveyor.yml b/appveyor.yml index d356ba2f..a3732a23 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -61,6 +61,20 @@ before_build: build_script: - cmake --build . --config %CONFIGURATION% -- /verbosity:minimal + - ps: | + # Download default NNUE net from fishtest + $nnuenet = Get-Content -Path src\ucioption.cpp | Select-String -CaseSensitive -Pattern "Option" | Select-String -CaseSensitive -Pattern "nn-[a-z0-9]{12}.nnue" + $dummy = $nnuenet -match "(?nn-[a-z0-9]{12}.nnue)" + $nnuenet = $Matches.nnuenet + Write-Host "Default net:" $nnuenet + $nnuedownloadurl = "https://tests.stockfishchess.org/api/nn/$nnuenet" + $nnuefilepath = "src\${env:CONFIGURATION}\$nnuenet" + if (Test-Path -Path $nnuefilepath) { + Write-Host "Already available." + } else { + Write-Host "Downloading $nnuedownloadurl to $nnuefilepath" + Invoke-WebRequest -Uri $nnuedownloadurl -OutFile $nnuefilepath + } before_test: - cd src/%CONFIGURATION% diff --git a/src/Makefile b/src/Makefile index e34fbf61..c00b60b5 100644 --- a/src/Makefile +++ b/src/Makefile @@ -569,7 +569,7 @@ help: build: config-sanity $(MAKE) ARCH=$(ARCH) COMP=$(COMP) all -profile-build: config-sanity objclean profileclean +profile-build: config-sanity objclean profileclean net @echo "" @echo "Step 1/4. Building instrumented executable ..." $(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) diff --git a/src/benchmark.cpp b/src/benchmark.cpp index 6041d642..806e9840 100644 --- a/src/benchmark.cpp +++ b/src/benchmark.cpp @@ -95,8 +95,9 @@ const vector Defaults = { /// setup_bench() builds a list of UCI commands to be run by bench. There /// are five parameters: TT size in MB, number of search threads that /// should be used, the limit value spent for each position, a file name -/// where to look for positions in FEN format and the type of the limit: -/// depth, perft, nodes and movetime (in millisecs). +/// where to look for positions in FEN format, the type of the limit: +/// depth, perft, nodes and movetime (in millisecs), and evaluation type +/// mixed (default), classical, NNUE. /// /// bench -> search default positions up to depth 13 /// bench 64 1 15 -> search default positions up to depth 15 (TT = 64MB) @@ -115,6 +116,7 @@ vector setup_bench(const Position& current, istream& is) { string limit = (is >> token) ? token : "13"; string fenFile = (is >> token) ? token : "default"; string limitType = (is >> token) ? token : "depth"; + string evalType = (is >> token) ? token : "mixed"; go = limitType == "eval" ? "eval" : "go " + limitType + " " + limit; @@ -146,13 +148,20 @@ vector setup_bench(const Position& current, istream& is) { list.emplace_back("setoption name Hash value " + ttSize); list.emplace_back("ucinewgame"); + size_t posCounter = 0; + for (const string& fen : fens) if (fen.find("setoption") != string::npos) list.emplace_back(fen); else { + if (evalType == "classical" || (evalType == "mixed" && posCounter % 2 == 0)) + list.emplace_back("setoption name Use NNUE value false"); + else if (evalType == "NNUE" || (evalType == "mixed" && posCounter % 2 != 0)) + list.emplace_back("setoption name Use NNUE value true"); list.emplace_back("position fen " + fen); list.emplace_back(go); + ++posCounter; } return list; diff --git a/tests/instrumented.sh b/tests/instrumented.sh index ae6d5c4b..03ded74a 100755 --- a/tests/instrumented.sh +++ b/tests/instrumented.sh @@ -70,7 +70,7 @@ for args in "eval" \ "go depth 10" \ "go movetime 1000" \ "go wtime 8000 btime 8000 winc 500 binc 500" \ - "bench 128 $threads 10 default depth" + "bench 128 $threads 8 default depth" do echo "$prefix $exeprefix ./stockfish $args $postfix" @@ -80,7 +80,7 @@ done # more general testing, following an uci protocol exchange cat << EOF > game.exp - set timeout 10 + set timeout 240 spawn $exeprefix ./stockfish send "uci\n" @@ -98,7 +98,7 @@ cat << EOF > game.exp expect "bestmove" send "position fen 5rk1/1K4p1/8/8/3B4/8/8/8 b - - 0 1\n" - send "go depth 30\n" + send "go depth 20\n" expect "bestmove" send "quit\n" @@ -121,7 +121,7 @@ cat << EOF > syzygy.exp send "uci\n" send "setoption name SyzygyPath value ../tests/syzygy/\n" expect "info string Found 35 tablebases" {} timeout {exit 1} - send "bench 128 1 10 default depth\n" + send "bench 128 1 8 default depth\n" send "quit\n" expect eof From ee060464129f8d3af184efa013177a4ef387a394 Mon Sep 17 00:00:00 2001 From: SFisGOD Date: Mon, 10 Aug 2020 21:13:56 +0800 Subject: [PATCH 15/58] Tweak castling extension Change condition from three friendly pieces to two. This now means that we only extend castling on the king side if there are no other friendly pieces aside from king and rook. For the queen side, we only extend if there is only a rook and another friendly piece or if there is only a single rook and no other friendly piece but this is very rare. STC: LLR: 3.20 (-2.94,2.94) {-0.50,1.50} Total: 31144 W: 4086 L: 3903 D: 23155 Ptnml(0-2): 227, 2843, 9278, 2968, 256 https://tests.stockfishchess.org/tests/view/5f31487f9081672066537516 LTC: LLR: 2.93 (-2.94,2.94) {0.25,1.75} Total: 57816 W: 3786 L: 3538 D: 50492 Ptnml(0-2): 92, 2991, 22488, 3251, 86 https://tests.stockfishchess.org/tests/view/5f3167c3908167206653753d closes https://github.com/official-stockfish/Stockfish/pull/2980 Bench: 4244812 --- src/search.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search.cpp b/src/search.cpp index ef47fd22..c5b4332f 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1139,7 +1139,7 @@ moves_loop: // When in check, search starts from here // Castling extension if ( type_of(move) == CASTLING - && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 3) + && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2) extension = 1; // Late irreversible move extension From 992f549ae7f4f73b025429c44bdbbc65de917f6c Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Tue, 11 Aug 2020 21:11:17 +0200 Subject: [PATCH 16/58] Restrict avx2 hack to windows target this workaround is possibly rather a windows & gcc specific problem. See e.g. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412#c25 on Linux with gcc 8 this patch brings roughly a 8% speedup. However, probably needs some testing in the wild. includes a workaround for an old msys make (3.81) installation (fixes #2984) No functional change --- src/Makefile | 2 +- src/nnue/nnue_common.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Makefile b/src/Makefile index c00b60b5..e82b066b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -569,7 +569,7 @@ help: build: config-sanity $(MAKE) ARCH=$(ARCH) COMP=$(COMP) all -profile-build: config-sanity objclean profileclean net +profile-build: net config-sanity objclean profileclean @echo "" @echo "Step 1/4. Building instrumented executable ..." $(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index cb1251c5..eab7d258 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -44,7 +44,7 @@ // compiled with older g++ crashes because the output memory is not aligned // even though alignas is specified. #if defined(USE_AVX2) -#if defined(__GNUC__ ) && (__GNUC__ < 9) +#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) #define _mm256_loadA_si256 _mm256_loadu_si256 #define _mm256_storeA_si256 _mm256_storeu_si256 #else @@ -54,7 +54,7 @@ #endif #if defined(USE_AVX512) -#if defined(__GNUC__ ) && (__GNUC__ < 9) +#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) #define _mm512_loadA_si512 _mm512_loadu_si512 #define _mm512_storeA_si512 _mm512_storeu_si512 #else From 6bc0256292cf51d390fee0cb78963da884dc2677 Mon Sep 17 00:00:00 2001 From: Daylen Yang Date: Tue, 11 Aug 2020 12:02:48 -0700 Subject: [PATCH 17/58] Use posix_memalign for Apple Silicon instead of _mm_malloc fails to build on that target, because of missing Intel Intrinsics. macOS has posix_memalign() since ~2014 so we can simplify the code and just use that for all Apple platforms. closes https://github.com/official-stockfish/Stockfish/pull/2985 No functional change. --- src/misc.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/misc.cpp b/src/misc.cpp index fc3746cf..aeb3c912 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -51,7 +51,7 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY); #include #endif -#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) +#if defined(__APPLE__) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) #define POSIXALIGNEDALLOC #include #endif @@ -328,7 +328,7 @@ void* std_aligned_alloc(size_t alignment, size_t size) { if(posix_memalign(&pointer, alignment, size) == 0) return pointer; return nullptr; -#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES))) +#elif defined(_WIN32) return _mm_malloc(size, alignment); #else return std::aligned_alloc(alignment, size); @@ -338,7 +338,7 @@ void* std_aligned_alloc(size_t alignment, size_t size) { void std_aligned_free(void* ptr) { #if defined(POSIXALIGNEDALLOC) free(ptr); -#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES))) +#elif defined(_WIN32) _mm_free(ptr); #else free(ptr); From dd63b98fb06e050aa961fbad6fd1f9316f2b17df Mon Sep 17 00:00:00 2001 From: mstembera Date: Tue, 11 Aug 2020 12:59:39 -0700 Subject: [PATCH 18/58] Add support for VNNI Adds support for Vector Neural Network Instructions (avx512), as available on Intel Cascade Lake The _mm512_dpbusd_epi32() intrinsic (vpdpbusd instruction) is taylor made for NNUE. on a cascade lake CPU (AWS C5.24x.large, gcc 10) NNUE eval is at roughly 78% nps of classical (single core test) bench 1024 1 24 default depth: target classical NNUE ratio vnni 2207232 1725987 78.20 avx512 2216789 1671734 75.41 avx2 2194006 1611263 73.44 modern 2185001 1352469 61.90 closes https://github.com/official-stockfish/Stockfish/pull/2987 No functional change --- src/Makefile | 25 +++++++++++++++++++++++++ src/misc.cpp | 3 +++ src/nnue/layers/affine_transform.h | 14 +++++++++++++- 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index e82b066b..0804cdd5 100644 --- a/src/Makefile +++ b/src/Makefile @@ -73,6 +73,7 @@ endif # avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2 # pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction # avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512 +# vnni = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512 # neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture # # Note that Makefile is space sensitive, so when adding new architectures @@ -93,6 +94,7 @@ sse41 = no avx2 = no pext = no avx512 = no +vnni = no neon = no ARCH = x86-64-modern @@ -190,6 +192,19 @@ ifeq ($(ARCH),x86-64-avx512) avx512 = yes endif +ifeq ($(ARCH),x86-64-vnni) + arch = x86_64 + prefetch = yes + popcnt = yes + sse = yes + ssse3 = yes + sse41 = yes + avx2 = yes + pext = yes + avx512 = yes + vnni = yes +endif + ifeq ($(ARCH),armv7) arch = armv7 prefetch = yes @@ -420,6 +435,13 @@ ifeq ($(avx512),yes) endif endif +ifeq ($(vnni),yes) + CXXFLAGS += -DUSE_VNNI + ifeq ($(comp),$(filter $(comp),gcc clang mingw)) + CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl + endif +endif + ifeq ($(sse41),yes) CXXFLAGS += -DUSE_SSE41 ifeq ($(comp),$(filter $(comp),gcc clang mingw)) @@ -522,6 +544,7 @@ help: @echo "" @echo "Supported archs:" @echo "" + @echo "x86-64-vnni > x86 64-bit with vnni support" @echo "x86-64-avx512 > x86 64-bit with avx512 support" @echo "x86-64-bmi2 > x86 64-bit with bmi2 support" @echo "x86-64-avx2 > x86 64-bit with avx2 support" @@ -640,6 +663,7 @@ config-sanity: @echo "avx2: '$(avx2)'" @echo "pext: '$(pext)'" @echo "avx512: '$(avx512)'" + @echo "vnni: '$(vnni)'" @echo "neon: '$(neon)'" @echo "" @echo "Flags:" @@ -664,6 +688,7 @@ config-sanity: @test "$(avx2)" = "yes" || test "$(avx2)" = "no" @test "$(pext)" = "yes" || test "$(pext)" = "no" @test "$(avx512)" = "yes" || test "$(avx512)" = "no" + @test "$(vnni)" = "yes" || test "$(vnni)" = "no" @test "$(neon)" = "yes" || test "$(neon)" = "no" @test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" diff --git a/src/misc.cpp b/src/misc.cpp index aeb3c912..ab52d30b 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -219,6 +219,9 @@ const std::string compiler_info() { compiler += "\nCompilation settings include: "; compiler += (Is64Bit ? " 64bit" : " 32bit"); + #if defined(USE_VNNI) + compiler += " VNNI"; + #endif #if defined(USE_AVX512) compiler += " AVX512"; #endif diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 8d2acd18..322e3240 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -79,8 +79,10 @@ namespace Eval::NNUE::Layers { #if defined(USE_AVX512) constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2); - const __m512i kOnes = _mm512_set1_epi16(1); const auto input_vector = reinterpret_cast(input); + #if !defined(USE_VNNI) + const __m512i kOnes = _mm512_set1_epi16(1); + #endif #elif defined(USE_AVX2) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; @@ -113,9 +115,13 @@ namespace Eval::NNUE::Layers { __m512i sum = _mm512_setzero_si512(); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { + #if defined(USE_VNNI) + sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j])); + #else __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j])); product = _mm512_madd_epi16(product, kOnes); sum = _mm512_add_epi32(sum, product); + #endif } // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks. @@ -125,8 +131,14 @@ namespace Eval::NNUE::Layers { { const auto iv256 = reinterpret_cast(&input_vector[kNumChunks]); const auto row256 = reinterpret_cast(&row[kNumChunks]); + #if defined(USE_VNNI) + __m256i product256 = _mm256_dpbusd_epi32( + _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0])); + sum = _mm512_inserti32x8(sum, product256, 0); + #else __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0])); sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256)); + #endif } output[i] = _mm512_reduce_add_epi32(sum) + biases_[i]; From 69cfe28f315b559cb1a07c0806266aa2850b5d4b Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Wed, 12 Aug 2020 17:21:12 +0200 Subject: [PATCH 19/58] Output the SSE2 flag in compiler_info was missing in the list of outputs, slightly reorder flags. explicitly add -msse2 if USE_SSE2 (is implicit already, -msse -m64). closes https://github.com/official-stockfish/Stockfish/pull/2990 No functional change. --- src/Makefile | 2 +- src/misc.cpp | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/Makefile b/src/Makefile index 0804cdd5..027cc3e3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -468,7 +468,7 @@ ifeq ($(neon),yes) endif ifeq ($(arch),x86_64) - CXXFLAGS += -DUSE_SSE2 + CXXFLAGS += -msse2 -DUSE_SSE2 endif ### 3.7 pext diff --git a/src/misc.cpp b/src/misc.cpp index ab52d30b..1cee4726 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -225,6 +225,7 @@ const std::string compiler_info() { #if defined(USE_AVX512) compiler += " AVX512"; #endif + compiler += (HasPext ? " BMI2" : ""); #if defined(USE_AVX2) compiler += " AVX2"; #endif @@ -234,11 +235,14 @@ const std::string compiler_info() { #if defined(USE_SSSE3) compiler += " SSSE3"; #endif - compiler += (HasPext ? " BMI2" : ""); - compiler += (HasPopCnt ? " POPCNT" : ""); + #if defined(USE_SSE2) + compiler += " SSE2"; + #endif + compiler += (HasPopCnt ? " POPCNT" : ""); #if defined(USE_MMX) compiler += " MMX"; #endif + #if !defined(NDEBUG) compiler += " DEBUG"; #endif From 67e48418afd58dd69708dcd67dea6161f61ef76f Mon Sep 17 00:00:00 2001 From: Sergio Vieri Date: Wed, 12 Aug 2020 23:21:21 +0800 Subject: [PATCH 20/58] Update default net to nn-82215d0fd0df.nnue Net created at: 20200812-2257 passed STC: https://tests.stockfishchess.org/tests/view/5f340ca99e5f2effc089da17 LLR: 2.96 (-2.94,2.94) {-0.50,1.50} Total: 5744 W: 756 L: 627 D: 4361 Ptnml(0-2): 28, 485, 1731, 586, 42 passed LTC: https://tests.stockfishchess.org/tests/view/5f341eba9e5f2effc089da23 LLR: 2.94 (-2.94,2.94) {0.25,1.75} Total: 17136 W: 1041 L: 917 D: 15178 Ptnml(0-2): 13, 813, 6807, 907, 28 closes https://github.com/official-stockfish/Stockfish/pull/2992 Bench: 3935117 --- src/ucioption.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ucioption.cpp b/src/ucioption.cpp index b0689d6d..0a35d01b 100644 --- a/src/ucioption.cpp +++ b/src/ucioption.cpp @@ -79,7 +79,7 @@ void init(OptionsMap& o) { o["Syzygy50MoveRule"] << Option(true); o["SyzygyProbeLimit"] << Option(7, 0, 7); o["Use NNUE"] << Option(false, on_use_NNUE); - o["EvalFile"] << Option("nn-112bb1c8cdb5.nnue", on_eval_file); + o["EvalFile"] << Option("nn-82215d0fd0df.nnue", on_eval_file); } From e8ea215a13e009b78a148fda831392eb3224107e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= Date: Thu, 13 Aug 2020 13:40:06 +0200 Subject: [PATCH 21/58] Clean-up Makefile help Do not show the details of the default architecture for a simple "make help" invocation, as the details are most likely to confuse beginners. Instead we make it clear which architecture is the default and put an example at the end of the Makefile as an incentative to use "make help ARCH=blah" to discover the flags used by the different architectures. ``` make help make help ARCH=x86-64-ssse3 ``` Also clean-up and modernize a bit the Makefile examples while at it. closes https://github.com/official-stockfish/Stockfish/pull/2996 No functional change --- src/Makefile | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/src/Makefile b/src/Makefile index 027cc3e3..a9fb7b81 100644 --- a/src/Makefile +++ b/src/Makefile @@ -81,6 +81,11 @@ endif # at the end of the line for flag values. ### 2.1. General and architecture defaults + +ifeq ($(ARCH),) + empty_arch = yes +endif + optimize = yes debug = no sanitize = no @@ -99,6 +104,7 @@ neon = no ARCH = x86-64-modern ### 2.2 Architecture specific + ifeq ($(ARCH),general-32) arch = any bits = 32 @@ -141,16 +147,7 @@ ifeq ($(ARCH),x86-64-ssse3) ssse3 = yes endif -ifeq ($(ARCH),x86-64-modern) - arch = x86_64 - prefetch = yes - popcnt = yes - sse = yes - ssse3 = yes - sse41 = yes -endif - -ifeq ($(ARCH),x86-64-sse41-popcnt) +ifeq ($(ARCH),$(filter $(ARCH),x86-64-sse41-popcnt x86-64-modern)) arch = x86_64 prefetch = yes popcnt = yes @@ -535,12 +532,13 @@ help: @echo "" @echo "Supported targets:" @echo "" + @echo "help > Display architecture details" @echo "build > Standard build" - @echo "profile-build > Standard build with PGO" + @echo "net > Download the default nnue net" + @echo "profile-build > Faster build (with profile-guided optimization)" @echo "strip > Strip executable" @echo "install > Install executable" @echo "clean > Clean up" - @echo "net > Download the default nnue net" @echo "" @echo "Supported archs:" @echo "" @@ -549,7 +547,7 @@ help: @echo "x86-64-bmi2 > x86 64-bit with bmi2 support" @echo "x86-64-avx2 > x86 64-bit with avx2 support" @echo "x86-64-sse41-popcnt > x86 64-bit with sse41 and popcnt support" - @echo "x86-64-modern > the same as previous (x86-64-sse41-popcnt)" + @echo "x86-64-modern > common modern CPU, currently x86-64-sse41-popcnt" @echo "x86-64-ssse3 > x86 64-bit with ssse3 support" @echo "x86-64-sse3-popcnt > x86 64-bit with sse3 and popcnt support" @echo "x86-64 > x86 64-bit generic" @@ -572,17 +570,20 @@ help: @echo "" @echo "Simple examples. If you don't know what to do, you likely want to run: " @echo "" - @echo "make -j build ARCH=x86-64 (This is for 64-bit systems)" - @echo "make -j build ARCH=x86-32 (This is for 32-bit systems)" + @echo "make -j build ARCH=x86-64 (A portable, slow compile for 64-bit systems)" + @echo "make -j build ARCH=x86-32 (A portable, slow compile for 32-bit systems)" @echo "" - @echo "Advanced examples, for experienced users: " + @echo "Advanced examples, for experienced users looking for performance: " @echo "" - @echo "make -j build ARCH=x86-64-modern COMP=clang" - @echo "make -j profile-build ARCH=x86-64-bmi2 COMP=gcc COMPCXX=g++-4.8" - @echo "" - @echo "The selected architecture $(ARCH) enables the following configuration: " + @echo "make help ARCH=x86-64-bmi2" + @echo "make -j profile-build ARCH=x86-64-bmi2 COMP=gcc COMPCXX=g++-9.0" + @echo "make -j build ARCH=x86-64-ssse3 COMP=clang" @echo "" +ifneq ($(empty_arch), yes) + @echo "-------------------------------\n" + @echo "The selected architecture $(ARCH) will enable the following configuration: " @$(MAKE) ARCH=$(ARCH) COMP=$(COMP) config-sanity +endif .PHONY: help build profile-build strip install clean net objclean profileclean \ From ce009ea1aaecc577bbdf208cef8e61dd1827a18e Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Thu, 13 Aug 2020 22:54:13 +0200 Subject: [PATCH 22/58] Verify SHA of downloaded net file check SHA of the available and downloaded file. Document the format requirement on the default net. Also allow curl to make possibly insecure connections, as needed for old curl. fixes https://github.com/official-stockfish/Stockfish/issues/2998 closes https://github.com/official-stockfish/Stockfish/pull/3000 No functional change. --- src/Makefile | 4 +++- src/ucioption.cpp | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index a9fb7b81..38f607cb 100644 --- a/src/Makefile +++ b/src/Makefile @@ -624,8 +624,10 @@ net: $(eval nnuenet := $(shell grep EvalFile ucioption.cpp | grep Option | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/')) @echo "Default net: $(nnuenet)" $(eval nnuedownloadurl := https://tests.stockfishchess.org/api/nn/$(nnuenet)) - $(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -sL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi)) + $(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi)) @if test -f "$(nnuenet)"; then echo "Already available."; else echo "Downloading $(nnuedownloadurl)"; $(curl_or_wget) $(nnuedownloadurl) > $(nnuenet); fi + $(eval shasum_command := $(shell if hash shasum 2>/dev/null; then echo "shasum -a 256 "; elif hash sha256sum 2>/dev/null; then echo "sha256sum "; fi)) + @if [ "$(nnuenet)" != "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then echo "Failed download or $(nnuenet) corrupted, please delete!"; exit 1; fi # clean binaries and objects objclean: diff --git a/src/ucioption.cpp b/src/ucioption.cpp index 0a35d01b..2b66a475 100644 --- a/src/ucioption.cpp +++ b/src/ucioption.cpp @@ -79,6 +79,8 @@ void init(OptionsMap& o) { o["Syzygy50MoveRule"] << Option(true); o["SyzygyProbeLimit"] << Option(7, 0, 7); o["Use NNUE"] << Option(false, on_use_NNUE); + // The default must follow the format nn-[SHA256 first 12 digits].nnue + // for the build process (profile-build and fishtest) to work. o["EvalFile"] << Option("nn-82215d0fd0df.nnue", on_eval_file); } From e5f450cf0bfe5a34dd4ea51a5592a71be4514601 Mon Sep 17 00:00:00 2001 From: Miguel Lahoz Date: Mon, 10 Aug 2020 22:57:11 +0800 Subject: [PATCH 23/58] Also dampen NNUE eval with 50 move rule Move the existing dampening function last so that NNUE evaluations are also handled as we approach the 50 move rule. STC: LLR: 2.95 (-2.94,2.94) {-0.50,1.50} Total: 4792 W: 695 L: 561 D: 3536 Ptnml(0-2): 19, 420, 1422, 478, 57 https://tests.stockfishchess.org/tests/view/5f3164179081672066537534 LTC: LLR: 8.62 (-2.94,2.94) {0.25,1.75} Total: 286744 W: 18494 L: 17430 D: 250820 Ptnml(0-2): 418, 14886, 111745, 15860, 463 https://tests.stockfishchess.org/tests/view/5f316b039081672066537541 closes https://github.com/official-stockfish/Stockfish/pull/3004 Bench: 4001800 --- src/evaluate.cpp | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index caab2979..00fd2005 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -927,9 +927,6 @@ make_v: // Side to move point of view v = (pos.side_to_move() == WHITE ? v : -v) + Tempo; - // Damp down the evaluation linearly when shuffling - v = v * (100 - pos.rule50_count()) / 100; - return v; } @@ -941,14 +938,15 @@ make_v: Value Eval::evaluate(const Position& pos) { - if (Eval::useNNUE) - { - Value v = eg_value(pos.psq_score()); - // Take NNUE eval only on balanced positions - if (abs(v) < NNUEThreshold) - return NNUE::evaluate(pos) + Tempo; - } - return Evaluation(pos).value(); + bool classical = !Eval::useNNUE + || abs(eg_value(pos.psq_score())) >= NNUEThreshold; + Value v = classical ? Evaluation(pos).value() + : NNUE::evaluate(pos) + Tempo; + + // Damp down the evaluation linearly when shuffling + v = v * (100 - pos.rule50_count()) / 100; + + return v; } /// trace() is like evaluate(), but instead of returning a value, it returns From 6eb186c97e9d808970d0b1369bcd7aca60612e26 Mon Sep 17 00:00:00 2001 From: mstembera Date: Fri, 14 Aug 2020 04:49:33 -0700 Subject: [PATCH 24/58] Try to match relative magnitude of NNUE eval to classical The idea is that since we are mixing NNUE and classical evals matching their magnitudes closer allows for better comparisons. STC https://tests.stockfishchess.org/tests/view/5f35a65411a9b1a1dbf18e2b LLR: 2.94 (-2.94,2.94) {-0.50,1.50} Total: 9840 W: 1150 L: 1027 D: 7663 Ptnml(0-2): 49, 772, 3175, 855, 69 LTC https://tests.stockfishchess.org/tests/view/5f35bcbe11a9b1a1dbf18e47 LLR: 2.93 (-2.94,2.94) {0.25,1.75} Total: 44424 W: 2492 L: 2294 D: 39638 Ptnml(0-2): 42, 2015, 17915, 2183, 57 also corrects the location to clamp the evaluation (non-function on bench). closes https://github.com/official-stockfish/Stockfish/pull/3003 bench: 3905447 --- src/evaluate.cpp | 5 ++++- src/nnue/evaluate_nnue.cpp | 5 +---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 00fd2005..a453fa0f 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -941,11 +941,14 @@ Value Eval::evaluate(const Position& pos) { bool classical = !Eval::useNNUE || abs(eg_value(pos.psq_score())) >= NNUEThreshold; Value v = classical ? Evaluation(pos).value() - : NNUE::evaluate(pos) + Tempo; + : NNUE::evaluate(pos) * 5 / 4 + Tempo; // Damp down the evaluation linearly when shuffling v = v * (100 - pos.rule50_count()) / 100; + // Guarantee evalution outside of TB range + v = Utility::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1); + return v; } diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp index af0894b2..a6ece8e2 100644 --- a/src/nnue/evaluate_nnue.cpp +++ b/src/nnue/evaluate_nnue.cpp @@ -159,10 +159,7 @@ namespace Eval::NNUE { // Evaluation function. Perform differential calculation. Value evaluate(const Position& pos) { - Value v = ComputeScore(pos, false); - v = Utility::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1); - - return v; + return ComputeScore(pos, false); } // Evaluation function. Perform full calculation. From cd0b8b4cf28208fffef931322749205a0ddc6066 Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Fri, 14 Aug 2020 22:18:12 +0200 Subject: [PATCH 25/58] Use NNUE more for fortresses Increases the use of NNUE evaluation in positions without captures/pawn moves, by increasing the NNUEThreshold threshold with rule50_count. This patch will force Stockfish to use NNUE eval more and more in materially unbalanced positions, when it seems that the classical eval is struggling to win and only manages to shuffle. This will ask the (slower) NNUE eval to double-check the potential fortress branches of the search tree, but only when necessary. passed STC: https://tests.stockfishchess.org/tests/view/5f36f1bf11a9b1a1dbf192d8 LLR: 2.93 (-2.94,2.94) {-0.50,1.50} Total: 51824 W: 5836 L: 5653 D: 40335 Ptnml(0-2): 264, 4356, 16512, 4493, 287 passed LTC: https://tests.stockfishchess.org/tests/view/5f37836111a9b1a1dbf1936d LLR: 2.93 (-2.94,2.94) {0.25,1.75} Total: 29768 W: 1747 L: 1590 D: 26431 Ptnml(0-2): 33, 1347, 11977, 1484, 43 closes https://github.com/official-stockfish/Stockfish/pull/3011 Bench: 4173967 --- src/evaluate.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index a453fa0f..3a620a78 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -939,7 +939,7 @@ make_v: Value Eval::evaluate(const Position& pos) { bool classical = !Eval::useNNUE - || abs(eg_value(pos.psq_score())) >= NNUEThreshold; + || abs(eg_value(pos.psq_score())) >= NNUEThreshold * (16 + pos.rule50_count()) / 16; Value v = classical ? Evaluation(pos).value() : NNUE::evaluate(pos) * 5 / 4 + Tempo; From 8cf43c6317665295eece747ed1589ee33a435d2c Mon Sep 17 00:00:00 2001 From: Daylen Yang Date: Fri, 14 Aug 2020 19:53:46 -0700 Subject: [PATCH 26/58] Display NEON in compiler string if NEON intrinsics are being used and USE_NEON is defined. closes https://github.com/official-stockfish/Stockfish/pull/3008 No functional change --- src/misc.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/misc.cpp b/src/misc.cpp index 1cee4726..459ea100 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -242,6 +242,9 @@ const std::string compiler_info() { #if defined(USE_MMX) compiler += " MMX"; #endif + #if defined(USE_NEON) + compiler += " NEON"; + #endif #if !defined(NDEBUG) compiler += " DEBUG"; From 72dc7a5c54554a8c7c4bf68aa7de2d4de05f3294 Mon Sep 17 00:00:00 2001 From: syzygy1 <3028851+syzygy1@users.noreply.github.com> Date: Sat, 15 Aug 2020 16:50:39 +0200 Subject: [PATCH 27/58] Assume network file is in little-endian byte order This patch fixes the byte order when reading 16- and 32-bit values from the network file on a big-endian machine. Bytes are ordered in read_le() using unsigned arithmetic, which doesn't need tricks to determine the endianness of the machine. Unfortunately the compiler doesn't seem to be able to optimise the ordering operation, but reading in the weights is not a time-critical operation and the extra time it takes should not be noticeable. Big endian systems are still untested with NNUE. fixes #3007 closes https://github.com/official-stockfish/Stockfish/pull/3009 No functional change. --- src/nnue/evaluate_nnue.cpp | 8 ++++---- src/nnue/layers/affine_transform.h | 9 ++++----- src/nnue/nnue_common.h | 19 +++++++++++++++++++ src/nnue/nnue_feature_transformer.h | 8 ++++---- 4 files changed, 31 insertions(+), 13 deletions(-) diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp index a6ece8e2..3aa85943 100644 --- a/src/nnue/evaluate_nnue.cpp +++ b/src/nnue/evaluate_nnue.cpp @@ -77,7 +77,7 @@ namespace Eval::NNUE { bool ReadParameters(std::istream& stream, const AlignedPtr& pointer) { std::uint32_t header; - stream.read(reinterpret_cast(&header), sizeof(header)); + header = read_le(stream); if (!stream || header != T::GetHashValue()) return false; return pointer->ReadParameters(stream); } @@ -96,9 +96,9 @@ namespace Eval::NNUE { std::uint32_t* hash_value, std::string* architecture) { std::uint32_t version, size; - stream.read(reinterpret_cast(&version), sizeof(version)); - stream.read(reinterpret_cast(hash_value), sizeof(*hash_value)); - stream.read(reinterpret_cast(&size), sizeof(size)); + version = read_le(stream); + *hash_value = read_le(stream); + size = read_le(stream); if (!stream || version != kVersion) return false; architecture->resize(size); stream.read(&(*architecture)[0], size); diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 322e3240..bac258e8 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -62,11 +62,10 @@ namespace Eval::NNUE::Layers { // Read network parameters bool ReadParameters(std::istream& stream) { if (!previous_layer_.ReadParameters(stream)) return false; - stream.read(reinterpret_cast(biases_), - kOutputDimensions * sizeof(BiasType)); - stream.read(reinterpret_cast(weights_), - kOutputDimensions * kPaddedInputDimensions * - sizeof(WeightType)); + for (std::size_t i = 0; i < kOutputDimensions; ++i) + biases_[i] = read_le(stream); + for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i) + weights_[i] = read_le(stream); return !stream.fail(); } diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index eab7d258..61f18aee 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -21,6 +21,9 @@ #ifndef NNUE_COMMON_H_INCLUDED #define NNUE_COMMON_H_INCLUDED +#include +#include + #if defined(USE_AVX2) #include @@ -101,6 +104,22 @@ namespace Eval::NNUE { return (n + base - 1) / base * base; } + // Read a signed or unsigned integer from a stream in little-endian order + template + inline IntType read_le(std::istream& stream) { + // Read the relevant bytes from the stream in little-endian order + std::uint8_t u[sizeof(IntType)]; + stream.read(reinterpret_cast(u), sizeof(IntType)); + // Use unsigned arithmetic to convert to machine order + typename std::make_unsigned::type v = 0; + for (std::size_t i = 0; i < sizeof(IntType); ++i) + v = (v << 8) | u[sizeof(IntType) - i - 1]; + // Copy the machine-ordered bytes into a potentially signed value + IntType w; + std::memcpy(&w, &v, sizeof(IntType)); + return w; + } + } // namespace Eval::NNUE #endif // #ifndef NNUE_COMMON_H_INCLUDED diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 40f2603d..4db9be9f 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -55,10 +55,10 @@ namespace Eval::NNUE { // Read network parameters bool ReadParameters(std::istream& stream) { - stream.read(reinterpret_cast(biases_), - kHalfDimensions * sizeof(BiasType)); - stream.read(reinterpret_cast(weights_), - kHalfDimensions * kInputDimensions * sizeof(WeightType)); + for (std::size_t i = 0; i < kHalfDimensions; ++i) + biases_[i] = read_le(stream); + for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i) + weights_[i] = read_le(stream); return !stream.fail(); } From 65572de4a79ab017c19d85eacee865afe7bfc7c1 Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Sun, 16 Aug 2020 13:21:07 +0200 Subject: [PATCH 28/58] Add further targets to travis testing general-32, general-64 and help closes https://github.com/official-stockfish/Stockfish/pull/3014 No functional change --- .travis.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0dd38047..45f1bd3d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -51,11 +51,12 @@ script: - export benchref=$(cat git_sig) - echo "Reference bench:" $benchref - # # Compiler version string - $COMPILER -v - # + # test help target + - make help + # Verify bench number against various builds - export CXXFLAGS="-Werror -D_GLIBCXX_DEBUG" - make clean && make -j2 ARCH=x86-64-modern optimize=no debug=yes build && ../tests/signature.sh $benchref @@ -64,8 +65,10 @@ script: - make clean && make -j2 ARCH=x86-64-ssse3 build && ../tests/signature.sh $benchref - make clean && make -j2 ARCH=x86-64-sse3-popcnt build && ../tests/signature.sh $benchref - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-old build && ../tests/signature.sh $benchref; fi - if [[ "$TRAVIS_OS_NAME" == "linux" && "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi From 81d716f5ccff3f0898ae985b9ef69f79d014bdc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= Date: Sun, 16 Aug 2020 21:46:54 +0200 Subject: [PATCH 29/58] Reformat code in little-endian patch Reformat code and rename the function to "read_little_endian()" in the recent commit by Ronald de Man for support of big endian systems. closes https://github.com/official-stockfish/Stockfish/pull/3016 No functional change ----- Recommended net: https://tests.stockfishchess.org/api/nn/nn-82215d0fd0df.nnue --- src/nnue/evaluate_nnue.cpp | 14 +++++++------- src/nnue/layers/affine_transform.h | 4 ++-- src/nnue/nnue_common.h | 30 +++++++++++++++-------------- src/nnue/nnue_feature_transformer.h | 4 ++-- 4 files changed, 27 insertions(+), 25 deletions(-) diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp index 3aa85943..dfbb1ac2 100644 --- a/src/nnue/evaluate_nnue.cpp +++ b/src/nnue/evaluate_nnue.cpp @@ -77,7 +77,7 @@ namespace Eval::NNUE { bool ReadParameters(std::istream& stream, const AlignedPtr& pointer) { std::uint32_t header; - header = read_le(stream); + header = read_little_endian(stream); if (!stream || header != T::GetHashValue()) return false; return pointer->ReadParameters(stream); } @@ -92,13 +92,13 @@ namespace Eval::NNUE { } // Read network header - bool ReadHeader(std::istream& stream, - std::uint32_t* hash_value, std::string* architecture) { - + bool ReadHeader(std::istream& stream, std::uint32_t* hash_value, std::string* architecture) + { std::uint32_t version, size; - version = read_le(stream); - *hash_value = read_le(stream); - size = read_le(stream); + + version = read_little_endian(stream); + *hash_value = read_little_endian(stream); + size = read_little_endian(stream); if (!stream || version != kVersion) return false; architecture->resize(size); stream.read(&(*architecture)[0], size); diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index bac258e8..7ac5a1c0 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -63,9 +63,9 @@ namespace Eval::NNUE::Layers { bool ReadParameters(std::istream& stream) { if (!previous_layer_.ReadParameters(stream)) return false; for (std::size_t i = 0; i < kOutputDimensions; ++i) - biases_[i] = read_le(stream); + biases_[i] = read_little_endian(stream); for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i) - weights_[i] = read_le(stream); + weights_[i] = read_little_endian(stream); return !stream.fail(); } diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index 61f18aee..4c93e3d1 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -101,23 +101,25 @@ namespace Eval::NNUE { // Round n up to be a multiple of base template constexpr IntType CeilToMultiple(IntType n, IntType base) { - return (n + base - 1) / base * base; + return (n + base - 1) / base * base; } - // Read a signed or unsigned integer from a stream in little-endian order + // read_little_endian() is our utility to read an integer (signed or unsigned, any size) + // from a stream in little-endian order. We swap the byte order after the read if + // necessary to return a result with the byte ordering of the compiling machine. template - inline IntType read_le(std::istream& stream) { - // Read the relevant bytes from the stream in little-endian order - std::uint8_t u[sizeof(IntType)]; - stream.read(reinterpret_cast(u), sizeof(IntType)); - // Use unsigned arithmetic to convert to machine order - typename std::make_unsigned::type v = 0; - for (std::size_t i = 0; i < sizeof(IntType); ++i) - v = (v << 8) | u[sizeof(IntType) - i - 1]; - // Copy the machine-ordered bytes into a potentially signed value - IntType w; - std::memcpy(&w, &v, sizeof(IntType)); - return w; + inline IntType read_little_endian(std::istream& stream) { + + IntType result; + std::uint8_t u[sizeof(IntType)]; + typename std::make_unsigned::type v = 0; + + stream.read(reinterpret_cast(u), sizeof(IntType)); + for (std::size_t i = 0; i < sizeof(IntType); ++i) + v = (v << 8) | u[sizeof(IntType) - i - 1]; + + std::memcpy(&result, &v, sizeof(IntType)); + return result; } } // namespace Eval::NNUE diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 4db9be9f..43707610 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -56,9 +56,9 @@ namespace Eval::NNUE { // Read network parameters bool ReadParameters(std::istream& stream) { for (std::size_t i = 0; i < kHalfDimensions; ++i) - biases_[i] = read_le(stream); + biases_[i] = read_little_endian(stream); for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i) - weights_[i] = read_le(stream); + weights_[i] = read_little_endian(stream); return !stream.fail(); } From 0e17a89e4dee73bd46e496cf6bed467432f116e6 Mon Sep 17 00:00:00 2001 From: Unai Corzo Date: Mon, 17 Aug 2020 09:22:15 +0200 Subject: [PATCH 30/58] Simplify away the passed pawn extension STC https://tests.stockfishchess.org/tests/view/5f3955f0e98b6c64b3df41d7 LLR: 2.96 (-2.94,2.94) {-1.50,0.50} Total: 31992 W: 3611 L: 3548 D: 24833 Ptnml(0-2): 174, 2658, 10273, 2713, 178 LTC https://tests.stockfishchess.org/tests/view/5f399e41e98b6c64b3df4210 LLR: 3.01 (-2.94,2.94) {-1.50,0.50} Total: 29568 W: 1488 L: 1480 D: 26600 Ptnml(0-2): 40, 1272, 12142, 1300, 30 closes https://github.com/official-stockfish/Stockfish/pull/3017 bench: 3844671 ----- Recommended net: https://tests.stockfishchess.org/api/nn/nn-82215d0fd0df.nnue --- src/search.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index c5b4332f..83fb722f 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1126,12 +1126,6 @@ moves_loop: // When in check, search starts from here && (pos.is_discovery_check_on_king(~us, move) || pos.see_ge(move))) extension = 1; - // Passed pawn extension - else if ( move == ss->killers[0] - && pos.advanced_pawn_push(move) - && pos.pawn_passed(us, to_sq(move))) - extension = 1; - // Last captures extension else if ( PieceValue[EG][pos.captured_piece()] > PawnValueEg && pos.non_pawn_material() <= 2 * RookValueMg) From 65b976439f8867e81682c0b66da6796ad3176177 Mon Sep 17 00:00:00 2001 From: notruck <56622488+notruck@users.noreply.github.com> Date: Sun, 16 Aug 2020 08:59:13 -0700 Subject: [PATCH 31/58] Support building for Android using NDK The easiest way to use the NDK in conjunction with this Makefile (tested on linux-x86_64): 1. Download the latest NDK (r21d) from Google from https://developer.android.com/ndk/downloads 2. Place and unzip the NDK in $HOME/ndk folder 3. Export the path variable e.g., `export PATH=$PATH:$HOME/ndk/android-ndk-r21d/toolchains/llvm/prebuilt/linux-x86_64/bin` 4. cd to your Stockfish/src dir 5. Issue `make -j ARCH=armv8 COMP=ndk build` (use `ARCH=armv7` or `ARCH=armv7-neon` for older CPUs) 6. Optionally `make -j ARCH=armv8 COMP=ndk strip` 7. That's all. Enjoy! Improves support from Raspberry Pi (incomplete?) and compiling on arm in general closes https://github.com/official-stockfish/Stockfish/pull/3015 fixes https://github.com/official-stockfish/Stockfish/issues/2860 fixes https://github.com/official-stockfish/Stockfish/issues/2641 Support is still fragile as we're missing CI on these targets. Nevertheless tested with: ```bash # build crosses from ubuntu 20.04 on x86 to various arch/OS combos # tested with suitable packages installed # (build-essentials, mingw-w64, g++-arm-linux-gnueabihf, NDK (r21d) from google) # cross to Android export PATH=$HOME/ndk/android-ndk-r21d/toolchains/llvm/prebuilt/linux-x86_64/bin:$PATH make clean && make -j build ARCH=armv7 COMP=ndk && make -j build ARCH=armv7 COMP=ndk strip make clean && make -j build ARCH=armv7-neon COMP=ndk && make -j build ARCH=armv7-neon COMP=ndk strip make clean && make -j build ARCH=armv8 COMP=ndk && make -j build ARCH=armv8 COMP=ndk strip # cross to Raspberry Pi make clean && make -j build ARCH=armv7 COMP=gcc COMPILER=arm-linux-gnueabihf-g++ make clean && make -j build ARCH=armv7-neon COMP=gcc COMPILER=arm-linux-gnueabihf-g++ # cross to Windows make clean && make -j build ARCH=x86-64-modern COMP=mingw ``` No functional change --- AUTHORS | 1 + src/Makefile | 65 ++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/AUTHORS b/AUTHORS index 41b89705..d8f4d30e 100644 --- a/AUTHORS +++ b/AUTHORS @@ -127,6 +127,7 @@ Niklas Fiekas (niklasf) Nikolay Kostov (NikolayIT) Nguyen Pham (nguyenpham) Norman Schmidt (FireFather) +notruck Ondrej Mosnáček (WOnder93) Oskar Werkelin Ahlin Pablo Vazquez diff --git a/src/Makefile b/src/Makefile index 38f607cb..0f458aa1 100644 --- a/src/Makefile +++ b/src/Makefile @@ -102,6 +102,7 @@ avx512 = no vnni = no neon = no ARCH = x86-64-modern +STRIP = strip ### 2.2 Architecture specific @@ -208,6 +209,14 @@ ifeq ($(ARCH),armv7) bits = 32 endif +ifeq ($(ARCH),armv7-neon) + arch = armv7 + prefetch = yes + popcnt = yes + neon = yes + bits = 32 +endif + ifeq ($(ARCH),armv8) arch = armv8-a prefetch = yes @@ -251,7 +260,7 @@ ifeq ($(COMP),gcc) CXX=g++ CXXFLAGS += -pedantic -Wextra -Wshadow - ifeq ($(ARCH),$(filter $(ARCH),armv7 armv8)) + ifeq ($(arch),$(filter $(arch),armv7 armv8-a)) ifeq ($(OS),Android) CXXFLAGS += -m$(bits) LDFLAGS += -m$(bits) @@ -261,6 +270,10 @@ ifeq ($(COMP),gcc) LDFLAGS += -m$(bits) endif + ifeq ($(arch),$(filter $(arch),armv7)) + LDFLAGS += -latomic + endif + ifneq ($(KERNEL),Darwin) LDFLAGS += -Wl,--no-as-needed endif @@ -311,7 +324,7 @@ ifeq ($(COMP),clang) endif endif - ifeq ($(ARCH),$(filter $(ARCH),armv7 armv8)) + ifeq ($(arch),$(filter $(arch),armv7 armv8)) ifeq ($(OS),Android) CXXFLAGS += -m$(bits) LDFLAGS += -m$(bits) @@ -340,6 +353,25 @@ ifeq ($(KERNEL),Darwin) LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14 endif +# To cross-compile for Android, NDK version r21 or later is recommended. +# In earlier NDK versions, you'll need to pass -fno-addrsig if using GNU binutils. +# Currently we don't know how to make PGO builds with the NDK yet. +ifeq ($(COMP),ndk) + CXXFLAGS += -stdlib=libc++ -fPIE + ifeq ($(arch),armv7) + comp=armv7a-linux-androideabi16-clang + CXX=armv7a-linux-androideabi16-clang++ + CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon + STRIP=arm-linux-androideabi-strip + endif + ifeq ($(arch),armv8-a) + comp=aarch64-linux-android21-clang + CXX=aarch64-linux-android21-clang++ + STRIP=aarch64-linux-android-strip + endif + LDFLAGS += -static-libstdc++ -pie -lm -latomic +endif + ### Travis CI script uses COMPILER to overwrite CXX ifdef COMPILER COMPCXX=$(COMPILER) @@ -356,7 +388,9 @@ ifneq ($(comp),mingw) ifneq ($(OS),Android) # Haiku has pthreads in its libroot, so only link it in on other platforms ifneq ($(KERNEL),Haiku) - LDFLAGS += -lpthread + ifneq ($(COMP),ndk) + LDFLAGS += -lpthread + endif endif endif endif @@ -401,7 +435,6 @@ endif ifeq ($(prefetch),yes) ifeq ($(sse),yes) CXXFLAGS += -msse - DEPENDFLAGS += -msse endif else CXXFLAGS += -DNO_PREFETCH @@ -409,7 +442,7 @@ endif ### 3.6 popcnt ifeq ($(popcnt),yes) - ifeq ($(arch),$(filter $(arch),ppc64 armv8-a arm64)) + ifeq ($(arch),$(filter $(arch),ppc64 armv7 armv8-a arm64)) CXXFLAGS += -DUSE_POPCNT else ifeq ($(comp),icc) CXXFLAGS += -msse3 -DUSE_POPCNT @@ -418,6 +451,7 @@ ifeq ($(popcnt),yes) endif endif + ifeq ($(avx2),yes) CXXFLAGS += -DUSE_AVX2 ifeq ($(comp),$(filter $(comp),gcc clang mingw)) @@ -462,6 +496,11 @@ endif ifeq ($(neon),yes) CXXFLAGS += -DUSE_NEON + ifeq ($(KERNEL),Linux) + ifneq ($(COMP),ndk) + CXXFLAGS += -mfpu=neon + endif + endif endif ifeq ($(arch),x86_64) @@ -481,7 +520,10 @@ endif ### needs access to the optimization flags. ifeq ($(optimize),yes) ifeq ($(debug), no) - ifeq ($(comp),clang) + ifeq ($(COMP),ndk) + CXXFLAGS += -flto=thin + LDFLAGS += $(CXXFLAGS) + else ifeq ($(comp),clang) CXXFLAGS += -flto=thin LDFLAGS += $(CXXFLAGS) @@ -502,7 +544,7 @@ ifeq ($(debug), no) endif # To use LTO and static linking on windows, the tool chain requires a recent gcc: -# gcc version 10.1 in msys2 or TDM-GCC version 9.2 are know to work, older might not. +# gcc version 10.1 in msys2 or TDM-GCC version 9.2 are known to work, older might not. # So, only enable it for a cross from Linux by default. else ifeq ($(comp),mingw) ifeq ($(KERNEL),Linux) @@ -556,7 +598,8 @@ help: @echo "ppc-64 > PPC 64-bit" @echo "ppc-32 > PPC 32-bit" @echo "armv7 > ARMv7 32-bit" - @echo "armv8 > ARMv8 64-bit" + @echo "armv7-neon" > ARMv7 32-bit with popcnt and neon" + @echo "armv8 > ARMv8 64-bit with popcnt and neon" @echo "apple-silicon > Apple silicon ARM64" @echo "general-64 > unspecified 64-bit" @echo "general-32 > unspecified 32-bit" @@ -567,6 +610,7 @@ help: @echo "mingw > Gnu compiler with MinGW under Windows" @echo "clang > LLVM Clang compiler" @echo "icc > Intel compiler" + @echo "ndk > Google NDK to cross-compile for Android" @echo "" @echo "Simple examples. If you don't know what to do, you likely want to run: " @echo "" @@ -609,7 +653,7 @@ profile-build: net config-sanity objclean profileclean $(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean strip: - strip $(EXE) + $(STRIP) $(EXE) install: -mkdir -p -m 755 $(BINDIR) @@ -693,7 +737,8 @@ config-sanity: @test "$(avx512)" = "yes" || test "$(avx512)" = "no" @test "$(vnni)" = "yes" || test "$(vnni)" = "no" @test "$(neon)" = "yes" || test "$(neon)" = "no" - @test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" + @test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" \ + || test "$(comp)" = "armv7a-linux-androideabi16-clang" || test "$(comp)" = "aarch64-linux-android21-clang" $(EXE): $(OBJS) +$(CXX) -o $@ $(OBJS) $(LDFLAGS) From 1c0b7bdf4f77b8160cebe8af96b28230e870a136 Mon Sep 17 00:00:00 2001 From: VoyagerOne Date: Mon, 17 Aug 2020 08:58:03 -0400 Subject: [PATCH 32/58] Remove history bonus from Eval STC: LLR: 2.92 (-2.94,2.94) {-1.50,0.50} Total: 26776 W: 2787 L: 2725 D: 21264 https://tests.stockfishchess.org/tests/view/5f39d6beb38d442594aabd9b LTC: LLR: 2.93 (-2.94,2.94) {-1.50,0.50} Total: 12968 W: 635 L: 608 D: 11725 https://tests.stockfishchess.org/tests/view/5f39decfb38d442594aabda7 closes https://github.com/official-stockfish/Stockfish/pull/3019 Bench: 4335100 --- src/search.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index 83fb722f..7c839dfc 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -794,11 +794,7 @@ namespace { else { if ((ss-1)->currentMove != MOVE_NULL) - { - int bonus = -(ss-1)->statScore / 512; - - ss->staticEval = eval = evaluate(pos) + bonus; - } + ss->staticEval = eval = evaluate(pos); else ss->staticEval = eval = -(ss-1)->staticEval + 2 * Tempo; From 581b92e4a70b99fa5a22f7a1a38f2c8d2099769f Mon Sep 17 00:00:00 2001 From: Unai Corzo Date: Mon, 17 Aug 2020 18:22:32 +0200 Subject: [PATCH 33/58] Remove last captures extension STC https://tests.stockfishchess.org/tests/view/5f395657e98b6c64b3df41dd LLR: 2.95 (-2.94,2.94) {-1.50,0.50} Total: 144664 W: 15426 L: 15537 D: 113701 Ptnml(0-2): 612, 11341, 48537, 11230, 612 LTC https://tests.stockfishchess.org/tests/view/5f3a2ec7b38d442594aabdd7 LLR: 2.96 (-2.94,2.94) {-1.50,0.50} Total: 22728 W: 1161 L: 1146 D: 20421 Ptnml(0-2): 21, 960, 9388, 973, 22 closes https://github.com/official-stockfish/Stockfish/pull/3020 bench: 3832662 --- src/search.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index 7c839dfc..1d5bc5f7 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1122,11 +1122,6 @@ moves_loop: // When in check, search starts from here && (pos.is_discovery_check_on_king(~us, move) || pos.see_ge(move))) extension = 1; - // Last captures extension - else if ( PieceValue[EG][pos.captured_piece()] > PawnValueEg - && pos.non_pawn_material() <= 2 * RookValueMg) - extension = 1; - // Castling extension if ( type_of(move) == CASTLING && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2) From 1bcc981a5a70e3065b4ff588644f270136fd7e3c Mon Sep 17 00:00:00 2001 From: mstembera Date: Sun, 16 Aug 2020 15:23:50 -0700 Subject: [PATCH 34/58] Fallback to NNUE If the classical eval ends up much smaller than estimated fall back to NNUE. Also use multiply instead of divide for the threshold comparison for smoother transitions without rounding. STC https://tests.stockfishchess.org/tests/view/5f3a5011b38d442594aabdfe LLR: 2.96 (-2.94,2.94) {-0.50,1.50} Total: 57352 W: 6325 L: 6135 D: 44892 Ptnml(0-2): 277, 4748, 18482, 4846, 323 LTC https://tests.stockfishchess.org/tests/view/5f3aee9db38d442594aabe82 LLR: 2.95 (-2.94,2.94) {0.25,1.75} Total: 16232 W: 897 L: 781 D: 14554 Ptnml(0-2): 19, 679, 6616, 771, 31 closes https://github.com/official-stockfish/Stockfish/pull/3023 bench: 4026216 ----- Recommended net: https://tests.stockfishchess.org/api/nn/nn-82215d0fd0df.nnue --- src/evaluate.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 3a620a78..1bd89353 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -114,7 +114,8 @@ namespace { constexpr Value LazyThreshold1 = Value(1400); constexpr Value LazyThreshold2 = Value(1300); constexpr Value SpaceThreshold = Value(12222); - constexpr Value NNUEThreshold = Value(575); + constexpr Value NNUEThreshold1 = Value(550); + constexpr Value NNUEThreshold2 = Value(150); // KingAttackWeights[PieceType] contains king attack weights by piece type constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 }; @@ -939,10 +940,13 @@ make_v: Value Eval::evaluate(const Position& pos) { bool classical = !Eval::useNNUE - || abs(eg_value(pos.psq_score())) >= NNUEThreshold * (16 + pos.rule50_count()) / 16; + || abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count()); Value v = classical ? Evaluation(pos).value() : NNUE::evaluate(pos) * 5 / 4 + Tempo; + if (classical && Eval::useNNUE && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count())) + v = NNUE::evaluate(pos) * 5 / 4 + Tempo; + // Damp down the evaluation linearly when shuffling v = v * (100 - pos.rule50_count()) / 100; From fbae5614eb1e82bccd37fbcfb0d2ca388b7a9a7d Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Tue, 18 Aug 2020 08:49:06 +0200 Subject: [PATCH 35/58] Fix Makefile typo remove stray quote, shown with `make help` No functional change --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index 0f458aa1..1f8ba455 100644 --- a/src/Makefile +++ b/src/Makefile @@ -598,7 +598,7 @@ help: @echo "ppc-64 > PPC 64-bit" @echo "ppc-32 > PPC 32-bit" @echo "armv7 > ARMv7 32-bit" - @echo "armv7-neon" > ARMv7 32-bit with popcnt and neon" + @echo "armv7-neon > ARMv7 32-bit with popcnt and neon" @echo "armv8 > ARMv8 64-bit with popcnt and neon" @echo "apple-silicon > Apple silicon ARM64" @echo "general-64 > unspecified 64-bit" From 384d6844841e9f2da8f5a913c7620440f9e05ab5 Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Tue, 18 Aug 2020 18:06:28 +0200 Subject: [PATCH 36/58] Better error message on missing curl/wget provide clean error/warning message for missing curl/wget, sha256sum/shasum fixes https://github.com/official-stockfish/Stockfish/issues/3025 closes https://github.com/official-stockfish/Stockfish/pull/3026 No functional change --- src/Makefile | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/Makefile b/src/Makefile index 1f8ba455..a3feb68e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -669,9 +669,24 @@ net: @echo "Default net: $(nnuenet)" $(eval nnuedownloadurl := https://tests.stockfishchess.org/api/nn/$(nnuenet)) $(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi)) - @if test -f "$(nnuenet)"; then echo "Already available."; else echo "Downloading $(nnuedownloadurl)"; $(curl_or_wget) $(nnuedownloadurl) > $(nnuenet); fi + @if test -f "$(nnuenet)"; then \ + echo "Already available."; \ + else \ + if [ "x$(curl_or_wget)" = "x" ]; then \ + echo "Automatic download failed: neither curl nor wget is installed. Install one of these tools or download the net manually"; exit 1; \ + else \ + echo "Downloading $(nnuedownloadurl)"; $(curl_or_wget) $(nnuedownloadurl) > $(nnuenet);\ + fi; \ + fi; $(eval shasum_command := $(shell if hash shasum 2>/dev/null; then echo "shasum -a 256 "; elif hash sha256sum 2>/dev/null; then echo "sha256sum "; fi)) - @if [ "$(nnuenet)" != "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then echo "Failed download or $(nnuenet) corrupted, please delete!"; exit 1; fi + @if [ "x$(shasum_command)" != "x" ]; then \ + if [ "$(nnuenet)" != "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then \ + echo "Failed download or $(nnuenet) corrupted, please delete!"; exit 1; \ + fi \ + else \ + echo "shasum / sha256sum not found, skipping net validation"; \ + fi + # clean binaries and objects objclean: From 42e8789f0b3935b7ea389b3aa929e05e0a016872 Mon Sep 17 00:00:00 2001 From: syzygy1 <3028851+syzygy1@users.noreply.github.com> Date: Tue, 18 Aug 2020 01:56:12 +0200 Subject: [PATCH 37/58] Expanded support for x86-32 architectures. add new ARCH targets x86-32-sse41-popcnt > x86 32-bit with sse41 and popcnt support x86-32-sse2 > x86 32-bit with sse2 support x86-32 > x86 32-bit generic (with mmx and sse support) retire x86-32-old (use general-32) closes https://github.com/official-stockfish/Stockfish/pull/3022 No functional change. --- .travis.yml | 3 +- src/Makefile | 145 +++++++++++++++++++++++++++++++-------------------- 2 files changed, 91 insertions(+), 57 deletions(-) diff --git a/.travis.yml b/.travis.yml index 45f1bd3d..12596f1e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -67,9 +67,10 @@ script: - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse41-popcnt build && ../tests/signature.sh $benchref; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-old build && ../tests/signature.sh $benchref; fi - if [[ "$TRAVIS_OS_NAME" == "linux" && "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi # compile only for some more advanced architectures (might not run in travis) diff --git a/src/Makefile b/src/Makefile index a3feb68e..79c7333a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -67,11 +67,13 @@ endif # bits = 64/32 --- -DIS_64BIT --- 64-/32-bit operating system # prefetch = yes/no --- -DUSE_PREFETCH --- Use prefetch asm-instruction # popcnt = yes/no --- -DUSE_POPCNT --- Use popcnt asm-instruction +# pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction # sse = yes/no --- -msse --- Use Intel Streaming SIMD Extensions +# mmx = yes/no --- -mmmx --- Use Intel MMX instructions +# sse2 = yes/no --- -msse2 --- Use Intel Streaming SIMD Extensions 2 # ssse3 = yes/no --- -mssse3 --- Use Intel Supplemental Streaming SIMD Extensions 3 # sse41 = yes/no --- -msse4.1 --- Use Intel Streaming SIMD Extensions 4.1 # avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2 -# pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction # avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512 # vnni = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512 # neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture @@ -92,12 +94,13 @@ sanitize = no bits = 64 prefetch = no popcnt = no -mmx = no +pext = no sse = no +mmx = no +sse2 = no ssse3 = no sse41 = no avx2 = no -pext = no avx512 = no vnni = no neon = no @@ -106,83 +109,82 @@ STRIP = strip ### 2.2 Architecture specific -ifeq ($(ARCH),general-32) - arch = any - bits = 32 -endif +ifeq ($(findstring x86,$(ARCH)),x86) -ifeq ($(ARCH),x86-32-old) +# x86-32/64 + +ifeq ($(findstring x86-32,$(ARCH)),x86-32) arch = i386 bits = 32 -endif - -ifeq ($(ARCH),x86-32) - arch = i386 - bits = 32 - prefetch = yes + sse = yes mmx = yes - sse = yes -endif - -ifeq ($(ARCH),general-64) - arch = any -endif - -ifeq ($(ARCH),x86-64) +else arch = x86_64 - prefetch = yes + sse = yes + sse2 = yes +endif + +ifeq ($(findstring -sse,$(ARCH)),-sse) sse = yes endif -ifeq ($(ARCH),x86-64-sse3-popcnt) - arch = x86_64 - prefetch = yes - sse = yes +ifeq ($(findstring -popcnt,$(ARCH)),-popcnt) popcnt = yes endif -ifeq ($(ARCH),x86-64-ssse3) - arch = x86_64 - prefetch = yes +ifeq ($(findstring -mmx,$(ARCH)),-mmx) + mmx = yes +endif + +ifeq ($(findstring -sse2,$(ARCH)),-sse2) sse = yes + sse2 = yes +endif + +ifeq ($(findstring -ssse3,$(ARCH)),-ssse3) + sse = yes + sse2 = yes ssse3 = yes endif -ifeq ($(ARCH),$(filter $(ARCH),x86-64-sse41-popcnt x86-64-modern)) - arch = x86_64 - prefetch = yes - popcnt = yes +ifeq ($(findstring -sse41,$(ARCH)),-sse41) sse = yes + sse2 = yes ssse3 = yes sse41 = yes endif -ifeq ($(ARCH),x86-64-avx2) - arch = x86_64 - prefetch = yes +ifeq ($(findstring -modern,$(ARCH)),-modern) popcnt = yes sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes +endif + +ifeq ($(findstring -avx2,$(ARCH)),-avx2) + popcnt = yes + sse = yes + sse2 = yes ssse3 = yes sse41 = yes avx2 = yes endif -ifeq ($(ARCH),x86-64-bmi2) - arch = x86_64 - prefetch = yes +ifeq ($(findstring -bmi2,$(ARCH)),-bmi2) popcnt = yes sse = yes + sse2 = yes ssse3 = yes sse41 = yes avx2 = yes pext = yes endif -ifeq ($(ARCH),x86-64-avx512) - arch = x86_64 - prefetch = yes +ifeq ($(findstring -avx512,$(ARCH)),-avx512) popcnt = yes sse = yes + sse2 = yes ssse3 = yes sse41 = yes avx2 = yes @@ -190,11 +192,10 @@ ifeq ($(ARCH),x86-64-avx512) avx512 = yes endif -ifeq ($(ARCH),x86-64-vnni) - arch = x86_64 - prefetch = yes +ifeq ($(findstring -vnni,$(ARCH)),-vnni) popcnt = yes sse = yes + sse2 = yes ssse3 = yes sse41 = yes avx2 = yes @@ -203,6 +204,28 @@ ifeq ($(ARCH),x86-64-vnni) vnni = yes endif +ifeq ($(sse),yes) + prefetch = yes +endif + +# 64-bit pext is not available on x86-32 +ifeq ($(bits),32) + pext = no +endif + +else + +# all other architectures + +ifeq ($(ARCH),general-32) + arch = any + bits = 32 +endif + +ifeq ($(ARCH),general-64) + arch = any +endif + ifeq ($(ARCH),armv7) arch = armv7 prefetch = yes @@ -242,6 +265,8 @@ ifeq ($(ARCH),ppc-64) prefetch = yes endif +endif + ### ========================================================================== ### Section 3. Low-level Configuration ### ========================================================================== @@ -487,6 +512,13 @@ ifeq ($(ssse3),yes) endif endif +ifeq ($(sse2),yes) + CXXFLAGS += -DUSE_SSE2 + ifeq ($(comp),$(filter $(comp),gcc clang mingw)) + CXXFLAGS += -msse2 + endif +endif + ifeq ($(mmx),yes) CXXFLAGS += -DUSE_MMX ifeq ($(comp),$(filter $(comp),gcc clang mingw)) @@ -503,10 +535,6 @@ ifeq ($(neon),yes) endif endif -ifeq ($(arch),x86_64) - CXXFLAGS += -msse2 -DUSE_SSE2 -endif - ### 3.7 pext ifeq ($(pext),yes) CXXFLAGS += -DUSE_PEXT @@ -592,9 +620,10 @@ help: @echo "x86-64-modern > common modern CPU, currently x86-64-sse41-popcnt" @echo "x86-64-ssse3 > x86 64-bit with ssse3 support" @echo "x86-64-sse3-popcnt > x86 64-bit with sse3 and popcnt support" - @echo "x86-64 > x86 64-bit generic" - @echo "x86-32 > x86 32-bit (also enables MMX and SSE)" - @echo "x86-32-old > x86 32-bit fall back for old hardware" + @echo "x86-64 > x86 64-bit generic (with sse2 support)" + @echo "x86-32-sse41-popcnt > x86 32-bit with sse41 and popcnt support" + @echo "x86-32-sse2 > x86 32-bit with sse2 support" + @echo "x86-32 > x86 32-bit generic (with mmx and sse support)" @echo "ppc-64 > PPC 64-bit" @echo "ppc-32 > PPC 32-bit" @echo "armv7 > ARMv7 32-bit" @@ -624,7 +653,7 @@ help: @echo "make -j build ARCH=x86-64-ssse3 COMP=clang" @echo "" ifneq ($(empty_arch), yes) - @echo "-------------------------------\n" + @echo "-------------------------------" @echo "The selected architecture $(ARCH) will enable the following configuration: " @$(MAKE) ARCH=$(ARCH) COMP=$(COMP) config-sanity endif @@ -719,11 +748,13 @@ config-sanity: @echo "os: '$(OS)'" @echo "prefetch: '$(prefetch)'" @echo "popcnt: '$(popcnt)'" + @echo "pext: '$(pext)'" @echo "sse: '$(sse)'" + @echo "mmx: '$(mmx)'" + @echo "sse2: '$(sse2)'" @echo "ssse3: '$(ssse3)'" @echo "sse41: '$(sse41)'" @echo "avx2: '$(avx2)'" - @echo "pext: '$(pext)'" @echo "avx512: '$(avx512)'" @echo "vnni: '$(vnni)'" @echo "neon: '$(neon)'" @@ -744,11 +775,13 @@ config-sanity: @test "$(bits)" = "32" || test "$(bits)" = "64" @test "$(prefetch)" = "yes" || test "$(prefetch)" = "no" @test "$(popcnt)" = "yes" || test "$(popcnt)" = "no" + @test "$(pext)" = "yes" || test "$(pext)" = "no" @test "$(sse)" = "yes" || test "$(sse)" = "no" + @test "$(mmx)" = "yes" || test "$(mmx)" = "no" + @test "$(sse2)" = "yes" || test "$(sse2)" = "no" @test "$(ssse3)" = "yes" || test "$(ssse3)" = "no" @test "$(sse41)" = "yes" || test "$(sse41)" = "no" @test "$(avx2)" = "yes" || test "$(avx2)" = "no" - @test "$(pext)" = "yes" || test "$(pext)" = "no" @test "$(avx512)" = "yes" || test "$(avx512)" = "no" @test "$(vnni)" = "yes" || test "$(vnni)" = "no" @test "$(neon)" = "yes" || test "$(neon)" = "no" From 2deb08a52946379d4cebb1082e5d740d1d027122 Mon Sep 17 00:00:00 2001 From: SFisGOD Date: Tue, 18 Aug 2020 18:54:28 +0800 Subject: [PATCH 38/58] Reintroduce last captures extension STC: LLR: 2.93 (-2.94,2.94) {-0.50,1.50} Total: 34840 W: 3834 L: 3682 D: 27324 Ptnml(0-2): 153, 2767, 11455, 2865, 180 https://tests.stockfishchess.org/tests/view/5f3bb380b38d442594aabefc LTC: LLR: 2.95 (-2.94,2.94) {0.25,1.75} Total: 15832 W: 890 L: 776 D: 14166 Ptnml(0-2): 17, 669, 6429, 785, 16 https://tests.stockfishchess.org/tests/view/5f3c46a0a95672ddd56c632a closes https://github.com/official-stockfish/Stockfish/pull/3028 see also https://github.com/official-stockfish/Stockfish/pull/3020 Bench: 4348811 --- src/search.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/search.cpp b/src/search.cpp index 1d5bc5f7..7c839dfc 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1122,6 +1122,11 @@ moves_loop: // When in check, search starts from here && (pos.is_discovery_check_on_king(~us, move) || pos.see_ge(move))) extension = 1; + // Last captures extension + else if ( PieceValue[EG][pos.captured_piece()] > PawnValueEg + && pos.non_pawn_material() <= 2 * RookValueMg) + extension = 1; + // Castling extension if ( type_of(move) == CASTLING && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2) From a1ad8604a11459a94189f857e368d0fbb72da25d Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Wed, 19 Aug 2020 19:21:41 +0200 Subject: [PATCH 39/58] Send error message as an UCI info string some GUIs do not show the error message when the engine terminates in the no-net case, as it is send to cerr. Instead send it as an info string, which the GUI will more likely display. closes https://github.com/official-stockfish/Stockfish/pull/3031 No functional change. --- src/evaluate.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 1bd89353..c84d894f 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -53,10 +53,11 @@ namespace Eval { UCI::OptionsMap defaults; UCI::init(defaults); - std::cerr << "NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully. " - << "These network evaluation parameters must be available, and compatible with this version of the code. " - << "The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file. " - << "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << std::endl; + sync_cout << "info string ERROR: NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully." << sync_endl; + sync_cout << "info string ERROR: The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file." << sync_endl; + sync_cout << "info string ERROR: The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << sync_endl; + sync_cout << "info string ERROR: If the UCI option Use NNUE is set to true, network evaluation parameters compatible with the program must be available." << sync_endl; + sync_cout << "info string ERROR: The engine will be terminated now." << sync_endl; std::exit(EXIT_FAILURE); } From daac86691de55fe388b9b727794c7d27f2b90d5c Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Thu, 20 Aug 2020 14:24:49 +0200 Subject: [PATCH 40/58] Set Use NNUE by default to true Since the initial stages of the merge, progress has been made so that this seems the best option now: * NNUE is clearly stronger on most relevant hardware and time controls * All of our CI and testing infrastructure has been adjusted * The default net is easy to get (further ideas #3030) fixes https://github.com/official-stockfish/Stockfish/issues/2861 closes https://github.com/official-stockfish/Stockfish/pull/3033 No functional change. --- src/ucioption.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ucioption.cpp b/src/ucioption.cpp index 2b66a475..ec83c7c8 100644 --- a/src/ucioption.cpp +++ b/src/ucioption.cpp @@ -78,7 +78,7 @@ void init(OptionsMap& o) { o["SyzygyProbeDepth"] << Option(1, 1, 100); o["Syzygy50MoveRule"] << Option(true); o["SyzygyProbeLimit"] << Option(7, 0, 7); - o["Use NNUE"] << Option(false, on_use_NNUE); + o["Use NNUE"] << Option(true, on_use_NNUE); // The default must follow the format nn-[SHA256 first 12 digits].nnue // for the build process (profile-build and fishtest) to work. o["EvalFile"] << Option("nn-82215d0fd0df.nnue", on_eval_file); From 8b45b1c4907b4b2186441e02edd3b0c37f8b3269 Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Fri, 21 Aug 2020 07:42:19 +0200 Subject: [PATCH 41/58] Deal with very old linux kernels MADV_HUGEPAGE might not be available, for kernels before 2.6.38 (released 2011). Just skip the madvise. closes https://github.com/official-stockfish/Stockfish/pull/3039 No functional change --- src/misc.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/misc.cpp b/src/misc.cpp index 459ea100..56a3dcad 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -367,7 +367,9 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) { size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment if (posix_memalign(&mem, alignment, size)) mem = nullptr; +#if defined(MADV_HUGEPAGE) madvise(mem, allocSize, MADV_HUGEPAGE); +#endif return mem; } From 15abcaedc1e32d4de913a2f7dea12578912371b7 Mon Sep 17 00:00:00 2001 From: gsobala Date: Fri, 21 Aug 2020 11:28:53 +0100 Subject: [PATCH 42/58] Update Makefile for macOS Changes to deal with compilation (particularly profile-build) on macOS. (1) The default toolchain has gcc masquerading as clang, the previous Makefile was not picking up the required changes to the different profiling tools. (2) The previous Makefile test for gccisclang occurred before a potential overwrite of CXX by COMPCXX (3) llvm-profdata no longer runs as a command on macOS and instead is invoked by ``xcrun llvm-profdata`` (4) Needs to support use of true gcc using e.g. ``make build ... COMPCXX=g++-10`` (5) enable profile-build in travis for macOS closes https://github.com/official-stockfish/Stockfish/pull/3043 No functional change --- .travis.yml | 3 ++- AUTHORS | 1 + src/Makefile | 16 ++++++++++++---- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 12596f1e..a029c4fc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -71,7 +71,8 @@ script: - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" && "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi + # workaround: exclude a custom version of llvm+clang, which doesn't find llvm-profdata on ubuntu + - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi # compile only for some more advanced architectures (might not run in travis) - make clean && make -j2 ARCH=x86-64-avx2 build diff --git a/AUTHORS b/AUTHORS index d8f4d30e..c96f870a 100644 --- a/AUTHORS +++ b/AUTHORS @@ -59,6 +59,7 @@ Fauzi Akram Dabat (FauziAkram) Felix Wittmann gamander Gary Heckman (gheckman) +George Sobala (gsobala) gguliash Gian-Carlo Pascutto (gcp) Gontran Lemaire (gonlem) diff --git a/src/Makefile b/src/Makefile index 79c7333a..b969ba04 100644 --- a/src/Makefile +++ b/src/Makefile @@ -302,9 +302,6 @@ ifeq ($(COMP),gcc) ifneq ($(KERNEL),Darwin) LDFLAGS += -Wl,--no-as-needed endif - - gccversion = $(shell $(CXX) --version) - gccisclang = $(findstring clang,$(gccversion)) endif ifeq ($(COMP),mingw) @@ -376,6 +373,7 @@ endif ifeq ($(KERNEL),Darwin) CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14 LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14 + XCRUN = xcrun endif # To cross-compile for Android, NDK version r21 or later is recommended. @@ -407,6 +405,16 @@ ifdef COMPCXX CXX=$(COMPCXX) endif +### Sometimes gcc is really clang +ifeq ($(COMP),gcc) + gccversion = $(shell $(CXX) --version) + gccisclang = $(findstring clang,$(gccversion)) + ifneq ($(gccisclang),) + profile_make = clang-profile-make + profile_use = clang-profile-use + endif +endif + ### On mingw use Windows threads, otherwise POSIX ifneq ($(comp),mingw) # On Android Bionic's C library comes with its own pthread implementation bundled in @@ -798,7 +806,7 @@ clang-profile-make: all clang-profile-use: - llvm-profdata merge -output=stockfish.profdata *.profraw + $(XCRUN) llvm-profdata merge -output=stockfish.profdata *.profraw $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ EXTRACXXFLAGS='-fprofile-instr-use=stockfish.profdata' \ EXTRALDFLAGS='-fprofile-use ' \ From e64b957274b94e89ad1a6e3ec4571c9082246a0a Mon Sep 17 00:00:00 2001 From: Unai Corzo Date: Fri, 21 Aug 2020 09:24:25 +0200 Subject: [PATCH 43/58] Simplify away internal iterative deepening Remove the iterative deepening step. Instead, employ a depth reduction if the position is not in TT and on the PV. STC https://tests.stockfishchess.org/tests/view/5f3ce6eaa95672ddd56c637e LLR: 2.97 (-2.94,2.94) {-0.50,1.50} Total: 41096 W: 4421 L: 4257 D: 32418 Ptnml(0-2): 207, 3259, 13460, 3407, 215 LTC (old) https://tests.stockfishchess.org/tests/view/5f3d7d4fa95672ddd56c640b LLR: 2.92 (-2.94,2.94) {-1.50,0.50} Total: 26032 W: 1320 L: 1309 D: 23403 Ptnml(0-2): 22, 1152, 10654, 1169, 19 LTC (new) https://tests.stockfishchess.org/tests/view/5f3e31e0a95672ddd56c6464 LLR: 2.95 (-2.94,2.94) {-0.75,0.25} Total: 34160 W: 1844 L: 1766 D: 30550 Ptnml(0-2): 33, 1533, 13876, 1599, 39 bench: 3849173 --- src/search.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index 7c839dfc..ba13680c 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -939,15 +939,11 @@ namespace { } } - // Step 11. Internal iterative deepening (~1 Elo) - if (depth >= 7 && !ttMove) - { - search(pos, ss, alpha, beta, depth - 7, cutNode); - - tte = TT.probe(posKey, ttHit); - ttValue = ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE; - ttMove = ttHit ? tte->move() : MOVE_NONE; - } + // Step 11. If the position is not in TT, decrease depth by 2 + if ( PvNode + && depth >= 6 + && !ttMove) + depth -= 2; moves_loop: // When in check, search starts from here From cbcb05ca092160137c166f84e7e9da3d6bb4e2d3 Mon Sep 17 00:00:00 2001 From: MJZ1977 <37274752+MJZ1977@users.noreply.github.com> Date: Fri, 21 Aug 2020 10:57:34 +0200 Subject: [PATCH 44/58] Display classic and NNUE evaluation in trace mode show both the classical and NNUE evaluation, as well as the Final evaluation. closes https://github.com/official-stockfish/Stockfish/pull/3042 No functional change. --- src/evaluate.cpp | 65 ++++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index c84d894f..c66938d6 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -972,42 +972,47 @@ std::string Eval::trace(const Position& pos) { Value v; - if (Eval::useNNUE) - { - v = NNUE::evaluate(pos); - } - else - { - std::memset(scores, 0, sizeof(scores)); + std::memset(scores, 0, sizeof(scores)); - pos.this_thread()->contempt = SCORE_ZERO; // Reset any dynamic contempt + pos.this_thread()->contempt = SCORE_ZERO; // Reset any dynamic contempt - v = Evaluation(pos).value(); + v = Evaluation(pos).value(); - ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2) - << " Term | White | Black | Total \n" - << " | MG EG | MG EG | MG EG \n" - << " ------------+-------------+-------------+------------\n" - << " Material | " << Term(MATERIAL) - << " Imbalance | " << Term(IMBALANCE) - << " Pawns | " << Term(PAWN) - << " Knights | " << Term(KNIGHT) - << " Bishops | " << Term(BISHOP) - << " Rooks | " << Term(ROOK) - << " Queens | " << Term(QUEEN) - << " Mobility | " << Term(MOBILITY) - << " King safety | " << Term(KING) - << " Threats | " << Term(THREAT) - << " Passed | " << Term(PASSED) - << " Space | " << Term(SPACE) - << " Winnable | " << Term(WINNABLE) - << " ------------+-------------+-------------+------------\n" - << " Total | " << Term(TOTAL); - } + ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2) + << " Term | White | Black | Total \n" + << " | MG EG | MG EG | MG EG \n" + << " ------------+-------------+-------------+------------\n" + << " Material | " << Term(MATERIAL) + << " Imbalance | " << Term(IMBALANCE) + << " Pawns | " << Term(PAWN) + << " Knights | " << Term(KNIGHT) + << " Bishops | " << Term(BISHOP) + << " Rooks | " << Term(ROOK) + << " Queens | " << Term(QUEEN) + << " Mobility | " << Term(MOBILITY) + << " King safety | " << Term(KING) + << " Threats | " << Term(THREAT) + << " Passed | " << Term(PASSED) + << " Space | " << Term(SPACE) + << " Winnable | " << Term(WINNABLE) + << " ------------+-------------+-------------+------------\n" + << " Total | " << Term(TOTAL); v = pos.side_to_move() == WHITE ? v : -v; - ss << "\nFinal evaluation: " << to_cp(v) << " (white side)\n"; + ss << "\nClassical evaluation: " << to_cp(v) << " (white side)\n"; + + if (Eval::useNNUE) + { + v = NNUE::evaluate(pos); + v = pos.side_to_move() == WHITE ? v : -v; + ss << "\nNNUE evaluation: " << to_cp(v) << " (white side)\n"; + } + + v = evaluate(pos); + v = pos.side_to_move() == WHITE ? v : -v; + ss << "\nFinal evaluation: " << to_cp(v) << " (white side)\n"; + return ss.str(); } From 34f67c57223d73ad40d583ccc033c75eb0df2453 Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Fri, 21 Aug 2020 22:10:55 +0200 Subject: [PATCH 45/58] Explicitly rely on pthreads if possible allows us to set the needed stacksize on thread creation. Useful for environments with too small a default stack size (e.g. Alpine Linux with musl). Passed STC, no regression: LLR: 2.96 (-2.94,2.94) {-1.25,0.25} Total: 17816 W: 1344 L: 1275 D: 15197 Ptnml(0-2): 30, 1057, 6682, 1092, 47 https://tests.stockfishchess.org/tests/view/5f402b5587a5c3c63d8f534d closes https://github.com/official-stockfish/Stockfish/pull/3047 fixes https://github.com/official-stockfish/Stockfish/issues/3041 No functional change. --- src/Makefile | 1 + src/thread_win32_osx.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index b969ba04..74ef87b9 100644 --- a/src/Makefile +++ b/src/Makefile @@ -417,6 +417,7 @@ endif ### On mingw use Windows threads, otherwise POSIX ifneq ($(comp),mingw) + CXXFLAGS += -DUSE_PTHREADS # On Android Bionic's C library comes with its own pthread implementation bundled in ifneq ($(OS),Android) # Haiku has pthreads in its libroot, so only link it in on other platforms diff --git a/src/thread_win32_osx.h b/src/thread_win32_osx.h index c4b55a48..75ef5d9a 100644 --- a/src/thread_win32_osx.h +++ b/src/thread_win32_osx.h @@ -27,7 +27,7 @@ /// The implementation calls pthread_create() with the stack size parameter /// equal to the linux 8MB default, on platforms that support it. -#if defined(__APPLE__) || defined(__MINGW32__) || defined(__MINGW64__) +#if defined(__APPLE__) || defined(__MINGW32__) || defined(__MINGW64__) || defined(USE_PTHREADS) #include From 3542033342f15625f808013b69aa8c2d274a2f91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Nicolet?= Date: Sat, 22 Aug 2020 11:37:53 +0200 Subject: [PATCH 46/58] Instructions to build on older Macintosh In recent Macs, it is possible to use the Clang compiler provided by Apple to compile Stockfish out of the box, and this is the method used by default in our Makefile (the Makefile sets the macosx-version-min=10.14 flag to select the right libc++ library for the Clang compiler with recent c++17 support). But it is quite possible to compile and run Stockfish on older Macs! Below we describe a method to install a recent GNU compiler on these Macs, to get the c++17 support. We have tested the following procedure to install gcc10 on machines running Mac OS 10.7, Mac OS 10.9 and Mac OS 10.13: 1) install XCode for your machine. 2) install Apple command-line developer tools for XCode, by typing the following command in a Terminal: ``` sudo xcode-select --install ``` 3) go to the Stockfish "src" directory, then try a default build and run Stockfish: ``` make clean make build make net ./stockfish ``` 4) if step 3 worked, congrats! You have a compiler recent enough on your Mac to compile Stockfish. If not, continue with step 5 to install GNU gcc10 :-) 5) install the MacPorts package manager (https://www.macports.org/install.php), for instance using the fast method in the "macOS Package (.pkg) Installer" section of the page. 6) use the "port" command to install the gcc10 package of MacPorts by typing the following command: ``` sudo port install gcc10 ``` With this step, MacPorts will install the gcc10 compiler under the name "g++-mp-10" in the /opt/local/bin directory: ``` which g++-mp-10 /opt/local/bin/g++-mp-10 <--- answer ``` 7) You can now go back to the "src" directory of Stockfish, and try to build Stockfish by pointing at the right compiler: ``` make clean make build COMP=gcc COMPCXX=/opt/local/bin/g++-mp-10 make net ./stockfish ``` 8) Enjoy Stockfish on Macintosh! See this pull request for further discussion: https://github.com/official-stockfish/Stockfish/pull/3049 No functional change --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index 74ef87b9..b0274504 100644 --- a/src/Makefile +++ b/src/Makefile @@ -370,7 +370,7 @@ else endif endif -ifeq ($(KERNEL),Darwin) +ifeq ($(KERNEL),Darwin) CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14 LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14 XCRUN = xcrun From 5f1843c9cb55afcd3fb1da9e9dc4b0092f25d9f0 Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Sat, 11 Jul 2020 16:59:33 +0200 Subject: [PATCH 47/58] Small trivial cleanups closes https://github.com/official-stockfish/Stockfish/pull/2801 No functional change --- README.md | 53 +++++++++++++++++++++++++----------------------- src/Makefile | 2 +- src/bitboard.cpp | 12 ++++++++++- src/bitboard.h | 10 --------- src/evaluate.cpp | 17 ++++++++-------- src/material.cpp | 2 +- src/misc.cpp | 17 ++++++++-------- src/misc.h | 8 -------- src/movegen.cpp | 2 +- src/movepick.cpp | 14 ++++++------- src/movepick.h | 8 ++++---- src/pawns.cpp | 2 +- src/position.cpp | 4 ++-- src/position.h | 5 +++++ src/search.cpp | 15 +++++++------- src/timeman.cpp | 18 ++++++++-------- src/uci.cpp | 2 +- 17 files changed, 96 insertions(+), 95 deletions(-) diff --git a/README.md b/README.md index 7b6ddf4c..2cc88bf4 100644 --- a/README.md +++ b/README.md @@ -4,17 +4,17 @@ [![Build Status](https://ci.appveyor.com/api/projects/status/github/official-stockfish/Stockfish?branch=master&svg=true)](https://ci.appveyor.com/project/mcostalba/stockfish/branch/master) [Stockfish](https://stockfishchess.org) is a free, powerful UCI chess engine -derived from Glaurung 2.1. It features two evaluation functions, the classical -evaluation based on handcrafted terms, and the NNUE evaluation based on -efficiently updateable neural networks. The classical evaluation runs efficiently -on most 64bit CPU architectures, while the NNUE evaluation benefits strongly from the -vector intrinsics available on modern CPUs (avx2 or similar). +derived from Glaurung 2.1. Stockfish is not a complete chess program and requires a +UCI-compatible graphical user interface (GUI) (e.g. XBoard with PolyGlot, Scid, +Cute Chess, eboard, Arena, Sigma Chess, Shredder, Chess Partner or Fritz) in order +to be used comfortably. Read the documentation for your GUI of choice for information +about how to use Stockfish with it. -Stockfish is not a complete chess program and requires a -UCI-compatible GUI (e.g. XBoard with PolyGlot, Scid, Cute Chess, eboard, Arena, -Sigma Chess, Shredder, Chess Partner or Fritz) in order to be used comfortably. -Read the documentation for your GUI of choice for information about how to use -Stockfish with it. +The Stockfish engine features two evaluation functions for chess, the classical +evaluation based on handcrafted terms, and the NNUE evaluation based on efficiently +updateable neural networks. The classical evaluation runs efficiently on most 64bit +CPU architectures, while the NNUE evaluation benefits strongly from the vector +intrinsics available on modern CPUs (avx2 or similar). ## Files @@ -28,10 +28,13 @@ This distribution of Stockfish consists of the following files: * src, a subdirectory containing the full source code, including a Makefile that can be used to compile Stockfish on Unix-like systems. -To use the NNUE evaluation an additional data file with neural network parameters -needs to be downloaded. The filename for the default set can be found as the default -value of the `EvalFile` UCI option, with the format -`nn-[SHA256 first 12 digits].nnue` (e.g. nn-c157e0a5755b.nnue). This file can be downloaded from + * a file with the .nnue extension, storing the neural network for the NNUE + evaluation. + +Note: to use the NNUE evaluation, the additional data file with neural network parameters +needs to be downloaded. The filename for the default net can be found as the default +value of the `EvalFile` UCI option, with the format `nn-[SHA256 first 12 digits].nnue` +(for instance, `nn-c157e0a5755b.nnue`). This file can be downloaded from ``` https://tests.stockfishchess.org/api/nn/[filename] ``` @@ -64,14 +67,6 @@ Currently, Stockfish has the following UCI options: The name of the file of the NNUE evaluation parameters. Depending on the GUI the filename should include the full path to the folder/directory that contains the file. - * #### Contempt - A positive value for contempt favors middle game positions and avoids draws, - effective for the classical evaluation only. - - * #### Analysis Contempt - By default, contempt is set to prefer the side to move. Set this option to "White" - or "Black" to analyse with contempt for that side, or "Off" to disable contempt. - * #### UCI_AnalyseMode An option handled by your GUI. @@ -120,6 +115,14 @@ Currently, Stockfish has the following UCI options: Limit Syzygy tablebase probing to positions with at most this many pieces left (including kings and pawns). + * #### Contempt + A positive value for contempt favors middle game positions and avoids draws, + effective for the classical evaluation only. + + * #### Analysis Contempt + By default, contempt is set to prefer the side to move. Set this option to "White" + or "Black" to analyse with contempt for that side, or "Off" to disable contempt. + * #### Move Overhead Assume a time delay of x ms due to network and GUI overheads. This is useful to avoid losses on time in those cases. @@ -138,7 +141,7 @@ Currently, Stockfish has the following UCI options: * #### Debug Log File Write all communication to and from the engine into a text file. -## Classical and NNUE evaluation +## A note on classical and NNUE evaluation Both approaches assign a value to a position that is used in alpha-beta (PVS) search to find the best move. The classical evaluation computes this value as a function @@ -226,6 +229,7 @@ targets with corresponding descriptions. cd src make help make build ARCH=x86-64-modern + make net ``` When not using the Makefile to compile (for instance with Microsoft MSVC) you @@ -237,8 +241,7 @@ compiler you used to create your executable. These informations can be found by typing the following commands in a console: ``` - ./stockfish - compiler + ./stockfish compiler ``` ## Understanding the code base and participating in the project diff --git a/src/Makefile b/src/Makefile index b0274504..74ef87b9 100644 --- a/src/Makefile +++ b/src/Makefile @@ -370,7 +370,7 @@ else endif endif -ifeq ($(KERNEL),Darwin) +ifeq ($(KERNEL),Darwin) CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14 LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14 XCRUN = xcrun diff --git a/src/bitboard.cpp b/src/bitboard.cpp index f531010c..80206b58 100644 --- a/src/bitboard.cpp +++ b/src/bitboard.cpp @@ -39,6 +39,16 @@ namespace { Bitboard BishopTable[0x1480]; // To store bishop attacks void init_magics(PieceType pt, Bitboard table[], Magic magics[]); + +} + + +/// safe_destination() returns the bitboard of target square for the given step +/// from the given square. If the step is off the board, returns empty bitboard. + +inline Bitboard safe_destination(Square s, int step) { + Square to = Square(s + step); + return is_ok(to) && distance(s, to) <= 2 ? square_bb(to) : Bitboard(0); } @@ -110,7 +120,7 @@ namespace { Direction RookDirections[4] = {NORTH, SOUTH, EAST, WEST}; Direction BishopDirections[4] = {NORTH_EAST, SOUTH_EAST, SOUTH_WEST, NORTH_WEST}; - for(Direction d : (pt == ROOK ? RookDirections : BishopDirections)) + for (Direction d : (pt == ROOK ? RookDirections : BishopDirections)) { Square s = sq; while(safe_destination(s, d) && !(occupied & s)) diff --git a/src/bitboard.h b/src/bitboard.h index a899d879..29d8f66d 100644 --- a/src/bitboard.h +++ b/src/bitboard.h @@ -279,16 +279,6 @@ inline int edge_distance(File f) { return std::min(f, File(FILE_H - f)); } inline int edge_distance(Rank r) { return std::min(r, Rank(RANK_8 - r)); } -/// safe_destination() returns the bitboard of target square for the given step -/// from the given square. If the step is off the board, returns empty bitboard. - -inline Bitboard safe_destination(Square s, int step) -{ - Square to = Square(s + step); - return is_ok(to) && distance(s, to) <= 2 ? square_bb(to) : Bitboard(0); -} - - /// attacks_bb(Square) returns the pseudo attacks of the give piece type /// assuming an empty board. diff --git a/src/evaluate.cpp b/src/evaluate.cpp index c66938d6..ce92db9a 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -288,8 +288,8 @@ namespace { attackedBy2[Us] = dblAttackByPawn | (attackedBy[Us][KING] & attackedBy[Us][PAWN]); // Init our king safety tables - Square s = make_square(Utility::clamp(file_of(ksq), FILE_B, FILE_G), - Utility::clamp(rank_of(ksq), RANK_2, RANK_7)); + Square s = make_square(std::clamp(file_of(ksq), FILE_B, FILE_G), + std::clamp(rank_of(ksq), RANK_2, RANK_7)); kingRing[Us] = attacks_bb(s) | s; kingAttackersCount[Them] = popcount(kingRing[Us] & pe->pawn_attacks(Them)); @@ -686,8 +686,8 @@ namespace { Square blockSq = s + Up; // Adjust bonus based on the king's proximity - bonus += make_score(0, ( (king_proximity(Them, blockSq) * 19) / 4 - - king_proximity(Us, blockSq) * 2) * w); + bonus += make_score(0, ( king_proximity(Them, blockSq) * 19 / 4 + - king_proximity(Us, blockSq) * 2) * w); // If blockSq is not the queening square then consider also a second push if (r != RANK_7) @@ -731,7 +731,7 @@ namespace { // Evaluation::space() computes a space evaluation for a given side, aiming to improve game - // play in the opening. It is based on the number of safe squares on the 4 central files + // play in the opening. It is based on the number of safe squares on the four central files // on ranks 2 to 4. Completely safe squares behind a friendly pawn are counted twice. // Finally, the space bonus is multiplied by a weight which decreases according to occupancy. @@ -804,7 +804,7 @@ namespace { // Now apply the bonus: note that we find the attacking side by extracting the // sign of the midgame or endgame values, and that we carefully cap the bonus // so that the midgame and endgame scores do not change sign after the bonus. - int u = ((mg > 0) - (mg < 0)) * Utility::clamp(complexity + 50, -abs(mg), 0); + int u = ((mg > 0) - (mg < 0)) * std::clamp(complexity + 50, -abs(mg), 0); int v = ((eg > 0) - (eg < 0)) * std::max(complexity, -abs(eg)); mg += u; @@ -951,8 +951,8 @@ Value Eval::evaluate(const Position& pos) { // Damp down the evaluation linearly when shuffling v = v * (100 - pos.rule50_count()) / 100; - // Guarantee evalution outside of TB range - v = Utility::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1); + // Guarantee evaluation does not hit the tablebase range + v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1); return v; } @@ -1013,6 +1013,5 @@ std::string Eval::trace(const Position& pos) { v = pos.side_to_move() == WHITE ? v : -v; ss << "\nFinal evaluation: " << to_cp(v) << " (white side)\n"; - return ss.str(); } diff --git a/src/material.cpp b/src/material.cpp index 0ef9926f..870a5e11 100644 --- a/src/material.cpp +++ b/src/material.cpp @@ -130,7 +130,7 @@ Entry* probe(const Position& pos) { Value npm_w = pos.non_pawn_material(WHITE); Value npm_b = pos.non_pawn_material(BLACK); - Value npm = Utility::clamp(npm_w + npm_b, EndgameLimit, MidgameLimit); + Value npm = std::clamp(npm_w + npm_b, EndgameLimit, MidgameLimit); // Map total non-pawn material into [PHASE_ENDGAME, PHASE_MIDGAME] e->gamePhase = Phase(((npm - EndgameLimit) * PHASE_MIDGAME) / (MidgameLimit - EndgameLimit)); diff --git a/src/misc.cpp b/src/misc.cpp index 56a3dcad..80c436ac 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -328,16 +328,16 @@ void prefetch(void* addr) { #endif -/// Wrappers for systems where the c++17 implementation doesn't guarantee the availability of aligned_alloc. -/// Memory allocated with std_aligned_alloc must be freed with std_aligned_free. -/// + +/// std_aligned_alloc() is our wrapper for systems where the c++17 implementation +/// does not guarantee the availability of aligned_alloc(). Memory allocated with +/// std_aligned_alloc() must be freed with std_aligned_free(). void* std_aligned_alloc(size_t alignment, size_t size) { + #if defined(POSIXALIGNEDALLOC) - void *pointer; - if(posix_memalign(&pointer, alignment, size) == 0) - return pointer; - return nullptr; + void *mem; + return posix_memalign(&mem, alignment, size) ? nullptr : mem; #elif defined(_WIN32) return _mm_malloc(size, alignment); #else @@ -346,6 +346,7 @@ void* std_aligned_alloc(size_t alignment, size_t size) { } void std_aligned_free(void* ptr) { + #if defined(POSIXALIGNEDALLOC) free(ptr); #elif defined(_WIN32) @@ -355,7 +356,7 @@ void std_aligned_free(void* ptr) { #endif } -/// aligned_ttmem_alloc() will return suitably aligned memory, and if possible use large pages. +/// aligned_ttmem_alloc() will return suitably aligned memory, if possible using large pages. /// The returned pointer is the aligned one, while the mem argument is the one that needs /// to be passed to free. With c++17 some of this functionality could be simplified. diff --git a/src/misc.h b/src/misc.h index eb4e05c0..8ad17b50 100644 --- a/src/misc.h +++ b/src/misc.h @@ -65,14 +65,6 @@ std::ostream& operator<<(std::ostream&, SyncCout); #define sync_cout std::cout << IO_LOCK #define sync_endl std::endl << IO_UNLOCK -namespace Utility { - -/// Clamp a value between lo and hi. Available in c++17. -template constexpr const T& clamp(const T& v, const T& lo, const T& hi) { - return v < lo ? lo : v > hi ? hi : v; -} - -} /// xorshift64star Pseudo-Random Number Generator /// This class is based on original code written and dedicated diff --git a/src/movegen.cpp b/src/movegen.cpp index d74df4c3..3340f65c 100644 --- a/src/movegen.cpp +++ b/src/movegen.cpp @@ -248,7 +248,7 @@ namespace { *moveList++ = make_move(ksq, pop_lsb(&b)); if ((Type != CAPTURES) && pos.can_castle(Us & ANY_CASTLING)) - for(CastlingRights cr : { Us & KING_SIDE, Us & QUEEN_SIDE } ) + for (CastlingRights cr : { Us & KING_SIDE, Us & QUEEN_SIDE } ) if (!pos.castling_impeded(cr) && pos.can_castle(cr)) *moveList++ = make(ksq, pos.castling_rook_square(cr)); } diff --git a/src/movepick.cpp b/src/movepick.cpp index 96a44449..153d323e 100644 --- a/src/movepick.cpp +++ b/src/movepick.cpp @@ -182,7 +182,7 @@ top: --endMoves; ++stage; - /* fallthrough */ + [[fallthrough]]; case REFUTATION: if (select([&](){ return *cur != MOVE_NONE @@ -190,7 +190,7 @@ top: && pos.pseudo_legal(*cur); })) return *(cur - 1); ++stage; - /* fallthrough */ + [[fallthrough]]; case QUIET_INIT: if (!skipQuiets) @@ -203,7 +203,7 @@ top: } ++stage; - /* fallthrough */ + [[fallthrough]]; case QUIET: if ( !skipQuiets @@ -217,7 +217,7 @@ top: endMoves = endBadCaptures; ++stage; - /* fallthrough */ + [[fallthrough]]; case BAD_CAPTURE: return select([](){ return true; }); @@ -228,7 +228,7 @@ top: score(); ++stage; - /* fallthrough */ + [[fallthrough]]; case EVASION: return select([](){ return true; }); @@ -246,14 +246,14 @@ top: return MOVE_NONE; ++stage; - /* fallthrough */ + [[fallthrough]]; case QCHECK_INIT: cur = moves; endMoves = generate(pos, cur); ++stage; - /* fallthrough */ + [[fallthrough]]; case QCHECK: return select([](){ return true; }); diff --git a/src/movepick.h b/src/movepick.h index f080935a..97ea5bec 100644 --- a/src/movepick.h +++ b/src/movepick.h @@ -86,14 +86,14 @@ enum StatsType { NoCaptures, Captures }; /// the move's from and to squares, see www.chessprogramming.org/Butterfly_Boards typedef Stats ButterflyHistory; -/// At higher depths LowPlyHistory records successful quiet moves near the root and quiet -/// moves which are/were in the PV (ttPv) -/// It is cleared with each new search and filled during iterative deepening +/// At higher depths LowPlyHistory records successful quiet moves near the root +/// and quiet moves which are/were in the PV (ttPv). It is cleared with each new +/// search and filled during iterative deepening. constexpr int MAX_LPH = 4; typedef Stats LowPlyHistory; /// CounterMoveHistory stores counter moves indexed by [piece][to] of the previous -/// move, see www.chessprogramming.org/Countermove_Heuristic +/// move, see www.chessprogramming.org/Countermove_Heuristic typedef Stats CounterMoveHistory; /// CapturePieceToHistory is addressed by a move's [piece][to][captured piece type] diff --git a/src/pawns.cpp b/src/pawns.cpp index 868d0c8e..af0f6618 100644 --- a/src/pawns.cpp +++ b/src/pawns.cpp @@ -219,7 +219,7 @@ Score Entry::evaluate_shelter(const Position& pos, Square ksq) const { Score bonus = make_score(5, 5); - File center = Utility::clamp(file_of(ksq), FILE_B, FILE_G); + File center = std::clamp(file_of(ksq), FILE_B, FILE_G); for (File f = File(center - 1); f <= File(center + 1); ++f) { b = ourPawns & file_bb(f); diff --git a/src/position.cpp b/src/position.cpp index 46e5d78b..02898547 100644 --- a/src/position.cpp +++ b/src/position.cpp @@ -1145,8 +1145,8 @@ bool Position::see_ge(Move m, Value threshold) const { // Don't allow pinned pieces to attack (except the king) as long as // there are pinners on their original square. - if (st->pinners[~stm] & occupied) - stmAttackers &= ~st->blockersForKing[stm]; + if (pinners(~stm) & occupied) + stmAttackers &= ~blockers_for_king(stm); if (!stmAttackers) break; diff --git a/src/position.h b/src/position.h index a77050eb..5ce17277 100644 --- a/src/position.h +++ b/src/position.h @@ -113,6 +113,7 @@ public: Bitboard checkers() const; Bitboard blockers_for_king(Color c) const; Bitboard check_squares(PieceType pt) const; + Bitboard pinners(Color c) const; bool is_discovery_check_on_king(Color c, Move m) const; // Attacks to/from a given square @@ -309,6 +310,10 @@ inline Bitboard Position::blockers_for_king(Color c) const { return st->blockersForKing[c]; } +inline Bitboard Position::pinners(Color c) const { + return st->pinners[c]; +} + inline Bitboard Position::check_squares(PieceType pt) const { return st->checkSquares[pt]; } diff --git a/src/search.cpp b/src/search.cpp index ba13680c..82d8bb9d 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -335,7 +335,7 @@ void Thread::search() { // for match (TC 60+0.6) results spanning a wide range of k values. PRNG rng(now()); double floatLevel = Options["UCI_LimitStrength"] ? - Utility::clamp(std::pow((Options["UCI_Elo"] - 1346.6) / 143.4, 1 / 0.806), 0.0, 20.0) : + std::clamp(std::pow((Options["UCI_Elo"] - 1346.6) / 143.4, 1 / 0.806), 0.0, 20.0) : double(Options["Skill Level"]); int intLevel = int(floatLevel) + ((floatLevel - int(floatLevel)) * 1024 > rng.rand() % 1024 ? 1 : 0); @@ -508,7 +508,7 @@ void Thread::search() { { double fallingEval = (318 + 6 * (mainThread->bestPreviousScore - bestValue) + 6 * (mainThread->iterValue[iterIdx] - bestValue)) / 825.0; - fallingEval = Utility::clamp(fallingEval, 0.5, 1.5); + fallingEval = std::clamp(fallingEval, 0.5, 1.5); // If the bestMove is stable over several iterations, reduce time accordingly timeReduction = lastBestMoveDepth + 9 < completedDepth ? 1.92 : 0.95; @@ -807,8 +807,9 @@ namespace { && eval <= alpha - RazorMargin) return qsearch(pos, ss, alpha, beta); - improving = (ss-2)->staticEval == VALUE_NONE ? (ss->staticEval > (ss-4)->staticEval - || (ss-4)->staticEval == VALUE_NONE) : ss->staticEval > (ss-2)->staticEval; + improving = (ss-2)->staticEval == VALUE_NONE + ? ss->staticEval > (ss-4)->staticEval || (ss-4)->staticEval == VALUE_NONE + : ss->staticEval > (ss-2)->staticEval; // Step 8. Futility pruning: child node (~50 Elo) if ( !PvNode @@ -879,8 +880,8 @@ namespace { // there and in further interactions with transposition table cutoff depth is set to depth - 3 // because probCut search has depth set to depth - 4 but we also do a move before it // so effective depth is equal to depth - 3 - && !( ttHit - && tte->depth() >= depth - 3 + && !( ttHit + && tte->depth() >= depth - 3 && ttValue != VALUE_NONE && ttValue < probCutBeta)) { @@ -1238,7 +1239,7 @@ moves_loop: // When in check, search starts from here r++; } - Depth d = Utility::clamp(newDepth - r, 1, newDepth); + Depth d = std::clamp(newDepth - r, 1, newDepth); value = -search(pos, ss+1, -(alpha+1), -alpha, d, true); diff --git a/src/timeman.cpp b/src/timeman.cpp index df4ba9b2..6d9c95ef 100644 --- a/src/timeman.cpp +++ b/src/timeman.cpp @@ -38,9 +38,9 @@ void TimeManagement::init(Search::LimitsType& limits, Color us, int ply) { TimePoint slowMover = TimePoint(Options["Slow Mover"]); TimePoint npmsec = TimePoint(Options["nodestime"]); - // opt_scale is a percentage of available time to use for the current move. - // max_scale is a multiplier applied to optimumTime. - double opt_scale, max_scale; + // optScale is a percentage of available time to use for the current move. + // maxScale is a multiplier applied to optimumTime. + double optScale, maxScale; // If we have to play in 'nodes as time' mode, then convert from time // to nodes, and use resulting values in time management formulas. @@ -75,22 +75,22 @@ void TimeManagement::init(Search::LimitsType& limits, Color us, int ply) { // game time for the current move, so also cap to 20% of available game time. if (limits.movestogo == 0) { - opt_scale = std::min(0.008 + std::pow(ply + 3.0, 0.5) / 250.0, + optScale = std::min(0.008 + std::pow(ply + 3.0, 0.5) / 250.0, 0.2 * limits.time[us] / double(timeLeft)); - max_scale = std::min(7.0, 4.0 + ply / 12.0); + maxScale = std::min(7.0, 4.0 + ply / 12.0); } // x moves in y seconds (+ z increment) else { - opt_scale = std::min((0.8 + ply / 128.0) / mtg, + optScale = std::min((0.8 + ply / 128.0) / mtg, 0.8 * limits.time[us] / double(timeLeft)); - max_scale = std::min(6.3, 1.5 + 0.11 * mtg); + maxScale = std::min(6.3, 1.5 + 0.11 * mtg); } // Never use more than 80% of the available time for this move - optimumTime = TimePoint(opt_scale * timeLeft); - maximumTime = TimePoint(std::min(0.8 * limits.time[us] - moveOverhead, max_scale * optimumTime)); + optimumTime = TimePoint(optScale * timeLeft); + maximumTime = TimePoint(std::min(0.8 * limits.time[us] - moveOverhead, maxScale * optimumTime)); if (Options["Ponder"]) optimumTime += optimumTime / 4; diff --git a/src/uci.cpp b/src/uci.cpp index d6486320..bc0ee0a0 100644 --- a/src/uci.cpp +++ b/src/uci.cpp @@ -211,7 +211,7 @@ namespace { double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3]; // Transform eval to centipawns with limited range - double x = Utility::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0); + double x = std::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0); // Return win rate in per mille (rounded to nearest) return int(0.5 + 1000 / (1 + std::exp((a - x) / b))); From cc9d503ddea998890112efd08fae3705f2727e37 Mon Sep 17 00:00:00 2001 From: syzygy1 <3028851+syzygy1@users.noreply.github.com> Date: Sat, 22 Aug 2020 13:36:34 +0200 Subject: [PATCH 48/58] Skip the alignment bug workaround for Clang Clang-10.0.0 poses as gcc-4.2: $ clang++ -E -dM - Date: Sun, 23 Aug 2020 14:22:32 +0300 Subject: [PATCH 49/58] Introduce movecount pruning for qsearch() If in quiescence search, we assume that me can prune late moves when: a) the move ordering count of the move is : moveCount > abs(depth) + 2 b) we are not in check c) the late move does not give check d) the late move is not an advanced pawn push Modification of an original idea by @VoyagerOne. STC https://tests.stockfishchess.org/tests/view/5f40581787a5c3c63d8f535f LLR: 2.96 (-2.94,2.94) {-0.25,1.25} Total: 132848 W: 14999 L: 14661 D: 103188 Ptnml(0-2): 684, 11242, 42309, 11430, 759 LTC https://tests.stockfishchess.org/tests/view/5f4226da87a5c3c63d8f5412 LLR: 2.98 (-2.94,2.94) {0.25,1.25} Total: 12008 W: 678 L: 551 D: 10779 Ptnml(0-2): 8, 485, 4899, 596, 16 closes https://github.com/official-stockfish/Stockfish/pull/3053 Bench: 3749974 --- src/movepick.h | 2 +- src/search.cpp | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/movepick.h b/src/movepick.h index 97ea5bec..4c0ad551 100644 --- a/src/movepick.h +++ b/src/movepick.h @@ -93,7 +93,7 @@ constexpr int MAX_LPH = 4; typedef Stats LowPlyHistory; /// CounterMoveHistory stores counter moves indexed by [piece][to] of the previous -/// move, see www.chessprogramming.org/Countermove_Heuristic +/// move, see www.chessprogramming.org/Countermove_Heuristic typedef Stats CounterMoveHistory; /// CapturePieceToHistory is addressed by a move's [piece][to][captured piece type] diff --git a/src/search.cpp b/src/search.cpp index 82d8bb9d..266e2db3 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1531,6 +1531,10 @@ moves_loop: // When in check, search starts from here { assert(type_of(move) != ENPASSANT); // Due to !pos.advanced_pawn_push + // moveCount pruning + if (moveCount > abs(depth) + 2) + continue; + futilityValue = futilityBase + PieceValue[EG][pos.piece_on(to_sq(move))]; if (futilityValue <= alpha) @@ -1547,7 +1551,7 @@ moves_loop: // When in check, search starts from here } // Do not search moves with negative SEE values - if ( !ss->inCheck && !pos.see_ge(move)) + if (!ss->inCheck && !pos.see_ge(move)) continue; // Speculative prefetch as early as possible From e453f09f06f41680ef96f594f593f8de33e62b8f Mon Sep 17 00:00:00 2001 From: George Sobala Date: Mon, 24 Aug 2020 06:37:42 +0100 Subject: [PATCH 50/58] armv8 AArch64 does not require -mfpu=neon -mpfu is not required on AArch64 / armv8 architecture on Linux and throws an error if present. This PR has been tested on gcc and clang on Gentoo-64 and Raspian-64 on a Raspberry Pi 4, as well as with a cross from Ubuntu (`make clean && make -j build ARCH=armv8 COMP=gcc COMPILER=aarch64-linux-gnu-g++`) fixes https://github.com/official-stockfish/Stockfish/issues/3056 closes https://github.com/official-stockfish/Stockfish/pull/3059 No functional change --- src/Makefile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/Makefile b/src/Makefile index 74ef87b9..3e1b7c35 100644 --- a/src/Makefile +++ b/src/Makefile @@ -241,7 +241,7 @@ ifeq ($(ARCH),armv7-neon) endif ifeq ($(ARCH),armv8) - arch = armv8-a + arch = armv8 prefetch = yes popcnt = yes neon = yes @@ -285,7 +285,7 @@ ifeq ($(COMP),gcc) CXX=g++ CXXFLAGS += -pedantic -Wextra -Wshadow - ifeq ($(arch),$(filter $(arch),armv7 armv8-a)) + ifeq ($(arch),$(filter $(arch),armv7 armv8)) ifeq ($(OS),Android) CXXFLAGS += -m$(bits) LDFLAGS += -m$(bits) @@ -387,7 +387,7 @@ ifeq ($(COMP),ndk) CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon STRIP=arm-linux-androideabi-strip endif - ifeq ($(arch),armv8-a) + ifeq ($(arch),armv8) comp=aarch64-linux-android21-clang CXX=aarch64-linux-android21-clang++ STRIP=aarch64-linux-android-strip @@ -476,7 +476,7 @@ endif ### 3.6 popcnt ifeq ($(popcnt),yes) - ifeq ($(arch),$(filter $(arch),ppc64 armv7 armv8-a arm64)) + ifeq ($(arch),$(filter $(arch),ppc64 armv7 armv8 arm64)) CXXFLAGS += -DUSE_POPCNT else ifeq ($(comp),icc) CXXFLAGS += -msse3 -DUSE_POPCNT @@ -539,9 +539,11 @@ ifeq ($(neon),yes) CXXFLAGS += -DUSE_NEON ifeq ($(KERNEL),Linux) ifneq ($(COMP),ndk) + ifneq ($(arch),armv8) CXXFLAGS += -mfpu=neon endif endif + endif endif ### 3.7 pext @@ -780,7 +782,7 @@ config-sanity: @test "$(optimize)" = "yes" || test "$(optimize)" = "no" @test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \ test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || \ - test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a" || test "$(arch)" = "arm64" + test "$(arch)" = "armv7" || test "$(arch)" = "armv8" || test "$(arch)" = "arm64" @test "$(bits)" = "32" || test "$(bits)" = "64" @test "$(prefetch)" = "yes" || test "$(prefetch)" = "no" @test "$(popcnt)" = "yes" || test "$(popcnt)" = "no" From 701b2427bd84d112376ce858b66befc5b66c4bb2 Mon Sep 17 00:00:00 2001 From: mstembera Date: Thu, 20 Aug 2020 16:59:27 -0700 Subject: [PATCH 51/58] Support VNNI on 256bit vectors due to downclocking on current chips (tested up to cascade lake) supporting avx512 and vnni512, it is better to use avx2 or vnni256 in multithreaded (in particular hyperthreaded) engine use. In single threaded use, the picture is different. gcc compilation for vnni256 requires a toolchain for gcc >= 9. closes https://github.com/official-stockfish/Stockfish/pull/3038 No functional change --- .travis.yml | 6 +++-- src/Makefile | 39 ++++++++++++++++++++++++------ src/nnue/layers/affine_transform.h | 8 +++++- 3 files changed, 42 insertions(+), 11 deletions(-) diff --git a/.travis.yml b/.travis.yml index a029c4fc..c1e6d6df 100644 --- a/.travis.yml +++ b/.travis.yml @@ -77,8 +77,10 @@ script: # compile only for some more advanced architectures (might not run in travis) - make clean && make -j2 ARCH=x86-64-avx2 build - make clean && make -j2 ARCH=x86-64-bmi2 build - # needs gcc 10 to compile - - if [[ "$COMPILER" != "g++-8" ]]; then make clean && make -j2 ARCH=x86-64-avx512 build; fi + - make clean && make -j2 ARCH=x86-64-avx512 build + - make clean && make -j2 ARCH=x86-64-vnni512 build + # requires gcc 9 or higher + - if [[ "$COMPILER" != "g++-8" ]]; make clean && make -j2 ARCH=x86-64-vnni256 build; fi # # Check perft and reproducible search diff --git a/src/Makefile b/src/Makefile index 3e1b7c35..228ea851 100644 --- a/src/Makefile +++ b/src/Makefile @@ -75,7 +75,8 @@ endif # sse41 = yes/no --- -msse4.1 --- Use Intel Streaming SIMD Extensions 4.1 # avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2 # avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512 -# vnni = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512 +# vnni256 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 256 +# vnni512 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512 # neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture # # Note that Makefile is space sensitive, so when adding new architectures @@ -102,7 +103,8 @@ ssse3 = no sse41 = no avx2 = no avx512 = no -vnni = no +vnni256 = no +vnni512 = no neon = no ARCH = x86-64-modern STRIP = strip @@ -192,7 +194,18 @@ ifeq ($(findstring -avx512,$(ARCH)),-avx512) avx512 = yes endif -ifeq ($(findstring -vnni,$(ARCH)),-vnni) +ifeq ($(findstring -vnni256,$(ARCH)),-vnni256) + popcnt = yes + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes + avx2 = yes + pext = yes + vnni256 = yes +endif + +ifeq ($(findstring -vnni512,$(ARCH)),-vnni512) popcnt = yes sse = yes sse2 = yes @@ -201,7 +214,7 @@ ifeq ($(findstring -vnni,$(ARCH)),-vnni) avx2 = yes pext = yes avx512 = yes - vnni = yes + vnni512 = yes endif ifeq ($(sse),yes) @@ -500,7 +513,14 @@ ifeq ($(avx512),yes) endif endif -ifeq ($(vnni),yes) +ifeq ($(vnni256),yes) + CXXFLAGS += -DUSE_VNNI + ifeq ($(comp),$(filter $(comp),gcc clang mingw)) + CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=256 + endif +endif + +ifeq ($(vnni512),yes) CXXFLAGS += -DUSE_VNNI ifeq ($(comp),$(filter $(comp),gcc clang mingw)) CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl @@ -623,7 +643,8 @@ help: @echo "" @echo "Supported archs:" @echo "" - @echo "x86-64-vnni > x86 64-bit with vnni support" + @echo "x86-64-vnni512 > x86 64-bit with vnni support 512bit wide" + @echo "x86-64-vnni256 > x86 64-bit with vnni support 256bit wide" @echo "x86-64-avx512 > x86 64-bit with avx512 support" @echo "x86-64-bmi2 > x86 64-bit with bmi2 support" @echo "x86-64-avx2 > x86 64-bit with avx2 support" @@ -767,7 +788,8 @@ config-sanity: @echo "sse41: '$(sse41)'" @echo "avx2: '$(avx2)'" @echo "avx512: '$(avx512)'" - @echo "vnni: '$(vnni)'" + @echo "vnni256: '$(vnni256)'" + @echo "vnni512: '$(vnni512)'" @echo "neon: '$(neon)'" @echo "" @echo "Flags:" @@ -794,7 +816,8 @@ config-sanity: @test "$(sse41)" = "yes" || test "$(sse41)" = "no" @test "$(avx2)" = "yes" || test "$(avx2)" = "no" @test "$(avx512)" = "yes" || test "$(avx512)" = "no" - @test "$(vnni)" = "yes" || test "$(vnni)" = "no" + @test "$(vnni256)" = "yes" || test "$(vnni256)" = "no" + @test "$(vnni512)" = "yes" || test "$(vnni512)" = "no" @test "$(neon)" = "yes" || test "$(neon)" = "no" @test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" \ || test "$(comp)" = "armv7a-linux-androideabi16-clang" || test "$(comp)" = "aarch64-linux-android21-clang" diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 7ac5a1c0..94d0b5a9 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -85,8 +85,10 @@ namespace Eval::NNUE::Layers { #elif defined(USE_AVX2) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; - const __m256i kOnes = _mm256_set1_epi16(1); const auto input_vector = reinterpret_cast(input); + #if !defined(USE_VNNI) + const __m256i kOnes = _mm256_set1_epi16(1); + #endif #elif defined(USE_SSE2) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; @@ -145,9 +147,13 @@ namespace Eval::NNUE::Layers { __m256i sum = _mm256_setzero_si256(); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { + #if defined(USE_VNNI) + sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j])); + #else __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j])); product = _mm256_madd_epi16(product, kOnes); sum = _mm256_add_epi32(sum, product); + #endif } __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC)); From f7b3f0e8426bbf7414d139ed9d1cfa7a98d7314d Mon Sep 17 00:00:00 2001 From: Sami Kiminki Date: Fri, 21 Aug 2020 12:12:39 +0300 Subject: [PATCH 52/58] Allow TT entries with key16==0 to be fetched Fix the issue where a TT entry with key16==0 would always be reported as a miss. Instead, we'll use depth8 to detect whether the TT entry is occupied. In order to do that, we'll change DEPTH_OFFSET to -7 (depth8==0) to distinguish between an unoccupied entry and the otherwise lowest possible depth, i.e., DEPTH_NONE (depth8==1). To prevent a performance regression, we'll reorder the TT entry fields by the access order of TranspositionTable::probe(). Memory in general works fastest when accessed in sequential order. We'll also match the store order in TTEntry::save() with the entry field order, and re-order the 'if-or' expressions in TTEntry::save() from the cheapest to the most expensive. Finally, as we now have a proper TT entry occupancy test, we'll fix a minor corner case with hashfull reporting. To reproduce: - Use a big hash - Either: a. Start 31 very quick searches (this wraparounds generation to 0); or b. Force generation of the first search to 0. - go depth infinite Before the fix, hashfull would incorrectly report nearly full hash immediately after the search start, since TranspositionTable::hashfull() used to consider only the entry generation and not whether the entry was actually occupied. STC: LLR: 2.95 (-2.94,2.94) {-0.25,1.25} Total: 36848 W: 4091 L: 3898 D: 28859 Ptnml(0-2): 158, 2996, 11972, 3091, 207 https://tests.stockfishchess.org/tests/view/5f3f98d5dc02a01a0c2881f7 LTC: LLR: 2.95 (-2.94,2.94) {0.25,1.25} Total: 32280 W: 1828 L: 1653 D: 28799 Ptnml(0-2): 34, 1428, 13051, 1583, 44 https://tests.stockfishchess.org/tests/view/5f3fe77a87a5c3c63d8f5332 closes https://github.com/official-stockfish/Stockfish/pull/3048 Bench: 3760677 --- src/tt.cpp | 21 +++++++++++---------- src/tt.h | 12 ++++++------ src/types.h | 3 ++- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/tt.cpp b/src/tt.cpp index d494c27d..60a3a5f1 100644 --- a/src/tt.cpp +++ b/src/tt.cpp @@ -37,18 +37,19 @@ void TTEntry::save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev) if (m || (uint16_t)k != key16) move16 = (uint16_t)m; - // Overwrite less valuable entries - if ((uint16_t)k != key16 - || d - DEPTH_OFFSET > depth8 - 4 - || b == BOUND_EXACT) + // Overwrite less valuable entries (cheapest checks first) + if (b == BOUND_EXACT + || (uint16_t)k != key16 + || d - DEPTH_OFFSET > depth8 - 4) { - assert(d >= DEPTH_OFFSET); + assert(d > DEPTH_OFFSET); + assert(d < 256 + DEPTH_OFFSET); key16 = (uint16_t)k; + depth8 = (uint8_t)(d - DEPTH_OFFSET); + genBound8 = (uint8_t)(TT.generation8 | uint8_t(pv) << 2 | b); value16 = (int16_t)v; eval16 = (int16_t)ev; - genBound8 = (uint8_t)(TT.generation8 | uint8_t(pv) << 2 | b); - depth8 = (uint8_t)(d - DEPTH_OFFSET); } } @@ -119,11 +120,11 @@ TTEntry* TranspositionTable::probe(const Key key, bool& found) const { const uint16_t key16 = (uint16_t)key; // Use the low 16 bits as key inside the cluster for (int i = 0; i < ClusterSize; ++i) - if (!tte[i].key16 || tte[i].key16 == key16) + if (tte[i].key16 == key16 || !tte[i].depth8) { tte[i].genBound8 = uint8_t(generation8 | (tte[i].genBound8 & 0x7)); // Refresh - return found = (bool)tte[i].key16, &tte[i]; + return found = (bool)tte[i].depth8, &tte[i]; } // Find an entry to be replaced according to the replacement strategy @@ -149,7 +150,7 @@ int TranspositionTable::hashfull() const { int cnt = 0; for (int i = 0; i < 1000; ++i) for (int j = 0; j < ClusterSize; ++j) - cnt += (table[i].entry[j].genBound8 & 0xF8) == generation8; + cnt += table[i].entry[j].depth8 && (table[i].entry[j].genBound8 & 0xF8) == generation8; return cnt / ClusterSize; } diff --git a/src/tt.h b/src/tt.h index c177ca52..fdfd6769 100644 --- a/src/tt.h +++ b/src/tt.h @@ -25,13 +25,13 @@ /// TTEntry struct is the 10 bytes transposition table entry, defined as below: /// /// key 16 bit -/// move 16 bit -/// value 16 bit -/// eval value 16 bit +/// depth 8 bit /// generation 5 bit /// pv node 1 bit /// bound type 2 bit -/// depth 8 bit +/// move 16 bit +/// value 16 bit +/// eval value 16 bit struct TTEntry { @@ -47,11 +47,11 @@ private: friend class TranspositionTable; uint16_t key16; + uint8_t depth8; + uint8_t genBound8; uint16_t move16; int16_t value16; int16_t eval16; - uint8_t genBound8; - uint8_t depth8; }; diff --git a/src/types.h b/src/types.h index 73da41e2..1cd711b6 100644 --- a/src/types.h +++ b/src/types.h @@ -232,7 +232,8 @@ enum : int { DEPTH_QS_RECAPTURES = -5, DEPTH_NONE = -6, - DEPTH_OFFSET = DEPTH_NONE + + DEPTH_OFFSET = -7 // value used only for TT entry occupancy check }; enum Square : int { From 843a961a8c10d5949e04718b829e3b3d5adeedb4 Mon Sep 17 00:00:00 2001 From: Vizvezdenec Date: Mon, 24 Aug 2020 08:04:16 +0300 Subject: [PATCH 53/58] Introduce countermove based pruning for qsearch This patch continues work of previous patch in introducing pruning heuristics in qsearch by analogy to main search, now with countermove based pruning. Idea is that if move is late enough and is quite check (we do generate them in qsearch) and has bad enough countermove history - prune it. passed STC https://tests.stockfishchess.org/tests/view/5f41220287a5c3c63d8f53c5 LLR: 2.93 (-2.94,2.94) {-0.25,1.25} Total: 35944 W: 4127 L: 3929 D: 27888 Ptnml(0-2): 196, 2970, 11459, 3134, 213 passed LTC https://tests.stockfishchess.org/tests/view/5f41862f87a5c3c63d8f53e8 LLR: 2.95 (-2.94,2.94) {0.25,1.25} Total: 138448 W: 7655 L: 7252 D: 123541 Ptnml(0-2): 145, 6247, 56043, 6638, 151 closes https://github.com/official-stockfish/Stockfish/pull/3058 Bench: 3610676 --- src/search.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/search.cpp b/src/search.cpp index 266e2db3..2ca64a01 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1570,6 +1570,12 @@ moves_loop: // When in check, search starts from here [pos.moved_piece(move)] [to_sq(move)]; + if ( !captureOrPromotion + && moveCount >= abs(depth) + 1 + && (*contHist[0])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold + && (*contHist[1])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold) + continue; + // Make and search the move pos.do_move(move, st, givesCheck); value = -qsearch(pos, ss+1, -beta, -alpha, depth - 1); From 530fccbf272ffe424ae53a464b91db148cc968ae Mon Sep 17 00:00:00 2001 From: mstembera Date: Mon, 24 Aug 2020 03:38:01 -0700 Subject: [PATCH 54/58] Allow for VNNI256 compilation with g++-8 explicitly pass needed -mavx512f -mavx512bw flags closes https://github.com/official-stockfish/Stockfish/pull/3061 No functional change --- .travis.yml | 3 +-- src/Makefile | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index c1e6d6df..092c7f53 100644 --- a/.travis.yml +++ b/.travis.yml @@ -79,8 +79,7 @@ script: - make clean && make -j2 ARCH=x86-64-bmi2 build - make clean && make -j2 ARCH=x86-64-avx512 build - make clean && make -j2 ARCH=x86-64-vnni512 build - # requires gcc 9 or higher - - if [[ "$COMPILER" != "g++-8" ]]; make clean && make -j2 ARCH=x86-64-vnni256 build; fi + - make clean && make -j2 ARCH=x86-64-vnni256 build # # Check perft and reproducible search diff --git a/src/Makefile b/src/Makefile index 228ea851..2e85a144 100644 --- a/src/Makefile +++ b/src/Makefile @@ -516,7 +516,7 @@ endif ifeq ($(vnni256),yes) CXXFLAGS += -DUSE_VNNI ifeq ($(comp),$(filter $(comp),gcc clang mingw)) - CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=256 + CXXFLAGS += -mavx512f -mavx512bw -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=256 endif endif From b0b4ca17db49ed03057b5fa4ee4a12dab0e9c9e6 Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Mon, 24 Aug 2020 21:32:04 +0200 Subject: [PATCH 55/58] Check ARCH=.... variable to prevent user errors or generating untested code, check explicitly that the ARCH variable is equivalent to a supported architecture as listed in `make help`. To nevertheless compile for an untested target the user can override the internal variable, passing the undocumented `SUPPORTED_ARCH=true` to make. closes https://github.com/official-stockfish/Stockfish/pull/3062 No functional change. --- src/Makefile | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/Makefile b/src/Makefile index 2e85a144..703aa230 100644 --- a/src/Makefile +++ b/src/Makefile @@ -85,8 +85,15 @@ endif ### 2.1. General and architecture defaults -ifeq ($(ARCH),) - empty_arch = yes +# explicitly check for the list of supported architectures (as listed with make help), +# the user can override with `make ARCH=x86-32-vnni256 SUPPORTED_ARCH=true` +ifeq ($(ARCH),$(filter $(ARCH),x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-bmi2 x86-64-avx2 \ + x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \ + x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 \ + armv7 armv7-neon armv8 apple-silicon general-64 general-32)) + SUPPORTED_ARCH=true +else + SUPPORTED_ARCH=false endif optimize = yes @@ -625,6 +632,7 @@ endif ### Section 4. Public Targets ### ========================================================================== + help: @echo "" @echo "To compile stockfish, type: " @@ -684,10 +692,12 @@ help: @echo "make -j profile-build ARCH=x86-64-bmi2 COMP=gcc COMPCXX=g++-9.0" @echo "make -j build ARCH=x86-64-ssse3 COMP=clang" @echo "" -ifneq ($(empty_arch), yes) @echo "-------------------------------" +ifeq ($(SUPPORTED_ARCH), true) @echo "The selected architecture $(ARCH) will enable the following configuration: " @$(MAKE) ARCH=$(ARCH) COMP=$(COMP) config-sanity +else + @echo "Specify a supported architecture with the ARCH option for more details" endif @@ -802,6 +812,7 @@ config-sanity: @test "$(debug)" = "yes" || test "$(debug)" = "no" @test "$(sanitize)" = "undefined" || test "$(sanitize)" = "thread" || test "$(sanitize)" = "address" || test "$(sanitize)" = "no" @test "$(optimize)" = "yes" || test "$(optimize)" = "no" + @test "$(SUPPORTED_ARCH)" = "true" @test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \ test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || \ test "$(arch)" = "armv7" || test "$(arch)" = "armv8" || test "$(arch)" = "arm64" From 9b4967071e2fb116673820127522bc43d01d2257 Mon Sep 17 00:00:00 2001 From: syzygy1 <3028851+syzygy1@users.noreply.github.com> Date: Mon, 24 Aug 2020 02:29:38 +0200 Subject: [PATCH 56/58] Remove EvalList This patch removes the EvalList structure from the Position object and generally simplifies the interface between do_move() and the NNUE code. The NNUE evaluation function first calculates the "accumulator". The accumulator consists of two halves: one for white's perspective, one for black's perspective. If the "friendly king" has moved or the accumulator for the parent position is not available, the accumulator for this half has to be calculated from scratch. To do this, the NNUE node needs to know the positions and types of all non-king pieces and the position of the friendly king. This information can easily be obtained from the Position object. If the "friendly king" has not moved, its half of the accumulator can be calculated by incrementally updating the accumulator for the previous position. For this, the NNUE code needs to know which pieces have been added to which squares and which pieces have been removed from which squares. In principle this information can be derived from the Position object and StateInfo struct (in the same way as undo_move() does this). However, it is probably a bit faster to prepare this information in do_move(), so I have kept the DirtyPiece struct. Since the DirtyPiece struct now stores the squares rather than "PieceSquare" indices, there are now at most three "dirty pieces" (previously two). A promotion move that captures a piece removes the capturing pawn and the captured piece from the board (to SQ_NONE) and moves the promoted piece to the promotion square (from SQ_NONE). An STC test has confirmed a small speedup: https://tests.stockfishchess.org/tests/view/5f43f06b5089a564a10d850a LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 87704 W: 9763 L: 9500 D: 68441 Ptnml(0-2): 426, 6950, 28845, 7197, 434 closes https://github.com/official-stockfish/Stockfish/pull/3068 No functional change --- src/nnue/evaluate_nnue.cpp | 43 ++++++------ src/nnue/features/feature_set.h | 3 +- src/nnue/features/half_kp.cpp | 58 ++++++---------- src/nnue/features/half_kp.h | 10 +-- src/nnue/nnue_common.h | 21 ++++++ src/position.cpp | 95 +++++++------------------- src/position.h | 23 ------- src/types.h | 116 +++----------------------------- 8 files changed, 96 insertions(+), 273 deletions(-) diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp index dfbb1ac2..e6619089 100644 --- a/src/nnue/evaluate_nnue.cpp +++ b/src/nnue/evaluate_nnue.cpp @@ -29,30 +29,29 @@ #include "evaluate_nnue.h" -ExtPieceSquare kpp_board_index[PIECE_NB] = { - // convention: W - us, B - them - // viewed from other side, W and B are reversed - { PS_NONE, PS_NONE }, - { PS_W_PAWN, PS_B_PAWN }, - { PS_W_KNIGHT, PS_B_KNIGHT }, - { PS_W_BISHOP, PS_B_BISHOP }, - { PS_W_ROOK, PS_B_ROOK }, - { PS_W_QUEEN, PS_B_QUEEN }, - { PS_W_KING, PS_B_KING }, - { PS_NONE, PS_NONE }, - { PS_NONE, PS_NONE }, - { PS_B_PAWN, PS_W_PAWN }, - { PS_B_KNIGHT, PS_W_KNIGHT }, - { PS_B_BISHOP, PS_W_BISHOP }, - { PS_B_ROOK, PS_W_ROOK }, - { PS_B_QUEEN, PS_W_QUEEN }, - { PS_B_KING, PS_W_KING }, - { PS_NONE, PS_NONE } -}; - - namespace Eval::NNUE { + uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = { + // convention: W - us, B - them + // viewed from other side, W and B are reversed + { PS_NONE, PS_NONE }, + { PS_W_PAWN, PS_B_PAWN }, + { PS_W_KNIGHT, PS_B_KNIGHT }, + { PS_W_BISHOP, PS_B_BISHOP }, + { PS_W_ROOK, PS_B_ROOK }, + { PS_W_QUEEN, PS_B_QUEEN }, + { PS_W_KING, PS_B_KING }, + { PS_NONE, PS_NONE }, + { PS_NONE, PS_NONE }, + { PS_B_PAWN, PS_W_PAWN }, + { PS_B_KNIGHT, PS_W_KNIGHT }, + { PS_B_BISHOP, PS_W_BISHOP }, + { PS_B_ROOK, PS_W_ROOK }, + { PS_B_QUEEN, PS_W_QUEEN }, + { PS_B_KING, PS_W_KING }, + { PS_NONE, PS_NONE } + }; + // Input feature converter AlignedPtr feature_transformer; diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h index 79ca83ae..558a6b22 100644 --- a/src/nnue/features/feature_set.h +++ b/src/nnue/features/feature_set.h @@ -68,8 +68,7 @@ namespace Eval::NNUE::Features { reset[perspective] = false; switch (trigger) { case TriggerEvent::kFriendKingMoved: - reset[perspective] = - dp.pieceId[0] == PIECE_ID_KING + perspective; + reset[perspective] = dp.piece[0] == make_piece(perspective, KING); break; default: assert(false); diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp index 628add6e..88e384a3 100644 --- a/src/nnue/features/half_kp.cpp +++ b/src/nnue/features/half_kp.cpp @@ -23,25 +23,17 @@ namespace Eval::NNUE::Features { - // Find the index of the feature quantity from the king position and PieceSquare - template - inline IndexType HalfKP::MakeIndex(Square sq_k, PieceSquare p) { - return static_cast(PS_END) * static_cast(sq_k) + p; + // Orient a square according to perspective (rotates by 180 for black) + inline Square orient(Color perspective, Square s) { + return Square(int(s) ^ (bool(perspective) * 63)); } - // Get pieces information + // Find the index of the feature quantity from the king position and PieceSquare template - inline void HalfKP::GetPieces( - const Position& pos, Color perspective, - PieceSquare** pieces, Square* sq_target_k) { + inline IndexType HalfKP::MakeIndex( + Color perspective, Square s, Piece pc, Square ksq) { - *pieces = (perspective == BLACK) ? - pos.eval_list()->piece_list_fb() : - pos.eval_list()->piece_list_fw(); - const PieceId target = (AssociatedKing == Side::kFriend) ? - static_cast(PIECE_ID_KING + perspective) : - static_cast(PIECE_ID_KING + ~perspective); - *sq_target_k = static_cast(((*pieces)[target] - PS_W_KING) % SQUARE_NB); + return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq); } // Get a list of indices for active features @@ -49,16 +41,11 @@ namespace Eval::NNUE::Features { void HalfKP::AppendActiveIndices( const Position& pos, Color perspective, IndexList* active) { - // Do nothing if array size is small to avoid compiler warning - if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return; - - PieceSquare* pieces; - Square sq_target_k; - GetPieces(pos, perspective, &pieces, &sq_target_k); - for (PieceId i = PIECE_ID_ZERO; i < PIECE_ID_KING; ++i) { - if (pieces[i] != PS_NONE) { - active->push_back(MakeIndex(sq_target_k, pieces[i])); - } + Square ksq = orient(perspective, pos.square(perspective)); + Bitboard bb = pos.pieces() & ~pos.pieces(KING); + while (bb) { + Square s = pop_lsb(&bb); + active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq)); } } @@ -68,22 +55,15 @@ namespace Eval::NNUE::Features { const Position& pos, Color perspective, IndexList* removed, IndexList* added) { - PieceSquare* pieces; - Square sq_target_k; - GetPieces(pos, perspective, &pieces, &sq_target_k); + Square ksq = orient(perspective, pos.square(perspective)); const auto& dp = pos.state()->dirtyPiece; for (int i = 0; i < dp.dirty_num; ++i) { - if (dp.pieceId[i] >= PIECE_ID_KING) continue; - const auto old_p = static_cast( - dp.old_piece[i].from[perspective]); - if (old_p != PS_NONE) { - removed->push_back(MakeIndex(sq_target_k, old_p)); - } - const auto new_p = static_cast( - dp.new_piece[i].from[perspective]); - if (new_p != PS_NONE) { - added->push_back(MakeIndex(sq_target_k, new_p)); - } + Piece pc = dp.piece[i]; + if (type_of(pc) == KING) continue; + if (dp.from[i] != SQ_NONE) + removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq)); + if (dp.to[i] != SQ_NONE) + added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq)); } } diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h index 99842eea..ee6a8df3 100644 --- a/src/nnue/features/half_kp.h +++ b/src/nnue/features/half_kp.h @@ -41,7 +41,7 @@ namespace Eval::NNUE::Features { static constexpr IndexType kDimensions = static_cast(SQUARE_NB) * static_cast(PS_END); // Maximum number of simultaneously active features - static constexpr IndexType kMaxActiveDimensions = PIECE_ID_KING; + static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count // Trigger for full calculation instead of difference calculation static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kFriendKingMoved; @@ -53,13 +53,9 @@ namespace Eval::NNUE::Features { static void AppendChangedIndices(const Position& pos, Color perspective, IndexList* removed, IndexList* added); - // Index of a feature for a given king position and another piece on some square - static IndexType MakeIndex(Square sq_k, PieceSquare p); - private: - // Get pieces information - static void GetPieces(const Position& pos, Color perspective, - PieceSquare** pieces, Square* sq_target_k); + // Index of a feature for a given king position and another piece on some square + static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k); }; } // namespace Eval::NNUE::Features diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index a9d8e5af..7bc905dc 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -94,6 +94,27 @@ namespace Eval::NNUE { constexpr std::size_t kMaxSimdWidth = 32; + // unique number for each piece type on each square + enum { + PS_NONE = 0, + PS_W_PAWN = 1, + PS_B_PAWN = 1 * SQUARE_NB + 1, + PS_W_KNIGHT = 2 * SQUARE_NB + 1, + PS_B_KNIGHT = 3 * SQUARE_NB + 1, + PS_W_BISHOP = 4 * SQUARE_NB + 1, + PS_B_BISHOP = 5 * SQUARE_NB + 1, + PS_W_ROOK = 6 * SQUARE_NB + 1, + PS_B_ROOK = 7 * SQUARE_NB + 1, + PS_W_QUEEN = 8 * SQUARE_NB + 1, + PS_B_QUEEN = 9 * SQUARE_NB + 1, + PS_W_KING = 10 * SQUARE_NB + 1, + PS_END = PS_W_KING, // pieces without kings (pawns included) + PS_B_KING = 11 * SQUARE_NB + 1, + PS_END2 = 12 * SQUARE_NB + 1 + }; + + extern uint32_t kpp_board_index[PIECE_NB][COLOR_NB]; + // Type of input feature after conversion using TransformedFeatureType = std::uint8_t; using IndexType = std::uint32_t; diff --git a/src/position.cpp b/src/position.cpp index 02898547..fe89b753 100644 --- a/src/position.cpp +++ b/src/position.cpp @@ -198,9 +198,6 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th std::fill_n(&pieceList[0][0], sizeof(pieceList) / sizeof(Square), SQ_NONE); st = si; - // Each piece on board gets a unique ID used to track the piece later - PieceId piece_id, next_piece_id = PIECE_ID_ZERO; - ss >> std::noskipws; // 1. Piece placement @@ -212,21 +209,8 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th else if (token == '/') sq += 2 * SOUTH; - else if ((idx = PieceToChar.find(token)) != string::npos) - { - auto pc = Piece(idx); - put_piece(pc, sq); - - if (Eval::useNNUE) - { - // Kings get a fixed ID, other pieces get ID in order of placement - piece_id = - (idx == W_KING) ? PIECE_ID_WKING : - (idx == B_KING) ? PIECE_ID_BKING : - next_piece_id++; - evalList.put_piece(piece_id, sq, pc); - } - + else if ((idx = PieceToChar.find(token)) != string::npos) { + put_piece(Piece(idx), sq); ++sq; } } @@ -721,8 +705,6 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { // Used by NNUE st->accumulator.computed_accumulation = false; st->accumulator.computed_score = false; - PieceId dp0 = PIECE_ID_NONE; - PieceId dp1 = PIECE_ID_NONE; auto& dp = st->dirtyPiece; dp.dirty_num = 1; @@ -775,12 +757,10 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { if (Eval::useNNUE) { - dp.dirty_num = 2; // 2 pieces moved - dp1 = piece_id_on(capsq); - dp.pieceId[1] = dp1; - dp.old_piece[1] = evalList.piece_with_id(dp1); - evalList.put_piece(dp1, capsq, NO_PIECE); - dp.new_piece[1] = evalList.piece_with_id(dp1); + dp.dirty_num = 2; // 1 piece moved, 1 piece captured + dp.piece[1] = captured; + dp.from[1] = capsq; + dp.to[1] = SQ_NONE; } // Update board and piece lists @@ -821,11 +801,9 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { { if (Eval::useNNUE) { - dp0 = piece_id_on(from); - dp.pieceId[0] = dp0; - dp.old_piece[0] = evalList.piece_with_id(dp0); - evalList.put_piece(dp0, to, pc); - dp.new_piece[0] = evalList.piece_with_id(dp0); + dp.piece[0] = pc; + dp.from[0] = from; + dp.to[0] = to; } move_piece(from, to); @@ -854,9 +832,12 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { if (Eval::useNNUE) { - dp0 = piece_id_on(to); - evalList.put_piece(dp0, to, promotion); - dp.new_piece[0] = evalList.piece_with_id(dp0); + // Promoting pawn to SQ_NONE, promoted piece from SQ_NONE + dp.to[0] = SQ_NONE; + dp.piece[dp.dirty_num] = promotion; + dp.from[dp.dirty_num] = SQ_NONE; + dp.to[dp.dirty_num] = to; + dp.dirty_num++; } // Update hash keys @@ -950,12 +931,6 @@ void Position::undo_move(Move m) { { move_piece(to, from); // Put the piece back at the source square - if (Eval::useNNUE) - { - PieceId dp0 = st->dirtyPiece.pieceId[0]; - evalList.put_piece(dp0, from, pc); - } - if (st->capturedPiece) { Square capsq = to; @@ -972,14 +947,6 @@ void Position::undo_move(Move m) { } put_piece(st->capturedPiece, capsq); // Restore the captured piece - - if (Eval::useNNUE) - { - PieceId dp1 = st->dirtyPiece.pieceId[1]; - assert(evalList.piece_with_id(dp1).from[WHITE] == PS_NONE); - assert(evalList.piece_with_id(dp1).from[BLACK] == PS_NONE); - evalList.put_piece(dp1, capsq, st->capturedPiece); - } } } @@ -1001,32 +968,16 @@ void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Squ rto = relative_square(us, kingSide ? SQ_F1 : SQ_D1); to = relative_square(us, kingSide ? SQ_G1 : SQ_C1); - if (Eval::useNNUE) + if (Do && Eval::useNNUE) { - PieceId dp0, dp1; auto& dp = st->dirtyPiece; - dp.dirty_num = 2; // 2 pieces moved - - if (Do) - { - dp0 = piece_id_on(from); - dp1 = piece_id_on(rfrom); - dp.pieceId[0] = dp0; - dp.old_piece[0] = evalList.piece_with_id(dp0); - evalList.put_piece(dp0, to, make_piece(us, KING)); - dp.new_piece[0] = evalList.piece_with_id(dp0); - dp.pieceId[1] = dp1; - dp.old_piece[1] = evalList.piece_with_id(dp1); - evalList.put_piece(dp1, rto, make_piece(us, ROOK)); - dp.new_piece[1] = evalList.piece_with_id(dp1); - } - else - { - dp0 = piece_id_on(to); - dp1 = piece_id_on(rto); - evalList.put_piece(dp0, from, make_piece(us, KING)); - evalList.put_piece(dp1, rfrom, make_piece(us, ROOK)); - } + dp.piece[0] = make_piece(us, KING); + dp.from[0] = from; + dp.to[0] = to; + dp.piece[1] = make_piece(us, ROOK); + dp.from[1] = rfrom; + dp.to[1] = rto; + dp.dirty_num = 2; } // Remove both pieces first since squares could overlap in Chess960 diff --git a/src/position.h b/src/position.h index 5ce17277..d6f5c9fd 100644 --- a/src/position.h +++ b/src/position.h @@ -171,7 +171,6 @@ public: // Used by NNUE StateInfo* state() const; - const EvalList* eval_list() const; private: // Initialization helpers (used while setting up a position) @@ -186,9 +185,6 @@ private: template void do_castling(Color us, Square from, Square& to, Square& rfrom, Square& rto); - // ID of a piece on a given square - PieceId piece_id_on(Square sq) const; - // Data members Piece board[SQUARE_NB]; Bitboard byTypeBB[PIECE_TYPE_NB]; @@ -205,9 +201,6 @@ private: Thread* thisThread; StateInfo* st; bool chess960; - - // List of pieces used in NNUE evaluation function - EvalList evalList; }; namespace PSQT { @@ -451,20 +444,4 @@ inline StateInfo* Position::state() const { return st; } -inline const EvalList* Position::eval_list() const { - - return &evalList; -} - -inline PieceId Position::piece_id_on(Square sq) const -{ - - assert(piece_on(sq) != NO_PIECE); - - PieceId pid = evalList.piece_id_list[sq]; - assert(is_ok(pid)); - - return pid; -} - #endif // #ifndef POSITION_H_INCLUDED diff --git a/src/types.h b/src/types.h index 1cd711b6..5873c698 100644 --- a/src/types.h +++ b/src/types.h @@ -201,22 +201,6 @@ enum Piece { PIECE_NB = 16 }; -// An ID used to track the pieces. Max. 32 pieces on board. -enum PieceId { - PIECE_ID_ZERO = 0, - PIECE_ID_KING = 30, - PIECE_ID_WKING = 30, - PIECE_ID_BKING = 31, - PIECE_ID_NONE = 32 -}; - -inline PieceId operator++(PieceId& d, int) { - - PieceId x = d; - d = PieceId(int(d) + 1); - return x; -} - constexpr Value PieceValue[PHASE_NB][PIECE_NB] = { { VALUE_ZERO, PawnValueMg, KnightValueMg, BishopValueMg, RookValueMg, QueenValueMg, VALUE_ZERO, VALUE_ZERO, VALUE_ZERO, PawnValueMg, KnightValueMg, BishopValueMg, RookValueMg, QueenValueMg, VALUE_ZERO, VALUE_ZERO }, @@ -271,93 +255,20 @@ enum Rank : int { RANK_1, RANK_2, RANK_3, RANK_4, RANK_5, RANK_6, RANK_7, RANK_8, RANK_NB }; -// unique number for each piece type on each square -enum PieceSquare : uint32_t { - PS_NONE = 0, - PS_W_PAWN = 1, - PS_B_PAWN = 1 * SQUARE_NB + 1, - PS_W_KNIGHT = 2 * SQUARE_NB + 1, - PS_B_KNIGHT = 3 * SQUARE_NB + 1, - PS_W_BISHOP = 4 * SQUARE_NB + 1, - PS_B_BISHOP = 5 * SQUARE_NB + 1, - PS_W_ROOK = 6 * SQUARE_NB + 1, - PS_B_ROOK = 7 * SQUARE_NB + 1, - PS_W_QUEEN = 8 * SQUARE_NB + 1, - PS_B_QUEEN = 9 * SQUARE_NB + 1, - PS_W_KING = 10 * SQUARE_NB + 1, - PS_END = PS_W_KING, // pieces without kings (pawns included) - PS_B_KING = 11 * SQUARE_NB + 1, - PS_END2 = 12 * SQUARE_NB + 1 -}; - -struct ExtPieceSquare { - PieceSquare from[COLOR_NB]; -}; - -// Array for finding the PieceSquare corresponding to the piece on the board -extern ExtPieceSquare kpp_board_index[PIECE_NB]; - -constexpr bool is_ok(PieceId pid); -constexpr Square rotate180(Square sq); - -// Structure holding which tracked piece (PieceId) is where (PieceSquare) -class EvalList { - -public: - // Max. number of pieces without kings is 30 but must be a multiple of 4 in AVX2 - static const int MAX_LENGTH = 32; - - // Array that holds the piece id for the pieces on the board - PieceId piece_id_list[SQUARE_NB]; - - // List of pieces, separate from White and Black POV - PieceSquare* piece_list_fw() const { return const_cast(pieceListFw); } - PieceSquare* piece_list_fb() const { return const_cast(pieceListFb); } - - // Place the piece pc with piece_id on the square sq on the board - void put_piece(PieceId piece_id, Square sq, Piece pc) - { - assert(is_ok(piece_id)); - if (pc != NO_PIECE) - { - pieceListFw[piece_id] = PieceSquare(kpp_board_index[pc].from[WHITE] + sq); - pieceListFb[piece_id] = PieceSquare(kpp_board_index[pc].from[BLACK] + rotate180(sq)); - piece_id_list[sq] = piece_id; - } - else - { - pieceListFw[piece_id] = PS_NONE; - pieceListFb[piece_id] = PS_NONE; - piece_id_list[sq] = piece_id; - } - } - - // Convert the specified piece_id piece to ExtPieceSquare type and return it - ExtPieceSquare piece_with_id(PieceId piece_id) const - { - ExtPieceSquare eps; - eps.from[WHITE] = pieceListFw[piece_id]; - eps.from[BLACK] = pieceListFb[piece_id]; - return eps; - } - -private: - PieceSquare pieceListFw[MAX_LENGTH]; - PieceSquare pieceListFb[MAX_LENGTH]; -}; - -// For differential evaluation of pieces that changed since last turn +// Keep track of what a move changes on the board (used by NNUE) struct DirtyPiece { // Number of changed pieces int dirty_num; - // The ids of changed pieces, max. 2 pieces can change in one move - PieceId pieceId[2]; + // Max 3 pieces can change in one move. A promotion with capture moves + // both the pawn and the captured piece to SQ_NONE and the piece promoted + // to from SQ_NONE to the capture square. + Piece piece[3]; - // What changed from the piece with that piece number - ExtPieceSquare old_piece[2]; - ExtPieceSquare new_piece[2]; + // From and to squares, which may be SQ_NONE + Square from[3]; + Square to[3]; }; /// Score enum stores a middlegame and an endgame value in a single integer (enum). @@ -407,8 +318,6 @@ ENABLE_FULL_OPERATORS_ON(Value) ENABLE_FULL_OPERATORS_ON(Direction) ENABLE_INCR_OPERATORS_ON(Piece) -ENABLE_INCR_OPERATORS_ON(PieceSquare) -ENABLE_INCR_OPERATORS_ON(PieceId) ENABLE_INCR_OPERATORS_ON(PieceType) ENABLE_INCR_OPERATORS_ON(Square) ENABLE_INCR_OPERATORS_ON(File) @@ -497,10 +406,6 @@ inline Color color_of(Piece pc) { return Color(pc >> 3); } -constexpr bool is_ok(PieceId pid) { - return pid < PIECE_ID_NONE; -} - constexpr bool is_ok(Square s) { return s >= SQ_A1 && s <= SQ_H8; } @@ -537,11 +442,6 @@ constexpr Square to_sq(Move m) { return Square(m & 0x3F); } -// Return relative square when turning the board 180 degrees -constexpr Square rotate180(Square sq) { - return (Square)(sq ^ 0x3F); -} - constexpr int from_to(Move m) { return m & 0xFFF; } From 95b8f3f8005598fc3ad07a7a8f6440d828cabc29 Mon Sep 17 00:00:00 2001 From: VoyagerOne Date: Sun, 23 Aug 2020 12:04:50 -0400 Subject: [PATCH 57/58] Remove Reduce Depth Remove Reduce Depth at PV nodes. STC: LLR: 2.94 (-2.94,2.94) {-1.25,0.25} Total: 56760 W: 6299 L: 6236 D: 44225 Ptnml(0-2): 286, 4843, 18076, 4872, 303 https://tests.stockfishchess.org/tests/view/5f41356087a5c3c63d8f53c9 LTC: LLR: 2.95 (-2.94,2.94) {-0.75,0.25} Total: 17496 W: 954 L: 865 D: 15677 Ptnml(0-2): 13, 768, 7098, 855, 14 https://tests.stockfishchess.org/tests/view/5f41bb7687a5c3c63d8f53f9 closes https://github.com/official-stockfish/Stockfish/pull/3055 Bench: 3555051 --- src/search.cpp | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index 2ca64a01..d6b611a3 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -940,12 +940,6 @@ namespace { } } - // Step 11. If the position is not in TT, decrease depth by 2 - if ( PvNode - && depth >= 6 - && !ttMove) - depth -= 2; - moves_loop: // When in check, search starts from here const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory, @@ -969,7 +963,7 @@ moves_loop: // When in check, search starts from here // Mark this node as being searched ThreadHolding th(thisThread, posKey, ss->ply); - // Step 12. Loop through all pseudo-legal moves until no moves remain + // Step 11. Loop through all pseudo-legal moves until no moves remain // or a beta cutoff occurs. while ((move = mp.next_move(moveCountPruning)) != MOVE_NONE) { @@ -1007,7 +1001,7 @@ moves_loop: // When in check, search starts from here // Calculate new depth for this move newDepth = depth - 1; - // Step 13. Pruning at shallow depth (~200 Elo) + // Step 12. Pruning at shallow depth (~200 Elo) if ( !rootNode && pos.non_pawn_material(us) && bestValue > VALUE_TB_LOSS_IN_MAX_PLY) @@ -1065,7 +1059,7 @@ moves_loop: // When in check, search starts from here } } - // Step 14. Extensions (~75 Elo) + // Step 13. Extensions (~75 Elo) // Singular extension search (~70 Elo). If all moves but one fail low on a // search of (alpha-s, beta-s), and just one fails high on (alpha, beta), @@ -1148,10 +1142,10 @@ moves_loop: // When in check, search starts from here [movedPiece] [to_sq(move)]; - // Step 15. Make the move + // Step 14. Make the move pos.do_move(move, st, givesCheck); - // Step 16. Reduced depth search (LMR, ~200 Elo). If the move fails high it will be + // Step 15. Reduced depth search (LMR, ~200 Elo). If the move fails high it will be // re-searched at full depth. if ( depth >= 3 && moveCount > 1 + 2 * rootNode + 2 * (PvNode && abs(bestValue) < 2) @@ -1254,7 +1248,7 @@ moves_loop: // When in check, search starts from here didLMR = false; } - // Step 17. Full depth search when LMR is skipped or fails high + // Step 16. Full depth search when LMR is skipped or fails high if (doFullDepthSearch) { value = -search(pos, ss+1, -(alpha+1), -alpha, newDepth, !cutNode); @@ -1282,12 +1276,12 @@ moves_loop: // When in check, search starts from here value = -search(pos, ss+1, -beta, -alpha, newDepth, false); } - // Step 18. Undo move + // Step 17. Undo move pos.undo_move(move); assert(value > -VALUE_INFINITE && value < VALUE_INFINITE); - // Step 19. Check for a new best move + // Step 18. Check for a new best move // Finished searching the move. If a stop occurred, the return value of // the search cannot be trusted, and we return immediately without // updating best move, PV and TT. @@ -1364,7 +1358,7 @@ moves_loop: // When in check, search starts from here return VALUE_DRAW; */ - // Step 20. Check for mate and stalemate + // Step 19. Check for mate and stalemate // All legal moves have been searched and if there are no legal moves, it // must be a mate or a stalemate. If we are in a singular extension search then // return a fail low score. From 242a7d9fead561488ca176a4687deef8859918f2 Mon Sep 17 00:00:00 2001 From: VoyagerOne Date: Tue, 25 Aug 2020 09:10:47 -0400 Subject: [PATCH 58/58] Simplify MCP in QS Simplify moveCount pruning in QS by removing depth dependency. STC LLR: 2.94 (-2.94,2.94) {-1.25,0.25} Total: 42960 W: 4741 L: 4661 D: 33558 Ptnml(0-2): 218, 3574, 13804, 3678, 206 https://tests.stockfishchess.org/tests/view/5f42e3f75089a564a10d8493 LTC LLR: 2.94 (-2.94,2.94) {-0.75,0.25} Total: 66672 W: 3563 L: 3508 D: 59601 Ptnml(0-2): 71, 3064, 26996, 3149, 56 https://tests.stockfishchess.org/tests/view/5f4353285089a564a10d84d0 closes https://github.com/official-stockfish/Stockfish/pull/3067 Bench: 4074430 --- src/search.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search.cpp b/src/search.cpp index d6b611a3..cae8a684 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1526,7 +1526,7 @@ moves_loop: // When in check, search starts from here assert(type_of(move) != ENPASSANT); // Due to !pos.advanced_pawn_push // moveCount pruning - if (moveCount > abs(depth) + 2) + if (moveCount > 2) continue; futilityValue = futilityBase + PieceValue[EG][pos.piece_on(to_sq(move))];