From d7a26899a973536ab9d3ce4771d8276d1a4dc55c Mon Sep 17 00:00:00 2001 From: Daniel Dugovic Date: Sat, 8 Aug 2020 15:39:29 -0500 Subject: [PATCH 1/6] Use fallback implementation for C++ aligned_alloc fixes https://github.com/official-stockfish/Stockfish/issues/2921 closes https://github.com/official-stockfish/Stockfish/pull/2927 No functional change --- src/Makefile | 4 ++-- src/misc.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Makefile b/src/Makefile index cab7a7e5..b7585a17 100644 --- a/src/Makefile +++ b/src/Makefile @@ -354,8 +354,8 @@ endif endif ifeq ($(KERNEL),Darwin) - CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.15 - LDFLAGS += -arch $(arch) -mmacosx-version-min=10.15 + CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.13 + LDFLAGS += -arch $(arch) -mmacosx-version-min=10.13 endif ### Travis CI script uses COMPILER to overwrite CXX diff --git a/src/misc.cpp b/src/misc.cpp index 3d7c75e5..05f79b45 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -321,9 +321,9 @@ void prefetch(void* addr) { /// void* std_aligned_alloc(size_t alignment, size_t size) { -#if defined(__APPLE__) +#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) ) return aligned_alloc(alignment, size); -#elif defined(_WIN32) +#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES))) return _mm_malloc(size, alignment); #else return std::aligned_alloc(alignment, size); @@ -331,9 +331,9 @@ void* std_aligned_alloc(size_t alignment, size_t size) { } void std_aligned_free(void* ptr) { -#if defined(__APPLE__) +#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) ) free(ptr); -#elif defined(_WIN32) +#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES))) _mm_free(ptr); #else free(ptr); From 320fa1b2f082a7db67363e468e7e241d7cedcc64 Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Sun, 9 Aug 2020 11:05:07 +0200 Subject: [PATCH 2/6] Improve error message on missing net. small rewording, but also print the download url for the default net. closes https://github.com/official-stockfish/Stockfish/pull/2954 No functional change --- src/evaluate.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 1ae6cb3a..a642357e 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -50,9 +50,13 @@ namespace Eval { std::string eval_file = std::string(Options["EvalFile"]); if (useNNUE && eval_file_loaded != eval_file) { - std::cerr << "Use of NNUE evaluation, but the file " << eval_file << " was not loaded successfully. " - << "These network evaluation parameters must be available, compatible with this version of the code. " - << "The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file." << std::endl; + UCI::OptionsMap defaults; + UCI::init(defaults); + + std::cerr << "NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully. " + << "These network evaluation parameters must be available, and compatible with this version of the code. " + << "The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file. " + << "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << std::endl; std::exit(EXIT_FAILURE); } From cd1bb27dd452f336d434a45131bfbe43f8a8c5b3 Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Sun, 9 Aug 2020 19:08:47 +0200 Subject: [PATCH 3/6] Fix aligned_alloc on MinGW introduced with d7a26899a973536ab9d3ce4771d8276d1a4dc55c closes https://github.com/official-stockfish/Stockfish/pull/2959 No functional change. --- src/misc.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/misc.cpp b/src/misc.cpp index 05f79b45..bdd7bccb 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -321,7 +321,7 @@ void prefetch(void* addr) { /// void* std_aligned_alloc(size_t alignment, size_t size) { -#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) ) +#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) return aligned_alloc(alignment, size); #elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES))) return _mm_malloc(size, alignment); @@ -331,7 +331,7 @@ void* std_aligned_alloc(size_t alignment, size_t size) { } void std_aligned_free(void* ptr) { -#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) ) +#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) free(ptr); #elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES))) _mm_free(ptr); From 2bfde5542919c2ed624b5b62883616e325ccb942 Mon Sep 17 00:00:00 2001 From: Vizvezdenec Date: Sun, 9 Aug 2020 21:39:46 +0300 Subject: [PATCH 4/6] Adjust NNUE usage based on number of pawns in position The idea of this patch is that positions are usually more complex and hard to evaluate even if there are more pawns. This patch adjusts NNUE threshold usage depending on number of pawns in position, if pawn count is <3 we use the classical evaluation more often, for pawn count = 3 patch the is non-functional, with pawn count > 3 NNUE evaluation is used more often. passed STC https://tests.stockfishchess.org/tests/view/5f2f02d09081672066536b1f LLR: 2.96 (-2.94,2.94) {-0.50,1.50} Total: 36520 W: 5011 L: 4823 D: 26686 Ptnml(0-2): 299, 3482, 10548, 3594, 337 passed LTC https://tests.stockfishchess.org/tests/view/5f2f4c329081672066536b5c LLR: 2.98 (-2.94,2.94) {0.25,1.75} Total: 39272 W: 2630 L: 2433 D: 34209 Ptnml(0-2): 53, 2066, 15218, 2229, 70 closes https://github.com/official-stockfish/Stockfish/pull/2960 bench 4084753 --- src/evaluate.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index a642357e..ce35c630 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -114,7 +114,7 @@ namespace { constexpr Value LazyThreshold1 = Value(1400); constexpr Value LazyThreshold2 = Value(1300); constexpr Value SpaceThreshold = Value(12222); - constexpr Value NNUEThreshold = Value(520); + constexpr Value NNUEThreshold = Value(460); // KingAttackWeights[PieceType] contains king attack weights by piece type constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 }; @@ -945,7 +945,7 @@ Value Eval::evaluate(const Position& pos) { { Value v = eg_value(pos.psq_score()); // Take NNUE eval only on balanced positions - if (abs(v) < NNUEThreshold) + if (abs(v) < NNUEThreshold + 20 * pos.count()) return NNUE::evaluate(pos) + Tempo; } return Evaluation(pos).value(); From a6e89293df5af35931b61d86b6de3872a981c100 Mon Sep 17 00:00:00 2001 From: Dariusz Orzechowski Date: Sun, 9 Aug 2020 14:32:24 -0700 Subject: [PATCH 5/6] Avoid special casing for MinGW after some testing, no version of MinGW/gcc has been found where this code is still necessary. Probably older code (pre-c++17?) closes https://github.com/official-stockfish/Stockfish/pull/2891 No functional change --- src/nnue/layers/affine_transform.h | 29 +++-------------- src/nnue/layers/clipped_relu.h | 49 ++++------------------------- src/nnue/nnue_feature_transformer.h | 34 ++------------------ 3 files changed, 14 insertions(+), 98 deletions(-) diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index b585bc87..ecc3008a 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -104,13 +104,8 @@ namespace Eval::NNUE::Layers { __m512i sum = _mm512_setzero_si512(); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - - #if defined(__MINGW32__) || defined(__MINGW64__) - __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j])); - #else - __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j])); - #endif - + __m512i product = _mm512_maddubs_epi16( + _mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j])); product = _mm512_madd_epi16(product, kOnes); sum = _mm512_add_epi32(sum, product); } @@ -125,12 +120,8 @@ namespace Eval::NNUE::Layers { const auto row_256 = reinterpret_cast(&weights_[offset]); int j = kNumChunks * 2; - #if defined(__MINGW32__) || defined(__MINGW64__) // See HACK comment below in AVX2. - __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); - #else - __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); - #endif - + __m256i sum256 = _mm256_maddubs_epi16( + _mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1)); sum256 = _mm256_hadd_epi32(sum256, sum256); sum256 = _mm256_hadd_epi32(sum256, sum256); @@ -144,17 +135,7 @@ namespace Eval::NNUE::Layers { const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m256i product = _mm256_maddubs_epi16( - - #if defined(__MINGW32__) || defined(__MINGW64__) - // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary - // compiled with g++ in MSYS2 crashes here because the output memory is not aligned - // even though alignas is specified. - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&input_vector[j]), _mm256_load_si256(&row[j])); + _mm256_load_si256(&input_vector[j]), _mm256_load_si256(&row[j])); product = _mm256_madd_epi16(product, kOnes); sum = _mm256_add_epi32(sum, product); } diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index 7ade598f..7e5fcf4a 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -74,50 +74,13 @@ namespace Eval::NNUE::Layers { const auto out = reinterpret_cast<__m256i*>(output); for (IndexType i = 0; i < kNumChunks; ++i) { const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32( - - #if defined(__MINGW32__) || defined(__MINGW64__) - // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary - // compiled with g++ in MSYS2 crashes here because the output memory is not aligned - // even though alignas is specified. - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 0]), - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 1])), kWeightScaleBits); + _mm256_load_si256(&in[i * 4 + 0]), + _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits); const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32( - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 2]), - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 3])), kWeightScaleBits); - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_storeu_si256 - #else - _mm256_store_si256 - #endif - - (&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( + _mm256_load_si256(&in[i * 4 + 2]), + _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits); + _mm256_store_si256( + &out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( _mm256_packs_epi16(words0, words1), kZero), kOffsets)); } constexpr IndexType kStart = kNumChunks * kSimdWidth; diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 1cfebbe4..f899d761 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -110,36 +110,12 @@ namespace Eval::NNUE { auto out = reinterpret_cast<__m256i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m256i sum0 = - - #if defined(__MINGW32__) || defined(__MINGW64__) - // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary - // compiled with g++ in MSYS2 crashes here because the output memory is not aligned - // even though alignas is specified. - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&reinterpret_cast( + _mm256_load_si256(&reinterpret_cast( accumulation[perspectives[p]][0])[j * 2 + 0]); __m256i sum1 = - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&reinterpret_cast( + _mm256_load_si256(&reinterpret_cast( accumulation[perspectives[p]][0])[j * 2 + 1]); - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_storeu_si256 - #else - _mm256_store_si256 - #endif - - (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( + _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( _mm256_packs_epi16(sum0, sum1), kZero), kControl)); } @@ -202,11 +178,7 @@ namespace Eval::NNUE { auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); for (IndexType j = 0; j < kNumChunks; ++j) { - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j])); - #else accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]); - #endif } #elif defined(USE_SSE2) From 27b593a94477a821f80a041320683f805114d4a3 Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Sun, 9 Aug 2020 18:11:38 +0200 Subject: [PATCH 6/6] Fix a data race for NNUE the stateInfo at the rootPos is no longer read-only, as the NNUE accumulator is part of it. Threads can thus not share this object and need their own copy. tested for no regression https://tests.stockfishchess.org/tests/view/5f3022239081672066536bce LLR: 2.96 (-2.94,2.94) {-1.50,0.50} Total: 52800 W: 6843 L: 6802 D: 39155 Ptnml(0-2): 336, 4646, 16399, 4679, 340 closes https://github.com/official-stockfish/Stockfish/pull/2957 fixes https://github.com/official-stockfish/Stockfish/issues/2933 No functional change --- src/Makefile | 4 ++-- src/thread.cpp | 13 +++++-------- src/thread.h | 1 + 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/Makefile b/src/Makefile index b7585a17..571172b2 100644 --- a/src/Makefile +++ b/src/Makefile @@ -354,8 +354,8 @@ endif endif ifeq ($(KERNEL),Darwin) - CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.13 - LDFLAGS += -arch $(arch) -mmacosx-version-min=10.13 + CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14 + LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14 endif ### Travis CI script uses COMPILER to overwrite CXX diff --git a/src/thread.cpp b/src/thread.cpp index 44aea14e..1aa66a81 100644 --- a/src/thread.cpp +++ b/src/thread.cpp @@ -204,21 +204,18 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states, // We use Position::set() to set root position across threads. But there are // some StateInfo fields (previous, pliesFromNull, capturedPiece) that cannot - // be deduced from a fen string, so set() clears them and to not lose the info - // we need to backup and later restore setupStates->back(). Note that setupStates - // is shared by threads but is accessed in read-only mode. - StateInfo tmp = setupStates->back(); - + // be deduced from a fen string, so set() clears them and they are set from + // setupStates->back() later. The rootState is per thread, earlier states are shared + // since they are read-only. for (Thread* th : *this) { th->nodes = th->tbHits = th->nmpMinPly = th->bestMoveChanges = 0; th->rootDepth = th->completedDepth = 0; th->rootMoves = rootMoves; - th->rootPos.set(pos.fen(), pos.is_chess960(), &setupStates->back(), th); + th->rootPos.set(pos.fen(), pos.is_chess960(), &th->rootState, th); + th->rootState = setupStates->back(); } - setupStates->back() = tmp; - main()->start_searching(); } diff --git a/src/thread.h b/src/thread.h index 46da1e34..042bc2e9 100644 --- a/src/thread.h +++ b/src/thread.h @@ -65,6 +65,7 @@ public: std::atomic nodes, tbHits, bestMoveChanges; Position rootPos; + StateInfo rootState; Search::RootMoves rootMoves; Depth rootDepth, completedDepth; CounterMoveHistory counterMoves;