diff --git a/src/Makefile b/src/Makefile index 59fb90c5..e871f267 100644 --- a/src/Makefile +++ b/src/Makefile @@ -368,8 +368,8 @@ endif endif ifeq ($(KERNEL),Darwin) - CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.15 - LDFLAGS += -arch $(arch) -mmacosx-version-min=10.15 + CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14 + LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14 endif ### Travis CI script uses COMPILER to overwrite CXX diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 61824b3a..9b3b58c3 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -51,9 +51,13 @@ namespace Eval { std::string eval_file = std::string(Options["EvalFile"]); if (useNNUE && eval_file_loaded != eval_file) { - std::cerr << "Use of NNUE evaluation, but the file " << eval_file << " was not loaded successfully. " - << "These network evaluation parameters must be available, compatible with this version of the code. " - << "The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file." << std::endl; + UCI::OptionsMap defaults; + UCI::init(defaults); + + std::cerr << "NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully. " + << "These network evaluation parameters must be available, and compatible with this version of the code. " + << "The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file. " + << "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << std::endl; std::exit(EXIT_FAILURE); } @@ -111,7 +115,7 @@ namespace { constexpr Value LazyThreshold1 = Value(1400); constexpr Value LazyThreshold2 = Value(1300); constexpr Value SpaceThreshold = Value(12222); - constexpr Value NNUEThreshold = Value(520); + constexpr Value NNUEThreshold = Value(460); // KingAttackWeights[PieceType] contains king attack weights by piece type constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 }; @@ -942,7 +946,7 @@ Value Eval::evaluate(const Position& pos) { { Value v = eg_value(pos.psq_score()); // Take NNUE eval only on balanced positions - if (abs(v) < NNUEThreshold) + if (abs(v) < NNUEThreshold + 20 * pos.count()) return NNUE::evaluate(pos) + Tempo; } return Evaluation(pos).value(); diff --git a/src/misc.cpp b/src/misc.cpp index 0cb98e17..725450c2 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -321,9 +321,9 @@ void prefetch(void* addr) { /// void* std_aligned_alloc(size_t alignment, size_t size) { -#if defined(__APPLE__) +#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) return aligned_alloc(alignment, size); -#elif defined(_WIN32) +#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES))) return _mm_malloc(size, alignment); #else return std::aligned_alloc(alignment, size); @@ -331,9 +331,9 @@ void* std_aligned_alloc(size_t alignment, size_t size) { } void std_aligned_free(void* ptr) { -#if defined(__APPLE__) +#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) free(ptr); -#elif defined(_WIN32) +#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES))) _mm_free(ptr); #else free(ptr); diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 057de8e1..7336be52 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -123,13 +123,8 @@ namespace Eval::NNUE::Layers { __m512i sum = _mm512_setzero_si512(); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - - #if defined(__MINGW32__) || defined(__MINGW64__) - __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j])); - #else - __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j])); - #endif - + __m512i product = _mm512_maddubs_epi16( + _mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j])); product = _mm512_madd_epi16(product, kOnes); sum = _mm512_add_epi32(sum, product); } @@ -144,12 +139,8 @@ namespace Eval::NNUE::Layers { const auto row_256 = reinterpret_cast(&weights_[offset]); int j = kNumChunks * 2; - #if defined(__MINGW32__) || defined(__MINGW64__) // See HACK comment below in AVX2. - __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); - #else - __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); - #endif - + __m256i sum256 = _mm256_maddubs_epi16( + _mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1)); sum256 = _mm256_hadd_epi32(sum256, sum256); sum256 = _mm256_hadd_epi32(sum256, sum256); @@ -163,17 +154,7 @@ namespace Eval::NNUE::Layers { const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m256i product = _mm256_maddubs_epi16( - - #if defined(__MINGW32__) || defined(__MINGW64__) - // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary - // compiled with g++ in MSYS2 crashes here because the output memory is not aligned - // even though alignas is specified. - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&input_vector[j]), _mm256_load_si256(&row[j])); + _mm256_load_si256(&input_vector[j]), _mm256_load_si256(&row[j])); product = _mm256_madd_epi16(product, kOnes); sum = _mm256_add_epi32(sum, product); } diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index 822e60b0..9b5a5f5f 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -86,50 +86,13 @@ namespace Eval::NNUE::Layers { const auto out = reinterpret_cast<__m256i*>(output); for (IndexType i = 0; i < kNumChunks; ++i) { const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32( - - #if defined(__MINGW32__) || defined(__MINGW64__) - // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary - // compiled with g++ in MSYS2 crashes here because the output memory is not aligned - // even though alignas is specified. - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 0]), - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 1])), kWeightScaleBits); + _mm256_load_si256(&in[i * 4 + 0]), + _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits); const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32( - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 2]), - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 3])), kWeightScaleBits); - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_storeu_si256 - #else - _mm256_store_si256 - #endif - - (&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( + _mm256_load_si256(&in[i * 4 + 2]), + _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits); + _mm256_store_si256( + &out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( _mm256_packs_epi16(words0, words1), kZero), kOffsets)); } constexpr IndexType kStart = kNumChunks * kSimdWidth; diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index a9fcf434..29e6db6e 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -126,36 +126,12 @@ namespace Eval::NNUE { auto out = reinterpret_cast<__m256i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m256i sum0 = - - #if defined(__MINGW32__) || defined(__MINGW64__) - // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary - // compiled with g++ in MSYS2 crashes here because the output memory is not aligned - // even though alignas is specified. - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&reinterpret_cast( + _mm256_load_si256(&reinterpret_cast( accumulation[perspectives[p]][0])[j * 2 + 0]); __m256i sum1 = - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&reinterpret_cast( + _mm256_load_si256(&reinterpret_cast( accumulation[perspectives[p]][0])[j * 2 + 1]); - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_storeu_si256 - #else - _mm256_store_si256 - #endif - - (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( + _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( _mm256_packs_epi16(sum0, sum1), kZero), kControl)); } @@ -218,11 +194,7 @@ namespace Eval::NNUE { auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); for (IndexType j = 0; j < kNumChunks; ++j) { - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j])); - #else accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]); - #endif } #elif defined(USE_SSE2) diff --git a/src/thread.cpp b/src/thread.cpp index 44aea14e..1aa66a81 100644 --- a/src/thread.cpp +++ b/src/thread.cpp @@ -204,21 +204,18 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states, // We use Position::set() to set root position across threads. But there are // some StateInfo fields (previous, pliesFromNull, capturedPiece) that cannot - // be deduced from a fen string, so set() clears them and to not lose the info - // we need to backup and later restore setupStates->back(). Note that setupStates - // is shared by threads but is accessed in read-only mode. - StateInfo tmp = setupStates->back(); - + // be deduced from a fen string, so set() clears them and they are set from + // setupStates->back() later. The rootState is per thread, earlier states are shared + // since they are read-only. for (Thread* th : *this) { th->nodes = th->tbHits = th->nmpMinPly = th->bestMoveChanges = 0; th->rootDepth = th->completedDepth = 0; th->rootMoves = rootMoves; - th->rootPos.set(pos.fen(), pos.is_chess960(), &setupStates->back(), th); + th->rootPos.set(pos.fen(), pos.is_chess960(), &th->rootState, th); + th->rootState = setupStates->back(); } - setupStates->back() = tmp; - main()->start_searching(); } diff --git a/src/thread.h b/src/thread.h index 46da1e34..042bc2e9 100644 --- a/src/thread.h +++ b/src/thread.h @@ -65,6 +65,7 @@ public: std::atomic nodes, tbHits, bestMoveChanges; Position rootPos; + StateInfo rootState; Search::RootMoves rootMoves; Depth rootDepth, completedDepth; CounterMoveHistory counterMoves;