From 027626db1e449597ba2211a0819f251beda37b88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= Date: Thu, 12 Nov 2020 14:05:28 +0100 Subject: [PATCH 1/4] Small cleanups 13 No functional change --- AUTHORS | 2 +- src/evaluate.cpp | 10 +++++----- src/misc.cpp | 3 +-- src/nnue/nnue_feature_transformer.h | 2 +- src/pawns.cpp | 4 ++-- src/search.cpp | 2 +- src/types.h | 4 ++-- 7 files changed, 13 insertions(+), 14 deletions(-) diff --git a/AUTHORS b/AUTHORS index f30be4de..71b718b8 100644 --- a/AUTHORS +++ b/AUTHORS @@ -44,6 +44,7 @@ Daniel Dugovic (ddugovic) Dariusz Orzechowski (dorzechowski) David Zar Daylen Yang (daylen) +Deshawn Mohan-Smith (GoldenRare) DiscanX Dominik Schlösser (domschl) double-beep @@ -64,7 +65,6 @@ Gary Heckman (gheckman) George Sobala (gsobala) gguliash Gian-Carlo Pascutto (gcp) -Deshawn Mohan-Smith (GoldenRare) Gontran Lemaire (gonlem) Goodkov Vasiliy Aleksandrovich (goodkov) Gregor Cramer diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 34ebe6c3..1a8cf662 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -1025,7 +1025,7 @@ Value Eval::evaluate(const Position& pos) { { // Scale and shift NNUE for compatibility with search and classical evaluation auto adjusted_NNUE = [&](){ - int mat = pos.non_pawn_material() + PieceValue[MG][PAWN] * pos.count(); + int mat = pos.non_pawn_material() + PawnValueMg * pos.count(); return NNUE::evaluate(pos) * (720 + mat / 32) / 1024 + Tempo; }; @@ -1041,10 +1041,10 @@ Value Eval::evaluate(const Position& pos) { // For the case of opposite colored bishops, switch to NNUE eval with // small probability if the classical eval is less than the threshold. if ( largePsq - && (abs(v) * 16 < NNUEThreshold2 * r50 - || ( pos.opposite_bishops() - && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50 - && !(pos.this_thread()->nodes & 0xB)))) + && ( abs(v) * 16 < NNUEThreshold2 * r50 + || ( pos.opposite_bishops() + && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50 + && !(pos.this_thread()->nodes & 0xB)))) v = adjusted_NNUE(); } diff --git a/src/misc.cpp b/src/misc.cpp index a16a6e90..f2bce6b0 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -583,11 +583,10 @@ namespace CommandLine { string argv0; // path+name of the executable binary, as given by argv[0] string binaryDirectory; // path of the executable directory string workingDirectory; // path of the working directory -string pathSeparator; // Separator for our current OS void init(int argc, char* argv[]) { (void)argc; - string separator; + string pathSeparator; // extract the path+name of the executable binary argv0 = argv[0]; diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index f49777b5..85bc2bc8 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -247,7 +247,7 @@ namespace Eval::NNUE { // Look for a usable accumulator of an earlier position. We keep track // of the estimated gain in terms of features to be added/subtracted. StateInfo *st = pos.state(), *next = nullptr; - int gain = popcount(pos.pieces()) - 2; + int gain = pos.count() - 2; while (st->accumulator.state[c] == EMPTY) { auto& dp = st->dirtyPiece; diff --git a/src/pawns.cpp b/src/pawns.cpp index fde70ba5..68aaf331 100644 --- a/src/pawns.cpp +++ b/src/pawns.cpp @@ -176,8 +176,8 @@ namespace { score -= Doubled * doubled + WeakLever * more_than_one(lever); - if (blocked && r > RANK_4) - score += BlockedPawn[r-4]; + if (blocked && r >= RANK_5) + score += BlockedPawn[r - RANK_5]; } return score; diff --git a/src/search.cpp b/src/search.cpp index 66ef5043..78a1f7b6 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1058,7 +1058,7 @@ moves_loop: // When in check, search starts from here && captureHistory[movedPiece][to_sq(move)][type_of(pos.piece_on(to_sq(move)))] < 0) continue; - // See based pruning + // SEE based pruning if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo) continue; } diff --git a/src/types.h b/src/types.h index bf692f7e..8506b06e 100644 --- a/src/types.h +++ b/src/types.h @@ -202,8 +202,8 @@ enum PieceType { enum Piece { NO_PIECE, - W_PAWN = 1, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING, - B_PAWN = 9, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING, + W_PAWN = PAWN, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING, + B_PAWN = PAWN + 8, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING, PIECE_NB = 16 }; From 9fb6383ed804d0bc86d52b07def14352f44eb5b4 Mon Sep 17 00:00:00 2001 From: Unai Corzo Date: Tue, 24 Nov 2020 17:06:30 +0100 Subject: [PATCH 2/4] Assorted search and eval parameter tune Search and eval parameter tune. STC https://tests.stockfishchess.org/tests/view/5fba850a67cbf42301d6b07d LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 24312 W: 2388 L: 2228 D: 19696 Ptnml(0-2): 85, 1800, 8241, 1930, 100 LTC https://tests.stockfishchess.org/tests/view/5fbad5ea67cbf42301d6b0fa LLR: 2.95 (-2.94,2.94) {0.25,1.25} Total: 88376 W: 3619 L: 3351 D: 81406 Ptnml(0-2): 56, 2977, 37849, 3255, 51 closes https://github.com/official-stockfish/Stockfish/pull/3232 bench: 3600361 --- src/evaluate.cpp | 10 +++++----- src/search.cpp | 34 +++++++++++++++++----------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 1a8cf662..3d887119 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -187,11 +187,11 @@ using namespace Trace; namespace { // Threshold for lazy and space evaluation - constexpr Value LazyThreshold1 = Value(1400); - constexpr Value LazyThreshold2 = Value(1300); - constexpr Value SpaceThreshold = Value(12222); - constexpr Value NNUEThreshold1 = Value(550); - constexpr Value NNUEThreshold2 = Value(150); + constexpr Value LazyThreshold1 = Value(1565); + constexpr Value LazyThreshold2 = Value(1102); + constexpr Value SpaceThreshold = Value(11551); + constexpr Value NNUEThreshold1 = Value(682); + constexpr Value NNUEThreshold2 = Value(176); // KingAttackWeights[PieceType] contains king attack weights by piece type constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 }; diff --git a/src/search.cpp b/src/search.cpp index 78a1f7b6..7c797bef 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -65,7 +65,7 @@ namespace { // Razor and futility margins constexpr int RazorMargin = 510; Value futility_margin(Depth d, bool improving) { - return Value(223 * (d - improving)); + return Value(234 * (d - improving)); } // Reductions lookup table, initialized at startup @@ -73,7 +73,7 @@ namespace { Depth reduction(bool i, Depth d, int mn) { int r = Reductions[d] * Reductions[mn]; - return (r + 509) / 1024 + (!i && r > 894); + return (r + 503) / 1024 + (!i && r > 915); } constexpr int futility_move_count(bool improving, Depth depth) { @@ -194,7 +194,7 @@ namespace { void Search::init() { for (int i = 1; i < MAX_MOVES; ++i) - Reductions[i] = int((22.0 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i))); + Reductions[i] = int((21.3 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i))); } @@ -410,7 +410,7 @@ void Thread::search() { beta = std::min(prev + delta, VALUE_INFINITE); // Adjust contempt based on root move's previousScore (dynamic contempt) - int dct = ct + (105 - ct / 2) * prev / (abs(prev) + 149); + int dct = ct + (113 - ct / 2) * prev / (abs(prev) + 147); contempt = (us == WHITE ? make_score(dct, dct / 2) : -make_score(dct, dct / 2)); @@ -830,7 +830,7 @@ namespace { && (ss-1)->statScore < 22977 && eval >= beta && eval >= ss->staticEval - && ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 182 + && ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 168 && !excludedMove && pos.non_pawn_material(us) && (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor)) @@ -838,7 +838,7 @@ namespace { assert(eval - beta >= 0); // Null move dynamic reduction based on depth and value - Depth R = (982 + 85 * depth) / 256 + std::min(int(eval - beta) / 192, 3); + Depth R = (1015 + 85 * depth) / 256 + std::min(int(eval - beta) / 191, 3); ss->currentMove = MOVE_NULL; ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0]; @@ -855,7 +855,7 @@ namespace { if (nullValue >= VALUE_TB_WIN_IN_MAX_PLY) nullValue = beta; - if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 13)) + if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 14)) return nullValue; assert(!thisThread->nmpMinPly); // Recursive verification is not allowed @@ -874,7 +874,7 @@ namespace { } } - probCutBeta = beta + 176 - 49 * improving; + probCutBeta = beta + 183 - 49 * improving; // Step 10. ProbCut (~10 Elo) // If we have a good enough capture and a reduced search returns a value @@ -1039,7 +1039,7 @@ moves_loop: // When in check, search starts from here // Futility pruning: parent node (~5 Elo) if ( lmrDepth < 7 && !ss->inCheck - && ss->staticEval + 283 + 170 * lmrDepth <= alpha + && ss->staticEval + 266 + 170 * lmrDepth <= alpha && (*contHist[0])[movedPiece][to_sq(move)] + (*contHist[1])[movedPiece][to_sq(move)] + (*contHist[3])[movedPiece][to_sq(move)] @@ -1047,7 +1047,7 @@ moves_loop: // When in check, search starts from here continue; // Prune moves with negative SEE (~20 Elo) - if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth))) + if (!pos.see_ge(move, Value(-(30 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth))) continue; } else @@ -1059,7 +1059,7 @@ moves_loop: // When in check, search starts from here continue; // SEE based pruning - if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo) + if (!pos.see_ge(move, Value(-213) * depth)) // (~25 Elo) continue; } } @@ -1153,12 +1153,12 @@ moves_loop: // When in check, search starts from here || moveCountPruning || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha || cutNode - || thisThread->ttHitAverage < 427 * TtHitAverageResolution * TtHitAverageWindow / 1024)) + || thisThread->ttHitAverage < 432 * TtHitAverageResolution * TtHitAverageWindow / 1024)) { Depth r = reduction(improving, depth, moveCount); // Decrease reduction if the ttHit running average is large - if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024) + if (thisThread->ttHitAverage > 537 * TtHitAverageResolution * TtHitAverageWindow / 1024) r--; // Increase reduction if other threads are searching this position @@ -1211,10 +1211,10 @@ moves_loop: // When in check, search starts from here - 5287; // Decrease/increase reduction by comparing opponent's stat score (~10 Elo) - if (ss->statScore >= -106 && (ss-1)->statScore < -104) + if (ss->statScore >= -105 && (ss-1)->statScore < -103) r--; - else if ((ss-1)->statScore >= -119 && ss->statScore < -140) + else if ((ss-1)->statScore >= -122 && ss->statScore < -129) r++; // Decrease/increase reduction for moves with a good/bad history (~30 Elo) @@ -1228,7 +1228,7 @@ moves_loop: // When in check, search starts from here // Unless giving check, this capture is likely bad if ( !givesCheck - && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 213 * depth <= alpha) + && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 210 * depth <= alpha) r++; } @@ -1502,7 +1502,7 @@ moves_loop: // When in check, search starts from here if (PvNode && bestValue > alpha) alpha = bestValue; - futilityBase = bestValue + 145; + futilityBase = bestValue + 155; } const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory, From 7615e3485e75c2f1715d372f7bb1f546738a5c76 Mon Sep 17 00:00:00 2001 From: MaximMolchanov Date: Sat, 14 Nov 2020 02:55:29 +0200 Subject: [PATCH 3/4] Calculate sum from first elements in affine transform for AVX512/AVX2/SSSE3 The idea is to initialize sum with the first element instead of zero. Reduce one add_epi32 and one set_zero SIMD instructions for each output dimension. sum = 0; for i = 1 to n sum += a[i] -> sum = a[1]; for i = 2 to n sum += a[i] STC: LLR: 2.95 (-2.94,2.94) {-0.25,1.25} Total: 69048 W: 7024 L: 6799 D: 55225 Ptnml(0-2): 260, 5175, 23458, 5342, 289 https://tests.stockfishchess.org/tests/view/5faf2cf467cbf42301d6aa06 closes https://github.com/official-stockfish/Stockfish/pull/3227 No functional change. --- AUTHORS | 1 + src/nnue/layers/affine_transform.h | 211 ++++++++++++++++++++--------- 2 files changed, 148 insertions(+), 64 deletions(-) diff --git a/AUTHORS b/AUTHORS index 71b718b8..b31a36e9 100644 --- a/AUTHORS +++ b/AUTHORS @@ -112,6 +112,7 @@ Mark Tenzer (31m059) marotear Matthew Lai (matthewlai) Matthew Sullivan (Matt14916) +Maxim Molchanov (Maxim) Michael An (man) Michael Byrne (MichaelB7) Michael Chaly (Vizvezdenec) diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 47c9c488..caf315b2 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -181,13 +181,13 @@ namespace Eval::NNUE::Layers { return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias); }; - [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) { #if defined (USE_VNNI) + [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) { acc = _mm512_dpbusd_epi32(acc, a, b); #else + [[maybe_unused]] auto m512_dpbusd_epi32 = [=](__m512i a, __m512i b) -> __m512i { __m512i product0 = _mm512_maddubs_epi16(a, b); - product0 = _mm512_madd_epi16(product0, kOnes512); - acc = _mm512_add_epi32(acc, product0); + return _mm512_madd_epi16(product0, kOnes512); #endif }; @@ -214,14 +214,13 @@ namespace Eval::NNUE::Layers { return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias); }; - - [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) { #if defined (USE_VNNI) + [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) { acc = _mm256_dpbusd_epi32(acc, a, b); #else + [[maybe_unused]] auto m256_dpbusd_epi32 = [=](__m256i a, __m256i b) -> __m256i { __m256i product0 = _mm256_maddubs_epi16(a, b); - product0 = _mm256_madd_epi16(product0, kOnes256); - acc = _mm256_add_epi32(acc, product0); + return _mm256_madd_epi16(product0, kOnes256); #endif }; @@ -246,10 +245,9 @@ namespace Eval::NNUE::Layers { return _mm_add_epi32(sum0, bias); }; - [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) { + [[maybe_unused]] auto m128_dpbusd_epi32 = [=](__m128i a, __m128i b) -> __m128i { __m128i product0 = _mm_maddubs_epi16(a, b); - product0 = _mm_madd_epi16(product0, kOnes128); - acc = _mm_add_epi32(acc, product0); + return _mm_madd_epi16(product0, kOnes128); }; #endif @@ -293,15 +291,6 @@ namespace Eval::NNUE::Layers { const __m512i bias = *reinterpret_cast(&biases_[i]); __m512i* outptr = reinterpret_cast<__m512i*>(&output[i]); - __m512i sum01a = _mm512_setzero_si512(); - __m512i sum23a = _mm512_setzero_si512(); - __m512i sum45a = _mm512_setzero_si512(); - __m512i sum67a = _mm512_setzero_si512(); - __m512i sum01b = _mm512_setzero_si512(); - __m512i sum23b = _mm512_setzero_si512(); - __m512i sum45b = _mm512_setzero_si512(); - __m512i sum67b = _mm512_setzero_si512(); - const auto row01a = *reinterpret_cast(&weights_[offset01a]); const auto row23a = *reinterpret_cast(&weights_[offset23a]); const auto row45a = *reinterpret_cast(&weights_[offset45a]); @@ -314,6 +303,16 @@ namespace Eval::NNUE::Layers { const __m256i in256 = input_vector256[0]; const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1); +#if defined (USE_VNNI) + __m512i sum01a = _mm512_setzero_si512(); + __m512i sum23a = _mm512_setzero_si512(); + __m512i sum45a = _mm512_setzero_si512(); + __m512i sum67a = _mm512_setzero_si512(); + __m512i sum01b = _mm512_setzero_si512(); + __m512i sum23b = _mm512_setzero_si512(); + __m512i sum45b = _mm512_setzero_si512(); + __m512i sum67b = _mm512_setzero_si512(); + m512_add_dpbusd_epi32(sum01a, in, row01a); m512_add_dpbusd_epi32(sum23a, in, row23a); m512_add_dpbusd_epi32(sum45a, in, row45a); @@ -322,6 +321,16 @@ namespace Eval::NNUE::Layers { m512_add_dpbusd_epi32(sum23b, in, row23b); m512_add_dpbusd_epi32(sum45b, in, row45b); m512_add_dpbusd_epi32(sum67b, in, row67b); +#else + __m512i sum01a = m512_dpbusd_epi32(in, row01a); + __m512i sum23a = m512_dpbusd_epi32(in, row23a); + __m512i sum45a = m512_dpbusd_epi32(in, row45a); + __m512i sum67a = m512_dpbusd_epi32(in, row67a); + __m512i sum01b = m512_dpbusd_epi32(in, row01b); + __m512i sum23b = m512_dpbusd_epi32(in, row23b); + __m512i sum45b = m512_dpbusd_epi32(in, row45b); + __m512i sum67b = m512_dpbusd_epi32(in, row67b); +#endif *outptr = m512_hadd256x16( sum01a, sum23a, sum45a, sum67a, @@ -342,48 +351,80 @@ namespace Eval::NNUE::Layers { if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0) { - __m512i sum0 = _mm512_setzero_si512(); - __m512i sum1 = _mm512_setzero_si512(); - __m512i sum2 = _mm512_setzero_si512(); - __m512i sum3 = _mm512_setzero_si512(); - const auto row0 = reinterpret_cast(&weights_[offset0]); const auto row1 = reinterpret_cast(&weights_[offset1]); const auto row2 = reinterpret_cast(&weights_[offset2]); const auto row3 = reinterpret_cast(&weights_[offset3]); - for (IndexType j = 0; j < kNumChunks512; ++j) +#if defined (USE_VNNI) + __m512i sum0 = _mm512_setzero_si512(); + __m512i sum1 = _mm512_setzero_si512(); + __m512i sum2 = _mm512_setzero_si512(); + __m512i sum3 = _mm512_setzero_si512(); + const IndexType kStart = 0; +#else + __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]); + __m512i sum1 = m512_dpbusd_epi32(input_vector512[0], row1[0]); + __m512i sum2 = m512_dpbusd_epi32(input_vector512[0], row2[0]); + __m512i sum3 = m512_dpbusd_epi32(input_vector512[0], row3[0]); + const IndexType kStart = 1; +#endif + + for (IndexType j = kStart; j < kNumChunks512; ++j) { const __m512i in = input_vector512[j]; +#if defined (USE_VNNI) m512_add_dpbusd_epi32(sum0, in, row0[j]); m512_add_dpbusd_epi32(sum1, in, row1[j]); m512_add_dpbusd_epi32(sum2, in, row2[j]); m512_add_dpbusd_epi32(sum3, in, row3[j]); +#else + sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j])); + sum1 = _mm512_add_epi32(sum1, m512_dpbusd_epi32(in, row1[j])); + sum2 = _mm512_add_epi32(sum2, m512_dpbusd_epi32(in, row2[j])); + sum3 = _mm512_add_epi32(sum3, m512_dpbusd_epi32(in, row3[j])); +#endif } *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias); } else { - __m256i sum0 = _mm256_setzero_si256(); - __m256i sum1 = _mm256_setzero_si256(); - __m256i sum2 = _mm256_setzero_si256(); - __m256i sum3 = _mm256_setzero_si256(); - const auto row0 = reinterpret_cast(&weights_[offset0]); const auto row1 = reinterpret_cast(&weights_[offset1]); const auto row2 = reinterpret_cast(&weights_[offset2]); const auto row3 = reinterpret_cast(&weights_[offset3]); - for (IndexType j = 0; j < kNumChunks256; ++j) +#if defined (USE_VNNI) + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + const IndexType kStart = 0; +#else + __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]); + __m256i sum1 = m256_dpbusd_epi32(input_vector256[0], row1[0]); + __m256i sum2 = m256_dpbusd_epi32(input_vector256[0], row2[0]); + __m256i sum3 = m256_dpbusd_epi32(input_vector256[0], row3[0]); + const IndexType kStart = 1; +#endif + + for (IndexType j = kStart; j < kNumChunks256; ++j) { const __m256i in = input_vector256[j]; +#if defined (USE_VNNI) m256_add_dpbusd_epi32(sum0, in, row0[j]); m256_add_dpbusd_epi32(sum1, in, row1[j]); m256_add_dpbusd_epi32(sum2, in, row2[j]); m256_add_dpbusd_epi32(sum3, in, row3[j]); +#else + sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j])); + sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j])); + sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j])); + sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j])); +#endif } *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias); @@ -394,30 +435,50 @@ namespace Eval::NNUE::Layers { { if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0) { - __m512i sum0 = _mm512_setzero_si512(); - const auto row0 = reinterpret_cast(&weights_[0]); - for (IndexType j = 0; j < kNumChunks512; ++j) +#if defined (USE_VNNI) + __m512i sum0 = _mm512_setzero_si512(); + const IndexType kStart = 0; +#else + __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]); + const IndexType kStart = 1; +#endif + + for (IndexType j = kStart; j < kNumChunks512; ++j) { const __m512i in = input_vector512[j]; +#if defined (USE_VNNI) m512_add_dpbusd_epi32(sum0, in, row0[j]); +#else + sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j])); +#endif } output[0] = m512_hadd(sum0, biases_[0]); } else { - __m256i sum0 = _mm256_setzero_si256(); - const auto row0 = reinterpret_cast(&weights_[0]); - for (IndexType j = 0; j < kNumChunks256; ++j) +#if defined (USE_VNNI) + __m256i sum0 = _mm256_setzero_si256(); + const IndexType kStart = 0; +#else + __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]); + const IndexType kStart = 1; +#endif + + for (IndexType j = kStart; j < kNumChunks256; ++j) { const __m256i in = input_vector256[j]; +#if defined (USE_VNNI) m256_add_dpbusd_epi32(sum0, in, row0[j]); +#else + sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j])); +#endif } output[0] = m256_hadd(sum0, biases_[0]); @@ -451,24 +512,40 @@ namespace Eval::NNUE::Layers { const __m128i bias = *reinterpret_cast(&biases_[i]); __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]); - __m256i sum0 = _mm256_setzero_si256(); - __m256i sum1 = _mm256_setzero_si256(); - __m256i sum2 = _mm256_setzero_si256(); - __m256i sum3 = _mm256_setzero_si256(); - const auto row0 = reinterpret_cast(&weights_[offset0]); const auto row1 = reinterpret_cast(&weights_[offset1]); const auto row2 = reinterpret_cast(&weights_[offset2]); const auto row3 = reinterpret_cast(&weights_[offset3]); - for (IndexType j = 0; j < kNumChunks; ++j) +#if defined (USE_VNNI) + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + const IndexType kStart = 0; +#else + __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]); + __m256i sum1 = m256_dpbusd_epi32(input_vector[0], row1[0]); + __m256i sum2 = m256_dpbusd_epi32(input_vector[0], row2[0]); + __m256i sum3 = m256_dpbusd_epi32(input_vector[0], row3[0]); + const IndexType kStart = 1; +#endif + + for (IndexType j = kStart; j < kNumChunks; ++j) { const __m256i in = input_vector[j]; +#if defined (USE_VNNI) m256_add_dpbusd_epi32(sum0, in, row0[j]); m256_add_dpbusd_epi32(sum1, in, row1[j]); m256_add_dpbusd_epi32(sum2, in, row2[j]); m256_add_dpbusd_epi32(sum3, in, row3[j]); +#else + sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j])); + sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j])); + sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j])); + sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j])); +#endif } *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias); @@ -476,15 +553,25 @@ namespace Eval::NNUE::Layers { } else if constexpr (kOutputDimensions == 1) { - __m256i sum0 = _mm256_setzero_si256(); - const auto row0 = reinterpret_cast(&weights_[0]); - for (IndexType j = 0; j < kNumChunks; ++j) +#if defined (USE_VNNI) + __m256i sum0 = _mm256_setzero_si256(); + const IndexType kStart = 0; +#else + __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]); + const IndexType kStart = 1; +#endif + + for (IndexType j = kStart; j < kNumChunks; ++j) { const __m256i in = input_vector[j]; - m256_add_dpbusd_epi32(sum0, in, row0[j]); +#if defined (USE_VNNI) + m256_add_dpbusd_epi32(sum0, in, row0[j]); +#else + sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j])); +#endif } output[0] = m256_hadd(sum0, biases_[0]); @@ -517,24 +604,24 @@ namespace Eval::NNUE::Layers { const __m128i bias = *reinterpret_cast(&biases_[i]); __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]); - __m128i sum0 = _mm_setzero_si128(); - __m128i sum1 = _mm_setzero_si128(); - __m128i sum2 = _mm_setzero_si128(); - __m128i sum3 = _mm_setzero_si128(); - const auto row0 = reinterpret_cast(&weights_[offset0]); const auto row1 = reinterpret_cast(&weights_[offset1]); const auto row2 = reinterpret_cast(&weights_[offset2]); const auto row3 = reinterpret_cast(&weights_[offset3]); - for (int j = 0; j < (int)kNumChunks; j += 1) + __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]); + __m128i sum1 = m128_dpbusd_epi32(input_vector[0], row1[0]); + __m128i sum2 = m128_dpbusd_epi32(input_vector[0], row2[0]); + __m128i sum3 = m128_dpbusd_epi32(input_vector[0], row3[0]); + + for (int j = 1; j < (int)kNumChunks; ++j) { const __m128i in = input_vector[j]; - m128_add_dpbusd_epi32(sum0, in, row0[j]); - m128_add_dpbusd_epi32(sum1, in, row1[j]); - m128_add_dpbusd_epi32(sum2, in, row2[j]); - m128_add_dpbusd_epi32(sum3, in, row3[j]); + sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(in, row0[j])); + sum1 = _mm_add_epi32(sum1, m128_dpbusd_epi32(in, row1[j])); + sum2 = _mm_add_epi32(sum2, m128_dpbusd_epi32(in, row2[j])); + sum3 = _mm_add_epi32(sum3, m128_dpbusd_epi32(in, row3[j])); } *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias); @@ -542,16 +629,12 @@ namespace Eval::NNUE::Layers { } else if constexpr (kOutputDimensions == 1) { - __m128i sum0 = _mm_setzero_si128(); - const auto row0 = reinterpret_cast(&weights_[0]); - for (int j = 0; j < (int)kNumChunks; j += 1) - { - const __m128i in = input_vector[j]; + __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]); - m128_add_dpbusd_epi32(sum0, in, row0[j]); - } + for (int j = 1; j < (int)kNumChunks; ++j) + sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(input_vector[j], row0[j])); output[0] = m128_hadd(sum0, biases_[0]); } From 190dd26b9f1bc6442acf7b2ae4750eb4ab8b90bd Mon Sep 17 00:00:00 2001 From: Vizvezdenec Date: Thu, 26 Nov 2020 06:38:09 +0100 Subject: [PATCH 4/4] use classical for certain endgames. STC https://tests.stockfishchess.org/tests/view/5fbc64c067cbf42301d6b1d6 LLR: 2.97 (-2.94,2.94) {-0.25,1.25} Total: 53360 W: 5223 L: 5024 D: 43113 Ptnml(0-2): 184, 3877, 18390, 4014, 215 LTC https://tests.stockfishchess.org/tests/view/5fbc97f267cbf42301d6b1ee LLR: 2.96 (-2.94,2.94) {0.25,1.25} Total: 126472 W: 5111 L: 4766 D: 116595 Ptnml(0-2): 50, 4032, 54749, 4333, 72 closes https://github.com/official-stockfish/Stockfish/pull/3240 bench: 3820648 --- src/evaluate.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 3d887119..90d11a00 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -1035,12 +1035,14 @@ Value Eval::evaluate(const Position& pos) { bool largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50; bool classical = largePsq || (psq > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB)); - v = classical ? Evaluation(pos).value() : adjusted_NNUE(); + bool strongClassical = pos.non_pawn_material() < 2 * RookValueMg && pos.count() < 2; + + v = classical || strongClassical ? Evaluation(pos).value() : adjusted_NNUE(); // If the classical eval is small and imbalance large, use NNUE nevertheless. // For the case of opposite colored bishops, switch to NNUE eval with // small probability if the classical eval is less than the threshold. - if ( largePsq + if ( largePsq && !strongClassical && ( abs(v) * 16 < NNUEThreshold2 * r50 || ( pos.opposite_bishops() && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50