diff --git a/AUTHORS b/AUTHORS index c00ab657..198dfa5a 100644 --- a/AUTHORS +++ b/AUTHORS @@ -63,6 +63,7 @@ Gary Heckman (gheckman) George Sobala (gsobala) gguliash Gian-Carlo Pascutto (gcp) +Deshawn Mohan-Smith (GoldenRare) Gontran Lemaire (gonlem) Goodkov Vasiliy Aleksandrovich (goodkov) Gregor Cramer diff --git a/src/Makefile b/src/Makefile index 639cf6f9..815a197b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -410,19 +410,6 @@ ifeq ($(COMP),clang) endif endif -ifeq ($(comp),icc) - profile_make = icc-profile-make - profile_use = icc-profile-use -else -ifeq ($(comp),clang) - profile_make = clang-profile-make - profile_use = clang-profile-use -else - profile_make = gcc-profile-make - profile_use = gcc-profile-use -endif -endif - ifeq ($(KERNEL),Darwin) CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14 LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14 @@ -434,20 +421,30 @@ endif # Currently we don't know how to make PGO builds with the NDK yet. ifeq ($(COMP),ndk) CXXFLAGS += -stdlib=libc++ -fPIE + comp=clang ifeq ($(arch),armv7) - comp=armv7a-linux-androideabi16-clang CXX=armv7a-linux-androideabi16-clang++ CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon STRIP=arm-linux-androideabi-strip endif ifeq ($(arch),armv8) - comp=aarch64-linux-android21-clang CXX=aarch64-linux-android21-clang++ STRIP=aarch64-linux-android-strip endif LDFLAGS += -static-libstdc++ -pie -lm -latomic endif +ifeq ($(comp),icc) + profile_make = icc-profile-make + profile_use = icc-profile-use +else ifeq ($(comp),clang) + profile_make = clang-profile-make + profile_use = clang-profile-use +else + profile_make = gcc-profile-make + profile_use = gcc-profile-use +endif + ### Travis CI script uses COMPILER to overwrite CXX ifdef COMPILER COMPCXX=$(COMPILER) @@ -619,10 +616,7 @@ endif ### needs access to the optimization flags. ifeq ($(optimize),yes) ifeq ($(debug), no) - ifeq ($(COMP),ndk) - CXXFLAGS += -flto=thin - LDFLAGS += $(CXXFLAGS) - else ifeq ($(comp),clang) + ifeq ($(comp),clang) CXXFLAGS += -flto=thin ifneq ($(findstring MINGW,$(KERNEL)),) CXXFLAGS += -fuse-ld=lld diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 3461ebf8..b4c65bdf 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -29,6 +29,56 @@ namespace Eval::NNUE { + // If vector instructions are enabled, we update and refresh the + // accumulator tile by tile such that each tile fits in the CPU's + // vector registers. + #define TILING + + #ifdef USE_AVX512 + typedef __m512i vec_t; + #define vec_load(a) _mm512_loadA_si512(a) + #define vec_store(a,b) _mm512_storeA_si512(a,b) + #define vec_add_16(a,b) _mm512_add_epi16(a,b) + #define vec_sub_16(a,b) _mm512_sub_epi16(a,b) + static constexpr IndexType kNumRegs = 8; // only 8 are needed + + #elif USE_AVX2 + typedef __m256i vec_t; + #define vec_load(a) _mm256_loadA_si256(a) + #define vec_store(a,b) _mm256_storeA_si256(a,b) + #define vec_add_16(a,b) _mm256_add_epi16(a,b) + #define vec_sub_16(a,b) _mm256_sub_epi16(a,b) + static constexpr IndexType kNumRegs = 16; + + #elif USE_SSE2 + typedef __m128i vec_t; + #define vec_load(a) (*(a)) + #define vec_store(a,b) *(a)=(b) + #define vec_add_16(a,b) _mm_add_epi16(a,b) + #define vec_sub_16(a,b) _mm_sub_epi16(a,b) + static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8; + + #elif USE_MMX + typedef __m64 vec_t; + #define vec_load(a) (*(a)) + #define vec_store(a,b) *(a)=(b) + #define vec_add_16(a,b) _mm_add_pi16(a,b) + #define vec_sub_16(a,b) _mm_sub_pi16(a,b) + static constexpr IndexType kNumRegs = 8; + + #elif USE_NEON + typedef int16x8_t vec_t; + #define vec_load(a) (*(a)) + #define vec_store(a,b) *(a)=(b) + #define vec_add_16(a,b) vaddq_s16(a,b) + #define vec_sub_16(a,b) vsubq_s16(a,b) + static constexpr IndexType kNumRegs = 16; + + #else + #undef TILING + + #endif + // Input feature converter class FeatureTransformer { @@ -36,6 +86,11 @@ namespace Eval::NNUE { // Number of output dimensions for one side static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions; + #ifdef TILING + static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2; + static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions"); + #endif + public: // Output type using OutputType = TransformedFeatureType; @@ -205,57 +260,41 @@ namespace Eval::NNUE { RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i], active_indices); for (Color perspective : { WHITE, BLACK }) { + #ifdef TILING + for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) { + auto biasesTile = reinterpret_cast( + &biases_[j * kTileHeight]); + auto accTile = reinterpret_cast( + &accumulator.accumulation[perspective][i][j * kTileHeight]); + vec_t acc[kNumRegs]; + + for (unsigned k = 0; k < kNumRegs; ++k) + acc[k] = biasesTile[k]; + + for (const auto index : active_indices[perspective]) { + const IndexType offset = kHalfDimensions * index + j * kTileHeight; + auto column = reinterpret_cast(&weights_[offset]); + + for (unsigned k = 0; k < kNumRegs; ++k) + acc[k] = vec_add_16(acc[k], column[k]); + } + + for (unsigned k = 0; k < kNumRegs; k++) + vec_store(&accTile[k], acc[k]); + } + #else std::memcpy(accumulator.accumulation[perspective][i], biases_, - kHalfDimensions * sizeof(BiasType)); + kHalfDimensions * sizeof(BiasType)); + for (const auto index : active_indices[perspective]) { const IndexType offset = kHalfDimensions * index; - #if defined(USE_AVX512) - auto accumulation = reinterpret_cast<__m512i*>( - &accumulator.accumulation[perspective][i][0]); - auto column = reinterpret_cast(&weights_[offset]); - constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; - for (IndexType j = 0; j < kNumChunks; ++j) - _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j])); - #elif defined(USE_AVX2) - auto accumulation = reinterpret_cast<__m256i*>( - &accumulator.accumulation[perspective][i][0]); - auto column = reinterpret_cast(&weights_[offset]); - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) - _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j])); - - #elif defined(USE_SSE2) - auto accumulation = reinterpret_cast<__m128i*>( - &accumulator.accumulation[perspective][i][0]); - auto column = reinterpret_cast(&weights_[offset]); - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); - - #elif defined(USE_MMX) - auto accumulation = reinterpret_cast<__m64*>( - &accumulator.accumulation[perspective][i][0]); - auto column = reinterpret_cast(&weights_[offset]); - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm_add_pi16(accumulation[j], column[j]); - - #elif defined(USE_NEON) - auto accumulation = reinterpret_cast( - &accumulator.accumulation[perspective][i][0]); - auto column = reinterpret_cast(&weights_[offset]); - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = vaddq_s16(accumulation[j], column[j]); - - #else for (IndexType j = 0; j < kHalfDimensions; ++j) accumulator.accumulation[perspective][i][j] += weights_[offset + j]; - #endif - } + #endif } + #if defined(USE_MMX) _mm_empty(); #endif @@ -273,29 +312,55 @@ namespace Eval::NNUE { bool reset[2]; RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i], removed_indices, added_indices, reset); - for (Color perspective : { WHITE, BLACK }) { - #if defined(USE_AVX2) - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - auto accumulation = reinterpret_cast<__m256i*>( - &accumulator.accumulation[perspective][i][0]); + #ifdef TILING + for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) { + for (Color perspective : { WHITE, BLACK }) { + auto accTile = reinterpret_cast( + &accumulator.accumulation[perspective][i][j * kTileHeight]); + vec_t acc[kNumRegs]; - #elif defined(USE_SSE2) - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - auto accumulation = reinterpret_cast<__m128i*>( - &accumulator.accumulation[perspective][i][0]); + if (reset[perspective]) { + auto biasesTile = reinterpret_cast( + &biases_[j * kTileHeight]); + for (unsigned k = 0; k < kNumRegs; ++k) + acc[k] = biasesTile[k]; + } else { + auto prevAccTile = reinterpret_cast( + &prev_accumulator.accumulation[perspective][i][j * kTileHeight]); + for (IndexType k = 0; k < kNumRegs; ++k) + acc[k] = vec_load(&prevAccTile[k]); - #elif defined(USE_MMX) - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - auto accumulation = reinterpret_cast<__m64*>( - &accumulator.accumulation[perspective][i][0]); + // Difference calculation for the deactivated features + for (const auto index : removed_indices[perspective]) { + const IndexType offset = kHalfDimensions * index + j * kTileHeight; + auto column = reinterpret_cast(&weights_[offset]); - #elif defined(USE_NEON) - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - auto accumulation = reinterpret_cast( - &accumulator.accumulation[perspective][i][0]); + for (IndexType k = 0; k < kNumRegs; ++k) + acc[k] = vec_sub_16(acc[k], column[k]); + } + } + { // Difference calculation for the activated features + for (const auto index : added_indices[perspective]) { + const IndexType offset = kHalfDimensions * index + j * kTileHeight; + auto column = reinterpret_cast(&weights_[offset]); + + for (IndexType k = 0; k < kNumRegs; ++k) + acc[k] = vec_add_16(acc[k], column[k]); + } + } + + for (IndexType k = 0; k < kNumRegs; ++k) + vec_store(&accTile[k], acc[k]); + } + } + #if defined(USE_MMX) + _mm_empty(); #endif + #else + for (Color perspective : { WHITE, BLACK }) { + if (reset[perspective]) { std::memcpy(accumulator.accumulation[perspective][i], biases_, kHalfDimensions * sizeof(BiasType)); @@ -307,67 +372,19 @@ namespace Eval::NNUE { for (const auto index : removed_indices[perspective]) { const IndexType offset = kHalfDimensions * index; - #if defined(USE_AVX2) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]); - - #elif defined(USE_SSE2) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]); - - #elif defined(USE_MMX) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]); - - #elif defined(USE_NEON) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = vsubq_s16(accumulation[j], column[j]); - - #else for (IndexType j = 0; j < kHalfDimensions; ++j) accumulator.accumulation[perspective][i][j] -= weights_[offset + j]; - #endif - } } { // Difference calculation for the activated features for (const auto index : added_indices[perspective]) { const IndexType offset = kHalfDimensions * index; - #if defined(USE_AVX2) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]); - - #elif defined(USE_SSE2) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); - - #elif defined(USE_MMX) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm_add_pi16(accumulation[j], column[j]); - - #elif defined(USE_NEON) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = vaddq_s16(accumulation[j], column[j]); - - #else for (IndexType j = 0; j < kHalfDimensions; ++j) accumulator.accumulation[perspective][i][j] += weights_[offset + j]; - #endif - } } } - #if defined(USE_MMX) - _mm_empty(); #endif accumulator.computed_accumulation = true; diff --git a/src/position.h b/src/position.h index e3f758e0..a5b0d445 100644 --- a/src/position.h +++ b/src/position.h @@ -194,6 +194,7 @@ public: // Returns the position of the ball on the c side. Square king_square(Color c) const { return pieceList[make_piece(c, KING)][0]; } #endif // EVAL_LEARN + bool RootInTB; private: // Initialization helpers (used while setting up a position) diff --git a/src/search.cpp b/src/search.cpp index 5c3dce01..69c25cfc 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -43,7 +43,6 @@ namespace Search { namespace Tablebases { int Cardinality; - bool RootInTB; bool UseRule50; Depth ProbeDepth; } @@ -520,7 +519,7 @@ void Thread::search() { totBestMoveChanges += th->bestMoveChanges; th->bestMoveChanges = 0; } - double bestMoveInstability = 1 + totBestMoveChanges / Threads.size(); + double bestMoveInstability = 1 + 2 * totBestMoveChanges / Threads.size(); double totalTime = rootMoves.size() == 1 ? 0 : Time.optimum() * fallingEval * reduction * bestMoveInstability; @@ -654,9 +653,7 @@ namespace { // starts with statScore = 0. Later grandchildren start with the last calculated // statScore of the previous grandchild. This influences the reduction rules in // LMR which are based on the statScore of parent position. - if (rootNode) - (ss+4)->statScore = 0; - else + if (!rootNode) (ss+2)->statScore = 0; // Step 4. Transposition table lookup. We don't want the score of a partial @@ -1062,7 +1059,6 @@ moves_loop: // When in check, search starts from here if ( !givesCheck && lmrDepth < 6 && !(PvNode && abs(bestValue) < 2) - && PieceValue[MG][type_of(movedPiece)] >= PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] && !ss->inCheck && ss->staticEval + 169 + 244 * lmrDepth + PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] <= alpha) @@ -1133,11 +1129,6 @@ moves_loop: // When in check, search starts from here && pos.non_pawn_material() <= 2 * RookValueMg) extension = 1; - // Castling extension - if ( type_of(move) == CASTLING - && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2) - extension = 1; - // Late irreversible move extension if ( move == ttMove && pos.rule50_count() > 80 @@ -1853,7 +1844,7 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) { size_t pvIdx = pos.this_thread()->pvIdx; size_t multiPV = std::min((size_t)Options["MultiPV"], rootMoves.size()); uint64_t nodesSearched = Threads.nodes_searched(); - uint64_t tbHits = Threads.tb_hits() + (TB::RootInTB ? rootMoves.size() : 0); + uint64_t tbHits = Threads.tb_hits() + (pos.RootInTB ? rootMoves.size() : 0); for (size_t i = 0; i < multiPV; ++i) { @@ -1868,7 +1859,7 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) { if (v == -VALUE_INFINITE) v = VALUE_ZERO; - bool tb = TB::RootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY; + bool tb = pos.RootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY; v = tb ? rootMoves[i].tbScore : v; if (ss.rdbuf()->in_avail()) // Not at first line @@ -1935,7 +1926,7 @@ bool RootMove::extract_ponder_from_tt(Position& pos) { void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) { - RootInTB = false; + pos.RootInTB = false; UseRule50 = bool(Options["Syzygy50MoveRule"]); ProbeDepth = int(Options["SyzygyProbeDepth"]); Cardinality = int(Options["SyzygyProbeLimit"]); @@ -1952,17 +1943,17 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) { if (Cardinality >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING)) { // Rank moves using DTZ tables - RootInTB = root_probe(pos, rootMoves); + pos.RootInTB = root_probe(pos, rootMoves); - if (!RootInTB) + if (!pos.RootInTB) { // DTZ tables are missing; try to rank moves using WDL tables dtz_available = false; - RootInTB = root_probe_wdl(pos, rootMoves); + pos.RootInTB = root_probe_wdl(pos, rootMoves); } } - if (RootInTB) + if (pos.RootInTB) { // Sort moves according to TB rank std::stable_sort(rootMoves.begin(), rootMoves.end(), diff --git a/src/tt.cpp b/src/tt.cpp index 16d12993..283e4a7a 100644 --- a/src/tt.cpp +++ b/src/tt.cpp @@ -32,7 +32,27 @@ TranspositionTable TT; // Our global transposition table /// overwriting an old position. Update is not atomic and can be racy. void TTEntry::save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev) { + if (Options["Training"]) + return; + // Preserve any existing move for the same position + if (m || (uint16_t)k != key16) + move16 = (uint16_t)m; + + // Overwrite less valuable entries (cheapest checks first) + if (b == BOUND_EXACT + || (uint16_t)k != key16 + || d - DEPTH_OFFSET > depth8 - 4) + { + assert(d > DEPTH_OFFSET); + assert(d < 256 + DEPTH_OFFSET); + + key16 = (uint16_t)k; + depth8 = (uint8_t)(d - DEPTH_OFFSET); + genBound8 = (uint8_t)(TT.generation8 | uint8_t(pv) << 2 | b); + value16 = (int16_t)v; + eval16 = (int16_t)ev; + } } @@ -97,7 +117,32 @@ void TranspositionTable::clear() { /// TTEntry t2 if its replace value is greater than that of t2. TTEntry* TranspositionTable::probe(const Key key, bool& found) const { - return found = false, first_entry(0); + if (Options["Training"]) + return found = false, first_entry(0); + + TTEntry* const tte = first_entry(key); + const uint16_t key16 = (uint16_t)key; // Use the low 16 bits as key inside the cluster + + for (int i = 0; i < ClusterSize; ++i) + if (tte[i].key16 == key16 || !tte[i].depth8) + { + tte[i].genBound8 = uint8_t(generation8 | (tte[i].genBound8 & 0x7)); // Refresh + + return found = (bool)tte[i].depth8, &tte[i]; + } + + // Find an entry to be replaced according to the replacement strategy + TTEntry* replace = tte; + for (int i = 1; i < ClusterSize; ++i) + // Due to our packed storage format for generation and its cyclic + // nature we add 263 (256 is the modulus plus 7 to keep the unrelated + // lowest three bits from affecting the result) to calculate the entry + // age correctly even after generation8 overflows into the next cycle. + if ( replace->depth8 - ((263 + generation8 - replace->genBound8) & 0xF8) + > tte[i].depth8 - ((263 + generation8 - tte[i].genBound8) & 0xF8)) + replace = &tte[i]; + + return found = false, replace; } diff --git a/src/uci.cpp b/src/uci.cpp index bdecaca1..d51d1610 100644 --- a/src/uci.cpp +++ b/src/uci.cpp @@ -200,7 +200,7 @@ namespace UCI { if (token == "go" || token == "eval") { - cerr << "\nPosition: " << cnt++ << '/' << num << endl; + cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")" << endl; if (token == "go") { go(pos, is, states);