diff --git a/AUTHORS b/AUTHORS index c12b98a0..69d682f1 100644 --- a/AUTHORS +++ b/AUTHORS @@ -27,6 +27,7 @@ Andy Duplain Antoine Champion (antoinechampion) Aram Tumanian (atumanian) Arjun Temurnikar +Artem Solopiy (EntityFX) Auguste Pop Balint Pfliegel Ben Koshy (BKSpurgeon) diff --git a/README.md b/README.md index 690eaf3b..b2ce535e 100644 --- a/README.md +++ b/README.md @@ -33,9 +33,14 @@ This distribution of Stockfish consists of the following files: * a file with the .nnue extension, storing the neural network for the NNUE evaluation. Binary distributions will have this file embedded. -## UCI options +## The UCI protocol and available options -Currently, Stockfish has the following UCI options: +The Universal Chess Interface (UCI) is a standard protocol used to communicate with a chess engine, +and is the recommended way to do so for typical graphical user interfaces (GUI) or chess tools. + +Stockfish implements most commands as described in [the UCI protocol](https://www.shredderchess.com/download/div/uci.zip) + +For users, the following UCI options, which can typically be set via a GUI, are available in Stockfish: * #### Threads The number of CPU threads used for searching a position. For best performance, set @@ -136,6 +141,33 @@ Currently, Stockfish has the following UCI options: * #### Debug Log File Write all communication to and from the engine into a text file. +For developers the following non-standard commands might be of interest, mainly useful for debugging: + + * #### bench ttSize threads limit fenFile limitType evalType + Performs a standard benchmark using various options. The signature or standard node + count is obtained using all defaults. `bench` is currently `bench 16 1 13 default depth mixed`. + + * #### compiler + Give information about the compiler and environment used for building a binary. + + * #### d + Display the current position, with ascii art and fen. + + * #### eval + Return the evaluation of the current position. + + * #### export_net [filename] + Exports the currently loaded network to a file. + If the currently loaded network is the embedded network and the filename + is not specified then the network is saved to the file matching the name + of the embedded network, as defined in evaluate.h. + If the currently loaded network is not the embedded network (some net set + through the UCI setoption) then the filename parameter is required and the + network is saved into that file. + + * #### flip + Flips the side to move. + ### Generating Training Data To generate training data from the classic eval, use the generate_training_data command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands. diff --git a/src/Makefile b/src/Makefile index cc0f7391..45ae7e5f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -107,6 +107,7 @@ ifeq ($(ARCH), $(filter $(ARCH), \ x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-bmi2 x86-64-avx2 \ x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \ x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 \ + e2k \ armv7 armv7-neon armv8 apple-silicon general-64 general-32)) SUPPORTED_ARCH=true else @@ -301,6 +302,17 @@ ifeq ($(ARCH),ppc-64) prefetch = yes endif +ifeq ($(findstring e2k,$(ARCH)),e2k) + arch = e2k + mmx = yes + bits = 64 + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes + popcnt = yes +endif + endif ### ========================================================================== @@ -524,7 +536,6 @@ ifeq ($(popcnt),yes) endif endif - ifeq ($(avx2),yes) CXXFLAGS += -DUSE_AVX2 ifeq ($(comp),$(filter $(comp),gcc clang mingw)) @@ -692,6 +703,7 @@ help: @echo "armv7 > ARMv7 32-bit" @echo "armv7-neon > ARMv7 32-bit with popcnt and neon" @echo "armv8 > ARMv8 64-bit with popcnt and neon" + @echo "e2k > Elbrus 2000" @echo "apple-silicon > Apple silicon ARM64" @echo "general-64 > unspecified 64-bit" @echo "general-32 > unspecified 32-bit" @@ -841,6 +853,7 @@ config-sanity: net @test "$(SUPPORTED_ARCH)" = "true" @test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \ test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || \ + test "$(arch)" = "e2k" || \ test "$(arch)" = "armv7" || test "$(arch)" = "armv8" || test "$(arch)" = "arm64" @test "$(bits)" = "32" || test "$(bits)" = "64" @test "$(prefetch)" = "yes" || test "$(prefetch)" = "no" diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 3d9a48b7..b7e0bb82 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -35,6 +35,7 @@ #include "misc.h" #include "pawns.h" #include "thread.h" +#include "timeman.h" #include "uci.h" #include "incbin/incbin.h" @@ -126,8 +127,28 @@ namespace Eval { } } - /// NNUE::verify() verifies that the last net used was loaded successfully - void verify() { + void NNUE::export_net(const std::optional& filename) { + std::string actualFilename; + if (filename.has_value()) { + actualFilename = filename.value(); + } else { + if (eval_file_loaded != EvalFileDefaultName) { + sync_cout << "Failed to export a net. A non-embedded net can only be saved if the filename is specified." << sync_endl; + return; + } + actualFilename = EvalFileDefaultName; + } + + ofstream stream(actualFilename, std::ios_base::binary); + if (save_eval(stream)) { + sync_cout << "Network saved successfully to " << actualFilename << "." << sync_endl; + } else { + sync_cout << "Failed to export a net." << sync_endl; + } + } + + /// NNUE::verify() verifies that the last net used was loaded successfully + void NNUE::verify() { string eval_file = string(Options["EvalFile"]); @@ -1120,7 +1141,7 @@ Value Eval::evaluate(const Position& pos) { + material / 32 - 4 * pos.rule50_count(); - Value nnue = NNUE::evaluate(pos) * scale / 1024 + Tempo; + Value nnue = NNUE::evaluate(pos) * scale / 1024 + Time.tempoNNUE; if (pos.is_chess960()) nnue += fix_FRC(pos); diff --git a/src/evaluate.h b/src/evaluate.h index af3453b4..f237cd22 100644 --- a/src/evaluate.h +++ b/src/evaluate.h @@ -20,6 +20,7 @@ #define EVALUATE_H_INCLUDED #include +#include #include "types.h" @@ -50,7 +51,9 @@ namespace Eval { Value evaluate(const Position& pos); bool load_eval(std::string name, std::istream& stream); + bool save_eval(std::ostream& stream); void init(); + void export_net(const std::optional& filename); void verify(); } diff --git a/src/misc.cpp b/src/misc.cpp index b58695ec..7a5559ce 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -51,7 +51,7 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY); #include #endif -#if defined(__APPLE__) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) +#if defined(__APPLE__) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) || defined(__e2k__) #define POSIXALIGNEDALLOC #include #endif @@ -194,6 +194,18 @@ std::string compiler_info() { compiler += "(version "; compiler += stringify(_MSC_FULL_VER) "." stringify(_MSC_BUILD); compiler += ")"; + #elif defined(__e2k__) && defined(__LCC__) + #define dot_ver2(n) \ + compiler += (char)'.'; \ + compiler += (char)('0' + (n) / 10); \ + compiler += (char)('0' + (n) % 10); + + compiler += "MCST LCC "; + compiler += "(version "; + compiler += std::to_string(__LCC__ / 100); + dot_ver2(__LCC__ % 100) + dot_ver2(__LCC_MINOR__) + compiler += ")"; #elif __GNUC__ compiler += "g++ (GNUC) "; compiler += make_version_string(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); diff --git a/src/misc.h b/src/misc.h index b7d3c78a..66e869d7 100644 --- a/src/misc.h +++ b/src/misc.h @@ -86,6 +86,49 @@ T* align_ptr_up(T* ptr) return reinterpret_cast(reinterpret_cast((ptrint + (Alignment - 1)) / Alignment * Alignment)); } +template +class ValueListInserter { +public: + ValueListInserter(T* v, std::size_t& s) : + values(v), + size(&s) + { + } + + void push_back(const T& value) { values[(*size)++] = value; } +private: + T* values; + std::size_t* size; +}; + +template +class ValueList { + +public: + std::size_t size() const { return size_; } + void resize(std::size_t newSize) { size_ = newSize; } + void push_back(const T& value) { values_[size_++] = value; } + T& operator[](std::size_t index) { return values_[index]; } + T* begin() { return values_; } + T* end() { return values_ + size_; } + const T& operator[](std::size_t index) const { return values_[index]; } + const T* begin() const { return values_; } + const T* end() const { return values_ + size_; } + operator ValueListInserter() { return ValueListInserter(values_, size_); } + + void swap(ValueList& other) { + const std::size_t maxSize = std::max(size_, other.size_); + for (std::size_t i = 0; i < maxSize; ++i) { + std::swap(values_[i], other.values_[i]); + } + std::swap(size_, other.size_); + } + +private: + T values_[MaxSize]; + std::size_t size_ = 0; +}; + // This logger allows printing many parts in a region atomically // but doesn't block the threads trying to append to other regions. // Instead if some region tries to pring while other region holds diff --git a/src/movegen.cpp b/src/movegen.cpp index 50496136..be168450 100644 --- a/src/movegen.cpp +++ b/src/movegen.cpp @@ -58,19 +58,16 @@ namespace { constexpr Direction UpLeft = (Us == WHITE ? NORTH_WEST : SOUTH_EAST); const Square ksq = pos.square(Them); - Bitboard emptySquares; + const Bitboard emptySquares = Type == QUIETS || Type == QUIET_CHECKS ? target : ~pos.pieces(); + const Bitboard enemies = Type == EVASIONS ? pos.checkers() + : Type == CAPTURES ? target : pos.pieces(Them); Bitboard pawnsOn7 = pos.pieces(Us, PAWN) & TRank7BB; Bitboard pawnsNotOn7 = pos.pieces(Us, PAWN) & ~TRank7BB; - Bitboard enemies = (Type == EVASIONS ? pos.checkers(): - Type == CAPTURES ? target : pos.pieces(Them)); - // Single and double pawn pushes, no promotions if (Type != CAPTURES) { - emptySquares = (Type == QUIETS || Type == QUIET_CHECKS ? target : ~pos.pieces()); - Bitboard b1 = shift(pawnsNotOn7) & emptySquares; Bitboard b2 = shift(b1 & TRank3BB) & emptySquares; @@ -82,22 +79,12 @@ namespace { if (Type == QUIET_CHECKS) { - b1 &= pawn_attacks_bb(Them, ksq); - b2 &= pawn_attacks_bb(Them, ksq); - - // Add pawn pushes which give discovered check. This is possible only - // if the pawn is not on the same file as the enemy king, because we - // don't generate captures. Note that a possible discovered check - // promotion has been already generated amongst the captures. - Bitboard dcCandidateQuiets = pos.blockers_for_king(Them) & pawnsNotOn7; - if (dcCandidateQuiets) - { - Bitboard dc1 = shift(dcCandidateQuiets) & emptySquares & ~file_bb(ksq); - Bitboard dc2 = shift(dc1 & TRank3BB) & emptySquares; - - b1 |= dc1; - b2 |= dc2; - } + // To make a quiet check, you either make a direct check by pushing a pawn + // or push a blocker pawn that is not on the same file as the enemy king. + // Discovered check promotion has been already generated amongst the captures. + Bitboard dcCandidatePawns = pos.blockers_for_king(Them) & ~file_bb(ksq); + b1 &= pawn_attacks_bb(Them, ksq) | shift< Up>(dcCandidatePawns); + b2 &= pawn_attacks_bb(Them, ksq) | shift(dcCandidatePawns); } while (b1) @@ -116,16 +103,13 @@ namespace { // Promotions and underpromotions if (pawnsOn7) { - if (Type == CAPTURES) - emptySquares = ~pos.pieces(); - - if (Type == EVASIONS) - emptySquares &= target; - Bitboard b1 = shift(pawnsOn7) & enemies; Bitboard b2 = shift(pawnsOn7) & enemies; Bitboard b3 = shift(pawnsOn7) & emptySquares; + if (Type == EVASIONS) + b3 &= target; + while (b1) moveList = make_promotions(moveList, pop_lsb(b1), ksq); @@ -175,19 +159,20 @@ namespace { } - template - ExtMove* generate_moves(const Position& pos, ExtMove* moveList, Bitboard piecesToMove, Bitboard target) { + template + ExtMove* generate_moves(const Position& pos, ExtMove* moveList, Bitboard target) { static_assert(Pt != KING && Pt != PAWN, "Unsupported piece type in generate_moves()"); - Bitboard bb = piecesToMove & pos.pieces(Pt); + Bitboard bb = pos.pieces(Us, Pt); while (bb) { Square from = pop_lsb(bb); - Bitboard b = attacks_bb(from, pos.pieces()) & target; - if constexpr (Checks) + + // To check, you either move freely a blocker or make a direct check. + if (Checks && (Pt == QUEEN || !(pos.blockers_for_king(~Us) & from))) b &= pos.check_squares(Pt); while (b) @@ -204,42 +189,34 @@ namespace { static_assert(Type != LEGAL, "Unsupported type in generate_all()"); constexpr bool Checks = Type == QUIET_CHECKS; // Reduce template instantiations - Bitboard target, piecesToMove = pos.pieces(Us); + const Square ksq = pos.square(Us); + Bitboard target; - if(Type == QUIET_CHECKS) - piecesToMove &= ~pos.blockers_for_king(~Us); + if (Type == EVASIONS && more_than_one(pos.checkers())) + goto kingMoves; // Double check, only a king move can save the day - switch (Type) - { - case CAPTURES: - target = pos.pieces(~Us); - break; - case QUIETS: - case QUIET_CHECKS: - target = ~pos.pieces(); - break; - case EVASIONS: - target = between_bb(pos.square(Us), lsb(pos.checkers())); - break; - case NON_EVASIONS: - target = ~pos.pieces(Us); - break; - } + target = Type == EVASIONS ? between_bb(ksq, lsb(pos.checkers())) + : Type == NON_EVASIONS ? ~pos.pieces( Us) + : Type == CAPTURES ? pos.pieces(~Us) + : ~pos.pieces( ); // QUIETS || QUIET_CHECKS moveList = generate_pawn_moves(pos, moveList, target); - moveList = generate_moves(pos, moveList, piecesToMove, target); - moveList = generate_moves(pos, moveList, piecesToMove, target); - moveList = generate_moves< ROOK, Checks>(pos, moveList, piecesToMove, target); - moveList = generate_moves< QUEEN, Checks>(pos, moveList, piecesToMove, target); + moveList = generate_moves(pos, moveList, target); + moveList = generate_moves(pos, moveList, target); + moveList = generate_moves(pos, moveList, target); + moveList = generate_moves(pos, moveList, target); - if (Type != QUIET_CHECKS && Type != EVASIONS) +kingMoves: + if (!Checks || pos.blockers_for_king(~Us) & ksq) { - Square ksq = pos.square(Us); - Bitboard b = attacks_bb(ksq) & target; + Bitboard b = attacks_bb(ksq) & (Type == EVASIONS ? ~pos.pieces(Us) : target); + if (Checks) + b &= ~attacks_bb(pos.square(~Us)); + while (b) *moveList++ = make_move(ksq, pop_lsb(b)); - if ((Type != CAPTURES) && pos.can_castle(Us & ANY_CASTLING)) + if ((Type == QUIETS || Type == NON_EVASIONS) && pos.can_castle(Us & ANY_CASTLING)) for (CastlingRights cr : { Us & KING_SIDE, Us & QUEEN_SIDE } ) if (!pos.castling_impeded(cr) && pos.can_castle(cr)) *moveList++ = make(ksq, pos.castling_rook_square(cr)); @@ -253,6 +230,8 @@ namespace { /// Generates all pseudo-legal captures plus queen and checking knight promotions /// Generates all pseudo-legal non-captures and underpromotions (except checking knight) +/// Generates all pseudo-legal check evasions when the side to move is in check +/// Generates all pseudo-legal non-captures giving check, except castling /// Generates all pseudo-legal captures and non-captures /// /// Returns a pointer to the end of the move list. @@ -260,8 +239,8 @@ namespace { template ExtMove* generate(const Position& pos, ExtMove* moveList) { - static_assert(Type == CAPTURES || Type == QUIETS || Type == NON_EVASIONS, "Unsupported type in generate()"); - assert(!pos.checkers()); + static_assert(Type != LEGAL, "Unsupported type in generate()"); + assert((Type == EVASIONS) == (bool)pos.checkers()); Color us = pos.side_to_move(); @@ -272,62 +251,11 @@ ExtMove* generate(const Position& pos, ExtMove* moveList) { // Explicit template instantiations template ExtMove* generate(const Position&, ExtMove*); template ExtMove* generate(const Position&, ExtMove*); +template ExtMove* generate(const Position&, ExtMove*); +template ExtMove* generate(const Position&, ExtMove*); template ExtMove* generate(const Position&, ExtMove*); -/// generate generates all pseudo-legal non-captures giving check, -/// except castling. Returns a pointer to the end of the move list. -template<> -ExtMove* generate(const Position& pos, ExtMove* moveList) { - - assert(!pos.checkers()); - - Color us = pos.side_to_move(); - Bitboard dc = pos.blockers_for_king(~us) & pos.pieces(us) & ~pos.pieces(PAWN); - - while (dc) - { - Square from = pop_lsb(dc); - PieceType pt = type_of(pos.piece_on(from)); - - Bitboard b = attacks_bb(pt, from, pos.pieces()) & ~pos.pieces(); - - if (pt == KING) - b &= ~attacks_bb(pos.square(~us)); - - while (b) - *moveList++ = make_move(from, pop_lsb(b)); - } - - return us == WHITE ? generate_all(pos, moveList) - : generate_all(pos, moveList); -} - - -/// generate generates all pseudo-legal check evasions when the side -/// to move is in check. Returns a pointer to the end of the move list. -template<> -ExtMove* generate(const Position& pos, ExtMove* moveList) { - - assert(pos.checkers()); - - Color us = pos.side_to_move(); - Square ksq = pos.square(us); - - // Generate evasions for king - Bitboard b = attacks_bb(ksq) & ~pos.pieces(us); - while (b) - *moveList++ = make_move(ksq, pop_lsb(b)); - - if (more_than_one(pos.checkers())) - return moveList; // Double check, only a king move can save the day - - // Generate blocking interpositions or captures of the checking piece - return us == WHITE ? generate_all(pos, moveList) - : generate_all(pos, moveList); -} - - /// generate generates all the legal moves in the given position template<> diff --git a/src/nnue/architectures/halfkp_256x2-32-32.h b/src/nnue/architectures/halfkp_256x2-32-32.h deleted file mode 100644 index a6768204..00000000 --- a/src/nnue/architectures/halfkp_256x2-32-32.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - Stockfish, a UCI chess playing engine derived from Glaurung 2.1 - Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file) - - Stockfish is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Stockfish is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -// Definition of input features and network structure used in NNUE evaluation function - -#ifndef NNUE_HALFKP_256X2_32_32_H_INCLUDED -#define NNUE_HALFKP_256X2_32_32_H_INCLUDED - -#include "../features/feature_set.h" -#include "../features/half_kp.h" - -#include "../layers/input_slice.h" -#include "../layers/affine_transform.h" -#include "../layers/clipped_relu.h" - -namespace Stockfish::Eval::NNUE { - -// Input features used in evaluation function -using RawFeatures = Features::FeatureSet< - Features::HalfKP>; - -// Number of input feature dimensions after conversion -constexpr IndexType kTransformedFeatureDimensions = 256; - -namespace Layers { - -// Define network structure -using InputLayer = InputSlice; -using HiddenLayer1 = ClippedReLU>; -using HiddenLayer2 = ClippedReLU>; -using OutputLayer = AffineTransform; - -} // namespace Layers - -using Network = Layers::OutputLayer; - -} // namespace Stockfish::Eval::NNUE - -#endif // #ifndef NNUE_HALFKP_256X2_32_32_H_INCLUDED diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp index 5416f13e..e0d4b911 100644 --- a/src/nnue/evaluate_nnue.cpp +++ b/src/nnue/evaluate_nnue.cpp @@ -32,26 +32,27 @@ namespace Stockfish::Eval::NNUE { // Input feature converter - LargePagePtr feature_transformer; + LargePagePtr featureTransformer; // Evaluation function AlignedPtr network; // Evaluation function file name std::string fileName; + std::string netDescription; namespace Detail { // Initialize the evaluation function parameters template - void Initialize(AlignedPtr& pointer) { + void initialize(AlignedPtr& pointer) { pointer.reset(reinterpret_cast(std_aligned_alloc(alignof(T), sizeof(T)))); std::memset(pointer.get(), 0, sizeof(T)); } template - void Initialize(LargePagePtr& pointer) { + void initialize(LargePagePtr& pointer) { static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T"); pointer.reset(reinterpret_cast(aligned_large_pages_alloc(sizeof(T)))); @@ -60,85 +61,120 @@ namespace Stockfish::Eval::NNUE { // Read evaluation function parameters template - bool ReadParameters(std::istream& stream, T& reference) { + bool read_parameters(std::istream& stream, T& reference) { std::uint32_t header; header = read_little_endian(stream); - if (!stream || header != T::GetHashValue()) return false; - return reference.ReadParameters(stream); + if (!stream || header != T::get_hash_value()) return false; + return reference.read_parameters(stream); + } + + // Write evaluation function parameters + template + bool write_parameters(std::ostream& stream, const T& reference) { + + write_little_endian(stream, T::get_hash_value()); + return reference.write_parameters(stream); } } // namespace Detail // Initialize the evaluation function parameters - void Initialize() { + void initialize() { - Detail::Initialize(feature_transformer); - Detail::Initialize(network); + Detail::initialize(featureTransformer); + Detail::initialize(network); } // Read network header - bool ReadHeader(std::istream& stream, std::uint32_t* hash_value, std::string* architecture) + bool read_header(std::istream& stream, std::uint32_t* hashValue, std::string* desc) { std::uint32_t version, size; version = read_little_endian(stream); - *hash_value = read_little_endian(stream); + *hashValue = read_little_endian(stream); size = read_little_endian(stream); - if (!stream || version != kVersion) return false; - architecture->resize(size); - stream.read(&(*architecture)[0], size); + if (!stream || version != Version) return false; + desc->resize(size); + stream.read(&(*desc)[0], size); + return !stream.fail(); + } + + // Write network header + bool write_header(std::ostream& stream, std::uint32_t hashValue, const std::string& desc) + { + write_little_endian(stream, Version); + write_little_endian(stream, hashValue); + write_little_endian(stream, desc.size()); + stream.write(&desc[0], desc.size()); return !stream.fail(); } // Read network parameters - bool ReadParameters(std::istream& stream) { + bool read_parameters(std::istream& stream) { - std::uint32_t hash_value; - std::string architecture; - if (!ReadHeader(stream, &hash_value, &architecture)) return false; - if (hash_value != kHashValue) return false; - if (!Detail::ReadParameters(stream, *feature_transformer)) return false; - if (!Detail::ReadParameters(stream, *network)) return false; + std::uint32_t hashValue; + if (!read_header(stream, &hashValue, &netDescription)) return false; + if (hashValue != HashValue) return false; + if (!Detail::read_parameters(stream, *featureTransformer)) return false; + if (!Detail::read_parameters(stream, *network)) return false; return stream && stream.peek() == std::ios::traits_type::eof(); } + // Write network parameters + bool write_parameters(std::ostream& stream) { + + if (!write_header(stream, HashValue, netDescription)) return false; + if (!Detail::write_parameters(stream, *featureTransformer)) return false; + if (!Detail::write_parameters(stream, *network)) return false; + return (bool)stream; + } + // Evaluation function. Perform differential calculation. Value evaluate(const Position& pos) { // We manually align the arrays on the stack because with gcc < 9.3 // overaligning stack variables with alignas() doesn't work correctly. - constexpr uint64_t alignment = kCacheLineSize; + constexpr uint64_t alignment = CacheLineSize; #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN) - TransformedFeatureType transformed_features_unaligned[ - FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)]; - char buffer_unaligned[Network::kBufferSize + alignment]; + TransformedFeatureType transformedFeaturesUnaligned[ + FeatureTransformer::BufferSize + alignment / sizeof(TransformedFeatureType)]; + char bufferUnaligned[Network::BufferSize + alignment]; - auto* transformed_features = align_ptr_up(&transformed_features_unaligned[0]); - auto* buffer = align_ptr_up(&buffer_unaligned[0]); + auto* transformedFeatures = align_ptr_up(&transformedFeaturesUnaligned[0]); + auto* buffer = align_ptr_up(&bufferUnaligned[0]); #else alignas(alignment) - TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize]; - alignas(alignment) char buffer[Network::kBufferSize]; + TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize]; + alignas(alignment) char buffer[Network::BufferSize]; #endif - ASSERT_ALIGNED(transformed_features, alignment); + ASSERT_ALIGNED(transformedFeatures, alignment); ASSERT_ALIGNED(buffer, alignment); - feature_transformer->Transform(pos, transformed_features); - const auto output = network->Propagate(transformed_features, buffer); + featureTransformer->transform(pos, transformedFeatures); + const auto output = network->propagate(transformedFeatures, buffer); - return static_cast(output[0] / FV_SCALE); + return static_cast(output[0] / OutputScale); } // Load eval, from a file stream or a memory stream bool load_eval(std::string name, std::istream& stream) { - Initialize(); + initialize(); fileName = name; - return ReadParameters(stream); + return read_parameters(stream); + } + + // Save eval, to a file stream or a memory stream + bool save_eval(std::ostream& stream) { + + if (fileName.empty()) + return false; + + return write_parameters(stream); } } // namespace Stockfish::Eval::NNUE diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h index 24aa6cc0..c7fa4a96 100644 --- a/src/nnue/evaluate_nnue.h +++ b/src/nnue/evaluate_nnue.h @@ -28,8 +28,8 @@ namespace Stockfish::Eval::NNUE { // Hash value of evaluation function structure - constexpr std::uint32_t kHashValue = - FeatureTransformer::GetHashValue() ^ Network::GetHashValue(); + constexpr std::uint32_t HashValue = + FeatureTransformer::get_hash_value() ^ Network::get_hash_value(); // Deleter for automating release of memory area template diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h deleted file mode 100644 index a3fea9c0..00000000 --- a/src/nnue/features/feature_set.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - Stockfish, a UCI chess playing engine derived from Glaurung 2.1 - Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file) - - Stockfish is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Stockfish is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -// A class template that represents the input feature set of the NNUE evaluation function - -#ifndef NNUE_FEATURE_SET_H_INCLUDED -#define NNUE_FEATURE_SET_H_INCLUDED - -#include "features_common.h" -#include - -namespace Stockfish::Eval::NNUE::Features { - - // Class template that represents a list of values - template - struct CompileTimeList; - - template - struct CompileTimeList { - static constexpr bool Contains(T value) { - return value == First || CompileTimeList::Contains(value); - } - static constexpr std::array - kValues = {{First, Remaining...}}; - }; - - // Base class of feature set - template - class FeatureSetBase { - - }; - - // Class template that represents the feature set - template - class FeatureSet : public FeatureSetBase> { - - public: - // Hash value embedded in the evaluation file - static constexpr std::uint32_t kHashValue = FeatureType::kHashValue; - // Number of feature dimensions - static constexpr IndexType kDimensions = FeatureType::kDimensions; - // Maximum number of simultaneously active features - static constexpr IndexType kMaxActiveDimensions = - FeatureType::kMaxActiveDimensions; - // Trigger for full calculation instead of difference calculation - using SortedTriggerSet = - CompileTimeList; - static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues; - - }; - -} // namespace Stockfish::Eval::NNUE::Features - -#endif // #ifndef NNUE_FEATURE_SET_H_INCLUDED diff --git a/src/nnue/features/features_common.h b/src/nnue/features/features_common.h deleted file mode 100644 index 118ec953..00000000 --- a/src/nnue/features/features_common.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - Stockfish, a UCI chess playing engine derived from Glaurung 2.1 - Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file) - - Stockfish is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Stockfish is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -//Common header of input features of NNUE evaluation function - -#ifndef NNUE_FEATURES_COMMON_H_INCLUDED -#define NNUE_FEATURES_COMMON_H_INCLUDED - -#include "../../evaluate.h" -#include "../nnue_common.h" - -namespace Stockfish::Eval::NNUE::Features { - - class IndexList; - - template - class FeatureSet; - - // Trigger to perform full calculations instead of difference only - enum class TriggerEvent { - kFriendKingMoved // calculate full evaluation when own king moves - }; - - enum class Side { - kFriend // side to move - }; - -} // namespace Stockfish::Eval::NNUE::Features - -#endif // #ifndef NNUE_FEATURES_COMMON_H_INCLUDED diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp index 8e6907ae..aa1decee 100644 --- a/src/nnue/features/half_kp.cpp +++ b/src/nnue/features/half_kp.cpp @@ -19,69 +19,68 @@ //Definition of input features HalfKP of NNUE evaluation function #include "half_kp.h" -#include "index_list.h" + +#include "../../position.h" namespace Stockfish::Eval::NNUE::Features { // Orient a square according to perspective (rotates by 180 for black) - inline Square orient(Color perspective, Square s) { + inline Square HalfKP::orient(Color perspective, Square s) { return Square(int(s) ^ (bool(perspective) * 63)); } // Index of a feature for a given king position and another piece on some square - inline IndexType make_index(Color perspective, Square s, Piece pc, Square ksq) { - return IndexType(orient(perspective, s) + kpp_board_index[perspective][pc] + PS_END * ksq); + inline IndexType HalfKP::make_index(Color perspective, Square s, Piece pc, Square ksq) { + return IndexType(orient(perspective, s) + PieceSquareIndex[perspective][pc] + PS_NB * ksq); } // Get a list of indices for active features - template - void HalfKP::AppendActiveIndices( - const Position& pos, Color perspective, IndexList* active) { - + void HalfKP::append_active_indices( + const Position& pos, + Color perspective, + ValueListInserter active + ) { Square ksq = orient(perspective, pos.square(perspective)); Bitboard bb = pos.pieces() & ~pos.pieces(KING); while (bb) { Square s = pop_lsb(bb); - active->push_back(make_index(perspective, s, pos.piece_on(s), ksq)); + active.push_back(make_index(perspective, s, pos.piece_on(s), ksq)); } } - // AppendChangedIndices() : get a list of indices for recently changed features + // append_changed_indices() : get a list of indices for recently changed features - // IMPORTANT: The `pos` in this function is pretty much useless as it - // is not always the position the features are updated to. The feature - // transformer code right now can update multiple accumulators per move, - // but since Stockfish only keeps the full state of the current leaf - // search position it is not possible to always pass here the position for - // which the accumulator is being updated. Therefore the only thing that - // can be reliably extracted from `pos` is the king square for the king - // of the `perspective` color (note: not even the other king's square will - // match reality in all cases, this is also the reason why `dp` is passed - // as a parameter and not extracted from pos.state()). This is of particular - // problem for future nets with other feature sets, where updating the active - // feature might require more information from the intermediate positions. In - // this case the only easy solution is to remove the multiple updates from - // the feature transformer update code and only update the accumulator for - // the current leaf position (the position after the move). - - template - void HalfKP::AppendChangedIndices( - const Position& pos, const DirtyPiece& dp, Color perspective, - IndexList* removed, IndexList* added) { - - Square ksq = orient(perspective, pos.square(perspective)); + void HalfKP::append_changed_indices( + Square ksq, + StateInfo* st, + Color perspective, + ValueListInserter removed, + ValueListInserter added + ) { + const auto& dp = st->dirtyPiece; + Square oriented_ksq = orient(perspective, ksq); for (int i = 0; i < dp.dirty_num; ++i) { Piece pc = dp.piece[i]; if (type_of(pc) == KING) continue; if (dp.from[i] != SQ_NONE) - removed->push_back(make_index(perspective, dp.from[i], pc, ksq)); + removed.push_back(make_index(perspective, dp.from[i], pc, oriented_ksq)); if (dp.to[i] != SQ_NONE) - added->push_back(make_index(perspective, dp.to[i], pc, ksq)); + added.push_back(make_index(perspective, dp.to[i], pc, oriented_ksq)); } } - template class HalfKP; + int HalfKP::update_cost(StateInfo* st) { + return st->dirtyPiece.dirty_num; + } + + int HalfKP::refresh_cost(const Position& pos) { + return pos.count() - 2; + } + + bool HalfKP::requires_refresh(StateInfo* st, Color perspective) { + return st->dirtyPiece.piece[0] == make_piece(perspective, KING); + } } // namespace Stockfish::Eval::NNUE::Features diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h index 2461acb7..a09c221b 100644 --- a/src/nnue/features/half_kp.h +++ b/src/nnue/features/half_kp.h @@ -21,37 +21,88 @@ #ifndef NNUE_FEATURES_HALF_KP_H_INCLUDED #define NNUE_FEATURES_HALF_KP_H_INCLUDED +#include "../nnue_common.h" + #include "../../evaluate.h" -#include "features_common.h" +#include "../../misc.h" + +namespace Stockfish { + struct StateInfo; +} namespace Stockfish::Eval::NNUE::Features { // Feature HalfKP: Combination of the position of own king // and the position of pieces other than kings - template class HalfKP { + // unique number for each piece type on each square + enum { + PS_NONE = 0, + PS_W_PAWN = 1, + PS_B_PAWN = 1 * SQUARE_NB + 1, + PS_W_KNIGHT = 2 * SQUARE_NB + 1, + PS_B_KNIGHT = 3 * SQUARE_NB + 1, + PS_W_BISHOP = 4 * SQUARE_NB + 1, + PS_B_BISHOP = 5 * SQUARE_NB + 1, + PS_W_ROOK = 6 * SQUARE_NB + 1, + PS_B_ROOK = 7 * SQUARE_NB + 1, + PS_W_QUEEN = 8 * SQUARE_NB + 1, + PS_B_QUEEN = 9 * SQUARE_NB + 1, + PS_NB = 10 * SQUARE_NB + 1 + }; + + static constexpr IndexType PieceSquareIndex[COLOR_NB][PIECE_NB] = { + // convention: W - us, B - them + // viewed from other side, W and B are reversed + { PS_NONE, PS_W_PAWN, PS_W_KNIGHT, PS_W_BISHOP, PS_W_ROOK, PS_W_QUEEN, PS_NONE, PS_NONE, + PS_NONE, PS_B_PAWN, PS_B_KNIGHT, PS_B_BISHOP, PS_B_ROOK, PS_B_QUEEN, PS_NONE, PS_NONE }, + { PS_NONE, PS_B_PAWN, PS_B_KNIGHT, PS_B_BISHOP, PS_B_ROOK, PS_B_QUEEN, PS_NONE, PS_NONE, + PS_NONE, PS_W_PAWN, PS_W_KNIGHT, PS_W_BISHOP, PS_W_ROOK, PS_W_QUEEN, PS_NONE, PS_NONE } + }; + + // Orient a square according to perspective (rotates by 180 for black) + static Square orient(Color perspective, Square s); + + // Index of a feature for a given king position and another piece on some square + static IndexType make_index(Color perspective, Square s, Piece pc, Square ksq); + public: // Feature name - static constexpr const char* kName = "HalfKP(Friend)"; + static constexpr const char* Name = "HalfKP(Friend)"; + // Hash value embedded in the evaluation file - static constexpr std::uint32_t kHashValue = - 0x5D69D5B9u ^ (AssociatedKing == Side::kFriend); + static constexpr std::uint32_t HashValue = 0x5D69D5B8u; + // Number of feature dimensions - static constexpr IndexType kDimensions = - static_cast(SQUARE_NB) * static_cast(PS_END); - // Maximum number of simultaneously active features - static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count - // Trigger for full calculation instead of difference calculation - static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kFriendKingMoved; + static constexpr IndexType Dimensions = + static_cast(SQUARE_NB) * static_cast(PS_NB); + + // Maximum number of simultaneously active features. 30 because kins are not included. + static constexpr IndexType MaxActiveDimensions = 30; // Get a list of indices for active features - static void AppendActiveIndices(const Position& pos, Color perspective, - IndexList* active); + static void append_active_indices( + const Position& pos, + Color perspective, + ValueListInserter active); // Get a list of indices for recently changed features - static void AppendChangedIndices(const Position& pos, const DirtyPiece& dp, Color perspective, - IndexList* removed, IndexList* added); + static void append_changed_indices( + Square ksq, + StateInfo* st, + Color perspective, + ValueListInserter removed, + ValueListInserter added); + + // Returns the cost of updating one perspective, the most costly one. + // Assumes no refresh needed. + static int update_cost(StateInfo* st); + static int refresh_cost(const Position& pos); + + // Returns whether the change stored in this StateInfo means that + // a full accumulator refresh is required. + static bool requires_refresh(StateInfo* st, Color perspective); }; } // namespace Stockfish::Eval::NNUE::Features diff --git a/src/nnue/features/index_list.h b/src/nnue/features/index_list.h deleted file mode 100644 index 9f03993b..00000000 --- a/src/nnue/features/index_list.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - Stockfish, a UCI chess playing engine derived from Glaurung 2.1 - Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file) - - Stockfish is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Stockfish is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -// Definition of index list of input features - -#ifndef NNUE_FEATURES_INDEX_LIST_H_INCLUDED -#define NNUE_FEATURES_INDEX_LIST_H_INCLUDED - -#include "../../position.h" -#include "../nnue_architecture.h" - -namespace Stockfish::Eval::NNUE::Features { - - // Class template used for feature index list - template - class ValueList { - - public: - std::size_t size() const { return size_; } - void resize(std::size_t size) { size_ = size; } - void push_back(const T& value) { values_[size_++] = value; } - T& operator[](std::size_t index) { return values_[index]; } - T* begin() { return values_; } - T* end() { return values_ + size_; } - const T& operator[](std::size_t index) const { return values_[index]; } - const T* begin() const { return values_; } - const T* end() const { return values_ + size_; } - - void swap(ValueList& other) { - const std::size_t max_size = std::max(size_, other.size_); - for (std::size_t i = 0; i < max_size; ++i) { - std::swap(values_[i], other.values_[i]); - } - std::swap(size_, other.size_); - } - - private: - T values_[MaxSize]; - std::size_t size_ = 0; - }; - - //Type of feature index list - class IndexList - : public ValueList { - }; - -} // namespace Stockfish::Eval::NNUE::Features - -#endif // NNUE_FEATURES_INDEX_LIST_H_INCLUDED diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 1faa180d..fc192691 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -27,7 +27,7 @@ namespace Stockfish::Eval::NNUE::Layers { // Affine transformation layer - template + template class AffineTransform { public: // Input/output type @@ -36,64 +36,68 @@ namespace Stockfish::Eval::NNUE::Layers { static_assert(std::is_same::value, ""); // Number of input/output dimensions - static constexpr IndexType kInputDimensions = - PreviousLayer::kOutputDimensions; - static constexpr IndexType kOutputDimensions = OutputDimensions; - static constexpr IndexType kPaddedInputDimensions = - CeilToMultiple(kInputDimensions, kMaxSimdWidth); + static constexpr IndexType InputDimensions = + PreviousLayer::OutputDimensions; + static constexpr IndexType OutputDimensions = OutDims; + static constexpr IndexType PaddedInputDimensions = + ceil_to_multiple(InputDimensions, MaxSimdWidth); #if defined (USE_AVX512) - static constexpr const IndexType kOutputSimdWidth = kSimdWidth / 2; + static constexpr const IndexType OutputSimdWidth = SimdWidth / 2; #elif defined (USE_SSSE3) - static constexpr const IndexType kOutputSimdWidth = kSimdWidth / 4; + static constexpr const IndexType OutputSimdWidth = SimdWidth / 4; #endif // Size of forward propagation buffer used in this layer - static constexpr std::size_t kSelfBufferSize = - CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize); + static constexpr std::size_t SelfBufferSize = + ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize); // Size of the forward propagation buffer used from the input layer to this layer - static constexpr std::size_t kBufferSize = - PreviousLayer::kBufferSize + kSelfBufferSize; + static constexpr std::size_t BufferSize = + PreviousLayer::BufferSize + SelfBufferSize; // Hash value embedded in the evaluation file - static constexpr std::uint32_t GetHashValue() { - std::uint32_t hash_value = 0xCC03DAE4u; - hash_value += kOutputDimensions; - hash_value ^= PreviousLayer::GetHashValue() >> 1; - hash_value ^= PreviousLayer::GetHashValue() << 31; - return hash_value; + static constexpr std::uint32_t get_hash_value() { + std::uint32_t hashValue = 0xCC03DAE4u; + hashValue += OutputDimensions; + hashValue ^= PreviousLayer::get_hash_value() >> 1; + hashValue ^= PreviousLayer::get_hash_value() << 31; + return hashValue; } - // Read network parameters - bool ReadParameters(std::istream& stream) { - if (!previous_layer_.ReadParameters(stream)) return false; - for (std::size_t i = 0; i < kOutputDimensions; ++i) - biases_[i] = read_little_endian(stream); - for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i) + // Read network parameters + bool read_parameters(std::istream& stream) { + if (!previousLayer.read_parameters(stream)) return false; + for (std::size_t i = 0; i < OutputDimensions; ++i) + biases[i] = read_little_endian(stream); #if !defined (USE_SSSE3) - weights_[i] = read_little_endian(stream); + for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) + weights[i] = read_little_endian(stream); #else - weights_[ - (i / 4) % (kPaddedInputDimensions / 4) * kOutputDimensions * 4 + - i / kPaddedInputDimensions * 4 + - i % 4 - ] = read_little_endian(stream); + std::unique_ptr indexMap = std::make_unique(OutputDimensions * PaddedInputDimensions); + for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) { + const uint32_t scrambledIdx = + (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 + + i / PaddedInputDimensions * 4 + + i % 4; + weights[scrambledIdx] = read_little_endian(stream); + indexMap[scrambledIdx] = i; + } // Determine if eights of weight and input products can be summed using 16bits // without saturation. We assume worst case combinations of 0 and 127 for all inputs. - if (kOutputDimensions > 1 && !stream.fail()) + if (OutputDimensions > 1 && !stream.fail()) { canSaturate16.count = 0; #if !defined(USE_VNNI) - for (IndexType i = 0; i < kPaddedInputDimensions; i += 16) - for (IndexType j = 0; j < kOutputDimensions; ++j) + for (IndexType i = 0; i < PaddedInputDimensions; i += 16) + for (IndexType j = 0; j < OutputDimensions; ++j) for (int x = 0; x < 2; ++x) { - WeightType* w = &weights_[i * kOutputDimensions + j * 4 + x * 2]; + WeightType* w = &weights[i * OutputDimensions + j * 4 + x * 2]; int sum[2] = {0, 0}; for (int k = 0; k < 8; ++k) { - IndexType idx = k / 2 * kOutputDimensions * 4 + k % 2; + IndexType idx = k / 2 * OutputDimensions * 4 + k % 2; sum[w[idx] < 0] += w[idx]; } for (int sign : { -1, 1 }) @@ -102,14 +106,15 @@ namespace Stockfish::Eval::NNUE::Layers { int maxK = 0, maxW = 0; for (int k = 0; k < 8; ++k) { - IndexType idx = k / 2 * kOutputDimensions * 4 + k % 2; + IndexType idx = k / 2 * OutputDimensions * 4 + k % 2; if (maxW < sign * w[idx]) maxK = k, maxW = sign * w[idx]; } - IndexType idx = maxK / 2 * kOutputDimensions * 4 + maxK % 2; + IndexType idx = maxK / 2 * OutputDimensions * 4 + maxK % 2; sum[sign == -1] -= w[idx]; - canSaturate16.add(j, i + maxK / 2 * 4 + maxK % 2 + x * 2, w[idx]); + const uint32_t scrambledIdx = idx + i * OutputDimensions + j * 4 + x * 2; + canSaturate16.add(j, i + maxK / 2 * 4 + maxK % 2 + x * 2, w[idx], indexMap[scrambledIdx]); w[idx] = 0; } } @@ -125,15 +130,43 @@ namespace Stockfish::Eval::NNUE::Layers { return !stream.fail(); } + // Write network parameters + bool write_parameters(std::ostream& stream) const { + if (!previousLayer.write_parameters(stream)) return false; + for (std::size_t i = 0; i < OutputDimensions; ++i) + write_little_endian(stream, biases[i]); +#if !defined (USE_SSSE3) + for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) + write_little_endian(stream, weights[i]); +#else + std::unique_ptr unscrambledWeights = std::make_unique(OutputDimensions * PaddedInputDimensions); + for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) { + unscrambledWeights[i] = + weights[ + (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 + + i / PaddedInputDimensions * 4 + + i % 4 + ]; + } + for (int i = 0; i < canSaturate16.count; ++i) + unscrambledWeights[canSaturate16.ids[i].wIdx] = canSaturate16.ids[i].w; + + for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) + write_little_endian(stream, unscrambledWeights[i]); +#endif + + return !stream.fail(); + } + // Forward propagation - const OutputType* Propagate( - const TransformedFeatureType* transformed_features, char* buffer) const { - const auto input = previous_layer_.Propagate( - transformed_features, buffer + kSelfBufferSize); + const OutputType* propagate( + const TransformedFeatureType* transformedFeatures, char* buffer) const { + const auto input = previousLayer.propagate( + transformedFeatures, buffer + SelfBufferSize); #if defined (USE_AVX512) - [[maybe_unused]] const __m512i kOnes512 = _mm512_set1_epi16(1); + [[maybe_unused]] const __m512i Ones512 = _mm512_set1_epi16(1); [[maybe_unused]] auto m512_hadd = [](__m512i sum, int bias) -> int { return _mm512_reduce_add_epi32(sum) + bias; @@ -144,7 +177,7 @@ namespace Stockfish::Eval::NNUE::Layers { acc = _mm512_dpbusd_epi32(acc, a, b); #else __m512i product0 = _mm512_maddubs_epi16(a, b); - product0 = _mm512_madd_epi16(product0, kOnes512); + product0 = _mm512_madd_epi16(product0, Ones512); acc = _mm512_add_epi32(acc, product0); #endif }; @@ -164,7 +197,7 @@ namespace Stockfish::Eval::NNUE::Layers { product0 = _mm512_add_epi16(product0, product1); product2 = _mm512_add_epi16(product2, product3); product0 = _mm512_add_epi16(product0, product2); - product0 = _mm512_madd_epi16(product0, kOnes512); + product0 = _mm512_madd_epi16(product0, Ones512); acc = _mm512_add_epi32(acc, product0); #endif }; @@ -172,7 +205,7 @@ namespace Stockfish::Eval::NNUE::Layers { #endif #if defined (USE_AVX2) - [[maybe_unused]] const __m256i kOnes256 = _mm256_set1_epi16(1); + [[maybe_unused]] const __m256i Ones256 = _mm256_set1_epi16(1); [[maybe_unused]] auto m256_hadd = [](__m256i sum, int bias) -> int { __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); @@ -186,7 +219,7 @@ namespace Stockfish::Eval::NNUE::Layers { acc = _mm256_dpbusd_epi32(acc, a, b); #else __m256i product0 = _mm256_maddubs_epi16(a, b); - product0 = _mm256_madd_epi16(product0, kOnes256); + product0 = _mm256_madd_epi16(product0, Ones256); acc = _mm256_add_epi32(acc, product0); #endif }; @@ -206,7 +239,7 @@ namespace Stockfish::Eval::NNUE::Layers { product0 = _mm256_add_epi16(product0, product1); product2 = _mm256_add_epi16(product2, product3); product0 = _mm256_add_epi16(product0, product2); - product0 = _mm256_madd_epi16(product0, kOnes256); + product0 = _mm256_madd_epi16(product0, Ones256); acc = _mm256_add_epi32(acc, product0); #endif }; @@ -214,7 +247,7 @@ namespace Stockfish::Eval::NNUE::Layers { #endif #if defined (USE_SSSE3) - [[maybe_unused]] const __m128i kOnes128 = _mm_set1_epi16(1); + [[maybe_unused]] const __m128i Ones128 = _mm_set1_epi16(1); [[maybe_unused]] auto m128_hadd = [](__m128i sum, int bias) -> int { sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC @@ -224,7 +257,7 @@ namespace Stockfish::Eval::NNUE::Layers { [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) { __m128i product0 = _mm_maddubs_epi16(a, b); - product0 = _mm_madd_epi16(product0, kOnes128); + product0 = _mm_madd_epi16(product0, Ones128); acc = _mm_add_epi32(acc, product0); }; @@ -237,7 +270,7 @@ namespace Stockfish::Eval::NNUE::Layers { product0 = _mm_add_epi16(product0, product1); product2 = _mm_add_epi16(product2, product3); product0 = _mm_add_epi16(product0, product2); - product0 = _mm_madd_epi16(product0, kOnes128); + product0 = _mm_madd_epi16(product0, Ones128); acc = _mm_add_epi32(acc, product0); }; @@ -269,71 +302,71 @@ namespace Stockfish::Eval::NNUE::Layers { #if defined (USE_SSSE3) const auto output = reinterpret_cast(buffer); - const auto input_vector = reinterpret_cast(input); + const auto inputVector = reinterpret_cast(input); - static_assert(kOutputDimensions % kOutputSimdWidth == 0 || kOutputDimensions == 1); + static_assert(OutputDimensions % OutputSimdWidth == 0 || OutputDimensions == 1); - // kOutputDimensions is either 1 or a multiple of kSimdWidth + // OutputDimensions is either 1 or a multiple of SimdWidth // because then it is also an input dimension. - if constexpr (kOutputDimensions % kOutputSimdWidth == 0) + if constexpr (OutputDimensions % OutputSimdWidth == 0) { - constexpr IndexType kNumChunks = kPaddedInputDimensions / 4; + constexpr IndexType NumChunks = PaddedInputDimensions / 4; const auto input32 = reinterpret_cast(input); vec_t* outptr = reinterpret_cast(output); - std::memcpy(output, biases_, kOutputDimensions * sizeof(OutputType)); + std::memcpy(output, biases, OutputDimensions * sizeof(OutputType)); - for (int i = 0; i < (int)kNumChunks - 3; i += 4) + for (int i = 0; i < (int)NumChunks - 3; i += 4) { const vec_t in0 = vec_set_32(input32[i + 0]); const vec_t in1 = vec_set_32(input32[i + 1]); const vec_t in2 = vec_set_32(input32[i + 2]); const vec_t in3 = vec_set_32(input32[i + 3]); - const auto col0 = reinterpret_cast(&weights_[(i + 0) * kOutputDimensions * 4]); - const auto col1 = reinterpret_cast(&weights_[(i + 1) * kOutputDimensions * 4]); - const auto col2 = reinterpret_cast(&weights_[(i + 2) * kOutputDimensions * 4]); - const auto col3 = reinterpret_cast(&weights_[(i + 3) * kOutputDimensions * 4]); - for (int j = 0; j * kOutputSimdWidth < kOutputDimensions; ++j) + const auto col0 = reinterpret_cast(&weights[(i + 0) * OutputDimensions * 4]); + const auto col1 = reinterpret_cast(&weights[(i + 1) * OutputDimensions * 4]); + const auto col2 = reinterpret_cast(&weights[(i + 2) * OutputDimensions * 4]); + const auto col3 = reinterpret_cast(&weights[(i + 3) * OutputDimensions * 4]); + for (int j = 0; j * OutputSimdWidth < OutputDimensions; ++j) vec_add_dpbusd_32x4(outptr[j], in0, col0[j], in1, col1[j], in2, col2[j], in3, col3[j]); } for (int i = 0; i < canSaturate16.count; ++i) output[canSaturate16.ids[i].out] += input[canSaturate16.ids[i].in] * canSaturate16.ids[i].w; } - else if constexpr (kOutputDimensions == 1) + else if constexpr (OutputDimensions == 1) { #if defined (USE_AVX512) - if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) != 0) + if constexpr (PaddedInputDimensions % (SimdWidth * 2) != 0) { - constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; - const auto input_vector256 = reinterpret_cast(input); + constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth; + const auto inputVector256 = reinterpret_cast(input); __m256i sum0 = _mm256_setzero_si256(); - const auto row0 = reinterpret_cast(&weights_[0]); + const auto row0 = reinterpret_cast(&weights[0]); - for (int j = 0; j < (int)kNumChunks; ++j) + for (int j = 0; j < (int)NumChunks; ++j) { - const __m256i in = input_vector256[j]; + const __m256i in = inputVector256[j]; m256_add_dpbusd_epi32(sum0, in, row0[j]); } - output[0] = m256_hadd(sum0, biases_[0]); + output[0] = m256_hadd(sum0, biases[0]); } else #endif { #if defined (USE_AVX512) - constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2); + constexpr IndexType NumChunks = PaddedInputDimensions / (SimdWidth * 2); #else - constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; + constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth; #endif vec_t sum0 = vec_setzero(); - const auto row0 = reinterpret_cast(&weights_[0]); + const auto row0 = reinterpret_cast(&weights[0]); - for (int j = 0; j < (int)kNumChunks; ++j) + for (int j = 0; j < (int)NumChunks; ++j) { - const vec_t in = input_vector[j]; + const vec_t in = inputVector[j]; vec_add_dpbusd_32(sum0, in, row0[j]); } - output[0] = vec_hadd(sum0, biases_[0]); + output[0] = vec_hadd(sum0, biases[0]); } } @@ -344,80 +377,80 @@ namespace Stockfish::Eval::NNUE::Layers { auto output = reinterpret_cast(buffer); #if defined(USE_SSE2) - constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; - const __m128i kZeros = _mm_setzero_si128(); - const auto input_vector = reinterpret_cast(input); + constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth; + const __m128i Zeros = _mm_setzero_si128(); + const auto inputVector = reinterpret_cast(input); #elif defined(USE_MMX) - constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; - const __m64 kZeros = _mm_setzero_si64(); - const auto input_vector = reinterpret_cast(input); + constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth; + const __m64 Zeros = _mm_setzero_si64(); + const auto inputVector = reinterpret_cast(input); #elif defined(USE_NEON) - constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; - const auto input_vector = reinterpret_cast(input); + constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth; + const auto inputVector = reinterpret_cast(input); #endif - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const IndexType offset = i * kPaddedInputDimensions; + for (IndexType i = 0; i < OutputDimensions; ++i) { + const IndexType offset = i * PaddedInputDimensions; #if defined(USE_SSE2) - __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]); - __m128i sum_hi = kZeros; - const auto row = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { + __m128i sumLo = _mm_cvtsi32_si128(biases[i]); + __m128i sumHi = Zeros; + const auto row = reinterpret_cast(&weights[offset]); + for (IndexType j = 0; j < NumChunks; ++j) { __m128i row_j = _mm_load_si128(&row[j]); - __m128i input_j = _mm_load_si128(&input_vector[j]); - __m128i extended_row_lo = _mm_srai_epi16(_mm_unpacklo_epi8(row_j, row_j), 8); - __m128i extended_row_hi = _mm_srai_epi16(_mm_unpackhi_epi8(row_j, row_j), 8); - __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros); - __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros); - __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo); - __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi); - sum_lo = _mm_add_epi32(sum_lo, product_lo); - sum_hi = _mm_add_epi32(sum_hi, product_hi); + __m128i input_j = _mm_load_si128(&inputVector[j]); + __m128i extendedRowLo = _mm_srai_epi16(_mm_unpacklo_epi8(row_j, row_j), 8); + __m128i extendedRowHi = _mm_srai_epi16(_mm_unpackhi_epi8(row_j, row_j), 8); + __m128i extendedInputLo = _mm_unpacklo_epi8(input_j, Zeros); + __m128i extendedInputHi = _mm_unpackhi_epi8(input_j, Zeros); + __m128i productLo = _mm_madd_epi16(extendedRowLo, extendedInputLo); + __m128i productHi = _mm_madd_epi16(extendedRowHi, extendedInputHi); + sumLo = _mm_add_epi32(sumLo, productLo); + sumHi = _mm_add_epi32(sumHi, productHi); } - __m128i sum = _mm_add_epi32(sum_lo, sum_hi); - __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2)); - sum = _mm_add_epi32(sum, sum_high_64); + __m128i sum = _mm_add_epi32(sumLo, sumHi); + __m128i sumHigh_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2)); + sum = _mm_add_epi32(sum, sumHigh_64); __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2)); sum = _mm_add_epi32(sum, sum_second_32); output[i] = _mm_cvtsi128_si32(sum); #elif defined(USE_MMX) - __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]); - __m64 sum_hi = kZeros; - const auto row = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { + __m64 sumLo = _mm_cvtsi32_si64(biases[i]); + __m64 sumHi = Zeros; + const auto row = reinterpret_cast(&weights[offset]); + for (IndexType j = 0; j < NumChunks; ++j) { __m64 row_j = row[j]; - __m64 input_j = input_vector[j]; - __m64 extended_row_lo = _mm_srai_pi16(_mm_unpacklo_pi8(row_j, row_j), 8); - __m64 extended_row_hi = _mm_srai_pi16(_mm_unpackhi_pi8(row_j, row_j), 8); - __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros); - __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros); - __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo); - __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi); - sum_lo = _mm_add_pi32(sum_lo, product_lo); - sum_hi = _mm_add_pi32(sum_hi, product_hi); + __m64 input_j = inputVector[j]; + __m64 extendedRowLo = _mm_srai_pi16(_mm_unpacklo_pi8(row_j, row_j), 8); + __m64 extendedRowHi = _mm_srai_pi16(_mm_unpackhi_pi8(row_j, row_j), 8); + __m64 extendedInputLo = _mm_unpacklo_pi8(input_j, Zeros); + __m64 extendedInputHi = _mm_unpackhi_pi8(input_j, Zeros); + __m64 productLo = _mm_madd_pi16(extendedRowLo, extendedInputLo); + __m64 productHi = _mm_madd_pi16(extendedRowHi, extendedInputHi); + sumLo = _mm_add_pi32(sumLo, productLo); + sumHi = _mm_add_pi32(sumHi, productHi); } - __m64 sum = _mm_add_pi32(sum_lo, sum_hi); + __m64 sum = _mm_add_pi32(sumLo, sumHi); sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum)); output[i] = _mm_cvtsi64_si32(sum); #elif defined(USE_NEON) - int32x4_t sum = {biases_[i]}; - const auto row = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { - int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]); - product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]); + int32x4_t sum = {biases[i]}; + const auto row = reinterpret_cast(&weights[offset]); + for (IndexType j = 0; j < NumChunks; ++j) { + int16x8_t product = vmull_s8(inputVector[j * 2], row[j * 2]); + product = vmlal_s8(product, inputVector[j * 2 + 1], row[j * 2 + 1]); sum = vpadalq_s16(sum, product); } output[i] = sum[0] + sum[1] + sum[2] + sum[3]; #else - OutputType sum = biases_[i]; - for (IndexType j = 0; j < kInputDimensions; ++j) { - sum += weights_[offset + j] * input[j]; + OutputType sum = biases[i]; + for (IndexType j = 0; j < InputDimensions; ++j) { + sum += weights[offset + j] * input[j]; } output[i] = sum; #endif @@ -436,20 +469,22 @@ namespace Stockfish::Eval::NNUE::Layers { using BiasType = OutputType; using WeightType = std::int8_t; - PreviousLayer previous_layer_; + PreviousLayer previousLayer; - alignas(kCacheLineSize) BiasType biases_[kOutputDimensions]; - alignas(kCacheLineSize) WeightType weights_[kOutputDimensions * kPaddedInputDimensions]; + alignas(CacheLineSize) BiasType biases[OutputDimensions]; + alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions]; #if defined (USE_SSSE3) struct CanSaturate { int count; struct Entry { + uint32_t wIdx; uint16_t out; uint16_t in; int8_t w; - } ids[kPaddedInputDimensions * kOutputDimensions * 3 / 4]; + } ids[PaddedInputDimensions * OutputDimensions * 3 / 4]; - void add(int i, int j, int8_t w) { + void add(int i, int j, int8_t w, uint32_t wIdx) { + ids[count].wIdx = wIdx; ids[count].out = i; ids[count].in = j; ids[count].w = w; diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index a10e3e48..f1ac2dfe 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -35,130 +35,135 @@ namespace Stockfish::Eval::NNUE::Layers { static_assert(std::is_same::value, ""); // Number of input/output dimensions - static constexpr IndexType kInputDimensions = - PreviousLayer::kOutputDimensions; - static constexpr IndexType kOutputDimensions = kInputDimensions; + static constexpr IndexType InputDimensions = + PreviousLayer::OutputDimensions; + static constexpr IndexType OutputDimensions = InputDimensions; // Size of forward propagation buffer used in this layer - static constexpr std::size_t kSelfBufferSize = - CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize); + static constexpr std::size_t SelfBufferSize = + ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize); // Size of the forward propagation buffer used from the input layer to this layer - static constexpr std::size_t kBufferSize = - PreviousLayer::kBufferSize + kSelfBufferSize; + static constexpr std::size_t BufferSize = + PreviousLayer::BufferSize + SelfBufferSize; // Hash value embedded in the evaluation file - static constexpr std::uint32_t GetHashValue() { - std::uint32_t hash_value = 0x538D24C7u; - hash_value += PreviousLayer::GetHashValue(); - return hash_value; + static constexpr std::uint32_t get_hash_value() { + std::uint32_t hashValue = 0x538D24C7u; + hashValue += PreviousLayer::get_hash_value(); + return hashValue; } // Read network parameters - bool ReadParameters(std::istream& stream) { - return previous_layer_.ReadParameters(stream); + bool read_parameters(std::istream& stream) { + return previousLayer.read_parameters(stream); + } + + // Write network parameters + bool write_parameters(std::ostream& stream) const { + return previousLayer.write_parameters(stream); } // Forward propagation - const OutputType* Propagate( - const TransformedFeatureType* transformed_features, char* buffer) const { - const auto input = previous_layer_.Propagate( - transformed_features, buffer + kSelfBufferSize); + const OutputType* propagate( + const TransformedFeatureType* transformedFeatures, char* buffer) const { + const auto input = previousLayer.propagate( + transformedFeatures, buffer + SelfBufferSize); const auto output = reinterpret_cast(buffer); #if defined(USE_AVX2) - constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; - const __m256i kZero = _mm256_setzero_si256(); - const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + constexpr IndexType NumChunks = InputDimensions / SimdWidth; + const __m256i Zero = _mm256_setzero_si256(); + const __m256i Offsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); const auto in = reinterpret_cast(input); const auto out = reinterpret_cast<__m256i*>(output); - for (IndexType i = 0; i < kNumChunks; ++i) { + for (IndexType i = 0; i < NumChunks; ++i) { const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32( _mm256_load_si256(&in[i * 4 + 0]), - _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits); + _mm256_load_si256(&in[i * 4 + 1])), WeightScaleBits); const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32( _mm256_load_si256(&in[i * 4 + 2]), - _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits); + _mm256_load_si256(&in[i * 4 + 3])), WeightScaleBits); _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( - _mm256_packs_epi16(words0, words1), kZero), kOffsets)); + _mm256_packs_epi16(words0, words1), Zero), Offsets)); } - constexpr IndexType kStart = kNumChunks * kSimdWidth; + constexpr IndexType Start = NumChunks * SimdWidth; #elif defined(USE_SSE2) - constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; + constexpr IndexType NumChunks = InputDimensions / SimdWidth; #ifdef USE_SSE41 - const __m128i kZero = _mm_setzero_si128(); + const __m128i Zero = _mm_setzero_si128(); #else const __m128i k0x80s = _mm_set1_epi8(-128); #endif const auto in = reinterpret_cast(input); const auto out = reinterpret_cast<__m128i*>(output); - for (IndexType i = 0; i < kNumChunks; ++i) { + for (IndexType i = 0; i < NumChunks; ++i) { const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32( _mm_load_si128(&in[i * 4 + 0]), - _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits); + _mm_load_si128(&in[i * 4 + 1])), WeightScaleBits); const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32( _mm_load_si128(&in[i * 4 + 2]), - _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits); + _mm_load_si128(&in[i * 4 + 3])), WeightScaleBits); const __m128i packedbytes = _mm_packs_epi16(words0, words1); _mm_store_si128(&out[i], #ifdef USE_SSE41 - _mm_max_epi8(packedbytes, kZero) + _mm_max_epi8(packedbytes, Zero) #else _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s) #endif ); } - constexpr IndexType kStart = kNumChunks * kSimdWidth; + constexpr IndexType Start = NumChunks * SimdWidth; #elif defined(USE_MMX) - constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; + constexpr IndexType NumChunks = InputDimensions / SimdWidth; const __m64 k0x80s = _mm_set1_pi8(-128); const auto in = reinterpret_cast(input); const auto out = reinterpret_cast<__m64*>(output); - for (IndexType i = 0; i < kNumChunks; ++i) { + for (IndexType i = 0; i < NumChunks; ++i) { const __m64 words0 = _mm_srai_pi16( _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]), - kWeightScaleBits); + WeightScaleBits); const __m64 words1 = _mm_srai_pi16( _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]), - kWeightScaleBits); + WeightScaleBits); const __m64 packedbytes = _mm_packs_pi16(words0, words1); out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); } _mm_empty(); - constexpr IndexType kStart = kNumChunks * kSimdWidth; + constexpr IndexType Start = NumChunks * SimdWidth; #elif defined(USE_NEON) - constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2); - const int8x8_t kZero = {0}; + constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2); + const int8x8_t Zero = {0}; const auto in = reinterpret_cast(input); const auto out = reinterpret_cast(output); - for (IndexType i = 0; i < kNumChunks; ++i) { + for (IndexType i = 0; i < NumChunks; ++i) { int16x8_t shifted; const auto pack = reinterpret_cast(&shifted); - pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits); - pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits); - out[i] = vmax_s8(vqmovn_s16(shifted), kZero); + pack[0] = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits); + pack[1] = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits); + out[i] = vmax_s8(vqmovn_s16(shifted), Zero); } - constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2); + constexpr IndexType Start = NumChunks * (SimdWidth / 2); #else - constexpr IndexType kStart = 0; + constexpr IndexType Start = 0; #endif - for (IndexType i = kStart; i < kInputDimensions; ++i) { + for (IndexType i = Start; i < InputDimensions; ++i) { output[i] = static_cast( - std::max(0, std::min(127, input[i] >> kWeightScaleBits))); + std::max(0, std::min(127, input[i] >> WeightScaleBits))); } return output; } private: - PreviousLayer previous_layer_; + PreviousLayer previousLayer; }; } // namespace Stockfish::Eval::NNUE::Layers diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h index 9a7ce92e..56d25af8 100644 --- a/src/nnue/layers/input_slice.h +++ b/src/nnue/layers/input_slice.h @@ -26,37 +26,42 @@ namespace Stockfish::Eval::NNUE::Layers { // Input layer -template +template class InputSlice { public: // Need to maintain alignment - static_assert(Offset % kMaxSimdWidth == 0, ""); + static_assert(Offset % MaxSimdWidth == 0, ""); // Output type using OutputType = TransformedFeatureType; // Output dimensionality - static constexpr IndexType kOutputDimensions = OutputDimensions; + static constexpr IndexType OutputDimensions = OutDims; // Size of forward propagation buffer used from the input layer to this layer - static constexpr std::size_t kBufferSize = 0; + static constexpr std::size_t BufferSize = 0; // Hash value embedded in the evaluation file - static constexpr std::uint32_t GetHashValue() { - std::uint32_t hash_value = 0xEC42E90Du; - hash_value ^= kOutputDimensions ^ (Offset << 10); - return hash_value; + static constexpr std::uint32_t get_hash_value() { + std::uint32_t hashValue = 0xEC42E90Du; + hashValue ^= OutputDimensions ^ (Offset << 10); + return hashValue; } // Read network parameters - bool ReadParameters(std::istream& /*stream*/) { + bool read_parameters(std::istream& /*stream*/) { + return true; + } + + // Read network parameters + bool write_parameters(std::ostream& /*stream*/) const { return true; } // Forward propagation - const OutputType* Propagate( - const TransformedFeatureType* transformed_features, + const OutputType* propagate( + const TransformedFeatureType* transformedFeatures, char* /*buffer*/) const { - return transformed_features + Offset; + return transformedFeatures + Offset; } private: diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index 55fafa13..72a151f8 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -29,9 +29,9 @@ namespace Stockfish::Eval::NNUE { enum AccumulatorState { EMPTY, COMPUTED, INIT }; // Class that holds the result of affine transformation of input features - struct alignas(kCacheLineSize) Accumulator { + struct alignas(CacheLineSize) Accumulator { std::int16_t - accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions]; + accumulation[2][TransformedFeatureDimensions]; AccumulatorState state[2]; }; diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h index 1680368e..55a01fbe 100644 --- a/src/nnue/nnue_architecture.h +++ b/src/nnue/nnue_architecture.h @@ -21,17 +21,37 @@ #ifndef NNUE_ARCHITECTURE_H_INCLUDED #define NNUE_ARCHITECTURE_H_INCLUDED -// Defines the network structure -#include "architectures/halfkp_256x2-32-32.h" +#include "nnue_common.h" + +#include "features/half_kp.h" + +#include "layers/input_slice.h" +#include "layers/affine_transform.h" +#include "layers/clipped_relu.h" namespace Stockfish::Eval::NNUE { - static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, ""); - static_assert(Network::kOutputDimensions == 1, ""); - static_assert(std::is_same::value, ""); + // Input features used in evaluation function + using FeatureSet = Features::HalfKP; - // Trigger for full calculation instead of difference calculation - constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers; + // Number of input feature dimensions after conversion + constexpr IndexType TransformedFeatureDimensions = 256; + + namespace Layers { + + // Define network structure + using InputLayer = InputSlice; + using HiddenLayer1 = ClippedReLU>; + using HiddenLayer2 = ClippedReLU>; + using OutputLayer = AffineTransform; + + } // namespace Layers + + using Network = Layers::OutputLayer; + + static_assert(TransformedFeatureDimensions % MaxSimdWidth == 0, ""); + static_assert(Network::OutputDimensions == 1, ""); + static_assert(std::is_same::value, ""); } // namespace Stockfish::Eval::NNUE diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index bfd0738e..9fc7b0e9 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -48,58 +48,30 @@ namespace Stockfish::Eval::NNUE { // Version of the evaluation file - constexpr std::uint32_t kVersion = 0x7AF32F16u; + constexpr std::uint32_t Version = 0x7AF32F16u; // Constant used in evaluation value calculation - constexpr int FV_SCALE = 16; - constexpr int kWeightScaleBits = 6; + constexpr int OutputScale = 16; + constexpr int WeightScaleBits = 6; // Size of cache line (in bytes) - constexpr std::size_t kCacheLineSize = 64; + constexpr std::size_t CacheLineSize = 64; // SIMD width (in bytes) #if defined(USE_AVX2) - constexpr std::size_t kSimdWidth = 32; + constexpr std::size_t SimdWidth = 32; #elif defined(USE_SSE2) - constexpr std::size_t kSimdWidth = 16; + constexpr std::size_t SimdWidth = 16; #elif defined(USE_MMX) - constexpr std::size_t kSimdWidth = 8; + constexpr std::size_t SimdWidth = 8; #elif defined(USE_NEON) - constexpr std::size_t kSimdWidth = 16; + constexpr std::size_t SimdWidth = 16; #endif - constexpr std::size_t kMaxSimdWidth = 32; - - // unique number for each piece type on each square - enum { - PS_NONE = 0, - PS_W_PAWN = 1, - PS_B_PAWN = 1 * SQUARE_NB + 1, - PS_W_KNIGHT = 2 * SQUARE_NB + 1, - PS_B_KNIGHT = 3 * SQUARE_NB + 1, - PS_W_BISHOP = 4 * SQUARE_NB + 1, - PS_B_BISHOP = 5 * SQUARE_NB + 1, - PS_W_ROOK = 6 * SQUARE_NB + 1, - PS_B_ROOK = 7 * SQUARE_NB + 1, - PS_W_QUEEN = 8 * SQUARE_NB + 1, - PS_B_QUEEN = 9 * SQUARE_NB + 1, - PS_W_KING = 10 * SQUARE_NB + 1, - PS_END = PS_W_KING, // pieces without kings (pawns included) - PS_B_KING = 11 * SQUARE_NB + 1, - PS_END2 = 12 * SQUARE_NB + 1 - }; - - constexpr uint32_t kpp_board_index[COLOR_NB][PIECE_NB] = { - // convention: W - us, B - them - // viewed from other side, W and B are reversed - { PS_NONE, PS_W_PAWN, PS_W_KNIGHT, PS_W_BISHOP, PS_W_ROOK, PS_W_QUEEN, PS_W_KING, PS_NONE, - PS_NONE, PS_B_PAWN, PS_B_KNIGHT, PS_B_BISHOP, PS_B_ROOK, PS_B_QUEEN, PS_B_KING, PS_NONE }, - { PS_NONE, PS_B_PAWN, PS_B_KNIGHT, PS_B_BISHOP, PS_B_ROOK, PS_B_QUEEN, PS_B_KING, PS_NONE, - PS_NONE, PS_W_PAWN, PS_W_KNIGHT, PS_W_BISHOP, PS_W_ROOK, PS_W_QUEEN, PS_W_KING, PS_NONE } - }; + constexpr std::size_t MaxSimdWidth = 32; // Type of input feature after conversion using TransformedFeatureType = std::uint8_t; @@ -107,7 +79,7 @@ namespace Stockfish::Eval::NNUE { // Round n up to be a multiple of base template - constexpr IntType CeilToMultiple(IntType n, IntType base) { + constexpr IntType ceil_to_multiple(IntType n, IntType base) { return (n + base - 1) / base * base; } @@ -129,6 +101,24 @@ namespace Stockfish::Eval::NNUE { return result; } + template + inline void write_little_endian(std::ostream& stream, IntType value) { + + std::uint8_t u[sizeof(IntType)]; + typename std::make_unsigned::type v = value; + + std::size_t i = 0; + // if constexpr to silence the warning about shift by 8 + if constexpr (sizeof(IntType) > 1) { + for (; i + 1 < sizeof(IntType); ++i) { + u[i] = v; + v >>= 8; + } + } + u[i] = v; + + stream.write(reinterpret_cast(u), sizeof(IntType)); + } } // namespace Stockfish::Eval::NNUE #endif // #ifndef NNUE_COMMON_H_INCLUDED diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 1e0b0e6d..a4a8e98f 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -23,7 +23,8 @@ #include "nnue_common.h" #include "nnue_architecture.h" -#include "features/index_list.h" + +#include "../misc.h" #include // std::memset() @@ -40,7 +41,7 @@ namespace Stockfish::Eval::NNUE { #define vec_store(a,b) _mm512_store_si512(a,b) #define vec_add_16(a,b) _mm512_add_epi16(a,b) #define vec_sub_16(a,b) _mm512_sub_epi16(a,b) - static constexpr IndexType kNumRegs = 8; // only 8 are needed + static constexpr IndexType NumRegs = 8; // only 8 are needed #elif USE_AVX2 typedef __m256i vec_t; @@ -48,7 +49,7 @@ namespace Stockfish::Eval::NNUE { #define vec_store(a,b) _mm256_store_si256(a,b) #define vec_add_16(a,b) _mm256_add_epi16(a,b) #define vec_sub_16(a,b) _mm256_sub_epi16(a,b) - static constexpr IndexType kNumRegs = 16; + static constexpr IndexType NumRegs = 16; #elif USE_SSE2 typedef __m128i vec_t; @@ -56,7 +57,7 @@ namespace Stockfish::Eval::NNUE { #define vec_store(a,b) *(a)=(b) #define vec_add_16(a,b) _mm_add_epi16(a,b) #define vec_sub_16(a,b) _mm_sub_epi16(a,b) - static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8; + static constexpr IndexType NumRegs = Is64Bit ? 16 : 8; #elif USE_MMX typedef __m64 vec_t; @@ -64,7 +65,7 @@ namespace Stockfish::Eval::NNUE { #define vec_store(a,b) *(a)=(b) #define vec_add_16(a,b) _mm_add_pi16(a,b) #define vec_sub_16(a,b) _mm_sub_pi16(a,b) - static constexpr IndexType kNumRegs = 8; + static constexpr IndexType NumRegs = 8; #elif USE_NEON typedef int16x8_t vec_t; @@ -72,7 +73,7 @@ namespace Stockfish::Eval::NNUE { #define vec_store(a,b) *(a)=(b) #define vec_add_16(a,b) vaddq_s16(a,b) #define vec_sub_16(a,b) vsubq_s16(a,b) - static constexpr IndexType kNumRegs = 16; + static constexpr IndexType NumRegs = 16; #else #undef VECTOR @@ -84,11 +85,11 @@ namespace Stockfish::Eval::NNUE { private: // Number of output dimensions for one side - static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions; + static constexpr IndexType HalfDimensions = TransformedFeatureDimensions; #ifdef VECTOR - static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2; - static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions"); + static constexpr IndexType TileHeight = NumRegs * sizeof(vec_t) / 2; + static_assert(HalfDimensions % TileHeight == 0, "TileHeight must divide HalfDimensions"); #endif public: @@ -96,105 +97,111 @@ namespace Stockfish::Eval::NNUE { using OutputType = TransformedFeatureType; // Number of input/output dimensions - static constexpr IndexType kInputDimensions = RawFeatures::kDimensions; - static constexpr IndexType kOutputDimensions = kHalfDimensions * 2; + static constexpr IndexType InputDimensions = FeatureSet::Dimensions; + static constexpr IndexType OutputDimensions = HalfDimensions * 2; // Size of forward propagation buffer - static constexpr std::size_t kBufferSize = - kOutputDimensions * sizeof(OutputType); + static constexpr std::size_t BufferSize = + OutputDimensions * sizeof(OutputType); // Hash value embedded in the evaluation file - static constexpr std::uint32_t GetHashValue() { - - return RawFeatures::kHashValue ^ kOutputDimensions; + static constexpr std::uint32_t get_hash_value() { + return FeatureSet::HashValue ^ OutputDimensions; } // Read network parameters - bool ReadParameters(std::istream& stream) { + bool read_parameters(std::istream& stream) { + for (std::size_t i = 0; i < HalfDimensions; ++i) + biases[i] = read_little_endian(stream); + for (std::size_t i = 0; i < HalfDimensions * InputDimensions; ++i) + weights[i] = read_little_endian(stream); + return !stream.fail(); + } - for (std::size_t i = 0; i < kHalfDimensions; ++i) - biases_[i] = read_little_endian(stream); - for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i) - weights_[i] = read_little_endian(stream); + // Write network parameters + bool write_parameters(std::ostream& stream) const { + for (std::size_t i = 0; i < HalfDimensions; ++i) + write_little_endian(stream, biases[i]); + for (std::size_t i = 0; i < HalfDimensions * InputDimensions; ++i) + write_little_endian(stream, weights[i]); return !stream.fail(); } // Convert input features - void Transform(const Position& pos, OutputType* output) const { - - UpdateAccumulator(pos, WHITE); - UpdateAccumulator(pos, BLACK); + void transform(const Position& pos, OutputType* output) const { + update_accumulator(pos, WHITE); + update_accumulator(pos, BLACK); const auto& accumulation = pos.state()->accumulator.accumulation; #if defined(USE_AVX512) - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth * 2); - static_assert(kHalfDimensions % (kSimdWidth * 2) == 0); - const __m512i kControl = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); - const __m512i kZero = _mm512_setzero_si512(); + constexpr IndexType NumChunks = HalfDimensions / (SimdWidth * 2); + static_assert(HalfDimensions % (SimdWidth * 2) == 0); + const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + const __m512i Zero = _mm512_setzero_si512(); #elif defined(USE_AVX2) - constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; - constexpr int kControl = 0b11011000; - const __m256i kZero = _mm256_setzero_si256(); + constexpr IndexType NumChunks = HalfDimensions / SimdWidth; + constexpr int Control = 0b11011000; + const __m256i Zero = _mm256_setzero_si256(); #elif defined(USE_SSE2) - constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; + constexpr IndexType NumChunks = HalfDimensions / SimdWidth; #ifdef USE_SSE41 - const __m128i kZero = _mm_setzero_si128(); + const __m128i Zero = _mm_setzero_si128(); #else const __m128i k0x80s = _mm_set1_epi8(-128); #endif #elif defined(USE_MMX) - constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; + constexpr IndexType NumChunks = HalfDimensions / SimdWidth; const __m64 k0x80s = _mm_set1_pi8(-128); #elif defined(USE_NEON) - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - const int8x8_t kZero = {0}; + constexpr IndexType NumChunks = HalfDimensions / (SimdWidth / 2); + const int8x8_t Zero = {0}; #endif const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()}; for (IndexType p = 0; p < 2; ++p) { - const IndexType offset = kHalfDimensions * p; + const IndexType offset = HalfDimensions * p; #if defined(USE_AVX512) auto out = reinterpret_cast<__m512i*>(&output[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { + for (IndexType j = 0; j < NumChunks; ++j) { __m512i sum0 = _mm512_load_si512( - &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 0]); + &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 0]); __m512i sum1 = _mm512_load_si512( - &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 1]); - _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(kControl, - _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), kZero))); + &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 1]); + _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control, + _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero))); } #elif defined(USE_AVX2) auto out = reinterpret_cast<__m256i*>(&output[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { + for (IndexType j = 0; j < NumChunks; ++j) { __m256i sum0 = _mm256_load_si256( - &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 0]); + &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 0]); __m256i sum1 = _mm256_load_si256( - &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 1]); + &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 1]); _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( - _mm256_packs_epi16(sum0, sum1), kZero), kControl)); + _mm256_packs_epi16(sum0, sum1), Zero), Control)); } #elif defined(USE_SSE2) auto out = reinterpret_cast<__m128i*>(&output[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { + for (IndexType j = 0; j < NumChunks; ++j) { __m128i sum0 = _mm_load_si128(&reinterpret_cast( - accumulation[perspectives[p]][0])[j * 2 + 0]); + accumulation[perspectives[p]])[j * 2 + 0]); __m128i sum1 = _mm_load_si128(&reinterpret_cast( - accumulation[perspectives[p]][0])[j * 2 + 1]); + accumulation[perspectives[p]])[j * 2 + 1]); const __m128i packedbytes = _mm_packs_epi16(sum0, sum1); _mm_store_si128(&out[j], #ifdef USE_SSE41 - _mm_max_epi8(packedbytes, kZero) + _mm_max_epi8(packedbytes, Zero) #else _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s) #endif @@ -204,26 +211,26 @@ namespace Stockfish::Eval::NNUE { #elif defined(USE_MMX) auto out = reinterpret_cast<__m64*>(&output[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { + for (IndexType j = 0; j < NumChunks; ++j) { __m64 sum0 = *(&reinterpret_cast( - accumulation[perspectives[p]][0])[j * 2 + 0]); + accumulation[perspectives[p]])[j * 2 + 0]); __m64 sum1 = *(&reinterpret_cast( - accumulation[perspectives[p]][0])[j * 2 + 1]); + accumulation[perspectives[p]])[j * 2 + 1]); const __m64 packedbytes = _mm_packs_pi16(sum0, sum1); out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); } #elif defined(USE_NEON) const auto out = reinterpret_cast(&output[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { + for (IndexType j = 0; j < NumChunks; ++j) { int16x8_t sum = reinterpret_cast( - accumulation[perspectives[p]][0])[j]; - out[j] = vmax_s8(vqmovn_s16(sum), kZero); + accumulation[perspectives[p]])[j]; + out[j] = vmax_s8(vqmovn_s16(sum), Zero); } #else - for (IndexType j = 0; j < kHalfDimensions; ++j) { - BiasType sum = accumulation[static_cast(perspectives[p])][0][j]; + for (IndexType j = 0; j < HalfDimensions; ++j) { + BiasType sum = accumulation[static_cast(perspectives[p])][j]; output[offset + j] = static_cast( std::max(0, std::min(127, sum))); } @@ -236,34 +243,36 @@ namespace Stockfish::Eval::NNUE { } private: - void UpdateAccumulator(const Position& pos, const Color c) const { + void update_accumulator(const Position& pos, const Color perspective) const { + + // The size must be enough to contain the largest possible update. + // That might depend on the feature set and generally relies on the + // feature set's update cost calculation to be correct and never + // allow updates with more added/removed features than MaxActiveDimensions. + using IndexList = ValueList; #ifdef VECTOR // Gcc-10.2 unnecessarily spills AVX2 registers if this array // is defined in the VECTOR code below, once in each branch - vec_t acc[kNumRegs]; + vec_t acc[NumRegs]; #endif // Look for a usable accumulator of an earlier position. We keep track // of the estimated gain in terms of features to be added/subtracted. StateInfo *st = pos.state(), *next = nullptr; - int gain = pos.count() - 2; - while (st->accumulator.state[c] == EMPTY) + int gain = FeatureSet::refresh_cost(pos); + while (st->accumulator.state[perspective] == EMPTY) { - auto& dp = st->dirtyPiece; - // The first condition tests whether an incremental update is - // possible at all: if this side's king has moved, it is not possible. - static_assert(std::is_same_v>, - "Current code assumes that only kFriendlyKingMoved refresh trigger is being used."); - if ( dp.piece[0] == make_piece(c, KING) - || (gain -= dp.dirty_num + 1) < 0) + // This governs when a full feature refresh is needed and how many + // updates are better than just one full refresh. + if ( FeatureSet::requires_refresh(st, perspective) + || (gain -= FeatureSet::update_cost(st) + 1) < 0) break; next = st; st = st->previous; } - if (st->accumulator.state[c] == COMPUTED) + if (st->accumulator.state[perspective] == COMPUTED) { if (next == nullptr) return; @@ -271,85 +280,83 @@ namespace Stockfish::Eval::NNUE { // Update incrementally in two steps. First, we update the "next" // accumulator. Then, we update the current accumulator (pos.state()). - // Gather all features to be updated. This code assumes HalfKP features - // only and doesn't support refresh triggers. - static_assert(std::is_same_v>, - RawFeatures>); - Features::IndexList removed[2], added[2]; - Features::HalfKP::AppendChangedIndices(pos, - next->dirtyPiece, c, &removed[0], &added[0]); + // Gather all features to be updated. + const Square ksq = pos.square(perspective); + IndexList removed[2], added[2]; + FeatureSet::append_changed_indices( + ksq, next, perspective, removed[0], added[0]); for (StateInfo *st2 = pos.state(); st2 != next; st2 = st2->previous) - Features::HalfKP::AppendChangedIndices(pos, - st2->dirtyPiece, c, &removed[1], &added[1]); + FeatureSet::append_changed_indices( + ksq, st2, perspective, removed[1], added[1]); // Mark the accumulators as computed. - next->accumulator.state[c] = COMPUTED; - pos.state()->accumulator.state[c] = COMPUTED; + next->accumulator.state[perspective] = COMPUTED; + pos.state()->accumulator.state[perspective] = COMPUTED; - // Now update the accumulators listed in info[], where the last element is a sentinel. - StateInfo *info[3] = + // Now update the accumulators listed in states_to_update[], where the last element is a sentinel. + StateInfo *states_to_update[3] = { next, next == pos.state() ? nullptr : pos.state(), nullptr }; #ifdef VECTOR - for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) + for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) { // Load accumulator auto accTile = reinterpret_cast( - &st->accumulator.accumulation[c][0][j * kTileHeight]); - for (IndexType k = 0; k < kNumRegs; ++k) + &st->accumulator.accumulation[perspective][j * TileHeight]); + for (IndexType k = 0; k < NumRegs; ++k) acc[k] = vec_load(&accTile[k]); - for (IndexType i = 0; info[i]; ++i) + for (IndexType i = 0; states_to_update[i]; ++i) { // Difference calculation for the deactivated features for (const auto index : removed[i]) { - const IndexType offset = kHalfDimensions * index + j * kTileHeight; - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType k = 0; k < kNumRegs; ++k) + const IndexType offset = HalfDimensions * index + j * TileHeight; + auto column = reinterpret_cast(&weights[offset]); + for (IndexType k = 0; k < NumRegs; ++k) acc[k] = vec_sub_16(acc[k], column[k]); } // Difference calculation for the activated features for (const auto index : added[i]) { - const IndexType offset = kHalfDimensions * index + j * kTileHeight; - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType k = 0; k < kNumRegs; ++k) + const IndexType offset = HalfDimensions * index + j * TileHeight; + auto column = reinterpret_cast(&weights[offset]); + for (IndexType k = 0; k < NumRegs; ++k) acc[k] = vec_add_16(acc[k], column[k]); } // Store accumulator accTile = reinterpret_cast( - &info[i]->accumulator.accumulation[c][0][j * kTileHeight]); - for (IndexType k = 0; k < kNumRegs; ++k) + &states_to_update[i]->accumulator.accumulation[perspective][j * TileHeight]); + for (IndexType k = 0; k < NumRegs; ++k) vec_store(&accTile[k], acc[k]); } } #else - for (IndexType i = 0; info[i]; ++i) + for (IndexType i = 0; states_to_update[i]; ++i) { - std::memcpy(info[i]->accumulator.accumulation[c][0], - st->accumulator.accumulation[c][0], - kHalfDimensions * sizeof(BiasType)); - st = info[i]; + std::memcpy(states_to_update[i]->accumulator.accumulation[perspective], + st->accumulator.accumulation[perspective], + HalfDimensions * sizeof(BiasType)); + st = states_to_update[i]; // Difference calculation for the deactivated features for (const auto index : removed[i]) { - const IndexType offset = kHalfDimensions * index; + const IndexType offset = HalfDimensions * index; - for (IndexType j = 0; j < kHalfDimensions; ++j) - st->accumulator.accumulation[c][0][j] -= weights_[offset + j]; + for (IndexType j = 0; j < HalfDimensions; ++j) + st->accumulator.accumulation[perspective][j] -= weights[offset + j]; } // Difference calculation for the activated features for (const auto index : added[i]) { - const IndexType offset = kHalfDimensions * index; + const IndexType offset = HalfDimensions * index; - for (IndexType j = 0; j < kHalfDimensions; ++j) - st->accumulator.accumulation[c][0][j] += weights_[offset + j]; + for (IndexType j = 0; j < HalfDimensions; ++j) + st->accumulator.accumulation[perspective][j] += weights[offset + j]; } } #endif @@ -358,43 +365,43 @@ namespace Stockfish::Eval::NNUE { { // Refresh the accumulator auto& accumulator = pos.state()->accumulator; - accumulator.state[c] = COMPUTED; - Features::IndexList active; - Features::HalfKP::AppendActiveIndices(pos, c, &active); + accumulator.state[perspective] = COMPUTED; + IndexList active; + FeatureSet::append_active_indices(pos, perspective, active); #ifdef VECTOR - for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) + for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) { auto biasesTile = reinterpret_cast( - &biases_[j * kTileHeight]); - for (IndexType k = 0; k < kNumRegs; ++k) + &biases[j * TileHeight]); + for (IndexType k = 0; k < NumRegs; ++k) acc[k] = biasesTile[k]; for (const auto index : active) { - const IndexType offset = kHalfDimensions * index + j * kTileHeight; - auto column = reinterpret_cast(&weights_[offset]); + const IndexType offset = HalfDimensions * index + j * TileHeight; + auto column = reinterpret_cast(&weights[offset]); - for (unsigned k = 0; k < kNumRegs; ++k) + for (unsigned k = 0; k < NumRegs; ++k) acc[k] = vec_add_16(acc[k], column[k]); } auto accTile = reinterpret_cast( - &accumulator.accumulation[c][0][j * kTileHeight]); - for (unsigned k = 0; k < kNumRegs; k++) + &accumulator.accumulation[perspective][j * TileHeight]); + for (unsigned k = 0; k < NumRegs; k++) vec_store(&accTile[k], acc[k]); } #else - std::memcpy(accumulator.accumulation[c][0], biases_, - kHalfDimensions * sizeof(BiasType)); + std::memcpy(accumulator.accumulation[perspective], biases, + HalfDimensions * sizeof(BiasType)); for (const auto index : active) { - const IndexType offset = kHalfDimensions * index; + const IndexType offset = HalfDimensions * index; - for (IndexType j = 0; j < kHalfDimensions; ++j) - accumulator.accumulation[c][0][j] += weights_[offset + j]; + for (IndexType j = 0; j < HalfDimensions; ++j) + accumulator.accumulation[perspective][j] += weights[offset + j]; } #endif } @@ -407,9 +414,8 @@ namespace Stockfish::Eval::NNUE { using BiasType = std::int16_t; using WeightType = std::int16_t; - alignas(kCacheLineSize) BiasType biases_[kHalfDimensions]; - alignas(kCacheLineSize) - WeightType weights_[kHalfDimensions * kInputDimensions]; + alignas(CacheLineSize) BiasType biases[HalfDimensions]; + alignas(CacheLineSize) WeightType weights[HalfDimensions * InputDimensions]; }; } // namespace Stockfish::Eval::NNUE diff --git a/src/position.cpp b/src/position.cpp index 8ef516b6..b555864d 100644 --- a/src/position.cpp +++ b/src/position.cpp @@ -84,7 +84,7 @@ std::ostream& operator<<(std::ostream& os, const Position& pos) { && !pos.can_castle(ANY_CASTLING)) { StateInfo st; - ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize); + ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize); Position p; p.set(pos.fen(), pos.is_chess960(), &st, pos.this_thread()); @@ -1337,7 +1337,7 @@ bool Position::pos_is_ok() const { assert(0 && "pos_is_ok: Bitboards"); StateInfo si = *st; - ASSERT_ALIGNED(&si, Eval::NNUE::kCacheLineSize); + ASSERT_ALIGNED(&si, Eval::NNUE::CacheLineSize); set_state(&si); if (std::memcmp(&si, st, sizeof(StateInfo))) diff --git a/src/position.h b/src/position.h index e816c541..7abd030e 100644 --- a/src/position.h +++ b/src/position.h @@ -118,7 +118,6 @@ public: Bitboard blockers_for_king(Color c) const; Bitboard check_squares(PieceType pt) const; Bitboard pinners(Color c) const; - bool is_discovered_check_on_king(Color c, Move m) const; // Attacks to/from a given square Bitboard attackers_to(Square s) const; @@ -328,10 +327,6 @@ inline Bitboard Position::check_squares(PieceType pt) const { return st->checkSquares[pt]; } -inline bool Position::is_discovered_check_on_king(Color c, Move m) const { - return st->blockersForKing[c] & from_sq(m); -} - inline bool Position::pawn_passed(Color c, Square s) const { return !(pieces(~c, PAWN) & passed_pawn_span(c, s)); } diff --git a/src/search.cpp b/src/search.cpp index a179c1d1..4d8ea035 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -96,49 +96,6 @@ namespace { Move best = MOVE_NONE; }; - // Breadcrumbs are used to mark nodes as being searched by a given thread - struct Breadcrumb { - std::atomic thread; - std::atomic key; - }; - std::array breadcrumbs; - - // ThreadHolding structure keeps track of which thread left breadcrumbs at the given - // node for potential reductions. A free node will be marked upon entering the moves - // loop by the constructor, and unmarked upon leaving that loop by the destructor. - struct ThreadHolding { - explicit ThreadHolding(Thread* thisThread, Key posKey, int ply) { - location = ply < 8 ? &breadcrumbs[posKey & (breadcrumbs.size() - 1)] : nullptr; - otherThread = false; - owning = false; - if (location) - { - // See if another already marked this location, if not, mark it ourselves - Thread* tmp = (*location).thread.load(std::memory_order_relaxed); - if (tmp == nullptr) - { - (*location).thread.store(thisThread, std::memory_order_relaxed); - (*location).key.store(posKey, std::memory_order_relaxed); - owning = true; - } - else if ( tmp != thisThread - && (*location).key.load(std::memory_order_relaxed) == posKey) - otherThread = true; - } - } - - ~ThreadHolding() { - if (owning) // Free the marked location - (*location).thread.store(nullptr, std::memory_order_relaxed); - } - - bool marked() { return otherThread; } - - private: - Breadcrumb* location; - bool otherThread, owning; - }; - template Value search(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth, bool cutNode); @@ -159,7 +116,7 @@ namespace { uint64_t perft(Position& pos, Depth depth) { StateInfo st; - ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize); + ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize); uint64_t cnt, nodes = 0; const bool leaf = (depth == 2); @@ -594,7 +551,7 @@ namespace { Move pv[MAX_PLY+1], capturesSearched[32], quietsSearched[64]; StateInfo st; - ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize); + ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize); TTEntry* tte; Key posKey; @@ -1003,8 +960,12 @@ moves_loop: // When in check, search starts from here value = bestValue; singularQuietLMR = moveCountPruning = false; - // Mark this node as being searched - ThreadHolding th(thisThread, posKey, ss->ply); + // Indicate PvNodes that will probably fail low if the node was searched + // at a depth equal or greater than the current depth, and the result of this search was a fail low. + bool likelyFailLow = PvNode + && ttMove + && (tte->bound() & BOUND_UPPER) + && tte->depth() >= depth; // Step 12. Loop through all pseudo-legal moves until no moves remain // or a beta cutoff occurs. @@ -1043,14 +1004,6 @@ moves_loop: // When in check, search starts from here movedPiece = pos.moved_piece(move); givesCheck = pos.gives_check(move); - // Indicate PvNodes that will probably fail low if node was searched with non-PV search - // at depth equal or greater to current depth and result of this search was far below alpha - bool likelyFailLow = PvNode - && ttMove - && (tte->bound() & BOUND_UPPER) - && ttValue < alpha + 200 + 100 * depth - && tte->depth() >= depth; - // Calculate new depth for this move newDepth = depth - 1; @@ -1130,6 +1083,8 @@ moves_loop: // When in check, search starts from here { extension = 1; singularQuietLMR = !ttCapture; + if (!PvNode && value < singularBeta - 140) + extension = 2; } // Multi-cut pruning @@ -1153,11 +1108,6 @@ moves_loop: // When in check, search starts from here } } - // Check extension (~2 Elo) - else if ( givesCheck - && (pos.is_discovered_check_on_king(~us, move) || pos.see_ge(move))) - extension = 1; - // Add extension to new depth newDepth += extension; @@ -1185,7 +1135,8 @@ moves_loop: // When in check, search starts from here || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha || cutNode || (!PvNode && !formerPv && captureHistory[movedPiece][to_sq(move)][type_of(pos.captured_piece())] < 3678) - || thisThread->ttHitAverage < 432 * TtHitAverageResolution * TtHitAverageWindow / 1024)) + || thisThread->ttHitAverage < 432 * TtHitAverageResolution * TtHitAverageWindow / 1024) + && (!PvNode || ss->ply > 1 || thisThread->id() % 4 != 3)) { Depth r = reduction(improving, depth, moveCount); @@ -1193,10 +1144,6 @@ moves_loop: // When in check, search starts from here if (thisThread->ttHitAverage > 537 * TtHitAverageResolution * TtHitAverageWindow / 1024) r--; - // Increase reduction if other threads are searching this position - if (th.marked()) - r++; - // Decrease reduction if position is or has been on the PV // and node is not likely to fail low. (~10 Elo) if ( ss->ttPv @@ -1209,29 +1156,17 @@ moves_loop: // When in check, search starts from here && thisThread->bestMoveChanges <= 2) r++; - // More reductions for late moves if position was not in previous PV - if ( moveCountPruning - && !formerPv) - r++; - - // Decrease reduction if opponent's move count is high (~5 Elo) + // Decrease reduction if opponent's move count is high (~1 Elo) if ((ss-1)->moveCount > 13) r--; - // Decrease reduction if ttMove has been singularly extended (~3 Elo) + // Decrease reduction if ttMove has been singularly extended (~1 Elo) if (singularQuietLMR) r--; - if (captureOrPromotion) + if (!captureOrPromotion) { - // Increase reduction for non-checking captures likely to be bad - if ( !givesCheck - && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 210 * depth <= alpha) - r++; - } - else - { - // Increase reduction if ttMove is a capture (~5 Elo) + // Increase reduction if ttMove is a capture (~3 Elo) if (ttCapture) r++; @@ -1242,13 +1177,6 @@ moves_loop: // When in check, search starts from here if (cutNode) r += 2; - // Decrease reduction for moves that escape a capture. Filter out - // castling moves, because they are coded as "king captures rook" and - // hence break reverse_move() (~2 Elo) - else if ( type_of(move) == NORMAL - && !pos.see_ge(reverse_move(move))) - r -= 2 + ss->ttPv - (type_of(movedPiece) == PAWN); - ss->statScore = thisThread->mainHistory[us][from_to(move)] + (*contHist[0])[movedPiece][to_sq(move)] + (*contHist[1])[movedPiece][to_sq(move)] @@ -1458,7 +1386,7 @@ moves_loop: // When in check, search starts from here Move pv[MAX_PLY+1]; StateInfo st; - ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize); + ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize); TTEntry* tte; Key posKey; @@ -1964,7 +1892,7 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) { bool RootMove::extract_ponder_from_tt(Position& pos) { StateInfo st; - ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize); + ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize); bool ttHit; diff --git a/src/thread.h b/src/thread.h index 06fc18f4..767bf4ac 100644 --- a/src/thread.h +++ b/src/thread.h @@ -74,6 +74,7 @@ public: void idle_loop(); void start_searching(); void wait_for_search_finished(); + int id() const { return idx; } void wait_for_worker_finished(); size_t thread_idx() const { return idx; } diff --git a/src/timeman.cpp b/src/timeman.cpp index f742d1e4..3236b6e9 100644 --- a/src/timeman.cpp +++ b/src/timeman.cpp @@ -94,6 +94,14 @@ void TimeManagement::init(Search::LimitsType& limits, Color us, int ply) { optimumTime = TimePoint(optScale * timeLeft); maximumTime = TimePoint(std::min(0.8 * limits.time[us] - moveOverhead, maxScale * optimumTime)); + if (Stockfish::Search::Limits.use_time_management()) + { + int strength = std::log( std::max(1, int(optimumTime * Threads.size() / 10))) * 60; + tempoNNUE = std::clamp( (strength + 264) / 24, 18, 30); + } + else + tempoNNUE = 28; // default for no time given + if (Options["Ponder"]) optimumTime += optimumTime / 4; } diff --git a/src/timeman.h b/src/timeman.h index b1878d65..4ac0b4be 100644 --- a/src/timeman.h +++ b/src/timeman.h @@ -37,6 +37,7 @@ public: TimePoint(Threads.nodes_searched()) : now() - startTime; } int64_t availableNodes; // When in 'nodes as time' mode + int tempoNNUE; private: TimePoint startTime; diff --git a/src/uci.cpp b/src/uci.cpp index 6f5b28a9..b1b39bc4 100644 --- a/src/uci.cpp +++ b/src/uci.cpp @@ -319,7 +319,14 @@ void UCI::loop(int argc, char* argv[]) { else if (token == "d") sync_cout << pos << sync_endl; else if (token == "eval") trace_eval(pos); else if (token == "compiler") sync_cout << compiler_info() << sync_endl; - + else if (token == "export_net") { + std::optional filename; + std::string f; + if (is >> skipws >> f) { + filename = f; + } + Eval::NNUE::export_net(filename); + } else if (token == "generate_training_data") Tools::generate_training_data(is); else if (token == "generate_training_data") Tools::generate_training_data_nonpv(is); else if (token == "convert") Tools::convert(is);