mirror of
https://github.com/HChaZZY/Stockfish.git
synced 2025-12-06 10:53:50 +08:00
Compare commits
55 Commits
5337edfdb6
...
cluster
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b4ac3d6b96 | ||
|
|
6c19bec86e | ||
|
|
8c4ac26c8e | ||
|
|
a2b24e0030 | ||
|
|
66b4e7f080 | ||
|
|
04a0be956d | ||
|
|
f327096cfb | ||
|
|
e592dcb8e3 | ||
|
|
80eae02603 | ||
|
|
0ad7de3182 | ||
|
|
e0b4dc24c8 | ||
|
|
b79ec2e0b2 | ||
|
|
43c887d367 | ||
|
|
3a187b863b | ||
|
|
5fcd0e6f2a | ||
|
|
19129473f2 | ||
|
|
b706b91bb1 | ||
|
|
8ec5faa46e | ||
|
|
8a9d269855 | ||
|
|
0b3c13107a | ||
|
|
669074672c | ||
|
|
0fd0e4e849 | ||
|
|
85327828c9 | ||
|
|
4cdb6386d8 | ||
|
|
982880bd70 | ||
|
|
bf17a410ec | ||
|
|
5e7777e9d0 | ||
|
|
10a920d7d7 | ||
|
|
21819b7bf8 | ||
|
|
8c4338ae49 | ||
|
|
8a3f8e21ae | ||
|
|
267ca781cd | ||
|
|
ac43bef5c5 | ||
|
|
7a32d26d5f | ||
|
|
fb5c1f5bf5 | ||
|
|
87f0fa55a0 | ||
|
|
2f882309d5 | ||
|
|
86953b9392 | ||
|
|
ba1c639836 | ||
|
|
e526c5aa52 | ||
|
|
9cd2c817db | ||
|
|
54a0a228f6 | ||
|
|
1cd2c7861a | ||
|
|
7af3f4da7a | ||
|
|
271181bb31 | ||
|
|
66b2c6b9f1 | ||
|
|
2559c20c6e | ||
|
|
2659c407c4 | ||
|
|
3730ae1efb | ||
|
|
0d6cdc0c6d | ||
|
|
80afeb0d3b | ||
|
|
2405b38165 | ||
|
|
8a95d269eb | ||
|
|
3b7b632aa5 | ||
|
|
29c166a072 |
27
README.md
27
README.md
@@ -59,6 +59,33 @@ This distribution of Stockfish consists of the following files:
|
||||
* a file with the .nnue extension, storing the neural network for the NNUE
|
||||
evaluation. Binary distributions will have this file embedded.
|
||||
|
||||
## Stockfish on distributed memory systems
|
||||
|
||||
The cluster branch allows for running Stockfish on a cluster of servers (nodes)
|
||||
that are connected with a high-speed and low-latency network, using the message
|
||||
passing interface (MPI). In this case, one MPI process should be run per node,
|
||||
and UCI options can be used to set the number of threads/hash per node as usual.
|
||||
Typically, the engine will be invoked as
|
||||
```
|
||||
mpirun -np N /path/to/stockfish
|
||||
```
|
||||
where ```N``` stands for the number of MPI processes used (alternatives to ```mpirun```,
|
||||
include ```mpiexec```, ```srun```). Use 1 mpi rank per node, and employ threading
|
||||
according to the cores per node. To build the cluster
|
||||
branch, it is sufficient to specify ```COMPCXX=mpicxx``` (or e.g. CC depending on the name
|
||||
of the compiler providing MPI support) on the make command line, and do a clean build:
|
||||
```
|
||||
make -j ARCH=x86-64-modern clean build COMPCXX=mpicxx mpi=yes
|
||||
```
|
||||
Make sure that the MPI installation is configured to support ```MPI_THREAD_MULTIPLE```,
|
||||
this might require adding system specific compiler options to the Makefile. Stockfish employs
|
||||
non-blocking (asynchronous) communication, and benefits from an MPI
|
||||
implementation that efficiently supports this. Some MPI implentations might benefit
|
||||
from leaving 1 core/thread free for these asynchronous communications, and might require
|
||||
setting additional environment variables. ```mpirun``` should forward stdin/stdout
|
||||
to ```rank 0``` only (e.g. ```srun --input=0 --output=0```).
|
||||
Refer to your MPI documentation for more info.
|
||||
|
||||
## Contributing
|
||||
|
||||
__See [Contributing Guide](CONTRIBUTING.md).__
|
||||
|
||||
16
src/Makefile
16
src/Makefile
@@ -53,7 +53,7 @@ PGOBENCH = $(WINE_PATH) ./$(EXE) bench
|
||||
|
||||
### Source and object files
|
||||
SRCS = benchmark.cpp bitboard.cpp evaluate.cpp main.cpp \
|
||||
misc.cpp movegen.cpp movepick.cpp position.cpp \
|
||||
misc.cpp movegen.cpp movepick.cpp position.cpp cluster.cpp \
|
||||
search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
|
||||
nnue/nnue_misc.cpp nnue/features/half_ka_v2_hm.cpp nnue/network.cpp
|
||||
|
||||
@@ -63,7 +63,7 @@ HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h \
|
||||
nnue/layers/sqr_clipped_relu.h nnue/nnue_accumulator.h nnue/nnue_architecture.h \
|
||||
nnue/nnue_common.h nnue/nnue_feature_transformer.h position.h \
|
||||
search.h syzygy/tbprobe.h thread.h thread_win32_osx.h timeman.h \
|
||||
tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h
|
||||
tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h cluster.h
|
||||
|
||||
OBJS = $(notdir $(SRCS:.cpp=.o))
|
||||
|
||||
@@ -100,6 +100,7 @@ VPATH = syzygy:nnue:nnue/features
|
||||
# vnni512 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512
|
||||
# neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture
|
||||
# dotprod = yes/no --- -DUSE_NEON_DOTPROD --- Use ARM advanced SIMD Int8 dot product instructions
|
||||
# mpi = yes/no --- -DUSE_MPI --- Use Message Passing Interface
|
||||
#
|
||||
# Note that Makefile is space sensitive, so when adding new architectures
|
||||
# or modifying existing flags, you have to make sure there are no extra spaces
|
||||
@@ -149,6 +150,7 @@ avx512 = no
|
||||
vnni256 = no
|
||||
vnni512 = no
|
||||
neon = no
|
||||
mpi = no
|
||||
dotprod = no
|
||||
arm_version = 0
|
||||
STRIP = strip
|
||||
@@ -791,6 +793,15 @@ ifeq ($(OS), Android)
|
||||
LDFLAGS += -fPIE -pie
|
||||
endif
|
||||
|
||||
### 3.10 MPI
|
||||
ifneq (,$(findstring mpi, $(CXX)))
|
||||
mpi = yes
|
||||
endif
|
||||
ifeq ($(mpi),yes)
|
||||
CXXFLAGS += -DUSE_MPI -Wno-cast-qual -fexceptions
|
||||
DEPENDFLAGS += -DUSE_MPI
|
||||
endif
|
||||
|
||||
### ==========================================================================
|
||||
### Section 4. Public Targets
|
||||
### ==========================================================================
|
||||
@@ -1013,6 +1024,7 @@ config-sanity: net
|
||||
@echo "vnni256: '$(vnni256)'"
|
||||
@echo "vnni512: '$(vnni512)'"
|
||||
@echo "neon: '$(neon)'"
|
||||
@echo "mpi: '$(mpi)'"
|
||||
@echo "dotprod: '$(dotprod)'"
|
||||
@echo "arm_version: '$(arm_version)'"
|
||||
@echo "target_windows: '$(target_windows)'"
|
||||
|
||||
480
src/cluster.cpp
Normal file
480
src/cluster.cpp
Normal file
@@ -0,0 +1,480 @@
|
||||
/*
|
||||
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
|
||||
Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
|
||||
|
||||
Stockfish is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Stockfish is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifdef USE_MPI
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <istream>
|
||||
#include <map>
|
||||
#include <mpi.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "cluster.h"
|
||||
#include "thread.h"
|
||||
#include "timeman.h"
|
||||
#include "tt.h"
|
||||
#include "search.h"
|
||||
|
||||
namespace Stockfish {
|
||||
namespace Cluster {
|
||||
|
||||
// Total number of ranks and rank within the communicator
|
||||
static int world_rank = MPI_PROC_NULL;
|
||||
static int world_size = 0;
|
||||
|
||||
// Signals between ranks exchange basic info using a dedicated communicator
|
||||
static MPI_Comm signalsComm = MPI_COMM_NULL;
|
||||
static MPI_Request reqSignals = MPI_REQUEST_NULL;
|
||||
static uint64_t signalsCallCounter = 0;
|
||||
|
||||
// Signals are the number of nodes searched, stop, table base hits, transposition table saves
|
||||
enum Signals : int {
|
||||
SIG_NODES = 0,
|
||||
SIG_STOP = 1,
|
||||
SIG_TB = 2,
|
||||
SIG_TTS = 3,
|
||||
SIG_NB = 4
|
||||
};
|
||||
static uint64_t signalsSend[SIG_NB] = {};
|
||||
static uint64_t signalsRecv[SIG_NB] = {};
|
||||
static uint64_t nodesSearchedOthers = 0;
|
||||
static uint64_t tbHitsOthers = 0;
|
||||
static uint64_t TTsavesOthers = 0;
|
||||
static uint64_t stopSignalsPosted = 0;
|
||||
|
||||
// The UCI threads of each rank exchange use a dedicated communicator
|
||||
static MPI_Comm InputComm = MPI_COMM_NULL;
|
||||
|
||||
// bestMove requires MoveInfo communicators and data types
|
||||
static MPI_Comm MoveComm = MPI_COMM_NULL;
|
||||
static MPI_Datatype MIDatatype = MPI_DATATYPE_NULL;
|
||||
|
||||
// TT entries are communicated with a dedicated communicator.
|
||||
// The receive buffer is used to gather information from all ranks.
|
||||
// THe TTCacheCounter tracks the number of local elements that are ready to be sent.
|
||||
static MPI_Comm TTComm = MPI_COMM_NULL;
|
||||
static std::array<std::vector<KeyedTTEntry>, 2> TTSendRecvBuffs;
|
||||
static std::array<MPI_Request, 2> reqsTTSendRecv = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
|
||||
static uint64_t sendRecvPosted = 0;
|
||||
static std::atomic<uint64_t> TTCacheCounter = {};
|
||||
|
||||
/// Initialize MPI and associated data types. Note that the MPI library must be configured
|
||||
/// to support MPI_THREAD_MULTIPLE, since multiple threads access MPI simultaneously.
|
||||
void init() {
|
||||
|
||||
int thread_support;
|
||||
MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &thread_support);
|
||||
if (thread_support < MPI_THREAD_MULTIPLE)
|
||||
{
|
||||
std::cerr << "Stockfish requires support for MPI_THREAD_MULTIPLE." << std::endl;
|
||||
std::exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
|
||||
|
||||
const std::array<MPI_Aint, 5> MIdisps = {offsetof(MoveInfo, move), offsetof(MoveInfo, ponder),
|
||||
offsetof(MoveInfo, depth), offsetof(MoveInfo, score),
|
||||
offsetof(MoveInfo, rank)};
|
||||
MPI_Type_create_hindexed_block(5, 1, MIdisps.data(), MPI_INT, &MIDatatype);
|
||||
MPI_Type_commit(&MIDatatype);
|
||||
|
||||
MPI_Comm_dup(MPI_COMM_WORLD, &InputComm);
|
||||
MPI_Comm_dup(MPI_COMM_WORLD, &TTComm);
|
||||
MPI_Comm_dup(MPI_COMM_WORLD, &MoveComm);
|
||||
MPI_Comm_dup(MPI_COMM_WORLD, &signalsComm);
|
||||
}
|
||||
|
||||
/// Finalize MPI and free the associated data types.
|
||||
void finalize() {
|
||||
|
||||
MPI_Type_free(&MIDatatype);
|
||||
|
||||
MPI_Comm_free(&InputComm);
|
||||
MPI_Comm_free(&TTComm);
|
||||
MPI_Comm_free(&MoveComm);
|
||||
MPI_Comm_free(&signalsComm);
|
||||
|
||||
MPI_Finalize();
|
||||
}
|
||||
|
||||
/// Return the total number of ranks
|
||||
int size() { return world_size; }
|
||||
|
||||
/// Return the rank (index) of the process
|
||||
int rank() { return world_rank; }
|
||||
|
||||
/// The receive buffer depends on the number of MPI ranks and threads, resize as needed
|
||||
void ttSendRecvBuff_resize(size_t nThreads) {
|
||||
|
||||
for (int i : {0, 1})
|
||||
{
|
||||
TTSendRecvBuffs[i].resize(TTCacheSize * world_size * nThreads);
|
||||
std::fill(TTSendRecvBuffs[i].begin(), TTSendRecvBuffs[i].end(), KeyedTTEntry());
|
||||
}
|
||||
}
|
||||
|
||||
/// As input is only received by the root (rank 0) of the cluster, this input must be relayed
|
||||
/// to the UCI threads of all ranks, in order to setup the position, etc. We do this with a
|
||||
/// dedicated getline implementation, where the root broadcasts to all other ranks the received
|
||||
/// information.
|
||||
bool getline(std::istream& input, std::string& str) {
|
||||
|
||||
int size;
|
||||
std::vector<char> vec;
|
||||
int state;
|
||||
|
||||
if (is_root())
|
||||
{
|
||||
state = static_cast<bool>(std::getline(input, str));
|
||||
vec.assign(str.begin(), str.end());
|
||||
size = vec.size();
|
||||
}
|
||||
|
||||
// Some MPI implementations use busy-wait polling, while we need yielding as otherwise
|
||||
// the UCI thread on the non-root ranks would be consuming resources.
|
||||
static MPI_Request reqInput = MPI_REQUEST_NULL;
|
||||
MPI_Ibcast(&size, 1, MPI_INT, 0, InputComm, &reqInput);
|
||||
if (is_root())
|
||||
MPI_Wait(&reqInput, MPI_STATUS_IGNORE);
|
||||
else
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
int flag;
|
||||
MPI_Test(&reqInput, &flag, MPI_STATUS_IGNORE);
|
||||
if (flag)
|
||||
break;
|
||||
else
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
||||
}
|
||||
}
|
||||
|
||||
// Broadcast received string
|
||||
if (!is_root())
|
||||
vec.resize(size);
|
||||
MPI_Bcast(vec.data(), size, MPI_CHAR, 0, InputComm);
|
||||
if (!is_root())
|
||||
str.assign(vec.begin(), vec.end());
|
||||
MPI_Bcast(&state, 1, MPI_INT, 0, InputComm);
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
/// Sending part of the signal communication loop
|
||||
namespace {
|
||||
void signals_send(const ThreadPool& threads) {
|
||||
|
||||
signalsSend[SIG_NODES] = threads.nodes_searched();
|
||||
signalsSend[SIG_TB] = threads.tb_hits();
|
||||
signalsSend[SIG_TTS] = threads.TT_saves();
|
||||
signalsSend[SIG_STOP] = threads.stop;
|
||||
MPI_Iallreduce(signalsSend, signalsRecv, SIG_NB, MPI_UINT64_T, MPI_SUM, signalsComm,
|
||||
&reqSignals);
|
||||
++signalsCallCounter;
|
||||
}
|
||||
|
||||
|
||||
/// Processing part of the signal communication loop.
|
||||
/// For some counters (e.g. nodes) we only keep their sum on the other nodes
|
||||
/// allowing to add local counters at any time for more fine grained process,
|
||||
/// which is useful to indicate progress during early iterations, and to have
|
||||
/// node counts that exactly match the non-MPI code in the single rank case.
|
||||
/// This call also propagates the stop signal between ranks.
|
||||
void signals_process(ThreadPool& threads) {
|
||||
|
||||
nodesSearchedOthers = signalsRecv[SIG_NODES] - signalsSend[SIG_NODES];
|
||||
tbHitsOthers = signalsRecv[SIG_TB] - signalsSend[SIG_TB];
|
||||
TTsavesOthers = signalsRecv[SIG_TTS] - signalsSend[SIG_TTS];
|
||||
stopSignalsPosted = signalsRecv[SIG_STOP];
|
||||
if (signalsRecv[SIG_STOP] > 0)
|
||||
threads.stop = true;
|
||||
}
|
||||
|
||||
void sendrecv_post() {
|
||||
|
||||
++sendRecvPosted;
|
||||
MPI_Irecv(TTSendRecvBuffs[sendRecvPosted % 2].data(),
|
||||
TTSendRecvBuffs[sendRecvPosted % 2].size() * sizeof(KeyedTTEntry), MPI_BYTE,
|
||||
(rank() + size() - 1) % size(), 42, TTComm, &reqsTTSendRecv[0]);
|
||||
MPI_Isend(TTSendRecvBuffs[(sendRecvPosted + 1) % 2].data(),
|
||||
TTSendRecvBuffs[(sendRecvPosted + 1) % 2].size() * sizeof(KeyedTTEntry), MPI_BYTE,
|
||||
(rank() + 1) % size(), 42, TTComm, &reqsTTSendRecv[1]);
|
||||
}
|
||||
}
|
||||
|
||||
/// During search, most message passing is asynchronous, but at the end of
|
||||
/// search it makes sense to bring them to a common, finalized state.
|
||||
void signals_sync(ThreadPool& threads) {
|
||||
|
||||
while (stopSignalsPosted < uint64_t(size()))
|
||||
signals_poll(threads);
|
||||
|
||||
// Finalize outstanding messages of the signal loops.
|
||||
// We might have issued one call less than needed on some ranks.
|
||||
uint64_t globalCounter;
|
||||
MPI_Allreduce(&signalsCallCounter, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm);
|
||||
if (signalsCallCounter < globalCounter)
|
||||
{
|
||||
MPI_Wait(&reqSignals, MPI_STATUS_IGNORE);
|
||||
signals_send(threads);
|
||||
}
|
||||
assert(signalsCallCounter == globalCounter);
|
||||
MPI_Wait(&reqSignals, MPI_STATUS_IGNORE);
|
||||
signals_process(threads);
|
||||
|
||||
// Finalize outstanding messages in the sendRecv loop
|
||||
MPI_Allreduce(&sendRecvPosted, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm);
|
||||
while (sendRecvPosted < globalCounter)
|
||||
{
|
||||
MPI_Waitall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), MPI_STATUSES_IGNORE);
|
||||
sendrecv_post();
|
||||
}
|
||||
assert(sendRecvPosted == globalCounter);
|
||||
MPI_Waitall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), MPI_STATUSES_IGNORE);
|
||||
}
|
||||
|
||||
/// Initialize signal counters to zero.
|
||||
void signals_init() {
|
||||
|
||||
stopSignalsPosted = tbHitsOthers = TTsavesOthers = nodesSearchedOthers = 0;
|
||||
|
||||
signalsSend[SIG_NODES] = signalsRecv[SIG_NODES] = 0;
|
||||
signalsSend[SIG_TB] = signalsRecv[SIG_TB] = 0;
|
||||
signalsSend[SIG_TTS] = signalsRecv[SIG_TTS] = 0;
|
||||
signalsSend[SIG_STOP] = signalsRecv[SIG_STOP] = 0;
|
||||
}
|
||||
|
||||
/// Poll the signal loop, and start next round as needed.
|
||||
void signals_poll(ThreadPool& threads) {
|
||||
|
||||
int flag;
|
||||
MPI_Test(&reqSignals, &flag, MPI_STATUS_IGNORE);
|
||||
if (flag)
|
||||
{
|
||||
signals_process(threads);
|
||||
signals_send(threads);
|
||||
}
|
||||
}
|
||||
|
||||
/// Provide basic info related the cluster performance, in particular, the number of signals send,
|
||||
/// signals per sounds (sps), the number of gathers, the number of positions gathered (per node and per second, gpps)
|
||||
/// The number of TT saves and TT saves per second. If gpps equals approximately TTSavesps the gather loop has enough bandwidth.
|
||||
void cluster_info(const ThreadPool& threads, Depth depth, TimePoint elapsed) {
|
||||
|
||||
// TimePoint elapsed = Time.elapsed() + 1;
|
||||
uint64_t TTSaves = TT_saves(threads);
|
||||
|
||||
sync_cout << "info depth " << depth << " cluster "
|
||||
<< " signals " << signalsCallCounter << " sps " << signalsCallCounter * 1000 / elapsed
|
||||
<< " sendRecvs " << sendRecvPosted << " srpps "
|
||||
<< TTSendRecvBuffs[0].size() * sendRecvPosted * 1000 / elapsed << " TTSaves "
|
||||
<< TTSaves << " TTSavesps " << TTSaves * 1000 / elapsed << sync_endl;
|
||||
}
|
||||
|
||||
/// When a TT entry is saved, additional steps are taken if the entry is of sufficient depth.
|
||||
/// If sufficient entries has been collected, a communication is initiated.
|
||||
/// If a communication has been completed, the received results are saved to the TT.
|
||||
void save(TranspositionTable& TT,
|
||||
ThreadPool& threads,
|
||||
Search::Worker* thread,
|
||||
TTEntry* tte,
|
||||
Key k,
|
||||
Value v,
|
||||
bool PvHit,
|
||||
Bound b,
|
||||
Depth d,
|
||||
Move m,
|
||||
Value ev,
|
||||
uint8_t generation8) {
|
||||
|
||||
// Standard save to the TT
|
||||
tte->save(k, v, PvHit, b, d, m, ev, generation8);
|
||||
|
||||
// If the entry is of sufficient depth to be worth communicating, take action.
|
||||
if (d > 3)
|
||||
{
|
||||
// count the TTsaves to information: this should be relatively similar
|
||||
// to the number of entries we can send/recv.
|
||||
thread->TTsaves.fetch_add(1, std::memory_order_relaxed);
|
||||
|
||||
// Add to thread's send buffer, the locking here avoids races when the master thread
|
||||
// prepares the send buffer.
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(thread->ttCache.mutex);
|
||||
thread->ttCache.buffer.replace(KeyedTTEntry(k, *tte));
|
||||
++TTCacheCounter;
|
||||
}
|
||||
|
||||
size_t recvBuffPerRankSize = threads.size() * TTCacheSize;
|
||||
|
||||
// Communicate on main search thread, as soon the threads combined have collected
|
||||
// sufficient data to fill the send buffers.
|
||||
if (thread == threads.main_thread()->worker.get() && TTCacheCounter > recvBuffPerRankSize)
|
||||
{
|
||||
// Test communication status
|
||||
int flag;
|
||||
MPI_Testall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), &flag, MPI_STATUSES_IGNORE);
|
||||
|
||||
// Current communication is complete
|
||||
if (flag)
|
||||
{
|
||||
// Save all received entries to TT, and store our TTCaches, ready for the next round of communication
|
||||
for (size_t irank = 0; irank < size_t(size()); ++irank)
|
||||
{
|
||||
if (irank
|
||||
== size_t(
|
||||
rank())) // this is our part, fill the part of the buffer for sending
|
||||
{
|
||||
// Copy from the thread caches to the right spot in the buffer
|
||||
size_t i = irank * recvBuffPerRankSize;
|
||||
for (auto&& th : threads)
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(th->worker->ttCache.mutex);
|
||||
|
||||
for (auto&& e : th->worker->ttCache.buffer)
|
||||
TTSendRecvBuffs[sendRecvPosted % 2][i++] = e;
|
||||
|
||||
// Reset thread's send buffer
|
||||
th->worker->ttCache.buffer = {};
|
||||
}
|
||||
|
||||
TTCacheCounter = 0;
|
||||
}
|
||||
else // process data received from the corresponding rank.
|
||||
for (size_t i = irank * recvBuffPerRankSize;
|
||||
i < (irank + 1) * recvBuffPerRankSize; ++i)
|
||||
{
|
||||
auto&& e = TTSendRecvBuffs[sendRecvPosted % 2][i];
|
||||
bool found;
|
||||
TTEntry* replace_tte;
|
||||
replace_tte = TT.probe(e.first, found);
|
||||
replace_tte->save(e.first, e.second.value(), e.second.is_pv(),
|
||||
e.second.bound(), e.second.depth(), e.second.move(),
|
||||
e.second.eval(), TT.generation());
|
||||
}
|
||||
}
|
||||
|
||||
// Start next communication
|
||||
sendrecv_post();
|
||||
|
||||
// Force check of time on the next occasion, the above actions might have taken some time.
|
||||
thread->main_manager()->callsCnt = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Picks the bestMove across ranks, and send the associated info and PV to the root of the cluster.
|
||||
/// Note that this bestMove and PV must be output by the root, the guarantee proper ordering of output.
|
||||
/// TODO update to the scheme in master.. can this use aggregation of votes?
|
||||
void pick_moves(MoveInfo& mi, std::string& PVLine) {
|
||||
|
||||
MoveInfo* pMoveInfo = NULL;
|
||||
if (is_root())
|
||||
{
|
||||
pMoveInfo = (MoveInfo*) malloc(sizeof(MoveInfo) * size());
|
||||
}
|
||||
MPI_Gather(&mi, 1, MIDatatype, pMoveInfo, 1, MIDatatype, 0, MoveComm);
|
||||
|
||||
if (is_root())
|
||||
{
|
||||
std::map<int, int> votes;
|
||||
int minScore = pMoveInfo[0].score;
|
||||
for (int i = 0; i < size(); ++i)
|
||||
{
|
||||
minScore = std::min(minScore, pMoveInfo[i].score);
|
||||
votes[pMoveInfo[i].move] = 0;
|
||||
}
|
||||
for (int i = 0; i < size(); ++i)
|
||||
{
|
||||
votes[pMoveInfo[i].move] += pMoveInfo[i].score - minScore + pMoveInfo[i].depth;
|
||||
}
|
||||
int bestVote = votes[pMoveInfo[0].move];
|
||||
for (int i = 0; i < size(); ++i)
|
||||
{
|
||||
if (votes[pMoveInfo[i].move] > bestVote)
|
||||
{
|
||||
bestVote = votes[pMoveInfo[i].move];
|
||||
mi = pMoveInfo[i];
|
||||
}
|
||||
}
|
||||
free(pMoveInfo);
|
||||
}
|
||||
|
||||
// Send around the final result
|
||||
MPI_Bcast(&mi, 1, MIDatatype, 0, MoveComm);
|
||||
|
||||
// Send PV line to root as needed
|
||||
if (mi.rank != 0 && mi.rank == rank())
|
||||
{
|
||||
int size;
|
||||
std::vector<char> vec;
|
||||
vec.assign(PVLine.begin(), PVLine.end());
|
||||
size = vec.size();
|
||||
MPI_Send(&size, 1, MPI_INT, 0, 42, MoveComm);
|
||||
MPI_Send(vec.data(), size, MPI_CHAR, 0, 42, MoveComm);
|
||||
}
|
||||
if (mi.rank != 0 && is_root())
|
||||
{
|
||||
int size;
|
||||
std::vector<char> vec;
|
||||
MPI_Recv(&size, 1, MPI_INT, mi.rank, 42, MoveComm, MPI_STATUS_IGNORE);
|
||||
vec.resize(size);
|
||||
MPI_Recv(vec.data(), size, MPI_CHAR, mi.rank, 42, MoveComm, MPI_STATUS_IGNORE);
|
||||
PVLine.assign(vec.begin(), vec.end());
|
||||
}
|
||||
}
|
||||
|
||||
/// Return nodes searched (lazily updated cluster wide in the signal loop)
|
||||
uint64_t nodes_searched(const ThreadPool& threads) {
|
||||
return nodesSearchedOthers + threads.nodes_searched();
|
||||
}
|
||||
|
||||
/// Return table base hits (lazily updated cluster wide in the signal loop)
|
||||
uint64_t tb_hits(const ThreadPool& threads) { return tbHitsOthers + threads.tb_hits(); }
|
||||
|
||||
/// Return the number of saves to the TT buffers, (lazily updated cluster wide in the signal loop)
|
||||
uint64_t TT_saves(const ThreadPool& threads) { return TTsavesOthers + threads.TT_saves(); }
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include "cluster.h"
|
||||
#include "thread.h"
|
||||
|
||||
namespace Stockfish {
|
||||
namespace Cluster {
|
||||
|
||||
uint64_t nodes_searched(const ThreadPool& threads) { return threads.nodes_searched(); }
|
||||
|
||||
uint64_t tb_hits(const ThreadPool& threads) { return threads.tb_hits(); }
|
||||
|
||||
uint64_t TT_saves(const ThreadPool& threads) { return threads.TT_saves(); }
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif // USE_MPI
|
||||
157
src/cluster.h
Normal file
157
src/cluster.h
Normal file
@@ -0,0 +1,157 @@
|
||||
/*
|
||||
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
|
||||
Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
|
||||
|
||||
Stockfish is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Stockfish is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef CLUSTER_H_INCLUDED
|
||||
#define CLUSTER_H_INCLUDED
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <istream>
|
||||
#include <string>
|
||||
|
||||
#include "tt.h"
|
||||
|
||||
namespace Stockfish {
|
||||
class Thread;
|
||||
class ThreadPool;
|
||||
|
||||
namespace Search {
|
||||
class Worker;
|
||||
}
|
||||
|
||||
/// The Cluster namespace contains functionality required to run on distributed
|
||||
/// memory architectures using MPI as the message passing interface. On a high level,
|
||||
/// a 'lazy SMP'-like scheme is implemented where TT saves of sufficient depth are
|
||||
/// collected on each rank and distributed to, and used by, all other ranks,
|
||||
/// which search essentially independently. The root (MPI rank 0) of the cluster
|
||||
/// is responsible for all I/O and time management, communicating this info to
|
||||
/// the other ranks as needed. UCI options such as Threads and Hash specify these
|
||||
/// quantities per MPI rank. It is recommended to have one rank (MPI process) per node.
|
||||
/// For the non-MPI case, wrappers that will be compiler-optimized away are provided.
|
||||
|
||||
namespace Cluster {
|
||||
|
||||
/// Basic info to find the cluster-wide bestMove
|
||||
struct MoveInfo {
|
||||
int move;
|
||||
int ponder;
|
||||
int depth;
|
||||
int score;
|
||||
int rank;
|
||||
};
|
||||
|
||||
#ifdef USE_MPI
|
||||
|
||||
// store the TTEntry with its full key, so it can be saved on the receiver side
|
||||
using KeyedTTEntry = std::pair<Key, TTEntry>;
|
||||
constexpr std::size_t TTCacheSize = 16;
|
||||
|
||||
// Threads locally cache their high-depth TT entries till a batch can be send by MPI
|
||||
template<std::size_t N>
|
||||
class TTCache: public std::array<KeyedTTEntry, N> {
|
||||
|
||||
struct Compare {
|
||||
inline bool operator()(const KeyedTTEntry& lhs, const KeyedTTEntry& rhs) {
|
||||
return lhs.second.depth() > rhs.second.depth();
|
||||
}
|
||||
};
|
||||
Compare compare;
|
||||
|
||||
public:
|
||||
// Keep a heap of entries replacing low depth with high depth entries
|
||||
bool replace(const KeyedTTEntry& value) {
|
||||
|
||||
if (compare(value, this->front()))
|
||||
{
|
||||
std::pop_heap(this->begin(), this->end(), compare);
|
||||
this->back() = value;
|
||||
std::push_heap(this->begin(), this->end(), compare);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
void init();
|
||||
void finalize();
|
||||
bool getline(std::istream& input, std::string& str);
|
||||
int size();
|
||||
int rank();
|
||||
inline bool is_root() { return rank() == 0; }
|
||||
void save(TranspositionTable&,
|
||||
ThreadPool&,
|
||||
Search::Worker* thread,
|
||||
TTEntry* tte,
|
||||
Key k,
|
||||
Value v,
|
||||
bool PvHit,
|
||||
Bound b,
|
||||
Depth d,
|
||||
Move m,
|
||||
Value ev,
|
||||
uint8_t generation8);
|
||||
void pick_moves(MoveInfo& mi, std::string& PVLine);
|
||||
void ttSendRecvBuff_resize(size_t nThreads);
|
||||
uint64_t nodes_searched(const ThreadPool&);
|
||||
uint64_t tb_hits(const ThreadPool&);
|
||||
uint64_t TT_saves(const ThreadPool&);
|
||||
void cluster_info(const ThreadPool&, Depth depth, TimePoint elapsed);
|
||||
void signals_init();
|
||||
void signals_poll(ThreadPool& threads);
|
||||
void signals_sync(ThreadPool& threads);
|
||||
|
||||
#else
|
||||
|
||||
inline void init() {}
|
||||
inline void finalize() {}
|
||||
inline bool getline(std::istream& input, std::string& str) {
|
||||
return static_cast<bool>(std::getline(input, str));
|
||||
}
|
||||
constexpr int size() { return 1; }
|
||||
constexpr int rank() { return 0; }
|
||||
constexpr bool is_root() { return true; }
|
||||
inline void save(TranspositionTable&,
|
||||
ThreadPool&,
|
||||
Search::Worker*,
|
||||
TTEntry* tte,
|
||||
Key k,
|
||||
Value v,
|
||||
bool PvHit,
|
||||
Bound b,
|
||||
Depth d,
|
||||
Move m,
|
||||
Value ev,
|
||||
uint8_t generation8) {
|
||||
tte->save(k, v, PvHit, b, d, m, ev, generation8);
|
||||
}
|
||||
inline void pick_moves(MoveInfo&, std::string&) {}
|
||||
inline void ttSendRecvBuff_resize(size_t) {}
|
||||
uint64_t nodes_searched(const ThreadPool&);
|
||||
uint64_t tb_hits(const ThreadPool&);
|
||||
uint64_t TT_saves(const ThreadPool&);
|
||||
inline void cluster_info(const ThreadPool&, Depth, TimePoint) {}
|
||||
inline void signals_init() {}
|
||||
inline void signals_poll(ThreadPool& threads) {}
|
||||
inline void signals_sync(ThreadPool& threads) {}
|
||||
|
||||
#endif /* USE_MPI */
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif // #ifndef CLUSTER_H_INCLUDED
|
||||
@@ -29,7 +29,9 @@ using namespace Stockfish;
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
|
||||
std::cout << engine_info() << std::endl;
|
||||
Cluster::init();
|
||||
if (Cluster::is_root())
|
||||
std::cout << engine_info() << std::endl;
|
||||
|
||||
Bitboards::init();
|
||||
Position::init();
|
||||
@@ -40,5 +42,7 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
uci.loop();
|
||||
|
||||
Cluster::finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include "../cluster.h"
|
||||
#include "../evaluate.h"
|
||||
#include "../incbin/incbin.h"
|
||||
#include "../misc.h"
|
||||
@@ -250,7 +251,8 @@ void Network<Arch, Transformer>::verify(std::string evalfilePath) const {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
sync_cout << "info string NNUE evaluation using " << evalfilePath << sync_endl;
|
||||
if (Cluster::is_root())
|
||||
sync_cout << "info string NNUE evaluation using " << evalfilePath << sync_endl;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "cluster.h"
|
||||
#include "movegen.h"
|
||||
#include "position.h"
|
||||
#include "types.h"
|
||||
@@ -50,7 +51,7 @@ uint64_t perft(Position& pos, Depth depth) {
|
||||
nodes += cnt;
|
||||
pos.undo_move(m);
|
||||
}
|
||||
if (Root)
|
||||
if (Root && Cluster::is_root())
|
||||
sync_cout << UCI::move(m, pos.is_chess960()) << ": " << cnt << sync_endl;
|
||||
}
|
||||
return nodes;
|
||||
@@ -62,7 +63,8 @@ inline void perft(const std::string& fen, Depth depth, bool isChess960) {
|
||||
p.set(fen, isChess960, &states->back());
|
||||
|
||||
uint64_t nodes = perft<true>(p, depth);
|
||||
sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl;
|
||||
if (Cluster::is_root())
|
||||
sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
132
src/search.cpp
132
src/search.cpp
@@ -30,6 +30,7 @@
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
|
||||
#include "cluster.h"
|
||||
#include "evaluate.h"
|
||||
#include "misc.h"
|
||||
#include "movegen.h"
|
||||
@@ -157,9 +158,10 @@ void Search::Worker::start_searching() {
|
||||
if (rootMoves.empty())
|
||||
{
|
||||
rootMoves.emplace_back(Move::none());
|
||||
sync_cout << "info depth 0 score "
|
||||
<< UCI::to_score(rootPos.checkers() ? -VALUE_MATE : VALUE_DRAW, rootPos)
|
||||
<< sync_endl;
|
||||
if (Cluster::is_root())
|
||||
sync_cout << "info depth 0 score "
|
||||
<< UCI::to_score(rootPos.checkers() ? -VALUE_MATE : VALUE_DRAW, rootPos)
|
||||
<< sync_endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -173,12 +175,17 @@ void Search::Worker::start_searching() {
|
||||
// GUI sends a "stop" or "ponderhit" command. We therefore simply wait here
|
||||
// until the GUI sends one of those commands.
|
||||
while (!threads.stop && (main_manager()->ponder || limits.infinite))
|
||||
{} // Busy wait for a stop or a ponder reset
|
||||
{
|
||||
Cluster::signals_poll(threads);
|
||||
} // Busy wait for a stop or a ponder reset
|
||||
|
||||
// Stop the threads if not already stopped (also raise the stop if
|
||||
// "ponderhit" just reset threads.ponder).
|
||||
threads.stop = true;
|
||||
|
||||
// Signal and synchronize all other ranks
|
||||
Cluster::signals_sync(threads);
|
||||
|
||||
// Wait until all threads have finished
|
||||
threads.wait_for_search_finished();
|
||||
|
||||
@@ -186,7 +193,7 @@ void Search::Worker::start_searching() {
|
||||
// the available ones before exiting.
|
||||
if (limits.npmsec)
|
||||
main_manager()->tm.advance_nodes_time(limits.inc[rootPos.side_to_move()]
|
||||
- threads.nodes_searched());
|
||||
- Cluster::nodes_searched(threads));
|
||||
|
||||
Worker* bestThread = this;
|
||||
Skill skill =
|
||||
@@ -196,21 +203,40 @@ void Search::Worker::start_searching() {
|
||||
&& rootMoves[0].pv[0] != Move::none())
|
||||
bestThread = threads.get_best_thread()->worker.get();
|
||||
|
||||
// Prepare PVLine and ponder move
|
||||
std::string PVLine = main_manager()->pv(*bestThread, threads, tt, bestThread->completedDepth);
|
||||
|
||||
main_manager()->bestPreviousScore = bestThread->rootMoves[0].score;
|
||||
main_manager()->bestPreviousAverageScore = bestThread->rootMoves[0].averageScore;
|
||||
|
||||
// Send again PV info if we have a new best thread
|
||||
if (bestThread != this)
|
||||
sync_cout << main_manager()->pv(*bestThread, threads, tt, bestThread->completedDepth)
|
||||
<< sync_endl;
|
||||
|
||||
sync_cout << "bestmove " << UCI::move(bestThread->rootMoves[0].pv[0], rootPos.is_chess960());
|
||||
|
||||
Move bestMove = bestThread->rootMoves[0].pv[0];
|
||||
Move ponderMove = Move::none();
|
||||
if (bestThread->rootMoves[0].pv.size() > 1
|
||||
|| bestThread->rootMoves[0].extract_ponder_from_tt(tt, rootPos))
|
||||
std::cout << " ponder " << UCI::move(bestThread->rootMoves[0].pv[1], rootPos.is_chess960());
|
||||
ponderMove = bestThread->rootMoves[0].pv[1];
|
||||
|
||||
std::cout << sync_endl;
|
||||
// Exchange info as needed
|
||||
Cluster::MoveInfo mi{bestMove.raw(), ponderMove.raw(), bestThread->completedDepth,
|
||||
bestThread->rootMoves[0].score, Cluster::rank()};
|
||||
Cluster::pick_moves(mi, PVLine);
|
||||
|
||||
main_manager()->bestPreviousScore = static_cast<Value>(mi.score);
|
||||
|
||||
if (Cluster::is_root())
|
||||
{
|
||||
// Send again PV info if we have a new best thread/rank
|
||||
if (bestThread != this || mi.rank != 0)
|
||||
sync_cout << PVLine << sync_endl;
|
||||
|
||||
bestMove = static_cast<Move>(mi.move);
|
||||
ponderMove = static_cast<Move>(mi.ponder);
|
||||
|
||||
if (ponderMove != Move::none())
|
||||
sync_cout << "bestmove " << UCI::move(bestMove, rootPos.is_chess960()) << " ponder "
|
||||
<< UCI::move(ponderMove, rootPos.is_chess960()) << sync_endl;
|
||||
else
|
||||
sync_cout << "bestmove " << UCI::move(bestMove, rootPos.is_chess960()) << sync_endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Main iterative deepening loop. It calls search()
|
||||
@@ -272,7 +298,7 @@ void Search::Worker::iterative_deepening() {
|
||||
|
||||
// Iterative deepening loop until requested to stop or the target depth is reached
|
||||
while (++rootDepth < MAX_PLY && !threads.stop
|
||||
&& !(limits.depth && mainThread && rootDepth > limits.depth))
|
||||
&& !(limits.depth && mainThread && Cluster::is_root() && rootDepth > limits.depth))
|
||||
{
|
||||
// Age out PV variability metric
|
||||
if (mainThread)
|
||||
@@ -341,9 +367,14 @@ void Search::Worker::iterative_deepening() {
|
||||
|
||||
// When failing high/low give some update (without cluttering
|
||||
// the UI) before a re-search.
|
||||
if (mainThread && multiPV == 1 && (bestValue <= alpha || bestValue >= beta)
|
||||
&& mainThread->tm.elapsed(threads.nodes_searched()) > 3000)
|
||||
if (Cluster::is_root() && mainThread && multiPV == 1
|
||||
&& (bestValue <= alpha || bestValue >= beta)
|
||||
&& mainThread->tm.elapsed(Cluster::nodes_searched(threads)) > 3000)
|
||||
{
|
||||
sync_cout << main_manager()->pv(*this, threads, tt, rootDepth) << sync_endl;
|
||||
Cluster::cluster_info(threads, rootDepth,
|
||||
mainThread->tm.elapsed(Cluster::nodes_searched(threads)));
|
||||
}
|
||||
|
||||
// In case of failing low/high increase aspiration window and
|
||||
// re-search, otherwise exit the loop.
|
||||
@@ -372,15 +403,19 @@ void Search::Worker::iterative_deepening() {
|
||||
// Sort the PV lines searched so far and update the GUI
|
||||
std::stable_sort(rootMoves.begin() + pvFirst, rootMoves.begin() + pvIdx + 1);
|
||||
|
||||
if (mainThread
|
||||
if (Cluster::is_root() && mainThread
|
||||
&& (threads.stop || pvIdx + 1 == multiPV
|
||||
|| mainThread->tm.elapsed(threads.nodes_searched()) > 3000)
|
||||
|| mainThread->tm.elapsed(Cluster::nodes_searched(threads)) > 3000)
|
||||
// A thread that aborted search can have mated-in/TB-loss PV and score
|
||||
// that cannot be trusted, i.e. it can be delayed or refuted if we would have
|
||||
// had time to fully search other root-moves. Thus we suppress this output and
|
||||
// below pick a proven score/PV for this thread (from the previous iteration).
|
||||
&& !(threads.abortedSearch && rootMoves[0].uciScore <= VALUE_TB_LOSS_IN_MAX_PLY))
|
||||
{
|
||||
sync_cout << main_manager()->pv(*this, threads, tt, rootDepth) << sync_endl;
|
||||
Cluster::cluster_info(threads, rootDepth,
|
||||
mainThread->tm.elapsed(Cluster::nodes_searched(threads)) + 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (!threads.stop)
|
||||
@@ -451,12 +486,12 @@ void Search::Worker::iterative_deepening() {
|
||||
totalTime = std::min(500.0, totalTime);
|
||||
|
||||
if (completedDepth >= 10 && nodesEffort >= 97
|
||||
&& mainThread->tm.elapsed(threads.nodes_searched()) > totalTime * 0.739
|
||||
&& mainThread->tm.elapsed(Cluster::nodes_searched(threads)) > totalTime * 0.739
|
||||
&& !mainThread->ponder)
|
||||
threads.stop = true;
|
||||
|
||||
// Stop the search if we have exceeded the totalTime
|
||||
if (mainThread->tm.elapsed(threads.nodes_searched()) > totalTime)
|
||||
if (mainThread->tm.elapsed(Cluster::nodes_searched(threads)) > totalTime)
|
||||
{
|
||||
// If we are allowed to ponder do not stop the search now but
|
||||
// keep pondering until the GUI sends "ponderhit" or "stop".
|
||||
@@ -468,7 +503,7 @@ void Search::Worker::iterative_deepening() {
|
||||
else
|
||||
threads.increaseDepth =
|
||||
mainThread->ponder
|
||||
|| mainThread->tm.elapsed(threads.nodes_searched()) <= totalTime * 0.506;
|
||||
|| mainThread->tm.elapsed(Cluster::nodes_searched(threads)) <= totalTime * 0.506;
|
||||
}
|
||||
|
||||
mainThread->iterValue[iterIdx] = bestValue;
|
||||
@@ -671,9 +706,9 @@ Value Search::Worker::search(
|
||||
|
||||
if (b == BOUND_EXACT || (b == BOUND_LOWER ? value >= beta : value <= alpha))
|
||||
{
|
||||
tte->save(posKey, value_to_tt(value, ss->ply), ss->ttPv, b,
|
||||
std::min(MAX_PLY - 1, depth + 6), Move::none(), VALUE_NONE,
|
||||
tt.generation());
|
||||
Cluster::save(tt, threads, thisThread, tte, posKey, value_to_tt(value, ss->ply),
|
||||
ss->ttPv, b, std::min(MAX_PLY - 1, depth + 6), Move::none(),
|
||||
VALUE_NONE, tt.generation());
|
||||
|
||||
return value;
|
||||
}
|
||||
@@ -726,8 +761,8 @@ Value Search::Worker::search(
|
||||
ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
|
||||
|
||||
// Static evaluation is saved as it was before adjustment by correction history
|
||||
tte->save(posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_NONE, Move::none(),
|
||||
unadjustedStaticEval, tt.generation());
|
||||
Cluster::save(tt, threads, thisThread, tte, posKey, VALUE_NONE, ss->ttPv, BOUND_NONE,
|
||||
DEPTH_NONE, Move::none(), unadjustedStaticEval, tt.generation());
|
||||
}
|
||||
|
||||
// Use static evaluation difference to improve quiet move ordering (~9 Elo)
|
||||
@@ -872,8 +907,9 @@ Value Search::Worker::search(
|
||||
if (value >= probCutBeta)
|
||||
{
|
||||
// Save ProbCut data into transposition table
|
||||
tte->save(posKey, value_to_tt(value, ss->ply), ss->ttPv, BOUND_LOWER, depth - 3,
|
||||
move, unadjustedStaticEval, tt.generation());
|
||||
Cluster::save(tt, threads, thisThread, tte, posKey, value_to_tt(value, ss->ply),
|
||||
ss->ttPv, BOUND_LOWER, depth - 3, move, unadjustedStaticEval,
|
||||
tt.generation());
|
||||
return std::abs(value) < VALUE_TB_WIN_IN_MAX_PLY ? value - (probCutBeta - beta)
|
||||
: value;
|
||||
}
|
||||
@@ -930,8 +966,8 @@ moves_loop: // When in check, search starts here
|
||||
|
||||
ss->moveCount = ++moveCount;
|
||||
|
||||
if (rootNode && is_mainthread()
|
||||
&& main_manager()->tm.elapsed(threads.nodes_searched()) > 3000)
|
||||
if (rootNode && Cluster::is_root() && is_mainthread()
|
||||
&& main_manager()->tm.elapsed(Cluster::nodes_searched(threads)) > 3000)
|
||||
sync_cout << "info depth " << depth << " currmove "
|
||||
<< UCI::move(move, pos.is_chess960()) << " currmovenumber "
|
||||
<< moveCount + thisThread->pvIdx << sync_endl;
|
||||
@@ -1341,11 +1377,12 @@ moves_loop: // When in check, search starts here
|
||||
// Write gathered information in transposition table
|
||||
// Static evaluation is saved as it was before correction history
|
||||
if (!excludedMove && !(rootNode && thisThread->pvIdx))
|
||||
tte->save(posKey, value_to_tt(bestValue, ss->ply), ss->ttPv,
|
||||
bestValue >= beta ? BOUND_LOWER
|
||||
: PvNode && bestMove ? BOUND_EXACT
|
||||
: BOUND_UPPER,
|
||||
depth, bestMove, unadjustedStaticEval, tt.generation());
|
||||
Cluster::save(tt, threads, thisThread, tte, posKey, value_to_tt(bestValue, ss->ply),
|
||||
ss->ttPv,
|
||||
bestValue >= beta ? BOUND_LOWER
|
||||
: PvNode && bestMove ? BOUND_EXACT
|
||||
: BOUND_UPPER,
|
||||
depth, bestMove, unadjustedStaticEval, tt.generation());
|
||||
|
||||
// Adjust correction history
|
||||
if (!ss->inCheck && (!bestMove || !pos.capture(bestMove))
|
||||
@@ -1472,8 +1509,9 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
|
||||
if (bestValue >= beta)
|
||||
{
|
||||
if (!ss->ttHit)
|
||||
tte->save(posKey, value_to_tt(bestValue, ss->ply), false, BOUND_LOWER, DEPTH_NONE,
|
||||
Move::none(), unadjustedStaticEval, tt.generation());
|
||||
Cluster::save(tt, threads, thisThread, tte, posKey, value_to_tt(bestValue, ss->ply),
|
||||
false, BOUND_LOWER, DEPTH_NONE, Move::none(), unadjustedStaticEval,
|
||||
tt.generation());
|
||||
|
||||
return bestValue;
|
||||
}
|
||||
@@ -1618,9 +1656,9 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
|
||||
|
||||
// Save gathered info in transposition table
|
||||
// Static evaluation is saved as it was before adjustment by correction history
|
||||
tte->save(posKey, value_to_tt(bestValue, ss->ply), pvHit,
|
||||
bestValue >= beta ? BOUND_LOWER : BOUND_UPPER, ttDepth, bestMove,
|
||||
unadjustedStaticEval, tt.generation());
|
||||
Cluster::save(tt, threads, thisThread, tte, posKey, value_to_tt(bestValue, ss->ply), pvHit,
|
||||
bestValue >= beta ? BOUND_LOWER : BOUND_UPPER, ttDepth, bestMove,
|
||||
unadjustedStaticEval, tt.generation());
|
||||
|
||||
assert(bestValue > -VALUE_INFINITE && bestValue < VALUE_INFINITE);
|
||||
|
||||
@@ -1846,7 +1884,7 @@ void SearchManager::check_time(Search::Worker& worker) {
|
||||
|
||||
static TimePoint lastInfoTime = now();
|
||||
|
||||
TimePoint elapsed = tm.elapsed(worker.threads.nodes_searched());
|
||||
TimePoint elapsed = tm.elapsed(Cluster::nodes_searched(worker.threads));
|
||||
TimePoint tick = worker.limits.startTime + elapsed;
|
||||
|
||||
if (tick - lastInfoTime >= 1000)
|
||||
@@ -1855,6 +1893,9 @@ void SearchManager::check_time(Search::Worker& worker) {
|
||||
dbg_print();
|
||||
}
|
||||
|
||||
// poll on MPI signals
|
||||
Cluster::signals_poll(worker.threads);
|
||||
|
||||
// We should not stop pondering until told so by the GUI
|
||||
if (ponder)
|
||||
return;
|
||||
@@ -1865,7 +1906,8 @@ void SearchManager::check_time(Search::Worker& worker) {
|
||||
worker.completedDepth >= 1
|
||||
&& ((worker.limits.use_time_management() && (elapsed > tm.maximum() || stopOnPonderhit))
|
||||
|| (worker.limits.movetime && elapsed >= worker.limits.movetime)
|
||||
|| (worker.limits.nodes && worker.threads.nodes_searched() >= worker.limits.nodes)))
|
||||
|| (worker.limits.nodes
|
||||
&& Cluster::nodes_searched(worker.threads) >= worker.limits.nodes)))
|
||||
worker.threads.stop = worker.threads.abortedSearch = true;
|
||||
}
|
||||
|
||||
@@ -1875,13 +1917,13 @@ std::string SearchManager::pv(const Search::Worker& worker,
|
||||
Depth depth) const {
|
||||
std::stringstream ss;
|
||||
|
||||
const auto nodes = threads.nodes_searched();
|
||||
const auto nodes = Cluster::nodes_searched(threads);
|
||||
const auto& rootMoves = worker.rootMoves;
|
||||
const auto& pos = worker.rootPos;
|
||||
size_t pvIdx = worker.pvIdx;
|
||||
TimePoint time = tm.elapsed(nodes) + 1;
|
||||
size_t multiPV = std::min(size_t(worker.options["MultiPV"]), rootMoves.size());
|
||||
uint64_t tbHits = threads.tb_hits() + (worker.tbConfig.rootInTB ? rootMoves.size() : 0);
|
||||
uint64_t tbHits = Cluster::tb_hits(threads) + (worker.tbConfig.rootInTB ? rootMoves.size() : 0);
|
||||
|
||||
for (size_t i = 0; i < multiPV; ++i)
|
||||
{
|
||||
|
||||
26
src/search.h
26
src/search.h
@@ -27,7 +27,9 @@
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <mutex>
|
||||
|
||||
#include "cluster.h"
|
||||
#include "misc.h"
|
||||
#include "movepick.h"
|
||||
#include "position.h"
|
||||
@@ -117,7 +119,7 @@ struct LimitsType {
|
||||
ponderMode = false;
|
||||
}
|
||||
|
||||
bool use_time_management() const { return time[WHITE] || time[BLACK]; }
|
||||
bool use_time_management() const { return Cluster::is_root() && (time[WHITE] || time[BLACK]); }
|
||||
|
||||
std::vector<Move> searchmoves;
|
||||
TimePoint time[COLOR_NB], inc[COLOR_NB], npmsec, movetime, startTime;
|
||||
@@ -211,6 +213,28 @@ class Worker {
|
||||
PawnHistory pawnHistory;
|
||||
CorrectionHistory correctionHistory;
|
||||
|
||||
#ifdef USE_MPI
|
||||
struct {
|
||||
std::mutex mutex;
|
||||
Cluster::TTCache<Cluster::TTCacheSize> buffer = {};
|
||||
} ttCache;
|
||||
#endif
|
||||
|
||||
std::atomic<uint64_t> TTsaves;
|
||||
|
||||
friend void Cluster::save(TranspositionTable&,
|
||||
ThreadPool&,
|
||||
Search::Worker*,
|
||||
TTEntry* tte,
|
||||
Key k,
|
||||
Value v,
|
||||
bool PvHit,
|
||||
Bound b,
|
||||
Depth d,
|
||||
Move m,
|
||||
Value ev,
|
||||
uint8_t generation8);
|
||||
|
||||
private:
|
||||
void iterative_deepening();
|
||||
|
||||
|
||||
@@ -37,6 +37,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "../bitboard.h"
|
||||
#include "../cluster.h"
|
||||
#include "../misc.h"
|
||||
#include "../movegen.h"
|
||||
#include "../position.h"
|
||||
@@ -1466,7 +1467,8 @@ void Tablebases::init(const std::string& paths) {
|
||||
}
|
||||
}
|
||||
|
||||
sync_cout << "info string Found " << TBTables.size() << " tablebases" << sync_endl;
|
||||
if (Cluster::is_root())
|
||||
sync_cout << "info string Found " << TBTables.size() << " tablebases" << sync_endl;
|
||||
}
|
||||
|
||||
// Probe the WDL table for a particular position.
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
|
||||
#include "cluster.h"
|
||||
#include "misc.h"
|
||||
#include "movegen.h"
|
||||
#include "search.h"
|
||||
@@ -115,6 +116,7 @@ Search::SearchManager* ThreadPool::main_manager() {
|
||||
|
||||
uint64_t ThreadPool::nodes_searched() const { return accumulate(&Search::Worker::nodes); }
|
||||
uint64_t ThreadPool::tb_hits() const { return accumulate(&Search::Worker::tbHits); }
|
||||
uint64_t ThreadPool::TT_saves() const { return accumulate(&Search::Worker::TTsaves); }
|
||||
|
||||
// Creates/destroys threads to match the requested number.
|
||||
// Created and launched threads will immediately go to sleep in idle_loop.
|
||||
@@ -147,6 +149,9 @@ void ThreadPool::set(Search::SharedState sharedState) {
|
||||
|
||||
// Reallocate the hash with the new threadpool size
|
||||
sharedState.tt.resize(sharedState.options["Hash"], requested);
|
||||
|
||||
// Adjust cluster buffers
|
||||
Cluster::ttSendRecvBuff_resize(requested);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -205,6 +210,7 @@ void ThreadPool::start_thinking(const OptionsMap& options,
|
||||
th->worker->limits = limits;
|
||||
th->worker->nodes = th->worker->tbHits = th->worker->nmpMinPly =
|
||||
th->worker->bestMoveChanges = 0;
|
||||
th->worker->TTsaves = 0;
|
||||
th->worker->rootDepth = th->worker->completedDepth = 0;
|
||||
th->worker->rootMoves = rootMoves;
|
||||
th->worker->rootPos.set(pos.fen(), pos.is_chess960(), &th->worker->rootState);
|
||||
@@ -212,6 +218,8 @@ void ThreadPool::start_thinking(const OptionsMap& options,
|
||||
th->worker->tbConfig = tbConfig;
|
||||
}
|
||||
|
||||
Cluster::signals_init();
|
||||
|
||||
main_thread()->start_searching();
|
||||
}
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
|
||||
#include "movepick.h"
|
||||
#include "position.h"
|
||||
#include "search.h"
|
||||
#include "thread_win32_osx.h"
|
||||
@@ -88,6 +89,7 @@ class ThreadPool {
|
||||
Thread* main_thread() const { return threads.front(); }
|
||||
uint64_t nodes_searched() const;
|
||||
uint64_t tb_hits() const;
|
||||
uint64_t TT_saves() const;
|
||||
Thread* get_best_thread() const;
|
||||
void start_searching();
|
||||
void wait_for_search_finished() const;
|
||||
|
||||
@@ -22,8 +22,8 @@
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
#include "cluster.h"
|
||||
#include "misc.h"
|
||||
#include "types.h"
|
||||
|
||||
namespace Stockfish {
|
||||
|
||||
|
||||
29
src/tt.h
29
src/tt.h
@@ -27,16 +27,21 @@
|
||||
|
||||
namespace Stockfish {
|
||||
|
||||
// TTEntry struct is the 10 bytes transposition table entry, defined as below:
|
||||
//
|
||||
// key 16 bit
|
||||
// depth 8 bit
|
||||
// generation 5 bit
|
||||
// pv node 1 bit
|
||||
// bound type 2 bit
|
||||
// move 16 bit
|
||||
// value 16 bit
|
||||
// eval value 16 bit
|
||||
namespace Cluster {
|
||||
void init();
|
||||
}
|
||||
|
||||
/// TTEntry struct is the 10 bytes transposition table entry, defined as below:
|
||||
///
|
||||
/// key 16 bit
|
||||
/// depth 8 bit
|
||||
/// generation 5 bit
|
||||
/// pv node 1 bit
|
||||
/// bound type 2 bit
|
||||
/// move 16 bit
|
||||
/// value 16 bit
|
||||
/// eval value 16 bit
|
||||
|
||||
struct TTEntry {
|
||||
|
||||
Move move() const { return Move(move16); }
|
||||
@@ -51,6 +56,8 @@ struct TTEntry {
|
||||
|
||||
private:
|
||||
friend class TranspositionTable;
|
||||
friend void Cluster::init();
|
||||
|
||||
|
||||
uint16_t key16;
|
||||
uint8_t depth8;
|
||||
@@ -68,6 +75,8 @@ struct TTEntry {
|
||||
// prefetched when possible.
|
||||
class TranspositionTable {
|
||||
|
||||
friend void Cluster::init();
|
||||
|
||||
static constexpr int ClusterSize = 3;
|
||||
|
||||
struct Cluster {
|
||||
|
||||
38
src/uci.cpp
38
src/uci.cpp
@@ -32,6 +32,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "benchmark.h"
|
||||
#include "cluster.h"
|
||||
#include "evaluate.h"
|
||||
#include "movegen.h"
|
||||
#include "nnue/network.h"
|
||||
@@ -112,7 +113,8 @@ void UCI::loop() {
|
||||
do
|
||||
{
|
||||
if (cli.argc == 1
|
||||
&& !getline(std::cin, cmd)) // Wait for an input or an end-of-file (EOF) indication
|
||||
&& !Cluster::getline(std::cin,
|
||||
cmd)) // Wait for an input or an end-of-file (EOF) indication
|
||||
cmd = "quit";
|
||||
|
||||
std::istringstream is(cmd);
|
||||
@@ -130,7 +132,7 @@ void UCI::loop() {
|
||||
else if (token == "ponderhit")
|
||||
threads.main_manager()->ponder = false; // Switch to the normal search
|
||||
|
||||
else if (token == "uci")
|
||||
else if (token == "uci" && Cluster::is_root())
|
||||
sync_cout << "id name " << engine_info(true) << "\n"
|
||||
<< options << "\nuciok" << sync_endl;
|
||||
|
||||
@@ -142,7 +144,7 @@ void UCI::loop() {
|
||||
position(pos, is, states);
|
||||
else if (token == "ucinewgame")
|
||||
search_clear();
|
||||
else if (token == "isready")
|
||||
else if (token == "isready" && Cluster::is_root())
|
||||
sync_cout << "readyok" << sync_endl;
|
||||
|
||||
// Add custom non-UCI commands, mainly for debugging purposes.
|
||||
@@ -151,13 +153,13 @@ void UCI::loop() {
|
||||
pos.flip();
|
||||
else if (token == "bench")
|
||||
bench(pos, is, states);
|
||||
else if (token == "d")
|
||||
else if (token == "d" && Cluster::is_root())
|
||||
sync_cout << pos << sync_endl;
|
||||
else if (token == "eval")
|
||||
else if (token == "eval" && Cluster::is_root())
|
||||
trace_eval(pos);
|
||||
else if (token == "compiler")
|
||||
else if (token == "compiler" && Cluster::is_root())
|
||||
sync_cout << compiler_info() << sync_endl;
|
||||
else if (token == "export_net")
|
||||
else if (token == "export_net" && Cluster::is_root())
|
||||
{
|
||||
std::pair<std::optional<std::string>, std::string> files[2];
|
||||
|
||||
@@ -170,7 +172,9 @@ void UCI::loop() {
|
||||
networks.big.save(files[0].first);
|
||||
networks.small.save(files[1].first);
|
||||
}
|
||||
else if (token == "--help" || token == "help" || token == "--license" || token == "license")
|
||||
else if ((token == "--help" || token == "help" || token == "--license"
|
||||
|| token == "license")
|
||||
&& Cluster::is_root())
|
||||
sync_cout
|
||||
<< "\nStockfish is a powerful chess engine for playing and analyzing."
|
||||
"\nIt is released as free software licensed under the GNU GPLv3 License."
|
||||
@@ -179,7 +183,7 @@ void UCI::loop() {
|
||||
"\nFor any further information, visit https://github.com/official-stockfish/Stockfish#readme"
|
||||
"\nor read the corresponding README.md and Copying.txt files distributed along with this program.\n"
|
||||
<< sync_endl;
|
||||
else if (!token.empty() && token[0] != '#')
|
||||
else if (!token.empty() && token[0] != '#' && Cluster::is_root())
|
||||
sync_cout << "Unknown command: '" << cmd << "'. Type help for more information."
|
||||
<< sync_endl;
|
||||
|
||||
@@ -259,15 +263,16 @@ void UCI::bench(Position& pos, std::istream& args, StateListPtr& states) {
|
||||
|
||||
if (token == "go" || token == "eval")
|
||||
{
|
||||
std::cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")"
|
||||
<< std::endl;
|
||||
if (Cluster::is_root())
|
||||
std::cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")"
|
||||
<< std::endl;
|
||||
if (token == "go")
|
||||
{
|
||||
go(pos, is, states);
|
||||
threads.main_thread()->wait_for_search_finished();
|
||||
nodes += threads.nodes_searched();
|
||||
nodes += Cluster::nodes_searched(threads);
|
||||
}
|
||||
else
|
||||
else if (Cluster::is_root())
|
||||
trace_eval(pos);
|
||||
}
|
||||
else if (token == "setoption")
|
||||
@@ -285,9 +290,10 @@ void UCI::bench(Position& pos, std::istream& args, StateListPtr& states) {
|
||||
|
||||
dbg_print();
|
||||
|
||||
std::cerr << "\n==========================="
|
||||
<< "\nTotal time (ms) : " << elapsed << "\nNodes searched : " << nodes
|
||||
<< "\nNodes/second : " << 1000 * nodes / elapsed << std::endl;
|
||||
if (Cluster::is_root())
|
||||
std::cerr << "\n==========================="
|
||||
<< "\nTotal time (ms) : " << elapsed << "\nNodes searched : " << nodes
|
||||
<< "\nNodes/second : " << 1000 * nodes / elapsed << std::endl;
|
||||
}
|
||||
|
||||
void UCI::trace_eval(Position& pos) {
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
|
||||
#include "cluster.h"
|
||||
#include "misc.h"
|
||||
|
||||
namespace Stockfish {
|
||||
@@ -51,7 +52,7 @@ void OptionsMap::setoption(std::istringstream& is) {
|
||||
|
||||
if (options_map.count(name))
|
||||
options_map[name] = value;
|
||||
else
|
||||
else if (Cluster::is_root())
|
||||
sync_cout << "No such option: " << name << sync_endl;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user