Merge pull request #5161 from Disservin/cluster

Merge SF master in the cluster branch
Merge branch 'master' into cluster
2025-12-06 10:53:50 +08:00 · 2024-04-10 22:18:16 +02:00 · 2024-04-10 18:46:26 +02:00 · 2023-07-05 12:11:22 +02:00 · 2023-07-03 20:35:36 +02:00 · 2023-01-21 16:57:10 +01:00
16 changed files with 859 additions and 81 deletions
--- a/README.md
+++ b/README.md
@@ -59,6 +59,33 @@ This distribution of Stockfish consists of the following files:
  * a file with the .nnue extension, storing the neural network for the NNUE
    evaluation. Binary distributions will have this file embedded.

+## Stockfish on distributed memory systems
+
+The cluster branch allows for running Stockfish on a cluster of servers (nodes)
+that are connected with a high-speed and low-latency network, using the message
+passing interface (MPI). In this case, one MPI process should be run per node,
+and UCI options can be used to set the number of threads/hash per node as usual.
+Typically, the engine will be invoked as
+```
+mpirun -np N /path/to/stockfish
+```
+where ```N``` stands for the number of MPI processes used (alternatives to ```mpirun```,
+include ```mpiexec```, ```srun```). Use 1 mpi rank per node, and employ threading
+according to the cores per node. To build the cluster
+branch, it is sufficient to specify ```COMPCXX=mpicxx``` (or e.g. CC depending on the name
+of the compiler providing MPI support) on the make command line, and do a clean build:
+```
+make -j ARCH=x86-64-modern clean build COMPCXX=mpicxx mpi=yes
+```
+Make sure that the MPI installation is configured to support ```MPI_THREAD_MULTIPLE```,
+this might require adding system specific compiler options to the Makefile. Stockfish employs
+non-blocking (asynchronous) communication, and benefits from an MPI
+implementation that efficiently supports this. Some MPI implentations might benefit
+from leaving 1 core/thread free for these asynchronous communications, and might require
+setting additional environment variables. ```mpirun``` should forward stdin/stdout
+to ```rank 0``` only (e.g. ```srun --input=0 --output=0```).
+Refer to your MPI documentation for more info.
+
 ## Contributing

 __See [Contributing Guide](CONTRIBUTING.md).__
--- a/src/Makefile
+++ b/src/Makefile
@@ -53,7 +53,7 @@ PGOBENCH = $(WINE_PATH) ./$(EXE) bench

 ### Source and object files
 SRCS = benchmark.cpp bitboard.cpp evaluate.cpp main.cpp \
-	misc.cpp movegen.cpp movepick.cpp position.cpp \
+	misc.cpp movegen.cpp movepick.cpp position.cpp cluster.cpp \
 	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
 	nnue/nnue_misc.cpp nnue/features/half_ka_v2_hm.cpp nnue/network.cpp

@@ -63,7 +63,7 @@ HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h \
 		nnue/layers/sqr_clipped_relu.h nnue/nnue_accumulator.h nnue/nnue_architecture.h \
 		nnue/nnue_common.h nnue/nnue_feature_transformer.h position.h \
 		search.h syzygy/tbprobe.h thread.h thread_win32_osx.h timeman.h \
-		tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h
+		tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h cluster.h

 OBJS = $(notdir $(SRCS:.cpp=.o))

@@ -100,6 +100,7 @@ VPATH = syzygy:nnue:nnue/features
 # vnni512 = yes/no    --- -mavx512vnni       --- Use Intel Vector Neural Network Instructions 512
 # neon = yes/no       --- -DUSE_NEON         --- Use ARM SIMD architecture
 # dotprod = yes/no    --- -DUSE_NEON_DOTPROD --- Use ARM advanced SIMD Int8 dot product instructions
+# mpi = yes/no        --- -DUSE_MPI        --- Use Message Passing Interface
 #
 # Note that Makefile is space sensitive, so when adding new architectures
 # or modifying existing flags, you have to make sure there are no extra spaces
@@ -149,6 +150,7 @@ avx512 = no
 vnni256 = no
 vnni512 = no
 neon = no
+mpi = no
 dotprod = no
 arm_version = 0
 STRIP = strip
@@ -791,6 +793,15 @@ ifeq ($(OS), Android)
 	LDFLAGS += -fPIE -pie
 endif

+### 3.10 MPI
+ifneq (,$(findstring mpi, $(CXX)))
+	mpi = yes
+endif
+ifeq ($(mpi),yes)
+	CXXFLAGS += -DUSE_MPI -Wno-cast-qual -fexceptions
+        DEPENDFLAGS += -DUSE_MPI
+endif
+
 ### ==========================================================================
 ### Section 4. Public Targets
 ### ==========================================================================
@@ -1013,6 +1024,7 @@ config-sanity: net
 	@echo "vnni256: '$(vnni256)'"
 	@echo "vnni512: '$(vnni512)'"
 	@echo "neon: '$(neon)'"
+	@echo "mpi: '$(mpi)'"
 	@echo "dotprod: '$(dotprod)'"
 	@echo "arm_version: '$(arm_version)'"
 	@echo "target_windows: '$(target_windows)'"
--- a/src/cluster.cpp
+++ b/src/cluster.cpp
@@ -0,0 +1,480 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifdef USE_MPI
+
+    #include <array>
+    #include <cstddef>
+    #include <cstdlib>
+    #include <iostream>
+    #include <istream>
+    #include <map>
+    #include <mpi.h>
+    #include <string>
+    #include <vector>
+
+    #include "cluster.h"
+    #include "thread.h"
+    #include "timeman.h"
+    #include "tt.h"
+    #include "search.h"
+
+namespace Stockfish {
+namespace Cluster {
+
+// Total number of ranks and rank within the communicator
+static int world_rank = MPI_PROC_NULL;
+static int world_size = 0;
+
+// Signals between ranks exchange basic info using a dedicated communicator
+static MPI_Comm    signalsComm        = MPI_COMM_NULL;
+static MPI_Request reqSignals         = MPI_REQUEST_NULL;
+static uint64_t    signalsCallCounter = 0;
+
+// Signals are the number of nodes searched, stop, table base hits, transposition table saves
+enum Signals : int {
+    SIG_NODES = 0,
+    SIG_STOP  = 1,
+    SIG_TB    = 2,
+    SIG_TTS   = 3,
+    SIG_NB    = 4
+};
+static uint64_t signalsSend[SIG_NB] = {};
+static uint64_t signalsRecv[SIG_NB] = {};
+static uint64_t nodesSearchedOthers = 0;
+static uint64_t tbHitsOthers        = 0;
+static uint64_t TTsavesOthers       = 0;
+static uint64_t stopSignalsPosted   = 0;
+
+// The UCI threads of each rank exchange use a dedicated communicator
+static MPI_Comm InputComm = MPI_COMM_NULL;
+
+// bestMove requires MoveInfo communicators and data types
+static MPI_Comm     MoveComm   = MPI_COMM_NULL;
+static MPI_Datatype MIDatatype = MPI_DATATYPE_NULL;
+
+// TT entries are communicated with a dedicated communicator.
+// The receive buffer is used to gather information from all ranks.
+// THe TTCacheCounter tracks the number of local elements that are ready to be sent.
+static MPI_Comm                                 TTComm = MPI_COMM_NULL;
+static std::array<std::vector<KeyedTTEntry>, 2> TTSendRecvBuffs;
+static std::array<MPI_Request, 2> reqsTTSendRecv = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
+static uint64_t                   sendRecvPosted = 0;
+static std::atomic<uint64_t>      TTCacheCounter = {};
+
+/// Initialize MPI and associated data types. Note that the MPI library must be configured
+/// to support MPI_THREAD_MULTIPLE, since multiple threads access MPI simultaneously.
+void init() {
+
+    int thread_support;
+    MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &thread_support);
+    if (thread_support < MPI_THREAD_MULTIPLE)
+    {
+        std::cerr << "Stockfish requires support for MPI_THREAD_MULTIPLE." << std::endl;
+        std::exit(EXIT_FAILURE);
+    }
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+    const std::array<MPI_Aint, 5> MIdisps = {offsetof(MoveInfo, move), offsetof(MoveInfo, ponder),
+                                             offsetof(MoveInfo, depth), offsetof(MoveInfo, score),
+                                             offsetof(MoveInfo, rank)};
+    MPI_Type_create_hindexed_block(5, 1, MIdisps.data(), MPI_INT, &MIDatatype);
+    MPI_Type_commit(&MIDatatype);
+
+    MPI_Comm_dup(MPI_COMM_WORLD, &InputComm);
+    MPI_Comm_dup(MPI_COMM_WORLD, &TTComm);
+    MPI_Comm_dup(MPI_COMM_WORLD, &MoveComm);
+    MPI_Comm_dup(MPI_COMM_WORLD, &signalsComm);
+}
+
+/// Finalize MPI and free the associated data types.
+void finalize() {
+
+    MPI_Type_free(&MIDatatype);
+
+    MPI_Comm_free(&InputComm);
+    MPI_Comm_free(&TTComm);
+    MPI_Comm_free(&MoveComm);
+    MPI_Comm_free(&signalsComm);
+
+    MPI_Finalize();
+}
+
+/// Return the total number of ranks
+int size() { return world_size; }
+
+/// Return the rank (index) of the process
+int rank() { return world_rank; }
+
+/// The receive buffer depends on the number of MPI ranks and threads, resize as needed
+void ttSendRecvBuff_resize(size_t nThreads) {
+
+    for (int i : {0, 1})
+    {
+        TTSendRecvBuffs[i].resize(TTCacheSize * world_size * nThreads);
+        std::fill(TTSendRecvBuffs[i].begin(), TTSendRecvBuffs[i].end(), KeyedTTEntry());
+    }
+}
+
+/// As input is only received by the root (rank 0) of the cluster, this input must be relayed
+/// to the UCI threads of all ranks, in order to setup the position, etc. We do this with a
+/// dedicated getline implementation, where the root broadcasts to all other ranks the received
+/// information.
+bool getline(std::istream& input, std::string& str) {
+
+    int               size;
+    std::vector<char> vec;
+    int               state;
+
+    if (is_root())
+    {
+        state = static_cast<bool>(std::getline(input, str));
+        vec.assign(str.begin(), str.end());
+        size = vec.size();
+    }
+
+    // Some MPI implementations use busy-wait polling, while we need yielding as otherwise
+    // the UCI thread on the non-root ranks would be consuming resources.
+    static MPI_Request reqInput = MPI_REQUEST_NULL;
+    MPI_Ibcast(&size, 1, MPI_INT, 0, InputComm, &reqInput);
+    if (is_root())
+        MPI_Wait(&reqInput, MPI_STATUS_IGNORE);
+    else
+    {
+        while (true)
+        {
+            int flag;
+            MPI_Test(&reqInput, &flag, MPI_STATUS_IGNORE);
+            if (flag)
+                break;
+            else
+                std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        }
+    }
+
+    // Broadcast received string
+    if (!is_root())
+        vec.resize(size);
+    MPI_Bcast(vec.data(), size, MPI_CHAR, 0, InputComm);
+    if (!is_root())
+        str.assign(vec.begin(), vec.end());
+    MPI_Bcast(&state, 1, MPI_INT, 0, InputComm);
+
+    return state;
+}
+
+/// Sending part of the signal communication loop
+namespace {
+void signals_send(const ThreadPool& threads) {
+
+    signalsSend[SIG_NODES] = threads.nodes_searched();
+    signalsSend[SIG_TB]    = threads.tb_hits();
+    signalsSend[SIG_TTS]   = threads.TT_saves();
+    signalsSend[SIG_STOP]  = threads.stop;
+    MPI_Iallreduce(signalsSend, signalsRecv, SIG_NB, MPI_UINT64_T, MPI_SUM, signalsComm,
+                   &reqSignals);
+    ++signalsCallCounter;
+}
+
+
+/// Processing part of the signal communication loop.
+/// For some counters (e.g. nodes) we only keep their sum on the other nodes
+/// allowing to add local counters at any time for more fine grained process,
+/// which is useful to indicate progress during early iterations, and to have
+/// node counts that exactly match the non-MPI code in the single rank case.
+/// This call also propagates the stop signal between ranks.
+void signals_process(ThreadPool& threads) {
+
+    nodesSearchedOthers = signalsRecv[SIG_NODES] - signalsSend[SIG_NODES];
+    tbHitsOthers        = signalsRecv[SIG_TB] - signalsSend[SIG_TB];
+    TTsavesOthers       = signalsRecv[SIG_TTS] - signalsSend[SIG_TTS];
+    stopSignalsPosted   = signalsRecv[SIG_STOP];
+    if (signalsRecv[SIG_STOP] > 0)
+        threads.stop = true;
+}
+
+void sendrecv_post() {
+
+    ++sendRecvPosted;
+    MPI_Irecv(TTSendRecvBuffs[sendRecvPosted % 2].data(),
+              TTSendRecvBuffs[sendRecvPosted % 2].size() * sizeof(KeyedTTEntry), MPI_BYTE,
+              (rank() + size() - 1) % size(), 42, TTComm, &reqsTTSendRecv[0]);
+    MPI_Isend(TTSendRecvBuffs[(sendRecvPosted + 1) % 2].data(),
+              TTSendRecvBuffs[(sendRecvPosted + 1) % 2].size() * sizeof(KeyedTTEntry), MPI_BYTE,
+              (rank() + 1) % size(), 42, TTComm, &reqsTTSendRecv[1]);
+}
+}
+
+/// During search, most message passing is asynchronous, but at the end of
+/// search it makes sense to bring them to a common, finalized state.
+void signals_sync(ThreadPool& threads) {
+
+    while (stopSignalsPosted < uint64_t(size()))
+        signals_poll(threads);
+
+    // Finalize outstanding messages of the signal loops.
+    // We might have issued one call less than needed on some ranks.
+    uint64_t globalCounter;
+    MPI_Allreduce(&signalsCallCounter, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm);
+    if (signalsCallCounter < globalCounter)
+    {
+        MPI_Wait(&reqSignals, MPI_STATUS_IGNORE);
+        signals_send(threads);
+    }
+    assert(signalsCallCounter == globalCounter);
+    MPI_Wait(&reqSignals, MPI_STATUS_IGNORE);
+    signals_process(threads);
+
+    // Finalize outstanding messages in the sendRecv loop
+    MPI_Allreduce(&sendRecvPosted, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm);
+    while (sendRecvPosted < globalCounter)
+    {
+        MPI_Waitall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), MPI_STATUSES_IGNORE);
+        sendrecv_post();
+    }
+    assert(sendRecvPosted == globalCounter);
+    MPI_Waitall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), MPI_STATUSES_IGNORE);
+}
+
+/// Initialize signal counters to zero.
+void signals_init() {
+
+    stopSignalsPosted = tbHitsOthers = TTsavesOthers = nodesSearchedOthers = 0;
+
+    signalsSend[SIG_NODES] = signalsRecv[SIG_NODES] = 0;
+    signalsSend[SIG_TB] = signalsRecv[SIG_TB] = 0;
+    signalsSend[SIG_TTS] = signalsRecv[SIG_TTS] = 0;
+    signalsSend[SIG_STOP] = signalsRecv[SIG_STOP] = 0;
+}
+
+/// Poll the signal loop, and start next round as needed.
+void signals_poll(ThreadPool& threads) {
+
+    int flag;
+    MPI_Test(&reqSignals, &flag, MPI_STATUS_IGNORE);
+    if (flag)
+    {
+        signals_process(threads);
+        signals_send(threads);
+    }
+}
+
+/// Provide basic info related the cluster performance, in particular, the number of signals send,
+/// signals per sounds (sps), the number of gathers, the number of positions gathered (per node and per second, gpps)
+/// The number of TT saves and TT saves per second. If gpps equals approximately TTSavesps the gather loop has enough bandwidth.
+void cluster_info(const ThreadPool& threads, Depth depth, TimePoint elapsed) {
+
+    // TimePoint elapsed = Time.elapsed() + 1;
+    uint64_t TTSaves = TT_saves(threads);
+
+    sync_cout << "info depth " << depth << " cluster "
+              << " signals " << signalsCallCounter << " sps " << signalsCallCounter * 1000 / elapsed
+              << " sendRecvs " << sendRecvPosted << " srpps "
+              << TTSendRecvBuffs[0].size() * sendRecvPosted * 1000 / elapsed << " TTSaves "
+              << TTSaves << " TTSavesps " << TTSaves * 1000 / elapsed << sync_endl;
+}
+
+/// When a TT entry is saved, additional steps are taken if the entry is of sufficient depth.
+/// If sufficient entries has been collected, a communication is initiated.
+/// If a communication has been completed, the received results are saved to the TT.
+void save(TranspositionTable& TT,
+          ThreadPool&         threads,
+          Search::Worker*     thread,
+          TTEntry*            tte,
+          Key                 k,
+          Value               v,
+          bool                PvHit,
+          Bound               b,
+          Depth               d,
+          Move                m,
+          Value               ev,
+          uint8_t             generation8) {
+
+    // Standard save to the TT
+    tte->save(k, v, PvHit, b, d, m, ev, generation8);
+
+    // If the entry is of sufficient depth to be worth communicating, take action.
+    if (d > 3)
+    {
+        // count the TTsaves to information: this should be relatively similar
+        // to the number of entries we can send/recv.
+        thread->TTsaves.fetch_add(1, std::memory_order_relaxed);
+
+        // Add to thread's send buffer, the locking here avoids races when the master thread
+        // prepares the send buffer.
+        {
+            std::lock_guard<std::mutex> lk(thread->ttCache.mutex);
+            thread->ttCache.buffer.replace(KeyedTTEntry(k, *tte));
+            ++TTCacheCounter;
+        }
+
+        size_t recvBuffPerRankSize = threads.size() * TTCacheSize;
+
+        // Communicate on main search thread, as soon the threads combined have collected
+        // sufficient data to fill the send buffers.
+        if (thread == threads.main_thread()->worker.get() && TTCacheCounter > recvBuffPerRankSize)
+        {
+            // Test communication status
+            int flag;
+            MPI_Testall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), &flag, MPI_STATUSES_IGNORE);
+
+            // Current communication is complete
+            if (flag)
+            {
+                // Save all received entries to TT, and store our TTCaches, ready for the next round of communication
+                for (size_t irank = 0; irank < size_t(size()); ++irank)
+                {
+                    if (irank
+                        == size_t(
+                          rank()))  // this is our part, fill the part of the buffer for sending
+                    {
+                        // Copy from the thread caches to the right spot in the buffer
+                        size_t i = irank * recvBuffPerRankSize;
+                        for (auto&& th : threads)
+                        {
+                            std::lock_guard<std::mutex> lk(th->worker->ttCache.mutex);
+
+                            for (auto&& e : th->worker->ttCache.buffer)
+                                TTSendRecvBuffs[sendRecvPosted % 2][i++] = e;
+
+                            // Reset thread's send buffer
+                            th->worker->ttCache.buffer = {};
+                        }
+
+                        TTCacheCounter = 0;
+                    }
+                    else  // process data received from the corresponding rank.
+                        for (size_t i = irank * recvBuffPerRankSize;
+                             i < (irank + 1) * recvBuffPerRankSize; ++i)
+                        {
+                            auto&&   e = TTSendRecvBuffs[sendRecvPosted % 2][i];
+                            bool     found;
+                            TTEntry* replace_tte;
+                            replace_tte = TT.probe(e.first, found);
+                            replace_tte->save(e.first, e.second.value(), e.second.is_pv(),
+                                              e.second.bound(), e.second.depth(), e.second.move(),
+                                              e.second.eval(), TT.generation());
+                        }
+                }
+
+                // Start next communication
+                sendrecv_post();
+
+                // Force check of time on the next occasion, the above actions might have taken some time.
+                thread->main_manager()->callsCnt = 0;
+            }
+        }
+    }
+}
+
+/// Picks the bestMove across ranks, and send the associated info and PV to the root of the cluster.
+/// Note that this bestMove and PV must be output by the root, the guarantee proper ordering of output.
+/// TODO update to the scheme in master.. can this use aggregation of votes?
+void pick_moves(MoveInfo& mi, std::string& PVLine) {
+
+    MoveInfo* pMoveInfo = NULL;
+    if (is_root())
+    {
+        pMoveInfo = (MoveInfo*) malloc(sizeof(MoveInfo) * size());
+    }
+    MPI_Gather(&mi, 1, MIDatatype, pMoveInfo, 1, MIDatatype, 0, MoveComm);
+
+    if (is_root())
+    {
+        std::map<int, int> votes;
+        int                minScore = pMoveInfo[0].score;
+        for (int i = 0; i < size(); ++i)
+        {
+            minScore                 = std::min(minScore, pMoveInfo[i].score);
+            votes[pMoveInfo[i].move] = 0;
+        }
+        for (int i = 0; i < size(); ++i)
+        {
+            votes[pMoveInfo[i].move] += pMoveInfo[i].score - minScore + pMoveInfo[i].depth;
+        }
+        int bestVote = votes[pMoveInfo[0].move];
+        for (int i = 0; i < size(); ++i)
+        {
+            if (votes[pMoveInfo[i].move] > bestVote)
+            {
+                bestVote = votes[pMoveInfo[i].move];
+                mi       = pMoveInfo[i];
+            }
+        }
+        free(pMoveInfo);
+    }
+
+    // Send around the final result
+    MPI_Bcast(&mi, 1, MIDatatype, 0, MoveComm);
+
+    // Send PV line to root as needed
+    if (mi.rank != 0 && mi.rank == rank())
+    {
+        int               size;
+        std::vector<char> vec;
+        vec.assign(PVLine.begin(), PVLine.end());
+        size = vec.size();
+        MPI_Send(&size, 1, MPI_INT, 0, 42, MoveComm);
+        MPI_Send(vec.data(), size, MPI_CHAR, 0, 42, MoveComm);
+    }
+    if (mi.rank != 0 && is_root())
+    {
+        int               size;
+        std::vector<char> vec;
+        MPI_Recv(&size, 1, MPI_INT, mi.rank, 42, MoveComm, MPI_STATUS_IGNORE);
+        vec.resize(size);
+        MPI_Recv(vec.data(), size, MPI_CHAR, mi.rank, 42, MoveComm, MPI_STATUS_IGNORE);
+        PVLine.assign(vec.begin(), vec.end());
+    }
+}
+
+/// Return nodes searched (lazily updated cluster wide in the signal loop)
+uint64_t nodes_searched(const ThreadPool& threads) {
+    return nodesSearchedOthers + threads.nodes_searched();
+}
+
+/// Return table base hits (lazily updated cluster wide in the signal loop)
+uint64_t tb_hits(const ThreadPool& threads) { return tbHitsOthers + threads.tb_hits(); }
+
+/// Return the number of saves to the TT buffers, (lazily updated cluster wide in the signal loop)
+uint64_t TT_saves(const ThreadPool& threads) { return TTsavesOthers + threads.TT_saves(); }
+
+
+}
+}
+
+#else
+
+    #include "cluster.h"
+    #include "thread.h"
+
+namespace Stockfish {
+namespace Cluster {
+
+uint64_t nodes_searched(const ThreadPool& threads) { return threads.nodes_searched(); }
+
+uint64_t tb_hits(const ThreadPool& threads) { return threads.tb_hits(); }
+
+uint64_t TT_saves(const ThreadPool& threads) { return threads.TT_saves(); }
+
+}
+}
+
+#endif  // USE_MPI
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -0,0 +1,157 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef CLUSTER_H_INCLUDED
+#define CLUSTER_H_INCLUDED
+
+#include <algorithm>
+#include <array>
+#include <istream>
+#include <string>
+
+#include "tt.h"
+
+namespace Stockfish {
+class Thread;
+class ThreadPool;
+
+namespace Search {
+class Worker;
+}
+
+/// The Cluster namespace contains functionality required to run on distributed
+/// memory architectures using MPI as the message passing interface. On a high level,
+/// a 'lazy SMP'-like scheme is implemented where TT saves of sufficient depth are
+/// collected on each rank and distributed to, and used by, all other ranks,
+/// which search essentially independently.  The root (MPI rank 0) of the cluster
+/// is responsible for all I/O and time management, communicating this info to
+/// the other ranks as needed. UCI options such as Threads and Hash specify these
+/// quantities per MPI rank.  It is recommended to have one rank (MPI process) per node.
+/// For the non-MPI case, wrappers that will be compiler-optimized away are provided.
+
+namespace Cluster {
+
+/// Basic info to find the cluster-wide bestMove
+struct MoveInfo {
+    int move;
+    int ponder;
+    int depth;
+    int score;
+    int rank;
+};
+
+#ifdef USE_MPI
+
+// store the TTEntry with its full key, so it can be saved on the receiver side
+using KeyedTTEntry                = std::pair<Key, TTEntry>;
+constexpr std::size_t TTCacheSize = 16;
+
+// Threads locally cache their high-depth TT entries till a batch can be send by MPI
+template<std::size_t N>
+class TTCache: public std::array<KeyedTTEntry, N> {
+
+    struct Compare {
+        inline bool operator()(const KeyedTTEntry& lhs, const KeyedTTEntry& rhs) {
+            return lhs.second.depth() > rhs.second.depth();
+        }
+    };
+    Compare compare;
+
+   public:
+    // Keep a heap of entries replacing low depth with high depth entries
+    bool replace(const KeyedTTEntry& value) {
+
+        if (compare(value, this->front()))
+        {
+            std::pop_heap(this->begin(), this->end(), compare);
+            this->back() = value;
+            std::push_heap(this->begin(), this->end(), compare);
+            return true;
+        }
+        return false;
+    }
+};
+
+void        init();
+void        finalize();
+bool        getline(std::istream& input, std::string& str);
+int         size();
+int         rank();
+inline bool is_root() { return rank() == 0; }
+void        save(TranspositionTable&,
+                 ThreadPool&,
+                 Search::Worker* thread,
+                 TTEntry*        tte,
+                 Key             k,
+                 Value           v,
+                 bool            PvHit,
+                 Bound           b,
+                 Depth           d,
+                 Move            m,
+                 Value           ev,
+                 uint8_t         generation8);
+void        pick_moves(MoveInfo& mi, std::string& PVLine);
+void        ttSendRecvBuff_resize(size_t nThreads);
+uint64_t    nodes_searched(const ThreadPool&);
+uint64_t    tb_hits(const ThreadPool&);
+uint64_t    TT_saves(const ThreadPool&);
+void        cluster_info(const ThreadPool&, Depth depth, TimePoint elapsed);
+void        signals_init();
+void        signals_poll(ThreadPool& threads);
+void        signals_sync(ThreadPool& threads);
+
+#else
+
+inline void init() {}
+inline void finalize() {}
+inline bool getline(std::istream& input, std::string& str) {
+    return static_cast<bool>(std::getline(input, str));
+}
+constexpr int  size() { return 1; }
+constexpr int  rank() { return 0; }
+constexpr bool is_root() { return true; }
+inline void    save(TranspositionTable&,
+                    ThreadPool&,
+                    Search::Worker*,
+                    TTEntry* tte,
+                    Key      k,
+                    Value    v,
+                    bool     PvHit,
+                    Bound    b,
+                    Depth    d,
+                    Move     m,
+                    Value    ev,
+                    uint8_t  generation8) {
+    tte->save(k, v, PvHit, b, d, m, ev, generation8);
+}
+inline void pick_moves(MoveInfo&, std::string&) {}
+inline void ttSendRecvBuff_resize(size_t) {}
+uint64_t    nodes_searched(const ThreadPool&);
+uint64_t    tb_hits(const ThreadPool&);
+uint64_t    TT_saves(const ThreadPool&);
+inline void cluster_info(const ThreadPool&, Depth, TimePoint) {}
+inline void signals_init() {}
+inline void signals_poll(ThreadPool& threads) {}
+inline void signals_sync(ThreadPool& threads) {}
+
+#endif /* USE_MPI */
+
+}
+}
+
+#endif  // #ifndef CLUSTER_H_INCLUDED
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -29,7 +29,9 @@ using namespace Stockfish;

 int main(int argc, char* argv[]) {

-    std::cout << engine_info() << std::endl;
+    Cluster::init();
+    if (Cluster::is_root())
+        std::cout << engine_info() << std::endl;

    Bitboards::init();
    Position::init();
@@ -40,5 +42,7 @@ int main(int argc, char* argv[]) {

    uci.loop();

+    Cluster::finalize();
+
    return 0;
 }
--- a/src/nnue/network.cpp
+++ b/src/nnue/network.cpp
@@ -27,6 +27,7 @@
 #include <type_traits>
 #include <vector>

+#include "../cluster.h"
 #include "../evaluate.h"
 #include "../incbin/incbin.h"
 #include "../misc.h"
@@ -250,7 +251,8 @@ void Network<Arch, Transformer>::verify(std::string evalfilePath) const {
        exit(EXIT_FAILURE);
    }

-    sync_cout << "info string NNUE evaluation using " << evalfilePath << sync_endl;
+    if (Cluster::is_root())
+        sync_cout << "info string NNUE evaluation using " << evalfilePath << sync_endl;
 }


--- a/src/perft.h
+++ b/src/perft.h
@@ -21,6 +21,7 @@

 #include <cstdint>

+#include "cluster.h"
 #include "movegen.h"
 #include "position.h"
 #include "types.h"
@@ -50,7 +51,7 @@ uint64_t perft(Position& pos, Depth depth) {
            nodes += cnt;
            pos.undo_move(m);
        }
-        if (Root)
+        if (Root && Cluster::is_root())
            sync_cout << UCI::move(m, pos.is_chess960()) << ": " << cnt << sync_endl;
    }
    return nodes;
@@ -62,7 +63,8 @@ inline void perft(const std::string& fen, Depth depth, bool isChess960) {
    p.set(fen, isChess960, &states->back());

    uint64_t nodes = perft<true>(p, depth);
-    sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl;
+    if (Cluster::is_root())
+        sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl;
 }
 }

--- a/src/search.cpp
+++ b/src/search.cpp
@@ -30,6 +30,7 @@
 #include <sstream>
 #include <utility>

+#include "cluster.h"
 #include "evaluate.h"
 #include "misc.h"
 #include "movegen.h"
@@ -157,9 +158,10 @@ void Search::Worker::start_searching() {
    if (rootMoves.empty())
    {
        rootMoves.emplace_back(Move::none());
-        sync_cout << "info depth 0 score "
-                  << UCI::to_score(rootPos.checkers() ? -VALUE_MATE : VALUE_DRAW, rootPos)
-                  << sync_endl;
+        if (Cluster::is_root())
+            sync_cout << "info depth 0 score "
+                      << UCI::to_score(rootPos.checkers() ? -VALUE_MATE : VALUE_DRAW, rootPos)
+                      << sync_endl;
    }
    else
    {
@@ -173,12 +175,17 @@ void Search::Worker::start_searching() {
    // GUI sends a "stop" or "ponderhit" command. We therefore simply wait here
    // until the GUI sends one of those commands.
    while (!threads.stop && (main_manager()->ponder || limits.infinite))
-    {}  // Busy wait for a stop or a ponder reset
+    {
+        Cluster::signals_poll(threads);
+    }  // Busy wait for a stop or a ponder reset

    // Stop the threads if not already stopped (also raise the stop if
    // "ponderhit" just reset threads.ponder).
    threads.stop = true;

+    // Signal and synchronize all other ranks
+    Cluster::signals_sync(threads);
+
    // Wait until all threads have finished
    threads.wait_for_search_finished();

@@ -186,7 +193,7 @@ void Search::Worker::start_searching() {
    // the available ones before exiting.
    if (limits.npmsec)
        main_manager()->tm.advance_nodes_time(limits.inc[rootPos.side_to_move()]
-                                              - threads.nodes_searched());
+                                              - Cluster::nodes_searched(threads));

    Worker* bestThread = this;
    Skill   skill =
@@ -196,21 +203,40 @@ void Search::Worker::start_searching() {
        && rootMoves[0].pv[0] != Move::none())
        bestThread = threads.get_best_thread()->worker.get();

+    // Prepare PVLine and ponder move
+    std::string PVLine = main_manager()->pv(*bestThread, threads, tt, bestThread->completedDepth);
+
    main_manager()->bestPreviousScore        = bestThread->rootMoves[0].score;
    main_manager()->bestPreviousAverageScore = bestThread->rootMoves[0].averageScore;

-    // Send again PV info if we have a new best thread
-    if (bestThread != this)
-        sync_cout << main_manager()->pv(*bestThread, threads, tt, bestThread->completedDepth)
-                  << sync_endl;
-
-    sync_cout << "bestmove " << UCI::move(bestThread->rootMoves[0].pv[0], rootPos.is_chess960());
-
+    Move bestMove   = bestThread->rootMoves[0].pv[0];
+    Move ponderMove = Move::none();
    if (bestThread->rootMoves[0].pv.size() > 1
        || bestThread->rootMoves[0].extract_ponder_from_tt(tt, rootPos))
-        std::cout << " ponder " << UCI::move(bestThread->rootMoves[0].pv[1], rootPos.is_chess960());
+        ponderMove = bestThread->rootMoves[0].pv[1];

-    std::cout << sync_endl;
+    // Exchange info as needed
+    Cluster::MoveInfo mi{bestMove.raw(), ponderMove.raw(), bestThread->completedDepth,
+                         bestThread->rootMoves[0].score, Cluster::rank()};
+    Cluster::pick_moves(mi, PVLine);
+
+    main_manager()->bestPreviousScore = static_cast<Value>(mi.score);
+
+    if (Cluster::is_root())
+    {
+        // Send again PV info if we have a new best thread/rank
+        if (bestThread != this || mi.rank != 0)
+            sync_cout << PVLine << sync_endl;
+
+        bestMove   = static_cast<Move>(mi.move);
+        ponderMove = static_cast<Move>(mi.ponder);
+
+        if (ponderMove != Move::none())
+            sync_cout << "bestmove " << UCI::move(bestMove, rootPos.is_chess960()) << " ponder "
+                      << UCI::move(ponderMove, rootPos.is_chess960()) << sync_endl;
+        else
+            sync_cout << "bestmove " << UCI::move(bestMove, rootPos.is_chess960()) << sync_endl;
+    }
 }

 // Main iterative deepening loop. It calls search()
@@ -272,7 +298,7 @@ void Search::Worker::iterative_deepening() {

    // Iterative deepening loop until requested to stop or the target depth is reached
    while (++rootDepth < MAX_PLY && !threads.stop
-           && !(limits.depth && mainThread && rootDepth > limits.depth))
+           && !(limits.depth && mainThread && Cluster::is_root() && rootDepth > limits.depth))
    {
        // Age out PV variability metric
        if (mainThread)
@@ -341,9 +367,14 @@ void Search::Worker::iterative_deepening() {

                // When failing high/low give some update (without cluttering
                // the UI) before a re-search.
-                if (mainThread && multiPV == 1 && (bestValue <= alpha || bestValue >= beta)
-                    && mainThread->tm.elapsed(threads.nodes_searched()) > 3000)
+                if (Cluster::is_root() && mainThread && multiPV == 1
+                    && (bestValue <= alpha || bestValue >= beta)
+                    && mainThread->tm.elapsed(Cluster::nodes_searched(threads)) > 3000)
+                {
                    sync_cout << main_manager()->pv(*this, threads, tt, rootDepth) << sync_endl;
+                    Cluster::cluster_info(threads, rootDepth,
+                                          mainThread->tm.elapsed(Cluster::nodes_searched(threads)));
+                }

                // In case of failing low/high increase aspiration window and
                // re-search, otherwise exit the loop.
@@ -372,15 +403,19 @@ void Search::Worker::iterative_deepening() {
            // Sort the PV lines searched so far and update the GUI
            std::stable_sort(rootMoves.begin() + pvFirst, rootMoves.begin() + pvIdx + 1);

-            if (mainThread
+            if (Cluster::is_root() && mainThread
                && (threads.stop || pvIdx + 1 == multiPV
-                    || mainThread->tm.elapsed(threads.nodes_searched()) > 3000)
+                    || mainThread->tm.elapsed(Cluster::nodes_searched(threads)) > 3000)
                // A thread that aborted search can have mated-in/TB-loss PV and score
                // that cannot be trusted, i.e. it can be delayed or refuted if we would have
                // had time to fully search other root-moves. Thus we suppress this output and
                // below pick a proven score/PV for this thread (from the previous iteration).
                && !(threads.abortedSearch && rootMoves[0].uciScore <= VALUE_TB_LOSS_IN_MAX_PLY))
+            {
                sync_cout << main_manager()->pv(*this, threads, tt, rootDepth) << sync_endl;
+                Cluster::cluster_info(threads, rootDepth,
+                                      mainThread->tm.elapsed(Cluster::nodes_searched(threads)) + 1);
+            }
        }

        if (!threads.stop)
@@ -451,12 +486,12 @@ void Search::Worker::iterative_deepening() {
                totalTime = std::min(500.0, totalTime);

            if (completedDepth >= 10 && nodesEffort >= 97
-                && mainThread->tm.elapsed(threads.nodes_searched()) > totalTime * 0.739
+                && mainThread->tm.elapsed(Cluster::nodes_searched(threads)) > totalTime * 0.739
                && !mainThread->ponder)
                threads.stop = true;

            // Stop the search if we have exceeded the totalTime
-            if (mainThread->tm.elapsed(threads.nodes_searched()) > totalTime)
+            if (mainThread->tm.elapsed(Cluster::nodes_searched(threads)) > totalTime)
            {
                // If we are allowed to ponder do not stop the search now but
                // keep pondering until the GUI sends "ponderhit" or "stop".
@@ -468,7 +503,7 @@ void Search::Worker::iterative_deepening() {
            else
                threads.increaseDepth =
                  mainThread->ponder
-                  || mainThread->tm.elapsed(threads.nodes_searched()) <= totalTime * 0.506;
+                  || mainThread->tm.elapsed(Cluster::nodes_searched(threads)) <= totalTime * 0.506;
        }

        mainThread->iterValue[iterIdx] = bestValue;
@@ -671,9 +706,9 @@ Value Search::Worker::search(

                if (b == BOUND_EXACT || (b == BOUND_LOWER ? value >= beta : value <= alpha))
                {
-                    tte->save(posKey, value_to_tt(value, ss->ply), ss->ttPv, b,
-                              std::min(MAX_PLY - 1, depth + 6), Move::none(), VALUE_NONE,
-                              tt.generation());
+                    Cluster::save(tt, threads, thisThread, tte, posKey, value_to_tt(value, ss->ply),
+                                  ss->ttPv, b, std::min(MAX_PLY - 1, depth + 6), Move::none(),
+                                  VALUE_NONE, tt.generation());

                    return value;
                }
@@ -726,8 +761,8 @@ Value Search::Worker::search(
        ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);

        // Static evaluation is saved as it was before adjustment by correction history
-        tte->save(posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_NONE, Move::none(),
-                  unadjustedStaticEval, tt.generation());
+        Cluster::save(tt, threads, thisThread, tte, posKey, VALUE_NONE, ss->ttPv, BOUND_NONE,
+                      DEPTH_NONE, Move::none(), unadjustedStaticEval, tt.generation());
    }

    // Use static evaluation difference to improve quiet move ordering (~9 Elo)
@@ -872,8 +907,9 @@ Value Search::Worker::search(
                if (value >= probCutBeta)
                {
                    // Save ProbCut data into transposition table
-                    tte->save(posKey, value_to_tt(value, ss->ply), ss->ttPv, BOUND_LOWER, depth - 3,
-                              move, unadjustedStaticEval, tt.generation());
+                    Cluster::save(tt, threads, thisThread, tte, posKey, value_to_tt(value, ss->ply),
+                                  ss->ttPv, BOUND_LOWER, depth - 3, move, unadjustedStaticEval,
+                                  tt.generation());
                    return std::abs(value) < VALUE_TB_WIN_IN_MAX_PLY ? value - (probCutBeta - beta)
                                                                     : value;
                }
@@ -930,8 +966,8 @@ moves_loop:  // When in check, search starts here

        ss->moveCount = ++moveCount;

-        if (rootNode && is_mainthread()
-            && main_manager()->tm.elapsed(threads.nodes_searched()) > 3000)
+        if (rootNode && Cluster::is_root() && is_mainthread()
+            && main_manager()->tm.elapsed(Cluster::nodes_searched(threads)) > 3000)
            sync_cout << "info depth " << depth << " currmove "
                      << UCI::move(move, pos.is_chess960()) << " currmovenumber "
                      << moveCount + thisThread->pvIdx << sync_endl;
@@ -1341,11 +1377,12 @@ moves_loop:  // When in check, search starts here
    // Write gathered information in transposition table
    // Static evaluation is saved as it was before correction history
    if (!excludedMove && !(rootNode && thisThread->pvIdx))
-        tte->save(posKey, value_to_tt(bestValue, ss->ply), ss->ttPv,
-                  bestValue >= beta    ? BOUND_LOWER
-                  : PvNode && bestMove ? BOUND_EXACT
-                                       : BOUND_UPPER,
-                  depth, bestMove, unadjustedStaticEval, tt.generation());
+        Cluster::save(tt, threads, thisThread, tte, posKey, value_to_tt(bestValue, ss->ply),
+                      ss->ttPv,
+                      bestValue >= beta    ? BOUND_LOWER
+                      : PvNode && bestMove ? BOUND_EXACT
+                                           : BOUND_UPPER,
+                      depth, bestMove, unadjustedStaticEval, tt.generation());

    // Adjust correction history
    if (!ss->inCheck && (!bestMove || !pos.capture(bestMove))
@@ -1472,8 +1509,9 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
        if (bestValue >= beta)
        {
            if (!ss->ttHit)
-                tte->save(posKey, value_to_tt(bestValue, ss->ply), false, BOUND_LOWER, DEPTH_NONE,
-                          Move::none(), unadjustedStaticEval, tt.generation());
+                Cluster::save(tt, threads, thisThread, tte, posKey, value_to_tt(bestValue, ss->ply),
+                              false, BOUND_LOWER, DEPTH_NONE, Move::none(), unadjustedStaticEval,
+                              tt.generation());

            return bestValue;
        }
@@ -1618,9 +1656,9 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,

    // Save gathered info in transposition table
    // Static evaluation is saved as it was before adjustment by correction history
-    tte->save(posKey, value_to_tt(bestValue, ss->ply), pvHit,
-              bestValue >= beta ? BOUND_LOWER : BOUND_UPPER, ttDepth, bestMove,
-              unadjustedStaticEval, tt.generation());
+    Cluster::save(tt, threads, thisThread, tte, posKey, value_to_tt(bestValue, ss->ply), pvHit,
+                  bestValue >= beta ? BOUND_LOWER : BOUND_UPPER, ttDepth, bestMove,
+                  unadjustedStaticEval, tt.generation());

    assert(bestValue > -VALUE_INFINITE && bestValue < VALUE_INFINITE);

@@ -1846,7 +1884,7 @@ void SearchManager::check_time(Search::Worker& worker) {

    static TimePoint lastInfoTime = now();

-    TimePoint elapsed = tm.elapsed(worker.threads.nodes_searched());
+    TimePoint elapsed = tm.elapsed(Cluster::nodes_searched(worker.threads));
    TimePoint tick    = worker.limits.startTime + elapsed;

    if (tick - lastInfoTime >= 1000)
@@ -1855,6 +1893,9 @@ void SearchManager::check_time(Search::Worker& worker) {
        dbg_print();
    }

+    // poll on MPI signals
+    Cluster::signals_poll(worker.threads);
+
    // We should not stop pondering until told so by the GUI
    if (ponder)
        return;
@@ -1865,7 +1906,8 @@ void SearchManager::check_time(Search::Worker& worker) {
      worker.completedDepth >= 1
      && ((worker.limits.use_time_management() && (elapsed > tm.maximum() || stopOnPonderhit))
          || (worker.limits.movetime && elapsed >= worker.limits.movetime)
-          || (worker.limits.nodes && worker.threads.nodes_searched() >= worker.limits.nodes)))
+          || (worker.limits.nodes
+              && Cluster::nodes_searched(worker.threads) >= worker.limits.nodes)))
        worker.threads.stop = worker.threads.abortedSearch = true;
 }

@@ -1875,13 +1917,13 @@ std::string SearchManager::pv(const Search::Worker&     worker,
                              Depth                     depth) const {
    std::stringstream ss;

-    const auto  nodes     = threads.nodes_searched();
+    const auto  nodes     = Cluster::nodes_searched(threads);
    const auto& rootMoves = worker.rootMoves;
    const auto& pos       = worker.rootPos;
    size_t      pvIdx     = worker.pvIdx;
    TimePoint   time      = tm.elapsed(nodes) + 1;
    size_t      multiPV   = std::min(size_t(worker.options["MultiPV"]), rootMoves.size());
-    uint64_t    tbHits    = threads.tb_hits() + (worker.tbConfig.rootInTB ? rootMoves.size() : 0);
+    uint64_t tbHits = Cluster::tb_hits(threads) + (worker.tbConfig.rootInTB ? rootMoves.size() : 0);

    for (size_t i = 0; i < multiPV; ++i)
    {
--- a/src/search.h
+++ b/src/search.h
@@ -27,7 +27,9 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include <mutex>

+#include "cluster.h"
 #include "misc.h"
 #include "movepick.h"
 #include "position.h"
@@ -117,7 +119,7 @@ struct LimitsType {
        ponderMode                                  = false;
    }

-    bool use_time_management() const { return time[WHITE] || time[BLACK]; }
+    bool use_time_management() const { return Cluster::is_root() && (time[WHITE] || time[BLACK]); }

    std::vector<Move> searchmoves;
    TimePoint         time[COLOR_NB], inc[COLOR_NB], npmsec, movetime, startTime;
@@ -211,6 +213,28 @@ class Worker {
    PawnHistory           pawnHistory;
    CorrectionHistory     correctionHistory;

+#ifdef USE_MPI
+    struct {
+        std::mutex                             mutex;
+        Cluster::TTCache<Cluster::TTCacheSize> buffer = {};
+    } ttCache;
+#endif
+
+    std::atomic<uint64_t> TTsaves;
+
+    friend void Cluster::save(TranspositionTable&,
+                              ThreadPool&,
+                              Search::Worker*,
+                              TTEntry* tte,
+                              Key      k,
+                              Value    v,
+                              bool     PvHit,
+                              Bound    b,
+                              Depth    d,
+                              Move     m,
+                              Value    ev,
+                              uint8_t  generation8);
+
   private:
    void iterative_deepening();

--- a/src/syzygy/tbprobe.cpp
+++ b/src/syzygy/tbprobe.cpp
@@ -37,6 +37,7 @@
 #include <vector>

 #include "../bitboard.h"
+#include "../cluster.h"
 #include "../misc.h"
 #include "../movegen.h"
 #include "../position.h"
@@ -1466,7 +1467,8 @@ void Tablebases::init(const std::string& paths) {
        }
    }

-    sync_cout << "info string Found " << TBTables.size() << " tablebases" << sync_endl;
+    if (Cluster::is_root())
+        sync_cout << "info string Found " << TBTables.size() << " tablebases" << sync_endl;
 }

 // Probe the WDL table for a particular position.
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -25,6 +25,7 @@
 #include <unordered_map>
 #include <utility>

+#include "cluster.h"
 #include "misc.h"
 #include "movegen.h"
 #include "search.h"
@@ -115,6 +116,7 @@ Search::SearchManager* ThreadPool::main_manager() {

 uint64_t ThreadPool::nodes_searched() const { return accumulate(&Search::Worker::nodes); }
 uint64_t ThreadPool::tb_hits() const { return accumulate(&Search::Worker::tbHits); }
+uint64_t ThreadPool::TT_saves() const { return accumulate(&Search::Worker::TTsaves); }

 // Creates/destroys threads to match the requested number.
 // Created and launched threads will immediately go to sleep in idle_loop.
@@ -147,6 +149,9 @@ void ThreadPool::set(Search::SharedState sharedState) {

        // Reallocate the hash with the new threadpool size
        sharedState.tt.resize(sharedState.options["Hash"], requested);
+
+        // Adjust cluster buffers
+        Cluster::ttSendRecvBuff_resize(requested);
    }
 }

@@ -205,6 +210,7 @@ void ThreadPool::start_thinking(const OptionsMap&  options,
        th->worker->limits = limits;
        th->worker->nodes = th->worker->tbHits = th->worker->nmpMinPly =
          th->worker->bestMoveChanges          = 0;
+        th->worker->TTsaves                    = 0;
        th->worker->rootDepth = th->worker->completedDepth = 0;
        th->worker->rootMoves                              = rootMoves;
        th->worker->rootPos.set(pos.fen(), pos.is_chess960(), &th->worker->rootState);
@@ -212,6 +218,8 @@ void ThreadPool::start_thinking(const OptionsMap&  options,
        th->worker->tbConfig  = tbConfig;
    }

+    Cluster::signals_init();
+
    main_thread()->start_searching();
 }

--- a/src/thread.h
+++ b/src/thread.h
@@ -27,6 +27,7 @@
 #include <mutex>
 #include <vector>

+#include "movepick.h"
 #include "position.h"
 #include "search.h"
 #include "thread_win32_osx.h"
@@ -88,6 +89,7 @@ class ThreadPool {
    Thread*                main_thread() const { return threads.front(); }
    uint64_t               nodes_searched() const;
    uint64_t               tb_hits() const;
+    uint64_t               TT_saves() const;
    Thread*                get_best_thread() const;
    void                   start_searching();
    void                   wait_for_search_finished() const;
--- a/src/timeman.h
+++ b/src/timeman.h
@@ -22,8 +22,8 @@
 #include <cstddef>
 #include <cstdint>

+#include "cluster.h"
 #include "misc.h"
-#include "types.h"

 namespace Stockfish {

--- a/src/tt.h
+++ b/src/tt.h
@@ -27,16 +27,21 @@

 namespace Stockfish {

-// TTEntry struct is the 10 bytes transposition table entry, defined as below:
-//
-// key        16 bit
-// depth       8 bit
-// generation  5 bit
-// pv node     1 bit
-// bound type  2 bit
-// move       16 bit
-// value      16 bit
-// eval value 16 bit
+namespace Cluster {
+void init();
+}
+
+/// TTEntry struct is the 10 bytes transposition table entry, defined as below:
+///
+/// key        16 bit
+/// depth       8 bit
+/// generation  5 bit
+/// pv node     1 bit
+/// bound type  2 bit
+/// move       16 bit
+/// value      16 bit
+/// eval value 16 bit
+
 struct TTEntry {

    Move  move() const { return Move(move16); }
@@ -51,6 +56,8 @@ struct TTEntry {

   private:
    friend class TranspositionTable;
+    friend void Cluster::init();
+

    uint16_t key16;
    uint8_t  depth8;
@@ -68,6 +75,8 @@ struct TTEntry {
 // prefetched when possible.
 class TranspositionTable {

+    friend void Cluster::init();
+
    static constexpr int ClusterSize = 3;

    struct Cluster {
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -32,6 +32,7 @@
 #include <vector>

 #include "benchmark.h"
+#include "cluster.h"
 #include "evaluate.h"
 #include "movegen.h"
 #include "nnue/network.h"
@@ -112,7 +113,8 @@ void UCI::loop() {
    do
    {
        if (cli.argc == 1
-            && !getline(std::cin, cmd))  // Wait for an input or an end-of-file (EOF) indication
+            && !Cluster::getline(std::cin,
+                                 cmd))  // Wait for an input or an end-of-file (EOF) indication
            cmd = "quit";

        std::istringstream is(cmd);
@@ -130,7 +132,7 @@ void UCI::loop() {
        else if (token == "ponderhit")
            threads.main_manager()->ponder = false;  // Switch to the normal search

-        else if (token == "uci")
+        else if (token == "uci" && Cluster::is_root())
            sync_cout << "id name " << engine_info(true) << "\n"
                      << options << "\nuciok" << sync_endl;

@@ -142,7 +144,7 @@ void UCI::loop() {
            position(pos, is, states);
        else if (token == "ucinewgame")
            search_clear();
-        else if (token == "isready")
+        else if (token == "isready" && Cluster::is_root())
            sync_cout << "readyok" << sync_endl;

        // Add custom non-UCI commands, mainly for debugging purposes.
@@ -151,13 +153,13 @@ void UCI::loop() {
            pos.flip();
        else if (token == "bench")
            bench(pos, is, states);
-        else if (token == "d")
+        else if (token == "d" && Cluster::is_root())
            sync_cout << pos << sync_endl;
-        else if (token == "eval")
+        else if (token == "eval" && Cluster::is_root())
            trace_eval(pos);
-        else if (token == "compiler")
+        else if (token == "compiler" && Cluster::is_root())
            sync_cout << compiler_info() << sync_endl;
-        else if (token == "export_net")
+        else if (token == "export_net" && Cluster::is_root())
        {
            std::pair<std::optional<std::string>, std::string> files[2];

@@ -170,7 +172,9 @@ void UCI::loop() {
            networks.big.save(files[0].first);
            networks.small.save(files[1].first);
        }
-        else if (token == "--help" || token == "help" || token == "--license" || token == "license")
+        else if ((token == "--help" || token == "help" || token == "--license"
+                  || token == "license")
+                 && Cluster::is_root())
            sync_cout
              << "\nStockfish is a powerful chess engine for playing and analyzing."
                 "\nIt is released as free software licensed under the GNU GPLv3 License."
@@ -179,7 +183,7 @@ void UCI::loop() {
                 "\nFor any further information, visit https://github.com/official-stockfish/Stockfish#readme"
                 "\nor read the corresponding README.md and Copying.txt files distributed along with this program.\n"
              << sync_endl;
-        else if (!token.empty() && token[0] != '#')
+        else if (!token.empty() && token[0] != '#' && Cluster::is_root())
            sync_cout << "Unknown command: '" << cmd << "'. Type help for more information."
                      << sync_endl;

@@ -259,15 +263,16 @@ void UCI::bench(Position& pos, std::istream& args, StateListPtr& states) {

        if (token == "go" || token == "eval")
        {
-            std::cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")"
-                      << std::endl;
+            if (Cluster::is_root())
+                std::cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")"
+                          << std::endl;
            if (token == "go")
            {
                go(pos, is, states);
                threads.main_thread()->wait_for_search_finished();
-                nodes += threads.nodes_searched();
+                nodes += Cluster::nodes_searched(threads);
            }
-            else
+            else if (Cluster::is_root())
                trace_eval(pos);
        }
        else if (token == "setoption")
@@ -285,9 +290,10 @@ void UCI::bench(Position& pos, std::istream& args, StateListPtr& states) {

    dbg_print();

-    std::cerr << "\n==========================="
-              << "\nTotal time (ms) : " << elapsed << "\nNodes searched  : " << nodes
-              << "\nNodes/second    : " << 1000 * nodes / elapsed << std::endl;
+    if (Cluster::is_root())
+        std::cerr << "\n==========================="
+                  << "\nTotal time (ms) : " << elapsed << "\nNodes searched  : " << nodes
+                  << "\nNodes/second    : " << 1000 * nodes / elapsed << std::endl;
 }

 void UCI::trace_eval(Position& pos) {
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -25,6 +25,7 @@
 #include <sstream>
 #include <utility>

+#include "cluster.h"
 #include "misc.h"

 namespace Stockfish {
@@ -51,7 +52,7 @@ void OptionsMap::setoption(std::istringstream& is) {

    if (options_map.count(name))
        options_map[name] = value;
-    else
+    else if (Cluster::is_root())
        sync_cout << "No such option: " << name << sync_endl;
 }
Author	SHA1	Message	Date
Joost VandeVondele	b4ac3d6b96	Merge pull request #5161 from Disservin/cluster Merge SF master in the cluster branch	2024-04-10 22:18:16 +02:00
Disservin	6c19bec86e	Merge branch 'master' into cluster	2024-04-10 18:46:26 +02:00
Joost VandeVondele	8c4ac26c8e	Merge pull request #4661 from vondele/clusterMergeMaster17 Merge SF 16 in the cluster branch	2023-07-05 12:11:22 +02:00
Joost VandeVondele	a2b24e0030	Merge branch 'sf16branch' into clusterMergeMaster17	2023-07-03 20:35:36 +02:00
Joost VandeVondele	66b4e7f080	Merge pull request #4349 from vondele/clusterMergeMaster16 Cluster: merge master up to SF15.1	2023-01-21 16:57:10 +01:00
Joost VandeVondele	04a0be956d	Merge branch 'master' into clusterMergeMaster16	2022-12-04 16:43:04 +01:00
Joost VandeVondele	f327096cfb	Merge branch 'master' into clusterMergeMaster15 fix small merge conflicts, lightly tested: Score of cluster vs master: 1 - 0 - 59 [0.508] 60 Elo difference: 5.8 +/- 11.2, LOS: 84.1 %, DrawRatio: 98.3 %	2022-10-30 16:12:47 +01:00
Joost VandeVondele	e592dcb8e3	Merge branch 'master' into clusterMergeMaster14 closes https://github.com/official-stockfish/Stockfish/pull/3980	2022-04-16 08:31:47 +02:00
Joost VandeVondele	80eae02603	Merge pull request #3789 from vondele/fixNodeCount [cluster] fix node count in bench command	2021-11-13 17:17:17 +01:00
Joost VandeVondele	0ad7de3182	[cluster] fix node count in bench command bugfix use the cluster wide node count in bench for reporting No function change	2021-11-13 17:14:27 +01:00
Joost VandeVondele	e0b4dc24c8	Merge pull request #3771 from vondele/clusterMergeMaster13 Cluster: merge SF 14.1	2021-10-31 21:38:23 +01:00
Joost VandeVondele	b79ec2e0b2	Merge branch 'master' into clusterMergeMaster13 brings the cluster branch to SF 14.1 Fixes minor conflicts local testing cluster 4x4T vs master 4T, 10+0.1s, noob_3moves: Score of cluster vs master: 6 - 0 - 94 [0.530] 100 Elo difference: 20.9 +/- 16.2, LOS: 99.3 %, DrawRatio: 94.0 % No functional change	2021-10-31 16:17:24 +01:00
Stéphane Nicolet	43c887d367	Merge branch 'vondele-clusterMergeMaster12' into cluster	2021-03-17 11:22:02 +01:00
Joost VandeVondele	3a187b863b	Merge branch 'master' into clusterMergeMaster12 fixes minor merge conflicts looks good: Score of cluster vs master: 15 - 1 - 134 [0.547] 150 Elo difference: 32.5 +/- 17.5, LOS: 100.0 %, DrawRatio: 89.3 % for 4 threads against 4x4 threads at 10+0.1s	2021-03-16 20:49:19 +01:00
Joost VandeVondele	5fcd0e6f2a	Merge branch 'master' into clusterMergeMaster11 fixes minor conflicts.	2020-09-17 19:17:37 +02:00
Joost VandeVondele	19129473f2	Merge NNUE (master) in the cluster branch. fixes minor merge conflicts, and a first quick testing looks OK: 4mpix4th vs 4th at 30+0.3s: Score of cluster vs master: 3 - 0 - 37 [0.537] 40 Elo difference: 26.1 +/- 28.5, LOS: 95.8 %, DrawRatio: 92.5 % No functional change.	2020-08-19 23:30:37 +02:00
Joost VandeVondele	b706b91bb1	Update cluster branch to latest master Fixes a few merge conflicts. Verified equal bench for 1 rank, and expected performance master vs cluster with 2 ranks. Score of cluster vs master: 196 - 54 - 400 [0.609] 650 Elo difference: 77.1 +/- 16.3, LOS: 100.0 %, DrawRatio: 61.5 % No functional change.	2020-06-28 13:04:21 +02:00
Joost VandeVondele	8ec5faa46e	Merge branch 'master' into clusterMergeMaster8	2020-01-18 08:26:43 +01:00
Joost VandeVondele	8a9d269855	Merge remote-tracking branch 'upstream/master' into clusterMergeMaster7	2019-10-20 09:20:34 +02:00
Joost VandeVondele	0b3c13107a	Merge branch 'master' into clusterMergeMaster6	2019-07-11 15:26:46 +02:00
Joost VandeVondele	669074672c	Some doc changes. No functional change.	2019-07-05 14:47:33 +02:00
Joost VandeVondele	0fd0e4e849	Merge branch 'master' into clusterMergeMaster6	2019-07-01 16:36:58 +02:00
Joost VandeVondele	85327828c9	Merge branch 'master' into clusterMergeMaster5	2019-05-01 08:23:22 +02:00
Joost VandeVondele	4cdb6386d8	Merge branch 'master' into clusterMergeMaster5	2019-03-21 07:13:00 +01:00
Joost VandeVondele	982880bd70	Merge remote-tracking branch 'upstream/master' into clusterMergeMaster4	2019-02-17 13:18:08 +01:00
Joost VandeVondele	bf17a410ec	[Cluster] Use a sendrecv ring instead of allgather Using point to point instead of a collective improves performance, and might be more flexible for future improvements. Also corrects the condition for the number elements required to fill the send buffer. The actual Elo gains depends a bit on the setup used for testing. 8mpi x 32t yields 141 - 102 - 957 ~ 11 Elo 8mpi x 1t yields 70 +- 9 Elo.	2019-01-24 10:39:24 +01:00
Joost VandeVondele	5e7777e9d0	[Cluster] adds missing line one-liner fixes a merge error, resulting in a garbage output line. No influence on play.	2019-01-17 08:06:25 +01:00
Joost VandeVondele	10a920d7d7	[cluster] Improve user documentation - add cluster info line - provides basic info on positions received/stored in a cluster run, useful to judge performance. - document most cluster functionality in the readme.md No functional change	2019-01-14 09:11:33 +01:00
Joost VandeVondele	21819b7bf8	Merge branch 'master' into clusterMergeMaster3	2019-01-09 21:52:30 +01:00
Joost VandeVondele	8c4338ae49	[Cluster] Param tweak. Small tweak of parameters, yielding some Elo. The cluster branch can now be considered to be in good shape. In local testing, it runs stable for >30k games. Performance benefits from an MPI implementation that is able to make asynchronous progress. The code should be run with 1 MPI rank per node, and threaded on the node. Performance against master has now been measured. Master has been given 1 node with 32 cores/threads in standard SMP, the cluster branch has been given N=2..20 of those nodes, running the corresponding number of MPI processes, each with 32 threads. Time control has been 10s+0.1s, Hash 8MB/core, the book 8moves_v3.pgn, the number of games 400. ``` Score of cluster-2mpix32t vs master-32t: 96 - 27 - 277 [0.586] 400 Elo difference: 60.54 +/- 18.49 Score of cluster-3mpix32t vs master-32t: 101 - 18 - 281 [0.604] 400 Elo difference: 73.16 +/- 17.94 Score of cluster-4mpix32t vs master-32t: 126 - 18 - 256 [0.635] 400 Elo difference: 96.19 +/- 19.68 Score of cluster-5mpix32t vs master-32t: 110 - 5 - 285 [0.631] 400 Elo difference: 93.39 +/- 17.09 Score of cluster-6mpix32t vs master-32t: 117 - 9 - 274 [0.635] 400 Elo difference: 96.19 +/- 18.06 Score of cluster-7mpix32t vs master-32t: 142 - 10 - 248 [0.665] 400 Elo difference: 119.11 +/- 19.89 Score of cluster-8mpix32t vs master-32t: 125 - 14 - 261 [0.639] 400 Elo difference: 99.01 +/- 19.18 Score of cluster-9mpix32t vs master-32t: 137 - 7 - 256 [0.662] 400 Elo difference: 117.16 +/- 19.20 Score of cluster-10mpix32t vs master-32t: 145 - 8 - 247 [0.671] 400 Elo difference: 124.01 +/- 19.86 Score of cluster-16mpix32t vs master-32t: 153 - 6 - 241 [0.684] 400 Elo difference: 133.95 +/- 20.17 Score of cluster-20mpix32t vs master-32t: 134 - 8 - 258 [0.657] 400 Elo difference: 113.29 +/- 19.11 ``` As the cluster parallelism is essentially lazyMPI, the nodes per second has been verified to scale perfectly to large node counts. Unfortunately, that is not necessarily indicative of playing strength. In the following 2min search from startPos, we reach about 4.8Gnps (128 nodes). ``` info depth 38 seldepth 51 multipv 1 score cp 53 nodes 576165794092 nps 4801341606 hashfull 1000 tbhits 0 time 120001 pv e2e4 c7c5 g1f3 d7d6 f1b5 c8d7 b5d7 d8d7 c2c4 b8c6 b1c3 g8f6 d2d4 d7g4 d4d5 c6d4 f3d4 g4d1 e1d1 c5d4 c3b5 a8c8 b2b3 a7a6 b5d4 f6e4 d1e2 g7g6 c1e3 f8g7 a1c1 e4c5 f2f3 f7f5 h1d1 e8g8 d4c2 c5d7 a2a4 a6a5 e3d4 f5f4 d4f2 f8f7 h2h3 d7c5 ```	2019-01-06 15:38:31 +01:00
Joost VandeVondele	8a3f8e21ae	[Cluster] Move IO to the root. Fixes one TODO, by moving the IO related to bestmove to the root, even if this move is found by a different rank. This is needed to make sure IO from different ranks is ordered properly. If this is not done it is possible that e.g. a bestmove arrives before all info lines have been received, leading to output that confuses tools and humans alike (see e.g. https://github.com/cutechess/cutechess/issues/472)	2019-01-04 14:56:04 +01:00
Joost VandeVondele	267ca781cd	Always wait before posting the next call in _sync.	2019-01-02 11:16:24 +01:00
Joost VandeVondele	ac43bef5c5	[Cluster] Improve message passing part. This rewrites in part the message passing part, using in place gather, and collecting, rather than merging, the data of all threads. neutral with a single thread per rank: Score of new-2mpi-1t vs old-2mpi-1t: 789 - 787 - 2615 [0.500] 4191 Elo difference: 0.17 +/- 6.44 likely progress with multiple threads per rank: Score of new-2mpi-36t vs old-2mpi-36t: 76 - 53 - 471 [0.519] 600 Elo difference: 13.32 +/- 12.85	2019-01-02 11:16:24 +01:00
Joost VandeVondele	7a32d26d5f	[cluster] keep track of TB hits cluster-wide.	2018-12-29 15:34:57 +01:00
Joost VandeVondele	fb5c1f5bf5	Fix comment	2018-12-29 15:34:57 +01:00
Joost VandeVondele	87f0fa55a0	[cluster] keep track of node counts cluster-wide. This generalizes exchange of signals between the ranks using a non-blocking all-reduce. It is now used for the stop signal and the node count, but should be easily generalizable (TB hits, and ponder still missing). It avoids having long-lived outstanding non-blocking collectives (removes an early posted Ibarrier). A bit too short a test, but not worse than before: Score of new-r4-1t vs old-r4-1t: 459 - 401 - 1505 [0.512] 2365 Elo difference: 8.52 +/- 8.43	2018-12-29 15:34:57 +01:00
Joost VandeVondele	2f882309d5	fixup	2018-12-29 15:34:57 +01:00
Joost VandeVondele	86953b9392	[cluster] Fix non-mpi compile fix compile of the cluster branch in the non-mpi case. Add a TODO as a reminder for the new voting scheme. No functional changes	2018-12-29 15:34:56 +01:00
Joost VandeVondele	ba1c639836	[cluster] fill sendbuffer better use a counter to track available elements. Some elo gain, on 4 ranks: Score of old-r4-1t vs new-r4-1t: 422 - 508 - 1694 [0.484] 2624 Elo difference: -11.39 +/- 7.90	2018-12-29 15:34:56 +01:00
Joost VandeVondele	e526c5aa52	[cluster] Make bench compatible Fix one TODO. Takes care of output from bench. Sum nodes over ranks.	2018-12-29 15:34:56 +01:00
Joost VandeVondele	9cd2c817db	Add one more TODO	2018-12-29 15:34:56 +01:00
Joost VandeVondele	54a0a228f6	[cluster] Some formatting cleanup standarize whitespace a bit. Also adds two TODOs for follow up work. No functional change.	2018-12-29 15:34:56 +01:00
Joost VandeVondele	1cd2c7861a	[cluster] avoid creating MPI data type. there is no need to make an MPI data type for the sendbuffer, simpler and faster. No functional change	2018-12-29 15:34:56 +01:00
Joost VandeVondele	7af3f4da7a	[cluster] Avoid TT saving our own TT entries. avoid saving to TT the part of the receive buffer that actually originates from the same rank. Now, on 1 mpi rank, we have the same bench as the non-mpi code on 1 thread.	2018-12-29 15:34:56 +01:00
Joost VandeVondele	271181bb31	[cluster] Add depth condition to cluster TT saves. since the logic for saving moves in the sendbuffer and the associated rehashing is expensive, only do it for TT stores of sufficient depth. quite some gain in local testing with 4 ranks against the previous version. Elo difference: 288.84 +/- 21.98 This starts to make the branch useful, but for on-node runs, difference remains to the standard threading.	2018-12-29 15:34:56 +01:00
noobpwnftw	66b2c6b9f1	Implement best move voting system for cluster This implements the cluster version of `d96c1c32a2`	2018-12-29 15:34:56 +01:00
Joost VandeVondele	2559c20c6e	[cluster] Fix oversight in TT key reuse In the original code, the position key stored in the TT is used to probe&store TT entries after message passing. Since we only store part of the bits in the TT, this leads to incorrect rehashing. This is fixed in this patch storing also the full key in the send buffers, and using that for hashing after message arrival. Short testing with 4 ranks (old vs new) shows this is effective: Score of mpiold vs mpinew: 84 - 275 - 265 [0.347] 624 Elo difference: -109.87 +/- 20.88	2018-12-29 15:34:55 +01:00
Joost VandeVondele	2659c407c4	Fix segfault. the wrong data type was passed to an MPI call, leading to occasional segfaults. This patch fixes this. No functional change.	2018-12-29 15:34:55 +01:00
noobpwnftw	3730ae1efb	Small simplifications and code cleanup Non-functional simplifications.	2018-12-29 15:34:55 +01:00
noobpwnftw	0d6cdc0c6d	Implement yielding loop while waiting for input Some MPI implementations use busy-wait pooling, which will turn MPI_Bcast into busy-wait loop, workaround with our own yielding loop.	2018-12-29 15:34:55 +01:00
noobpwnftw	80afeb0d3b	Fix consistency between PV and bestmove output In case that a non-root mainThread on a node is the new best thread in the cluster, it should always output its PV.	2018-12-29 15:34:55 +01:00
noobpwnftw	2405b38165	Fix search result aggregation This reverts my earlier change that only the root node gets to output best move after fixing problem with MPI_Allreduce by our custom operator(BestMoveOp). This function is not commutable and we must ensure that its output is consistent among all nodes.	2018-12-29 15:34:55 +01:00
noobpwnftw	8a95d269eb	Implement proper stop signalling from root node Previous behavior was to wait on all nodes to finish their search on their own TM and aggregate to root node via a blocking MPI_Allreduce call. This seems to be problematic. In this commit a proper non-blocking signalling barrier was implemented to use TM from root node to control the cluster search, and disable TM on all non-root nodes. Also includes some cosmetic fix to the nodes/NPS display.	2018-12-29 15:34:55 +01:00
noobpwnftw	3b7b632aa5	Fix a bug of outputting multiple lines of bestmove	2018-12-29 15:34:55 +01:00
Omri Mor	29c166a072	MPI/Cluster implementation for Stockfish Based on Peter Österlund's "Lazy Cluster" algorithm, but with some simplifications. To compile, point COMPCXX to the MPI C++ compiler wrapper (mpicxx).	2018-12-29 15:34:55 +01:00