Lazy SMP

Start all threads searching on root position and use only the shared TT table as synching scheme. It seems this scheme scales better than YBWC for high number of threads. Verified for nor regression at STC 3 threads LLR: -2.95 (-2.94,2.94) [-3.00,1.00] Total: 40232 W: 6908 L: 7130 D: 26194 Verified for nor regression at LTC 3 threads LLR: 2.95 (-2.94,2.94) [-3.00,1.00] Total: 28186 W: 3908 L: 3798 D: 20480 Verified for nor regression at STC 7 threads LLR: 2.95 (-2.94,2.94) [-3.00,1.00] Total: 3607 W: 674 L: 526 D: 2407 Verified for nor regression at LTC 7 threads LLR: 2.95 (-2.94,2.94) [-3.00,1.00] Total: 4235 W: 671 L: 528 D: 3036 Tested with fixed games at LTC with 20 threads ELO: 44.75 +-7.6 (95%) LOS: 100.0% Total: 2069 W: 407 L: 142 D: 1520 Tested with fixed games at XLTC (120secs) with 20 threads ELO: 28.01 +-6.7 (95%) LOS: 100.0% Total: 2275 W: 349 L: 166 D: 1760 Original patch of mbootsector, with additional work from Ivan Ivec (log formula), Joerg Oster (id loop simplification) and Marco Costalba (assorted formatting and rework). Bench: 8116244
2025-12-20 00:56:39 +08:00 · 2015-10-06 08:15:17 +02:00
parent 7ea5659c5f
commit ecc5ff6693
10 changed files with 365 additions and 750 deletions
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -66,15 +66,24 @@ void ThreadBase::notify_one() {
 }


-// ThreadBase::wait_for() set the thread to sleep until 'condition' turns true
+// ThreadBase::wait() set the thread to sleep until 'condition' turns true

-void ThreadBase::wait_for(volatile const bool& condition) {
+void ThreadBase::wait(volatile const bool& condition) {

  std::unique_lock<Mutex> lk(mutex);
  sleepCondition.wait(lk, [&]{ return condition; });
 }


+// ThreadBase::wait_while() set the thread to sleep until 'condition' turns false
+
+void ThreadBase::wait_while(volatile const bool& condition) {
+
+  std::unique_lock<Mutex> lk(mutex);
+  sleepCondition.wait(lk, [&]{ return !condition; });
+}
+
+
 // Thread c'tor makes some init but does not launch any execution thread that
 // will be started only when c'tor returns.

@@ -82,143 +91,10 @@ Thread::Thread() /* : splitPoints() */ { // Initialization of non POD broken in

  searching = false;
  maxPly = 0;
-  splitPointsSize = 0;
-  activeSplitPoint = nullptr;
-  activePosition = nullptr;
  idx = Threads.size(); // Starts from 0
 }


-// Thread::cutoff_occurred() checks whether a beta cutoff has occurred in the
-// current active split point, or in some ancestor of the split point.
-
-bool Thread::cutoff_occurred() const {
-
-  for (SplitPoint* sp = activeSplitPoint; sp; sp = sp->parentSplitPoint)
-      if (sp->cutoff)
-          return true;
-
-  return false;
-}
-
-
-// Thread::can_join() checks whether the thread is available to join the split
-// point 'sp'. An obvious requirement is that thread must be idle. With more than
-// two threads, this is not sufficient: If the thread is the master of some split
-// point, it is only available as a slave for the split points below his active
-// one (the "helpful master" concept in YBWC terminology).
-
-bool Thread::can_join(const SplitPoint* sp) const {
-
-  if (searching)
-      return false;
-
-  // Make a local copy to be sure it doesn't become zero under our feet while
-  // testing next condition and so leading to an out of bounds access.
-  const size_t size = splitPointsSize;
-
-  // No split points means that the thread is available as a slave for any
-  // other thread otherwise apply the "helpful master" concept if possible.
-  return !size || splitPoints[size - 1].slavesMask.test(sp->master->idx);
-}
-
-
-// Thread::split() does the actual work of distributing the work at a node between
-// several available threads. If it does not succeed in splitting the node
-// (because no idle threads are available), the function immediately returns.
-// If splitting is possible, a SplitPoint object is initialized with all the
-// data that must be copied to the helper threads and then helper threads are
-// informed that they have been assigned work. This will cause them to instantly
-// leave their idle loops and call search(). When all threads have returned from
-// search() then split() returns.
-
-void Thread::split(Position& pos, Stack* ss, Value alpha, Value beta, Value* bestValue,
-                   Move* bestMove, Depth depth, int moveCount,
-                   MovePicker* movePicker, int nodeType, bool cutNode) {
-
-  assert(searching);
-  assert(-VALUE_INFINITE < *bestValue && *bestValue <= alpha && alpha < beta && beta <= VALUE_INFINITE);
-  assert(depth >= Threads.minimumSplitDepth);
-  assert(splitPointsSize < MAX_SPLITPOINTS_PER_THREAD);
-
-  // Pick and init the next available split point
-  SplitPoint& sp = splitPoints[splitPointsSize];
-
-  sp.spinlock.acquire(); // No contention here until we don't increment splitPointsSize
-
-  sp.master = this;
-  sp.parentSplitPoint = activeSplitPoint;
-  sp.slavesMask = 0, sp.slavesMask.set(idx);
-  sp.depth = depth;
-  sp.bestValue = *bestValue;
-  sp.bestMove = *bestMove;
-  sp.alpha = alpha;
-  sp.beta = beta;
-  sp.nodeType = nodeType;
-  sp.cutNode = cutNode;
-  sp.movePicker = movePicker;
-  sp.moveCount = moveCount;
-  sp.pos = &pos;
-  sp.nodes = 0;
-  sp.cutoff = false;
-  sp.ss = ss;
-  sp.allSlavesSearching = true; // Must be set under lock protection
-
-  ++splitPointsSize;
-  activeSplitPoint = &sp;
-  activePosition = nullptr;
-
-  // Try to allocate available threads
-  Thread* slave;
-
-  while (    sp.slavesMask.count() < MAX_SLAVES_PER_SPLITPOINT
-         && (slave = Threads.available_slave(&sp)) != nullptr)
-  {
-     slave->spinlock.acquire();
-
-      if (slave->can_join(activeSplitPoint))
-      {
-          activeSplitPoint->slavesMask.set(slave->idx);
-          slave->activeSplitPoint = activeSplitPoint;
-          slave->searching = true;
-      }
-
-      slave->spinlock.release();
-  }
-
-  // Everything is set up. The master thread enters the idle loop, from which
-  // it will instantly launch a search, because its 'searching' flag is set.
-  // The thread will return from the idle loop when all slaves have finished
-  // their work at this split point.
-  sp.spinlock.release();
-
-  Thread::idle_loop(); // Force a call to base class idle_loop()
-
-  // In the helpful master concept, a master can help only a sub-tree of its
-  // split point and because everything is finished here, it's not possible
-  // for the master to be booked.
-  assert(!searching);
-  assert(!activePosition);
-
-  // We have returned from the idle loop, which means that all threads are
-  // finished. Note that decreasing splitPointsSize must be done under lock
-  // protection to avoid a race with Thread::can_join().
-  spinlock.acquire();
-
-  searching = true;
-  --splitPointsSize;
-  activeSplitPoint = sp.parentSplitPoint;
-  activePosition = &pos;
-
-  spinlock.release();
-
-  // Split point data cannot be changed now, so no need to lock protect
-  pos.set_nodes_searched(pos.nodes_searched() + sp.nodes);
-  *bestMove = sp.bestMove;
-  *bestValue = sp.bestValue;
-}
-
-
 // TimerThread::idle_loop() is where the timer thread waits Resolution milliseconds
 // and then calls check_time(). When not searching, thread sleeps until it's woken up.

@@ -233,12 +109,31 @@ void TimerThread::idle_loop() {

      lk.unlock();

-      if (run)
+      if (!exit && run)
          check_time();
  }
 }


+// Thread::idle_loop() is where the thread is parked when it has no work to do
+
+void Thread::idle_loop() {
+
+  while (!exit)
+  {
+      std::unique_lock<Mutex> lk(mutex);
+
+      while (!searching && !exit)
+          sleepCondition.wait(lk);
+
+      lk.unlock();
+
+      if (!exit && searching)
+          search();
+  }
+}
+
+
 // MainThread::idle_loop() is where the main thread is parked waiting to be started
 // when there is a new search. The main thread will launch all the slave threads.

@@ -259,20 +154,12 @@ void MainThread::idle_loop() {
      lk.unlock();

      if (!exit)
-      {
-          searching = true;
-
-          Search::think();
-
-          assert(searching);
-
-          searching = false;
-      }
+          think();
  }
 }


-// MainThread::join() waits for main thread to finish the search
+// MainThread::join() waits for main thread to finish thinking

 void MainThread::join() {

@@ -317,7 +204,6 @@ void ThreadPool::exit() {

 void ThreadPool::read_uci_options() {

-  minimumSplitDepth = Options["Min Split Depth"] * ONE_PLY;
  size_t requested  = Options["Threads"];

  assert(requested > 0);
@@ -333,16 +219,14 @@ void ThreadPool::read_uci_options() {
 }


-// ThreadPool::available_slave() tries to find an idle thread which is available
-// to join SplitPoint 'sp'.
+// ThreadPool::nodes_searched() returns the number of nodes searched

-Thread* ThreadPool::available_slave(const SplitPoint* sp) const {
+int64_t ThreadPool::nodes_searched() {

-  for (Thread* th : *this)
-      if (th->can_join(sp))
-          return th;
-
-  return nullptr;
+  int64_t nodes = 0;
+  for (Thread *th : *this)
+      nodes += th->rootPos.nodes_searched();
+  return nodes;
 }


@@ -356,8 +240,8 @@ void ThreadPool::start_thinking(const Position& pos, const LimitsType& limits,
  Signals.stopOnPonderhit = Signals.firstRootMove = false;
  Signals.stop = Signals.failedLowAtRoot = false;

-  RootMoves.clear();
-  RootPos = pos;
+  main()->rootMoves.clear();
+  main()->rootPos = pos;
  Limits = limits;
  if (states.get()) // If we don't set a new position, preserve current state
  {
@@ -368,7 +252,7 @@ void ThreadPool::start_thinking(const Position& pos, const LimitsType& limits,
  for (const auto& m : MoveList<LEGAL>(pos))
      if (   limits.searchmoves.empty()
          || std::count(limits.searchmoves.begin(), limits.searchmoves.end(), m))
-          RootMoves.push_back(RootMove(m));
+          main()->rootMoves.push_back(RootMove(m));

  main()->thinking = true;
  main()->notify_one(); // Wake up main thread: 'thinking' must be already set