Remove custom mutex implementation

As part of the investigation of the hang caused by an incorrect implementation
of condition_variable in libwinpthread, it was realized that our custom Mutex
implementation is no longer needed. Prior to lazySMP this custom implementation
resulted in a 30% speedup, but now no speed difference can be measured as no
mutex is used on the hot path in lazySMP.

https://github.com/official-stockfish/Stockfish/issues/2291
https://github.com/official-stockfish/Stockfish/issues/2309#issuecomment-533733393  https://github.com/official-stockfish/Stockfish/issues/2309#issuecomment-533737515

The interest of this patch is that it removes platform-specific code, which is
always less tested.

No functional change.
This commit is contained in:
Joost VandeVondele
2019-09-16 07:51:25 +02:00
committed by Stéphane Nicolet
parent 8726beba59
commit d703d2b5e7
5 changed files with 16 additions and 61 deletions

View File

@@ -168,7 +168,7 @@ void dbg_print() {
std::ostream& operator<<(std::ostream& os, SyncCout sc) { std::ostream& operator<<(std::ostream& os, SyncCout sc) {
static Mutex m; static std::mutex m;
if (sc == IO_LOCK) if (sc == IO_LOCK)
m.lock(); m.lock();

View File

@@ -27,12 +27,12 @@
#include <list> #include <list>
#include <sstream> #include <sstream>
#include <type_traits> #include <type_traits>
#include <mutex>
#include "../bitboard.h" #include "../bitboard.h"
#include "../movegen.h" #include "../movegen.h"
#include "../position.h" #include "../position.h"
#include "../search.h" #include "../search.h"
#include "../thread_win32_osx.h"
#include "../types.h" #include "../types.h"
#include "../uci.h" #include "../uci.h"
@@ -45,7 +45,9 @@
#include <sys/stat.h> #include <sys/stat.h>
#else #else
#define WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN
#define NOMINMAX #ifndef NOMINMAX
# define NOMINMAX // Disable macros min() and max()
#endif
#include <windows.h> #include <windows.h>
#endif #endif
@@ -1124,14 +1126,14 @@ void set(T& e, uint8_t* data) {
template<TBType Type> template<TBType Type>
void* mapped(TBTable<Type>& e, const Position& pos) { void* mapped(TBTable<Type>& e, const Position& pos) {
static Mutex mutex; static std::mutex mutex;
// Use 'acquire' to avoid a thread reading 'ready' == true while // Use 'acquire' to avoid a thread reading 'ready' == true while
// another is still working. (compiler reordering may cause this). // another is still working. (compiler reordering may cause this).
if (e.ready.load(std::memory_order_acquire)) if (e.ready.load(std::memory_order_acquire))
return e.baseAddress; // Could be nullptr if file does not exist return e.baseAddress; // Could be nullptr if file does not exist
std::unique_lock<Mutex> lk(mutex); std::unique_lock<std::mutex> lk(mutex);
if (e.ready.load(std::memory_order_relaxed)) // Recheck under lock if (e.ready.load(std::memory_order_relaxed)) // Recheck under lock
return e.baseAddress; return e.baseAddress;

View File

@@ -81,7 +81,7 @@ void Thread::clear() {
void Thread::start_searching() { void Thread::start_searching() {
std::lock_guard<Mutex> lk(mutex); std::lock_guard<std::mutex> lk(mutex);
searching = true; searching = true;
cv.notify_one(); // Wake up the thread in idle_loop() cv.notify_one(); // Wake up the thread in idle_loop()
} }
@@ -92,7 +92,7 @@ void Thread::start_searching() {
void Thread::wait_for_search_finished() { void Thread::wait_for_search_finished() {
std::unique_lock<Mutex> lk(mutex); std::unique_lock<std::mutex> lk(mutex);
cv.wait(lk, [&]{ return !searching; }); cv.wait(lk, [&]{ return !searching; });
} }
@@ -112,7 +112,7 @@ void Thread::idle_loop() {
while (true) while (true)
{ {
std::unique_lock<Mutex> lk(mutex); std::unique_lock<std::mutex> lk(mutex);
searching = false; searching = false;
cv.notify_one(); // Wake up anyone waiting for search finished cv.notify_one(); // Wake up anyone waiting for search finished
cv.wait(lk, [&]{ return searching; }); cv.wait(lk, [&]{ return searching; });

View File

@@ -42,8 +42,8 @@
class Thread { class Thread {
Mutex mutex; std::mutex mutex;
ConditionVariable cv; std::condition_variable cv;
size_t idx; size_t idx;
bool exit = false, searching = true; // Set before starting std::thread bool exit = false, searching = true; // Set before starting std::thread
NativeThread stdThread; NativeThread stdThread;

View File

@@ -21,60 +21,13 @@
#ifndef THREAD_WIN32_OSX_H_INCLUDED #ifndef THREAD_WIN32_OSX_H_INCLUDED
#define THREAD_WIN32_OSX_H_INCLUDED #define THREAD_WIN32_OSX_H_INCLUDED
/// STL thread library used by mingw and gcc when cross compiling for Windows
/// relies on libwinpthread. Currently libwinpthread implements mutexes directly
/// on top of Windows semaphores. Semaphores, being kernel objects, require kernel
/// mode transition in order to lock or unlock, which is very slow compared to
/// interlocked operations (about 30% slower on bench test). To work around this
/// issue, we define our wrappers to the low level Win32 calls. We use critical
/// sections to support Windows XP and older versions. Unfortunately, cond_wait()
/// is racy between unlock() and WaitForSingleObject() but they have the same
/// speed performance as the SRW locks.
#include <condition_variable>
#include <mutex>
#include <thread> #include <thread>
#if defined(_WIN32) && !defined(_MSC_VER)
#ifndef NOMINMAX
# define NOMINMAX // Disable macros min() and max()
#endif
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#undef WIN32_LEAN_AND_MEAN
#undef NOMINMAX
/// Mutex and ConditionVariable struct are wrappers of the low level locking
/// machinery and are modeled after the corresponding C++11 classes.
struct Mutex {
Mutex() { InitializeCriticalSection(&cs); }
~Mutex() { DeleteCriticalSection(&cs); }
void lock() { EnterCriticalSection(&cs); }
void unlock() { LeaveCriticalSection(&cs); }
private:
CRITICAL_SECTION cs;
};
typedef std::condition_variable_any ConditionVariable;
#else // Default case: use STL classes
typedef std::mutex Mutex;
typedef std::condition_variable ConditionVariable;
#endif
/// On OSX threads other than the main thread are created with a reduced stack /// On OSX threads other than the main thread are created with a reduced stack
/// size of 512KB by default, this is dangerously low for deep searches, so /// size of 512KB by default, this is too low for deep searches, which require
/// adjust it to TH_STACK_SIZE. The implementation calls pthread_create() with /// somewhat more than 1MB stack, so adjust it to TH_STACK_SIZE.
/// proper stack size parameter. /// The implementation calls pthread_create() with the stack size parameter
/// equal to the linux 8MB default, on platforms that support it.
/// On toolchains where pthread is always available, ensure minimum stack size
/// instead of relying on linker defaults which may be platform-specific.
#if defined(__APPLE__) || defined(__MINGW32__) || defined(__MINGW64__) #if defined(__APPLE__) || defined(__MINGW32__) || defined(__MINGW64__)