Add support for Windows large pages

for users that set the needed privilige "Lock Pages in Memory"
large pages will be automatically enabled (see Readme.md).

This expert setting might improve speed, 5% - 30%, depending
on the hardware, the number of threads and hash size. More for
large hashes, large number of threads and NUMA. If the operating
system can not allocate large pages (easier after a reboot), default
allocation is used automatically. The engine log provides details.

closes https://github.com/official-stockfish/Stockfish/pull/2656

fixes https://github.com/official-stockfish/Stockfish/issues/2619

No functional change
This commit is contained in:
Sami Kiminki
2020-05-04 20:49:27 +03:00
committed by Joost VandeVondele
parent 86ee4eb84d
commit d4763424d2
5 changed files with 120 additions and 2 deletions

View File

@@ -49,6 +49,7 @@ int main(int argc, char* argv[]) {
UCI::loop(argc, argv);
TT.resize(0);
Threads.set(0);
return 0;
}

View File

@@ -309,6 +309,69 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
return mem;
}
#elif defined(_WIN64)
static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
HANDLE hProcessToken { };
LUID luid { };
void* mem = nullptr;
const size_t largePageSize = GetLargePageMinimum();
if (!largePageSize)
return nullptr;
// We need SeLockMemoryPrivilege, so try to enable it for the process
if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken))
return nullptr;
if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &luid))
{
TOKEN_PRIVILEGES tp { };
TOKEN_PRIVILEGES prevTp { };
DWORD prevTpLen = 0;
tp.PrivilegeCount = 1;
tp.Privileges[0].Luid = luid;
tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
// Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() succeeds,
// we still need to query GetLastError() to ensure that the privileges were actually obtained...
if (AdjustTokenPrivileges(
hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp, &prevTpLen) &&
GetLastError() == ERROR_SUCCESS)
{
// round up size to full pages and allocate
allocSize = (allocSize + largePageSize - 1) & ~size_t(largePageSize - 1);
mem = VirtualAlloc(
NULL, allocSize, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE);
// privilege no longer needed, restore previous state
AdjustTokenPrivileges(hProcessToken, FALSE, &prevTp, 0, NULL, NULL);
}
}
CloseHandle(hProcessToken);
return mem;
}
void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
// try to allocate large pages
mem = aligned_ttmem_alloc_large_pages(allocSize);
if (mem)
sync_cout << "info string Hash table allocation: Windows large pages used." << sync_endl;
else
sync_cout << "info string Hash table allocation: Windows large pages not used." << sync_endl;
// fall back to regular, page aligned, allocation if necessary
if (!mem)
mem = VirtualAlloc(NULL, allocSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
return mem;
}
#else
void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
@@ -322,6 +385,28 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
#endif
/// aligned_ttmem_free will free the previously allocated ttmem
#if defined(_WIN64)
void aligned_ttmem_free(void* mem) {
if (!VirtualFree(mem, 0, MEM_RELEASE))
{
DWORD err = GetLastError();
std::cerr << "Failed to free transposition table. Error code: 0x" <<
std::hex << err << std::dec << std::endl;
exit(EXIT_FAILURE);
}
}
#else
void aligned_ttmem_free(void *mem) {
free(mem);
}
#endif
namespace WinProcGroup {

View File

@@ -34,6 +34,7 @@ const std::string compiler_info();
void prefetch(void* addr);
void start_logger(const std::string& fname);
void* aligned_ttmem_alloc(size_t size, void*& mem);
void aligned_ttmem_free(void* mem);
void dbg_hit_on(bool b);
void dbg_hit_on(bool c, bool b);

View File

@@ -63,7 +63,14 @@ void TranspositionTable::resize(size_t mbSize) {
Threads.main()->wait_for_search_finished();
free(mem);
if (mem)
aligned_ttmem_free(mem);
if (!mbSize)
{
mem = nullptr;
return;
}
clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), mem));