mirror of
https://github.com/HChaZZY/Stockfish.git
synced 2025-12-26 03:56:50 +08:00
Revert and fix earlier windows NUMA patch
revert 9048ac00db due to core spread problem and fix new OS compatibility with another method.
This code assumes that if one NUMA node has more than one processor groups, they are created equal(having equal amount of cores assigned to each of the groups), and also the total number of available cores contained in such groups are equal to the number of available cores within one NUMA node because of how best_node function works.
closes https://github.com/official-stockfish/Stockfish/pull/3798
fixes https://github.com/official-stockfish/Stockfish/pull/3787
No functional change.
This commit is contained in:
committed by
Joost VandeVondele
parent
a943b1d28d
commit
7218ec4df9
54
src/misc.cpp
54
src/misc.cpp
@@ -36,6 +36,7 @@ typedef bool(*fun1_t)(LOGICAL_PROCESSOR_RELATIONSHIP,
|
||||
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
|
||||
typedef bool(*fun2_t)(USHORT, PGROUP_AFFINITY);
|
||||
typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
|
||||
typedef bool(*fun4_t)(USHORT, PGROUP_AFFINITY, USHORT, PUSHORT);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -495,14 +496,14 @@ void bindThisThread(size_t) {}
|
||||
|
||||
#else
|
||||
|
||||
/// best_group() retrieves logical processor information using Windows specific
|
||||
/// API and returns the best group id for the thread with index idx. Original
|
||||
/// best_node() retrieves logical processor information using Windows specific
|
||||
/// API and returns the best node id for the thread with index idx. Original
|
||||
/// code from Texel by Peter Österlund.
|
||||
|
||||
int best_group(size_t idx) {
|
||||
int best_node(size_t idx) {
|
||||
|
||||
int threads = 0;
|
||||
int groups = 0;
|
||||
int nodes = 0;
|
||||
int cores = 0;
|
||||
DWORD returnLength = 0;
|
||||
DWORD byteOffset = 0;
|
||||
@@ -530,8 +531,8 @@ int best_group(size_t idx) {
|
||||
|
||||
while (byteOffset < returnLength)
|
||||
{
|
||||
if (ptr->Relationship == RelationGroup)
|
||||
groups += ptr->Group.MaximumGroupCount;
|
||||
if (ptr->Relationship == RelationNumaNode)
|
||||
nodes++;
|
||||
|
||||
else if (ptr->Relationship == RelationProcessorCore)
|
||||
{
|
||||
@@ -546,23 +547,23 @@ int best_group(size_t idx) {
|
||||
|
||||
free(buffer);
|
||||
|
||||
std::vector<int> core_groups;
|
||||
std::vector<int> groups;
|
||||
|
||||
// Run as many threads as possible on the same group until core limit is
|
||||
// reached, then move on filling the next group.
|
||||
for (int n = 0; n < groups; n++)
|
||||
for (int i = 0; i < cores / groups; i++)
|
||||
core_groups.push_back(n);
|
||||
// Run as many threads as possible on the same node until core limit is
|
||||
// reached, then move on filling the next node.
|
||||
for (int n = 0; n < nodes; n++)
|
||||
for (int i = 0; i < cores / nodes; i++)
|
||||
groups.push_back(n);
|
||||
|
||||
// In case a core has more than one logical processor (we assume 2) and we
|
||||
// have still threads to allocate, then spread them evenly across available
|
||||
// groups.
|
||||
// nodes.
|
||||
for (int t = 0; t < threads - cores; t++)
|
||||
core_groups.push_back(t % groups);
|
||||
groups.push_back(t % nodes);
|
||||
|
||||
// If we still have more threads than the total number of logical processors
|
||||
// then return -1 and let the OS to decide what to do.
|
||||
return idx < core_groups.size() ? core_groups[idx] : -1;
|
||||
return idx < groups.size() ? groups[idx] : -1;
|
||||
}
|
||||
|
||||
|
||||
@@ -571,22 +572,35 @@ int best_group(size_t idx) {
|
||||
void bindThisThread(size_t idx) {
|
||||
|
||||
// Use only local variables to be thread-safe
|
||||
int group = best_group(idx);
|
||||
int node = best_node(idx);
|
||||
|
||||
if (group == -1)
|
||||
if (node == -1)
|
||||
return;
|
||||
|
||||
// Early exit if the needed API are not available at runtime
|
||||
HMODULE k32 = GetModuleHandle("Kernel32.dll");
|
||||
auto fun2 = (fun2_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMaskEx");
|
||||
auto fun3 = (fun3_t)(void(*)())GetProcAddress(k32, "SetThreadGroupAffinity");
|
||||
auto fun4 = (fun4_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMask2");
|
||||
|
||||
if (!fun2 || !fun3)
|
||||
return;
|
||||
|
||||
GROUP_AFFINITY affinity;
|
||||
if (fun2(group, &affinity))
|
||||
fun3(GetCurrentThread(), &affinity, nullptr);
|
||||
if (!fun4) {
|
||||
GROUP_AFFINITY affinity;
|
||||
if (fun2(node, &affinity))
|
||||
fun3(GetCurrentThread(), &affinity, nullptr);
|
||||
} else {
|
||||
// If a numa node has more than one processor group, we assume they are
|
||||
// sized equal and we spread threads evenly across the groups.
|
||||
USHORT elements, returnedElements;
|
||||
elements = GetMaximumProcessorGroupCount();
|
||||
GROUP_AFFINITY *affinity = (GROUP_AFFINITY*)malloc(
|
||||
elements * sizeof(GROUP_AFFINITY));
|
||||
if (fun4(node, affinity, elements, &returnedElements))
|
||||
fun3(GetCurrentThread(), &affinity[idx % returnedElements], nullptr);
|
||||
free(affinity);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user