Use 128 bit multiply for TT index

Remove super cluster stuff from TT and just use a 128 bit multiply.

STC https://tests.stockfishchess.org/tests/view/5ee719b3aae8aec816ab7548
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 12736 W: 2502 L: 2333 D: 7901
Ptnml(0-2): 191, 1452, 2944, 1559, 222

LTC https://tests.stockfishchess.org/tests/view/5ee732d1aae8aec816ab7556
LLR: 2.93 (-2.94,2.94) {-1.50,0.50}
Total: 27584 W: 3431 L: 3350 D: 20803
Ptnml(0-2): 173, 2500, 8400, 2511, 208

Scheme back to being derived from https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/

Also the default optimized version of the index calculation now uses fewer instructions.
https://godbolt.org/z/Tktxbv
Might benefit from mulx (requires -mbmi2)

closes https://github.com/official-stockfish/Stockfish/pull/2744

bench: 4320954
This commit is contained in:
mstembera
2020-06-14 23:35:07 -07:00
committed by Joost VandeVondele
parent 995ee4b311
commit 1ea488d34c
5 changed files with 22 additions and 22 deletions

View File

@@ -36,17 +36,17 @@ TranspositionTable TT; // Our global transposition table
void TTEntry::save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev) {
// Preserve any existing move for the same position
if (m || (k >> 48) != key16)
if (m || (uint16_t)k != key16)
move16 = (uint16_t)m;
// Overwrite less valuable entries
if ( (k >> 48) != key16
if ((uint16_t)k != key16
|| d - DEPTH_OFFSET > depth8 - 4
|| b == BOUND_EXACT)
{
assert(d >= DEPTH_OFFSET);
key16 = (uint16_t)(k >> 48);
key16 = (uint16_t)k;
value16 = (int16_t)v;
eval16 = (int16_t)ev;
genBound8 = (uint8_t)(TT.generation8 | uint8_t(pv) << 2 | b);
@@ -65,10 +65,8 @@ void TranspositionTable::resize(size_t mbSize) {
aligned_ttmem_free(mem);
superClusterCount = mbSize * 1024 * 1024 / (sizeof(Cluster) * ClustersPerSuperCluster);
table = static_cast<Cluster*>(
aligned_ttmem_alloc(superClusterCount * ClustersPerSuperCluster * sizeof(Cluster), mem));
clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), mem));
if (!mem)
{
std::cerr << "Failed to allocate " << mbSize
@@ -91,8 +89,6 @@ void TranspositionTable::clear() {
{
threads.emplace_back([this, idx]() {
const size_t clusterCount = superClusterCount * ClustersPerSuperCluster;
// Thread binding gives faster search on systems with a first-touch policy
if (Options["Threads"] > 8)
WinProcGroup::bindThisThread(idx);
@@ -121,7 +117,7 @@ void TranspositionTable::clear() {
TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
TTEntry* const tte = first_entry(key);
const uint16_t key16 = key >> 48; // Use the high 16 bits as key inside the cluster
const uint16_t key16 = (uint16_t)key; // Use the low 16 bits as key inside the cluster
for (int i = 0; i < ClusterSize; ++i)
if (!tte[i].key16 || tte[i].key16 == key16)