mirror of
https://github.com/HChaZZY/Stockfish.git
synced 2025-12-24 19:16:49 +08:00
Move the observed feature collection to the threaded part now that it can be done safely.
This commit is contained in:
@@ -400,7 +400,7 @@ inline uint64_t mul_hi64(uint64_t a, uint64_t b) {
|
||||
// This bitset can be accessed concurrently, provided
|
||||
// the concurrent accesses are performed on distinct
|
||||
// instances of underlying type. That means the cuncurrent
|
||||
// accesses need to be spaced by at least
|
||||
// accesses need to be spaced by at least
|
||||
// bits_per_bucket bits.
|
||||
// But at least best_concurrent_access_stride bits
|
||||
// is recommended to prevent false sharing.
|
||||
@@ -418,6 +418,11 @@ public:
|
||||
constexpr static uint64_t num_buckets = (num_bits + bits_per_bucket - 1) / bits_per_bucket;
|
||||
constexpr static uint64_t best_concurrent_access_stride = 8 * cache_line_size;
|
||||
|
||||
LargeBitset()
|
||||
{
|
||||
std::fill(std::begin(bits), std::end(bits), 0);
|
||||
}
|
||||
|
||||
void set(uint64_t idx)
|
||||
{
|
||||
const uint64_t bucket = idx / bits_per_bucket;
|
||||
|
||||
@@ -203,7 +203,7 @@ namespace Eval::NNUE {
|
||||
min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1));
|
||||
max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1));
|
||||
|
||||
for (IndexType b = 0; b < batch.size(); ++b)
|
||||
for (IndexType b = 0; b < batch.size(); ++b)
|
||||
{
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
|
||||
@@ -283,7 +283,7 @@ namespace Eval::NNUE {
|
||||
learning_rate * learning_rate_scale_;
|
||||
|
||||
#if defined (USE_SSE2)
|
||||
|
||||
|
||||
{
|
||||
static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
|
||||
|
||||
@@ -408,10 +408,26 @@ namespace Eval::NNUE {
|
||||
for (IndexType c = 0; c < 2; ++c) {
|
||||
const IndexType output_offset = batch_offset + kHalfDimensions * c;
|
||||
for (const auto& feature : (*batch_)[b].training_features[c]) {
|
||||
if (feature.get_index() % num_threads != thread_index)
|
||||
const IndexType feature_index = feature.get_index();
|
||||
|
||||
// We assign each bucket a continuous range of bits at least
|
||||
// of cache line size to prevent false sharing.
|
||||
// For HalfKP this is enough to saturate about 80 threads.
|
||||
const IndexType thread_bucket =
|
||||
(feature_index / BitsetType::best_concurrent_access_stride)
|
||||
% num_threads;
|
||||
|
||||
if (thread_bucket != thread_index)
|
||||
continue;
|
||||
|
||||
// This operation can be performed safely because
|
||||
// each thread accesses a different memory location
|
||||
// (even a different cache line)
|
||||
observed_features.set(feature_index);
|
||||
|
||||
const IndexType weights_offset =
|
||||
kHalfDimensions * feature.get_index();
|
||||
kHalfDimensions * feature_index;
|
||||
|
||||
const auto scale = static_cast<LearnFloatType>(
|
||||
effective_learning_rate / feature.get_count());
|
||||
|
||||
@@ -438,14 +454,6 @@ namespace Eval::NNUE {
|
||||
}
|
||||
);
|
||||
|
||||
for (IndexType b = 0; b < batch_->size(); ++b) {
|
||||
for (IndexType c = 0; c < 2; ++c) {
|
||||
for (const auto& feature : (*batch_)[b].training_features[c]) {
|
||||
observed_features.set(feature.get_index());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
thread_pool.wait_for_workers_finished();
|
||||
}
|
||||
|
||||
@@ -628,7 +636,8 @@ namespace Eval::NNUE {
|
||||
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
|
||||
|
||||
// Features that appeared in the training data
|
||||
std::bitset<kInputDimensions> observed_features;
|
||||
using BitsetType = LargeBitset<kInputDimensions>;
|
||||
BitsetType observed_features;
|
||||
|
||||
// hyper parameter
|
||||
LearnFloatType momentum_;
|
||||
|
||||
Reference in New Issue
Block a user