diff --git a/src/misc.h b/src/misc.h index e564311f..020fa9b5 100644 --- a/src/misc.h +++ b/src/misc.h @@ -400,7 +400,7 @@ inline uint64_t mul_hi64(uint64_t a, uint64_t b) { // This bitset can be accessed concurrently, provided // the concurrent accesses are performed on distinct // instances of underlying type. That means the cuncurrent -// accesses need to be spaced by at least +// accesses need to be spaced by at least // bits_per_bucket bits. // But at least best_concurrent_access_stride bits // is recommended to prevent false sharing. @@ -418,6 +418,11 @@ public: constexpr static uint64_t num_buckets = (num_bits + bits_per_bucket - 1) / bits_per_bucket; constexpr static uint64_t best_concurrent_access_stride = 8 * cache_line_size; + LargeBitset() + { + std::fill(std::begin(bits), std::end(bits), 0); + } + void set(uint64_t idx) { const uint64_t bucket = idx / bits_per_bucket; diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h index 3062e432..419cdf5e 100644 --- a/src/nnue/trainer/trainer_feature_transformer.h +++ b/src/nnue/trainer/trainer_feature_transformer.h @@ -203,7 +203,7 @@ namespace Eval::NNUE { min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1)); max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1)); - for (IndexType b = 0; b < batch.size(); ++b) + for (IndexType b = 0; b < batch.size(); ++b) { const IndexType batch_offset = kOutputDimensions * b; @@ -283,7 +283,7 @@ namespace Eval::NNUE { learning_rate * learning_rate_scale_; #if defined (USE_SSE2) - + { static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time"); @@ -408,10 +408,26 @@ namespace Eval::NNUE { for (IndexType c = 0; c < 2; ++c) { const IndexType output_offset = batch_offset + kHalfDimensions * c; for (const auto& feature : (*batch_)[b].training_features[c]) { - if (feature.get_index() % num_threads != thread_index) + const IndexType feature_index = feature.get_index(); + + // We assign each bucket a continuous range of bits at least + // of cache line size to prevent false sharing. + // For HalfKP this is enough to saturate about 80 threads. + const IndexType thread_bucket = + (feature_index / BitsetType::best_concurrent_access_stride) + % num_threads; + + if (thread_bucket != thread_index) continue; + + // This operation can be performed safely because + // each thread accesses a different memory location + // (even a different cache line) + observed_features.set(feature_index); + const IndexType weights_offset = - kHalfDimensions * feature.get_index(); + kHalfDimensions * feature_index; + const auto scale = static_cast( effective_learning_rate / feature.get_count()); @@ -438,14 +454,6 @@ namespace Eval::NNUE { } ); - for (IndexType b = 0; b < batch_->size(); ++b) { - for (IndexType c = 0; c < 2; ++c) { - for (const auto& feature : (*batch_)[b].training_features[c]) { - observed_features.set(feature.get_index()); - } - } - } - thread_pool.wait_for_workers_finished(); } @@ -628,7 +636,8 @@ namespace Eval::NNUE { std::vector> output_; // Features that appeared in the training data - std::bitset observed_features; + using BitsetType = LargeBitset; + BitsetType observed_features; // hyper parameter LearnFloatType momentum_;