diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp index 93262b42..66461cc5 100644 --- a/src/learn/learn.cpp +++ b/src/learn/learn.cpp @@ -704,7 +704,7 @@ namespace Learner // should be no real issues happening since // the read/write phases are isolated. atomic_thread_fence(memory_order_seq_cst); - Eval::NNUE::update_parameters(epoch, params.verbose, params.learning_rate, calc_grad); + Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, calc_grad); atomic_thread_fence(memory_order_seq_cst); if (++save_count * params.mini_batch_size >= params.eval_save_interval) diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp index 4de939c5..6294865d 100644 --- a/src/nnue/evaluate_nnue_learner.cpp +++ b/src/nnue/evaluate_nnue_learner.cpp @@ -18,6 +18,7 @@ #include "uci.h" #include "misc.h" #include "thread_win32_osx.h" +#include "thread.h" // Code for learning NNUE evaluation function namespace Eval::NNUE { @@ -180,6 +181,7 @@ namespace Eval::NNUE { // update the evaluation function parameters void update_parameters( + ThreadPool& thread_pool, uint64_t epoch, bool verbose, double learning_rate, @@ -202,7 +204,7 @@ namespace Eval::NNUE { std::vector batch(examples.end() - batch_size, examples.end()); examples.resize(examples.size() - batch_size); - const auto network_output = trainer->propagate(batch); + const auto network_output = trainer->propagate(thread_pool, batch); std::vector gradients(batch.size()); for (std::size_t b = 0; b < batch.size(); ++b) { @@ -226,7 +228,7 @@ namespace Eval::NNUE { } } - trainer->backpropagate(gradients.data(), learning_rate); + trainer->backpropagate(thread_pool, gradients.data(), learning_rate); collect_stats = false; } diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h index d350691b..8633f713 100644 --- a/src/nnue/evaluate_nnue_learner.h +++ b/src/nnue/evaluate_nnue_learner.h @@ -5,6 +5,8 @@ #include "misc.h" +struct ThreadPool; + // Interface used for learning NNUE evaluation function namespace Eval::NNUE { @@ -32,6 +34,7 @@ namespace Eval::NNUE { // update the evaluation function parameters void update_parameters( + ThreadPool& thread_pool, uint64_t epoch, bool verbose, double learning_rate, diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h index 449a0a11..5d2f29c9 100644 --- a/src/nnue/trainer/trainer_affine_transform.h +++ b/src/nnue/trainer/trainer_affine_transform.h @@ -7,6 +7,8 @@ #include "nnue/layers/affine_transform.h" +#include "thread.h" + #include // Specialization of NNUE evaluation function learning class template for AffineTransform @@ -88,14 +90,14 @@ namespace Eval::NNUE { } // forward propagation - const LearnFloatType* propagate(const std::vector& batch) { + const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector& batch) { if (output_.size() < kOutputDimensions * batch.size()) { output_.resize(kOutputDimensions * batch.size()); gradients_.resize(kInputDimensions * batch.size()); } batch_size_ = static_cast(batch.size()); - batch_input_ = previous_layer_trainer_->propagate(batch); + batch_input_ = previous_layer_trainer_->propagate(thread_pool, batch); #if defined(USE_BLAS) for (IndexType b = 0; b < batch_size_; ++b) { const IndexType batch_offset = kOutputDimensions * b; @@ -127,7 +129,8 @@ namespace Eval::NNUE { } // backpropagation - void backpropagate(const LearnFloatType* gradients, + void backpropagate(ThreadPool& thread_pool, + const LearnFloatType* gradients, LearnFloatType learning_rate) { const LearnFloatType local_learning_rate = @@ -211,7 +214,7 @@ namespace Eval::NNUE { } num_weights_diffs_ += kOutputDimensions * kInputDimensions; - previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate); + previous_layer_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate); } private: diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h index 5f2ff065..8e29e4a1 100644 --- a/src/nnue/trainer/trainer_clipped_relu.h +++ b/src/nnue/trainer/trainer_clipped_relu.h @@ -7,6 +7,8 @@ #include "nnue/layers/clipped_relu.h" +#include "thread.h" + // Specialization of NNUE evaluation function learning class template for ClippedReLU namespace Eval::NNUE { @@ -41,13 +43,13 @@ namespace Eval::NNUE { } // forward propagation - const LearnFloatType* propagate(const std::vector& batch) { + const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector& batch) { if (output_.size() < kOutputDimensions * batch.size()) { output_.resize(kOutputDimensions * batch.size()); gradients_.resize(kInputDimensions * batch.size()); } - const auto input = previous_layer_trainer_->propagate(batch); + const auto input = previous_layer_trainer_->propagate(thread_pool, batch); batch_size_ = static_cast(batch.size()); for (IndexType b = 0; b < batch_size_; ++b) { const IndexType batch_offset = kOutputDimensions * b; @@ -63,7 +65,8 @@ namespace Eval::NNUE { } // backpropagation - void backpropagate(const LearnFloatType* gradients, + void backpropagate(ThreadPool& thread_pool, + const LearnFloatType* gradients, LearnFloatType learning_rate) { for (IndexType b = 0; b < batch_size_; ++b) { @@ -77,7 +80,7 @@ namespace Eval::NNUE { } num_total_ += batch_size_ * kOutputDimensions; - previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate); + previous_layer_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate); } private: diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h index 9f0648d2..a778f956 100644 --- a/src/nnue/trainer/trainer_feature_transformer.h +++ b/src/nnue/trainer/trainer_feature_transformer.h @@ -9,6 +9,8 @@ #include "nnue/nnue_feature_transformer.h" +#include "thread.h" + #include #include #include @@ -90,12 +92,14 @@ namespace Eval::NNUE { } // forward propagation - const LearnFloatType* propagate(const std::vector& batch) { + const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector& batch) { if (output_.size() < kOutputDimensions * batch.size()) { output_.resize(kOutputDimensions * batch.size()); gradients_.resize(kOutputDimensions * batch.size()); } + (void)thread_pool; + batch_ = &batch; // affine transform #pragma omp parallel for @@ -143,9 +147,12 @@ namespace Eval::NNUE { } // backpropagation - void backpropagate(const LearnFloatType* gradients, + void backpropagate(ThreadPool& thread_pool, + const LearnFloatType* gradients, LearnFloatType learning_rate) { + (void)thread_pool; + const LearnFloatType local_learning_rate = learning_rate * learning_rate_scale_; diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h index 9b8e5e13..4bb38104 100644 --- a/src/nnue/trainer/trainer_input_slice.h +++ b/src/nnue/trainer/trainer_input_slice.h @@ -7,6 +7,8 @@ #include "nnue/layers/input_slice.h" +#include "thread.h" + // Specialization of NNUE evaluation function learning class template for InputSlice namespace Eval::NNUE { @@ -60,7 +62,7 @@ namespace Eval::NNUE { } // forward propagation - const LearnFloatType* propagate(const std::vector& batch) { + const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector& batch) { if (gradients_.size() < kInputDimensions * batch.size()) { gradients_.resize(kInputDimensions * batch.size()); } @@ -69,7 +71,7 @@ namespace Eval::NNUE { if (num_calls_ == 0) { current_operation_ = Operation::kPropagate; - output_ = feature_transformer_trainer_->propagate(batch); + output_ = feature_transformer_trainer_->propagate(thread_pool, batch); } assert(current_operation_ == Operation::kPropagate); @@ -83,11 +85,12 @@ namespace Eval::NNUE { } // backpropagation - void backpropagate(const LearnFloatType* gradients, + void backpropagate(ThreadPool& thread_pool, + const LearnFloatType* gradients, LearnFloatType learning_rate) { if (num_referrers_ == 1) { - feature_transformer_trainer_->backpropagate(gradients, learning_rate); + feature_transformer_trainer_->backpropagate(thread_pool, gradients, learning_rate); return; } @@ -112,7 +115,7 @@ namespace Eval::NNUE { if (++num_calls_ == num_referrers_) { feature_transformer_trainer_->backpropagate( - gradients_.data(), learning_rate); + thread_pool, gradients_.data(), learning_rate); num_calls_ = 0; current_operation_ = Operation::kNone; } @@ -193,7 +196,7 @@ namespace Eval::NNUE { } // forward propagation - const LearnFloatType* propagate(const std::vector& batch) { + const LearnFloatType* propagate(ThreadPool& thread_pool,const std::vector& batch) { if (output_.size() < kOutputDimensions * batch.size()) { output_.resize(kOutputDimensions * batch.size()); gradients_.resize(kInputDimensions * batch.size()); @@ -201,7 +204,7 @@ namespace Eval::NNUE { batch_size_ = static_cast(batch.size()); - const auto input = shared_input_trainer_->propagate(batch); + const auto input = shared_input_trainer_->propagate(thread_pool, batch); for (IndexType b = 0; b < batch_size_; ++b) { const IndexType input_offset = kInputDimensions * b; const IndexType output_offset = kOutputDimensions * b; @@ -219,7 +222,8 @@ namespace Eval::NNUE { } // backpropagation - void backpropagate(const LearnFloatType* gradients, + void backpropagate(ThreadPool& thread_pool, + const LearnFloatType* gradients, LearnFloatType learning_rate) { for (IndexType b = 0; b < batch_size_; ++b) { @@ -233,7 +237,7 @@ namespace Eval::NNUE { } } } - shared_input_trainer_->backpropagate(gradients_.data(), learning_rate); + shared_input_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate); } private: diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h index b35420d6..6defb95f 100644 --- a/src/nnue/trainer/trainer_sum.h +++ b/src/nnue/trainer/trainer_sum.h @@ -7,6 +7,8 @@ #include "nnue/layers/sum.h" +#include "thread.h" + // Specialization of NNUE evaluation function learning class template for Sum namespace Eval::NNUE { @@ -45,10 +47,10 @@ namespace Eval::NNUE { } // forward propagation - /*const*/ LearnFloatType* propagate(const std::vector& batch) { + /*const*/ LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector& batch) { batch_size_ = static_cast(batch.size()); - auto output = Tail::propagate(batch); - const auto head_output = previous_layer_trainer_->propagate(batch); + auto output = Tail::propagate(thread_pool, batch); + const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch); #if defined(USE_BLAS) cblas_saxpy(kOutputDimensions * batch_size_, 1.0, @@ -66,11 +68,12 @@ namespace Eval::NNUE { } // backpropagation - void backpropagate(const LearnFloatType* gradients, + void backpropagate(ThreadPool& thread_pool, + const LearnFloatType* gradients, LearnFloatType learning_rate) { - Tail::backpropagate(gradients, learning_rate); - previous_layer_trainer_->backpropagate(gradients, learning_rate); + Tail::backpropagate(thread_pool, gradients, learning_rate); + previous_layer_trainer_->backpropagate(thread_pool, gradients, learning_rate); } private: