From a3c78691a23fd743e2a815b65594609683b87d9c Mon Sep 17 00:00:00 2001 From: Tomasz Sobczyk Date: Sun, 22 Nov 2020 20:44:26 +0100 Subject: [PATCH] Prepare input slice trainer. --- src/nnue/trainer/trainer_input_slice.h | 181 ++++++++++++++++--------- 1 file changed, 115 insertions(+), 66 deletions(-) diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h index a93a3ea0..54f03d42 100644 --- a/src/nnue/trainer/trainer_input_slice.h +++ b/src/nnue/trainer/trainer_input_slice.h @@ -34,15 +34,15 @@ namespace Eval::NNUE { // Set options such as hyperparameters void send_message(Message* message) { - if (num_calls_ == 0) { + if (num_calls_[0] == 0) { current_operation_ = Operation::kSendMessage; feature_transformer_trainer_->send_message(message); } assert(current_operation_ == Operation::kSendMessage); - if (++num_calls_ == num_referrers_) { - num_calls_ = 0; + if (++num_calls_[0] == num_referrers_) { + num_calls_[0] = 0; current_operation_ = Operation::kNone; } } @@ -50,55 +50,79 @@ namespace Eval::NNUE { // Initialize the parameters with random numbers template void initialize(RNG& rng) { - if (num_calls_ == 0) { + if (num_calls_[0] == 0) { current_operation_ = Operation::kInitialize; feature_transformer_trainer_->initialize(rng); } assert(current_operation_ == Operation::kInitialize); - if (++num_calls_ == num_referrers_) { - num_calls_ = 0; + if (++num_calls_[0] == num_referrers_) { + num_calls_[0] = 0; current_operation_ = Operation::kNone; } } - // forward propagation - const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector& batch) { - if (gradients_.size() < kInputDimensions * batch.size()) { - gradients_.resize(kInputDimensions * batch.size()); + const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector& combined_batch) { + if (gradients_.size() < kInputDimensions * combined_batch.size()) { + gradients_.resize(kInputDimensions * combined_batch.size()); } - batch_size_ = static_cast(batch.size()); - - if (num_calls_ == 0) { - current_operation_ = Operation::kPropagate; - output_ = feature_transformer_trainer_->propagate(thread_pool, batch); + if (num_calls_.size() < thread_pool.size()) + { + num_calls_.resize(thread_pool.size(), 0); } - assert(current_operation_ == Operation::kPropagate); + batch_size_ = static_cast(combined_batch.size()); - if (++num_calls_ == num_referrers_) { - num_calls_ = 0; + if (num_calls_[0] == 0) { + current_operation_ = Operation::kStepStart; + output_ = feature_transformer_trainer_->step_start(thread_pool, combined_batch); + } + + assert(current_operation_ == Operation::kStepStart); + + if (++num_calls_[0] == num_referrers_) { + num_calls_[0] = 0; current_operation_ = Operation::kNone; } return output_; } + // forward propagation + void propagate(Thread& th, uint64_t offset, uint64_t count) { + const auto thread_id = th.thread_idx(); + + if (num_calls_[thread_id] == 0) { + current_operation_ = Operation::kPropagate; + feature_transformer_trainer_->propagate(th, offset, count); + } + + assert(current_operation_ == Operation::kPropagate); + + if (++num_calls_[thread_id] == num_referrers_) { + num_calls_[thread_id] = 0; + current_operation_ = Operation::kNone; + } + } + // backpropagation - void backpropagate(ThreadPool& thread_pool, + void backpropagate(Thread& th, const LearnFloatType* gradients, - LearnFloatType learning_rate) { + uint64_t offset, + uint64_t count) { + + const auto thread_id = th.thread_idx(); if (num_referrers_ == 1) { - feature_transformer_trainer_->backpropagate(thread_pool, gradients, learning_rate); + feature_transformer_trainer_->backpropagate(th, gradients, offset, count); return; } - if (num_calls_ == 0) { + if (num_calls_[thread_id] == 0) { current_operation_ = Operation::kBackPropagate; - for (IndexType b = 0; b < batch_size_; ++b) { + for (IndexType b = offset; b < offset + count; ++b) { const IndexType batch_offset = kInputDimensions * b; for (IndexType i = 0; i < kInputDimensions; ++i) { gradients_[batch_offset + i] = static_cast(0.0); @@ -108,17 +132,31 @@ namespace Eval::NNUE { assert(current_operation_ == Operation::kBackPropagate); - for (IndexType b = 0; b < batch_size_; ++b) { + for (IndexType b = offset; b < offset + count; ++b) { const IndexType batch_offset = kInputDimensions * b; for (IndexType i = 0; i < kInputDimensions; ++i) { gradients_[batch_offset + i] += gradients[batch_offset + i]; } } - if (++num_calls_ == num_referrers_) { + if (++num_calls_[thread_id] == num_referrers_) { feature_transformer_trainer_->backpropagate( - thread_pool, gradients_.data(), learning_rate); - num_calls_ = 0; + th, gradients_.data(), offset, count); + num_calls_[thread_id] = 0; + current_operation_ = Operation::kNone; + } + } + + void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) { + if (num_calls_[0] == 0) { + current_operation_ = Operation::kStepEnd; + feature_transformer_trainer_->step_end(thread_pool, learning_rate); + } + + assert(current_operation_ == Operation::kStepEnd); + + if (++num_calls_[0] == num_referrers_) { + num_calls_[0] = 0; current_operation_ = Operation::kNone; } } @@ -128,7 +166,7 @@ namespace Eval::NNUE { SharedInputTrainer(FeatureTransformer* ft) : batch_size_(0), num_referrers_(0), - num_calls_(0), + num_calls_(1, 0), current_operation_(Operation::kNone), feature_transformer_trainer_(Trainer::create( ft)), @@ -144,8 +182,10 @@ namespace Eval::NNUE { kNone, kSendMessage, kInitialize, + kStepStart, kPropagate, kBackPropagate, + kStepEnd, }; // number of samples in mini-batch @@ -155,7 +195,7 @@ namespace Eval::NNUE { std::uint32_t num_referrers_; // Number of times the current process has been called - std::uint32_t num_calls_; + std::vector num_calls_; // current processing type Operation current_operation_; @@ -197,74 +237,81 @@ namespace Eval::NNUE { shared_input_trainer_->initialize(rng); } - // forward propagation - const LearnFloatType* propagate(ThreadPool& thread_pool,const std::vector& batch) { - if (output_.size() < kOutputDimensions * batch.size()) { - output_.resize(kOutputDimensions * batch.size()); - gradients_.resize(kInputDimensions * batch.size()); + const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector& combined_batch) { + if (output_.size() < kOutputDimensions * combined_batch.size()) { + output_.resize(kOutputDimensions * combined_batch.size()); + gradients_.resize(kInputDimensions * combined_batch.size()); } - batch_size_ = static_cast(batch.size()); + batch_size_ = static_cast(combined_batch.size()); - const auto input = shared_input_trainer_->propagate(thread_pool, batch); - for (IndexType b = 0; b < batch_size_; ++b) { + input_ = shared_input_trainer_->step_start(thread_pool, combined_batch); + + return output_.data(); + } + + // forward propagation + void propagate(Thread& th, uint64_t offset, uint64_t count) { + + shared_input_trainer_->propagate(th, offset, count); + + for (IndexType b = offset; b < offset + count; ++b) { const IndexType input_offset = kInputDimensions * b; const IndexType output_offset = kOutputDimensions * b; #if defined(USE_BLAS) cblas_scopy( - kOutputDimensions, &input[input_offset + Offset], 1, + kOutputDimensions, &input_[input_offset + Offset], 1, &output_[output_offset], 1 ); #else Blas::scopy( - thread_pool, - kOutputDimensions, &input[input_offset + Offset], 1, + kOutputDimensions, &input_[input_offset + Offset], 1, &output_[output_offset], 1 ); #endif } - - return output_.data(); } // backpropagation - void backpropagate(ThreadPool& thread_pool, + void backpropagate(Thread& th, const LearnFloatType* gradients, - LearnFloatType learning_rate) { + uint64_t offset, + uint64_t count) { - thread_pool.for_each_index_with_workers( - 0, batch_size_, - [&](Thread&, int b) { - const IndexType input_offset = kInputDimensions * b; - const IndexType output_offset = kOutputDimensions * b; + for (IndexType b = offset; b < offset + count; ++b) + { + const IndexType input_offset = kInputDimensions * b; + const IndexType output_offset = kOutputDimensions * b; - IndexType i = 0; - for (; i < Offset; ++i) { - gradients_[input_offset + i] = static_cast(0.0); - } - - for (; i < Offset + kOutputDimensions; ++i) { - gradients_[input_offset + i] = gradients[output_offset + i - Offset]; - } - - for (; i < kInputDimensions; ++i) - { - gradients_[input_offset + i] = static_cast(0.0); - } + IndexType i = 0; + for (; i < Offset; ++i) { + gradients_[input_offset + i] = static_cast(0.0); } - ); - thread_pool.wait_for_workers_finished(); - shared_input_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate); + for (; i < Offset + kOutputDimensions; ++i) { + gradients_[input_offset + i] = gradients[output_offset + i - Offset]; + } + + for (; i < kInputDimensions; ++i) + { + gradients_[input_offset + i] = static_cast(0.0); + } + } + + shared_input_trainer_->backpropagate(th, gradients_.data(), offset, count); + } + + void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) { + shared_input_trainer_->step_end(thread_pool, learning_rate); } private: // constructor - Trainer(FeatureTransformer* ft): + Trainer(FeatureTransformer* ft) : batch_size_(0), shared_input_trainer_(SharedInputTrainer::create(ft)) { } @@ -278,6 +325,8 @@ namespace Eval::NNUE { // number of samples in mini-batch IndexType batch_size_; + const LearnFloatType* input_; + // Trainer of shared input layer const std::shared_ptr shared_input_trainer_;