From a3c78691a23fd743e2a815b65594609683b87d9c Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 20:44:26 +0100
Subject: [PATCH] Prepare input slice trainer.

---
 src/nnue/trainer/trainer_input_slice.h | 181 ++++++++++++++++---------
 1 file changed, 115 insertions(+), 66 deletions(-)
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index a93a3ea0..54f03d42 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -34,15 +34,15 @@ namespace Eval::NNUE {
 
         // Set options such as hyperparameters
         void send_message(Message* message) {
-            if (num_calls_ == 0) {
+            if (num_calls_[0] == 0) {
                 current_operation_ = Operation::kSendMessage;
                 feature_transformer_trainer_->send_message(message);
             }
 
             assert(current_operation_ == Operation::kSendMessage);
 
-            if (++num_calls_ == num_referrers_) {
-                num_calls_ = 0;
+            if (++num_calls_[0] == num_referrers_) {
+                num_calls_[0] = 0;
                 current_operation_ = Operation::kNone;
             }
         }
@@ -50,55 +50,79 @@ namespace Eval::NNUE {
         // Initialize the parameters with random numbers
         template <typename RNG>
         void initialize(RNG& rng) {
-            if (num_calls_ == 0) {
+            if (num_calls_[0] == 0) {
                 current_operation_ = Operation::kInitialize;
                 feature_transformer_trainer_->initialize(rng);
             }
 
             assert(current_operation_ == Operation::kInitialize);
 
-            if (++num_calls_ == num_referrers_) {
-                num_calls_ = 0;
+            if (++num_calls_[0] == num_referrers_) {
+                num_calls_[0] = 0;
                 current_operation_ = Operation::kNone;
             }
         }
 
-        // forward propagation
-        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
-            if (gradients_.size() < kInputDimensions * batch.size()) {
-                gradients_.resize(kInputDimensions * batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
+            if (gradients_.size() < kInputDimensions * combined_batch.size()) {
+                gradients_.resize(kInputDimensions * combined_batch.size());
             }
 
-            batch_size_ = static_cast<IndexType>(batch.size());
-
-            if (num_calls_ == 0) {
-                current_operation_ = Operation::kPropagate;
-                output_ = feature_transformer_trainer_->propagate(thread_pool, batch);
+            if (num_calls_.size() < thread_pool.size())
+            {
+                num_calls_.resize(thread_pool.size(), 0);
             }
 
-            assert(current_operation_ == Operation::kPropagate);
+            batch_size_ = static_cast<IndexType>(combined_batch.size());
 
-            if (++num_calls_ == num_referrers_) {
-                num_calls_ = 0;
+            if (num_calls_[0] == 0) {
+                current_operation_ = Operation::kStepStart;
+                output_ = feature_transformer_trainer_->step_start(thread_pool, combined_batch);
+            }
+
+            assert(current_operation_ == Operation::kStepStart);
+
+            if (++num_calls_[0] == num_referrers_) {
+                num_calls_[0] = 0;
                 current_operation_ = Operation::kNone;
             }
 
             return output_;
         }
 
+        // forward propagation
+        void propagate(Thread& th, uint64_t offset, uint64_t count) {
+            const auto thread_id = th.thread_idx();
+
+            if (num_calls_[thread_id] == 0) {
+                current_operation_ = Operation::kPropagate;
+                feature_transformer_trainer_->propagate(th, offset, count);
+            }
+
+            assert(current_operation_ == Operation::kPropagate);
+
+            if (++num_calls_[thread_id] == num_referrers_) {
+                num_calls_[thread_id] = 0;
+                current_operation_ = Operation::kNone;
+            }
+        }
+
         // backpropagation
-        void backpropagate(ThreadPool& thread_pool,
+        void backpropagate(Thread& th,
                            const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
+                           uint64_t offset,
+                           uint64_t count) {
+
+            const auto thread_id = th.thread_idx();
 
             if (num_referrers_ == 1) {
-                feature_transformer_trainer_->backpropagate(thread_pool, gradients, learning_rate);
+                feature_transformer_trainer_->backpropagate(th, gradients, offset, count);
                 return;
             }
 
-            if (num_calls_ == 0) {
+            if (num_calls_[thread_id] == 0) {
                 current_operation_ = Operation::kBackPropagate;
-                for (IndexType b = 0; b < batch_size_; ++b) {
+                for (IndexType b = offset; b < offset + count; ++b) {
                     const IndexType batch_offset = kInputDimensions * b;
                     for (IndexType i = 0; i < kInputDimensions; ++i) {
                         gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
@@ -108,17 +132,31 @@ namespace Eval::NNUE {
 
             assert(current_operation_ == Operation::kBackPropagate);
 
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kInputDimensions * b;
                 for (IndexType i = 0; i < kInputDimensions; ++i) {
                     gradients_[batch_offset + i] += gradients[batch_offset + i];
                 }
             }
 
-            if (++num_calls_ == num_referrers_) {
+            if (++num_calls_[thread_id] == num_referrers_) {
                 feature_transformer_trainer_->backpropagate(
-                    thread_pool, gradients_.data(), learning_rate);
-                num_calls_ = 0;
+                    th, gradients_.data(), offset, count);
+                num_calls_[thread_id] = 0;
+                current_operation_ = Operation::kNone;
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
+            if (num_calls_[0] == 0) {
+                current_operation_ = Operation::kStepEnd;
+                feature_transformer_trainer_->step_end(thread_pool, learning_rate);
+            }
+
+            assert(current_operation_ == Operation::kStepEnd);
+
+            if (++num_calls_[0] == num_referrers_) {
+                num_calls_[0] = 0;
                 current_operation_ = Operation::kNone;
             }
         }
@@ -128,7 +166,7 @@ namespace Eval::NNUE {
         SharedInputTrainer(FeatureTransformer* ft) :
             batch_size_(0),
             num_referrers_(0),
-            num_calls_(0),
+            num_calls_(1, 0),
             current_operation_(Operation::kNone),
             feature_transformer_trainer_(Trainer<FeatureTransformer>::create(
                 ft)),
@@ -144,8 +182,10 @@ namespace Eval::NNUE {
             kNone,
             kSendMessage,
             kInitialize,
+            kStepStart,
             kPropagate,
             kBackPropagate,
+            kStepEnd,
         };
 
         // number of samples in mini-batch
@@ -155,7 +195,7 @@ namespace Eval::NNUE {
         std::uint32_t num_referrers_;
 
         // Number of times the current process has been called
-        std::uint32_t num_calls_;
+        std::vector<std::uint32_t> num_calls_;
 
         // current processing type
         Operation current_operation_;
@@ -197,74 +237,81 @@ namespace Eval::NNUE {
             shared_input_trainer_->initialize(rng);
         }
 
-        // forward propagation
-        const LearnFloatType* propagate(ThreadPool& thread_pool,const std::vector<Example>& batch) {
-            if (output_.size() < kOutputDimensions * batch.size()) {
-              output_.resize(kOutputDimensions * batch.size());
-              gradients_.resize(kInputDimensions * batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
+            if (output_.size() < kOutputDimensions * combined_batch.size()) {
+              output_.resize(kOutputDimensions * combined_batch.size());
+              gradients_.resize(kInputDimensions * combined_batch.size());
             }
 
-            batch_size_ = static_cast<IndexType>(batch.size());
+            batch_size_ = static_cast<IndexType>(combined_batch.size());
 
-            const auto input = shared_input_trainer_->propagate(thread_pool, batch);
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            input_ = shared_input_trainer_->step_start(thread_pool, combined_batch);
+
+            return output_.data();
+        }
+
+        // forward propagation
+        void propagate(Thread& th, uint64_t offset, uint64_t count) {
+
+            shared_input_trainer_->propagate(th, offset, count);
+
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType input_offset = kInputDimensions * b;
                 const IndexType output_offset = kOutputDimensions * b;
 
 #if defined(USE_BLAS)
 
                 cblas_scopy(
-                    kOutputDimensions, &input[input_offset + Offset], 1,
+                    kOutputDimensions, &input_[input_offset + Offset], 1,
                     &output_[output_offset], 1
                 );
 #else
 
                 Blas::scopy(
-                    thread_pool,
-                    kOutputDimensions, &input[input_offset + Offset], 1,
+                    kOutputDimensions, &input_[input_offset + Offset], 1,
                     &output_[output_offset], 1
                 );
 
 #endif
             }
-
-            return output_.data();
         }
 
         // backpropagation
-        void backpropagate(ThreadPool& thread_pool,
+        void backpropagate(Thread& th,
                            const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
+                           uint64_t offset,
+                           uint64_t count) {
 
-            thread_pool.for_each_index_with_workers(
-                0, batch_size_,
-                [&](Thread&, int b) {
-                    const IndexType input_offset = kInputDimensions * b;
-                    const IndexType output_offset = kOutputDimensions * b;
+            for (IndexType b = offset; b < offset + count; ++b)
+            {
+                const IndexType input_offset = kInputDimensions * b;
+                const IndexType output_offset = kOutputDimensions * b;
 
-                    IndexType i = 0;
-                    for (; i < Offset; ++i) {
-                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-                    }
-
-                    for (; i < Offset + kOutputDimensions; ++i) {
-                        gradients_[input_offset + i] = gradients[output_offset + i - Offset];
-                    }
-
-                    for (; i < kInputDimensions; ++i)
-                    {
-                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-                    }
+                IndexType i = 0;
+                for (; i < Offset; ++i) {
+                    gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
                 }
-            );
-            thread_pool.wait_for_workers_finished();
 
-            shared_input_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);
+                for (; i < Offset + kOutputDimensions; ++i) {
+                    gradients_[input_offset + i] = gradients[output_offset + i - Offset];
+                }
+
+                for (; i < kInputDimensions; ++i)
+                {
+                    gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+                }
+            }
+
+            shared_input_trainer_->backpropagate(th, gradients_.data(), offset, count);
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
+            shared_input_trainer_->step_end(thread_pool, learning_rate);
         }
 
     private:
         // constructor
-        Trainer(FeatureTransformer* ft):
+        Trainer(FeatureTransformer* ft) :
             batch_size_(0),
             shared_input_trainer_(SharedInputTrainer::create(ft)) {
         }
@@ -278,6 +325,8 @@ namespace Eval::NNUE {
         // number of samples in mini-batch
         IndexType batch_size_;
 
+        const LearnFloatType* input_;
+
         // Trainer of shared input layer
         const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;