Don't unnecessarily copy the batch part.

2025-12-26 12:06:22 +08:00 · 2020-11-25 22:59:34 +01:00
parent e954b14196
commit 0bee8fef64
5 changed files with 61 additions and 43 deletions
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -215,27 +215,28 @@ namespace Eval::NNUE {
        std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);

        while (examples.size() >= batch_size) {
-            std::vector<Example> batch(examples.end() - batch_size, examples.end());
-            examples.resize(examples.size() - batch_size);
-
-            const auto network_output = trainer->step_start(thread_pool, batch);
-            std::vector<LearnFloatType> gradients(batch.size());
+            auto batch_begin = examples.end() - batch_size;
+            auto batch_end = examples.end();
+            auto size = batch_end - batch_begin;
+            const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end);
+            std::vector<LearnFloatType> gradients(size);

            thread_pool.for_each_index_chunk_with_workers(
-                std::size_t(0), batch.size(),
+                std::size_t(0), size,
                [&](Thread& th, std::size_t offset, std::size_t count) {
                    const auto thread_id = th.thread_idx();

                    trainer->propagate(th, offset, count);

                    for (std::size_t b = offset; b < offset + count; ++b) {
+                        const auto& e = *(batch_begin + b);
                        const auto shallow = static_cast<Value>(round<std::int32_t>(
-                            batch[b].sign * network_output[b] * kPonanzaConstant));
-                        const auto discrete = batch[b].sign * batch[b].discrete_nn_eval;
-                        const auto& psv = batch[b].psv;
+                            e.sign * network_output[b] * kPonanzaConstant));
+                        const auto discrete = e.sign * e.discrete_nn_eval;
+                        const auto& psv = e.psv;
                        const double gradient =
-                            batch[b].sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
-                        gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+                            e.sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
+                        gradients[b] = static_cast<LearnFloatType>(gradient * e.weight);


                        // The discrete eval will only be valid before first backpropagation,
@@ -256,6 +257,8 @@ namespace Eval::NNUE {

            trainer->step_end(thread_pool, learning_rate);

+            examples.resize(examples.size() - size);
+
            collect_stats = false;
        }

--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -91,11 +91,13 @@ namespace Eval::NNUE {
            quantize_parameters();
        }

-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
        {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-                output_.resize(kOutputDimensions * combined_batch.size());
-                gradients_.resize(kInputDimensions * combined_batch.size());
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+                output_.resize(kOutputDimensions * size);
+                gradients_.resize(kInputDimensions * size);
            }

            if (thread_states_.size() < thread_pool.size())
@@ -103,8 +105,8 @@ namespace Eval::NNUE {
                thread_states_.resize(thread_pool.size());
            }

-            combined_batch_size_ = static_cast<IndexType>(combined_batch.size());
-            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
+            combined_batch_size_ = size;
+            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);

            auto& main_thread_state = thread_states_[0];

--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -42,11 +42,13 @@ namespace Eval::NNUE {
            previous_layer_trainer_->initialize(rng);
        }

-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
        {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-              output_.resize(kOutputDimensions * combined_batch.size());
-              gradients_.resize(kInputDimensions * combined_batch.size());
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+              output_.resize(kOutputDimensions * size);
+              gradients_.resize(kInputDimensions * size);
            }

            if (thread_states_.size() < thread_pool.size())
@@ -54,9 +56,9 @@ namespace Eval::NNUE {
                thread_states_.resize(thread_pool.size());
            }

-            input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
+            input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);

-            batch_size_ = static_cast<IndexType>(combined_batch.size());
+            batch_size_ = size;

            return output_.data();
        }
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -89,11 +89,13 @@ namespace Eval::NNUE {
            quantize_parameters();
        }

-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
        {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-                output_.resize(kOutputDimensions * combined_batch.size());
-                gradients_.resize(kOutputDimensions * combined_batch.size());
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+                output_.resize(kOutputDimensions * size);
+                gradients_.resize(kOutputDimensions * size);
            }

            if (thread_stat_states_.size() < thread_pool.size())
@@ -106,7 +108,8 @@ namespace Eval::NNUE {
                thread_bias_states_.resize(thread_pool.size());
            }

-            batch_ = &combined_batch;
+            batch_ = &*batch_begin;
+            batch_size_ = size;

            auto& main_thread_bias_state = thread_bias_states_[0];

@@ -161,7 +164,7 @@ namespace Eval::NNUE {
                    Blas::scopy(
                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
                    );
-                    for (const auto& feature : (*batch_)[b].training_features[c]) {
+                    for (const auto& feature : batch_[b].training_features[c]) {
                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
                        Blas::saxpy(
                            kHalfDimensions, (float)feature.get_count(),
@@ -458,12 +461,12 @@ namespace Eval::NNUE {
                [&, num_threads = thread_pool.size()](Thread& th) {
                    const auto thread_index = th.thread_idx();

-                    for (IndexType b = 0; b < batch_->size(); ++b) {
+                    for (IndexType b = 0; b < batch_size_; ++b) {
                        const IndexType batch_offset = kOutputDimensions * b;

                        for (IndexType c = 0; c < 2; ++c) {
                            const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                            for (const auto& feature : (*batch_)[b].training_features[c]) {
+                            for (const auto& feature : batch_[b].training_features[c]) {
                                const IndexType feature_index = feature.get_index();
                                const IndexType weights_offset =
                                    kHalfDimensions * feature_index;
@@ -519,6 +522,7 @@ namespace Eval::NNUE {
        // constructor
        Trainer(LayerType* target_layer) :
            batch_(nullptr),
+            batch_size_(0),
            target_layer_(target_layer),
            biases_(),
            weights_(),
@@ -669,7 +673,8 @@ namespace Eval::NNUE {
        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);

        // mini batch
-        const std::vector<Example>* batch_;
+        const Example* batch_;
+        IndexType batch_size_;

        // layer to learn
        LayerType* const target_layer_;
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -63,9 +63,12 @@ namespace Eval::NNUE {
            }
        }

-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
-            if (gradients_.size() < kInputDimensions * combined_batch.size()) {
-                gradients_.resize(kInputDimensions * combined_batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
+            
+            if (gradients_.size() < kInputDimensions * size) {
+                gradients_.resize(kInputDimensions * size);
            }

            if (num_calls_.size() < thread_pool.size())
@@ -73,11 +76,11 @@ namespace Eval::NNUE {
                num_calls_.resize(thread_pool.size(), 0);
            }

-            batch_size_ = static_cast<IndexType>(combined_batch.size());
+            batch_size_ = size;

            if (num_calls_[0] == 0) {
                current_operation_ = Operation::kStepStart;
-                output_ = feature_transformer_trainer_->step_start(thread_pool, combined_batch);
+                output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end);
            }

            assert(current_operation_ == Operation::kStepStart);
@@ -237,15 +240,18 @@ namespace Eval::NNUE {
            shared_input_trainer_->initialize(rng);
        }

-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-              output_.resize(kOutputDimensions * combined_batch.size());
-              gradients_.resize(kInputDimensions * combined_batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+              output_.resize(kOutputDimensions * size);
+              gradients_.resize(kInputDimensions * size);
            }

-            batch_size_ = static_cast<IndexType>(combined_batch.size());
+            batch_size_ = size;

-            input_ = shared_input_trainer_->step_start(thread_pool, combined_batch);
+            input_ = shared_input_trainer_->step_start(thread_pool, batch_begin, batch_end);

            return output_.data();
        }