diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 2f0a2122..24ad2732 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -215,27 +215,28 @@ namespace Eval::NNUE {
         std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
 
         while (examples.size() >= batch_size) {
-            std::vector<Example> batch(examples.end() - batch_size, examples.end());
-            examples.resize(examples.size() - batch_size);
-
-            const auto network_output = trainer->step_start(thread_pool, batch);
-            std::vector<LearnFloatType> gradients(batch.size());
+            auto batch_begin = examples.end() - batch_size;
+            auto batch_end = examples.end();
+            auto size = batch_end - batch_begin;
+            const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end);
+            std::vector<LearnFloatType> gradients(size);
 
             thread_pool.for_each_index_chunk_with_workers(
-                std::size_t(0), batch.size(),
+                std::size_t(0), size,
                 [&](Thread& th, std::size_t offset, std::size_t count) {
                     const auto thread_id = th.thread_idx();
 
                     trainer->propagate(th, offset, count);
 
                     for (std::size_t b = offset; b < offset + count; ++b) {
+                        const auto& e = *(batch_begin + b);
                         const auto shallow = static_cast<Value>(round<std::int32_t>(
-                            batch[b].sign * network_output[b] * kPonanzaConstant));
-                        const auto discrete = batch[b].sign * batch[b].discrete_nn_eval;
-                        const auto& psv = batch[b].psv;
+                            e.sign * network_output[b] * kPonanzaConstant));
+                        const auto discrete = e.sign * e.discrete_nn_eval;
+                        const auto& psv = e.psv;
                         const double gradient =
-                            batch[b].sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
-                        gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+                            e.sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
+                        gradients[b] = static_cast<LearnFloatType>(gradient * e.weight);
 
 
                         // The discrete eval will only be valid before first backpropagation,
@@ -256,6 +257,8 @@ namespace Eval::NNUE {
 
             trainer->step_end(thread_pool, learning_rate);
 
+            examples.resize(examples.size() - size);
+
             collect_stats = false;
         }
 
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index f66f1a65..b6d70aa4 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -91,11 +91,13 @@ namespace Eval::NNUE {
             quantize_parameters();
         }
 
-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
         {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-                output_.resize(kOutputDimensions * combined_batch.size());
-                gradients_.resize(kInputDimensions * combined_batch.size());
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+                output_.resize(kOutputDimensions * size);
+                gradients_.resize(kInputDimensions * size);
             }
 
             if (thread_states_.size() < thread_pool.size())
@@ -103,8 +105,8 @@ namespace Eval::NNUE {
                 thread_states_.resize(thread_pool.size());
             }
 
-            combined_batch_size_ = static_cast<IndexType>(combined_batch.size());
-            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
+            combined_batch_size_ = size;
+            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
 
             auto& main_thread_state = thread_states_[0];
 
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index e4bcecaf..eae35df6 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -42,11 +42,13 @@ namespace Eval::NNUE {
             previous_layer_trainer_->initialize(rng);
         }
 
-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
         {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-              output_.resize(kOutputDimensions * combined_batch.size());
-              gradients_.resize(kInputDimensions * combined_batch.size());
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+              output_.resize(kOutputDimensions * size);
+              gradients_.resize(kInputDimensions * size);
             }
 
             if (thread_states_.size() < thread_pool.size())
@@ -54,9 +56,9 @@ namespace Eval::NNUE {
                 thread_states_.resize(thread_pool.size());
             }
 
-            input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
+            input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
 
-            batch_size_ = static_cast<IndexType>(combined_batch.size());
+            batch_size_ = size;
 
             return output_.data();
         }
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index fa0859ed..65766b05 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -89,11 +89,13 @@ namespace Eval::NNUE {
             quantize_parameters();
         }
 
-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
         {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-                output_.resize(kOutputDimensions * combined_batch.size());
-                gradients_.resize(kOutputDimensions * combined_batch.size());
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+                output_.resize(kOutputDimensions * size);
+                gradients_.resize(kOutputDimensions * size);
             }
 
             if (thread_stat_states_.size() < thread_pool.size())
@@ -106,7 +108,8 @@ namespace Eval::NNUE {
                 thread_bias_states_.resize(thread_pool.size());
             }
 
-            batch_ = &combined_batch;
+            batch_ = &*batch_begin;
+            batch_size_ = size;
 
             auto& main_thread_bias_state = thread_bias_states_[0];
 
@@ -161,7 +164,7 @@ namespace Eval::NNUE {
                     Blas::scopy(
                         kHalfDimensions, biases_, 1, &output_[output_offset], 1
                     );
-                    for (const auto& feature : (*batch_)[b].training_features[c]) {
+                    for (const auto& feature : batch_[b].training_features[c]) {
                         const IndexType weights_offset = kHalfDimensions * feature.get_index();
                         Blas::saxpy(
                             kHalfDimensions, (float)feature.get_count(),
@@ -458,12 +461,12 @@ namespace Eval::NNUE {
                 [&, num_threads = thread_pool.size()](Thread& th) {
                     const auto thread_index = th.thread_idx();
 
-                    for (IndexType b = 0; b < batch_->size(); ++b) {
+                    for (IndexType b = 0; b < batch_size_; ++b) {
                         const IndexType batch_offset = kOutputDimensions * b;
 
                         for (IndexType c = 0; c < 2; ++c) {
                             const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                            for (const auto& feature : (*batch_)[b].training_features[c]) {
+                            for (const auto& feature : batch_[b].training_features[c]) {
                                 const IndexType feature_index = feature.get_index();
                                 const IndexType weights_offset =
                                     kHalfDimensions * feature_index;
@@ -519,6 +522,7 @@ namespace Eval::NNUE {
         // constructor
         Trainer(LayerType* target_layer) :
             batch_(nullptr),
+            batch_size_(0),
             target_layer_(target_layer),
             biases_(),
             weights_(),
@@ -669,7 +673,8 @@ namespace Eval::NNUE {
         static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
 
         // mini batch
-        const std::vector<Example>* batch_;
+        const Example* batch_;
+        IndexType batch_size_;
 
         // layer to learn
         LayerType* const target_layer_;
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 54f03d42..ad681d57 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -63,9 +63,12 @@ namespace Eval::NNUE {
             }
         }
 
-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
-            if (gradients_.size() < kInputDimensions * combined_batch.size()) {
-                gradients_.resize(kInputDimensions * combined_batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
+            
+            if (gradients_.size() < kInputDimensions * size) {
+                gradients_.resize(kInputDimensions * size);
             }
 
             if (num_calls_.size() < thread_pool.size())
@@ -73,11 +76,11 @@ namespace Eval::NNUE {
                 num_calls_.resize(thread_pool.size(), 0);
             }
 
-            batch_size_ = static_cast<IndexType>(combined_batch.size());
+            batch_size_ = size;
 
             if (num_calls_[0] == 0) {
                 current_operation_ = Operation::kStepStart;
-                output_ = feature_transformer_trainer_->step_start(thread_pool, combined_batch);
+                output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end);
             }
 
             assert(current_operation_ == Operation::kStepStart);
@@ -237,15 +240,18 @@ namespace Eval::NNUE {
             shared_input_trainer_->initialize(rng);
         }
 
-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-              output_.resize(kOutputDimensions * combined_batch.size());
-              gradients_.resize(kInputDimensions * combined_batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+              output_.resize(kOutputDimensions * size);
+              gradients_.resize(kInputDimensions * size);
             }
 
-            batch_size_ = static_cast<IndexType>(combined_batch.size());
+            batch_size_ = size;
 
-            input_ = shared_input_trainer_->step_start(thread_pool, combined_batch);
+            input_ = shared_input_trainer_->step_start(thread_pool, batch_begin, batch_end);
 
             return output_.data();
         }