diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp index 2f0a2122..24ad2732 100644 --- a/src/nnue/evaluate_nnue_learner.cpp +++ b/src/nnue/evaluate_nnue_learner.cpp @@ -215,27 +215,28 @@ namespace Eval::NNUE { std::vector gradient_norm_local(thread_pool.size(), 0.0); while (examples.size() >= batch_size) { - std::vector batch(examples.end() - batch_size, examples.end()); - examples.resize(examples.size() - batch_size); - - const auto network_output = trainer->step_start(thread_pool, batch); - std::vector gradients(batch.size()); + auto batch_begin = examples.end() - batch_size; + auto batch_end = examples.end(); + auto size = batch_end - batch_begin; + const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end); + std::vector gradients(size); thread_pool.for_each_index_chunk_with_workers( - std::size_t(0), batch.size(), + std::size_t(0), size, [&](Thread& th, std::size_t offset, std::size_t count) { const auto thread_id = th.thread_idx(); trainer->propagate(th, offset, count); for (std::size_t b = offset; b < offset + count; ++b) { + const auto& e = *(batch_begin + b); const auto shallow = static_cast(round( - batch[b].sign * network_output[b] * kPonanzaConstant)); - const auto discrete = batch[b].sign * batch[b].discrete_nn_eval; - const auto& psv = batch[b].psv; + e.sign * network_output[b] * kPonanzaConstant)); + const auto discrete = e.sign * e.discrete_nn_eval; + const auto& psv = e.psv; const double gradient = - batch[b].sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly); - gradients[b] = static_cast(gradient * batch[b].weight); + e.sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly); + gradients[b] = static_cast(gradient * e.weight); // The discrete eval will only be valid before first backpropagation, @@ -256,6 +257,8 @@ namespace Eval::NNUE { trainer->step_end(thread_pool, learning_rate); + examples.resize(examples.size() - size); + collect_stats = false; } diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h index f66f1a65..b6d70aa4 100644 --- a/src/nnue/trainer/trainer_affine_transform.h +++ b/src/nnue/trainer/trainer_affine_transform.h @@ -91,11 +91,13 @@ namespace Eval::NNUE { quantize_parameters(); } - const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector& combined_batch) + const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector::const_iterator batch_begin, std::vector::const_iterator batch_end) { - if (output_.size() < kOutputDimensions * combined_batch.size()) { - output_.resize(kOutputDimensions * combined_batch.size()); - gradients_.resize(kInputDimensions * combined_batch.size()); + const auto size = batch_end - batch_begin; + + if (output_.size() < kOutputDimensions * size) { + output_.resize(kOutputDimensions * size); + gradients_.resize(kInputDimensions * size); } if (thread_states_.size() < thread_pool.size()) @@ -103,8 +105,8 @@ namespace Eval::NNUE { thread_states_.resize(thread_pool.size()); } - combined_batch_size_ = static_cast(combined_batch.size()); - combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch); + combined_batch_size_ = size; + combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end); auto& main_thread_state = thread_states_[0]; diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h index e4bcecaf..eae35df6 100644 --- a/src/nnue/trainer/trainer_clipped_relu.h +++ b/src/nnue/trainer/trainer_clipped_relu.h @@ -42,11 +42,13 @@ namespace Eval::NNUE { previous_layer_trainer_->initialize(rng); } - const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector& combined_batch) + const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector::const_iterator batch_begin, std::vector::const_iterator batch_end) { - if (output_.size() < kOutputDimensions * combined_batch.size()) { - output_.resize(kOutputDimensions * combined_batch.size()); - gradients_.resize(kInputDimensions * combined_batch.size()); + const auto size = batch_end - batch_begin; + + if (output_.size() < kOutputDimensions * size) { + output_.resize(kOutputDimensions * size); + gradients_.resize(kInputDimensions * size); } if (thread_states_.size() < thread_pool.size()) @@ -54,9 +56,9 @@ namespace Eval::NNUE { thread_states_.resize(thread_pool.size()); } - input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch); + input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end); - batch_size_ = static_cast(combined_batch.size()); + batch_size_ = size; return output_.data(); } diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h index fa0859ed..65766b05 100644 --- a/src/nnue/trainer/trainer_feature_transformer.h +++ b/src/nnue/trainer/trainer_feature_transformer.h @@ -89,11 +89,13 @@ namespace Eval::NNUE { quantize_parameters(); } - const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector& combined_batch) + const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector::const_iterator batch_begin, std::vector::const_iterator batch_end) { - if (output_.size() < kOutputDimensions * combined_batch.size()) { - output_.resize(kOutputDimensions * combined_batch.size()); - gradients_.resize(kOutputDimensions * combined_batch.size()); + const auto size = batch_end - batch_begin; + + if (output_.size() < kOutputDimensions * size) { + output_.resize(kOutputDimensions * size); + gradients_.resize(kOutputDimensions * size); } if (thread_stat_states_.size() < thread_pool.size()) @@ -106,7 +108,8 @@ namespace Eval::NNUE { thread_bias_states_.resize(thread_pool.size()); } - batch_ = &combined_batch; + batch_ = &*batch_begin; + batch_size_ = size; auto& main_thread_bias_state = thread_bias_states_[0]; @@ -161,7 +164,7 @@ namespace Eval::NNUE { Blas::scopy( kHalfDimensions, biases_, 1, &output_[output_offset], 1 ); - for (const auto& feature : (*batch_)[b].training_features[c]) { + for (const auto& feature : batch_[b].training_features[c]) { const IndexType weights_offset = kHalfDimensions * feature.get_index(); Blas::saxpy( kHalfDimensions, (float)feature.get_count(), @@ -458,12 +461,12 @@ namespace Eval::NNUE { [&, num_threads = thread_pool.size()](Thread& th) { const auto thread_index = th.thread_idx(); - for (IndexType b = 0; b < batch_->size(); ++b) { + for (IndexType b = 0; b < batch_size_; ++b) { const IndexType batch_offset = kOutputDimensions * b; for (IndexType c = 0; c < 2; ++c) { const IndexType output_offset = batch_offset + kHalfDimensions * c; - for (const auto& feature : (*batch_)[b].training_features[c]) { + for (const auto& feature : batch_[b].training_features[c]) { const IndexType feature_index = feature.get_index(); const IndexType weights_offset = kHalfDimensions * feature_index; @@ -519,6 +522,7 @@ namespace Eval::NNUE { // constructor Trainer(LayerType* target_layer) : batch_(nullptr), + batch_size_(0), target_layer_(target_layer), biases_(), weights_(), @@ -669,7 +673,8 @@ namespace Eval::NNUE { static constexpr LearnFloatType kOne = static_cast(1.0); // mini batch - const std::vector* batch_; + const Example* batch_; + IndexType batch_size_; // layer to learn LayerType* const target_layer_; diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h index 54f03d42..ad681d57 100644 --- a/src/nnue/trainer/trainer_input_slice.h +++ b/src/nnue/trainer/trainer_input_slice.h @@ -63,9 +63,12 @@ namespace Eval::NNUE { } } - const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector& combined_batch) { - if (gradients_.size() < kInputDimensions * combined_batch.size()) { - gradients_.resize(kInputDimensions * combined_batch.size()); + const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector::const_iterator batch_begin, std::vector::const_iterator batch_end) + { + const auto size = batch_end - batch_begin; + + if (gradients_.size() < kInputDimensions * size) { + gradients_.resize(kInputDimensions * size); } if (num_calls_.size() < thread_pool.size()) @@ -73,11 +76,11 @@ namespace Eval::NNUE { num_calls_.resize(thread_pool.size(), 0); } - batch_size_ = static_cast(combined_batch.size()); + batch_size_ = size; if (num_calls_[0] == 0) { current_operation_ = Operation::kStepStart; - output_ = feature_transformer_trainer_->step_start(thread_pool, combined_batch); + output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end); } assert(current_operation_ == Operation::kStepStart); @@ -237,15 +240,18 @@ namespace Eval::NNUE { shared_input_trainer_->initialize(rng); } - const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector& combined_batch) { - if (output_.size() < kOutputDimensions * combined_batch.size()) { - output_.resize(kOutputDimensions * combined_batch.size()); - gradients_.resize(kInputDimensions * combined_batch.size()); + const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector::const_iterator batch_begin, std::vector::const_iterator batch_end) + { + const auto size = batch_end - batch_begin; + + if (output_.size() < kOutputDimensions * size) { + output_.resize(kOutputDimensions * size); + gradients_.resize(kInputDimensions * size); } - batch_size_ = static_cast(combined_batch.size()); + batch_size_ = size; - input_ = shared_input_trainer_->step_start(thread_pool, combined_batch); + input_ = shared_input_trainer_->step_start(thread_pool, batch_begin, batch_end); return output_.data(); }