Don't unnecessarily copy the batch part.

This commit is contained in:
Tomasz Sobczyk
2020-11-25 22:59:34 +01:00
committed by nodchip
parent e954b14196
commit 0bee8fef64
5 changed files with 61 additions and 43 deletions

View File

@@ -215,27 +215,28 @@ namespace Eval::NNUE {
std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
while (examples.size() >= batch_size) {
std::vector<Example> batch(examples.end() - batch_size, examples.end());
examples.resize(examples.size() - batch_size);
const auto network_output = trainer->step_start(thread_pool, batch);
std::vector<LearnFloatType> gradients(batch.size());
auto batch_begin = examples.end() - batch_size;
auto batch_end = examples.end();
auto size = batch_end - batch_begin;
const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end);
std::vector<LearnFloatType> gradients(size);
thread_pool.for_each_index_chunk_with_workers(
std::size_t(0), batch.size(),
std::size_t(0), size,
[&](Thread& th, std::size_t offset, std::size_t count) {
const auto thread_id = th.thread_idx();
trainer->propagate(th, offset, count);
for (std::size_t b = offset; b < offset + count; ++b) {
const auto& e = *(batch_begin + b);
const auto shallow = static_cast<Value>(round<std::int32_t>(
batch[b].sign * network_output[b] * kPonanzaConstant));
const auto discrete = batch[b].sign * batch[b].discrete_nn_eval;
const auto& psv = batch[b].psv;
e.sign * network_output[b] * kPonanzaConstant));
const auto discrete = e.sign * e.discrete_nn_eval;
const auto& psv = e.psv;
const double gradient =
batch[b].sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
e.sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
gradients[b] = static_cast<LearnFloatType>(gradient * e.weight);
// The discrete eval will only be valid before first backpropagation,
@@ -256,6 +257,8 @@ namespace Eval::NNUE {
trainer->step_end(thread_pool, learning_rate);
examples.resize(examples.size() - size);
collect_stats = false;
}

View File

@@ -91,11 +91,13 @@ namespace Eval::NNUE {
quantize_parameters();
}
const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
{
if (output_.size() < kOutputDimensions * combined_batch.size()) {
output_.resize(kOutputDimensions * combined_batch.size());
gradients_.resize(kInputDimensions * combined_batch.size());
const auto size = batch_end - batch_begin;
if (output_.size() < kOutputDimensions * size) {
output_.resize(kOutputDimensions * size);
gradients_.resize(kInputDimensions * size);
}
if (thread_states_.size() < thread_pool.size())
@@ -103,8 +105,8 @@ namespace Eval::NNUE {
thread_states_.resize(thread_pool.size());
}
combined_batch_size_ = static_cast<IndexType>(combined_batch.size());
combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
combined_batch_size_ = size;
combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
auto& main_thread_state = thread_states_[0];

View File

@@ -42,11 +42,13 @@ namespace Eval::NNUE {
previous_layer_trainer_->initialize(rng);
}
const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
{
if (output_.size() < kOutputDimensions * combined_batch.size()) {
output_.resize(kOutputDimensions * combined_batch.size());
gradients_.resize(kInputDimensions * combined_batch.size());
const auto size = batch_end - batch_begin;
if (output_.size() < kOutputDimensions * size) {
output_.resize(kOutputDimensions * size);
gradients_.resize(kInputDimensions * size);
}
if (thread_states_.size() < thread_pool.size())
@@ -54,9 +56,9 @@ namespace Eval::NNUE {
thread_states_.resize(thread_pool.size());
}
input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
batch_size_ = static_cast<IndexType>(combined_batch.size());
batch_size_ = size;
return output_.data();
}

View File

@@ -89,11 +89,13 @@ namespace Eval::NNUE {
quantize_parameters();
}
const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
{
if (output_.size() < kOutputDimensions * combined_batch.size()) {
output_.resize(kOutputDimensions * combined_batch.size());
gradients_.resize(kOutputDimensions * combined_batch.size());
const auto size = batch_end - batch_begin;
if (output_.size() < kOutputDimensions * size) {
output_.resize(kOutputDimensions * size);
gradients_.resize(kOutputDimensions * size);
}
if (thread_stat_states_.size() < thread_pool.size())
@@ -106,7 +108,8 @@ namespace Eval::NNUE {
thread_bias_states_.resize(thread_pool.size());
}
batch_ = &combined_batch;
batch_ = &*batch_begin;
batch_size_ = size;
auto& main_thread_bias_state = thread_bias_states_[0];
@@ -161,7 +164,7 @@ namespace Eval::NNUE {
Blas::scopy(
kHalfDimensions, biases_, 1, &output_[output_offset], 1
);
for (const auto& feature : (*batch_)[b].training_features[c]) {
for (const auto& feature : batch_[b].training_features[c]) {
const IndexType weights_offset = kHalfDimensions * feature.get_index();
Blas::saxpy(
kHalfDimensions, (float)feature.get_count(),
@@ -458,12 +461,12 @@ namespace Eval::NNUE {
[&, num_threads = thread_pool.size()](Thread& th) {
const auto thread_index = th.thread_idx();
for (IndexType b = 0; b < batch_->size(); ++b) {
for (IndexType b = 0; b < batch_size_; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c;
for (const auto& feature : (*batch_)[b].training_features[c]) {
for (const auto& feature : batch_[b].training_features[c]) {
const IndexType feature_index = feature.get_index();
const IndexType weights_offset =
kHalfDimensions * feature_index;
@@ -519,6 +522,7 @@ namespace Eval::NNUE {
// constructor
Trainer(LayerType* target_layer) :
batch_(nullptr),
batch_size_(0),
target_layer_(target_layer),
biases_(),
weights_(),
@@ -669,7 +673,8 @@ namespace Eval::NNUE {
static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
// mini batch
const std::vector<Example>* batch_;
const Example* batch_;
IndexType batch_size_;
// layer to learn
LayerType* const target_layer_;

View File

@@ -63,9 +63,12 @@ namespace Eval::NNUE {
}
}
const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
if (gradients_.size() < kInputDimensions * combined_batch.size()) {
gradients_.resize(kInputDimensions * combined_batch.size());
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
{
const auto size = batch_end - batch_begin;
if (gradients_.size() < kInputDimensions * size) {
gradients_.resize(kInputDimensions * size);
}
if (num_calls_.size() < thread_pool.size())
@@ -73,11 +76,11 @@ namespace Eval::NNUE {
num_calls_.resize(thread_pool.size(), 0);
}
batch_size_ = static_cast<IndexType>(combined_batch.size());
batch_size_ = size;
if (num_calls_[0] == 0) {
current_operation_ = Operation::kStepStart;
output_ = feature_transformer_trainer_->step_start(thread_pool, combined_batch);
output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end);
}
assert(current_operation_ == Operation::kStepStart);
@@ -237,15 +240,18 @@ namespace Eval::NNUE {
shared_input_trainer_->initialize(rng);
}
const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
if (output_.size() < kOutputDimensions * combined_batch.size()) {
output_.resize(kOutputDimensions * combined_batch.size());
gradients_.resize(kInputDimensions * combined_batch.size());
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
{
const auto size = batch_end - batch_begin;
if (output_.size() < kOutputDimensions * size) {
output_.resize(kOutputDimensions * size);
gradients_.resize(kInputDimensions * size);
}
batch_size_ = static_cast<IndexType>(combined_batch.size());
batch_size_ = size;
input_ = shared_input_trainer_->step_start(thread_pool, combined_batch);
input_ = shared_input_trainer_->step_start(thread_pool, batch_begin, batch_end);
return output_.data();
}