mirror of
https://github.com/HChaZZY/Stockfish.git
synced 2025-12-26 12:06:22 +08:00
Don't unnecessarily copy the batch part.
This commit is contained in:
@@ -215,27 +215,28 @@ namespace Eval::NNUE {
|
||||
std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
|
||||
|
||||
while (examples.size() >= batch_size) {
|
||||
std::vector<Example> batch(examples.end() - batch_size, examples.end());
|
||||
examples.resize(examples.size() - batch_size);
|
||||
|
||||
const auto network_output = trainer->step_start(thread_pool, batch);
|
||||
std::vector<LearnFloatType> gradients(batch.size());
|
||||
auto batch_begin = examples.end() - batch_size;
|
||||
auto batch_end = examples.end();
|
||||
auto size = batch_end - batch_begin;
|
||||
const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end);
|
||||
std::vector<LearnFloatType> gradients(size);
|
||||
|
||||
thread_pool.for_each_index_chunk_with_workers(
|
||||
std::size_t(0), batch.size(),
|
||||
std::size_t(0), size,
|
||||
[&](Thread& th, std::size_t offset, std::size_t count) {
|
||||
const auto thread_id = th.thread_idx();
|
||||
|
||||
trainer->propagate(th, offset, count);
|
||||
|
||||
for (std::size_t b = offset; b < offset + count; ++b) {
|
||||
const auto& e = *(batch_begin + b);
|
||||
const auto shallow = static_cast<Value>(round<std::int32_t>(
|
||||
batch[b].sign * network_output[b] * kPonanzaConstant));
|
||||
const auto discrete = batch[b].sign * batch[b].discrete_nn_eval;
|
||||
const auto& psv = batch[b].psv;
|
||||
e.sign * network_output[b] * kPonanzaConstant));
|
||||
const auto discrete = e.sign * e.discrete_nn_eval;
|
||||
const auto& psv = e.psv;
|
||||
const double gradient =
|
||||
batch[b].sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
|
||||
gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
|
||||
e.sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
|
||||
gradients[b] = static_cast<LearnFloatType>(gradient * e.weight);
|
||||
|
||||
|
||||
// The discrete eval will only be valid before first backpropagation,
|
||||
@@ -256,6 +257,8 @@ namespace Eval::NNUE {
|
||||
|
||||
trainer->step_end(thread_pool, learning_rate);
|
||||
|
||||
examples.resize(examples.size() - size);
|
||||
|
||||
collect_stats = false;
|
||||
}
|
||||
|
||||
|
||||
@@ -91,11 +91,13 @@ namespace Eval::NNUE {
|
||||
quantize_parameters();
|
||||
}
|
||||
|
||||
const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
|
||||
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
|
||||
{
|
||||
if (output_.size() < kOutputDimensions * combined_batch.size()) {
|
||||
output_.resize(kOutputDimensions * combined_batch.size());
|
||||
gradients_.resize(kInputDimensions * combined_batch.size());
|
||||
const auto size = batch_end - batch_begin;
|
||||
|
||||
if (output_.size() < kOutputDimensions * size) {
|
||||
output_.resize(kOutputDimensions * size);
|
||||
gradients_.resize(kInputDimensions * size);
|
||||
}
|
||||
|
||||
if (thread_states_.size() < thread_pool.size())
|
||||
@@ -103,8 +105,8 @@ namespace Eval::NNUE {
|
||||
thread_states_.resize(thread_pool.size());
|
||||
}
|
||||
|
||||
combined_batch_size_ = static_cast<IndexType>(combined_batch.size());
|
||||
combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
|
||||
combined_batch_size_ = size;
|
||||
combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
|
||||
|
||||
auto& main_thread_state = thread_states_[0];
|
||||
|
||||
|
||||
@@ -42,11 +42,13 @@ namespace Eval::NNUE {
|
||||
previous_layer_trainer_->initialize(rng);
|
||||
}
|
||||
|
||||
const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
|
||||
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
|
||||
{
|
||||
if (output_.size() < kOutputDimensions * combined_batch.size()) {
|
||||
output_.resize(kOutputDimensions * combined_batch.size());
|
||||
gradients_.resize(kInputDimensions * combined_batch.size());
|
||||
const auto size = batch_end - batch_begin;
|
||||
|
||||
if (output_.size() < kOutputDimensions * size) {
|
||||
output_.resize(kOutputDimensions * size);
|
||||
gradients_.resize(kInputDimensions * size);
|
||||
}
|
||||
|
||||
if (thread_states_.size() < thread_pool.size())
|
||||
@@ -54,9 +56,9 @@ namespace Eval::NNUE {
|
||||
thread_states_.resize(thread_pool.size());
|
||||
}
|
||||
|
||||
input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
|
||||
input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
|
||||
|
||||
batch_size_ = static_cast<IndexType>(combined_batch.size());
|
||||
batch_size_ = size;
|
||||
|
||||
return output_.data();
|
||||
}
|
||||
|
||||
@@ -89,11 +89,13 @@ namespace Eval::NNUE {
|
||||
quantize_parameters();
|
||||
}
|
||||
|
||||
const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
|
||||
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
|
||||
{
|
||||
if (output_.size() < kOutputDimensions * combined_batch.size()) {
|
||||
output_.resize(kOutputDimensions * combined_batch.size());
|
||||
gradients_.resize(kOutputDimensions * combined_batch.size());
|
||||
const auto size = batch_end - batch_begin;
|
||||
|
||||
if (output_.size() < kOutputDimensions * size) {
|
||||
output_.resize(kOutputDimensions * size);
|
||||
gradients_.resize(kOutputDimensions * size);
|
||||
}
|
||||
|
||||
if (thread_stat_states_.size() < thread_pool.size())
|
||||
@@ -106,7 +108,8 @@ namespace Eval::NNUE {
|
||||
thread_bias_states_.resize(thread_pool.size());
|
||||
}
|
||||
|
||||
batch_ = &combined_batch;
|
||||
batch_ = &*batch_begin;
|
||||
batch_size_ = size;
|
||||
|
||||
auto& main_thread_bias_state = thread_bias_states_[0];
|
||||
|
||||
@@ -161,7 +164,7 @@ namespace Eval::NNUE {
|
||||
Blas::scopy(
|
||||
kHalfDimensions, biases_, 1, &output_[output_offset], 1
|
||||
);
|
||||
for (const auto& feature : (*batch_)[b].training_features[c]) {
|
||||
for (const auto& feature : batch_[b].training_features[c]) {
|
||||
const IndexType weights_offset = kHalfDimensions * feature.get_index();
|
||||
Blas::saxpy(
|
||||
kHalfDimensions, (float)feature.get_count(),
|
||||
@@ -458,12 +461,12 @@ namespace Eval::NNUE {
|
||||
[&, num_threads = thread_pool.size()](Thread& th) {
|
||||
const auto thread_index = th.thread_idx();
|
||||
|
||||
for (IndexType b = 0; b < batch_->size(); ++b) {
|
||||
for (IndexType b = 0; b < batch_size_; ++b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
|
||||
for (IndexType c = 0; c < 2; ++c) {
|
||||
const IndexType output_offset = batch_offset + kHalfDimensions * c;
|
||||
for (const auto& feature : (*batch_)[b].training_features[c]) {
|
||||
for (const auto& feature : batch_[b].training_features[c]) {
|
||||
const IndexType feature_index = feature.get_index();
|
||||
const IndexType weights_offset =
|
||||
kHalfDimensions * feature_index;
|
||||
@@ -519,6 +522,7 @@ namespace Eval::NNUE {
|
||||
// constructor
|
||||
Trainer(LayerType* target_layer) :
|
||||
batch_(nullptr),
|
||||
batch_size_(0),
|
||||
target_layer_(target_layer),
|
||||
biases_(),
|
||||
weights_(),
|
||||
@@ -669,7 +673,8 @@ namespace Eval::NNUE {
|
||||
static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
|
||||
|
||||
// mini batch
|
||||
const std::vector<Example>* batch_;
|
||||
const Example* batch_;
|
||||
IndexType batch_size_;
|
||||
|
||||
// layer to learn
|
||||
LayerType* const target_layer_;
|
||||
|
||||
@@ -63,9 +63,12 @@ namespace Eval::NNUE {
|
||||
}
|
||||
}
|
||||
|
||||
const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
|
||||
if (gradients_.size() < kInputDimensions * combined_batch.size()) {
|
||||
gradients_.resize(kInputDimensions * combined_batch.size());
|
||||
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
|
||||
{
|
||||
const auto size = batch_end - batch_begin;
|
||||
|
||||
if (gradients_.size() < kInputDimensions * size) {
|
||||
gradients_.resize(kInputDimensions * size);
|
||||
}
|
||||
|
||||
if (num_calls_.size() < thread_pool.size())
|
||||
@@ -73,11 +76,11 @@ namespace Eval::NNUE {
|
||||
num_calls_.resize(thread_pool.size(), 0);
|
||||
}
|
||||
|
||||
batch_size_ = static_cast<IndexType>(combined_batch.size());
|
||||
batch_size_ = size;
|
||||
|
||||
if (num_calls_[0] == 0) {
|
||||
current_operation_ = Operation::kStepStart;
|
||||
output_ = feature_transformer_trainer_->step_start(thread_pool, combined_batch);
|
||||
output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end);
|
||||
}
|
||||
|
||||
assert(current_operation_ == Operation::kStepStart);
|
||||
@@ -237,15 +240,18 @@ namespace Eval::NNUE {
|
||||
shared_input_trainer_->initialize(rng);
|
||||
}
|
||||
|
||||
const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
|
||||
if (output_.size() < kOutputDimensions * combined_batch.size()) {
|
||||
output_.resize(kOutputDimensions * combined_batch.size());
|
||||
gradients_.resize(kInputDimensions * combined_batch.size());
|
||||
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
|
||||
{
|
||||
const auto size = batch_end - batch_begin;
|
||||
|
||||
if (output_.size() < kOutputDimensions * size) {
|
||||
output_.resize(kOutputDimensions * size);
|
||||
gradients_.resize(kInputDimensions * size);
|
||||
}
|
||||
|
||||
batch_size_ = static_cast<IndexType>(combined_batch.size());
|
||||
batch_size_ = size;
|
||||
|
||||
input_ = shared_input_trainer_->step_start(thread_pool, combined_batch);
|
||||
input_ = shared_input_trainer_->step_start(thread_pool, batch_begin, batch_end);
|
||||
|
||||
return output_.data();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user