diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp index 4104fef5..644ac9a4 100644 --- a/src/nnue/evaluate_nnue_learner.cpp +++ b/src/nnue/evaluate_nnue_learner.cpp @@ -54,6 +54,12 @@ namespace Eval::NNUE { const std::string& seed, SynchronizedRegionLogger::Region& out) { +#if defined (OPENBLAS_VERSION) + openblas_set_num_threads(1); +#elif defined (INTEL_MKL_VERSION) + mkl_set_num_threads(1); +#endif + out << "INFO (initialize_training): Initializing NN training for " << get_architecture_string() << std::endl; @@ -199,39 +205,62 @@ namespace Eval::NNUE { bool collect_stats = verbose; + std::vector abs_eval_diff_sum_local(thread_pool.size(), 0.0); + std::vector abs_discrete_eval_sum_local(thread_pool.size(), 0.0); + std::vector gradient_norm_local(thread_pool.size(), 0.0); + while (examples.size() >= batch_size) { std::vector batch(examples.end() - batch_size, examples.end()); examples.resize(examples.size() - batch_size); - const auto network_output = trainer->propagate(thread_pool, batch); - + const auto network_output = trainer->step_start(thread_pool, batch); std::vector gradients(batch.size()); - for (std::size_t b = 0; b < batch.size(); ++b) { - const auto shallow = static_cast(round( - batch[b].sign * network_output[b] * kPonanzaConstant)); - const auto discrete = batch[b].sign * batch[b].discrete_nn_eval; - const auto& psv = batch[b].psv; - const double gradient = - batch[b].sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly); - gradients[b] = static_cast(gradient * batch[b].weight); + + thread_pool.for_each_index_chunk_with_workers( + std::size_t(0), batch.size(), + [&](Thread& th, std::size_t offset, std::size_t count) { + const auto thread_id = th.thread_idx(); + + trainer->propagate(th, offset, count); + + for (std::size_t b = offset; b < offset + count; ++b) { + const auto shallow = static_cast(round( + batch[b].sign * network_output[b] * kPonanzaConstant)); + const auto discrete = batch[b].sign * batch[b].discrete_nn_eval; + const auto& psv = batch[b].psv; + const double gradient = + batch[b].sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly); + gradients[b] = static_cast(gradient * batch[b].weight); - // The discrete eval will only be valid before first backpropagation, - // that is only for the first batch. - // Similarily we want only gradients from one batch. - if (collect_stats) - { - abs_eval_diff_sum += std::abs(discrete - shallow); - abs_discrete_eval_sum += std::abs(discrete); - gradient_norm += std::abs(gradient); + // The discrete eval will only be valid before first backpropagation, + // that is only for the first batch. + // Similarily we want only gradients from one batch. + if (collect_stats) + { + abs_eval_diff_sum_local[thread_id] += std::abs(discrete - shallow); + abs_discrete_eval_sum_local[thread_id] += std::abs(discrete); + gradient_norm_local[thread_id] += std::abs(gradient); + } + } + + trainer->backpropagate(th, gradients.data(), offset, count); } - } + ); + thread_pool.wait_for_workers_finished(); - trainer->backpropagate(thread_pool, gradients.data(), learning_rate); + trainer->step_end(thread_pool, learning_rate); collect_stats = false; } + if (verbose) + { + abs_eval_diff_sum = std::accumulate(abs_eval_diff_sum_local.begin(), abs_eval_diff_sum_local.end(), 0.0); + abs_discrete_eval_sum = std::accumulate(abs_discrete_eval_sum_local.begin(), abs_discrete_eval_sum_local.end(), 0.0); + gradient_norm = std::accumulate(gradient_norm_local.begin(), gradient_norm_local.end(), 0.0); + } + if (verbose) { const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size; const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;