diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp index 4900ff79..450a80c6 100644 --- a/src/learn/learn.cpp +++ b/src/learn/learn.cpp @@ -52,7 +52,6 @@ #include #include #include -#include #if defined (_OPENMP) #include @@ -98,65 +97,6 @@ namespace Learner // Using stockfish's WDL with win rate model instead of sigmoid static bool use_wdl = false; - struct Loss - { - double value() const - { - return m_loss.value; - } - - double grad() const - { - return m_loss.grad; - } - - uint64_t count() const - { - return m_count; - } - - Loss& operator += (const ValueWithGrad& rhs) - { - std::unique_lock lock(m_mutex); - - m_loss += rhs.abs(); - m_count += 1; - - return *this; - } - - Loss& operator += (const Loss& rhs) - { - std::unique_lock lock(m_mutex); - - m_loss += rhs.m_loss.abs(); - m_count += rhs.m_count; - - return *this; - } - - void reset() - { - std::unique_lock lock(m_mutex); - - m_loss = ValueWithGrad{ 0.0, 0.0 }; - m_count = 0; - } - - template - void print(const std::string& prefix, StreamT& s) const - { - s << " - " << prefix << "_loss = " << m_loss.value / (double)m_count << endl; - s << " - " << prefix << "_grad_norm = " << m_loss.grad / (double)m_count << endl; - } - - private: - ValueWithGrad m_loss{ 0.0, 0.0 }; - uint64_t m_count{0}; - std::mutex m_mutex; - - }; - static void append_files_from_dir( std::vector& filenames, const std::string& base_dir, @@ -714,7 +654,6 @@ namespace Learner const auto thread_id = th.thread_idx(); auto& pos = th.rootPos; - Loss local_loss_sum{}; std::vector> state(MAX_PLY); while(!stop_flag) @@ -761,17 +700,8 @@ namespace Learner auto pos_add_grad = [&]() { // Evaluation value of deep search - const auto deep_value = (Value)ps.score; - const Value shallow_value = Eval::evaluate(pos); - const auto loss = get_loss( - deep_value, - (rootColor == pos.side_to_move()) ? shallow_value : -shallow_value, - ps); - - local_loss_sum += loss; - Eval::NNUE::add_example(pos, rootColor, shallow_value, ps, 1.0); }; @@ -809,8 +739,6 @@ namespace Learner // Since we have reached the end phase of PV, add the slope here. pos_add_grad(); } - - learn_loss_sum += local_loss_sum; } void LearnerThink::update_weights(const PSVector& psv, uint64_t epoch) @@ -819,7 +747,8 @@ namespace Learner // should be no real issues happening since // the read/write phases are isolated. atomic_thread_fence(memory_order_seq_cst); - Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, params.max_grad, get_loss); + learn_loss_sum += Eval::NNUE::update_parameters( + Threads, epoch, params.verbose, params.learning_rate, params.max_grad, get_loss); atomic_thread_fence(memory_order_seq_cst); if (++save_count * params.mini_batch_size >= params.eval_save_interval) @@ -899,11 +828,11 @@ namespace Learner if (psv.size() && test_loss_sum.count() > 0) { - test_loss_sum.print("test", out); + test_loss_sum.print("val", out); if (learn_loss_sum.count() > 0) { - learn_loss_sum.print("learn", out); + learn_loss_sum.print("train", out); } out << " - norm = " << sum_norm << endl; diff --git a/src/learn/learn.h b/src/learn/learn.h index 4e8d8a02..552096b2 100644 --- a/src/learn/learn.h +++ b/src/learn/learn.h @@ -40,6 +40,8 @@ using LearnFloatType = float; #include #include +#include +#include namespace Learner { @@ -69,6 +71,72 @@ namespace Learner void learn(std::istringstream& is); using CalcLossFunc = ValueWithGrad(Value, Value, int, int); + + struct Loss + { + double value() const + { + return m_loss.value; + } + + double grad() const + { + return m_loss.grad; + } + + uint64_t count() const + { + return m_count; + } + + Loss() = default; + + Loss(const Loss& other) : + m_loss(other.m_loss), + m_count(other.m_count) + { + } + + Loss& operator += (const ValueWithGrad& rhs) + { + std::unique_lock lock(m_mutex); + + m_loss += rhs.abs(); + m_count += 1; + + return *this; + } + + Loss& operator += (const Loss& rhs) + { + std::unique_lock lock(m_mutex); + + m_loss += rhs.m_loss.abs(); + m_count += rhs.m_count; + + return *this; + } + + void reset() + { + std::unique_lock lock(m_mutex); + + m_loss = ValueWithGrad{ 0.0, 0.0 }; + m_count = 0; + } + + template + void print(const std::string& prefix, StreamT& s) const + { + s << " - " << prefix << "_loss = " << m_loss.value / (double)m_count << std::endl; + s << " - " << prefix << "_grad_norm = " << m_loss.grad / (double)m_count << std::endl; + } + + private: + ValueWithGrad m_loss{ 0.0, 0.0 }; + uint64_t m_count{0}; + std::mutex m_mutex; + }; } #endif // ifndef _LEARN_H_ diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp index 8c28e4f4..3061a4f4 100644 --- a/src/nnue/evaluate_nnue_learner.cpp +++ b/src/nnue/evaluate_nnue_learner.cpp @@ -190,7 +190,7 @@ namespace Eval::NNUE { } // update the evaluation function parameters - void update_parameters( + Learner::Loss update_parameters( ThreadPool& thread_pool, uint64_t epoch, bool verbose, @@ -212,9 +212,12 @@ namespace Eval::NNUE { bool collect_stats = verbose; + Learner::Loss loss_sum{}; + std::vector abs_eval_diff_sum_local(thread_pool.size(), 0.0); std::vector abs_discrete_eval_sum_local(thread_pool.size(), 0.0); std::vector gradient_norm_local(thread_pool.size(), 0.0); + std::vector loss_sum_local(thread_pool.size()); auto prev_batch_begin = examples.end(); while ((long)(prev_batch_begin - examples.begin()) >= (long)batch_size) { @@ -237,11 +240,11 @@ namespace Eval::NNUE { e.sign * network_output[b] * kPonanzaConstant)); const auto discrete = e.sign * e.discrete_nn_eval; const auto& psv = e.psv; - const auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly); - const double gradient = std::clamp( + auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly); + loss.grad = std::clamp( loss.grad * e.sign * kPonanzaConstant * e.weight, -max_grad, max_grad); - gradients[b] = static_cast(gradient); - + gradients[b] = static_cast(loss.grad); + loss_sum_local[thread_id] += loss; // The discrete eval will only be valid before first backpropagation, // that is only for the first batch. @@ -250,7 +253,7 @@ namespace Eval::NNUE { { abs_eval_diff_sum_local[thread_id] += std::abs(discrete - shallow); abs_discrete_eval_sum_local[thread_id] += std::abs(discrete); - gradient_norm_local[thread_id] += std::abs(gradient); + gradient_norm_local[thread_id] += std::abs(loss.grad); } } @@ -277,9 +280,7 @@ namespace Eval::NNUE { abs_eval_diff_sum = std::accumulate(abs_eval_diff_sum_local.begin(), abs_eval_diff_sum_local.end(), 0.0); abs_discrete_eval_sum = std::accumulate(abs_discrete_eval_sum_local.begin(), abs_discrete_eval_sum_local.end(), 0.0); gradient_norm = std::accumulate(gradient_norm_local.begin(), gradient_norm_local.end(), 0.0); - } - if (verbose) { const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size; const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size; @@ -300,6 +301,13 @@ namespace Eval::NNUE { } send_messages({{"quantize_parameters"}}); + + for(auto& loss : loss_sum_local) + { + loss_sum += loss; + } + + return loss_sum; } // Check if there are any problems with learning diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h index 5beca0a7..3d9f5b31 100644 --- a/src/nnue/evaluate_nnue_learner.h +++ b/src/nnue/evaluate_nnue_learner.h @@ -33,7 +33,7 @@ namespace Eval::NNUE { double weight); // update the evaluation function parameters - void update_parameters( + Learner::Loss update_parameters( ThreadPool& thread_pool, uint64_t epoch, bool verbose,