diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h index f6d374ef..21e54f18 100644 --- a/src/nnue/trainer/trainer_affine_transform.h +++ b/src/nnue/trainer/trainer_affine_transform.h @@ -48,6 +48,10 @@ namespace Eval::NNUE { if (receive_message("quantize_parameters", message)) { quantize_parameters(); } + + if (receive_message("check_health", message)) { + check_health(); + } } // Initialize the parameters with random numbers @@ -145,16 +149,11 @@ namespace Eval::NNUE { &gradients[batch_offset], 1, biases_diff_, 1); } - cblas_saxpy(kOutputDimensions, -local_learning_rate, - biases_diff_, 1, biases_, 1); - cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, kOutputDimensions, kInputDimensions, batch_size_, 1.0, gradients, kOutputDimensions, batch_input_, kInputDimensions, momentum_, weights_diff_, kInputDimensions); - cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate, - weights_diff_, 1, weights_, 1); #else // backpropagate @@ -196,16 +195,22 @@ namespace Eval::NNUE { } } } +#endif for (IndexType i = 0; i < kOutputDimensions; ++i) { - biases_[i] -= local_learning_rate * biases_diff_[i]; + const double d = local_learning_rate * biases_diff_[i]; + biases_[i] -= d; + abs_biases_diff_sum_ += std::abs(d); } + num_biases_diffs_ += kOutputDimensions; for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) { - weights_[i] -= local_learning_rate * weights_diff_[i]; + const double d = local_learning_rate * weights_diff_[i]; + weights_[i] -= d; + abs_weights_diff_sum_ += std::abs(d); } + num_weights_diffs_ += kOutputDimensions * kInputDimensions; -#endif previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate); } @@ -227,6 +232,30 @@ namespace Eval::NNUE { dequantize_parameters(); } + void reset_stats() { + abs_biases_diff_sum_ = 0.0; + abs_weights_diff_sum_ = 0.0; + num_biases_diffs_ = 0; + num_weights_diffs_ = 0; + } + + void check_health() { + + auto out = sync_region_cout.new_region(); + + out << "INFO (check_health):" + << " layer " << LayerType::kLayerIndex + << " - " << LayerType::get_name() + << std::endl; + + out << " - avg_abs_bias_diff = " << abs_biases_diff_sum_ / num_biases_diffs_ << std::endl; + out << " - avg_abs_weight_diff = " << abs_weights_diff_sum_ / num_weights_diffs_ << std::endl; + + out.unlock(); + + reset_stats(); + } + // Weight saturation and parameterization void quantize_parameters() { for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) { @@ -270,6 +299,8 @@ namespace Eval::NNUE { static_cast(0.0)); std::fill(std::begin(weights_diff_), std::end(weights_diff_), static_cast(0.0)); + + reset_stats(); } // number of input/output dimensions @@ -296,6 +327,11 @@ namespace Eval::NNUE { // number of samples in mini-batch IndexType batch_size_; + double abs_biases_diff_sum_; + double abs_weights_diff_sum_; + uint64_t num_biases_diffs_; + uint64_t num_weights_diffs_; + // Input mini batch const LearnFloatType* batch_input_; diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h index f9bbd833..57e9bac4 100644 --- a/src/nnue/trainer/trainer_clipped_relu.h +++ b/src/nnue/trainer/trainer_clipped_relu.h @@ -70,10 +70,12 @@ namespace Eval::NNUE { const IndexType batch_offset = kOutputDimensions * b; for (IndexType i = 0; i < kOutputDimensions; ++i) { const IndexType index = batch_offset + i; - gradients_[index] = gradients[index] * - (output_[index] > kZero) * (output_[index] < kOne); + const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne); + gradients_[index] = gradients[index] * !clipped; + num_clipped_ += clipped; } } + num_total_ += batch_size_ * kOutputDimensions; previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate); } @@ -86,10 +88,17 @@ namespace Eval::NNUE { &target_layer->previous_layer_, ft)), target_layer_(target_layer) { + reset_stats(); + } + + void reset_stats() { std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits::max()); std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits::lowest()); + + num_clipped_ = 0; + num_total_ = 0; } // Check if there are any problems with learning @@ -111,12 +120,12 @@ namespace Eval::NNUE { << " , smallest max activation = " << smallest_max_activation << std::endl; + out << " - clipped " << static_cast(num_clipped_) / num_total_ * 100.0 << "% of outputs" + << std::endl; + out.unlock(); - std::fill(std::begin(min_activations_), std::end(min_activations_), - std::numeric_limits::max()); - std::fill(std::begin(max_activations_), std::end(max_activations_), - std::numeric_limits::lowest()); + reset_stats(); } // number of input/output dimensions @@ -130,6 +139,9 @@ namespace Eval::NNUE { // number of samples in mini-batch IndexType batch_size_; + IndexType num_clipped_; + IndexType num_total_; + // Trainer of the previous layer const std::shared_ptr> previous_layer_trainer_; diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h index ffde6eba..869ceb85 100644 --- a/src/nnue/trainer/trainer_feature_transformer.h +++ b/src/nnue/trainer/trainer_feature_transformer.h @@ -153,10 +153,12 @@ namespace Eval::NNUE { const IndexType batch_offset = kOutputDimensions * b; for (IndexType i = 0; i < kOutputDimensions; ++i) { const IndexType index = batch_offset + i; - gradients_[index] = gradients[index] * - ((output_[index] > kZero) * (output_[index] < kOne)); + const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne); + gradients_[index] = gradients[index] * !clipped; + num_clipped_ += clipped; } } + num_total_ += batch_->size() * kOutputDimensions; // Since the weight matrix updates only the columns corresponding to the features that appeared in the input, // Correct the learning rate and adjust the scale without using momentum @@ -261,14 +263,6 @@ namespace Eval::NNUE { momentum_(0.2), learning_rate_scale_(1.0) { - min_pre_activation_ = std::numeric_limits::max(); - max_pre_activation_ = std::numeric_limits::lowest(); - - std::fill(std::begin(min_activations_), std::end(min_activations_), - std::numeric_limits::max()); - std::fill(std::begin(max_activations_), std::end(max_activations_), - std::numeric_limits::lowest()); - dequantize_parameters(); } @@ -299,6 +293,19 @@ namespace Eval::NNUE { } } + void reset_stats() { + min_pre_activation_ = std::numeric_limits::max(); + max_pre_activation_ = std::numeric_limits::lowest(); + + std::fill(std::begin(min_activations_), std::end(min_activations_), + std::numeric_limits::max()); + std::fill(std::begin(max_activations_), std::end(max_activations_), + std::numeric_limits::lowest()); + + num_clipped_ = 0; + num_total_ = 0; + } + // read parameterized integer void dequantize_parameters() { for (IndexType i = 0; i < kHalfDimensions; ++i) { @@ -314,6 +321,8 @@ namespace Eval::NNUE { } std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero); + + reset_stats(); } // Set the weight corresponding to the feature that does not appear in the learning data to 0 @@ -361,12 +370,12 @@ namespace Eval::NNUE { << " , smallest max activation = " << smallest_max_activation << std::endl; + out << " - clipped " << static_cast(num_clipped_) / num_total_ * 100.0 << "% of outputs" + << std::endl; + out.unlock(); - std::fill(std::begin(min_activations_), std::end(min_activations_), - std::numeric_limits::max()); - std::fill(std::begin(max_activations_), std::end(max_activations_), - std::numeric_limits::lowest()); + reset_stats(); } // number of input/output dimensions @@ -391,6 +400,9 @@ namespace Eval::NNUE { // layer to learn LayerType* const target_layer_; + IndexType num_clipped_; + IndexType num_total_; + // parameter alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions]; alignas(kCacheLineSize)