mirror of
https://github.com/HChaZZY/Stockfish.git
synced 2025-12-24 19:16:49 +08:00
Additional output from layers during training.
This commit is contained in:
@@ -48,6 +48,10 @@ namespace Eval::NNUE {
|
||||
if (receive_message("quantize_parameters", message)) {
|
||||
quantize_parameters();
|
||||
}
|
||||
|
||||
if (receive_message("check_health", message)) {
|
||||
check_health();
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize the parameters with random numbers
|
||||
@@ -145,16 +149,11 @@ namespace Eval::NNUE {
|
||||
&gradients[batch_offset], 1, biases_diff_, 1);
|
||||
}
|
||||
|
||||
cblas_saxpy(kOutputDimensions, -local_learning_rate,
|
||||
biases_diff_, 1, biases_, 1);
|
||||
|
||||
cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
|
||||
kOutputDimensions, kInputDimensions, batch_size_, 1.0,
|
||||
gradients, kOutputDimensions,
|
||||
batch_input_, kInputDimensions,
|
||||
momentum_, weights_diff_, kInputDimensions);
|
||||
cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate,
|
||||
weights_diff_, 1, weights_, 1);
|
||||
|
||||
#else
|
||||
// backpropagate
|
||||
@@ -196,16 +195,22 @@ namespace Eval::NNUE {
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (IndexType i = 0; i < kOutputDimensions; ++i) {
|
||||
biases_[i] -= local_learning_rate * biases_diff_[i];
|
||||
const double d = local_learning_rate * biases_diff_[i];
|
||||
biases_[i] -= d;
|
||||
abs_biases_diff_sum_ += std::abs(d);
|
||||
}
|
||||
num_biases_diffs_ += kOutputDimensions;
|
||||
|
||||
for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
|
||||
weights_[i] -= local_learning_rate * weights_diff_[i];
|
||||
const double d = local_learning_rate * weights_diff_[i];
|
||||
weights_[i] -= d;
|
||||
abs_weights_diff_sum_ += std::abs(d);
|
||||
}
|
||||
num_weights_diffs_ += kOutputDimensions * kInputDimensions;
|
||||
|
||||
#endif
|
||||
previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate);
|
||||
}
|
||||
|
||||
@@ -227,6 +232,30 @@ namespace Eval::NNUE {
|
||||
dequantize_parameters();
|
||||
}
|
||||
|
||||
void reset_stats() {
|
||||
abs_biases_diff_sum_ = 0.0;
|
||||
abs_weights_diff_sum_ = 0.0;
|
||||
num_biases_diffs_ = 0;
|
||||
num_weights_diffs_ = 0;
|
||||
}
|
||||
|
||||
void check_health() {
|
||||
|
||||
auto out = sync_region_cout.new_region();
|
||||
|
||||
out << "INFO (check_health):"
|
||||
<< " layer " << LayerType::kLayerIndex
|
||||
<< " - " << LayerType::get_name()
|
||||
<< std::endl;
|
||||
|
||||
out << " - avg_abs_bias_diff = " << abs_biases_diff_sum_ / num_biases_diffs_ << std::endl;
|
||||
out << " - avg_abs_weight_diff = " << abs_weights_diff_sum_ / num_weights_diffs_ << std::endl;
|
||||
|
||||
out.unlock();
|
||||
|
||||
reset_stats();
|
||||
}
|
||||
|
||||
// Weight saturation and parameterization
|
||||
void quantize_parameters() {
|
||||
for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
|
||||
@@ -270,6 +299,8 @@ namespace Eval::NNUE {
|
||||
static_cast<LearnFloatType>(0.0));
|
||||
std::fill(std::begin(weights_diff_), std::end(weights_diff_),
|
||||
static_cast<LearnFloatType>(0.0));
|
||||
|
||||
reset_stats();
|
||||
}
|
||||
|
||||
// number of input/output dimensions
|
||||
@@ -296,6 +327,11 @@ namespace Eval::NNUE {
|
||||
// number of samples in mini-batch
|
||||
IndexType batch_size_;
|
||||
|
||||
double abs_biases_diff_sum_;
|
||||
double abs_weights_diff_sum_;
|
||||
uint64_t num_biases_diffs_;
|
||||
uint64_t num_weights_diffs_;
|
||||
|
||||
// Input mini batch
|
||||
const LearnFloatType* batch_input_;
|
||||
|
||||
|
||||
@@ -70,10 +70,12 @@ namespace Eval::NNUE {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
for (IndexType i = 0; i < kOutputDimensions; ++i) {
|
||||
const IndexType index = batch_offset + i;
|
||||
gradients_[index] = gradients[index] *
|
||||
(output_[index] > kZero) * (output_[index] < kOne);
|
||||
const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
|
||||
gradients_[index] = gradients[index] * !clipped;
|
||||
num_clipped_ += clipped;
|
||||
}
|
||||
}
|
||||
num_total_ += batch_size_ * kOutputDimensions;
|
||||
|
||||
previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate);
|
||||
}
|
||||
@@ -86,10 +88,17 @@ namespace Eval::NNUE {
|
||||
&target_layer->previous_layer_, ft)),
|
||||
target_layer_(target_layer) {
|
||||
|
||||
reset_stats();
|
||||
}
|
||||
|
||||
void reset_stats() {
|
||||
std::fill(std::begin(min_activations_), std::end(min_activations_),
|
||||
std::numeric_limits<LearnFloatType>::max());
|
||||
std::fill(std::begin(max_activations_), std::end(max_activations_),
|
||||
std::numeric_limits<LearnFloatType>::lowest());
|
||||
|
||||
num_clipped_ = 0;
|
||||
num_total_ = 0;
|
||||
}
|
||||
|
||||
// Check if there are any problems with learning
|
||||
@@ -111,12 +120,12 @@ namespace Eval::NNUE {
|
||||
<< " , smallest max activation = " << smallest_max_activation
|
||||
<< std::endl;
|
||||
|
||||
out << " - clipped " << static_cast<double>(num_clipped_) / num_total_ * 100.0 << "% of outputs"
|
||||
<< std::endl;
|
||||
|
||||
out.unlock();
|
||||
|
||||
std::fill(std::begin(min_activations_), std::end(min_activations_),
|
||||
std::numeric_limits<LearnFloatType>::max());
|
||||
std::fill(std::begin(max_activations_), std::end(max_activations_),
|
||||
std::numeric_limits<LearnFloatType>::lowest());
|
||||
reset_stats();
|
||||
}
|
||||
|
||||
// number of input/output dimensions
|
||||
@@ -130,6 +139,9 @@ namespace Eval::NNUE {
|
||||
// number of samples in mini-batch
|
||||
IndexType batch_size_;
|
||||
|
||||
IndexType num_clipped_;
|
||||
IndexType num_total_;
|
||||
|
||||
// Trainer of the previous layer
|
||||
const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
|
||||
|
||||
|
||||
@@ -153,10 +153,12 @@ namespace Eval::NNUE {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
for (IndexType i = 0; i < kOutputDimensions; ++i) {
|
||||
const IndexType index = batch_offset + i;
|
||||
gradients_[index] = gradients[index] *
|
||||
((output_[index] > kZero) * (output_[index] < kOne));
|
||||
const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
|
||||
gradients_[index] = gradients[index] * !clipped;
|
||||
num_clipped_ += clipped;
|
||||
}
|
||||
}
|
||||
num_total_ += batch_->size() * kOutputDimensions;
|
||||
|
||||
// Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
|
||||
// Correct the learning rate and adjust the scale without using momentum
|
||||
@@ -261,14 +263,6 @@ namespace Eval::NNUE {
|
||||
momentum_(0.2),
|
||||
learning_rate_scale_(1.0) {
|
||||
|
||||
min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
|
||||
max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
|
||||
|
||||
std::fill(std::begin(min_activations_), std::end(min_activations_),
|
||||
std::numeric_limits<LearnFloatType>::max());
|
||||
std::fill(std::begin(max_activations_), std::end(max_activations_),
|
||||
std::numeric_limits<LearnFloatType>::lowest());
|
||||
|
||||
dequantize_parameters();
|
||||
}
|
||||
|
||||
@@ -299,6 +293,19 @@ namespace Eval::NNUE {
|
||||
}
|
||||
}
|
||||
|
||||
void reset_stats() {
|
||||
min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
|
||||
max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
|
||||
|
||||
std::fill(std::begin(min_activations_), std::end(min_activations_),
|
||||
std::numeric_limits<LearnFloatType>::max());
|
||||
std::fill(std::begin(max_activations_), std::end(max_activations_),
|
||||
std::numeric_limits<LearnFloatType>::lowest());
|
||||
|
||||
num_clipped_ = 0;
|
||||
num_total_ = 0;
|
||||
}
|
||||
|
||||
// read parameterized integer
|
||||
void dequantize_parameters() {
|
||||
for (IndexType i = 0; i < kHalfDimensions; ++i) {
|
||||
@@ -314,6 +321,8 @@ namespace Eval::NNUE {
|
||||
}
|
||||
|
||||
std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
|
||||
|
||||
reset_stats();
|
||||
}
|
||||
|
||||
// Set the weight corresponding to the feature that does not appear in the learning data to 0
|
||||
@@ -361,12 +370,12 @@ namespace Eval::NNUE {
|
||||
<< " , smallest max activation = " << smallest_max_activation
|
||||
<< std::endl;
|
||||
|
||||
out << " - clipped " << static_cast<double>(num_clipped_) / num_total_ * 100.0 << "% of outputs"
|
||||
<< std::endl;
|
||||
|
||||
out.unlock();
|
||||
|
||||
std::fill(std::begin(min_activations_), std::end(min_activations_),
|
||||
std::numeric_limits<LearnFloatType>::max());
|
||||
std::fill(std::begin(max_activations_), std::end(max_activations_),
|
||||
std::numeric_limits<LearnFloatType>::lowest());
|
||||
reset_stats();
|
||||
}
|
||||
|
||||
// number of input/output dimensions
|
||||
@@ -391,6 +400,9 @@ namespace Eval::NNUE {
|
||||
// layer to learn
|
||||
LayerType* const target_layer_;
|
||||
|
||||
IndexType num_clipped_;
|
||||
IndexType num_total_;
|
||||
|
||||
// parameter
|
||||
alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
|
||||
alignas(kCacheLineSize)
|
||||
|
||||
Reference in New Issue
Block a user