From c286f9cd7d875aa4bf61ae12998e68f2e8fcb1a4 Mon Sep 17 00:00:00 2001 From: Tomasz Sobczyk Date: Wed, 14 Oct 2020 21:26:03 +0200 Subject: [PATCH] Cleanup trainer. --- src/nnue/trainer/trainer.h | 205 +++--- src/nnue/trainer/trainer_affine_transform.h | 578 +++++++-------- src/nnue/trainer/trainer_clipped_relu.h | 228 +++--- .../trainer/trainer_feature_transformer.h | 661 +++++++++--------- src/nnue/trainer/trainer_input_slice.h | 432 ++++++------ src/nnue/trainer/trainer_sum.h | 312 +++++---- 6 files changed, 1263 insertions(+), 1153 deletions(-) diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h index 659863ad..7d9b66ee 100644 --- a/src/nnue/trainer/trainer.h +++ b/src/nnue/trainer/trainer.h @@ -1,121 +1,134 @@ -// Common header of class template for learning NNUE evaluation function - -#ifndef _NNUE_TRAINER_H_ +#ifndef _NNUE_TRAINER_H_ #define _NNUE_TRAINER_H_ -#include "../nnue_common.h" -#include "../features/index_list.h" +#include "nnue/nnue_common.h" +#include "nnue/features/index_list.h" #include + #if defined(USE_BLAS) static_assert(std::is_same::value, ""); #include #endif -namespace Eval { +// Common header of class template for learning NNUE evaluation function +namespace Eval::NNUE { -namespace NNUE { + // Ponanza constant used in the relation between evaluation value and winning percentage + constexpr double kPonanzaConstant = 600.0; -// Ponanza constant used in the relation between evaluation value and winning percentage -constexpr double kPonanzaConstant = 600.0; + // Class that represents one index of learning feature + class TrainingFeature { + using StorageType = std::uint32_t; + static_assert(std::is_unsigned::value, ""); -// Class that represents one index of learning feature -class TrainingFeature { - using StorageType = std::uint32_t; - static_assert(std::is_unsigned::value, ""); + public: + static constexpr std::uint32_t kIndexBits = 24; - public: - static constexpr std::uint32_t kIndexBits = 24; - static_assert(kIndexBits < std::numeric_limits::digits, ""); - static constexpr std::uint32_t kCountBits = - std::numeric_limits::digits - kIndexBits; + static_assert(kIndexBits < std::numeric_limits::digits, ""); - explicit TrainingFeature(IndexType index) : - index_and_count_((index << kCountBits) | 1) { - assert(index < (1 << kIndexBits)); - } - TrainingFeature& operator+=(const TrainingFeature& other) { - assert(other.GetIndex() == GetIndex()); - assert(other.GetCount() + GetCount() < (1 << kCountBits)); - index_and_count_ += other.GetCount(); - return *this; - } - IndexType GetIndex() const { - return static_cast(index_and_count_ >> kCountBits); - } - void ShiftIndex(IndexType offset) { - assert(GetIndex() + offset < (1 << kIndexBits)); - index_and_count_ += offset << kCountBits; - } - IndexType GetCount() const { - return static_cast(index_and_count_ & ((1 << kCountBits) - 1)); - } - bool operator<(const TrainingFeature& other) const { - return index_and_count_ < other.index_and_count_; - } + static constexpr std::uint32_t kCountBits = + std::numeric_limits::digits - kIndexBits; - private: - StorageType index_and_count_; -}; + explicit TrainingFeature(IndexType index) : + index_and_count_((index << kCountBits) | 1) { -// Structure that represents one sample of training data -struct Example { - std::vector training_features[2]; - Learner::PackedSfenValue psv; - int sign; - double weight; -}; + assert(index < (1 << kIndexBits)); + } -// Message used for setting hyperparameters -struct Message { - Message(const std::string& message_name, const std::string& message_value = ""): - name(message_name), value(message_value), num_peekers(0), num_receivers(0) {} - const std::string name; - const std::string value; - std::uint32_t num_peekers; - std::uint32_t num_receivers; -}; + TrainingFeature& operator+=(const TrainingFeature& other) { + assert(other.GetIndex() == GetIndex()); + assert(other.GetCount() + GetCount() < (1 << kCountBits)); + index_and_count_ += other.GetCount(); + return *this; + } -// determine whether to accept the message -bool ReceiveMessage(const std::string& name, Message* message) { - const auto subscript = "[" + std::to_string(message->num_peekers) + "]"; - if (message->name.substr(0, name.size() + 1) == name + "[") { - ++message->num_peekers; - } - if (message->name == name || message->name == name + subscript) { - ++message->num_receivers; - return true; - } - return false; -} + IndexType GetIndex() const { + return static_cast(index_and_count_ >> kCountBits); + } -// split the string -std::vector Split(const std::string& input, char delimiter) { - std::istringstream stream(input); - std::string field; - std::vector fields; - while (std::getline(stream, field, delimiter)) { - fields.push_back(field); - } - return fields; -} + void ShiftIndex(IndexType offset) { + assert(GetIndex() + offset < (1 << kIndexBits)); + index_and_count_ += offset << kCountBits; + } -// round a floating point number to an integer -template -IntType Round(double value) { - return static_cast(std::floor(value + 0.5)); -} + IndexType GetCount() const { + return static_cast(index_and_count_ & ((1 << kCountBits) - 1)); + } -// make_shared with alignment -template -std::shared_ptr MakeAlignedSharedPtr(ArgumentTypes&&... arguments) { - const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T))) - T(std::forward(arguments)...); - return std::shared_ptr(ptr, AlignedDeleter()); -} + bool operator<(const TrainingFeature& other) const { + return index_and_count_ < other.index_and_count_; + } -} // namespace NNUE + private: + StorageType index_and_count_; + }; -} // namespace Eval + // Structure that represents one sample of training data + struct Example { + std::vector training_features[2]; + Learner::PackedSfenValue psv; + int sign; + double weight; + }; + + // Message used for setting hyperparameters + struct Message { + Message(const std::string& message_name, const std::string& message_value = "") : + name(message_name), value(message_value), num_peekers(0), num_receivers(0) + { + } + + const std::string name; + const std::string value; + std::uint32_t num_peekers; + std::uint32_t num_receivers; + }; + + // determine whether to accept the message + bool ReceiveMessage(const std::string& name, Message* message) { + const auto subscript = "[" + std::to_string(message->num_peekers) + "]"; + + if (message->name.substr(0, name.size() + 1) == name + "[") { + ++message->num_peekers; + } + + if (message->name == name || message->name == name + subscript) { + ++message->num_receivers; + return true; + } + + return false; + } + + // split the string + std::vector Split(const std::string& input, char delimiter) { + std::istringstream stream(input); + std::string field; + std::vector fields; + + while (std::getline(stream, field, delimiter)) { + fields.push_back(field); + } + + return fields; + } + + // round a floating point number to an integer + template + IntType Round(double value) { + return static_cast(std::floor(value + 0.5)); + } + + // make_shared with alignment + template + std::shared_ptr MakeAlignedSharedPtr(ArgumentTypes&&... arguments) { + const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T))) + T(std::forward(arguments)...); + + return std::shared_ptr(ptr, AlignedDeleter()); + } + +} // namespace Eval::NNUE #endif diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h index 415b7dc8..dd70b8fb 100644 --- a/src/nnue/trainer/trainer_affine_transform.h +++ b/src/nnue/trainer/trainer_affine_transform.h @@ -1,297 +1,329 @@ -// Specialization of NNUE evaluation function learning class template for AffineTransform - -#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_ +#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_ #define _NNUE_TRAINER_AFFINE_TRANSFORM_H_ -#include "../../learn/learn.h" -#include "../layers/affine_transform.h" #include "trainer.h" +#include "learn/learn.h" + +#include "nnue/layers/affine_transform.h" + #include -namespace Eval { +// Specialization of NNUE evaluation function learning class template for AffineTransform +namespace Eval::NNUE { -namespace NNUE { + // Learning: Affine transformation layer + template + class Trainer> { + private: + // Type of layer to learn + using LayerType = Layers::AffineTransform; -// Learning: Affine transformation layer -template -class Trainer> { - private: - // Type of layer to learn - using LayerType = Layers::AffineTransform; + public: + // factory function + static std::shared_ptr Create( + LayerType* target_layer, FeatureTransformer* ft) { - public: - // factory function - static std::shared_ptr Create( - LayerType* target_layer, FeatureTransformer* ft) { - return std::shared_ptr( - new Trainer(target_layer, ft)); - } - - // Set options such as hyperparameters - void SendMessage(Message* message) { - previous_layer_trainer_->SendMessage(message); - if (ReceiveMessage("momentum", message)) { - momentum_ = static_cast(std::stod(message->value)); - } - if (ReceiveMessage("learning_rate_scale", message)) { - learning_rate_scale_ = - static_cast(std::stod(message->value)); - } - if (ReceiveMessage("reset", message)) { - DequantizeParameters(); - } - if (ReceiveMessage("quantize_parameters", message)) { - QuantizeParameters(); - } - } - - // Initialize the parameters with random numbers - template - void Initialize(RNG& rng) { - previous_layer_trainer_->Initialize(rng); - if (kIsOutputLayer) { - // Initialize output layer with 0 - std::fill(std::begin(biases_), std::end(biases_), - static_cast(0.0)); - std::fill(std::begin(weights_), std::end(weights_), - static_cast(0.0)); - } else { - // Assuming that the input distribution is unit-mean 0.5, equal variance, - // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input - const double kSigma = 1.0 / std::sqrt(kInputDimensions); - auto distribution = std::normal_distribution(0.0, kSigma); - for (IndexType i = 0; i < kOutputDimensions; ++i) { - double sum = 0.0; - for (IndexType j = 0; j < kInputDimensions; ++j) { - const auto weight = static_cast(distribution(rng)); - weights_[kInputDimensions * i + j] = weight; - sum += weight; + return std::shared_ptr( + new Trainer(target_layer, ft)); } - biases_[i] = static_cast(0.5 - 0.5 * sum); - } - } - QuantizeParameters(); - } - // forward propagation - const LearnFloatType* Propagate(const std::vector& batch) { - if (output_.size() < kOutputDimensions * batch.size()) { - output_.resize(kOutputDimensions * batch.size()); - gradients_.resize(kInputDimensions * batch.size()); - } - batch_size_ = static_cast(batch.size()); - batch_input_ = previous_layer_trainer_->Propagate(batch); + // Set options such as hyperparameters + void SendMessage(Message* message) { + previous_layer_trainer_->SendMessage(message); + + if (ReceiveMessage("momentum", message)) { + momentum_ = static_cast(std::stod(message->value)); + } + + if (ReceiveMessage("learning_rate_scale", message)) { + learning_rate_scale_ = + static_cast(std::stod(message->value)); + } + + if (ReceiveMessage("reset", message)) { + DequantizeParameters(); + } + + if (ReceiveMessage("quantize_parameters", message)) { + QuantizeParameters(); + } + } + + // Initialize the parameters with random numbers + template + void Initialize(RNG& rng) { + previous_layer_trainer_->Initialize(rng); + + if (kIsOutputLayer) { + // Initialize output layer with 0 + std::fill(std::begin(biases_), std::end(biases_), + static_cast(0.0)); + std::fill(std::begin(weights_), std::end(weights_), + static_cast(0.0)); + } + else { + // Assuming that the input distribution is unit-mean 0.5, equal variance, + // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input + const double kSigma = 1.0 / std::sqrt(kInputDimensions); + auto distribution = std::normal_distribution(0.0, kSigma); + + for (IndexType i = 0; i < kOutputDimensions; ++i) { + double sum = 0.0; + for (IndexType j = 0; j < kInputDimensions; ++j) { + const auto weight = static_cast(distribution(rng)); + weights_[kInputDimensions * i + j] = weight; + sum += weight; + } + + biases_[i] = static_cast(0.5 - 0.5 * sum); + } + } + + QuantizeParameters(); + } + + // forward propagation + const LearnFloatType* Propagate(const std::vector& batch) { + if (output_.size() < kOutputDimensions * batch.size()) { + output_.resize(kOutputDimensions * batch.size()); + gradients_.resize(kInputDimensions * batch.size()); + } + + batch_size_ = static_cast(batch.size()); + batch_input_ = previous_layer_trainer_->Propagate(batch); #if defined(USE_BLAS) - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1); - } - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, - kOutputDimensions, batch_size_, kInputDimensions, 1.0, - weights_, kInputDimensions, - batch_input_, kInputDimensions, - 1.0, &output_[0], kOutputDimensions); -#else - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType input_batch_offset = kInputDimensions * b; - const IndexType output_batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - double sum = biases_[i]; - for (IndexType j = 0; j < kInputDimensions; ++j) { - const IndexType index = kInputDimensions * i + j; - sum += weights_[index] * batch_input_[input_batch_offset + j]; - } - output_[output_batch_offset + i] = static_cast(sum); - } - } -#endif - return output_.data(); - } + for (IndexType b = 0; b < batch_size_; ++b) { + const IndexType batch_offset = kOutputDimensions * b; + cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1); + } + + cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, + kOutputDimensions, batch_size_, kInputDimensions, 1.0, + weights_, kInputDimensions, + batch_input_, kInputDimensions, + 1.0, &output_[0], kOutputDimensions); +#else + for (IndexType b = 0; b < batch_size_; ++b) { + const IndexType input_batch_offset = kInputDimensions * b; + const IndexType output_batch_offset = kOutputDimensions * b; + for (IndexType i = 0; i < kOutputDimensions; ++i) { + double sum = biases_[i]; + for (IndexType j = 0; j < kInputDimensions; ++j) { + const IndexType index = kInputDimensions * i + j; + sum += weights_[index] * batch_input_[input_batch_offset + j]; + } + + output_[output_batch_offset + i] = static_cast(sum); + } + } + +#endif + return output_.data(); + } + + // backpropagation + void Backpropagate(const LearnFloatType* gradients, + LearnFloatType learning_rate) { + + const LearnFloatType local_learning_rate = + learning_rate * learning_rate_scale_; - // backpropagation - void Backpropagate(const LearnFloatType* gradients, - LearnFloatType learning_rate) { - const LearnFloatType local_learning_rate = - learning_rate * learning_rate_scale_; #if defined(USE_BLAS) - // backpropagate - cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, - kInputDimensions, batch_size_, kOutputDimensions, 1.0, - weights_, kInputDimensions, - gradients, kOutputDimensions, - 0.0, &gradients_[0], kInputDimensions); - // update - cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1); - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - cblas_saxpy(kOutputDimensions, 1.0, - &gradients[batch_offset], 1, biases_diff_, 1); - } - cblas_saxpy(kOutputDimensions, -local_learning_rate, - biases_diff_, 1, biases_, 1); - cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, - kOutputDimensions, kInputDimensions, batch_size_, 1.0, - gradients, kOutputDimensions, - batch_input_, kInputDimensions, - momentum_, weights_diff_, kInputDimensions); - cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate, - weights_diff_, 1, weights_, 1); + // backpropagate + cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + kInputDimensions, batch_size_, kOutputDimensions, 1.0, + weights_, kInputDimensions, + gradients, kOutputDimensions, + 0.0, &gradients_[0], kInputDimensions); + + // update + cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1); + for (IndexType b = 0; b < batch_size_; ++b) { + const IndexType batch_offset = kOutputDimensions * b; + cblas_saxpy(kOutputDimensions, 1.0, + &gradients[batch_offset], 1, biases_diff_, 1); + } + + cblas_saxpy(kOutputDimensions, -local_learning_rate, + biases_diff_, 1, biases_, 1); + + cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, + kOutputDimensions, kInputDimensions, batch_size_, 1.0, + gradients, kOutputDimensions, + batch_input_, kInputDimensions, + momentum_, weights_diff_, kInputDimensions); + cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate, + weights_diff_, 1, weights_, 1); + #else - // backpropagate - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType input_batch_offset = kInputDimensions * b; - const IndexType output_batch_offset = kOutputDimensions * b; - for (IndexType j = 0; j < kInputDimensions; ++j) { - double sum = 0.0; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const IndexType index = kInputDimensions * i + j; - sum += weights_[index] * gradients[output_batch_offset + i]; - } - gradients_[input_batch_offset + j] = static_cast(sum); - } - } - // update - for (IndexType i = 0; i < kOutputDimensions; ++i) { - biases_diff_[i] *= momentum_; - } - for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) { - weights_diff_[i] *= momentum_; - } - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType input_batch_offset = kInputDimensions * b; - const IndexType output_batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - biases_diff_[i] += gradients[output_batch_offset + i]; - } - for (IndexType i = 0; i < kOutputDimensions; ++i) { - for (IndexType j = 0; j < kInputDimensions; ++j) { - const IndexType index = kInputDimensions * i + j; - weights_diff_[index] += gradients[output_batch_offset + i] * - batch_input_[input_batch_offset + j]; - } - } - } - for (IndexType i = 0; i < kOutputDimensions; ++i) { - biases_[i] -= local_learning_rate * biases_diff_[i]; - } - for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) { - weights_[i] -= local_learning_rate * weights_diff_[i]; - } -#endif - previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate); - } + // backpropagate + for (IndexType b = 0; b < batch_size_; ++b) { + const IndexType input_batch_offset = kInputDimensions * b; + const IndexType output_batch_offset = kOutputDimensions * b; + for (IndexType j = 0; j < kInputDimensions; ++j) { + double sum = 0.0; + for (IndexType i = 0; i < kOutputDimensions; ++i) { + const IndexType index = kInputDimensions * i + j; + sum += weights_[index] * gradients[output_batch_offset + i]; + } + gradients_[input_batch_offset + j] = static_cast(sum); + } + } - private: - // constructor - Trainer(LayerType* target_layer, FeatureTransformer* ft) : - batch_size_(0), - batch_input_(nullptr), - previous_layer_trainer_(Trainer::Create( - &target_layer->previous_layer_, ft)), - target_layer_(target_layer), - biases_(), - weights_(), - biases_diff_(), - weights_diff_(), - momentum_(0.2), - learning_rate_scale_(1.0) { - DequantizeParameters(); - } + // update + for (IndexType i = 0; i < kOutputDimensions; ++i) { + biases_diff_[i] *= momentum_; + } - // Weight saturation and parameterization - void QuantizeParameters() { - for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) { - weights_[i] = std::max(-kMaxWeightMagnitude, - std::min(+kMaxWeightMagnitude, weights_[i])); - } - for (IndexType i = 0; i < kOutputDimensions; ++i) { - target_layer_->biases_[i] = - Round(biases_[i] * kBiasScale); - } - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const auto offset = kInputDimensions * i; - const auto padded_offset = LayerType::kPaddedInputDimensions * i; - for (IndexType j = 0; j < kInputDimensions; ++j) { - target_layer_->weights_[padded_offset + j] = - Round( - weights_[offset + j] * kWeightScale); - } - } - } + for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) { + weights_diff_[i] *= momentum_; + } - // read parameterized integer - void DequantizeParameters() { - for (IndexType i = 0; i < kOutputDimensions; ++i) { - biases_[i] = static_cast( - target_layer_->biases_[i] / kBiasScale); - } - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const auto offset = kInputDimensions * i; - const auto padded_offset = LayerType::kPaddedInputDimensions * i; - for (IndexType j = 0; j < kInputDimensions; ++j) { - weights_[offset + j] = static_cast( - target_layer_->weights_[padded_offset + j] / kWeightScale); - } - } - std::fill(std::begin(biases_diff_), std::end(biases_diff_), - static_cast(0.0)); - std::fill(std::begin(weights_diff_), std::end(weights_diff_), - static_cast(0.0)); - } + for (IndexType b = 0; b < batch_size_; ++b) { + const IndexType input_batch_offset = kInputDimensions * b; + const IndexType output_batch_offset = kOutputDimensions * b; - // number of input/output dimensions - static constexpr IndexType kInputDimensions = LayerType::kInputDimensions; - static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions; + for (IndexType i = 0; i < kOutputDimensions; ++i) { + biases_diff_[i] += gradients[output_batch_offset + i]; + } - // If the output dimensionality is 1, the output layer - static constexpr bool kIsOutputLayer = kOutputDimensions == 1; + for (IndexType i = 0; i < kOutputDimensions; ++i) { + for (IndexType j = 0; j < kInputDimensions; ++j) { + const IndexType index = kInputDimensions * i + j; + weights_diff_[index] += gradients[output_batch_offset + i] * + batch_input_[input_batch_offset + j]; + } + } + } - // Coefficient used for parameterization - static constexpr LearnFloatType kActivationScale = - std::numeric_limits::max(); - static constexpr LearnFloatType kBiasScale = kIsOutputLayer ? - (kPonanzaConstant * FV_SCALE) : - ((1 << kWeightScaleBits) * kActivationScale); - static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale; + for (IndexType i = 0; i < kOutputDimensions; ++i) { + biases_[i] -= local_learning_rate * biases_diff_[i]; + } - // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers - static constexpr LearnFloatType kMaxWeightMagnitude = - std::numeric_limits::max() / kWeightScale; - - // number of samples in mini-batch - IndexType batch_size_; - - // Input mini batch - const LearnFloatType* batch_input_; - - // Trainer of the previous layer - const std::shared_ptr> previous_layer_trainer_; - - // layer to learn - LayerType* const target_layer_; - - // parameter - LearnFloatType biases_[kOutputDimensions]; - LearnFloatType weights_[kOutputDimensions * kInputDimensions]; - - // Buffer used for updating parameters - LearnFloatType biases_diff_[kOutputDimensions]; - LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions]; - - // Forward propagation buffer - std::vector output_; - - // buffer for back propagation - std::vector gradients_; - - // hyper parameter - LearnFloatType momentum_; - LearnFloatType learning_rate_scale_; -}; - -} // namespace NNUE - -} // namespace Eval + for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) { + weights_[i] -= local_learning_rate * weights_diff_[i]; + } + +#endif + previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate); + } + + private: + // constructor + Trainer(LayerType* target_layer, FeatureTransformer* ft) : + batch_size_(0), + batch_input_(nullptr), + previous_layer_trainer_(Trainer::Create( + &target_layer->previous_layer_, ft)), + target_layer_(target_layer), + biases_(), + weights_(), + biases_diff_(), + weights_diff_(), + momentum_(0.2), + learning_rate_scale_(1.0) { + + DequantizeParameters(); + } + + // Weight saturation and parameterization + void QuantizeParameters() { + for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) { + weights_[i] = std::max(-kMaxWeightMagnitude, + std::min(+kMaxWeightMagnitude, weights_[i])); + } + + for (IndexType i = 0; i < kOutputDimensions; ++i) { + target_layer_->biases_[i] = + Round(biases_[i] * kBiasScale); + } + + for (IndexType i = 0; i < kOutputDimensions; ++i) { + const auto offset = kInputDimensions * i; + const auto padded_offset = LayerType::kPaddedInputDimensions * i; + for (IndexType j = 0; j < kInputDimensions; ++j) { + target_layer_->weights_[padded_offset + j] = + Round( + weights_[offset + j] * kWeightScale); + } + } + } + + // read parameterized integer + void DequantizeParameters() { + for (IndexType i = 0; i < kOutputDimensions; ++i) { + biases_[i] = static_cast( + target_layer_->biases_[i] / kBiasScale); + } + + for (IndexType i = 0; i < kOutputDimensions; ++i) { + const auto offset = kInputDimensions * i; + const auto padded_offset = LayerType::kPaddedInputDimensions * i; + for (IndexType j = 0; j < kInputDimensions; ++j) { + weights_[offset + j] = static_cast( + target_layer_->weights_[padded_offset + j] / kWeightScale); + } + } + + std::fill(std::begin(biases_diff_), std::end(biases_diff_), + static_cast(0.0)); + std::fill(std::begin(weights_diff_), std::end(weights_diff_), + static_cast(0.0)); + } + + // number of input/output dimensions + static constexpr IndexType kInputDimensions = LayerType::kInputDimensions; + static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions; + + // If the output dimensionality is 1, the output layer + static constexpr bool kIsOutputLayer = kOutputDimensions == 1; + + // Coefficient used for parameterization + static constexpr LearnFloatType kActivationScale = + std::numeric_limits::max(); + + static constexpr LearnFloatType kBiasScale = kIsOutputLayer ? + (kPonanzaConstant * FV_SCALE) : + ((1 << kWeightScaleBits) * kActivationScale); + + static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale; + + // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers + static constexpr LearnFloatType kMaxWeightMagnitude = + std::numeric_limits::max() / kWeightScale; + + // number of samples in mini-batch + IndexType batch_size_; + + // Input mini batch + const LearnFloatType* batch_input_; + + // Trainer of the previous layer + const std::shared_ptr> previous_layer_trainer_; + + // layer to learn + LayerType* const target_layer_; + + // parameter + LearnFloatType biases_[kOutputDimensions]; + LearnFloatType weights_[kOutputDimensions * kInputDimensions]; + + // Buffer used for updating parameters + LearnFloatType biases_diff_[kOutputDimensions]; + LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions]; + + // Forward propagation buffer + std::vector output_; + + // buffer for back propagation + std::vector gradients_; + + // hyper parameter + LearnFloatType momentum_; + LearnFloatType learning_rate_scale_; + }; + +} // namespace Eval::NNUE #endif diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h index cf7a2447..902c2747 100644 --- a/src/nnue/trainer/trainer_clipped_relu.h +++ b/src/nnue/trainer/trainer_clipped_relu.h @@ -1,138 +1,142 @@ -// Specialization of NNUE evaluation function learning class template for ClippedReLU - -#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_ +#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_ #define _NNUE_TRAINER_CLIPPED_RELU_H_ -#include "../../learn/learn.h" -#include "../layers/clipped_relu.h" #include "trainer.h" -namespace Eval { +#include "learn/learn.h" -namespace NNUE { +#include "nnue/layers/clipped_relu.h" -// Learning: Affine transformation layer -template -class Trainer> { - private: - // Type of layer to learn - using LayerType = Layers::ClippedReLU; +// Specialization of NNUE evaluation function learning class template for ClippedReLU +namespace Eval::NNUE { - public: - // factory function - static std::shared_ptr Create( - LayerType* target_layer, FeatureTransformer* ft) { - return std::shared_ptr( - new Trainer(target_layer, ft)); - } + // Learning: Affine transformation layer + template + class Trainer> { + private: + // Type of layer to learn + using LayerType = Layers::ClippedReLU; - // Set options such as hyperparameters - void SendMessage(Message* message) { - previous_layer_trainer_->SendMessage(message); - if (ReceiveMessage("check_health", message)) { - CheckHealth(); - } - } + public: + // factory function + static std::shared_ptr Create( + LayerType* target_layer, FeatureTransformer* ft) { - // Initialize the parameters with random numbers - template - void Initialize(RNG& rng) { - previous_layer_trainer_->Initialize(rng); - } + return std::shared_ptr( + new Trainer(target_layer, ft)); + } - // forward propagation - const LearnFloatType* Propagate(const std::vector& batch) { - if (output_.size() < kOutputDimensions * batch.size()) { - output_.resize(kOutputDimensions * batch.size()); - gradients_.resize(kInputDimensions * batch.size()); - } - const auto input = previous_layer_trainer_->Propagate(batch); - batch_size_ = static_cast(batch.size()); - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const IndexType index = batch_offset + i; - output_[index] = std::max(+kZero, std::min(+kOne, input[index])); - min_activations_[i] = std::min(min_activations_[i], output_[index]); - max_activations_[i] = std::max(max_activations_[i], output_[index]); - } - } - return output_.data(); - } + // Set options such as hyperparameters + void SendMessage(Message* message) { + previous_layer_trainer_->SendMessage(message); + if (ReceiveMessage("check_health", message)) { + CheckHealth(); + } + } - // backpropagation - void Backpropagate(const LearnFloatType* gradients, - LearnFloatType learning_rate) { - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const IndexType index = batch_offset + i; - gradients_[index] = gradients[index] * - (output_[index] > kZero) * (output_[index] < kOne); - } - } - previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate); - } + // Initialize the parameters with random numbers + template + void Initialize(RNG& rng) { + previous_layer_trainer_->Initialize(rng); + } - private: - // constructor - Trainer(LayerType* target_layer, FeatureTransformer* ft) : - batch_size_(0), - previous_layer_trainer_(Trainer::Create( - &target_layer->previous_layer_, ft)), - target_layer_(target_layer) { - std::fill(std::begin(min_activations_), std::end(min_activations_), - std::numeric_limits::max()); - std::fill(std::begin(max_activations_), std::end(max_activations_), - std::numeric_limits::lowest()); - } + // forward propagation + const LearnFloatType* Propagate(const std::vector& batch) { + if (output_.size() < kOutputDimensions * batch.size()) { + output_.resize(kOutputDimensions * batch.size()); + gradients_.resize(kInputDimensions * batch.size()); + } - // Check if there are any problems with learning - void CheckHealth() { - const auto largest_min_activation = *std::max_element( - std::begin(min_activations_), std::end(min_activations_)); - const auto smallest_max_activation = *std::min_element( - std::begin(max_activations_), std::end(max_activations_)); - std::cout << "INFO: largest min activation = " << largest_min_activation - << ", smallest max activation = " << smallest_max_activation - << std::endl; + const auto input = previous_layer_trainer_->Propagate(batch); + batch_size_ = static_cast(batch.size()); + for (IndexType b = 0; b < batch_size_; ++b) { + const IndexType batch_offset = kOutputDimensions * b; + for (IndexType i = 0; i < kOutputDimensions; ++i) { + const IndexType index = batch_offset + i; + output_[index] = std::max(+kZero, std::min(+kOne, input[index])); + min_activations_[i] = std::min(min_activations_[i], output_[index]); + max_activations_[i] = std::max(max_activations_[i], output_[index]); + } + } - std::fill(std::begin(min_activations_), std::end(min_activations_), - std::numeric_limits::max()); - std::fill(std::begin(max_activations_), std::end(max_activations_), - std::numeric_limits::lowest()); - } + return output_.data(); + } - // number of input/output dimensions - static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions; - static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions; + // backpropagation + void Backpropagate(const LearnFloatType* gradients, + LearnFloatType learning_rate) { - // LearnFloatType constant - static constexpr LearnFloatType kZero = static_cast(0.0); - static constexpr LearnFloatType kOne = static_cast(1.0); + for (IndexType b = 0; b < batch_size_; ++b) { + const IndexType batch_offset = kOutputDimensions * b; + for (IndexType i = 0; i < kOutputDimensions; ++i) { + const IndexType index = batch_offset + i; + gradients_[index] = gradients[index] * + (output_[index] > kZero) * (output_[index] < kOne); + } + } - // number of samples in mini-batch - IndexType batch_size_; + previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate); + } - // Trainer of the previous layer - const std::shared_ptr> previous_layer_trainer_; + private: + // constructor + Trainer(LayerType* target_layer, FeatureTransformer* ft) : + batch_size_(0), + previous_layer_trainer_(Trainer::Create( + &target_layer->previous_layer_, ft)), + target_layer_(target_layer) { - // layer to learn - LayerType* const target_layer_; + std::fill(std::begin(min_activations_), std::end(min_activations_), + std::numeric_limits::max()); + std::fill(std::begin(max_activations_), std::end(max_activations_), + std::numeric_limits::lowest()); + } - // Forward propagation buffer - std::vector output_; + // Check if there are any problems with learning + void CheckHealth() { + const auto largest_min_activation = *std::max_element( + std::begin(min_activations_), std::end(min_activations_)); + const auto smallest_max_activation = *std::min_element( + std::begin(max_activations_), std::end(max_activations_)); - // buffer for back propagation - std::vector gradients_; + std::cout << "INFO: largest min activation = " << largest_min_activation + << ", smallest max activation = " << smallest_max_activation + << std::endl; - // Health check statistics - LearnFloatType min_activations_[kOutputDimensions]; - LearnFloatType max_activations_[kOutputDimensions]; -}; + std::fill(std::begin(min_activations_), std::end(min_activations_), + std::numeric_limits::max()); + std::fill(std::begin(max_activations_), std::end(max_activations_), + std::numeric_limits::lowest()); + } -} // namespace NNUE + // number of input/output dimensions + static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions; + static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions; -} // namespace Eval + // LearnFloatType constant + static constexpr LearnFloatType kZero = static_cast(0.0); + static constexpr LearnFloatType kOne = static_cast(1.0); + + // number of samples in mini-batch + IndexType batch_size_; + + // Trainer of the previous layer + const std::shared_ptr> previous_layer_trainer_; + + // layer to learn + LayerType* const target_layer_; + + // Forward propagation buffer + std::vector output_; + + // buffer for back propagation + std::vector gradients_; + + // Health check statistics + LearnFloatType min_activations_[kOutputDimensions]; + LearnFloatType max_activations_[kOutputDimensions]; + }; + +} // namespace Eval::NNUE #endif diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h index 225c91fc..f403e413 100644 --- a/src/nnue/trainer/trainer_feature_transformer.h +++ b/src/nnue/trainer/trainer_feature_transformer.h @@ -1,13 +1,14 @@ -// Specialization for feature transformer of learning class template of NNUE evaluation function - -#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_ +#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_ #define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_ -#include "../../learn/learn.h" -#include "../nnue_feature_transformer.h" #include "trainer.h" + #include "features/factorizer_feature_set.h" +#include "learn/learn.h" + +#include "nnue/nnue_feature_transformer.h" + #include #include #include @@ -18,356 +19,392 @@ #include #endif -namespace Eval { +// Specialization for feature transformer of learning class template of NNUE evaluation function +namespace Eval::NNUE { -namespace NNUE { + // Learning: Input feature converter + template <> + class Trainer { + private: + // Type of layer to learn + using LayerType = FeatureTransformer; -// Learning: Input feature converter -template <> -class Trainer { - private: - // Type of layer to learn - using LayerType = FeatureTransformer; + public: + template + friend struct AlignedDeleter; - public: - template - friend struct AlignedDeleter; - template - friend std::shared_ptr MakeAlignedSharedPtr(ArgumentTypes&&... arguments); + template + friend std::shared_ptr MakeAlignedSharedPtr(ArgumentTypes&&... arguments); - // factory function - static std::shared_ptr Create(LayerType* target_layer) { - return MakeAlignedSharedPtr(target_layer); - } + // factory function + static std::shared_ptr Create(LayerType* target_layer) { + return MakeAlignedSharedPtr(target_layer); + } - // Set options such as hyperparameters - void SendMessage(Message* message) { - if (ReceiveMessage("momentum", message)) { - momentum_ = static_cast(std::stod(message->value)); - } - if (ReceiveMessage("learning_rate_scale", message)) { - learning_rate_scale_ = - static_cast(std::stod(message->value)); - } - if (ReceiveMessage("reset", message)) { - DequantizeParameters(); - } - if (ReceiveMessage("quantize_parameters", message)) { - QuantizeParameters(); - } - if (ReceiveMessage("clear_unobserved_feature_weights", message)) { - ClearUnobservedFeatureWeights(); - } - if (ReceiveMessage("check_health", message)) { - CheckHealth(); - } - } + // Set options such as hyperparameters + void SendMessage(Message* message) { + if (ReceiveMessage("momentum", message)) { + momentum_ = static_cast(std::stod(message->value)); + } - // Initialize the parameters with random numbers - template - void Initialize(RNG& rng) { - std::fill(std::begin(weights_), std::end(weights_), +kZero); - const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions); - auto distribution = std::normal_distribution(0.0, kSigma); - for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) { - const auto weight = static_cast(distribution(rng)); - weights_[i] = weight; - } - for (IndexType i = 0; i < kHalfDimensions; ++i) { - biases_[i] = static_cast(0.5); - } - QuantizeParameters(); - } + if (ReceiveMessage("learning_rate_scale", message)) { + learning_rate_scale_ = + static_cast(std::stod(message->value)); + } - // forward propagation - const LearnFloatType* Propagate(const std::vector& batch) { - if (output_.size() < kOutputDimensions * batch.size()) { - output_.resize(kOutputDimensions * batch.size()); - gradients_.resize(kOutputDimensions * batch.size()); - } - batch_ = &batch; - // affine transform + if (ReceiveMessage("reset", message)) { + DequantizeParameters(); + } + + if (ReceiveMessage("quantize_parameters", message)) { + QuantizeParameters(); + } + + if (ReceiveMessage("clear_unobserved_feature_weights", message)) { + ClearUnobservedFeatureWeights(); + } + + if (ReceiveMessage("check_health", message)) { + CheckHealth(); + } + } + + // Initialize the parameters with random numbers + template + void Initialize(RNG& rng) { + std::fill(std::begin(weights_), std::end(weights_), +kZero); + + const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions); + auto distribution = std::normal_distribution(0.0, kSigma); + + for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) { + const auto weight = static_cast(distribution(rng)); + weights_[i] = weight; + } + + for (IndexType i = 0; i < kHalfDimensions; ++i) { + biases_[i] = static_cast(0.5); + } + + QuantizeParameters(); + } + + // forward propagation + const LearnFloatType* Propagate(const std::vector& batch) { + if (output_.size() < kOutputDimensions * batch.size()) { + output_.resize(kOutputDimensions * batch.size()); + gradients_.resize(kOutputDimensions * batch.size()); + } + + batch_ = &batch; + // affine transform #pragma omp parallel for - for (IndexType b = 0; b < batch.size(); ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType c = 0; c < 2; ++c) { - const IndexType output_offset = batch_offset + kHalfDimensions * c; + for (IndexType b = 0; b < batch.size(); ++b) { + const IndexType batch_offset = kOutputDimensions * b; + for (IndexType c = 0; c < 2; ++c) { + const IndexType output_offset = batch_offset + kHalfDimensions * c; #if defined(USE_BLAS) - cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1); - for (const auto& feature : batch[b].training_features[c]) { - const IndexType weights_offset = kHalfDimensions * feature.GetIndex(); - cblas_saxpy(kHalfDimensions, (float)feature.GetCount(), - &weights_[weights_offset], 1, &output_[output_offset], 1); - } + cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1); + for (const auto& feature : batch[b].training_features[c]) { + const IndexType weights_offset = kHalfDimensions * feature.GetIndex(); + cblas_saxpy(kHalfDimensions, (float)feature.GetCount(), + &weights_[weights_offset], 1, &output_[output_offset], 1); + } #else - for (IndexType i = 0; i < kHalfDimensions; ++i) { - output_[output_offset + i] = biases_[i]; - } - for (const auto& feature : batch[b].training_features[c]) { - const IndexType weights_offset = kHalfDimensions * feature.GetIndex(); - for (IndexType i = 0; i < kHalfDimensions; ++i) { - output_[output_offset + i] += - feature.GetCount() * weights_[weights_offset + i]; - } - } + for (IndexType i = 0; i < kHalfDimensions; ++i) { + output_[output_offset + i] = biases_[i]; + } + for (const auto& feature : batch[b].training_features[c]) { + const IndexType weights_offset = kHalfDimensions * feature.GetIndex(); + for (IndexType i = 0; i < kHalfDimensions; ++i) { + output_[output_offset + i] += + feature.GetCount() * weights_[weights_offset + i]; + } + } #endif - } - } - // clipped ReLU - for (IndexType b = 0; b < batch.size(); ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const IndexType index = batch_offset + i; - min_pre_activation_ = std::min(min_pre_activation_, output_[index]); - max_pre_activation_ = std::max(max_pre_activation_, output_[index]); - output_[index] = std::max(+kZero, std::min(+kOne, output_[index])); - const IndexType t = i % kHalfDimensions; - min_activations_[t] = std::min(min_activations_[t], output_[index]); - max_activations_[t] = std::max(max_activations_[t], output_[index]); - } - } - return output_.data(); - } + } + } - // backpropagation - void Backpropagate(const LearnFloatType* gradients, - LearnFloatType learning_rate) { - const LearnFloatType local_learning_rate = - learning_rate * learning_rate_scale_; - for (IndexType b = 0; b < batch_->size(); ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const IndexType index = batch_offset + i; - gradients_[index] = gradients[index] * - ((output_[index] > kZero) * (output_[index] < kOne)); - } - } - // Since the weight matrix updates only the columns corresponding to the features that appeared in the input, - // Correct the learning rate and adjust the scale without using momentum - const LearnFloatType effective_learning_rate = - static_cast(local_learning_rate / (1.0 - momentum_)); + // clipped ReLU + for (IndexType b = 0; b < batch.size(); ++b) { + const IndexType batch_offset = kOutputDimensions * b; + for (IndexType i = 0; i < kOutputDimensions; ++i) { + const IndexType index = batch_offset + i; + min_pre_activation_ = std::min(min_pre_activation_, output_[index]); + max_pre_activation_ = std::max(max_pre_activation_, output_[index]); + output_[index] = std::max(+kZero, std::min(+kOne, output_[index])); + const IndexType t = i % kHalfDimensions; + min_activations_[t] = std::min(min_activations_[t], output_[index]); + max_activations_[t] = std::max(max_activations_[t], output_[index]); + } + } + + return output_.data(); + } + + // backpropagation + void Backpropagate(const LearnFloatType* gradients, + LearnFloatType learning_rate) { + + const LearnFloatType local_learning_rate = + learning_rate * learning_rate_scale_; + + for (IndexType b = 0; b < batch_->size(); ++b) { + const IndexType batch_offset = kOutputDimensions * b; + for (IndexType i = 0; i < kOutputDimensions; ++i) { + const IndexType index = batch_offset + i; + gradients_[index] = gradients[index] * + ((output_[index] > kZero) * (output_[index] < kOne)); + } + } + + // Since the weight matrix updates only the columns corresponding to the features that appeared in the input, + // Correct the learning rate and adjust the scale without using momentum + const LearnFloatType effective_learning_rate = + static_cast(local_learning_rate / (1.0 - momentum_)); #if defined(USE_BLAS) - cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1); - for (IndexType b = 0; b < batch_->size(); ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType c = 0; c < 2; ++c) { - const IndexType output_offset = batch_offset + kHalfDimensions * c; - cblas_saxpy(kHalfDimensions, 1.0, - &gradients_[output_offset], 1, biases_diff_, 1); - } - } - cblas_saxpy(kHalfDimensions, -local_learning_rate, - biases_diff_, 1, biases_, 1); + cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1); + for (IndexType b = 0; b < batch_->size(); ++b) { + const IndexType batch_offset = kOutputDimensions * b; + for (IndexType c = 0; c < 2; ++c) { + const IndexType output_offset = batch_offset + kHalfDimensions * c; + cblas_saxpy(kHalfDimensions, 1.0, + &gradients_[output_offset], 1, biases_diff_, 1); + } + } + + cblas_saxpy(kHalfDimensions, -local_learning_rate, + biases_diff_, 1, biases_, 1); + #pragma omp parallel - { + { #if defined(_OPENMP) - const IndexType num_threads = omp_get_num_threads(); - const IndexType thread_index = omp_get_thread_num(); + const IndexType num_threads = omp_get_num_threads(); + const IndexType thread_index = omp_get_thread_num(); #endif - for (IndexType b = 0; b < batch_->size(); ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType c = 0; c < 2; ++c) { - const IndexType output_offset = batch_offset + kHalfDimensions * c; - for (const auto& feature : (*batch_)[b].training_features[c]) { + for (IndexType b = 0; b < batch_->size(); ++b) { + const IndexType batch_offset = kOutputDimensions * b; + for (IndexType c = 0; c < 2; ++c) { + const IndexType output_offset = batch_offset + kHalfDimensions * c; + for (const auto& feature : (*batch_)[b].training_features[c]) { #if defined(_OPENMP) - if (feature.GetIndex() % num_threads != thread_index) continue; + if (feature.GetIndex() % num_threads != thread_index) + continue; #endif - const IndexType weights_offset = - kHalfDimensions * feature.GetIndex(); - const auto scale = static_cast( - effective_learning_rate / feature.GetCount()); - cblas_saxpy(kHalfDimensions, -scale, - &gradients_[output_offset], 1, - &weights_[weights_offset], 1); - } - } - } - } + const IndexType weights_offset = + kHalfDimensions * feature.GetIndex(); + const auto scale = static_cast( + effective_learning_rate / feature.GetCount()); + + cblas_saxpy(kHalfDimensions, -scale, + &gradients_[output_offset], 1, + &weights_[weights_offset], 1); + } + } + } + } + #else - for (IndexType i = 0; i < kHalfDimensions; ++i) { - biases_diff_[i] *= momentum_; - } - for (IndexType b = 0; b < batch_->size(); ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType c = 0; c < 2; ++c) { - const IndexType output_offset = batch_offset + kHalfDimensions * c; - for (IndexType i = 0; i < kHalfDimensions; ++i) { - biases_diff_[i] += gradients_[output_offset + i]; - } - } - } - for (IndexType i = 0; i < kHalfDimensions; ++i) { - biases_[i] -= local_learning_rate * biases_diff_[i]; - } - for (IndexType b = 0; b < batch_->size(); ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType c = 0; c < 2; ++c) { - const IndexType output_offset = batch_offset + kHalfDimensions * c; - for (const auto& feature : (*batch_)[b].training_features[c]) { - const IndexType weights_offset = kHalfDimensions * feature.GetIndex(); - const auto scale = static_cast( - effective_learning_rate / feature.GetCount()); - for (IndexType i = 0; i < kHalfDimensions; ++i) { - weights_[weights_offset + i] -= - scale * gradients_[output_offset + i]; - } - } - } - } + for (IndexType i = 0; i < kHalfDimensions; ++i) { + biases_diff_[i] *= momentum_; + } + + for (IndexType b = 0; b < batch_->size(); ++b) { + const IndexType batch_offset = kOutputDimensions * b; + for (IndexType c = 0; c < 2; ++c) { + const IndexType output_offset = batch_offset + kHalfDimensions * c; + for (IndexType i = 0; i < kHalfDimensions; ++i) { + biases_diff_[i] += gradients_[output_offset + i]; + } + } + } + + for (IndexType i = 0; i < kHalfDimensions; ++i) { + biases_[i] -= local_learning_rate * biases_diff_[i]; + } + + for (IndexType b = 0; b < batch_->size(); ++b) { + const IndexType batch_offset = kOutputDimensions * b; + for (IndexType c = 0; c < 2; ++c) { + const IndexType output_offset = batch_offset + kHalfDimensions * c; + for (const auto& feature : (*batch_)[b].training_features[c]) { + const IndexType weights_offset = kHalfDimensions * feature.GetIndex(); + const auto scale = static_cast( + effective_learning_rate / feature.GetCount()); + + for (IndexType i = 0; i < kHalfDimensions; ++i) { + weights_[weights_offset + i] -= + scale * gradients_[output_offset + i]; + } + } + } + } + #endif - for (IndexType b = 0; b < batch_->size(); ++b) { - for (IndexType c = 0; c < 2; ++c) { - for (const auto& feature : (*batch_)[b].training_features[c]) { - observed_features.set(feature.GetIndex()); + for (IndexType b = 0; b < batch_->size(); ++b) { + for (IndexType c = 0; c < 2; ++c) { + for (const auto& feature : (*batch_)[b].training_features[c]) { + observed_features.set(feature.GetIndex()); + } + } + } } - } - } - } - private: - // constructor - Trainer(LayerType* target_layer) : - batch_(nullptr), - target_layer_(target_layer), - biases_(), - weights_(), - biases_diff_(), - momentum_(0.2), - learning_rate_scale_(1.0) { - min_pre_activation_ = std::numeric_limits::max(); - max_pre_activation_ = std::numeric_limits::lowest(); - std::fill(std::begin(min_activations_), std::end(min_activations_), - std::numeric_limits::max()); - std::fill(std::begin(max_activations_), std::end(max_activations_), - std::numeric_limits::lowest()); - DequantizeParameters(); - } + private: + // constructor + Trainer(LayerType* target_layer) : + batch_(nullptr), + target_layer_(target_layer), + biases_(), + weights_(), + biases_diff_(), + momentum_(0.2), + learning_rate_scale_(1.0) { + + min_pre_activation_ = std::numeric_limits::max(); + max_pre_activation_ = std::numeric_limits::lowest(); + + std::fill(std::begin(min_activations_), std::end(min_activations_), + std::numeric_limits::max()); + std::fill(std::begin(max_activations_), std::end(max_activations_), + std::numeric_limits::lowest()); + + DequantizeParameters(); + } + + // Weight saturation and parameterization + void QuantizeParameters() { + for (IndexType i = 0; i < kHalfDimensions; ++i) { + target_layer_->biases_[i] = + Round(biases_[i] * kBiasScale); + } + + std::vector training_features; - // Weight saturation and parameterization - void QuantizeParameters() { - for (IndexType i = 0; i < kHalfDimensions; ++i) { - target_layer_->biases_[i] = - Round(biases_[i] * kBiasScale); - } - std::vector training_features; #pragma omp parallel for private(training_features) - for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) { - training_features.clear(); - Features::Factorizer::AppendTrainingFeatures( - j, &training_features); - for (IndexType i = 0; i < kHalfDimensions; ++i) { - double sum = 0.0; - for (const auto& feature : training_features) { - sum += weights_[kHalfDimensions * feature.GetIndex() + i]; + for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) { + training_features.clear(); + Features::Factorizer::AppendTrainingFeatures( + j, &training_features); + + for (IndexType i = 0; i < kHalfDimensions; ++i) { + double sum = 0.0; + for (const auto& feature : training_features) { + sum += weights_[kHalfDimensions * feature.GetIndex() + i]; + } + + target_layer_->weights_[kHalfDimensions * j + i] = + Round(sum * kWeightScale); + } + } } - target_layer_->weights_[kHalfDimensions * j + i] = - Round(sum * kWeightScale); - } - } - } - // read parameterized integer - void DequantizeParameters() { - for (IndexType i = 0; i < kHalfDimensions; ++i) { - biases_[i] = static_cast( - target_layer_->biases_[i] / kBiasScale); - } - std::fill(std::begin(weights_), std::end(weights_), +kZero); - for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) { - weights_[i] = static_cast( - target_layer_->weights_[i] / kWeightScale); - } - std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero); - } + // read parameterized integer + void DequantizeParameters() { + for (IndexType i = 0; i < kHalfDimensions; ++i) { + biases_[i] = static_cast( + target_layer_->biases_[i] / kBiasScale); + } - // Set the weight corresponding to the feature that does not appear in the learning data to 0 - void ClearUnobservedFeatureWeights() { - for (IndexType i = 0; i < kInputDimensions; ++i) { - if (!observed_features.test(i)) { - std::fill(std::begin(weights_) + kHalfDimensions * i, - std::begin(weights_) + kHalfDimensions * (i + 1), +kZero); - } - } - QuantizeParameters(); - } + std::fill(std::begin(weights_), std::end(weights_), +kZero); - // Check if there are any problems with learning - void CheckHealth() { - std::cout << "INFO: observed " << observed_features.count() - << " (out of " << kInputDimensions << ") features" << std::endl; + for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) { + weights_[i] = static_cast( + target_layer_->weights_[i] / kWeightScale); + } - constexpr LearnFloatType kPreActivationLimit = - std::numeric_limits::max() / - kWeightScale; - std::cout << "INFO: (min, max) of pre-activations = " - << min_pre_activation_ << ", " - << max_pre_activation_ << " (limit = " - << kPreActivationLimit << ")" << std::endl; + std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero); + } - const auto largest_min_activation = *std::max_element( - std::begin(min_activations_), std::end(min_activations_)); - const auto smallest_max_activation = *std::min_element( - std::begin(max_activations_), std::end(max_activations_)); - std::cout << "INFO: largest min activation = " << largest_min_activation - << ", smallest max activation = " << smallest_max_activation - << std::endl; + // Set the weight corresponding to the feature that does not appear in the learning data to 0 + void ClearUnobservedFeatureWeights() { + for (IndexType i = 0; i < kInputDimensions; ++i) { + if (!observed_features.test(i)) { + std::fill(std::begin(weights_) + kHalfDimensions * i, + std::begin(weights_) + kHalfDimensions * (i + 1), +kZero); + } + } - std::fill(std::begin(min_activations_), std::end(min_activations_), - std::numeric_limits::max()); - std::fill(std::begin(max_activations_), std::end(max_activations_), - std::numeric_limits::lowest()); - } + QuantizeParameters(); + } - // number of input/output dimensions - static constexpr IndexType kInputDimensions = - Features::Factorizer::GetDimensions(); - static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions; - static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions; + // Check if there are any problems with learning + void CheckHealth() { + std::cout << "INFO: observed " << observed_features.count() + << " (out of " << kInputDimensions << ") features" << std::endl; - // Coefficient used for parameterization - static constexpr LearnFloatType kActivationScale = - std::numeric_limits::max(); - static constexpr LearnFloatType kBiasScale = kActivationScale; - static constexpr LearnFloatType kWeightScale = kActivationScale; + constexpr LearnFloatType kPreActivationLimit = + std::numeric_limits::max() / + kWeightScale; - // LearnFloatType constant - static constexpr LearnFloatType kZero = static_cast(0.0); - static constexpr LearnFloatType kOne = static_cast(1.0); + std::cout << "INFO: (min, max) of pre-activations = " + << min_pre_activation_ << ", " + << max_pre_activation_ << " (limit = " + << kPreActivationLimit << ")" << std::endl; - // mini batch - const std::vector* batch_; + const auto largest_min_activation = *std::max_element( + std::begin(min_activations_), std::end(min_activations_)); + const auto smallest_max_activation = *std::min_element( + std::begin(max_activations_), std::end(max_activations_)); - // layer to learn - LayerType* const target_layer_; + std::cout << "INFO: largest min activation = " << largest_min_activation + << ", smallest max activation = " << smallest_max_activation + << std::endl; - // parameter - alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions]; - alignas(kCacheLineSize) - LearnFloatType weights_[kHalfDimensions * kInputDimensions]; + std::fill(std::begin(min_activations_), std::end(min_activations_), + std::numeric_limits::max()); + std::fill(std::begin(max_activations_), std::end(max_activations_), + std::numeric_limits::lowest()); + } - // Buffer used for updating parameters - LearnFloatType biases_diff_[kHalfDimensions]; - std::vector gradients_; + // number of input/output dimensions + static constexpr IndexType kInputDimensions = + Features::Factorizer::GetDimensions(); + static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions; + static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions; - // Forward propagation buffer - std::vector output_; + // Coefficient used for parameterization + static constexpr LearnFloatType kActivationScale = + std::numeric_limits::max(); + static constexpr LearnFloatType kBiasScale = kActivationScale; + static constexpr LearnFloatType kWeightScale = kActivationScale; - // Features that appeared in the training data - std::bitset observed_features; + // LearnFloatType constant + static constexpr LearnFloatType kZero = static_cast(0.0); + static constexpr LearnFloatType kOne = static_cast(1.0); - // hyper parameter - LearnFloatType momentum_; - LearnFloatType learning_rate_scale_; + // mini batch + const std::vector* batch_; - // Health check statistics - LearnFloatType min_pre_activation_; - LearnFloatType max_pre_activation_; - LearnFloatType min_activations_[kHalfDimensions]; - LearnFloatType max_activations_[kHalfDimensions]; -}; + // layer to learn + LayerType* const target_layer_; -} // namespace NNUE + // parameter + alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions]; + alignas(kCacheLineSize) + LearnFloatType weights_[kHalfDimensions * kInputDimensions]; -} // namespace Eval + // Buffer used for updating parameters + LearnFloatType biases_diff_[kHalfDimensions]; + std::vector gradients_; + + // Forward propagation buffer + std::vector output_; + + // Features that appeared in the training data + std::bitset observed_features; + + // hyper parameter + LearnFloatType momentum_; + LearnFloatType learning_rate_scale_; + + // Health check statistics + LearnFloatType min_pre_activation_; + LearnFloatType max_pre_activation_; + LearnFloatType min_activations_[kHalfDimensions]; + LearnFloatType max_activations_[kHalfDimensions]; + }; + +} // namespace Eval::NNUE #endif diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h index e2cd0c25..45dcbacc 100644 --- a/src/nnue/trainer/trainer_input_slice.h +++ b/src/nnue/trainer/trainer_input_slice.h @@ -1,247 +1,267 @@ -// Specialization of NNUE evaluation function learning class template for InputSlice - -#ifndef _NNUE_TRAINER_INPUT_SLICE_H_ +#ifndef _NNUE_TRAINER_INPUT_SLICE_H_ #define _NNUE_TRAINER_INPUT_SLICE_H_ -#include "../../learn/learn.h" -#include "../layers/input_slice.h" #include "trainer.h" -namespace Eval { +#include "learn/learn.h" -namespace NNUE { +#include "nnue/layers/input_slice.h" -// Learning: Input layer -class SharedInputTrainer { - public: - // factory function - static std::shared_ptr Create( - FeatureTransformer* ft) { - static std::shared_ptr instance; - if (!instance) { - instance.reset(new SharedInputTrainer(ft)); - } - ++instance->num_referrers_; - return instance; - } +// Specialization of NNUE evaluation function learning class template for InputSlice +namespace Eval::NNUE { - // Set options such as hyperparameters - void SendMessage(Message* message) { - if (num_calls_ == 0) { - current_operation_ = Operation::kSendMessage; - feature_transformer_trainer_->SendMessage(message); - } - assert(current_operation_ == Operation::kSendMessage); - if (++num_calls_ == num_referrers_) { - num_calls_ = 0; - current_operation_ = Operation::kNone; - } - } + // Learning: Input layer + class SharedInputTrainer { + public: + // factory function + static std::shared_ptr Create( + FeatureTransformer* ft) { - // Initialize the parameters with random numbers - template - void Initialize(RNG& rng) { - if (num_calls_ == 0) { - current_operation_ = Operation::kInitialize; - feature_transformer_trainer_->Initialize(rng); - } - assert(current_operation_ == Operation::kInitialize); - if (++num_calls_ == num_referrers_) { - num_calls_ = 0; - current_operation_ = Operation::kNone; - } - } + static std::shared_ptr instance; - // forward propagation - const LearnFloatType* Propagate(const std::vector& batch) { - if (gradients_.size() < kInputDimensions * batch.size()) { - gradients_.resize(kInputDimensions * batch.size()); - } - batch_size_ = static_cast(batch.size()); - if (num_calls_ == 0) { - current_operation_ = Operation::kPropagate; - output_ = feature_transformer_trainer_->Propagate(batch); - } - assert(current_operation_ == Operation::kPropagate); - if (++num_calls_ == num_referrers_) { - num_calls_ = 0; - current_operation_ = Operation::kNone; - } - return output_; - } + if (!instance) { + instance.reset(new SharedInputTrainer(ft)); + } - // backpropagation - void Backpropagate(const LearnFloatType* gradients, - LearnFloatType learning_rate) { - if (num_referrers_ == 1) { - feature_transformer_trainer_->Backpropagate(gradients, learning_rate); - return; - } - if (num_calls_ == 0) { - current_operation_ = Operation::kBackPropagate; - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType batch_offset = kInputDimensions * b; - for (IndexType i = 0; i < kInputDimensions; ++i) { - gradients_[batch_offset + i] = static_cast(0.0); + ++instance->num_referrers_; + + return instance; } - } - } - assert(current_operation_ == Operation::kBackPropagate); - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType batch_offset = kInputDimensions * b; - for (IndexType i = 0; i < kInputDimensions; ++i) { - gradients_[batch_offset + i] += gradients[batch_offset + i]; - } - } - if (++num_calls_ == num_referrers_) { - feature_transformer_trainer_->Backpropagate( - gradients_.data(), learning_rate); - num_calls_ = 0; - current_operation_ = Operation::kNone; - } - } - private: - // constructor - SharedInputTrainer(FeatureTransformer* ft) : - batch_size_(0), - num_referrers_(0), - num_calls_(0), - current_operation_(Operation::kNone), - feature_transformer_trainer_(Trainer::Create( - ft)), - output_(nullptr) { - } + // Set options such as hyperparameters + void SendMessage(Message* message) { + if (num_calls_ == 0) { + current_operation_ = Operation::kSendMessage; + feature_transformer_trainer_->SendMessage(message); + } - // number of input/output dimensions - static constexpr IndexType kInputDimensions = - FeatureTransformer::kOutputDimensions; + assert(current_operation_ == Operation::kSendMessage); - // type of processing - enum class Operation { - kNone, - kSendMessage, - kInitialize, - kPropagate, - kBackPropagate, - }; + if (++num_calls_ == num_referrers_) { + num_calls_ = 0; + current_operation_ = Operation::kNone; + } + } - // number of samples in mini-batch - IndexType batch_size_; + // Initialize the parameters with random numbers + template + void Initialize(RNG& rng) { + if (num_calls_ == 0) { + current_operation_ = Operation::kInitialize; + feature_transformer_trainer_->Initialize(rng); + } - // number of layers sharing this layer as input - std::uint32_t num_referrers_; + assert(current_operation_ == Operation::kInitialize); - // Number of times the current process has been called - std::uint32_t num_calls_; + if (++num_calls_ == num_referrers_) { + num_calls_ = 0; + current_operation_ = Operation::kNone; + } + } - // current processing type - Operation current_operation_; + // forward propagation + const LearnFloatType* Propagate(const std::vector& batch) { + if (gradients_.size() < kInputDimensions * batch.size()) { + gradients_.resize(kInputDimensions * batch.size()); + } - // Trainer of input feature converter - const std::shared_ptr> - feature_transformer_trainer_; + batch_size_ = static_cast(batch.size()); - // pointer to output shared for forward propagation - const LearnFloatType* output_; + if (num_calls_ == 0) { + current_operation_ = Operation::kPropagate; + output_ = feature_transformer_trainer_->Propagate(batch); + } - // buffer for back propagation - std::vector gradients_; -}; + assert(current_operation_ == Operation::kPropagate); -// Learning: Input layer -template -class Trainer> { - private: - // Type of layer to learn - using LayerType = Layers::InputSlice; + if (++num_calls_ == num_referrers_) { + num_calls_ = 0; + current_operation_ = Operation::kNone; + } - public: - // factory function - static std::shared_ptr Create( - LayerType* /*target_layer*/, FeatureTransformer* ft) { - return std::shared_ptr(new Trainer(ft)); - } + return output_; + } - // Set options such as hyperparameters - void SendMessage(Message* message) { - shared_input_trainer_->SendMessage(message); - } + // backpropagation + void Backpropagate(const LearnFloatType* gradients, + LearnFloatType learning_rate) { - // Initialize the parameters with random numbers - template - void Initialize(RNG& rng) { - shared_input_trainer_->Initialize(rng); - } + if (num_referrers_ == 1) { + feature_transformer_trainer_->Backpropagate(gradients, learning_rate); + return; + } - // forward propagation - const LearnFloatType* Propagate(const std::vector& batch) { - if (output_.size() < kOutputDimensions * batch.size()) { - output_.resize(kOutputDimensions * batch.size()); - gradients_.resize(kInputDimensions * batch.size()); - } - batch_size_ = static_cast(batch.size()); - const auto input = shared_input_trainer_->Propagate(batch); - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType input_offset = kInputDimensions * b; - const IndexType output_offset = kOutputDimensions * b; + if (num_calls_ == 0) { + current_operation_ = Operation::kBackPropagate; + for (IndexType b = 0; b < batch_size_; ++b) { + const IndexType batch_offset = kInputDimensions * b; + for (IndexType i = 0; i < kInputDimensions; ++i) { + gradients_[batch_offset + i] = static_cast(0.0); + } + } + } + + assert(current_operation_ == Operation::kBackPropagate); + + for (IndexType b = 0; b < batch_size_; ++b) { + const IndexType batch_offset = kInputDimensions * b; + for (IndexType i = 0; i < kInputDimensions; ++i) { + gradients_[batch_offset + i] += gradients[batch_offset + i]; + } + } + + if (++num_calls_ == num_referrers_) { + feature_transformer_trainer_->Backpropagate( + gradients_.data(), learning_rate); + num_calls_ = 0; + current_operation_ = Operation::kNone; + } + } + + private: + // constructor + SharedInputTrainer(FeatureTransformer* ft) : + batch_size_(0), + num_referrers_(0), + num_calls_(0), + current_operation_(Operation::kNone), + feature_transformer_trainer_(Trainer::Create( + ft)), + output_(nullptr) { + } + + // number of input/output dimensions + static constexpr IndexType kInputDimensions = + FeatureTransformer::kOutputDimensions; + + // type of processing + enum class Operation { + kNone, + kSendMessage, + kInitialize, + kPropagate, + kBackPropagate, + }; + + // number of samples in mini-batch + IndexType batch_size_; + + // number of layers sharing this layer as input + std::uint32_t num_referrers_; + + // Number of times the current process has been called + std::uint32_t num_calls_; + + // current processing type + Operation current_operation_; + + // Trainer of input feature converter + const std::shared_ptr> + feature_transformer_trainer_; + + // pointer to output shared for forward propagation + const LearnFloatType* output_; + + // buffer for back propagation + std::vector gradients_; + }; + + // Learning: Input layer + template + class Trainer> { + private: + // Type of layer to learn + using LayerType = Layers::InputSlice; + + public: + // factory function + static std::shared_ptr Create( + LayerType* /*target_layer*/, FeatureTransformer* ft) { + + return std::shared_ptr(new Trainer(ft)); + } + + // Set options such as hyperparameters + void SendMessage(Message* message) { + shared_input_trainer_->SendMessage(message); + } + + // Initialize the parameters with random numbers + template + void Initialize(RNG& rng) { + shared_input_trainer_->Initialize(rng); + } + + // forward propagation + const LearnFloatType* Propagate(const std::vector& batch) { + if (output_.size() < kOutputDimensions * batch.size()) { + output_.resize(kOutputDimensions * batch.size()); + gradients_.resize(kInputDimensions * batch.size()); + } + + batch_size_ = static_cast(batch.size()); + + const auto input = shared_input_trainer_->Propagate(batch); + for (IndexType b = 0; b < batch_size_; ++b) { + const IndexType input_offset = kInputDimensions * b; + const IndexType output_offset = kOutputDimensions * b; #if defined(USE_BLAS) - cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1, - &output_[output_offset], 1); + cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1, + &output_[output_offset], 1); #else - for (IndexType i = 0; i < kOutputDimensions; ++i) { - output_[output_offset + i] = input[input_offset + Offset + i]; - } + for (IndexType i = 0; i < kOutputDimensions; ++i) { + output_[output_offset + i] = input[input_offset + Offset + i]; + } #endif - } - return output_.data(); - } + } - // backpropagation - void Backpropagate(const LearnFloatType* gradients, - LearnFloatType learning_rate) { - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType input_offset = kInputDimensions * b; - const IndexType output_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kInputDimensions; ++i) { - if ((int)i < (int)Offset || i >= Offset + kOutputDimensions) { - gradients_[input_offset + i] = static_cast(0.0); - } else { - gradients_[input_offset + i] = gradients[output_offset + i - Offset]; + return output_.data(); } - } - } - shared_input_trainer_->Backpropagate(gradients_.data(), learning_rate); - } - private: - // constructor - Trainer(FeatureTransformer* ft): - batch_size_(0), - shared_input_trainer_(SharedInputTrainer::Create(ft)) { - } + // backpropagation + void Backpropagate(const LearnFloatType* gradients, + LearnFloatType learning_rate) { - // number of input/output dimensions - static constexpr IndexType kInputDimensions = - FeatureTransformer::kOutputDimensions; - static constexpr IndexType kOutputDimensions = OutputDimensions; - static_assert(Offset + kOutputDimensions <= kInputDimensions, ""); + for (IndexType b = 0; b < batch_size_; ++b) { + const IndexType input_offset = kInputDimensions * b; + const IndexType output_offset = kOutputDimensions * b; + for (IndexType i = 0; i < kInputDimensions; ++i) { + if ((int)i < (int)Offset || i >= Offset + kOutputDimensions) { + gradients_[input_offset + i] = static_cast(0.0); + } else { + gradients_[input_offset + i] = gradients[output_offset + i - Offset]; + } + } + } + shared_input_trainer_->Backpropagate(gradients_.data(), learning_rate); + } - // number of samples in mini-batch - IndexType batch_size_; + private: + // constructor + Trainer(FeatureTransformer* ft): + batch_size_(0), + shared_input_trainer_(SharedInputTrainer::Create(ft)) { + } - // Trainer of shared input layer - const std::shared_ptr shared_input_trainer_; + // number of input/output dimensions + static constexpr IndexType kInputDimensions = + FeatureTransformer::kOutputDimensions; + static constexpr IndexType kOutputDimensions = OutputDimensions; + static_assert(Offset + kOutputDimensions <= kInputDimensions, ""); - // Forward propagation buffer - std::vector output_; + // number of samples in mini-batch + IndexType batch_size_; - // buffer for back propagation - std::vector gradients_; -}; + // Trainer of shared input layer + const std::shared_ptr shared_input_trainer_; -} // namespace NNUE + // Forward propagation buffer + std::vector output_; -} // namespace Eval + // buffer for back propagation + std::vector gradients_; + }; + +} // namespace Eval::NNUE #endif diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h index 65a0b681..9904704b 100644 --- a/src/nnue/trainer/trainer_sum.h +++ b/src/nnue/trainer/trainer_sum.h @@ -1,186 +1,190 @@ -// Specialization of NNUE evaluation function learning class template for Sum - -#ifndef _NNUE_TRAINER_SUM_H_ +#ifndef _NNUE_TRAINER_SUM_H_ #define _NNUE_TRAINER_SUM_H_ #include "../../learn/learn.h" #include "../layers/sum.h" #include "trainer.h" -namespace Eval { +// Specialization of NNUE evaluation function learning class template for Sum +namespace Eval::NNUE { -namespace NNUE { + // Learning: A layer that sums the outputs of multiple layers + template + class Trainer> : + Trainer> { + private: + // Type of layer to learn + using LayerType = Layers::Sum; + using Tail = Trainer>; -// Learning: A layer that sums the outputs of multiple layers -template -class Trainer> : - Trainer> { - private: - // Type of layer to learn - using LayerType = Layers::Sum; - using Tail = Trainer>; + public: + // factory function + static std::shared_ptr Create( + LayerType* target_layer, FeatureTransformer* ft) { - public: - // factory function - static std::shared_ptr Create( - LayerType* target_layer, FeatureTransformer* ft) { - return std::shared_ptr( - new Trainer(target_layer, ft)); - } + return std::shared_ptr( + new Trainer(target_layer, ft)); + } - // Set options such as hyperparameters - void SendMessage(Message* message) { - // The results of other member functions do not depend on the processing order, so - // Tail is processed first for the purpose of simplifying the implementation, but - // SendMessage processes Head first to make it easier to understand subscript correspondence - previous_layer_trainer_->SendMessage(message); - Tail::SendMessage(message); - } + // Set options such as hyperparameters + void SendMessage(Message* message) { + // The results of other member functions do not depend on the processing order, so + // Tail is processed first for the purpose of simplifying the implementation, but + // SendMessage processes Head first to make it easier to understand subscript correspondence + previous_layer_trainer_->SendMessage(message); + Tail::SendMessage(message); + } - // Initialize the parameters with random numbers - template - void Initialize(RNG& rng) { - Tail::Initialize(rng); - previous_layer_trainer_->Initialize(rng); - } + // Initialize the parameters with random numbers + template + void Initialize(RNG& rng) { + Tail::Initialize(rng); + previous_layer_trainer_->Initialize(rng); + } + + // forward propagation + /*const*/ LearnFloatType* Propagate(const std::vector& batch) { + batch_size_ = static_cast(batch.size()); + auto output = Tail::Propagate(batch); + const auto head_output = previous_layer_trainer_->Propagate(batch); - // forward propagation - /*const*/ LearnFloatType* Propagate(const std::vector& batch) { - batch_size_ = static_cast(batch.size()); - auto output = Tail::Propagate(batch); - const auto head_output = previous_layer_trainer_->Propagate(batch); #if defined(USE_BLAS) - cblas_saxpy(kOutputDimensions * batch_size_, 1.0, - head_output, 1, output, 1); + cblas_saxpy(kOutputDimensions * batch_size_, 1.0, + head_output, 1, output, 1); #else - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - output[batch_offset + i] += head_output[batch_offset + i]; - } - } + for (IndexType b = 0; b < batch_size_; ++b) { + const IndexType batch_offset = kOutputDimensions * b; + for (IndexType i = 0; i < kOutputDimensions; ++i) { + output[batch_offset + i] += head_output[batch_offset + i]; + } + } + #endif - return output; - } + return output; + } - // backpropagation - void Backpropagate(const LearnFloatType* gradients, - LearnFloatType learning_rate) { - Tail::Backpropagate(gradients, learning_rate); - previous_layer_trainer_->Backpropagate(gradients, learning_rate); - } + // backpropagation + void Backpropagate(const LearnFloatType* gradients, + LearnFloatType learning_rate) { - private: - // constructor - Trainer(LayerType* target_layer, FeatureTransformer* ft): - Tail(target_layer, ft), - batch_size_(0), - previous_layer_trainer_(Trainer::Create( - &target_layer->previous_layer_, ft)), - target_layer_(target_layer) { - } + Tail::Backpropagate(gradients, learning_rate); + previous_layer_trainer_->Backpropagate(gradients, learning_rate); + } - // number of input/output dimensions - static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions; + private: + // constructor + Trainer(LayerType* target_layer, FeatureTransformer* ft): + Tail(target_layer, ft), + batch_size_(0), + previous_layer_trainer_(Trainer::Create( + &target_layer->previous_layer_, ft)), + target_layer_(target_layer) { + } - // make subclass friend - template - friend class Trainer; + // number of input/output dimensions + static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions; - // number of samples in mini-batch - IndexType batch_size_; + // make subclass friend + template + friend class Trainer; - // Trainer of the previous layer - const std::shared_ptr> previous_layer_trainer_; + // number of samples in mini-batch + IndexType batch_size_; - // layer to learn - LayerType* const target_layer_; -}; + // Trainer of the previous layer + const std::shared_ptr> previous_layer_trainer_; + + // layer to learn + LayerType* const target_layer_; + }; -// Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument) -template -class Trainer> { - private: - // Type of layer to learn - using LayerType = Layers::Sum; + // Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument) + template + class Trainer> { + private: + // Type of layer to learn + using LayerType = Layers::Sum; - public: - // factory function - static std::shared_ptr Create( - LayerType* target_layer, FeatureTransformer* ft) { - return std::shared_ptr( - new Trainer(target_layer, ft)); - } + public: + // factory function + static std::shared_ptr Create( + LayerType* target_layer, FeatureTransformer* ft) { - // Set options such as hyperparameters - void SendMessage(Message* message) { - previous_layer_trainer_->SendMessage(message); - } + return std::shared_ptr( + new Trainer(target_layer, ft)); + } - // Initialize the parameters with random numbers - template - void Initialize(RNG& rng) { - previous_layer_trainer_->Initialize(rng); - } + // Set options such as hyperparameters + void SendMessage(Message* message) { + previous_layer_trainer_->SendMessage(message); + } + + // Initialize the parameters with random numbers + template + void Initialize(RNG& rng) { + previous_layer_trainer_->Initialize(rng); + } + + // forward propagation + /*const*/ LearnFloatType* Propagate(const std::vector& batch) { + if (output_.size() < kOutputDimensions * batch.size()) { + output_.resize(kOutputDimensions * batch.size()); + } + + batch_size_ = static_cast(batch.size()); + const auto output = previous_layer_trainer_->Propagate(batch); - // forward propagation - /*const*/ LearnFloatType* Propagate(const std::vector& batch) { - if (output_.size() < kOutputDimensions * batch.size()) { - output_.resize(kOutputDimensions * batch.size()); - } - batch_size_ = static_cast(batch.size()); - const auto output = previous_layer_trainer_->Propagate(batch); #if defined(USE_BLAS) - cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1); + cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1); #else - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - output_[batch_offset + i] = output[batch_offset + i]; - } - } -#endif - return output_.data(); - } - - // backpropagation - void Backpropagate(const LearnFloatType* gradients, - LearnFloatType learning_rate) { - previous_layer_trainer_->Backpropagate(gradients, learning_rate); - } - - private: - // constructor - Trainer(LayerType* target_layer, FeatureTransformer* ft) : - batch_size_(0), - previous_layer_trainer_(Trainer::Create( - &target_layer->previous_layer_, ft)), - target_layer_(target_layer) { - } - - // number of input/output dimensions - static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions; - - // make subclass friend - template - friend class Trainer; - - // number of samples in mini-batch - IndexType batch_size_; - - // Trainer of the previous layer - const std::shared_ptr> previous_layer_trainer_; - - // layer to learn - LayerType* const target_layer_; - - // Forward propagation buffer - std::vector output_; -}; - -} // namespace NNUE - -} // namespace Eval + for (IndexType b = 0; b < batch_size_; ++b) { + const IndexType batch_offset = kOutputDimensions * b; + for (IndexType i = 0; i < kOutputDimensions; ++i) { + output_[batch_offset + i] = output[batch_offset + i]; + } + } + +#endif + return output_.data(); + } + + // backpropagation + void Backpropagate(const LearnFloatType* gradients, + LearnFloatType learning_rate) { + + previous_layer_trainer_->Backpropagate(gradients, learning_rate); + } + + private: + // constructor + Trainer(LayerType* target_layer, FeatureTransformer* ft) : + batch_size_(0), + previous_layer_trainer_(Trainer::Create( + &target_layer->previous_layer_, ft)), + target_layer_(target_layer) { + } + + // number of input/output dimensions + static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions; + + // make subclass friend + template + friend class Trainer; + + // number of samples in mini-batch + IndexType batch_size_; + + // Trainer of the previous layer + const std::shared_ptr> previous_layer_trainer_; + + // layer to learn + LayerType* const target_layer_; + + // Forward propagation buffer + std::vector output_; + }; + +} // namespace Eval::NNUE #endif