diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h index 5d2f29c9..610805ca 100644 --- a/src/nnue/trainer/trainer_affine_transform.h +++ b/src/nnue/trainer/trainer_affine_transform.h @@ -3,6 +3,8 @@ #include "trainer.h" +#include "extra/stockfish_blas.h" + #include "learn/learn.h" #include "nnue/layers/affine_transform.h" @@ -98,32 +100,46 @@ namespace Eval::NNUE { batch_size_ = static_cast(batch.size()); batch_input_ = previous_layer_trainer_->propagate(thread_pool, batch); + #if defined(USE_BLAS) + for (IndexType b = 0; b < batch_size_; ++b) { const IndexType batch_offset = kOutputDimensions * b; - cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1); + cblas_scopy( + kOutputDimensions, biases_, 1, &output_[batch_offset], 1 + ); } - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, - kOutputDimensions, batch_size_, kInputDimensions, 1.0, - weights_, kInputDimensions, - batch_input_, kInputDimensions, - 1.0, &output_[0], kOutputDimensions); + cblas_sgemm( + CblasColMajor, CblasTrans, CblasNoTrans, + kOutputDimensions, batch_size_, kInputDimensions, + 1.0, + weights_, kInputDimensions, + batch_input_, kInputDimensions, + 1.0, + &output_[0], kOutputDimensions + ); #else - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType input_batch_offset = kInputDimensions * b; - const IndexType output_batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - double sum = biases_[i]; - for (IndexType j = 0; j < kInputDimensions; ++j) { - const IndexType index = kInputDimensions * i + j; - sum += weights_[index] * batch_input_[input_batch_offset + j]; - } - output_[output_batch_offset + i] = static_cast(sum); - } + for (IndexType b = 0; b < batch_size_; ++b) { + const IndexType batch_offset = kOutputDimensions * b; + Blas::scopy( + thread_pool, + kOutputDimensions, biases_, 1, &output_[batch_offset], 1 + ); } + Blas::sgemm( + thread_pool, + Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans, + kOutputDimensions, batch_size_, kInputDimensions, + 1.0, + weights_, kInputDimensions, + batch_input_, kInputDimensions, + 1.0, + &output_[0], kOutputDimensions + ); + #endif return output_.data(); } @@ -137,67 +153,77 @@ namespace Eval::NNUE { learning_rate * learning_rate_scale_; #if defined(USE_BLAS) - // backpropagate - cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, - kInputDimensions, batch_size_, kOutputDimensions, 1.0, - weights_, kInputDimensions, - gradients, kOutputDimensions, - 0.0, &gradients_[0], kInputDimensions); + + cblas_sgemm( + CblasColMajor, CblasNoTrans, CblasNoTrans, + kInputDimensions, batch_size_, kOutputDimensions, + 1.0, + weights_, kInputDimensions, + gradients, kOutputDimensions, + 0.0, + &gradients_[0], kInputDimensions + ); // update - cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1); + cblas_sscal( + kOutputDimensions, momentum_, biases_diff_, 1 + ); + for (IndexType b = 0; b < batch_size_; ++b) { const IndexType batch_offset = kOutputDimensions * b; - cblas_saxpy(kOutputDimensions, 1.0, + cblas_saxpy( + kOutputDimensions, 1.0, + &gradients[batch_offset], 1, biases_diff_, 1 + ); + } + + cblas_sgemm( + CblasRowMajor, CblasTrans, CblasNoTrans, + kOutputDimensions, kInputDimensions, batch_size_, + 1.0, + gradients, kOutputDimensions, + batch_input_, kInputDimensions, + momentum_, + weights_diff_, kInputDimensions + ); + +#else + + // backpropagate + Blas::sgemm( + thread_pool, + Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans, + kInputDimensions, batch_size_, kOutputDimensions, + 1.0, + weights_, kInputDimensions, + gradients, kOutputDimensions, + 0.0, + &gradients_[0], kInputDimensions + ); + + + Blas::sscal( + thread_pool, + kOutputDimensions, momentum_, biases_diff_, 1 + ); + + for (IndexType b = 0; b < batch_size_; ++b) { + const IndexType batch_offset = kOutputDimensions * b; + Blas::saxpy(thread_pool, kOutputDimensions, 1.0, &gradients[batch_offset], 1, biases_diff_, 1); } - cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, - kOutputDimensions, kInputDimensions, batch_size_, 1.0, - gradients, kOutputDimensions, - batch_input_, kInputDimensions, - momentum_, weights_diff_, kInputDimensions); + Blas::sgemm( + thread_pool, + Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans, + kOutputDimensions, kInputDimensions, batch_size_, + 1.0, + gradients, kOutputDimensions, + batch_input_, kInputDimensions, + momentum_, + weights_diff_, kInputDimensions + ); -#else - // backpropagate - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType input_batch_offset = kInputDimensions * b; - const IndexType output_batch_offset = kOutputDimensions * b; - for (IndexType j = 0; j < kInputDimensions; ++j) { - double sum = 0.0; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const IndexType index = kInputDimensions * i + j; - sum += weights_[index] * gradients[output_batch_offset + i]; - } - gradients_[input_batch_offset + j] = static_cast(sum); - } - } - - // update - for (IndexType i = 0; i < kOutputDimensions; ++i) { - biases_diff_[i] *= momentum_; - } - - for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) { - weights_diff_[i] *= momentum_; - } - - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType input_batch_offset = kInputDimensions * b; - const IndexType output_batch_offset = kOutputDimensions * b; - - for (IndexType i = 0; i < kOutputDimensions; ++i) { - biases_diff_[i] += gradients[output_batch_offset + i]; - } - - for (IndexType i = 0; i < kOutputDimensions; ++i) { - for (IndexType j = 0; j < kInputDimensions; ++j) { - const IndexType index = kInputDimensions * i + j; - weights_diff_[index] += gradients[output_batch_offset + i] * - batch_input_[input_batch_offset + j]; - } - } - } #endif for (IndexType i = 0; i < kOutputDimensions; ++i) { diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h index a778f956..8be584e8 100644 --- a/src/nnue/trainer/trainer_feature_transformer.h +++ b/src/nnue/trainer/trainer_feature_transformer.h @@ -3,6 +3,8 @@ #include "trainer.h" +#include "extra/stockfish_blas.h" + #include "features/factorizer_feature_set.h" #include "learn/learn.h" @@ -107,24 +109,36 @@ namespace Eval::NNUE { const IndexType batch_offset = kOutputDimensions * b; for (IndexType c = 0; c < 2; ++c) { const IndexType output_offset = batch_offset + kHalfDimensions * c; + #if defined(USE_BLAS) - cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1); + + cblas_scopy( + kHalfDimensions, biases_, 1, &output_[output_offset], 1 + ); + for (const auto& feature : batch[b].training_features[c]) { const IndexType weights_offset = kHalfDimensions * feature.get_index(); - cblas_saxpy(kHalfDimensions, (float)feature.get_count(), - &weights_[weights_offset], 1, &output_[output_offset], 1); + cblas_saxpy( + kHalfDimensions, (float)feature.get_count(), + &weights_[weights_offset], 1, &output_[output_offset], 1 + ); } + #else - for (IndexType i = 0; i < kHalfDimensions; ++i) { - output_[output_offset + i] = biases_[i]; - } + + Blas::scopy( + thread_pool, + kHalfDimensions, biases_, 1, &output_[output_offset], 1 + ); for (const auto& feature : batch[b].training_features[c]) { const IndexType weights_offset = kHalfDimensions * feature.get_index(); - for (IndexType i = 0; i < kHalfDimensions; ++i) { - output_[output_offset + i] += - feature.get_count() * weights_[weights_offset + i]; - } + Blas::saxpy( + thread_pool, + kHalfDimensions, (float)feature.get_count(), + &weights_[weights_offset], 1, &output_[output_offset], 1 + ); } + #endif } } @@ -171,19 +185,27 @@ namespace Eval::NNUE { // Correct the learning rate and adjust the scale without using momentum const LearnFloatType effective_learning_rate = static_cast(local_learning_rate / (1.0 - momentum_)); + #if defined(USE_BLAS) - cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1); + + cblas_sscal( + kHalfDimensions, momentum_, biases_diff_, 1 + ); for (IndexType b = 0; b < batch_->size(); ++b) { const IndexType batch_offset = kOutputDimensions * b; for (IndexType c = 0; c < 2; ++c) { const IndexType output_offset = batch_offset + kHalfDimensions * c; - cblas_saxpy(kHalfDimensions, 1.0, - &gradients_[output_offset], 1, biases_diff_, 1); + cblas_saxpy( + kHalfDimensions, 1.0, + &gradients_[output_offset], 1, biases_diff_, 1 + ); } } - cblas_saxpy(kHalfDimensions, -local_learning_rate, - biases_diff_, 1, biases_, 1); + cblas_saxpy( + kHalfDimensions, -local_learning_rate, + biases_diff_, 1, biases_, 1 + ); #pragma omp parallel { @@ -205,45 +227,67 @@ namespace Eval::NNUE { const auto scale = static_cast( effective_learning_rate / feature.get_count()); - cblas_saxpy(kHalfDimensions, -scale, - &gradients_[output_offset], 1, - &weights_[weights_offset], 1); + cblas_saxpy( + kHalfDimensions, -scale, + &gradients_[output_offset], 1, + &weights_[weights_offset], 1 + ); } } } } #else - for (IndexType i = 0; i < kHalfDimensions; ++i) { - biases_diff_[i] *= momentum_; - } + + Blas::sscal( + thread_pool, + kHalfDimensions, momentum_, biases_diff_, 1 + ); for (IndexType b = 0; b < batch_->size(); ++b) { const IndexType batch_offset = kOutputDimensions * b; for (IndexType c = 0; c < 2; ++c) { const IndexType output_offset = batch_offset + kHalfDimensions * c; - for (IndexType i = 0; i < kHalfDimensions; ++i) { - biases_diff_[i] += gradients_[output_offset + i]; - } + Blas::saxpy( + thread_pool, + kHalfDimensions, 1.0, + &gradients_[output_offset], 1, biases_diff_, 1 + ); } } - for (IndexType i = 0; i < kHalfDimensions; ++i) { - biases_[i] -= local_learning_rate * biases_diff_[i]; - } + Blas::saxpy( + thread_pool, + kHalfDimensions, -local_learning_rate, + biases_diff_, 1, biases_, 1 + ); - for (IndexType b = 0; b < batch_->size(); ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType c = 0; c < 2; ++c) { - const IndexType output_offset = batch_offset + kHalfDimensions * c; - for (const auto& feature : (*batch_)[b].training_features[c]) { - const IndexType weights_offset = kHalfDimensions * feature.get_index(); - const auto scale = static_cast( - effective_learning_rate / feature.get_count()); +#pragma omp parallel + { +#if defined(_OPENMP) + const IndexType num_threads = omp_get_num_threads(); + const IndexType thread_index = omp_get_thread_num(); +#endif + for (IndexType b = 0; b < batch_->size(); ++b) { + const IndexType batch_offset = kOutputDimensions * b; + for (IndexType c = 0; c < 2; ++c) { + const IndexType output_offset = batch_offset + kHalfDimensions * c; + for (const auto& feature : (*batch_)[b].training_features[c]) { +#if defined(_OPENMP) + if (feature.get_index() % num_threads != thread_index) + continue; +#endif + const IndexType weights_offset = + kHalfDimensions * feature.get_index(); + const auto scale = static_cast( + effective_learning_rate / feature.get_count()); - for (IndexType i = 0; i < kHalfDimensions; ++i) { - weights_[weights_offset + i] -= - scale * gradients_[output_offset + i]; + Blas::saxpy( + thread_pool, + kHalfDimensions, -scale, + &gradients_[output_offset], 1, + &weights_[weights_offset], 1 + ); } } } diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h index 4bb38104..03e9fec0 100644 --- a/src/nnue/trainer/trainer_input_slice.h +++ b/src/nnue/trainer/trainer_input_slice.h @@ -3,6 +3,8 @@ #include "trainer.h" +#include "extra/stockfish_blas.h" + #include "learn/learn.h" #include "nnue/layers/input_slice.h" @@ -208,13 +210,21 @@ namespace Eval::NNUE { for (IndexType b = 0; b < batch_size_; ++b) { const IndexType input_offset = kInputDimensions * b; const IndexType output_offset = kOutputDimensions * b; + #if defined(USE_BLAS) - cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1, - &output_[output_offset], 1); + + cblas_scopy( + kOutputDimensions, &input[input_offset + Offset], 1, + &output_[output_offset], 1 + ); #else - for (IndexType i = 0; i < kOutputDimensions; ++i) { - output_[output_offset + i] = input[input_offset + Offset + i]; - } + + Blas::scopy( + thread_pool, + kOutputDimensions, &input[input_offset + Offset], 1, + &output_[output_offset], 1 + ); + #endif } diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h index 6defb95f..88ff302c 100644 --- a/src/nnue/trainer/trainer_sum.h +++ b/src/nnue/trainer/trainer_sum.h @@ -3,6 +3,8 @@ #include "trainer.h" +#include "extra/stockfish_blas.h" + #include "learn/learn.h" #include "nnue/layers/sum.h" @@ -53,15 +55,19 @@ namespace Eval::NNUE { const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch); #if defined(USE_BLAS) - cblas_saxpy(kOutputDimensions * batch_size_, 1.0, - head_output, 1, output, 1); + + cblas_saxpy( + kOutputDimensions * batch_size_, 1.0, + head_output, 1, output, 1 + ); + #else - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - output[batch_offset + i] += head_output[batch_offset + i]; - } - } + + Blas::saxpy( + thread_pool, + kOutputDimensions * batch_size_, 1.0, + head_output, 1, output, 1 + ); #endif return output;