mirror of
https://github.com/HChaZZY/Stockfish.git
synced 2025-12-25 03:26:24 +08:00
Replace non-blas parts of trainers with our own blas-like routines.
This commit is contained in:
@@ -3,6 +3,8 @@
|
||||
|
||||
#include "trainer.h"
|
||||
|
||||
#include "extra/stockfish_blas.h"
|
||||
|
||||
#include "learn/learn.h"
|
||||
|
||||
#include "nnue/layers/affine_transform.h"
|
||||
@@ -98,32 +100,46 @@ namespace Eval::NNUE {
|
||||
|
||||
batch_size_ = static_cast<IndexType>(batch.size());
|
||||
batch_input_ = previous_layer_trainer_->propagate(thread_pool, batch);
|
||||
|
||||
#if defined(USE_BLAS)
|
||||
|
||||
for (IndexType b = 0; b < batch_size_; ++b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
|
||||
cblas_scopy(
|
||||
kOutputDimensions, biases_, 1, &output_[batch_offset], 1
|
||||
);
|
||||
}
|
||||
|
||||
cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
|
||||
kOutputDimensions, batch_size_, kInputDimensions, 1.0,
|
||||
weights_, kInputDimensions,
|
||||
batch_input_, kInputDimensions,
|
||||
1.0, &output_[0], kOutputDimensions);
|
||||
cblas_sgemm(
|
||||
CblasColMajor, CblasTrans, CblasNoTrans,
|
||||
kOutputDimensions, batch_size_, kInputDimensions,
|
||||
1.0,
|
||||
weights_, kInputDimensions,
|
||||
batch_input_, kInputDimensions,
|
||||
1.0,
|
||||
&output_[0], kOutputDimensions
|
||||
);
|
||||
#else
|
||||
for (IndexType b = 0; b < batch_size_; ++b) {
|
||||
const IndexType input_batch_offset = kInputDimensions * b;
|
||||
const IndexType output_batch_offset = kOutputDimensions * b;
|
||||
for (IndexType i = 0; i < kOutputDimensions; ++i) {
|
||||
double sum = biases_[i];
|
||||
for (IndexType j = 0; j < kInputDimensions; ++j) {
|
||||
const IndexType index = kInputDimensions * i + j;
|
||||
sum += weights_[index] * batch_input_[input_batch_offset + j];
|
||||
}
|
||||
|
||||
output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
|
||||
}
|
||||
for (IndexType b = 0; b < batch_size_; ++b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
Blas::scopy(
|
||||
thread_pool,
|
||||
kOutputDimensions, biases_, 1, &output_[batch_offset], 1
|
||||
);
|
||||
}
|
||||
|
||||
Blas::sgemm(
|
||||
thread_pool,
|
||||
Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
|
||||
kOutputDimensions, batch_size_, kInputDimensions,
|
||||
1.0,
|
||||
weights_, kInputDimensions,
|
||||
batch_input_, kInputDimensions,
|
||||
1.0,
|
||||
&output_[0], kOutputDimensions
|
||||
);
|
||||
|
||||
#endif
|
||||
return output_.data();
|
||||
}
|
||||
@@ -137,67 +153,77 @@ namespace Eval::NNUE {
|
||||
learning_rate * learning_rate_scale_;
|
||||
|
||||
#if defined(USE_BLAS)
|
||||
// backpropagate
|
||||
cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
|
||||
kInputDimensions, batch_size_, kOutputDimensions, 1.0,
|
||||
weights_, kInputDimensions,
|
||||
gradients, kOutputDimensions,
|
||||
0.0, &gradients_[0], kInputDimensions);
|
||||
|
||||
cblas_sgemm(
|
||||
CblasColMajor, CblasNoTrans, CblasNoTrans,
|
||||
kInputDimensions, batch_size_, kOutputDimensions,
|
||||
1.0,
|
||||
weights_, kInputDimensions,
|
||||
gradients, kOutputDimensions,
|
||||
0.0,
|
||||
&gradients_[0], kInputDimensions
|
||||
);
|
||||
|
||||
// update
|
||||
cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
|
||||
cblas_sscal(
|
||||
kOutputDimensions, momentum_, biases_diff_, 1
|
||||
);
|
||||
|
||||
for (IndexType b = 0; b < batch_size_; ++b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
cblas_saxpy(kOutputDimensions, 1.0,
|
||||
cblas_saxpy(
|
||||
kOutputDimensions, 1.0,
|
||||
&gradients[batch_offset], 1, biases_diff_, 1
|
||||
);
|
||||
}
|
||||
|
||||
cblas_sgemm(
|
||||
CblasRowMajor, CblasTrans, CblasNoTrans,
|
||||
kOutputDimensions, kInputDimensions, batch_size_,
|
||||
1.0,
|
||||
gradients, kOutputDimensions,
|
||||
batch_input_, kInputDimensions,
|
||||
momentum_,
|
||||
weights_diff_, kInputDimensions
|
||||
);
|
||||
|
||||
#else
|
||||
|
||||
// backpropagate
|
||||
Blas::sgemm(
|
||||
thread_pool,
|
||||
Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans,
|
||||
kInputDimensions, batch_size_, kOutputDimensions,
|
||||
1.0,
|
||||
weights_, kInputDimensions,
|
||||
gradients, kOutputDimensions,
|
||||
0.0,
|
||||
&gradients_[0], kInputDimensions
|
||||
);
|
||||
|
||||
|
||||
Blas::sscal(
|
||||
thread_pool,
|
||||
kOutputDimensions, momentum_, biases_diff_, 1
|
||||
);
|
||||
|
||||
for (IndexType b = 0; b < batch_size_; ++b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
Blas::saxpy(thread_pool, kOutputDimensions, 1.0,
|
||||
&gradients[batch_offset], 1, biases_diff_, 1);
|
||||
}
|
||||
|
||||
cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
|
||||
kOutputDimensions, kInputDimensions, batch_size_, 1.0,
|
||||
gradients, kOutputDimensions,
|
||||
batch_input_, kInputDimensions,
|
||||
momentum_, weights_diff_, kInputDimensions);
|
||||
Blas::sgemm(
|
||||
thread_pool,
|
||||
Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
|
||||
kOutputDimensions, kInputDimensions, batch_size_,
|
||||
1.0,
|
||||
gradients, kOutputDimensions,
|
||||
batch_input_, kInputDimensions,
|
||||
momentum_,
|
||||
weights_diff_, kInputDimensions
|
||||
);
|
||||
|
||||
#else
|
||||
// backpropagate
|
||||
for (IndexType b = 0; b < batch_size_; ++b) {
|
||||
const IndexType input_batch_offset = kInputDimensions * b;
|
||||
const IndexType output_batch_offset = kOutputDimensions * b;
|
||||
for (IndexType j = 0; j < kInputDimensions; ++j) {
|
||||
double sum = 0.0;
|
||||
for (IndexType i = 0; i < kOutputDimensions; ++i) {
|
||||
const IndexType index = kInputDimensions * i + j;
|
||||
sum += weights_[index] * gradients[output_batch_offset + i];
|
||||
}
|
||||
gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
|
||||
}
|
||||
}
|
||||
|
||||
// update
|
||||
for (IndexType i = 0; i < kOutputDimensions; ++i) {
|
||||
biases_diff_[i] *= momentum_;
|
||||
}
|
||||
|
||||
for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
|
||||
weights_diff_[i] *= momentum_;
|
||||
}
|
||||
|
||||
for (IndexType b = 0; b < batch_size_; ++b) {
|
||||
const IndexType input_batch_offset = kInputDimensions * b;
|
||||
const IndexType output_batch_offset = kOutputDimensions * b;
|
||||
|
||||
for (IndexType i = 0; i < kOutputDimensions; ++i) {
|
||||
biases_diff_[i] += gradients[output_batch_offset + i];
|
||||
}
|
||||
|
||||
for (IndexType i = 0; i < kOutputDimensions; ++i) {
|
||||
for (IndexType j = 0; j < kInputDimensions; ++j) {
|
||||
const IndexType index = kInputDimensions * i + j;
|
||||
weights_diff_[index] += gradients[output_batch_offset + i] *
|
||||
batch_input_[input_batch_offset + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (IndexType i = 0; i < kOutputDimensions; ++i) {
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
|
||||
#include "trainer.h"
|
||||
|
||||
#include "extra/stockfish_blas.h"
|
||||
|
||||
#include "features/factorizer_feature_set.h"
|
||||
|
||||
#include "learn/learn.h"
|
||||
@@ -107,24 +109,36 @@ namespace Eval::NNUE {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
for (IndexType c = 0; c < 2; ++c) {
|
||||
const IndexType output_offset = batch_offset + kHalfDimensions * c;
|
||||
|
||||
#if defined(USE_BLAS)
|
||||
cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
|
||||
|
||||
cblas_scopy(
|
||||
kHalfDimensions, biases_, 1, &output_[output_offset], 1
|
||||
);
|
||||
|
||||
for (const auto& feature : batch[b].training_features[c]) {
|
||||
const IndexType weights_offset = kHalfDimensions * feature.get_index();
|
||||
cblas_saxpy(kHalfDimensions, (float)feature.get_count(),
|
||||
&weights_[weights_offset], 1, &output_[output_offset], 1);
|
||||
cblas_saxpy(
|
||||
kHalfDimensions, (float)feature.get_count(),
|
||||
&weights_[weights_offset], 1, &output_[output_offset], 1
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
for (IndexType i = 0; i < kHalfDimensions; ++i) {
|
||||
output_[output_offset + i] = biases_[i];
|
||||
}
|
||||
|
||||
Blas::scopy(
|
||||
thread_pool,
|
||||
kHalfDimensions, biases_, 1, &output_[output_offset], 1
|
||||
);
|
||||
for (const auto& feature : batch[b].training_features[c]) {
|
||||
const IndexType weights_offset = kHalfDimensions * feature.get_index();
|
||||
for (IndexType i = 0; i < kHalfDimensions; ++i) {
|
||||
output_[output_offset + i] +=
|
||||
feature.get_count() * weights_[weights_offset + i];
|
||||
}
|
||||
Blas::saxpy(
|
||||
thread_pool,
|
||||
kHalfDimensions, (float)feature.get_count(),
|
||||
&weights_[weights_offset], 1, &output_[output_offset], 1
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@@ -171,19 +185,27 @@ namespace Eval::NNUE {
|
||||
// Correct the learning rate and adjust the scale without using momentum
|
||||
const LearnFloatType effective_learning_rate =
|
||||
static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
|
||||
|
||||
#if defined(USE_BLAS)
|
||||
cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1);
|
||||
|
||||
cblas_sscal(
|
||||
kHalfDimensions, momentum_, biases_diff_, 1
|
||||
);
|
||||
for (IndexType b = 0; b < batch_->size(); ++b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
for (IndexType c = 0; c < 2; ++c) {
|
||||
const IndexType output_offset = batch_offset + kHalfDimensions * c;
|
||||
cblas_saxpy(kHalfDimensions, 1.0,
|
||||
&gradients_[output_offset], 1, biases_diff_, 1);
|
||||
cblas_saxpy(
|
||||
kHalfDimensions, 1.0,
|
||||
&gradients_[output_offset], 1, biases_diff_, 1
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
cblas_saxpy(kHalfDimensions, -local_learning_rate,
|
||||
biases_diff_, 1, biases_, 1);
|
||||
cblas_saxpy(
|
||||
kHalfDimensions, -local_learning_rate,
|
||||
biases_diff_, 1, biases_, 1
|
||||
);
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
@@ -205,45 +227,67 @@ namespace Eval::NNUE {
|
||||
const auto scale = static_cast<LearnFloatType>(
|
||||
effective_learning_rate / feature.get_count());
|
||||
|
||||
cblas_saxpy(kHalfDimensions, -scale,
|
||||
&gradients_[output_offset], 1,
|
||||
&weights_[weights_offset], 1);
|
||||
cblas_saxpy(
|
||||
kHalfDimensions, -scale,
|
||||
&gradients_[output_offset], 1,
|
||||
&weights_[weights_offset], 1
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
for (IndexType i = 0; i < kHalfDimensions; ++i) {
|
||||
biases_diff_[i] *= momentum_;
|
||||
}
|
||||
|
||||
Blas::sscal(
|
||||
thread_pool,
|
||||
kHalfDimensions, momentum_, biases_diff_, 1
|
||||
);
|
||||
|
||||
for (IndexType b = 0; b < batch_->size(); ++b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
for (IndexType c = 0; c < 2; ++c) {
|
||||
const IndexType output_offset = batch_offset + kHalfDimensions * c;
|
||||
for (IndexType i = 0; i < kHalfDimensions; ++i) {
|
||||
biases_diff_[i] += gradients_[output_offset + i];
|
||||
}
|
||||
Blas::saxpy(
|
||||
thread_pool,
|
||||
kHalfDimensions, 1.0,
|
||||
&gradients_[output_offset], 1, biases_diff_, 1
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
for (IndexType i = 0; i < kHalfDimensions; ++i) {
|
||||
biases_[i] -= local_learning_rate * biases_diff_[i];
|
||||
}
|
||||
Blas::saxpy(
|
||||
thread_pool,
|
||||
kHalfDimensions, -local_learning_rate,
|
||||
biases_diff_, 1, biases_, 1
|
||||
);
|
||||
|
||||
for (IndexType b = 0; b < batch_->size(); ++b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
for (IndexType c = 0; c < 2; ++c) {
|
||||
const IndexType output_offset = batch_offset + kHalfDimensions * c;
|
||||
for (const auto& feature : (*batch_)[b].training_features[c]) {
|
||||
const IndexType weights_offset = kHalfDimensions * feature.get_index();
|
||||
const auto scale = static_cast<LearnFloatType>(
|
||||
effective_learning_rate / feature.get_count());
|
||||
#pragma omp parallel
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
const IndexType num_threads = omp_get_num_threads();
|
||||
const IndexType thread_index = omp_get_thread_num();
|
||||
#endif
|
||||
for (IndexType b = 0; b < batch_->size(); ++b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
for (IndexType c = 0; c < 2; ++c) {
|
||||
const IndexType output_offset = batch_offset + kHalfDimensions * c;
|
||||
for (const auto& feature : (*batch_)[b].training_features[c]) {
|
||||
#if defined(_OPENMP)
|
||||
if (feature.get_index() % num_threads != thread_index)
|
||||
continue;
|
||||
#endif
|
||||
const IndexType weights_offset =
|
||||
kHalfDimensions * feature.get_index();
|
||||
const auto scale = static_cast<LearnFloatType>(
|
||||
effective_learning_rate / feature.get_count());
|
||||
|
||||
for (IndexType i = 0; i < kHalfDimensions; ++i) {
|
||||
weights_[weights_offset + i] -=
|
||||
scale * gradients_[output_offset + i];
|
||||
Blas::saxpy(
|
||||
thread_pool,
|
||||
kHalfDimensions, -scale,
|
||||
&gradients_[output_offset], 1,
|
||||
&weights_[weights_offset], 1
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
|
||||
#include "trainer.h"
|
||||
|
||||
#include "extra/stockfish_blas.h"
|
||||
|
||||
#include "learn/learn.h"
|
||||
|
||||
#include "nnue/layers/input_slice.h"
|
||||
@@ -208,13 +210,21 @@ namespace Eval::NNUE {
|
||||
for (IndexType b = 0; b < batch_size_; ++b) {
|
||||
const IndexType input_offset = kInputDimensions * b;
|
||||
const IndexType output_offset = kOutputDimensions * b;
|
||||
|
||||
#if defined(USE_BLAS)
|
||||
cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
|
||||
&output_[output_offset], 1);
|
||||
|
||||
cblas_scopy(
|
||||
kOutputDimensions, &input[input_offset + Offset], 1,
|
||||
&output_[output_offset], 1
|
||||
);
|
||||
#else
|
||||
for (IndexType i = 0; i < kOutputDimensions; ++i) {
|
||||
output_[output_offset + i] = input[input_offset + Offset + i];
|
||||
}
|
||||
|
||||
Blas::scopy(
|
||||
thread_pool,
|
||||
kOutputDimensions, &input[input_offset + Offset], 1,
|
||||
&output_[output_offset], 1
|
||||
);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
|
||||
#include "trainer.h"
|
||||
|
||||
#include "extra/stockfish_blas.h"
|
||||
|
||||
#include "learn/learn.h"
|
||||
|
||||
#include "nnue/layers/sum.h"
|
||||
@@ -53,15 +55,19 @@ namespace Eval::NNUE {
|
||||
const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch);
|
||||
|
||||
#if defined(USE_BLAS)
|
||||
cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
|
||||
head_output, 1, output, 1);
|
||||
|
||||
cblas_saxpy(
|
||||
kOutputDimensions * batch_size_, 1.0,
|
||||
head_output, 1, output, 1
|
||||
);
|
||||
|
||||
#else
|
||||
for (IndexType b = 0; b < batch_size_; ++b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
for (IndexType i = 0; i < kOutputDimensions; ++i) {
|
||||
output[batch_offset + i] += head_output[batch_offset + i];
|
||||
}
|
||||
}
|
||||
|
||||
Blas::saxpy(
|
||||
thread_pool,
|
||||
kOutputDimensions * batch_size_, 1.0,
|
||||
head_output, 1, output, 1
|
||||
);
|
||||
|
||||
#endif
|
||||
return output;
|
||||
|
||||
Reference in New Issue
Block a user