Replace non-blas parts of trainers with our own blas-like routines.

This commit is contained in:
Tomasz Sobczyk
2020-10-28 14:52:27 +01:00
committed by nodchip
parent c56a4a36eb
commit a56d8124d8
4 changed files with 207 additions and 121 deletions

View File

@@ -3,6 +3,8 @@
#include "trainer.h"
#include "extra/stockfish_blas.h"
#include "learn/learn.h"
#include "nnue/layers/affine_transform.h"
@@ -98,32 +100,46 @@ namespace Eval::NNUE {
batch_size_ = static_cast<IndexType>(batch.size());
batch_input_ = previous_layer_trainer_->propagate(thread_pool, batch);
#if defined(USE_BLAS)
for (IndexType b = 0; b < batch_size_; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
cblas_scopy(
kOutputDimensions, biases_, 1, &output_[batch_offset], 1
);
}
cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
kOutputDimensions, batch_size_, kInputDimensions, 1.0,
weights_, kInputDimensions,
batch_input_, kInputDimensions,
1.0, &output_[0], kOutputDimensions);
cblas_sgemm(
CblasColMajor, CblasTrans, CblasNoTrans,
kOutputDimensions, batch_size_, kInputDimensions,
1.0,
weights_, kInputDimensions,
batch_input_, kInputDimensions,
1.0,
&output_[0], kOutputDimensions
);
#else
for (IndexType b = 0; b < batch_size_; ++b) {
const IndexType input_batch_offset = kInputDimensions * b;
const IndexType output_batch_offset = kOutputDimensions * b;
for (IndexType i = 0; i < kOutputDimensions; ++i) {
double sum = biases_[i];
for (IndexType j = 0; j < kInputDimensions; ++j) {
const IndexType index = kInputDimensions * i + j;
sum += weights_[index] * batch_input_[input_batch_offset + j];
}
output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
}
for (IndexType b = 0; b < batch_size_; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
Blas::scopy(
thread_pool,
kOutputDimensions, biases_, 1, &output_[batch_offset], 1
);
}
Blas::sgemm(
thread_pool,
Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
kOutputDimensions, batch_size_, kInputDimensions,
1.0,
weights_, kInputDimensions,
batch_input_, kInputDimensions,
1.0,
&output_[0], kOutputDimensions
);
#endif
return output_.data();
}
@@ -137,67 +153,77 @@ namespace Eval::NNUE {
learning_rate * learning_rate_scale_;
#if defined(USE_BLAS)
// backpropagate
cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
kInputDimensions, batch_size_, kOutputDimensions, 1.0,
weights_, kInputDimensions,
gradients, kOutputDimensions,
0.0, &gradients_[0], kInputDimensions);
cblas_sgemm(
CblasColMajor, CblasNoTrans, CblasNoTrans,
kInputDimensions, batch_size_, kOutputDimensions,
1.0,
weights_, kInputDimensions,
gradients, kOutputDimensions,
0.0,
&gradients_[0], kInputDimensions
);
// update
cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
cblas_sscal(
kOutputDimensions, momentum_, biases_diff_, 1
);
for (IndexType b = 0; b < batch_size_; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
cblas_saxpy(kOutputDimensions, 1.0,
cblas_saxpy(
kOutputDimensions, 1.0,
&gradients[batch_offset], 1, biases_diff_, 1
);
}
cblas_sgemm(
CblasRowMajor, CblasTrans, CblasNoTrans,
kOutputDimensions, kInputDimensions, batch_size_,
1.0,
gradients, kOutputDimensions,
batch_input_, kInputDimensions,
momentum_,
weights_diff_, kInputDimensions
);
#else
// backpropagate
Blas::sgemm(
thread_pool,
Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans,
kInputDimensions, batch_size_, kOutputDimensions,
1.0,
weights_, kInputDimensions,
gradients, kOutputDimensions,
0.0,
&gradients_[0], kInputDimensions
);
Blas::sscal(
thread_pool,
kOutputDimensions, momentum_, biases_diff_, 1
);
for (IndexType b = 0; b < batch_size_; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
Blas::saxpy(thread_pool, kOutputDimensions, 1.0,
&gradients[batch_offset], 1, biases_diff_, 1);
}
cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
kOutputDimensions, kInputDimensions, batch_size_, 1.0,
gradients, kOutputDimensions,
batch_input_, kInputDimensions,
momentum_, weights_diff_, kInputDimensions);
Blas::sgemm(
thread_pool,
Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
kOutputDimensions, kInputDimensions, batch_size_,
1.0,
gradients, kOutputDimensions,
batch_input_, kInputDimensions,
momentum_,
weights_diff_, kInputDimensions
);
#else
// backpropagate
for (IndexType b = 0; b < batch_size_; ++b) {
const IndexType input_batch_offset = kInputDimensions * b;
const IndexType output_batch_offset = kOutputDimensions * b;
for (IndexType j = 0; j < kInputDimensions; ++j) {
double sum = 0.0;
for (IndexType i = 0; i < kOutputDimensions; ++i) {
const IndexType index = kInputDimensions * i + j;
sum += weights_[index] * gradients[output_batch_offset + i];
}
gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
}
}
// update
for (IndexType i = 0; i < kOutputDimensions; ++i) {
biases_diff_[i] *= momentum_;
}
for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
weights_diff_[i] *= momentum_;
}
for (IndexType b = 0; b < batch_size_; ++b) {
const IndexType input_batch_offset = kInputDimensions * b;
const IndexType output_batch_offset = kOutputDimensions * b;
for (IndexType i = 0; i < kOutputDimensions; ++i) {
biases_diff_[i] += gradients[output_batch_offset + i];
}
for (IndexType i = 0; i < kOutputDimensions; ++i) {
for (IndexType j = 0; j < kInputDimensions; ++j) {
const IndexType index = kInputDimensions * i + j;
weights_diff_[index] += gradients[output_batch_offset + i] *
batch_input_[input_batch_offset + j];
}
}
}
#endif
for (IndexType i = 0; i < kOutputDimensions; ++i) {

View File

@@ -3,6 +3,8 @@
#include "trainer.h"
#include "extra/stockfish_blas.h"
#include "features/factorizer_feature_set.h"
#include "learn/learn.h"
@@ -107,24 +109,36 @@ namespace Eval::NNUE {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c;
#if defined(USE_BLAS)
cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
cblas_scopy(
kHalfDimensions, biases_, 1, &output_[output_offset], 1
);
for (const auto& feature : batch[b].training_features[c]) {
const IndexType weights_offset = kHalfDimensions * feature.get_index();
cblas_saxpy(kHalfDimensions, (float)feature.get_count(),
&weights_[weights_offset], 1, &output_[output_offset], 1);
cblas_saxpy(
kHalfDimensions, (float)feature.get_count(),
&weights_[weights_offset], 1, &output_[output_offset], 1
);
}
#else
for (IndexType i = 0; i < kHalfDimensions; ++i) {
output_[output_offset + i] = biases_[i];
}
Blas::scopy(
thread_pool,
kHalfDimensions, biases_, 1, &output_[output_offset], 1
);
for (const auto& feature : batch[b].training_features[c]) {
const IndexType weights_offset = kHalfDimensions * feature.get_index();
for (IndexType i = 0; i < kHalfDimensions; ++i) {
output_[output_offset + i] +=
feature.get_count() * weights_[weights_offset + i];
}
Blas::saxpy(
thread_pool,
kHalfDimensions, (float)feature.get_count(),
&weights_[weights_offset], 1, &output_[output_offset], 1
);
}
#endif
}
}
@@ -171,19 +185,27 @@ namespace Eval::NNUE {
// Correct the learning rate and adjust the scale without using momentum
const LearnFloatType effective_learning_rate =
static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
#if defined(USE_BLAS)
cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1);
cblas_sscal(
kHalfDimensions, momentum_, biases_diff_, 1
);
for (IndexType b = 0; b < batch_->size(); ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c;
cblas_saxpy(kHalfDimensions, 1.0,
&gradients_[output_offset], 1, biases_diff_, 1);
cblas_saxpy(
kHalfDimensions, 1.0,
&gradients_[output_offset], 1, biases_diff_, 1
);
}
}
cblas_saxpy(kHalfDimensions, -local_learning_rate,
biases_diff_, 1, biases_, 1);
cblas_saxpy(
kHalfDimensions, -local_learning_rate,
biases_diff_, 1, biases_, 1
);
#pragma omp parallel
{
@@ -205,45 +227,67 @@ namespace Eval::NNUE {
const auto scale = static_cast<LearnFloatType>(
effective_learning_rate / feature.get_count());
cblas_saxpy(kHalfDimensions, -scale,
&gradients_[output_offset], 1,
&weights_[weights_offset], 1);
cblas_saxpy(
kHalfDimensions, -scale,
&gradients_[output_offset], 1,
&weights_[weights_offset], 1
);
}
}
}
}
#else
for (IndexType i = 0; i < kHalfDimensions; ++i) {
biases_diff_[i] *= momentum_;
}
Blas::sscal(
thread_pool,
kHalfDimensions, momentum_, biases_diff_, 1
);
for (IndexType b = 0; b < batch_->size(); ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c;
for (IndexType i = 0; i < kHalfDimensions; ++i) {
biases_diff_[i] += gradients_[output_offset + i];
}
Blas::saxpy(
thread_pool,
kHalfDimensions, 1.0,
&gradients_[output_offset], 1, biases_diff_, 1
);
}
}
for (IndexType i = 0; i < kHalfDimensions; ++i) {
biases_[i] -= local_learning_rate * biases_diff_[i];
}
Blas::saxpy(
thread_pool,
kHalfDimensions, -local_learning_rate,
biases_diff_, 1, biases_, 1
);
for (IndexType b = 0; b < batch_->size(); ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c;
for (const auto& feature : (*batch_)[b].training_features[c]) {
const IndexType weights_offset = kHalfDimensions * feature.get_index();
const auto scale = static_cast<LearnFloatType>(
effective_learning_rate / feature.get_count());
#pragma omp parallel
{
#if defined(_OPENMP)
const IndexType num_threads = omp_get_num_threads();
const IndexType thread_index = omp_get_thread_num();
#endif
for (IndexType b = 0; b < batch_->size(); ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c;
for (const auto& feature : (*batch_)[b].training_features[c]) {
#if defined(_OPENMP)
if (feature.get_index() % num_threads != thread_index)
continue;
#endif
const IndexType weights_offset =
kHalfDimensions * feature.get_index();
const auto scale = static_cast<LearnFloatType>(
effective_learning_rate / feature.get_count());
for (IndexType i = 0; i < kHalfDimensions; ++i) {
weights_[weights_offset + i] -=
scale * gradients_[output_offset + i];
Blas::saxpy(
thread_pool,
kHalfDimensions, -scale,
&gradients_[output_offset], 1,
&weights_[weights_offset], 1
);
}
}
}

View File

@@ -3,6 +3,8 @@
#include "trainer.h"
#include "extra/stockfish_blas.h"
#include "learn/learn.h"
#include "nnue/layers/input_slice.h"
@@ -208,13 +210,21 @@ namespace Eval::NNUE {
for (IndexType b = 0; b < batch_size_; ++b) {
const IndexType input_offset = kInputDimensions * b;
const IndexType output_offset = kOutputDimensions * b;
#if defined(USE_BLAS)
cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
&output_[output_offset], 1);
cblas_scopy(
kOutputDimensions, &input[input_offset + Offset], 1,
&output_[output_offset], 1
);
#else
for (IndexType i = 0; i < kOutputDimensions; ++i) {
output_[output_offset + i] = input[input_offset + Offset + i];
}
Blas::scopy(
thread_pool,
kOutputDimensions, &input[input_offset + Offset], 1,
&output_[output_offset], 1
);
#endif
}

View File

@@ -3,6 +3,8 @@
#include "trainer.h"
#include "extra/stockfish_blas.h"
#include "learn/learn.h"
#include "nnue/layers/sum.h"
@@ -53,15 +55,19 @@ namespace Eval::NNUE {
const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch);
#if defined(USE_BLAS)
cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
head_output, 1, output, 1);
cblas_saxpy(
kOutputDimensions * batch_size_, 1.0,
head_output, 1, output, 1
);
#else
for (IndexType b = 0; b < batch_size_; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType i = 0; i < kOutputDimensions; ++i) {
output[batch_offset + i] += head_output[batch_offset + i];
}
}
Blas::saxpy(
thread_pool,
kOutputDimensions * batch_size_, 1.0,
head_output, 1, output, 1
);
#endif
return output;