Replace omp in trainer_feature_transformer

This commit is contained in:
Tomasz Sobczyk
2020-10-27 11:57:45 +01:00
committed by nodchip
parent db1b33d4ac
commit e8907bcfc4

View File

@@ -19,10 +19,6 @@
#include <random>
#include <set>
#if defined(_OPENMP)
#include <omp.h>
#endif
// Specialization for feature transformer of learning class template of NNUE evaluation function
namespace Eval::NNUE {
@@ -104,44 +100,45 @@ namespace Eval::NNUE {
batch_ = &batch;
// affine transform
#pragma omp parallel for
for (IndexType b = 0; b < batch.size(); ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c;
thread_pool.for_each_index_with_workers(
0, batch.size(),
[&](Thread&, int b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c;
#if defined(USE_BLAS)
cblas_scopy(
kHalfDimensions, biases_, 1, &output_[output_offset], 1
);
for (const auto& feature : batch[b].training_features[c]) {
const IndexType weights_offset = kHalfDimensions * feature.get_index();
cblas_saxpy(
kHalfDimensions, (float)feature.get_count(),
&weights_[weights_offset], 1, &output_[output_offset], 1
cblas_scopy(
kHalfDimensions, biases_, 1, &output_[output_offset], 1
);
}
for (const auto& feature : batch[b].training_features[c]) {
const IndexType weights_offset = kHalfDimensions * feature.get_index();
cblas_saxpy(
kHalfDimensions, (float)feature.get_count(),
&weights_[weights_offset], 1, &output_[output_offset], 1
);
}
#else
Blas::scopy(
thread_pool,
kHalfDimensions, biases_, 1, &output_[output_offset], 1
);
for (const auto& feature : batch[b].training_features[c]) {
const IndexType weights_offset = kHalfDimensions * feature.get_index();
Blas::saxpy(
thread_pool,
kHalfDimensions, (float)feature.get_count(),
&weights_[weights_offset], 1, &output_[output_offset], 1
Blas::scopy(
kHalfDimensions, biases_, 1, &output_[output_offset], 1
);
}
for (const auto& feature : batch[b].training_features[c]) {
const IndexType weights_offset = kHalfDimensions * feature.get_index();
Blas::saxpy(
kHalfDimensions, (float)feature.get_count(),
&weights_[weights_offset], 1, &output_[output_offset], 1
);
}
#endif
}
}
}
);
thread_pool.wait_for_workers_finished();
#if defined (USE_SSE2)
@@ -358,6 +355,7 @@ namespace Eval::NNUE {
cblas_sscal(
kHalfDimensions, momentum_, biases_diff_, 1
);
for (IndexType b = 0; b < batch_->size(); ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
@@ -374,36 +372,6 @@ namespace Eval::NNUE {
biases_diff_, 1, biases_, 1
);
#pragma omp parallel
{
#if defined(_OPENMP)
const IndexType num_threads = omp_get_num_threads();
const IndexType thread_index = omp_get_thread_num();
#endif
for (IndexType b = 0; b < batch_->size(); ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c;
for (const auto& feature : (*batch_)[b].training_features[c]) {
#if defined(_OPENMP)
if (feature.get_index() % num_threads != thread_index)
continue;
#endif
const IndexType weights_offset =
kHalfDimensions * feature.get_index();
const auto scale = static_cast<LearnFloatType>(
effective_learning_rate / feature.get_count());
cblas_saxpy(
kHalfDimensions, -scale,
&gradients_[output_offset], 1,
&weights_[weights_offset], 1
);
}
}
}
}
#else
Blas::sscal(
@@ -429,38 +397,47 @@ namespace Eval::NNUE {
biases_diff_, 1, biases_, 1
);
#pragma omp parallel
{
#if defined(_OPENMP)
const IndexType num_threads = omp_get_num_threads();
const IndexType thread_index = omp_get_thread_num();
#endif
for (IndexType b = 0; b < batch_->size(); ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c;
for (const auto& feature : (*batch_)[b].training_features[c]) {
#if defined(_OPENMP)
if (feature.get_index() % num_threads != thread_index)
continue;
#endif
const IndexType weights_offset =
kHalfDimensions * feature.get_index();
const auto scale = static_cast<LearnFloatType>(
effective_learning_rate / feature.get_count());
Blas::saxpy(
thread_pool,
kHalfDimensions, -scale,
&gradients_[output_offset], 1,
&weights_[weights_offset], 1
);
thread_pool.execute_with_workers(
[&, num_threads = thread_pool.size()](Thread& th) {
const auto thread_index = th.thread_idx();
for (IndexType b = 0; b < batch_->size(); ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c;
for (const auto& feature : (*batch_)[b].training_features[c]) {
if (feature.get_index() % num_threads != thread_index)
continue;
const IndexType weights_offset =
kHalfDimensions * feature.get_index();
const auto scale = static_cast<LearnFloatType>(
effective_learning_rate / feature.get_count());
#if defined (USE_BLAS)
cblas_saxpy(
kHalfDimensions, -scale,
&gradients_[output_offset], 1,
&weights_[weights_offset], 1
);
#else
Blas::saxpy(
kHalfDimensions, -scale,
&gradients_[output_offset], 1,
&weights_[weights_offset], 1
);
#endif
}
}
}
}
}
);
#endif
for (IndexType b = 0; b < batch_->size(); ++b) {
for (IndexType c = 0; c < 2; ++c) {
for (const auto& feature : (*batch_)[b].training_features[c]) {
@@ -468,6 +445,8 @@ namespace Eval::NNUE {
}
}
}
thread_pool.wait_for_workers_finished();
}
private:
@@ -493,22 +472,25 @@ namespace Eval::NNUE {
std::vector<TrainingFeature> training_features;
#pragma omp parallel for private(training_features)
for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
training_features.clear();
Features::Factorizer<RawFeatures>::append_training_features(
j, &training_features);
Threads.for_each_index_with_workers(
0, RawFeatures::kDimensions,
[this, training_features](Thread&, int j) mutable {
training_features.clear();
Features::Factorizer<RawFeatures>::append_training_features(
j, &training_features);
for (IndexType i = 0; i < kHalfDimensions; ++i) {
double sum = 0.0;
for (const auto& feature : training_features) {
sum += weights_[kHalfDimensions * feature.get_index() + i];
for (IndexType i = 0; i < kHalfDimensions; ++i) {
double sum = 0.0;
for (const auto& feature : training_features) {
sum += weights_[kHalfDimensions * feature.get_index() + i];
}
target_layer_->weights_[kHalfDimensions * j + i] =
round<typename LayerType::WeightType>(sum * kWeightScale);
}
target_layer_->weights_[kHalfDimensions * j + i] =
round<typename LayerType::WeightType>(sum * kWeightScale);
}
}
);
Threads.wait_for_workers_finished();
}
void reset_stats() {