mirror of
https://github.com/HChaZZY/Stockfish.git
synced 2025-12-24 19:16:49 +08:00
Replace omp in trainer_feature_transformer
This commit is contained in:
@@ -19,10 +19,6 @@
|
||||
#include <random>
|
||||
#include <set>
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
// Specialization for feature transformer of learning class template of NNUE evaluation function
|
||||
namespace Eval::NNUE {
|
||||
|
||||
@@ -104,44 +100,45 @@ namespace Eval::NNUE {
|
||||
|
||||
batch_ = &batch;
|
||||
// affine transform
|
||||
#pragma omp parallel for
|
||||
for (IndexType b = 0; b < batch.size(); ++b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
for (IndexType c = 0; c < 2; ++c) {
|
||||
const IndexType output_offset = batch_offset + kHalfDimensions * c;
|
||||
thread_pool.for_each_index_with_workers(
|
||||
0, batch.size(),
|
||||
[&](Thread&, int b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
for (IndexType c = 0; c < 2; ++c) {
|
||||
const IndexType output_offset = batch_offset + kHalfDimensions * c;
|
||||
|
||||
#if defined(USE_BLAS)
|
||||
|
||||
cblas_scopy(
|
||||
kHalfDimensions, biases_, 1, &output_[output_offset], 1
|
||||
);
|
||||
|
||||
for (const auto& feature : batch[b].training_features[c]) {
|
||||
const IndexType weights_offset = kHalfDimensions * feature.get_index();
|
||||
cblas_saxpy(
|
||||
kHalfDimensions, (float)feature.get_count(),
|
||||
&weights_[weights_offset], 1, &output_[output_offset], 1
|
||||
cblas_scopy(
|
||||
kHalfDimensions, biases_, 1, &output_[output_offset], 1
|
||||
);
|
||||
}
|
||||
|
||||
for (const auto& feature : batch[b].training_features[c]) {
|
||||
const IndexType weights_offset = kHalfDimensions * feature.get_index();
|
||||
cblas_saxpy(
|
||||
kHalfDimensions, (float)feature.get_count(),
|
||||
&weights_[weights_offset], 1, &output_[output_offset], 1
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
Blas::scopy(
|
||||
thread_pool,
|
||||
kHalfDimensions, biases_, 1, &output_[output_offset], 1
|
||||
);
|
||||
for (const auto& feature : batch[b].training_features[c]) {
|
||||
const IndexType weights_offset = kHalfDimensions * feature.get_index();
|
||||
Blas::saxpy(
|
||||
thread_pool,
|
||||
kHalfDimensions, (float)feature.get_count(),
|
||||
&weights_[weights_offset], 1, &output_[output_offset], 1
|
||||
Blas::scopy(
|
||||
kHalfDimensions, biases_, 1, &output_[output_offset], 1
|
||||
);
|
||||
}
|
||||
for (const auto& feature : batch[b].training_features[c]) {
|
||||
const IndexType weights_offset = kHalfDimensions * feature.get_index();
|
||||
Blas::saxpy(
|
||||
kHalfDimensions, (float)feature.get_count(),
|
||||
&weights_[weights_offset], 1, &output_[output_offset], 1
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
thread_pool.wait_for_workers_finished();
|
||||
|
||||
#if defined (USE_SSE2)
|
||||
|
||||
@@ -358,6 +355,7 @@ namespace Eval::NNUE {
|
||||
cblas_sscal(
|
||||
kHalfDimensions, momentum_, biases_diff_, 1
|
||||
);
|
||||
|
||||
for (IndexType b = 0; b < batch_->size(); ++b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
for (IndexType c = 0; c < 2; ++c) {
|
||||
@@ -374,36 +372,6 @@ namespace Eval::NNUE {
|
||||
biases_diff_, 1, biases_, 1
|
||||
);
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
const IndexType num_threads = omp_get_num_threads();
|
||||
const IndexType thread_index = omp_get_thread_num();
|
||||
#endif
|
||||
for (IndexType b = 0; b < batch_->size(); ++b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
for (IndexType c = 0; c < 2; ++c) {
|
||||
const IndexType output_offset = batch_offset + kHalfDimensions * c;
|
||||
for (const auto& feature : (*batch_)[b].training_features[c]) {
|
||||
#if defined(_OPENMP)
|
||||
if (feature.get_index() % num_threads != thread_index)
|
||||
continue;
|
||||
#endif
|
||||
const IndexType weights_offset =
|
||||
kHalfDimensions * feature.get_index();
|
||||
const auto scale = static_cast<LearnFloatType>(
|
||||
effective_learning_rate / feature.get_count());
|
||||
|
||||
cblas_saxpy(
|
||||
kHalfDimensions, -scale,
|
||||
&gradients_[output_offset], 1,
|
||||
&weights_[weights_offset], 1
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
Blas::sscal(
|
||||
@@ -429,38 +397,47 @@ namespace Eval::NNUE {
|
||||
biases_diff_, 1, biases_, 1
|
||||
);
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
const IndexType num_threads = omp_get_num_threads();
|
||||
const IndexType thread_index = omp_get_thread_num();
|
||||
#endif
|
||||
for (IndexType b = 0; b < batch_->size(); ++b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
for (IndexType c = 0; c < 2; ++c) {
|
||||
const IndexType output_offset = batch_offset + kHalfDimensions * c;
|
||||
for (const auto& feature : (*batch_)[b].training_features[c]) {
|
||||
#if defined(_OPENMP)
|
||||
if (feature.get_index() % num_threads != thread_index)
|
||||
continue;
|
||||
#endif
|
||||
const IndexType weights_offset =
|
||||
kHalfDimensions * feature.get_index();
|
||||
const auto scale = static_cast<LearnFloatType>(
|
||||
effective_learning_rate / feature.get_count());
|
||||
|
||||
Blas::saxpy(
|
||||
thread_pool,
|
||||
kHalfDimensions, -scale,
|
||||
&gradients_[output_offset], 1,
|
||||
&weights_[weights_offset], 1
|
||||
);
|
||||
thread_pool.execute_with_workers(
|
||||
[&, num_threads = thread_pool.size()](Thread& th) {
|
||||
const auto thread_index = th.thread_idx();
|
||||
|
||||
for (IndexType b = 0; b < batch_->size(); ++b) {
|
||||
const IndexType batch_offset = kOutputDimensions * b;
|
||||
for (IndexType c = 0; c < 2; ++c) {
|
||||
const IndexType output_offset = batch_offset + kHalfDimensions * c;
|
||||
for (const auto& feature : (*batch_)[b].training_features[c]) {
|
||||
if (feature.get_index() % num_threads != thread_index)
|
||||
continue;
|
||||
const IndexType weights_offset =
|
||||
kHalfDimensions * feature.get_index();
|
||||
const auto scale = static_cast<LearnFloatType>(
|
||||
effective_learning_rate / feature.get_count());
|
||||
|
||||
#if defined (USE_BLAS)
|
||||
|
||||
cblas_saxpy(
|
||||
kHalfDimensions, -scale,
|
||||
&gradients_[output_offset], 1,
|
||||
&weights_[weights_offset], 1
|
||||
);
|
||||
|
||||
#else
|
||||
|
||||
Blas::saxpy(
|
||||
kHalfDimensions, -scale,
|
||||
&gradients_[output_offset], 1,
|
||||
&weights_[weights_offset], 1
|
||||
);
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
#endif
|
||||
for (IndexType b = 0; b < batch_->size(); ++b) {
|
||||
for (IndexType c = 0; c < 2; ++c) {
|
||||
for (const auto& feature : (*batch_)[b].training_features[c]) {
|
||||
@@ -468,6 +445,8 @@ namespace Eval::NNUE {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
thread_pool.wait_for_workers_finished();
|
||||
}
|
||||
|
||||
private:
|
||||
@@ -493,22 +472,25 @@ namespace Eval::NNUE {
|
||||
|
||||
std::vector<TrainingFeature> training_features;
|
||||
|
||||
#pragma omp parallel for private(training_features)
|
||||
for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
|
||||
training_features.clear();
|
||||
Features::Factorizer<RawFeatures>::append_training_features(
|
||||
j, &training_features);
|
||||
Threads.for_each_index_with_workers(
|
||||
0, RawFeatures::kDimensions,
|
||||
[this, training_features](Thread&, int j) mutable {
|
||||
training_features.clear();
|
||||
Features::Factorizer<RawFeatures>::append_training_features(
|
||||
j, &training_features);
|
||||
|
||||
for (IndexType i = 0; i < kHalfDimensions; ++i) {
|
||||
double sum = 0.0;
|
||||
for (const auto& feature : training_features) {
|
||||
sum += weights_[kHalfDimensions * feature.get_index() + i];
|
||||
for (IndexType i = 0; i < kHalfDimensions; ++i) {
|
||||
double sum = 0.0;
|
||||
for (const auto& feature : training_features) {
|
||||
sum += weights_[kHalfDimensions * feature.get_index() + i];
|
||||
}
|
||||
|
||||
target_layer_->weights_[kHalfDimensions * j + i] =
|
||||
round<typename LayerType::WeightType>(sum * kWeightScale);
|
||||
}
|
||||
|
||||
target_layer_->weights_[kHalfDimensions * j + i] =
|
||||
round<typename LayerType::WeightType>(sum * kWeightScale);
|
||||
}
|
||||
}
|
||||
);
|
||||
Threads.wait_for_workers_finished();
|
||||
}
|
||||
|
||||
void reset_stats() {
|
||||
|
||||
Reference in New Issue
Block a user