From e8907bcfc456cd9c5966dbc238018b0bc961eece Mon Sep 17 00:00:00 2001 From: Tomasz Sobczyk Date: Tue, 27 Oct 2020 11:57:45 +0100 Subject: [PATCH] Replace omp in trainer_feature_transformer --- .../trainer/trainer_feature_transformer.h | 182 ++++++++---------- 1 file changed, 82 insertions(+), 100 deletions(-) diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h index 77edfbde..3062e432 100644 --- a/src/nnue/trainer/trainer_feature_transformer.h +++ b/src/nnue/trainer/trainer_feature_transformer.h @@ -19,10 +19,6 @@ #include #include -#if defined(_OPENMP) -#include -#endif - // Specialization for feature transformer of learning class template of NNUE evaluation function namespace Eval::NNUE { @@ -104,44 +100,45 @@ namespace Eval::NNUE { batch_ = &batch; // affine transform -#pragma omp parallel for - for (IndexType b = 0; b < batch.size(); ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType c = 0; c < 2; ++c) { - const IndexType output_offset = batch_offset + kHalfDimensions * c; + thread_pool.for_each_index_with_workers( + 0, batch.size(), + [&](Thread&, int b) { + const IndexType batch_offset = kOutputDimensions * b; + for (IndexType c = 0; c < 2; ++c) { + const IndexType output_offset = batch_offset + kHalfDimensions * c; #if defined(USE_BLAS) - cblas_scopy( - kHalfDimensions, biases_, 1, &output_[output_offset], 1 - ); - - for (const auto& feature : batch[b].training_features[c]) { - const IndexType weights_offset = kHalfDimensions * feature.get_index(); - cblas_saxpy( - kHalfDimensions, (float)feature.get_count(), - &weights_[weights_offset], 1, &output_[output_offset], 1 + cblas_scopy( + kHalfDimensions, biases_, 1, &output_[output_offset], 1 ); - } + + for (const auto& feature : batch[b].training_features[c]) { + const IndexType weights_offset = kHalfDimensions * feature.get_index(); + cblas_saxpy( + kHalfDimensions, (float)feature.get_count(), + &weights_[weights_offset], 1, &output_[output_offset], 1 + ); + } #else - Blas::scopy( - thread_pool, - kHalfDimensions, biases_, 1, &output_[output_offset], 1 - ); - for (const auto& feature : batch[b].training_features[c]) { - const IndexType weights_offset = kHalfDimensions * feature.get_index(); - Blas::saxpy( - thread_pool, - kHalfDimensions, (float)feature.get_count(), - &weights_[weights_offset], 1, &output_[output_offset], 1 + Blas::scopy( + kHalfDimensions, biases_, 1, &output_[output_offset], 1 ); - } + for (const auto& feature : batch[b].training_features[c]) { + const IndexType weights_offset = kHalfDimensions * feature.get_index(); + Blas::saxpy( + kHalfDimensions, (float)feature.get_count(), + &weights_[weights_offset], 1, &output_[output_offset], 1 + ); + } #endif + } } - } + ); + thread_pool.wait_for_workers_finished(); #if defined (USE_SSE2) @@ -358,6 +355,7 @@ namespace Eval::NNUE { cblas_sscal( kHalfDimensions, momentum_, biases_diff_, 1 ); + for (IndexType b = 0; b < batch_->size(); ++b) { const IndexType batch_offset = kOutputDimensions * b; for (IndexType c = 0; c < 2; ++c) { @@ -374,36 +372,6 @@ namespace Eval::NNUE { biases_diff_, 1, biases_, 1 ); -#pragma omp parallel - { -#if defined(_OPENMP) - const IndexType num_threads = omp_get_num_threads(); - const IndexType thread_index = omp_get_thread_num(); -#endif - for (IndexType b = 0; b < batch_->size(); ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType c = 0; c < 2; ++c) { - const IndexType output_offset = batch_offset + kHalfDimensions * c; - for (const auto& feature : (*batch_)[b].training_features[c]) { -#if defined(_OPENMP) - if (feature.get_index() % num_threads != thread_index) - continue; -#endif - const IndexType weights_offset = - kHalfDimensions * feature.get_index(); - const auto scale = static_cast( - effective_learning_rate / feature.get_count()); - - cblas_saxpy( - kHalfDimensions, -scale, - &gradients_[output_offset], 1, - &weights_[weights_offset], 1 - ); - } - } - } - } - #else Blas::sscal( @@ -429,38 +397,47 @@ namespace Eval::NNUE { biases_diff_, 1, biases_, 1 ); -#pragma omp parallel - { -#if defined(_OPENMP) - const IndexType num_threads = omp_get_num_threads(); - const IndexType thread_index = omp_get_thread_num(); #endif - for (IndexType b = 0; b < batch_->size(); ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType c = 0; c < 2; ++c) { - const IndexType output_offset = batch_offset + kHalfDimensions * c; - for (const auto& feature : (*batch_)[b].training_features[c]) { -#if defined(_OPENMP) - if (feature.get_index() % num_threads != thread_index) - continue; -#endif - const IndexType weights_offset = - kHalfDimensions * feature.get_index(); - const auto scale = static_cast( - effective_learning_rate / feature.get_count()); - Blas::saxpy( - thread_pool, - kHalfDimensions, -scale, - &gradients_[output_offset], 1, - &weights_[weights_offset], 1 - ); + thread_pool.execute_with_workers( + [&, num_threads = thread_pool.size()](Thread& th) { + const auto thread_index = th.thread_idx(); + + for (IndexType b = 0; b < batch_->size(); ++b) { + const IndexType batch_offset = kOutputDimensions * b; + for (IndexType c = 0; c < 2; ++c) { + const IndexType output_offset = batch_offset + kHalfDimensions * c; + for (const auto& feature : (*batch_)[b].training_features[c]) { + if (feature.get_index() % num_threads != thread_index) + continue; + const IndexType weights_offset = + kHalfDimensions * feature.get_index(); + const auto scale = static_cast( + effective_learning_rate / feature.get_count()); + +#if defined (USE_BLAS) + + cblas_saxpy( + kHalfDimensions, -scale, + &gradients_[output_offset], 1, + &weights_[weights_offset], 1 + ); + +#else + + Blas::saxpy( + kHalfDimensions, -scale, + &gradients_[output_offset], 1, + &weights_[weights_offset], 1 + ); + +#endif + } } } } - } + ); -#endif for (IndexType b = 0; b < batch_->size(); ++b) { for (IndexType c = 0; c < 2; ++c) { for (const auto& feature : (*batch_)[b].training_features[c]) { @@ -468,6 +445,8 @@ namespace Eval::NNUE { } } } + + thread_pool.wait_for_workers_finished(); } private: @@ -493,22 +472,25 @@ namespace Eval::NNUE { std::vector training_features; -#pragma omp parallel for private(training_features) - for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) { - training_features.clear(); - Features::Factorizer::append_training_features( - j, &training_features); + Threads.for_each_index_with_workers( + 0, RawFeatures::kDimensions, + [this, training_features](Thread&, int j) mutable { + training_features.clear(); + Features::Factorizer::append_training_features( + j, &training_features); - for (IndexType i = 0; i < kHalfDimensions; ++i) { - double sum = 0.0; - for (const auto& feature : training_features) { - sum += weights_[kHalfDimensions * feature.get_index() + i]; + for (IndexType i = 0; i < kHalfDimensions; ++i) { + double sum = 0.0; + for (const auto& feature : training_features) { + sum += weights_[kHalfDimensions * feature.get_index() + i]; + } + + target_layer_->weights_[kHalfDimensions * j + i] = + round(sum * kWeightScale); } - - target_layer_->weights_[kHalfDimensions * j + i] = - round(sum * kWeightScale); } - } + ); + Threads.wait_for_workers_finished(); } void reset_stats() {