From e8907bcfc456cd9c5966dbc238018b0bc961eece Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 11:57:45 +0100
Subject: [PATCH] Replace omp in trainer_feature_transformer

---
 .../trainer/trainer_feature_transformer.h     | 182 ++++++++----------
 1 file changed, 82 insertions(+), 100 deletions(-)
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 77edfbde..3062e432 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -19,10 +19,6 @@
 #include <random>
 #include <set>
 
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
-
 // Specialization for feature transformer of learning class template of NNUE evaluation function
 namespace Eval::NNUE {
 
@@ -104,44 +100,45 @@ namespace Eval::NNUE {
 
             batch_ = &batch;
             // affine transform
-#pragma omp parallel for
-            for (IndexType b = 0; b < batch.size(); ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+            thread_pool.for_each_index_with_workers(
+                0, batch.size(),
+                [&](Thread&, int b) {
+                    const IndexType batch_offset = kOutputDimensions * b;
+                    for (IndexType c = 0; c < 2; ++c) {
+                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
 
 #if defined(USE_BLAS)
 
-                    cblas_scopy(
-                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                    );
-
-                    for (const auto& feature : batch[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        cblas_saxpy(
-                            kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        cblas_scopy(
+                            kHalfDimensions, biases_, 1, &output_[output_offset], 1
                         );
-                    }
+
+                        for (const auto& feature : batch[b].training_features[c]) {
+                            const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                            cblas_saxpy(
+                                kHalfDimensions, (float)feature.get_count(),
+                                &weights_[weights_offset], 1, &output_[output_offset], 1
+                            );
+                        }
 
 #else
 
-                    Blas::scopy(
-                        thread_pool,
-                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                    );
-                    for (const auto& feature : batch[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        Blas::saxpy(
-                            thread_pool,
-                            kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        Blas::scopy(
+                            kHalfDimensions, biases_, 1, &output_[output_offset], 1
                         );
-                    }
+                        for (const auto& feature : batch[b].training_features[c]) {
+                            const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                            Blas::saxpy(
+                                kHalfDimensions, (float)feature.get_count(),
+                                &weights_[weights_offset], 1, &output_[output_offset], 1
+                            );
+                        }
 
 #endif
+                    }
                 }
-            }
+            );
+            thread_pool.wait_for_workers_finished();
 
 #if defined (USE_SSE2)
 
@@ -358,6 +355,7 @@ namespace Eval::NNUE {
             cblas_sscal(
                 kHalfDimensions, momentum_, biases_diff_, 1
             );
+
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType c = 0; c < 2; ++c) {
@@ -374,36 +372,6 @@ namespace Eval::NNUE {
                 biases_diff_, 1, biases_, 1
             );
 
-#pragma omp parallel
-            {
-#if defined(_OPENMP)
-                const IndexType num_threads = omp_get_num_threads();
-                const IndexType thread_index = omp_get_thread_num();
-#endif
-                for (IndexType b = 0; b < batch_->size(); ++b) {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType c = 0; c < 2; ++c) {
-                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                        for (const auto& feature : (*batch_)[b].training_features[c]) {
-#if defined(_OPENMP)
-                            if (feature.get_index() % num_threads != thread_index)
-                                continue;
-#endif
-                            const IndexType weights_offset =
-                                kHalfDimensions * feature.get_index();
-                            const auto scale = static_cast<LearnFloatType>(
-                                effective_learning_rate / feature.get_count());
-
-                            cblas_saxpy(
-                                kHalfDimensions, -scale,
-                                &gradients_[output_offset], 1,
-                                &weights_[weights_offset], 1
-                            );
-                        }
-                    }
-                }
-            }
-
 #else
 
             Blas::sscal(
@@ -429,38 +397,47 @@ namespace Eval::NNUE {
                 biases_diff_, 1, biases_, 1
             );
 
-#pragma omp parallel
-            {
-#if defined(_OPENMP)
-                const IndexType num_threads = omp_get_num_threads();
-                const IndexType thread_index = omp_get_thread_num();
 #endif
-                for (IndexType b = 0; b < batch_->size(); ++b) {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType c = 0; c < 2; ++c) {
-                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                        for (const auto& feature : (*batch_)[b].training_features[c]) {
-#if defined(_OPENMP)
-                            if (feature.get_index() % num_threads != thread_index)
-                                continue;
-#endif
-                            const IndexType weights_offset =
-                                kHalfDimensions * feature.get_index();
-                            const auto scale = static_cast<LearnFloatType>(
-                                effective_learning_rate / feature.get_count());
 
-                            Blas::saxpy(
-                                thread_pool,
-                                kHalfDimensions, -scale,
-                                &gradients_[output_offset], 1,
-                                &weights_[weights_offset], 1
-                            );
+            thread_pool.execute_with_workers(
+                [&, num_threads = thread_pool.size()](Thread& th) {
+                    const auto thread_index = th.thread_idx();
+
+                    for (IndexType b = 0; b < batch_->size(); ++b) {
+                        const IndexType batch_offset = kOutputDimensions * b;
+                        for (IndexType c = 0; c < 2; ++c) {
+                            const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                            for (const auto& feature : (*batch_)[b].training_features[c]) {
+                                if (feature.get_index() % num_threads != thread_index)
+                                    continue;
+                                const IndexType weights_offset =
+                                    kHalfDimensions * feature.get_index();
+                                const auto scale = static_cast<LearnFloatType>(
+                                    effective_learning_rate / feature.get_count());
+
+#if defined (USE_BLAS)
+
+                                cblas_saxpy(
+                                    kHalfDimensions, -scale,
+                                    &gradients_[output_offset], 1,
+                                    &weights_[weights_offset], 1
+                                );
+
+#else
+
+                                Blas::saxpy(
+                                    kHalfDimensions, -scale,
+                                    &gradients_[output_offset], 1,
+                                    &weights_[weights_offset], 1
+                                );
+
+#endif
+                            }
                         }
                     }
                 }
-            }
+            );
 
-#endif
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 for (IndexType c = 0; c < 2; ++c) {
                     for (const auto& feature : (*batch_)[b].training_features[c]) {
@@ -468,6 +445,8 @@ namespace Eval::NNUE {
                     }
                 }
             }
+
+            thread_pool.wait_for_workers_finished();
         }
 
     private:
@@ -493,22 +472,25 @@ namespace Eval::NNUE {
 
             std::vector<TrainingFeature> training_features;
 
-#pragma omp parallel for private(training_features)
-            for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
-                training_features.clear();
-                Features::Factorizer<RawFeatures>::append_training_features(
-                    j, &training_features);
+            Threads.for_each_index_with_workers(
+                0, RawFeatures::kDimensions,
+                [this, training_features](Thread&, int j) mutable {
+                    training_features.clear();
+                    Features::Factorizer<RawFeatures>::append_training_features(
+                        j, &training_features);
 
-                for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                    double sum = 0.0;
-                    for (const auto& feature : training_features) {
-                        sum += weights_[kHalfDimensions * feature.get_index() + i];
+                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                        double sum = 0.0;
+                        for (const auto& feature : training_features) {
+                            sum += weights_[kHalfDimensions * feature.get_index() + i];
+                        }
+
+                        target_layer_->weights_[kHalfDimensions * j + i] =
+                            round<typename LayerType::WeightType>(sum * kWeightScale);
                     }
-
-                    target_layer_->weights_[kHalfDimensions * j + i] =
-                        round<typename LayerType::WeightType>(sum * kWeightScale);
                 }
-            }
+            );
+            Threads.wait_for_workers_finished();
         }
 
         void reset_stats() {