Replace omp in trainer_feature_transformer

2025-12-24 19:16:49 +08:00 · 2020-10-27 11:57:45 +01:00
parent db1b33d4ac
commit e8907bcfc4
1 changed files with 82 additions and 100 deletions
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -19,10 +19,6 @@
 #include <random>
 #include <set>

-#if defined(_OPENMP)
-#include <omp.h>
-#endif
-
 // Specialization for feature transformer of learning class template of NNUE evaluation function
 namespace Eval::NNUE {

@@ -104,44 +100,45 @@ namespace Eval::NNUE {

            batch_ = &batch;
            // affine transform
-#pragma omp parallel for
-            for (IndexType b = 0; b < batch.size(); ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+            thread_pool.for_each_index_with_workers(
+                0, batch.size(),
+                [&](Thread&, int b) {
+                    const IndexType batch_offset = kOutputDimensions * b;
+                    for (IndexType c = 0; c < 2; ++c) {
+                        const IndexType output_offset = batch_offset + kHalfDimensions * c;

 #if defined(USE_BLAS)

-                    cblas_scopy(
-                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                    );
-
-                    for (const auto& feature : batch[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        cblas_saxpy(
-                            kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        cblas_scopy(
+                            kHalfDimensions, biases_, 1, &output_[output_offset], 1
                        );
-                    }
+
+                        for (const auto& feature : batch[b].training_features[c]) {
+                            const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                            cblas_saxpy(
+                                kHalfDimensions, (float)feature.get_count(),
+                                &weights_[weights_offset], 1, &output_[output_offset], 1
+                            );
+                        }

 #else

-                    Blas::scopy(
-                        thread_pool,
-                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                    );
-                    for (const auto& feature : batch[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        Blas::saxpy(
-                            thread_pool,
-                            kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        Blas::scopy(
+                            kHalfDimensions, biases_, 1, &output_[output_offset], 1
                        );
-                    }
+                        for (const auto& feature : batch[b].training_features[c]) {
+                            const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                            Blas::saxpy(
+                                kHalfDimensions, (float)feature.get_count(),
+                                &weights_[weights_offset], 1, &output_[output_offset], 1
+                            );
+                        }

 #endif
+                    }
                }
-            }
+            );
+            thread_pool.wait_for_workers_finished();

 #if defined (USE_SSE2)

@@ -358,6 +355,7 @@ namespace Eval::NNUE {
            cblas_sscal(
                kHalfDimensions, momentum_, biases_diff_, 1
            );
+
            for (IndexType b = 0; b < batch_->size(); ++b) {
                const IndexType batch_offset = kOutputDimensions * b;
                for (IndexType c = 0; c < 2; ++c) {
@@ -374,36 +372,6 @@ namespace Eval::NNUE {
                biases_diff_, 1, biases_, 1
            );

-#pragma omp parallel
-            {
-#if defined(_OPENMP)
-                const IndexType num_threads = omp_get_num_threads();
-                const IndexType thread_index = omp_get_thread_num();
-#endif
-                for (IndexType b = 0; b < batch_->size(); ++b) {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType c = 0; c < 2; ++c) {
-                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                        for (const auto& feature : (*batch_)[b].training_features[c]) {
-#if defined(_OPENMP)
-                            if (feature.get_index() % num_threads != thread_index)
-                                continue;
-#endif
-                            const IndexType weights_offset =
-                                kHalfDimensions * feature.get_index();
-                            const auto scale = static_cast<LearnFloatType>(
-                                effective_learning_rate / feature.get_count());
-
-                            cblas_saxpy(
-                                kHalfDimensions, -scale,
-                                &gradients_[output_offset], 1,
-                                &weights_[weights_offset], 1
-                            );
-                        }
-                    }
-                }
-            }
-
 #else

            Blas::sscal(
@@ -429,38 +397,47 @@ namespace Eval::NNUE {
                biases_diff_, 1, biases_, 1
            );

-#pragma omp parallel
-            {
-#if defined(_OPENMP)
-                const IndexType num_threads = omp_get_num_threads();
-                const IndexType thread_index = omp_get_thread_num();
 #endif
-                for (IndexType b = 0; b < batch_->size(); ++b) {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType c = 0; c < 2; ++c) {
-                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                        for (const auto& feature : (*batch_)[b].training_features[c]) {
-#if defined(_OPENMP)
-                            if (feature.get_index() % num_threads != thread_index)
-                                continue;
-#endif
-                            const IndexType weights_offset =
-                                kHalfDimensions * feature.get_index();
-                            const auto scale = static_cast<LearnFloatType>(
-                                effective_learning_rate / feature.get_count());

-                            Blas::saxpy(
-                                thread_pool,
-                                kHalfDimensions, -scale,
-                                &gradients_[output_offset], 1,
-                                &weights_[weights_offset], 1
-                            );
+            thread_pool.execute_with_workers(
+                [&, num_threads = thread_pool.size()](Thread& th) {
+                    const auto thread_index = th.thread_idx();
+
+                    for (IndexType b = 0; b < batch_->size(); ++b) {
+                        const IndexType batch_offset = kOutputDimensions * b;
+                        for (IndexType c = 0; c < 2; ++c) {
+                            const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                            for (const auto& feature : (*batch_)[b].training_features[c]) {
+                                if (feature.get_index() % num_threads != thread_index)
+                                    continue;
+                                const IndexType weights_offset =
+                                    kHalfDimensions * feature.get_index();
+                                const auto scale = static_cast<LearnFloatType>(
+                                    effective_learning_rate / feature.get_count());
+
+#if defined (USE_BLAS)
+
+                                cblas_saxpy(
+                                    kHalfDimensions, -scale,
+                                    &gradients_[output_offset], 1,
+                                    &weights_[weights_offset], 1
+                                );
+
+#else
+
+                                Blas::saxpy(
+                                    kHalfDimensions, -scale,
+                                    &gradients_[output_offset], 1,
+                                    &weights_[weights_offset], 1
+                                );
+
+#endif
+                            }
                        }
                    }
                }
-            }
+            );

-#endif
            for (IndexType b = 0; b < batch_->size(); ++b) {
                for (IndexType c = 0; c < 2; ++c) {
                    for (const auto& feature : (*batch_)[b].training_features[c]) {
@@ -468,6 +445,8 @@ namespace Eval::NNUE {
                    }
                }
            }
+
+            thread_pool.wait_for_workers_finished();
        }

    private:
@@ -493,22 +472,25 @@ namespace Eval::NNUE {

            std::vector<TrainingFeature> training_features;

-#pragma omp parallel for private(training_features)
-            for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
-                training_features.clear();
-                Features::Factorizer<RawFeatures>::append_training_features(
-                    j, &training_features);
+            Threads.for_each_index_with_workers(
+                0, RawFeatures::kDimensions,
+                [this, training_features](Thread&, int j) mutable {
+                    training_features.clear();
+                    Features::Factorizer<RawFeatures>::append_training_features(
+                        j, &training_features);

-                for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                    double sum = 0.0;
-                    for (const auto& feature : training_features) {
-                        sum += weights_[kHalfDimensions * feature.get_index() + i];
+                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                        double sum = 0.0;
+                        for (const auto& feature : training_features) {
+                            sum += weights_[kHalfDimensions * feature.get_index() + i];
+                        }
+
+                        target_layer_->weights_[kHalfDimensions * j + i] =
+                            round<typename LayerType::WeightType>(sum * kWeightScale);
                    }
-
-                    target_layer_->weights_[kHalfDimensions * j + i] =
-                        round<typename LayerType::WeightType>(sum * kWeightScale);
                }
-            }
+            );
+            Threads.wait_for_workers_finished();
        }

        void reset_stats() {