diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index 5d2f29c9..610805ca 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -3,6 +3,8 @@
 
 #include "trainer.h"
 
+#include "extra/stockfish_blas.h"
+
 #include "learn/learn.h"
 
 #include "nnue/layers/affine_transform.h"
@@ -98,32 +100,46 @@ namespace Eval::NNUE {
 
             batch_size_ = static_cast<IndexType>(batch.size());
             batch_input_ = previous_layer_trainer_->propagate(thread_pool, batch);
+
 #if defined(USE_BLAS)
+
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
-                cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
+                cblas_scopy(
+                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
+                );
             }
 
-            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                        kOutputDimensions, batch_size_, kInputDimensions, 1.0,
-                        weights_, kInputDimensions,
-                        batch_input_, kInputDimensions,
-                        1.0, &output_[0], kOutputDimensions);
+            cblas_sgemm(
+                CblasColMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, batch_size_, kInputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                batch_input_, kInputDimensions,
+                1.0,
+                &output_[0], kOutputDimensions
+            );
 #else
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType input_batch_offset = kInputDimensions * b;
-                const IndexType output_batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    double sum = biases_[i];
-                    for (IndexType j = 0; j < kInputDimensions; ++j) {
-                        const IndexType index = kInputDimensions * i + j;
-                        sum += weights_[index] * batch_input_[input_batch_offset + j];
-                    }
 
-                    output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
-                }
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                Blas::scopy(
+                    thread_pool,
+                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
+                );
             }
 
+            Blas::sgemm(
+                thread_pool,
+                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
+                kOutputDimensions, batch_size_, kInputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                batch_input_, kInputDimensions,
+                1.0,
+                &output_[0], kOutputDimensions
+            );
+
 #endif
             return output_.data();
         }
@@ -137,67 +153,77 @@ namespace Eval::NNUE {
                 learning_rate * learning_rate_scale_;
 
 #if defined(USE_BLAS)
-            // backpropagate
-            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
-                        kInputDimensions, batch_size_, kOutputDimensions, 1.0,
-                        weights_, kInputDimensions,
-                        gradients, kOutputDimensions,
-                        0.0, &gradients_[0], kInputDimensions);
+
+            cblas_sgemm(
+                CblasColMajor, CblasNoTrans, CblasNoTrans,
+                kInputDimensions, batch_size_, kOutputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                gradients, kOutputDimensions,
+                0.0,
+                &gradients_[0], kInputDimensions
+            );
 
             // update
-            cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
+            cblas_sscal(
+                kOutputDimensions, momentum_, biases_diff_, 1
+            );
+
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
-                cblas_saxpy(kOutputDimensions, 1.0,
+                cblas_saxpy(
+                    kOutputDimensions, 1.0,
+                    &gradients[batch_offset], 1, biases_diff_, 1
+                );
+            }
+
+            cblas_sgemm(
+                CblasRowMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, kInputDimensions, batch_size_,
+                1.0,
+                gradients, kOutputDimensions,
+                batch_input_, kInputDimensions,
+                momentum_,
+                weights_diff_, kInputDimensions
+            );
+
+#else
+
+            // backpropagate
+            Blas::sgemm(
+                thread_pool,
+                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans,
+                kInputDimensions, batch_size_, kOutputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                gradients, kOutputDimensions,
+                0.0,
+                &gradients_[0], kInputDimensions
+            );
+
+
+            Blas::sscal(
+                thread_pool,
+                kOutputDimensions, momentum_, biases_diff_, 1
+            );
+
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                Blas::saxpy(thread_pool, kOutputDimensions, 1.0,
                           &gradients[batch_offset], 1, biases_diff_, 1);
             }
 
-            cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
-                        kOutputDimensions, kInputDimensions, batch_size_, 1.0,
-                        gradients, kOutputDimensions,
-                        batch_input_, kInputDimensions,
-                        momentum_, weights_diff_, kInputDimensions);
+            Blas::sgemm(
+                thread_pool,
+                Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
+                kOutputDimensions, kInputDimensions, batch_size_,
+                1.0,
+                gradients, kOutputDimensions,
+                batch_input_, kInputDimensions,
+                momentum_,
+                weights_diff_, kInputDimensions
+            );
 
-#else
-            // backpropagate
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType input_batch_offset = kInputDimensions * b;
-                const IndexType output_batch_offset = kOutputDimensions * b;
-                for (IndexType j = 0; j < kInputDimensions; ++j) {
-                    double sum = 0.0;
-                    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                        const IndexType index = kInputDimensions * i + j;
-                        sum += weights_[index] * gradients[output_batch_offset + i];
-                    }
-                    gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
-                }
-            }
-
-            // update
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                biases_diff_[i] *= momentum_;
-            }
-
-            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-                weights_diff_[i] *= momentum_;
-            }
-
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType input_batch_offset = kInputDimensions * b;
-                const IndexType output_batch_offset = kOutputDimensions * b;
-
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    biases_diff_[i] += gradients[output_batch_offset + i];
-                }
-
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    for (IndexType j = 0; j < kInputDimensions; ++j) {
-                        const IndexType index = kInputDimensions * i + j;
-                        weights_diff_[index] += gradients[output_batch_offset + i] *
-                            batch_input_[input_batch_offset + j];
-                    }
-                }
-            }
 #endif
 
             for (IndexType i = 0; i < kOutputDimensions; ++i) {
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index a778f956..8be584e8 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -3,6 +3,8 @@
 
 #include "trainer.h"
 
+#include "extra/stockfish_blas.h"
+
 #include "features/factorizer_feature_set.h"
 
 #include "learn/learn.h"
@@ -107,24 +109,36 @@ namespace Eval::NNUE {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType c = 0; c < 2; ++c) {
                     const IndexType output_offset = batch_offset + kHalfDimensions * c;
+
 #if defined(USE_BLAS)
-                    cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
+
+                    cblas_scopy(
+                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
+                    );
+
                     for (const auto& feature : batch[b].training_features[c]) {
                         const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        cblas_saxpy(kHalfDimensions, (float)feature.get_count(),
-                                    &weights_[weights_offset], 1, &output_[output_offset], 1);
+                        cblas_saxpy(
+                            kHalfDimensions, (float)feature.get_count(),
+                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        );
                     }
+
 #else
-                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                        output_[output_offset + i] = biases_[i];
-                    }
+
+                    Blas::scopy(
+                        thread_pool,
+                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
+                    );
                     for (const auto& feature : batch[b].training_features[c]) {
                         const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                            output_[output_offset + i] +=
-                                feature.get_count() * weights_[weights_offset + i];
-                        }
+                        Blas::saxpy(
+                            thread_pool,
+                            kHalfDimensions, (float)feature.get_count(),
+                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        );
                     }
+
 #endif
                 }
             }
@@ -171,19 +185,27 @@ namespace Eval::NNUE {
             // Correct the learning rate and adjust the scale without using momentum
             const LearnFloatType effective_learning_rate =
                 static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
+
 #if defined(USE_BLAS)
-            cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1);
+
+            cblas_sscal(
+                kHalfDimensions, momentum_, biases_diff_, 1
+            );
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType c = 0; c < 2; ++c) {
                     const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    cblas_saxpy(kHalfDimensions, 1.0,
-                                &gradients_[output_offset], 1, biases_diff_, 1);
+                    cblas_saxpy(
+                        kHalfDimensions, 1.0,
+                        &gradients_[output_offset], 1, biases_diff_, 1
+                    );
                 }
             }
 
-            cblas_saxpy(kHalfDimensions, -local_learning_rate,
-                        biases_diff_, 1, biases_, 1);
+            cblas_saxpy(
+                kHalfDimensions, -local_learning_rate,
+                biases_diff_, 1, biases_, 1
+            );
 
 #pragma omp parallel
             {
@@ -205,45 +227,67 @@ namespace Eval::NNUE {
                             const auto scale = static_cast<LearnFloatType>(
                                 effective_learning_rate / feature.get_count());
 
-                            cblas_saxpy(kHalfDimensions, -scale,
-                                        &gradients_[output_offset], 1,
-                                        &weights_[weights_offset], 1);
+                            cblas_saxpy(
+                                kHalfDimensions, -scale,
+                                &gradients_[output_offset], 1,
+                                &weights_[weights_offset], 1
+                            );
                         }
                     }
                 }
             }
 
 #else
-            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                biases_diff_[i] *= momentum_;
-            }
+
+            Blas::sscal(
+                thread_pool,
+                kHalfDimensions, momentum_, biases_diff_, 1
+            );
 
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType c = 0; c < 2; ++c) {
                     const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                        biases_diff_[i] += gradients_[output_offset + i];
-                    }
+                    Blas::saxpy(
+                        thread_pool,
+                        kHalfDimensions, 1.0,
+                        &gradients_[output_offset], 1, biases_diff_, 1
+                    );
                 }
             }
 
-            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                biases_[i] -= local_learning_rate * biases_diff_[i];
-            }
+            Blas::saxpy(
+                thread_pool,
+                kHalfDimensions, -local_learning_rate,
+                biases_diff_, 1, biases_, 1
+            );
 
-            for (IndexType b = 0; b < batch_->size(); ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    for (const auto& feature : (*batch_)[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        const auto scale = static_cast<LearnFloatType>(
-                            effective_learning_rate / feature.get_count());
+#pragma omp parallel
+            {
+#if defined(_OPENMP)
+                const IndexType num_threads = omp_get_num_threads();
+                const IndexType thread_index = omp_get_thread_num();
+#endif
+                for (IndexType b = 0; b < batch_->size(); ++b) {
+                    const IndexType batch_offset = kOutputDimensions * b;
+                    for (IndexType c = 0; c < 2; ++c) {
+                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                        for (const auto& feature : (*batch_)[b].training_features[c]) {
+#if defined(_OPENMP)
+                            if (feature.get_index() % num_threads != thread_index)
+                                continue;
+#endif
+                            const IndexType weights_offset =
+                                kHalfDimensions * feature.get_index();
+                            const auto scale = static_cast<LearnFloatType>(
+                                effective_learning_rate / feature.get_count());
 
-                        for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                            weights_[weights_offset + i] -=
-                                scale * gradients_[output_offset + i];
+                            Blas::saxpy(
+                                thread_pool,
+                                kHalfDimensions, -scale,
+                                &gradients_[output_offset], 1,
+                                &weights_[weights_offset], 1
+                            );
                         }
                     }
                 }
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 4bb38104..03e9fec0 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -3,6 +3,8 @@
 
 #include "trainer.h"
 
+#include "extra/stockfish_blas.h"
+
 #include "learn/learn.h"
 
 #include "nnue/layers/input_slice.h"
@@ -208,13 +210,21 @@ namespace Eval::NNUE {
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType input_offset = kInputDimensions * b;
                 const IndexType output_offset = kOutputDimensions * b;
+
 #if defined(USE_BLAS)
-                cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
-                            &output_[output_offset], 1);
+
+                cblas_scopy(
+                    kOutputDimensions, &input[input_offset + Offset], 1,
+                    &output_[output_offset], 1
+                );
 #else
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    output_[output_offset + i] = input[input_offset + Offset + i];
-                }
+
+                Blas::scopy(
+                    thread_pool,
+                    kOutputDimensions, &input[input_offset + Offset], 1,
+                    &output_[output_offset], 1
+                );
+
 #endif
             }
 
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index 6defb95f..88ff302c 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -3,6 +3,8 @@
 
 #include "trainer.h"
 
+#include "extra/stockfish_blas.h"
+
 #include "learn/learn.h"
 
 #include "nnue/layers/sum.h"
@@ -53,15 +55,19 @@ namespace Eval::NNUE {
             const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch);
 
 #if defined(USE_BLAS)
-            cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
-                        head_output, 1, output, 1);
+
+            cblas_saxpy(
+                kOutputDimensions * batch_size_, 1.0,
+                head_output, 1, output, 1
+            );
+
 #else
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    output[batch_offset + i] += head_output[batch_offset + i];
-                }
-            }
+
+            Blas::saxpy(
+                thread_pool,
+                kOutputDimensions * batch_size_, 1.0,
+                head_output, 1, output, 1
+            );
 
 #endif
             return output;