From 3101ae7973b94f6eea176bb302813210eb3feeb3 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Apr 2021 19:04:14 +0200
Subject: [PATCH] remove learn

---
 src/Makefile                                  |    4 -
 src/extra/stockfish_blas.cpp                  | 1291 ---------------
 src/extra/stockfish_blas.h                    |  140 --
 src/learn/autograd.h                          |  667 --------
 src/learn/gensfen.cpp                         |    5 +-
 src/learn/gensfen_nonpv.cpp                   |    1 -
 src/learn/half_float.h                        |  133 --
 src/learn/learn.cpp                           | 1474 -----------------
 src/learn/learn.h                             |  148 --
 src/nnue/evaluate_nnue_learner.cpp            |  341 ----
 src/nnue/evaluate_nnue_learner.h              |   52 -
 src/nnue/nnue_test_command.cpp                |  215 ---
 src/nnue/nnue_test_command.h                  |   12 -
 src/nnue/trainer/features/all_factorizers.h   |   10 -
 src/nnue/trainer/features/factorizer.h        |  117 --
 .../trainer/features/factorizer_feature_set.h |  121 --
 .../trainer/features/factorizer_half_ka.h     |   93 --
 .../trainer/features/factorizer_half_kp.h     |  104 --
 src/nnue/trainer/trainer.h                    |  122 --
 src/nnue/trainer/trainer_affine_transform.h   |  476 ------
 src/nnue/trainer/trainer_clipped_relu.h       |  354 ----
 .../trainer/trainer_feature_transformer.h     |  783 ---------
 src/nnue/trainer/trainer_input_slice.h        |  383 -----
 src/nnue/trainer/trainer_sum.h                |  201 ---
 src/uci.cpp                                   |   25 -
 25 files changed, 2 insertions(+), 7270 deletions(-)
 delete mode 100644 src/extra/stockfish_blas.cpp
 delete mode 100644 src/extra/stockfish_blas.h
 delete mode 100644 src/learn/autograd.h
 delete mode 100644 src/learn/half_float.h
 delete mode 100644 src/learn/learn.cpp
 delete mode 100644 src/learn/learn.h
 delete mode 100644 src/nnue/evaluate_nnue_learner.cpp
 delete mode 100644 src/nnue/evaluate_nnue_learner.h
 delete mode 100644 src/nnue/nnue_test_command.cpp
 delete mode 100644 src/nnue/nnue_test_command.h
 delete mode 100644 src/nnue/trainer/features/all_factorizers.h
 delete mode 100644 src/nnue/trainer/features/factorizer.h
 delete mode 100644 src/nnue/trainer/features/factorizer_feature_set.h
 delete mode 100644 src/nnue/trainer/features/factorizer_half_ka.h
 delete mode 100644 src/nnue/trainer/features/factorizer_half_kp.h
 delete mode 100644 src/nnue/trainer/trainer.h
 delete mode 100644 src/nnue/trainer/trainer_affine_transform.h
 delete mode 100644 src/nnue/trainer/trainer_clipped_relu.h
 delete mode 100644 src/nnue/trainer/trainer_feature_transformer.h
 delete mode 100644 src/nnue/trainer/trainer_input_slice.h
 delete mode 100644 src/nnue/trainer/trainer_sum.h

diff --git a/src/Makefile b/src/Makefile
index a4ced5f0..19927ce5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -47,9 +47,7 @@ PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 sfen_format bin output_file_name
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
 	material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
 	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
-	extra/stockfish_blas.cpp \
 	nnue/evaluate_nnue.cpp \
-	nnue/evaluate_nnue_learner.cpp \
 	nnue/features/half_kp.cpp \
 	nnue/features/half_ka.cpp \
 	nnue/features/half_relative_kp.cpp \
@@ -59,9 +57,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	nnue/features/a.cpp \
 	nnue/features/castling_right.cpp \
 	nnue/features/enpassant.cpp \
-	nnue/nnue_test_command.cpp \
 	learn/sfen_packer.cpp \
-	learn/learn.cpp \
 	learn/gensfen.cpp \
 	learn/gensfen_nonpv.cpp \
 	learn/opening_book.cpp \
diff --git a/src/extra/stockfish_blas.cpp b/src/extra/stockfish_blas.cpp
deleted file mode 100644
index 70b258bc..00000000
--- a/src/extra/stockfish_blas.cpp
+++ /dev/null
@@ -1,1291 +0,0 @@
-#include "stockfish_blas.h"
-
-#include "thread.h"
-
-#include <cstring>
-#include <random>
-#include <iostream>
-#include <vector>
-#include <algorithm>
-#include <cmath>
-#include <atomic>
-#include <chrono>
-
-#if defined(USE_SSE2)
-#include <xmmintrin.h>
-#endif
-
-#if defined (USE_SSE3)
-#include <pmmintrin.h>
-#endif
-
-#if defined(USE_BLAS)
-#include <cblas.h>
-#endif
-
-namespace Blas {
-    void scopy(
-        const int N,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    )
-    {
-        std::memcpy(Y, X, sizeof(float) * N);
-    }
-
-    void scopy(
-        const int N,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    )
-    {
-        if (incX == 1 && incY == 1)
-        {
-            scopy(N, X, Y);
-        }
-        else
-        {
-            for(int i = 0; i < N; ++i)
-            {
-                *Y = *X;
-                X += incX;
-                Y += incY;
-            }
-        }
-    }
-
-    void scopy(
-        ThreadPool&,
-        const int N,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    )
-    {
-        scopy(N, X, Y);
-    }
-
-    void scopy(
-        ThreadPool&,
-        const int N,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    )
-    {
-        scopy(N, X, incX, Y, incY);
-    }
-
-    void sscal(
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X
-    )
-    {
-#if defined (USE_SSE2)
-
-        const __m128 alpha4 = _mm_set1_ps(alpha);
-
-        int i = 0;
-        for(; i < N - 31; i += 32)
-        {
-            __m128 x0 = _mm_loadu_ps(X + i +  0);
-            __m128 x1 = _mm_loadu_ps(X + i +  4);
-            __m128 x2 = _mm_loadu_ps(X + i +  8);
-            __m128 x3 = _mm_loadu_ps(X + i + 12);
-            __m128 x4 = _mm_loadu_ps(X + i + 16);
-            __m128 x5 = _mm_loadu_ps(X + i + 20);
-            __m128 x6 = _mm_loadu_ps(X + i + 24);
-            __m128 x7 = _mm_loadu_ps(X + i + 28);
-
-            x0 = _mm_mul_ps(x0, alpha4);
-            x1 = _mm_mul_ps(x1, alpha4);
-            x2 = _mm_mul_ps(x2, alpha4);
-            x3 = _mm_mul_ps(x3, alpha4);
-            x4 = _mm_mul_ps(x4, alpha4);
-            x5 = _mm_mul_ps(x5, alpha4);
-            x6 = _mm_mul_ps(x6, alpha4);
-            x7 = _mm_mul_ps(x7, alpha4);
-
-            _mm_storeu_ps(X + i +  0, x0);
-            _mm_storeu_ps(X + i +  4, x1);
-            _mm_storeu_ps(X + i +  8, x2);
-            _mm_storeu_ps(X + i + 12, x3);
-            _mm_storeu_ps(X + i + 16, x4);
-            _mm_storeu_ps(X + i + 20, x5);
-            _mm_storeu_ps(X + i + 24, x6);
-            _mm_storeu_ps(X + i + 28, x7);
-        }
-
-        for(; i < N; ++i)
-        {
-            X[i] *= alpha;
-        }
-
-#else
-
-        for(int i = 0; i < N; ++i)
-        {
-            X[i] *= alpha;
-        }
-
-#endif
-    }
-
-    void sscal(
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X, const int incX
-    )
-    {
-        if (incX == 1)
-        {
-            sscal(N, alpha, X);
-        }
-        else
-        {
-            for(int i = 0; i < N; ++i)
-            {
-                *X *= alpha;
-                X += incX;
-            }
-        }
-    }
-
-    void sscal(
-        ThreadPool&,
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X
-    )
-    {
-        sscal(N, alpha, X);
-    }
-
-    void sscal(
-        ThreadPool&,
-        const int N,
-        const float alpha,
-        float *X, const int incX
-    )
-    {
-        sscal(N, alpha, X, incX);
-    }
-
-    void saxpy(
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    )
-    {
-        if (alpha == 1.0f)
-        {
-            for (int i = 0; i < N; ++i)
-            {
-                Y[i] += X[i];
-            }
-        }
-        else
-        {
-            for (int i = 0; i < N; ++i)
-            {
-                Y[i] += X[i] * alpha;
-            }
-        }
-
-    }
-
-    void saxpy(
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    )
-    {
-        if (incX == 1 && incY == 1)
-        {
-            saxpy(N, alpha, X, Y);
-        }
-        else
-        {
-            for(int i = 0; i < N; ++i)
-            {
-                *Y += *X * alpha;
-                Y += incY;
-                X += incX;
-            }
-        }
-    }
-
-    void saxpy(
-        ThreadPool&,
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    )
-    {
-        saxpy(N, alpha, X, Y);
-    }
-
-    void saxpy(
-        ThreadPool&,
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    )
-    {
-        saxpy(N, alpha, X, incX, Y, incY);
-    }
-
-#if defined (USE_SSE3)
-    inline __m128 m128_hadd_ps(__m128 a, __m128 b, __m128 c, __m128 d)
-    {
-        const __m128 t0 = _mm_hadd_ps(a, b);
-        const __m128 t1 = _mm_hadd_ps(c, d);
-        return _mm_hadd_ps(t0, t1);
-    }
-#endif
-
-#if defined (USE_SSE2)
-
-    inline void transpose4x4_sse2(
-        const float* SF_BLAS_RESTRICT A, const int lda,
-        float* SF_BLAS_RESTRICT B, const int ldb
-    )
-    {
-        __m128 row1 = _mm_loadu_ps(&A[0 * lda]);
-        __m128 row2 = _mm_loadu_ps(&A[1 * lda]);
-        __m128 row3 = _mm_loadu_ps(&A[2 * lda]);
-        __m128 row4 = _mm_loadu_ps(&A[3 * lda]);
-
-        _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
-
-        _mm_storeu_ps(&B[0 * ldb], row1);
-        _mm_storeu_ps(&B[1 * ldb], row2);
-        _mm_storeu_ps(&B[2 * ldb], row3);
-        _mm_storeu_ps(&B[3 * ldb], row4);
-    }
-
-    void transpose_sse2(
-        const int N, const int M,
-        const float* SF_BLAS_RESTRICT A, const int lda,
-        float* SF_BLAS_RESTRICT B, const int ldb
-    )
-    {
-        static constexpr int block_size = 16;
-
-        for (int n = 0; n < N; n += block_size)
-        {
-            for (int m = 0; m < M; m += block_size)
-            {
-                const int max_n2 = n + block_size < N ? n + block_size : N;
-                const int max_m2 = m + block_size < M ? m + block_size : M;
-
-                int n2 = n;
-                for (; n2 < max_n2 - 3; n2 += 4)
-                {
-                    int m2 = m;
-                    for (; m2 < max_m2 - 3; m2 += 4)
-                    {
-                        transpose4x4_sse2(
-                            &A[n2 * lda + m2], lda,
-                            &B[m2 * ldb + n2], ldb
-                        );
-                    }
-
-                    for (; m2 < max_m2; ++m2)
-                    {
-                        B[m2 * ldb + n2 + 0] = A[(n2 + 0) * lda + m2];
-                        B[m2 * ldb + n2 + 1] = A[(n2 + 1) * lda + m2];
-                        B[m2 * ldb + n2 + 2] = A[(n2 + 2) * lda + m2];
-                        B[m2 * ldb + n2 + 3] = A[(n2 + 3) * lda + m2];
-                    }
-                }
-
-                for (; n2 < max_n2; ++n2)
-                {
-                    for (int m2 = m; m2 < max_m2; ++m2)
-                    {
-                        B[m2 * ldb + n2] = A[n2 * lda + m2];
-                    }
-                }
-            }
-        }
-    }
-#endif
-
-    void transpose(
-        const int N, const int M,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        float* SF_BLAS_RESTRICT B, const int ldb
-    )
-    {
-#if defined (USE_SSE2)
-
-        transpose_sse2(
-            N, M,
-            A, lda,
-            B, ldb
-        );
-
-#else
-
-        for(int r = 0; r < N; ++r)
-        {
-            for (int c = 0; c < M; ++c)
-            {
-                B[c*ldb + r] = A[r*lda + c];
-            }
-        }
-
-#endif
-    }
-
-    void sgemm_row_major_transpose_right(
-        ThreadPool& thread_pool,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-
-#if defined(USE_SSE3)
-
-        const __m128 alpha4 = _mm_set1_ps(alpha);
-        const __m128 beta4 = _mm_set1_ps(beta);
-
-        std::atomic<int> m_atomic = 0;
-        thread_pool.execute_with_workers(
-            [
-                M, N, K,
-                alpha, alpha4,
-                A, lda,
-                B, ldb,
-                beta, beta4,
-                C, ldc,
-                &m_atomic
-            ](Thread&) {
-                for (;;)
-                {
-                    const int m = m_atomic.fetch_add(2);
-                    if (m >= M - 1)
-                        break;
-
-                    int n = 0;
-                    for (; n < N - 3; n += 4)
-                    {
-                        //        mn
-                        __m128 sum00 = _mm_setzero_ps();
-                        __m128 sum01 = _mm_setzero_ps();
-                        __m128 sum02 = _mm_setzero_ps();
-                        __m128 sum03 = _mm_setzero_ps();
-                        __m128 sum10 = _mm_setzero_ps();
-                        __m128 sum11 = _mm_setzero_ps();
-                        __m128 sum12 = _mm_setzero_ps();
-                        __m128 sum13 = _mm_setzero_ps();
-
-                        // Horizontal sum of elements in sum[m][n] corresponds to
-                        // the final element in the C.
-
-                        int k = 0;
-                        for (; k < K - 3; k += 4)
-                        {
-                            const __m128 a0 = _mm_loadu_ps(&A[(m+0)*lda+k+0]);
-                            const __m128 a1 = _mm_loadu_ps(&A[(m+1)*lda+k+0]);
-
-                            const __m128 b0 = _mm_loadu_ps(&B[(n+0)*ldb+k+0]);
-                            const __m128 b1 = _mm_loadu_ps(&B[(n+1)*ldb+k+0]);
-                            const __m128 b2 = _mm_loadu_ps(&B[(n+2)*ldb+k+0]);
-                            const __m128 b3 = _mm_loadu_ps(&B[(n+3)*ldb+k+0]);
-
-                            sum00 = _mm_add_ps(sum00, _mm_mul_ps(a0, b0));
-                            sum01 = _mm_add_ps(sum01, _mm_mul_ps(a0, b1));
-                            sum02 = _mm_add_ps(sum02, _mm_mul_ps(a0, b2));
-                            sum03 = _mm_add_ps(sum03, _mm_mul_ps(a0, b3));
-                            sum10 = _mm_add_ps(sum10, _mm_mul_ps(a1, b0));
-                            sum11 = _mm_add_ps(sum11, _mm_mul_ps(a1, b1));
-                            sum12 = _mm_add_ps(sum12, _mm_mul_ps(a1, b2));
-                            sum13 = _mm_add_ps(sum13, _mm_mul_ps(a1, b3));
-                        }
-
-                        for(; k < K; k += 1)
-                        {
-                            const float a0 = A[(m+0)*lda+k+0];
-                            const float a1 = A[(m+1)*lda+k+0];
-
-                            const float b0 = B[(n+0)*ldb+k+0];
-                            const float b1 = B[(n+1)*ldb+k+0];
-                            const float b2 = B[(n+2)*ldb+k+0];
-                            const float b3 = B[(n+3)*ldb+k+0];
-
-                            // Since all will be summed vertically anyway we can
-                            // just add to the first element.
-                            // Other elements are left unmodified.
-                            sum00 = _mm_add_ss(sum00, _mm_set_ss(a0 * b0));
-                            sum01 = _mm_add_ss(sum01, _mm_set_ss(a0 * b1));
-                            sum02 = _mm_add_ss(sum02, _mm_set_ss(a0 * b2));
-                            sum03 = _mm_add_ss(sum03, _mm_set_ss(a0 * b3));
-                            sum10 = _mm_add_ss(sum10, _mm_set_ss(a1 * b0));
-                            sum11 = _mm_add_ss(sum11, _mm_set_ss(a1 * b1));
-                            sum12 = _mm_add_ss(sum12, _mm_set_ss(a1 * b2));
-                            sum13 = _mm_add_ss(sum13, _mm_set_ss(a1 * b3));
-                        }
-
-                        __m128 s0 = m128_hadd_ps(sum00, sum01, sum02, sum03);
-                        __m128 s1 = m128_hadd_ps(sum10, sum11, sum12, sum13);
-                        s0 = _mm_mul_ps(s0, alpha4);
-                        s1 = _mm_mul_ps(s1, alpha4);
-
-                        __m128 c0 = _mm_loadu_ps(&C[(m+0)*ldc+(n+0)]);
-                        __m128 c1 = _mm_loadu_ps(&C[(m+1)*ldc+(n+0)]);
-                        c0 = _mm_mul_ps(c0, beta4);
-                        c1 = _mm_mul_ps(c1, beta4);
-
-                        c0 = _mm_add_ps(c0, s0);
-                        c1 = _mm_add_ps(c1, s1);
-
-                        _mm_storeu_ps(&C[(m+0)*ldc+(n+0)], c0);
-                        _mm_storeu_ps(&C[(m+1)*ldc+(n+0)], c1);
-                    }
-
-                    for(; n < N; n += 1)
-                    {
-                        float sum0 = 0.0f;
-                        float sum1 = 0.0f;
-
-                        for (int k = 0; k < K; ++k)
-                        {
-                            const float a0 = A[(m+0)*lda+k+0];
-                            const float a1 = A[(m+1)*lda+k+0];
-
-                            const float b0 = B[(n+0)*ldb+k+0];
-
-                            sum0 += a0 * b0;
-                            sum1 += a1 * b0;
-                        }
-
-                        C[(m+0)*ldc+(n+0)] = C[(m+0)*ldc+(n+0)] * beta + sum0 * alpha;
-                        C[(m+1)*ldc+(n+0)] = C[(m+1)*ldc+(n+0)] * beta + sum1 * alpha;
-                    }
-                }
-            }
-        );
-
-        int m = M - (M % 2);
-        for (; m < M; m += 1)
-        {
-            for (int n = 0; n < N; n += 1)
-            {
-                float sum = 0.0f;
-
-                for (int k = 0; k < K; k += 1)
-                {
-                    sum += A[m*lda + k] * B[n*ldb + k];
-                }
-
-                C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
-            }
-        }
-
-        thread_pool.wait_for_workers_finished();
-
-#else
-
-        thread_pool.for_each_index_with_workers(
-            0, M,
-            [&](Thread&, int m) {
-                for (int n = 0; n < N; n += 1)
-                {
-                    float sum = 0.0f;
-
-                    for (int k = 0; k < K; k += 1)
-                    {
-                        sum += A[m*lda + k] * B[n*ldb + k];
-                    }
-
-                    C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
-                }
-            }
-        );
-        thread_pool.wait_for_workers_finished();
-
-#endif
-    }
-
-    void sgemm_row_major_transpose_right(
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-
-#if defined(USE_SSE3)
-
-        const __m128 alpha4 = _mm_set1_ps(alpha);
-        const __m128 beta4 = _mm_set1_ps(beta);
-
-        int m = 0;
-        for (; m < M - 1; m += 2)
-        {
-            int n = 0;
-            for (; n < N - 3; n += 4)
-            {
-                //        mn
-                __m128 sum00 = _mm_setzero_ps();
-                __m128 sum01 = _mm_setzero_ps();
-                __m128 sum02 = _mm_setzero_ps();
-                __m128 sum03 = _mm_setzero_ps();
-                __m128 sum10 = _mm_setzero_ps();
-                __m128 sum11 = _mm_setzero_ps();
-                __m128 sum12 = _mm_setzero_ps();
-                __m128 sum13 = _mm_setzero_ps();
-
-                // Horizontal sum of elements in sum[m][n] corresponds to
-                // the final element in the C.
-
-                int k = 0;
-                for (; k < K - 3; k += 4)
-                {
-                    const __m128 a0 = _mm_loadu_ps(&A[(m+0)*lda+k+0]);
-                    const __m128 a1 = _mm_loadu_ps(&A[(m+1)*lda+k+0]);
-
-                    const __m128 b0 = _mm_loadu_ps(&B[(n+0)*ldb+k+0]);
-                    const __m128 b1 = _mm_loadu_ps(&B[(n+1)*ldb+k+0]);
-                    const __m128 b2 = _mm_loadu_ps(&B[(n+2)*ldb+k+0]);
-                    const __m128 b3 = _mm_loadu_ps(&B[(n+3)*ldb+k+0]);
-
-                    sum00 = _mm_add_ps(sum00, _mm_mul_ps(a0, b0));
-                    sum01 = _mm_add_ps(sum01, _mm_mul_ps(a0, b1));
-                    sum02 = _mm_add_ps(sum02, _mm_mul_ps(a0, b2));
-                    sum03 = _mm_add_ps(sum03, _mm_mul_ps(a0, b3));
-                    sum10 = _mm_add_ps(sum10, _mm_mul_ps(a1, b0));
-                    sum11 = _mm_add_ps(sum11, _mm_mul_ps(a1, b1));
-                    sum12 = _mm_add_ps(sum12, _mm_mul_ps(a1, b2));
-                    sum13 = _mm_add_ps(sum13, _mm_mul_ps(a1, b3));
-                }
-
-                for(; k < K; k += 1)
-                {
-                    const float a0 = A[(m+0)*lda+k+0];
-                    const float a1 = A[(m+1)*lda+k+0];
-
-                    const float b0 = B[(n+0)*ldb+k+0];
-                    const float b1 = B[(n+1)*ldb+k+0];
-                    const float b2 = B[(n+2)*ldb+k+0];
-                    const float b3 = B[(n+3)*ldb+k+0];
-
-                    // Since all will be summed vertically anyway we can
-                    // just add to the first element.
-                    // Other elements are left unmodified.
-                    sum00 = _mm_add_ss(sum00, _mm_set_ss(a0 * b0));
-                    sum01 = _mm_add_ss(sum01, _mm_set_ss(a0 * b1));
-                    sum02 = _mm_add_ss(sum02, _mm_set_ss(a0 * b2));
-                    sum03 = _mm_add_ss(sum03, _mm_set_ss(a0 * b3));
-                    sum10 = _mm_add_ss(sum10, _mm_set_ss(a1 * b0));
-                    sum11 = _mm_add_ss(sum11, _mm_set_ss(a1 * b1));
-                    sum12 = _mm_add_ss(sum12, _mm_set_ss(a1 * b2));
-                    sum13 = _mm_add_ss(sum13, _mm_set_ss(a1 * b3));
-                }
-
-                __m128 s0 = m128_hadd_ps(sum00, sum01, sum02, sum03);
-                __m128 s1 = m128_hadd_ps(sum10, sum11, sum12, sum13);
-                s0 = _mm_mul_ps(s0, alpha4);
-                s1 = _mm_mul_ps(s1, alpha4);
-
-                __m128 c0 = _mm_loadu_ps(&C[(m+0)*ldc+(n+0)]);
-                __m128 c1 = _mm_loadu_ps(&C[(m+1)*ldc+(n+0)]);
-                c0 = _mm_mul_ps(c0, beta4);
-                c1 = _mm_mul_ps(c1, beta4);
-
-                c0 = _mm_add_ps(c0, s0);
-                c1 = _mm_add_ps(c1, s1);
-
-                _mm_storeu_ps(&C[(m+0)*ldc+(n+0)], c0);
-                _mm_storeu_ps(&C[(m+1)*ldc+(n+0)], c1);
-            }
-
-            for(; n < N; n += 1)
-            {
-                float sum0 = 0.0f;
-                float sum1 = 0.0f;
-
-                for (int k = 0; k < K; ++k)
-                {
-                    const float a0 = A[(m+0)*lda+k+0];
-                    const float a1 = A[(m+1)*lda+k+0];
-
-                    const float b0 = B[(n+0)*ldb+k+0];
-
-                    sum0 += a0 * b0;
-                    sum1 += a1 * b0;
-                }
-
-                C[(m+0)*ldc+(n+0)] = C[(m+0)*ldc+(n+0)] * beta + sum0 * alpha;
-                C[(m+1)*ldc+(n+0)] = C[(m+1)*ldc+(n+0)] * beta + sum1 * alpha;
-            }
-        }
-
-        for (; m < M; m += 1)
-        {
-            for (int n = 0; n < N; n += 1)
-            {
-                float sum = 0.0f;
-
-                for (int k = 0; k < K; k += 1)
-                {
-                    sum += A[m*lda + k] * B[n*ldb + k];
-                }
-
-                C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
-            }
-        }
-
-#else
-
-        for (int m = 0; m < M; m += 1)
-        {
-            for (int n = 0; n < N; n += 1)
-            {
-                float sum = 0.0f;
-
-                for (int k = 0; k < K; k += 1)
-                {
-                    sum += A[m*lda + k] * B[n*ldb + k];
-                }
-
-                C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
-            }
-        }
-
-#endif
-    }
-
-    // The pointer to the storage returned by this function
-    // is valid until the next call to this function from
-    // the same thread with the same idx.
-    // This is an unsafe function and should be used with caution
-    // and only within this translation unit.
-    // The number of buffers available is just enough to make
-    // all functions here work.
-    float* get_thread_local_temporary_storage(
-        int requested_size, int idx
-    )
-    {
-        static constexpr int MAX_NUM_BUFFERS = 2;
-
-        static thread_local int s_data_size[MAX_NUM_BUFFERS] = {0};
-        static thread_local std::unique_ptr<float[]> s_data[MAX_NUM_BUFFERS];
-
-        if (requested_size > s_data_size[idx])
-        {
-            s_data[idx] = std::make_unique<float[]>(requested_size);
-            s_data_size[idx] = requested_size;
-        }
-
-        return s_data[idx].get();
-    }
-
-    void sgemm_row_major_transpose_none(
-        ThreadPool& thread_pool,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-        constexpr static int temporary_buffer_index = 1;
-
-        auto B_tr = get_thread_local_temporary_storage(K * N, temporary_buffer_index);
-
-        transpose(
-            K, N,
-            B, ldb,
-            B_tr, K
-        );
-
-        sgemm_row_major_transpose_right(
-            thread_pool,
-            M, N, K,
-            alpha,
-            A, lda,
-            B_tr, K,
-            beta,
-            C, ldc
-        );
-    }
-
-    void sgemm_row_major_transpose_none(
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-        constexpr static int temporary_buffer_index = 1;
-
-        auto B_tr = get_thread_local_temporary_storage(K * N, temporary_buffer_index);
-
-        transpose(
-            K, N,
-            B, ldb,
-            B_tr, K
-        );
-
-        sgemm_row_major_transpose_right(
-            M, N, K,
-            alpha,
-            A, lda,
-            B_tr, K,
-            beta,
-            C, ldc
-        );
-    }
-
-    void sgemm_row_major(
-        ThreadPool& thread_pool,
-        MatrixTranspose TransA, MatrixTranspose TransB,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-        constexpr static int temporary_buffer_index = 0;
-
-        if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::Trans)
-        {
-            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
-
-            transpose(
-                K, M,
-                A, lda,
-                A_tr, K
-            );
-
-            sgemm_row_major_transpose_right(
-                thread_pool,
-                M, N, K,
-                alpha,
-                A_tr, K,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else if (TransA == MatrixTranspose::NoTrans && TransB == MatrixTranspose::Trans)
-        {
-            sgemm_row_major_transpose_right(
-                thread_pool,
-                M, N, K,
-                alpha,
-                A, lda,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::NoTrans)
-        {
-            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
-
-            transpose(
-                K, M,
-                A, lda,
-                A_tr, K
-            );
-
-            sgemm_row_major_transpose_none(
-                thread_pool,
-                M, N, K,
-                alpha,
-                A_tr, K,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else // no transpositions
-        {
-            sgemm_row_major_transpose_none(
-                thread_pool,
-                M, N, K,
-                alpha,
-                A, lda,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-    }
-
-    void sgemm_row_major(
-        MatrixTranspose TransA, MatrixTranspose TransB,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-        constexpr static int temporary_buffer_index = 0;
-
-        if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::Trans)
-        {
-            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
-
-            transpose(
-                K, M,
-                A, lda,
-                A_tr, K
-            );
-
-            sgemm_row_major_transpose_right(
-                M, N, K,
-                alpha,
-                A_tr, K,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else if (TransA == MatrixTranspose::NoTrans && TransB == MatrixTranspose::Trans)
-        {
-            sgemm_row_major_transpose_right(
-                M, N, K,
-                alpha,
-                A, lda,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::NoTrans)
-        {
-            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
-
-            transpose(
-                K, M,
-                A, lda,
-                A_tr, K
-            );
-
-            sgemm_row_major_transpose_none(
-                M, N, K,
-                alpha,
-                A_tr, K,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else // no transpositions
-        {
-            sgemm_row_major_transpose_none(
-                M, N, K,
-                alpha,
-                A, lda,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-    }
-
-    void sgemm(
-        ThreadPool& thread_pool,
-        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-        if (layout == MatrixLayout::RowMajor)
-        {
-            sgemm_row_major(
-                thread_pool,
-                TransA, TransB,
-                M, N, K,
-                alpha,
-                A, lda,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else
-        {
-            sgemm_row_major(
-                thread_pool,
-                TransB, TransA,
-                N, M, K,
-                alpha,
-                B, ldb,
-                A, lda,
-                beta,
-                C, ldc
-            );
-        }
-    }
-
-
-    void sgemm(
-        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-        if (layout == MatrixLayout::RowMajor)
-        {
-            sgemm_row_major(
-                TransA, TransB,
-                M, N, K,
-                alpha,
-                A, lda,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else
-        {
-            sgemm_row_major(
-                TransB, TransA,
-                N, M, K,
-                alpha,
-                B, ldb,
-                A, lda,
-                beta,
-                C, ldc
-            );
-        }
-    }
-
-    std::vector<float> generate_random_matrix(int rows, int cols)
-    {
-        std::vector<float> m(rows * cols);
-
-        std::mt19937_64 rng;
-        std::uniform_real_distribution<float> d(-1.0, 1.0);
-
-        for(auto& v : m)
-        {
-            v = d(rng);
-        }
-
-        return m;
-    }
-
-    std::vector<float> generate_zero_matrix(int rows, int cols)
-    {
-        return std::vector<float>(rows * cols, 0.0f);
-    }
-
-    float matrix_relative_error(
-        const std::vector<float>& ref,
-        const std::vector<float>& our
-    )
-    {
-        double sum = 0.0;
-        double diff_sum = 0.0;
-
-        for(size_t i = 0; i < ref.size(); ++i)
-        {
-            sum += std::abs(ref[i]);
-            diff_sum += std::abs(ref[i] - our[i]);
-        }
-
-        return diff_sum / sum;
-    }
-
-    float norm(
-        const std::vector<float>& v
-    )
-    {
-        double sum = 0.0;
-
-        for(auto& e : v)
-        {
-            sum += e * e;
-        }
-
-        return std::sqrt(sum);
-    }
-
-#if defined (USE_BLAS)
-
-    CBLAS_LAYOUT matrix_layout_to_blas_layout(MatrixLayout layout)
-    {
-        if (layout == MatrixLayout::RowMajor)
-            return CblasRowMajor;
-        else if (layout == MatrixLayout::ColMajor)
-            return CblasColMajor;
-
-        return static_cast<CBLAS_LAYOUT>(-1);
-    }
-
-    const char* matrix_layout_to_string(MatrixLayout layout)
-    {
-        if (layout == MatrixLayout::RowMajor)
-            return "RowMajor";
-        else if (layout == MatrixLayout::ColMajor)
-            return "ColMajor";
-
-        return "INVALID";
-    }
-
-    CBLAS_TRANSPOSE matrix_transpose_to_blas_transpose(MatrixTranspose tr)
-    {
-        if (tr == MatrixTranspose::NoTrans)
-            return CblasNoTrans;
-        else if (tr == MatrixTranspose::Trans)
-            return CblasTrans;
-
-        return static_cast<CBLAS_TRANSPOSE>(-1);
-    }
-
-    const char* matrix_transpose_to_string(MatrixTranspose tr)
-    {
-        if (tr == MatrixTranspose::NoTrans)
-            return "NoTrans";
-        else if (tr == MatrixTranspose::Trans)
-            return "Trans";
-
-        return "INVALID";
-    }
-
-    void test_sgemm(
-        ThreadPool& thread_pool,
-        MatrixLayout layout, MatrixTranspose trA, MatrixTranspose trB,
-        int M, int N, int K
-    )
-    {
-        auto A = generate_random_matrix(M * 2, K * 2);
-        auto B = generate_random_matrix(K * 2, N * 2);
-        auto C_ref = generate_random_matrix(M * 2, N * 2);
-        auto C_our = C_ref;
-
-        std::cout
-            << matrix_layout_to_string(layout) << ' '
-            << matrix_transpose_to_string(trA) << ' '
-            << matrix_transpose_to_string(trB) << '\n';
-
-        std::cout << "A norm: " << norm(A) << '\n';
-        std::cout << "B norm: " << norm(B) << '\n';
-        std::cout << "C norm: " << norm(C_ref) << '\n';
-
-        const int lda = (trA == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? K * 2 : M * 2;
-        const int ldb = (trB == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? N * 2 : K * 2;
-        const int ldc = (layout == MatrixLayout::RowMajor) ? N * 2 : M * 2;
-
-        cblas_sgemm(
-            matrix_layout_to_blas_layout(layout),
-            matrix_transpose_to_blas_transpose(trA),
-            matrix_transpose_to_blas_transpose(trB),
-            M, N, K,
-            1.0,
-            A.data(), lda,
-            B.data(), ldb,
-            1.0,
-            C_ref.data(), ldc
-        );
-
-        sgemm(
-            thread_pool,
-            layout, trA, trB,
-            M, N, K,
-            1.0,
-            A.data(), lda,
-            B.data(), ldb,
-            1.0,
-            C_our.data(), ldc
-        );
-
-        std::cout << "C_ref norm: " << norm(C_ref) << '\n';
-        std::cout << "C_our norm: " << norm(C_our) << '\n';
-        std::cout << "Relative error: " << matrix_relative_error(C_ref, C_our) << '\n';
-
-        std::cout << '\n';
-    }
-
-    void test_sgemm(
-        ThreadPool& thread_pool
-    )
-    {
-        constexpr int M = 57;
-        constexpr int N = 127;
-        constexpr int K = 31;
-
-        std::cout << "SGEMM test:\n";
-
-        for(auto layout : { MatrixLayout::RowMajor, MatrixLayout::ColMajor })
-        {
-            for(auto trA : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
-            {
-                for(auto trB : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
-                {
-                    test_sgemm(
-                        thread_pool,
-                        layout, trA, trB,
-                        M, N, K
-                    );
-                }
-            }
-        }
-    }
-
-    void bench_sgemm(
-        ThreadPool& thread_pool,
-        MatrixLayout layout, MatrixTranspose trA, MatrixTranspose trB,
-        int M, int N, int K
-    )
-    {
-        constexpr int num_iters = 1000;
-
-        auto A = generate_random_matrix(M * 2, K * 2);
-        auto B = generate_random_matrix(K * 2, N * 2);
-        auto C_ref = generate_random_matrix(M * 2, N * 2);
-        auto C_our = C_ref;
-
-        std::cout
-            << matrix_layout_to_string(layout) << ' '
-            << matrix_transpose_to_string(trA) << ' '
-            << matrix_transpose_to_string(trB) << '\n';
-
-        std::cout << "A norm: " << norm(A) << '\n';
-        std::cout << "B norm: " << norm(B) << '\n';
-        std::cout << "C norm: " << norm(C_ref) << '\n';
-
-        const int lda = (trA == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? K * 2 : M * 2;
-        const int ldb = (trB == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? N * 2 : K * 2;
-        const int ldc = (layout == MatrixLayout::RowMajor) ? N * 2 : M * 2;
-
-        auto t0_ref = std::chrono::high_resolution_clock::now();
-        for(int i = 0; i < num_iters; ++i)
-        {
-            cblas_sgemm(
-                matrix_layout_to_blas_layout(layout),
-                matrix_transpose_to_blas_transpose(trA),
-                matrix_transpose_to_blas_transpose(trB),
-                M, N, K,
-                1.0,
-                A.data(), lda,
-                B.data(), ldb,
-                -0.5,
-                C_ref.data(), ldc
-            );
-        }
-        auto t1_ref = std::chrono::high_resolution_clock::now();
-        auto diff_ref = t1_ref - t0_ref;
-
-        auto t0_our = std::chrono::high_resolution_clock::now();
-        for(int i = 0; i < num_iters; ++i)
-        {
-            sgemm(
-                thread_pool,
-                layout, trA, trB,
-                M, N, K,
-                1.0,
-                A.data(), lda,
-                B.data(), ldb,
-                -0.5,
-                C_our.data(), ldc
-            );
-        }
-        auto t1_our = std::chrono::high_resolution_clock::now();
-        auto diff_our = t1_our - t0_our;
-
-        std::cout << "C_ref norm: " << norm(C_ref) << '\n';
-        std::cout << "C_our norm: " << norm(C_our) << '\n';
-        std::cout << "Relative error: " << matrix_relative_error(C_ref, C_our) << '\n';
-        std::cout << "Ref time: " << std::chrono::duration_cast<std::chrono::nanoseconds>(diff_ref).count() << " [ns]\n";
-        std::cout << "Our time: " << std::chrono::duration_cast<std::chrono::nanoseconds>(diff_our).count() << " [ns]\n";
-
-        std::cout << '\n';
-    }
-
-    void bench_sgemm(
-        ThreadPool& thread_pool
-    )
-    {
-        constexpr int M = 107;
-        constexpr int N = 213;
-        constexpr int K = 57;
-
-        std::cout << "SGEMM benchmark:\n";
-
-        for(auto layout : { MatrixLayout::RowMajor, MatrixLayout::ColMajor })
-        {
-            for(auto trA : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
-            {
-                for(auto trB : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
-                {
-                    bench_sgemm(
-                        thread_pool,
-                        layout, trA, trB,
-                        M, N, K
-                    );
-                }
-            }
-        }
-    }
-
-#endif
-
-    void print_arch()
-    {
-#if defined (USE_SSE3)
-        std::cout << "Using the sse3 implementation.\n";
-#elif defined (USE_SSE2)
-        std::cout << "Using the sse2 implementation.\n";
-#else
-        std::cout << "Using the base implementation.\n";
-#endif
-    }
-
-    void test(
-        ThreadPool& thread_pool
-    )
-    {
-#if defined (USE_BLAS)
-        print_arch();
-        test_sgemm(thread_pool);
-#else
-        std::cout << "Blas tests are only runnable when USE_BLAS is defined.\n";
-        (void)thread_pool;
-#endif
-    }
-
-    void bench(
-        ThreadPool& thread_pool
-    )
-    {
-#if defined (USE_BLAS)
-        print_arch();
-        bench_sgemm(thread_pool);
-#else
-        std::cout << "Blas benchmarks are only runnable when USE_BLAS is defined.\n";
-        (void)thread_pool;
-#endif
-    }
-}
\ No newline at end of file
diff --git a/src/extra/stockfish_blas.h b/src/extra/stockfish_blas.h
deleted file mode 100644
index f551bbf2..00000000
--- a/src/extra/stockfish_blas.h
+++ /dev/null
@@ -1,140 +0,0 @@
-#ifndef _STOCKFISH_BLAS_H_
-#define _STOCKFISH_BLAS_H_
-
-struct ThreadPool;
-
-#if defined (_MSC_VER)
-#define SF_BLAS_RESTRICT __restrict
-#elif defined (__INTEL_COMPILER)
-#define SF_BLAS_RESTRICT restrict
-#elif defined (__clang__)
-#define SF_BLAS_RESTRICT __restrict__
-#elif defined (__GNUC__)
-#define SF_BLAS_RESTRICT __restrict__
-#endif
-
-namespace Blas {
-
-    enum struct MatrixLayout {
-        RowMajor = 101,
-        ColMajor = 102
-    };
-
-    enum struct MatrixTranspose {
-        NoTrans = 111,
-        Trans = 112
-    };
-
-    void scopy(
-        const int N,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    );
-
-    void scopy(
-        const int N,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    );
-
-    void scopy(
-        ThreadPool& thread_pool,
-        const int N,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    );
-
-    void scopy(
-        ThreadPool& thread_pool,
-        const int N,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    );
-
-    void sscal(
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X
-    );
-
-    void sscal(
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X, const int incX
-    );
-
-    void sscal(
-        ThreadPool& thread_pool,
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X
-    );
-
-    void sscal(
-        ThreadPool& thread_pool,
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X, const int incX
-    );
-
-    void saxpy(
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    );
-
-    void saxpy(
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    );
-
-    void saxpy(
-        ThreadPool& thread_pool,
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    );
-
-    void saxpy(
-        ThreadPool& thread_pool,
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    );
-
-    void sgemm(
-        ThreadPool& thread_pool,
-        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    );
-
-    void sgemm(
-        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    );
-
-    void test(
-        ThreadPool& thread_pool
-    );
-
-    void bench(
-        ThreadPool& thread_pool
-    );
-}
-
-#endif
diff --git a/src/learn/autograd.h b/src/learn/autograd.h
deleted file mode 100644
index 7b2853df..00000000
--- a/src/learn/autograd.h
+++ /dev/null
@@ -1,667 +0,0 @@
-#ifndef LEARNER_AUTOGRAD_H
-#define LEARNER_AUTOGRAD_H
-
-#include <cmath>
-#include <utility>
-#include <type_traits>
-#include <memory>
-#include <tuple>
-#include <optional>
-#include <algorithm>
-#include <cstdint>
-
-namespace Learner
-{
-    template <typename T>
-    struct ValueWithGrad
-    {
-        T value;
-        T grad;
-
-        ValueWithGrad& operator+=(const ValueWithGrad<T>& rhs)
-        {
-            value += rhs.value;
-            grad += rhs.grad;
-            return *this;
-        }
-
-        ValueWithGrad& operator-=(const ValueWithGrad<T>& rhs)
-        {
-            value -= rhs.value;
-            grad -= rhs.grad;
-            return *this;
-        }
-
-        ValueWithGrad& operator*=(T rhs)
-        {
-            value *= rhs;
-            grad *= rhs;
-            return *this;
-        }
-
-        ValueWithGrad& operator/=(T rhs)
-        {
-            value /= rhs;
-            grad /= rhs;
-            return *this;
-        }
-
-        [[nodiscard]] ValueWithGrad abs() const
-        {
-            return { std::abs(value), std::abs(grad) };
-        }
-
-        [[nodiscard]] ValueWithGrad clamp_grad(T max) const
-        {
-            return { value, std::clamp(grad, -max, max) };
-        }
-    };
-}
-
-namespace Learner::Autograd::UnivariateStatic
-{
-
-    template <typename T>
-    struct Identity
-    {
-        using type = T;
-    };
-
-    template <typename T>
-    using Id = typename Identity<T>::type;
-
-    template <typename T>
-    using StoreValueOrRef = std::conditional_t<
-            std::is_rvalue_reference_v<T>,
-            std::remove_reference_t<T>,
-            const std::remove_reference_t<T>&
-        >;
-
-    namespace Detail
-    {
-        using CallIdType = std::uint32_t;
-
-        struct CallId
-        {
-            CallIdType call_id{};
-
-            constexpr CallId() :
-                call_id(0)
-            {
-            }
-
-            constexpr CallId(CallIdType id) :
-                call_id(id)
-            {
-            }
-
-            [[nodiscard]] bool operator==(CallId rhs) const noexcept
-            {
-                return call_id == rhs.call_id;
-            }
-
-            [[nodiscard]] bool operator!=(CallId rhs) const noexcept
-            {
-                return call_id != rhs.call_id;
-            }
-        };
-
-        [[nodiscard]] inline CallId next_call_id()
-        {
-            static thread_local CallIdType s_call_id = 0;
-            return CallId{ s_call_id++ };
-        }
-
-        template <typename T, typename Tuple>
-        struct TupleContains;
-
-        template <typename T, typename... Us>
-        struct TupleContains<T, std::tuple<Us...>> : std::disjunction<std::is_same<T, Us>...> {};
-
-        template <typename T, typename Tuple>
-        constexpr bool TupleContainsV = TupleContains<T, Tuple>::value;
-
-        template <typename... Ts>
-        constexpr bool AreAllConstantV = (std::remove_reference_t<Ts>::is_constant && ...);
-    }
-
-    template <typename T, typename ChildT>
-    struct Evaluable
-    {
-        constexpr Evaluable() = default;
-
-        // We append a unique call id so that we can invalidate the cache when
-        // the next computation starts. A single evaluation should see
-        // the same call_id at every node.
-        template <typename... ArgsTs>
-        [[nodiscard]] auto eval(const std::tuple<ArgsTs...>& args) const
-        {
-            const auto call_id = Detail::next_call_id();
-            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
-            return ValueWithGrad<T>{ value(new_args), grad(new_args) };
-        }
-
-        template <typename... ArgsTs,
-            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
-        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args) const
-        {
-            const ChildT* this_ = static_cast<const ChildT*>(this);
-
-            const auto call_id = std::get<Detail::CallId>(args);
-            if (!value_cache.has_value() || value_cache_call_id != call_id)
-            {
-                value_cache_call_id = call_id;
-                value_cache = this_->calculate_value(args);
-            }
-
-            return *value_cache;
-        }
-
-        template <typename... ArgsTs,
-            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
-        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args, ...) const
-        {
-            const auto call_id = Detail::next_call_id();
-            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
-            return value(new_args);
-        }
-
-        template <typename... ArgsTs,
-            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
-        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args) const
-        {
-            if constexpr (ChildT::is_constant)
-            {
-                return T(0.0);
-            }
-            else
-            {
-                const ChildT* this_ = static_cast<const ChildT*>(this);
-
-                const auto call_id = std::get<Detail::CallId>(args);
-                if (!grad_cache.has_value() || grad_cache_call_id != call_id)
-                {
-                    grad_cache_call_id = call_id;
-                    grad_cache = this_->calculate_grad(args);
-                }
-
-                return *grad_cache;
-            }
-        }
-
-        template <typename... ArgsTs,
-            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
-        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args, ...) const
-        {
-            const auto call_id = Detail::next_call_id();
-            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
-            return grad(new_args);
-        }
-
-    private:
-        mutable std::optional<T> value_cache;
-        mutable std::optional<T> grad_cache;
-        mutable Detail::CallId value_cache_call_id{};
-        mutable Detail::CallId grad_cache_call_id{};
-    };
-
-    template <typename T, int I>
-    struct VariableParameter : Evaluable<T, VariableParameter<T, I>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = false;
-
-        constexpr VariableParameter()
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return std::get<I>(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
-        {
-            return T(1.0);
-        }
-    };
-
-    template <typename T, int I>
-    struct ConstantParameter : Evaluable<T, ConstantParameter<T, I>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = true;
-
-        constexpr ConstantParameter()
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return std::get<I>(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
-        {
-            return T(0.0);
-        }
-    };
-
-    template <typename T>
-    struct Constant : Evaluable<T, Constant<T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = true;
-
-        constexpr Constant(T x) :
-            m_x(std::move(x))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
-        {
-            return m_x;
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
-        {
-            return T(0.0);
-        }
-
-    private:
-        T m_x;
-    };
-
-    // The "constant" may change between executions, but is assumed to be
-    // constant during a single evaluation.
-    template <typename T>
-    struct ConstantRef : Evaluable<T, ConstantRef<T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = true;
-
-        constexpr ConstantRef(const T& x) :
-            m_x(x)
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
-        {
-            return m_x;
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
-        {
-            return T(0.0);
-        }
-
-    private:
-        const T& m_x;
-    };
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    struct Sum : Evaluable<T, Sum<LhsT, RhsT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
-
-        constexpr Sum(LhsT&& lhs, RhsT&& rhs) :
-            m_lhs(std::forward<LhsT>(lhs)),
-            m_rhs(std::forward<RhsT>(rhs))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.value(args) + m_rhs.value(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.grad(args) + m_rhs.grad(args);
-        }
-
-    private:
-        StoreValueOrRef<LhsT> m_lhs;
-        StoreValueOrRef<RhsT> m_rhs;
-    };
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, RhsT&& rhs)
-    {
-        return Sum<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, Id<T> rhs)
-    {
-        return Sum<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
-    }
-
-    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator+(Id<T> lhs, RhsT&& rhs)
-    {
-        return Sum<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    struct Difference : Evaluable<T, Difference<LhsT, RhsT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
-
-        constexpr Difference(LhsT&& lhs, RhsT&& rhs) :
-            m_lhs(std::forward<LhsT>(lhs)),
-            m_rhs(std::forward<RhsT>(rhs))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.value(args) - m_rhs.value(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.grad(args) - m_rhs.grad(args);
-        }
-
-    private:
-        StoreValueOrRef<LhsT> m_lhs;
-        StoreValueOrRef<RhsT> m_rhs;
-    };
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, RhsT&& rhs)
-    {
-        return Difference<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, Id<T> rhs)
-    {
-        return Difference<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
-    }
-
-    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator-(Id<T> lhs, RhsT&& rhs)
-    {
-        return Difference<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    struct Product : Evaluable<T, Product<LhsT, RhsT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
-
-        constexpr Product(LhsT&& lhs, RhsT&& rhs) :
-            m_lhs(std::forward<LhsT>(lhs)),
-            m_rhs(std::forward<RhsT>(rhs))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.value(args) * m_rhs.value(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.grad(args) * m_rhs.value(args) + m_lhs.value(args) * m_rhs.grad(args);
-        }
-
-    private:
-        StoreValueOrRef<LhsT> m_lhs;
-        StoreValueOrRef<RhsT> m_rhs;
-    };
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, RhsT&& rhs)
-    {
-        return Product<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, Id<T> rhs)
-    {
-        return Product<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
-    }
-
-    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator*(Id<T> lhs, RhsT&& rhs)
-    {
-        return Product<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    struct Quotient : Evaluable<T, Quotient<LhsT, RhsT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
-
-        constexpr Quotient(LhsT&& lhs, RhsT&& rhs) :
-            m_lhs(std::forward<LhsT>(lhs)),
-            m_rhs(std::forward<RhsT>(rhs))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.value(args) / m_rhs.value(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            auto g = m_rhs.value(args);
-            return (m_lhs.grad(args) * g - m_lhs.value(args) * m_rhs.grad(args)) / (g * g);
-        }
-
-    private:
-        StoreValueOrRef<LhsT> m_lhs;
-        StoreValueOrRef<RhsT> m_rhs;
-    };
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, RhsT&& rhs)
-    {
-        return Quotient<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, Id<T> rhs)
-    {
-        return Quotient<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
-    }
-
-    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator/(Id<T> lhs, RhsT&& rhs)
-    {
-        return Quotient<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    struct Negation : Evaluable<T, Negation<ArgT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
-
-        constexpr explicit Negation(ArgT&& x) :
-            m_x(std::forward<ArgT>(x))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return -m_x.value(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return -m_x.grad(args);
-        }
-
-    private:
-        StoreValueOrRef<ArgT> m_x;
-    };
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    [[nodiscard]] constexpr auto operator-(ArgT&& x)
-    {
-        return Negation<ArgT&&>(std::forward<ArgT>(x));
-    }
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    struct Sigmoid : Evaluable<T, Sigmoid<ArgT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
-
-        constexpr explicit Sigmoid(ArgT&& x) :
-            m_x(std::forward<ArgT>(x))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return value_(m_x.value(args));
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_x.grad(args) * grad_(m_x.value(args));
-        }
-
-    private:
-        StoreValueOrRef<ArgT> m_x;
-
-        [[nodiscard]] T value_(T x) const
-        {
-            return 1.0 / (1.0 + std::exp(-x));
-        }
-
-        [[nodiscard]] T grad_(T x) const
-        {
-            return value_(x) * (1.0 - value_(x));
-        }
-    };
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    [[nodiscard]] constexpr auto sigmoid(ArgT&& x)
-    {
-        return Sigmoid<ArgT&&>(std::forward<ArgT>(x));
-    }
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    struct Pow : Evaluable<T, Pow<ArgT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
-
-        constexpr explicit Pow(ArgT&& x, Id<T> exponent) :
-            m_x(std::forward<ArgT>(x)),
-            m_exponent(std::move(exponent))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return std::pow(m_x.value(args), m_exponent);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_exponent * std::pow(m_x.value(args), m_exponent - T(1.0)) * m_x.grad(args);
-        }
-
-    private:
-        StoreValueOrRef<ArgT> m_x;
-        T m_exponent;
-    };
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    [[nodiscard]] constexpr auto pow(ArgT&& x, Id<T> exp)
-    {
-        return Pow<ArgT&&>(std::forward<ArgT>(x), std::move(exp));
-    }
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    struct Log : Evaluable<T, Log<ArgT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
-
-        constexpr explicit Log(ArgT&& x) :
-            m_x(std::forward<ArgT>(x))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return value_(m_x.value(args));
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_x.grad(args) * grad_(m_x.value(args));
-        }
-
-    private:
-        StoreValueOrRef<ArgT> m_x;
-
-        T value_(T x) const
-        {
-            return std::log(x);
-        }
-
-        T grad_(T x) const
-        {
-            return 1.0 / x;
-        }
-    };
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    [[nodiscard]] constexpr auto log(ArgT&& x)
-    {
-        return Log<ArgT&&>(std::forward<ArgT>(x));
-    }
-
-}
-
-#endif
\ No newline at end of file
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index b28afa13..e5ddd6aa 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -13,7 +13,6 @@
 #include "extra/nnue_data_binpack_format.h"
 
 #include "nnue/evaluate_nnue.h"
-#include "nnue/evaluate_nnue_learner.h"
 
 #include "syzygy/tbprobe.h"
 
@@ -493,8 +492,8 @@ namespace Learner
 
         // has it reached the max length or is a draw by fifty-move rule
         // or by 3-fold repetition
-        if (ply >= params.write_maxply 
-            || pos.is_fifty_move_draw() 
+        if (ply >= params.write_maxply
+            || pos.is_fifty_move_draw()
             || pos.is_three_fold_repetition())
         {
             return 0;
diff --git a/src/learn/gensfen_nonpv.cpp b/src/learn/gensfen_nonpv.cpp
index ca365034..098511fe 100644
--- a/src/learn/gensfen_nonpv.cpp
+++ b/src/learn/gensfen_nonpv.cpp
@@ -13,7 +13,6 @@
 #include "extra/nnue_data_binpack_format.h"
 
 #include "nnue/evaluate_nnue.h"
-#include "nnue/evaluate_nnue_learner.h"
 
 #include "syzygy/tbprobe.h"
 
diff --git a/src/learn/half_float.h b/src/learn/half_float.h
deleted file mode 100644
index 5808a786..00000000
--- a/src/learn/half_float.h
+++ /dev/null
@@ -1,133 +0,0 @@
-﻿#ifndef __HALF_FLOAT_H__
-#define __HALF_FLOAT_H__
-
-// Half Float Library by yaneurao
-// (16-bit float)
-
-// Floating point operation by 16bit type
-// Assume that the float type code generated by the compiler is in IEEE 754 format and use it.
-
-#include "types.h"
-
-namespace HalfFloat
-{
-    // IEEE 754 float 32 format is :
-    //   sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits
-    //
-    // Our float16 format is :
-    //   sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits
-    union float32_converter
-    {
-        int32_t n;
-        float f;
-    };
-
-
-    // 16-bit float
-    struct float16
-    {
-        // --- constructors
-
-        float16() {}
-        float16(int16_t n) { from_float((float)n);  }
-        float16(int32_t n) { from_float((float)n); }
-        float16(float n) { from_float(n); }
-        float16(double n) { from_float((float)n); }
-
-        // build from a float
-        void from_float(float f) { *this = to_float16(f); }
-
-        // --- implicit converters
-
-        operator int32_t() const { return (int32_t)to_float(*this); }
-        operator float() const { return to_float(*this); }
-        operator double() const { return double(to_float(*this)); }
-
-        // --- operators
-
-        float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; }
-        float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; }
-        float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; }
-        float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; }
-        float16 operator + (float16 rhs) const { return float16(*this) += rhs; }
-        float16 operator - (float16 rhs) const { return float16(*this) -= rhs; }
-        float16 operator * (float16 rhs) const { return float16(*this) *= rhs; }
-        float16 operator / (float16 rhs) const { return float16(*this) /= rhs; }
-        float16 operator - () const { return float16(-to_float(*this)); }
-        bool operator == (float16 rhs) const { return this->v_ == rhs.v_; }
-        bool operator != (float16 rhs) const { return !(*this == rhs); }
-
-        static void UnitTest() { unit_test(); }
-
-    private:
-
-        // --- entity
-
-        uint16_t v_;
-
-        // --- conversion between float and float16
-
-        static float16 to_float16(float f)
-        {
-            float32_converter c;
-            c.f = f;
-            u32 n = c.n;
-
-            // The sign bit is MSB in common.
-            uint16_t sign_bit = (n >> 16) & 0x8000;
-
-            // The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit.
-            uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10;
-
-            // The fraction is limited to 10-bit.
-            uint16_t fraction = (n >> (23-10)) & 0x3ff;
-
-            float16 f_;
-            f_.v_ = sign_bit | exponent | fraction;
-
-            return f_;
-        }
-
-        static float to_float(float16 v)
-        {
-            u32 sign_bit = (v.v_ & 0x8000) << 16;
-            u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23;
-            u32 fraction = (v.v_ & 0x3ff) << (23 - 10);
-
-            float32_converter c;
-            c.n = sign_bit | exponent | fraction;
-            return c.f;
-        }
-
-        // It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe).
-        static void unit_test()
-        {
-            float16 a, b, c, d;
-            a = 1;
-            std::cout << (float)a << std::endl;
-            b = -118.625;
-            std::cout << (float)b << std::endl;
-            c = 2.5;
-            std::cout << (float)c << std::endl;
-            d = a + c;
-            std::cout << (float)d << std::endl;
-
-            c *= 1.5;
-            std::cout << (float)c << std::endl;
-
-            b /= 3;
-            std::cout << (float)b << std::endl;
-
-            float f1 = 1.5;
-            a += f1;
-            std::cout << (float)a << std::endl;
-
-            a += f1 * (float)a;
-            std::cout << (float)a << std::endl;
-        }
-
-    };
-
-}
-
-#endif // __HALF_FLOAT_H__
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
deleted file mode 100644
index 9c4546a6..00000000
--- a/src/learn/learn.cpp
+++ /dev/null
@@ -1,1474 +0,0 @@
-﻿// Learning routines:
-//
-// 1) Automatic generation of game records in .bin format
-// → "gensfen" command
-//
-// 2) Learning evaluation function parameters from the generated .bin files
-// → "learn" command
-//
-// → Shuffle in the teacher phase is also an extension of this command.
-// Example) "learn shuffle"
-//
-// 3) Automatic generation of fixed traces
-// → "makebook think" command
-// → implemented in extra/book/book.cpp
-//
-// 4) Post-station automatic review mode
-// → I will not be involved in the engine because it is a problem that the GUI should assist.
-// etc..
-
-#include "learn.h"
-
-#include "autograd.h"
-#include "sfen_reader.h"
-
-#include "misc.h"
-#include "position.h"
-#include "thread.h"
-#include "tt.h"
-#include "uci.h"
-#include "search.h"
-#include "timeman.h"
-
-#include "nnue/evaluate_nnue.h"
-#include "nnue/evaluate_nnue_learner.h"
-
-#include "syzygy/tbprobe.h"
-
-#include <chrono>
-#include <climits>
-#include <cmath>    // std::exp(),std::pow(),std::log()
-#include <cstring>  // memcpy()
-#include <fstream>
-#include <iomanip>
-#include <limits>
-#include <list>
-#include <memory>
-#include <optional>
-#include <random>
-#include <regex>
-#include <shared_mutex>
-#include <sstream>
-#include <unordered_set>
-#include <iostream>
-#include <map>
-#include <algorithm>
-
-#if defined (_OPENMP)
-#include <omp.h>
-#endif
-
-using namespace std;
-
-template <typename T>
-T operator +=(std::atomic<T>& x, const T rhs)
-{
-    T old = x.load(std::memory_order_consume);
-
-    // It is allowed that the value is rewritten from other thread at this timing.
-    // The idea that the value is not destroyed is good.
-    T desired = old + rhs;
-    while (!x.compare_exchange_weak(old, desired, std::memory_order_release, std::memory_order_consume))
-        desired = old + rhs;
-    return desired;
-}
-template <typename T>
-T operator -= (std::atomic<T>& x, const T rhs) { return x += -rhs; }
-
-namespace Learner
-{
-    static double winning_probability_coefficient = 1.0 / PawnValueEg / 4.0 * std::log(10.0);
-
-    // Score scale factors. ex) If we set src_score_min_value = 0.0,
-    // src_score_max_value = 1.0, dest_score_min_value = 0.0,
-    // dest_score_max_value = 10000.0, [0.0, 1.0] will be scaled to [0, 10000].
-    static double src_score_min_value = 0.0;
-    static double src_score_max_value = 1.0;
-    static double dest_score_min_value = 0.0;
-    static double dest_score_max_value = 1.0;
-
-    // A constant used in elmo (WCSC27). Adjustment required.
-    // Since elmo does not internally divide the expression, the value is different.
-    // You can set this value with the learn command.
-    // 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27)
-    static double elmo_lambda_low = 1.0;
-    static double elmo_lambda_high = 1.0;
-    static double elmo_lambda_limit = 32000;
-
-    // Using stockfish's WDL with win rate model instead of sigmoid
-    static bool use_wdl = false;
-
-    static void append_files_from_dir(
-        std::vector<std::string>& filenames,
-        const std::string& base_dir,
-        const std::string& target_dir)
-    {
-        string kif_base_dir = Path::combine(base_dir, target_dir);
-
-        sys::path p(kif_base_dir); // Origin of enumeration
-        std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
-            [&](const sys::path& path) {
-                if (sys::is_regular_file(path))
-                    filenames.push_back(Path::combine(target_dir, path.filename().generic_string()));
-            });
-    }
-
-    static void rebase_files(
-        std::vector<std::string>& filenames,
-        const std::string& base_dir)
-    {
-        for (auto& file : filenames)
-        {
-            file = Path::combine(base_dir, file);
-        }
-    }
-
-    static double calculate_lambda(double teacher_signal)
-    {
-        // If the evaluation value in deep search exceeds elmo_lambda_limit
-        // then apply elmo_lambda_high instead of elmo_lambda_low.
-        const double lambda =
-            (std::abs(teacher_signal) >= elmo_lambda_limit)
-            ? elmo_lambda_high
-            : elmo_lambda_low;
-
-        return lambda;
-    }
-
-    // We use our own simple static autograd for automatic
-    // differentiation of the loss function. While it works it has it's caveats.
-    // To work fast enough it requires memoization and reference semantics.
-    // Memoization is mostly opaque to the user and is only per eval basis.
-    // As for reference semantics, we cannot copy every node,
-    // because we need a way to reuse computation.
-    // But we can't really use shared_ptr because of the overhead. That means
-    // that we have to ensure all parts of a loss expression are not destroyed
-    // before use. When lvalue references are used to construct a node it will
-    // store just a reference, it only perform a copy of the rvalue reference arguments.
-    // This means that we need some storage for the whole computation tree
-    // that keeps the values after function returns and never moves them to
-    // a different memory location. This means that we cannot use local
-    // variables and just return by value - because there may be dangling references left.
-    // We also cannot create a struct with this tree on demand because one cannot
-    // use `auto` as a struct members. This is a big issue, and the only way
-    // to solve it as of now is to use static thread_local variables and rely on the
-    // following assumptions:
-    // 1. the expression node must not change for the duration of the program
-    //    within a single instance of a function. This is usually not a problem
-    //    because almost all information is carried by the type. There is an
-    //    exception though, we have ConstantRef and Constant nodes that
-    //    do not encode the constants in the type, so it's possible
-    //    that these nodes are different on the first call to the function
-    //    then later. We MUST ensure that one function is only ever used
-    //    for one specific expression.
-    // 2. thread_local variables are not expensive. Usually after creation
-    //    it only requires a single unsynchronized boolean check and that's
-    //    how most compilers implement it.
-    //
-    // So the general way to do things right now is to use static thread_local
-    // variables for all named autograd nodes. Results being nodes should be
-    // returned by reference, so that there's no need to copy the returned objects.
-    // Parameters being nodes should be taken by lvalue reference if they are
-    // used more than once (to enable reference semantics to reuse computation),
-    // but they can be rvalues and forward on first use if there's only one use
-    // of the node in the scope.
-    // We must keep in mind that the node tree created by such a function
-    // is never going to change as thread_local variables are initialized
-    // on first call. This means that one cannot use one function as a factory
-    // for different autograd expression trees.
-
-    template <typename ShallowT, typename TeacherT, typename ResultT, typename LambdaT>
-    static auto& cross_entropy_(
-        ShallowT& q_,
-        TeacherT& p_,
-        ResultT& t_,
-        LambdaT& lambda_
-    )
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        constexpr double epsilon = 1e-12;
-
-        static thread_local auto teacher_entropy_ = -(p_ * log(p_ + epsilon) + (1.0 - p_) * log(1.0 - p_ + epsilon));
-        static thread_local auto outcome_entropy_ = -(t_ * log(t_ + epsilon) + (1.0 - t_) * log(1.0 - t_ + epsilon));
-        static thread_local auto teacher_loss_ = -(p_ * log(q_ + epsilon) + (1.0 - p_) * log(1.0 - q_ + epsilon));
-        static thread_local auto outcome_loss_ = -(t_ * log(q_ + epsilon) + (1.0 - t_) * log(1.0 - q_ + epsilon));
-        static thread_local auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
-        static thread_local auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
-        static thread_local auto cross_entropy_ = result_ - entropy_;
-
-        return cross_entropy_;
-    }
-
-    template <typename ValueT>
-    static auto& scale_score_(ValueT&& v_)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        // Normalize to [0.0, 1.0].
-        static thread_local auto normalized_ =
-            (std::forward<ValueT>(v_) - ConstantRef<double>(src_score_min_value))
-            / (ConstantRef<double>(src_score_max_value) - ConstantRef<double>(src_score_min_value));
-
-        // Scale to [dest_score_min_value, dest_score_max_value].
-        static thread_local auto scaled_ =
-            normalized_
-            * (ConstantRef<double>(dest_score_max_value) - ConstantRef<double>(dest_score_min_value))
-            + ConstantRef<double>(dest_score_min_value);
-
-        return scaled_;
-    }
-
-    static Value scale_score(Value v)
-    {
-        // Normalize to [0.0, 1.0].
-        auto normalized =
-            ((double)v - src_score_min_value)
-            / (src_score_max_value - src_score_min_value);
-
-        // Scale to [dest_score_min_value, dest_score_max_value].
-        auto scaled =
-            normalized
-            * (dest_score_max_value - dest_score_min_value)
-            + dest_score_min_value;
-
-        return Value(scaled);
-    }
-
-    template <typename ValueT>
-    static auto& expected_perf_(ValueT&& v_)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto perf_ = sigmoid(std::forward<ValueT>(v_) * ConstantRef<double>(winning_probability_coefficient));
-
-        return perf_;
-    }
-
-    template <typename ValueT, typename PlyT, typename T = typename ValueT::ValueType>
-    static auto& expected_perf_use_wdl_(
-        ValueT& v_,
-        PlyT&& ply_
-    )
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        // Coefficients of a 3rd order polynomial fit based on fishtest data
-        // for two parameters needed to transform eval to the argument of a
-        // logistic function.
-        static constexpr T as[] = { -8.24404295, 64.23892342, -95.73056462, 153.86478679 };
-        static constexpr T bs[] = { -3.37154371, 28.44489198, -56.67657741,  72.05858751 };
-
-        // The model captures only up to 240 plies, so limit input (and rescale)
-        static thread_local auto m_ = std::forward<PlyT>(ply_) / 64.0;
-
-        static thread_local auto a_ = (((as[0] * m_ + as[1]) * m_ + as[2]) * m_) + as[3];
-        static thread_local auto b_ = (((bs[0] * m_ + bs[1]) * m_ + bs[2]) * m_) + bs[3];
-
-        // Return win rate in per mille
-        static thread_local auto sv_ = (v_ - a_) / b_;
-        static thread_local auto svn_ = (-v_ - a_) / b_;
-
-        static thread_local auto win_pct_ = sigmoid(sv_);
-        static thread_local auto loss_pct_ = sigmoid(svn_);
-
-        static thread_local auto draw_pct_ = 1.0 - win_pct_ - loss_pct_;
-
-        static thread_local auto perf_ = win_pct_ + draw_pct_ * 0.5;
-
-        return perf_;
-    }
-
-    static double expected_perf_use_wdl(
-        Value v,
-        int ply
-    )
-    {
-        // Coefficients of a 3rd order polynomial fit based on fishtest data
-        // for two parameters needed to transform eval to the argument of a
-        // logistic function.
-        static constexpr double as[] = { -8.24404295, 64.23892342, -95.73056462, 153.86478679 };
-        static constexpr double bs[] = { -3.37154371, 28.44489198, -56.67657741,  72.05858751 };
-
-        // The model captures only up to 240 plies, so limit input (and rescale)
-        auto m = ply / 64.0;
-
-        auto a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
-        auto b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
-
-        // Return win rate in per mille
-        auto sv = ((double)v - a) / b;
-        auto svn = ((double)-v - a) / b;
-
-        auto win_pct = Math::sigmoid(sv);
-        auto loss_pct = Math::sigmoid(svn);
-
-        auto draw_pct = 1.0 - win_pct - loss_pct;
-
-        auto perf = win_pct + draw_pct * 0.5;
-
-        return perf;
-    }
-
-    [[maybe_unused]] static ValueWithGrad<double> get_loss_noob(
-        Value shallow, Value teacher_signal, int result, int /* ply */)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto q_ = VariableParameter<double, 0>{};
-        static thread_local auto p_ = ConstantParameter<double, 1>{};
-        static thread_local auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
-
-        auto args = std::tuple(
-            (double)shallow,
-            (double)teacher_signal,
-            (double)result,
-            calculate_lambda(teacher_signal)
-        );
-
-        return loss_.eval(args);
-    }
-
-    static auto& get_loss_cross_entropy_()
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto& q_ = expected_perf_(VariableParameter<double, 0>{});
-        static thread_local auto& p_ = expected_perf_(scale_score_(ConstantParameter<double, 1>{}));
-        static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
-        static thread_local auto lambda_ = ConstantParameter<double, 3>{};
-        static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_);
-
-        return loss_;
-    }
-
-    static auto get_loss_cross_entropy_args(
-        Value shallow, Value teacher_signal, int result)
-    {
-        return std::tuple(
-            (double)shallow,
-            (double)teacher_signal,
-            (double)result,
-            calculate_lambda(teacher_signal)
-        );
-    }
-
-    static ValueWithGrad<double> get_loss_cross_entropy(
-        Value shallow, Value teacher_signal, int result, int /* ply */)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto& loss_ = get_loss_cross_entropy_();
-
-        auto args = get_loss_cross_entropy_args(shallow, teacher_signal, result);
-
-        return loss_.eval(args);
-    }
-
-    static ValueWithGrad<double> get_loss_cross_entropy_no_grad(
-        Value shallow, Value teacher_signal, int result, int /* ply */)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto& loss_ = get_loss_cross_entropy_();
-
-        auto args = get_loss_cross_entropy_args(shallow, teacher_signal, result);
-
-        return { loss_.value(args), 0.0 };
-    }
-
-    static auto& get_loss_cross_entropy_use_wdl_()
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto ply_ = ConstantParameter<double, 4>{};
-        static thread_local auto shallow_ = VariableParameter<double, 0>{};
-        static thread_local auto& q_ = expected_perf_use_wdl_(shallow_, ply_);
-        // We could do just this but MSVC crashes with an internal compiler error :(
-        // static thread_local auto& scaled_teacher_ = scale_score_(ConstantParameter<double, 1>{});
-        // static thread_local auto& p_ = expected_perf_use_wdl_(scaled_teacher_, ply_);
-        static thread_local auto p_ = ConstantParameter<double, 1>{};
-        static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
-        static thread_local auto lambda_ = ConstantParameter<double, 3>{};
-        static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_);
-
-        return loss_;
-    }
-
-    static auto get_loss_cross_entropy_use_wdl_args(
-        Value shallow, Value teacher_signal, int result, int ply)
-    {
-        return std::tuple(
-            (double)shallow,
-            // This is required because otherwise MSVC crashes :(
-            expected_perf_use_wdl(scale_score(teacher_signal), ply),
-            (double)result,
-            calculate_lambda(teacher_signal),
-            (double)std::min(240, ply)
-        );
-    }
-
-    static ValueWithGrad<double> get_loss_cross_entropy_use_wdl(
-        Value shallow, Value teacher_signal, int result, int ply)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto& loss_ = get_loss_cross_entropy_use_wdl_();
-
-        auto args = get_loss_cross_entropy_use_wdl_args(shallow, teacher_signal, result, ply);
-
-        return loss_.eval(args);
-    }
-
-    static ValueWithGrad<double> get_loss_cross_entropy_use_wdl_no_grad(
-        Value shallow, Value teacher_signal, int result, int ply)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto& loss_ = get_loss_cross_entropy_use_wdl_();
-
-        auto args = get_loss_cross_entropy_use_wdl_args(shallow, teacher_signal, result, ply);
-
-        return { loss_.value(args), 0.0 };
-    }
-
-    static auto get_loss(Value shallow, Value teacher_signal, int result, int ply)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        if (use_wdl)
-        {
-            return get_loss_cross_entropy_use_wdl(shallow, teacher_signal, result, ply);
-        }
-        else
-        {
-            return get_loss_cross_entropy(shallow, teacher_signal, result, ply);
-        }
-    }
-
-    static auto get_loss_no_grad(Value shallow, Value teacher_signal, int result, int ply)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        if (use_wdl)
-        {
-            return get_loss_cross_entropy_use_wdl_no_grad(shallow, teacher_signal, result, ply);
-        }
-        else
-        {
-            return get_loss_cross_entropy_no_grad(shallow, teacher_signal, result, ply);
-        }
-    }
-
-    [[maybe_unused]] static auto get_loss(
-        Value teacher_signal,
-        Value shallow,
-        const PackedSfenValue& psv)
-    {
-        return get_loss(shallow, teacher_signal, psv.game_result, psv.gamePly);
-    }
-
-    static auto get_loss_no_grad(
-        Value teacher_signal,
-        Value shallow,
-        const PackedSfenValue& psv)
-    {
-        return get_loss_no_grad(shallow, teacher_signal, psv.game_result, psv.gamePly);
-    }
-
-    // Class to generate sfen with multiple threads
-    struct LearnerThink
-    {
-        struct Params
-        {
-            // Mini batch size size. Be sure to set it on the side that uses this class.
-            uint64_t mini_batch_size = LEARN_MINI_BATCH_SIZE;
-
-            // Number of phases used for calculation such as mse
-            // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
-            // Since search() is performed with depth = 1 in calculation of
-            // move match rate, simple comparison is not possible...
-            uint64_t validation_count = 2000;
-
-            // Option to exclude early stage from learning
-            int reduction_gameply = 1;
-
-            // If the absolute value of the evaluation value of the deep search
-            // of the teacher phase exceeds this value, discard the teacher phase.
-            int eval_limit = 32000;
-
-            // Flag whether to dig a folder each time the evaluation function is saved.
-            // If true, do not dig the folder.
-            bool save_only_once = false;
-
-            bool shuffle = true;
-
-            bool verbose = false;
-
-            double newbob_decay = 0.5;
-            int newbob_num_trials = 4;
-            uint64_t auto_lr_drop = 0;
-
-            std::string best_nn_directory;
-
-            uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
-            uint64_t loss_output_interval = 1'000'000;
-
-            size_t sfen_read_size = SfenReader::DEFAULT_SFEN_READ_SIZE;
-            size_t thread_buffer_size = SfenReader::DEFAULT_THREAD_BUFFER_SIZE;
-
-            bool use_draw_games_in_training = true;
-            bool use_draw_games_in_validation = true;
-            bool skip_duplicated_positions_in_training = true;
-
-            bool assume_quiet = false;
-            bool smart_fen_skipping = false;
-            bool smart_fen_skipping_for_validation = false;
-
-            double learning_rate = 1.0;
-            double warmup_learning_rate = 0.1;
-            double max_grad = 1.0;
-
-            string validation_set_file_name;
-            string seed;
-
-            std::vector<std::string> filenames;
-
-            uint64_t num_threads;
-
-            void enforce_constraints()
-            {
-                num_threads = Options["Threads"];
-
-                if (loss_output_interval == 0)
-                {
-                    loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
-                }
-
-                // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
-                reduction_gameply = max(reduction_gameply, 1);
-
-                if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
-                    // Save the current net to [EvalSaveDir]\original.
-                    Eval::NNUE::save_eval("original");
-
-                    // Set the folder above to best_nn_directory so that the trainer can
-                    // resotre the network parameters from the original net file.
-                    best_nn_directory =
-                        Path::combine(Options["EvalSaveDir"], "original");
-                }
-            }
-        };
-
-        LearnerThink(const Params& prm) :
-            params(prm),
-            init_prng(prm.seed),
-            train_sr(
-                prm.filenames,
-                prm.shuffle,
-                SfenReaderMode::Cyclic,
-                prm.num_threads,
-                std::to_string(init_prng.next_random_seed()),
-                prm.sfen_read_size,
-                prm.thread_buffer_size),
-            validation_sr(
-                prm.validation_set_file_name.empty() ? prm.filenames : std::vector<std::string>{ prm.validation_set_file_name },
-                prm.shuffle,
-                SfenReaderMode::Cyclic,
-                1,
-                std::to_string(init_prng.next_random_seed()),
-                std::min<size_t>(prm.validation_count * 10, 1000000),
-                prm.thread_buffer_size),
-            learn_loss_sum{}
-        {
-            save_count = 0;
-            loss_output_count = 0;
-            last_lr_drop = 0;
-            best_loss = std::numeric_limits<double>::infinity();
-            latest_loss_sum = 0.0;
-            latest_loss_count = 0;
-            total_done = 0;
-            trials = params.newbob_num_trials;
-            dir_number = 0;
-
-            prngs.reserve(prm.num_threads);
-            for (uint64_t i = 0; i < prm.num_threads; ++i)
-            {
-                prngs.emplace_back(init_prng.next_random_seed());
-            }
-        }
-
-        void learn(uint64_t epochs, uint64_t warmup_epochs = 0);
-
-    private:
-        static void set_learning_search_limits();
-
-        PSVector fetch_next_validation_set();
-
-        void learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
-
-        void update_weights(const PSVector& psv, uint64_t epoch);
-        void update_weights_warmup(uint64_t warmup_epoch);
-
-        void calc_loss(const PSVector& psv, uint64_t epoch);
-
-        void calc_loss_worker(
-            Thread& th,
-            std::atomic<uint64_t>& counter,
-            const PSVector& psv,
-            Loss& test_loss_sum,
-            atomic<double>& sum_norm,
-            atomic<int>& move_accord_count,
-            atomic<double>& sum_one_over_move_count
-        );
-
-        bool has_depth1_move_agreement(Position& pos, Move pvmove);
-
-        bool check_progress();
-
-        // save merit function parameters to a file
-        bool save(bool is_final = false);
-
-        Params params;
-
-        PRNG init_prng;
-        std::vector<PRNG> prngs;
-
-        // sfen reader
-        SfenReader train_sr;
-        SfenReader validation_sr;
-
-        uint64_t save_count;
-        uint64_t loss_output_count;
-
-        std::atomic<bool> stop_flag;
-
-        uint64_t total_done;
-
-        uint64_t last_lr_drop;
-        double best_loss;
-        double latest_loss_sum;
-        uint64_t latest_loss_count;
-
-        int trials;
-        int dir_number;
-
-        // For calculation of learning data loss
-        Loss learn_loss_sum;
-    };
-
-    void LearnerThink::set_learning_search_limits()
-    {
-        Threads.main()->ponder = false;
-
-        // About Search::Limits
-        // Be careful because this member variable is global and affects other threads.
-        auto& limits = Search::Limits;
-
-        limits.startTime = now();
-
-        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
-        limits.infinite = true;
-
-        // Since PV is an obstacle when displayed, erase it.
-        limits.silent = true;
-
-        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
-        limits.nodes = 0;
-
-        // depth is also processed by the one passed as an argument of Learner::search().
-        limits.depth = 0;
-    }
-
-    PSVector LearnerThink::fetch_next_validation_set()
-    {
-        PSVector validation_data;
-
-        auto mainThread = Threads.main();
-        mainThread->execute_with_worker([&validation_data, this](auto& th){
-            auto do_include_predicate = [&th, this](const PackedSfenValue& ps) -> bool {
-                if (params.eval_limit < abs(ps.score))
-                    return false;
-
-                if (!params.use_draw_games_in_validation && ps.game_result == 0)
-                    return false;
-
-                if (params.smart_fen_skipping_for_validation)
-                {
-                    StateInfo si;
-                    auto& pos = th.rootPos;
-                    if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0)
-                        return false;
-
-                    if (pos.capture_or_promotion((Move)ps.move) || pos.checkers())
-                        return false;
-                }
-
-                return true;
-            };
-
-            validation_data = validation_sr.read_some(
-                params.validation_count,
-                params.validation_count * 100, // to have a reasonable bound on the running time.
-                do_include_predicate
-            );
-        });
-        mainThread->wait_for_worker_finished();
-
-        return validation_data;
-    }
-
-    void LearnerThink::learn(uint64_t epochs, uint64_t warmup_epochs)
-    {
-#if defined(_OPENMP)
-        omp_set_num_threads((int)Options["Threads"]);
-#endif
-
-        set_learning_search_limits();
-
-        Eval::NNUE::verify_any_net_loaded();
-
-        const PSVector validation_data = fetch_next_validation_set();
-
-        if (validation_data.size() != params.validation_count)
-        {
-            auto out = sync_region_cout.new_region();
-            out
-                << "INFO (learn): Error reading validation data. Read " << validation_data.size()
-                << " out of " << params.validation_count << '\n'
-                << "INFO (learn): This either means that less than 1% of the validation data passed the filter"
-                << " or the file is empty\n";
-
-            return;
-        }
-
-        stop_flag = false;
-
-        if (warmup_epochs > 0)
-        {
-            cout << "Doing " << warmup_epochs << " warmup epochs." << endl;
-        }
-
-        for(uint64_t warmup_epoch = 1; warmup_epoch <= warmup_epochs; ++warmup_epoch)
-        {
-            std::atomic<uint64_t> counter{0};
-
-            Threads.execute_with_workers([this, &counter](auto& th){
-                learn_worker(th, counter, params.mini_batch_size);
-            });
-
-            total_done += params.mini_batch_size;
-
-            Threads.wait_for_workers_finished();
-
-            if (stop_flag)
-                break;
-
-            update_weights_warmup(warmup_epoch);
-
-            if (stop_flag)
-                break;
-
-            cout << "Finished " << warmup_epoch << " out of " << warmup_epochs << " warmup epochs." << endl;
-        }
-
-        if (params.newbob_decay != 1.0) {
-
-            calc_loss(validation_data, 0);
-
-            best_loss = latest_loss_sum / latest_loss_count;
-            latest_loss_sum = 0.0;
-            latest_loss_count = 0;
-
-            auto out = sync_region_cout.new_region();
-            out << "INFO (learn): initial loss = " << best_loss << endl;
-        }
-
-        for(uint64_t epoch = 1; epoch <= epochs; ++epoch)
-        {
-            std::atomic<uint64_t> counter{0};
-
-            Threads.execute_with_workers([this, &counter](auto& th){
-                learn_worker(th, counter, params.mini_batch_size);
-            });
-
-            total_done += params.mini_batch_size;
-
-            Threads.wait_for_workers_finished();
-
-            if (stop_flag)
-                break;
-
-            update_weights(validation_data, epoch);
-
-            if (stop_flag)
-                break;
-        }
-
-        Eval::NNUE::finalize_net();
-
-        save(true);
-    }
-
-    void LearnerThink::learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit)
-    {
-        const auto thread_id = th.thread_idx();
-        auto& pos = th.rootPos;
-        auto& prng = prngs[th.thread_idx()];
-
-        std::vector<StateInfo, AlignedAllocator<StateInfo>> state(MAX_PLY);
-
-        while(!stop_flag)
-        {
-            const auto iter = counter.fetch_add(1);
-            if (iter >= limit)
-                break;
-
-            PackedSfenValue ps;
-
-        RETRY_READ:;
-
-            if (!train_sr.read_to_thread_buffer(thread_id, ps))
-            {
-                // If we ran out of data we stop completely
-                // because there's nothing left to do.
-                stop_flag = true;
-                break;
-            }
-
-            if (params.eval_limit < abs(ps.score))
-                goto RETRY_READ;
-
-            if (!params.use_draw_games_in_training && ps.game_result == 0)
-                goto RETRY_READ;
-
-            // Skip over the opening phase
-            if (ps.gamePly < prng.rand(params.reduction_gameply))
-                goto RETRY_READ;
-
-            StateInfo si;
-            if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0)
-            {
-                // Malformed sfen
-                auto out = sync_region_cout.new_region();
-                out << "ERROR: illigal packed sfen = " << pos.fen() << endl;
-                goto RETRY_READ;
-            }
-
-            const auto rootColor = pos.side_to_move();
-
-            // A function that adds the current `pos` and `ps`
-            // to the training set.
-            auto pos_add_grad = [&]() {
-
-                // Evaluation value of deep search
-                const Value shallow_value = Eval::evaluate(pos);
-
-                Eval::NNUE::add_example(pos, rootColor, shallow_value, ps, 1.0);
-            };
-
-            if (!pos.pseudo_legal((Move)ps.move) || !pos.legal((Move)ps.move))
-            {
-                goto RETRY_READ;
-            }
-
-            // We don't need to qsearch when doing smart skipping
-            if (!params.assume_quiet && !params.smart_fen_skipping)
-            {
-                int ply = 0;
-                pos.do_move((Move)ps.move, state[ply++]);
-
-                // Evaluation value of shallow search (qsearch)
-                const auto [_, pv] = Search::qsearch(pos);
-
-                for (auto m : pv)
-                {
-                    pos.do_move(m, state[ply++]);
-                }
-            }
-
-            if (params.smart_fen_skipping
-                && (pos.capture_or_promotion((Move)ps.move)
-                    || pos.checkers()))
-            {
-                goto RETRY_READ;
-            }
-
-            // We want to position being trained on not to be terminal
-            if (MoveList<LEGAL>(pos).size() == 0)
-                goto RETRY_READ;
-
-            // Since we have reached the end phase of PV, add the slope here.
-            pos_add_grad();
-        }
-    }
-
-    void LearnerThink::update_weights_warmup(uint64_t warmup_epoch)
-    {
-        // I'm not sure this fencing is correct. But either way there
-        // should be no real issues happening since
-        // the read/write phases are isolated.
-        atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(
-            Threads, warmup_epoch, params.verbose, params.warmup_learning_rate, params.max_grad, get_loss);
-        atomic_thread_fence(memory_order_seq_cst);
-    }
-
-    void LearnerThink::update_weights(const PSVector& psv, uint64_t epoch)
-    {
-        // I'm not sure this fencing is correct. But either way there
-        // should be no real issues happening since
-        // the read/write phases are isolated.
-        atomic_thread_fence(memory_order_seq_cst);
-        learn_loss_sum += Eval::NNUE::update_parameters(
-            Threads, epoch, params.verbose, params.learning_rate, params.max_grad, get_loss);
-        atomic_thread_fence(memory_order_seq_cst);
-
-        if (++save_count * params.mini_batch_size >= params.eval_save_interval)
-        {
-            save_count = 0;
-
-            const bool converged = save();
-            if (converged)
-            {
-                stop_flag = true;
-                return;
-            }
-        }
-
-        if (++loss_output_count * params.mini_batch_size >= params.loss_output_interval)
-        {
-            loss_output_count = 0;
-
-            // loss calculation
-            calc_loss(psv, epoch);
-
-            Eval::NNUE::check_health();
-        }
-    }
-
-    void LearnerThink::calc_loss(const PSVector& psv, uint64_t epoch)
-    {
-        TT.new_search();
-        TimePoint elapsed = now() - Search::Limits.startTime + 1;
-
-        auto out = sync_region_cout.new_region();
-
-        out << "\n";
-        out << "PROGRESS (calc_loss): " << now_string()
-             << ", " << total_done << " sfens"
-             << ", " << total_done * 1000 / elapsed  << " sfens/second"
-             << ", epoch " << epoch
-             << endl;
-
-        out << "  - learning rate = " << params.learning_rate << endl;
-
-        // For calculation of verification data loss
-        Loss test_loss_sum{};
-
-        // norm for learning
-        atomic<double> sum_norm{0.0};
-
-        // The number of times the pv first move of deep
-        // search matches the pv first move of search(1).
-        atomic<int> move_accord_count{0};
-
-        // If there is 10 legal moves then 0.1 will be added.
-        // This happens for each position tested.
-        // Effectively at the end we have the random move accuracy
-        // multiplied by the number of positions, which is psv.size()
-        atomic<double> sum_one_over_move_count{0.0};
-
-        auto mainThread = Threads.main();
-        mainThread->execute_with_worker([&out](auto& th){
-            auto& pos = th.rootPos;
-            StateInfo si;
-            pos.set(StartFEN, false, &si, &th);
-            out << "  - startpos eval = " << Eval::evaluate(pos) << endl;
-        });
-        mainThread->wait_for_worker_finished();
-
-        // The number of tasks to do.
-        atomic<uint64_t> counter{0};
-        Threads.execute_with_workers([&](auto& th){
-            calc_loss_worker(
-                th,
-                counter,
-                psv,
-                test_loss_sum,
-                sum_norm,
-                move_accord_count,
-                sum_one_over_move_count
-            );
-        });
-        Threads.wait_for_workers_finished();
-
-        latest_loss_sum += test_loss_sum.value();
-        latest_loss_count += psv.size();
-
-        if (psv.size() && test_loss_sum.count() > 0)
-        {
-            test_loss_sum.print_only_loss("val", out);
-
-            if (learn_loss_sum.count() > 0)
-            {
-                learn_loss_sum.print_with_grad("train", out);
-            }
-
-            out << "  - norm = " << sum_norm << endl;
-            out << "  - move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
-            out << "  - random move accuracy = " << (sum_one_over_move_count * 100.0 / psv.size()) << "%" << endl;
-        }
-        else
-        {
-            out << "ERROR: psv.size() = " << psv.size() << " ,  done = " << test_loss_sum.count() << endl;
-        }
-
-        learn_loss_sum.reset();
-    }
-
-    void LearnerThink::calc_loss_worker(
-        Thread& th,
-        std::atomic<uint64_t>& counter,
-        const PSVector& psv,
-        Loss& test_loss_sum,
-        atomic<double>& sum_norm,
-        atomic<int>& move_accord_count,
-        atomic<double>& sum_one_over_move_count
-    )
-    {
-        Loss local_loss_sum{};
-        double local_sum_one_over_move_count = 0.0;
-        auto& pos = th.rootPos;
-
-        for(;;)
-        {
-            const auto task_id = counter.fetch_add(1);
-            if (task_id >= psv.size())
-            {
-                break;
-            }
-
-            const auto& ps = psv[task_id];
-
-            StateInfo si;
-            if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0)
-            {
-                cout << "Error! : illegal packed sfen " << pos.fen() << endl;
-                continue;
-            }
-
-            const Value shallow_value = Eval::evaluate(pos);
-
-            // Evaluation value of deep search
-            const auto deep_value = (Value)ps.score;
-
-            const auto loss = get_loss_no_grad(
-                deep_value,
-                shallow_value,
-                ps);
-
-            local_loss_sum += loss;
-            sum_norm += (double)abs(shallow_value);
-
-            // Threat all moves with equal scores as first. This is up to move ordering.
-            if (has_depth1_move_agreement(pos, (Move)ps.move))
-                move_accord_count.fetch_add(1, std::memory_order_relaxed);
-
-            local_sum_one_over_move_count += 1.0 / static_cast<double>(MoveList<LEGAL>(pos).size());
-        }
-
-        sum_one_over_move_count += local_sum_one_over_move_count;
-        test_loss_sum += local_loss_sum;
-    }
-
-    bool LearnerThink::has_depth1_move_agreement(Position& pos, Move pvmove)
-    {
-        // Determine if the depth 1 search pv matches the move from the dataset.
-        // Do a manual depth 1 search so we're not affected by previous searches.
-        std::vector<std::pair<Move, Value>> child_scores;
-
-        // Call evaluate once for the rootpos so that the evals
-        // for children moves use incremental feature transformer updates.
-        (void)Eval::evaluate(pos);
-
-        // Just to get guaranteed alignment.
-        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(1);
-        auto legal_moves = MoveList<LEGAL>(pos);
-        for (auto m : legal_moves)
-        {
-            pos.do_move(m, states[0]);
-            // We don't care if the king is in check or stuff like that.
-            // not a big issue and nnue should digest all.
-            auto value = -Eval::evaluate(pos);
-            child_scores.emplace_back(m, value);
-            pos.undo_move(m);
-        }
-
-        if (child_scores.empty())
-            return false;
-
-        std::sort(
-            child_scores.begin(),
-            child_scores.end(),
-            [](auto& lhs, auto& rhs) { return lhs.second > rhs.second; }
-        );
-
-        // Require the best move to have strictly higher score than the next one.
-        return
-            child_scores[0].first == pvmove
-            && (child_scores.size() == 1
-                || child_scores[1].second != child_scores[0].second);
-    }
-
-    bool LearnerThink::check_progress()
-    {
-        auto out = sync_region_cout.new_region();
-
-        const double latest_loss = latest_loss_sum / latest_loss_count;
-        bool converged = false;
-        latest_loss_sum = 0.0;
-        latest_loss_count = 0;
-
-        auto drop_lr = [&]() {
-            last_lr_drop = total_done;
-
-            out
-                << "  - reducing learning rate from " << params.learning_rate
-                << " to " << (params.learning_rate * params.newbob_decay)
-                << " (" << trials << " more trials)" << endl;
-
-            params.learning_rate *= params.newbob_decay;
-        };
-
-        auto accept = [&]() {
-            out << "  - loss = " << latest_loss << " < best (" << best_loss << "), accepted" << endl;
-
-            best_loss = latest_loss;
-            trials = params.newbob_num_trials;
-        };
-
-        auto reject = [&]() {
-            out << "  - loss = " << latest_loss << " >= best (" << best_loss << "), rejected" << endl;
-
-            --trials;
-            if (trials > 0)
-            {
-                drop_lr();
-                return false;
-            }
-            else
-            {
-                return true;
-            }
-        };
-
-        out << "INFO (learning_rate):" << endl;
-
-        if (params.auto_lr_drop)
-        {
-            accept();
-
-            if (total_done >= last_lr_drop + params.auto_lr_drop)
-            {
-                drop_lr();
-            }
-        }
-        else if (latest_loss < best_loss)
-        {
-            accept();
-        }
-        else
-        {
-            converged = reject();
-        }
-
-        if (converged)
-        {
-            out << "  - converged" << endl;
-        }
-
-        return converged;
-    }
-
-    // Write evaluation function file.
-    bool LearnerThink::save(bool is_final)
-    {
-        // Each time you save, change the extension part of the file name like "0","1","2",..
-        // (Because I want to compare the winning rate for each evaluation function parameter later)
-
-        bool converged = false;
-
-        if (params.save_only_once)
-        {
-            // When EVAL_SAVE_ONLY_ONCE is defined,
-            // Do not dig a subfolder because I want to save it only once.
-            Eval::NNUE::save_eval("");
-        }
-        else if (is_final)
-        {
-            Eval::NNUE::save_eval("final");
-            converged = true;
-        }
-        else
-        {
-            // TODO: consider naming the output directory by epoch.
-            const std::string dir_name = std::to_string(dir_number++);
-            Eval::NNUE::save_eval(dir_name);
-
-            if (params.newbob_decay != 1.0 && latest_loss_count > 0)
-            {
-                converged = check_progress();
-                params.best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
-            }
-        }
-
-        return converged;
-    }
-
-    // Learning from the generated game record
-    void learn(istringstream& is)
-    {
-        LearnerThink::Params params;
-
-        // Number of epochs
-        uint64_t epochs = std::numeric_limits<uint64_t>::max();
-        uint64_t warmup_epochs = 0;
-
-        // Game file storage folder (get game file with relative path from here)
-        string base_dir;
-        string target_dir;
-
-        uint64_t nn_batch_size = 1000;
-        string nn_options;
-
-        auto out = sync_region_cout.new_region();
-
-        // Assume the filenames are staggered.
-        while (true)
-        {
-            string option;
-            is >> option;
-
-            if (option == "")
-                break;
-
-            // specify the number of phases of mini-batch
-            if (option == "bat")
-            {
-                is >> params.mini_batch_size;
-                params.mini_batch_size *= 10000; // Unit is ten thousand
-            }
-
-            // Specify the folder in which the game record is stored and make it the rooting target.
-            else if (option == "targetdir") is >> target_dir;
-            else if (option == "targetfile")
-            {
-                std::string filename;
-                is >> filename;
-                params.filenames.push_back(filename);
-            }
-            else if (option == "validation_count") is >> params.validation_count;
-
-            // Specify the number of loops
-            else if (option == "epochs") is >> epochs;
-            else if (option == "warmup_epochs") is >> warmup_epochs;
-
-            // Game file storage folder (get game file with relative path from here)
-            else if (option == "basedir") is >> base_dir;
-
-            // Mini batch size
-            else if (option == "batchsize"
-                  || option == "epoch_size")
-                is >> params.mini_batch_size;
-
-            // learning rate
-            else if (option == "lr") is >> params.learning_rate;
-            else if (option == "warmup_lr") is >> params.warmup_learning_rate;
-            else if (option == "max_grad") is >> params.max_grad;
-
-            // Accept also the old option name.
-            else if (option == "use_draw_in_training"
-                  || option == "use_draw_games_in_training")
-                is >> params.use_draw_games_in_training;
-
-            // Accept also the old option name.
-            else if (option == "use_draw_in_validation"
-                  || option == "use_draw_games_in_validation")
-                is >> params.use_draw_games_in_validation;
-
-            // Accept also the old option name.
-            else if (option == "use_hash_in_training"
-                  || option == "skip_duplicated_positions_in_training")
-                is >> params.skip_duplicated_positions_in_training;
-
-            else if (option == "winning_probability_coefficient")
-                is >> winning_probability_coefficient;
-
-            // Using WDL with win rate model instead of sigmoid
-            else if (option == "use_wdl") is >> use_wdl;
-
-
-            // LAMBDA
-            else if (option == "lambda") is >> elmo_lambda_low;
-            else if (option == "lambda2") is >> elmo_lambda_high;
-            else if (option == "lambda_limit") is >> elmo_lambda_limit;
-
-            else if (option == "reduction_gameply") is >> params.reduction_gameply;
-
-            else if (option == "eval_limit") is >> params.eval_limit;
-            else if (option == "save_only_once") params.save_only_once = true;
-            else if (option == "no_shuffle") params.shuffle = false;
-
-            else if (option == "nn_batch_size"
-                  || option == "batch_size")
-                is >> nn_batch_size;
-            else if (option == "newbob_decay"
-                  || option == "lr_step")
-                is >> params.newbob_decay;
-            else if (option == "newbob_num_trials"
-                  || option == "max_consecutive_rejections")
-                is >> params.newbob_num_trials;
-            else if (option == "nn_options") is >> nn_options;
-            else if (option == "auto_lr_drop") is >> params.auto_lr_drop;
-
-            else if (option == "eval_save_interval") is >> params.eval_save_interval;
-            else if (option == "loss_output_interval") is >> params.loss_output_interval;
-            else if (option == "validation_set_file_name") is >> params.validation_set_file_name;
-
-            else if (option == "src_score_min_value") is >> src_score_min_value;
-            else if (option == "src_score_max_value") is >> src_score_max_value;
-            else if (option == "dest_score_min_value") is >> dest_score_min_value;
-            else if (option == "dest_score_max_value") is >> dest_score_max_value;
-
-            else if (option == "sfen_read_size") is >> params.sfen_read_size;
-            else if (option == "thread_buffer_size") is >> params.thread_buffer_size;
-
-            else if (option == "seed") is >> params.seed;
-            else if (option == "set_recommended_uci_options")
-            {
-                UCI::setoption("Use NNUE", "pure");
-                UCI::setoption("MultiPV", "1");
-                UCI::setoption("Contempt", "0");
-                UCI::setoption("Skill Level", "20");
-                UCI::setoption("UCI_Chess960", "false");
-                UCI::setoption("UCI_AnalyseMode", "false");
-                UCI::setoption("UCI_LimitStrength", "false");
-                UCI::setoption("PruneAtShallowDepth", "false");
-                UCI::setoption("EnableTranspositionTable", "false");
-            }
-            else if (option == "verbose") params.verbose = true;
-            else if (option == "assume_quiet") params.assume_quiet = true;
-            else if (option == "smart_fen_skipping") params.smart_fen_skipping = true;
-            else if (option == "smart_fen_skipping_for_validation") params.smart_fen_skipping_for_validation = true;
-            else
-            {
-                out << "INFO: Unknown option: " << option << ". Ignoring.\n";
-            }
-        }
-
-        out << "INFO: Executing learn command\n";
-
-        // Issue a warning if OpenMP is disabled.
-#if !defined(_OPENMP)
-        out << "WARNING: OpenMP disabled." << endl;
-#endif
-
-        params.enforce_constraints();
-
-        // Right now we only have the individual files.
-        // We need to apply base_dir here
-        if (!target_dir.empty())
-        {
-            append_files_from_dir(params.filenames, base_dir, target_dir);
-        }
-        rebase_files(params.filenames, base_dir);
-
-        out << "INFO: Input files:\n";
-        for (auto s : params.filenames)
-            out << "  - " << s << '\n';
-
-        out << "INFO: Parameters:\n";
-        if (!params.validation_set_file_name.empty())
-        {
-            out << "  - validation set           : " << params.validation_set_file_name << endl;
-        }
-
-        out << "  - validation count         : " << params.validation_count << endl;
-        out << "  - epochs                   : " << epochs << endl;
-        out << "  - positions                : " << epochs * params.mini_batch_size << endl;
-        out << "  - warmup epochs            : " << warmup_epochs << endl;
-        out << "  - warmup positions         : " << warmup_epochs * params.mini_batch_size << endl;
-        out << "  - eval_limit               : " << params.eval_limit << endl;
-        out << "  - save_only_once           : " << (params.save_only_once ? "true" : "false") << endl;
-        out << "  - shuffle on read          : " << (params.shuffle ? "true" : "false") << endl;
-
-        out << "  - Loss Function            : " << LOSS_FUNCTION << endl;
-        out << "  - minibatch size           : " << params.mini_batch_size << endl;
-
-        out << "  - nn_batch_size            : " << nn_batch_size << endl;
-        out << "  - nn_options               : " << nn_options << endl;
-
-        out << "  - learning rate            : " << params.learning_rate << endl;
-        out << "  - warmup learning rate     : " << params.warmup_learning_rate << endl;
-        out << "  - max_grad                 : " << params.max_grad << endl;
-        out << "  - use draws in training    : " << params.use_draw_games_in_training << endl;
-        out << "  - use draws in validation  : " << params.use_draw_games_in_validation << endl;
-        out << "  - skip repeated positions  : " << params.skip_duplicated_positions_in_training << endl;
-
-        out << "  - winning prob coeff       : " << winning_probability_coefficient << endl;
-        out << "  - use_wdl                  : " << use_wdl << endl;
-
-        out << "  - src_score_min_value      : " << src_score_min_value << endl;
-        out << "  - src_score_max_value      : " << src_score_max_value << endl;
-        out << "  - dest_score_min_value     : " << dest_score_min_value << endl;
-        out << "  - dest_score_max_value     : " << dest_score_max_value << endl;
-
-        out << "  - reduction_gameply        : " << params.reduction_gameply << endl;
-
-        out << "  - elmo_lambda_low          : " << elmo_lambda_low << endl;
-        out << "  - elmo_lambda_high         : " << elmo_lambda_high << endl;
-        out << "  - elmo_lambda_limit        : " << elmo_lambda_limit << endl;
-        out << "  - eval_save_interval       : " << params.eval_save_interval << " sfens" << endl;
-        out << "  - loss_output_interval     : " << params.loss_output_interval << " sfens" << endl;
-
-        out << "  - sfen_read_size           : " << params.sfen_read_size << endl;
-        out << "  - thread_buffer_size       : " << params.thread_buffer_size << endl;
-
-        out << "  - smart_fen_skipping       : " << params.smart_fen_skipping << endl;
-        out << "  - smart_fen_skipping_val   : " << params.smart_fen_skipping_for_validation << endl;
-
-        out << "  - seed                     : " << params.seed << endl;
-        out << "  - verbose                  : " << (params.verbose ? "true" : "false") << endl;
-
-        if (params.auto_lr_drop) {
-            out << "  - learning rate scheduling : every " << params.auto_lr_drop << " sfens" << endl;
-        }
-        else if (params.newbob_decay != 1.0) {
-            out << "  - learning rate scheduling : newbob with decay" << endl;
-            out << "  - newbob_decay             : " << params.newbob_decay << endl;
-            out << "  - newbob_num_trials        : " << params.newbob_num_trials << endl;
-        }
-        else {
-            out << "  - learning rate scheduling : fixed learning rate" << endl;
-        }
-
-        out << endl;
-
-        out << "INFO: Started initialization." << endl;
-
-        Eval::NNUE::initialize_training(params.seed, out);
-        Eval::NNUE::set_batch_size(nn_batch_size);
-        Eval::NNUE::set_options(nn_options);
-
-        LearnerThink learn_think(params);
-
-        out << "Finished initialization." << endl;
-
-        out.unlock();
-
-        // Start learning.
-        learn_think.learn(epochs, warmup_epochs);
-    }
-
-} // namespace Learner
diff --git a/src/learn/learn.h b/src/learn/learn.h
deleted file mode 100644
index 842ffad0..00000000
--- a/src/learn/learn.h
+++ /dev/null
@@ -1,148 +0,0 @@
-﻿#ifndef _LEARN_H_
-#define _LEARN_H_
-
-// ----------------------
-// Floating point for learning
-// ----------------------
-
-// If this is set to double, the calculation accuracy will be higher, but the weight array entangled memory will be doubled.
-// Currently, if this is float, the weight array is 4.5 times the size of the evaluation function file. (About 4.5GB with KPPT)
-// Even if it is a double type, there is almost no difference in the way of convergence, so fix it to float.
-
-// when using float
-using LearnFloatType = float;
-
-// when using double
-//typedef double LearnFloatType;
-
-// when using float16
-//#include "half_float.h"
-//typedef HalfFloat::float16 LearnFloatType;
-
-// ======================
-// configure
-// ======================
-
-// ----------------------
-// Learning with the method of elmo (WCSC27)
-// ----------------------
-
-#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
-
-// ----------------------
-// Definition of struct used in Learner
-// ----------------------
-
-#include "autograd.h"
-#include "packed_sfen.h"
-
-#include "position.h"
-
-#include <sstream>
-#include <vector>
-#include <mutex>
-#include <string>
-
-namespace Learner
-{
-    // ----------------------
-    // Settings for learning
-    // ----------------------
-
-    // mini-batch size.
-    // Calculate the gradient by combining this number of phases.
-    // If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
-    // If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
-    // I don't think you need to change this value in most cases.
-
-    constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;
-
-    // Saving interval of evaluation function at learning. Save each time you learn this number of phases.
-    // Needless to say, the longer the saving interval, the shorter the learning time.
-    // Folder name is incremented for each save like 0/, 1/, 2/...
-    // By default, once every 1 billion phases.
-    constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 100'000'000ULL;
-
-    // Reduce the output of rmse during learning to 1 for this number of times.
-    // rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
-    constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;
-
-    // Learning from the generated game record
-    void learn(std::istringstream& is);
-
-    using CalcLossFunc = ValueWithGrad<double>(Value, Value, int, int);
-
-    struct Loss
-    {
-        double value() const
-        {
-            return m_loss.value;
-        }
-
-        double grad() const
-        {
-            return m_loss.grad;
-        }
-
-        uint64_t count() const
-        {
-            return m_count;
-        }
-
-        Loss() = default;
-
-        Loss(const Loss& other) :
-            m_loss(other.m_loss),
-            m_count(other.m_count)
-        {
-        }
-
-        Loss& operator += (const ValueWithGrad<double>& rhs)
-        {
-            std::unique_lock lock(m_mutex);
-
-            m_loss += rhs.abs();
-            m_count += 1;
-
-            return *this;
-        }
-
-        Loss& operator += (const Loss& rhs)
-        {
-            std::unique_lock lock(m_mutex);
-
-            m_loss += rhs.m_loss.abs();
-            m_count += rhs.m_count;
-
-            return *this;
-        }
-
-        void reset()
-        {
-            std::unique_lock lock(m_mutex);
-
-            m_loss = ValueWithGrad<double>{ 0.0, 0.0 };
-            m_count = 0;
-        }
-
-        template <typename StreamT>
-        void print_with_grad(const std::string& prefix, StreamT& s) const
-        {
-            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
-            s << "  - " << prefix << "_grad_norm  = " << m_loss.grad / (double)m_count << std::endl;
-        }
-
-        template <typename StreamT>
-        void print_only_loss(const std::string& prefix, StreamT& s) const
-        {
-            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
-        }
-
-    private:
-        ValueWithGrad<double> m_loss{ 0.0, 0.0 };
-        uint64_t m_count{0};
-        std::mutex m_mutex;
-    };
-}
-
-#endif // ifndef _LEARN_H_
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
deleted file mode 100644
index 8d95221c..00000000
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ /dev/null
@@ -1,341 +0,0 @@
-﻿#include <random>
-#include <fstream>
-
-#include "evaluate_nnue.h"
-#include "evaluate_nnue_learner.h"
-
-#include "trainer/features/all_factorizers.h"
-
-#include "trainer/trainer_feature_transformer.h"
-#include "trainer/trainer_input_slice.h"
-#include "trainer/trainer_affine_transform.h"
-#include "trainer/trainer_clipped_relu.h"
-#include "trainer/trainer_sum.h"
-
-#include "position.h"
-#include "uci.h"
-#include "misc.h"
-#include "thread_win32_osx.h"
-#include "thread.h"
-
-// Code for learning NNUE evaluation function
-namespace Eval::NNUE {
-
-    namespace {
-
-        // learning data
-        std::vector<Example> examples;
-
-        // Mutex for exclusive control of examples
-        std::mutex examples_mutex;
-
-        // number of samples in mini-batch
-        uint64_t batch_size;
-
-        // random number generator
-        std::mt19937 rng;
-
-        // learner
-        std::shared_ptr<Trainer<Network>> trainer;
-
-        // Tell the learner options such as hyperparameters
-        void send_messages(std::vector<Message> messages) {
-            for (auto& message : messages) {
-                trainer->send_message(&message);
-                assert(message.num_receivers > 0);
-            }
-        }
-
-    }  // namespace
-
-    // Initialize learning
-    void initialize_training(
-        const std::string& seed,
-        SynchronizedRegionLogger::Region& out) {
-
-#if defined (OPENBLAS_VERSION)
-        openblas_set_num_threads(1);
-#elif defined (INTEL_MKL_VERSION)
-        mkl_set_num_threads(1);
-#endif
-
-        out << "INFO (initialize_training): Initializing NN training for "
-            << get_architecture_string() << std::endl;
-
-        out << std::endl;
-
-        out << "Layers:\n"
-            << get_layers_info() << std::endl;
-
-        out << std::endl;
-
-        out << "Factorizers:\n"
-            << Features::Factorizer<RawFeatures>::get_factorizers_string() << std::endl;
-
-        out << std::endl;
-
-        assert(feature_transformer);
-        assert(network);
-
-        trainer = Trainer<Network>::create(network.get(), feature_transformer.get());
-        rng.seed(PRNG(seed).rand<uint64_t>());
-
-        if (Options["SkipLoadingEval"]) {
-            out << "INFO (initialize_training): Performing random net initialization.\n";
-            trainer->initialize(rng);
-        }
-    }
-
-    // set the number of samples in the mini-batch
-    void set_batch_size(uint64_t size) {
-        assert(size > 0);
-        batch_size = size;
-    }
-
-    // Set options such as hyperparameters
-    void set_options(const std::string& options) {
-        std::vector<Message> messages;
-        for (const auto& option : Algo::split(options, ',')) {
-          const auto fields = Algo::split(option, '=');
-          assert(fields.size() == 1 || fields.size() == 2);
-
-          if (fields.size() == 1) {
-              messages.emplace_back(fields[0]);
-          } else {
-              messages.emplace_back(fields[0], fields[1]);
-          }
-        }
-
-        send_messages(std::move(messages));
-    }
-
-    // Reread the evaluation function parameters for learning from the file
-    void restore_parameters(const std::string& dir_name) {
-        const std::string file_name = Path::combine(dir_name, NNUE::savedfileName);
-        std::ifstream stream(file_name, std::ios::binary);
-#ifndef NDEBUG
-        bool result =
-#endif
-        ReadParameters(stream);
-#ifndef NDEBUG
-        assert(result);
-#endif
-
-        send_messages({{"reset"}});
-    }
-
-    void finalize_net() {
-        send_messages({{"clear_unobserved_feature_weights"}});
-    }
-
-    // Add 1 sample of learning data
-    void add_example(
-        Position& pos,
-        Color rootColor,
-        Value discrete_nn_eval,
-        const Learner::PackedSfenValue& psv,
-        double weight) {
-
-        Example example;
-        if (rootColor == pos.side_to_move()) {
-            example.sign = 1;
-        } else {
-            example.sign = -1;
-        }
-
-        example.discrete_nn_eval = discrete_nn_eval;
-        example.psv = psv;
-        example.weight = weight;
-
-        Features::IndexList active_indices[2];
-        for (const auto trigger : kRefreshTriggers) {
-            RawFeatures::append_active_indices(pos, trigger, active_indices);
-        }
-
-        if (pos.side_to_move() != WHITE) {
-            active_indices[0].swap(active_indices[1]);
-        }
-
-        static thread_local std::vector<TrainingFeature> s_training_features;
-        auto& training_features = s_training_features;
-
-        for (const auto color : Colors) {
-            training_features.clear();
-
-            for (const auto base_index : active_indices[color]) {
-                static_assert(Features::Factorizer<RawFeatures>::get_dimensions() <
-                              (1 << TrainingFeature::kIndexBits), "");
-                Features::Factorizer<RawFeatures>::append_training_features(
-                    base_index, &training_features);
-            }
-
-            std::sort(training_features.begin(), training_features.end());
-
-            auto& unique_features = example.training_features[color];
-            unique_features.reserve(training_features.size());
-            for (const auto& feature : training_features) {
-                if (!unique_features.empty() &&
-                    feature.get_index() == unique_features.back().get_index()) {
-
-                    unique_features.back() += feature;
-                } else {
-                    unique_features.push_back(feature);
-                }
-            }
-        }
-
-        std::lock_guard<std::mutex> lock(examples_mutex);
-        examples.push_back(std::move(example));
-    }
-
-    // update the evaluation function parameters
-    Learner::Loss update_parameters(
-        ThreadPool& thread_pool,
-        uint64_t epoch,
-        bool verbose,
-        double learning_rate,
-        double max_grad,
-        Learner::CalcLossFunc calc_loss)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        assert(batch_size > 0);
-
-        learning_rate /= batch_size;
-
-        std::lock_guard<std::mutex> lock(examples_mutex);
-
-        double abs_eval_diff_sum = 0.0;
-        double abs_discrete_eval_sum = 0.0;
-        double gradient_norm = 0.0;
-
-        bool collect_stats = verbose;
-
-        Learner::Loss loss_sum{};
-
-        std::vector<double> abs_eval_diff_sum_local(thread_pool.size(), 0.0);
-        std::vector<double> abs_discrete_eval_sum_local(thread_pool.size(), 0.0);
-        std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
-        std::vector<Learner::Loss> loss_sum_local(thread_pool.size());
-
-        auto prev_batch_begin = examples.end();
-        while ((long)(prev_batch_begin - examples.begin()) >= (long)batch_size) {
-            auto batch_begin = prev_batch_begin - batch_size;
-            auto batch_end = prev_batch_begin;
-            auto size = batch_end - batch_begin;
-            const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end);
-            std::vector<LearnFloatType> gradients(size);
-
-            thread_pool.for_each_index_chunk_with_workers(
-                std::size_t(0), size,
-                [&](Thread& th, std::size_t offset, std::size_t count) {
-                    const auto thread_id = th.thread_idx();
-
-                    trainer->propagate(th, offset, count);
-
-                    for (std::size_t b = offset; b < offset + count; ++b) {
-                        const auto& e = *(batch_begin + b);
-                        const auto shallow = static_cast<Value>(round<std::int32_t>(
-                            e.sign * network_output[b] * kPonanzaConstant));
-                        const auto discrete = e.sign * e.discrete_nn_eval;
-                        const auto& psv = e.psv;
-                        auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
-                        loss.grad = std::clamp(
-                            loss.grad * e.sign * kPonanzaConstant * e.weight, -max_grad, max_grad);
-                        gradients[b] = static_cast<LearnFloatType>(loss.grad);
-                        loss_sum_local[thread_id] += loss;
-
-                        // The discrete eval will only be valid before first backpropagation,
-                        // that is only for the first batch.
-                        // Similarily we want only gradients from one batch.
-                        if (collect_stats)
-                        {
-                            abs_eval_diff_sum_local[thread_id] += std::abs(discrete - shallow);
-                            abs_discrete_eval_sum_local[thread_id] += std::abs(discrete);
-                            gradient_norm_local[thread_id] += std::abs(loss.grad);
-                        }
-                    }
-
-                    trainer->backpropagate(th, gradients.data(), offset, count);
-                }
-            );
-
-            // We can asyncronously erase the examples that we used in the previous
-            // step. This can be done safely because we're no longer using these
-            // examples and erase won't invalidate iterators.
-            examples.erase(prev_batch_begin, examples.end());
-            prev_batch_begin = batch_begin;
-
-            thread_pool.wait_for_workers_finished();
-
-            trainer->step_end(thread_pool, learning_rate);
-
-            collect_stats = false;
-        }
-        examples.erase(prev_batch_begin, examples.end());
-
-        if (verbose)
-        {
-            abs_eval_diff_sum = std::accumulate(abs_eval_diff_sum_local.begin(), abs_eval_diff_sum_local.end(), 0.0);
-            abs_discrete_eval_sum = std::accumulate(abs_discrete_eval_sum_local.begin(), abs_discrete_eval_sum_local.end(), 0.0);
-            gradient_norm = std::accumulate(gradient_norm_local.begin(), gradient_norm_local.end(), 0.0);
-
-            const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
-            const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
-
-            auto out = sync_region_cout.new_region();
-
-            out << "INFO (update_parameters):"
-                << " epoch = " << epoch
-                << " , avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff
-                << " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval
-                << " , avg_relative_error = " << avg_abs_eval_diff / avg_abs_discrete_eval
-                << " , batch_size = " << batch_size
-                << " , grad_norm = " << gradient_norm
-                << std::endl;
-        } else {
-            // Display some progress but don't synchronize as
-            // we can't really decide when to release the output lock here
-            std::cout << '.';
-        }
-
-        send_messages({{"quantize_parameters"}});
-
-        for(auto& loss : loss_sum_local)
-        {
-            loss_sum += loss;
-        }
-
-        return loss_sum;
-    }
-
-    // Check if there are any problems with learning
-    void check_health() {
-        send_messages({{"check_health"}});
-    }
-
-    // save merit function parameters to a file
-    void save_eval(std::string dir_name) {
-        auto eval_dir = Path::combine(Options["EvalSaveDir"], dir_name);
-
-        auto out = sync_region_cout.new_region();
-
-        out << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
-
-        // mkdir() will fail if this folder already exists, but
-        // Apart from that. If not, I just want you to make it.
-        // Also, assume that the folders up to EvalSaveDir have been dug.
-        sys::create_directories(eval_dir);
-
-        const std::string file_name = Path::combine(eval_dir, NNUE::savedfileName);
-        std::ofstream stream(file_name, std::ios::binary);
-#ifndef NDEBUG
-        bool result =
-#endif
-        WriteParameters(stream);
-#ifndef NDEBUG
-        assert(result);
-#endif
-        out << "INFO (save_eval): Finished saving evaluation file in " << eval_dir << std::endl;
-    }
-}  // namespace Eval::NNUE
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
deleted file mode 100644
index 3d9f5b31..00000000
--- a/src/nnue/evaluate_nnue_learner.h
+++ /dev/null
@@ -1,52 +0,0 @@
-﻿#ifndef _EVALUATE_NNUE_LEARNER_H_
-#define _EVALUATE_NNUE_LEARNER_H_
-
-#include "learn/learn.h"
-
-#include "misc.h"
-
-struct ThreadPool;
-
-// Interface used for learning NNUE evaluation function
-namespace Eval::NNUE {
-
-    // Initialize learning
-    void initialize_training(
-        const std::string& seed,
-        SynchronizedRegionLogger::Region& out);
-
-    // set the number of samples in the mini-batch
-    void set_batch_size(uint64_t size);
-
-    // Set options such as hyperparameters
-    void set_options(const std::string& options);
-
-    // Reread the evaluation function parameters for learning from the file
-    void restore_parameters(const std::string& dir_name);
-
-    // Add 1 sample of learning data
-    void add_example(
-        Position& pos,
-        Color rootColor,
-        Value discrete_nn_eval,
-    	const Learner::PackedSfenValue& psv,
-        double weight);
-
-    // update the evaluation function parameters
-    Learner::Loss update_parameters(
-        ThreadPool& thread_pool,
-        uint64_t epoch,
-        bool verbose,
-        double learning_rate,
-        double max_grad,
-        Learner::CalcLossFunc calc_loss);
-
-    // Check if there are any problems with learning
-    void check_health();
-
-    void finalize_net();
-
-    void save_eval(std::string suffix);
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
deleted file mode 100644
index d892222b..00000000
--- a/src/nnue/nnue_test_command.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-﻿#include "evaluate_nnue.h"
-#include "nnue_test_command.h"
-
-#include "thread.h"
-#include "uci.h"
-
-#include <set>
-#include <fstream>
-
-#define ASSERT(X) { \
-    if (!(X)) { \
-        std::cout \
-            << "\nError : ASSERT(" << #X << "), " \
-            << __FILE__ << "(" << __LINE__ << "): " \
-            << __func__ << std::endl; \
-            std::this_thread::sleep_for(std::chrono::microseconds(3000)); \
-            *(int*)1 =0; \
-    } \
-}
-
-// USI extended command for NNUE evaluation function
-namespace Eval::NNUE {
-
-    namespace {
-
-        // Testing RawFeatures mainly for difference calculation
-        void test_features(Position& pos) {
-            const std::uint64_t num_games = 1000;
-            StateInfo si;
-            pos.set(StartFEN, false, &si, Threads.main());
-            const int MAX_PLY = 256; // test up to 256 hands
-
-            StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
-            int ply; // Trouble from the initial phase
-
-            PRNG prng(20171128);
-
-            std::uint64_t num_moves = 0;
-            std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
-            std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
-            constexpr IndexType kUnknown = -1;
-            std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
-
-            auto make_index_sets = [&](const Position& position) {
-                std::vector<std::vector<std::set<IndexType>>> index_sets(
-                    kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
-
-                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-                    Features::IndexList active_indices[2];
-                    RawFeatures::append_active_indices(position, kRefreshTriggers[i],
-                                                     active_indices);
-
-                    for (const auto perspective : Colors) {
-                        for (const auto index : active_indices[perspective]) {
-                            ASSERT(index < RawFeatures::kDimensions);
-                            ASSERT(index_sets[i][perspective].count(index) == 0);
-                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-                            index_sets[i][perspective].insert(index);
-                            trigger_map[index] = i;
-                        }
-                    }
-                }
-
-                return index_sets;
-            };
-
-            auto update_index_sets = [&](const Position& position, auto* index_sets) {
-                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-                    Features::IndexList removed_indices[2], added_indices[2];
-                    bool reset[2] = { false, false };
-                    RawFeatures::append_changed_indices(position, kRefreshTriggers[i],
-                                                      removed_indices, added_indices, reset);
-                    for (const auto perspective : Colors) {
-                        if (reset[perspective]) {
-                            (*index_sets)[i][perspective].clear();
-                            ++num_resets[i];
-                        } else {
-                            for (const auto index : removed_indices[perspective]) {
-                                ASSERT(index < RawFeatures::kDimensions);
-                                ASSERT((*index_sets)[i][perspective].count(index) == 1);
-                                ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-                                (*index_sets)[i][perspective].erase(index);
-                                ++num_updates.back();
-                                ++num_updates[i];
-                                trigger_map[index] = i;
-                            }
-                        }
-
-                        for (const auto index : added_indices[perspective]) {
-                            ASSERT(index < RawFeatures::kDimensions);
-                            ASSERT((*index_sets)[i][perspective].count(index) == 0);
-                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-                            (*index_sets)[i][perspective].insert(index);
-                            ++num_updates.back();
-                            ++num_updates[i];
-                            trigger_map[index] = i;
-                        }
-                    }
-                }
-            };
-
-            std::cout << "feature set: " << RawFeatures::get_name()
-                      << "[" << RawFeatures::kDimensions << "]" << std::endl;
-            std::cout << "start testing with random games";
-
-            for (std::uint64_t i = 0; i < num_games; ++i) {
-                auto index_sets = make_index_sets(pos);
-                for (ply = 0; ply < MAX_PLY; ++ply) {
-                    MoveList<LEGAL> mg(pos); // Generate all legal hands
-
-                    // There was no legal move == Clog
-                    if (mg.size() == 0)
-                        break;
-
-                    // Randomly choose from the generated moves and advance the phase with the moves.
-                    Move m = mg.begin()[prng.rand(mg.size())];
-                    pos.do_move(m, state[ply]);
-
-                    ++num_moves;
-                    update_index_sets(pos, &index_sets);
-                    ASSERT(index_sets == make_index_sets(pos));
-                }
-
-                pos.set(StartFEN, false, &si, Threads.main());
-
-                // Output'.' every 100 times (so you can see that it's progressing)
-                if ((i % 100) == 0)
-                    std::cout << "." << std::flush;
-            }
-
-            std::cout << "passed." << std::endl;
-            std::cout << num_games << " games, " << num_moves << " moves, "
-                      << num_updates.back() << " updates, "
-                      << (1.0 * num_updates.back() / num_moves)
-                      << " updates per move" << std::endl;
-            std::size_t num_observed_indices = 0;
-
-            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-                const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
-                num_observed_indices += count;
-                std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
-                          << "): " << count << " features ("
-                          << (100.0 * count / RawFeatures::kDimensions) << "%), "
-                          << num_updates[i] << " updates ("
-                          << (1.0 * num_updates[i] / num_moves) << " per move), "
-                          << num_resets[i] << " resets ("
-                          << (100.0 * num_resets[i] / num_moves) << "%)"
-                          << std::endl;
-            }
-            std::cout << "observed " << num_observed_indices << " ("
-                      << (100.0 * num_observed_indices / RawFeatures::kDimensions)
-                      << "% of " << RawFeatures::kDimensions
-                      << ") features" << std::endl;
-        }
-
-        // Output a string that represents the structure of the evaluation function
-        void print_info(std::istream& stream) {
-            std::cout << "network architecture: " << get_architecture_string() << std::endl;
-
-            while (true) {
-                std::string file_name;
-                stream >> file_name;
-                if (file_name.empty())
-                    break;
-
-                std::uint32_t hash_value;
-                std::string architecture;
-                const bool success = [&]() {
-                    std::ifstream file_stream(file_name, std::ios::binary);
-
-                    if (!file_stream)
-                        return false;
-                    if (!read_header(file_stream, &hash_value, &architecture))
-                        return false;
-
-                    return true;
-                }();
-
-                std::cout << file_name << ": ";
-                if (success) {
-                    if (hash_value == kHashValue) {
-                        std::cout << "matches with this binary";
-                        if (architecture != get_architecture_string()) {
-                            std::cout << ", but architecture string differs: " << architecture;
-                        }
-
-                        std::cout << std::endl;
-                    } else {
-                        std::cout << architecture << std::endl;
-                    }
-                } else {
-                    std::cout << "failed to read header" << std::endl;
-                }
-            }
-        }
-
-    }  // namespace
-
-    // USI extended command for NNUE evaluation function
-    void test_command(Position& pos, std::istream& stream) {
-        std::string sub_command;
-        stream >> sub_command;
-
-        if (sub_command == "test_features") {
-            test_features(pos);
-        } else if (sub_command == "info") {
-            print_info(stream);
-        } else {
-            std::cout << "usage:" << std::endl;
-            std::cout << " test nnue test_features" << std::endl;
-            std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
-        }
-    }
-
-}  // namespace Eval::NNUE
diff --git a/src/nnue/nnue_test_command.h b/src/nnue/nnue_test_command.h
deleted file mode 100644
index fcfe16f6..00000000
--- a/src/nnue/nnue_test_command.h
+++ /dev/null
@@ -1,12 +0,0 @@
-﻿#ifndef _NNUE_TEST_COMMAND_H_
-#define _NNUE_TEST_COMMAND_H_
-
-// USI extended command interface for NNUE evaluation function
-namespace Eval::NNUE {
-
-    // USI extended command for NNUE evaluation function
-    void test_command(Position& pos, std::istream& stream);
-
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/nnue/trainer/features/all_factorizers.h b/src/nnue/trainer/features/all_factorizers.h
deleted file mode 100644
index 75d62ec8..00000000
--- a/src/nnue/trainer/features/all_factorizers.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
-#define _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
-
-#include "factorizer.h"
-#include "factorizer_feature_set.h"
-
-#include "factorizer_half_kp.h"
-#include "factorizer_half_ka.h"
-
-#endif
diff --git a/src/nnue/trainer/features/factorizer.h b/src/nnue/trainer/features/factorizer.h
deleted file mode 100644
index b64b0c74..00000000
--- a/src/nnue/trainer/features/factorizer.h
+++ /dev/null
@@ -1,117 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
-#define _NNUE_TRAINER_FEATURES_FACTORIZER_H_
-
-#include "nnue/nnue_common.h"
-
-#include "nnue/trainer/trainer.h"
-
-// NNUE evaluation function feature conversion class template
-namespace Eval::NNUE::Features {
-
-    // Class template that converts input features into learning features
-    // By default, the learning feature is the same as the original input feature, and specialized as necessary
-    template <typename FeatureType>
-    class Factorizer {
-    public:
-        static constexpr std::string get_name() {
-            return "Factorizer<" + FeatureType::get_name() + "> -> " + std::string("No factorizer");
-        }
-
-        static constexpr std::string get_factorizers_string() {
-            return "  - " + get_name();
-        }
-
-        // Get the dimensionality of the learning feature
-        static constexpr IndexType get_dimensions() {
-            return FeatureType::kDimensions;
-        }
-
-        // Get index of learning feature and scale of learning rate
-        static void append_training_features(
-            IndexType base_index, std::vector<TrainingFeature>* training_features) {
-
-            assert(base_index <FeatureType::kDimensions);
-            training_features->emplace_back(base_index);
-        }
-    };
-
-    // Learning feature information
-    struct FeatureProperties {
-        bool active;
-        IndexType dimensions;
-    };
-
-    // Add the original input features to the learning features
-    template <typename FeatureType>
-    IndexType append_base_feature(
-        FeatureProperties properties, IndexType base_index,
-        std::vector<TrainingFeature>* training_features) {
-
-        assert(properties.dimensions == FeatureType::kDimensions);
-        assert(base_index < FeatureType::kDimensions);
-        training_features->emplace_back(base_index);
-        return properties.dimensions;
-    }
-
-    // If the learning rate scale is not 0, inherit other types of learning features
-    template <typename FeatureType>
-    IndexType inherit_features_if_required(
-        IndexType index_offset, FeatureProperties properties, IndexType base_index,
-        std::vector<TrainingFeature>* training_features) {
-
-        if (!properties.active) {
-            return 0;
-        }
-
-        assert(properties.dimensions == Factorizer<FeatureType>::get_dimensions());
-        assert(base_index < FeatureType::kDimensions);
-
-        const auto start = training_features->size();
-        Factorizer<FeatureType>::append_training_features(
-            base_index, training_features);
-
-        for (auto i = start; i < training_features->size(); ++i) {
-            auto& feature = (*training_features)[i];
-            assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
-            feature.shift_index(index_offset);
-        }
-
-        return properties.dimensions;
-    }
-
-    // Return the index difference as needed, without adding learning features
-    // Call instead of InheritFeaturesIfRequired() if there are no corresponding features
-    IndexType skip_features(FeatureProperties properties) {
-        if (!properties.active)
-            return 0;
-
-        return properties.dimensions;
-    }
-
-    // Get the dimensionality of the learning feature
-    template <std::size_t N>
-    constexpr IndexType get_active_dimensions(
-        const FeatureProperties (&properties)[N]) {
-
-        static_assert(N > 0, "");
-
-        IndexType dimensions = properties[0].dimensions;
-
-        for (std::size_t i = 1; i < N; ++i) {
-            if (properties[i].active) {
-                dimensions += properties[i].dimensions;
-            }
-        }
-
-        return dimensions;
-    }
-
-    // get the number of elements in the array
-    template <typename T, std::size_t N>
-    constexpr std::size_t get_array_length(const T (&/*array*/)[N]) {
-        return N;
-    }
-
-}  // namespace Eval::NNUE::Features
-
-#endif
diff --git a/src/nnue/trainer/features/factorizer_feature_set.h b/src/nnue/trainer/features/factorizer_feature_set.h
deleted file mode 100644
index 60f42166..00000000
--- a/src/nnue/trainer/features/factorizer_feature_set.h
+++ /dev/null
@@ -1,121 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
-#define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
-
-#include "factorizer.h"
-
-#include "nnue/features/feature_set.h"
-
-// Specialization for feature set of feature conversion class template of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    // Class template that converts input features into learning features
-    // Specialization for FeatureSet
-    template <typename FirstFeatureType, typename... RemainingFeatureTypes>
-    class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
-    private:
-        using Head = Factorizer<FeatureSet<FirstFeatureType>>;
-        using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
-
-    public:
-        // number of dimensions of original input features
-        static constexpr IndexType kBaseDimensions =
-            FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
-
-        static constexpr std::string get_factorizers_string() {
-            std::string str = "  - ";
-            str += Head::get_name();
-            str += '\n';
-            str += Tail::get_factorizers_string();
-            return str;
-        }
-
-        // Get the dimensionality of the learning feature
-        static constexpr IndexType get_dimensions() {
-            return Head::get_dimensions() + Tail::get_dimensions();
-        }
-
-        // Get index of learning feature and scale of learning rate
-        static void append_training_features(
-            IndexType base_index, std::vector<TrainingFeature>* training_features,
-            IndexType base_dimensions = kBaseDimensions) {
-
-            assert(base_index < kBaseDimensions);
-
-            constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
-
-            if (base_index < boundary) {
-                Tail::append_training_features(
-                    base_index, training_features, base_dimensions);
-            }
-            else {
-                const auto start = training_features->size();
-
-                Head::append_training_features(
-                    base_index - boundary, training_features, base_dimensions);
-
-                for (auto i = start; i < training_features->size(); ++i) {
-                    auto& feature = (*training_features)[i];
-                    const auto index = feature.get_index();
-
-                    assert(index < Head::get_dimensions() ||
-                               (index >= base_dimensions &&
-                                index < base_dimensions +
-                                        Head::get_dimensions() - Head::kBaseDimensions));
-
-                    if (index < Head::kBaseDimensions) {
-                        feature.shift_index(Tail::kBaseDimensions);
-                    }
-                    else {
-                        feature.shift_index(Tail::get_dimensions() - Tail::kBaseDimensions);
-                    }
-                }
-            }
-        }
-    };
-
-    // Class template that converts input features into learning features
-    // Specialization when FeatureSet has one template argument
-    template <typename FeatureType>
-    class Factorizer<FeatureSet<FeatureType>> {
-    public:
-        // number of dimensions of original input features
-        static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
-
-        static constexpr std::string get_name() {
-            return Factorizer<FeatureType>::get_name();
-        }
-
-        static constexpr std::string get_factorizers_string() {
-            return "  - " + get_name();
-        }
-
-        // Get the dimensionality of the learning feature
-        static constexpr IndexType get_dimensions() {
-            return Factorizer<FeatureType>::get_dimensions();
-        }
-
-        // Get index of learning feature and scale of learning rate
-        static void append_training_features(
-            IndexType base_index, std::vector<TrainingFeature>* training_features,
-            IndexType base_dimensions = kBaseDimensions) {
-
-            assert(base_index < kBaseDimensions);
-
-            const auto start = training_features->size();
-
-            Factorizer<FeatureType>::append_training_features(
-                base_index, training_features);
-
-            for (auto i = start; i < training_features->size(); ++i) {
-                auto& feature = (*training_features)[i];
-                assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
-                if (feature.get_index() >= kBaseDimensions) {
-                    feature.shift_index(base_dimensions - kBaseDimensions);
-                }
-            }
-        }
-    };
-
-}  // namespace Eval::NNUE::Features
-
-#endif
diff --git a/src/nnue/trainer/features/factorizer_half_ka.h b/src/nnue/trainer/features/factorizer_half_ka.h
deleted file mode 100644
index 36d36a2d..00000000
--- a/src/nnue/trainer/features/factorizer_half_ka.h
+++ /dev/null
@@ -1,93 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
-#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
-
-#include "factorizer.h"
-
-#include "nnue/features/half_ka.h"
-#include "nnue/features/a.h"
-#include "nnue/features/half_relative_ka.h"
-
-// Specialization of NNUE evaluation function feature conversion class template for HalfKA
-namespace Eval::NNUE::Features {
-
-    // Class template that converts input features into learning features
-    // Specialization for HalfKA
-    template <Side AssociatedKing>
-    class Factorizer<HalfKA<AssociatedKing>> {
-    private:
-        using FeatureType = HalfKA<AssociatedKing>;
-
-        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-        static constexpr IndexType kMaxActiveDimensions =
-            FeatureType::kMaxActiveDimensions;
-
-        // Type of learning feature
-        enum TrainingFeatureType {
-            kFeaturesHalfKA,
-            kFeaturesA,
-            kFeaturesHalfRelativeKA,
-            kNumTrainingFeatureTypes,
-        };
-
-        // Learning feature information
-        static constexpr FeatureProperties kProperties[] = {
-            // kFeaturesHalfA
-            {true, FeatureType::kDimensions},
-            // kFeaturesA
-            {true, Factorizer<A>::get_dimensions()},
-            // kFeaturesHalfRelativeKA
-            {true, Factorizer<HalfRelativeKA<AssociatedKing>>::get_dimensions()},
-        };
-
-        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
-
-    public:
-        static constexpr std::string get_name() {
-            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "A, HalfRelativeKA";
-        }
-
-        static constexpr std::string get_factorizers_string() {
-            return "  - " + get_name();
-        }
-
-        // Get the dimensionality of the learning feature
-        static constexpr IndexType get_dimensions() {
-            return get_active_dimensions(kProperties);
-        }
-
-        // Get index of learning feature and scale of learning rate
-        static void append_training_features(
-            IndexType base_index, std::vector<TrainingFeature>* training_features) {
-
-            // kFeaturesHalfA
-            IndexType index_offset = append_base_feature<FeatureType>(
-                kProperties[kFeaturesHalfKA], base_index, training_features);
-
-            const auto sq_k = static_cast<Square>(base_index / PS_END2);
-            const auto a = static_cast<IndexType>(base_index % PS_END2);
-
-            // kFeaturesA
-            index_offset += inherit_features_if_required<A>(
-                index_offset, kProperties[kFeaturesA], a, training_features);
-
-            // kFeaturesHalfRelativeKA
-            if (a >= PS_W_PAWN) {
-                index_offset += inherit_features_if_required<HalfRelativeKA<AssociatedKing>>(
-                    index_offset, kProperties[kFeaturesHalfRelativeKA],
-                    HalfRelativeKA<AssociatedKing>::make_index(sq_k, a),
-                    training_features);
-            }
-            else {
-                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKA]);
-            }
-
-            assert(index_offset == get_dimensions());
-        }
-    };
-
-    template <Side AssociatedKing>
-    constexpr FeatureProperties Factorizer<HalfKA<AssociatedKing>>::kProperties[];
-
-}  // namespace Eval::NNUE::Features
-
-#endif // #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
diff --git a/src/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h
deleted file mode 100644
index c554f0fc..00000000
--- a/src/nnue/trainer/features/factorizer_half_kp.h
+++ /dev/null
@@ -1,104 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
-#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
-
-#include "factorizer.h"
-
-#include "nnue/features/half_kp.h"
-#include "nnue/features/p.h"
-#include "nnue/features/half_relative_kp.h"
-
-// Specialization of NNUE evaluation function feature conversion class template for HalfKP
-namespace Eval::NNUE::Features {
-
-    // Class template that converts input features into learning features
-    // Specialization for HalfKP
-    template <Side AssociatedKing>
-    class Factorizer<HalfKP<AssociatedKing>> {
-    private:
-        using FeatureType = HalfKP<AssociatedKing>;
-
-        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-        static constexpr IndexType kMaxActiveDimensions =
-            FeatureType::kMaxActiveDimensions;
-
-        // Type of learning feature
-        enum TrainingFeatureType {
-            kFeaturesHalfKP,
-            kFeaturesHalfK,
-            kFeaturesP,
-            kFeaturesHalfRelativeKP,
-            kNumTrainingFeatureTypes,
-        };
-
-        // Learning feature information
-        static constexpr FeatureProperties kProperties[] = {
-            // kFeaturesHalfKP
-            {true, FeatureType::kDimensions},
-            // kFeaturesHalfK
-            {true, SQUARE_NB},
-            // kFeaturesP
-            {true, Factorizer<P>::get_dimensions()},
-            // kFeaturesHalfRelativeKP
-            {true, Factorizer<HalfRelativeKP<AssociatedKing>>::get_dimensions()},
-        };
-
-        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
-
-    public:
-        static constexpr std::string get_name() {
-            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "HalfK, P, HalfRelativeKP";
-        }
-
-        static constexpr std::string get_factorizers_string() {
-            return "  - " + get_name();
-        }
-
-        // Get the dimensionality of the learning feature
-        static constexpr IndexType get_dimensions() {
-            return get_active_dimensions(kProperties);
-        }
-
-        // Get index of learning feature and scale of learning rate
-        static void append_training_features(
-            IndexType base_index, std::vector<TrainingFeature>* training_features) {
-
-            // kFeaturesHalfKP
-            IndexType index_offset = append_base_feature<FeatureType>(
-                kProperties[kFeaturesHalfKP], base_index, training_features);
-
-            const auto sq_k = static_cast<Square>(base_index / PS_END);
-            const auto p = static_cast<IndexType>(base_index % PS_END);
-
-            // kFeaturesHalfK
-            {
-                const auto& properties = kProperties[kFeaturesHalfK];
-                if (properties.active) {
-                    training_features->emplace_back(index_offset + sq_k);
-                    index_offset += properties.dimensions;
-                }
-            }
-
-            // kFeaturesP
-            index_offset += inherit_features_if_required<P>(
-                index_offset, kProperties[kFeaturesP], p, training_features);
-            // kFeaturesHalfRelativeKP
-            if (p >= PS_W_PAWN) {
-                index_offset += inherit_features_if_required<HalfRelativeKP<AssociatedKing>>(
-                    index_offset, kProperties[kFeaturesHalfRelativeKP],
-                    HalfRelativeKP<AssociatedKing>::make_index(sq_k, p),
-                    training_features);
-            }
-            else {
-                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKP]);
-            }
-
-            assert(index_offset == get_dimensions());
-        }
-    };
-
-    template <Side AssociatedKing>
-    constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
-
-}  // namespace Eval::NNUE::Features
-
-#endif
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
deleted file mode 100644
index 973bc898..00000000
--- a/src/nnue/trainer/trainer.h
+++ /dev/null
@@ -1,122 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_H_
-#define _NNUE_TRAINER_H_
-
-#include "nnue/nnue_common.h"
-#include "nnue/features/index_list.h"
-
-#include <sstream>
-
-#if defined(USE_BLAS)
-static_assert(std::is_same<LearnFloatType, float>::value, "");
-#include <cblas.h>
-#endif
-
-// Common header of class template for learning NNUE evaluation function
-namespace Eval::NNUE {
-
-    // Ponanza constant used in the relation between evaluation value and winning percentage
-    constexpr double kPonanzaConstant = 600.0;
-
-    // Class that represents one index of learning feature
-    class TrainingFeature {
-        using StorageType = std::uint32_t;
-        static_assert(std::is_unsigned<StorageType>::value, "");
-
-    public:
-        static constexpr std::uint32_t kIndexBits = 24;
-
-        static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
-
-        static constexpr std::uint32_t kCountBits =
-            std::numeric_limits<StorageType>::digits - kIndexBits;
-
-        explicit TrainingFeature(IndexType index) :
-            index_and_count_((index << kCountBits) | 1) {
-
-            assert(index < (1 << kIndexBits));
-        }
-
-        TrainingFeature& operator+=(const TrainingFeature& other) {
-            assert(other.get_index() == get_index());
-            assert(other.get_count() + get_count() < (1 << kCountBits));
-            index_and_count_ += other.get_count();
-            return *this;
-        }
-
-        IndexType get_index() const {
-            return static_cast<IndexType>(index_and_count_ >> kCountBits);
-        }
-
-        void shift_index(IndexType offset) {
-            assert(get_index() + offset < (1 << kIndexBits));
-            index_and_count_ += offset << kCountBits;
-        }
-
-        IndexType get_count() const {
-            return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
-        }
-
-        bool operator<(const TrainingFeature& other) const {
-            return index_and_count_ < other.index_and_count_;
-        }
-
-    private:
-        StorageType index_and_count_;
-    };
-
-    // Structure that represents one sample of training data
-    struct Example {
-        std::vector<TrainingFeature> training_features[2];
-        Learner::PackedSfenValue psv;
-        Value discrete_nn_eval;
-        int sign;
-        double weight;
-    };
-
-    // Message used for setting hyperparameters
-    struct Message {
-        Message(const std::string& message_name, const std::string& message_value = "") :
-            name(message_name), value(message_value), num_peekers(0), num_receivers(0)
-        {
-        }
-
-        const std::string name;
-        const std::string value;
-        std::uint32_t num_peekers;
-        std::uint32_t num_receivers;
-    };
-
-    // determine whether to accept the message
-    bool receive_message(const std::string& name, Message* message) {
-        const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
-
-        if (message->name.substr(0, name.size() + 1) == name + "[") {
-            ++message->num_peekers;
-        }
-
-        if (message->name == name || message->name == name + subscript) {
-            ++message->num_receivers;
-            return true;
-        }
-
-        return false;
-    }
-
-    // round a floating point number to an integer
-    template <typename IntType>
-    IntType round(double value) {
-        return static_cast<IntType>(std::floor(value + 0.5));
-    }
-
-    // make_shared with alignment
-    template <typename T, typename... ArgumentTypes>
-    std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments) {
-        const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
-            T(std::forward<ArgumentTypes>(arguments)...);
-
-        return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
-    }
-
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
deleted file mode 100644
index 53e8f904..00000000
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ /dev/null
@@ -1,476 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
-#define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
-
-#include "trainer.h"
-
-#include "extra/stockfish_blas.h"
-
-#include "learn/learn.h"
-
-#include "nnue/layers/affine_transform.h"
-
-#include "thread.h"
-
-#include <random>
-
-// Specialization of NNUE evaluation function learning class template for AffineTransform
-namespace Eval::NNUE {
-
-    // Learning: Affine transformation layer
-    template <typename PreviousLayer, IndexType OutputDimensions>
-    class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
-    private:
-        // Type of layer to learn
-        using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
-
-    public:
-        // factory function
-        static std::shared_ptr<Trainer> create(
-            LayerType* target_layer, FeatureTransformer* ft) {
-
-            return std::shared_ptr<Trainer>(
-                new Trainer(target_layer, ft));
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            previous_layer_trainer_->send_message(message);
-
-            if (receive_message("momentum", message)) {
-                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-            }
-
-            if (receive_message("learning_rate_scale", message)) {
-                learning_rate_scale_ =
-                    static_cast<LearnFloatType>(std::stod(message->value));
-            }
-
-            if (receive_message("reset", message)) {
-                dequantize_parameters();
-            }
-
-            if (receive_message("quantize_parameters", message)) {
-                quantize_parameters();
-            }
-
-            if (receive_message("check_health", message)) {
-                check_health();
-            }
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            previous_layer_trainer_->initialize(rng);
-
-            if (kIsOutputLayer) {
-                // Initialize output layer with 0
-                std::fill(std::begin(biases_), std::end(biases_),
-                          static_cast<LearnFloatType>(0.0));
-                std::fill(std::begin(weights_), std::end(weights_),
-                          static_cast<LearnFloatType>(0.0));
-            }
-            else {
-                // Assuming that the input distribution is unit-mean 0.5, equal variance,
-                // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
-                const double kSigma = 1.0 / std::sqrt(kInputDimensions);
-                auto distribution = std::normal_distribution<double>(0.0, kSigma);
-
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    double sum = 0.0;
-                      for (IndexType j = 0; j < kInputDimensions; ++j) {
-                          const auto weight = static_cast<LearnFloatType>(distribution(rng));
-                          weights_[kInputDimensions * i + j] = weight;
-                          sum += weight;
-                      }
-
-                    biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
-                }
-            }
-
-            quantize_parameters();
-        }
-
-        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
-        {
-            const auto size = batch_end - batch_begin;
-
-            if ((long)output_.size() < (long)kOutputDimensions * size) {
-                output_.resize(kOutputDimensions * size);
-                gradients_.resize(kInputDimensions * size);
-            }
-
-            if (thread_states_.size() < thread_pool.size())
-            {
-                thread_states_.resize(thread_pool.size());
-            }
-
-            combined_batch_size_ = size;
-            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
-
-            auto& main_thread_state = thread_states_[0];
-
-#if defined(USE_BLAS)
-
-            // update
-            cblas_sscal(
-                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
-            );
-
-#else
-
-            Blas::sscal(
-                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
-            );
-
-#endif
-
-            for (IndexType i = 1; i < thread_states_.size(); ++i)
-                thread_states_[i].reset_biases();
-
-            return output_.data();
-        }
-
-        // forward propagation
-        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
-
-            previous_layer_trainer_->propagate(th, offset, count);
-
-#if defined(USE_BLAS)
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                cblas_scopy(
-                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
-                );
-            }
-
-            cblas_sgemm(
-                CblasColMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, count, kInputDimensions,
-                1.0,
-                weights_, kInputDimensions,
-                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
-                1.0,
-                &output_[offset * kOutputDimensions], kOutputDimensions
-            );
-#else
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                Blas::scopy(
-                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
-                );
-            }
-
-            Blas::sgemm(
-                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
-                kOutputDimensions, count, kInputDimensions,
-                1.0,
-                weights_, kInputDimensions,
-                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
-                1.0,
-                &output_[offset * kOutputDimensions], kOutputDimensions
-            );
-
-#endif
-        }
-
-        // backpropagation
-        void backpropagate(Thread& th,
-                           const LearnFloatType* gradients,
-                           uint64_t offset,
-                           uint64_t count) {
-
-            auto& thread_state = thread_states_[th.thread_idx()];
-            const auto momentum = th.thread_idx() == 0 ? momentum_ : 0.0f;
-#if defined(USE_BLAS)
-
-            cblas_sgemm(
-                CblasColMajor, CblasNoTrans, CblasNoTrans,
-                kInputDimensions, count, kOutputDimensions,
-                1.0,
-                weights_, kInputDimensions,
-                gradients + offset * kOutputDimensions, kOutputDimensions,
-                0.0,
-                &gradients_[offset * kInputDimensions], kInputDimensions
-            );
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                cblas_saxpy(
-                    kOutputDimensions, 1.0,
-                    &gradients[batch_offset], 1, thread_state.biases_diff_, 1
-                );
-            }
-
-            cblas_sgemm(
-                CblasRowMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, kInputDimensions, count,
-                1.0,
-                gradients + offset * kOutputDimensions, kOutputDimensions,
-                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
-                momentum,
-                thread_state.weights_diff_, kInputDimensions
-            );
-
-#else
-
-            // backpropagate
-            Blas::sgemm(
-                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans,
-                kInputDimensions, count, kOutputDimensions,
-                1.0,
-                weights_, kInputDimensions,
-                gradients + offset * kOutputDimensions, kOutputDimensions,
-                0.0,
-                &gradients_[offset * kInputDimensions], kInputDimensions
-            );
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                Blas::saxpy(kOutputDimensions, 1.0,
-                          &gradients[batch_offset], 1, thread_state.biases_diff_, 1);
-            }
-
-            Blas::sgemm(
-                Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
-                kOutputDimensions, kInputDimensions, count,
-                1.0,
-                gradients + offset * kOutputDimensions, kOutputDimensions,
-                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
-                momentum,
-                thread_state.weights_diff_, kInputDimensions
-            );
-
-#endif
-
-            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
-        }
-
-        void reduce_thread_state()
-        {
-            for (IndexType i = 1; i < thread_states_.size(); ++i)
-            {
-                thread_states_[0] += thread_states_[i];
-            }
-        }
-
-        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
-        {
-            const LearnFloatType local_learning_rate =
-                learning_rate * learning_rate_scale_;
-
-            reduce_thread_state();
-
-            auto& main_thread_state = thread_states_[0];
-
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                const double d = local_learning_rate * main_thread_state.biases_diff_[i];
-                biases_[i] -= d;
-                abs_biases_diff_sum_ += std::abs(d);
-            }
-            num_biases_diffs_ += kOutputDimensions;
-
-            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-                const double d = local_learning_rate * main_thread_state.weights_diff_[i];
-                weights_[i] -= d;
-                abs_weights_diff_sum_ += std::abs(d);
-            }
-            num_weights_diffs_ += kOutputDimensions * kInputDimensions;
-
-            previous_layer_trainer_->step_end(thread_pool, learning_rate);
-        }
-
-    private:
-        // constructor
-        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-            combined_batch_size_(0),
-            combined_batch_input_(nullptr),
-            previous_layer_trainer_(Trainer<PreviousLayer>::create(
-                &target_layer->previous_layer_, ft)),
-            target_layer_(target_layer),
-            biases_(),
-            weights_(),
-            momentum_(0.2),
-            learning_rate_scale_(1.0) {
-
-            dequantize_parameters();
-        }
-
-        void reset_stats() {
-            abs_biases_diff_sum_ = 0.0;
-            abs_weights_diff_sum_ = 0.0;
-            num_biases_diffs_ = 0;
-            num_weights_diffs_ = 0;
-        }
-
-        void check_health() {
-
-            double abs_bias_sum = 0.0;
-            double abs_weight_sum = 0.0;
-
-            for(auto b : biases_)
-                abs_bias_sum += std::abs(b);
-
-            for(auto w : weights_)
-                abs_weight_sum += std::abs(w);
-
-            auto out = sync_region_cout.new_region();
-
-            out << "INFO (check_health):"
-                << " layer " << LayerType::kLayerIndex
-                << " - " << LayerType::get_name()
-                << std::endl;
-
-            out << "  - avg_abs_bias        = " << abs_bias_sum / std::size(biases_) << std::endl;
-            out << "  - avg_abs_bias_diff   = " << abs_biases_diff_sum_ / num_biases_diffs_ << std::endl;
-            out << "  - avg_abs_weight      = " << abs_weight_sum / std::size(weights_) << std::endl;
-            out << "  - avg_abs_weight_diff = " << abs_weights_diff_sum_ / num_weights_diffs_ << std::endl;
-
-            out.unlock();
-
-            reset_stats();
-        }
-
-        // Weight saturation and parameterization
-        void quantize_parameters() {
-            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-                weights_[i] = std::max(-kMaxWeightMagnitude,
-                                       std::min(+kMaxWeightMagnitude, weights_[i]));
-            }
-
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                target_layer_->biases_[i] =
-                    round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
-            }
-
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                const auto offset = kInputDimensions * i;
-                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-                for (IndexType j = 0; j < kInputDimensions; ++j) {
-                    target_layer_->weights_[padded_offset + j] =
-                        round<typename LayerType::WeightType>(
-                            weights_[offset + j] * kWeightScale);
-                }
-            }
-        }
-
-        // read parameterized integer
-        void dequantize_parameters() {
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                biases_[i] = static_cast<LearnFloatType>(
-                    target_layer_->biases_[i] / kBiasScale);
-            }
-
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                const auto offset = kInputDimensions * i;
-                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-                for (IndexType j = 0; j < kInputDimensions; ++j) {
-                    weights_[offset + j] = static_cast<LearnFloatType>(
-                        target_layer_->weights_[padded_offset + j] / kWeightScale);
-                }
-            }
-
-            for (auto& state : thread_states_)
-            {
-                state.reset_weights();
-                state.reset_biases();
-            }
-
-
-            reset_stats();
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
-        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-        // If the output dimensionality is 1, the output layer
-        static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
-
-        // Coefficient used for parameterization
-        static constexpr LearnFloatType kActivationScale =
-            std::numeric_limits<std::int8_t>::max();
-
-        static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
-            (kPonanzaConstant * FV_SCALE) :
-            ((1 << kWeightScaleBits) * kActivationScale);
-
-        static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
-
-        // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
-        static constexpr LearnFloatType kMaxWeightMagnitude =
-            std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
-
-        // number of samples in mini-batch
-        IndexType combined_batch_size_;
-
-        double abs_biases_diff_sum_;
-        double abs_weights_diff_sum_;
-        uint64_t num_biases_diffs_;
-        uint64_t num_weights_diffs_;
-
-        // Input mini batch
-        const LearnFloatType* combined_batch_input_;
-
-        // Trainer of the previous layer
-        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-        // layer to learn
-        LayerType* const target_layer_;
-
-        // parameter
-        struct alignas(kCacheLineSize) ThreadState
-        {
-            // Buffer used for updating parameters
-            alignas(kCacheLineSize) LearnFloatType biases_diff_[kOutputDimensions];
-            alignas(kCacheLineSize) LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
-
-            ThreadState() { reset_weights(); reset_biases(); }
-
-            ThreadState& operator+=(const ThreadState& other)
-            {
-                for (IndexType i = 0; i < kOutputDimensions; ++i)
-                {
-                    biases_diff_[i] += other.biases_diff_[i];
-                }
-
-                for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i)
-                {
-                    weights_diff_[i] += other.weights_diff_[i];
-                }
-
-                return *this;
-            }
-
-            void reset_weights()
-            {
-                std::fill(std::begin(weights_diff_), std::end(weights_diff_), 0.0f);
-            }
-
-            void reset_biases()
-            {
-                std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
-            }
-        };
-
-        alignas(kCacheLineSize) LearnFloatType biases_[kOutputDimensions];
-        alignas(kCacheLineSize) LearnFloatType weights_[kOutputDimensions * kInputDimensions];
-
-        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
-
-        // Forward propagation buffer
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
-
-        // buffer for back propagation
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
-
-        // hyper parameter
-        LearnFloatType momentum_;
-        LearnFloatType learning_rate_scale_;
-    };
-
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
deleted file mode 100644
index 48dec8be..00000000
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ /dev/null
@@ -1,354 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
-#define _NNUE_TRAINER_CLIPPED_RELU_H_
-
-#include "trainer.h"
-
-#include "learn/learn.h"
-
-#include "nnue/layers/clipped_relu.h"
-
-#include "thread.h"
-
-// Specialization of NNUE evaluation function learning class template for ClippedReLU
-namespace Eval::NNUE {
-
-    // Learning: Affine transformation layer
-    template <typename PreviousLayer>
-    class Trainer<Layers::ClippedReLU<PreviousLayer>> {
-    private:
-        // Type of layer to learn
-        using LayerType = Layers::ClippedReLU<PreviousLayer>;
-
-    public:
-        // factory function
-        static std::shared_ptr<Trainer> create(
-            LayerType* target_layer, FeatureTransformer* ft) {
-
-            return std::shared_ptr<Trainer>(
-                new Trainer(target_layer, ft));
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            previous_layer_trainer_->send_message(message);
-            if (receive_message("check_health", message)) {
-                check_health();
-            }
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            previous_layer_trainer_->initialize(rng);
-        }
-
-        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
-        {
-            const auto size = batch_end - batch_begin;
-
-            if ((long)output_.size() < (long)kOutputDimensions * size) {
-              output_.resize(kOutputDimensions * size);
-              gradients_.resize(kInputDimensions * size);
-            }
-
-            if (thread_states_.size() < thread_pool.size())
-            {
-                thread_states_.resize(thread_pool.size());
-            }
-
-            input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
-
-            batch_size_ = size;
-
-            return output_.data();
-        }
-
-        // forward propagation
-        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
-
-            auto& thread_state = thread_states_[th.thread_idx()];
-
-            previous_layer_trainer_->propagate(th, offset, count);
-
-#if defined (USE_SSE2)
-
-            {
-                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
-
-                const __m128 kZero4 = _mm_set1_ps(+kZero);
-                const __m128 kOne4 = _mm_set1_ps(+kOne);
-
-                for (IndexType b = offset; b < offset + count; ++b)
-                {
-                    const IndexType batch_offset = kOutputDimensions * b;
-
-                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
-                    {
-                        __m128 out0 = _mm_loadu_ps(&input_[i + 0 + batch_offset]);
-                        __m128 out1 = _mm_loadu_ps(&input_[i + 4 + batch_offset]);
-                        __m128 out2 = _mm_loadu_ps(&input_[i + 8 + batch_offset]);
-                        __m128 out3 = _mm_loadu_ps(&input_[i + 12 + batch_offset]);
-
-                        out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
-                        out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
-                        out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
-                        out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
-
-                        _mm_storeu_ps(&output_[i + 0 + batch_offset], out0);
-                        _mm_storeu_ps(&output_[i + 4 + batch_offset], out1);
-                        _mm_storeu_ps(&output_[i + 8 + batch_offset], out2);
-                        _mm_storeu_ps(&output_[i + 12 + batch_offset], out3);
-
-                        __m128 minact0 = _mm_loadu_ps(&thread_state.min_activations_[i + 0]);
-                        __m128 minact1 = _mm_loadu_ps(&thread_state.min_activations_[i + 4]);
-                        __m128 minact2 = _mm_loadu_ps(&thread_state.min_activations_[i + 8]);
-                        __m128 minact3 = _mm_loadu_ps(&thread_state.min_activations_[i + 12]);
-
-                        __m128 maxact0 = _mm_loadu_ps(&thread_state.max_activations_[i + 0]);
-                        __m128 maxact1 = _mm_loadu_ps(&thread_state.max_activations_[i + 4]);
-                        __m128 maxact2 = _mm_loadu_ps(&thread_state.max_activations_[i + 8]);
-                        __m128 maxact3 = _mm_loadu_ps(&thread_state.max_activations_[i + 12]);
-
-                        minact0 = _mm_min_ps(out0, minact0);
-                        minact1 = _mm_min_ps(out1, minact1);
-                        minact2 = _mm_min_ps(out2, minact2);
-                        minact3 = _mm_min_ps(out3, minact3);
-
-                        maxact0 = _mm_max_ps(out0, maxact0);
-                        maxact1 = _mm_max_ps(out1, maxact1);
-                        maxact2 = _mm_max_ps(out2, maxact2);
-                        maxact3 = _mm_max_ps(out3, maxact3);
-
-                        _mm_storeu_ps(&thread_state.min_activations_[i + 0], minact0);
-                        _mm_storeu_ps(&thread_state.min_activations_[i + 4], minact1);
-                        _mm_storeu_ps(&thread_state.min_activations_[i + 8], minact2);
-                        _mm_storeu_ps(&thread_state.min_activations_[i + 12], minact3);
-
-                        _mm_storeu_ps(&thread_state.max_activations_[i + 0], maxact0);
-                        _mm_storeu_ps(&thread_state.max_activations_[i + 4], maxact1);
-                        _mm_storeu_ps(&thread_state.max_activations_[i + 8], maxact2);
-                        _mm_storeu_ps(&thread_state.max_activations_[i + 12], maxact3);
-                    }
-                }
-            }
-
-#else
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    const IndexType index = batch_offset + i;
-                    output_[index] = std::max(+kZero, std::min(+kOne, input_[index]));
-                    thread_state.min_activations_[i] = std::min(thread_state.min_activations_[i], output_[index]);
-                    thread_state.max_activations_[i] = std::max(thread_state.max_activations_[i], output_[index]);
-                }
-            }
-
-#endif
-        }
-
-        // backpropagation
-        void backpropagate(Thread& th,
-                           const LearnFloatType* gradients,
-                           const uint64_t offset,
-                           const uint64_t count) {
-
-            auto& thread_state = thread_states_[th.thread_idx()];
-
-#if defined (USE_SSE2)
-
-            {
-                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
-
-                const __m128 kZero4 = _mm_set1_ps(+kZero);
-                const __m128 kOne4 = _mm_set1_ps(+kOne);
-
-                for (IndexType b = offset; b < offset + count; ++b)
-                {
-                    const IndexType batch_offset = kOutputDimensions * b;
-
-                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
-                    {
-                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
-                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
-                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
-                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
-
-                        __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
-                        __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
-                        __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
-                        __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
-
-                        __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
-                        __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
-                        __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
-                        __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
-
-                        grad0 = _mm_andnot_ps(clipped0, grad0);
-                        grad1 = _mm_andnot_ps(clipped1, grad1);
-                        grad2 = _mm_andnot_ps(clipped2, grad2);
-                        grad3 = _mm_andnot_ps(clipped3, grad3);
-
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
-
-                        const int clipped_mask =
-                            (_mm_movemask_ps(clipped0) << 0)
-                            | (_mm_movemask_ps(clipped1) << 4)
-                            | (_mm_movemask_ps(clipped2) << 8)
-                            | (_mm_movemask_ps(clipped3) << 12);
-
-                        thread_state.num_clipped_ += popcount(clipped_mask);
-                    }
-                }
-            }
-
-#else
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    const IndexType index = batch_offset + i;
-                    const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
-                    gradients_[index] = gradients[index] * !clipped;
-                    thread_state.num_clipped_ += clipped;
-                }
-            }
-
-#endif
-
-            thread_state.num_total_ += count * kOutputDimensions;
-
-            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
-        }
-
-        void reduce_thread_state()
-        {
-            for (IndexType i = 1; i < thread_states_.size(); ++i)
-            {
-                thread_states_[0] += thread_states_[i];
-            }
-        }
-
-        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
-        {
-            previous_layer_trainer_->step_end(thread_pool, learning_rate);
-        }
-
-    private:
-        // constructor
-        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-            batch_size_(0),
-            previous_layer_trainer_(Trainer<PreviousLayer>::create(
-                &target_layer->previous_layer_, ft)),
-            target_layer_(target_layer) {
-
-            reset_stats();
-        }
-
-        void reset_stats() {
-            for(auto& state : thread_states_)
-                state.reset();
-        }
-
-        // Check if there are any problems with learning
-        void check_health() {
-
-            reduce_thread_state();
-
-            auto& main_thread_state = thread_states_[0];
-
-            const auto largest_min_activation = *std::max_element(
-                std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
-            const auto smallest_max_activation = *std::min_element(
-                std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
-
-            auto out = sync_region_cout.new_region();
-
-            out << "INFO (check_health):"
-                << " layer " << LayerType::kLayerIndex
-                << " - " << LayerType::get_name()
-                << std::endl;
-
-            out << "  - largest min activation = " << largest_min_activation
-                << " , smallest max activation = " << smallest_max_activation
-                << std::endl;
-
-            out << "  - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
-                << std::endl;
-
-            out.unlock();
-
-            reset_stats();
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
-        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-        // LearnFloatType constant
-        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
-        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
-
-        // number of samples in mini-batch
-        IndexType batch_size_;
-
-        const LearnFloatType* input_;
-
-        // Trainer of the previous layer
-        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-        // layer to learn
-        LayerType* const target_layer_;
-
-        // Forward propagation buffer
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
-
-        // buffer for back propagation
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
-
-        struct alignas(kCacheLineSize) ThreadState
-        {
-            // Health check statistics
-            LearnFloatType min_activations_[kOutputDimensions];
-            LearnFloatType max_activations_[kOutputDimensions];
-            uint64_t num_clipped_;
-            uint64_t num_total_;
-
-            ThreadState() { reset(); }
-
-            ThreadState& operator+=(const ThreadState& other)
-            {
-                for (IndexType i = 0; i < kOutputDimensions; ++i)
-                {
-                    min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
-                }
-
-                for (IndexType i = 0; i < kOutputDimensions; ++i)
-                {
-                    max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
-                }
-
-                num_clipped_ += other.num_clipped_;
-                num_total_ += other.num_total_;
-
-                return *this;
-            }
-
-            void reset()
-            {
-                std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
-                std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
-                num_clipped_ = 0;
-                num_total_ = 0;
-            }
-        };
-
-        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
-    };
-
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
deleted file mode 100644
index b0e0ebba..00000000
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ /dev/null
@@ -1,783 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
-#define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
-
-#include "trainer.h"
-
-#include "extra/stockfish_blas.h"
-
-#include "features/all_factorizers.h"
-
-#include "learn/learn.h"
-
-#include "nnue/nnue_feature_transformer.h"
-
-#include "thread.h"
-
-#include <array>
-#include <bitset>
-#include <numeric>
-#include <random>
-#include <set>
-
-// Specialization for feature transformer of learning class template of NNUE evaluation function
-namespace Eval::NNUE {
-
-    // Learning: Input feature converter
-    template <>
-    class Trainer<FeatureTransformer> {
-    private:
-        // Type of layer to learn
-        using LayerType = FeatureTransformer;
-
-    public:
-        template <typename T>
-        friend struct AlignedDeleter;
-
-        template <typename T, typename... ArgumentTypes>
-        friend std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments);
-
-        // factory function
-        static std::shared_ptr<Trainer> create(LayerType* target_layer) {
-            return make_aligned_shared_ptr<Trainer>(target_layer);
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            if (receive_message("momentum", message)) {
-                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-            }
-
-            if (receive_message("learning_rate_scale", message)) {
-                learning_rate_scale_ =
-                    static_cast<LearnFloatType>(std::stod(message->value));
-            }
-
-            if (receive_message("reset", message)) {
-                dequantize_parameters();
-            }
-
-            if (receive_message("quantize_parameters", message)) {
-                quantize_parameters();
-            }
-
-            if (receive_message("clear_unobserved_feature_weights", message)) {
-                clear_unobserved_feature_weights();
-            }
-
-            if (receive_message("check_health", message)) {
-                check_health();
-            }
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            std::fill(std::begin(weights_), std::end(weights_), +kZero);
-
-            const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
-            auto distribution = std::normal_distribution<double>(0.0, kSigma);
-
-            for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
-                const auto weight = static_cast<LearnFloatType>(distribution(rng));
-                weights_[i] = weight;
-            }
-
-            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                biases_[i] = static_cast<LearnFloatType>(0.5);
-            }
-
-            quantize_parameters();
-        }
-
-        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
-        {
-            const auto size = batch_end - batch_begin;
-
-            if ((long)output_.size() < (long)kOutputDimensions * size) {
-                output_.resize(kOutputDimensions * size);
-                gradients_.resize(kOutputDimensions * size);
-            }
-
-            if (thread_stat_states_.size() < thread_pool.size())
-            {
-                thread_stat_states_.resize(thread_pool.size());
-            }
-
-            if (thread_bias_states_.size() < thread_pool.size())
-            {
-                thread_bias_states_.resize(thread_pool.size());
-            }
-
-            batch_ = &*batch_begin;
-            batch_size_ = size;
-
-            auto& main_thread_bias_state = thread_bias_states_[0];
-
-#if defined(USE_BLAS)
-
-            cblas_sscal(
-                kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1
-            );
-
-#else
-
-            Blas::sscal(
-                kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1
-            );
-
-#endif
-
-            for (IndexType i = 1; i < thread_bias_states_.size(); ++i)
-                thread_bias_states_[i].reset();
-
-            return output_.data();
-        }
-
-        // forward propagation
-        void propagate(Thread& th, uint64_t offset, uint64_t count) {
-
-            auto& thread_stat_state = thread_stat_states_[th.thread_idx()];
-
-            for (IndexType b = offset; b < offset + count; ++b)
-            {
-                const IndexType batch_offset = kOutputDimensions * b;
-
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-
-#if defined(USE_BLAS)
-
-                    cblas_scopy(
-                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                    );
-
-                    for (const auto& feature : batch_[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        cblas_saxpy(
-                            kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], 1, &output_[output_offset], 1
-                        );
-                    }
-
-#else
-
-                    Blas::scopy(
-                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                    );
-                    for (const auto& feature : batch_[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        Blas::saxpy(
-                            kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], &output_[output_offset]
-                        );
-                    }
-
-#endif
-                }
-            }
-
-#if defined (USE_SSE2)
-
-            {
-                static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
-
-                auto m128_hmin_ps = [](__m128 x3210) {
-                    __m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2));
-                    __m128 min_x_x_13_20 = _mm_min_ps(x3210, x0032);
-                    // a = [ # , # , min(x[1], x[3]) , min(x[2], x[0]) ]
-                    __m128 min_x_x_20_13 = _mm_shuffle_ps(min_x_x_13_20, min_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1));
-                    return _mm_cvtss_f32(_mm_min_ps(min_x_x_13_20, min_x_x_20_13));
-                };
-
-                auto m128_hmax_ps = [](__m128 x3210) {
-                    __m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2));
-                    __m128 max_x_x_13_20 = _mm_max_ps(x3210, x0032);
-                    // a = [ # , # , max(x[1], x[3]) , max(x[2], x[0]) ]
-                    __m128 max_x_x_20_13 = _mm_shuffle_ps(max_x_x_13_20, max_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1));
-                    return _mm_cvtss_f32(_mm_max_ps(max_x_x_13_20, max_x_x_20_13));
-                };
-
-                const __m128 kZero4 = _mm_set1_ps(+kZero);
-                const __m128 kOne4 = _mm_set1_ps(+kOne);
-
-                __m128 min_pre_activation0 = _mm_set1_ps(thread_stat_state.min_pre_activation_);
-                __m128 min_pre_activation1 = _mm_set1_ps(thread_stat_state.min_pre_activation_);
-                __m128 max_pre_activation0 = _mm_set1_ps(thread_stat_state.max_pre_activation_);
-                __m128 max_pre_activation1 = _mm_set1_ps(thread_stat_state.max_pre_activation_);
-
-                for (IndexType b = offset; b < offset + count; ++b)
-                {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
-                    {
-                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i +  0]);
-                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i +  4]);
-                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i +  8]);
-                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
-
-                        __m128 min01 = _mm_min_ps(out0, out1);
-                        __m128 min23 = _mm_min_ps(out2, out3);
-
-                        __m128 max01 = _mm_max_ps(out0, out1);
-                        __m128 max23 = _mm_max_ps(out2, out3);
-
-                        min_pre_activation0 = _mm_min_ps(min_pre_activation0, min01);
-                        min_pre_activation1 = _mm_min_ps(min_pre_activation1, min23);
-                        max_pre_activation0 = _mm_max_ps(max_pre_activation0, max01);
-                        max_pre_activation1 = _mm_max_ps(max_pre_activation1, max23);
-
-                        out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
-                        out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
-                        out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
-                        out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
-
-                        _mm_storeu_ps(&output_[batch_offset + i +  0], out0);
-                        _mm_storeu_ps(&output_[batch_offset + i +  4], out1);
-                        _mm_storeu_ps(&output_[batch_offset + i +  8], out2);
-                        _mm_storeu_ps(&output_[batch_offset + i + 12], out3);
-                    }
-                }
-
-                thread_stat_state.min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1));
-                thread_stat_state.max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1));
-
-                for (IndexType b = offset; b < offset + count; ++b)
-                {
-                    const IndexType batch_offset = kOutputDimensions * b;
-
-                    for (IndexType half = 0; half < 2; ++half)
-                    {
-                        const IndexType half_offset = batch_offset + half * kHalfDimensions;
-                        for (IndexType i = 0; i < kHalfDimensions; i += 16)
-                        {
-                            const __m128 out0 = _mm_loadu_ps(&output_[i +  0 + half_offset]);
-                            const __m128 out1 = _mm_loadu_ps(&output_[i +  4 + half_offset]);
-                            const __m128 out2 = _mm_loadu_ps(&output_[i +  8 + half_offset]);
-                            const __m128 out3 = _mm_loadu_ps(&output_[i + 12 + half_offset]);
-
-                            __m128 minact0 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  0]);
-                            __m128 minact1 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  4]);
-                            __m128 minact2 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  8]);
-                            __m128 minact3 = _mm_loadu_ps(&thread_stat_state.min_activations_[i + 12]);
-
-                            __m128 maxact0 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  0]);
-                            __m128 maxact1 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  4]);
-                            __m128 maxact2 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  8]);
-                            __m128 maxact3 = _mm_loadu_ps(&thread_stat_state.max_activations_[i + 12]);
-
-                            minact0 = _mm_min_ps(out0, minact0);
-                            minact1 = _mm_min_ps(out1, minact1);
-                            minact2 = _mm_min_ps(out2, minact2);
-                            minact3 = _mm_min_ps(out3, minact3);
-
-                            maxact0 = _mm_max_ps(out0, maxact0);
-                            maxact1 = _mm_max_ps(out1, maxact1);
-                            maxact2 = _mm_max_ps(out2, maxact2);
-                            maxact3 = _mm_max_ps(out3, maxact3);
-
-                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  0], minact0);
-                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  4], minact1);
-                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  8], minact2);
-                            _mm_storeu_ps(&thread_stat_state.min_activations_[i + 12], minact3);
-
-                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  0], maxact0);
-                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  4], maxact1);
-                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  8], maxact2);
-                            _mm_storeu_ps(&thread_stat_state.max_activations_[i + 12], maxact3);
-                        }
-                    }
-                }
-            }
-
-#else
-
-            // clipped ReLU
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    const IndexType index = batch_offset + i;
-                    thread_stat_state.min_pre_activation_ = std::min(thread_stat_state.min_pre_activation_, output_[index]);
-                    thread_stat_state.max_pre_activation_ = std::max(thread_stat_state.max_pre_activation_, output_[index]);
-                    output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
-                    const IndexType t = i % kHalfDimensions;
-                    thread_stat_state.min_activations_[t] = std::min(thread_stat_state.min_activations_[t], output_[index]);
-                    thread_stat_state.max_activations_[t] = std::max(thread_stat_state.max_activations_[t], output_[index]);
-                }
-            }
-
-#endif
-        }
-
-        // backpropagation
-        void backpropagate(Thread& th,
-                           const LearnFloatType* gradients,
-                           uint64_t offset,
-                           uint64_t count) {
-
-            auto& thread_stat_state = thread_stat_states_[th.thread_idx()];
-            auto& thread_bias_state = thread_bias_states_[th.thread_idx()];
-
-#if defined (USE_SSE2)
-
-            {
-                static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
-
-                const __m128 kZero4 = _mm_set1_ps(+kZero);
-                const __m128 kOne4 = _mm_set1_ps(+kOne);
-
-                for (IndexType b = offset; b < offset + count; ++b)
-                {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
-                    {
-                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
-                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
-                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
-                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
-
-                        __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
-                        __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
-                        __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
-                        __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
-
-                        __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
-                        __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
-                        __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
-                        __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
-
-                        grad0 = _mm_andnot_ps(clipped0, grad0);
-                        grad1 = _mm_andnot_ps(clipped1, grad1);
-                        grad2 = _mm_andnot_ps(clipped2, grad2);
-                        grad3 = _mm_andnot_ps(clipped3, grad3);
-
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
-
-                        const int clipped_mask =
-                            (_mm_movemask_ps(clipped0) << 0)
-                            | (_mm_movemask_ps(clipped1) << 4)
-                            | (_mm_movemask_ps(clipped2) << 8)
-                            | (_mm_movemask_ps(clipped3) << 12);
-
-                        thread_stat_state.num_clipped_ += popcount(clipped_mask);
-                    }
-                }
-            }
-
-#else
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    const IndexType index = batch_offset + i;
-                    const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
-                    gradients_[index] = gradients[index] * !clipped;
-                    thread_stat_state.num_clipped_ += clipped;
-                }
-            }
-
-#endif
-
-            thread_stat_state.num_total_ += count * kOutputDimensions;
-
-#if defined(USE_BLAS)
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    cblas_saxpy(
-                        kHalfDimensions, 1.0,
-                        &gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1
-                    );
-                }
-            }
-
-#else
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    Blas::saxpy(
-                        kHalfDimensions, 1.0,
-                        &gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1
-                    );
-                }
-            }
-
-#endif
-        }
-
-        void reduce_thread_stat_state()
-        {
-            for (IndexType i = 1; i < thread_stat_states_.size(); ++i)
-            {
-                thread_stat_states_[0] += thread_stat_states_[i];
-            }
-        }
-
-        void reduce_thread_bias_state()
-        {
-            for (IndexType i = 1; i < thread_bias_states_.size(); ++i)
-            {
-                thread_bias_states_[0] += thread_bias_states_[i];
-            }
-        }
-
-        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
-
-            const LearnFloatType local_learning_rate =
-                learning_rate * learning_rate_scale_;
-
-            // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
-            // Correct the learning rate and adjust the scale without using momentum
-            const LearnFloatType effective_learning_rate =
-                static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
-
-            reduce_thread_bias_state();
-
-            auto& main_thread_state = thread_bias_states_[0];
-
-#if defined(USE_BLAS)
-
-            cblas_saxpy(
-                kHalfDimensions, -local_learning_rate,
-                main_thread_state.biases_diff_, 1, biases_, 1
-            );
-
-#else
-
-            Blas::saxpy(
-                kHalfDimensions, -local_learning_rate,
-                main_thread_state.biases_diff_, 1, biases_, 1
-            );
-
-#endif
-
-            thread_pool.execute_with_workers(
-                [&, num_threads = thread_pool.size()](Thread& th) {
-                    const auto thread_index = th.thread_idx();
-
-                    for (IndexType b = 0; b < batch_size_; ++b) {
-                        const IndexType batch_offset = kOutputDimensions * b;
-
-                        for (IndexType c = 0; c < 2; ++c) {
-                            const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                            for (const auto& feature : batch_[b].training_features[c]) {
-                                const IndexType feature_index = feature.get_index();
-                                const IndexType weights_offset =
-                                    kHalfDimensions * feature_index;
-#if defined (USE_SSE2)
-                                _mm_prefetch(reinterpret_cast<const char*>(&weights_[weights_offset]), _MM_HINT_T2);
-#endif
-
-                                // We assign each bucket a continuous range of bits at least
-                                // of cache line size to prevent false sharing.
-                                // For HalfKP this is enough to saturate about 80 threads.
-                                const IndexType thread_bucket =
-                                    (feature_index / BitsetType::best_concurrent_access_stride)
-                                    % num_threads;
-
-                                if (thread_bucket != thread_index)
-                                    continue;
-
-                                // This operation can be performed safely because
-                                // each thread accesses a different memory location
-                                // (even a different cache line)
-                                observed_features.set(feature_index);
-
-                                const auto scale = static_cast<LearnFloatType>(
-                                    effective_learning_rate / feature.get_count());
-
-#if defined (USE_BLAS)
-
-                                cblas_saxpy(
-                                    kHalfDimensions, -scale,
-                                    &gradients_[output_offset], 1,
-                                    &weights_[weights_offset], 1
-                                );
-
-#else
-
-                                Blas::saxpy(
-                                    kHalfDimensions, -scale,
-                                    &gradients_[output_offset],
-                                    &weights_[weights_offset]
-                                );
-
-#endif
-                            }
-                        }
-                    }
-                }
-            );
-
-            thread_pool.wait_for_workers_finished();
-        }
-
-    private:
-        // constructor
-        Trainer(LayerType* target_layer) :
-            batch_(nullptr),
-            batch_size_(0),
-            target_layer_(target_layer),
-            biases_(),
-            weights_(),
-            momentum_(0.2),
-            learning_rate_scale_(1.0) {
-
-            dequantize_parameters();
-        }
-
-        // Weight saturation and parameterization
-        void quantize_parameters() {
-            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                target_layer_->biases_[i] =
-                    round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
-            }
-
-            std::vector<TrainingFeature> training_features;
-
-            Threads.for_each_index_with_workers(
-                0, RawFeatures::kDimensions,
-                [this, training_features](Thread&, int j) mutable {
-                    training_features.clear();
-                    Features::Factorizer<RawFeatures>::append_training_features(
-                        j, &training_features);
-
-                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                        double sum = 0.0;
-                        for (const auto& feature : training_features) {
-                            sum += weights_[kHalfDimensions * feature.get_index() + i];
-                        }
-
-                        target_layer_->weights_[kHalfDimensions * j + i] =
-                            round<typename LayerType::WeightType>(sum * kWeightScale);
-                    }
-                }
-            );
-            Threads.wait_for_workers_finished();
-        }
-
-        void reset_stats() {
-            for (auto& state : thread_stat_states_)
-                state.reset();
-        }
-
-        // read parameterized integer
-        void dequantize_parameters() {
-            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                biases_[i] = static_cast<LearnFloatType>(
-                    target_layer_->biases_[i] / kBiasScale);
-            }
-
-            std::fill(std::begin(weights_), std::end(weights_), +kZero);
-
-            for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
-                weights_[i] = static_cast<LearnFloatType>(
-                    target_layer_->weights_[i] / kWeightScale);
-            }
-
-            reset_stats();
-
-            for (auto& state : thread_bias_states_)
-                state.reset();
-        }
-
-        // Set the weight corresponding to the feature that does not appear in the learning data to 0
-        void clear_unobserved_feature_weights() {
-            for (IndexType i = 0; i < kInputDimensions; ++i) {
-                if (!observed_features.test(i)) {
-                    std::fill(std::begin(weights_) + kHalfDimensions * i,
-                              std::begin(weights_) + kHalfDimensions * (i + 1), +kZero);
-                }
-            }
-
-            quantize_parameters();
-        }
-
-        // Check if there are any problems with learning
-        void check_health() {
-
-            constexpr LearnFloatType kPreActivationLimit =
-                std::numeric_limits<typename LayerType::WeightType>::max() /
-                kWeightScale;
-
-            reduce_thread_stat_state();
-
-            auto& main_thread_state = thread_stat_states_[0];
-
-            const auto largest_min_activation = *std::max_element(
-                std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
-            const auto smallest_max_activation = *std::min_element(
-                std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
-
-            double abs_bias_sum = 0.0;
-            double abs_weight_sum = 0.0;
-
-            for(auto b : biases_)
-                abs_bias_sum += std::abs(b);
-
-            std::vector<TrainingFeature> training_features;
-            for (IndexType j = 0; j < RawFeatures::kDimensions; ++j)
-            {
-                training_features.clear();
-                Features::Factorizer<RawFeatures>::append_training_features(
-                    j, &training_features);
-
-                for (const auto& feature : training_features) {
-                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                        abs_weight_sum += std::abs(weights_[kHalfDimensions * feature.get_index() + i]);
-                    }
-                }
-            }
-
-            auto out = sync_region_cout.new_region();
-
-            out << "INFO (check_health):"
-                << " layer " << LayerType::kLayerIndex
-                << " - " << LayerType::get_name()
-                << std::endl;
-
-            out << "  - observed " << observed_features.count()
-                << " (out of " << kInputDimensions << ") features"
-                << std::endl;
-
-            out << "  - (min, max) of pre-activations = "
-                << main_thread_state.min_pre_activation_ << ", "
-                << main_thread_state.max_pre_activation_ << " (limit = "
-                << kPreActivationLimit << ")"
-                << std::endl;
-
-            out << "  - largest min activation = " << largest_min_activation
-                << " , smallest max activation = " << smallest_max_activation
-                << std::endl;
-
-            out << "  - avg_abs_bias   = " << abs_bias_sum / std::size(biases_) << std::endl;
-            out << "  - avg_abs_weight = " << abs_weight_sum / std::size(weights_) << std::endl;
-
-            out << "  - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
-                << std::endl;
-
-            out.unlock();
-
-            reset_stats();
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions =
-            Features::Factorizer<RawFeatures>::get_dimensions();
-        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-        static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
-
-        // Coefficient used for parameterization
-        static constexpr LearnFloatType kActivationScale =
-            std::numeric_limits<std::int8_t>::max();
-        static constexpr LearnFloatType kBiasScale = kActivationScale;
-        static constexpr LearnFloatType kWeightScale = kActivationScale;
-
-        // LearnFloatType constant
-        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
-        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
-
-        // mini batch
-        const Example* batch_;
-        IndexType batch_size_;
-
-        // layer to learn
-        LayerType* const target_layer_;
-
-        // parameter
-        alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
-        alignas(kCacheLineSize)
-            LearnFloatType weights_[kHalfDimensions * kInputDimensions];
-
-        // Buffer used for updating parameters
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
-
-        // Forward propagation buffer
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
-
-        // Features that appeared in the training data
-        using BitsetType = LargeBitset<kInputDimensions>;
-        BitsetType observed_features;
-
-        // hyper parameter
-        LearnFloatType momentum_;
-        LearnFloatType learning_rate_scale_;
-
-        struct alignas(kCacheLineSize) ThreadStatState
-        {
-            alignas(kCacheLineSize) LearnFloatType min_activations_[kHalfDimensions];
-            alignas(kCacheLineSize) LearnFloatType max_activations_[kHalfDimensions];
-            LearnFloatType min_pre_activation_;
-            LearnFloatType max_pre_activation_;
-            uint64_t num_clipped_;
-            uint64_t num_total_;
-
-            ThreadStatState() { reset(); }
-
-            ThreadStatState& operator+=(const ThreadStatState& other)
-            {
-                for (IndexType i = 0; i < kHalfDimensions; ++i)
-                {
-                    min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
-                }
-
-                for (IndexType i = 0; i < kHalfDimensions; ++i)
-                {
-                    max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
-                }
-
-                min_pre_activation_ = std::min(min_pre_activation_, other.min_pre_activation_);
-                max_pre_activation_ = std::max(max_pre_activation_, other.max_pre_activation_);
-
-                num_clipped_ += other.num_clipped_;
-                num_total_ += other.num_total_;
-
-                return *this;
-            }
-
-            void reset()
-            {
-                std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
-                std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
-                min_pre_activation_ = std::numeric_limits<float>::max();
-                max_pre_activation_ = std::numeric_limits<float>::lowest();
-                num_clipped_ = 0;
-                num_total_ = 0;
-            }
-        };
-
-        struct alignas(kCacheLineSize) ThreadBiasState
-        {
-            alignas(kCacheLineSize) LearnFloatType biases_diff_[kHalfDimensions];
-
-            ThreadBiasState() { reset(); }
-
-            ThreadBiasState& operator+=(const ThreadBiasState& other)
-            {
-                for (IndexType i = 0; i < kHalfDimensions; ++i)
-                {
-                    biases_diff_[i] += other.biases_diff_[i];
-                }
-
-                return *this;
-            }
-
-            void reset()
-            {
-                std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
-            }
-        };
-
-        std::vector<ThreadStatState, CacheLineAlignedAllocator<ThreadStatState>> thread_stat_states_;
-        std::vector<ThreadBiasState, CacheLineAlignedAllocator<ThreadBiasState>> thread_bias_states_;
-    };
-
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
deleted file mode 100644
index ff1265dc..00000000
--- a/src/nnue/trainer/trainer_input_slice.h
+++ /dev/null
@@ -1,383 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
-#define _NNUE_TRAINER_INPUT_SLICE_H_
-
-#include "trainer.h"
-
-#include "extra/stockfish_blas.h"
-
-#include "learn/learn.h"
-
-#include "nnue/layers/input_slice.h"
-
-#include "thread.h"
-
-// Specialization of NNUE evaluation function learning class template for InputSlice
-namespace Eval::NNUE {
-
-    // Learning: Input layer
-    // This is tricky. It exists because when there's more than one trainer
-    // on top of a single feature transformer we want to only call propagate/backpropagate
-    // on the feature transformer once. This is straightforward in the old
-    // multithreading case, because propagate/backpropagate is called just once from the
-    // main thread. But with the current implementation of coarser multithreading
-    // we end up calling each method from each thread. Therefore we have to keep
-    // the num_calls and current_operation per thread basis, each thread must work
-    // on its designated batch slice, and the only synchronization points are
-    // step_start and step_end - for which we use state of the first thread.
-    // Each thread requires their own bookkeeping because it's possible that
-    // one thread is still in propagate of some batch slice while the other thread
-    // is doing backpropagate of some other slice. We also ensure the thread state
-    // isn't suspectible to false sharing by using a full cache line for the state.
-    class SharedInputTrainer {
-    public:
-        // factory function
-        static std::shared_ptr<SharedInputTrainer> create(
-            FeatureTransformer* ft) {
-
-            static std::shared_ptr<SharedInputTrainer> instance;
-
-            if (!instance) {
-                instance.reset(new SharedInputTrainer(ft));
-            }
-
-            ++instance->num_referrers_;
-
-            return instance;
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            auto& thread_state = thread_states_[0];
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kSendMessage;
-                feature_transformer_trainer_->send_message(message);
-            }
-
-            assert(thread_state.current_operation == Operation::kSendMessage);
-
-            if (++thread_state.num_calls == num_referrers_) {
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            auto& thread_state = thread_states_[0];
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kInitialize;
-                feature_transformer_trainer_->initialize(rng);
-            }
-
-            assert(thread_state.current_operation == Operation::kInitialize);
-
-            if (++thread_state.num_calls == num_referrers_) {
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-        }
-
-        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
-        {
-            const auto size = batch_end - batch_begin;
-
-            if ((long)gradients_.size() < (long)kInputDimensions * size) {
-                gradients_.resize(kInputDimensions * size);
-            }
-
-            if (thread_states_.size() < thread_pool.size())
-            {
-                thread_states_.resize(thread_pool.size());
-            }
-
-            batch_size_ = size;
-
-            auto& thread_state = thread_states_[0];
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kStepStart;
-                output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end);
-            }
-
-            assert(thread_state.current_operation == Operation::kStepStart);
-
-            if (++thread_state.num_calls == num_referrers_) {
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-
-            return output_;
-        }
-
-        // forward propagation
-        void propagate(Thread& th, uint64_t offset, uint64_t count) {
-            const auto thread_id = th.thread_idx();
-
-            auto& thread_state = thread_states_[thread_id];
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kPropagate;
-                feature_transformer_trainer_->propagate(th, offset, count);
-            }
-
-            assert(thread_state.current_operation == Operation::kPropagate);
-
-            if (++thread_state.num_calls == num_referrers_) {
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-        }
-
-        // backpropagation
-        void backpropagate(Thread& th,
-                           const LearnFloatType* gradients,
-                           uint64_t offset,
-                           uint64_t count) {
-
-            const auto thread_id = th.thread_idx();
-
-            auto& thread_state = thread_states_[thread_id];
-
-            if (num_referrers_ == 1) {
-                feature_transformer_trainer_->backpropagate(th, gradients, offset, count);
-                return;
-            }
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kBackPropagate;
-                for (IndexType b = offset; b < offset + count; ++b) {
-                    const IndexType batch_offset = kInputDimensions * b;
-                    for (IndexType i = 0; i < kInputDimensions; ++i) {
-                        gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
-                    }
-                }
-            }
-
-            assert(thread_state.current_operation == Operation::kBackPropagate);
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kInputDimensions * b;
-                for (IndexType i = 0; i < kInputDimensions; ++i) {
-                    gradients_[batch_offset + i] += gradients[batch_offset + i];
-                }
-            }
-
-            if (++thread_state.num_calls == num_referrers_) {
-                feature_transformer_trainer_->backpropagate(
-                    th, gradients_.data(), offset, count);
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-        }
-
-        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
-            auto& thread_state = thread_states_[0];
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kStepEnd;
-                feature_transformer_trainer_->step_end(thread_pool, learning_rate);
-            }
-
-            assert(thread_state.current_operation == Operation::kStepEnd);
-
-            if (++thread_state.num_calls == num_referrers_) {
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-        }
-
-    private:
-        // constructor
-        SharedInputTrainer(FeatureTransformer* ft) :
-            batch_size_(0),
-            num_referrers_(0),
-            thread_states_(1),
-            feature_transformer_trainer_(Trainer<FeatureTransformer>::create(
-                ft)),
-            output_(nullptr) {
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions =
-            FeatureTransformer::kOutputDimensions;
-
-        // type of processing
-        enum class Operation {
-            kNone,
-            kSendMessage,
-            kInitialize,
-            kStepStart,
-            kPropagate,
-            kBackPropagate,
-            kStepEnd,
-        };
-
-        // number of samples in mini-batch
-        IndexType batch_size_;
-
-        // number of layers sharing this layer as input
-        std::uint32_t num_referrers_;
-
-        struct alignas(kCacheLineSize) ThreadState
-        {
-            std::uint32_t num_calls{0};
-
-            // current processing type
-            Operation current_operation = Operation::kNone;
-        };
-
-        // Number of times the current process has been called
-        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
-
-        // Trainer of input feature converter
-        const std::shared_ptr<Trainer<FeatureTransformer>>
-            feature_transformer_trainer_;
-
-        // pointer to output shared for forward propagation
-        const LearnFloatType* output_;
-
-        // buffer for back propagation
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
-    };
-
-    // Learning: Input layer
-    template <IndexType OutputDimensions, IndexType Offset>
-    class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
-    private:
-        // Type of layer to learn
-        using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
-
-    public:
-        // factory function
-        static std::shared_ptr<Trainer> create(
-            LayerType* /*target_layer*/, FeatureTransformer* ft) {
-
-            return std::shared_ptr<Trainer>(new Trainer(ft));
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            shared_input_trainer_->send_message(message);
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            shared_input_trainer_->initialize(rng);
-        }
-
-        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
-        {
-            const auto size = batch_end - batch_begin;
-
-            if ((long)output_.size() < (long)kOutputDimensions * size) {
-              output_.resize(kOutputDimensions * size);
-              gradients_.resize(kInputDimensions * size);
-            }
-
-            batch_size_ = size;
-
-            input_ = shared_input_trainer_->step_start(thread_pool, batch_begin, batch_end);
-
-            return output_.data();
-        }
-
-        // forward propagation
-        void propagate(Thread& th, uint64_t offset, uint64_t count) {
-
-            shared_input_trainer_->propagate(th, offset, count);
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType input_offset = kInputDimensions * b;
-                const IndexType output_offset = kOutputDimensions * b;
-
-#if defined(USE_BLAS)
-
-                cblas_scopy(
-                    kOutputDimensions, &input_[input_offset + Offset], 1,
-                    &output_[output_offset], 1
-                );
-#else
-
-                Blas::scopy(
-                    kOutputDimensions, &input_[input_offset + Offset], 1,
-                    &output_[output_offset], 1
-                );
-
-#endif
-            }
-        }
-
-        // backpropagation
-        void backpropagate(Thread& th,
-                           const LearnFloatType* gradients,
-                           uint64_t offset,
-                           uint64_t count) {
-
-            for (IndexType b = offset; b < offset + count; ++b)
-            {
-                const IndexType input_offset = kInputDimensions * b;
-                const IndexType output_offset = kOutputDimensions * b;
-
-                IndexType i = 0;
-                if constexpr (Offset > 0)
-                {
-                    for (; i < Offset; ++i) {
-                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-                    }
-                }
-
-                for (; i < Offset + kOutputDimensions; ++i) {
-                    gradients_[input_offset + i] = gradients[output_offset + i - Offset];
-                }
-
-                if constexpr (Offset + kOutputDimensions < kInputDimensions)
-                {
-                    for (; i < kInputDimensions; ++i)
-                    {
-                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-                    }
-                }
-            }
-
-            shared_input_trainer_->backpropagate(th, gradients_.data(), offset, count);
-        }
-
-        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
-            shared_input_trainer_->step_end(thread_pool, learning_rate);
-        }
-
-    private:
-        // constructor
-        Trainer(FeatureTransformer* ft) :
-            batch_size_(0),
-            shared_input_trainer_(SharedInputTrainer::create(ft)) {
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions =
-            FeatureTransformer::kOutputDimensions;
-        static constexpr IndexType kOutputDimensions = OutputDimensions;
-        static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
-
-        // number of samples in mini-batch
-        IndexType batch_size_;
-
-        const LearnFloatType* input_;
-
-        // Trainer of shared input layer
-        const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
-
-        // Forward propagation buffer
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
-
-        // buffer for back propagation
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
-    };
-
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
deleted file mode 100644
index 88ff302c..00000000
--- a/src/nnue/trainer/trainer_sum.h
+++ /dev/null
@@ -1,201 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_SUM_H_
-#define _NNUE_TRAINER_SUM_H_
-
-#include "trainer.h"
-
-#include "extra/stockfish_blas.h"
-
-#include "learn/learn.h"
-
-#include "nnue/layers/sum.h"
-
-#include "thread.h"
-
-// Specialization of NNUE evaluation function learning class template for Sum
-namespace Eval::NNUE {
-
-    // Learning: A layer that sums the outputs of multiple layers
-    template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
-    class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
-          Trainer<Layers::Sum<RemainingPreviousLayers...>> {
-    private:
-        // Type of layer to learn
-        using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
-        using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
-
-    public:
-        // factory function
-        static std::shared_ptr<Trainer> create(
-            LayerType* target_layer, FeatureTransformer* ft) {
-
-            return std::shared_ptr<Trainer>(
-                new Trainer(target_layer, ft));
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            // The results of other member functions do not depend on the processing order, so
-            // Tail is processed first for the purpose of simplifying the implementation, but
-            // SendMessage processes Head first to make it easier to understand subscript correspondence
-            previous_layer_trainer_->send_message(message);
-            Tail::send_message(message);
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            Tail::initialize(rng);
-            previous_layer_trainer_->initialize(rng);
-        }
-
-        // forward propagation
-        /*const*/ LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
-            batch_size_ = static_cast<IndexType>(batch.size());
-            auto output = Tail::propagate(thread_pool, batch);
-            const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch);
-
-#if defined(USE_BLAS)
-
-            cblas_saxpy(
-                kOutputDimensions * batch_size_, 1.0,
-                head_output, 1, output, 1
-            );
-
-#else
-
-            Blas::saxpy(
-                thread_pool,
-                kOutputDimensions * batch_size_, 1.0,
-                head_output, 1, output, 1
-            );
-
-#endif
-            return output;
-        }
-
-        // backpropagation
-        void backpropagate(ThreadPool& thread_pool,
-                           const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
-
-            Tail::backpropagate(thread_pool, gradients, learning_rate);
-            previous_layer_trainer_->backpropagate(thread_pool, gradients, learning_rate);
-        }
-
-    private:
-        // constructor
-        Trainer(LayerType* target_layer, FeatureTransformer* ft):
-            Tail(target_layer, ft),
-            batch_size_(0),
-            previous_layer_trainer_(Trainer<FirstPreviousLayer>::create(
-                &target_layer->previous_layer_, ft)),
-            target_layer_(target_layer) {
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-        // make subclass friend
-        template <typename SumLayer>
-        friend class Trainer;
-
-        // number of samples in mini-batch
-        IndexType batch_size_;
-
-        // Trainer of the previous layer
-        const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
-
-        // layer to learn
-        LayerType* const target_layer_;
-    };
-
-
-    // Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
-    template <typename PreviousLayer>
-    class Trainer<Layers::Sum<PreviousLayer>> {
-    private:
-        // Type of layer to learn
-        using LayerType = Layers::Sum<PreviousLayer>;
-
-    public:
-        // factory function
-        static std::shared_ptr<Trainer> create(
-            LayerType* target_layer, FeatureTransformer* ft) {
-
-            return std::shared_ptr<Trainer>(
-                new Trainer(target_layer, ft));
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            previous_layer_trainer_->send_message(message);
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            previous_layer_trainer_->initialize(rng);
-        }
-
-        // forward propagation
-        /*const*/ LearnFloatType* propagate(const std::vector<Example>& batch) {
-            if (output_.size() < kOutputDimensions * batch.size()) {
-                output_.resize(kOutputDimensions * batch.size());
-            }
-
-            batch_size_ = static_cast<IndexType>(batch.size());
-            const auto output = previous_layer_trainer_->propagate(batch);
-
-#if defined(USE_BLAS)
-            cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
-#else
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    output_[batch_offset + i] = output[batch_offset + i];
-                }
-            }
-
-#endif
-            return output_.data();
-        }
-
-        // backpropagation
-        void backpropagate(const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
-
-            previous_layer_trainer_->backpropagate(gradients, learning_rate);
-        }
-
-    private:
-        // constructor
-        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-            batch_size_(0),
-            previous_layer_trainer_(Trainer<PreviousLayer>::create(
-                &target_layer->previous_layer_, ft)),
-            target_layer_(target_layer) {
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-        // make subclass friend
-        template <typename SumLayer>
-        friend class Trainer;
-
-        // number of samples in mini-batch
-        IndexType batch_size_;
-
-        // Trainer of the previous layer
-        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-        // layer to learn
-        LayerType* const target_layer_;
-
-        // Forward propagation buffer
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
-    };
-
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/uci.cpp b/src/uci.cpp
index 7da2881f..9a9a9e3c 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -22,11 +22,9 @@
 #include <sstream>
 #include <string>
 
-#include "extra/stockfish_blas.h"
 #include "nnue/evaluate_nnue.h"
 #include "evaluate.h"
 #include "movegen.h"
-#include "nnue/nnue_test_command.h"
 #include "position.h"
 #include "search.h"
 #include "syzygy/tbprobe.h"
@@ -37,7 +35,6 @@
 
 #include "learn/gensfen.h"
 #include "learn/gensfen_nonpv.h"
-#include "learn/learn.h"
 #include "learn/convert.h"
 #include "learn/transform.h"
 #include "learn/stats.h"
@@ -49,17 +46,6 @@ extern vector<string> setup_bench(const Position&, istream&);
 // FEN string of the initial position, normal chess
 const char* StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
 
-void test_cmd(Position& pos, istringstream& is)
-{
-    // Initialize as it may be searched.
-    Eval::NNUE::init();
-
-    std::string param;
-    is >> param;
-
-    if (param == "nnue") Eval::NNUE::test_command(pos, is);
-}
-
 namespace {
 
   // position() is called when engine receives the "position" UCI command.
@@ -344,7 +330,6 @@ void UCI::loop(int argc, char* argv[]) {
 
       else if (token == "gensfen") Learner::gensfen(is);
       else if (token == "gensfen_nonpv") Learner::gensfen_nonpv(is);
-      else if (token == "learn") Learner::learn(is);
       else if (token == "convert") Learner::convert(is);
       else if (token == "convert_bin") Learner::convert_bin(is);
       else if (token == "convert_plain") Learner::convert_plain(is);
@@ -361,17 +346,7 @@ void UCI::loop(int argc, char* argv[]) {
           std::cout << th.thread_idx() << '\n';
         });
       }
-      else if (token == "blastest")
-      {
-        Blas::test(Threads);
-      }
-      else if (token == "blasbench")
-      {
-        Blas::bench(Threads);
-      }
 
-      // test command
-      else if (token == "test") test_cmd(pos, is);
       else
           sync_cout << "Unknown command: " << cmd << sync_endl;