remove learn

2025-12-25 19:46:55 +08:00 · 2021-04-18 19:04:14 +02:00
parent 17946c5954
commit 3101ae7973
25 changed files with 2 additions and 7270 deletions
--- a/src/Makefile
+++ b/src/Makefile
@@ -47,9 +47,7 @@ PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 sfen_format bin output_file_name
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
 	material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
 	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
-	extra/stockfish_blas.cpp \
 	nnue/evaluate_nnue.cpp \
-	nnue/evaluate_nnue_learner.cpp \
 	nnue/features/half_kp.cpp \
 	nnue/features/half_ka.cpp \
 	nnue/features/half_relative_kp.cpp \
@@ -59,9 +57,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	nnue/features/a.cpp \
 	nnue/features/castling_right.cpp \
 	nnue/features/enpassant.cpp \
-	nnue/nnue_test_command.cpp \
 	learn/sfen_packer.cpp \
-	learn/learn.cpp \
 	learn/gensfen.cpp \
 	learn/gensfen_nonpv.cpp \
 	learn/opening_book.cpp \
--- a/src/extra/stockfish_blas.cpp
+++ b/src/extra/stockfish_blas.cpp
--- a/src/extra/stockfish_blas.h
+++ b/src/extra/stockfish_blas.h
@@ -1,140 +0,0 @@
-#ifndef _STOCKFISH_BLAS_H_
-#define _STOCKFISH_BLAS_H_
-
-struct ThreadPool;
-
-#if defined (_MSC_VER)
-#define SF_BLAS_RESTRICT __restrict
-#elif defined (__INTEL_COMPILER)
-#define SF_BLAS_RESTRICT restrict
-#elif defined (__clang__)
-#define SF_BLAS_RESTRICT __restrict__
-#elif defined (__GNUC__)
-#define SF_BLAS_RESTRICT __restrict__
-#endif
-
-namespace Blas {
-
-    enum struct MatrixLayout {
-        RowMajor = 101,
-        ColMajor = 102
-    };
-
-    enum struct MatrixTranspose {
-        NoTrans = 111,
-        Trans = 112
-    };
-
-    void scopy(
-        const int N,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    );
-
-    void scopy(
-        const int N,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    );
-
-    void scopy(
-        ThreadPool& thread_pool,
-        const int N,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    );
-
-    void scopy(
-        ThreadPool& thread_pool,
-        const int N,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    );
-
-    void sscal(
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X
-    );
-
-    void sscal(
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X, const int incX
-    );
-
-    void sscal(
-        ThreadPool& thread_pool,
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X
-    );
-
-    void sscal(
-        ThreadPool& thread_pool,
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X, const int incX
-    );
-
-    void saxpy(
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    );
-
-    void saxpy(
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    );
-
-    void saxpy(
-        ThreadPool& thread_pool,
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    );
-
-    void saxpy(
-        ThreadPool& thread_pool,
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    );
-
-    void sgemm(
-        ThreadPool& thread_pool,
-        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    );
-
-    void sgemm(
-        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    );
-
-    void test(
-        ThreadPool& thread_pool
-    );
-
-    void bench(
-        ThreadPool& thread_pool
-    );
-}
-
-#endif
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -1,667 +0,0 @@
-#ifndef LEARNER_AUTOGRAD_H
-#define LEARNER_AUTOGRAD_H
-
-#include <cmath>
-#include <utility>
-#include <type_traits>
-#include <memory>
-#include <tuple>
-#include <optional>
-#include <algorithm>
-#include <cstdint>
-
-namespace Learner
-{
-    template <typename T>
-    struct ValueWithGrad
-    {
-        T value;
-        T grad;
-
-        ValueWithGrad& operator+=(const ValueWithGrad<T>& rhs)
-        {
-            value += rhs.value;
-            grad += rhs.grad;
-            return *this;
-        }
-
-        ValueWithGrad& operator-=(const ValueWithGrad<T>& rhs)
-        {
-            value -= rhs.value;
-            grad -= rhs.grad;
-            return *this;
-        }
-
-        ValueWithGrad& operator*=(T rhs)
-        {
-            value *= rhs;
-            grad *= rhs;
-            return *this;
-        }
-
-        ValueWithGrad& operator/=(T rhs)
-        {
-            value /= rhs;
-            grad /= rhs;
-            return *this;
-        }
-
-        [[nodiscard]] ValueWithGrad abs() const
-        {
-            return { std::abs(value), std::abs(grad) };
-        }
-
-        [[nodiscard]] ValueWithGrad clamp_grad(T max) const
-        {
-            return { value, std::clamp(grad, -max, max) };
-        }
-    };
-}
-
-namespace Learner::Autograd::UnivariateStatic
-{
-
-    template <typename T>
-    struct Identity
-    {
-        using type = T;
-    };
-
-    template <typename T>
-    using Id = typename Identity<T>::type;
-
-    template <typename T>
-    using StoreValueOrRef = std::conditional_t<
-            std::is_rvalue_reference_v<T>,
-            std::remove_reference_t<T>,
-            const std::remove_reference_t<T>&
-        >;
-
-    namespace Detail
-    {
-        using CallIdType = std::uint32_t;
-
-        struct CallId
-        {
-            CallIdType call_id{};
-
-            constexpr CallId() :
-                call_id(0)
-            {
-            }
-
-            constexpr CallId(CallIdType id) :
-                call_id(id)
-            {
-            }
-
-            [[nodiscard]] bool operator==(CallId rhs) const noexcept
-            {
-                return call_id == rhs.call_id;
-            }
-
-            [[nodiscard]] bool operator!=(CallId rhs) const noexcept
-            {
-                return call_id != rhs.call_id;
-            }
-        };
-
-        [[nodiscard]] inline CallId next_call_id()
-        {
-            static thread_local CallIdType s_call_id = 0;
-            return CallId{ s_call_id++ };
-        }
-
-        template <typename T, typename Tuple>
-        struct TupleContains;
-
-        template <typename T, typename... Us>
-        struct TupleContains<T, std::tuple<Us...>> : std::disjunction<std::is_same<T, Us>...> {};
-
-        template <typename T, typename Tuple>
-        constexpr bool TupleContainsV = TupleContains<T, Tuple>::value;
-
-        template <typename... Ts>
-        constexpr bool AreAllConstantV = (std::remove_reference_t<Ts>::is_constant && ...);
-    }
-
-    template <typename T, typename ChildT>
-    struct Evaluable
-    {
-        constexpr Evaluable() = default;
-
-        // We append a unique call id so that we can invalidate the cache when
-        // the next computation starts. A single evaluation should see
-        // the same call_id at every node.
-        template <typename... ArgsTs>
-        [[nodiscard]] auto eval(const std::tuple<ArgsTs...>& args) const
-        {
-            const auto call_id = Detail::next_call_id();
-            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
-            return ValueWithGrad<T>{ value(new_args), grad(new_args) };
-        }
-
-        template <typename... ArgsTs,
-            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
-        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args) const
-        {
-            const ChildT* this_ = static_cast<const ChildT*>(this);
-
-            const auto call_id = std::get<Detail::CallId>(args);
-            if (!value_cache.has_value() || value_cache_call_id != call_id)
-            {
-                value_cache_call_id = call_id;
-                value_cache = this_->calculate_value(args);
-            }
-
-            return *value_cache;
-        }
-
-        template <typename... ArgsTs,
-            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
-        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args, ...) const
-        {
-            const auto call_id = Detail::next_call_id();
-            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
-            return value(new_args);
-        }
-
-        template <typename... ArgsTs,
-            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
-        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args) const
-        {
-            if constexpr (ChildT::is_constant)
-            {
-                return T(0.0);
-            }
-            else
-            {
-                const ChildT* this_ = static_cast<const ChildT*>(this);
-
-                const auto call_id = std::get<Detail::CallId>(args);
-                if (!grad_cache.has_value() || grad_cache_call_id != call_id)
-                {
-                    grad_cache_call_id = call_id;
-                    grad_cache = this_->calculate_grad(args);
-                }
-
-                return *grad_cache;
-            }
-        }
-
-        template <typename... ArgsTs,
-            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
-        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args, ...) const
-        {
-            const auto call_id = Detail::next_call_id();
-            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
-            return grad(new_args);
-        }
-
-    private:
-        mutable std::optional<T> value_cache;
-        mutable std::optional<T> grad_cache;
-        mutable Detail::CallId value_cache_call_id{};
-        mutable Detail::CallId grad_cache_call_id{};
-    };
-
-    template <typename T, int I>
-    struct VariableParameter : Evaluable<T, VariableParameter<T, I>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = false;
-
-        constexpr VariableParameter()
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return std::get<I>(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
-        {
-            return T(1.0);
-        }
-    };
-
-    template <typename T, int I>
-    struct ConstantParameter : Evaluable<T, ConstantParameter<T, I>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = true;
-
-        constexpr ConstantParameter()
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return std::get<I>(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
-        {
-            return T(0.0);
-        }
-    };
-
-    template <typename T>
-    struct Constant : Evaluable<T, Constant<T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = true;
-
-        constexpr Constant(T x) :
-            m_x(std::move(x))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
-        {
-            return m_x;
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
-        {
-            return T(0.0);
-        }
-
-    private:
-        T m_x;
-    };
-
-    // The "constant" may change between executions, but is assumed to be
-    // constant during a single evaluation.
-    template <typename T>
-    struct ConstantRef : Evaluable<T, ConstantRef<T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = true;
-
-        constexpr ConstantRef(const T& x) :
-            m_x(x)
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
-        {
-            return m_x;
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
-        {
-            return T(0.0);
-        }
-
-    private:
-        const T& m_x;
-    };
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    struct Sum : Evaluable<T, Sum<LhsT, RhsT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
-
-        constexpr Sum(LhsT&& lhs, RhsT&& rhs) :
-            m_lhs(std::forward<LhsT>(lhs)),
-            m_rhs(std::forward<RhsT>(rhs))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.value(args) + m_rhs.value(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.grad(args) + m_rhs.grad(args);
-        }
-
-    private:
-        StoreValueOrRef<LhsT> m_lhs;
-        StoreValueOrRef<RhsT> m_rhs;
-    };
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, RhsT&& rhs)
-    {
-        return Sum<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, Id<T> rhs)
-    {
-        return Sum<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
-    }
-
-    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator+(Id<T> lhs, RhsT&& rhs)
-    {
-        return Sum<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    struct Difference : Evaluable<T, Difference<LhsT, RhsT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
-
-        constexpr Difference(LhsT&& lhs, RhsT&& rhs) :
-            m_lhs(std::forward<LhsT>(lhs)),
-            m_rhs(std::forward<RhsT>(rhs))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.value(args) - m_rhs.value(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.grad(args) - m_rhs.grad(args);
-        }
-
-    private:
-        StoreValueOrRef<LhsT> m_lhs;
-        StoreValueOrRef<RhsT> m_rhs;
-    };
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, RhsT&& rhs)
-    {
-        return Difference<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, Id<T> rhs)
-    {
-        return Difference<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
-    }
-
-    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator-(Id<T> lhs, RhsT&& rhs)
-    {
-        return Difference<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    struct Product : Evaluable<T, Product<LhsT, RhsT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
-
-        constexpr Product(LhsT&& lhs, RhsT&& rhs) :
-            m_lhs(std::forward<LhsT>(lhs)),
-            m_rhs(std::forward<RhsT>(rhs))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.value(args) * m_rhs.value(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.grad(args) * m_rhs.value(args) + m_lhs.value(args) * m_rhs.grad(args);
-        }
-
-    private:
-        StoreValueOrRef<LhsT> m_lhs;
-        StoreValueOrRef<RhsT> m_rhs;
-    };
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, RhsT&& rhs)
-    {
-        return Product<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, Id<T> rhs)
-    {
-        return Product<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
-    }
-
-    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator*(Id<T> lhs, RhsT&& rhs)
-    {
-        return Product<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    struct Quotient : Evaluable<T, Quotient<LhsT, RhsT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
-
-        constexpr Quotient(LhsT&& lhs, RhsT&& rhs) :
-            m_lhs(std::forward<LhsT>(lhs)),
-            m_rhs(std::forward<RhsT>(rhs))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.value(args) / m_rhs.value(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            auto g = m_rhs.value(args);
-            return (m_lhs.grad(args) * g - m_lhs.value(args) * m_rhs.grad(args)) / (g * g);
-        }
-
-    private:
-        StoreValueOrRef<LhsT> m_lhs;
-        StoreValueOrRef<RhsT> m_rhs;
-    };
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, RhsT&& rhs)
-    {
-        return Quotient<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, Id<T> rhs)
-    {
-        return Quotient<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
-    }
-
-    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator/(Id<T> lhs, RhsT&& rhs)
-    {
-        return Quotient<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    struct Negation : Evaluable<T, Negation<ArgT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
-
-        constexpr explicit Negation(ArgT&& x) :
-            m_x(std::forward<ArgT>(x))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return -m_x.value(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return -m_x.grad(args);
-        }
-
-    private:
-        StoreValueOrRef<ArgT> m_x;
-    };
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    [[nodiscard]] constexpr auto operator-(ArgT&& x)
-    {
-        return Negation<ArgT&&>(std::forward<ArgT>(x));
-    }
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    struct Sigmoid : Evaluable<T, Sigmoid<ArgT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
-
-        constexpr explicit Sigmoid(ArgT&& x) :
-            m_x(std::forward<ArgT>(x))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return value_(m_x.value(args));
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_x.grad(args) * grad_(m_x.value(args));
-        }
-
-    private:
-        StoreValueOrRef<ArgT> m_x;
-
-        [[nodiscard]] T value_(T x) const
-        {
-            return 1.0 / (1.0 + std::exp(-x));
-        }
-
-        [[nodiscard]] T grad_(T x) const
-        {
-            return value_(x) * (1.0 - value_(x));
-        }
-    };
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    [[nodiscard]] constexpr auto sigmoid(ArgT&& x)
-    {
-        return Sigmoid<ArgT&&>(std::forward<ArgT>(x));
-    }
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    struct Pow : Evaluable<T, Pow<ArgT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
-
-        constexpr explicit Pow(ArgT&& x, Id<T> exponent) :
-            m_x(std::forward<ArgT>(x)),
-            m_exponent(std::move(exponent))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return std::pow(m_x.value(args), m_exponent);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_exponent * std::pow(m_x.value(args), m_exponent - T(1.0)) * m_x.grad(args);
-        }
-
-    private:
-        StoreValueOrRef<ArgT> m_x;
-        T m_exponent;
-    };
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    [[nodiscard]] constexpr auto pow(ArgT&& x, Id<T> exp)
-    {
-        return Pow<ArgT&&>(std::forward<ArgT>(x), std::move(exp));
-    }
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    struct Log : Evaluable<T, Log<ArgT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
-
-        constexpr explicit Log(ArgT&& x) :
-            m_x(std::forward<ArgT>(x))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return value_(m_x.value(args));
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_x.grad(args) * grad_(m_x.value(args));
-        }
-
-    private:
-        StoreValueOrRef<ArgT> m_x;
-
-        T value_(T x) const
-        {
-            return std::log(x);
-        }
-
-        T grad_(T x) const
-        {
-            return 1.0 / x;
-        }
-    };
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    [[nodiscard]] constexpr auto log(ArgT&& x)
-    {
-        return Log<ArgT&&>(std::forward<ArgT>(x));
-    }
-
-}
-
-#endif
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -13,7 +13,6 @@
 #include "extra/nnue_data_binpack_format.h"

 #include "nnue/evaluate_nnue.h"
-#include "nnue/evaluate_nnue_learner.h"

 #include "syzygy/tbprobe.h"

@@ -493,8 +492,8 @@ namespace Learner

        // has it reached the max length or is a draw by fifty-move rule
        // or by 3-fold repetition
-        if (ply >= params.write_maxply 
-            || pos.is_fifty_move_draw() 
+        if (ply >= params.write_maxply
+            || pos.is_fifty_move_draw()
            || pos.is_three_fold_repetition())
        {
            return 0;
--- a/src/learn/gensfen_nonpv.cpp
+++ b/src/learn/gensfen_nonpv.cpp
@@ -13,7 +13,6 @@
 #include "extra/nnue_data_binpack_format.h"

 #include "nnue/evaluate_nnue.h"
-#include "nnue/evaluate_nnue_learner.h"

 #include "syzygy/tbprobe.h"

--- a/src/learn/half_float.h
+++ b/src/learn/half_float.h
@@ -1,133 +0,0 @@
-#ifndef __HALF_FLOAT_H__
-#define __HALF_FLOAT_H__
-
-// Half Float Library by yaneurao
-// (16-bit float)
-
-// Floating point operation by 16bit type
-// Assume that the float type code generated by the compiler is in IEEE 754 format and use it.
-
-#include "types.h"
-
-namespace HalfFloat
-{
-    // IEEE 754 float 32 format is :
-    //   sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits
-    //
-    // Our float16 format is :
-    //   sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits
-    union float32_converter
-    {
-        int32_t n;
-        float f;
-    };
-
-
-    // 16-bit float
-    struct float16
-    {
-        // --- constructors
-
-        float16() {}
-        float16(int16_t n) { from_float((float)n);  }
-        float16(int32_t n) { from_float((float)n); }
-        float16(float n) { from_float(n); }
-        float16(double n) { from_float((float)n); }
-
-        // build from a float
-        void from_float(float f) { *this = to_float16(f); }
-
-        // --- implicit converters
-
-        operator int32_t() const { return (int32_t)to_float(*this); }
-        operator float() const { return to_float(*this); }
-        operator double() const { return double(to_float(*this)); }
-
-        // --- operators
-
-        float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; }
-        float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; }
-        float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; }
-        float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; }
-        float16 operator + (float16 rhs) const { return float16(*this) += rhs; }
-        float16 operator - (float16 rhs) const { return float16(*this) -= rhs; }
-        float16 operator * (float16 rhs) const { return float16(*this) *= rhs; }
-        float16 operator / (float16 rhs) const { return float16(*this) /= rhs; }
-        float16 operator - () const { return float16(-to_float(*this)); }
-        bool operator == (float16 rhs) const { return this->v_ == rhs.v_; }
-        bool operator != (float16 rhs) const { return !(*this == rhs); }
-
-        static void UnitTest() { unit_test(); }
-
-    private:
-
-        // --- entity
-
-        uint16_t v_;
-
-        // --- conversion between float and float16
-
-        static float16 to_float16(float f)
-        {
-            float32_converter c;
-            c.f = f;
-            u32 n = c.n;
-
-            // The sign bit is MSB in common.
-            uint16_t sign_bit = (n >> 16) & 0x8000;
-
-            // The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit.
-            uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10;
-
-            // The fraction is limited to 10-bit.
-            uint16_t fraction = (n >> (23-10)) & 0x3ff;
-
-            float16 f_;
-            f_.v_ = sign_bit | exponent | fraction;
-
-            return f_;
-        }
-
-        static float to_float(float16 v)
-        {
-            u32 sign_bit = (v.v_ & 0x8000) << 16;
-            u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23;
-            u32 fraction = (v.v_ & 0x3ff) << (23 - 10);
-
-            float32_converter c;
-            c.n = sign_bit | exponent | fraction;
-            return c.f;
-        }
-
-        // It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe).
-        static void unit_test()
-        {
-            float16 a, b, c, d;
-            a = 1;
-            std::cout << (float)a << std::endl;
-            b = -118.625;
-            std::cout << (float)b << std::endl;
-            c = 2.5;
-            std::cout << (float)c << std::endl;
-            d = a + c;
-            std::cout << (float)d << std::endl;
-
-            c *= 1.5;
-            std::cout << (float)c << std::endl;
-
-            b /= 3;
-            std::cout << (float)b << std::endl;
-
-            float f1 = 1.5;
-            a += f1;
-            std::cout << (float)a << std::endl;
-
-            a += f1 * (float)a;
-            std::cout << (float)a << std::endl;
-        }
-
-    };
-
-}
-
-#endif // __HALF_FLOAT_H__
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -1,148 +0,0 @@
-#ifndef _LEARN_H_
-#define _LEARN_H_
-
-// ----------------------
-// Floating point for learning
-// ----------------------
-
-// If this is set to double, the calculation accuracy will be higher, but the weight array entangled memory will be doubled.
-// Currently, if this is float, the weight array is 4.5 times the size of the evaluation function file. (About 4.5GB with KPPT)
-// Even if it is a double type, there is almost no difference in the way of convergence, so fix it to float.
-
-// when using float
-using LearnFloatType = float;
-
-// when using double
-//typedef double LearnFloatType;
-
-// when using float16
-//#include "half_float.h"
-//typedef HalfFloat::float16 LearnFloatType;
-
-// ======================
-// configure
-// ======================
-
-// ----------------------
-// Learning with the method of elmo (WCSC27)
-// ----------------------
-
-#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
-
-// ----------------------
-// Definition of struct used in Learner
-// ----------------------
-
-#include "autograd.h"
-#include "packed_sfen.h"
-
-#include "position.h"
-
-#include <sstream>
-#include <vector>
-#include <mutex>
-#include <string>
-
-namespace Learner
-{
-    // ----------------------
-    // Settings for learning
-    // ----------------------
-
-    // mini-batch size.
-    // Calculate the gradient by combining this number of phases.
-    // If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
-    // If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
-    // I don't think you need to change this value in most cases.
-
-    constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;
-
-    // Saving interval of evaluation function at learning. Save each time you learn this number of phases.
-    // Needless to say, the longer the saving interval, the shorter the learning time.
-    // Folder name is incremented for each save like 0/, 1/, 2/...
-    // By default, once every 1 billion phases.
-    constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 100'000'000ULL;
-
-    // Reduce the output of rmse during learning to 1 for this number of times.
-    // rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
-    constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;
-
-    // Learning from the generated game record
-    void learn(std::istringstream& is);
-
-    using CalcLossFunc = ValueWithGrad<double>(Value, Value, int, int);
-
-    struct Loss
-    {
-        double value() const
-        {
-            return m_loss.value;
-        }
-
-        double grad() const
-        {
-            return m_loss.grad;
-        }
-
-        uint64_t count() const
-        {
-            return m_count;
-        }
-
-        Loss() = default;
-
-        Loss(const Loss& other) :
-            m_loss(other.m_loss),
-            m_count(other.m_count)
-        {
-        }
-
-        Loss& operator += (const ValueWithGrad<double>& rhs)
-        {
-            std::unique_lock lock(m_mutex);
-
-            m_loss += rhs.abs();
-            m_count += 1;
-
-            return *this;
-        }
-
-        Loss& operator += (const Loss& rhs)
-        {
-            std::unique_lock lock(m_mutex);
-
-            m_loss += rhs.m_loss.abs();
-            m_count += rhs.m_count;
-
-            return *this;
-        }
-
-        void reset()
-        {
-            std::unique_lock lock(m_mutex);
-
-            m_loss = ValueWithGrad<double>{ 0.0, 0.0 };
-            m_count = 0;
-        }
-
-        template <typename StreamT>
-        void print_with_grad(const std::string& prefix, StreamT& s) const
-        {
-            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
-            s << "  - " << prefix << "_grad_norm  = " << m_loss.grad / (double)m_count << std::endl;
-        }
-
-        template <typename StreamT>
-        void print_only_loss(const std::string& prefix, StreamT& s) const
-        {
-            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
-        }
-
-    private:
-        ValueWithGrad<double> m_loss{ 0.0, 0.0 };
-        uint64_t m_count{0};
-        std::mutex m_mutex;
-    };
-}
-
-#endif // ifndef _LEARN_H_
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -1,341 +0,0 @@
-#include <random>
-#include <fstream>
-
-#include "evaluate_nnue.h"
-#include "evaluate_nnue_learner.h"
-
-#include "trainer/features/all_factorizers.h"
-
-#include "trainer/trainer_feature_transformer.h"
-#include "trainer/trainer_input_slice.h"
-#include "trainer/trainer_affine_transform.h"
-#include "trainer/trainer_clipped_relu.h"
-#include "trainer/trainer_sum.h"
-
-#include "position.h"
-#include "uci.h"
-#include "misc.h"
-#include "thread_win32_osx.h"
-#include "thread.h"
-
-// Code for learning NNUE evaluation function
-namespace Eval::NNUE {
-
-    namespace {
-
-        // learning data
-        std::vector<Example> examples;
-
-        // Mutex for exclusive control of examples
-        std::mutex examples_mutex;
-
-        // number of samples in mini-batch
-        uint64_t batch_size;
-
-        // random number generator
-        std::mt19937 rng;
-
-        // learner
-        std::shared_ptr<Trainer<Network>> trainer;
-
-        // Tell the learner options such as hyperparameters
-        void send_messages(std::vector<Message> messages) {
-            for (auto& message : messages) {
-                trainer->send_message(&message);
-                assert(message.num_receivers > 0);
-            }
-        }
-
-    }  // namespace
-
-    // Initialize learning
-    void initialize_training(
-        const std::string& seed,
-        SynchronizedRegionLogger::Region& out) {
-
-#if defined (OPENBLAS_VERSION)
-        openblas_set_num_threads(1);
-#elif defined (INTEL_MKL_VERSION)
-        mkl_set_num_threads(1);
-#endif
-
-        out << "INFO (initialize_training): Initializing NN training for "
-            << get_architecture_string() << std::endl;
-
-        out << std::endl;
-
-        out << "Layers:\n"
-            << get_layers_info() << std::endl;
-
-        out << std::endl;
-
-        out << "Factorizers:\n"
-            << Features::Factorizer<RawFeatures>::get_factorizers_string() << std::endl;
-
-        out << std::endl;
-
-        assert(feature_transformer);
-        assert(network);
-
-        trainer = Trainer<Network>::create(network.get(), feature_transformer.get());
-        rng.seed(PRNG(seed).rand<uint64_t>());
-
-        if (Options["SkipLoadingEval"]) {
-            out << "INFO (initialize_training): Performing random net initialization.\n";
-            trainer->initialize(rng);
-        }
-    }
-
-    // set the number of samples in the mini-batch
-    void set_batch_size(uint64_t size) {
-        assert(size > 0);
-        batch_size = size;
-    }
-
-    // Set options such as hyperparameters
-    void set_options(const std::string& options) {
-        std::vector<Message> messages;
-        for (const auto& option : Algo::split(options, ',')) {
-          const auto fields = Algo::split(option, '=');
-          assert(fields.size() == 1 || fields.size() == 2);
-
-          if (fields.size() == 1) {
-              messages.emplace_back(fields[0]);
-          } else {
-              messages.emplace_back(fields[0], fields[1]);
-          }
-        }
-
-        send_messages(std::move(messages));
-    }
-
-    // Reread the evaluation function parameters for learning from the file
-    void restore_parameters(const std::string& dir_name) {
-        const std::string file_name = Path::combine(dir_name, NNUE::savedfileName);
-        std::ifstream stream(file_name, std::ios::binary);
-#ifndef NDEBUG
-        bool result =
-#endif
-        ReadParameters(stream);
-#ifndef NDEBUG
-        assert(result);
-#endif
-
-        send_messages({{"reset"}});
-    }
-
-    void finalize_net() {
-        send_messages({{"clear_unobserved_feature_weights"}});
-    }
-
-    // Add 1 sample of learning data
-    void add_example(
-        Position& pos,
-        Color rootColor,
-        Value discrete_nn_eval,
-        const Learner::PackedSfenValue& psv,
-        double weight) {
-
-        Example example;
-        if (rootColor == pos.side_to_move()) {
-            example.sign = 1;
-        } else {
-            example.sign = -1;
-        }
-
-        example.discrete_nn_eval = discrete_nn_eval;
-        example.psv = psv;
-        example.weight = weight;
-
-        Features::IndexList active_indices[2];
-        for (const auto trigger : kRefreshTriggers) {
-            RawFeatures::append_active_indices(pos, trigger, active_indices);
-        }
-
-        if (pos.side_to_move() != WHITE) {
-            active_indices[0].swap(active_indices[1]);
-        }
-
-        static thread_local std::vector<TrainingFeature> s_training_features;
-        auto& training_features = s_training_features;
-
-        for (const auto color : Colors) {
-            training_features.clear();
-
-            for (const auto base_index : active_indices[color]) {
-                static_assert(Features::Factorizer<RawFeatures>::get_dimensions() <
-                              (1 << TrainingFeature::kIndexBits), "");
-                Features::Factorizer<RawFeatures>::append_training_features(
-                    base_index, &training_features);
-            }
-
-            std::sort(training_features.begin(), training_features.end());
-
-            auto& unique_features = example.training_features[color];
-            unique_features.reserve(training_features.size());
-            for (const auto& feature : training_features) {
-                if (!unique_features.empty() &&
-                    feature.get_index() == unique_features.back().get_index()) {
-
-                    unique_features.back() += feature;
-                } else {
-                    unique_features.push_back(feature);
-                }
-            }
-        }
-
-        std::lock_guard<std::mutex> lock(examples_mutex);
-        examples.push_back(std::move(example));
-    }
-
-    // update the evaluation function parameters
-    Learner::Loss update_parameters(
-        ThreadPool& thread_pool,
-        uint64_t epoch,
-        bool verbose,
-        double learning_rate,
-        double max_grad,
-        Learner::CalcLossFunc calc_loss)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        assert(batch_size > 0);
-
-        learning_rate /= batch_size;
-
-        std::lock_guard<std::mutex> lock(examples_mutex);
-
-        double abs_eval_diff_sum = 0.0;
-        double abs_discrete_eval_sum = 0.0;
-        double gradient_norm = 0.0;
-
-        bool collect_stats = verbose;
-
-        Learner::Loss loss_sum{};
-
-        std::vector<double> abs_eval_diff_sum_local(thread_pool.size(), 0.0);
-        std::vector<double> abs_discrete_eval_sum_local(thread_pool.size(), 0.0);
-        std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
-        std::vector<Learner::Loss> loss_sum_local(thread_pool.size());
-
-        auto prev_batch_begin = examples.end();
-        while ((long)(prev_batch_begin - examples.begin()) >= (long)batch_size) {
-            auto batch_begin = prev_batch_begin - batch_size;
-            auto batch_end = prev_batch_begin;
-            auto size = batch_end - batch_begin;
-            const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end);
-            std::vector<LearnFloatType> gradients(size);
-
-            thread_pool.for_each_index_chunk_with_workers(
-                std::size_t(0), size,
-                [&](Thread& th, std::size_t offset, std::size_t count) {
-                    const auto thread_id = th.thread_idx();
-
-                    trainer->propagate(th, offset, count);
-
-                    for (std::size_t b = offset; b < offset + count; ++b) {
-                        const auto& e = *(batch_begin + b);
-                        const auto shallow = static_cast<Value>(round<std::int32_t>(
-                            e.sign * network_output[b] * kPonanzaConstant));
-                        const auto discrete = e.sign * e.discrete_nn_eval;
-                        const auto& psv = e.psv;
-                        auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
-                        loss.grad = std::clamp(
-                            loss.grad * e.sign * kPonanzaConstant * e.weight, -max_grad, max_grad);
-                        gradients[b] = static_cast<LearnFloatType>(loss.grad);
-                        loss_sum_local[thread_id] += loss;
-
-                        // The discrete eval will only be valid before first backpropagation,
-                        // that is only for the first batch.
-                        // Similarily we want only gradients from one batch.
-                        if (collect_stats)
-                        {
-                            abs_eval_diff_sum_local[thread_id] += std::abs(discrete - shallow);
-                            abs_discrete_eval_sum_local[thread_id] += std::abs(discrete);
-                            gradient_norm_local[thread_id] += std::abs(loss.grad);
-                        }
-                    }
-
-                    trainer->backpropagate(th, gradients.data(), offset, count);
-                }
-            );
-
-            // We can asyncronously erase the examples that we used in the previous
-            // step. This can be done safely because we're no longer using these
-            // examples and erase won't invalidate iterators.
-            examples.erase(prev_batch_begin, examples.end());
-            prev_batch_begin = batch_begin;
-
-            thread_pool.wait_for_workers_finished();
-
-            trainer->step_end(thread_pool, learning_rate);
-
-            collect_stats = false;
-        }
-        examples.erase(prev_batch_begin, examples.end());
-
-        if (verbose)
-        {
-            abs_eval_diff_sum = std::accumulate(abs_eval_diff_sum_local.begin(), abs_eval_diff_sum_local.end(), 0.0);
-            abs_discrete_eval_sum = std::accumulate(abs_discrete_eval_sum_local.begin(), abs_discrete_eval_sum_local.end(), 0.0);
-            gradient_norm = std::accumulate(gradient_norm_local.begin(), gradient_norm_local.end(), 0.0);
-
-            const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
-            const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
-
-            auto out = sync_region_cout.new_region();
-
-            out << "INFO (update_parameters):"
-                << " epoch = " << epoch
-                << " , avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff
-                << " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval
-                << " , avg_relative_error = " << avg_abs_eval_diff / avg_abs_discrete_eval
-                << " , batch_size = " << batch_size
-                << " , grad_norm = " << gradient_norm
-                << std::endl;
-        } else {
-            // Display some progress but don't synchronize as
-            // we can't really decide when to release the output lock here
-            std::cout << '.';
-        }
-
-        send_messages({{"quantize_parameters"}});
-
-        for(auto& loss : loss_sum_local)
-        {
-            loss_sum += loss;
-        }
-
-        return loss_sum;
-    }
-
-    // Check if there are any problems with learning
-    void check_health() {
-        send_messages({{"check_health"}});
-    }
-
-    // save merit function parameters to a file
-    void save_eval(std::string dir_name) {
-        auto eval_dir = Path::combine(Options["EvalSaveDir"], dir_name);
-
-        auto out = sync_region_cout.new_region();
-
-        out << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
-
-        // mkdir() will fail if this folder already exists, but
-        // Apart from that. If not, I just want you to make it.
-        // Also, assume that the folders up to EvalSaveDir have been dug.
-        sys::create_directories(eval_dir);
-
-        const std::string file_name = Path::combine(eval_dir, NNUE::savedfileName);
-        std::ofstream stream(file_name, std::ios::binary);
-#ifndef NDEBUG
-        bool result =
-#endif
-        WriteParameters(stream);
-#ifndef NDEBUG
-        assert(result);
-#endif
-        out << "INFO (save_eval): Finished saving evaluation file in " << eval_dir << std::endl;
-    }
-}  // namespace Eval::NNUE
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -1,52 +0,0 @@
-#ifndef _EVALUATE_NNUE_LEARNER_H_
-#define _EVALUATE_NNUE_LEARNER_H_
-
-#include "learn/learn.h"
-
-#include "misc.h"
-
-struct ThreadPool;
-
-// Interface used for learning NNUE evaluation function
-namespace Eval::NNUE {
-
-    // Initialize learning
-    void initialize_training(
-        const std::string& seed,
-        SynchronizedRegionLogger::Region& out);
-
-    // set the number of samples in the mini-batch
-    void set_batch_size(uint64_t size);
-
-    // Set options such as hyperparameters
-    void set_options(const std::string& options);
-
-    // Reread the evaluation function parameters for learning from the file
-    void restore_parameters(const std::string& dir_name);
-
-    // Add 1 sample of learning data
-    void add_example(
-        Position& pos,
-        Color rootColor,
-        Value discrete_nn_eval,
-    	const Learner::PackedSfenValue& psv,
-        double weight);
-
-    // update the evaluation function parameters
-    Learner::Loss update_parameters(
-        ThreadPool& thread_pool,
-        uint64_t epoch,
-        bool verbose,
-        double learning_rate,
-        double max_grad,
-        Learner::CalcLossFunc calc_loss);
-
-    // Check if there are any problems with learning
-    void check_health();
-
-    void finalize_net();
-
-    void save_eval(std::string suffix);
-}  // namespace Eval::NNUE
-
-#endif
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -1,215 +0,0 @@
-#include "evaluate_nnue.h"
-#include "nnue_test_command.h"
-
-#include "thread.h"
-#include "uci.h"
-
-#include <set>
-#include <fstream>
-
-#define ASSERT(X) { \
-    if (!(X)) { \
-        std::cout \
-            << "\nError : ASSERT(" << #X << "), " \
-            << __FILE__ << "(" << __LINE__ << "): " \
-            << __func__ << std::endl; \
-            std::this_thread::sleep_for(std::chrono::microseconds(3000)); \
-            *(int*)1 =0; \
-    } \
-}
-
-// USI extended command for NNUE evaluation function
-namespace Eval::NNUE {
-
-    namespace {
-
-        // Testing RawFeatures mainly for difference calculation
-        void test_features(Position& pos) {
-            const std::uint64_t num_games = 1000;
-            StateInfo si;
-            pos.set(StartFEN, false, &si, Threads.main());
-            const int MAX_PLY = 256; // test up to 256 hands
-
-            StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
-            int ply; // Trouble from the initial phase
-
-            PRNG prng(20171128);
-
-            std::uint64_t num_moves = 0;
-            std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
-            std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
-            constexpr IndexType kUnknown = -1;
-            std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
-
-            auto make_index_sets = [&](const Position& position) {
-                std::vector<std::vector<std::set<IndexType>>> index_sets(
-                    kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
-
-                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-                    Features::IndexList active_indices[2];
-                    RawFeatures::append_active_indices(position, kRefreshTriggers[i],
-                                                     active_indices);
-
-                    for (const auto perspective : Colors) {
-                        for (const auto index : active_indices[perspective]) {
-                            ASSERT(index < RawFeatures::kDimensions);
-                            ASSERT(index_sets[i][perspective].count(index) == 0);
-                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-                            index_sets[i][perspective].insert(index);
-                            trigger_map[index] = i;
-                        }
-                    }
-                }
-
-                return index_sets;
-            };
-
-            auto update_index_sets = [&](const Position& position, auto* index_sets) {
-                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-                    Features::IndexList removed_indices[2], added_indices[2];
-                    bool reset[2] = { false, false };
-                    RawFeatures::append_changed_indices(position, kRefreshTriggers[i],
-                                                      removed_indices, added_indices, reset);
-                    for (const auto perspective : Colors) {
-                        if (reset[perspective]) {
-                            (*index_sets)[i][perspective].clear();
-                            ++num_resets[i];
-                        } else {
-                            for (const auto index : removed_indices[perspective]) {
-                                ASSERT(index < RawFeatures::kDimensions);
-                                ASSERT((*index_sets)[i][perspective].count(index) == 1);
-                                ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-                                (*index_sets)[i][perspective].erase(index);
-                                ++num_updates.back();
-                                ++num_updates[i];
-                                trigger_map[index] = i;
-                            }
-                        }
-
-                        for (const auto index : added_indices[perspective]) {
-                            ASSERT(index < RawFeatures::kDimensions);
-                            ASSERT((*index_sets)[i][perspective].count(index) == 0);
-                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-                            (*index_sets)[i][perspective].insert(index);
-                            ++num_updates.back();
-                            ++num_updates[i];
-                            trigger_map[index] = i;
-                        }
-                    }
-                }
-            };
-
-            std::cout << "feature set: " << RawFeatures::get_name()
-                      << "[" << RawFeatures::kDimensions << "]" << std::endl;
-            std::cout << "start testing with random games";
-
-            for (std::uint64_t i = 0; i < num_games; ++i) {
-                auto index_sets = make_index_sets(pos);
-                for (ply = 0; ply < MAX_PLY; ++ply) {
-                    MoveList<LEGAL> mg(pos); // Generate all legal hands
-
-                    // There was no legal move == Clog
-                    if (mg.size() == 0)
-                        break;
-
-                    // Randomly choose from the generated moves and advance the phase with the moves.
-                    Move m = mg.begin()[prng.rand(mg.size())];
-                    pos.do_move(m, state[ply]);
-
-                    ++num_moves;
-                    update_index_sets(pos, &index_sets);
-                    ASSERT(index_sets == make_index_sets(pos));
-                }
-
-                pos.set(StartFEN, false, &si, Threads.main());
-
-                // Output'.' every 100 times (so you can see that it's progressing)
-                if ((i % 100) == 0)
-                    std::cout << "." << std::flush;
-            }
-
-            std::cout << "passed." << std::endl;
-            std::cout << num_games << " games, " << num_moves << " moves, "
-                      << num_updates.back() << " updates, "
-                      << (1.0 * num_updates.back() / num_moves)
-                      << " updates per move" << std::endl;
-            std::size_t num_observed_indices = 0;
-
-            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-                const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
-                num_observed_indices += count;
-                std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
-                          << "): " << count << " features ("
-                          << (100.0 * count / RawFeatures::kDimensions) << "%), "
-                          << num_updates[i] << " updates ("
-                          << (1.0 * num_updates[i] / num_moves) << " per move), "
-                          << num_resets[i] << " resets ("
-                          << (100.0 * num_resets[i] / num_moves) << "%)"
-                          << std::endl;
-            }
-            std::cout << "observed " << num_observed_indices << " ("
-                      << (100.0 * num_observed_indices / RawFeatures::kDimensions)
-                      << "% of " << RawFeatures::kDimensions
-                      << ") features" << std::endl;
-        }
-
-        // Output a string that represents the structure of the evaluation function
-        void print_info(std::istream& stream) {
-            std::cout << "network architecture: " << get_architecture_string() << std::endl;
-
-            while (true) {
-                std::string file_name;
-                stream >> file_name;
-                if (file_name.empty())
-                    break;
-
-                std::uint32_t hash_value;
-                std::string architecture;
-                const bool success = [&]() {
-                    std::ifstream file_stream(file_name, std::ios::binary);
-
-                    if (!file_stream)
-                        return false;
-                    if (!read_header(file_stream, &hash_value, &architecture))
-                        return false;
-
-                    return true;
-                }();
-
-                std::cout << file_name << ": ";
-                if (success) {
-                    if (hash_value == kHashValue) {
-                        std::cout << "matches with this binary";
-                        if (architecture != get_architecture_string()) {
-                            std::cout << ", but architecture string differs: " << architecture;
-                        }
-
-                        std::cout << std::endl;
-                    } else {
-                        std::cout << architecture << std::endl;
-                    }
-                } else {
-                    std::cout << "failed to read header" << std::endl;
-                }
-            }
-        }
-
-    }  // namespace
-
-    // USI extended command for NNUE evaluation function
-    void test_command(Position& pos, std::istream& stream) {
-        std::string sub_command;
-        stream >> sub_command;
-
-        if (sub_command == "test_features") {
-            test_features(pos);
-        } else if (sub_command == "info") {
-            print_info(stream);
-        } else {
-            std::cout << "usage:" << std::endl;
-            std::cout << " test nnue test_features" << std::endl;
-            std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
-        }
-    }
-
-}  // namespace Eval::NNUE
--- a/src/nnue/nnue_test_command.h
+++ b/src/nnue/nnue_test_command.h
@@ -1,12 +0,0 @@
-#ifndef _NNUE_TEST_COMMAND_H_
-#define _NNUE_TEST_COMMAND_H_
-
-// USI extended command interface for NNUE evaluation function
-namespace Eval::NNUE {
-
-    // USI extended command for NNUE evaluation function
-    void test_command(Position& pos, std::istream& stream);
-
-}  // namespace Eval::NNUE
-
-#endif
--- a/src/nnue/trainer/features/all_factorizers.h
+++ b/src/nnue/trainer/features/all_factorizers.h
@@ -1,10 +0,0 @@
-#ifndef _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
-#define _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
-
-#include "factorizer.h"
-#include "factorizer_feature_set.h"
-
-#include "factorizer_half_kp.h"
-#include "factorizer_half_ka.h"
-
-#endif
--- a/src/nnue/trainer/features/factorizer.h
+++ b/src/nnue/trainer/features/factorizer.h
@@ -1,117 +0,0 @@
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
-#define _NNUE_TRAINER_FEATURES_FACTORIZER_H_
-
-#include "nnue/nnue_common.h"
-
-#include "nnue/trainer/trainer.h"
-
-// NNUE evaluation function feature conversion class template
-namespace Eval::NNUE::Features {
-
-    // Class template that converts input features into learning features
-    // By default, the learning feature is the same as the original input feature, and specialized as necessary
-    template <typename FeatureType>
-    class Factorizer {
-    public:
-        static constexpr std::string get_name() {
-            return "Factorizer<" + FeatureType::get_name() + "> -> " + std::string("No factorizer");
-        }
-
-        static constexpr std::string get_factorizers_string() {
-            return "  - " + get_name();
-        }
-
-        // Get the dimensionality of the learning feature
-        static constexpr IndexType get_dimensions() {
-            return FeatureType::kDimensions;
-        }
-
-        // Get index of learning feature and scale of learning rate
-        static void append_training_features(
-            IndexType base_index, std::vector<TrainingFeature>* training_features) {
-
-            assert(base_index <FeatureType::kDimensions);
-            training_features->emplace_back(base_index);
-        }
-    };
-
-    // Learning feature information
-    struct FeatureProperties {
-        bool active;
-        IndexType dimensions;
-    };
-
-    // Add the original input features to the learning features
-    template <typename FeatureType>
-    IndexType append_base_feature(
-        FeatureProperties properties, IndexType base_index,
-        std::vector<TrainingFeature>* training_features) {
-
-        assert(properties.dimensions == FeatureType::kDimensions);
-        assert(base_index < FeatureType::kDimensions);
-        training_features->emplace_back(base_index);
-        return properties.dimensions;
-    }
-
-    // If the learning rate scale is not 0, inherit other types of learning features
-    template <typename FeatureType>
-    IndexType inherit_features_if_required(
-        IndexType index_offset, FeatureProperties properties, IndexType base_index,
-        std::vector<TrainingFeature>* training_features) {
-
-        if (!properties.active) {
-            return 0;
-        }
-
-        assert(properties.dimensions == Factorizer<FeatureType>::get_dimensions());
-        assert(base_index < FeatureType::kDimensions);
-
-        const auto start = training_features->size();
-        Factorizer<FeatureType>::append_training_features(
-            base_index, training_features);
-
-        for (auto i = start; i < training_features->size(); ++i) {
-            auto& feature = (*training_features)[i];
-            assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
-            feature.shift_index(index_offset);
-        }
-
-        return properties.dimensions;
-    }
-
-    // Return the index difference as needed, without adding learning features
-    // Call instead of InheritFeaturesIfRequired() if there are no corresponding features
-    IndexType skip_features(FeatureProperties properties) {
-        if (!properties.active)
-            return 0;
-
-        return properties.dimensions;
-    }
-
-    // Get the dimensionality of the learning feature
-    template <std::size_t N>
-    constexpr IndexType get_active_dimensions(
-        const FeatureProperties (&properties)[N]) {
-
-        static_assert(N > 0, "");
-
-        IndexType dimensions = properties[0].dimensions;
-
-        for (std::size_t i = 1; i < N; ++i) {
-            if (properties[i].active) {
-                dimensions += properties[i].dimensions;
-            }
-        }
-
-        return dimensions;
-    }
-
-    // get the number of elements in the array
-    template <typename T, std::size_t N>
-    constexpr std::size_t get_array_length(const T (&/*array*/)[N]) {
-        return N;
-    }
-
-}  // namespace Eval::NNUE::Features
-
-#endif
--- a/src/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/nnue/trainer/features/factorizer_feature_set.h
@@ -1,121 +0,0 @@
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
-#define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
-
-#include "factorizer.h"
-
-#include "nnue/features/feature_set.h"
-
-// Specialization for feature set of feature conversion class template of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    // Class template that converts input features into learning features
-    // Specialization for FeatureSet
-    template <typename FirstFeatureType, typename... RemainingFeatureTypes>
-    class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
-    private:
-        using Head = Factorizer<FeatureSet<FirstFeatureType>>;
-        using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
-
-    public:
-        // number of dimensions of original input features
-        static constexpr IndexType kBaseDimensions =
-            FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
-
-        static constexpr std::string get_factorizers_string() {
-            std::string str = "  - ";
-            str += Head::get_name();
-            str += '\n';
-            str += Tail::get_factorizers_string();
-            return str;
-        }
-
-        // Get the dimensionality of the learning feature
-        static constexpr IndexType get_dimensions() {
-            return Head::get_dimensions() + Tail::get_dimensions();
-        }
-
-        // Get index of learning feature and scale of learning rate
-        static void append_training_features(
-            IndexType base_index, std::vector<TrainingFeature>* training_features,
-            IndexType base_dimensions = kBaseDimensions) {
-
-            assert(base_index < kBaseDimensions);
-
-            constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
-
-            if (base_index < boundary) {
-                Tail::append_training_features(
-                    base_index, training_features, base_dimensions);
-            }
-            else {
-                const auto start = training_features->size();
-
-                Head::append_training_features(
-                    base_index - boundary, training_features, base_dimensions);
-
-                for (auto i = start; i < training_features->size(); ++i) {
-                    auto& feature = (*training_features)[i];
-                    const auto index = feature.get_index();
-
-                    assert(index < Head::get_dimensions() ||
-                               (index >= base_dimensions &&
-                                index < base_dimensions +
-                                        Head::get_dimensions() - Head::kBaseDimensions));
-
-                    if (index < Head::kBaseDimensions) {
-                        feature.shift_index(Tail::kBaseDimensions);
-                    }
-                    else {
-                        feature.shift_index(Tail::get_dimensions() - Tail::kBaseDimensions);
-                    }
-                }
-            }
-        }
-    };
-
-    // Class template that converts input features into learning features
-    // Specialization when FeatureSet has one template argument
-    template <typename FeatureType>
-    class Factorizer<FeatureSet<FeatureType>> {
-    public:
-        // number of dimensions of original input features
-        static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
-
-        static constexpr std::string get_name() {
-            return Factorizer<FeatureType>::get_name();
-        }
-
-        static constexpr std::string get_factorizers_string() {
-            return "  - " + get_name();
-        }
-
-        // Get the dimensionality of the learning feature
-        static constexpr IndexType get_dimensions() {
-            return Factorizer<FeatureType>::get_dimensions();
-        }
-
-        // Get index of learning feature and scale of learning rate
-        static void append_training_features(
-            IndexType base_index, std::vector<TrainingFeature>* training_features,
-            IndexType base_dimensions = kBaseDimensions) {
-
-            assert(base_index < kBaseDimensions);
-
-            const auto start = training_features->size();
-
-            Factorizer<FeatureType>::append_training_features(
-                base_index, training_features);
-
-            for (auto i = start; i < training_features->size(); ++i) {
-                auto& feature = (*training_features)[i];
-                assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
-                if (feature.get_index() >= kBaseDimensions) {
-                    feature.shift_index(base_dimensions - kBaseDimensions);
-                }
-            }
-        }
-    };
-
-}  // namespace Eval::NNUE::Features
-
-#endif
--- a/src/nnue/trainer/features/factorizer_half_ka.h
+++ b/src/nnue/trainer/features/factorizer_half_ka.h
@@ -1,93 +0,0 @@
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
-#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
-
-#include "factorizer.h"
-
-#include "nnue/features/half_ka.h"
-#include "nnue/features/a.h"
-#include "nnue/features/half_relative_ka.h"
-
-// Specialization of NNUE evaluation function feature conversion class template for HalfKA
-namespace Eval::NNUE::Features {
-
-    // Class template that converts input features into learning features
-    // Specialization for HalfKA
-    template <Side AssociatedKing>
-    class Factorizer<HalfKA<AssociatedKing>> {
-    private:
-        using FeatureType = HalfKA<AssociatedKing>;
-
-        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-        static constexpr IndexType kMaxActiveDimensions =
-            FeatureType::kMaxActiveDimensions;
-
-        // Type of learning feature
-        enum TrainingFeatureType {
-            kFeaturesHalfKA,
-            kFeaturesA,
-            kFeaturesHalfRelativeKA,
-            kNumTrainingFeatureTypes,
-        };
-
-        // Learning feature information
-        static constexpr FeatureProperties kProperties[] = {
-            // kFeaturesHalfA
-            {true, FeatureType::kDimensions},
-            // kFeaturesA
-            {true, Factorizer<A>::get_dimensions()},
-            // kFeaturesHalfRelativeKA
-            {true, Factorizer<HalfRelativeKA<AssociatedKing>>::get_dimensions()},
-        };
-
-        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
-
-    public:
-        static constexpr std::string get_name() {
-            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "A, HalfRelativeKA";
-        }
-
-        static constexpr std::string get_factorizers_string() {
-            return "  - " + get_name();
-        }
-
-        // Get the dimensionality of the learning feature
-        static constexpr IndexType get_dimensions() {
-            return get_active_dimensions(kProperties);
-        }
-
-        // Get index of learning feature and scale of learning rate
-        static void append_training_features(
-            IndexType base_index, std::vector<TrainingFeature>* training_features) {
-
-            // kFeaturesHalfA
-            IndexType index_offset = append_base_feature<FeatureType>(
-                kProperties[kFeaturesHalfKA], base_index, training_features);
-
-            const auto sq_k = static_cast<Square>(base_index / PS_END2);
-            const auto a = static_cast<IndexType>(base_index % PS_END2);
-
-            // kFeaturesA
-            index_offset += inherit_features_if_required<A>(
-                index_offset, kProperties[kFeaturesA], a, training_features);
-
-            // kFeaturesHalfRelativeKA
-            if (a >= PS_W_PAWN) {
-                index_offset += inherit_features_if_required<HalfRelativeKA<AssociatedKing>>(
-                    index_offset, kProperties[kFeaturesHalfRelativeKA],
-                    HalfRelativeKA<AssociatedKing>::make_index(sq_k, a),
-                    training_features);
-            }
-            else {
-                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKA]);
-            }
-
-            assert(index_offset == get_dimensions());
-        }
-    };
-
-    template <Side AssociatedKing>
-    constexpr FeatureProperties Factorizer<HalfKA<AssociatedKing>>::kProperties[];
-
-}  // namespace Eval::NNUE::Features
-
-#endif // #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
--- a/src/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/nnue/trainer/features/factorizer_half_kp.h
@@ -1,104 +0,0 @@
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
-#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
-
-#include "factorizer.h"
-
-#include "nnue/features/half_kp.h"
-#include "nnue/features/p.h"
-#include "nnue/features/half_relative_kp.h"
-
-// Specialization of NNUE evaluation function feature conversion class template for HalfKP
-namespace Eval::NNUE::Features {
-
-    // Class template that converts input features into learning features
-    // Specialization for HalfKP
-    template <Side AssociatedKing>
-    class Factorizer<HalfKP<AssociatedKing>> {
-    private:
-        using FeatureType = HalfKP<AssociatedKing>;
-
-        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-        static constexpr IndexType kMaxActiveDimensions =
-            FeatureType::kMaxActiveDimensions;
-
-        // Type of learning feature
-        enum TrainingFeatureType {
-            kFeaturesHalfKP,
-            kFeaturesHalfK,
-            kFeaturesP,
-            kFeaturesHalfRelativeKP,
-            kNumTrainingFeatureTypes,
-        };
-
-        // Learning feature information
-        static constexpr FeatureProperties kProperties[] = {
-            // kFeaturesHalfKP
-            {true, FeatureType::kDimensions},
-            // kFeaturesHalfK
-            {true, SQUARE_NB},
-            // kFeaturesP
-            {true, Factorizer<P>::get_dimensions()},
-            // kFeaturesHalfRelativeKP
-            {true, Factorizer<HalfRelativeKP<AssociatedKing>>::get_dimensions()},
-        };
-
-        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
-
-    public:
-        static constexpr std::string get_name() {
-            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "HalfK, P, HalfRelativeKP";
-        }
-
-        static constexpr std::string get_factorizers_string() {
-            return "  - " + get_name();
-        }
-
-        // Get the dimensionality of the learning feature
-        static constexpr IndexType get_dimensions() {
-            return get_active_dimensions(kProperties);
-        }
-
-        // Get index of learning feature and scale of learning rate
-        static void append_training_features(
-            IndexType base_index, std::vector<TrainingFeature>* training_features) {
-
-            // kFeaturesHalfKP
-            IndexType index_offset = append_base_feature<FeatureType>(
-                kProperties[kFeaturesHalfKP], base_index, training_features);
-
-            const auto sq_k = static_cast<Square>(base_index / PS_END);
-            const auto p = static_cast<IndexType>(base_index % PS_END);
-
-            // kFeaturesHalfK
-            {
-                const auto& properties = kProperties[kFeaturesHalfK];
-                if (properties.active) {
-                    training_features->emplace_back(index_offset + sq_k);
-                    index_offset += properties.dimensions;
-                }
-            }
-
-            // kFeaturesP
-            index_offset += inherit_features_if_required<P>(
-                index_offset, kProperties[kFeaturesP], p, training_features);
-            // kFeaturesHalfRelativeKP
-            if (p >= PS_W_PAWN) {
-                index_offset += inherit_features_if_required<HalfRelativeKP<AssociatedKing>>(
-                    index_offset, kProperties[kFeaturesHalfRelativeKP],
-                    HalfRelativeKP<AssociatedKing>::make_index(sq_k, p),
-                    training_features);
-            }
-            else {
-                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKP]);
-            }
-
-            assert(index_offset == get_dimensions());
-        }
-    };
-
-    template <Side AssociatedKing>
-    constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
-
-}  // namespace Eval::NNUE::Features
-
-#endif
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -1,122 +0,0 @@
-#ifndef _NNUE_TRAINER_H_
-#define _NNUE_TRAINER_H_
-
-#include "nnue/nnue_common.h"
-#include "nnue/features/index_list.h"
-
-#include <sstream>
-
-#if defined(USE_BLAS)
-static_assert(std::is_same<LearnFloatType, float>::value, "");
-#include <cblas.h>
-#endif
-
-// Common header of class template for learning NNUE evaluation function
-namespace Eval::NNUE {
-
-    // Ponanza constant used in the relation between evaluation value and winning percentage
-    constexpr double kPonanzaConstant = 600.0;
-
-    // Class that represents one index of learning feature
-    class TrainingFeature {
-        using StorageType = std::uint32_t;
-        static_assert(std::is_unsigned<StorageType>::value, "");
-
-    public:
-        static constexpr std::uint32_t kIndexBits = 24;
-
-        static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
-
-        static constexpr std::uint32_t kCountBits =
-            std::numeric_limits<StorageType>::digits - kIndexBits;
-
-        explicit TrainingFeature(IndexType index) :
-            index_and_count_((index << kCountBits) | 1) {
-
-            assert(index < (1 << kIndexBits));
-        }
-
-        TrainingFeature& operator+=(const TrainingFeature& other) {
-            assert(other.get_index() == get_index());
-            assert(other.get_count() + get_count() < (1 << kCountBits));
-            index_and_count_ += other.get_count();
-            return *this;
-        }
-
-        IndexType get_index() const {
-            return static_cast<IndexType>(index_and_count_ >> kCountBits);
-        }
-
-        void shift_index(IndexType offset) {
-            assert(get_index() + offset < (1 << kIndexBits));
-            index_and_count_ += offset << kCountBits;
-        }
-
-        IndexType get_count() const {
-            return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
-        }
-
-        bool operator<(const TrainingFeature& other) const {
-            return index_and_count_ < other.index_and_count_;
-        }
-
-    private:
-        StorageType index_and_count_;
-    };
-
-    // Structure that represents one sample of training data
-    struct Example {
-        std::vector<TrainingFeature> training_features[2];
-        Learner::PackedSfenValue psv;
-        Value discrete_nn_eval;
-        int sign;
-        double weight;
-    };
-
-    // Message used for setting hyperparameters
-    struct Message {
-        Message(const std::string& message_name, const std::string& message_value = "") :
-            name(message_name), value(message_value), num_peekers(0), num_receivers(0)
-        {
-        }
-
-        const std::string name;
-        const std::string value;
-        std::uint32_t num_peekers;
-        std::uint32_t num_receivers;
-    };
-
-    // determine whether to accept the message
-    bool receive_message(const std::string& name, Message* message) {
-        const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
-
-        if (message->name.substr(0, name.size() + 1) == name + "[") {
-            ++message->num_peekers;
-        }
-
-        if (message->name == name || message->name == name + subscript) {
-            ++message->num_receivers;
-            return true;
-        }
-
-        return false;
-    }
-
-    // round a floating point number to an integer
-    template <typename IntType>
-    IntType round(double value) {
-        return static_cast<IntType>(std::floor(value + 0.5));
-    }
-
-    // make_shared with alignment
-    template <typename T, typename... ArgumentTypes>
-    std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments) {
-        const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
-            T(std::forward<ArgumentTypes>(arguments)...);
-
-        return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
-    }
-
-}  // namespace Eval::NNUE
-
-#endif
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -1,476 +0,0 @@
-#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
-#define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
-
-#include "trainer.h"
-
-#include "extra/stockfish_blas.h"
-
-#include "learn/learn.h"
-
-#include "nnue/layers/affine_transform.h"
-
-#include "thread.h"
-
-#include <random>
-
-// Specialization of NNUE evaluation function learning class template for AffineTransform
-namespace Eval::NNUE {
-
-    // Learning: Affine transformation layer
-    template <typename PreviousLayer, IndexType OutputDimensions>
-    class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
-    private:
-        // Type of layer to learn
-        using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
-
-    public:
-        // factory function
-        static std::shared_ptr<Trainer> create(
-            LayerType* target_layer, FeatureTransformer* ft) {
-
-            return std::shared_ptr<Trainer>(
-                new Trainer(target_layer, ft));
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            previous_layer_trainer_->send_message(message);
-
-            if (receive_message("momentum", message)) {
-                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-            }
-
-            if (receive_message("learning_rate_scale", message)) {
-                learning_rate_scale_ =
-                    static_cast<LearnFloatType>(std::stod(message->value));
-            }
-
-            if (receive_message("reset", message)) {
-                dequantize_parameters();
-            }
-
-            if (receive_message("quantize_parameters", message)) {
-                quantize_parameters();
-            }
-
-            if (receive_message("check_health", message)) {
-                check_health();
-            }
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            previous_layer_trainer_->initialize(rng);
-
-            if (kIsOutputLayer) {
-                // Initialize output layer with 0
-                std::fill(std::begin(biases_), std::end(biases_),
-                          static_cast<LearnFloatType>(0.0));
-                std::fill(std::begin(weights_), std::end(weights_),
-                          static_cast<LearnFloatType>(0.0));
-            }
-            else {
-                // Assuming that the input distribution is unit-mean 0.5, equal variance,
-                // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
-                const double kSigma = 1.0 / std::sqrt(kInputDimensions);
-                auto distribution = std::normal_distribution<double>(0.0, kSigma);
-
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    double sum = 0.0;
-                      for (IndexType j = 0; j < kInputDimensions; ++j) {
-                          const auto weight = static_cast<LearnFloatType>(distribution(rng));
-                          weights_[kInputDimensions * i + j] = weight;
-                          sum += weight;
-                      }
-
-                    biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
-                }
-            }
-
-            quantize_parameters();
-        }
-
-        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
-        {
-            const auto size = batch_end - batch_begin;
-
-            if ((long)output_.size() < (long)kOutputDimensions * size) {
-                output_.resize(kOutputDimensions * size);
-                gradients_.resize(kInputDimensions * size);
-            }
-
-            if (thread_states_.size() < thread_pool.size())
-            {
-                thread_states_.resize(thread_pool.size());
-            }
-
-            combined_batch_size_ = size;
-            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
-
-            auto& main_thread_state = thread_states_[0];
-
-#if defined(USE_BLAS)
-
-            // update
-            cblas_sscal(
-                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
-            );
-
-#else
-
-            Blas::sscal(
-                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
-            );
-
-#endif
-
-            for (IndexType i = 1; i < thread_states_.size(); ++i)
-                thread_states_[i].reset_biases();
-
-            return output_.data();
-        }
-
-        // forward propagation
-        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
-
-            previous_layer_trainer_->propagate(th, offset, count);
-
-#if defined(USE_BLAS)
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                cblas_scopy(
-                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
-                );
-            }
-
-            cblas_sgemm(
-                CblasColMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, count, kInputDimensions,
-                1.0,
-                weights_, kInputDimensions,
-                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
-                1.0,
-                &output_[offset * kOutputDimensions], kOutputDimensions
-            );
-#else
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                Blas::scopy(
-                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
-                );
-            }
-
-            Blas::sgemm(
-                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
-                kOutputDimensions, count, kInputDimensions,
-                1.0,
-                weights_, kInputDimensions,
-                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
-                1.0,
-                &output_[offset * kOutputDimensions], kOutputDimensions
-            );
-
-#endif
-        }
-
-        // backpropagation
-        void backpropagate(Thread& th,
-                           const LearnFloatType* gradients,
-                           uint64_t offset,
-                           uint64_t count) {
-
-            auto& thread_state = thread_states_[th.thread_idx()];
-            const auto momentum = th.thread_idx() == 0 ? momentum_ : 0.0f;
-#if defined(USE_BLAS)
-
-            cblas_sgemm(
-                CblasColMajor, CblasNoTrans, CblasNoTrans,
-                kInputDimensions, count, kOutputDimensions,
-                1.0,
-                weights_, kInputDimensions,
-                gradients + offset * kOutputDimensions, kOutputDimensions,
-                0.0,
-                &gradients_[offset * kInputDimensions], kInputDimensions
-            );
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                cblas_saxpy(
-                    kOutputDimensions, 1.0,
-                    &gradients[batch_offset], 1, thread_state.biases_diff_, 1
-                );
-            }
-
-            cblas_sgemm(
-                CblasRowMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, kInputDimensions, count,
-                1.0,
-                gradients + offset * kOutputDimensions, kOutputDimensions,
-                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
-                momentum,
-                thread_state.weights_diff_, kInputDimensions
-            );
-
-#else
-
-            // backpropagate
-            Blas::sgemm(
-                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans,
-                kInputDimensions, count, kOutputDimensions,
-                1.0,
-                weights_, kInputDimensions,
-                gradients + offset * kOutputDimensions, kOutputDimensions,
-                0.0,
-                &gradients_[offset * kInputDimensions], kInputDimensions
-            );
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                Blas::saxpy(kOutputDimensions, 1.0,
-                          &gradients[batch_offset], 1, thread_state.biases_diff_, 1);
-            }
-
-            Blas::sgemm(
-                Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
-                kOutputDimensions, kInputDimensions, count,
-                1.0,
-                gradients + offset * kOutputDimensions, kOutputDimensions,
-                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
-                momentum,
-                thread_state.weights_diff_, kInputDimensions
-            );
-
-#endif
-
-            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
-        }
-
-        void reduce_thread_state()
-        {
-            for (IndexType i = 1; i < thread_states_.size(); ++i)
-            {
-                thread_states_[0] += thread_states_[i];
-            }
-        }
-
-        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
-        {
-            const LearnFloatType local_learning_rate =
-                learning_rate * learning_rate_scale_;
-
-            reduce_thread_state();
-
-            auto& main_thread_state = thread_states_[0];
-
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                const double d = local_learning_rate * main_thread_state.biases_diff_[i];
-                biases_[i] -= d;
-                abs_biases_diff_sum_ += std::abs(d);
-            }
-            num_biases_diffs_ += kOutputDimensions;
-
-            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-                const double d = local_learning_rate * main_thread_state.weights_diff_[i];
-                weights_[i] -= d;
-                abs_weights_diff_sum_ += std::abs(d);
-            }
-            num_weights_diffs_ += kOutputDimensions * kInputDimensions;
-
-            previous_layer_trainer_->step_end(thread_pool, learning_rate);
-        }
-
-    private:
-        // constructor
-        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-            combined_batch_size_(0),
-            combined_batch_input_(nullptr),
-            previous_layer_trainer_(Trainer<PreviousLayer>::create(
-                &target_layer->previous_layer_, ft)),
-            target_layer_(target_layer),
-            biases_(),
-            weights_(),
-            momentum_(0.2),
-            learning_rate_scale_(1.0) {
-
-            dequantize_parameters();
-        }
-
-        void reset_stats() {
-            abs_biases_diff_sum_ = 0.0;
-            abs_weights_diff_sum_ = 0.0;
-            num_biases_diffs_ = 0;
-            num_weights_diffs_ = 0;
-        }
-
-        void check_health() {
-
-            double abs_bias_sum = 0.0;
-            double abs_weight_sum = 0.0;
-
-            for(auto b : biases_)
-                abs_bias_sum += std::abs(b);
-
-            for(auto w : weights_)
-                abs_weight_sum += std::abs(w);
-
-            auto out = sync_region_cout.new_region();
-
-            out << "INFO (check_health):"
-                << " layer " << LayerType::kLayerIndex
-                << " - " << LayerType::get_name()
-                << std::endl;
-
-            out << "  - avg_abs_bias        = " << abs_bias_sum / std::size(biases_) << std::endl;
-            out << "  - avg_abs_bias_diff   = " << abs_biases_diff_sum_ / num_biases_diffs_ << std::endl;
-            out << "  - avg_abs_weight      = " << abs_weight_sum / std::size(weights_) << std::endl;
-            out << "  - avg_abs_weight_diff = " << abs_weights_diff_sum_ / num_weights_diffs_ << std::endl;
-
-            out.unlock();
-
-            reset_stats();
-        }
-
-        // Weight saturation and parameterization
-        void quantize_parameters() {
-            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-                weights_[i] = std::max(-kMaxWeightMagnitude,
-                                       std::min(+kMaxWeightMagnitude, weights_[i]));
-            }
-
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                target_layer_->biases_[i] =
-                    round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
-            }
-
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                const auto offset = kInputDimensions * i;
-                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-                for (IndexType j = 0; j < kInputDimensions; ++j) {
-                    target_layer_->weights_[padded_offset + j] =
-                        round<typename LayerType::WeightType>(
-                            weights_[offset + j] * kWeightScale);
-                }
-            }
-        }
-
-        // read parameterized integer
-        void dequantize_parameters() {
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                biases_[i] = static_cast<LearnFloatType>(
-                    target_layer_->biases_[i] / kBiasScale);
-            }
-
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                const auto offset = kInputDimensions * i;
-                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-                for (IndexType j = 0; j < kInputDimensions; ++j) {
-                    weights_[offset + j] = static_cast<LearnFloatType>(
-                        target_layer_->weights_[padded_offset + j] / kWeightScale);
-                }
-            }
-
-            for (auto& state : thread_states_)
-            {
-                state.reset_weights();
-                state.reset_biases();
-            }
-
-
-            reset_stats();
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
-        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-        // If the output dimensionality is 1, the output layer
-        static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
-
-        // Coefficient used for parameterization
-        static constexpr LearnFloatType kActivationScale =
-            std::numeric_limits<std::int8_t>::max();
-
-        static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
-            (kPonanzaConstant * FV_SCALE) :
-            ((1 << kWeightScaleBits) * kActivationScale);
-
-        static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
-
-        // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
-        static constexpr LearnFloatType kMaxWeightMagnitude =
-            std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
-
-        // number of samples in mini-batch
-        IndexType combined_batch_size_;
-
-        double abs_biases_diff_sum_;
-        double abs_weights_diff_sum_;
-        uint64_t num_biases_diffs_;
-        uint64_t num_weights_diffs_;
-
-        // Input mini batch
-        const LearnFloatType* combined_batch_input_;
-
-        // Trainer of the previous layer
-        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-        // layer to learn
-        LayerType* const target_layer_;
-
-        // parameter
-        struct alignas(kCacheLineSize) ThreadState
-        {
-            // Buffer used for updating parameters
-            alignas(kCacheLineSize) LearnFloatType biases_diff_[kOutputDimensions];
-            alignas(kCacheLineSize) LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
-
-            ThreadState() { reset_weights(); reset_biases(); }
-
-            ThreadState& operator+=(const ThreadState& other)
-            {
-                for (IndexType i = 0; i < kOutputDimensions; ++i)
-                {
-                    biases_diff_[i] += other.biases_diff_[i];
-                }
-
-                for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i)
-                {
-                    weights_diff_[i] += other.weights_diff_[i];
-                }
-
-                return *this;
-            }
-
-            void reset_weights()
-            {
-                std::fill(std::begin(weights_diff_), std::end(weights_diff_), 0.0f);
-            }
-
-            void reset_biases()
-            {
-                std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
-            }
-        };
-
-        alignas(kCacheLineSize) LearnFloatType biases_[kOutputDimensions];
-        alignas(kCacheLineSize) LearnFloatType weights_[kOutputDimensions * kInputDimensions];
-
-        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
-
-        // Forward propagation buffer
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
-
-        // buffer for back propagation
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
-
-        // hyper parameter
-        LearnFloatType momentum_;
-        LearnFloatType learning_rate_scale_;
-    };
-
-}  // namespace Eval::NNUE
-
-#endif
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -1,354 +0,0 @@
-#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
-#define _NNUE_TRAINER_CLIPPED_RELU_H_
-
-#include "trainer.h"
-
-#include "learn/learn.h"
-
-#include "nnue/layers/clipped_relu.h"
-
-#include "thread.h"
-
-// Specialization of NNUE evaluation function learning class template for ClippedReLU
-namespace Eval::NNUE {
-
-    // Learning: Affine transformation layer
-    template <typename PreviousLayer>
-    class Trainer<Layers::ClippedReLU<PreviousLayer>> {
-    private:
-        // Type of layer to learn
-        using LayerType = Layers::ClippedReLU<PreviousLayer>;
-
-    public:
-        // factory function
-        static std::shared_ptr<Trainer> create(
-            LayerType* target_layer, FeatureTransformer* ft) {
-
-            return std::shared_ptr<Trainer>(
-                new Trainer(target_layer, ft));
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            previous_layer_trainer_->send_message(message);
-            if (receive_message("check_health", message)) {
-                check_health();
-            }
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            previous_layer_trainer_->initialize(rng);
-        }
-
-        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
-        {
-            const auto size = batch_end - batch_begin;
-
-            if ((long)output_.size() < (long)kOutputDimensions * size) {
-              output_.resize(kOutputDimensions * size);
-              gradients_.resize(kInputDimensions * size);
-            }
-
-            if (thread_states_.size() < thread_pool.size())
-            {
-                thread_states_.resize(thread_pool.size());
-            }
-
-            input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
-
-            batch_size_ = size;
-
-            return output_.data();
-        }
-
-        // forward propagation
-        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
-
-            auto& thread_state = thread_states_[th.thread_idx()];
-
-            previous_layer_trainer_->propagate(th, offset, count);
-
-#if defined (USE_SSE2)
-
-            {
-                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
-
-                const __m128 kZero4 = _mm_set1_ps(+kZero);
-                const __m128 kOne4 = _mm_set1_ps(+kOne);
-
-                for (IndexType b = offset; b < offset + count; ++b)
-                {
-                    const IndexType batch_offset = kOutputDimensions * b;
-
-                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
-                    {
-                        __m128 out0 = _mm_loadu_ps(&input_[i + 0 + batch_offset]);
-                        __m128 out1 = _mm_loadu_ps(&input_[i + 4 + batch_offset]);
-                        __m128 out2 = _mm_loadu_ps(&input_[i + 8 + batch_offset]);
-                        __m128 out3 = _mm_loadu_ps(&input_[i + 12 + batch_offset]);
-
-                        out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
-                        out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
-                        out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
-                        out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
-
-                        _mm_storeu_ps(&output_[i + 0 + batch_offset], out0);
-                        _mm_storeu_ps(&output_[i + 4 + batch_offset], out1);
-                        _mm_storeu_ps(&output_[i + 8 + batch_offset], out2);
-                        _mm_storeu_ps(&output_[i + 12 + batch_offset], out3);
-
-                        __m128 minact0 = _mm_loadu_ps(&thread_state.min_activations_[i + 0]);
-                        __m128 minact1 = _mm_loadu_ps(&thread_state.min_activations_[i + 4]);
-                        __m128 minact2 = _mm_loadu_ps(&thread_state.min_activations_[i + 8]);
-                        __m128 minact3 = _mm_loadu_ps(&thread_state.min_activations_[i + 12]);
-
-                        __m128 maxact0 = _mm_loadu_ps(&thread_state.max_activations_[i + 0]);
-                        __m128 maxact1 = _mm_loadu_ps(&thread_state.max_activations_[i + 4]);
-                        __m128 maxact2 = _mm_loadu_ps(&thread_state.max_activations_[i + 8]);
-                        __m128 maxact3 = _mm_loadu_ps(&thread_state.max_activations_[i + 12]);
-
-                        minact0 = _mm_min_ps(out0, minact0);
-                        minact1 = _mm_min_ps(out1, minact1);
-                        minact2 = _mm_min_ps(out2, minact2);
-                        minact3 = _mm_min_ps(out3, minact3);
-
-                        maxact0 = _mm_max_ps(out0, maxact0);
-                        maxact1 = _mm_max_ps(out1, maxact1);
-                        maxact2 = _mm_max_ps(out2, maxact2);
-                        maxact3 = _mm_max_ps(out3, maxact3);
-
-                        _mm_storeu_ps(&thread_state.min_activations_[i + 0], minact0);
-                        _mm_storeu_ps(&thread_state.min_activations_[i + 4], minact1);
-                        _mm_storeu_ps(&thread_state.min_activations_[i + 8], minact2);
-                        _mm_storeu_ps(&thread_state.min_activations_[i + 12], minact3);
-
-                        _mm_storeu_ps(&thread_state.max_activations_[i + 0], maxact0);
-                        _mm_storeu_ps(&thread_state.max_activations_[i + 4], maxact1);
-                        _mm_storeu_ps(&thread_state.max_activations_[i + 8], maxact2);
-                        _mm_storeu_ps(&thread_state.max_activations_[i + 12], maxact3);
-                    }
-                }
-            }
-
-#else
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    const IndexType index = batch_offset + i;
-                    output_[index] = std::max(+kZero, std::min(+kOne, input_[index]));
-                    thread_state.min_activations_[i] = std::min(thread_state.min_activations_[i], output_[index]);
-                    thread_state.max_activations_[i] = std::max(thread_state.max_activations_[i], output_[index]);
-                }
-            }
-
-#endif
-        }
-
-        // backpropagation
-        void backpropagate(Thread& th,
-                           const LearnFloatType* gradients,
-                           const uint64_t offset,
-                           const uint64_t count) {
-
-            auto& thread_state = thread_states_[th.thread_idx()];
-
-#if defined (USE_SSE2)
-
-            {
-                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
-
-                const __m128 kZero4 = _mm_set1_ps(+kZero);
-                const __m128 kOne4 = _mm_set1_ps(+kOne);
-
-                for (IndexType b = offset; b < offset + count; ++b)
-                {
-                    const IndexType batch_offset = kOutputDimensions * b;
-
-                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
-                    {
-                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
-                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
-                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
-                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
-
-                        __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
-                        __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
-                        __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
-                        __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
-
-                        __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
-                        __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
-                        __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
-                        __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
-
-                        grad0 = _mm_andnot_ps(clipped0, grad0);
-                        grad1 = _mm_andnot_ps(clipped1, grad1);
-                        grad2 = _mm_andnot_ps(clipped2, grad2);
-                        grad3 = _mm_andnot_ps(clipped3, grad3);
-
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
-
-                        const int clipped_mask =
-                            (_mm_movemask_ps(clipped0) << 0)
-                            | (_mm_movemask_ps(clipped1) << 4)
-                            | (_mm_movemask_ps(clipped2) << 8)
-                            | (_mm_movemask_ps(clipped3) << 12);
-
-                        thread_state.num_clipped_ += popcount(clipped_mask);
-                    }
-                }
-            }
-
-#else
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    const IndexType index = batch_offset + i;
-                    const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
-                    gradients_[index] = gradients[index] * !clipped;
-                    thread_state.num_clipped_ += clipped;
-                }
-            }
-
-#endif
-
-            thread_state.num_total_ += count * kOutputDimensions;
-
-            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
-        }
-
-        void reduce_thread_state()
-        {
-            for (IndexType i = 1; i < thread_states_.size(); ++i)
-            {
-                thread_states_[0] += thread_states_[i];
-            }
-        }
-
-        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
-        {
-            previous_layer_trainer_->step_end(thread_pool, learning_rate);
-        }
-
-    private:
-        // constructor
-        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-            batch_size_(0),
-            previous_layer_trainer_(Trainer<PreviousLayer>::create(
-                &target_layer->previous_layer_, ft)),
-            target_layer_(target_layer) {
-
-            reset_stats();
-        }
-
-        void reset_stats() {
-            for(auto& state : thread_states_)
-                state.reset();
-        }
-
-        // Check if there are any problems with learning
-        void check_health() {
-
-            reduce_thread_state();
-
-            auto& main_thread_state = thread_states_[0];
-
-            const auto largest_min_activation = *std::max_element(
-                std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
-            const auto smallest_max_activation = *std::min_element(
-                std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
-
-            auto out = sync_region_cout.new_region();
-
-            out << "INFO (check_health):"
-                << " layer " << LayerType::kLayerIndex
-                << " - " << LayerType::get_name()
-                << std::endl;
-
-            out << "  - largest min activation = " << largest_min_activation
-                << " , smallest max activation = " << smallest_max_activation
-                << std::endl;
-
-            out << "  - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
-                << std::endl;
-
-            out.unlock();
-
-            reset_stats();
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
-        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-        // LearnFloatType constant
-        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
-        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
-
-        // number of samples in mini-batch
-        IndexType batch_size_;
-
-        const LearnFloatType* input_;
-
-        // Trainer of the previous layer
-        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-        // layer to learn
-        LayerType* const target_layer_;
-
-        // Forward propagation buffer
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
-
-        // buffer for back propagation
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
-
-        struct alignas(kCacheLineSize) ThreadState
-        {
-            // Health check statistics
-            LearnFloatType min_activations_[kOutputDimensions];
-            LearnFloatType max_activations_[kOutputDimensions];
-            uint64_t num_clipped_;
-            uint64_t num_total_;
-
-            ThreadState() { reset(); }
-
-            ThreadState& operator+=(const ThreadState& other)
-            {
-                for (IndexType i = 0; i < kOutputDimensions; ++i)
-                {
-                    min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
-                }
-
-                for (IndexType i = 0; i < kOutputDimensions; ++i)
-                {
-                    max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
-                }
-
-                num_clipped_ += other.num_clipped_;
-                num_total_ += other.num_total_;
-
-                return *this;
-            }
-
-            void reset()
-            {
-                std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
-                std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
-                num_clipped_ = 0;
-                num_total_ = 0;
-            }
-        };
-
-        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
-    };
-
-}  // namespace Eval::NNUE
-
-#endif
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -1,783 +0,0 @@
-#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
-#define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
-
-#include "trainer.h"
-
-#include "extra/stockfish_blas.h"
-
-#include "features/all_factorizers.h"
-
-#include "learn/learn.h"
-
-#include "nnue/nnue_feature_transformer.h"
-
-#include "thread.h"
-
-#include <array>
-#include <bitset>
-#include <numeric>
-#include <random>
-#include <set>
-
-// Specialization for feature transformer of learning class template of NNUE evaluation function
-namespace Eval::NNUE {
-
-    // Learning: Input feature converter
-    template <>
-    class Trainer<FeatureTransformer> {
-    private:
-        // Type of layer to learn
-        using LayerType = FeatureTransformer;
-
-    public:
-        template <typename T>
-        friend struct AlignedDeleter;
-
-        template <typename T, typename... ArgumentTypes>
-        friend std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments);
-
-        // factory function
-        static std::shared_ptr<Trainer> create(LayerType* target_layer) {
-            return make_aligned_shared_ptr<Trainer>(target_layer);
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            if (receive_message("momentum", message)) {
-                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-            }
-
-            if (receive_message("learning_rate_scale", message)) {
-                learning_rate_scale_ =
-                    static_cast<LearnFloatType>(std::stod(message->value));
-            }
-
-            if (receive_message("reset", message)) {
-                dequantize_parameters();
-            }
-
-            if (receive_message("quantize_parameters", message)) {
-                quantize_parameters();
-            }
-
-            if (receive_message("clear_unobserved_feature_weights", message)) {
-                clear_unobserved_feature_weights();
-            }
-
-            if (receive_message("check_health", message)) {
-                check_health();
-            }
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            std::fill(std::begin(weights_), std::end(weights_), +kZero);
-
-            const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
-            auto distribution = std::normal_distribution<double>(0.0, kSigma);
-
-            for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
-                const auto weight = static_cast<LearnFloatType>(distribution(rng));
-                weights_[i] = weight;
-            }
-
-            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                biases_[i] = static_cast<LearnFloatType>(0.5);
-            }
-
-            quantize_parameters();
-        }
-
-        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
-        {
-            const auto size = batch_end - batch_begin;
-
-            if ((long)output_.size() < (long)kOutputDimensions * size) {
-                output_.resize(kOutputDimensions * size);
-                gradients_.resize(kOutputDimensions * size);
-            }
-
-            if (thread_stat_states_.size() < thread_pool.size())
-            {
-                thread_stat_states_.resize(thread_pool.size());
-            }
-
-            if (thread_bias_states_.size() < thread_pool.size())
-            {
-                thread_bias_states_.resize(thread_pool.size());
-            }
-
-            batch_ = &*batch_begin;
-            batch_size_ = size;
-
-            auto& main_thread_bias_state = thread_bias_states_[0];
-
-#if defined(USE_BLAS)
-
-            cblas_sscal(
-                kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1
-            );
-
-#else
-
-            Blas::sscal(
-                kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1
-            );
-
-#endif
-
-            for (IndexType i = 1; i < thread_bias_states_.size(); ++i)
-                thread_bias_states_[i].reset();
-
-            return output_.data();
-        }
-
-        // forward propagation
-        void propagate(Thread& th, uint64_t offset, uint64_t count) {
-
-            auto& thread_stat_state = thread_stat_states_[th.thread_idx()];
-
-            for (IndexType b = offset; b < offset + count; ++b)
-            {
-                const IndexType batch_offset = kOutputDimensions * b;
-
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-
-#if defined(USE_BLAS)
-
-                    cblas_scopy(
-                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                    );
-
-                    for (const auto& feature : batch_[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        cblas_saxpy(
-                            kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], 1, &output_[output_offset], 1
-                        );
-                    }
-
-#else
-
-                    Blas::scopy(
-                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                    );
-                    for (const auto& feature : batch_[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        Blas::saxpy(
-                            kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], &output_[output_offset]
-                        );
-                    }
-
-#endif
-                }
-            }
-
-#if defined (USE_SSE2)
-
-            {
-                static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
-
-                auto m128_hmin_ps = [](__m128 x3210) {
-                    __m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2));
-                    __m128 min_x_x_13_20 = _mm_min_ps(x3210, x0032);
-                    // a = [ # , # , min(x[1], x[3]) , min(x[2], x[0]) ]
-                    __m128 min_x_x_20_13 = _mm_shuffle_ps(min_x_x_13_20, min_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1));
-                    return _mm_cvtss_f32(_mm_min_ps(min_x_x_13_20, min_x_x_20_13));
-                };
-
-                auto m128_hmax_ps = [](__m128 x3210) {
-                    __m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2));
-                    __m128 max_x_x_13_20 = _mm_max_ps(x3210, x0032);
-                    // a = [ # , # , max(x[1], x[3]) , max(x[2], x[0]) ]
-                    __m128 max_x_x_20_13 = _mm_shuffle_ps(max_x_x_13_20, max_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1));
-                    return _mm_cvtss_f32(_mm_max_ps(max_x_x_13_20, max_x_x_20_13));
-                };
-
-                const __m128 kZero4 = _mm_set1_ps(+kZero);
-                const __m128 kOne4 = _mm_set1_ps(+kOne);
-
-                __m128 min_pre_activation0 = _mm_set1_ps(thread_stat_state.min_pre_activation_);
-                __m128 min_pre_activation1 = _mm_set1_ps(thread_stat_state.min_pre_activation_);
-                __m128 max_pre_activation0 = _mm_set1_ps(thread_stat_state.max_pre_activation_);
-                __m128 max_pre_activation1 = _mm_set1_ps(thread_stat_state.max_pre_activation_);
-
-                for (IndexType b = offset; b < offset + count; ++b)
-                {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
-                    {
-                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i +  0]);
-                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i +  4]);
-                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i +  8]);
-                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
-
-                        __m128 min01 = _mm_min_ps(out0, out1);
-                        __m128 min23 = _mm_min_ps(out2, out3);
-
-                        __m128 max01 = _mm_max_ps(out0, out1);
-                        __m128 max23 = _mm_max_ps(out2, out3);
-
-                        min_pre_activation0 = _mm_min_ps(min_pre_activation0, min01);
-                        min_pre_activation1 = _mm_min_ps(min_pre_activation1, min23);
-                        max_pre_activation0 = _mm_max_ps(max_pre_activation0, max01);
-                        max_pre_activation1 = _mm_max_ps(max_pre_activation1, max23);
-
-                        out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
-                        out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
-                        out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
-                        out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
-
-                        _mm_storeu_ps(&output_[batch_offset + i +  0], out0);
-                        _mm_storeu_ps(&output_[batch_offset + i +  4], out1);
-                        _mm_storeu_ps(&output_[batch_offset + i +  8], out2);
-                        _mm_storeu_ps(&output_[batch_offset + i + 12], out3);
-                    }
-                }
-
-                thread_stat_state.min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1));
-                thread_stat_state.max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1));
-
-                for (IndexType b = offset; b < offset + count; ++b)
-                {
-                    const IndexType batch_offset = kOutputDimensions * b;
-
-                    for (IndexType half = 0; half < 2; ++half)
-                    {
-                        const IndexType half_offset = batch_offset + half * kHalfDimensions;
-                        for (IndexType i = 0; i < kHalfDimensions; i += 16)
-                        {
-                            const __m128 out0 = _mm_loadu_ps(&output_[i +  0 + half_offset]);
-                            const __m128 out1 = _mm_loadu_ps(&output_[i +  4 + half_offset]);
-                            const __m128 out2 = _mm_loadu_ps(&output_[i +  8 + half_offset]);
-                            const __m128 out3 = _mm_loadu_ps(&output_[i + 12 + half_offset]);
-
-                            __m128 minact0 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  0]);
-                            __m128 minact1 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  4]);
-                            __m128 minact2 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  8]);
-                            __m128 minact3 = _mm_loadu_ps(&thread_stat_state.min_activations_[i + 12]);
-
-                            __m128 maxact0 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  0]);
-                            __m128 maxact1 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  4]);
-                            __m128 maxact2 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  8]);
-                            __m128 maxact3 = _mm_loadu_ps(&thread_stat_state.max_activations_[i + 12]);
-
-                            minact0 = _mm_min_ps(out0, minact0);
-                            minact1 = _mm_min_ps(out1, minact1);
-                            minact2 = _mm_min_ps(out2, minact2);
-                            minact3 = _mm_min_ps(out3, minact3);
-
-                            maxact0 = _mm_max_ps(out0, maxact0);
-                            maxact1 = _mm_max_ps(out1, maxact1);
-                            maxact2 = _mm_max_ps(out2, maxact2);
-                            maxact3 = _mm_max_ps(out3, maxact3);
-
-                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  0], minact0);
-                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  4], minact1);
-                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  8], minact2);
-                            _mm_storeu_ps(&thread_stat_state.min_activations_[i + 12], minact3);
-
-                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  0], maxact0);
-                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  4], maxact1);
-                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  8], maxact2);
-                            _mm_storeu_ps(&thread_stat_state.max_activations_[i + 12], maxact3);
-                        }
-                    }
-                }
-            }
-
-#else
-
-            // clipped ReLU
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    const IndexType index = batch_offset + i;
-                    thread_stat_state.min_pre_activation_ = std::min(thread_stat_state.min_pre_activation_, output_[index]);
-                    thread_stat_state.max_pre_activation_ = std::max(thread_stat_state.max_pre_activation_, output_[index]);
-                    output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
-                    const IndexType t = i % kHalfDimensions;
-                    thread_stat_state.min_activations_[t] = std::min(thread_stat_state.min_activations_[t], output_[index]);
-                    thread_stat_state.max_activations_[t] = std::max(thread_stat_state.max_activations_[t], output_[index]);
-                }
-            }
-
-#endif
-        }
-
-        // backpropagation
-        void backpropagate(Thread& th,
-                           const LearnFloatType* gradients,
-                           uint64_t offset,
-                           uint64_t count) {
-
-            auto& thread_stat_state = thread_stat_states_[th.thread_idx()];
-            auto& thread_bias_state = thread_bias_states_[th.thread_idx()];
-
-#if defined (USE_SSE2)
-
-            {
-                static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
-
-                const __m128 kZero4 = _mm_set1_ps(+kZero);
-                const __m128 kOne4 = _mm_set1_ps(+kOne);
-
-                for (IndexType b = offset; b < offset + count; ++b)
-                {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
-                    {
-                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
-                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
-                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
-                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
-
-                        __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
-                        __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
-                        __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
-                        __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
-
-                        __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
-                        __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
-                        __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
-                        __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
-
-                        grad0 = _mm_andnot_ps(clipped0, grad0);
-                        grad1 = _mm_andnot_ps(clipped1, grad1);
-                        grad2 = _mm_andnot_ps(clipped2, grad2);
-                        grad3 = _mm_andnot_ps(clipped3, grad3);
-
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
-
-                        const int clipped_mask =
-                            (_mm_movemask_ps(clipped0) << 0)
-                            | (_mm_movemask_ps(clipped1) << 4)
-                            | (_mm_movemask_ps(clipped2) << 8)
-                            | (_mm_movemask_ps(clipped3) << 12);
-
-                        thread_stat_state.num_clipped_ += popcount(clipped_mask);
-                    }
-                }
-            }
-
-#else
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    const IndexType index = batch_offset + i;
-                    const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
-                    gradients_[index] = gradients[index] * !clipped;
-                    thread_stat_state.num_clipped_ += clipped;
-                }
-            }
-
-#endif
-
-            thread_stat_state.num_total_ += count * kOutputDimensions;
-
-#if defined(USE_BLAS)
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    cblas_saxpy(
-                        kHalfDimensions, 1.0,
-                        &gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1
-                    );
-                }
-            }
-
-#else
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    Blas::saxpy(
-                        kHalfDimensions, 1.0,
-                        &gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1
-                    );
-                }
-            }
-
-#endif
-        }
-
-        void reduce_thread_stat_state()
-        {
-            for (IndexType i = 1; i < thread_stat_states_.size(); ++i)
-            {
-                thread_stat_states_[0] += thread_stat_states_[i];
-            }
-        }
-
-        void reduce_thread_bias_state()
-        {
-            for (IndexType i = 1; i < thread_bias_states_.size(); ++i)
-            {
-                thread_bias_states_[0] += thread_bias_states_[i];
-            }
-        }
-
-        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
-
-            const LearnFloatType local_learning_rate =
-                learning_rate * learning_rate_scale_;
-
-            // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
-            // Correct the learning rate and adjust the scale without using momentum
-            const LearnFloatType effective_learning_rate =
-                static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
-
-            reduce_thread_bias_state();
-
-            auto& main_thread_state = thread_bias_states_[0];
-
-#if defined(USE_BLAS)
-
-            cblas_saxpy(
-                kHalfDimensions, -local_learning_rate,
-                main_thread_state.biases_diff_, 1, biases_, 1
-            );
-
-#else
-
-            Blas::saxpy(
-                kHalfDimensions, -local_learning_rate,
-                main_thread_state.biases_diff_, 1, biases_, 1
-            );
-
-#endif
-
-            thread_pool.execute_with_workers(
-                [&, num_threads = thread_pool.size()](Thread& th) {
-                    const auto thread_index = th.thread_idx();
-
-                    for (IndexType b = 0; b < batch_size_; ++b) {
-                        const IndexType batch_offset = kOutputDimensions * b;
-
-                        for (IndexType c = 0; c < 2; ++c) {
-                            const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                            for (const auto& feature : batch_[b].training_features[c]) {
-                                const IndexType feature_index = feature.get_index();
-                                const IndexType weights_offset =
-                                    kHalfDimensions * feature_index;
-#if defined (USE_SSE2)
-                                _mm_prefetch(reinterpret_cast<const char*>(&weights_[weights_offset]), _MM_HINT_T2);
-#endif
-
-                                // We assign each bucket a continuous range of bits at least
-                                // of cache line size to prevent false sharing.
-                                // For HalfKP this is enough to saturate about 80 threads.
-                                const IndexType thread_bucket =
-                                    (feature_index / BitsetType::best_concurrent_access_stride)
-                                    % num_threads;
-
-                                if (thread_bucket != thread_index)
-                                    continue;
-
-                                // This operation can be performed safely because
-                                // each thread accesses a different memory location
-                                // (even a different cache line)
-                                observed_features.set(feature_index);
-
-                                const auto scale = static_cast<LearnFloatType>(
-                                    effective_learning_rate / feature.get_count());
-
-#if defined (USE_BLAS)
-
-                                cblas_saxpy(
-                                    kHalfDimensions, -scale,
-                                    &gradients_[output_offset], 1,
-                                    &weights_[weights_offset], 1
-                                );
-
-#else
-
-                                Blas::saxpy(
-                                    kHalfDimensions, -scale,
-                                    &gradients_[output_offset],
-                                    &weights_[weights_offset]
-                                );
-
-#endif
-                            }
-                        }
-                    }
-                }
-            );
-
-            thread_pool.wait_for_workers_finished();
-        }
-
-    private:
-        // constructor
-        Trainer(LayerType* target_layer) :
-            batch_(nullptr),
-            batch_size_(0),
-            target_layer_(target_layer),
-            biases_(),
-            weights_(),
-            momentum_(0.2),
-            learning_rate_scale_(1.0) {
-
-            dequantize_parameters();
-        }
-
-        // Weight saturation and parameterization
-        void quantize_parameters() {
-            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                target_layer_->biases_[i] =
-                    round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
-            }
-
-            std::vector<TrainingFeature> training_features;
-
-            Threads.for_each_index_with_workers(
-                0, RawFeatures::kDimensions,
-                [this, training_features](Thread&, int j) mutable {
-                    training_features.clear();
-                    Features::Factorizer<RawFeatures>::append_training_features(
-                        j, &training_features);
-
-                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                        double sum = 0.0;
-                        for (const auto& feature : training_features) {
-                            sum += weights_[kHalfDimensions * feature.get_index() + i];
-                        }
-
-                        target_layer_->weights_[kHalfDimensions * j + i] =
-                            round<typename LayerType::WeightType>(sum * kWeightScale);
-                    }
-                }
-            );
-            Threads.wait_for_workers_finished();
-        }
-
-        void reset_stats() {
-            for (auto& state : thread_stat_states_)
-                state.reset();
-        }
-
-        // read parameterized integer
-        void dequantize_parameters() {
-            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                biases_[i] = static_cast<LearnFloatType>(
-                    target_layer_->biases_[i] / kBiasScale);
-            }
-
-            std::fill(std::begin(weights_), std::end(weights_), +kZero);
-
-            for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
-                weights_[i] = static_cast<LearnFloatType>(
-                    target_layer_->weights_[i] / kWeightScale);
-            }
-
-            reset_stats();
-
-            for (auto& state : thread_bias_states_)
-                state.reset();
-        }
-
-        // Set the weight corresponding to the feature that does not appear in the learning data to 0
-        void clear_unobserved_feature_weights() {
-            for (IndexType i = 0; i < kInputDimensions; ++i) {
-                if (!observed_features.test(i)) {
-                    std::fill(std::begin(weights_) + kHalfDimensions * i,
-                              std::begin(weights_) + kHalfDimensions * (i + 1), +kZero);
-                }
-            }
-
-            quantize_parameters();
-        }
-
-        // Check if there are any problems with learning
-        void check_health() {
-
-            constexpr LearnFloatType kPreActivationLimit =
-                std::numeric_limits<typename LayerType::WeightType>::max() /
-                kWeightScale;
-
-            reduce_thread_stat_state();
-
-            auto& main_thread_state = thread_stat_states_[0];
-
-            const auto largest_min_activation = *std::max_element(
-                std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
-            const auto smallest_max_activation = *std::min_element(
-                std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
-
-            double abs_bias_sum = 0.0;
-            double abs_weight_sum = 0.0;
-
-            for(auto b : biases_)
-                abs_bias_sum += std::abs(b);
-
-            std::vector<TrainingFeature> training_features;
-            for (IndexType j = 0; j < RawFeatures::kDimensions; ++j)
-            {
-                training_features.clear();
-                Features::Factorizer<RawFeatures>::append_training_features(
-                    j, &training_features);
-
-                for (const auto& feature : training_features) {
-                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                        abs_weight_sum += std::abs(weights_[kHalfDimensions * feature.get_index() + i]);
-                    }
-                }
-            }
-
-            auto out = sync_region_cout.new_region();
-
-            out << "INFO (check_health):"
-                << " layer " << LayerType::kLayerIndex
-                << " - " << LayerType::get_name()
-                << std::endl;
-
-            out << "  - observed " << observed_features.count()
-                << " (out of " << kInputDimensions << ") features"
-                << std::endl;
-
-            out << "  - (min, max) of pre-activations = "
-                << main_thread_state.min_pre_activation_ << ", "
-                << main_thread_state.max_pre_activation_ << " (limit = "
-                << kPreActivationLimit << ")"
-                << std::endl;
-
-            out << "  - largest min activation = " << largest_min_activation
-                << " , smallest max activation = " << smallest_max_activation
-                << std::endl;
-
-            out << "  - avg_abs_bias   = " << abs_bias_sum / std::size(biases_) << std::endl;
-            out << "  - avg_abs_weight = " << abs_weight_sum / std::size(weights_) << std::endl;
-
-            out << "  - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
-                << std::endl;
-
-            out.unlock();
-
-            reset_stats();
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions =
-            Features::Factorizer<RawFeatures>::get_dimensions();
-        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-        static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
-
-        // Coefficient used for parameterization
-        static constexpr LearnFloatType kActivationScale =
-            std::numeric_limits<std::int8_t>::max();
-        static constexpr LearnFloatType kBiasScale = kActivationScale;
-        static constexpr LearnFloatType kWeightScale = kActivationScale;
-
-        // LearnFloatType constant
-        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
-        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
-
-        // mini batch
-        const Example* batch_;
-        IndexType batch_size_;
-
-        // layer to learn
-        LayerType* const target_layer_;
-
-        // parameter
-        alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
-        alignas(kCacheLineSize)
-            LearnFloatType weights_[kHalfDimensions * kInputDimensions];
-
-        // Buffer used for updating parameters
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
-
-        // Forward propagation buffer
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
-
-        // Features that appeared in the training data
-        using BitsetType = LargeBitset<kInputDimensions>;
-        BitsetType observed_features;
-
-        // hyper parameter
-        LearnFloatType momentum_;
-        LearnFloatType learning_rate_scale_;
-
-        struct alignas(kCacheLineSize) ThreadStatState
-        {
-            alignas(kCacheLineSize) LearnFloatType min_activations_[kHalfDimensions];
-            alignas(kCacheLineSize) LearnFloatType max_activations_[kHalfDimensions];
-            LearnFloatType min_pre_activation_;
-            LearnFloatType max_pre_activation_;
-            uint64_t num_clipped_;
-            uint64_t num_total_;
-
-            ThreadStatState() { reset(); }
-
-            ThreadStatState& operator+=(const ThreadStatState& other)
-            {
-                for (IndexType i = 0; i < kHalfDimensions; ++i)
-                {
-                    min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
-                }
-
-                for (IndexType i = 0; i < kHalfDimensions; ++i)
-                {
-                    max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
-                }
-
-                min_pre_activation_ = std::min(min_pre_activation_, other.min_pre_activation_);
-                max_pre_activation_ = std::max(max_pre_activation_, other.max_pre_activation_);
-
-                num_clipped_ += other.num_clipped_;
-                num_total_ += other.num_total_;
-
-                return *this;
-            }
-
-            void reset()
-            {
-                std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
-                std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
-                min_pre_activation_ = std::numeric_limits<float>::max();
-                max_pre_activation_ = std::numeric_limits<float>::lowest();
-                num_clipped_ = 0;
-                num_total_ = 0;
-            }
-        };
-
-        struct alignas(kCacheLineSize) ThreadBiasState
-        {
-            alignas(kCacheLineSize) LearnFloatType biases_diff_[kHalfDimensions];
-
-            ThreadBiasState() { reset(); }
-
-            ThreadBiasState& operator+=(const ThreadBiasState& other)
-            {
-                for (IndexType i = 0; i < kHalfDimensions; ++i)
-                {
-                    biases_diff_[i] += other.biases_diff_[i];
-                }
-
-                return *this;
-            }
-
-            void reset()
-            {
-                std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
-            }
-        };
-
-        std::vector<ThreadStatState, CacheLineAlignedAllocator<ThreadStatState>> thread_stat_states_;
-        std::vector<ThreadBiasState, CacheLineAlignedAllocator<ThreadBiasState>> thread_bias_states_;
-    };
-
-}  // namespace Eval::NNUE
-
-#endif
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -1,383 +0,0 @@
-#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
-#define _NNUE_TRAINER_INPUT_SLICE_H_
-
-#include "trainer.h"
-
-#include "extra/stockfish_blas.h"
-
-#include "learn/learn.h"
-
-#include "nnue/layers/input_slice.h"
-
-#include "thread.h"
-
-// Specialization of NNUE evaluation function learning class template for InputSlice
-namespace Eval::NNUE {
-
-    // Learning: Input layer
-    // This is tricky. It exists because when there's more than one trainer
-    // on top of a single feature transformer we want to only call propagate/backpropagate
-    // on the feature transformer once. This is straightforward in the old
-    // multithreading case, because propagate/backpropagate is called just once from the
-    // main thread. But with the current implementation of coarser multithreading
-    // we end up calling each method from each thread. Therefore we have to keep
-    // the num_calls and current_operation per thread basis, each thread must work
-    // on its designated batch slice, and the only synchronization points are
-    // step_start and step_end - for which we use state of the first thread.
-    // Each thread requires their own bookkeeping because it's possible that
-    // one thread is still in propagate of some batch slice while the other thread
-    // is doing backpropagate of some other slice. We also ensure the thread state
-    // isn't suspectible to false sharing by using a full cache line for the state.
-    class SharedInputTrainer {
-    public:
-        // factory function
-        static std::shared_ptr<SharedInputTrainer> create(
-            FeatureTransformer* ft) {
-
-            static std::shared_ptr<SharedInputTrainer> instance;
-
-            if (!instance) {
-                instance.reset(new SharedInputTrainer(ft));
-            }
-
-            ++instance->num_referrers_;
-
-            return instance;
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            auto& thread_state = thread_states_[0];
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kSendMessage;
-                feature_transformer_trainer_->send_message(message);
-            }
-
-            assert(thread_state.current_operation == Operation::kSendMessage);
-
-            if (++thread_state.num_calls == num_referrers_) {
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            auto& thread_state = thread_states_[0];
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kInitialize;
-                feature_transformer_trainer_->initialize(rng);
-            }
-
-            assert(thread_state.current_operation == Operation::kInitialize);
-
-            if (++thread_state.num_calls == num_referrers_) {
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-        }
-
-        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
-        {
-            const auto size = batch_end - batch_begin;
-
-            if ((long)gradients_.size() < (long)kInputDimensions * size) {
-                gradients_.resize(kInputDimensions * size);
-            }
-
-            if (thread_states_.size() < thread_pool.size())
-            {
-                thread_states_.resize(thread_pool.size());
-            }
-
-            batch_size_ = size;
-
-            auto& thread_state = thread_states_[0];
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kStepStart;
-                output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end);
-            }
-
-            assert(thread_state.current_operation == Operation::kStepStart);
-
-            if (++thread_state.num_calls == num_referrers_) {
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-
-            return output_;
-        }
-
-        // forward propagation
-        void propagate(Thread& th, uint64_t offset, uint64_t count) {
-            const auto thread_id = th.thread_idx();
-
-            auto& thread_state = thread_states_[thread_id];
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kPropagate;
-                feature_transformer_trainer_->propagate(th, offset, count);
-            }
-
-            assert(thread_state.current_operation == Operation::kPropagate);
-
-            if (++thread_state.num_calls == num_referrers_) {
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-        }
-
-        // backpropagation
-        void backpropagate(Thread& th,
-                           const LearnFloatType* gradients,
-                           uint64_t offset,
-                           uint64_t count) {
-
-            const auto thread_id = th.thread_idx();
-
-            auto& thread_state = thread_states_[thread_id];
-
-            if (num_referrers_ == 1) {
-                feature_transformer_trainer_->backpropagate(th, gradients, offset, count);
-                return;
-            }
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kBackPropagate;
-                for (IndexType b = offset; b < offset + count; ++b) {
-                    const IndexType batch_offset = kInputDimensions * b;
-                    for (IndexType i = 0; i < kInputDimensions; ++i) {
-                        gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
-                    }
-                }
-            }
-
-            assert(thread_state.current_operation == Operation::kBackPropagate);
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kInputDimensions * b;
-                for (IndexType i = 0; i < kInputDimensions; ++i) {
-                    gradients_[batch_offset + i] += gradients[batch_offset + i];
-                }
-            }
-
-            if (++thread_state.num_calls == num_referrers_) {
-                feature_transformer_trainer_->backpropagate(
-                    th, gradients_.data(), offset, count);
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-        }
-
-        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
-            auto& thread_state = thread_states_[0];
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kStepEnd;
-                feature_transformer_trainer_->step_end(thread_pool, learning_rate);
-            }
-
-            assert(thread_state.current_operation == Operation::kStepEnd);
-
-            if (++thread_state.num_calls == num_referrers_) {
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-        }
-
-    private:
-        // constructor
-        SharedInputTrainer(FeatureTransformer* ft) :
-            batch_size_(0),
-            num_referrers_(0),
-            thread_states_(1),
-            feature_transformer_trainer_(Trainer<FeatureTransformer>::create(
-                ft)),
-            output_(nullptr) {
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions =
-            FeatureTransformer::kOutputDimensions;
-
-        // type of processing
-        enum class Operation {
-            kNone,
-            kSendMessage,
-            kInitialize,
-            kStepStart,
-            kPropagate,
-            kBackPropagate,
-            kStepEnd,
-        };
-
-        // number of samples in mini-batch
-        IndexType batch_size_;
-
-        // number of layers sharing this layer as input
-        std::uint32_t num_referrers_;
-
-        struct alignas(kCacheLineSize) ThreadState
-        {
-            std::uint32_t num_calls{0};
-
-            // current processing type
-            Operation current_operation = Operation::kNone;
-        };
-
-        // Number of times the current process has been called
-        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
-
-        // Trainer of input feature converter
-        const std::shared_ptr<Trainer<FeatureTransformer>>
-            feature_transformer_trainer_;
-
-        // pointer to output shared for forward propagation
-        const LearnFloatType* output_;
-
-        // buffer for back propagation
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
-    };
-
-    // Learning: Input layer
-    template <IndexType OutputDimensions, IndexType Offset>
-    class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
-    private:
-        // Type of layer to learn
-        using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
-
-    public:
-        // factory function
-        static std::shared_ptr<Trainer> create(
-            LayerType* /*target_layer*/, FeatureTransformer* ft) {
-
-            return std::shared_ptr<Trainer>(new Trainer(ft));
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            shared_input_trainer_->send_message(message);
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            shared_input_trainer_->initialize(rng);
-        }
-
-        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
-        {
-            const auto size = batch_end - batch_begin;
-
-            if ((long)output_.size() < (long)kOutputDimensions * size) {
-              output_.resize(kOutputDimensions * size);
-              gradients_.resize(kInputDimensions * size);
-            }
-
-            batch_size_ = size;
-
-            input_ = shared_input_trainer_->step_start(thread_pool, batch_begin, batch_end);
-
-            return output_.data();
-        }
-
-        // forward propagation
-        void propagate(Thread& th, uint64_t offset, uint64_t count) {
-
-            shared_input_trainer_->propagate(th, offset, count);
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType input_offset = kInputDimensions * b;
-                const IndexType output_offset = kOutputDimensions * b;
-
-#if defined(USE_BLAS)
-
-                cblas_scopy(
-                    kOutputDimensions, &input_[input_offset + Offset], 1,
-                    &output_[output_offset], 1
-                );
-#else
-
-                Blas::scopy(
-                    kOutputDimensions, &input_[input_offset + Offset], 1,
-                    &output_[output_offset], 1
-                );
-
-#endif
-            }
-        }
-
-        // backpropagation
-        void backpropagate(Thread& th,
-                           const LearnFloatType* gradients,
-                           uint64_t offset,
-                           uint64_t count) {
-
-            for (IndexType b = offset; b < offset + count; ++b)
-            {
-                const IndexType input_offset = kInputDimensions * b;
-                const IndexType output_offset = kOutputDimensions * b;
-
-                IndexType i = 0;
-                if constexpr (Offset > 0)
-                {
-                    for (; i < Offset; ++i) {
-                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-                    }
-                }
-
-                for (; i < Offset + kOutputDimensions; ++i) {
-                    gradients_[input_offset + i] = gradients[output_offset + i - Offset];
-                }
-
-                if constexpr (Offset + kOutputDimensions < kInputDimensions)
-                {
-                    for (; i < kInputDimensions; ++i)
-                    {
-                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-                    }
-                }
-            }
-
-            shared_input_trainer_->backpropagate(th, gradients_.data(), offset, count);
-        }
-
-        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
-            shared_input_trainer_->step_end(thread_pool, learning_rate);
-        }
-
-    private:
-        // constructor
-        Trainer(FeatureTransformer* ft) :
-            batch_size_(0),
-            shared_input_trainer_(SharedInputTrainer::create(ft)) {
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions =
-            FeatureTransformer::kOutputDimensions;
-        static constexpr IndexType kOutputDimensions = OutputDimensions;
-        static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
-
-        // number of samples in mini-batch
-        IndexType batch_size_;
-
-        const LearnFloatType* input_;
-
-        // Trainer of shared input layer
-        const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
-
-        // Forward propagation buffer
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
-
-        // buffer for back propagation
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
-    };
-
-}  // namespace Eval::NNUE
-
-#endif
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -1,201 +0,0 @@
-#ifndef _NNUE_TRAINER_SUM_H_
-#define _NNUE_TRAINER_SUM_H_
-
-#include "trainer.h"
-
-#include "extra/stockfish_blas.h"
-
-#include "learn/learn.h"
-
-#include "nnue/layers/sum.h"
-
-#include "thread.h"
-
-// Specialization of NNUE evaluation function learning class template for Sum
-namespace Eval::NNUE {
-
-    // Learning: A layer that sums the outputs of multiple layers
-    template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
-    class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
-          Trainer<Layers::Sum<RemainingPreviousLayers...>> {
-    private:
-        // Type of layer to learn
-        using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
-        using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
-
-    public:
-        // factory function
-        static std::shared_ptr<Trainer> create(
-            LayerType* target_layer, FeatureTransformer* ft) {
-
-            return std::shared_ptr<Trainer>(
-                new Trainer(target_layer, ft));
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            // The results of other member functions do not depend on the processing order, so
-            // Tail is processed first for the purpose of simplifying the implementation, but
-            // SendMessage processes Head first to make it easier to understand subscript correspondence
-            previous_layer_trainer_->send_message(message);
-            Tail::send_message(message);
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            Tail::initialize(rng);
-            previous_layer_trainer_->initialize(rng);
-        }
-
-        // forward propagation
-        /*const*/ LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
-            batch_size_ = static_cast<IndexType>(batch.size());
-            auto output = Tail::propagate(thread_pool, batch);
-            const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch);
-
-#if defined(USE_BLAS)
-
-            cblas_saxpy(
-                kOutputDimensions * batch_size_, 1.0,
-                head_output, 1, output, 1
-            );
-
-#else
-
-            Blas::saxpy(
-                thread_pool,
-                kOutputDimensions * batch_size_, 1.0,
-                head_output, 1, output, 1
-            );
-
-#endif
-            return output;
-        }
-
-        // backpropagation
-        void backpropagate(ThreadPool& thread_pool,
-                           const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
-
-            Tail::backpropagate(thread_pool, gradients, learning_rate);
-            previous_layer_trainer_->backpropagate(thread_pool, gradients, learning_rate);
-        }
-
-    private:
-        // constructor
-        Trainer(LayerType* target_layer, FeatureTransformer* ft):
-            Tail(target_layer, ft),
-            batch_size_(0),
-            previous_layer_trainer_(Trainer<FirstPreviousLayer>::create(
-                &target_layer->previous_layer_, ft)),
-            target_layer_(target_layer) {
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-        // make subclass friend
-        template <typename SumLayer>
-        friend class Trainer;
-
-        // number of samples in mini-batch
-        IndexType batch_size_;
-
-        // Trainer of the previous layer
-        const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
-
-        // layer to learn
-        LayerType* const target_layer_;
-    };
-
-
-    // Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
-    template <typename PreviousLayer>
-    class Trainer<Layers::Sum<PreviousLayer>> {
-    private:
-        // Type of layer to learn
-        using LayerType = Layers::Sum<PreviousLayer>;
-
-    public:
-        // factory function
-        static std::shared_ptr<Trainer> create(
-            LayerType* target_layer, FeatureTransformer* ft) {
-
-            return std::shared_ptr<Trainer>(
-                new Trainer(target_layer, ft));
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            previous_layer_trainer_->send_message(message);
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            previous_layer_trainer_->initialize(rng);
-        }
-
-        // forward propagation
-        /*const*/ LearnFloatType* propagate(const std::vector<Example>& batch) {
-            if (output_.size() < kOutputDimensions * batch.size()) {
-                output_.resize(kOutputDimensions * batch.size());
-            }
-
-            batch_size_ = static_cast<IndexType>(batch.size());
-            const auto output = previous_layer_trainer_->propagate(batch);
-
-#if defined(USE_BLAS)
-            cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
-#else
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    output_[batch_offset + i] = output[batch_offset + i];
-                }
-            }
-
-#endif
-            return output_.data();
-        }
-
-        // backpropagation
-        void backpropagate(const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
-
-            previous_layer_trainer_->backpropagate(gradients, learning_rate);
-        }
-
-    private:
-        // constructor
-        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-            batch_size_(0),
-            previous_layer_trainer_(Trainer<PreviousLayer>::create(
-                &target_layer->previous_layer_, ft)),
-            target_layer_(target_layer) {
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-        // make subclass friend
-        template <typename SumLayer>
-        friend class Trainer;
-
-        // number of samples in mini-batch
-        IndexType batch_size_;
-
-        // Trainer of the previous layer
-        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-        // layer to learn
-        LayerType* const target_layer_;
-
-        // Forward propagation buffer
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
-    };
-
-}  // namespace Eval::NNUE
-
-#endif
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -22,11 +22,9 @@
 #include <sstream>
 #include <string>

-#include "extra/stockfish_blas.h"
 #include "nnue/evaluate_nnue.h"
 #include "evaluate.h"
 #include "movegen.h"
-#include "nnue/nnue_test_command.h"
 #include "position.h"
 #include "search.h"
 #include "syzygy/tbprobe.h"
@@ -37,7 +35,6 @@

 #include "learn/gensfen.h"
 #include "learn/gensfen_nonpv.h"
-#include "learn/learn.h"
 #include "learn/convert.h"
 #include "learn/transform.h"
 #include "learn/stats.h"
@@ -49,17 +46,6 @@ extern vector<string> setup_bench(const Position&, istream&);
 // FEN string of the initial position, normal chess
 const char* StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";

-void test_cmd(Position& pos, istringstream& is)
-{
-    // Initialize as it may be searched.
-    Eval::NNUE::init();
-
-    std::string param;
-    is >> param;
-
-    if (param == "nnue") Eval::NNUE::test_command(pos, is);
-}
-
 namespace {

  // position() is called when engine receives the "position" UCI command.
@@ -344,7 +330,6 @@ void UCI::loop(int argc, char* argv[]) {

      else if (token == "gensfen") Learner::gensfen(is);
      else if (token == "gensfen_nonpv") Learner::gensfen_nonpv(is);
-      else if (token == "learn") Learner::learn(is);
      else if (token == "convert") Learner::convert(is);
      else if (token == "convert_bin") Learner::convert_bin(is);
      else if (token == "convert_plain") Learner::convert_plain(is);
@@ -361,17 +346,7 @@ void UCI::loop(int argc, char* argv[]) {
          std::cout << th.thread_idx() << '\n';
        });
      }
-      else if (token == "blastest")
-      {
-        Blas::test(Threads);
-      }
-      else if (token == "blasbench")
-      {
-        Blas::bench(Threads);
-      }

-      // test command
-      else if (token == "test") test_cmd(pos, is);
      else
          sync_cout << "Unknown command: " << cmd << sync_endl;