remove learn

This commit is contained in:
Tomasz Sobczyk
2021-04-18 19:04:14 +02:00
parent 17946c5954
commit 3101ae7973
25 changed files with 2 additions and 7270 deletions

View File

@@ -47,9 +47,7 @@ PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 sfen_format bin output_file_name
SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
extra/stockfish_blas.cpp \
nnue/evaluate_nnue.cpp \
nnue/evaluate_nnue_learner.cpp \
nnue/features/half_kp.cpp \
nnue/features/half_ka.cpp \
nnue/features/half_relative_kp.cpp \
@@ -59,9 +57,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
nnue/features/a.cpp \
nnue/features/castling_right.cpp \
nnue/features/enpassant.cpp \
nnue/nnue_test_command.cpp \
learn/sfen_packer.cpp \
learn/learn.cpp \
learn/gensfen.cpp \
learn/gensfen_nonpv.cpp \
learn/opening_book.cpp \

File diff suppressed because it is too large Load Diff

View File

@@ -1,140 +0,0 @@
#ifndef _STOCKFISH_BLAS_H_
#define _STOCKFISH_BLAS_H_
struct ThreadPool;
#if defined (_MSC_VER)
#define SF_BLAS_RESTRICT __restrict
#elif defined (__INTEL_COMPILER)
#define SF_BLAS_RESTRICT restrict
#elif defined (__clang__)
#define SF_BLAS_RESTRICT __restrict__
#elif defined (__GNUC__)
#define SF_BLAS_RESTRICT __restrict__
#endif
namespace Blas {
enum struct MatrixLayout {
RowMajor = 101,
ColMajor = 102
};
enum struct MatrixTranspose {
NoTrans = 111,
Trans = 112
};
void scopy(
const int N,
const float * SF_BLAS_RESTRICT X,
float * SF_BLAS_RESTRICT Y
);
void scopy(
const int N,
const float * SF_BLAS_RESTRICT X, const int incX,
float * SF_BLAS_RESTRICT Y, const int incY
);
void scopy(
ThreadPool& thread_pool,
const int N,
const float * SF_BLAS_RESTRICT X,
float * SF_BLAS_RESTRICT Y
);
void scopy(
ThreadPool& thread_pool,
const int N,
const float * SF_BLAS_RESTRICT X, const int incX,
float * SF_BLAS_RESTRICT Y, const int incY
);
void sscal(
const int N,
const float alpha,
float * SF_BLAS_RESTRICT X
);
void sscal(
const int N,
const float alpha,
float * SF_BLAS_RESTRICT X, const int incX
);
void sscal(
ThreadPool& thread_pool,
const int N,
const float alpha,
float * SF_BLAS_RESTRICT X
);
void sscal(
ThreadPool& thread_pool,
const int N,
const float alpha,
float * SF_BLAS_RESTRICT X, const int incX
);
void saxpy(
const int N,
const float alpha,
const float * SF_BLAS_RESTRICT X,
float * SF_BLAS_RESTRICT Y
);
void saxpy(
const int N,
const float alpha,
const float * SF_BLAS_RESTRICT X, const int incX,
float * SF_BLAS_RESTRICT Y, const int incY
);
void saxpy(
ThreadPool& thread_pool,
const int N,
const float alpha,
const float * SF_BLAS_RESTRICT X,
float * SF_BLAS_RESTRICT Y
);
void saxpy(
ThreadPool& thread_pool,
const int N,
const float alpha,
const float * SF_BLAS_RESTRICT X, const int incX,
float * SF_BLAS_RESTRICT Y, const int incY
);
void sgemm(
ThreadPool& thread_pool,
MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
const int M, const int N, const int K,
const float alpha,
const float * SF_BLAS_RESTRICT A, const int lda,
const float * SF_BLAS_RESTRICT B, const int ldb,
const float beta,
float * SF_BLAS_RESTRICT C, const int ldc
);
void sgemm(
MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
const int M, const int N, const int K,
const float alpha,
const float * SF_BLAS_RESTRICT A, const int lda,
const float * SF_BLAS_RESTRICT B, const int ldb,
const float beta,
float * SF_BLAS_RESTRICT C, const int ldc
);
void test(
ThreadPool& thread_pool
);
void bench(
ThreadPool& thread_pool
);
}
#endif

View File

@@ -1,667 +0,0 @@
#ifndef LEARNER_AUTOGRAD_H
#define LEARNER_AUTOGRAD_H
#include <cmath>
#include <utility>
#include <type_traits>
#include <memory>
#include <tuple>
#include <optional>
#include <algorithm>
#include <cstdint>
namespace Learner
{
template <typename T>
struct ValueWithGrad
{
T value;
T grad;
ValueWithGrad& operator+=(const ValueWithGrad<T>& rhs)
{
value += rhs.value;
grad += rhs.grad;
return *this;
}
ValueWithGrad& operator-=(const ValueWithGrad<T>& rhs)
{
value -= rhs.value;
grad -= rhs.grad;
return *this;
}
ValueWithGrad& operator*=(T rhs)
{
value *= rhs;
grad *= rhs;
return *this;
}
ValueWithGrad& operator/=(T rhs)
{
value /= rhs;
grad /= rhs;
return *this;
}
[[nodiscard]] ValueWithGrad abs() const
{
return { std::abs(value), std::abs(grad) };
}
[[nodiscard]] ValueWithGrad clamp_grad(T max) const
{
return { value, std::clamp(grad, -max, max) };
}
};
}
namespace Learner::Autograd::UnivariateStatic
{
template <typename T>
struct Identity
{
using type = T;
};
template <typename T>
using Id = typename Identity<T>::type;
template <typename T>
using StoreValueOrRef = std::conditional_t<
std::is_rvalue_reference_v<T>,
std::remove_reference_t<T>,
const std::remove_reference_t<T>&
>;
namespace Detail
{
using CallIdType = std::uint32_t;
struct CallId
{
CallIdType call_id{};
constexpr CallId() :
call_id(0)
{
}
constexpr CallId(CallIdType id) :
call_id(id)
{
}
[[nodiscard]] bool operator==(CallId rhs) const noexcept
{
return call_id == rhs.call_id;
}
[[nodiscard]] bool operator!=(CallId rhs) const noexcept
{
return call_id != rhs.call_id;
}
};
[[nodiscard]] inline CallId next_call_id()
{
static thread_local CallIdType s_call_id = 0;
return CallId{ s_call_id++ };
}
template <typename T, typename Tuple>
struct TupleContains;
template <typename T, typename... Us>
struct TupleContains<T, std::tuple<Us...>> : std::disjunction<std::is_same<T, Us>...> {};
template <typename T, typename Tuple>
constexpr bool TupleContainsV = TupleContains<T, Tuple>::value;
template <typename... Ts>
constexpr bool AreAllConstantV = (std::remove_reference_t<Ts>::is_constant && ...);
}
template <typename T, typename ChildT>
struct Evaluable
{
constexpr Evaluable() = default;
// We append a unique call id so that we can invalidate the cache when
// the next computation starts. A single evaluation should see
// the same call_id at every node.
template <typename... ArgsTs>
[[nodiscard]] auto eval(const std::tuple<ArgsTs...>& args) const
{
const auto call_id = Detail::next_call_id();
const auto new_args = std::tuple_cat(args, std::tuple(call_id));
return ValueWithGrad<T>{ value(new_args), grad(new_args) };
}
template <typename... ArgsTs,
typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
[[nodiscard]] auto value(const std::tuple<ArgsTs...>& args) const
{
const ChildT* this_ = static_cast<const ChildT*>(this);
const auto call_id = std::get<Detail::CallId>(args);
if (!value_cache.has_value() || value_cache_call_id != call_id)
{
value_cache_call_id = call_id;
value_cache = this_->calculate_value(args);
}
return *value_cache;
}
template <typename... ArgsTs,
typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
[[nodiscard]] auto value(const std::tuple<ArgsTs...>& args, ...) const
{
const auto call_id = Detail::next_call_id();
const auto new_args = std::tuple_cat(args, std::tuple(call_id));
return value(new_args);
}
template <typename... ArgsTs,
typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
[[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args) const
{
if constexpr (ChildT::is_constant)
{
return T(0.0);
}
else
{
const ChildT* this_ = static_cast<const ChildT*>(this);
const auto call_id = std::get<Detail::CallId>(args);
if (!grad_cache.has_value() || grad_cache_call_id != call_id)
{
grad_cache_call_id = call_id;
grad_cache = this_->calculate_grad(args);
}
return *grad_cache;
}
}
template <typename... ArgsTs,
typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
[[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args, ...) const
{
const auto call_id = Detail::next_call_id();
const auto new_args = std::tuple_cat(args, std::tuple(call_id));
return grad(new_args);
}
private:
mutable std::optional<T> value_cache;
mutable std::optional<T> grad_cache;
mutable Detail::CallId value_cache_call_id{};
mutable Detail::CallId grad_cache_call_id{};
};
template <typename T, int I>
struct VariableParameter : Evaluable<T, VariableParameter<T, I>>
{
using ValueType = T;
static constexpr bool is_constant = false;
constexpr VariableParameter()
{
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
{
return std::get<I>(args);
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
{
return T(1.0);
}
};
template <typename T, int I>
struct ConstantParameter : Evaluable<T, ConstantParameter<T, I>>
{
using ValueType = T;
static constexpr bool is_constant = true;
constexpr ConstantParameter()
{
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
{
return std::get<I>(args);
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
{
return T(0.0);
}
};
template <typename T>
struct Constant : Evaluable<T, Constant<T>>
{
using ValueType = T;
static constexpr bool is_constant = true;
constexpr Constant(T x) :
m_x(std::move(x))
{
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
{
return m_x;
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
{
return T(0.0);
}
private:
T m_x;
};
// The "constant" may change between executions, but is assumed to be
// constant during a single evaluation.
template <typename T>
struct ConstantRef : Evaluable<T, ConstantRef<T>>
{
using ValueType = T;
static constexpr bool is_constant = true;
constexpr ConstantRef(const T& x) :
m_x(x)
{
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
{
return m_x;
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
{
return T(0.0);
}
private:
const T& m_x;
};
template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
struct Sum : Evaluable<T, Sum<LhsT, RhsT, T>>
{
using ValueType = T;
static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
constexpr Sum(LhsT&& lhs, RhsT&& rhs) :
m_lhs(std::forward<LhsT>(lhs)),
m_rhs(std::forward<RhsT>(rhs))
{
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
{
return m_lhs.value(args) + m_rhs.value(args);
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
{
return m_lhs.grad(args) + m_rhs.grad(args);
}
private:
StoreValueOrRef<LhsT> m_lhs;
StoreValueOrRef<RhsT> m_rhs;
};
template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
[[nodiscard]] constexpr auto operator+(LhsT&& lhs, RhsT&& rhs)
{
return Sum<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
}
template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
[[nodiscard]] constexpr auto operator+(LhsT&& lhs, Id<T> rhs)
{
return Sum<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
}
template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
[[nodiscard]] constexpr auto operator+(Id<T> lhs, RhsT&& rhs)
{
return Sum<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
}
template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
struct Difference : Evaluable<T, Difference<LhsT, RhsT, T>>
{
using ValueType = T;
static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
constexpr Difference(LhsT&& lhs, RhsT&& rhs) :
m_lhs(std::forward<LhsT>(lhs)),
m_rhs(std::forward<RhsT>(rhs))
{
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
{
return m_lhs.value(args) - m_rhs.value(args);
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
{
return m_lhs.grad(args) - m_rhs.grad(args);
}
private:
StoreValueOrRef<LhsT> m_lhs;
StoreValueOrRef<RhsT> m_rhs;
};
template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
[[nodiscard]] constexpr auto operator-(LhsT&& lhs, RhsT&& rhs)
{
return Difference<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
}
template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
[[nodiscard]] constexpr auto operator-(LhsT&& lhs, Id<T> rhs)
{
return Difference<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
}
template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
[[nodiscard]] constexpr auto operator-(Id<T> lhs, RhsT&& rhs)
{
return Difference<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
}
template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
struct Product : Evaluable<T, Product<LhsT, RhsT, T>>
{
using ValueType = T;
static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
constexpr Product(LhsT&& lhs, RhsT&& rhs) :
m_lhs(std::forward<LhsT>(lhs)),
m_rhs(std::forward<RhsT>(rhs))
{
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
{
return m_lhs.value(args) * m_rhs.value(args);
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
{
return m_lhs.grad(args) * m_rhs.value(args) + m_lhs.value(args) * m_rhs.grad(args);
}
private:
StoreValueOrRef<LhsT> m_lhs;
StoreValueOrRef<RhsT> m_rhs;
};
template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
[[nodiscard]] constexpr auto operator*(LhsT&& lhs, RhsT&& rhs)
{
return Product<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
}
template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
[[nodiscard]] constexpr auto operator*(LhsT&& lhs, Id<T> rhs)
{
return Product<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
}
template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
[[nodiscard]] constexpr auto operator*(Id<T> lhs, RhsT&& rhs)
{
return Product<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
}
template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
struct Quotient : Evaluable<T, Quotient<LhsT, RhsT, T>>
{
using ValueType = T;
static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
constexpr Quotient(LhsT&& lhs, RhsT&& rhs) :
m_lhs(std::forward<LhsT>(lhs)),
m_rhs(std::forward<RhsT>(rhs))
{
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
{
return m_lhs.value(args) / m_rhs.value(args);
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
{
auto g = m_rhs.value(args);
return (m_lhs.grad(args) * g - m_lhs.value(args) * m_rhs.grad(args)) / (g * g);
}
private:
StoreValueOrRef<LhsT> m_lhs;
StoreValueOrRef<RhsT> m_rhs;
};
template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
[[nodiscard]] constexpr auto operator/(LhsT&& lhs, RhsT&& rhs)
{
return Quotient<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
}
template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
[[nodiscard]] constexpr auto operator/(LhsT&& lhs, Id<T> rhs)
{
return Quotient<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
}
template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
[[nodiscard]] constexpr auto operator/(Id<T> lhs, RhsT&& rhs)
{
return Quotient<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
}
template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
struct Negation : Evaluable<T, Negation<ArgT, T>>
{
using ValueType = T;
static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
constexpr explicit Negation(ArgT&& x) :
m_x(std::forward<ArgT>(x))
{
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
{
return -m_x.value(args);
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
{
return -m_x.grad(args);
}
private:
StoreValueOrRef<ArgT> m_x;
};
template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
[[nodiscard]] constexpr auto operator-(ArgT&& x)
{
return Negation<ArgT&&>(std::forward<ArgT>(x));
}
template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
struct Sigmoid : Evaluable<T, Sigmoid<ArgT, T>>
{
using ValueType = T;
static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
constexpr explicit Sigmoid(ArgT&& x) :
m_x(std::forward<ArgT>(x))
{
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
{
return value_(m_x.value(args));
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
{
return m_x.grad(args) * grad_(m_x.value(args));
}
private:
StoreValueOrRef<ArgT> m_x;
[[nodiscard]] T value_(T x) const
{
return 1.0 / (1.0 + std::exp(-x));
}
[[nodiscard]] T grad_(T x) const
{
return value_(x) * (1.0 - value_(x));
}
};
template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
[[nodiscard]] constexpr auto sigmoid(ArgT&& x)
{
return Sigmoid<ArgT&&>(std::forward<ArgT>(x));
}
template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
struct Pow : Evaluable<T, Pow<ArgT, T>>
{
using ValueType = T;
static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
constexpr explicit Pow(ArgT&& x, Id<T> exponent) :
m_x(std::forward<ArgT>(x)),
m_exponent(std::move(exponent))
{
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
{
return std::pow(m_x.value(args), m_exponent);
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
{
return m_exponent * std::pow(m_x.value(args), m_exponent - T(1.0)) * m_x.grad(args);
}
private:
StoreValueOrRef<ArgT> m_x;
T m_exponent;
};
template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
[[nodiscard]] constexpr auto pow(ArgT&& x, Id<T> exp)
{
return Pow<ArgT&&>(std::forward<ArgT>(x), std::move(exp));
}
template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
struct Log : Evaluable<T, Log<ArgT, T>>
{
using ValueType = T;
static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
constexpr explicit Log(ArgT&& x) :
m_x(std::forward<ArgT>(x))
{
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
{
return value_(m_x.value(args));
}
template <typename... ArgsTs>
[[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
{
return m_x.grad(args) * grad_(m_x.value(args));
}
private:
StoreValueOrRef<ArgT> m_x;
T value_(T x) const
{
return std::log(x);
}
T grad_(T x) const
{
return 1.0 / x;
}
};
template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
[[nodiscard]] constexpr auto log(ArgT&& x)
{
return Log<ArgT&&>(std::forward<ArgT>(x));
}
}
#endif

View File

@@ -13,7 +13,6 @@
#include "extra/nnue_data_binpack_format.h"
#include "nnue/evaluate_nnue.h"
#include "nnue/evaluate_nnue_learner.h"
#include "syzygy/tbprobe.h"
@@ -493,8 +492,8 @@ namespace Learner
// has it reached the max length or is a draw by fifty-move rule
// or by 3-fold repetition
if (ply >= params.write_maxply
|| pos.is_fifty_move_draw()
if (ply >= params.write_maxply
|| pos.is_fifty_move_draw()
|| pos.is_three_fold_repetition())
{
return 0;

View File

@@ -13,7 +13,6 @@
#include "extra/nnue_data_binpack_format.h"
#include "nnue/evaluate_nnue.h"
#include "nnue/evaluate_nnue_learner.h"
#include "syzygy/tbprobe.h"

View File

@@ -1,133 +0,0 @@
#ifndef __HALF_FLOAT_H__
#define __HALF_FLOAT_H__
// Half Float Library by yaneurao
// (16-bit float)
// Floating point operation by 16bit type
// Assume that the float type code generated by the compiler is in IEEE 754 format and use it.
#include "types.h"
namespace HalfFloat
{
// IEEE 754 float 32 format is :
// sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits
//
// Our float16 format is :
// sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits
union float32_converter
{
int32_t n;
float f;
};
// 16-bit float
struct float16
{
// --- constructors
float16() {}
float16(int16_t n) { from_float((float)n); }
float16(int32_t n) { from_float((float)n); }
float16(float n) { from_float(n); }
float16(double n) { from_float((float)n); }
// build from a float
void from_float(float f) { *this = to_float16(f); }
// --- implicit converters
operator int32_t() const { return (int32_t)to_float(*this); }
operator float() const { return to_float(*this); }
operator double() const { return double(to_float(*this)); }
// --- operators
float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; }
float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; }
float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; }
float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; }
float16 operator + (float16 rhs) const { return float16(*this) += rhs; }
float16 operator - (float16 rhs) const { return float16(*this) -= rhs; }
float16 operator * (float16 rhs) const { return float16(*this) *= rhs; }
float16 operator / (float16 rhs) const { return float16(*this) /= rhs; }
float16 operator - () const { return float16(-to_float(*this)); }
bool operator == (float16 rhs) const { return this->v_ == rhs.v_; }
bool operator != (float16 rhs) const { return !(*this == rhs); }
static void UnitTest() { unit_test(); }
private:
// --- entity
uint16_t v_;
// --- conversion between float and float16
static float16 to_float16(float f)
{
float32_converter c;
c.f = f;
u32 n = c.n;
// The sign bit is MSB in common.
uint16_t sign_bit = (n >> 16) & 0x8000;
// The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit.
uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10;
// The fraction is limited to 10-bit.
uint16_t fraction = (n >> (23-10)) & 0x3ff;
float16 f_;
f_.v_ = sign_bit | exponent | fraction;
return f_;
}
static float to_float(float16 v)
{
u32 sign_bit = (v.v_ & 0x8000) << 16;
u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23;
u32 fraction = (v.v_ & 0x3ff) << (23 - 10);
float32_converter c;
c.n = sign_bit | exponent | fraction;
return c.f;
}
// It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe).
static void unit_test()
{
float16 a, b, c, d;
a = 1;
std::cout << (float)a << std::endl;
b = -118.625;
std::cout << (float)b << std::endl;
c = 2.5;
std::cout << (float)c << std::endl;
d = a + c;
std::cout << (float)d << std::endl;
c *= 1.5;
std::cout << (float)c << std::endl;
b /= 3;
std::cout << (float)b << std::endl;
float f1 = 1.5;
a += f1;
std::cout << (float)a << std::endl;
a += f1 * (float)a;
std::cout << (float)a << std::endl;
}
};
}
#endif // __HALF_FLOAT_H__

File diff suppressed because it is too large Load Diff

View File

@@ -1,148 +0,0 @@
#ifndef _LEARN_H_
#define _LEARN_H_
// ----------------------
// Floating point for learning
// ----------------------
// If this is set to double, the calculation accuracy will be higher, but the weight array entangled memory will be doubled.
// Currently, if this is float, the weight array is 4.5 times the size of the evaluation function file. (About 4.5GB with KPPT)
// Even if it is a double type, there is almost no difference in the way of convergence, so fix it to float.
// when using float
using LearnFloatType = float;
// when using double
//typedef double LearnFloatType;
// when using float16
//#include "half_float.h"
//typedef HalfFloat::float16 LearnFloatType;
// ======================
// configure
// ======================
// ----------------------
// Learning with the method of elmo (WCSC27)
// ----------------------
#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
// ----------------------
// Definition of struct used in Learner
// ----------------------
#include "autograd.h"
#include "packed_sfen.h"
#include "position.h"
#include <sstream>
#include <vector>
#include <mutex>
#include <string>
namespace Learner
{
// ----------------------
// Settings for learning
// ----------------------
// mini-batch size.
// Calculate the gradient by combining this number of phases.
// If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
// If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
// I don't think you need to change this value in most cases.
constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;
// Saving interval of evaluation function at learning. Save each time you learn this number of phases.
// Needless to say, the longer the saving interval, the shorter the learning time.
// Folder name is incremented for each save like 0/, 1/, 2/...
// By default, once every 1 billion phases.
constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 100'000'000ULL;
// Reduce the output of rmse during learning to 1 for this number of times.
// rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;
// Learning from the generated game record
void learn(std::istringstream& is);
using CalcLossFunc = ValueWithGrad<double>(Value, Value, int, int);
struct Loss
{
double value() const
{
return m_loss.value;
}
double grad() const
{
return m_loss.grad;
}
uint64_t count() const
{
return m_count;
}
Loss() = default;
Loss(const Loss& other) :
m_loss(other.m_loss),
m_count(other.m_count)
{
}
Loss& operator += (const ValueWithGrad<double>& rhs)
{
std::unique_lock lock(m_mutex);
m_loss += rhs.abs();
m_count += 1;
return *this;
}
Loss& operator += (const Loss& rhs)
{
std::unique_lock lock(m_mutex);
m_loss += rhs.m_loss.abs();
m_count += rhs.m_count;
return *this;
}
void reset()
{
std::unique_lock lock(m_mutex);
m_loss = ValueWithGrad<double>{ 0.0, 0.0 };
m_count = 0;
}
template <typename StreamT>
void print_with_grad(const std::string& prefix, StreamT& s) const
{
s << " - " << prefix << "_loss = " << m_loss.value / (double)m_count << std::endl;
s << " - " << prefix << "_grad_norm = " << m_loss.grad / (double)m_count << std::endl;
}
template <typename StreamT>
void print_only_loss(const std::string& prefix, StreamT& s) const
{
s << " - " << prefix << "_loss = " << m_loss.value / (double)m_count << std::endl;
}
private:
ValueWithGrad<double> m_loss{ 0.0, 0.0 };
uint64_t m_count{0};
std::mutex m_mutex;
};
}
#endif // ifndef _LEARN_H_

View File

@@ -1,341 +0,0 @@
#include <random>
#include <fstream>
#include "evaluate_nnue.h"
#include "evaluate_nnue_learner.h"
#include "trainer/features/all_factorizers.h"
#include "trainer/trainer_feature_transformer.h"
#include "trainer/trainer_input_slice.h"
#include "trainer/trainer_affine_transform.h"
#include "trainer/trainer_clipped_relu.h"
#include "trainer/trainer_sum.h"
#include "position.h"
#include "uci.h"
#include "misc.h"
#include "thread_win32_osx.h"
#include "thread.h"
// Code for learning NNUE evaluation function
namespace Eval::NNUE {
namespace {
// learning data
std::vector<Example> examples;
// Mutex for exclusive control of examples
std::mutex examples_mutex;
// number of samples in mini-batch
uint64_t batch_size;
// random number generator
std::mt19937 rng;
// learner
std::shared_ptr<Trainer<Network>> trainer;
// Tell the learner options such as hyperparameters
void send_messages(std::vector<Message> messages) {
for (auto& message : messages) {
trainer->send_message(&message);
assert(message.num_receivers > 0);
}
}
} // namespace
// Initialize learning
void initialize_training(
const std::string& seed,
SynchronizedRegionLogger::Region& out) {
#if defined (OPENBLAS_VERSION)
openblas_set_num_threads(1);
#elif defined (INTEL_MKL_VERSION)
mkl_set_num_threads(1);
#endif
out << "INFO (initialize_training): Initializing NN training for "
<< get_architecture_string() << std::endl;
out << std::endl;
out << "Layers:\n"
<< get_layers_info() << std::endl;
out << std::endl;
out << "Factorizers:\n"
<< Features::Factorizer<RawFeatures>::get_factorizers_string() << std::endl;
out << std::endl;
assert(feature_transformer);
assert(network);
trainer = Trainer<Network>::create(network.get(), feature_transformer.get());
rng.seed(PRNG(seed).rand<uint64_t>());
if (Options["SkipLoadingEval"]) {
out << "INFO (initialize_training): Performing random net initialization.\n";
trainer->initialize(rng);
}
}
// set the number of samples in the mini-batch
void set_batch_size(uint64_t size) {
assert(size > 0);
batch_size = size;
}
// Set options such as hyperparameters
void set_options(const std::string& options) {
std::vector<Message> messages;
for (const auto& option : Algo::split(options, ',')) {
const auto fields = Algo::split(option, '=');
assert(fields.size() == 1 || fields.size() == 2);
if (fields.size() == 1) {
messages.emplace_back(fields[0]);
} else {
messages.emplace_back(fields[0], fields[1]);
}
}
send_messages(std::move(messages));
}
// Reread the evaluation function parameters for learning from the file
void restore_parameters(const std::string& dir_name) {
const std::string file_name = Path::combine(dir_name, NNUE::savedfileName);
std::ifstream stream(file_name, std::ios::binary);
#ifndef NDEBUG
bool result =
#endif
ReadParameters(stream);
#ifndef NDEBUG
assert(result);
#endif
send_messages({{"reset"}});
}
void finalize_net() {
send_messages({{"clear_unobserved_feature_weights"}});
}
// Add 1 sample of learning data
void add_example(
Position& pos,
Color rootColor,
Value discrete_nn_eval,
const Learner::PackedSfenValue& psv,
double weight) {
Example example;
if (rootColor == pos.side_to_move()) {
example.sign = 1;
} else {
example.sign = -1;
}
example.discrete_nn_eval = discrete_nn_eval;
example.psv = psv;
example.weight = weight;
Features::IndexList active_indices[2];
for (const auto trigger : kRefreshTriggers) {
RawFeatures::append_active_indices(pos, trigger, active_indices);
}
if (pos.side_to_move() != WHITE) {
active_indices[0].swap(active_indices[1]);
}
static thread_local std::vector<TrainingFeature> s_training_features;
auto& training_features = s_training_features;
for (const auto color : Colors) {
training_features.clear();
for (const auto base_index : active_indices[color]) {
static_assert(Features::Factorizer<RawFeatures>::get_dimensions() <
(1 << TrainingFeature::kIndexBits), "");
Features::Factorizer<RawFeatures>::append_training_features(
base_index, &training_features);
}
std::sort(training_features.begin(), training_features.end());
auto& unique_features = example.training_features[color];
unique_features.reserve(training_features.size());
for (const auto& feature : training_features) {
if (!unique_features.empty() &&
feature.get_index() == unique_features.back().get_index()) {
unique_features.back() += feature;
} else {
unique_features.push_back(feature);
}
}
}
std::lock_guard<std::mutex> lock(examples_mutex);
examples.push_back(std::move(example));
}
// update the evaluation function parameters
Learner::Loss update_parameters(
ThreadPool& thread_pool,
uint64_t epoch,
bool verbose,
double learning_rate,
double max_grad,
Learner::CalcLossFunc calc_loss)
{
using namespace Learner::Autograd::UnivariateStatic;
assert(batch_size > 0);
learning_rate /= batch_size;
std::lock_guard<std::mutex> lock(examples_mutex);
double abs_eval_diff_sum = 0.0;
double abs_discrete_eval_sum = 0.0;
double gradient_norm = 0.0;
bool collect_stats = verbose;
Learner::Loss loss_sum{};
std::vector<double> abs_eval_diff_sum_local(thread_pool.size(), 0.0);
std::vector<double> abs_discrete_eval_sum_local(thread_pool.size(), 0.0);
std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
std::vector<Learner::Loss> loss_sum_local(thread_pool.size());
auto prev_batch_begin = examples.end();
while ((long)(prev_batch_begin - examples.begin()) >= (long)batch_size) {
auto batch_begin = prev_batch_begin - batch_size;
auto batch_end = prev_batch_begin;
auto size = batch_end - batch_begin;
const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end);
std::vector<LearnFloatType> gradients(size);
thread_pool.for_each_index_chunk_with_workers(
std::size_t(0), size,
[&](Thread& th, std::size_t offset, std::size_t count) {
const auto thread_id = th.thread_idx();
trainer->propagate(th, offset, count);
for (std::size_t b = offset; b < offset + count; ++b) {
const auto& e = *(batch_begin + b);
const auto shallow = static_cast<Value>(round<std::int32_t>(
e.sign * network_output[b] * kPonanzaConstant));
const auto discrete = e.sign * e.discrete_nn_eval;
const auto& psv = e.psv;
auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
loss.grad = std::clamp(
loss.grad * e.sign * kPonanzaConstant * e.weight, -max_grad, max_grad);
gradients[b] = static_cast<LearnFloatType>(loss.grad);
loss_sum_local[thread_id] += loss;
// The discrete eval will only be valid before first backpropagation,
// that is only for the first batch.
// Similarily we want only gradients from one batch.
if (collect_stats)
{
abs_eval_diff_sum_local[thread_id] += std::abs(discrete - shallow);
abs_discrete_eval_sum_local[thread_id] += std::abs(discrete);
gradient_norm_local[thread_id] += std::abs(loss.grad);
}
}
trainer->backpropagate(th, gradients.data(), offset, count);
}
);
// We can asyncronously erase the examples that we used in the previous
// step. This can be done safely because we're no longer using these
// examples and erase won't invalidate iterators.
examples.erase(prev_batch_begin, examples.end());
prev_batch_begin = batch_begin;
thread_pool.wait_for_workers_finished();
trainer->step_end(thread_pool, learning_rate);
collect_stats = false;
}
examples.erase(prev_batch_begin, examples.end());
if (verbose)
{
abs_eval_diff_sum = std::accumulate(abs_eval_diff_sum_local.begin(), abs_eval_diff_sum_local.end(), 0.0);
abs_discrete_eval_sum = std::accumulate(abs_discrete_eval_sum_local.begin(), abs_discrete_eval_sum_local.end(), 0.0);
gradient_norm = std::accumulate(gradient_norm_local.begin(), gradient_norm_local.end(), 0.0);
const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
auto out = sync_region_cout.new_region();
out << "INFO (update_parameters):"
<< " epoch = " << epoch
<< " , avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff
<< " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval
<< " , avg_relative_error = " << avg_abs_eval_diff / avg_abs_discrete_eval
<< " , batch_size = " << batch_size
<< " , grad_norm = " << gradient_norm
<< std::endl;
} else {
// Display some progress but don't synchronize as
// we can't really decide when to release the output lock here
std::cout << '.';
}
send_messages({{"quantize_parameters"}});
for(auto& loss : loss_sum_local)
{
loss_sum += loss;
}
return loss_sum;
}
// Check if there are any problems with learning
void check_health() {
send_messages({{"check_health"}});
}
// save merit function parameters to a file
void save_eval(std::string dir_name) {
auto eval_dir = Path::combine(Options["EvalSaveDir"], dir_name);
auto out = sync_region_cout.new_region();
out << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
// mkdir() will fail if this folder already exists, but
// Apart from that. If not, I just want you to make it.
// Also, assume that the folders up to EvalSaveDir have been dug.
sys::create_directories(eval_dir);
const std::string file_name = Path::combine(eval_dir, NNUE::savedfileName);
std::ofstream stream(file_name, std::ios::binary);
#ifndef NDEBUG
bool result =
#endif
WriteParameters(stream);
#ifndef NDEBUG
assert(result);
#endif
out << "INFO (save_eval): Finished saving evaluation file in " << eval_dir << std::endl;
}
} // namespace Eval::NNUE

View File

@@ -1,52 +0,0 @@
#ifndef _EVALUATE_NNUE_LEARNER_H_
#define _EVALUATE_NNUE_LEARNER_H_
#include "learn/learn.h"
#include "misc.h"
struct ThreadPool;
// Interface used for learning NNUE evaluation function
namespace Eval::NNUE {
// Initialize learning
void initialize_training(
const std::string& seed,
SynchronizedRegionLogger::Region& out);
// set the number of samples in the mini-batch
void set_batch_size(uint64_t size);
// Set options such as hyperparameters
void set_options(const std::string& options);
// Reread the evaluation function parameters for learning from the file
void restore_parameters(const std::string& dir_name);
// Add 1 sample of learning data
void add_example(
Position& pos,
Color rootColor,
Value discrete_nn_eval,
const Learner::PackedSfenValue& psv,
double weight);
// update the evaluation function parameters
Learner::Loss update_parameters(
ThreadPool& thread_pool,
uint64_t epoch,
bool verbose,
double learning_rate,
double max_grad,
Learner::CalcLossFunc calc_loss);
// Check if there are any problems with learning
void check_health();
void finalize_net();
void save_eval(std::string suffix);
} // namespace Eval::NNUE
#endif

View File

@@ -1,215 +0,0 @@
#include "evaluate_nnue.h"
#include "nnue_test_command.h"
#include "thread.h"
#include "uci.h"
#include <set>
#include <fstream>
#define ASSERT(X) { \
if (!(X)) { \
std::cout \
<< "\nError : ASSERT(" << #X << "), " \
<< __FILE__ << "(" << __LINE__ << "): " \
<< __func__ << std::endl; \
std::this_thread::sleep_for(std::chrono::microseconds(3000)); \
*(int*)1 =0; \
} \
}
// USI extended command for NNUE evaluation function
namespace Eval::NNUE {
namespace {
// Testing RawFeatures mainly for difference calculation
void test_features(Position& pos) {
const std::uint64_t num_games = 1000;
StateInfo si;
pos.set(StartFEN, false, &si, Threads.main());
const int MAX_PLY = 256; // test up to 256 hands
StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
int ply; // Trouble from the initial phase
PRNG prng(20171128);
std::uint64_t num_moves = 0;
std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
constexpr IndexType kUnknown = -1;
std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
auto make_index_sets = [&](const Position& position) {
std::vector<std::vector<std::set<IndexType>>> index_sets(
kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
Features::IndexList active_indices[2];
RawFeatures::append_active_indices(position, kRefreshTriggers[i],
active_indices);
for (const auto perspective : Colors) {
for (const auto index : active_indices[perspective]) {
ASSERT(index < RawFeatures::kDimensions);
ASSERT(index_sets[i][perspective].count(index) == 0);
ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
index_sets[i][perspective].insert(index);
trigger_map[index] = i;
}
}
}
return index_sets;
};
auto update_index_sets = [&](const Position& position, auto* index_sets) {
for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
Features::IndexList removed_indices[2], added_indices[2];
bool reset[2] = { false, false };
RawFeatures::append_changed_indices(position, kRefreshTriggers[i],
removed_indices, added_indices, reset);
for (const auto perspective : Colors) {
if (reset[perspective]) {
(*index_sets)[i][perspective].clear();
++num_resets[i];
} else {
for (const auto index : removed_indices[perspective]) {
ASSERT(index < RawFeatures::kDimensions);
ASSERT((*index_sets)[i][perspective].count(index) == 1);
ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
(*index_sets)[i][perspective].erase(index);
++num_updates.back();
++num_updates[i];
trigger_map[index] = i;
}
}
for (const auto index : added_indices[perspective]) {
ASSERT(index < RawFeatures::kDimensions);
ASSERT((*index_sets)[i][perspective].count(index) == 0);
ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
(*index_sets)[i][perspective].insert(index);
++num_updates.back();
++num_updates[i];
trigger_map[index] = i;
}
}
}
};
std::cout << "feature set: " << RawFeatures::get_name()
<< "[" << RawFeatures::kDimensions << "]" << std::endl;
std::cout << "start testing with random games";
for (std::uint64_t i = 0; i < num_games; ++i) {
auto index_sets = make_index_sets(pos);
for (ply = 0; ply < MAX_PLY; ++ply) {
MoveList<LEGAL> mg(pos); // Generate all legal hands
// There was no legal move == Clog
if (mg.size() == 0)
break;
// Randomly choose from the generated moves and advance the phase with the moves.
Move m = mg.begin()[prng.rand(mg.size())];
pos.do_move(m, state[ply]);
++num_moves;
update_index_sets(pos, &index_sets);
ASSERT(index_sets == make_index_sets(pos));
}
pos.set(StartFEN, false, &si, Threads.main());
// Output'.' every 100 times (so you can see that it's progressing)
if ((i % 100) == 0)
std::cout << "." << std::flush;
}
std::cout << "passed." << std::endl;
std::cout << num_games << " games, " << num_moves << " moves, "
<< num_updates.back() << " updates, "
<< (1.0 * num_updates.back() / num_moves)
<< " updates per move" << std::endl;
std::size_t num_observed_indices = 0;
for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
num_observed_indices += count;
std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
<< "): " << count << " features ("
<< (100.0 * count / RawFeatures::kDimensions) << "%), "
<< num_updates[i] << " updates ("
<< (1.0 * num_updates[i] / num_moves) << " per move), "
<< num_resets[i] << " resets ("
<< (100.0 * num_resets[i] / num_moves) << "%)"
<< std::endl;
}
std::cout << "observed " << num_observed_indices << " ("
<< (100.0 * num_observed_indices / RawFeatures::kDimensions)
<< "% of " << RawFeatures::kDimensions
<< ") features" << std::endl;
}
// Output a string that represents the structure of the evaluation function
void print_info(std::istream& stream) {
std::cout << "network architecture: " << get_architecture_string() << std::endl;
while (true) {
std::string file_name;
stream >> file_name;
if (file_name.empty())
break;
std::uint32_t hash_value;
std::string architecture;
const bool success = [&]() {
std::ifstream file_stream(file_name, std::ios::binary);
if (!file_stream)
return false;
if (!read_header(file_stream, &hash_value, &architecture))
return false;
return true;
}();
std::cout << file_name << ": ";
if (success) {
if (hash_value == kHashValue) {
std::cout << "matches with this binary";
if (architecture != get_architecture_string()) {
std::cout << ", but architecture string differs: " << architecture;
}
std::cout << std::endl;
} else {
std::cout << architecture << std::endl;
}
} else {
std::cout << "failed to read header" << std::endl;
}
}
}
} // namespace
// USI extended command for NNUE evaluation function
void test_command(Position& pos, std::istream& stream) {
std::string sub_command;
stream >> sub_command;
if (sub_command == "test_features") {
test_features(pos);
} else if (sub_command == "info") {
print_info(stream);
} else {
std::cout << "usage:" << std::endl;
std::cout << " test nnue test_features" << std::endl;
std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
}
}
} // namespace Eval::NNUE

View File

@@ -1,12 +0,0 @@
#ifndef _NNUE_TEST_COMMAND_H_
#define _NNUE_TEST_COMMAND_H_
// USI extended command interface for NNUE evaluation function
namespace Eval::NNUE {
// USI extended command for NNUE evaluation function
void test_command(Position& pos, std::istream& stream);
} // namespace Eval::NNUE
#endif

View File

@@ -1,10 +0,0 @@
#ifndef _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
#define _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
#include "factorizer.h"
#include "factorizer_feature_set.h"
#include "factorizer_half_kp.h"
#include "factorizer_half_ka.h"
#endif

View File

@@ -1,117 +0,0 @@
#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
#define _NNUE_TRAINER_FEATURES_FACTORIZER_H_
#include "nnue/nnue_common.h"
#include "nnue/trainer/trainer.h"
// NNUE evaluation function feature conversion class template
namespace Eval::NNUE::Features {
// Class template that converts input features into learning features
// By default, the learning feature is the same as the original input feature, and specialized as necessary
template <typename FeatureType>
class Factorizer {
public:
static constexpr std::string get_name() {
return "Factorizer<" + FeatureType::get_name() + "> -> " + std::string("No factorizer");
}
static constexpr std::string get_factorizers_string() {
return " - " + get_name();
}
// Get the dimensionality of the learning feature
static constexpr IndexType get_dimensions() {
return FeatureType::kDimensions;
}
// Get index of learning feature and scale of learning rate
static void append_training_features(
IndexType base_index, std::vector<TrainingFeature>* training_features) {
assert(base_index <FeatureType::kDimensions);
training_features->emplace_back(base_index);
}
};
// Learning feature information
struct FeatureProperties {
bool active;
IndexType dimensions;
};
// Add the original input features to the learning features
template <typename FeatureType>
IndexType append_base_feature(
FeatureProperties properties, IndexType base_index,
std::vector<TrainingFeature>* training_features) {
assert(properties.dimensions == FeatureType::kDimensions);
assert(base_index < FeatureType::kDimensions);
training_features->emplace_back(base_index);
return properties.dimensions;
}
// If the learning rate scale is not 0, inherit other types of learning features
template <typename FeatureType>
IndexType inherit_features_if_required(
IndexType index_offset, FeatureProperties properties, IndexType base_index,
std::vector<TrainingFeature>* training_features) {
if (!properties.active) {
return 0;
}
assert(properties.dimensions == Factorizer<FeatureType>::get_dimensions());
assert(base_index < FeatureType::kDimensions);
const auto start = training_features->size();
Factorizer<FeatureType>::append_training_features(
base_index, training_features);
for (auto i = start; i < training_features->size(); ++i) {
auto& feature = (*training_features)[i];
assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
feature.shift_index(index_offset);
}
return properties.dimensions;
}
// Return the index difference as needed, without adding learning features
// Call instead of InheritFeaturesIfRequired() if there are no corresponding features
IndexType skip_features(FeatureProperties properties) {
if (!properties.active)
return 0;
return properties.dimensions;
}
// Get the dimensionality of the learning feature
template <std::size_t N>
constexpr IndexType get_active_dimensions(
const FeatureProperties (&properties)[N]) {
static_assert(N > 0, "");
IndexType dimensions = properties[0].dimensions;
for (std::size_t i = 1; i < N; ++i) {
if (properties[i].active) {
dimensions += properties[i].dimensions;
}
}
return dimensions;
}
// get the number of elements in the array
template <typename T, std::size_t N>
constexpr std::size_t get_array_length(const T (&/*array*/)[N]) {
return N;
}
} // namespace Eval::NNUE::Features
#endif

View File

@@ -1,121 +0,0 @@
#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
#define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
#include "factorizer.h"
#include "nnue/features/feature_set.h"
// Specialization for feature set of feature conversion class template of NNUE evaluation function
namespace Eval::NNUE::Features {
// Class template that converts input features into learning features
// Specialization for FeatureSet
template <typename FirstFeatureType, typename... RemainingFeatureTypes>
class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
private:
using Head = Factorizer<FeatureSet<FirstFeatureType>>;
using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
public:
// number of dimensions of original input features
static constexpr IndexType kBaseDimensions =
FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
static constexpr std::string get_factorizers_string() {
std::string str = " - ";
str += Head::get_name();
str += '\n';
str += Tail::get_factorizers_string();
return str;
}
// Get the dimensionality of the learning feature
static constexpr IndexType get_dimensions() {
return Head::get_dimensions() + Tail::get_dimensions();
}
// Get index of learning feature and scale of learning rate
static void append_training_features(
IndexType base_index, std::vector<TrainingFeature>* training_features,
IndexType base_dimensions = kBaseDimensions) {
assert(base_index < kBaseDimensions);
constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
if (base_index < boundary) {
Tail::append_training_features(
base_index, training_features, base_dimensions);
}
else {
const auto start = training_features->size();
Head::append_training_features(
base_index - boundary, training_features, base_dimensions);
for (auto i = start; i < training_features->size(); ++i) {
auto& feature = (*training_features)[i];
const auto index = feature.get_index();
assert(index < Head::get_dimensions() ||
(index >= base_dimensions &&
index < base_dimensions +
Head::get_dimensions() - Head::kBaseDimensions));
if (index < Head::kBaseDimensions) {
feature.shift_index(Tail::kBaseDimensions);
}
else {
feature.shift_index(Tail::get_dimensions() - Tail::kBaseDimensions);
}
}
}
}
};
// Class template that converts input features into learning features
// Specialization when FeatureSet has one template argument
template <typename FeatureType>
class Factorizer<FeatureSet<FeatureType>> {
public:
// number of dimensions of original input features
static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
static constexpr std::string get_name() {
return Factorizer<FeatureType>::get_name();
}
static constexpr std::string get_factorizers_string() {
return " - " + get_name();
}
// Get the dimensionality of the learning feature
static constexpr IndexType get_dimensions() {
return Factorizer<FeatureType>::get_dimensions();
}
// Get index of learning feature and scale of learning rate
static void append_training_features(
IndexType base_index, std::vector<TrainingFeature>* training_features,
IndexType base_dimensions = kBaseDimensions) {
assert(base_index < kBaseDimensions);
const auto start = training_features->size();
Factorizer<FeatureType>::append_training_features(
base_index, training_features);
for (auto i = start; i < training_features->size(); ++i) {
auto& feature = (*training_features)[i];
assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
if (feature.get_index() >= kBaseDimensions) {
feature.shift_index(base_dimensions - kBaseDimensions);
}
}
}
};
} // namespace Eval::NNUE::Features
#endif

View File

@@ -1,93 +0,0 @@
#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
#include "factorizer.h"
#include "nnue/features/half_ka.h"
#include "nnue/features/a.h"
#include "nnue/features/half_relative_ka.h"
// Specialization of NNUE evaluation function feature conversion class template for HalfKA
namespace Eval::NNUE::Features {
// Class template that converts input features into learning features
// Specialization for HalfKA
template <Side AssociatedKing>
class Factorizer<HalfKA<AssociatedKing>> {
private:
using FeatureType = HalfKA<AssociatedKing>;
// The maximum value of the number of indexes whose value is 1 at the same time among the feature values
static constexpr IndexType kMaxActiveDimensions =
FeatureType::kMaxActiveDimensions;
// Type of learning feature
enum TrainingFeatureType {
kFeaturesHalfKA,
kFeaturesA,
kFeaturesHalfRelativeKA,
kNumTrainingFeatureTypes,
};
// Learning feature information
static constexpr FeatureProperties kProperties[] = {
// kFeaturesHalfA
{true, FeatureType::kDimensions},
// kFeaturesA
{true, Factorizer<A>::get_dimensions()},
// kFeaturesHalfRelativeKA
{true, Factorizer<HalfRelativeKA<AssociatedKing>>::get_dimensions()},
};
static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
public:
static constexpr std::string get_name() {
return std::string("Factorizer<") + FeatureType::kName + "> -> " + "A, HalfRelativeKA";
}
static constexpr std::string get_factorizers_string() {
return " - " + get_name();
}
// Get the dimensionality of the learning feature
static constexpr IndexType get_dimensions() {
return get_active_dimensions(kProperties);
}
// Get index of learning feature and scale of learning rate
static void append_training_features(
IndexType base_index, std::vector<TrainingFeature>* training_features) {
// kFeaturesHalfA
IndexType index_offset = append_base_feature<FeatureType>(
kProperties[kFeaturesHalfKA], base_index, training_features);
const auto sq_k = static_cast<Square>(base_index / PS_END2);
const auto a = static_cast<IndexType>(base_index % PS_END2);
// kFeaturesA
index_offset += inherit_features_if_required<A>(
index_offset, kProperties[kFeaturesA], a, training_features);
// kFeaturesHalfRelativeKA
if (a >= PS_W_PAWN) {
index_offset += inherit_features_if_required<HalfRelativeKA<AssociatedKing>>(
index_offset, kProperties[kFeaturesHalfRelativeKA],
HalfRelativeKA<AssociatedKing>::make_index(sq_k, a),
training_features);
}
else {
index_offset += skip_features(kProperties[kFeaturesHalfRelativeKA]);
}
assert(index_offset == get_dimensions());
}
};
template <Side AssociatedKing>
constexpr FeatureProperties Factorizer<HalfKA<AssociatedKing>>::kProperties[];
} // namespace Eval::NNUE::Features
#endif // #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_

View File

@@ -1,104 +0,0 @@
#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
#include "factorizer.h"
#include "nnue/features/half_kp.h"
#include "nnue/features/p.h"
#include "nnue/features/half_relative_kp.h"
// Specialization of NNUE evaluation function feature conversion class template for HalfKP
namespace Eval::NNUE::Features {
// Class template that converts input features into learning features
// Specialization for HalfKP
template <Side AssociatedKing>
class Factorizer<HalfKP<AssociatedKing>> {
private:
using FeatureType = HalfKP<AssociatedKing>;
// The maximum value of the number of indexes whose value is 1 at the same time among the feature values
static constexpr IndexType kMaxActiveDimensions =
FeatureType::kMaxActiveDimensions;
// Type of learning feature
enum TrainingFeatureType {
kFeaturesHalfKP,
kFeaturesHalfK,
kFeaturesP,
kFeaturesHalfRelativeKP,
kNumTrainingFeatureTypes,
};
// Learning feature information
static constexpr FeatureProperties kProperties[] = {
// kFeaturesHalfKP
{true, FeatureType::kDimensions},
// kFeaturesHalfK
{true, SQUARE_NB},
// kFeaturesP
{true, Factorizer<P>::get_dimensions()},
// kFeaturesHalfRelativeKP
{true, Factorizer<HalfRelativeKP<AssociatedKing>>::get_dimensions()},
};
static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
public:
static constexpr std::string get_name() {
return std::string("Factorizer<") + FeatureType::kName + "> -> " + "HalfK, P, HalfRelativeKP";
}
static constexpr std::string get_factorizers_string() {
return " - " + get_name();
}
// Get the dimensionality of the learning feature
static constexpr IndexType get_dimensions() {
return get_active_dimensions(kProperties);
}
// Get index of learning feature and scale of learning rate
static void append_training_features(
IndexType base_index, std::vector<TrainingFeature>* training_features) {
// kFeaturesHalfKP
IndexType index_offset = append_base_feature<FeatureType>(
kProperties[kFeaturesHalfKP], base_index, training_features);
const auto sq_k = static_cast<Square>(base_index / PS_END);
const auto p = static_cast<IndexType>(base_index % PS_END);
// kFeaturesHalfK
{
const auto& properties = kProperties[kFeaturesHalfK];
if (properties.active) {
training_features->emplace_back(index_offset + sq_k);
index_offset += properties.dimensions;
}
}
// kFeaturesP
index_offset += inherit_features_if_required<P>(
index_offset, kProperties[kFeaturesP], p, training_features);
// kFeaturesHalfRelativeKP
if (p >= PS_W_PAWN) {
index_offset += inherit_features_if_required<HalfRelativeKP<AssociatedKing>>(
index_offset, kProperties[kFeaturesHalfRelativeKP],
HalfRelativeKP<AssociatedKing>::make_index(sq_k, p),
training_features);
}
else {
index_offset += skip_features(kProperties[kFeaturesHalfRelativeKP]);
}
assert(index_offset == get_dimensions());
}
};
template <Side AssociatedKing>
constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
} // namespace Eval::NNUE::Features
#endif

View File

@@ -1,122 +0,0 @@
#ifndef _NNUE_TRAINER_H_
#define _NNUE_TRAINER_H_
#include "nnue/nnue_common.h"
#include "nnue/features/index_list.h"
#include <sstream>
#if defined(USE_BLAS)
static_assert(std::is_same<LearnFloatType, float>::value, "");
#include <cblas.h>
#endif
// Common header of class template for learning NNUE evaluation function
namespace Eval::NNUE {
// Ponanza constant used in the relation between evaluation value and winning percentage
constexpr double kPonanzaConstant = 600.0;
// Class that represents one index of learning feature
class TrainingFeature {
using StorageType = std::uint32_t;
static_assert(std::is_unsigned<StorageType>::value, "");
public:
static constexpr std::uint32_t kIndexBits = 24;
static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
static constexpr std::uint32_t kCountBits =
std::numeric_limits<StorageType>::digits - kIndexBits;
explicit TrainingFeature(IndexType index) :
index_and_count_((index << kCountBits) | 1) {
assert(index < (1 << kIndexBits));
}
TrainingFeature& operator+=(const TrainingFeature& other) {
assert(other.get_index() == get_index());
assert(other.get_count() + get_count() < (1 << kCountBits));
index_and_count_ += other.get_count();
return *this;
}
IndexType get_index() const {
return static_cast<IndexType>(index_and_count_ >> kCountBits);
}
void shift_index(IndexType offset) {
assert(get_index() + offset < (1 << kIndexBits));
index_and_count_ += offset << kCountBits;
}
IndexType get_count() const {
return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
}
bool operator<(const TrainingFeature& other) const {
return index_and_count_ < other.index_and_count_;
}
private:
StorageType index_and_count_;
};
// Structure that represents one sample of training data
struct Example {
std::vector<TrainingFeature> training_features[2];
Learner::PackedSfenValue psv;
Value discrete_nn_eval;
int sign;
double weight;
};
// Message used for setting hyperparameters
struct Message {
Message(const std::string& message_name, const std::string& message_value = "") :
name(message_name), value(message_value), num_peekers(0), num_receivers(0)
{
}
const std::string name;
const std::string value;
std::uint32_t num_peekers;
std::uint32_t num_receivers;
};
// determine whether to accept the message
bool receive_message(const std::string& name, Message* message) {
const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
if (message->name.substr(0, name.size() + 1) == name + "[") {
++message->num_peekers;
}
if (message->name == name || message->name == name + subscript) {
++message->num_receivers;
return true;
}
return false;
}
// round a floating point number to an integer
template <typename IntType>
IntType round(double value) {
return static_cast<IntType>(std::floor(value + 0.5));
}
// make_shared with alignment
template <typename T, typename... ArgumentTypes>
std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments) {
const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
T(std::forward<ArgumentTypes>(arguments)...);
return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
}
} // namespace Eval::NNUE
#endif

View File

@@ -1,476 +0,0 @@
#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
#define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
#include "trainer.h"
#include "extra/stockfish_blas.h"
#include "learn/learn.h"
#include "nnue/layers/affine_transform.h"
#include "thread.h"
#include <random>
// Specialization of NNUE evaluation function learning class template for AffineTransform
namespace Eval::NNUE {
// Learning: Affine transformation layer
template <typename PreviousLayer, IndexType OutputDimensions>
class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
private:
// Type of layer to learn
using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
public:
// factory function
static std::shared_ptr<Trainer> create(
LayerType* target_layer, FeatureTransformer* ft) {
return std::shared_ptr<Trainer>(
new Trainer(target_layer, ft));
}
// Set options such as hyperparameters
void send_message(Message* message) {
previous_layer_trainer_->send_message(message);
if (receive_message("momentum", message)) {
momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
}
if (receive_message("learning_rate_scale", message)) {
learning_rate_scale_ =
static_cast<LearnFloatType>(std::stod(message->value));
}
if (receive_message("reset", message)) {
dequantize_parameters();
}
if (receive_message("quantize_parameters", message)) {
quantize_parameters();
}
if (receive_message("check_health", message)) {
check_health();
}
}
// Initialize the parameters with random numbers
template <typename RNG>
void initialize(RNG& rng) {
previous_layer_trainer_->initialize(rng);
if (kIsOutputLayer) {
// Initialize output layer with 0
std::fill(std::begin(biases_), std::end(biases_),
static_cast<LearnFloatType>(0.0));
std::fill(std::begin(weights_), std::end(weights_),
static_cast<LearnFloatType>(0.0));
}
else {
// Assuming that the input distribution is unit-mean 0.5, equal variance,
// Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
const double kSigma = 1.0 / std::sqrt(kInputDimensions);
auto distribution = std::normal_distribution<double>(0.0, kSigma);
for (IndexType i = 0; i < kOutputDimensions; ++i) {
double sum = 0.0;
for (IndexType j = 0; j < kInputDimensions; ++j) {
const auto weight = static_cast<LearnFloatType>(distribution(rng));
weights_[kInputDimensions * i + j] = weight;
sum += weight;
}
biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
}
}
quantize_parameters();
}
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
{
const auto size = batch_end - batch_begin;
if ((long)output_.size() < (long)kOutputDimensions * size) {
output_.resize(kOutputDimensions * size);
gradients_.resize(kInputDimensions * size);
}
if (thread_states_.size() < thread_pool.size())
{
thread_states_.resize(thread_pool.size());
}
combined_batch_size_ = size;
combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
auto& main_thread_state = thread_states_[0];
#if defined(USE_BLAS)
// update
cblas_sscal(
kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
);
#else
Blas::sscal(
kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
);
#endif
for (IndexType i = 1; i < thread_states_.size(); ++i)
thread_states_[i].reset_biases();
return output_.data();
}
// forward propagation
void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
previous_layer_trainer_->propagate(th, offset, count);
#if defined(USE_BLAS)
for (IndexType b = offset; b < offset + count; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
cblas_scopy(
kOutputDimensions, biases_, 1, &output_[batch_offset], 1
);
}
cblas_sgemm(
CblasColMajor, CblasTrans, CblasNoTrans,
kOutputDimensions, count, kInputDimensions,
1.0,
weights_, kInputDimensions,
combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
1.0,
&output_[offset * kOutputDimensions], kOutputDimensions
);
#else
for (IndexType b = offset; b < offset + count; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
Blas::scopy(
kOutputDimensions, biases_, 1, &output_[batch_offset], 1
);
}
Blas::sgemm(
Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
kOutputDimensions, count, kInputDimensions,
1.0,
weights_, kInputDimensions,
combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
1.0,
&output_[offset * kOutputDimensions], kOutputDimensions
);
#endif
}
// backpropagation
void backpropagate(Thread& th,
const LearnFloatType* gradients,
uint64_t offset,
uint64_t count) {
auto& thread_state = thread_states_[th.thread_idx()];
const auto momentum = th.thread_idx() == 0 ? momentum_ : 0.0f;
#if defined(USE_BLAS)
cblas_sgemm(
CblasColMajor, CblasNoTrans, CblasNoTrans,
kInputDimensions, count, kOutputDimensions,
1.0,
weights_, kInputDimensions,
gradients + offset * kOutputDimensions, kOutputDimensions,
0.0,
&gradients_[offset * kInputDimensions], kInputDimensions
);
for (IndexType b = offset; b < offset + count; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
cblas_saxpy(
kOutputDimensions, 1.0,
&gradients[batch_offset], 1, thread_state.biases_diff_, 1
);
}
cblas_sgemm(
CblasRowMajor, CblasTrans, CblasNoTrans,
kOutputDimensions, kInputDimensions, count,
1.0,
gradients + offset * kOutputDimensions, kOutputDimensions,
combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
momentum,
thread_state.weights_diff_, kInputDimensions
);
#else
// backpropagate
Blas::sgemm(
Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans,
kInputDimensions, count, kOutputDimensions,
1.0,
weights_, kInputDimensions,
gradients + offset * kOutputDimensions, kOutputDimensions,
0.0,
&gradients_[offset * kInputDimensions], kInputDimensions
);
for (IndexType b = offset; b < offset + count; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
Blas::saxpy(kOutputDimensions, 1.0,
&gradients[batch_offset], 1, thread_state.biases_diff_, 1);
}
Blas::sgemm(
Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
kOutputDimensions, kInputDimensions, count,
1.0,
gradients + offset * kOutputDimensions, kOutputDimensions,
combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
momentum,
thread_state.weights_diff_, kInputDimensions
);
#endif
previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
}
void reduce_thread_state()
{
for (IndexType i = 1; i < thread_states_.size(); ++i)
{
thread_states_[0] += thread_states_[i];
}
}
void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
{
const LearnFloatType local_learning_rate =
learning_rate * learning_rate_scale_;
reduce_thread_state();
auto& main_thread_state = thread_states_[0];
for (IndexType i = 0; i < kOutputDimensions; ++i) {
const double d = local_learning_rate * main_thread_state.biases_diff_[i];
biases_[i] -= d;
abs_biases_diff_sum_ += std::abs(d);
}
num_biases_diffs_ += kOutputDimensions;
for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
const double d = local_learning_rate * main_thread_state.weights_diff_[i];
weights_[i] -= d;
abs_weights_diff_sum_ += std::abs(d);
}
num_weights_diffs_ += kOutputDimensions * kInputDimensions;
previous_layer_trainer_->step_end(thread_pool, learning_rate);
}
private:
// constructor
Trainer(LayerType* target_layer, FeatureTransformer* ft) :
combined_batch_size_(0),
combined_batch_input_(nullptr),
previous_layer_trainer_(Trainer<PreviousLayer>::create(
&target_layer->previous_layer_, ft)),
target_layer_(target_layer),
biases_(),
weights_(),
momentum_(0.2),
learning_rate_scale_(1.0) {
dequantize_parameters();
}
void reset_stats() {
abs_biases_diff_sum_ = 0.0;
abs_weights_diff_sum_ = 0.0;
num_biases_diffs_ = 0;
num_weights_diffs_ = 0;
}
void check_health() {
double abs_bias_sum = 0.0;
double abs_weight_sum = 0.0;
for(auto b : biases_)
abs_bias_sum += std::abs(b);
for(auto w : weights_)
abs_weight_sum += std::abs(w);
auto out = sync_region_cout.new_region();
out << "INFO (check_health):"
<< " layer " << LayerType::kLayerIndex
<< " - " << LayerType::get_name()
<< std::endl;
out << " - avg_abs_bias = " << abs_bias_sum / std::size(biases_) << std::endl;
out << " - avg_abs_bias_diff = " << abs_biases_diff_sum_ / num_biases_diffs_ << std::endl;
out << " - avg_abs_weight = " << abs_weight_sum / std::size(weights_) << std::endl;
out << " - avg_abs_weight_diff = " << abs_weights_diff_sum_ / num_weights_diffs_ << std::endl;
out.unlock();
reset_stats();
}
// Weight saturation and parameterization
void quantize_parameters() {
for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
weights_[i] = std::max(-kMaxWeightMagnitude,
std::min(+kMaxWeightMagnitude, weights_[i]));
}
for (IndexType i = 0; i < kOutputDimensions; ++i) {
target_layer_->biases_[i] =
round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
}
for (IndexType i = 0; i < kOutputDimensions; ++i) {
const auto offset = kInputDimensions * i;
const auto padded_offset = LayerType::kPaddedInputDimensions * i;
for (IndexType j = 0; j < kInputDimensions; ++j) {
target_layer_->weights_[padded_offset + j] =
round<typename LayerType::WeightType>(
weights_[offset + j] * kWeightScale);
}
}
}
// read parameterized integer
void dequantize_parameters() {
for (IndexType i = 0; i < kOutputDimensions; ++i) {
biases_[i] = static_cast<LearnFloatType>(
target_layer_->biases_[i] / kBiasScale);
}
for (IndexType i = 0; i < kOutputDimensions; ++i) {
const auto offset = kInputDimensions * i;
const auto padded_offset = LayerType::kPaddedInputDimensions * i;
for (IndexType j = 0; j < kInputDimensions; ++j) {
weights_[offset + j] = static_cast<LearnFloatType>(
target_layer_->weights_[padded_offset + j] / kWeightScale);
}
}
for (auto& state : thread_states_)
{
state.reset_weights();
state.reset_biases();
}
reset_stats();
}
// number of input/output dimensions
static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
// If the output dimensionality is 1, the output layer
static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
// Coefficient used for parameterization
static constexpr LearnFloatType kActivationScale =
std::numeric_limits<std::int8_t>::max();
static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
(kPonanzaConstant * FV_SCALE) :
((1 << kWeightScaleBits) * kActivationScale);
static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
// Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
static constexpr LearnFloatType kMaxWeightMagnitude =
std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
// number of samples in mini-batch
IndexType combined_batch_size_;
double abs_biases_diff_sum_;
double abs_weights_diff_sum_;
uint64_t num_biases_diffs_;
uint64_t num_weights_diffs_;
// Input mini batch
const LearnFloatType* combined_batch_input_;
// Trainer of the previous layer
const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
// layer to learn
LayerType* const target_layer_;
// parameter
struct alignas(kCacheLineSize) ThreadState
{
// Buffer used for updating parameters
alignas(kCacheLineSize) LearnFloatType biases_diff_[kOutputDimensions];
alignas(kCacheLineSize) LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
ThreadState() { reset_weights(); reset_biases(); }
ThreadState& operator+=(const ThreadState& other)
{
for (IndexType i = 0; i < kOutputDimensions; ++i)
{
biases_diff_[i] += other.biases_diff_[i];
}
for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i)
{
weights_diff_[i] += other.weights_diff_[i];
}
return *this;
}
void reset_weights()
{
std::fill(std::begin(weights_diff_), std::end(weights_diff_), 0.0f);
}
void reset_biases()
{
std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
}
};
alignas(kCacheLineSize) LearnFloatType biases_[kOutputDimensions];
alignas(kCacheLineSize) LearnFloatType weights_[kOutputDimensions * kInputDimensions];
std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
// Forward propagation buffer
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
// buffer for back propagation
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
// hyper parameter
LearnFloatType momentum_;
LearnFloatType learning_rate_scale_;
};
} // namespace Eval::NNUE
#endif

View File

@@ -1,354 +0,0 @@
#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
#define _NNUE_TRAINER_CLIPPED_RELU_H_
#include "trainer.h"
#include "learn/learn.h"
#include "nnue/layers/clipped_relu.h"
#include "thread.h"
// Specialization of NNUE evaluation function learning class template for ClippedReLU
namespace Eval::NNUE {
// Learning: Affine transformation layer
template <typename PreviousLayer>
class Trainer<Layers::ClippedReLU<PreviousLayer>> {
private:
// Type of layer to learn
using LayerType = Layers::ClippedReLU<PreviousLayer>;
public:
// factory function
static std::shared_ptr<Trainer> create(
LayerType* target_layer, FeatureTransformer* ft) {
return std::shared_ptr<Trainer>(
new Trainer(target_layer, ft));
}
// Set options such as hyperparameters
void send_message(Message* message) {
previous_layer_trainer_->send_message(message);
if (receive_message("check_health", message)) {
check_health();
}
}
// Initialize the parameters with random numbers
template <typename RNG>
void initialize(RNG& rng) {
previous_layer_trainer_->initialize(rng);
}
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
{
const auto size = batch_end - batch_begin;
if ((long)output_.size() < (long)kOutputDimensions * size) {
output_.resize(kOutputDimensions * size);
gradients_.resize(kInputDimensions * size);
}
if (thread_states_.size() < thread_pool.size())
{
thread_states_.resize(thread_pool.size());
}
input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
batch_size_ = size;
return output_.data();
}
// forward propagation
void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
auto& thread_state = thread_states_[th.thread_idx()];
previous_layer_trainer_->propagate(th, offset, count);
#if defined (USE_SSE2)
{
static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
const __m128 kZero4 = _mm_set1_ps(+kZero);
const __m128 kOne4 = _mm_set1_ps(+kOne);
for (IndexType b = offset; b < offset + count; ++b)
{
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType i = 0; i < kOutputDimensions; i += 16)
{
__m128 out0 = _mm_loadu_ps(&input_[i + 0 + batch_offset]);
__m128 out1 = _mm_loadu_ps(&input_[i + 4 + batch_offset]);
__m128 out2 = _mm_loadu_ps(&input_[i + 8 + batch_offset]);
__m128 out3 = _mm_loadu_ps(&input_[i + 12 + batch_offset]);
out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
_mm_storeu_ps(&output_[i + 0 + batch_offset], out0);
_mm_storeu_ps(&output_[i + 4 + batch_offset], out1);
_mm_storeu_ps(&output_[i + 8 + batch_offset], out2);
_mm_storeu_ps(&output_[i + 12 + batch_offset], out3);
__m128 minact0 = _mm_loadu_ps(&thread_state.min_activations_[i + 0]);
__m128 minact1 = _mm_loadu_ps(&thread_state.min_activations_[i + 4]);
__m128 minact2 = _mm_loadu_ps(&thread_state.min_activations_[i + 8]);
__m128 minact3 = _mm_loadu_ps(&thread_state.min_activations_[i + 12]);
__m128 maxact0 = _mm_loadu_ps(&thread_state.max_activations_[i + 0]);
__m128 maxact1 = _mm_loadu_ps(&thread_state.max_activations_[i + 4]);
__m128 maxact2 = _mm_loadu_ps(&thread_state.max_activations_[i + 8]);
__m128 maxact3 = _mm_loadu_ps(&thread_state.max_activations_[i + 12]);
minact0 = _mm_min_ps(out0, minact0);
minact1 = _mm_min_ps(out1, minact1);
minact2 = _mm_min_ps(out2, minact2);
minact3 = _mm_min_ps(out3, minact3);
maxact0 = _mm_max_ps(out0, maxact0);
maxact1 = _mm_max_ps(out1, maxact1);
maxact2 = _mm_max_ps(out2, maxact2);
maxact3 = _mm_max_ps(out3, maxact3);
_mm_storeu_ps(&thread_state.min_activations_[i + 0], minact0);
_mm_storeu_ps(&thread_state.min_activations_[i + 4], minact1);
_mm_storeu_ps(&thread_state.min_activations_[i + 8], minact2);
_mm_storeu_ps(&thread_state.min_activations_[i + 12], minact3);
_mm_storeu_ps(&thread_state.max_activations_[i + 0], maxact0);
_mm_storeu_ps(&thread_state.max_activations_[i + 4], maxact1);
_mm_storeu_ps(&thread_state.max_activations_[i + 8], maxact2);
_mm_storeu_ps(&thread_state.max_activations_[i + 12], maxact3);
}
}
}
#else
for (IndexType b = offset; b < offset + count; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType i = 0; i < kOutputDimensions; ++i) {
const IndexType index = batch_offset + i;
output_[index] = std::max(+kZero, std::min(+kOne, input_[index]));
thread_state.min_activations_[i] = std::min(thread_state.min_activations_[i], output_[index]);
thread_state.max_activations_[i] = std::max(thread_state.max_activations_[i], output_[index]);
}
}
#endif
}
// backpropagation
void backpropagate(Thread& th,
const LearnFloatType* gradients,
const uint64_t offset,
const uint64_t count) {
auto& thread_state = thread_states_[th.thread_idx()];
#if defined (USE_SSE2)
{
static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
const __m128 kZero4 = _mm_set1_ps(+kZero);
const __m128 kOne4 = _mm_set1_ps(+kOne);
for (IndexType b = offset; b < offset + count; ++b)
{
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType i = 0; i < kOutputDimensions; i += 16)
{
__m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
__m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
__m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
__m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
__m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
__m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
__m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
__m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
__m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
__m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
__m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
__m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
grad0 = _mm_andnot_ps(clipped0, grad0);
grad1 = _mm_andnot_ps(clipped1, grad1);
grad2 = _mm_andnot_ps(clipped2, grad2);
grad3 = _mm_andnot_ps(clipped3, grad3);
_mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
_mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
_mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
_mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
const int clipped_mask =
(_mm_movemask_ps(clipped0) << 0)
| (_mm_movemask_ps(clipped1) << 4)
| (_mm_movemask_ps(clipped2) << 8)
| (_mm_movemask_ps(clipped3) << 12);
thread_state.num_clipped_ += popcount(clipped_mask);
}
}
}
#else
for (IndexType b = offset; b < offset + count; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType i = 0; i < kOutputDimensions; ++i) {
const IndexType index = batch_offset + i;
const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
gradients_[index] = gradients[index] * !clipped;
thread_state.num_clipped_ += clipped;
}
}
#endif
thread_state.num_total_ += count * kOutputDimensions;
previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
}
void reduce_thread_state()
{
for (IndexType i = 1; i < thread_states_.size(); ++i)
{
thread_states_[0] += thread_states_[i];
}
}
void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
{
previous_layer_trainer_->step_end(thread_pool, learning_rate);
}
private:
// constructor
Trainer(LayerType* target_layer, FeatureTransformer* ft) :
batch_size_(0),
previous_layer_trainer_(Trainer<PreviousLayer>::create(
&target_layer->previous_layer_, ft)),
target_layer_(target_layer) {
reset_stats();
}
void reset_stats() {
for(auto& state : thread_states_)
state.reset();
}
// Check if there are any problems with learning
void check_health() {
reduce_thread_state();
auto& main_thread_state = thread_states_[0];
const auto largest_min_activation = *std::max_element(
std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
const auto smallest_max_activation = *std::min_element(
std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
auto out = sync_region_cout.new_region();
out << "INFO (check_health):"
<< " layer " << LayerType::kLayerIndex
<< " - " << LayerType::get_name()
<< std::endl;
out << " - largest min activation = " << largest_min_activation
<< " , smallest max activation = " << smallest_max_activation
<< std::endl;
out << " - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
<< std::endl;
out.unlock();
reset_stats();
}
// number of input/output dimensions
static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
// LearnFloatType constant
static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
// number of samples in mini-batch
IndexType batch_size_;
const LearnFloatType* input_;
// Trainer of the previous layer
const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
// layer to learn
LayerType* const target_layer_;
// Forward propagation buffer
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
// buffer for back propagation
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
struct alignas(kCacheLineSize) ThreadState
{
// Health check statistics
LearnFloatType min_activations_[kOutputDimensions];
LearnFloatType max_activations_[kOutputDimensions];
uint64_t num_clipped_;
uint64_t num_total_;
ThreadState() { reset(); }
ThreadState& operator+=(const ThreadState& other)
{
for (IndexType i = 0; i < kOutputDimensions; ++i)
{
min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
}
for (IndexType i = 0; i < kOutputDimensions; ++i)
{
max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
}
num_clipped_ += other.num_clipped_;
num_total_ += other.num_total_;
return *this;
}
void reset()
{
std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
num_clipped_ = 0;
num_total_ = 0;
}
};
std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
};
} // namespace Eval::NNUE
#endif

View File

@@ -1,783 +0,0 @@
#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
#define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
#include "trainer.h"
#include "extra/stockfish_blas.h"
#include "features/all_factorizers.h"
#include "learn/learn.h"
#include "nnue/nnue_feature_transformer.h"
#include "thread.h"
#include <array>
#include <bitset>
#include <numeric>
#include <random>
#include <set>
// Specialization for feature transformer of learning class template of NNUE evaluation function
namespace Eval::NNUE {
// Learning: Input feature converter
template <>
class Trainer<FeatureTransformer> {
private:
// Type of layer to learn
using LayerType = FeatureTransformer;
public:
template <typename T>
friend struct AlignedDeleter;
template <typename T, typename... ArgumentTypes>
friend std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments);
// factory function
static std::shared_ptr<Trainer> create(LayerType* target_layer) {
return make_aligned_shared_ptr<Trainer>(target_layer);
}
// Set options such as hyperparameters
void send_message(Message* message) {
if (receive_message("momentum", message)) {
momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
}
if (receive_message("learning_rate_scale", message)) {
learning_rate_scale_ =
static_cast<LearnFloatType>(std::stod(message->value));
}
if (receive_message("reset", message)) {
dequantize_parameters();
}
if (receive_message("quantize_parameters", message)) {
quantize_parameters();
}
if (receive_message("clear_unobserved_feature_weights", message)) {
clear_unobserved_feature_weights();
}
if (receive_message("check_health", message)) {
check_health();
}
}
// Initialize the parameters with random numbers
template <typename RNG>
void initialize(RNG& rng) {
std::fill(std::begin(weights_), std::end(weights_), +kZero);
const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
auto distribution = std::normal_distribution<double>(0.0, kSigma);
for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
const auto weight = static_cast<LearnFloatType>(distribution(rng));
weights_[i] = weight;
}
for (IndexType i = 0; i < kHalfDimensions; ++i) {
biases_[i] = static_cast<LearnFloatType>(0.5);
}
quantize_parameters();
}
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
{
const auto size = batch_end - batch_begin;
if ((long)output_.size() < (long)kOutputDimensions * size) {
output_.resize(kOutputDimensions * size);
gradients_.resize(kOutputDimensions * size);
}
if (thread_stat_states_.size() < thread_pool.size())
{
thread_stat_states_.resize(thread_pool.size());
}
if (thread_bias_states_.size() < thread_pool.size())
{
thread_bias_states_.resize(thread_pool.size());
}
batch_ = &*batch_begin;
batch_size_ = size;
auto& main_thread_bias_state = thread_bias_states_[0];
#if defined(USE_BLAS)
cblas_sscal(
kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1
);
#else
Blas::sscal(
kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1
);
#endif
for (IndexType i = 1; i < thread_bias_states_.size(); ++i)
thread_bias_states_[i].reset();
return output_.data();
}
// forward propagation
void propagate(Thread& th, uint64_t offset, uint64_t count) {
auto& thread_stat_state = thread_stat_states_[th.thread_idx()];
for (IndexType b = offset; b < offset + count; ++b)
{
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c;
#if defined(USE_BLAS)
cblas_scopy(
kHalfDimensions, biases_, 1, &output_[output_offset], 1
);
for (const auto& feature : batch_[b].training_features[c]) {
const IndexType weights_offset = kHalfDimensions * feature.get_index();
cblas_saxpy(
kHalfDimensions, (float)feature.get_count(),
&weights_[weights_offset], 1, &output_[output_offset], 1
);
}
#else
Blas::scopy(
kHalfDimensions, biases_, 1, &output_[output_offset], 1
);
for (const auto& feature : batch_[b].training_features[c]) {
const IndexType weights_offset = kHalfDimensions * feature.get_index();
Blas::saxpy(
kHalfDimensions, (float)feature.get_count(),
&weights_[weights_offset], &output_[output_offset]
);
}
#endif
}
}
#if defined (USE_SSE2)
{
static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
auto m128_hmin_ps = [](__m128 x3210) {
__m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2));
__m128 min_x_x_13_20 = _mm_min_ps(x3210, x0032);
// a = [ # , # , min(x[1], x[3]) , min(x[2], x[0]) ]
__m128 min_x_x_20_13 = _mm_shuffle_ps(min_x_x_13_20, min_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1));
return _mm_cvtss_f32(_mm_min_ps(min_x_x_13_20, min_x_x_20_13));
};
auto m128_hmax_ps = [](__m128 x3210) {
__m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2));
__m128 max_x_x_13_20 = _mm_max_ps(x3210, x0032);
// a = [ # , # , max(x[1], x[3]) , max(x[2], x[0]) ]
__m128 max_x_x_20_13 = _mm_shuffle_ps(max_x_x_13_20, max_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1));
return _mm_cvtss_f32(_mm_max_ps(max_x_x_13_20, max_x_x_20_13));
};
const __m128 kZero4 = _mm_set1_ps(+kZero);
const __m128 kOne4 = _mm_set1_ps(+kOne);
__m128 min_pre_activation0 = _mm_set1_ps(thread_stat_state.min_pre_activation_);
__m128 min_pre_activation1 = _mm_set1_ps(thread_stat_state.min_pre_activation_);
__m128 max_pre_activation0 = _mm_set1_ps(thread_stat_state.max_pre_activation_);
__m128 max_pre_activation1 = _mm_set1_ps(thread_stat_state.max_pre_activation_);
for (IndexType b = offset; b < offset + count; ++b)
{
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType i = 0; i < kOutputDimensions; i += 16)
{
__m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
__m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
__m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
__m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
__m128 min01 = _mm_min_ps(out0, out1);
__m128 min23 = _mm_min_ps(out2, out3);
__m128 max01 = _mm_max_ps(out0, out1);
__m128 max23 = _mm_max_ps(out2, out3);
min_pre_activation0 = _mm_min_ps(min_pre_activation0, min01);
min_pre_activation1 = _mm_min_ps(min_pre_activation1, min23);
max_pre_activation0 = _mm_max_ps(max_pre_activation0, max01);
max_pre_activation1 = _mm_max_ps(max_pre_activation1, max23);
out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
_mm_storeu_ps(&output_[batch_offset + i + 0], out0);
_mm_storeu_ps(&output_[batch_offset + i + 4], out1);
_mm_storeu_ps(&output_[batch_offset + i + 8], out2);
_mm_storeu_ps(&output_[batch_offset + i + 12], out3);
}
}
thread_stat_state.min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1));
thread_stat_state.max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1));
for (IndexType b = offset; b < offset + count; ++b)
{
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType half = 0; half < 2; ++half)
{
const IndexType half_offset = batch_offset + half * kHalfDimensions;
for (IndexType i = 0; i < kHalfDimensions; i += 16)
{
const __m128 out0 = _mm_loadu_ps(&output_[i + 0 + half_offset]);
const __m128 out1 = _mm_loadu_ps(&output_[i + 4 + half_offset]);
const __m128 out2 = _mm_loadu_ps(&output_[i + 8 + half_offset]);
const __m128 out3 = _mm_loadu_ps(&output_[i + 12 + half_offset]);
__m128 minact0 = _mm_loadu_ps(&thread_stat_state.min_activations_[i + 0]);
__m128 minact1 = _mm_loadu_ps(&thread_stat_state.min_activations_[i + 4]);
__m128 minact2 = _mm_loadu_ps(&thread_stat_state.min_activations_[i + 8]);
__m128 minact3 = _mm_loadu_ps(&thread_stat_state.min_activations_[i + 12]);
__m128 maxact0 = _mm_loadu_ps(&thread_stat_state.max_activations_[i + 0]);
__m128 maxact1 = _mm_loadu_ps(&thread_stat_state.max_activations_[i + 4]);
__m128 maxact2 = _mm_loadu_ps(&thread_stat_state.max_activations_[i + 8]);
__m128 maxact3 = _mm_loadu_ps(&thread_stat_state.max_activations_[i + 12]);
minact0 = _mm_min_ps(out0, minact0);
minact1 = _mm_min_ps(out1, minact1);
minact2 = _mm_min_ps(out2, minact2);
minact3 = _mm_min_ps(out3, minact3);
maxact0 = _mm_max_ps(out0, maxact0);
maxact1 = _mm_max_ps(out1, maxact1);
maxact2 = _mm_max_ps(out2, maxact2);
maxact3 = _mm_max_ps(out3, maxact3);
_mm_storeu_ps(&thread_stat_state.min_activations_[i + 0], minact0);
_mm_storeu_ps(&thread_stat_state.min_activations_[i + 4], minact1);
_mm_storeu_ps(&thread_stat_state.min_activations_[i + 8], minact2);
_mm_storeu_ps(&thread_stat_state.min_activations_[i + 12], minact3);
_mm_storeu_ps(&thread_stat_state.max_activations_[i + 0], maxact0);
_mm_storeu_ps(&thread_stat_state.max_activations_[i + 4], maxact1);
_mm_storeu_ps(&thread_stat_state.max_activations_[i + 8], maxact2);
_mm_storeu_ps(&thread_stat_state.max_activations_[i + 12], maxact3);
}
}
}
}
#else
// clipped ReLU
for (IndexType b = offset; b < offset + count; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType i = 0; i < kOutputDimensions; ++i) {
const IndexType index = batch_offset + i;
thread_stat_state.min_pre_activation_ = std::min(thread_stat_state.min_pre_activation_, output_[index]);
thread_stat_state.max_pre_activation_ = std::max(thread_stat_state.max_pre_activation_, output_[index]);
output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
const IndexType t = i % kHalfDimensions;
thread_stat_state.min_activations_[t] = std::min(thread_stat_state.min_activations_[t], output_[index]);
thread_stat_state.max_activations_[t] = std::max(thread_stat_state.max_activations_[t], output_[index]);
}
}
#endif
}
// backpropagation
void backpropagate(Thread& th,
const LearnFloatType* gradients,
uint64_t offset,
uint64_t count) {
auto& thread_stat_state = thread_stat_states_[th.thread_idx()];
auto& thread_bias_state = thread_bias_states_[th.thread_idx()];
#if defined (USE_SSE2)
{
static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
const __m128 kZero4 = _mm_set1_ps(+kZero);
const __m128 kOne4 = _mm_set1_ps(+kOne);
for (IndexType b = offset; b < offset + count; ++b)
{
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType i = 0; i < kOutputDimensions; i += 16)
{
__m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
__m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
__m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
__m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
__m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
__m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
__m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
__m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
__m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
__m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
__m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
__m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
grad0 = _mm_andnot_ps(clipped0, grad0);
grad1 = _mm_andnot_ps(clipped1, grad1);
grad2 = _mm_andnot_ps(clipped2, grad2);
grad3 = _mm_andnot_ps(clipped3, grad3);
_mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
_mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
_mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
_mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
const int clipped_mask =
(_mm_movemask_ps(clipped0) << 0)
| (_mm_movemask_ps(clipped1) << 4)
| (_mm_movemask_ps(clipped2) << 8)
| (_mm_movemask_ps(clipped3) << 12);
thread_stat_state.num_clipped_ += popcount(clipped_mask);
}
}
}
#else
for (IndexType b = offset; b < offset + count; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType i = 0; i < kOutputDimensions; ++i) {
const IndexType index = batch_offset + i;
const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
gradients_[index] = gradients[index] * !clipped;
thread_stat_state.num_clipped_ += clipped;
}
}
#endif
thread_stat_state.num_total_ += count * kOutputDimensions;
#if defined(USE_BLAS)
for (IndexType b = offset; b < offset + count; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c;
cblas_saxpy(
kHalfDimensions, 1.0,
&gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1
);
}
}
#else
for (IndexType b = offset; b < offset + count; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c;
Blas::saxpy(
kHalfDimensions, 1.0,
&gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1
);
}
}
#endif
}
void reduce_thread_stat_state()
{
for (IndexType i = 1; i < thread_stat_states_.size(); ++i)
{
thread_stat_states_[0] += thread_stat_states_[i];
}
}
void reduce_thread_bias_state()
{
for (IndexType i = 1; i < thread_bias_states_.size(); ++i)
{
thread_bias_states_[0] += thread_bias_states_[i];
}
}
void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
const LearnFloatType local_learning_rate =
learning_rate * learning_rate_scale_;
// Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
// Correct the learning rate and adjust the scale without using momentum
const LearnFloatType effective_learning_rate =
static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
reduce_thread_bias_state();
auto& main_thread_state = thread_bias_states_[0];
#if defined(USE_BLAS)
cblas_saxpy(
kHalfDimensions, -local_learning_rate,
main_thread_state.biases_diff_, 1, biases_, 1
);
#else
Blas::saxpy(
kHalfDimensions, -local_learning_rate,
main_thread_state.biases_diff_, 1, biases_, 1
);
#endif
thread_pool.execute_with_workers(
[&, num_threads = thread_pool.size()](Thread& th) {
const auto thread_index = th.thread_idx();
for (IndexType b = 0; b < batch_size_; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c;
for (const auto& feature : batch_[b].training_features[c]) {
const IndexType feature_index = feature.get_index();
const IndexType weights_offset =
kHalfDimensions * feature_index;
#if defined (USE_SSE2)
_mm_prefetch(reinterpret_cast<const char*>(&weights_[weights_offset]), _MM_HINT_T2);
#endif
// We assign each bucket a continuous range of bits at least
// of cache line size to prevent false sharing.
// For HalfKP this is enough to saturate about 80 threads.
const IndexType thread_bucket =
(feature_index / BitsetType::best_concurrent_access_stride)
% num_threads;
if (thread_bucket != thread_index)
continue;
// This operation can be performed safely because
// each thread accesses a different memory location
// (even a different cache line)
observed_features.set(feature_index);
const auto scale = static_cast<LearnFloatType>(
effective_learning_rate / feature.get_count());
#if defined (USE_BLAS)
cblas_saxpy(
kHalfDimensions, -scale,
&gradients_[output_offset], 1,
&weights_[weights_offset], 1
);
#else
Blas::saxpy(
kHalfDimensions, -scale,
&gradients_[output_offset],
&weights_[weights_offset]
);
#endif
}
}
}
}
);
thread_pool.wait_for_workers_finished();
}
private:
// constructor
Trainer(LayerType* target_layer) :
batch_(nullptr),
batch_size_(0),
target_layer_(target_layer),
biases_(),
weights_(),
momentum_(0.2),
learning_rate_scale_(1.0) {
dequantize_parameters();
}
// Weight saturation and parameterization
void quantize_parameters() {
for (IndexType i = 0; i < kHalfDimensions; ++i) {
target_layer_->biases_[i] =
round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
}
std::vector<TrainingFeature> training_features;
Threads.for_each_index_with_workers(
0, RawFeatures::kDimensions,
[this, training_features](Thread&, int j) mutable {
training_features.clear();
Features::Factorizer<RawFeatures>::append_training_features(
j, &training_features);
for (IndexType i = 0; i < kHalfDimensions; ++i) {
double sum = 0.0;
for (const auto& feature : training_features) {
sum += weights_[kHalfDimensions * feature.get_index() + i];
}
target_layer_->weights_[kHalfDimensions * j + i] =
round<typename LayerType::WeightType>(sum * kWeightScale);
}
}
);
Threads.wait_for_workers_finished();
}
void reset_stats() {
for (auto& state : thread_stat_states_)
state.reset();
}
// read parameterized integer
void dequantize_parameters() {
for (IndexType i = 0; i < kHalfDimensions; ++i) {
biases_[i] = static_cast<LearnFloatType>(
target_layer_->biases_[i] / kBiasScale);
}
std::fill(std::begin(weights_), std::end(weights_), +kZero);
for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
weights_[i] = static_cast<LearnFloatType>(
target_layer_->weights_[i] / kWeightScale);
}
reset_stats();
for (auto& state : thread_bias_states_)
state.reset();
}
// Set the weight corresponding to the feature that does not appear in the learning data to 0
void clear_unobserved_feature_weights() {
for (IndexType i = 0; i < kInputDimensions; ++i) {
if (!observed_features.test(i)) {
std::fill(std::begin(weights_) + kHalfDimensions * i,
std::begin(weights_) + kHalfDimensions * (i + 1), +kZero);
}
}
quantize_parameters();
}
// Check if there are any problems with learning
void check_health() {
constexpr LearnFloatType kPreActivationLimit =
std::numeric_limits<typename LayerType::WeightType>::max() /
kWeightScale;
reduce_thread_stat_state();
auto& main_thread_state = thread_stat_states_[0];
const auto largest_min_activation = *std::max_element(
std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
const auto smallest_max_activation = *std::min_element(
std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
double abs_bias_sum = 0.0;
double abs_weight_sum = 0.0;
for(auto b : biases_)
abs_bias_sum += std::abs(b);
std::vector<TrainingFeature> training_features;
for (IndexType j = 0; j < RawFeatures::kDimensions; ++j)
{
training_features.clear();
Features::Factorizer<RawFeatures>::append_training_features(
j, &training_features);
for (const auto& feature : training_features) {
for (IndexType i = 0; i < kHalfDimensions; ++i) {
abs_weight_sum += std::abs(weights_[kHalfDimensions * feature.get_index() + i]);
}
}
}
auto out = sync_region_cout.new_region();
out << "INFO (check_health):"
<< " layer " << LayerType::kLayerIndex
<< " - " << LayerType::get_name()
<< std::endl;
out << " - observed " << observed_features.count()
<< " (out of " << kInputDimensions << ") features"
<< std::endl;
out << " - (min, max) of pre-activations = "
<< main_thread_state.min_pre_activation_ << ", "
<< main_thread_state.max_pre_activation_ << " (limit = "
<< kPreActivationLimit << ")"
<< std::endl;
out << " - largest min activation = " << largest_min_activation
<< " , smallest max activation = " << smallest_max_activation
<< std::endl;
out << " - avg_abs_bias = " << abs_bias_sum / std::size(biases_) << std::endl;
out << " - avg_abs_weight = " << abs_weight_sum / std::size(weights_) << std::endl;
out << " - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
<< std::endl;
out.unlock();
reset_stats();
}
// number of input/output dimensions
static constexpr IndexType kInputDimensions =
Features::Factorizer<RawFeatures>::get_dimensions();
static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
// Coefficient used for parameterization
static constexpr LearnFloatType kActivationScale =
std::numeric_limits<std::int8_t>::max();
static constexpr LearnFloatType kBiasScale = kActivationScale;
static constexpr LearnFloatType kWeightScale = kActivationScale;
// LearnFloatType constant
static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
// mini batch
const Example* batch_;
IndexType batch_size_;
// layer to learn
LayerType* const target_layer_;
// parameter
alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
alignas(kCacheLineSize)
LearnFloatType weights_[kHalfDimensions * kInputDimensions];
// Buffer used for updating parameters
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
// Forward propagation buffer
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
// Features that appeared in the training data
using BitsetType = LargeBitset<kInputDimensions>;
BitsetType observed_features;
// hyper parameter
LearnFloatType momentum_;
LearnFloatType learning_rate_scale_;
struct alignas(kCacheLineSize) ThreadStatState
{
alignas(kCacheLineSize) LearnFloatType min_activations_[kHalfDimensions];
alignas(kCacheLineSize) LearnFloatType max_activations_[kHalfDimensions];
LearnFloatType min_pre_activation_;
LearnFloatType max_pre_activation_;
uint64_t num_clipped_;
uint64_t num_total_;
ThreadStatState() { reset(); }
ThreadStatState& operator+=(const ThreadStatState& other)
{
for (IndexType i = 0; i < kHalfDimensions; ++i)
{
min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
}
for (IndexType i = 0; i < kHalfDimensions; ++i)
{
max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
}
min_pre_activation_ = std::min(min_pre_activation_, other.min_pre_activation_);
max_pre_activation_ = std::max(max_pre_activation_, other.max_pre_activation_);
num_clipped_ += other.num_clipped_;
num_total_ += other.num_total_;
return *this;
}
void reset()
{
std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
min_pre_activation_ = std::numeric_limits<float>::max();
max_pre_activation_ = std::numeric_limits<float>::lowest();
num_clipped_ = 0;
num_total_ = 0;
}
};
struct alignas(kCacheLineSize) ThreadBiasState
{
alignas(kCacheLineSize) LearnFloatType biases_diff_[kHalfDimensions];
ThreadBiasState() { reset(); }
ThreadBiasState& operator+=(const ThreadBiasState& other)
{
for (IndexType i = 0; i < kHalfDimensions; ++i)
{
biases_diff_[i] += other.biases_diff_[i];
}
return *this;
}
void reset()
{
std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
}
};
std::vector<ThreadStatState, CacheLineAlignedAllocator<ThreadStatState>> thread_stat_states_;
std::vector<ThreadBiasState, CacheLineAlignedAllocator<ThreadBiasState>> thread_bias_states_;
};
} // namespace Eval::NNUE
#endif

View File

@@ -1,383 +0,0 @@
#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
#define _NNUE_TRAINER_INPUT_SLICE_H_
#include "trainer.h"
#include "extra/stockfish_blas.h"
#include "learn/learn.h"
#include "nnue/layers/input_slice.h"
#include "thread.h"
// Specialization of NNUE evaluation function learning class template for InputSlice
namespace Eval::NNUE {
// Learning: Input layer
// This is tricky. It exists because when there's more than one trainer
// on top of a single feature transformer we want to only call propagate/backpropagate
// on the feature transformer once. This is straightforward in the old
// multithreading case, because propagate/backpropagate is called just once from the
// main thread. But with the current implementation of coarser multithreading
// we end up calling each method from each thread. Therefore we have to keep
// the num_calls and current_operation per thread basis, each thread must work
// on its designated batch slice, and the only synchronization points are
// step_start and step_end - for which we use state of the first thread.
// Each thread requires their own bookkeeping because it's possible that
// one thread is still in propagate of some batch slice while the other thread
// is doing backpropagate of some other slice. We also ensure the thread state
// isn't suspectible to false sharing by using a full cache line for the state.
class SharedInputTrainer {
public:
// factory function
static std::shared_ptr<SharedInputTrainer> create(
FeatureTransformer* ft) {
static std::shared_ptr<SharedInputTrainer> instance;
if (!instance) {
instance.reset(new SharedInputTrainer(ft));
}
++instance->num_referrers_;
return instance;
}
// Set options such as hyperparameters
void send_message(Message* message) {
auto& thread_state = thread_states_[0];
if (thread_state.num_calls == 0) {
thread_state.current_operation = Operation::kSendMessage;
feature_transformer_trainer_->send_message(message);
}
assert(thread_state.current_operation == Operation::kSendMessage);
if (++thread_state.num_calls == num_referrers_) {
thread_state.num_calls = 0;
thread_state.current_operation = Operation::kNone;
}
}
// Initialize the parameters with random numbers
template <typename RNG>
void initialize(RNG& rng) {
auto& thread_state = thread_states_[0];
if (thread_state.num_calls == 0) {
thread_state.current_operation = Operation::kInitialize;
feature_transformer_trainer_->initialize(rng);
}
assert(thread_state.current_operation == Operation::kInitialize);
if (++thread_state.num_calls == num_referrers_) {
thread_state.num_calls = 0;
thread_state.current_operation = Operation::kNone;
}
}
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
{
const auto size = batch_end - batch_begin;
if ((long)gradients_.size() < (long)kInputDimensions * size) {
gradients_.resize(kInputDimensions * size);
}
if (thread_states_.size() < thread_pool.size())
{
thread_states_.resize(thread_pool.size());
}
batch_size_ = size;
auto& thread_state = thread_states_[0];
if (thread_state.num_calls == 0) {
thread_state.current_operation = Operation::kStepStart;
output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end);
}
assert(thread_state.current_operation == Operation::kStepStart);
if (++thread_state.num_calls == num_referrers_) {
thread_state.num_calls = 0;
thread_state.current_operation = Operation::kNone;
}
return output_;
}
// forward propagation
void propagate(Thread& th, uint64_t offset, uint64_t count) {
const auto thread_id = th.thread_idx();
auto& thread_state = thread_states_[thread_id];
if (thread_state.num_calls == 0) {
thread_state.current_operation = Operation::kPropagate;
feature_transformer_trainer_->propagate(th, offset, count);
}
assert(thread_state.current_operation == Operation::kPropagate);
if (++thread_state.num_calls == num_referrers_) {
thread_state.num_calls = 0;
thread_state.current_operation = Operation::kNone;
}
}
// backpropagation
void backpropagate(Thread& th,
const LearnFloatType* gradients,
uint64_t offset,
uint64_t count) {
const auto thread_id = th.thread_idx();
auto& thread_state = thread_states_[thread_id];
if (num_referrers_ == 1) {
feature_transformer_trainer_->backpropagate(th, gradients, offset, count);
return;
}
if (thread_state.num_calls == 0) {
thread_state.current_operation = Operation::kBackPropagate;
for (IndexType b = offset; b < offset + count; ++b) {
const IndexType batch_offset = kInputDimensions * b;
for (IndexType i = 0; i < kInputDimensions; ++i) {
gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
}
}
}
assert(thread_state.current_operation == Operation::kBackPropagate);
for (IndexType b = offset; b < offset + count; ++b) {
const IndexType batch_offset = kInputDimensions * b;
for (IndexType i = 0; i < kInputDimensions; ++i) {
gradients_[batch_offset + i] += gradients[batch_offset + i];
}
}
if (++thread_state.num_calls == num_referrers_) {
feature_transformer_trainer_->backpropagate(
th, gradients_.data(), offset, count);
thread_state.num_calls = 0;
thread_state.current_operation = Operation::kNone;
}
}
void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
auto& thread_state = thread_states_[0];
if (thread_state.num_calls == 0) {
thread_state.current_operation = Operation::kStepEnd;
feature_transformer_trainer_->step_end(thread_pool, learning_rate);
}
assert(thread_state.current_operation == Operation::kStepEnd);
if (++thread_state.num_calls == num_referrers_) {
thread_state.num_calls = 0;
thread_state.current_operation = Operation::kNone;
}
}
private:
// constructor
SharedInputTrainer(FeatureTransformer* ft) :
batch_size_(0),
num_referrers_(0),
thread_states_(1),
feature_transformer_trainer_(Trainer<FeatureTransformer>::create(
ft)),
output_(nullptr) {
}
// number of input/output dimensions
static constexpr IndexType kInputDimensions =
FeatureTransformer::kOutputDimensions;
// type of processing
enum class Operation {
kNone,
kSendMessage,
kInitialize,
kStepStart,
kPropagate,
kBackPropagate,
kStepEnd,
};
// number of samples in mini-batch
IndexType batch_size_;
// number of layers sharing this layer as input
std::uint32_t num_referrers_;
struct alignas(kCacheLineSize) ThreadState
{
std::uint32_t num_calls{0};
// current processing type
Operation current_operation = Operation::kNone;
};
// Number of times the current process has been called
std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
// Trainer of input feature converter
const std::shared_ptr<Trainer<FeatureTransformer>>
feature_transformer_trainer_;
// pointer to output shared for forward propagation
const LearnFloatType* output_;
// buffer for back propagation
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
};
// Learning: Input layer
template <IndexType OutputDimensions, IndexType Offset>
class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
private:
// Type of layer to learn
using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
public:
// factory function
static std::shared_ptr<Trainer> create(
LayerType* /*target_layer*/, FeatureTransformer* ft) {
return std::shared_ptr<Trainer>(new Trainer(ft));
}
// Set options such as hyperparameters
void send_message(Message* message) {
shared_input_trainer_->send_message(message);
}
// Initialize the parameters with random numbers
template <typename RNG>
void initialize(RNG& rng) {
shared_input_trainer_->initialize(rng);
}
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
{
const auto size = batch_end - batch_begin;
if ((long)output_.size() < (long)kOutputDimensions * size) {
output_.resize(kOutputDimensions * size);
gradients_.resize(kInputDimensions * size);
}
batch_size_ = size;
input_ = shared_input_trainer_->step_start(thread_pool, batch_begin, batch_end);
return output_.data();
}
// forward propagation
void propagate(Thread& th, uint64_t offset, uint64_t count) {
shared_input_trainer_->propagate(th, offset, count);
for (IndexType b = offset; b < offset + count; ++b) {
const IndexType input_offset = kInputDimensions * b;
const IndexType output_offset = kOutputDimensions * b;
#if defined(USE_BLAS)
cblas_scopy(
kOutputDimensions, &input_[input_offset + Offset], 1,
&output_[output_offset], 1
);
#else
Blas::scopy(
kOutputDimensions, &input_[input_offset + Offset], 1,
&output_[output_offset], 1
);
#endif
}
}
// backpropagation
void backpropagate(Thread& th,
const LearnFloatType* gradients,
uint64_t offset,
uint64_t count) {
for (IndexType b = offset; b < offset + count; ++b)
{
const IndexType input_offset = kInputDimensions * b;
const IndexType output_offset = kOutputDimensions * b;
IndexType i = 0;
if constexpr (Offset > 0)
{
for (; i < Offset; ++i) {
gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
}
}
for (; i < Offset + kOutputDimensions; ++i) {
gradients_[input_offset + i] = gradients[output_offset + i - Offset];
}
if constexpr (Offset + kOutputDimensions < kInputDimensions)
{
for (; i < kInputDimensions; ++i)
{
gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
}
}
}
shared_input_trainer_->backpropagate(th, gradients_.data(), offset, count);
}
void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
shared_input_trainer_->step_end(thread_pool, learning_rate);
}
private:
// constructor
Trainer(FeatureTransformer* ft) :
batch_size_(0),
shared_input_trainer_(SharedInputTrainer::create(ft)) {
}
// number of input/output dimensions
static constexpr IndexType kInputDimensions =
FeatureTransformer::kOutputDimensions;
static constexpr IndexType kOutputDimensions = OutputDimensions;
static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
// number of samples in mini-batch
IndexType batch_size_;
const LearnFloatType* input_;
// Trainer of shared input layer
const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
// Forward propagation buffer
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
// buffer for back propagation
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
};
} // namespace Eval::NNUE
#endif

View File

@@ -1,201 +0,0 @@
#ifndef _NNUE_TRAINER_SUM_H_
#define _NNUE_TRAINER_SUM_H_
#include "trainer.h"
#include "extra/stockfish_blas.h"
#include "learn/learn.h"
#include "nnue/layers/sum.h"
#include "thread.h"
// Specialization of NNUE evaluation function learning class template for Sum
namespace Eval::NNUE {
// Learning: A layer that sums the outputs of multiple layers
template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
Trainer<Layers::Sum<RemainingPreviousLayers...>> {
private:
// Type of layer to learn
using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
public:
// factory function
static std::shared_ptr<Trainer> create(
LayerType* target_layer, FeatureTransformer* ft) {
return std::shared_ptr<Trainer>(
new Trainer(target_layer, ft));
}
// Set options such as hyperparameters
void send_message(Message* message) {
// The results of other member functions do not depend on the processing order, so
// Tail is processed first for the purpose of simplifying the implementation, but
// SendMessage processes Head first to make it easier to understand subscript correspondence
previous_layer_trainer_->send_message(message);
Tail::send_message(message);
}
// Initialize the parameters with random numbers
template <typename RNG>
void initialize(RNG& rng) {
Tail::initialize(rng);
previous_layer_trainer_->initialize(rng);
}
// forward propagation
/*const*/ LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
batch_size_ = static_cast<IndexType>(batch.size());
auto output = Tail::propagate(thread_pool, batch);
const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch);
#if defined(USE_BLAS)
cblas_saxpy(
kOutputDimensions * batch_size_, 1.0,
head_output, 1, output, 1
);
#else
Blas::saxpy(
thread_pool,
kOutputDimensions * batch_size_, 1.0,
head_output, 1, output, 1
);
#endif
return output;
}
// backpropagation
void backpropagate(ThreadPool& thread_pool,
const LearnFloatType* gradients,
LearnFloatType learning_rate) {
Tail::backpropagate(thread_pool, gradients, learning_rate);
previous_layer_trainer_->backpropagate(thread_pool, gradients, learning_rate);
}
private:
// constructor
Trainer(LayerType* target_layer, FeatureTransformer* ft):
Tail(target_layer, ft),
batch_size_(0),
previous_layer_trainer_(Trainer<FirstPreviousLayer>::create(
&target_layer->previous_layer_, ft)),
target_layer_(target_layer) {
}
// number of input/output dimensions
static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
// make subclass friend
template <typename SumLayer>
friend class Trainer;
// number of samples in mini-batch
IndexType batch_size_;
// Trainer of the previous layer
const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
// layer to learn
LayerType* const target_layer_;
};
// Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
template <typename PreviousLayer>
class Trainer<Layers::Sum<PreviousLayer>> {
private:
// Type of layer to learn
using LayerType = Layers::Sum<PreviousLayer>;
public:
// factory function
static std::shared_ptr<Trainer> create(
LayerType* target_layer, FeatureTransformer* ft) {
return std::shared_ptr<Trainer>(
new Trainer(target_layer, ft));
}
// Set options such as hyperparameters
void send_message(Message* message) {
previous_layer_trainer_->send_message(message);
}
// Initialize the parameters with random numbers
template <typename RNG>
void initialize(RNG& rng) {
previous_layer_trainer_->initialize(rng);
}
// forward propagation
/*const*/ LearnFloatType* propagate(const std::vector<Example>& batch) {
if (output_.size() < kOutputDimensions * batch.size()) {
output_.resize(kOutputDimensions * batch.size());
}
batch_size_ = static_cast<IndexType>(batch.size());
const auto output = previous_layer_trainer_->propagate(batch);
#if defined(USE_BLAS)
cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
#else
for (IndexType b = 0; b < batch_size_; ++b) {
const IndexType batch_offset = kOutputDimensions * b;
for (IndexType i = 0; i < kOutputDimensions; ++i) {
output_[batch_offset + i] = output[batch_offset + i];
}
}
#endif
return output_.data();
}
// backpropagation
void backpropagate(const LearnFloatType* gradients,
LearnFloatType learning_rate) {
previous_layer_trainer_->backpropagate(gradients, learning_rate);
}
private:
// constructor
Trainer(LayerType* target_layer, FeatureTransformer* ft) :
batch_size_(0),
previous_layer_trainer_(Trainer<PreviousLayer>::create(
&target_layer->previous_layer_, ft)),
target_layer_(target_layer) {
}
// number of input/output dimensions
static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
// make subclass friend
template <typename SumLayer>
friend class Trainer;
// number of samples in mini-batch
IndexType batch_size_;
// Trainer of the previous layer
const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
// layer to learn
LayerType* const target_layer_;
// Forward propagation buffer
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
};
} // namespace Eval::NNUE
#endif

View File

@@ -22,11 +22,9 @@
#include <sstream>
#include <string>
#include "extra/stockfish_blas.h"
#include "nnue/evaluate_nnue.h"
#include "evaluate.h"
#include "movegen.h"
#include "nnue/nnue_test_command.h"
#include "position.h"
#include "search.h"
#include "syzygy/tbprobe.h"
@@ -37,7 +35,6 @@
#include "learn/gensfen.h"
#include "learn/gensfen_nonpv.h"
#include "learn/learn.h"
#include "learn/convert.h"
#include "learn/transform.h"
#include "learn/stats.h"
@@ -49,17 +46,6 @@ extern vector<string> setup_bench(const Position&, istream&);
// FEN string of the initial position, normal chess
const char* StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
void test_cmd(Position& pos, istringstream& is)
{
// Initialize as it may be searched.
Eval::NNUE::init();
std::string param;
is >> param;
if (param == "nnue") Eval::NNUE::test_command(pos, is);
}
namespace {
// position() is called when engine receives the "position" UCI command.
@@ -344,7 +330,6 @@ void UCI::loop(int argc, char* argv[]) {
else if (token == "gensfen") Learner::gensfen(is);
else if (token == "gensfen_nonpv") Learner::gensfen_nonpv(is);
else if (token == "learn") Learner::learn(is);
else if (token == "convert") Learner::convert(is);
else if (token == "convert_bin") Learner::convert_bin(is);
else if (token == "convert_plain") Learner::convert_plain(is);
@@ -361,17 +346,7 @@ void UCI::loop(int argc, char* argv[]) {
std::cout << th.thread_idx() << '\n';
});
}
else if (token == "blastest")
{
Blas::test(Threads);
}
else if (token == "blasbench")
{
Blas::bench(Threads);
}
// test command
else if (token == "test") test_cmd(pos, is);
else
sync_cout << "Unknown command: " << cmd << sync_endl;