Added files for NNUE.

2025-12-25 19:46:55 +08:00 · 2019-06-09 10:40:12 +09:00
parent 3edf0e6b37
commit db02ddcc90
37 changed files with 4501 additions and 909 deletions
--- a/src/eval/nnue/architectures/halfkp_256x2-32-32.h
+++ b/src/eval/nnue/architectures/halfkp_256x2-32-32.h
@@ -0,0 +1,35 @@
+// NNUE評価関数で用いる入力特徴量とネットワーク構造の定義
+
+#include "../features/feature_set.h"
+#include "../features/half_kp.h"
+
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 評価関数で用いる入力特徴量
+using RawFeatures = Features::FeatureSet<
+    Features::HalfKP<Features::Side::kFriend>>;
+
+// 変換後の入力特徴量の次元数
+constexpr IndexType kTransformedFeatureDimensions = 256;
+
+namespace Layers {
+
+// ネットワーク構造の定義
+using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+}  // namespace Layers
+
+using Network = Layers::OutputLayer;
+
+}  // namespace NNUE
+
+}  // namespace Eval
--- a/src/eval/nnue/architectures/k-p_256x2-32-32.h
+++ b/src/eval/nnue/architectures/k-p_256x2-32-32.h
@@ -0,0 +1,35 @@
+// NNUE評価関数で用いる入力特徴量とネットワーク構造の定義
+
+#include "../features/feature_set.h"
+#include "../features/k.h"
+#include "../features/p.h"
+
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 評価関数で用いる入力特徴量
+using RawFeatures = Features::FeatureSet<Features::K, Features::P>;
+
+// 変換後の入力特徴量の次元数
+constexpr IndexType kTransformedFeatureDimensions = 256;
+
+namespace Layers {
+
+// ネットワーク構造の定義
+using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+}  // namespace Layers
+
+using Network = Layers::OutputLayer;
+
+}  // namespace NNUE
+
+}  // namespace Eval
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -0,0 +1,314 @@
+// NNUE評価関数の計算に関するコード
+
+#include <fstream>
+
+#include "../../evaluate.h"
+#include "../../position.h"
+#include "../../misc.h"
+
+#include "evaluate_nnue.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 入力特徴量変換器
+AlignedPtr<FeatureTransformer> feature_transformer;
+
+// 評価関数
+AlignedPtr<Network> network;
+
+// 評価関数ファイル名
+const char* const kFileName = "nn.bin";
+
+// 評価関数の構造を表す文字列を取得する
+std::string GetArchitectureString() {
+  return "Features=" + FeatureTransformer::GetStructureString() +
+      ",Network=" + Network::GetStructureString();
+}
+
+namespace {
+
+namespace Detail {
+
+// 評価関数パラメータを初期化する
+template <typename T>
+void Initialize(AlignedPtr<T>& pointer) {
+  pointer.reset(reinterpret_cast<T*>(aligned_malloc(sizeof(T), alignof(T))));
+  std::memset(pointer.get(), 0, sizeof(T));
+}
+
+// 評価関数パラメータを読み込む
+template <typename T>
+bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
+  std::uint32_t header;
+  stream.read(reinterpret_cast<char*>(&header), sizeof(header));
+  if (!stream || header != T::GetHashValue()) return false;
+  return pointer->ReadParameters(stream);
+}
+
+// 評価関数パラメータを書き込む
+template <typename T>
+bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
+  constexpr std::uint32_t header = T::GetHashValue();
+  stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+  return pointer->WriteParameters(stream);
+}
+
+}  // namespace Detail
+
+// 評価関数パラメータを初期化する
+void Initialize() {
+  Detail::Initialize(feature_transformer);
+  Detail::Initialize(network);
+}
+
+}  // namespace
+
+// ヘッダを読み込む
+bool ReadHeader(std::istream& stream,
+  std::uint32_t* hash_value, std::string* architecture) {
+  std::uint32_t version, size;
+  stream.read(reinterpret_cast<char*>(&version), sizeof(version));
+  stream.read(reinterpret_cast<char*>(hash_value), sizeof(*hash_value));
+  stream.read(reinterpret_cast<char*>(&size), sizeof(size));
+  if (!stream || version != kVersion) return false;
+  architecture->resize(size);
+  stream.read(&(*architecture)[0], size);
+  return !stream.fail();
+}
+
+// ヘッダを書き込む
+bool WriteHeader(std::ostream& stream,
+  std::uint32_t hash_value, const std::string& architecture) {
+  stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
+  stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
+  const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
+  stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
+  stream.write(architecture.data(), size);
+  return !stream.fail();
+}
+
+// 評価関数パラメータを読み込む
+bool ReadParameters(std::istream& stream) {
+  std::uint32_t hash_value;
+  std::string architecture;
+  if (!ReadHeader(stream, &hash_value, &architecture)) return false;
+  if (hash_value != kHashValue) return false;
+  if (!Detail::ReadParameters(stream, feature_transformer)) return false;
+  if (!Detail::ReadParameters(stream, network)) return false;
+  return stream && stream.peek() == std::ios::traits_type::eof();
+}
+
+// 評価関数パラメータを書き込む
+bool WriteParameters(std::ostream& stream) {
+  if (!WriteHeader(stream, kHashValue, GetArchitectureString())) return false;
+  if (!Detail::WriteParameters(stream, feature_transformer)) return false;
+  if (!Detail::WriteParameters(stream, network)) return false;
+  return !stream.fail();
+}
+
+// 差分計算ができるなら進める
+static void UpdateAccumulatorIfPossible(const Position& pos) {
+  feature_transformer->UpdateAccumulatorIfPossible(pos);
+}
+
+// 評価値を計算する
+static Value ComputeScore(const Position& pos, bool refresh = false) {
+  auto& accumulator = pos.state()->accumulator;
+  if (!refresh && accumulator.computed_score) {
+    return accumulator.score;
+  }
+
+  alignas(kCacheLineSize) TransformedFeatureType
+      transformed_features[FeatureTransformer::kBufferSize];
+  feature_transformer->Transform(pos, transformed_features, refresh);
+  alignas(kCacheLineSize) char buffer[Network::kBufferSize];
+  const auto output = network->Propagate(transformed_features, buffer);
+
+  // VALUE_MAX_EVALより大きな値が返ってくるとaspiration searchがfail highして
+  // 探索が終わらなくなるのでVALUE_MAX_EVAL以下であることを保証すべき。
+
+  // この現象が起きても、対局時に秒固定などだとそこで探索が打ち切られるので、
+  // 1つ前のiterationのときの最善手がbestmoveとして指されるので見かけ上、
+  // 問題ない。このVALUE_MAX_EVALが返ってくるような状況は、ほぼ詰みの局面であり、
+  // そのような詰みの局面が出現するのは終盤で形勢に大差がついていることが多いので
+  // 勝敗にはあまり影響しない。
+
+  // しかし、教師生成時などdepth固定で探索するときに探索から戻ってこなくなるので
+  // そのスレッドの計算時間を無駄にする。またdepth固定対局でtime-outするようになる。
+
+  auto score = static_cast<Value>(output[0] / FV_SCALE);
+
+  // 1) ここ、下手にclipすると学習時には影響があるような気もするが…。
+  // 2) accumulator.scoreは、差分計算の時に用いないので書き換えて問題ない。
+  score = Math::clamp(score , -VALUE_MAX_EVAL , VALUE_MAX_EVAL);
+
+  accumulator.score = score;
+  accumulator.computed_score = true;
+  return accumulator.score;
+}
+
+}  // namespace NNUE
+
+#if defined(USE_EVAL_HASH)
+// HashTableに評価値を保存するために利用するクラス
+struct alignas(16) ScoreKeyValue {
+#if defined(USE_SSE2)
+  ScoreKeyValue() = default;
+  ScoreKeyValue(const ScoreKeyValue& other) {
+    static_assert(sizeof(ScoreKeyValue) == sizeof(__m128i),
+                  "sizeof(ScoreKeyValue) should be equal to sizeof(__m128i)");
+    _mm_store_si128(&as_m128i, other.as_m128i);
+  }
+  ScoreKeyValue& operator=(const ScoreKeyValue& other) {
+    _mm_store_si128(&as_m128i, other.as_m128i);
+    return *this;
+  }
+#endif
+
+  // evaluate hashでatomicに操作できる必要があるのでそのための操作子
+  void encode() {
+#if defined(USE_SSE2)
+    // ScoreKeyValue は atomic にコピーされるので key が合っていればデータも合っている。
+#else
+    key ^= score;
+#endif
+  }
+  // decode()はencode()の逆変換だが、xorなので逆変換も同じ変換。
+  void decode() { encode(); }
+
+  union {
+    struct {
+      std::uint64_t key;
+      std::uint64_t score;
+    };
+#if defined(USE_SSE2)
+    __m128i as_m128i;
+#endif
+  };
+};
+
+// シンプルなHashTableの実装。
+// Sizeは2のべき乗。
+template <typename T, size_t Size>
+struct HashTable {
+  HashTable() { clear(); }
+  T* operator [] (const Key k) { return entries_ + (static_cast<size_t>(k) & (Size - 1)); }
+  void clear() { memset(entries_, 0, sizeof(T)*Size); }
+
+  // Size が 2のべき乗であることのチェック
+  static_assert((Size & (Size - 1)) == 0, "");
+
+ private:
+  T entries_[Size];
+};
+
+// evaluateしたものを保存しておくHashTable(俗にいうehash)
+
+#if !defined(USE_LARGE_EVAL_HASH)
+// 134MB(魔女のAVX2以外の時の設定)
+struct EvaluateHashTable : HashTable<ScoreKeyValue, 0x800000> {};
+#else
+// prefetch有りなら大きいほうが良いのでは…。
+// →　あまり変わらないし、メモリもったいないのでデフォルトでは↑の設定で良いか…。
+// 1GB(魔女のAVX2の時の設定)
+struct EvaluateHashTable : HashTable<ScoreKeyValue, 0x4000000> {};
+#endif
+
+EvaluateHashTable g_evalTable;
+
+// prefetchする関数も用意しておく。
+void prefetch_evalhash(const Key key) {
+  constexpr auto mask = ~((u64)0x1f);
+  prefetch((void*)((u64)g_evalTable[key] & mask));
+}
+#endif
+
+// 評価関数ファイルを読み込む
+// benchコマンドなどでOptionsを保存して復元するのでこのときEvalDirが変更されたことになって、
+// 評価関数の再読込の必要があるというフラグを立てるため、この関数は2度呼び出されることがある。
+void load_eval() {
+  NNUE::Initialize();
+
+#if defined(EVAL_LEARN)
+  if (!Options["SkipLoadingEval"])
+#endif
+  {
+    const std::string dir_name = Options["EvalDir"];
+    const std::string file_name = Path::Combine(dir_name, NNUE::kFileName);
+    std::ifstream stream(file_name, std::ios::binary);
+    const bool result = NNUE::ReadParameters(stream);
+
+//    ASSERT(result);
+	if (!result)
+	{
+		// 読み込みエラーのとき終了してくれないと困る。
+		std::cout << "Error! : failed to read " << NNUE::kFileName << std::endl;
+		my_exit();
+	}
+  }
+}
+
+// 初期化
+void init() {
+}
+
+// 評価関数。差分計算ではなく全計算する。
+// Position::set()で一度だけ呼び出される。(以降は差分計算)
+// 手番側から見た評価値を返すので注意。(他の評価関数とは設計がこの点において異なる)
+// なので、この関数の最適化は頑張らない。
+Value compute_eval(const Position& pos) {
+  return NNUE::ComputeScore(pos, true);
+}
+
+// 評価関数
+Value evaluate(const Position& pos) {
+  const auto& accumulator = pos.state()->accumulator;
+  if (accumulator.computed_score) {
+    return accumulator.score;
+  }
+
+#if defined(USE_GLOBAL_OPTIONS)
+  // GlobalOptionsでeval hashを用いない設定になっているなら
+  // eval hashへの照会をskipする。
+  if (!GlobalOptions.use_eval_hash) {
+    ASSERT_LV5(pos.state()->materialValue == Eval::material(pos));
+    return NNUE::ComputeScore(pos);
+  }
+#endif
+
+#if defined(USE_EVAL_HASH)
+  // evaluate hash tableにはあるかも。
+  const Key key = pos.state()->key();
+  ScoreKeyValue entry = *g_evalTable[key];
+  entry.decode();
+  if (entry.key == key) {
+    // あった！
+    return Value(entry.score);
+  }
+#endif
+
+  Value score = NNUE::ComputeScore(pos);
+#if defined(USE_EVAL_HASH)
+  // せっかく計算したのでevaluate hash tableに保存しておく。
+  entry.key = key;
+  entry.score = score;
+  entry.encode();
+  *g_evalTable[key] = entry;
+#endif
+
+  return score;
+}
+
+// 差分計算ができるなら進める
+void evaluate_with_no_return(const Position& pos) {
+  NNUE::UpdateAccumulatorIfPossible(pos);
+}
+
+// 現在の局面の評価値の内訳を表示する
+void print_eval_stat(Position& /*pos*/) {
+  std::cout << "--- EVAL STAT: not implemented" << std::endl;
+}
+
+}  // namespace Eval
--- a/src/eval/nnue/evaluate_nnue.h
+++ b/src/eval/nnue/evaluate_nnue.h
@@ -0,0 +1,64 @@
+// NNUE評価関数で用いるheader
+
+#ifndef _EVALUATE_NNUE_H_
+#define _EVALUATE_NNUE_H_
+
+#if defined(EVAL_NNUE)
+
+#include "nnue_feature_transformer.h"
+#include "nnue_architecture.h"
+
+#include <memory>
+
+namespace Eval {
+
+namespace NNUE {
+
+// 評価関数の構造のハッシュ値
+constexpr std::uint32_t kHashValue =
+    FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
+
+// メモリ領域の解放を自動化するためのデリータ
+template <typename T>
+struct AlignedDeleter {
+  void operator()(T* ptr) const {
+    ptr->~T();
+    aligned_free(ptr);
+  }
+};
+template <typename T>
+using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
+
+// 入力特徴量変換器
+extern AlignedPtr<FeatureTransformer> feature_transformer;
+
+// 評価関数
+extern AlignedPtr<Network> network;
+
+// 評価関数ファイル名
+extern const char* const kFileName;
+
+// 評価関数の構造を表す文字列を取得する
+std::string GetArchitectureString();
+
+// ヘッダを読み込む
+bool ReadHeader(std::istream& stream,
+    std::uint32_t* hash_value, std::string* architecture);
+
+// ヘッダを書き込む
+bool WriteHeader(std::ostream& stream,
+    std::uint32_t hash_value, const std::string& architecture);
+
+// 評価関数パラメータを読み込む
+bool ReadParameters(std::istream& stream);
+
+// 評価関数パラメータを書き込む
+bool WriteParameters(std::ostream& stream);
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/evaluate_nnue_learner.cpp
+++ b/src/eval/nnue/evaluate_nnue_learner.cpp
@@ -0,0 +1,230 @@
+// NNUE評価関数の学習時用のコード
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include <random>
+#include <fstream>
+
+#include "../../learn/learn.h"
+#include "../../learn/learning_tools.h"
+
+#include "../../position.h"
+#include "../../usi.h"
+#include "../../misc.h"
+
+#include "../evaluate_common.h"
+
+#include "evaluate_nnue.h"
+#include "evaluate_nnue_learner.h"
+#include "trainer/features/factorizer_feature_set.h"
+#include "trainer/features/factorizer_half_kp.h"
+#include "trainer/trainer_feature_transformer.h"
+#include "trainer/trainer_input_slice.h"
+#include "trainer/trainer_affine_transform.h"
+#include "trainer/trainer_clipped_relu.h"
+#include "trainer/trainer_sum.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace {
+
+// 学習データ
+std::vector<Example> examples;
+
+// examplesの排他制御をするMutex
+Mutex examples_mutex;
+
+// ミニバッチのサンプル数
+u64 batch_size;
+
+// 乱数生成器
+std::mt19937 rng;
+
+// 学習器
+std::shared_ptr<Trainer<Network>> trainer;
+
+// 学習率のスケール
+double global_learning_rate_scale;
+
+// 学習率のスケールを取得する
+double GetGlobalLearningRateScale() {
+  return global_learning_rate_scale;
+}
+
+// ハイパーパラメータなどのオプションを学習器に伝える
+void SendMessages(std::vector<Message> messages) {
+  for (auto& message : messages) {
+    trainer->SendMessage(&message);
+    ASSERT_LV3(message.num_receivers > 0);
+  }
+}
+
+}  // namespace
+
+// 学習の初期化を行う
+void InitializeTraining(double eta1, u64 eta1_epoch,
+                        double eta2, u64 eta2_epoch, double eta3) {
+  std::cout << "Initializing NN training for "
+            << GetArchitectureString() << std::endl;
+
+  ASSERT(feature_transformer);
+  ASSERT(network);
+  trainer = Trainer<Network>::Create(network.get(), feature_transformer.get());
+
+  if (Options["SkipLoadingEval"]) {
+    trainer->Initialize(rng);
+  }
+
+  global_learning_rate_scale = 1.0;
+  EvalLearningTools::Weight::init_eta(eta1, eta2, eta3, eta1_epoch, eta2_epoch);
+}
+
+// ミニバッチのサンプル数を設定する
+void SetBatchSize(u64 size) {
+  ASSERT_LV3(size > 0);
+  batch_size = size;
+}
+
+// 学習率のスケールを設定する
+void SetGlobalLearningRateScale(double scale) {
+  global_learning_rate_scale = scale;
+}
+
+// ハイパーパラメータなどのオプションを設定する
+void SetOptions(const std::string& options) {
+  std::vector<Message> messages;
+  for (const auto& option : Split(options, ',')) {
+    const auto fields = Split(option, '=');
+    ASSERT_LV3(fields.size() == 1 || fields.size() == 2);
+    if (fields.size() == 1) {
+      messages.emplace_back(fields[0]);
+    } else {
+      messages.emplace_back(fields[0], fields[1]);
+    }
+  }
+  SendMessages(std::move(messages));
+}
+
+// 学習用評価関数パラメータをファイルから読み直す
+void RestoreParameters(const std::string& dir_name) {
+  const std::string file_name = Path::Combine(dir_name, NNUE::kFileName);
+  std::ifstream stream(file_name, std::ios::binary);
+  bool result = ReadParameters(stream);
+  ASSERT(result);
+
+  SendMessages({{"reset"}});
+}
+
+// 学習データを1サンプル追加する
+void AddExample(Position& pos, Color rootColor,
+                const Learner::PackedSfenValue& psv, double weight) {
+  Example example;
+  if (rootColor == pos.side_to_move()) {
+    example.sign = 1;
+  } else {
+    example.sign = -1;
+  }
+  example.psv = psv;
+  example.weight = weight;
+
+  Features::IndexList active_indices[2];
+  for (const auto trigger : kRefreshTriggers) {
+    RawFeatures::AppendActiveIndices(pos, trigger, active_indices);
+  }
+  if (pos.side_to_move() != BLACK) {
+    active_indices[0].swap(active_indices[1]);
+  }
+  for (const auto color : COLOR) {
+    std::vector<TrainingFeature> training_features;
+    for (const auto base_index : active_indices[color]) {
+      static_assert(Features::Factorizer<RawFeatures>::GetDimensions() <
+                    (1 << TrainingFeature::kIndexBits), "");
+      Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+          base_index, &training_features);
+    }
+    std::sort(training_features.begin(), training_features.end());
+
+    auto& unique_features = example.training_features[color];
+    for (const auto& feature : training_features) {
+      if (!unique_features.empty() &&
+          feature.GetIndex() == unique_features.back().GetIndex()) {
+        unique_features.back() += feature;
+      } else {
+        unique_features.push_back(feature);
+      }
+    }
+  }
+
+  std::lock_guard<Mutex> lock(examples_mutex);
+  examples.push_back(std::move(example));
+}
+
+// 評価関数パラメーターを更新する
+void UpdateParameters(u64 epoch) {
+  ASSERT_LV3(batch_size > 0);
+
+  EvalLearningTools::Weight::calc_eta(epoch);
+  const auto learning_rate = static_cast<LearnFloatType>(
+      get_eta() / batch_size);
+
+  std::lock_guard<Mutex> lock(examples_mutex);
+  std::shuffle(examples.begin(), examples.end(), rng);
+  while (examples.size() >= batch_size) {
+    std::vector<Example> batch(examples.end() - batch_size, examples.end());
+    examples.resize(examples.size() - batch_size);
+
+    const auto network_output = trainer->Propagate(batch);
+
+    std::vector<LearnFloatType> gradients(batch.size());
+    for (std::size_t b = 0; b < batch.size(); ++b) {
+      const auto shallow = static_cast<Value>(Round<std::int32_t>(
+          batch[b].sign * network_output[b] * kPonanzaConstant));
+      const auto& psv = batch[b].psv;
+      const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
+      gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+    }
+
+    trainer->Backpropagate(gradients.data(), learning_rate);
+  }
+  SendMessages({{"quantize_parameters"}});
+}
+
+// 学習に問題が生じていないかチェックする
+void CheckHealth() {
+  SendMessages({{"check_health"}});
+}
+
+}  // namespace NNUE
+
+// 評価関数パラメーターをファイルに保存する
+void save_eval(std::string dir_name) {
+  auto eval_dir = Path::Combine(Options["EvalSaveDir"], dir_name);
+  std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
+
+  // すでにこのフォルダがあるならmkdir()に失敗するが、
+  // 別にそれは構わない。なければ作って欲しいだけ。
+  // また、EvalSaveDirまでのフォルダは掘ってあるものとする。
+  Dependency::mkdir(eval_dir);
+
+  if (Options["SkipLoadingEval"] && NNUE::trainer) {
+    NNUE::SendMessages({{"clear_unobserved_feature_weights"}});
+  }
+
+  const std::string file_name = Path::Combine(eval_dir, NNUE::kFileName);
+  std::ofstream stream(file_name, std::ios::binary);
+  const bool result = NNUE::WriteParameters(stream);
+  ASSERT(result);
+
+  std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
+}
+
+// 現在のetaを取得する
+double get_eta() {
+  return NNUE::GetGlobalLearningRateScale() * EvalLearningTools::Weight::eta;
+}
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
--- a/src/eval/nnue/evaluate_nnue_learner.h
+++ b/src/eval/nnue/evaluate_nnue_learner.h
@@ -0,0 +1,48 @@
+// NNUE評価関数の学習で用いるインターフェイス
+
+#ifndef _EVALUATE_NNUE_LEARNER_H_
+#define _EVALUATE_NNUE_LEARNER_H_
+
+#include "../../config.h"
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../learn/learn.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 学習の初期化を行う
+void InitializeTraining(double eta1, u64 eta1_epoch,
+                        double eta2, u64 eta2_epoch, double eta3);
+
+// ミニバッチのサンプル数を設定する
+void SetBatchSize(u64 size);
+
+// 学習率のスケールを設定する
+void SetGlobalLearningRateScale(double scale);
+
+// ハイパーパラメータなどのオプションを設定する
+void SetOptions(const std::string& options);
+
+// 学習用評価関数パラメータをファイルから読み直す
+void RestoreParameters(const std::string& dir_name);
+
+// 学習データを1サンプル追加する
+void AddExample(Position& pos, Color rootColor,
+                const Learner::PackedSfenValue& psv, double weight);
+
+// 評価関数パラメータを更新する
+void UpdateParameters(u64 epoch);
+
+// 学習に問題が生じていないかチェックする
+void CheckHealth();
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/features/feature_set.h
+++ b/src/eval/nnue/features/feature_set.h
@@ -0,0 +1,249 @@
+// NNUE評価関数の入力特徴量セットを表すクラステンプレート
+
+#ifndef _NNUE_FEATURE_SET_H_
+#define _NNUE_FEATURE_SET_H_
+
+#if defined(EVAL_NNUE)
+
+#include "features_common.h"
+#include <array>
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 値のリストを表すクラステンプレート
+template <typename T, T... Values>
+struct CompileTimeList;
+template <typename T, T First, T... Remaining>
+struct CompileTimeList<T, First, Remaining...> {
+  static constexpr bool Contains(T value) {
+    return value == First || CompileTimeList<T, Remaining...>::Contains(value);
+  }
+  static constexpr std::array<T, sizeof...(Remaining) + 1>
+      kValues = {{First, Remaining...}};
+};
+template <typename T, T First, T... Remaining>
+constexpr std::array<T, sizeof...(Remaining) + 1>
+    CompileTimeList<T, First, Remaining...>::kValues;
+template <typename T>
+struct CompileTimeList<T> {
+  static constexpr bool Contains(T /*value*/) {
+    return false;
+  }
+  static constexpr std::array<T, 0> kValues = {{}};
+};
+
+// リストの先頭への追加を行うクラステンプレート
+template <typename T, typename ListType, T Value>
+struct AppendToList;
+template <typename T, T... Values, T AnotherValue>
+struct AppendToList<T, CompileTimeList<T, Values...>, AnotherValue> {
+  using Result = CompileTimeList<T, AnotherValue, Values...>;
+};
+
+// ソートされた重複のないリストへの追加を行うクラステンプレート
+template <typename T, typename ListType, T Value>
+struct InsertToSet;
+template <typename T, T First, T... Remaining, T AnotherValue>
+struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
+  using Result = std::conditional_t<
+      CompileTimeList<T, First, Remaining...>::Contains(AnotherValue),
+      CompileTimeList<T, First, Remaining...>,
+      std::conditional_t<(AnotherValue < First),
+          CompileTimeList<T, AnotherValue, First, Remaining...>,
+          typename AppendToList<T, typename InsertToSet<
+              T, CompileTimeList<T, Remaining...>, AnotherValue>::Result,
+              First>::Result>>;
+};
+template <typename T, T Value>
+struct InsertToSet<T, CompileTimeList<T>, Value> {
+  using Result = CompileTimeList<T, Value>;
+};
+
+// 特徴量セットの基底クラス
+template <typename Derived>
+class FeatureSetBase {
+ public:
+  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  template <typename IndexListType>
+  static void AppendActiveIndices(
+      const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
+    for (const auto perspective : COLOR) {
+      Derived::CollectActiveIndices(
+          pos, trigger, perspective, &active[perspective]);
+    }
+  }
+
+  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  template <typename PositionType, typename IndexListType>
+  static void AppendChangedIndices(
+      const PositionType& pos, TriggerEvent trigger,
+      IndexListType removed[2], IndexListType added[2], bool reset[2]) {
+    const auto& dp = pos.state()->dirtyPiece;
+    if (dp.dirty_num == 0) return;
+
+    for (const auto perspective : COLOR) {
+      reset[perspective] = false;
+      switch (trigger) {
+        case TriggerEvent::kNone:
+          break;
+        case TriggerEvent::kFriendKingMoved:
+          reset[perspective] =
+              dp.pieceNo[0] == PIECE_NUMBER_KING + perspective;
+          break;
+        case TriggerEvent::kEnemyKingMoved:
+          reset[perspective] =
+              dp.pieceNo[0] == PIECE_NUMBER_KING + ~perspective;
+          break;
+        case TriggerEvent::kAnyKingMoved:
+          reset[perspective] = dp.pieceNo[0] >= PIECE_NUMBER_KING;
+          break;
+        case TriggerEvent::kAnyPieceMoved:
+          reset[perspective] = true;
+          break;
+        default:
+          ASSERT_LV5(false);
+          break;
+      }
+      if (reset[perspective]) {
+        Derived::CollectActiveIndices(
+            pos, trigger, perspective, &added[perspective]);
+      } else {
+        Derived::CollectChangedIndices(
+            pos, trigger, perspective,
+            &removed[perspective], &added[perspective]);
+      }
+    }
+  }
+};
+
+// 特徴量セットを表すクラステンプレート
+// 実行時の計算量を線形にするために、内部の処理はテンプレート引数の逆順に行う
+template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
+    public FeatureSetBase<
+        FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
+ private:
+  using Head = FirstFeatureType;
+  using Tail = FeatureSet<RemainingFeatureTypes...>;
+
+ public:
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t kHashValue =
+      Head::kHashValue ^ (Tail::kHashValue << 1) ^ (Tail::kHashValue >> 31);
+  // 特徴量の次元数
+  static constexpr IndexType kDimensions =
+      Head::kDimensions + Tail::kDimensions;
+  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  static constexpr IndexType kMaxActiveDimensions =
+      Head::kMaxActiveDimensions + Tail::kMaxActiveDimensions;
+  // 差分計算の代わりに全計算を行うタイミングのリスト
+  using SortedTriggerSet = typename InsertToSet<TriggerEvent,
+      typename Tail::SortedTriggerSet, Head::kRefreshTrigger>::Result;
+  static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+  // 特徴量名を取得する
+  static std::string GetName() {
+    return std::string(Head::kName) + "+" + Tail::GetName();
+  }
+
+ private:
+  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  template <typename IndexListType>
+  static void CollectActiveIndices(
+      const Position& pos, const TriggerEvent trigger, const Color perspective,
+      IndexListType* const active) {
+    Tail::CollectActiveIndices(pos, trigger, perspective, active);
+    if (Head::kRefreshTrigger == trigger) {
+      const auto start = active->size();
+      Head::AppendActiveIndices(pos, perspective, active);
+      for (auto i = start; i < active->size(); ++i) {
+        (*active)[i] += Tail::kDimensions;
+      }
+    }
+  }
+
+  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  template <typename IndexListType>
+  static void CollectChangedIndices(
+      const Position& pos, const TriggerEvent trigger, const Color perspective,
+      IndexListType* const removed, IndexListType* const added) {
+    Tail::CollectChangedIndices(pos, trigger, perspective, removed, added);
+    if (Head::kRefreshTrigger == trigger) {
+      const auto start_removed = removed->size();
+      const auto start_added = added->size();
+      Head::AppendChangedIndices(pos, perspective, removed, added);
+      for (auto i = start_removed; i < removed->size(); ++i) {
+        (*removed)[i] += Tail::kDimensions;
+      }
+      for (auto i = start_added; i < added->size(); ++i) {
+        (*added)[i] += Tail::kDimensions;
+      }
+    }
+  }
+
+  // 基底クラスと、自身を再帰的に利用するクラステンプレートをfriendにする
+  friend class FeatureSetBase<FeatureSet>;
+  template <typename... FeatureTypes>
+  friend class FeatureSet;
+};
+
+// 特徴量セットを表すクラステンプレート
+// テンプレート引数が1つの場合の特殊化
+template <typename FeatureType>
+class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
+ public:
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
+  // 特徴量の次元数
+  static constexpr IndexType kDimensions = FeatureType::kDimensions;
+  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  static constexpr IndexType kMaxActiveDimensions =
+      FeatureType::kMaxActiveDimensions;
+  // 差分計算の代わりに全計算を行うタイミングのリスト
+  using SortedTriggerSet =
+      CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
+  static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+  // 特徴量名を取得する
+  static std::string GetName() {
+    return FeatureType::kName;
+  }
+
+ private:
+  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  static void CollectActiveIndices(
+      const Position& pos, const TriggerEvent trigger, const Color perspective,
+      IndexList* const active) {
+    if (FeatureType::kRefreshTrigger == trigger) {
+      FeatureType::AppendActiveIndices(pos, perspective, active);
+    }
+  }
+
+  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  static void CollectChangedIndices(
+      const Position& pos, const TriggerEvent trigger, const Color perspective,
+      IndexList* const removed, IndexList* const added) {
+    if (FeatureType::kRefreshTrigger == trigger) {
+      FeatureType::AppendChangedIndices(pos, perspective, removed, added);
+    }
+  }
+
+  // 基底クラスと、自身を再帰的に利用するクラステンプレートをfriendにする
+  friend class FeatureSetBase<FeatureSet>;
+  template <typename... FeatureTypes>
+  friend class FeatureSet;
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/features/features_common.h
+++ b/src/eval/nnue/features/features_common.h
@@ -0,0 +1,47 @@
+// NNUE評価関数の入力特徴量の共通ヘッダ
+
+#ifndef _NNUE_FEATURES_COMMON_H_
+#define _NNUE_FEATURES_COMMON_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "../nnue_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// インデックスリストの型
+class IndexList;
+
+// 特徴量セットを表すクラステンプレート
+template <typename... FeatureTypes>
+class FeatureSet;
+
+// 差分計算の代わりに全計算を行うタイミングの種類
+enum class TriggerEvent {
+  kNone,             // 可能な場合は常に差分計算する
+  kFriendKingMoved,  // 自玉が移動した場合に全計算する
+  kEnemyKingMoved,   // 敵玉が移動した場合に全計算する
+  kAnyKingMoved,     // どちらかの玉が移動した場合に全計算する
+  kAnyPieceMoved,    // 常に全計算する
+};
+
+// 手番側or相手側
+enum class Side {
+  kFriend,  // 手番側
+  kEnemy,   // 相手側
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/features/half_kp.cpp
+++ b/src/eval/nnue/features/half_kp.cpp
@@ -0,0 +1,78 @@
+// NNUE評価関数の入力特徴量HalfKPの定義
+
+#if defined(EVAL_NNUE)
+
+#include "half_kp.h"
+#include "index_list.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 玉の位置とBonaPieceから特徴量のインデックスを求める
+template <Side AssociatedKing>
+inline IndexType HalfKP<AssociatedKing>::MakeIndex(Square sq_k, BonaPiece p) {
+  return static_cast<IndexType>(fe_end) * static_cast<IndexType>(sq_k) + p;
+}
+
+// 駒の情報を取得する
+template <Side AssociatedKing>
+inline void HalfKP<AssociatedKing>::GetPieces(
+    const Position& pos, Color perspective,
+    BonaPiece** pieces, Square* sq_target_k) {
+  *pieces = (perspective == BLACK) ?
+      pos.eval_list()->piece_list_fb() :
+      pos.eval_list()->piece_list_fw();
+  const PieceNumber target = (AssociatedKing == Side::kFriend) ?
+      static_cast<PieceNumber>(PIECE_NUMBER_KING + perspective) :
+      static_cast<PieceNumber>(PIECE_NUMBER_KING + ~perspective);
+  *sq_target_k = static_cast<Square>(((*pieces)[target] - f_king) % SQ_NB);
+}
+
+// 特徴量のうち、値が1であるインデックスのリストを取得する
+template <Side AssociatedKing>
+void HalfKP<AssociatedKing>::AppendActiveIndices(
+    const Position& pos, Color perspective, IndexList* active) {
+  // コンパイラの警告を回避するため、配列サイズが小さい場合は何もしない
+  if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+  BonaPiece* pieces;
+  Square sq_target_k;
+  GetPieces(pos, perspective, &pieces, &sq_target_k);
+  for (PieceNumber i = PIECE_NUMBER_ZERO; i < PIECE_NUMBER_KING; ++i) {
+    active->push_back(MakeIndex(sq_target_k, pieces[i]));
+  }
+}
+
+// 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+template <Side AssociatedKing>
+void HalfKP<AssociatedKing>::AppendChangedIndices(
+    const Position& pos, Color perspective,
+    IndexList* removed, IndexList* added) {
+  BonaPiece* pieces;
+  Square sq_target_k;
+  GetPieces(pos, perspective, &pieces, &sq_target_k);
+  const auto& dp = pos.state()->dirtyPiece;
+  for (int i = 0; i < dp.dirty_num; ++i) {
+    if (dp.pieceNo[i] >= PIECE_NUMBER_KING) continue;
+    const auto old_p = static_cast<BonaPiece>(
+        dp.changed_piece[i].old_piece.from[perspective]);
+    removed->push_back(MakeIndex(sq_target_k, old_p));
+    const auto new_p = static_cast<BonaPiece>(
+        dp.changed_piece[i].new_piece.from[perspective]);
+    added->push_back(MakeIndex(sq_target_k, new_p));
+  }
+}
+
+template class HalfKP<Side::kFriend>;
+template class HalfKP<Side::kEnemy>;
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
--- a/src/eval/nnue/features/half_kp.h
+++ b/src/eval/nnue/features/half_kp.h
@@ -0,0 +1,62 @@
+// NNUE評価関数の入力特徴量HalfKPの定義
+
+#ifndef _NNUE_FEATURES_HALF_KP_H_
+#define _NNUE_FEATURES_HALF_KP_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 特徴量HalfKP：自玉または敵玉の位置と、玉以外の駒の位置の組み合わせ
+template <Side AssociatedKing>
+class HalfKP {
+ public:
+  // 特徴量名
+  static constexpr const char* kName =
+      (AssociatedKing == Side::kFriend) ? "HalfKP(Friend)" : "HalfKP(Enemy)";
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t kHashValue =
+      0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
+  // 特徴量の次元数
+  static constexpr IndexType kDimensions =
+      static_cast<IndexType>(SQ_NB) * static_cast<IndexType>(fe_end);
+  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  static constexpr IndexType kMaxActiveDimensions = PIECE_NUMBER_KING;
+  // 差分計算の代わりに全計算を行うタイミング
+  static constexpr TriggerEvent kRefreshTrigger =
+      (AssociatedKing == Side::kFriend) ?
+      TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  static void AppendActiveIndices(const Position& pos, Color perspective,
+                                  IndexList* active);
+
+  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  static void AppendChangedIndices(const Position& pos, Color perspective,
+                                   IndexList* removed, IndexList* added);
+
+  // 玉の位置とBonaPieceから特徴量のインデックスを求める
+  static IndexType MakeIndex(Square sq_k, BonaPiece p);
+
+ private:
+  // 駒の情報を取得する
+  static void GetPieces(const Position& pos, Color perspective,
+                        BonaPiece** pieces, Square* sq_target_k);
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/features/half_relative_kp.cpp
+++ b/src/eval/nnue/features/half_relative_kp.cpp
@@ -0,0 +1,91 @@
+// NNUE評価関数の入力特徴量HalfRelativeKPの定義
+
+#if defined(EVAL_NNUE)
+
+#include "half_relative_kp.h"
+#include "index_list.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 玉の位置とBonaPieceから特徴量のインデックスを求める
+template <Side AssociatedKing>
+inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
+    Square sq_k, BonaPiece p) {
+  constexpr IndexType W = kBoardWidth;
+  constexpr IndexType H = kBoardHeight;
+  const IndexType piece_index = (p - fe_hand_end) / SQ_NB;
+  const Square sq_p = static_cast<Square>((p - fe_hand_end) % SQ_NB);
+  const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
+  const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
+  return H * W * piece_index + H * relative_file + relative_rank;
+}
+
+// 駒の情報を取得する
+template <Side AssociatedKing>
+inline void HalfRelativeKP<AssociatedKing>::GetPieces(
+    const Position& pos, Color perspective,
+    BonaPiece** pieces, Square* sq_target_k) {
+  *pieces = (perspective == BLACK) ?
+      pos.eval_list()->piece_list_fb() :
+      pos.eval_list()->piece_list_fw();
+  const PieceNumber target = (AssociatedKing == Side::kFriend) ?
+      static_cast<PieceNumber>(PIECE_NUMBER_KING + perspective) :
+      static_cast<PieceNumber>(PIECE_NUMBER_KING + ~perspective);
+  *sq_target_k = static_cast<Square>(((*pieces)[target] - f_king) % SQ_NB);
+}
+
+// 特徴量のうち、値が1であるインデックスのリストを取得する
+template <Side AssociatedKing>
+void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
+    const Position& pos, Color perspective, IndexList* active) {
+  // コンパイラの警告を回避するため、配列サイズが小さい場合は何もしない
+  if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+  BonaPiece* pieces;
+  Square sq_target_k;
+  GetPieces(pos, perspective, &pieces, &sq_target_k);
+  for (PieceNumber i = PIECE_NUMBER_ZERO; i < PIECE_NUMBER_KING; ++i) {
+    if (pieces[i] >= fe_hand_end) {
+      active->push_back(MakeIndex(sq_target_k, pieces[i]));
+    }
+  }
+}
+
+// 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+template <Side AssociatedKing>
+void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
+    const Position& pos, Color perspective,
+    IndexList* removed, IndexList* added) {
+  BonaPiece* pieces;
+  Square sq_target_k;
+  GetPieces(pos, perspective, &pieces, &sq_target_k);
+  const auto& dp = pos.state()->dirtyPiece;
+  for (int i = 0; i < dp.dirty_num; ++i) {
+    if (dp.pieceNo[i] >= PIECE_NUMBER_KING) continue;
+    const auto old_p = static_cast<BonaPiece>(
+        dp.changed_piece[i].old_piece.from[perspective]);
+    if (old_p >= fe_hand_end) {
+      removed->push_back(MakeIndex(sq_target_k, old_p));
+    }
+    const auto new_p = static_cast<BonaPiece>(
+        dp.changed_piece[i].new_piece.from[perspective]);
+    if (new_p >= fe_hand_end) {
+      added->push_back(MakeIndex(sq_target_k, new_p));
+    }
+  }
+}
+
+template class HalfRelativeKP<Side::kFriend>;
+template class HalfRelativeKP<Side::kEnemy>;
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
--- a/src/eval/nnue/features/half_relative_kp.h
+++ b/src/eval/nnue/features/half_relative_kp.h
@@ -0,0 +1,68 @@
+// NNUE評価関数の入力特徴量HalfRelativeKPの定義
+
+#ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
+#define _NNUE_FEATURES_HALF_RELATIVE_KP_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 特徴量HalfRelativeKP：自玉または敵玉を基準とした、玉以外の各駒の相対位置
+template <Side AssociatedKing>
+class HalfRelativeKP {
+ public:
+  // 特徴量名
+  static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+      "HalfRelativeKP(Friend)" : "HalfRelativeKP(Enemy)";
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t kHashValue =
+      0xF9180919u ^ (AssociatedKing == Side::kFriend);
+  // 玉を除いた駒種
+  static constexpr IndexType kNumPieceKinds = (fe_end - fe_hand_end) / SQ_NB;
+  // 玉を中央に置いた仮想的な盤の幅
+  static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
+  // 玉を中央に置いた仮想的な盤の高さ
+  static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
+  // 特徴量の次元数
+  static constexpr IndexType kDimensions =
+      kNumPieceKinds * kBoardHeight * kBoardWidth;
+  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  static constexpr IndexType kMaxActiveDimensions = PIECE_NUMBER_KING;
+  // 差分計算の代わりに全計算を行うタイミング
+  static constexpr TriggerEvent kRefreshTrigger =
+      (AssociatedKing == Side::kFriend) ?
+      TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  static void AppendActiveIndices(const Position& pos, Color perspective,
+                                  IndexList* active);
+
+  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  static void AppendChangedIndices(const Position& pos, Color perspective,
+                                   IndexList* removed, IndexList* added);
+
+  // 玉の位置とBonaPieceから特徴量のインデックスを求める
+  static IndexType MakeIndex(Square sq_k, BonaPiece p);
+
+ private:
+  // 駒の情報を取得する
+  static void GetPieces(const Position& pos, Color perspective,
+                        BonaPiece** pieces, Square* sq_target_k);
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/features/index_list.h
+++ b/src/eval/nnue/features/index_list.h
@@ -0,0 +1,55 @@
+// 入力特徴量のインデックスリストの定義
+
+#ifndef _NNUE_FEATURES_INDEX_LIST_H_
+#define _NNUE_FEATURES_INDEX_LIST_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../position.h"
+#include "../nnue_architecture.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 特徴量のインデックスリストに使うクラステンプレート
+template <typename T, std::size_t MaxSize>
+class ValueList {
+ public:
+  std::size_t size() const { return size_; }
+  void resize(std::size_t size) { size_ = size; }
+  void push_back(const T& value) { values_[size_++] = value; }
+  T& operator[](std::size_t index) { return values_[index]; }
+  T* begin() { return values_; }
+  T* end() { return values_ + size_; }
+  const T& operator[](std::size_t index) const { return values_[index]; }
+  const T* begin() const { return values_; }
+  const T* end() const { return values_ + size_; }
+  void swap(ValueList& other) {
+    const std::size_t max_size = std::max(size_, other.size_);
+    for (std::size_t i = 0; i < max_size; ++i) {
+      std::swap(values_[i], other.values_[i]);
+    }
+    std::swap(size_, other.size_);
+  }
+ private:
+  T values_[MaxSize];
+  std::size_t size_ = 0;
+};
+
+// 特徴量のインデックスリストの型
+class IndexList
+    : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/features/k.cpp
+++ b/src/eval/nnue/features/k.cpp
@@ -0,0 +1,49 @@
+// NNUE評価関数の入力特徴量Kの定義
+
+#if defined(EVAL_NNUE)
+
+#include "k.h"
+#include "index_list.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 特徴量のうち、値が1であるインデックスのリストを取得する
+void K::AppendActiveIndices(
+    const Position& pos, Color perspective, IndexList* active) {
+  // コンパイラの警告を回避するため、配列サイズが小さい場合は何もしない
+  if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+  const BonaPiece* pieces = (perspective == BLACK) ?
+      pos.eval_list()->piece_list_fb() :
+      pos.eval_list()->piece_list_fw();
+  ASSERT_LV5(pieces[PIECE_NUMBER_BKING] != BONA_PIECE_ZERO);
+  ASSERT_LV5(pieces[PIECE_NUMBER_WKING] != BONA_PIECE_ZERO);
+  for (PieceNumber i = PIECE_NUMBER_KING; i < PIECE_NUMBER_NB; ++i) {
+    active->push_back(pieces[i] - fe_end);
+  }
+}
+
+// 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+void K::AppendChangedIndices(
+    const Position& pos, Color perspective,
+    IndexList* removed, IndexList* added) {
+  const auto& dp = pos.state()->dirtyPiece;
+  if (dp.pieceNo[0] >= PIECE_NUMBER_KING) {
+    removed->push_back(
+        dp.changed_piece[0].old_piece.from[perspective] - fe_end);
+    added->push_back(
+        dp.changed_piece[0].new_piece.from[perspective] - fe_end);
+  }
+}
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
--- a/src/eval/nnue/features/k.h
+++ b/src/eval/nnue/features/k.h
@@ -0,0 +1,48 @@
+// NNUE評価関数の入力特徴量Kの定義
+
+#ifndef _NNUE_FEATURES_K_H_
+#define _NNUE_FEATURES_K_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 特徴量K：玉の位置
+class K {
+ public:
+  // 特徴量名
+  static constexpr const char* kName = "K";
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
+  // 特徴量の次元数
+  static constexpr IndexType kDimensions = SQ_NB * 2;
+  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  static constexpr IndexType kMaxActiveDimensions = 2;
+  // 差分計算の代わりに全計算を行うタイミング
+  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+
+  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  static void AppendActiveIndices(const Position& pos, Color perspective,
+                                  IndexList* active);
+
+  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  static void AppendChangedIndices(const Position& pos, Color perspective,
+                                   IndexList* removed, IndexList* added);
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/features/p.cpp
+++ b/src/eval/nnue/features/p.cpp
@@ -0,0 +1,46 @@
+// NNUE評価関数の入力特徴量Pの定義
+
+#if defined(EVAL_NNUE)
+
+#include "p.h"
+#include "index_list.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 特徴量のうち、値が1であるインデックスのリストを取得する
+void P::AppendActiveIndices(
+    const Position& pos, Color perspective, IndexList* active) {
+  // コンパイラの警告を回避するため、配列サイズが小さい場合は何もしない
+  if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+  const BonaPiece* pieces = (perspective == BLACK) ?
+      pos.eval_list()->piece_list_fb() :
+      pos.eval_list()->piece_list_fw();
+  for (PieceNumber i = PIECE_NUMBER_ZERO; i < PIECE_NUMBER_KING; ++i) {
+    active->push_back(pieces[i]);
+  }
+}
+
+// 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+void P::AppendChangedIndices(
+    const Position& pos, Color perspective,
+    IndexList* removed, IndexList* added) {
+  const auto& dp = pos.state()->dirtyPiece;
+  for (int i = 0; i < dp.dirty_num; ++i) {
+    if (dp.pieceNo[i] >= PIECE_NUMBER_KING) continue;
+    removed->push_back(dp.changed_piece[i].old_piece.from[perspective]);
+    added->push_back(dp.changed_piece[i].new_piece.from[perspective]);
+  }
+}
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
--- a/src/eval/nnue/features/p.h
+++ b/src/eval/nnue/features/p.h
@@ -0,0 +1,48 @@
+// NNUE評価関数の入力特徴量Pの定義
+
+#ifndef _NNUE_FEATURES_P_H_
+#define _NNUE_FEATURES_P_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 特徴量P：玉以外の駒のBonaPiece
+class P {
+ public:
+  // 特徴量名
+  static constexpr const char* kName = "P";
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
+  // 特徴量の次元数
+  static constexpr IndexType kDimensions = fe_end;
+  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  static constexpr IndexType kMaxActiveDimensions = PIECE_NUMBER_KING;
+  // 差分計算の代わりに全計算を行うタイミング
+  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+
+  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  static void AppendActiveIndices(const Position& pos, Color perspective,
+                                  IndexList* active);
+
+  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  static void AppendChangedIndices(const Position& pos, Color perspective,
+                                   IndexList* removed, IndexList* added);
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/layers/affine_transform.h
+++ b/src/eval/nnue/layers/affine_transform.h
@@ -0,0 +1,170 @@
+// NNUE評価関数の層AffineTransformの定義
+
+#ifndef _NNUE_LAYERS_AFFINE_TRANSFORM_H_
+#define _NNUE_LAYERS_AFFINE_TRANSFORM_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../nnue_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Layers {
+
+// アフィン変換層
+template <typename PreviousLayer, IndexType OutputDimensions>
+class AffineTransform {
+ public:
+  // 入出力の型
+  using InputType = typename PreviousLayer::OutputType;
+  using OutputType = std::int32_t;
+  static_assert(std::is_same<InputType, std::uint8_t>::value, "");
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions =
+      PreviousLayer::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = OutputDimensions;
+  static constexpr IndexType kPaddedInputDimensions =
+      CeilToMultiple<IndexType>(kInputDimensions, kMaxSimdWidth);
+
+  // この層で使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kSelfBufferSize =
+      CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+
+  // 入力層からこの層までで使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kBufferSize =
+      PreviousLayer::kBufferSize + kSelfBufferSize;
+
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0xCC03DAE4u;
+    hash_value += kOutputDimensions;
+    hash_value ^= PreviousLayer::GetHashValue() >> 1;
+    hash_value ^= PreviousLayer::GetHashValue() << 31;
+    return hash_value;
+  }
+
+  // 入力層からこの層までの構造を表す文字列
+  static std::string GetStructureString() {
+    return "AffineTransform[" +
+        std::to_string(kOutputDimensions) + "<-" +
+        std::to_string(kInputDimensions) + "](" +
+        PreviousLayer::GetStructureString() + ")";
+  }
+
+  // パラメータを読み込む
+  bool ReadParameters(std::istream& stream) {
+    if (!previous_layer_.ReadParameters(stream)) return false;
+    stream.read(reinterpret_cast<char*>(biases_),
+                kOutputDimensions * sizeof(BiasType));
+    stream.read(reinterpret_cast<char*>(weights_),
+                kOutputDimensions * kPaddedInputDimensions *
+                sizeof(WeightType));
+    return !stream.fail();
+  }
+
+  // パラメータを書き込む
+  bool WriteParameters(std::ostream& stream) const {
+    if (!previous_layer_.WriteParameters(stream)) return false;
+    stream.write(reinterpret_cast<const char*>(biases_),
+                 kOutputDimensions * sizeof(BiasType));
+    stream.write(reinterpret_cast<const char*>(weights_),
+                 kOutputDimensions * kPaddedInputDimensions *
+                 sizeof(WeightType));
+    return !stream.fail();
+  }
+
+  // 順伝播
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features, char* buffer) const {
+    const auto input = previous_layer_.Propagate(
+        transformed_features, buffer + kSelfBufferSize);
+    const auto output = reinterpret_cast<OutputType*>(buffer);
+#if defined(USE_AVX2)
+    constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+    const __m256i kOnes = _mm256_set1_epi16(1);
+    const auto input_vector = reinterpret_cast<const __m256i*>(input);
+#elif defined(USE_SSE41)
+    constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+    const __m128i kOnes = _mm_set1_epi16(1);
+    const auto input_vector = reinterpret_cast<const __m128i*>(input);
+#elif defined(IS_ARM)
+    constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+    const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
+#endif
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      const IndexType offset = i * kPaddedInputDimensions;
+#if defined(USE_AVX2)
+      __m256i sum = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, biases_[i]);
+      const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        __m256i product = _mm256_maddubs_epi16(
+            _mm256_load_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+        product = _mm256_madd_epi16(product, kOnes);
+        sum = _mm256_add_epi32(sum, product);
+      }
+      sum = _mm256_hadd_epi32(sum, sum);
+      sum = _mm256_hadd_epi32(sum, sum);
+      const __m128i lo = _mm256_extracti128_si256(sum, 0);
+      const __m128i hi = _mm256_extracti128_si256(sum, 1);
+      output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi);
+#elif defined(USE_SSE41)
+      __m128i sum = _mm_cvtsi32_si128(biases_[i]);
+      const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        __m128i product = _mm_maddubs_epi16(
+            _mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+        product = _mm_madd_epi16(product, kOnes);
+        sum = _mm_add_epi32(sum, product);
+      }
+      sum = _mm_hadd_epi32(sum, sum);
+      sum = _mm_hadd_epi32(sum, sum);
+      output[i] = _mm_cvtsi128_si32(sum);
+#elif defined(IS_ARM)
+      int32x4_t sum = {biases_[i]};
+      const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]);
+        product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]);
+        sum = vpadalq_s16(sum, product);
+      }
+      output[i] = sum[0] + sum[1] + sum[2] + sum[3];
+#else
+      OutputType sum = biases_[i];
+      for (IndexType j = 0; j < kInputDimensions; ++j) {
+        sum += weights_[offset + j] * input[j];
+      }
+      output[i] = sum;
+#endif
+    }
+    return output;
+  }
+
+ private:
+  // パラメータの型
+  using BiasType = OutputType;
+  using WeightType = std::int8_t;
+
+  // 学習用クラスをfriendにする
+  friend class Trainer<AffineTransform>;
+
+  // この層の直前の層
+  PreviousLayer previous_layer_;
+
+  // パラメータ
+  alignas(kCacheLineSize) BiasType biases_[kOutputDimensions];
+  alignas(kCacheLineSize)
+      WeightType weights_[kOutputDimensions * kPaddedInputDimensions];
+};
+
+}  // namespace Layers
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/layers/clipped_relu.h
+++ b/src/eval/nnue/layers/clipped_relu.h
@@ -0,0 +1,140 @@
+// NNUE評価関数の層ClippedReLUの定義
+
+#ifndef _NNUE_LAYERS_CLIPPED_RELU_H_
+#define _NNUE_LAYERS_CLIPPED_RELU_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../nnue_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Layers {
+
+// Clipped ReLU
+template <typename PreviousLayer>
+class ClippedReLU {
+ public:
+  // 入出力の型
+  using InputType = typename PreviousLayer::OutputType;
+  using OutputType = std::uint8_t;
+  static_assert(std::is_same<InputType, std::int32_t>::value, "");
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions =
+      PreviousLayer::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = kInputDimensions;
+
+  // この層で使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kSelfBufferSize =
+      CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+
+  // 入力層からこの層までで使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kBufferSize =
+      PreviousLayer::kBufferSize + kSelfBufferSize;
+
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0x538D24C7u;
+    hash_value += PreviousLayer::GetHashValue();
+    return hash_value;
+  }
+
+  // 入力層からこの層までの構造を表す文字列
+  static std::string GetStructureString() {
+    return "ClippedReLU[" +
+        std::to_string(kOutputDimensions) + "](" +
+        PreviousLayer::GetStructureString() + ")";
+  }
+
+  // パラメータを読み込む
+  bool ReadParameters(std::istream& stream) {
+    return previous_layer_.ReadParameters(stream);
+  }
+
+  // パラメータを書き込む
+  bool WriteParameters(std::ostream& stream) const {
+    return previous_layer_.WriteParameters(stream);
+  }
+
+  // 順伝播
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features, char* buffer) const {
+    const auto input = previous_layer_.Propagate(
+        transformed_features, buffer + kSelfBufferSize);
+    const auto output = reinterpret_cast<OutputType*>(buffer);
+#if defined(USE_AVX2)
+    constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+    const __m256i kZero = _mm256_setzero_si256();
+    const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+    const auto in = reinterpret_cast<const __m256i*>(input);
+    const auto out = reinterpret_cast<__m256i*>(output);
+    for (IndexType i = 0; i < kNumChunks; ++i) {
+      const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
+          _mm256_load_si256(&in[i * 4 + 0]),
+          _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
+      const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
+          _mm256_load_si256(&in[i * 4 + 2]),
+          _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
+      _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+          _mm256_packs_epi16(words0, words1), kZero), kOffsets));
+    }
+    constexpr IndexType kStart = kNumChunks * kSimdWidth;
+#elif defined(USE_SSE41)
+    constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+    const __m128i kZero = _mm_setzero_si128();
+    const auto in = reinterpret_cast<const __m128i*>(input);
+    const auto out = reinterpret_cast<__m128i*>(output);
+    for (IndexType i = 0; i < kNumChunks; ++i) {
+      const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
+          _mm_load_si128(&in[i * 4 + 0]),
+          _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits);
+      const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
+          _mm_load_si128(&in[i * 4 + 2]),
+          _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
+      _mm_store_si128(&out[i], _mm_max_epi8(
+          _mm_packs_epi16(words0, words1), kZero));
+    }
+    constexpr IndexType kStart = kNumChunks * kSimdWidth;
+#elif defined(IS_ARM)
+    constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
+    const int8x8_t kZero = {0};
+    const auto in = reinterpret_cast<const int32x4_t*>(input);
+    const auto out = reinterpret_cast<int8x8_t*>(output);
+    for (IndexType i = 0; i < kNumChunks; ++i) {
+      int16x8_t shifted;
+      const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
+      pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits);
+      pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits);
+      out[i] = vmax_s8(vqmovn_s16(shifted), kZero);
+    }
+    constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2);
+#else
+    constexpr IndexType kStart = 0;
+#endif
+    for (IndexType i = kStart; i < kInputDimensions; ++i) {
+      output[i] = static_cast<OutputType>(
+          std::max(0, std::min(127, input[i] >> kWeightScaleBits)));
+    }
+    return output;
+  }
+
+ private:
+  // 学習用クラスをfriendにする
+  friend class Trainer<ClippedReLU>;
+
+  // この層の直前の層
+  PreviousLayer previous_layer_;
+};
+
+}  // namespace Layers
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/layers/input_slice.h
+++ b/src/eval/nnue/layers/input_slice.h
@@ -0,0 +1,74 @@
+// NNUE評価関数の層InputSliceの定義
+
+#ifndef _NNUE_LAYERS_INPUT_SLICE_H_
+#define _NNUE_LAYERS_INPUT_SLICE_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../nnue_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Layers {
+
+// 入力層
+template <IndexType OutputDimensions, IndexType Offset = 0>
+class InputSlice {
+ public:
+  // アライメントを維持する必要がある
+  static_assert(Offset % kMaxSimdWidth == 0, "");
+
+  // 出力の型
+  using OutputType = TransformedFeatureType;
+
+  // 出力の次元数
+  static constexpr IndexType kOutputDimensions = OutputDimensions;
+
+  // 入力層からこの層までで使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kBufferSize = 0;
+
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0xEC42E90Du;
+    hash_value ^= kOutputDimensions ^ (Offset << 10);
+    return hash_value;
+  }
+
+  // 入力層からこの層までの構造を表す文字列
+  static std::string GetStructureString() {
+    return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
+        std::to_string(Offset) + ":" +
+        std::to_string(Offset + kOutputDimensions) + ")]";
+  }
+
+  // パラメータを読み込む
+  bool ReadParameters(std::istream& /*stream*/) {
+    return true;
+  }
+
+  // パラメータを書き込む
+  bool WriteParameters(std::ostream& /*stream*/) const {
+    return true;
+  }
+
+  // 順伝播
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features,
+      char* /*buffer*/) const {
+    return transformed_features + Offset;
+  }
+
+ private:
+};
+
+}  // namespace Layers
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/layers/sum.h
+++ b/src/eval/nnue/layers/sum.h
@@ -0,0 +1,165 @@
+// NNUE評価関数の層Sumの定義
+
+#ifndef _NNUE_LAYERS_SUM_H_
+#define _NNUE_LAYERS_SUM_H_
+
+#include "../../../config.h"
+
+#if defined(EVAL_NNUE)
+
+#include "../nnue_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Layers {
+
+// 複数の層の出力の和を取る層
+template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+class Sum : public Sum<RemainingPreviousLayers...> {
+ private:
+  using Head = FirstPreviousLayer;
+  using Tail = Sum<RemainingPreviousLayers...>;
+
+ public:
+  // 入出力の型
+  using InputType = typename Head::OutputType;
+  using OutputType = InputType;
+  static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = kInputDimensions;
+  static_assert(kInputDimensions == Tail::kInputDimensions , "");
+
+  // この層で使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kSelfBufferSize =
+      CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+
+  // 入力層からこの層までで使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kBufferSize =
+      std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
+
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0xBCE400B4u;
+    hash_value ^= Head::GetHashValue() >> 1;
+    hash_value ^= Head::GetHashValue() << 31;
+    hash_value ^= Tail::GetHashValue() >> 2;
+    hash_value ^= Tail::GetHashValue() << 30;
+    return hash_value;
+  }
+
+  // 入力層からこの層までの構造を表す文字列
+  static std::string GetStructureString() {
+    return "Sum[" +
+        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+  }
+
+  // パラメータを読み込む
+  bool ReadParameters(std::istream& stream) {
+    if (!Tail::ReadParameters(stream)) return false;
+    return previous_layer_.ReadParameters(stream);
+  }
+
+  // パラメータを書き込む
+  bool WriteParameters(std::ostream& stream) const {
+    if (!Tail::WriteParameters(stream)) return false;
+    return previous_layer_.WriteParameters(stream);
+  }
+
+  // 順伝播
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features, char* buffer) const {
+    Tail::Propagate(transformed_features, buffer);
+    const auto head_output = previous_layer_.Propagate(
+        transformed_features, buffer + kSelfBufferSize);
+    const auto output = reinterpret_cast<OutputType*>(buffer);
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      output[i] += head_output[i];
+    }
+    return output;
+  }
+
+ protected:
+  // 和を取る対象となる層のリストを表す文字列
+  static std::string GetSummandsString() {
+    return Head::GetStructureString() + "," + Tail::GetSummandsString();
+  }
+
+  // 学習用クラスをfriendにする
+  friend class Trainer<Sum>;
+
+  // この層の直前の層
+  FirstPreviousLayer previous_layer_;
+};
+
+// 複数の層の出力の和を取る層（テンプレート引数が1つの場合）
+template <typename PreviousLayer>
+class Sum<PreviousLayer> {
+ public:
+  // 入出力の型
+  using InputType = typename PreviousLayer::OutputType;
+  using OutputType = InputType;
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions =
+      PreviousLayer::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = kInputDimensions;
+
+  // 入力層からこの層までで使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0xBCE400B4u;
+    hash_value ^= PreviousLayer::GetHashValue() >> 1;
+    hash_value ^= PreviousLayer::GetHashValue() << 31;
+    return hash_value;
+  }
+
+  // 入力層からこの層までの構造を表す文字列
+  static std::string GetStructureString() {
+    return "Sum[" +
+        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+  }
+
+  // パラメータを読み込む
+  bool ReadParameters(std::istream& stream) {
+    return previous_layer_.ReadParameters(stream);
+  }
+
+  // パラメータを書き込む
+  bool WriteParameters(std::ostream& stream) const {
+    return previous_layer_.WriteParameters(stream);
+  }
+
+  // 順伝播
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features, char* buffer) const {
+    return previous_layer_.Propagate(transformed_features, buffer);
+  }
+
+ protected:
+  // 和を取る対象となる層のリストを表す文字列
+  static std::string GetSummandsString() {
+    return PreviousLayer::GetStructureString();
+  }
+
+  // 学習用クラスをfriendにする
+  friend class Trainer<Sum>;
+
+  // この層の直前の層
+  PreviousLayer previous_layer_;
+};
+
+}  // namespace Layers
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/nnue_accumulator.h
+++ b/src/eval/nnue/nnue_accumulator.h
@@ -0,0 +1,32 @@
+// NNUE評価関数の差分計算用のクラス
+
+#ifndef _NNUE_ACCUMULATOR_H_
+#define _NNUE_ACCUMULATOR_H_
+
+#include "../../config.h"
+
+#if defined(EVAL_NNUE)
+
+#include "nnue_architecture.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 入力特徴量をアフィン変換した結果を保持するクラス
+// 最終的な出力である評価値も一緒に持たせておく
+struct alignas(32) Accumulator {
+  std::int16_t
+      accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
+  Value score = VALUE_ZERO;
+  bool computed_accumulation = false;
+  bool computed_score = false;
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/nnue_architecture.h
+++ b/src/eval/nnue/nnue_architecture.h
@@ -0,0 +1,36 @@
+// NNUE評価関数で用いる入力特徴量とネットワーク構造
+
+#ifndef _NNUE_ARCHITECTURE_H_
+#define _NNUE_ARCHITECTURE_H_
+
+#if defined(EVAL_NNUE)
+
+// 入力特徴量とネットワーク構造が定義されたヘッダをincludeする
+
+// KP256型を使いたいときは、これを事前にdefineする。
+#if defined(EVAL_NNUE_KP256)
+#include "architectures/k-p_256x2-32-32.h"
+#else // #if defined(EVAL_NNUE_HALFKP256)
+
+// NNUE評価関数のデフォルトは、halfKP256
+#include "architectures/halfkp_256x2-32-32.h"
+#endif
+
+namespace Eval {
+
+namespace NNUE {
+
+static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
+static_assert(Network::kOutputDimensions == 1, "");
+static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
+
+// 差分計算の代わりに全計算を行うタイミングのリスト
+constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/nnue_common.h
+++ b/src/eval/nnue/nnue_common.h
@@ -0,0 +1,54 @@
+// NNUE評価関数で用いる定数など
+
+#ifndef _NNUE_COMMON_H_
+#define _NNUE_COMMON_H_
+
+#if defined(EVAL_NNUE)
+
+namespace Eval {
+
+namespace NNUE {
+
+// 評価関数ファイルのバージョンを表す定数
+constexpr std::uint32_t kVersion = 0x7AF32F16u;
+
+// 評価値の計算で利用する定数
+constexpr int FV_SCALE = 16;
+constexpr int kWeightScaleBits = 6;
+
+// キャッシュラインのサイズ（バイト単位）
+constexpr std::size_t kCacheLineSize = 64;
+
+// SIMD幅（バイト単位）
+#if defined(USE_AVX2)
+constexpr std::size_t kSimdWidth = 32;
+#elif defined(USE_SSE2)
+constexpr std::size_t kSimdWidth = 16;
+#elif defined(IS_ARM)
+constexpr std::size_t kSimdWidth = 16;
+#endif
+constexpr std::size_t kMaxSimdWidth = 32;
+
+// 変換後の入力特徴量の型
+using TransformedFeatureType = std::uint8_t;
+
+// インデックスの型
+using IndexType = std::uint32_t;
+
+// 学習用クラステンプレートの前方宣言
+template <typename Layer>
+class Trainer;
+
+// n以上で最小のbaseの倍数を求める
+template <typename IntType>
+constexpr IntType CeilToMultiple(IntType n, IntType base) {
+  return (n + base - 1) / base * base;
+}
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/nnue_feature_transformer.h
+++ b/src/eval/nnue/nnue_feature_transformer.h
@@ -0,0 +1,323 @@
+// NNUE評価関数の入力特徴量の変換を行うクラス
+
+#ifndef _NNUE_FEATURE_TRANSFORMER_H_
+#define _NNUE_FEATURE_TRANSFORMER_H_
+
+#if defined(EVAL_NNUE)
+
+#include "nnue_common.h"
+#include "nnue_architecture.h"
+#include "features/index_list.h"
+
+#include <cstring> // std::memset()
+
+namespace Eval {
+
+namespace NNUE {
+
+// 入力特徴量変換器
+class FeatureTransformer {
+ private:
+  // 片側分の出力の次元数
+  static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
+
+ public:
+  // 出力の型
+  using OutputType = TransformedFeatureType;
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
+  static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
+
+  // 順伝播用バッファのサイズ
+  static constexpr std::size_t kBufferSize =
+      kOutputDimensions * sizeof(OutputType);
+
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t GetHashValue() {
+    return RawFeatures::kHashValue ^ kOutputDimensions;
+  }
+
+  // 構造を表す文字列
+  static std::string GetStructureString() {
+    return RawFeatures::GetName() + "[" +
+        std::to_string(kInputDimensions) + "->" +
+        std::to_string(kHalfDimensions) + "x2]";
+  }
+
+  // パラメータを読み込む
+  bool ReadParameters(std::istream& stream) {
+    stream.read(reinterpret_cast<char*>(biases_),
+                kHalfDimensions * sizeof(BiasType));
+    stream.read(reinterpret_cast<char*>(weights_),
+                kHalfDimensions * kInputDimensions * sizeof(WeightType));
+    return !stream.fail();
+  }
+
+  // パラメータを書き込む
+  bool WriteParameters(std::ostream& stream) const {
+    stream.write(reinterpret_cast<const char*>(biases_),
+                 kHalfDimensions * sizeof(BiasType));
+    stream.write(reinterpret_cast<const char*>(weights_),
+                 kHalfDimensions * kInputDimensions * sizeof(WeightType));
+    return !stream.fail();
+  }
+
+  // 可能なら差分計算を進める
+  bool UpdateAccumulatorIfPossible(const Position& pos) const {
+    const auto now = pos.state();
+    if (now->accumulator.computed_accumulation) {
+      return true;
+    }
+    const auto prev = now->previous;
+    if (prev && prev->accumulator.computed_accumulation) {
+      UpdateAccumulator(pos);
+      return true;
+    }
+    return false;
+  }
+
+  // 入力特徴量を変換する
+  void Transform(const Position& pos, OutputType* output, bool refresh) const {
+    if (refresh || !UpdateAccumulatorIfPossible(pos)) {
+      RefreshAccumulator(pos);
+    }
+    const auto& accumulation = pos.state()->accumulator.accumulation;
+#if defined(USE_AVX2)
+    constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+    constexpr int kControl = 0b11011000;
+    const __m256i kZero = _mm256_setzero_si256();
+#elif defined(USE_SSE41)
+    constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+    const __m128i kZero = _mm_setzero_si128();
+#elif defined(IS_ARM)
+    constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+    const int8x8_t kZero = {0};
+#endif
+    const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
+    for (IndexType p = 0; p < 2; ++p) {
+      const IndexType offset = kHalfDimensions * p;
+#if defined(USE_AVX2)
+      auto out = reinterpret_cast<__m256i*>(&output[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        __m256i sum0 = _mm256_load_si256(&reinterpret_cast<const __m256i*>(
+            accumulation[perspectives[p]][0])[j * 2 + 0]);
+        __m256i sum1 = _mm256_load_si256(&reinterpret_cast<const __m256i*>(
+            accumulation[perspectives[p]][0])[j * 2 + 1]);
+        for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+          sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
+              accumulation[perspectives[p]][i])[j * 2 + 0]);
+          sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
+              accumulation[perspectives[p]][i])[j * 2 + 1]);
+        }
+        _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+            _mm256_packs_epi16(sum0, sum1), kZero), kControl));
+      }
+#elif defined(USE_SSE41)
+      auto out = reinterpret_cast<__m128i*>(&output[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+            accumulation[perspectives[p]][0])[j * 2 + 0]);
+        __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+            accumulation[perspectives[p]][0])[j * 2 + 1]);
+        for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+          sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
+              accumulation[perspectives[p]][i])[j * 2 + 0]);
+          sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
+              accumulation[perspectives[p]][i])[j * 2 + 1]);
+        }
+        _mm_store_si128(&out[j], _mm_max_epi8(
+            _mm_packs_epi16(sum0, sum1), kZero));
+      }
+#elif defined(IS_ARM)
+      const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        int16x8_t sum = reinterpret_cast<const int16x8_t*>(
+            accumulation[perspectives[p]][0])[j];
+        for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+          sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
+              accumulation[perspectives[p]][i])[j]);
+        }
+        out[j] = vmax_s8(vqmovn_s16(sum), kZero);
+      }
+#else
+      for (IndexType j = 0; j < kHalfDimensions; ++j) {
+        BiasType sum = accumulation[perspectives[p]][0][j];
+        for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+          sum += accumulation[perspectives[p]][i][j];
+        }
+        output[offset + j] = static_cast<OutputType>(
+            std::max<int>(0, std::min<int>(127, sum)));
+      }
+#endif
+    }
+  }
+
+ private:
+  // 差分計算を用いずに累積値を計算する
+  void RefreshAccumulator(const Position& pos) const {
+    auto& accumulator = pos.state()->accumulator;
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+      Features::IndexList active_indices[2];
+      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
+                                       active_indices);
+      for (const auto perspective : COLOR) {
+        if (i == 0) {
+          std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                      kHalfDimensions * sizeof(BiasType));
+        } else {
+          std::memset(accumulator.accumulation[perspective][i], 0,
+                      kHalfDimensions * sizeof(BiasType));
+        }
+        for (const auto index : active_indices[perspective]) {
+          const IndexType offset = kHalfDimensions * index;
+#if defined(USE_AVX2)
+          auto accumulation = reinterpret_cast<__m256i*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+            accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
+          }
+#elif defined(USE_SSE2)
+          auto accumulation = reinterpret_cast<__m128i*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+            accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
+          }
+#elif defined(IS_ARM)
+          auto accumulation = reinterpret_cast<int16x8_t*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+            accumulation[j] = vaddq_s16(accumulation[j], column[j]);
+          }
+#else
+          for (IndexType j = 0; j < kHalfDimensions; ++j) {
+            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+          }
+#endif
+        }
+      }
+    }
+
+    accumulator.computed_accumulation = true;
+    accumulator.computed_score = false;
+  }
+
+  // 差分計算を用いて累積値を計算する
+  void UpdateAccumulator(const Position& pos) const {
+    const auto prev_accumulator = pos.state()->previous->accumulator;
+    auto& accumulator = pos.state()->accumulator;
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+      Features::IndexList removed_indices[2], added_indices[2];
+      bool reset[2];
+      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
+                                        removed_indices, added_indices, reset);
+      for (const auto perspective : COLOR) {
+#if defined(USE_AVX2)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<__m256i*>(
+            &accumulator.accumulation[perspective][i][0]);
+#elif defined(USE_SSE2)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<__m128i*>(
+            &accumulator.accumulation[perspective][i][0]);
+#elif defined(IS_ARM)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<int16x8_t*>(
+            &accumulator.accumulation[perspective][i][0]);
+#endif
+        if (reset[perspective]) {
+          if (i == 0) {
+            std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                        kHalfDimensions * sizeof(BiasType));
+          } else {
+            std::memset(accumulator.accumulation[perspective][i], 0,
+                        kHalfDimensions * sizeof(BiasType));
+          }
+        } else {  // 1から0に変化した特徴量に関する差分計算
+          std::memcpy(accumulator.accumulation[perspective][i],
+                      prev_accumulator.accumulation[perspective][i],
+                      kHalfDimensions * sizeof(BiasType));
+          for (const auto index : removed_indices[perspective]) {
+            const IndexType offset = kHalfDimensions * index;
+#if defined(USE_AVX2)
+            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]);
+            }
+#elif defined(USE_SSE2)
+            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
+            }
+#elif defined(IS_ARM)
+            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = vsubq_s16(accumulation[j], column[j]);
+            }
+#else
+            for (IndexType j = 0; j < kHalfDimensions; ++j) {
+              accumulator.accumulation[perspective][i][j] -=
+                  weights_[offset + j];
+            }
+#endif
+          }
+        }
+        {  // 0から1に変化した特徴量に関する差分計算
+          for (const auto index : added_indices[perspective]) {
+            const IndexType offset = kHalfDimensions * index;
+#if defined(USE_AVX2)
+            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
+            }
+#elif defined(USE_SSE2)
+            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
+            }
+#elif defined(IS_ARM)
+            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = vaddq_s16(accumulation[j], column[j]);
+            }
+#else
+            for (IndexType j = 0; j < kHalfDimensions; ++j) {
+              accumulator.accumulation[perspective][i][j] +=
+                  weights_[offset + j];
+            }
+#endif
+          }
+        }
+      }
+    }
+
+    accumulator.computed_accumulation = true;
+    accumulator.computed_score = false;
+  }
+
+  // パラメータの型
+  using BiasType = std::int16_t;
+  using WeightType = std::int16_t;
+
+  // 学習用クラスをfriendにする
+  friend class Trainer<FeatureTransformer>;
+
+  // パラメータ
+  alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
+  alignas(kCacheLineSize)
+      WeightType weights_[kHalfDimensions * kInputDimensions];
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/nnue_test_command.cpp
+++ b/src/eval/nnue/nnue_test_command.cpp
@@ -0,0 +1,196 @@
+// NNUE評価関数に関するUSI拡張コマンド
+
+#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+
+#include "../../extra/all.h"
+#include "evaluate_nnue.h"
+#include "nnue_test_command.h"
+
+#include <set>
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace {
+
+// 主に差分計算に関するRawFeaturesのテスト
+void TestFeatures(Position& pos) {
+  const std::uint64_t num_games = 1000;
+  StateInfo si;
+  pos.set_hirate(&si,Threads.main());
+  const int MAX_PLY = 256; // 256手までテスト
+
+  StateInfo state[MAX_PLY]; // StateInfoを最大手数分だけ
+  int ply; // 初期局面からの手数
+
+  PRNG prng(20171128);
+
+  std::uint64_t num_moves = 0;
+  std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
+  std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
+  constexpr IndexType kUnknown = -1;
+  std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
+  auto make_index_sets = [&](const Position& pos) {
+    std::vector<std::vector<std::set<IndexType>>> index_sets(
+        kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+      Features::IndexList active_indices[2];
+      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
+                                       active_indices);
+      for (const auto perspective : COLOR) {
+        for (const auto index : active_indices[perspective]) {
+          ASSERT(index < RawFeatures::kDimensions);
+          ASSERT(index_sets[i][perspective].count(index) == 0);
+          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+          index_sets[i][perspective].insert(index);
+          trigger_map[index] = i;
+        }
+      }
+    }
+    return index_sets;
+  };
+  auto update_index_sets = [&](const Position& pos, auto* index_sets) {
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+      Features::IndexList removed_indices[2], added_indices[2];
+      bool reset[2];
+      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
+                                        removed_indices, added_indices, reset);
+      for (const auto perspective : COLOR) {
+        if (reset[perspective]) {
+          (*index_sets)[i][perspective].clear();
+          ++num_resets[i];
+        } else {
+          for (const auto index : removed_indices[perspective]) {
+            ASSERT(index < RawFeatures::kDimensions);
+            ASSERT((*index_sets)[i][perspective].count(index) == 1);
+            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+            (*index_sets)[i][perspective].erase(index);
+            ++num_updates.back();
+            ++num_updates[i];
+            trigger_map[index] = i;
+          }
+        }
+        for (const auto index : added_indices[perspective]) {
+          ASSERT(index < RawFeatures::kDimensions);
+          ASSERT((*index_sets)[i][perspective].count(index) == 0);
+          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+          (*index_sets)[i][perspective].insert(index);
+          ++num_updates.back();
+          ++num_updates[i];
+          trigger_map[index] = i;
+        }
+      }
+    }
+  };
+
+  std::cout << "feature set: " << RawFeatures::GetName()
+            << "[" << RawFeatures::kDimensions << "]" << std::endl;
+  std::cout << "start testing with random games";
+
+  for (std::uint64_t i = 0; i < num_games; ++i) {
+    auto index_sets = make_index_sets(pos);
+    for (ply = 0; ply < MAX_PLY; ++ply) {
+      MoveList<LEGAL_ALL> mg(pos); // 全合法手の生成
+
+      // 合法な指し手がなかった == 詰み
+      if (mg.size() == 0)
+        break;
+
+      // 生成された指し手のなかからランダムに選び、その指し手で局面を進める。
+      Move m = mg.begin()[prng.rand(mg.size())];
+      pos.do_move(m, state[ply]);
+
+      ++num_moves;
+      update_index_sets(pos, &index_sets);
+      ASSERT(index_sets == make_index_sets(pos));
+    }
+
+    pos.set_hirate(&si,Threads.main());
+
+    // 100回に1回ごとに'.'を出力(進んでいることがわかるように)
+    if ((i % 100) == 0)
+      std::cout << "." << std::flush;
+  }
+  std::cout << "passed." << std::endl;
+  std::cout << num_games << " games, " << num_moves << " moves, "
+            << num_updates.back() << " updates, "
+            << (1.0 * num_updates.back() / num_moves)
+            << " updates per move" << std::endl;
+  std::size_t num_observed_indices = 0;
+  for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+    const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
+    num_observed_indices += count;
+    std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
+              << "): " << count << " features ("
+              << (100.0 * count / RawFeatures::kDimensions) << "%), "
+              << num_updates[i] << " updates ("
+              << (1.0 * num_updates[i] / num_moves) << " per move), "
+              << num_resets[i] << " resets ("
+              << (100.0 * num_resets[i] / num_moves) << "%)"
+              << std::endl;
+  }
+  std::cout << "observed " << num_observed_indices << " ("
+            << (100.0 * num_observed_indices / RawFeatures::kDimensions)
+            << "% of " << RawFeatures::kDimensions
+            << ") features" << std::endl;
+}
+
+// 評価関数の構造を表す文字列を出力する
+void PrintInfo(std::istream& stream) {
+  std::cout << "network architecture: " << GetArchitectureString() << std::endl;
+
+  while (true) {
+    std::string file_name;
+    stream >> file_name;
+    if (file_name.empty()) break;
+
+    std::uint32_t hash_value;
+    std::string architecture;
+    const bool success = [&]() {
+      std::ifstream file_stream(file_name, std::ios::binary);
+      if (!file_stream) return false;
+      if (!ReadHeader(file_stream, &hash_value, &architecture)) return false;
+      return true;
+    }();
+
+    std::cout << file_name << ": ";
+    if (success) {
+      if (hash_value == kHashValue) {
+        std::cout << "matches with this binary";
+        if (architecture != GetArchitectureString()) {
+          std::cout << ", but architecture string differs: " << architecture;
+        }
+        std::cout << std::endl;
+      } else {
+        std::cout << architecture << std::endl;
+      }
+    } else {
+      std::cout << "failed to read header" << std::endl;
+    }
+  }
+}
+
+}  // namespace
+
+// NNUE評価関数に関するUSI拡張コマンド
+void TestCommand(Position& pos, std::istream& stream) {
+  std::string sub_command;
+  stream >> sub_command;
+
+  if (sub_command == "test_features") {
+    TestFeatures(pos);
+  } else if (sub_command == "info") {
+    PrintInfo(stream);
+  } else {
+    std::cout << "usage:" << std::endl;
+    std::cout << " test nn test_features" << std::endl;
+    std::cout << " test nn info [path/to/" << kFileName << "...]" << std::endl;
+  }
+}
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
--- a/src/eval/nnue/nnue_test_command.h
+++ b/src/eval/nnue/nnue_test_command.h
@@ -0,0 +1,23 @@
+// NNUE評価関数に関するUSI拡張コマンドのインターフェイス
+
+#ifndef _NNUE_TEST_COMMAND_H_
+#define _NNUE_TEST_COMMAND_H_
+
+#include "../../config.h"
+
+#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+
+namespace Eval {
+
+namespace NNUE {
+
+// NNUE評価関数に関するUSI拡張コマンド
+void TestCommand(Position& pos, std::istream& stream);
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/trainer/features/factorizer.h
+++ b/src/eval/nnue/trainer/features/factorizer.h
@@ -0,0 +1,112 @@
+// NNUE評価関数の特徴量変換クラステンプレート
+
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
+#define _NNUE_TRAINER_FEATURES_FACTORIZER_H_
+
+#include "../../../../config.h"
+
+#if defined(EVAL_NNUE)
+
+#include "../../nnue_common.h"
+#include "../trainer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 入力特徴量を学習用特徴量に変換するクラステンプレート
+// デフォルトでは学習用特徴量は元の入力特徴量と同じとし、必要に応じて特殊化する
+template <typename FeatureType>
+class Factorizer {
+ public:
+  // 学習用特徴量の次元数を取得する
+  static constexpr IndexType GetDimensions() {
+    return FeatureType::kDimensions;
+  }
+
+  // 学習用特徴量のインデックスと学習率のスケールを取得する
+  static void AppendTrainingFeatures(
+      IndexType base_index, std::vector<TrainingFeature>* training_features) {
+    ASSERT_LV5(base_index < FeatureType::kDimensions);
+    training_features->emplace_back(base_index);
+  }
+};
+
+// 学習用特徴量の情報
+struct FeatureProperties {
+  bool active;
+  IndexType dimensions;
+};
+
+// 元の入力特徴量を学習用特徴量に追加する
+template <typename FeatureType>
+IndexType AppendBaseFeature(
+    FeatureProperties properties, IndexType base_index,
+    std::vector<TrainingFeature>* training_features) {
+  ASSERT_LV5(properties.dimensions == FeatureType::kDimensions);
+  ASSERT_LV5(base_index < FeatureType::kDimensions);
+  training_features->emplace_back(base_index);
+  return properties.dimensions;
+}
+
+// 学習率のスケールが0でなければ他の種類の学習用特徴量を引き継ぐ
+template <typename FeatureType>
+IndexType InheritFeaturesIfRequired(
+    IndexType index_offset, FeatureProperties properties, IndexType base_index,
+    std::vector<TrainingFeature>* training_features) {
+  if (!properties.active) {
+    return 0;
+  }
+  ASSERT_LV5(properties.dimensions == Factorizer<FeatureType>::GetDimensions());
+  ASSERT_LV5(base_index < FeatureType::kDimensions);
+  const auto start = training_features->size();
+  Factorizer<FeatureType>::AppendTrainingFeatures(
+      base_index, training_features);
+  for (auto i = start; i < training_features->size(); ++i) {
+    auto& feature = (*training_features)[i];
+    ASSERT_LV5(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
+    feature.ShiftIndex(index_offset);
+  }
+  return properties.dimensions;
+}
+
+// 学習用特徴量を追加せず、必要に応じてインデックスの差分を返す
+// 対応する特徴量がない場合にInheritFeaturesIfRequired()の代わりに呼ぶ
+IndexType SkipFeatures(FeatureProperties properties) {
+  if (!properties.active) {
+    return 0;
+  }
+  return properties.dimensions;
+}
+
+// 学習用特徴量の次元数を取得する
+template <std::size_t N>
+constexpr IndexType GetActiveDimensions(
+    const FeatureProperties (&properties)[N]) {
+  static_assert(N > 0, "");
+  IndexType dimensions = properties[0].dimensions;
+  for (std::size_t i = 1; i < N; ++i) {
+    if (properties[i].active) {
+      dimensions += properties[i].dimensions;
+    }
+  }
+  return dimensions;
+}
+
+// 配列の要素数を取得する
+template <typename T, std::size_t N>
+constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
+  return N;
+}
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/eval/nnue/trainer/features/factorizer_feature_set.h
@@ -0,0 +1,106 @@
+// NNUE評価関数の特徴量変換クラステンプレートのFeatureSet用特殊化
+
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
+#define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
+
+#include "../../../../config.h"
+
+#if defined(EVAL_NNUE)
+
+#include "../../features/feature_set.h"
+#include "factorizer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 入力特徴量を学習用特徴量に変換するクラステンプレート
+// FeatureSet用特殊化
+template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
+ private:
+  using Head = Factorizer<FeatureSet<FirstFeatureType>>;
+  using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
+
+ public:
+  // 元の入力特徴量の次元数
+  static constexpr IndexType kBaseDimensions =
+      FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
+
+  // 学習用特徴量の次元数を取得する
+  static constexpr IndexType GetDimensions() {
+    return Head::GetDimensions() + Tail::GetDimensions();
+  }
+
+  // 学習用特徴量のインデックスと学習率のスケールを取得する
+  static void AppendTrainingFeatures(
+      IndexType base_index, std::vector<TrainingFeature>* training_features,
+      IndexType base_dimensions = kBaseDimensions) {
+    ASSERT_LV5(base_index < kBaseDimensions);
+    constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
+    if (base_index < boundary) {
+      Tail::AppendTrainingFeatures(
+          base_index, training_features, base_dimensions);
+    } else {
+      const auto start = training_features->size();
+      Head::AppendTrainingFeatures(
+          base_index - boundary, training_features, base_dimensions);
+      for (auto i = start; i < training_features->size(); ++i) {
+        auto& feature = (*training_features)[i];
+        const auto index = feature.GetIndex();
+        ASSERT_LV5(index < Head::GetDimensions() ||
+                   (index >= base_dimensions &&
+                    index < base_dimensions +
+                            Head::GetDimensions() - Head::kBaseDimensions));
+        if (index < Head::kBaseDimensions) {
+          feature.ShiftIndex(Tail::kBaseDimensions);
+        } else {
+          feature.ShiftIndex(Tail::GetDimensions() - Tail::kBaseDimensions);
+        }
+      }
+    }
+  }
+};
+
+// 入力特徴量を学習用特徴量に変換するクラステンプレート
+// FeatureSetのテンプレート引数が1つの場合の特殊化
+template <typename FeatureType>
+class Factorizer<FeatureSet<FeatureType>> {
+public:
+  // 元の入力特徴量の次元数
+  static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
+
+  // 学習用特徴量の次元数を取得する
+  static constexpr IndexType GetDimensions() {
+    return Factorizer<FeatureType>::GetDimensions();
+  }
+
+  // 学習用特徴量のインデックスと学習率のスケールを取得する
+  static void AppendTrainingFeatures(
+      IndexType base_index, std::vector<TrainingFeature>* training_features,
+      IndexType base_dimensions = kBaseDimensions) {
+    ASSERT_LV5(base_index < kBaseDimensions);
+    const auto start = training_features->size();
+    Factorizer<FeatureType>::AppendTrainingFeatures(
+        base_index, training_features);
+    for (auto i = start; i < training_features->size(); ++i) {
+      auto& feature = (*training_features)[i];
+      ASSERT_LV5(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
+      if (feature.GetIndex() >= kBaseDimensions) {
+        feature.ShiftIndex(base_dimensions - kBaseDimensions);
+      }
+    }
+  }
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/eval/nnue/trainer/features/factorizer_half_kp.h
@@ -0,0 +1,105 @@
+// NNUE評価関数の特徴量変換クラステンプレートのHalfKP用特殊化
+
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
+#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
+
+#include "../../../../config.h"
+
+#if defined(EVAL_NNUE)
+
+#include "../../features/half_kp.h"
+#include "../../features/p.h"
+#include "../../features/half_relative_kp.h"
+#include "factorizer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 入力特徴量を学習用特徴量に変換するクラステンプレート
+// HalfKP用特殊化
+template <Side AssociatedKing>
+class Factorizer<HalfKP<AssociatedKing>> {
+ private:
+  using FeatureType = HalfKP<AssociatedKing>;
+
+  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  static constexpr IndexType kMaxActiveDimensions =
+      FeatureType::kMaxActiveDimensions;
+
+  // 学習用特徴量の種類
+  enum TrainingFeatureType {
+    kFeaturesHalfKP,
+    kFeaturesHalfK,
+    kFeaturesP,
+    kFeaturesHalfRelativeKP,
+    kNumTrainingFeatureTypes,
+  };
+
+  // 学習用特徴量の情報
+  static constexpr FeatureProperties kProperties[] = {
+    // kFeaturesHalfKP
+    {true, FeatureType::kDimensions},
+    // kFeaturesHalfK
+    {true, SQ_NB},
+    // kFeaturesP
+    {true, Factorizer<P>::GetDimensions()},
+    // kFeaturesHalfRelativeKP
+    {true, Factorizer<HalfRelativeKP<AssociatedKing>>::GetDimensions()},
+  };
+  static_assert(GetArrayLength(kProperties) == kNumTrainingFeatureTypes, "");
+
+ public:
+  // 学習用特徴量の次元数を取得する
+  static constexpr IndexType GetDimensions() {
+    return GetActiveDimensions(kProperties);
+  }
+
+  // 学習用特徴量のインデックスと学習率のスケールを取得する
+  static void AppendTrainingFeatures(
+      IndexType base_index, std::vector<TrainingFeature>* training_features) {
+    // kFeaturesHalfKP
+    IndexType index_offset = AppendBaseFeature<FeatureType>(
+        kProperties[kFeaturesHalfKP], base_index, training_features);
+
+    const auto sq_k = static_cast<Square>(base_index / fe_end);
+    const auto p = static_cast<BonaPiece>(base_index % fe_end);
+    // kFeaturesHalfK
+    {
+      const auto& properties = kProperties[kFeaturesHalfK];
+      if (properties.active) {
+        training_features->emplace_back(index_offset + sq_k);
+        index_offset += properties.dimensions;
+      }
+    }
+    // kFeaturesP
+    index_offset += InheritFeaturesIfRequired<P>(
+        index_offset, kProperties[kFeaturesP], p, training_features);
+    // kFeaturesHalfRelativeKP
+    if (p >= fe_hand_end) {
+      index_offset += InheritFeaturesIfRequired<HalfRelativeKP<AssociatedKing>>(
+          index_offset, kProperties[kFeaturesHalfRelativeKP],
+          HalfRelativeKP<AssociatedKing>::MakeIndex(sq_k, p),
+          training_features);
+    } else {
+      index_offset += SkipFeatures(kProperties[kFeaturesHalfRelativeKP]);
+    }
+
+    ASSERT_LV5(index_offset == GetDimensions());
+  }
+};
+
+template <Side AssociatedKing>
+constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/trainer/trainer.h
+++ b/src/eval/nnue/trainer/trainer.h
@@ -0,0 +1,127 @@
+// NNUE評価関数の学習用クラステンプレートの共通ヘッダ
+
+#ifndef _NNUE_TRAINER_H_
+#define _NNUE_TRAINER_H_
+
+#include "../../../config.h"
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../nnue_common.h"
+#include "../features/index_list.h"
+
+#include <sstream>
+#if defined(USE_BLAS)
+static_assert(std::is_same<LearnFloatType, float>::value, "");
+#include <cblas.h>
+#endif
+
+namespace Eval {
+
+namespace NNUE {
+
+// 評価値と勝率の関係式で用いるPonanza定数
+constexpr double kPonanzaConstant = 600.0;
+
+// 学習用特徴量のインデックス1つを表すクラス
+class TrainingFeature {
+  using StorageType = std::uint32_t;
+  static_assert(std::is_unsigned<StorageType>::value, "");
+
+ public:
+  static constexpr std::uint32_t kIndexBits = 24;
+  static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
+  static constexpr std::uint32_t kCountBits =
+      std::numeric_limits<StorageType>::digits - kIndexBits;
+
+  explicit TrainingFeature(IndexType index) :
+      index_and_count_((index << kCountBits) | 1) {
+    ASSERT_LV3(index < (1 << kIndexBits));
+  }
+  TrainingFeature& operator+=(const TrainingFeature& other) {
+    ASSERT_LV3(other.GetIndex() == GetIndex());
+    ASSERT_LV3(other.GetCount() + GetCount() < (1 << kCountBits));
+    index_and_count_ += other.GetCount();
+    return *this;
+  }
+  IndexType GetIndex() const {
+    return static_cast<IndexType>(index_and_count_ >> kCountBits);
+  }
+  void ShiftIndex(IndexType offset) {
+    ASSERT_LV3(GetIndex() + offset < (1 << kIndexBits));
+    index_and_count_ += offset << kCountBits;
+  }
+  IndexType GetCount() const {
+    return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
+  }
+  bool operator<(const TrainingFeature& other) const {
+    return index_and_count_ < other.index_and_count_;
+  }
+
+ private:
+  StorageType index_and_count_;
+};
+
+// 学習データ1サンプルを表す構造体
+struct Example {
+  std::vector<TrainingFeature> training_features[2];
+  Learner::PackedSfenValue psv;
+  int sign;
+  double weight;
+};
+
+// ハイパーパラメータの設定などに使用するメッセージ
+struct Message {
+  Message(const std::string& name, const std::string& value = "") :
+      name(name), value(value), num_peekers(0), num_receivers(0) {}
+  const std::string name;
+  const std::string value;
+  std::uint32_t num_peekers;
+  std::uint32_t num_receivers;
+};
+
+// メッセージを受理するかどうかを判定する
+bool ReceiveMessage(const std::string& name, Message* message) {
+  const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
+  if (message->name.substr(0, name.size() + 1) == name + "[") {
+    ++message->num_peekers;
+  }
+  if (message->name == name || message->name == name + subscript) {
+    ++message->num_receivers;
+    return true;
+  }
+  return false;
+}
+
+// 文字列を分割する
+std::vector<std::string> Split(const std::string& input, char delimiter) {
+  std::istringstream stream(input);
+  std::string field;
+  std::vector<std::string> fields;
+  while (std::getline(stream, field, delimiter)) {
+    fields.push_back(field);
+  }
+  return fields;
+}
+
+// 浮動小数点数を整数に丸める
+template <typename IntType>
+IntType Round(double value) {
+  return static_cast<IntType>(std::floor(value + 0.5));
+}
+
+// アライメント付きmake_shared
+template <typename T, typename... ArgumentTypes>
+std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
+  const auto ptr = new(aligned_malloc(sizeof(T), alignof(T)))
+      T(std::forward<ArgumentTypes>(arguments)...);
+  return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
+}
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/trainer/trainer_affine_transform.h
+++ b/src/eval/nnue/trainer/trainer_affine_transform.h
@@ -0,0 +1,303 @@
+// NNUE評価関数の学習クラステンプレートのAffineTransform用特殊化
+
+#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
+#define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
+
+#include "../../../config.h"
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../../learn/learn.h"
+#include "../layers/affine_transform.h"
+#include "trainer.h"
+
+#include <random>
+
+namespace Eval {
+
+namespace NNUE {
+
+// 学習：アフィン変換層
+template <typename PreviousLayer, IndexType OutputDimensions>
+class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
+ private:
+  // 学習対象の層の型
+  using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
+
+ public:
+  // ファクトリ関数
+  static std::shared_ptr<Trainer> Create(
+      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+    return std::shared_ptr<Trainer>(
+        new Trainer(target_layer, feature_transformer));
+  }
+
+  // ハイパーパラメータなどのオプションを設定する
+  void SendMessage(Message* message) {
+    previous_layer_trainer_->SendMessage(message);
+    if (ReceiveMessage("momentum", message)) {
+      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
+    }
+    if (ReceiveMessage("learning_rate_scale", message)) {
+      learning_rate_scale_ =
+          static_cast<LearnFloatType>(std::stod(message->value));
+    }
+    if (ReceiveMessage("reset", message)) {
+      DequantizeParameters();
+    }
+    if (ReceiveMessage("quantize_parameters", message)) {
+      QuantizeParameters();
+    }
+  }
+
+  // パラメータを乱数で初期化する
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    previous_layer_trainer_->Initialize(rng);
+    if (kIsOutputLayer) {
+      // 出力層は0で初期化する
+      std::fill(std::begin(biases_), std::end(biases_),
+                static_cast<LearnFloatType>(0.0));
+      std::fill(std::begin(weights_), std::end(weights_),
+                static_cast<LearnFloatType>(0.0));
+    } else {
+      // 入力の分布が各ユニット平均0.5、等分散であることを仮定し、
+      // 出力の分布が各ユニット平均0.5、入力と同じ等分散になるように初期化する
+      const double kSigma = 1.0 / std::sqrt(kInputDimensions);
+      auto distribution = std::normal_distribution<double>(0.0, kSigma);
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        double sum = 0.0;
+        for (IndexType j = 0; j < kInputDimensions; ++j) {
+          const auto weight = static_cast<LearnFloatType>(distribution(rng));
+          weights_[kInputDimensions * i + j] = weight;
+          sum += weight;
+        }
+        biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
+      }
+    }
+    QuantizeParameters();
+  }
+
+  // 順伝播
+  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (output_.size() < kOutputDimensions * batch.size()) {
+      output_.resize(kOutputDimensions * batch.size());
+      gradients_.resize(kInputDimensions * batch.size());
+    }
+    batch_size_ = static_cast<IndexType>(batch.size());
+    batch_input_ = previous_layer_trainer_->Propagate(batch);
+#if defined(USE_BLAS)
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
+    }
+    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, batch_size_, kInputDimensions, 1.0,
+                weights_, kInputDimensions,
+                batch_input_, kInputDimensions,
+                1.0, &output_[0], kOutputDimensions);
+#else
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType input_batch_offset = kInputDimensions * b;
+      const IndexType output_batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        double sum = biases_[i];
+        for (IndexType j = 0; j < kInputDimensions; ++j) {
+          const IndexType index = kInputDimensions * i + j;
+          sum += weights_[index] * batch_input_[input_batch_offset + j];
+        }
+        output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
+      }
+    }
+#endif
+    return output_.data();
+  }
+
+  // 逆伝播
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    const LearnFloatType local_learning_rate =
+        learning_rate * learning_rate_scale_;
+#if defined(USE_BLAS)
+    // backpropagate
+    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                kInputDimensions, batch_size_, kOutputDimensions, 1.0,
+                weights_, kInputDimensions,
+                gradients, kOutputDimensions,
+                0.0, &gradients_[0], kInputDimensions);
+    // update
+    cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      cblas_saxpy(kOutputDimensions, 1.0,
+                  &gradients[batch_offset], 1, biases_diff_, 1);
+    }
+    cblas_saxpy(kOutputDimensions, -local_learning_rate,
+                biases_diff_, 1, biases_, 1);
+    cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, kInputDimensions, batch_size_, 1.0,
+                gradients, kOutputDimensions,
+                batch_input_, kInputDimensions,
+                momentum_, weights_diff_, kInputDimensions);
+    cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate,
+                weights_diff_, 1, weights_, 1);
+#else
+    // backpropagate
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType input_batch_offset = kInputDimensions * b;
+      const IndexType output_batch_offset = kOutputDimensions * b;
+      for (IndexType j = 0; j < kInputDimensions; ++j) {
+        double sum = 0.0;
+        for (IndexType i = 0; i < kOutputDimensions; ++i) {
+          const IndexType index = kInputDimensions * i + j;
+          sum += weights_[index] * gradients[output_batch_offset + i];
+        }
+        gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
+      }
+    }
+    // update
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      biases_diff_[i] *= momentum_;
+    }
+    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+      weights_diff_[i] *= momentum_;
+    }
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType input_batch_offset = kInputDimensions * b;
+      const IndexType output_batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        biases_diff_[i] += gradients[output_batch_offset + i];
+      }
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        for (IndexType j = 0; j < kInputDimensions; ++j) {
+          const IndexType index = kInputDimensions * i + j;
+          weights_diff_[index] += gradients[output_batch_offset + i] *
+              batch_input_[input_batch_offset + j];
+        }
+      }
+    }
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      biases_[i] -= local_learning_rate * biases_diff_[i];
+    }
+    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+      weights_[i] -= local_learning_rate * weights_diff_[i];
+    }
+#endif
+    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
+  }
+
+ private:
+  // コンストラクタ
+  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+      batch_size_(0),
+      batch_input_(nullptr),
+      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+          &target_layer->previous_layer_, feature_transformer)),
+      target_layer_(target_layer),
+      biases_(),
+      weights_(),
+      biases_diff_(),
+      weights_diff_(),
+      momentum_(0.0),
+      learning_rate_scale_(1.0) {
+    DequantizeParameters();
+  }
+
+  // 重みの飽和とパラメータの整数化
+  void QuantizeParameters() {
+    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+      weights_[i] = std::max(-kMaxWeightMagnitude,
+                             std::min(+kMaxWeightMagnitude, weights_[i]));
+    }
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      target_layer_->biases_[i] =
+          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+    }
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      const auto offset = kInputDimensions * i;
+      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+      for (IndexType j = 0; j < kInputDimensions; ++j) {
+        target_layer_->weights_[padded_offset + j] =
+            Round<typename LayerType::WeightType>(
+                weights_[offset + j] * kWeightScale);
+      }
+    }
+  }
+
+  // 整数化されたパラメータの読み込み
+  void DequantizeParameters() {
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      biases_[i] = static_cast<LearnFloatType>(
+          target_layer_->biases_[i] / kBiasScale);
+    }
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      const auto offset = kInputDimensions * i;
+      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+      for (IndexType j = 0; j < kInputDimensions; ++j) {
+        weights_[offset + j] = static_cast<LearnFloatType>(
+            target_layer_->weights_[padded_offset + j] / kWeightScale);
+      }
+    }
+    std::fill(std::begin(biases_diff_), std::end(biases_diff_),
+              static_cast<LearnFloatType>(0.0));
+    std::fill(std::begin(weights_diff_), std::end(weights_diff_),
+              static_cast<LearnFloatType>(0.0));
+  }
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
+  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+  // 出力の次元数が1なら出力層
+  static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
+
+  // パラメータの整数化で用いる係数
+  static constexpr LearnFloatType kActivationScale =
+      std::numeric_limits<std::int8_t>::max();
+  static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
+      (kPonanzaConstant * FV_SCALE) :
+      ((1 << kWeightScaleBits) * kActivationScale);
+  static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
+
+  // パラメータの整数化でオーバーフローさせないために用いる重みの絶対値の上限
+  static constexpr LearnFloatType kMaxWeightMagnitude =
+      std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
+
+  // ミニバッチのサンプル数
+  IndexType batch_size_;
+
+  // ミニバッチの入力
+  const LearnFloatType* batch_input_;
+
+  // 直前の層のTrainer
+  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+  // 学習対象の層
+  LayerType* const target_layer_;
+
+  // パラメータ
+  LearnFloatType biases_[kOutputDimensions];
+  LearnFloatType weights_[kOutputDimensions * kInputDimensions];
+
+  // パラメータの更新で用いるバッファ
+  LearnFloatType biases_diff_[kOutputDimensions];
+  LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
+
+  // 順伝播用バッファ
+  std::vector<LearnFloatType> output_;
+
+  // 逆伝播用バッファ
+  std::vector<LearnFloatType> gradients_;
+
+  // ハイパーパラメータ
+  LearnFloatType momentum_;
+  LearnFloatType learning_rate_scale_;
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/trainer/trainer_clipped_relu.h
+++ b/src/eval/nnue/trainer/trainer_clipped_relu.h
@@ -0,0 +1,144 @@
+// NNUE評価関数の学習クラステンプレートのClippedReLU用特殊化
+
+#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
+#define _NNUE_TRAINER_CLIPPED_RELU_H_
+
+#include "../../../config.h"
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../../learn/learn.h"
+#include "../layers/clipped_relu.h"
+#include "trainer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 学習：アフィン変換層
+template <typename PreviousLayer>
+class Trainer<Layers::ClippedReLU<PreviousLayer>> {
+ private:
+  // 学習対象の層の型
+  using LayerType = Layers::ClippedReLU<PreviousLayer>;
+
+ public:
+  // ファクトリ関数
+  static std::shared_ptr<Trainer> Create(
+      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+    return std::shared_ptr<Trainer>(
+        new Trainer(target_layer, feature_transformer));
+  }
+
+  // ハイパーパラメータなどのオプションを設定する
+  void SendMessage(Message* message) {
+    previous_layer_trainer_->SendMessage(message);
+    if (ReceiveMessage("check_health", message)) {
+      CheckHealth();
+    }
+  }
+
+  // パラメータを乱数で初期化する
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    previous_layer_trainer_->Initialize(rng);
+  }
+
+  // 順伝播
+  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (output_.size() < kOutputDimensions * batch.size()) {
+      output_.resize(kOutputDimensions * batch.size());
+      gradients_.resize(kInputDimensions * batch.size());
+    }
+    const auto input = previous_layer_trainer_->Propagate(batch);
+    batch_size_ = static_cast<IndexType>(batch.size());
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        const IndexType index = batch_offset + i;
+        output_[index] = std::max(+kZero, std::min(+kOne, input[index]));
+        min_activations_[i] = std::min(min_activations_[i], output_[index]);
+        max_activations_[i] = std::max(max_activations_[i], output_[index]);
+      }
+    }
+    return output_.data();
+  }
+
+  // 逆伝播
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        const IndexType index = batch_offset + i;
+        gradients_[index] = gradients[index] *
+            (output_[index] > kZero) * (output_[index] < kOne);
+      }
+    }
+    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
+  }
+
+ private:
+  // コンストラクタ
+  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+      batch_size_(0),
+      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+          &target_layer->previous_layer_, feature_transformer)),
+      target_layer_(target_layer) {
+    std::fill(std::begin(min_activations_), std::end(min_activations_),
+              std::numeric_limits<LearnFloatType>::max());
+    std::fill(std::begin(max_activations_), std::end(max_activations_),
+              std::numeric_limits<LearnFloatType>::lowest());
+  }
+
+  // 学習に問題が生じていないかチェックする
+  void CheckHealth() {
+    const auto largest_min_activation = *std::max_element(
+        std::begin(min_activations_), std::end(min_activations_));
+    const auto smallest_max_activation = *std::min_element(
+        std::begin(max_activations_), std::end(max_activations_));
+    std::cout << "INFO: largest min activation = " << largest_min_activation
+              << ", smallest max activation = " << smallest_max_activation
+              << std::endl;
+
+    std::fill(std::begin(min_activations_), std::end(min_activations_),
+              std::numeric_limits<LearnFloatType>::max());
+    std::fill(std::begin(max_activations_), std::end(max_activations_),
+              std::numeric_limits<LearnFloatType>::lowest());
+  }
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+  // LearnFloatTypeの定数
+  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+
+  // ミニバッチのサンプル数
+  IndexType batch_size_;
+
+  // 直前の層のTrainer
+  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+  // 学習対象の層
+  LayerType* const target_layer_;
+
+  // 順伝播用バッファ
+  std::vector<LearnFloatType> output_;
+
+  // 逆伝播用バッファ
+  std::vector<LearnFloatType> gradients_;
+
+  // ヘルスチェック用統計値
+  LearnFloatType min_activations_[kOutputDimensions];
+  LearnFloatType max_activations_[kOutputDimensions];
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/trainer/trainer_feature_transformer.h
+++ b/src/eval/nnue/trainer/trainer_feature_transformer.h
@@ -0,0 +1,379 @@
+// NNUE評価関数の学習クラステンプレートのFeatureTransformer用特殊化
+
+#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
+#define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
+
+#include "../../../config.h"
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../../learn/learn.h"
+#include "../nnue_feature_transformer.h"
+#include "trainer.h"
+#include "features/factorizer_feature_set.h"
+
+#include <array>
+#include <bitset>
+#include <numeric>
+#include <random>
+#include <set>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+namespace Eval {
+
+namespace NNUE {
+
+// 学習：入力特徴量変換器
+template <>
+class Trainer<FeatureTransformer> {
+ private:
+  // 学習対象の層の型
+  using LayerType = FeatureTransformer;
+
+ public:
+  template <typename T>
+  friend struct AlignedDeleter;
+  template <typename T, typename... ArgumentTypes>
+  friend std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments);
+
+  // ファクトリ関数
+  static std::shared_ptr<Trainer> Create(LayerType* target_layer) {
+    return MakeAlignedSharedPtr<Trainer>(target_layer);
+  }
+
+  // ハイパーパラメータなどのオプションを設定する
+  void SendMessage(Message* message) {
+    if (ReceiveMessage("momentum", message)) {
+      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
+    }
+    if (ReceiveMessage("learning_rate_scale", message)) {
+      learning_rate_scale_ =
+          static_cast<LearnFloatType>(std::stod(message->value));
+    }
+    if (ReceiveMessage("reset", message)) {
+      DequantizeParameters();
+    }
+    if (ReceiveMessage("quantize_parameters", message)) {
+      QuantizeParameters();
+    }
+    if (ReceiveMessage("clear_unobserved_feature_weights", message)) {
+      ClearUnobservedFeatureWeights();
+    }
+    if (ReceiveMessage("check_health", message)) {
+      CheckHealth();
+    }
+  }
+
+  // パラメータを乱数で初期化する
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    std::fill(std::begin(weights_), std::end(weights_), +kZero);
+    const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
+    auto distribution = std::normal_distribution<double>(0.0, kSigma);
+    for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
+      const auto weight = static_cast<LearnFloatType>(distribution(rng));
+      weights_[i] = weight;
+    }
+    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+      biases_[i] = static_cast<LearnFloatType>(0.5);
+    }
+    QuantizeParameters();
+  }
+
+  // 順伝播
+  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (output_.size() < kOutputDimensions * batch.size()) {
+      output_.resize(kOutputDimensions * batch.size());
+      gradients_.resize(kOutputDimensions * batch.size());
+    }
+    batch_ = &batch;
+    // affine transform
+#pragma omp parallel for
+    for (IndexType b = 0; b < batch.size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType c = 0; c < 2; ++c) {
+        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+#if defined(USE_BLAS)
+        cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
+        for (const auto& feature : batch[b].training_features[c]) {
+          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+          cblas_saxpy(kHalfDimensions, (float)feature.GetCount(),
+                      &weights_[weights_offset], 1, &output_[output_offset], 1);
+        }
+#else
+        for (IndexType i = 0; i < kHalfDimensions; ++i) {
+          output_[output_offset + i] = biases_[i];
+        }
+        for (const auto& feature : batch[b].training_features[c]) {
+          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+          for (IndexType i = 0; i < kHalfDimensions; ++i) {
+            output_[output_offset + i] +=
+                feature.GetCount() * weights_[weights_offset + i];
+          }
+        }
+#endif
+      }
+    }
+    // clipped ReLU
+    for (IndexType b = 0; b < batch.size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        const IndexType index = batch_offset + i;
+        min_pre_activation_ = std::min(min_pre_activation_, output_[index]);
+        max_pre_activation_ = std::max(max_pre_activation_, output_[index]);
+        output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
+        const IndexType t = i % kHalfDimensions;
+        min_activations_[t] = std::min(min_activations_[t], output_[index]);
+        max_activations_[t] = std::max(max_activations_[t], output_[index]);
+      }
+    }
+    return output_.data();
+  }
+
+  // 逆伝播
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    const LearnFloatType local_learning_rate =
+        learning_rate * learning_rate_scale_;
+    for (IndexType b = 0; b < batch_->size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        const IndexType index = batch_offset + i;
+        gradients_[index] = gradients[index] *
+            ((output_[index] > kZero) * (output_[index] < kOne));
+      }
+    }
+    // 重み行列は入力に出現した特徴量に対応する列のみを更新するため、
+    // momentumを使用せず、学習率を補正してスケールを合わせる
+    const LearnFloatType effective_learning_rate =
+        static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
+#if defined(USE_BLAS)
+    cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1);
+    for (IndexType b = 0; b < batch_->size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType c = 0; c < 2; ++c) {
+        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+        cblas_saxpy(kHalfDimensions, 1.0,
+                    &gradients_[output_offset], 1, biases_diff_, 1);
+      }
+    }
+    cblas_saxpy(kHalfDimensions, -local_learning_rate,
+                biases_diff_, 1, biases_, 1);
+#pragma omp parallel
+    {
+#if defined(_OPENMP)
+      const IndexType num_threads = omp_get_num_threads();
+      const IndexType thread_index = omp_get_thread_num();
+#endif
+      for (IndexType b = 0; b < batch_->size(); ++b) {
+        const IndexType batch_offset = kOutputDimensions * b;
+        for (IndexType c = 0; c < 2; ++c) {
+          const IndexType output_offset = batch_offset + kHalfDimensions * c;
+          for (const auto& feature : (*batch_)[b].training_features[c]) {
+#if defined(_OPENMP)
+            if (feature.GetIndex() % num_threads != thread_index) continue;
+#endif
+            const IndexType weights_offset =
+                kHalfDimensions * feature.GetIndex();
+            const auto scale = static_cast<LearnFloatType>(
+                effective_learning_rate / feature.GetCount());
+            cblas_saxpy(kHalfDimensions, -scale,
+                        &gradients_[output_offset], 1,
+                        &weights_[weights_offset], 1);
+          }
+        }
+      }
+    }
+#else
+    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+      biases_diff_[i] *= momentum_;
+    }
+    for (IndexType b = 0; b < batch_->size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType c = 0; c < 2; ++c) {
+        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+        for (IndexType i = 0; i < kHalfDimensions; ++i) {
+          biases_diff_[i] += gradients_[output_offset + i];
+        }
+      }
+    }
+    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+      biases_[i] -= local_learning_rate * biases_diff_[i];
+    }
+    for (IndexType b = 0; b < batch_->size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType c = 0; c < 2; ++c) {
+        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+        for (const auto& feature : (*batch_)[b].training_features[c]) {
+          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+          const auto scale = static_cast<LearnFloatType>(
+              effective_learning_rate / feature.GetCount());
+          for (IndexType i = 0; i < kHalfDimensions; ++i) {
+            weights_[weights_offset + i] -=
+                scale * gradients_[output_offset + i];
+          }
+        }
+      }
+    }
+#endif
+    for (IndexType b = 0; b < batch_->size(); ++b) {
+      for (IndexType c = 0; c < 2; ++c) {
+        for (const auto& feature : (*batch_)[b].training_features[c]) {
+          observed_features.set(feature.GetIndex());
+        }
+      }
+    }
+  }
+
+ private:
+  // コンストラクタ
+  Trainer(LayerType* target_layer) :
+      batch_(nullptr),
+      target_layer_(target_layer),
+      biases_(),
+      weights_(),
+      biases_diff_(),
+      momentum_(0.0),
+      learning_rate_scale_(1.0) {
+    min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
+    max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
+    std::fill(std::begin(min_activations_), std::end(min_activations_),
+              std::numeric_limits<LearnFloatType>::max());
+    std::fill(std::begin(max_activations_), std::end(max_activations_),
+              std::numeric_limits<LearnFloatType>::lowest());
+    DequantizeParameters();
+  }
+
+  // 重みの飽和とパラメータの整数化
+  void QuantizeParameters() {
+    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+      target_layer_->biases_[i] =
+          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+    }
+    std::vector<TrainingFeature> training_features;
+#pragma omp parallel for private(training_features)
+    for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
+      training_features.clear();
+      Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+          j, &training_features);
+      for (IndexType i = 0; i < kHalfDimensions; ++i) {
+        double sum = 0.0;
+        for (const auto& feature : training_features) {
+          sum += weights_[kHalfDimensions * feature.GetIndex() + i];
+        }
+        target_layer_->weights_[kHalfDimensions * j + i] =
+            Round<typename LayerType::WeightType>(sum * kWeightScale);
+      }
+    }
+  }
+
+  // 整数化されたパラメータの読み込み
+  void DequantizeParameters() {
+    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+      biases_[i] = static_cast<LearnFloatType>(
+          target_layer_->biases_[i] / kBiasScale);
+    }
+    std::fill(std::begin(weights_), std::end(weights_), +kZero);
+    for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
+      weights_[i] = static_cast<LearnFloatType>(
+          target_layer_->weights_[i] / kWeightScale);
+    }
+    std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
+  }
+
+  // 学習データに出現していない特徴量に対応する重みを0にする
+  void ClearUnobservedFeatureWeights() {
+    for (IndexType i = 0; i < kInputDimensions; ++i) {
+      if (!observed_features.test(i)) {
+        std::fill(std::begin(weights_) + kHalfDimensions * i,
+                  std::begin(weights_) + kHalfDimensions * (i + 1), +kZero);
+      }
+    }
+    QuantizeParameters();
+  }
+
+  // 学習に問題が生じていないかチェックする
+  void CheckHealth() {
+    std::cout << "INFO: observed " << observed_features.count()
+              << " (out of " << kInputDimensions << ") features" << std::endl;
+
+    constexpr LearnFloatType kPreActivationLimit =
+        std::numeric_limits<typename LayerType::WeightType>::max() /
+        kWeightScale;
+    std::cout << "INFO: (min, max) of pre-activations = "
+              << min_pre_activation_ << ", "
+              << max_pre_activation_ << " (limit = "
+              << kPreActivationLimit << ")" << std::endl;
+
+    const auto largest_min_activation = *std::max_element(
+        std::begin(min_activations_), std::end(min_activations_));
+    const auto smallest_max_activation = *std::min_element(
+        std::begin(max_activations_), std::end(max_activations_));
+    std::cout << "INFO: largest min activation = " << largest_min_activation
+              << ", smallest max activation = " << smallest_max_activation
+              << std::endl;
+
+    std::fill(std::begin(min_activations_), std::end(min_activations_),
+              std::numeric_limits<LearnFloatType>::max());
+    std::fill(std::begin(max_activations_), std::end(max_activations_),
+              std::numeric_limits<LearnFloatType>::lowest());
+  }
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions =
+      Features::Factorizer<RawFeatures>::GetDimensions();
+  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+  static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
+
+  // パラメータの整数化で用いる係数
+  static constexpr LearnFloatType kActivationScale =
+      std::numeric_limits<std::int8_t>::max();
+  static constexpr LearnFloatType kBiasScale = kActivationScale;
+  static constexpr LearnFloatType kWeightScale = kActivationScale;
+
+  // LearnFloatTypeの定数
+  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+
+  // ミニバッチ
+  const std::vector<Example>* batch_;
+
+  // 学習対象の層
+  LayerType* const target_layer_;
+
+  // パラメータ
+  alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
+  alignas(kCacheLineSize)
+      LearnFloatType weights_[kHalfDimensions * kInputDimensions];
+
+  // パラメータの更新で用いるバッファ
+  LearnFloatType biases_diff_[kHalfDimensions];
+  std::vector<LearnFloatType> gradients_;
+
+  // 順伝播用バッファ
+  std::vector<LearnFloatType> output_;
+
+  // 学習データに出現した特徴量
+  std::bitset<kInputDimensions> observed_features;
+
+  // ハイパーパラメータ
+  LearnFloatType momentum_;
+  LearnFloatType learning_rate_scale_;
+
+  // ヘルスチェック用統計値
+  LearnFloatType min_pre_activation_;
+  LearnFloatType max_pre_activation_;
+  LearnFloatType min_activations_[kHalfDimensions];
+  LearnFloatType max_activations_[kHalfDimensions];
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/trainer/trainer_input_slice.h
+++ b/src/eval/nnue/trainer/trainer_input_slice.h
@@ -0,0 +1,253 @@
+// NNUE評価関数の学習クラステンプレートのInputSlice用特殊化
+
+#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
+#define _NNUE_TRAINER_INPUT_SLICE_H_
+
+#include "../../../config.h"
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../../learn/learn.h"
+#include "../layers/input_slice.h"
+#include "trainer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 学習：入力層
+class SharedInputTrainer {
+ public:
+  // ファクトリ関数
+  static std::shared_ptr<SharedInputTrainer> Create(
+      FeatureTransformer* feature_transformer) {
+    static std::shared_ptr<SharedInputTrainer> instance;
+    if (!instance) {
+      instance.reset(new SharedInputTrainer(feature_transformer));
+    }
+    ++instance->num_referrers_;
+    return instance;
+  }
+
+  // ハイパーパラメータなどのオプションを設定する
+  void SendMessage(Message* message) {
+    if (num_calls_ == 0) {
+      current_operation_ = Operation::kSendMessage;
+      feature_transformer_trainer_->SendMessage(message);
+    }
+    ASSERT_LV3(current_operation_ == Operation::kSendMessage);
+    if (++num_calls_ == num_referrers_) {
+      num_calls_ = 0;
+      current_operation_ = Operation::kNone;
+    }
+  }
+
+  // パラメータを乱数で初期化する
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    if (num_calls_ == 0) {
+      current_operation_ = Operation::kInitialize;
+      feature_transformer_trainer_->Initialize(rng);
+    }
+    ASSERT_LV3(current_operation_ == Operation::kInitialize);
+    if (++num_calls_ == num_referrers_) {
+      num_calls_ = 0;
+      current_operation_ = Operation::kNone;
+    }
+  }
+
+  // 順伝播
+  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (gradients_.size() < kInputDimensions * batch.size()) {
+      gradients_.resize(kInputDimensions * batch.size());
+    }
+    batch_size_ = static_cast<IndexType>(batch.size());
+    if (num_calls_ == 0) {
+      current_operation_ = Operation::kPropagate;
+      output_ = feature_transformer_trainer_->Propagate(batch);
+    }
+    ASSERT_LV3(current_operation_ == Operation::kPropagate);
+    if (++num_calls_ == num_referrers_) {
+      num_calls_ = 0;
+      current_operation_ = Operation::kNone;
+    }
+    return output_;
+  }
+
+  // 逆伝播
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    if (num_referrers_ == 1) {
+      feature_transformer_trainer_->Backpropagate(gradients, learning_rate);
+      return;
+    }
+    if (num_calls_ == 0) {
+      current_operation_ = Operation::kBackPropagate;
+      for (IndexType b = 0; b < batch_size_; ++b) {
+        const IndexType batch_offset = kInputDimensions * b;
+        for (IndexType i = 0; i < kInputDimensions; ++i) {
+          gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
+        }
+      }
+    }
+    ASSERT_LV3(current_operation_ == Operation::kBackPropagate);
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kInputDimensions * b;
+      for (IndexType i = 0; i < kInputDimensions; ++i) {
+        gradients_[batch_offset + i] += gradients[batch_offset + i];
+      }
+    }
+    if (++num_calls_ == num_referrers_) {
+      feature_transformer_trainer_->Backpropagate(
+          gradients_.data(), learning_rate);
+      num_calls_ = 0;
+      current_operation_ = Operation::kNone;
+    }
+  }
+
+ private:
+  // コンストラクタ
+  SharedInputTrainer(FeatureTransformer* feature_transformer) :
+      batch_size_(0),
+      num_referrers_(0),
+      num_calls_(0),
+      current_operation_(Operation::kNone),
+      feature_transformer_trainer_(Trainer<FeatureTransformer>::Create(
+          feature_transformer)),
+      output_(nullptr) {
+  }
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions =
+      FeatureTransformer::kOutputDimensions;
+
+  // 処理の種類
+  enum class Operation {
+    kNone,
+    kSendMessage,
+    kInitialize,
+    kPropagate,
+    kBackPropagate,
+  };
+
+  // ミニバッチのサンプル数
+  IndexType batch_size_;
+
+  // この層を入力として共有する層の数
+  std::uint32_t num_referrers_;
+
+  // 現在の処理が呼び出された回数
+  std::uint32_t num_calls_;
+
+  // 現在の処理の種類
+  Operation current_operation_;
+
+  // 入力特徴量変換器のTrainer
+  const std::shared_ptr<Trainer<FeatureTransformer>>
+      feature_transformer_trainer_;
+
+  // 順伝播用に共有する出力のポインタ
+  const LearnFloatType* output_;
+
+  // 逆伝播用バッファ
+  std::vector<LearnFloatType> gradients_;
+};
+
+// 学習：入力層
+template <IndexType OutputDimensions, IndexType Offset>
+class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
+ private:
+  // 学習対象の層の型
+  using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
+
+ public:
+  // ファクトリ関数
+  static std::shared_ptr<Trainer> Create(
+      LayerType* /*target_layer*/, FeatureTransformer* feature_transformer) {
+    return std::shared_ptr<Trainer>(new Trainer(feature_transformer));
+  }
+
+  // ハイパーパラメータなどのオプションを設定する
+  void SendMessage(Message* message) {
+    shared_input_trainer_->SendMessage(message);
+  }
+
+  // パラメータを乱数で初期化する
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    shared_input_trainer_->Initialize(rng);
+  }
+
+  // 順伝播
+  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (output_.size() < kOutputDimensions * batch.size()) {
+      output_.resize(kOutputDimensions * batch.size());
+      gradients_.resize(kInputDimensions * batch.size());
+    }
+    batch_size_ = static_cast<IndexType>(batch.size());
+    const auto input = shared_input_trainer_->Propagate(batch);
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType input_offset = kInputDimensions * b;
+      const IndexType output_offset = kOutputDimensions * b;
+#if defined(USE_BLAS)
+      cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
+                  &output_[output_offset], 1);
+#else
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        output_[output_offset + i] = input[input_offset + Offset + i];
+      }
+#endif
+    }
+    return output_.data();
+  }
+
+  // 逆伝播
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType input_offset = kInputDimensions * b;
+      const IndexType output_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kInputDimensions; ++i) {
+        if (i < Offset || i >= Offset + kOutputDimensions) {
+          gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+        } else {
+          gradients_[input_offset + i] = gradients[output_offset + i - Offset];
+        }
+      }
+    }
+    shared_input_trainer_->Backpropagate(gradients_.data(), learning_rate);
+  }
+
+ private:
+  // コンストラクタ
+  Trainer(FeatureTransformer* feature_transformer) :
+      batch_size_(0),
+      shared_input_trainer_(SharedInputTrainer::Create(feature_transformer)) {
+  }
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions =
+      FeatureTransformer::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = OutputDimensions;
+  static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
+
+  // ミニバッチのサンプル数
+  IndexType batch_size_;
+
+  // 共有入力層のTrainer
+  const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
+
+  // 順伝播用バッファ
+  std::vector<LearnFloatType> output_;
+
+  // 逆伝播用バッファ
+  std::vector<LearnFloatType> gradients_;
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
--- a/src/eval/nnue/trainer/trainer_sum.h
+++ b/src/eval/nnue/trainer/trainer_sum.h
@@ -0,0 +1,192 @@
+// NNUE評価関数の学習クラステンプレートのSum用特殊化
+
+#ifndef _NNUE_TRAINER_SUM_H_
+#define _NNUE_TRAINER_SUM_H_
+
+#include "../../../config.h"
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../../learn/learn.h"
+#include "../layers/sum.h"
+#include "trainer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 学習：複数の層の出力の和を取る層
+template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
+      Trainer<Layers::Sum<RemainingPreviousLayers...>> {
+ private:
+  // 学習対象の層の型
+  using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
+  using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
+
+ public:
+  // ファクトリ関数
+  static std::shared_ptr<Trainer> Create(
+      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+    return std::shared_ptr<Trainer>(
+        new Trainer(target_layer, feature_transformer));
+  }
+
+  // ハイパーパラメータなどのオプションを設定する
+  void SendMessage(Message* message) {
+    // 他のメンバ関数の結果は処理の順番に依存しないため、
+    // 実装をシンプルにすることを目的としてTailを先に処理するが、
+    // SendMessageは添字の対応を分かりやすくするためにHeadを先に処理する
+    previous_layer_trainer_->SendMessage(message);
+    Tail::SendMessage(message);
+  }
+
+  // パラメータを乱数で初期化する
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    Tail::Initialize(rng);
+    previous_layer_trainer_->Initialize(rng);
+  }
+
+  // 順伝播
+  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    batch_size_ = static_cast<IndexType>(batch.size());
+    auto output = Tail::Propagate(batch);
+    const auto head_output = previous_layer_trainer_->Propagate(batch);
+#if defined(USE_BLAS)
+    cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
+                head_output, 1, output, 1);
+#else
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        output[batch_offset + i] += head_output[batch_offset + i];
+      }
+    }
+#endif
+    return output;
+  }
+
+  // 逆伝播
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    Tail::Backpropagate(gradients, learning_rate);
+    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
+  }
+
+ private:
+  // コンストラクタ
+  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+      Tail(target_layer, feature_transformer),
+      batch_size_(0),
+      previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
+          &target_layer->previous_layer_, feature_transformer)),
+      target_layer_(target_layer) {
+  }
+
+  // 入出力の次元数
+  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+  // サブクラスをfriendにする
+  template <typename SumLayer>
+  friend class Trainer;
+
+  // ミニバッチのサンプル数
+  IndexType batch_size_;
+
+  // 直前の層のTrainer
+  const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
+
+  // 学習対象の層
+  LayerType* const target_layer_;
+};
+
+
+// 学習：複数の層の出力の和を取る層（テンプレート引数が1つの場合）
+template <typename PreviousLayer>
+class Trainer<Layers::Sum<PreviousLayer>> {
+ private:
+  // 学習対象の層の型
+  using LayerType = Layers::Sum<PreviousLayer>;
+
+ public:
+  // ファクトリ関数
+  static std::shared_ptr<Trainer> Create(
+      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+    return std::shared_ptr<Trainer>(
+        new Trainer(target_layer, feature_transformer));
+  }
+
+  // ハイパーパラメータなどのオプションを設定する
+  void SendMessage(Message* message) {
+    previous_layer_trainer_->SendMessage(message);
+  }
+
+  // パラメータを乱数で初期化する
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    previous_layer_trainer_->Initialize(rng);
+  }
+
+  // 順伝播
+  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (output_.size() < kOutputDimensions * batch.size()) {
+      output_.resize(kOutputDimensions * batch.size());
+    }
+    batch_size_ = static_cast<IndexType>(batch.size());
+    const auto output = previous_layer_trainer_->Propagate(batch);
+#if defined(USE_BLAS)
+    cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
+#else
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        output_[batch_offset + i] = output[batch_offset + i];
+      }
+    }
+#endif
+    return output_.data();
+  }
+
+  // 逆伝播
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
+  }
+
+ private:
+  // コンストラクタ
+  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+      batch_size_(0),
+      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+          &target_layer->previous_layer_, feature_transformer)),
+      target_layer_(target_layer) {
+  }
+
+  // 入出力の次元数
+  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+  // サブクラスをfriendにする
+  template <typename SumLayer>
+  friend class Trainer;
+
+  // ミニバッチのサンプル数
+  IndexType batch_size_;
+
+  // 直前の層のTrainer
+  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+  // 学習対象の層
+  LayerType* const target_layer_;
+
+  // 順伝播用バッファ
+  std::vector<LearnFloatType> output_;
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif