Cleanup nnue

2025-12-24 19:16:49 +08:00 · 2020-10-14 21:45:38 +02:00
parent c286f9cd7d
commit 497f689aa3
10 changed files with 1003 additions and 970 deletions
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -214,13 +214,13 @@ namespace Eval::NNUE {

    std::string eval_file = std::string(Options["EvalFile"]);

-    #if defined(DEFAULT_NNUE_DIRECTORY)
-    #define stringify2(x) #x
-    #define stringify(x) stringify2(x)
+#if defined(DEFAULT_NNUE_DIRECTORY)
+#define stringify2(x) #x
+#define stringify(x) stringify2(x)
    std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
-    #else
+#else
    std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
-    #endif
+#endif

    for (std::string directory : dirs)
        if (eval_file_loaded != eval_file)
@@ -238,8 +238,8 @@ namespace Eval::NNUE {
            }
        }

-    #undef stringify2
-    #undef stringify
+#undef stringify2
+#undef stringify
  }

  /// NNUE::verify() verifies that the last net used was loaded successfully
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -1,23 +1,21 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.

-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.

-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

-// header used in NNUE evaluation function
-
 #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
 #define NNUE_EVALUATE_NNUE_H_INCLUDED

@@ -25,79 +23,82 @@

 #include <memory>

+// header used in NNUE evaluation function
 namespace Eval::NNUE {

-  enum struct UseNNUEMode
-  {
-    False,
-    True,
-    Pure
-  };
+    enum struct UseNNUEMode
+    {
+        False,
+        True,
+        Pure
+    };

-  // Hash value of evaluation function structure
-  constexpr std::uint32_t kHashValue =
-      FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
+    // Hash value of evaluation function structure
+    constexpr std::uint32_t kHashValue =
+        FeatureTransformer::GetHashValue() ^ Network::GetHashValue();

-  // Deleter for automating release of memory area
-  template <typename T>
-  struct AlignedDeleter {
-    void operator()(T* ptr) const {
-      ptr->~T();
-      std_aligned_free(ptr);
-    }
-  };
+    // Deleter for automating release of memory area
+    template <typename T>
+    struct AlignedDeleter {
+        void operator()(T* ptr) const {
+            ptr->~T();
+            std_aligned_free(ptr);
+        }
+    };

-  template <typename T>
-  struct LargePageDeleter {
-    void operator()(T* ptr) const {
-      ptr->~T();
-      aligned_large_pages_free(ptr);
-    }
-  };
+    template <typename T>
+    struct LargePageDeleter {
+        void operator()(T* ptr) const {
+            ptr->~T();
+            aligned_large_pages_free(ptr);
+        }
+    };

-  template <typename T>
-  using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
+    template <typename T>
+    using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;

-  template <typename T>
-  using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
+    template <typename T>
+    using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;

-  // Input feature converter
-  extern LargePagePtr<FeatureTransformer> feature_transformer;
+    // Input feature converter
+    extern LargePagePtr<FeatureTransformer> feature_transformer;

-  // Evaluation function
-  extern AlignedPtr<Network> network;
+    // Evaluation function
+    extern AlignedPtr<Network> network;

-  // Evaluation function file name
-  extern std::string fileName;
+    // Evaluation function file name
+    extern std::string fileName;

-  // Saved evaluation function file name
-  extern std::string savedfileName;
+    // Saved evaluation function file name
+    extern std::string savedfileName;

-  extern UseNNUEMode useNNUE;
-  extern std::string eval_file_loaded;
+    extern UseNNUEMode useNNUE;

-  // Get a string that represents the structure of the evaluation function
-  std::string GetArchitectureString();
+    extern std::string eval_file_loaded;

-  // read the header
-  bool ReadHeader(std::istream& stream,
-    std::uint32_t* hash_value, std::string* architecture);
+    // Get a string that represents the structure of the evaluation function
+    std::string GetArchitectureString();

-  // write the header
-  bool WriteHeader(std::ostream& stream,
-    std::uint32_t hash_value, const std::string& architecture);
+    // read the header
+    bool ReadHeader(std::istream& stream,
+        std::uint32_t* hash_value, std::string* architecture);

-  // read evaluation function parameters
-  bool ReadParameters(std::istream& stream);
+    // write the header
+    bool WriteHeader(std::ostream& stream,
+        std::uint32_t hash_value, const std::string& architecture);

-  // write evaluation function parameters
-  bool WriteParameters(std::ostream& stream);
+    // read evaluation function parameters
+    bool ReadParameters(std::istream& stream);

-  Value evaluate(const Position& pos);
-  bool load_eval(std::string name, std::istream& stream);
-  void init();
-  void verify_eval_file_loaded();
-  void verify_any_net_loaded();
+    // write evaluation function parameters
+    bool WriteParameters(std::ostream& stream);
+
+    Value evaluate(const Position& pos);
+    bool load_eval(std::string name, std::istream& stream);
+    void init();
+
+    void verify_eval_file_loaded();
+    void verify_any_net_loaded();

 }  // namespace Eval::NNUE

--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -1,18 +1,10 @@
-// Code for learning NNUE evaluation function
-
-#include <random>
+#include <random>
 #include <fstream>
 #include <filesystem>

-#include "../learn/learn.h"
-
-#include "../position.h"
-#include "../uci.h"
-#include "../misc.h"
-#include "../thread_win32_osx.h"
-
 #include "evaluate_nnue.h"
 #include "evaluate_nnue_learner.h"
+
 #include "trainer/features/factorizer_feature_set.h"
 #include "trainer/features/factorizer_half_kp.h"
 #include "trainer/trainer_feature_transformer.h"
@@ -21,191 +13,207 @@
 #include "trainer/trainer_clipped_relu.h"
 #include "trainer/trainer_sum.h"

+#include "position.h"
+#include "uci.h"
+#include "misc.h"
+#include "thread_win32_osx.h"
+
+#include "learn/learn.h"
+
 // Learning rate scale
 double global_learning_rate;

+// Code for learning NNUE evaluation function
 namespace Eval::NNUE {

-  namespace {
+    namespace {

-    // learning data
-    std::vector<Example> examples;
+        // learning data
+        std::vector<Example> examples;

-    // Mutex for exclusive control of examples
-    std::mutex examples_mutex;
+        // Mutex for exclusive control of examples
+        std::mutex examples_mutex;

-    // number of samples in mini-batch
-    uint64_t batch_size;
+        // number of samples in mini-batch
+        uint64_t batch_size;

-    // random number generator
-    std::mt19937 rng;
+        // random number generator
+        std::mt19937 rng;

-    // learner
-    std::shared_ptr<Trainer<Network>> trainer;
+        // learner
+        std::shared_ptr<Trainer<Network>> trainer;

-    // Tell the learner options such as hyperparameters
-    void SendMessages(std::vector<Message> messages) {
-      for (auto& message : messages) {
-        trainer->SendMessage(&message);
-        assert(message.num_receivers > 0);
-      }
-    }
-
-  }  // namespace
-
-  // Initialize learning
-  void InitializeTraining(const std::string& seed) {
-    std::cout << "Initializing NN training for "
-              << GetArchitectureString() << std::endl;
-
-    assert(feature_transformer);
-    assert(network);
-    trainer = Trainer<Network>::Create(network.get(), feature_transformer.get());
-    rng.seed(PRNG(seed).rand<uint64_t>());
-
-    if (Options["SkipLoadingEval"]) {
-      trainer->Initialize(rng);
-    }
-  }
-
-  // set the number of samples in the mini-batch
-  void SetBatchSize(uint64_t size) {
-    assert(size > 0);
-    batch_size = size;
-  }
-  
-  // Set options such as hyperparameters
-  void SetOptions(const std::string& options) {
-    std::vector<Message> messages;
-    for (const auto& option : Split(options, ',')) {
-      const auto fields = Split(option, '=');
-      assert(fields.size() == 1 || fields.size() == 2);
-      if (fields.size() == 1) {
-        messages.emplace_back(fields[0]);
-      } else {
-        messages.emplace_back(fields[0], fields[1]);
-      }
-    }
-    SendMessages(std::move(messages));
-  }
-
-  // Reread the evaluation function parameters for learning from the file
-  void RestoreParameters(const std::string& dir_name) {
-    const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
-    std::ifstream stream(file_name, std::ios::binary);
-#ifndef NDEBUG
-    bool result =
-#endif
-    ReadParameters(stream);
-#ifndef NDEBUG
-    assert(result);
-#endif
-
-    SendMessages({{"reset"}});
-  }
-
-  void FinalizeNet() {
-    SendMessages({{"clear_unobserved_feature_weights"}});
-  }
-
-  // Add 1 sample of learning data
-  void AddExample(Position& pos, Color rootColor,
-                  const Learner::PackedSfenValue& psv, double weight) {
-    Example example;
-    if (rootColor == pos.side_to_move()) {
-      example.sign = 1;
-    } else {
-      example.sign = -1;
-    }
-    example.psv = psv;
-    example.weight = weight;
-
-    Features::IndexList active_indices[2];
-    for (const auto trigger : kRefreshTriggers) {
-      RawFeatures::AppendActiveIndices(pos, trigger, active_indices);
-    }
-    if (pos.side_to_move() != WHITE) {
-      active_indices[0].swap(active_indices[1]);
-    }
-    for (const auto color : Colors) {
-      std::vector<TrainingFeature> training_features;
-      for (const auto base_index : active_indices[color]) {
-        static_assert(Features::Factorizer<RawFeatures>::GetDimensions() <
-                      (1 << TrainingFeature::kIndexBits), "");
-        Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
-            base_index, &training_features);
-      }
-      std::sort(training_features.begin(), training_features.end());
-
-      auto& unique_features = example.training_features[color];
-      for (const auto& feature : training_features) {
-        if (!unique_features.empty() &&
-            feature.GetIndex() == unique_features.back().GetIndex()) {
-          unique_features.back() += feature;
-        } else {
-          unique_features.push_back(feature);
+        // Tell the learner options such as hyperparameters
+        void SendMessages(std::vector<Message> messages) {
+            for (auto& message : messages) {
+                trainer->SendMessage(&message);
+                assert(message.num_receivers > 0);
+            }
+        }
+
+    }  // namespace
+
+    // Initialize learning
+    void InitializeTraining(const std::string& seed) {
+        std::cout << "Initializing NN training for "
+                  << GetArchitectureString() << std::endl;
+
+        assert(feature_transformer);
+        assert(network);
+        trainer = Trainer<Network>::Create(network.get(), feature_transformer.get());
+        rng.seed(PRNG(seed).rand<uint64_t>());
+
+        if (Options["SkipLoadingEval"]) {
+            trainer->Initialize(rng);
        }
-      }
    }

-    std::lock_guard<std::mutex> lock(examples_mutex);
-    examples.push_back(std::move(example));
-  }
-
-  // update the evaluation function parameters
-  void UpdateParameters() {
-    assert(batch_size > 0);
-
-    const auto learning_rate = static_cast<LearnFloatType>(
-        global_learning_rate / batch_size);
-
-    std::lock_guard<std::mutex> lock(examples_mutex);
-    std::shuffle(examples.begin(), examples.end(), rng);
-    while (examples.size() >= batch_size) {
-      std::vector<Example> batch(examples.end() - batch_size, examples.end());
-      examples.resize(examples.size() - batch_size);
-
-      const auto network_output = trainer->Propagate(batch);
-
-      std::vector<LearnFloatType> gradients(batch.size());
-      for (std::size_t b = 0; b < batch.size(); ++b) {
-        const auto shallow = static_cast<Value>(Round<std::int32_t>(
-            batch[b].sign * network_output[b] * kPonanzaConstant));
-        const auto& psv = batch[b].psv;
-        const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
-        gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
-      }
-
-      trainer->Backpropagate(gradients.data(), learning_rate);
+    // set the number of samples in the mini-batch
+    void SetBatchSize(uint64_t size) {
+        assert(size > 0);
+        batch_size = size;
    }
-    SendMessages({{"quantize_parameters"}});
-  }

-  // Check if there are any problems with learning
-  void CheckHealth() {
-    SendMessages({{"check_health"}});
-  }
+    // Set options such as hyperparameters
+    void SetOptions(const std::string& options) {
+        std::vector<Message> messages;
+        for (const auto& option : Split(options, ',')) {
+          const auto fields = Split(option, '=');
+          assert(fields.size() == 1 || fields.size() == 2);

-  // save merit function parameters to a file
-  void save_eval(std::string dir_name) {
-    auto eval_dir = Path::Combine(Options["EvalSaveDir"], dir_name);
-    std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
+          if (fields.size() == 1) {
+              messages.emplace_back(fields[0]);
+          } else {
+              messages.emplace_back(fields[0], fields[1]);
+          }
+        }

-    // mkdir() will fail if this folder already exists, but
-    // Apart from that. If not, I just want you to make it.
-    // Also, assume that the folders up to EvalSaveDir have been dug.
-    std::filesystem::create_directories(eval_dir);
+        SendMessages(std::move(messages));
+    }

-    const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
-    std::ofstream stream(file_name, std::ios::binary);
+    // Reread the evaluation function parameters for learning from the file
+    void RestoreParameters(const std::string& dir_name) {
+        const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
+        std::ifstream stream(file_name, std::ios::binary);
 #ifndef NDEBUG
-    bool result =
+        bool result =
 #endif
-    WriteParameters(stream);
+        ReadParameters(stream);
 #ifndef NDEBUG
-    assert(result);
+        assert(result);
 #endif

-    std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
-  }
+        SendMessages({{"reset"}});
+    }
+
+    void FinalizeNet() {
+        SendMessages({{"clear_unobserved_feature_weights"}});
+    }
+
+    // Add 1 sample of learning data
+    void AddExample(Position& pos, Color rootColor,
+                    const Learner::PackedSfenValue& psv, double weight) {
+
+        Example example;
+        if (rootColor == pos.side_to_move()) {
+            example.sign = 1;
+        } else {
+            example.sign = -1;
+        }
+
+        example.psv = psv;
+        example.weight = weight;
+
+        Features::IndexList active_indices[2];
+        for (const auto trigger : kRefreshTriggers) {
+            RawFeatures::AppendActiveIndices(pos, trigger, active_indices);
+        }
+
+        if (pos.side_to_move() != WHITE) {
+            active_indices[0].swap(active_indices[1]);
+        }
+
+        for (const auto color : Colors) {
+            std::vector<TrainingFeature> training_features;
+            for (const auto base_index : active_indices[color]) {
+                static_assert(Features::Factorizer<RawFeatures>::GetDimensions() <
+                              (1 << TrainingFeature::kIndexBits), "");
+                Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+                    base_index, &training_features);
+            }
+
+            std::sort(training_features.begin(), training_features.end());
+
+            auto& unique_features = example.training_features[color];
+            for (const auto& feature : training_features) {
+                if (!unique_features.empty() &&
+                    feature.GetIndex() == unique_features.back().GetIndex()) {
+
+                    unique_features.back() += feature;
+                } else {
+                    unique_features.push_back(feature);
+                }
+            }
+        }
+
+        std::lock_guard<std::mutex> lock(examples_mutex);
+        examples.push_back(std::move(example));
+    }
+
+    // update the evaluation function parameters
+    void UpdateParameters() {
+        assert(batch_size > 0);
+
+        const auto learning_rate = static_cast<LearnFloatType>(
+            global_learning_rate / batch_size);
+
+        std::lock_guard<std::mutex> lock(examples_mutex);
+        std::shuffle(examples.begin(), examples.end(), rng);
+        while (examples.size() >= batch_size) {
+            std::vector<Example> batch(examples.end() - batch_size, examples.end());
+            examples.resize(examples.size() - batch_size);
+
+            const auto network_output = trainer->Propagate(batch);
+
+            std::vector<LearnFloatType> gradients(batch.size());
+            for (std::size_t b = 0; b < batch.size(); ++b) {
+                const auto shallow = static_cast<Value>(Round<std::int32_t>(
+                    batch[b].sign * network_output[b] * kPonanzaConstant));
+                const auto& psv = batch[b].psv;
+                const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
+                gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+            }
+
+            trainer->Backpropagate(gradients.data(), learning_rate);
+        }
+        SendMessages({{"quantize_parameters"}});
+    }
+
+    // Check if there are any problems with learning
+    void CheckHealth() {
+        SendMessages({{"check_health"}});
+    }
+
+    // save merit function parameters to a file
+    void save_eval(std::string dir_name) {
+        auto eval_dir = Path::Combine(Options["EvalSaveDir"], dir_name);
+        std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
+
+        // mkdir() will fail if this folder already exists, but
+        // Apart from that. If not, I just want you to make it.
+        // Also, assume that the folders up to EvalSaveDir have been dug.
+        std::filesystem::create_directories(eval_dir);
+
+        const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
+        std::ofstream stream(file_name, std::ios::binary);
+#ifndef NDEBUG
+        bool result =
+#endif
+        WriteParameters(stream);
+#ifndef NDEBUG
+        assert(result);
+#endif
+
+        std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
+    }
 }  // namespace Eval::NNUE
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -1,37 +1,36 @@
-// Interface used for learning NNUE evaluation function
-
-#ifndef _EVALUATE_NNUE_LEARNER_H_
+#ifndef _EVALUATE_NNUE_LEARNER_H_
 #define _EVALUATE_NNUE_LEARNER_H_

-#include "../learn/learn.h"
+#include "learn/learn.h"

+// Interface used for learning NNUE evaluation function
 namespace Eval::NNUE {

-  // Initialize learning
-  void InitializeTraining(const std::string& seed);
+    // Initialize learning
+    void InitializeTraining(const std::string& seed);

-  // set the number of samples in the mini-batch
-  void SetBatchSize(uint64_t size);
+    // set the number of samples in the mini-batch
+    void SetBatchSize(uint64_t size);

-  // Set options such as hyperparameters
-  void SetOptions(const std::string& options);
+    // Set options such as hyperparameters
+    void SetOptions(const std::string& options);

-  // Reread the evaluation function parameters for learning from the file
-  void RestoreParameters(const std::string& dir_name);
+    // Reread the evaluation function parameters for learning from the file
+    void RestoreParameters(const std::string& dir_name);

-// Add 1 sample of learning data
-  void AddExample(Position& pos, Color rootColor,
-  	const Learner::PackedSfenValue& psv, double weight);
+    // Add 1 sample of learning data
+    void AddExample(Position& pos, Color rootColor,
+    	 const Learner::PackedSfenValue& psv, double weight);

-  // update the evaluation function parameters
-  void UpdateParameters();
+    // update the evaluation function parameters
+    void UpdateParameters();

-  // Check if there are any problems with learning
-  void CheckHealth();
+    // Check if there are any problems with learning
+    void CheckHealth();

-  void FinalizeNet();
+    void FinalizeNet();

-  void save_eval(std::string suffix);
+    void save_eval(std::string suffix);
 }  // namespace Eval::NNUE

 #endif
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -1,36 +1,34 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.

-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.

-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

-// Class for difference calculation of NNUE evaluation function
-
 #ifndef NNUE_ACCUMULATOR_H_INCLUDED
 #define NNUE_ACCUMULATOR_H_INCLUDED

 #include "nnue_architecture.h"

+// Class for difference calculation of NNUE evaluation function
 namespace Eval::NNUE {

-  // Class that holds the result of affine transformation of input features
-  struct alignas(kCacheLineSize) Accumulator {
-    std::int16_t
-        accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
-    bool computed_accumulation;
-  };
+    // Class that holds the result of affine transformation of input features
+    struct alignas(kCacheLineSize) Accumulator {
+        std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
+        bool computed_accumulation;
+    };

 }  // namespace Eval::NNUE

--- a/src/nnue/nnue_architecture.h
+++ b/src/nnue/nnue_architecture.h
@@ -1,37 +1,36 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.

-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.

-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

-// Input features and network structure used in NNUE evaluation function
-
 #ifndef NNUE_ARCHITECTURE_H_INCLUDED
 #define NNUE_ARCHITECTURE_H_INCLUDED

 // Defines the network structure
 #include "architectures/halfkp_256x2-32-32.h"

+// Input features and network structure used in NNUE evaluation function
 namespace Eval::NNUE {

-  static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
-  static_assert(Network::kOutputDimensions == 1, "");
-  static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
+    static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
+    static_assert(Network::kOutputDimensions == 1, "");
+    static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");

-  // Trigger for full calculation instead of difference calculation
-  constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
+    // Trigger for full calculation instead of difference calculation
+    constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;

 }  // namespace Eval::NNUE

--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.

-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.

-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 // Constants used in NNUE evaluation function
@@ -21,11 +21,11 @@
 #ifndef NNUE_COMMON_H_INCLUDED
 #define NNUE_COMMON_H_INCLUDED

+#include "types.h"
+
 #include <cstring>
 #include <iostream>

-#include "../types.h"
-
 #if defined(USE_AVX2)
 #include <immintrin.h>

@@ -70,84 +70,84 @@

 namespace Eval::NNUE {

-  // Version of the evaluation file
-  constexpr std::uint32_t kVersion = 0x7AF32F17u;
+    // Version of the evaluation file
+    constexpr std::uint32_t kVersion = 0x7AF32F17u;

-  // Constant used in evaluation value calculation
-  constexpr int FV_SCALE = 16;
-  constexpr int kWeightScaleBits = 6;
+    // Constant used in evaluation value calculation
+    constexpr int FV_SCALE = 16;
+    constexpr int kWeightScaleBits = 6;

-  // Size of cache line (in bytes)
-  constexpr std::size_t kCacheLineSize = 64;
+    // Size of cache line (in bytes)
+    constexpr std::size_t kCacheLineSize = 64;

-  // SIMD width (in bytes)
-  #if defined(USE_AVX2)
-  constexpr std::size_t kSimdWidth = 32;
+    // SIMD width (in bytes)
+#if defined(USE_AVX2)
+    constexpr std::size_t kSimdWidth = 32;

-  #elif defined(USE_SSE2)
-  constexpr std::size_t kSimdWidth = 16;
+#elif defined(USE_SSE2)
+    constexpr std::size_t kSimdWidth = 16;

-  #elif defined(USE_MMX)
-  constexpr std::size_t kSimdWidth = 8;
+#elif defined(USE_MMX)
+    constexpr std::size_t kSimdWidth = 8;

-  #elif defined(USE_NEON)
-  constexpr std::size_t kSimdWidth = 16;
-  #endif
+#elif defined(USE_NEON)
+    constexpr std::size_t kSimdWidth = 16;
+#endif

-  constexpr std::size_t kMaxSimdWidth = 32;
+    constexpr std::size_t kMaxSimdWidth = 32;

-  // unique number for each piece type on each square
-  enum {
-    PS_NONE     =  0,
-    PS_W_PAWN   =  1,
-    PS_B_PAWN   =  1 * SQUARE_NB + 1,
-    PS_W_KNIGHT =  2 * SQUARE_NB + 1,
-    PS_B_KNIGHT =  3 * SQUARE_NB + 1,
-    PS_W_BISHOP =  4 * SQUARE_NB + 1,
-    PS_B_BISHOP =  5 * SQUARE_NB + 1,
-    PS_W_ROOK   =  6 * SQUARE_NB + 1,
-    PS_B_ROOK   =  7 * SQUARE_NB + 1,
-    PS_W_QUEEN  =  8 * SQUARE_NB + 1,
-    PS_B_QUEEN  =  9 * SQUARE_NB + 1,
-    PS_W_KING   = 10 * SQUARE_NB + 1,
-    PS_END      = PS_W_KING, // pieces without kings (pawns included)
-    PS_B_KING   = 11 * SQUARE_NB + 1,
-    PS_END2     = 12 * SQUARE_NB + 1
-  };
+    // unique number for each piece type on each square
+    enum {
+        PS_NONE     =  0,
+        PS_W_PAWN   =  1,
+        PS_B_PAWN   =  1 * SQUARE_NB + 1,
+        PS_W_KNIGHT =  2 * SQUARE_NB + 1,
+        PS_B_KNIGHT =  3 * SQUARE_NB + 1,
+        PS_W_BISHOP =  4 * SQUARE_NB + 1,
+        PS_B_BISHOP =  5 * SQUARE_NB + 1,
+        PS_W_ROOK   =  6 * SQUARE_NB + 1,
+        PS_B_ROOK   =  7 * SQUARE_NB + 1,
+        PS_W_QUEEN  =  8 * SQUARE_NB + 1,
+        PS_B_QUEEN  =  9 * SQUARE_NB + 1,
+        PS_W_KING   = 10 * SQUARE_NB + 1,
+        PS_END      = PS_W_KING, // pieces without kings (pawns included)
+        PS_B_KING   = 11 * SQUARE_NB + 1,
+        PS_END2     = 12 * SQUARE_NB + 1
+    };

-  extern const uint32_t kpp_board_index[PIECE_NB][COLOR_NB];
+    extern const uint32_t kpp_board_index[PIECE_NB][COLOR_NB];

-  // Type of input feature after conversion
-  using TransformedFeatureType = std::uint8_t;
-  using IndexType = std::uint32_t;
+    // Type of input feature after conversion
+    using TransformedFeatureType = std::uint8_t;
+    using IndexType = std::uint32_t;

-  // Forward declaration of learning class template
-  template <typename Layer>
-  class Trainer;
+    // Forward declaration of learning class template
+    template <typename Layer>
+    class Trainer;

-  // Round n up to be a multiple of base
-  template <typename IntType>
-  constexpr IntType CeilToMultiple(IntType n, IntType base) {
-      return (n + base - 1) / base * base;
-  }
+    // Round n up to be a multiple of base
+    template <typename IntType>
+    constexpr IntType CeilToMultiple(IntType n, IntType base) {
+        return (n + base - 1) / base * base;
+    }

-  // read_little_endian() is our utility to read an integer (signed or unsigned, any size)
-  // from a stream in little-endian order. We swap the byte order after the read if
-  // necessary to return a result with the byte ordering of the compiling machine.
-  template <typename IntType>
-  inline IntType read_little_endian(std::istream& stream) {
+    // read_little_endian() is our utility to read an integer (signed or unsigned, any size)
+    // from a stream in little-endian order. We swap the byte order after the read if
+    // necessary to return a result with the byte ordering of the compiling machine.
+    template <typename IntType>
+    inline IntType read_little_endian(std::istream& stream) {

-      IntType result;
-      std::uint8_t u[sizeof(IntType)];
-      typename std::make_unsigned<IntType>::type v = 0;
+        IntType result;
+        std::uint8_t u[sizeof(IntType)];
+        typename std::make_unsigned<IntType>::type v = 0;

-      stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
-      for (std::size_t i = 0; i < sizeof(IntType); ++i)
-          v = (v << 8) | u[sizeof(IntType) - i - 1];
+        stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
+        for (std::size_t i = 0; i < sizeof(IntType); ++i)
+            v = (v << 8) | u[sizeof(IntType) - i - 1];

-      std::memcpy(&result, &v, sizeof(IntType));
-      return result;
-  }
+        std::memcpy(&result, &v, sizeof(IntType));
+        return result;
+    }

 }  // namespace Eval::NNUE

--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.

-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.

-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 // A class that converts the input features of the NNUE evaluation function
@@ -23,435 +23,450 @@

 #include "nnue_common.h"
 #include "nnue_architecture.h"
+
 #include "features/index_list.h"

-#include <cstring> // std::memset()
+#include <cstring>
+#include <string>

 namespace Eval::NNUE {

-  // If vector instructions are enabled, we update and refresh the
-  // accumulator tile by tile such that each tile fits in the CPU's
-  // vector registers.
-  #define TILING
+    // If vector instructions are enabled, we update and refresh the
+    // accumulator tile by tile such that each tile fits in the CPU's
+    // vector registers.
+#define TILING

-  #ifdef USE_AVX512
-  typedef __m512i vec_t;
-  #define vec_load(a) _mm512_loadA_si512(a)
-  #define vec_store(a,b) _mm512_storeA_si512(a,b)
-  #define vec_add_16(a,b) _mm512_add_epi16(a,b)
-  #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
-  #define vec_zero _mm512_setzero_si512()
-  static constexpr IndexType kNumRegs = 8; // only 8 are needed
+#ifdef USE_AVX512
+    typedef __m512i vec_t;
+#define vec_load(a) _mm512_loadA_si512(a)
+#define vec_store(a,b) _mm512_storeA_si512(a,b)
+#define vec_add_16(a,b) _mm512_add_epi16(a,b)
+#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+#define vec_zero _mm512_setzero_si512()
+    static constexpr IndexType kNumRegs = 8; // only 8 are needed

-  #elif USE_AVX2
-  typedef __m256i vec_t;
-  #define vec_load(a) _mm256_loadA_si256(a)
-  #define vec_store(a,b) _mm256_storeA_si256(a,b)
-  #define vec_add_16(a,b) _mm256_add_epi16(a,b)
-  #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
-  #define vec_zero _mm256_setzero_si256()
-  static constexpr IndexType kNumRegs = 16;
+#elif USE_AVX2
+    typedef __m256i vec_t;
+#define vec_load(a) _mm256_loadA_si256(a)
+#define vec_store(a,b) _mm256_storeA_si256(a,b)
+#define vec_add_16(a,b) _mm256_add_epi16(a,b)
+#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+#define vec_zero _mm256_setzero_si256()
+    static constexpr IndexType kNumRegs = 16;

-  #elif USE_SSE2
-  typedef __m128i vec_t;
-  #define vec_load(a) (*(a))
-  #define vec_store(a,b) *(a)=(b)
-  #define vec_add_16(a,b) _mm_add_epi16(a,b)
-  #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
-  #define vec_zero _mm_setzero_si128()
-  static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
+#elif USE_SSE2
+    typedef __m128i vec_t;
+#define vec_load(a) (*(a))
+#define vec_store(a,b) *(a)=(b)
+#define vec_add_16(a,b) _mm_add_epi16(a,b)
+#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
+#define vec_zero _mm_setzero_si128()
+    static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;

-  #elif USE_MMX
-  typedef __m64 vec_t;
-  #define vec_load(a) (*(a))
-  #define vec_store(a,b) *(a)=(b)
-  #define vec_add_16(a,b) _mm_add_pi16(a,b)
-  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
-  #define vec_zero _mm_setzero_si64()
-  static constexpr IndexType kNumRegs = 8;
+#elif USE_MMX
+    typedef __m64 vec_t;
+#define vec_load(a) (*(a))
+#define vec_store(a,b) *(a)=(b)
+#define vec_add_16(a,b) _mm_add_pi16(a,b)
+#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
+#define vec_zero _mm_setzero_si64()
+    static constexpr IndexType kNumRegs = 8;

-  #elif USE_NEON
-  typedef int16x8_t vec_t;
-  #define vec_load(a) (*(a))
-  #define vec_store(a,b) *(a)=(b)
-  #define vec_add_16(a,b) vaddq_s16(a,b)
-  #define vec_sub_16(a,b) vsubq_s16(a,b)
-  #define vec_zero {0}
-  static constexpr IndexType kNumRegs = 16;
+#elif USE_NEON
+    typedef int16x8_t vec_t;
+#define vec_load(a) (*(a))
+#define vec_store(a,b) *(a)=(b)
+#define vec_add_16(a,b) vaddq_s16(a,b)
+#define vec_sub_16(a,b) vsubq_s16(a,b)
+#define vec_zero {0}
+    static constexpr IndexType kNumRegs = 16;

-  #else
-  #undef TILING
+#else
+#undef TILING

-  #endif
+#endif

-  // Input feature converter
-  class FeatureTransformer {
+    // Input feature converter
+    class FeatureTransformer {

-   private:
-    // Number of output dimensions for one side
-    static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
+    private:
+        // Number of output dimensions for one side
+        static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;

-    #ifdef TILING
-    static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
-    static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
-    #endif
+#ifdef TILING
+        static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
+        static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
+#endif

-   public:
-    // Output type
-    using OutputType = TransformedFeatureType;
+    public:
+        // Output type
+        using OutputType = TransformedFeatureType;

-    // Number of input/output dimensions
-    static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
-    static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
+        // Number of input/output dimensions
+        static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
+        static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;

-    // Size of forward propagation buffer
-    static constexpr std::size_t kBufferSize =
-        kOutputDimensions * sizeof(OutputType);
+        // Size of forward propagation buffer
+        static constexpr std::size_t kBufferSize =
+            kOutputDimensions * sizeof(OutputType);

-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t GetHashValue() {
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t GetHashValue() {

-      return RawFeatures::kHashValue ^ kOutputDimensions;
-    }
-
-    // a string representing the structure
-    static std::string GetStructureString() {
-      return RawFeatures::GetName() + "[" +
-        std::to_string(kInputDimensions) + "->" +
-        std::to_string(kHalfDimensions) + "x2]";
-    }
-
-    // Read network parameters
-    bool ReadParameters(std::istream& stream) {
-
-      for (std::size_t i = 0; i < kHalfDimensions; ++i)
-        biases_[i] = read_little_endian<BiasType>(stream);
-      for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
-        weights_[i] = read_little_endian<WeightType>(stream);
-      return !stream.fail();
-    }
-
-    // write parameters
-    bool WriteParameters(std::ostream& stream) const {
-      stream.write(reinterpret_cast<const char*>(biases_),
-        kHalfDimensions * sizeof(BiasType));
-      stream.write(reinterpret_cast<const char*>(weights_),
-        kHalfDimensions * kInputDimensions * sizeof(WeightType));
-      return !stream.fail();
-    }
-
-    // Proceed with the difference calculation if possible
-    bool UpdateAccumulatorIfPossible(const Position& pos) const {
-
-      const auto now = pos.state();
-      if (now->accumulator.computed_accumulation)
-        return true;
-
-      const auto prev = now->previous;
-      if (prev && prev->accumulator.computed_accumulation) {
-        UpdateAccumulator(pos);
-        return true;
-      }
-
-      return false;
-    }
-
-    // Convert input features
-    void Transform(const Position& pos, OutputType* output) const {
-
-      if (!UpdateAccumulatorIfPossible(pos))
-        RefreshAccumulator(pos);
-
-      const auto& accumulation = pos.state()->accumulator.accumulation;
-
-  #if defined(USE_AVX2)
-      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-      constexpr int kControl = 0b11011000;
-      const __m256i kZero = _mm256_setzero_si256();
-
-  #elif defined(USE_SSE2)
-      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-
-  #ifdef USE_SSE41
-      const __m128i kZero = _mm_setzero_si128();
-  #else
-      const __m128i k0x80s = _mm_set1_epi8(-128);
-  #endif
-
-  #elif defined(USE_MMX)
-      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-      const __m64 k0x80s = _mm_set1_pi8(-128);
-
-  #elif defined(USE_NEON)
-      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-      const int8x8_t kZero = {0};
-  #endif
-
-      const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
-      for (IndexType p = 0; p < 2; ++p) {
-        const IndexType offset = kHalfDimensions * p;
-
-  #if defined(USE_AVX2)
-        auto out = reinterpret_cast<__m256i*>(&output[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i sum0 = _mm256_loadA_si256(
-              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m256i sum1 = _mm256_loadA_si256(
-            &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
-                accumulation[perspectives[p]][i])[j * 2 + 0]);
-            sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
-                accumulation[perspectives[p]][i])[j * 2 + 1]);
-          }
-          _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
-              _mm256_packs_epi16(sum0, sum1), kZero), kControl));
+            return RawFeatures::kHashValue ^ kOutputDimensions;
        }

-  #elif defined(USE_SSE2)
-        auto out = reinterpret_cast<__m128i*>(&output[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
-              accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
-              accumulation[perspectives[p]][0])[j * 2 + 1]);
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
-                accumulation[perspectives[p]][i])[j * 2 + 0]);
-            sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
-                accumulation[perspectives[p]][i])[j * 2 + 1]);
-          }
-      const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
-
-          _mm_store_si128(&out[j],
-
-  #ifdef USE_SSE41
-            _mm_max_epi8(packedbytes, kZero)
-  #else
-            _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
-  #endif
-
-          );
+        // a string representing the structure
+        static std::string GetStructureString() {
+            return RawFeatures::GetName() + "[" +
+                std::to_string(kInputDimensions) + "->" +
+                std::to_string(kHalfDimensions) + "x2]";
        }

-  #elif defined(USE_MMX)
-        auto out = reinterpret_cast<__m64*>(&output[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m64 sum0 = *(&reinterpret_cast<const __m64*>(
-              accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m64 sum1 = *(&reinterpret_cast<const __m64*>(
-              accumulation[perspectives[p]][0])[j * 2 + 1]);
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
-                accumulation[perspectives[p]][i])[j * 2 + 0]);
-            sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
-                accumulation[perspectives[p]][i])[j * 2 + 1]);
-          }
-          const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
-          out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+        // Read network parameters
+        bool ReadParameters(std::istream& stream) {
+
+            for (std::size_t i = 0; i < kHalfDimensions; ++i)
+                biases_[i] = read_little_endian<BiasType>(stream);
+
+            for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
+                weights_[i] = read_little_endian<WeightType>(stream);
+
+            return !stream.fail();
        }

-  #elif defined(USE_NEON)
-        const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          int16x8_t sum = reinterpret_cast<const int16x8_t*>(
-              accumulation[perspectives[p]][0])[j];
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
-                accumulation[perspectives[p]][i])[j]);
-          }
-          out[j] = vmax_s8(vqmovn_s16(sum), kZero);
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            stream.write(reinterpret_cast<const char*>(biases_),
+                kHalfDimensions * sizeof(BiasType));
+
+            stream.write(reinterpret_cast<const char*>(weights_),
+                kHalfDimensions * kInputDimensions * sizeof(WeightType));
+
+            return !stream.fail();
        }

-  #else
-        for (IndexType j = 0; j < kHalfDimensions; ++j) {
-          BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum += accumulation[static_cast<int>(perspectives[p])][i][j];
-          }
-          output[offset + j] = static_cast<OutputType>(
-              std::max<int>(0, std::min<int>(127, sum)));
-        }
-  #endif
+        // Proceed with the difference calculation if possible
+        bool UpdateAccumulatorIfPossible(const Position& pos) const {

-      }
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
-    }
+            const auto now = pos.state();
+            if (now->accumulator.computed_accumulation)
+                return true;

-   private:
-    // Calculate cumulative value without using difference calculation
-    void RefreshAccumulator(const Position& pos) const {
-
-      auto& accumulator = pos.state()->accumulator;
-      for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-        Features::IndexList active_indices[2];
-        RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
-                                         active_indices);
-        for (Color perspective : { WHITE, BLACK }) {
-    #ifdef TILING
-          for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
-            auto accTile = reinterpret_cast<vec_t*>(
-                &accumulator.accumulation[perspective][i][j * kTileHeight]);
-            vec_t acc[kNumRegs];
-
-            if (i == 0) {
-              auto biasesTile = reinterpret_cast<const vec_t*>(
-                  &biases_[j * kTileHeight]);
-              for (unsigned k = 0; k < kNumRegs; ++k)
-                acc[k] = biasesTile[k];
-            } else {
-              for (unsigned k = 0; k < kNumRegs; ++k)
-                acc[k] = vec_zero;
-            }
-            for (const auto index : active_indices[perspective]) {
-              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-              for (unsigned k = 0; k < kNumRegs; ++k)
-                acc[k] = vec_add_16(acc[k], column[k]);
+            const auto prev = now->previous;
+            if (prev && prev->accumulator.computed_accumulation) {
+                UpdateAccumulator(pos);
+                return true;
            }

-            for (unsigned k = 0; k < kNumRegs; k++)
-              vec_store(&accTile[k], acc[k]);
-          }
-    #else
-          if (i == 0) {
-            std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                        kHalfDimensions * sizeof(BiasType));
-          } else {
-            std::memset(accumulator.accumulation[perspective][i], 0,
-                        kHalfDimensions * sizeof(BiasType));
-          }
-
-          for (const auto index : active_indices[perspective]) {
-            const IndexType offset = kHalfDimensions * index;
-
-            for (IndexType j = 0; j < kHalfDimensions; ++j)
-              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-          }
-    #endif
+            return false;
        }

-      }
+        // Convert input features
+        void Transform(const Position& pos, OutputType* output) const {

-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
+            if (!UpdateAccumulatorIfPossible(pos))
+              RefreshAccumulator(pos);

-      accumulator.computed_accumulation = true;
-    }
+            const auto& accumulation = pos.state()->accumulator.accumulation;

-    // Calculate cumulative value using difference calculation
-    void UpdateAccumulator(const Position& pos) const {
+#if defined(USE_AVX2)
+            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+            constexpr int kControl = 0b11011000;
+            const __m256i kZero = _mm256_setzero_si256();

-      const auto& prev_accumulator = pos.state()->previous->accumulator;
-      auto& accumulator = pos.state()->accumulator;
-      for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-        Features::IndexList removed_indices[2], added_indices[2];
-        bool reset[2] = { false, false };
-        RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
-                                          removed_indices, added_indices, reset);
+#elif defined(USE_SSE2)
+            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;

-    #ifdef TILING
-        for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
-          for (Color perspective : { WHITE, BLACK }) {
-            auto accTile = reinterpret_cast<vec_t*>(
-                &accumulator.accumulation[perspective][i][j * kTileHeight]);
-            vec_t acc[kNumRegs];
+#ifdef USE_SSE41
+            const __m128i kZero = _mm_setzero_si128();
+#else
+            const __m128i k0x80s = _mm_set1_epi8(-128);
+#endif

-            if (reset[perspective]) {
-              if (i == 0) {
-                auto biasesTile = reinterpret_cast<const vec_t*>(
-                    &biases_[j * kTileHeight]);
-                for (unsigned k = 0; k < kNumRegs; ++k)
-                  acc[k] = biasesTile[k];
-              } else {
-                for (unsigned k = 0; k < kNumRegs; ++k)
-                  acc[k] = vec_zero;
-              }
-            } else {
-              auto prevAccTile = reinterpret_cast<const vec_t*>(
-                  &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
-              for (IndexType k = 0; k < kNumRegs; ++k)
-                acc[k] = vec_load(&prevAccTile[k]);
+#elif defined(USE_MMX)
+            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+            const __m64 k0x80s = _mm_set1_pi8(-128);

-              // Difference calculation for the deactivated features
-              for (const auto index : removed_indices[perspective]) {
-                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+#elif defined(USE_NEON)
+            constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+            const int8x8_t kZero = {0};
+#endif
+
+            const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
+            for (IndexType p = 0; p < 2; ++p) {
+                const IndexType offset = kHalfDimensions * p;
+
+#if defined(USE_AVX2)
+                auto out = reinterpret_cast<__m256i*>(&output[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    __m256i sum0 = _mm256_loadA_si256(
+                        &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+                    __m256i sum1 = _mm256_loadA_si256(
+                      &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+                        sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 0]);
+                        sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 1]);
+                    }
+
+                    _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+                        _mm256_packs_epi16(sum0, sum1), kZero), kControl));
+                }
+
+#elif defined(USE_SSE2)
+                auto out = reinterpret_cast<__m128i*>(&output[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+                        accumulation[perspectives[p]][0])[j * 2 + 0]);
+                    __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+                        accumulation[perspectives[p]][0])[j * 2 + 1]);
+                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+                        sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 0]);
+                        sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 1]);
+                    }
+
+                    const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
+
+                    _mm_store_si128(&out[j],
+
+#ifdef USE_SSE41
+                        _mm_max_epi8(packedbytes, kZero)
+#else
+                        _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+#endif
+
+                    );
+                }
+
+#elif defined(USE_MMX)
+                auto out = reinterpret_cast<__m64*>(&output[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    __m64 sum0 = *(&reinterpret_cast<const __m64*>(
+                        accumulation[perspectives[p]][0])[j * 2 + 0]);
+                    __m64 sum1 = *(&reinterpret_cast<const __m64*>(
+                        accumulation[perspectives[p]][0])[j * 2 + 1]);
+                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+                        sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 0]);
+                        sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 1]);
+                    }
+
+                    const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
+                    out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+                }
+
+#elif defined(USE_NEON)
+                const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    int16x8_t sum = reinterpret_cast<const int16x8_t*>(
+                        accumulation[perspectives[p]][0])[j];
+
+                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+                        sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
+                            accumulation[perspectives[p]][i])[j]);
+                    }
+
+                    out[j] = vmax_s8(vqmovn_s16(sum), kZero);
+                }
+
+#else
+                for (IndexType j = 0; j < kHalfDimensions; ++j) {
+                    BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
+                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+                        sum += accumulation[static_cast<int>(perspectives[p])][i][j];
+                    }
+
+                    output[offset + j] = static_cast<OutputType>(
+                        std::max<int>(0, std::min<int>(127, sum)));
+                }
+#endif

-                for (IndexType k = 0; k < kNumRegs; ++k)
-                  acc[k] = vec_sub_16(acc[k], column[k]);
-              }
            }
-            { // Difference calculation for the activated features
-              for (const auto index : added_indices[perspective]) {
-                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-                for (IndexType k = 0; k < kNumRegs; ++k)
-                  acc[k] = vec_add_16(acc[k], column[k]);
-              }
-            }
-
-            for (IndexType k = 0; k < kNumRegs; ++k)
-              vec_store(&accTile[k], acc[k]);
-          }
+#if defined(USE_MMX)
+            _mm_empty();
+#endif
        }
-    #if defined(USE_MMX)
-        _mm_empty();
-    #endif

-    #else
-        for (Color perspective : { WHITE, BLACK }) {
+    private:
+        // Calculate cumulative value without using difference calculation
+        void RefreshAccumulator(const Position& pos) const {
+
+            auto& accumulator = pos.state()->accumulator;
+            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                Features::IndexList active_indices[2];
+                RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
+                                                 active_indices);
+                for (Color perspective : { WHITE, BLACK }) {
+#ifdef TILING
+                    for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+                        auto accTile = reinterpret_cast<vec_t*>(
+                            &accumulator.accumulation[perspective][i][j * kTileHeight]);
+                        vec_t acc[kNumRegs];
+
+                        if (i == 0) {
+                            auto biasesTile = reinterpret_cast<const vec_t*>(
+                                &biases_[j * kTileHeight]);
+                            for (unsigned k = 0; k < kNumRegs; ++k)
+                                acc[k] = biasesTile[k];
+                        } else {
+                            for (unsigned k = 0; k < kNumRegs; ++k)
+                                acc[k] = vec_zero;
+                        }
+
+                        for (const auto index : active_indices[perspective]) {
+                            const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+                            auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+                            for (unsigned k = 0; k < kNumRegs; ++k)
+                                acc[k] = vec_add_16(acc[k], column[k]);
+                        }
+
+                        for (unsigned k = 0; k < kNumRegs; k++)
+                            vec_store(&accTile[k], acc[k]);
+                    }
+#else
+                    if (i == 0) {
+                        std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                                    kHalfDimensions * sizeof(BiasType));
+                    } else {
+                        std::memset(accumulator.accumulation[perspective][i], 0,
+                                    kHalfDimensions * sizeof(BiasType));
+                    }
+
+                    for (const auto index : active_indices[perspective]) {
+                        const IndexType offset = kHalfDimensions * index;
+
+                        for (IndexType j = 0; j < kHalfDimensions; ++j)
+                            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+                    }
+#endif
+                }

-          if (reset[perspective]) {
-            if (i == 0) {
-              std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                          kHalfDimensions * sizeof(BiasType));
-            } else {
-              std::memset(accumulator.accumulation[perspective][i], 0,
-                          kHalfDimensions * sizeof(BiasType));
            }
-          } else {
-            std::memcpy(accumulator.accumulation[perspective][i],
-                        prev_accumulator.accumulation[perspective][i],
-                        kHalfDimensions * sizeof(BiasType));
-            // Difference calculation for the deactivated features
-            for (const auto index : removed_indices[perspective]) {
-              const IndexType offset = kHalfDimensions * index;

-              for (IndexType j = 0; j < kHalfDimensions; ++j)
-                accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
-            }
-          }
-          { // Difference calculation for the activated features
-            for (const auto index : added_indices[perspective]) {
-              const IndexType offset = kHalfDimensions * index;
+#if defined(USE_MMX)
+            _mm_empty();
+#endif

-              for (IndexType j = 0; j < kHalfDimensions; ++j)
-                accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-            }
-          }
+            accumulator.computed_accumulation = true;
        }
-    #endif
-      }
-      accumulator.computed_accumulation = true;
-    }

-    using BiasType = std::int16_t;
-    using WeightType = std::int16_t;
+        // Calculate cumulative value using difference calculation
+        void UpdateAccumulator(const Position& pos) const {

-    // Make the learning class a friend
-    friend class Trainer<FeatureTransformer>;
+            const auto& prev_accumulator = pos.state()->previous->accumulator;
+            auto& accumulator = pos.state()->accumulator;
+            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                Features::IndexList removed_indices[2], added_indices[2];
+                bool reset[2] = { false, false };
+                RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
+                                                  removed_indices, added_indices, reset);

-    alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
-    alignas(kCacheLineSize)
-        WeightType weights_[kHalfDimensions * kInputDimensions];
-  };
+#ifdef TILING
+                for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+                    for (Color perspective : { WHITE, BLACK }) {
+                        auto accTile = reinterpret_cast<vec_t*>(
+                            &accumulator.accumulation[perspective][i][j * kTileHeight]);
+                        vec_t acc[kNumRegs];
+
+                        if (reset[perspective]) {
+                            if (i == 0) {
+                                auto biasesTile = reinterpret_cast<const vec_t*>(
+                                    &biases_[j * kTileHeight]);
+                                for (unsigned k = 0; k < kNumRegs; ++k)
+                                    acc[k] = biasesTile[k];
+                            } else {
+                                for (unsigned k = 0; k < kNumRegs; ++k)
+                                    acc[k] = vec_zero;
+                            }
+                        } else {
+                            auto prevAccTile = reinterpret_cast<const vec_t*>(
+                                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
+
+                            for (IndexType k = 0; k < kNumRegs; ++k)
+                                acc[k] = vec_load(&prevAccTile[k]);
+
+                            // Difference calculation for the deactivated features
+                            for (const auto index : removed_indices[perspective]) {
+                                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+                                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+                                for (IndexType k = 0; k < kNumRegs; ++k)
+                                    acc[k] = vec_sub_16(acc[k], column[k]);
+                            }
+                        }
+
+                        { // Difference calculation for the activated features
+                          for (const auto index : added_indices[perspective]) {
+                              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+                              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+                              for (IndexType k = 0; k < kNumRegs; ++k)
+                                  acc[k] = vec_add_16(acc[k], column[k]);
+                          }
+                        }
+
+                        for (IndexType k = 0; k < kNumRegs; ++k)
+                          vec_store(&accTile[k], acc[k]);
+                    }
+                }
+#if defined(USE_MMX)
+                _mm_empty();
+#endif
+
+#else
+                for (Color perspective : { WHITE, BLACK }) {
+
+                    if (reset[perspective]) {
+                        if (i == 0) {
+                            std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                                        kHalfDimensions * sizeof(BiasType));
+                        } else {
+                            std::memset(accumulator.accumulation[perspective][i], 0,
+                                        kHalfDimensions * sizeof(BiasType));
+                        }
+                    } else {
+                        std::memcpy(accumulator.accumulation[perspective][i],
+                                    prev_accumulator.accumulation[perspective][i],
+                                    kHalfDimensions * sizeof(BiasType));
+                        // Difference calculation for the deactivated features
+                        for (const auto index : removed_indices[perspective]) {
+                            const IndexType offset = kHalfDimensions * index;
+
+                            for (IndexType j = 0; j < kHalfDimensions; ++j)
+                                accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
+                        }
+                    }
+                    { // Difference calculation for the activated features
+                        for (const auto index : added_indices[perspective]) {
+                          const IndexType offset = kHalfDimensions * index;
+
+                          for (IndexType j = 0; j < kHalfDimensions; ++j)
+                              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+                        }
+                    }
+                }
+#endif
+            }
+            accumulator.computed_accumulation = true;
+        }
+
+        using BiasType = std::int16_t;
+        using WeightType = std::int16_t;
+
+        // Make the learning class a friend
+        friend class Trainer<FeatureTransformer>;
+
+        alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
+        alignas(kCacheLineSize)
+            WeightType weights_[kHalfDimensions * kInputDimensions];
+    };

 }  // namespace Eval::NNUE

-#endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#endif //#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -1,197 +1,215 @@
-// USI extended command for NNUE evaluation function
-
-#include "../thread.h"
-#include "../uci.h"
-#include "evaluate_nnue.h"
+#include "evaluate_nnue.h"
 #include "nnue_test_command.h"

+#include "thread.h"
+#include "uci.h"
+
 #include <set>
 #include <fstream>

-#define ASSERT(X) { if (!(X)) { std::cout << "\nError : ASSERT(" << #X << "), " << __FILE__ << "(" << __LINE__ << "): " << __func__ << std::endl; \
- std::this_thread::sleep_for(std::chrono::microseconds(3000)); *(int*)1 =0;} }
-
-namespace Eval {
-
-namespace NNUE {
-
-namespace {
-
-// Testing RawFeatures mainly for difference calculation
-void TestFeatures(Position& pos) {
-  const std::uint64_t num_games = 1000;
-  StateInfo si;
-  pos.set(StartFEN, false, &si, Threads.main());
-  const int MAX_PLY = 256; // test up to 256 hands
-
-  StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
-  int ply; // Trouble from the initial phase
-
-  PRNG prng(20171128);
-
-  std::uint64_t num_moves = 0;
-  std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
-  std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
-  constexpr IndexType kUnknown = -1;
-  std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
-  auto make_index_sets = [&](const Position& position) {
-    std::vector<std::vector<std::set<IndexType>>> index_sets(
-        kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
-    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-      Features::IndexList active_indices[2];
-      RawFeatures::AppendActiveIndices(position, kRefreshTriggers[i],
-                                       active_indices);
-      for (const auto perspective : Colors) {
-        for (const auto index : active_indices[perspective]) {
-          ASSERT(index < RawFeatures::kDimensions);
-          ASSERT(index_sets[i][perspective].count(index) == 0);
-          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-          index_sets[i][perspective].insert(index);
-          trigger_map[index] = i;
-        }
-      }
-    }
-    return index_sets;
-  };
-  auto update_index_sets = [&](const Position& position, auto* index_sets) {
-    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-      Features::IndexList removed_indices[2], added_indices[2];
-      bool reset[2] = { false, false };
-      RawFeatures::AppendChangedIndices(position, kRefreshTriggers[i],
-                                        removed_indices, added_indices, reset);
-      for (const auto perspective : Colors) {
-        if (reset[perspective]) {
-          (*index_sets)[i][perspective].clear();
-          ++num_resets[i];
-        } else {
-          for (const auto index : removed_indices[perspective]) {
-            ASSERT(index < RawFeatures::kDimensions);
-            ASSERT((*index_sets)[i][perspective].count(index) == 1);
-            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-            (*index_sets)[i][perspective].erase(index);
-            ++num_updates.back();
-            ++num_updates[i];
-            trigger_map[index] = i;
-          }
-        }
-        for (const auto index : added_indices[perspective]) {
-          ASSERT(index < RawFeatures::kDimensions);
-          ASSERT((*index_sets)[i][perspective].count(index) == 0);
-          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-          (*index_sets)[i][perspective].insert(index);
-          ++num_updates.back();
-          ++num_updates[i];
-          trigger_map[index] = i;
-        }
-      }
-    }
-  };
-
-  std::cout << "feature set: " << RawFeatures::GetName()
-            << "[" << RawFeatures::kDimensions << "]" << std::endl;
-  std::cout << "start testing with random games";
-
-  for (std::uint64_t i = 0; i < num_games; ++i) {
-    auto index_sets = make_index_sets(pos);
-    for (ply = 0; ply < MAX_PLY; ++ply) {
-      MoveList<LEGAL> mg(pos); // Generate all legal hands
-
-      // There was no legal move == Clog
-      if (mg.size() == 0)
-        break;
-
-      // Randomly choose from the generated moves and advance the phase with the moves.
-      Move m = mg.begin()[prng.rand(mg.size())];
-      pos.do_move(m, state[ply]);
-
-      ++num_moves;
-      update_index_sets(pos, &index_sets);
-      ASSERT(index_sets == make_index_sets(pos));
-    }
-
-    pos.set(StartFEN, false, &si, Threads.main());
-
-    // Output'.' every 100 times (so you can see that it's progressing)
-    if ((i % 100) == 0)
-      std::cout << "." << std::flush;
-  }
-  std::cout << "passed." << std::endl;
-  std::cout << num_games << " games, " << num_moves << " moves, "
-            << num_updates.back() << " updates, "
-            << (1.0 * num_updates.back() / num_moves)
-            << " updates per move" << std::endl;
-  std::size_t num_observed_indices = 0;
-  for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-    const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
-    num_observed_indices += count;
-    std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
-              << "): " << count << " features ("
-              << (100.0 * count / RawFeatures::kDimensions) << "%), "
-              << num_updates[i] << " updates ("
-              << (1.0 * num_updates[i] / num_moves) << " per move), "
-              << num_resets[i] << " resets ("
-              << (100.0 * num_resets[i] / num_moves) << "%)"
-              << std::endl;
-  }
-  std::cout << "observed " << num_observed_indices << " ("
-            << (100.0 * num_observed_indices / RawFeatures::kDimensions)
-            << "% of " << RawFeatures::kDimensions
-            << ") features" << std::endl;
+#define ASSERT(X) { \
+    if (!(X)) { \
+        std::cout \
+            << "\nError : ASSERT(" << #X << "), " \
+            << __FILE__ << "(" << __LINE__ << "): " \
+            << __func__ << std::endl; \
+            std::this_thread::sleep_for(std::chrono::microseconds(3000)); \
+            *(int*)1 =0; \
+    } \
 }

-// Output a string that represents the structure of the evaluation function
-void PrintInfo(std::istream& stream) {
-  std::cout << "network architecture: " << GetArchitectureString() << std::endl;
-
-  while (true) {
-    std::string file_name;
-    stream >> file_name;
-    if (file_name.empty()) break;
-
-    std::uint32_t hash_value;
-    std::string architecture;
-    const bool success = [&]() {
-      std::ifstream file_stream(file_name, std::ios::binary);
-      if (!file_stream) return false;
-      if (!ReadHeader(file_stream, &hash_value, &architecture)) return false;
-      return true;
-    }();
-
-    std::cout << file_name << ": ";
-    if (success) {
-      if (hash_value == kHashValue) {
-        std::cout << "matches with this binary";
-        if (architecture != GetArchitectureString()) {
-          std::cout << ", but architecture string differs: " << architecture;
-        }
-        std::cout << std::endl;
-      } else {
-        std::cout << architecture << std::endl;
-      }
-    } else {
-      std::cout << "failed to read header" << std::endl;
-    }
-  }
-}
-
-}  // namespace
-
 // USI extended command for NNUE evaluation function
-void TestCommand(Position& pos, std::istream& stream) {
-  std::string sub_command;
-  stream >> sub_command;
+namespace Eval::NNUE {

-  if (sub_command == "test_features") {
-    TestFeatures(pos);
-  } else if (sub_command == "info") {
-    PrintInfo(stream);
-  } else {
-    std::cout << "usage:" << std::endl;
-    std::cout << " test nnue test_features" << std::endl;
-    std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
-  }
-}
+    namespace {

-}  // namespace NNUE
+        // Testing RawFeatures mainly for difference calculation
+        void TestFeatures(Position& pos) {
+            const std::uint64_t num_games = 1000;
+            StateInfo si;
+            pos.set(StartFEN, false, &si, Threads.main());
+            const int MAX_PLY = 256; // test up to 256 hands

-}  // namespace Eval
+            StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
+            int ply; // Trouble from the initial phase
+
+            PRNG prng(20171128);
+
+            std::uint64_t num_moves = 0;
+            std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
+            std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
+            constexpr IndexType kUnknown = -1;
+            std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
+
+            auto make_index_sets = [&](const Position& position) {
+                std::vector<std::vector<std::set<IndexType>>> index_sets(
+                    kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
+
+                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                    Features::IndexList active_indices[2];
+                    RawFeatures::AppendActiveIndices(position, kRefreshTriggers[i],
+                                                     active_indices);
+
+                    for (const auto perspective : Colors) {
+                        for (const auto index : active_indices[perspective]) {
+                            ASSERT(index < RawFeatures::kDimensions);
+                            ASSERT(index_sets[i][perspective].count(index) == 0);
+                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                            index_sets[i][perspective].insert(index);
+                            trigger_map[index] = i;
+                        }
+                    }
+                }
+
+                return index_sets;
+            };
+
+            auto update_index_sets = [&](const Position& position, auto* index_sets) {
+                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                    Features::IndexList removed_indices[2], added_indices[2];
+                    bool reset[2] = { false, false };
+                    RawFeatures::AppendChangedIndices(position, kRefreshTriggers[i],
+                                                      removed_indices, added_indices, reset);
+                    for (const auto perspective : Colors) {
+                        if (reset[perspective]) {
+                            (*index_sets)[i][perspective].clear();
+                            ++num_resets[i];
+                        } else {
+                            for (const auto index : removed_indices[perspective]) {
+                                ASSERT(index < RawFeatures::kDimensions);
+                                ASSERT((*index_sets)[i][perspective].count(index) == 1);
+                                ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                                (*index_sets)[i][perspective].erase(index);
+                                ++num_updates.back();
+                                ++num_updates[i];
+                                trigger_map[index] = i;
+                            }
+                        }
+
+                        for (const auto index : added_indices[perspective]) {
+                            ASSERT(index < RawFeatures::kDimensions);
+                            ASSERT((*index_sets)[i][perspective].count(index) == 0);
+                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                            (*index_sets)[i][perspective].insert(index);
+                            ++num_updates.back();
+                            ++num_updates[i];
+                            trigger_map[index] = i;
+                        }
+                    }
+                }
+            };
+
+            std::cout << "feature set: " << RawFeatures::GetName()
+                      << "[" << RawFeatures::kDimensions << "]" << std::endl;
+            std::cout << "start testing with random games";
+
+            for (std::uint64_t i = 0; i < num_games; ++i) {
+                auto index_sets = make_index_sets(pos);
+                for (ply = 0; ply < MAX_PLY; ++ply) {
+                    MoveList<LEGAL> mg(pos); // Generate all legal hands
+
+                    // There was no legal move == Clog
+                    if (mg.size() == 0)
+                        break;
+
+                    // Randomly choose from the generated moves and advance the phase with the moves.
+                    Move m = mg.begin()[prng.rand(mg.size())];
+                    pos.do_move(m, state[ply]);
+
+                    ++num_moves;
+                    update_index_sets(pos, &index_sets);
+                    ASSERT(index_sets == make_index_sets(pos));
+                }
+
+                pos.set(StartFEN, false, &si, Threads.main());
+
+                // Output'.' every 100 times (so you can see that it's progressing)
+                if ((i % 100) == 0)
+                    std::cout << "." << std::flush;
+            }
+
+            std::cout << "passed." << std::endl;
+            std::cout << num_games << " games, " << num_moves << " moves, "
+                      << num_updates.back() << " updates, "
+                      << (1.0 * num_updates.back() / num_moves)
+                      << " updates per move" << std::endl;
+            std::size_t num_observed_indices = 0;
+
+            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
+                num_observed_indices += count;
+                std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
+                          << "): " << count << " features ("
+                          << (100.0 * count / RawFeatures::kDimensions) << "%), "
+                          << num_updates[i] << " updates ("
+                          << (1.0 * num_updates[i] / num_moves) << " per move), "
+                          << num_resets[i] << " resets ("
+                          << (100.0 * num_resets[i] / num_moves) << "%)"
+                          << std::endl;
+            }
+            std::cout << "observed " << num_observed_indices << " ("
+                      << (100.0 * num_observed_indices / RawFeatures::kDimensions)
+                      << "% of " << RawFeatures::kDimensions
+                      << ") features" << std::endl;
+        }
+
+        // Output a string that represents the structure of the evaluation function
+        void PrintInfo(std::istream& stream) {
+            std::cout << "network architecture: " << GetArchitectureString() << std::endl;
+
+            while (true) {
+                std::string file_name;
+                stream >> file_name;
+                if (file_name.empty())
+                    break;
+
+                std::uint32_t hash_value;
+                std::string architecture;
+                const bool success = [&]() {
+                    std::ifstream file_stream(file_name, std::ios::binary);
+
+                    if (!file_stream)
+                        return false;
+                    if (!ReadHeader(file_stream, &hash_value, &architecture))
+                        return false;
+
+                    return true;
+                }();
+
+                std::cout << file_name << ": ";
+                if (success) {
+                    if (hash_value == kHashValue) {
+                        std::cout << "matches with this binary";
+                        if (architecture != GetArchitectureString()) {
+                            std::cout << ", but architecture string differs: " << architecture;
+                        }
+
+                        std::cout << std::endl;
+                    } else {
+                        std::cout << architecture << std::endl;
+                    }
+                } else {
+                    std::cout << "failed to read header" << std::endl;
+                }
+            }
+        }
+
+    }  // namespace
+
+    // USI extended command for NNUE evaluation function
+    void TestCommand(Position& pos, std::istream& stream) {
+        std::string sub_command;
+        stream >> sub_command;
+
+        if (sub_command == "test_features") {
+            TestFeatures(pos);
+        } else if (sub_command == "info") {
+            PrintInfo(stream);
+        } else {
+            std::cout << "usage:" << std::endl;
+            std::cout << " test nnue test_features" << std::endl;
+            std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
+        }
+    }
+
+}  // namespace Eval::NNUE
--- a/src/nnue/nnue_test_command.h
+++ b/src/nnue/nnue_test_command.h
@@ -1,17 +1,12 @@
-// USI extended command interface for NNUE evaluation function
-
-#ifndef _NNUE_TEST_COMMAND_H_
+#ifndef _NNUE_TEST_COMMAND_H_
 #define _NNUE_TEST_COMMAND_H_

-namespace Eval {
+// USI extended command interface for NNUE evaluation function
+namespace Eval::NNUE {

-namespace NNUE {
+    // USI extended command for NNUE evaluation function
+    void TestCommand(Position& pos, std::istream& stream);

-// USI extended command for NNUE evaluation function
-void TestCommand(Position& pos, std::istream& stream);
-
-}  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE

 #endif