Merge remote-tracking branch 'remotes/official/master' into merge

2025-12-26 20:16:14 +08:00 · 2020-11-28 06:19:16 +08:00
parent 92b14a5ba2 190dd26b9f
commit 0b2ae6cb64
16 changed files with 1086 additions and 988 deletions
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -1,19 +1,19 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.

-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 // Code for calculating NNUE evaluation function
@@ -40,330 +40,313 @@

 namespace Eval::NNUE {

-    const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
-        // convention: W - us, B - them
-        // viewed from other side, W and B are reversed
-        { PS_NONE,     PS_NONE     },
-        { PS_W_PAWN,   PS_B_PAWN   },
-        { PS_W_KNIGHT, PS_B_KNIGHT },
-        { PS_W_BISHOP, PS_B_BISHOP },
-        { PS_W_ROOK,   PS_B_ROOK   },
-        { PS_W_QUEEN,  PS_B_QUEEN  },
-        { PS_W_KING,   PS_B_KING   },
-        { PS_NONE,     PS_NONE     },
-        { PS_NONE,     PS_NONE     },
-        { PS_B_PAWN,   PS_W_PAWN   },
-        { PS_B_KNIGHT, PS_W_KNIGHT },
-        { PS_B_BISHOP, PS_W_BISHOP },
-        { PS_B_ROOK,   PS_W_ROOK   },
-        { PS_B_QUEEN,  PS_W_QUEEN  },
-        { PS_B_KING,   PS_W_KING   },
-        { PS_NONE,     PS_NONE     }
-    };
+  const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
+   // convention: W - us, B - them
+   // viewed from other side, W and B are reversed
+      { PS_NONE,     PS_NONE     },
+      { PS_W_PAWN,   PS_B_PAWN   },
+      { PS_W_KNIGHT, PS_B_KNIGHT },
+      { PS_W_BISHOP, PS_B_BISHOP },
+      { PS_W_ROOK,   PS_B_ROOK   },
+      { PS_W_QUEEN,  PS_B_QUEEN  },
+      { PS_W_KING,   PS_B_KING   },
+      { PS_NONE,     PS_NONE     },
+      { PS_NONE,     PS_NONE     },
+      { PS_B_PAWN,   PS_W_PAWN   },
+      { PS_B_KNIGHT, PS_W_KNIGHT },
+      { PS_B_BISHOP, PS_W_BISHOP },
+      { PS_B_ROOK,   PS_W_ROOK   },
+      { PS_B_QUEEN,  PS_W_QUEEN  },
+      { PS_B_KING,   PS_W_KING   },
+      { PS_NONE,     PS_NONE     }
+  };

-    // Input feature converter
-    LargePagePtr<FeatureTransformer> feature_transformer;
+  // Input feature converter
+  LargePagePtr<FeatureTransformer> feature_transformer;

-    // Evaluation function
-    AlignedPtr<Network> network;
+  // Evaluation function
+  AlignedPtr<Network> network;

-    // Evaluation function file name
-    std::string fileName;
+  // Evaluation function file name
+  std::string fileName;

-    // Saved evaluation function file name
-    std::string savedfileName = "nn.bin";
+  // Saved evaluation function file name
+  std::string savedfileName = "nn.bin";

-    // Get a string that represents the structure of the evaluation function
-    std::string get_architecture_string() {
-        return "Features=" + FeatureTransformer::get_structure_string() +
-            ",Network=" + Network::get_structure_string();
-    }
+  // Get a string that represents the structure of the evaluation function
+  std::string get_architecture_string() {
+    return "Features=" + FeatureTransformer::get_structure_string() +
+        ",Network=" + Network::get_structure_string();
+  }

-    std::string get_layers_info() {
-        return
-            FeatureTransformer::get_layers_info()
-            + '\n' + Network::get_layers_info();
-    }
+  std::string get_layers_info() {
+    return
+        FeatureTransformer::get_layers_info()
+        + '\n' + Network::get_layers_info();
+  }

-    UseNNUEMode useNNUE;
-    std::string eval_file_loaded = "None";
+  UseNNUEMode useNNUE;
+  std::string eval_file_loaded = "None";

-    namespace Detail {
+  namespace Detail {

-        // Initialize the evaluation function parameters
-        template <typename T>
-        void initialize(AlignedPtr<T>& pointer) {
+  // Initialize the evaluation function parameters
+  template <typename T>
+  void initialize(AlignedPtr<T>& pointer) {

-            pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
-            std::memset(pointer.get(), 0, sizeof(T));
-        }
+    pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
+    std::memset(pointer.get(), 0, sizeof(T));
+  }

-        template <typename T>
-        void initialize(LargePagePtr<T>& pointer) {
+  template <typename T>
+  void initialize(LargePagePtr<T>& pointer) {

-            static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
+    static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
+    pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
+    std::memset(pointer.get(), 0, sizeof(T));
+  }

-            pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
-            std::memset(pointer.get(), 0, sizeof(T));
-        }
+  // Read evaluation function parameters
+  template <typename T>
+  bool ReadParameters(std::istream& stream, T& reference) {

-        // Read evaluation function parameters
-        template <typename T>
-        bool ReadParameters(std::istream& stream, T& reference) {
+    std::uint32_t header;
+    header = read_little_endian<std::uint32_t>(stream);
+    if (!stream || header != T::GetHashValue()) return false;
+    return reference.ReadParameters(stream);
+  }

-            std::uint32_t header;
-            header = read_little_endian<std::uint32_t>(stream);
+  // write evaluation function parameters
+  template <typename T>
+  bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
+    constexpr std::uint32_t header = T::GetHashValue();

-            if (!stream || header != T::GetHashValue())
-                return false;
+    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));

-            return reference.ReadParameters(stream);
-        }
+    return pointer->WriteParameters(stream);
+  }

-        // write evaluation function parameters
-        template <typename T>
-        bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
-            constexpr std::uint32_t header = T::GetHashValue();
+  template <typename T>
+  bool WriteParameters(std::ostream& stream, const LargePagePtr<T>& pointer) {
+    constexpr std::uint32_t header = T::GetHashValue();

-            stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));

-            return pointer->WriteParameters(stream);
-        }
+    return pointer->WriteParameters(stream);
+  }
+  }  // namespace Detail

-        template <typename T>
-        bool WriteParameters(std::ostream& stream, const LargePagePtr<T>& pointer) {
-            constexpr std::uint32_t header = T::GetHashValue();
+  // Initialize the evaluation function parameters
+  void initialize() {

-            stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+    Detail::initialize(feature_transformer);
+    Detail::initialize(network);
+  }

-            return pointer->WriteParameters(stream);
-        }
-    }  // namespace Detail
+  // Read network header
+  bool read_header(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
+  {
+    std::uint32_t version, size;

-    // Initialize the evaluation function parameters
-    void initialize() {
+    version     = read_little_endian<std::uint32_t>(stream);
+    *hash_value = read_little_endian<std::uint32_t>(stream);
+    size        = read_little_endian<std::uint32_t>(stream);
+    if (!stream || version != kVersion) return false;
+    architecture->resize(size);
+    stream.read(&(*architecture)[0], size);
+    return !stream.fail();
+  }

-        Detail::initialize(feature_transformer);
-        Detail::initialize(network);
-    }
+  // write the header
+  bool write_header(std::ostream& stream,
+    std::uint32_t hash_value, const std::string& architecture) {

-    // Read network header
-    bool read_header(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
-    {
-        std::uint32_t version, size;
+    stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
+    stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));

-        version     = read_little_endian<std::uint32_t>(stream);
-        *hash_value = read_little_endian<std::uint32_t>(stream);
-        size        = read_little_endian<std::uint32_t>(stream);
+    const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());

-        if (!stream || version != kVersion)
-            return false;
+    stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    stream.write(architecture.data(), size);

-        architecture->resize(size);
-        stream.read(&(*architecture)[0], size);
+    return !stream.fail();
+  }

-        return !stream.fail();
-    }
+  // Read network parameters
+  bool ReadParameters(std::istream& stream) {

-    // write the header
-    bool write_header(std::ostream& stream,
-        std::uint32_t hash_value, const std::string& architecture) {
+    std::uint32_t hash_value;
+    std::string architecture;
+    if (!read_header(stream, &hash_value, &architecture)) return false;
+    if (hash_value != kHashValue) return false;
+    if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
+    if (!Detail::ReadParameters(stream, *network)) return false;
+    return stream && stream.peek() == std::ios::traits_type::eof();
+  }

-        stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
-        stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
+  // write evaluation function parameters
+  bool WriteParameters(std::ostream& stream) {

-        const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
+    if (!write_header(stream, kHashValue, get_architecture_string()))
+        return false;

-        stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
-        stream.write(architecture.data(), size);
+    if (!Detail::WriteParameters(stream, feature_transformer))
+        return false;

-        return !stream.fail();
-    }
+    if (!Detail::WriteParameters(stream, network))
+        return false;

-    // Read network parameters
-    bool ReadParameters(std::istream& stream) {
+    return !stream.fail();
+}

-        std::uint32_t hash_value;
-        std::string architecture;
-        if (!read_header(stream, &hash_value, &architecture))
-            return false;
+  // Evaluation function. Perform differential calculation.
+  Value evaluate(const Position& pos) {

-        if (hash_value != kHashValue)
-            return false;
+    // We manually align the arrays on the stack because with gcc < 9.3
+    // overaligning stack variables with alignas() doesn't work correctly.

-        if (!Detail::ReadParameters(stream, *feature_transformer))
-            return false;
-
-        if (!Detail::ReadParameters(stream, *network))
-            return false;
-
-        return stream && stream.peek() == std::ios::traits_type::eof();
-    }
-    // write evaluation function parameters
-    bool WriteParameters(std::ostream& stream) {
-
-        if (!write_header(stream, kHashValue, get_architecture_string()))
-            return false;
-
-        if (!Detail::WriteParameters(stream, feature_transformer))
-            return false;
-
-        if (!Detail::WriteParameters(stream, network))
-            return false;
-
-        return !stream.fail();
-    }
-    // Evaluation function. Perform differential calculation.
-    Value evaluate(const Position& pos) {
-
-        // We manually align the arrays on the stack because with gcc < 9.3
-        // overaligning stack variables with alignas() doesn't work correctly.
-
-        constexpr uint64_t alignment = kCacheLineSize;
+    constexpr uint64_t alignment = kCacheLineSize;

 #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
-        TransformedFeatureType transformed_features_unaligned[
-          FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
-        char buffer_unaligned[Network::kBufferSize + alignment];
+    TransformedFeatureType transformed_features_unaligned[
+      FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
+    char buffer_unaligned[Network::kBufferSize + alignment];

-        auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
-        auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
+    auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
+    auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
 #else
-        alignas(alignment)
-          TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
-        alignas(alignment) char buffer[Network::kBufferSize];
+    alignas(alignment)
+      TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
+    alignas(alignment) char buffer[Network::kBufferSize];
 #endif

-        ASSERT_ALIGNED(transformed_features, alignment);
-        ASSERT_ALIGNED(buffer, alignment);
+    ASSERT_ALIGNED(transformed_features, alignment);
+    ASSERT_ALIGNED(buffer, alignment);

-        feature_transformer->Transform(pos, transformed_features);
+    feature_transformer->Transform(pos, transformed_features);
+    const auto output = network->Propagate(transformed_features, buffer);

+    return static_cast<Value>(output[0] / FV_SCALE);
+  }

-        const auto output = network->Propagate(transformed_features, buffer);
+  // Load eval, from a file stream or a memory stream
+  bool load_eval(std::string name, std::istream& stream) {

-        return static_cast<Value>(output[0] / FV_SCALE);
-    }
+    initialize();
+    fileName = name;
+    return ReadParameters(stream);
+}

-    // Load eval, from a file stream or a memory stream
-    bool load_eval(std::string name, std::istream& stream) {
+static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
+{
+  if (mode == "false")
+    return UseNNUEMode::False;
+  else if (mode == "true")
+     return UseNNUEMode::True;
+  else if (mode == "pure")
+    return UseNNUEMode::Pure;

-        initialize();
+  return UseNNUEMode::False;
+}

-        fileName = name;
-        return ReadParameters(stream);
-    }
+void init() {

-    static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
-    {
-        if (mode == "false")
-          return UseNNUEMode::False;
-        else if (mode == "true")
-          return UseNNUEMode::True;
-        else if (mode == "pure")
-          return UseNNUEMode::Pure;
+  useNNUE = nnue_mode_from_option(Options["Use NNUE"]);

-        return UseNNUEMode::False;
-    }
+  if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
+  {
+    eval_file_loaded.clear();
+    return;
+  }

-    void init() {
-
-        useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
-
-        if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
-        {
-            eval_file_loaded.clear();
-            return;
-        }
-
-        std::string eval_file = std::string(Options["EvalFile"]);
+  std::string eval_file = std::string(Options["EvalFile"]);

 #if defined(DEFAULT_NNUE_DIRECTORY)
 #define stringify2(x) #x
 #define stringify(x) stringify2(x)
-        std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
+  std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
 #else
-        std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
+  std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
 #endif

-        for (std::string directory : dirs)
-        {
-            if (eval_file_loaded != eval_file)
-            {
-                std::ifstream stream(directory + eval_file, std::ios::binary);
-                if (load_eval(eval_file, stream))
-                {
-                    sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
-                    eval_file_loaded = eval_file;
-                }
-                else
-                {
-                    sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
-                    eval_file_loaded.clear();
-                }
-            }
-        }
+  for (std::string directory : dirs)
+  {
+    if (eval_file_loaded != eval_file)
+    {
+      std::ifstream stream(directory + eval_file, std::ios::binary);
+      if (load_eval(eval_file, stream))
+      {
+        sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
+        eval_file_loaded = eval_file;
+      }
+      else
+      {
+        sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
+        eval_file_loaded.clear();
+      }
+    }
+  }

 #undef stringify2
 #undef stringify
-    }
+}

-    /// NNUE::verify() verifies that the last net used was loaded successfully
-    void verify_eval_file_loaded() {
+/// NNUE::verify() verifies that the last net used was loaded successfully
+void verify_eval_file_loaded() {

-        std::string eval_file = std::string(Options["EvalFile"]);
+  std::string eval_file = std::string(Options["EvalFile"]);

-        if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
-        {
-            UCI::OptionsMap defaults;
-            UCI::init(defaults);
+  if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
+  {
+    UCI::OptionsMap defaults;
+    UCI::init(defaults);

-            std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
-            std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
-            std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
-            std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
-            std::string msg5 = "The engine will be terminated now.";
+    std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+    std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
+    std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+    std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
+    std::string msg5 = "The engine will be terminated now.";

-            sync_cout << "info string ERROR: " << msg1 << sync_endl;
-            sync_cout << "info string ERROR: " << msg2 << sync_endl;
-            sync_cout << "info string ERROR: " << msg3 << sync_endl;
-            sync_cout << "info string ERROR: " << msg4 << sync_endl;
-            sync_cout << "info string ERROR: " << msg5 << sync_endl;
+    sync_cout << "info string ERROR: " << msg1 << sync_endl;
+    sync_cout << "info string ERROR: " << msg2 << sync_endl;
+    sync_cout << "info string ERROR: " << msg3 << sync_endl;
+    sync_cout << "info string ERROR: " << msg4 << sync_endl;
+    sync_cout << "info string ERROR: " << msg5 << sync_endl;

-            std::exit(EXIT_FAILURE);
-        }
+    std::exit(EXIT_FAILURE);
+  }

-        if (useNNUE != UseNNUEMode::False)
-            sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
-        else
-            sync_cout << "info string classical evaluation enabled" << sync_endl;
-    }
+  if (useNNUE != UseNNUEMode::False)
+    sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
+  else
+    sync_cout << "info string classical evaluation enabled" << sync_endl;
+}

-    /// In training we override eval file so this is useful.
-    void verify_any_net_loaded() {
+/// In training we override eval file so this is useful.
+void verify_any_net_loaded() {

-        if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
-        {
-            UCI::OptionsMap defaults;
-            UCI::init(defaults);
+  if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
+  {
+    UCI::OptionsMap defaults;
+    UCI::init(defaults);

-            std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
-            std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
-            std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
-            std::string msg5 = "The engine will be terminated now.";
+    std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+    std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
+    std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+    std::string msg5 = "The engine will be terminated now.";

-            sync_cout << "info string ERROR: " << msg1 << sync_endl;
-            sync_cout << "info string ERROR: " << msg2 << sync_endl;
-            sync_cout << "info string ERROR: " << msg3 << sync_endl;
-            sync_cout << "info string ERROR: " << msg5 << sync_endl;
+    sync_cout << "info string ERROR: " << msg1 << sync_endl;
+    sync_cout << "info string ERROR: " << msg2 << sync_endl;
+    sync_cout << "info string ERROR: " << msg3 << sync_endl;
+    sync_cout << "info string ERROR: " << msg5 << sync_endl;

-            std::exit(EXIT_FAILURE);
-        }
+    std::exit(EXIT_FAILURE);
+  }

-        if (useNNUE != UseNNUEMode::False)
-            sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
-        else
-            sync_cout << "info string classical evaluation enabled" << sync_endl;
-    }
+  if (useNNUE != UseNNUEMode::False)
+    sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
+  else
+    sync_cout << "info string classical evaluation enabled" << sync_endl;
+}

 } // namespace Eval::NNUE
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -1,21 +1,23 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.

-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

+// header used in NNUE evaluation function
+
 #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
 #define NNUE_EVALUATE_NNUE_H_INCLUDED

@@ -25,84 +27,83 @@

 #include <memory>

-// header used in NNUE evaluation function
 namespace Eval::NNUE {

-    enum struct UseNNUEMode
-    {
-        False,
-        True,
-        Pure
-    };
+  enum struct UseNNUEMode
+  {
+    False,
+    True,
+    Pure
+  };

-    // Hash value of evaluation function structure
-    constexpr std::uint32_t kHashValue =
-        FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
+  // Hash value of evaluation function structure
+  constexpr std::uint32_t kHashValue =
+      FeatureTransformer::GetHashValue() ^ Network::GetHashValue();

-    // Deleter for automating release of memory area
-    template <typename T>
-    struct AlignedDeleter {
-        void operator()(T* ptr) const {
-            ptr->~T();
-            std_aligned_free(ptr);
-        }
-    };
+  // Deleter for automating release of memory area
+  template <typename T>
+  struct AlignedDeleter {
+    void operator()(T* ptr) const {
+      ptr->~T();
+      std_aligned_free(ptr);
+    }
+  };

-    template <typename T>
-    struct LargePageDeleter {
-        void operator()(T* ptr) const {
-            ptr->~T();
-            aligned_large_pages_free(ptr);
-        }
-    };
+  template <typename T>
+  struct LargePageDeleter {
+    void operator()(T* ptr) const {
+      ptr->~T();
+      aligned_large_pages_free(ptr);
+    }
+  };

-    template <typename T>
-    using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
+  template <typename T>
+  using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;

-    template <typename T>
-    using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
+  template <typename T>
+  using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;

-    // Input feature converter
-    extern LargePagePtr<FeatureTransformer> feature_transformer;
+  // Input feature converter
+  extern LargePagePtr<FeatureTransformer> feature_transformer;

-    // Evaluation function
-    extern AlignedPtr<Network> network;
+  // Evaluation function
+  extern AlignedPtr<Network> network;

-    // Evaluation function file name
-    extern std::string fileName;
+  // Evaluation function file name
+  extern std::string fileName;

-    // Saved evaluation function file name
-    extern std::string savedfileName;
+  // Saved evaluation function file name
+  extern std::string savedfileName;

-    extern UseNNUEMode useNNUE;
+  extern UseNNUEMode useNNUE;

-    extern std::string eval_file_loaded;
+  extern std::string eval_file_loaded;

-    // Get a string that represents the structure of the evaluation function
-    std::string get_architecture_string();
+  // Get a string that represents the structure of the evaluation function
+  std::string get_architecture_string();

-    std::string get_layers_info();
+  std::string get_layers_info();

-    // read the header
-    bool read_header(std::istream& stream,
-        std::uint32_t* hash_value, std::string* architecture);
+  // read the header
+  bool read_header(std::istream& stream,
+      std::uint32_t* hash_value, std::string* architecture);

-    // write the header
-    bool write_header(std::ostream& stream,
-        std::uint32_t hash_value, const std::string& architecture);
+  // write the header
+  bool write_header(std::ostream& stream,
+      std::uint32_t hash_value, const std::string& architecture);

-    // read evaluation function parameters
-    bool ReadParameters(std::istream& stream);
+  // read evaluation function parameters
+  bool ReadParameters(std::istream& stream);

-    // write evaluation function parameters
-    bool WriteParameters(std::ostream& stream);
+  // write evaluation function parameters
+  bool WriteParameters(std::ostream& stream);

-    Value evaluate(const Position& pos);
-    bool load_eval(std::string name, std::istream& stream);
-    void init();
+  Value evaluate(const Position& pos);
+  bool load_eval(std::string name, std::istream& stream);
+  void init();

-    void verify_eval_file_loaded();
-    void verify_any_net_loaded();
+  void verify_eval_file_loaded();
+  void verify_any_net_loaded();

 }  // namespace Eval::NNUE

--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -1,19 +1,19 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.

-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 // A class template that represents the input feature set of the NNUE evaluation function
@@ -22,7 +22,6 @@
 #define NNUE_FEATURE_SET_H_INCLUDED

 #include "features_common.h"
-
 #include <array>

 namespace Eval::NNUE::Features {
--- a/src/nnue/features/features_common.h
+++ b/src/nnue/features/features_common.h
@@ -1,19 +1,19 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.

-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 //Common header of input features of NNUE evaluation function
@@ -21,30 +21,29 @@
 #ifndef NNUE_FEATURES_COMMON_H_INCLUDED
 #define NNUE_FEATURES_COMMON_H_INCLUDED

-#include "evaluate.h"
-
-#include "nnue/nnue_common.h"
+#include "../../evaluate.h"
+#include "../nnue_common.h"

 namespace Eval::NNUE::Features {

-    class IndexList;
+  class IndexList;

-    template <typename... FeatureTypes>
-    class FeatureSet;
+  template <typename... FeatureTypes>
+  class FeatureSet;

-    // Trigger to perform full calculations instead of difference only
-    enum class TriggerEvent {
-        kNone, // Calculate the difference whenever possible
-        kFriendKingMoved, // calculate full evaluation when own king moves
-        kEnemyKingMoved, // calculate full evaluation when opponent king moves
-        kAnyKingMoved, // calculate full evaluation when any king moves
-        kAnyPieceMoved, // always calculate full evaluation
-    };
+  // Trigger to perform full calculations instead of difference only
+  enum class TriggerEvent {
+    kNone, // Calculate the difference whenever possible
+    kFriendKingMoved, // calculate full evaluation when own king moves
+    kEnemyKingMoved, // calculate full evaluation when opponent king moves
+    kAnyKingMoved, // calculate full evaluation when any king moves
+    kAnyPieceMoved, // always calculate full evaluation
+  };

-    enum class Side {
-        kFriend, // side to move
-        kEnemy, // opponent
-    };
+  enum class Side {
+    kFriend, // side to move
+    kEnemy, // opponent
+  };

 }  // namespace Eval::NNUE::Features

--- a/src/nnue/features/index_list.h
+++ b/src/nnue/features/index_list.h
@@ -1,19 +1,19 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.

-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 // Definition of index list of input features
@@ -21,43 +21,43 @@
 #ifndef NNUE_FEATURES_INDEX_LIST_H_INCLUDED
 #define NNUE_FEATURES_INDEX_LIST_H_INCLUDED

-#include "position.h"
-
-#include "nnue/nnue_architecture.h"
+#include "../../position.h"
+#include "../nnue_architecture.h"

 namespace Eval::NNUE::Features {

-    // Class template used for feature index list
-    template <typename T, std::size_t MaxSize>
-    class ValueList {
+  // Class template used for feature index list
+  template <typename T, std::size_t MaxSize>
+  class ValueList {

-    public:
-        std::size_t size() const { return size_; }
-        void resize(std::size_t size) { size_ = size; }
-        void push_back(const T& value) { values_[size_++] = value; }
-        T& operator[](std::size_t index) { return values_[index]; }
-        T* begin() { return values_; }
-        T* end() { return values_ + size_; }
-        const T& operator[](std::size_t index) const { return values_[index]; }
-        const T* begin() const { return values_; }
-        const T* end() const { return values_ + size_; }
+   public:
+    std::size_t size() const { return size_; }
+    void resize(std::size_t size) { size_ = size; }
+    void push_back(const T& value) { values_[size_++] = value; }
+    T& operator[](std::size_t index) { return values_[index]; }
+    T* begin() { return values_; }
+    T* end() { return values_ + size_; }
+    const T& operator[](std::size_t index) const { return values_[index]; }
+    const T* begin() const { return values_; }
+    const T* end() const { return values_ + size_; }

-        void swap(ValueList& other) {
-            const std::size_t max_size = std::max(size_, other.size_);
-            for (std::size_t i = 0; i < max_size; ++i) {
-                std::swap(values_[i], other.values_[i]);
-            }
-            std::swap(size_, other.size_);
-        }
+    void swap(ValueList& other) {
+      const std::size_t max_size = std::max(size_, other.size_);
+      for (std::size_t i = 0; i < max_size; ++i) {
+        std::swap(values_[i], other.values_[i]);
+      }
+      std::swap(size_, other.size_);
+    }

-    private:
-        T values_[MaxSize] = {};
-        std::size_t size_ = 0;
-    };
+   private:
+    T values_[MaxSize];
+    std::size_t size_ = 0;
+  };

-    //Type of feature index list
-    class IndexList : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
-    };
+  //Type of feature index list
+  class IndexList
+      : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
+  };

 }  // namespace Eval::NNUE::Features

--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -223,13 +223,13 @@ namespace Eval::NNUE::Layers {
        return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias);
      };

-      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
 #if defined (USE_VNNI)
+      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
        acc = _mm512_dpbusd_epi32(acc, a, b);
 #else
+      [[maybe_unused]] auto m512_dpbusd_epi32 = [=](__m512i a, __m512i b) -> __m512i {
        __m512i product0 = _mm512_maddubs_epi16(a, b);
-        product0 = _mm512_madd_epi16(product0, kOnes512);
-        acc = _mm512_add_epi32(acc, product0);
+        return _mm512_madd_epi16(product0, kOnes512);
 #endif
      };

@@ -256,14 +256,13 @@ namespace Eval::NNUE::Layers {

        return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
      };
-
-      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
 #if defined (USE_VNNI)
+      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
        acc = _mm256_dpbusd_epi32(acc, a, b);
 #else
+      [[maybe_unused]] auto m256_dpbusd_epi32 = [=](__m256i a, __m256i b) -> __m256i {
        __m256i product0 = _mm256_maddubs_epi16(a, b);
-        product0 = _mm256_madd_epi16(product0, kOnes256);
-        acc = _mm256_add_epi32(acc, product0);
+        return _mm256_madd_epi16(product0, kOnes256);
 #endif
      };

@@ -288,10 +287,9 @@ namespace Eval::NNUE::Layers {
        return _mm_add_epi32(sum0, bias);
      };

-      [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) {
+      [[maybe_unused]] auto m128_dpbusd_epi32 = [=](__m128i a, __m128i b) -> __m128i {
        __m128i product0 = _mm_maddubs_epi16(a, b);
-        product0 = _mm_madd_epi16(product0, kOnes128);
-        acc = _mm_add_epi32(acc, product0);
+        return _mm_madd_epi16(product0, kOnes128);
      };

 #endif
@@ -335,15 +333,6 @@ namespace Eval::NNUE::Layers {
          const __m512i bias = *reinterpret_cast<const __m512i*>(&biases_[i]);
          __m512i* outptr = reinterpret_cast<__m512i*>(&output[i]);

-          __m512i sum01a = _mm512_setzero_si512();
-          __m512i sum23a = _mm512_setzero_si512();
-          __m512i sum45a = _mm512_setzero_si512();
-          __m512i sum67a = _mm512_setzero_si512();
-          __m512i sum01b = _mm512_setzero_si512();
-          __m512i sum23b = _mm512_setzero_si512();
-          __m512i sum45b = _mm512_setzero_si512();
-          __m512i sum67b = _mm512_setzero_si512();
-
          const auto row01a = *reinterpret_cast<const __m512i*>(&weights_[offset01a]);
          const auto row23a = *reinterpret_cast<const __m512i*>(&weights_[offset23a]);
          const auto row45a = *reinterpret_cast<const __m512i*>(&weights_[offset45a]);
@@ -356,6 +345,16 @@ namespace Eval::NNUE::Layers {
          const __m256i in256 = input_vector256[0];
          const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1);

+#if defined (USE_VNNI)
+          __m512i sum01a = _mm512_setzero_si512();
+          __m512i sum23a = _mm512_setzero_si512();
+          __m512i sum45a = _mm512_setzero_si512();
+          __m512i sum67a = _mm512_setzero_si512();
+          __m512i sum01b = _mm512_setzero_si512();
+          __m512i sum23b = _mm512_setzero_si512();
+          __m512i sum45b = _mm512_setzero_si512();
+          __m512i sum67b = _mm512_setzero_si512();
+
          m512_add_dpbusd_epi32(sum01a, in, row01a);
          m512_add_dpbusd_epi32(sum23a, in, row23a);
          m512_add_dpbusd_epi32(sum45a, in, row45a);
@@ -364,6 +363,16 @@ namespace Eval::NNUE::Layers {
          m512_add_dpbusd_epi32(sum23b, in, row23b);
          m512_add_dpbusd_epi32(sum45b, in, row45b);
          m512_add_dpbusd_epi32(sum67b, in, row67b);
+#else
+          __m512i sum01a = m512_dpbusd_epi32(in, row01a);
+          __m512i sum23a = m512_dpbusd_epi32(in, row23a);
+          __m512i sum45a = m512_dpbusd_epi32(in, row45a);
+          __m512i sum67a = m512_dpbusd_epi32(in, row67a);
+          __m512i sum01b = m512_dpbusd_epi32(in, row01b);
+          __m512i sum23b = m512_dpbusd_epi32(in, row23b);
+          __m512i sum45b = m512_dpbusd_epi32(in, row45b);
+          __m512i sum67b = m512_dpbusd_epi32(in, row67b);
+#endif

          *outptr = m512_hadd256x16(
            sum01a, sum23a, sum45a, sum67a,
@@ -384,48 +393,80 @@ namespace Eval::NNUE::Layers {

          if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
          {
-            __m512i sum0 = _mm512_setzero_si512();
-            __m512i sum1 = _mm512_setzero_si512();
-            __m512i sum2 = _mm512_setzero_si512();
-            __m512i sum3 = _mm512_setzero_si512();
-
            const auto row0 = reinterpret_cast<const __m512i*>(&weights_[offset0]);
            const auto row1 = reinterpret_cast<const __m512i*>(&weights_[offset1]);
            const auto row2 = reinterpret_cast<const __m512i*>(&weights_[offset2]);
            const auto row3 = reinterpret_cast<const __m512i*>(&weights_[offset3]);

-            for (IndexType j = 0; j < kNumChunks512; ++j)
+#if defined (USE_VNNI)
+            __m512i sum0 = _mm512_setzero_si512();
+            __m512i sum1 = _mm512_setzero_si512();
+            __m512i sum2 = _mm512_setzero_si512();
+            __m512i sum3 = _mm512_setzero_si512();
+            const IndexType kStart = 0;
+#else
+            __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
+            __m512i sum1 = m512_dpbusd_epi32(input_vector512[0], row1[0]);
+            __m512i sum2 = m512_dpbusd_epi32(input_vector512[0], row2[0]);
+            __m512i sum3 = m512_dpbusd_epi32(input_vector512[0], row3[0]);
+            const IndexType kStart = 1;
+#endif
+
+            for (IndexType j = kStart; j < kNumChunks512; ++j)
            {
              const __m512i in = input_vector512[j];

+#if defined (USE_VNNI)
              m512_add_dpbusd_epi32(sum0, in, row0[j]);
              m512_add_dpbusd_epi32(sum1, in, row1[j]);
              m512_add_dpbusd_epi32(sum2, in, row2[j]);
              m512_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+              sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
+              sum1 = _mm512_add_epi32(sum1, m512_dpbusd_epi32(in, row1[j]));
+              sum2 = _mm512_add_epi32(sum2, m512_dpbusd_epi32(in, row2[j]));
+              sum3 = _mm512_add_epi32(sum3, m512_dpbusd_epi32(in, row3[j]));
+#endif
            }

            *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias);
          }
          else
          {
-            __m256i sum0 = _mm256_setzero_si256();
-            __m256i sum1 = _mm256_setzero_si256();
-            __m256i sum2 = _mm256_setzero_si256();
-            __m256i sum3 = _mm256_setzero_si256();
-
            const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
            const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
            const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
            const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);

-            for (IndexType j = 0; j < kNumChunks256; ++j)
+#if defined (USE_VNNI)
+            __m256i sum0 = _mm256_setzero_si256();
+            __m256i sum1 = _mm256_setzero_si256();
+            __m256i sum2 = _mm256_setzero_si256();
+            __m256i sum3 = _mm256_setzero_si256();
+            const IndexType kStart = 0;
+#else
+            __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
+            __m256i sum1 = m256_dpbusd_epi32(input_vector256[0], row1[0]);
+            __m256i sum2 = m256_dpbusd_epi32(input_vector256[0], row2[0]);
+            __m256i sum3 = m256_dpbusd_epi32(input_vector256[0], row3[0]);
+            const IndexType kStart = 1;
+#endif
+
+            for (IndexType j = kStart; j < kNumChunks256; ++j)
            {
              const __m256i in = input_vector256[j];

+#if defined (USE_VNNI)
              m256_add_dpbusd_epi32(sum0, in, row0[j]);
              m256_add_dpbusd_epi32(sum1, in, row1[j]);
              m256_add_dpbusd_epi32(sum2, in, row2[j]);
              m256_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+              sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+              sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
+              sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
+              sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
+#endif
            }

            *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -436,30 +477,50 @@ namespace Eval::NNUE::Layers {
      {
        if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
        {
-          __m512i sum0 = _mm512_setzero_si512();
-
          const auto row0 = reinterpret_cast<const __m512i*>(&weights_[0]);

-          for (IndexType j = 0; j < kNumChunks512; ++j)
+#if defined (USE_VNNI)
+          __m512i sum0 = _mm512_setzero_si512();
+          const IndexType kStart = 0;
+#else
+          __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks512; ++j)
          {
            const __m512i in = input_vector512[j];

+#if defined (USE_VNNI)
            m512_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+            sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
+#endif
          }

          output[0] = m512_hadd(sum0, biases_[0]);
        }
        else
        {
-          __m256i sum0 = _mm256_setzero_si256();
-
          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);

-          for (IndexType j = 0; j < kNumChunks256; ++j)
+#if defined (USE_VNNI)
+          __m256i sum0 = _mm256_setzero_si256();
+          const IndexType kStart = 0;
+#else
+          __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks256; ++j)
          {
            const __m256i in = input_vector256[j];

+#if defined (USE_VNNI)
            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+#endif
          }

          output[0] = m256_hadd(sum0, biases_[0]);
@@ -493,24 +554,40 @@ namespace Eval::NNUE::Layers {
          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);

-          __m256i sum0 = _mm256_setzero_si256();
-          __m256i sum1 = _mm256_setzero_si256();
-          __m256i sum2 = _mm256_setzero_si256();
-          __m256i sum3 = _mm256_setzero_si256();
-
          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
          const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
          const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
          const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);

-          for (IndexType j = 0; j < kNumChunks; ++j)
+#if defined (USE_VNNI)
+          __m256i sum0 = _mm256_setzero_si256();
+          __m256i sum1 = _mm256_setzero_si256();
+          __m256i sum2 = _mm256_setzero_si256();
+          __m256i sum3 = _mm256_setzero_si256();
+          const IndexType kStart = 0;
+#else
+          __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
+          __m256i sum1 = m256_dpbusd_epi32(input_vector[0], row1[0]);
+          __m256i sum2 = m256_dpbusd_epi32(input_vector[0], row2[0]);
+          __m256i sum3 = m256_dpbusd_epi32(input_vector[0], row3[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks; ++j)
          {
            const __m256i in = input_vector[j];

+#if defined (USE_VNNI)
            m256_add_dpbusd_epi32(sum0, in, row0[j]);
            m256_add_dpbusd_epi32(sum1, in, row1[j]);
            m256_add_dpbusd_epi32(sum2, in, row2[j]);
            m256_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+            sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
+            sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
+            sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
+#endif
          }

          *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -518,15 +595,25 @@ namespace Eval::NNUE::Layers {
      }
      else if constexpr (kOutputDimensions == 1)
      {
-        __m256i sum0 = _mm256_setzero_si256();
-
        const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);

-        for (IndexType j = 0; j < kNumChunks; ++j)
+#if defined (USE_VNNI)
+        __m256i sum0 = _mm256_setzero_si256();
+        const IndexType kStart = 0;
+#else
+        __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
+        const IndexType kStart = 1;
+#endif
+
+        for (IndexType j = kStart; j < kNumChunks; ++j)
        {
          const __m256i in = input_vector[j];

-            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#if defined (USE_VNNI)
+          m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+          sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+#endif
        }

        output[0] = m256_hadd(sum0, biases_[0]);
@@ -559,24 +646,24 @@ namespace Eval::NNUE::Layers {
          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);

-          __m128i sum0 = _mm_setzero_si128();
-          __m128i sum1 = _mm_setzero_si128();
-          __m128i sum2 = _mm_setzero_si128();
-          __m128i sum3 = _mm_setzero_si128();
-
          const auto row0 = reinterpret_cast<const __m128i*>(&weights_[offset0]);
          const auto row1 = reinterpret_cast<const __m128i*>(&weights_[offset1]);
          const auto row2 = reinterpret_cast<const __m128i*>(&weights_[offset2]);
          const auto row3 = reinterpret_cast<const __m128i*>(&weights_[offset3]);

-          for (int j = 0; j < (int)kNumChunks; j += 1)
+          __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
+          __m128i sum1 = m128_dpbusd_epi32(input_vector[0], row1[0]);
+          __m128i sum2 = m128_dpbusd_epi32(input_vector[0], row2[0]);
+          __m128i sum3 = m128_dpbusd_epi32(input_vector[0], row3[0]);
+
+          for (int j = 1; j < (int)kNumChunks; ++j)
          {
            const __m128i in = input_vector[j];

-            m128_add_dpbusd_epi32(sum0, in, row0[j]);
-            m128_add_dpbusd_epi32(sum1, in, row1[j]);
-            m128_add_dpbusd_epi32(sum2, in, row2[j]);
-            m128_add_dpbusd_epi32(sum3, in, row3[j]);
+            sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(in, row0[j]));
+            sum1 = _mm_add_epi32(sum1, m128_dpbusd_epi32(in, row1[j]));
+            sum2 = _mm_add_epi32(sum2, m128_dpbusd_epi32(in, row2[j]));
+            sum3 = _mm_add_epi32(sum3, m128_dpbusd_epi32(in, row3[j]));
          }

          *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -584,16 +671,12 @@ namespace Eval::NNUE::Layers {
      }
      else if constexpr (kOutputDimensions == 1)
      {
-        __m128i sum0 = _mm_setzero_si128();
-
        const auto row0 = reinterpret_cast<const __m128i*>(&weights_[0]);

-        for (int j = 0; j < (int)kNumChunks; j += 1)
-        {
-          const __m128i in = input_vector[j];
+        __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);

-          m128_add_dpbusd_epi32(sum0, in, row0[j]);
-        }
+        for (int j = 1; j < (int)kNumChunks; ++j)
+          sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(input_vector[j], row0[j]));

        output[0] = m128_hadd(sum0, biases_[0]);
      }
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -1,34 +1,35 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.

-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

+// Class for difference calculation of NNUE evaluation function
+
 #ifndef NNUE_ACCUMULATOR_H_INCLUDED
 #define NNUE_ACCUMULATOR_H_INCLUDED

 #include "nnue_architecture.h"

-// Class for difference calculation of NNUE evaluation function
 namespace Eval::NNUE {

-    // Class that holds the result of affine transformation of input features
-    struct alignas(kCacheLineSize) Accumulator {
-        std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
-        bool computed_accumulation;
-    };
+  // Class that holds the result of affine transformation of input features
+  struct alignas(kCacheLineSize) Accumulator {
+      std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
+      bool computed_accumulation;
+  };

 }  // namespace Eval::NNUE

--- a/src/nnue/nnue_architecture.h
+++ b/src/nnue/nnue_architecture.h
@@ -1,36 +1,37 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.

-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

+// Input features and network structure used in NNUE evaluation function
+
 #ifndef NNUE_ARCHITECTURE_H_INCLUDED
 #define NNUE_ARCHITECTURE_H_INCLUDED

 // Defines the network structure
 #include "architectures/halfkp_256x2-32-32.h"

-// Input features and network structure used in NNUE evaluation function
 namespace Eval::NNUE {

-    static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
-    static_assert(Network::kOutputDimensions == 1, "");
-    static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
+  static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
+  static_assert(Network::kOutputDimensions == 1, "");
+  static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");

-    // Trigger for full calculation instead of difference calculation
-    constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
+  // Trigger for full calculation instead of difference calculation
+  constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;

 }  // namespace Eval::NNUE

--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -1,19 +1,19 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.

-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 // A class that converts the input features of the NNUE evaluation function
@@ -23,7 +23,6 @@

 #include "nnue_common.h"
 #include "nnue_architecture.h"
-
 #include "features/index_list.h"

 #include <cstring>
@@ -31,456 +30,486 @@

 namespace Eval::NNUE {

-    // If vector instructions are enabled, we update and refresh the
-    // accumulator tile by tile such that each tile fits in the CPU's
-    // vector registers.
-#define TILING
+  // If vector instructions are enabled, we update and refresh the
+  // accumulator tile by tile such that each tile fits in the CPU's
+  // vector registers.
+  #define VECTOR

-#ifdef USE_AVX512
-    typedef __m512i vec_t;
-#define vec_load(a) _mm512_load_si512(a)
-#define vec_store(a,b) _mm512_store_si512(a,b)
-#define vec_add_16(a,b) _mm512_add_epi16(a,b)
-#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
-#define vec_zero _mm512_setzero_si512()
-    static constexpr IndexType kNumRegs = 8; // only 8 are needed
+  #ifdef USE_AVX512
+  typedef __m512i vec_t;
+  #define vec_load(a) _mm512_load_si512(a)
+  #define vec_store(a,b) _mm512_store_si512(a,b)
+  #define vec_add_16(a,b) _mm512_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+  #define vec_zero _mm512_setzero_si512()
+  static constexpr IndexType kNumRegs = 8; // only 8 are needed

-#elif USE_AVX2
-    typedef __m256i vec_t;
-#define vec_load(a) _mm256_load_si256(a)
-#define vec_store(a,b) _mm256_store_si256(a,b)
-#define vec_add_16(a,b) _mm256_add_epi16(a,b)
-#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
-#define vec_zero _mm256_setzero_si256()
-    static constexpr IndexType kNumRegs = 16;
-
-#elif USE_SSE2
-    typedef __m128i vec_t;
-#define vec_load(a) (*(a))
-#define vec_store(a,b) *(a)=(b)
-#define vec_add_16(a,b) _mm_add_epi16(a,b)
-#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
-#define vec_zero _mm_setzero_si128()
-    static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
-
-#elif USE_MMX
-    typedef __m64 vec_t;
-#define vec_load(a) (*(a))
-#define vec_store(a,b) *(a)=(b)
-#define vec_add_16(a,b) _mm_add_pi16(a,b)
-#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
-#define vec_zero _mm_setzero_si64()
-    static constexpr IndexType kNumRegs = 8;
-
-#elif USE_NEON
-    typedef int16x8_t vec_t;
-#define vec_load(a) (*(a))
-#define vec_store(a,b) *(a)=(b)
-#define vec_add_16(a,b) vaddq_s16(a,b)
-#define vec_sub_16(a,b) vsubq_s16(a,b)
-#define vec_zero {0}
+  #elif USE_AVX2
+  typedef __m256i vec_t;
+  #define vec_load(a) _mm256_load_si256(a)
+  #define vec_store(a,b) _mm256_store_si256(a,b)
+  #define vec_add_16(a,b) _mm256_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+  #define vec_zero _mm256_setzero_si256()
  static constexpr IndexType kNumRegs = 16;

+  #elif USE_SSE2
+  typedef __m128i vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) _mm_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
+  #define vec_zero _mm_setzero_si128()
+  static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
+
+  #elif USE_MMX
+  typedef __m64 vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) _mm_add_pi16(a,b)
+  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
+  #define vec_zero _mm_setzero_si64()
+  static constexpr IndexType kNumRegs = 8;
+
+  #elif USE_NEON
+  typedef int16x8_t vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) vaddq_s16(a,b)
+  #define vec_sub_16(a,b) vsubq_s16(a,b)
+  #define vec_zero {0}
+  static constexpr IndexType kNumRegs = 16;
+
+  #else
+  #undef VECTOR
+
+  #endif
+
+  // Input feature converter
+  class FeatureTransformer {
+
+   private:
+    // Number of output dimensions for one side
+    static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
+
+    #ifdef VECTOR
+    static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
+    static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
+    #endif
+
+   public:
+    // Output type
+    using OutputType = TransformedFeatureType;
+
+    // Number of input/output dimensions
+    static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
+    static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
+
+    // Size of forward propagation buffer
+    static constexpr std::size_t kBufferSize =
+        kOutputDimensions * sizeof(OutputType);
+
+    static constexpr int kLayerIndex = 0;
+
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t GetHashValue() {
+
+      return RawFeatures::kHashValue ^ kOutputDimensions;
+    }
+
+    static std::string get_name() {
+      return RawFeatures::get_name() + "[" +
+          std::to_string(kInputDimensions) + "->" +
+          std::to_string(kHalfDimensions) + "x2]";
+    }
+
+    // a string representing the structure
+    static std::string get_structure_string() {
+      return get_name();
+    }
+
+    static std::string get_layers_info() {
+      std::string info = "  - ";
+      info += std::to_string(kLayerIndex);
+      info += " - ";
+      info += get_name();
+      return info;
+    }
+
+    // Read network parameters
+    bool ReadParameters(std::istream& stream) {
+
+      for (std::size_t i = 0; i < kHalfDimensions; ++i)
+        biases_[i] = read_little_endian<BiasType>(stream);
+      for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
+        weights_[i] = read_little_endian<WeightType>(stream);
+      return !stream.fail();
+    }
+
+    // write parameters
+    bool WriteParameters(std::ostream& stream) const {
+      stream.write(reinterpret_cast<const char*>(biases_),
+          kHalfDimensions * sizeof(BiasType));
+
+      stream.write(reinterpret_cast<const char*>(weights_),
+          kHalfDimensions * kInputDimensions * sizeof(WeightType));
+
+      return !stream.fail();
+    }
+
+    // Proceed with the difference calculation if possible
+    bool update_accumulator_if_possible(const Position& pos) const {
+
+      const auto now = pos.state();
+      if (now->accumulator.computed_accumulation)
+        return true;
+
+      const auto prev = now->previous;
+      if (prev && prev->accumulator.computed_accumulation) {
+        update_accumulator(pos);
+        return true;
+      }
+
+      return false;
+    }
+
+    // Convert input features
+    void Transform(const Position& pos, OutputType* output) const {
+
+      if (!update_accumulator_if_possible(pos))
+        refresh_accumulator(pos);
+
+      const auto& accumulation = pos.state()->accumulator.accumulation;
+
+  #if defined(USE_AVX512)
+      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth * 2);
+      static_assert(kHalfDimensions % (kSimdWidth * 2) == 0);
+      const __m512i kControl = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+      const __m512i kZero = _mm512_setzero_si512();
+
+  #elif defined(USE_AVX2)
+      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+      constexpr int kControl = 0b11011000;
+      const __m256i kZero = _mm256_setzero_si256();
+
+  #elif defined(USE_SSE2)
+      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+
+  #ifdef USE_SSE41
+      const __m128i kZero = _mm_setzero_si128();
+  #else
+      const __m128i k0x80s = _mm_set1_epi8(-128);
+  #endif
+
+  #elif defined(USE_MMX)
+      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+      const __m64 k0x80s = _mm_set1_pi8(-128);
+
+  #elif defined(USE_NEON)
+      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+      const int8x8_t kZero = {0};
+  #endif
+
+      const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
+      for (IndexType p = 0; p < 2; ++p) {
+        const IndexType offset = kHalfDimensions * p;
+
+  #if defined(USE_AVX512)
+        auto out = reinterpret_cast<__m512i*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m512i sum0 = _mm512_load_si512(
+              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m512i sum1 = _mm512_load_si512(
+              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum0 = _mm512_add_epi16(sum0, reinterpret_cast<const __m512i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 0]);
+              sum1 = _mm512_add_epi16(sum1, reinterpret_cast<const __m512i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
+          _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(kControl,
+              _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), kZero)));
+        }
+
+  #elif defined(USE_AVX2)
+        auto out = reinterpret_cast<__m256i*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m256i sum0 = _mm256_load_si256(
+              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m256i sum1 = _mm256_load_si256(
+              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 0]);
+              sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
+          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+              _mm256_packs_epi16(sum0, sum1), kZero), kControl));
+        }
+
+  #elif defined(USE_SSE2)
+        auto out = reinterpret_cast<__m128i*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+              accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+              accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+            sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
+                accumulation[perspectives[p]][i])[j * 2 + 0]);
+            sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
+                accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
+      const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
+
+          _mm_store_si128(&out[j],
+
+  #ifdef USE_SSE41
+              _mm_max_epi8(packedbytes, kZero)
+  #else
+              _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+  #endif
+
+          );
+        }
+
+  #elif defined(USE_MMX)
+        auto out = reinterpret_cast<__m64*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m64 sum0 = *(&reinterpret_cast<const __m64*>(
+              accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m64 sum1 = *(&reinterpret_cast<const __m64*>(
+              accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 0]);
+              sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
+          const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
+          out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+        }
+
+  #elif defined(USE_NEON)
+        const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          int16x8_t sum = reinterpret_cast<const int16x8_t*>(
+              accumulation[perspectives[p]][0])[j];
+
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
+                  accumulation[perspectives[p]][i])[j]);
+          }
+
+          out[j] = vmax_s8(vqmovn_s16(sum), kZero);
+        }
+
+  #else
+        for (IndexType j = 0; j < kHalfDimensions; ++j) {
+          BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum += accumulation[static_cast<int>(perspectives[p])][i][j];
+          }
+
+          output[offset + j] = static_cast<OutputType>(
+              std::max<int>(0, std::min<int>(127, sum)));
+        }
+  #endif
+
+      }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
+    }
+
+   private:
+    // Calculate cumulative value without using difference calculation
+    void refresh_accumulator(const Position& pos) const {
+
+  #ifdef VECTOR
+      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
+      // is defined in the VECTOR code below, once in each branch
+      vec_t acc[kNumRegs];
+  #endif
+      auto& accumulator = pos.state()->accumulator;
+      for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+        Features::IndexList active_indices[2];
+        RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
+                                           active_indices);
+          for (Color perspective : { WHITE, BLACK }) {
+#ifdef VECTOR
+            for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+              auto accTile = reinterpret_cast<vec_t*>(
+                  &accumulator.accumulation[perspective][i][j * kTileHeight]);
+
+              if (i == 0) {
+                auto biasesTile = reinterpret_cast<const vec_t*>(
+                    &biases_[j * kTileHeight]);
+                for (IndexType k = 0; k < kNumRegs; ++k)
+                  acc[k] = biasesTile[k];
+              } else {
+                for (IndexType k = 0; k < kNumRegs; ++k)
+                  acc[k] = vec_zero;
+              }
+
+              for (const auto index : active_indices[perspective]) {
+                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+                for (IndexType k = 0; k < kNumRegs; ++k)
+                  acc[k] = vec_add_16(acc[k], column[k]);
+              }
+
+              for (IndexType k = 0; k < kNumRegs; k++)
+                vec_store(&accTile[k], acc[k]);
+            }
 #else
-#undef TILING
-
-#endif
-
-    // Input feature converter
-    class FeatureTransformer {
-
-    private:
-        // Number of output dimensions for one side
-        static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
-
-#ifdef TILING
-        static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
-        static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
-#endif
-
-    public:
-        // Output type
-        using OutputType = TransformedFeatureType;
-
-        // Number of input/output dimensions
-        static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
-        static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
-
-        // Size of forward propagation buffer
-        static constexpr std::size_t kBufferSize =
-            kOutputDimensions * sizeof(OutputType);
-
-        static constexpr int kLayerIndex = 0;
-
-        // Hash value embedded in the evaluation file
-        static constexpr std::uint32_t GetHashValue() {
-
-            return RawFeatures::kHashValue ^ kOutputDimensions;
-        }
-
-        static std::string get_name() {
-            return RawFeatures::get_name() + "[" +
-                std::to_string(kInputDimensions) + "->" +
-                std::to_string(kHalfDimensions) + "x2]";
-        }
-
-        // a string representing the structure
-        static std::string get_structure_string() {
-            return get_name();
-        }
-
-        static std::string get_layers_info() {
-            std::string info = "  - ";
-            info += std::to_string(kLayerIndex);
-            info += " - ";
-            info += get_name();
-            return info;
-        }
-
-        // Read network parameters
-        bool ReadParameters(std::istream& stream) {
-
-            for (std::size_t i = 0; i < kHalfDimensions; ++i)
-                biases_[i] = read_little_endian<BiasType>(stream);
-
-            for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
-                weights_[i] = read_little_endian<WeightType>(stream);
-
-            return !stream.fail();
-        }
-
-        // write parameters
-        bool WriteParameters(std::ostream& stream) const {
-            stream.write(reinterpret_cast<const char*>(biases_),
-                kHalfDimensions * sizeof(BiasType));
-
-            stream.write(reinterpret_cast<const char*>(weights_),
-                kHalfDimensions * kInputDimensions * sizeof(WeightType));
-
-            return !stream.fail();
-        }
-
-        // Proceed with the difference calculation if possible
-        bool update_accumulator_if_possible(const Position& pos) const {
-
-            const auto now = pos.state();
-            if (now->accumulator.computed_accumulation)
-                return true;
-
-            const auto prev = now->previous;
-            if (prev && prev->accumulator.computed_accumulation) {
-                update_accumulator(pos);
-                return true;
+            if (i == 0) {
+              std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                          kHalfDimensions * sizeof(BiasType));
+            } else {
+              std::memset(accumulator.accumulation[perspective][i], 0,
+                          kHalfDimensions * sizeof(BiasType));
            }

-            return false;
-        }
-
-        // Convert input features
-        void Transform(const Position& pos, OutputType* output) const {
-
-            if (!update_accumulator_if_possible(pos))
-              refresh_accumulator(pos);
-
-            const auto& accumulation = pos.state()->accumulator.accumulation;
-
-#if defined(USE_AVX2)
-            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-            constexpr int kControl = 0b11011000;
-            const __m256i kZero = _mm256_setzero_si256();
-
-#elif defined(USE_SSE2)
-            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-
-#ifdef USE_SSE41
-            const __m128i kZero = _mm_setzero_si128();
-#else
-            const __m128i k0x80s = _mm_set1_epi8(-128);
-#endif
-
-#elif defined(USE_MMX)
-            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-            const __m64 k0x80s = _mm_set1_pi8(-128);
-
-#elif defined(USE_NEON)
-            constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-            const int8x8_t kZero = {0};
-#endif
-
-            const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
-            for (IndexType p = 0; p < 2; ++p) {
-                const IndexType offset = kHalfDimensions * p;
-
-#if defined(USE_AVX2)
-                auto out = reinterpret_cast<__m256i*>(&output[offset]);
-                for (IndexType j = 0; j < kNumChunks; ++j) {
-                    __m256i sum0 = _mm256_load_si256(
-                        &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
-                    __m256i sum1 = _mm256_load_si256(
-                      &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
-                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-                        sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
-                            accumulation[perspectives[p]][i])[j * 2 + 0]);
-                        sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
-                            accumulation[perspectives[p]][i])[j * 2 + 1]);
-                    }
-
-                    _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
-                        _mm256_packs_epi16(sum0, sum1), kZero), kControl));
-                }
-
-#elif defined(USE_SSE2)
-                auto out = reinterpret_cast<__m128i*>(&output[offset]);
-                for (IndexType j = 0; j < kNumChunks; ++j) {
-                    __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
-                        accumulation[perspectives[p]][0])[j * 2 + 0]);
-                    __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
-                        accumulation[perspectives[p]][0])[j * 2 + 1]);
-                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-                        sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
-                            accumulation[perspectives[p]][i])[j * 2 + 0]);
-                        sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
-                            accumulation[perspectives[p]][i])[j * 2 + 1]);
-                    }
-
-                    const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
-
-                    _mm_store_si128(&out[j],
-
-#ifdef USE_SSE41
-                        _mm_max_epi8(packedbytes, kZero)
-#else
-                        _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
-#endif
-
-                    );
-                }
-
-#elif defined(USE_MMX)
-                auto out = reinterpret_cast<__m64*>(&output[offset]);
-                for (IndexType j = 0; j < kNumChunks; ++j) {
-                    __m64 sum0 = *(&reinterpret_cast<const __m64*>(
-                        accumulation[perspectives[p]][0])[j * 2 + 0]);
-                    __m64 sum1 = *(&reinterpret_cast<const __m64*>(
-                        accumulation[perspectives[p]][0])[j * 2 + 1]);
-                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-                        sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
-                            accumulation[perspectives[p]][i])[j * 2 + 0]);
-                        sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
-                            accumulation[perspectives[p]][i])[j * 2 + 1]);
-                    }
-
-                    const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
-                    out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
-                }
-
-#elif defined(USE_NEON)
-                const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
-                for (IndexType j = 0; j < kNumChunks; ++j) {
-                    int16x8_t sum = reinterpret_cast<const int16x8_t*>(
-                        accumulation[perspectives[p]][0])[j];
-
-                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-                        sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
-                            accumulation[perspectives[p]][i])[j]);
-                    }
-
-                    out[j] = vmax_s8(vqmovn_s16(sum), kZero);
-                }
-
-#else
-                for (IndexType j = 0; j < kHalfDimensions; ++j) {
-                    BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
-                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-                        sum += accumulation[static_cast<int>(perspectives[p])][i][j];
-                    }
-
-                    output[offset + j] = static_cast<OutputType>(
-                        std::max<int>(0, std::min<int>(127, sum)));
-                }
-#endif
+            for (const auto index : active_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index;

+              for (IndexType j = 0; j < kHalfDimensions; ++j)
+                accumulator.accumulation[perspective][i][j] += weights_[offset + j];
            }
-#if defined(USE_MMX)
-            _mm_empty();
 #endif
+          }
+
        }

-    private:
-        // Calculate cumulative value without using difference calculation
-        void refresh_accumulator(const Position& pos) const {
-
-            auto& accumulator = pos.state()->accumulator;
-            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-                Features::IndexList active_indices[2];
-                RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
-                                                   active_indices);
-                for (Color perspective : { WHITE, BLACK }) {
-#ifdef TILING
-                    for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
-                        auto accTile = reinterpret_cast<vec_t*>(
-                            &accumulator.accumulation[perspective][i][j * kTileHeight]);
-                        vec_t acc[kNumRegs];
-
-                        if (i == 0) {
-                            auto biasesTile = reinterpret_cast<const vec_t*>(
-                                &biases_[j * kTileHeight]);
-                            for (unsigned k = 0; k < kNumRegs; ++k)
-                                acc[k] = biasesTile[k];
-                        } else {
-                            for (unsigned k = 0; k < kNumRegs; ++k)
-                                acc[k] = vec_zero;
-                        }
-
-                        for (const auto index : active_indices[perspective]) {
-                            const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-                            auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-                            for (unsigned k = 0; k < kNumRegs; ++k)
-                                acc[k] = vec_add_16(acc[k], column[k]);
-                        }
-
-                        for (unsigned k = 0; k < kNumRegs; k++)
-                            vec_store(&accTile[k], acc[k]);
-                    }
-#else
-                    if (i == 0) {
-                        std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                                    kHalfDimensions * sizeof(BiasType));
-                    } else {
-                        std::memset(accumulator.accumulation[perspective][i], 0,
-                                    kHalfDimensions * sizeof(BiasType));
-                    }
-
-                    for (const auto index : active_indices[perspective]) {
-                        const IndexType offset = kHalfDimensions * index;
-
-                        for (IndexType j = 0; j < kHalfDimensions; ++j)
-                            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-                    }
-#endif
-                }
-
-            }
-
 #if defined(USE_MMX)
-            _mm_empty();
+        _mm_empty();
 #endif

-            accumulator.computed_accumulation = true;
+        accumulator.computed_accumulation = true;
+    }
+
+    // Calculate cumulative value using difference calculation
+    void update_accumulator(const Position& pos) const {
+
+  #ifdef VECTOR
+      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
+      // is defined in the VECTOR code below, once in each branch
+      vec_t acc[kNumRegs];
+  #endif
+    const auto& prev_accumulator = pos.state()->previous->accumulator;
+    auto& accumulator = pos.state()->accumulator;
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+      Features::IndexList removed_indices[2], added_indices[2];
+      bool reset[2] = { false, false };
+      RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
+                                          removed_indices, added_indices, reset);
+
+#ifdef VECTOR
+      for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+        for (Color perspective : { WHITE, BLACK }) {
+          auto accTile = reinterpret_cast<vec_t*>(
+              &accumulator.accumulation[perspective][i][j * kTileHeight]);
+
+          if (reset[perspective]) {
+            if (i == 0) {
+              auto biasesTile = reinterpret_cast<const vec_t*>(
+                  &biases_[j * kTileHeight]);
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = biasesTile[k];
+            } else {
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_zero;
+            }
+          } else {
+            auto prevAccTile = reinterpret_cast<const vec_t*>(
+                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
+
+            for (IndexType k = 0; k < kNumRegs; ++k)
+              acc[k] = vec_load(&prevAccTile[k]);
+
+            // Difference calculation for the deactivated features
+            for (const auto index : removed_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_sub_16(acc[k], column[k]);
+            }
+          }
+
+          { // Difference calculation for the activated features
+            for (const auto index : added_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_add_16(acc[k], column[k]);
+            }
+          }
+
+          for (IndexType k = 0; k < kNumRegs; ++k)
+            vec_store(&accTile[k], acc[k]);
        }
-
-        // Calculate cumulative value using difference calculation
-        void update_accumulator(const Position& pos) const {
-
-            const auto& prev_accumulator = pos.state()->previous->accumulator;
-            auto& accumulator = pos.state()->accumulator;
-            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-                Features::IndexList removed_indices[2], added_indices[2];
-                bool reset[2] = { false, false };
-                RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
-                                                    removed_indices, added_indices, reset);
-
-#ifdef TILING
-                for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
-                    for (Color perspective : { WHITE, BLACK }) {
-                        auto accTile = reinterpret_cast<vec_t*>(
-                            &accumulator.accumulation[perspective][i][j * kTileHeight]);
-                        vec_t acc[kNumRegs];
-
-                        if (reset[perspective]) {
-                            if (i == 0) {
-                                auto biasesTile = reinterpret_cast<const vec_t*>(
-                                    &biases_[j * kTileHeight]);
-                                for (unsigned k = 0; k < kNumRegs; ++k)
-                                    acc[k] = biasesTile[k];
-                            } else {
-                                for (unsigned k = 0; k < kNumRegs; ++k)
-                                    acc[k] = vec_zero;
-                            }
-                        } else {
-                            auto prevAccTile = reinterpret_cast<const vec_t*>(
-                                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
-
-                            for (IndexType k = 0; k < kNumRegs; ++k)
-                                acc[k] = vec_load(&prevAccTile[k]);
-
-                            // Difference calculation for the deactivated features
-                            for (const auto index : removed_indices[perspective]) {
-                                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-                                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-                                for (IndexType k = 0; k < kNumRegs; ++k)
-                                    acc[k] = vec_sub_16(acc[k], column[k]);
-                            }
-                        }
-
-                        { // Difference calculation for the activated features
-                          for (const auto index : added_indices[perspective]) {
-                              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-                              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-                              for (IndexType k = 0; k < kNumRegs; ++k)
-                                  acc[k] = vec_add_16(acc[k], column[k]);
-                          }
-                        }
-
-                        for (IndexType k = 0; k < kNumRegs; ++k)
-                          vec_store(&accTile[k], acc[k]);
-                    }
-                }
+      }
 #if defined(USE_MMX)
-                _mm_empty();
+      _mm_empty();
 #endif

 #else
-                for (Color perspective : { WHITE, BLACK }) {
+      for (Color perspective : { WHITE, BLACK }) {

-                    if (reset[perspective]) {
-                        if (i == 0) {
-                            std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                                        kHalfDimensions * sizeof(BiasType));
-                        } else {
-                            std::memset(accumulator.accumulation[perspective][i], 0,
-                                        kHalfDimensions * sizeof(BiasType));
-                        }
-                    } else {
-                        std::memcpy(accumulator.accumulation[perspective][i],
-                                    prev_accumulator.accumulation[perspective][i],
-                                    kHalfDimensions * sizeof(BiasType));
-                        // Difference calculation for the deactivated features
-                        for (const auto index : removed_indices[perspective]) {
-                            const IndexType offset = kHalfDimensions * index;
+        if (reset[perspective]) {
+          if (i == 0) {
+            std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                        kHalfDimensions * sizeof(BiasType));
+          } else {
+            std::memset(accumulator.accumulation[perspective][i], 0,
+                        kHalfDimensions * sizeof(BiasType));
+          }
+        } else {
+          std::memcpy(accumulator.accumulation[perspective][i],
+                      prev_accumulator.accumulation[perspective][i],
+                      kHalfDimensions * sizeof(BiasType));
+          // Difference calculation for the deactivated features
+          for (const auto index : removed_indices[perspective]) {
+            const IndexType offset = kHalfDimensions * index;

-                            for (IndexType j = 0; j < kHalfDimensions; ++j)
-                                accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
-                        }
-                    }
-                    { // Difference calculation for the activated features
-                        for (const auto index : added_indices[perspective]) {
-                          const IndexType offset = kHalfDimensions * index;
-
-                          for (IndexType j = 0; j < kHalfDimensions; ++j)
-                              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-                        }
-                    }
-                }
-#endif
-            }
-            accumulator.computed_accumulation = true;
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
+          }
        }
+        { // Difference calculation for the activated features
+          for (const auto index : added_indices[perspective]) {
+            const IndexType offset = kHalfDimensions * index;

-        using BiasType = std::int16_t;
-        using WeightType = std::int16_t;
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+          }
+        }
+      }
+#endif
+      }
+      accumulator.computed_accumulation = true;
+    }

-        // Make the learning class a friend
-        friend class Trainer<FeatureTransformer>;
+    using BiasType = std::int16_t;
+    using WeightType = std::int16_t;

-        alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
-        alignas(kCacheLineSize)
-            WeightType weights_[kHalfDimensions * kInputDimensions];
-    };
+    // Make the learning class a friend
+    friend class Trainer<FeatureTransformer>;
+
+    alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
+    alignas(kCacheLineSize)
+        WeightType weights_[kHalfDimensions * kInputDimensions];
+  };

 }  // namespace Eval::NNUE

-#endif //#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED