diff --git a/AUTHORS b/AUTHORS
index f30be4de..b31a36e9 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -44,6 +44,7 @@ Daniel Dugovic (ddugovic)
 Dariusz Orzechowski (dorzechowski)
 David Zar
 Daylen Yang (daylen)
+Deshawn Mohan-Smith (GoldenRare)
 DiscanX
 Dominik Schlösser (domschl)
 double-beep
@@ -64,7 +65,6 @@ Gary Heckman (gheckman)
 George Sobala (gsobala)
 gguliash
 Gian-Carlo Pascutto (gcp)
-Deshawn Mohan-Smith (GoldenRare)
 Gontran Lemaire (gonlem)
 Goodkov Vasiliy Aleksandrovich (goodkov)
 Gregor Cramer
@@ -112,6 +112,7 @@ Mark Tenzer (31m059)
 marotear
 Matthew Lai (matthewlai)
 Matthew Sullivan (Matt14916)
+Maxim Molchanov (Maxim)
 Michael An (man)
 Michael Byrne (MichaelB7)
 Michael Chaly (Vizvezdenec)
diff --git a/src/Makefile b/src/Makefile
index 7f00bfff..7f1eaa86 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -41,7 +41,7 @@ BINDIR = $(PREFIX)/bin
 ### Built-in benchmark for pgo-builds
 PGO_TRAINING_DATA_FILE = pgo_training_data.bin
 PGOBENCH = ./$(EXE) bench
-PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 output_file_name $(PGO_TRAINING_DATA_FILE)
+PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 sfen_format bin output_file_name $(PGO_TRAINING_DATA_FILE)
 
 ### Source and object files
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 2abc6ac8..dd204a52 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -84,11 +84,11 @@ using namespace Trace;
 namespace {
 
   // Threshold for lazy and space evaluation
-  constexpr Value LazyThreshold1 =  Value(1400);
-  constexpr Value LazyThreshold2 =  Value(1300);
-  constexpr Value SpaceThreshold = Value(12222);
-  constexpr Value NNUEThreshold1 =   Value(550);
-  constexpr Value NNUEThreshold2 =   Value(150);
+  constexpr Value LazyThreshold1 =  Value(1565);
+  constexpr Value LazyThreshold2 =  Value(1102);
+  constexpr Value SpaceThreshold = Value(11551);
+  constexpr Value NNUEThreshold1 =   Value(682);
+  constexpr Value NNUEThreshold2 =   Value(176);
 
   // KingAttackWeights[PieceType] contains king attack weights by piece type
   constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -930,7 +930,7 @@ Value Eval::evaluate(const Position& pos) {
   {
       // Scale and shift NNUE for compatibility with search and classical evaluation
       auto  adjusted_NNUE = [&](){
-         int mat = pos.non_pawn_material() + PieceValue[MG][PAWN] * pos.count<PAWN>();
+         int mat = pos.non_pawn_material() + PawnValueMg * pos.count<PAWN>();
          return NNUE::evaluate(pos) * (720 + mat / 32) / 1024 + Tempo;
       };
 
@@ -940,16 +940,18 @@ Value Eval::evaluate(const Position& pos) {
       bool  largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50;
       bool  classical = largePsq || (psq > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB));
 
-      v = classical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE();
+      bool strongClassical = pos.non_pawn_material() < 2 * RookValueMg && pos.count<PAWN>() < 2;
+
+      v = classical || strongClassical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE();
 
       // If the classical eval is small and imbalance large, use NNUE nevertheless.
       // For the case of opposite colored bishops, switch to NNUE eval with
       // small probability if the classical eval is less than the threshold.
-      if (   largePsq
-          && (abs(v) * 16 < NNUEThreshold2 * r50
-          || (   pos.opposite_bishops()
-              && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50
-              && !(pos.this_thread()->nodes & 0xB))))
+      if (   largePsq && !strongClassical
+          && (   abs(v) * 16 < NNUEThreshold2 * r50
+              || (   pos.opposite_bishops()
+                  && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50
+                  && !(pos.this_thread()->nodes & 0xB))))
           v = adjusted_NNUE();
   }
 
diff --git a/src/misc.cpp b/src/misc.cpp
index 879f4462..eb68e842 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -585,11 +585,10 @@ namespace CommandLine {
 string argv0;            // path+name of the executable binary, as given by argv[0]
 string binaryDirectory;  // path of the executable directory
 string workingDirectory; // path of the working directory
-string pathSeparator;    // Separator for our current OS
 
 void init(int argc, char* argv[]) {
     (void)argc;
-    string separator;
+    string pathSeparator;
 
     // extract the path+name of the executable binary
     argv0 = argv[0];
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index c9a3ddbb..c7bd681f 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -1,19 +1,19 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
 
-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Code for calculating NNUE evaluation function
@@ -40,330 +40,313 @@
 
 namespace Eval::NNUE {
 
-    const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
-        // convention: W - us, B - them
-        // viewed from other side, W and B are reversed
-        { PS_NONE,     PS_NONE     },
-        { PS_W_PAWN,   PS_B_PAWN   },
-        { PS_W_KNIGHT, PS_B_KNIGHT },
-        { PS_W_BISHOP, PS_B_BISHOP },
-        { PS_W_ROOK,   PS_B_ROOK   },
-        { PS_W_QUEEN,  PS_B_QUEEN  },
-        { PS_W_KING,   PS_B_KING   },
-        { PS_NONE,     PS_NONE     },
-        { PS_NONE,     PS_NONE     },
-        { PS_B_PAWN,   PS_W_PAWN   },
-        { PS_B_KNIGHT, PS_W_KNIGHT },
-        { PS_B_BISHOP, PS_W_BISHOP },
-        { PS_B_ROOK,   PS_W_ROOK   },
-        { PS_B_QUEEN,  PS_W_QUEEN  },
-        { PS_B_KING,   PS_W_KING   },
-        { PS_NONE,     PS_NONE     }
-    };
+  const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
+   // convention: W - us, B - them
+   // viewed from other side, W and B are reversed
+      { PS_NONE,     PS_NONE     },
+      { PS_W_PAWN,   PS_B_PAWN   },
+      { PS_W_KNIGHT, PS_B_KNIGHT },
+      { PS_W_BISHOP, PS_B_BISHOP },
+      { PS_W_ROOK,   PS_B_ROOK   },
+      { PS_W_QUEEN,  PS_B_QUEEN  },
+      { PS_W_KING,   PS_B_KING   },
+      { PS_NONE,     PS_NONE     },
+      { PS_NONE,     PS_NONE     },
+      { PS_B_PAWN,   PS_W_PAWN   },
+      { PS_B_KNIGHT, PS_W_KNIGHT },
+      { PS_B_BISHOP, PS_W_BISHOP },
+      { PS_B_ROOK,   PS_W_ROOK   },
+      { PS_B_QUEEN,  PS_W_QUEEN  },
+      { PS_B_KING,   PS_W_KING   },
+      { PS_NONE,     PS_NONE     }
+  };
 
-    // Input feature converter
-    LargePagePtr<FeatureTransformer> feature_transformer;
+  // Input feature converter
+  LargePagePtr<FeatureTransformer> feature_transformer;
 
-    // Evaluation function
-    AlignedPtr<Network> network;
+  // Evaluation function
+  AlignedPtr<Network> network;
 
-    // Evaluation function file name
-    std::string fileName;
+  // Evaluation function file name
+  std::string fileName;
 
-    // Saved evaluation function file name
-    std::string savedfileName = "nn.bin";
+  // Saved evaluation function file name
+  std::string savedfileName = "nn.bin";
 
-    // Get a string that represents the structure of the evaluation function
-    std::string get_architecture_string() {
-        return "Features=" + FeatureTransformer::get_structure_string() +
-            ",Network=" + Network::get_structure_string();
-    }
+  // Get a string that represents the structure of the evaluation function
+  std::string get_architecture_string() {
+    return "Features=" + FeatureTransformer::get_structure_string() +
+        ",Network=" + Network::get_structure_string();
+  }
 
-    std::string get_layers_info() {
-        return
-            FeatureTransformer::get_layers_info()
-            + '\n' + Network::get_layers_info();
-    }
+  std::string get_layers_info() {
+    return
+        FeatureTransformer::get_layers_info()
+        + '\n' + Network::get_layers_info();
+  }
 
-    UseNNUEMode useNNUE;
-    std::string eval_file_loaded = "None";
+  UseNNUEMode useNNUE;
+  std::string eval_file_loaded = "None";
 
-    namespace Detail {
+  namespace Detail {
 
-        // Initialize the evaluation function parameters
-        template <typename T>
-        void initialize(AlignedPtr<T>& pointer) {
+  // Initialize the evaluation function parameters
+  template <typename T>
+  void initialize(AlignedPtr<T>& pointer) {
 
-            pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
-            std::memset(pointer.get(), 0, sizeof(T));
-        }
+    pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
+    std::memset(pointer.get(), 0, sizeof(T));
+  }
 
-        template <typename T>
-        void initialize(LargePagePtr<T>& pointer) {
+  template <typename T>
+  void initialize(LargePagePtr<T>& pointer) {
 
-            static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
+    static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
+    pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
+    std::memset(pointer.get(), 0, sizeof(T));
+  }
 
-            pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
-            std::memset(pointer.get(), 0, sizeof(T));
-        }
+  // Read evaluation function parameters
+  template <typename T>
+  bool ReadParameters(std::istream& stream, T& reference) {
 
-        // Read evaluation function parameters
-        template <typename T>
-        bool ReadParameters(std::istream& stream, T& reference) {
+    std::uint32_t header;
+    header = read_little_endian<std::uint32_t>(stream);
+    if (!stream || header != T::GetHashValue()) return false;
+    return reference.ReadParameters(stream);
+  }
 
-            std::uint32_t header;
-            header = read_little_endian<std::uint32_t>(stream);
+  // write evaluation function parameters
+  template <typename T>
+  bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
+    constexpr std::uint32_t header = T::GetHashValue();
 
-            if (!stream || header != T::GetHashValue())
-                return false;
+    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
 
-            return reference.ReadParameters(stream);
-        }
+    return pointer->WriteParameters(stream);
+  }
 
-        // write evaluation function parameters
-        template <typename T>
-        bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
-            constexpr std::uint32_t header = T::GetHashValue();
+  template <typename T>
+  bool WriteParameters(std::ostream& stream, const LargePagePtr<T>& pointer) {
+    constexpr std::uint32_t header = T::GetHashValue();
 
-            stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
 
-            return pointer->WriteParameters(stream);
-        }
+    return pointer->WriteParameters(stream);
+  }
+  }  // namespace Detail
 
-        template <typename T>
-        bool WriteParameters(std::ostream& stream, const LargePagePtr<T>& pointer) {
-            constexpr std::uint32_t header = T::GetHashValue();
+  // Initialize the evaluation function parameters
+  void initialize() {
 
-            stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+    Detail::initialize(feature_transformer);
+    Detail::initialize(network);
+  }
 
-            return pointer->WriteParameters(stream);
-        }
-    }  // namespace Detail
+  // Read network header
+  bool read_header(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
+  {
+    std::uint32_t version, size;
 
-    // Initialize the evaluation function parameters
-    void initialize() {
+    version     = read_little_endian<std::uint32_t>(stream);
+    *hash_value = read_little_endian<std::uint32_t>(stream);
+    size        = read_little_endian<std::uint32_t>(stream);
+    if (!stream || version != kVersion) return false;
+    architecture->resize(size);
+    stream.read(&(*architecture)[0], size);
+    return !stream.fail();
+  }
 
-        Detail::initialize(feature_transformer);
-        Detail::initialize(network);
-    }
+  // write the header
+  bool write_header(std::ostream& stream,
+    std::uint32_t hash_value, const std::string& architecture) {
 
-    // Read network header
-    bool read_header(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
-    {
-        std::uint32_t version, size;
+    stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
+    stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
 
-        version     = read_little_endian<std::uint32_t>(stream);
-        *hash_value = read_little_endian<std::uint32_t>(stream);
-        size        = read_little_endian<std::uint32_t>(stream);
+    const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
 
-        if (!stream || version != kVersion)
-            return false;
+    stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    stream.write(architecture.data(), size);
 
-        architecture->resize(size);
-        stream.read(&(*architecture)[0], size);
+    return !stream.fail();
+  }
 
-        return !stream.fail();
-    }
+  // Read network parameters
+  bool ReadParameters(std::istream& stream) {
 
-    // write the header
-    bool write_header(std::ostream& stream,
-        std::uint32_t hash_value, const std::string& architecture) {
+    std::uint32_t hash_value;
+    std::string architecture;
+    if (!read_header(stream, &hash_value, &architecture)) return false;
+    if (hash_value != kHashValue) return false;
+    if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
+    if (!Detail::ReadParameters(stream, *network)) return false;
+    return stream && stream.peek() == std::ios::traits_type::eof();
+  }
 
-        stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
-        stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
+  // write evaluation function parameters
+  bool WriteParameters(std::ostream& stream) {
 
-        const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
+    if (!write_header(stream, kHashValue, get_architecture_string()))
+        return false;
 
-        stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
-        stream.write(architecture.data(), size);
+    if (!Detail::WriteParameters(stream, feature_transformer))
+        return false;
 
-        return !stream.fail();
-    }
+    if (!Detail::WriteParameters(stream, network))
+        return false;
 
-    // Read network parameters
-    bool ReadParameters(std::istream& stream) {
+    return !stream.fail();
+}
 
-        std::uint32_t hash_value;
-        std::string architecture;
-        if (!read_header(stream, &hash_value, &architecture))
-            return false;
+  // Evaluation function. Perform differential calculation.
+  Value evaluate(const Position& pos) {
 
-        if (hash_value != kHashValue)
-            return false;
+    // We manually align the arrays on the stack because with gcc < 9.3
+    // overaligning stack variables with alignas() doesn't work correctly.
 
-        if (!Detail::ReadParameters(stream, *feature_transformer))
-            return false;
-
-        if (!Detail::ReadParameters(stream, *network))
-            return false;
-
-        return stream && stream.peek() == std::ios::traits_type::eof();
-    }
-    // write evaluation function parameters
-    bool WriteParameters(std::ostream& stream) {
-
-        if (!write_header(stream, kHashValue, get_architecture_string()))
-            return false;
-
-        if (!Detail::WriteParameters(stream, feature_transformer))
-            return false;
-
-        if (!Detail::WriteParameters(stream, network))
-            return false;
-
-        return !stream.fail();
-    }
-    // Evaluation function. Perform differential calculation.
-    Value evaluate(const Position& pos) {
-
-        // We manually align the arrays on the stack because with gcc < 9.3
-        // overaligning stack variables with alignas() doesn't work correctly.
-
-        constexpr uint64_t alignment = kCacheLineSize;
+    constexpr uint64_t alignment = kCacheLineSize;
 
 #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
-        TransformedFeatureType transformed_features_unaligned[
-          FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
-        char buffer_unaligned[Network::kBufferSize + alignment];
+    TransformedFeatureType transformed_features_unaligned[
+      FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
+    char buffer_unaligned[Network::kBufferSize + alignment];
 
-        auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
-        auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
+    auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
+    auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
 #else
-        alignas(alignment)
-          TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
-        alignas(alignment) char buffer[Network::kBufferSize];
+    alignas(alignment)
+      TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
+    alignas(alignment) char buffer[Network::kBufferSize];
 #endif
 
-        ASSERT_ALIGNED(transformed_features, alignment);
-        ASSERT_ALIGNED(buffer, alignment);
+    ASSERT_ALIGNED(transformed_features, alignment);
+    ASSERT_ALIGNED(buffer, alignment);
 
-        feature_transformer->Transform(pos, transformed_features);
+    feature_transformer->Transform(pos, transformed_features);
+    const auto output = network->Propagate(transformed_features, buffer);
 
+    return static_cast<Value>(output[0] / FV_SCALE);
+  }
 
-        const auto output = network->Propagate(transformed_features, buffer);
+  // Load eval, from a file stream or a memory stream
+  bool load_eval(std::string name, std::istream& stream) {
 
-        return static_cast<Value>(output[0] / FV_SCALE);
-    }
+    initialize();
+    fileName = name;
+    return ReadParameters(stream);
+}
 
-    // Load eval, from a file stream or a memory stream
-    bool load_eval(std::string name, std::istream& stream) {
+static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
+{
+  if (mode == "false")
+    return UseNNUEMode::False;
+  else if (mode == "true")
+     return UseNNUEMode::True;
+  else if (mode == "pure")
+    return UseNNUEMode::Pure;
 
-        initialize();
+  return UseNNUEMode::False;
+}
 
-        fileName = name;
-        return ReadParameters(stream);
-    }
+void init() {
 
-    static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
-    {
-        if (mode == "false")
-          return UseNNUEMode::False;
-        else if (mode == "true")
-          return UseNNUEMode::True;
-        else if (mode == "pure")
-          return UseNNUEMode::Pure;
+  useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
 
-        return UseNNUEMode::False;
-    }
+  if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
+  {
+    eval_file_loaded.clear();
+    return;
+  }
 
-    void init() {
-
-        useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
-
-        if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
-        {
-            eval_file_loaded.clear();
-            return;
-        }
-
-        std::string eval_file = std::string(Options["EvalFile"]);
+  std::string eval_file = std::string(Options["EvalFile"]);
 
 #if defined(DEFAULT_NNUE_DIRECTORY)
 #define stringify2(x) #x
 #define stringify(x) stringify2(x)
-        std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
+  std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
 #else
-        std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
+  std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
 #endif
 
-        for (std::string directory : dirs)
-        {
-            if (eval_file_loaded != eval_file)
-            {
-                std::ifstream stream(directory + eval_file, std::ios::binary);
-                if (load_eval(eval_file, stream))
-                {
-                    sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
-                    eval_file_loaded = eval_file;
-                }
-                else
-                {
-                    sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
-                    eval_file_loaded.clear();
-                }
-            }
-        }
+  for (std::string directory : dirs)
+  {
+    if (eval_file_loaded != eval_file)
+    {
+      std::ifstream stream(directory + eval_file, std::ios::binary);
+      if (load_eval(eval_file, stream))
+      {
+        sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
+        eval_file_loaded = eval_file;
+      }
+      else
+      {
+        sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
+        eval_file_loaded.clear();
+      }
+    }
+  }
 
 #undef stringify2
 #undef stringify
-    }
+}
 
-    /// NNUE::verify() verifies that the last net used was loaded successfully
-    void verify_eval_file_loaded() {
+/// NNUE::verify() verifies that the last net used was loaded successfully
+void verify_eval_file_loaded() {
 
-        std::string eval_file = std::string(Options["EvalFile"]);
+  std::string eval_file = std::string(Options["EvalFile"]);
 
-        if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
-        {
-            UCI::OptionsMap defaults;
-            UCI::init(defaults);
+  if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
+  {
+    UCI::OptionsMap defaults;
+    UCI::init(defaults);
 
-            std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
-            std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
-            std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
-            std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
-            std::string msg5 = "The engine will be terminated now.";
+    std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+    std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
+    std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+    std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
+    std::string msg5 = "The engine will be terminated now.";
 
-            sync_cout << "info string ERROR: " << msg1 << sync_endl;
-            sync_cout << "info string ERROR: " << msg2 << sync_endl;
-            sync_cout << "info string ERROR: " << msg3 << sync_endl;
-            sync_cout << "info string ERROR: " << msg4 << sync_endl;
-            sync_cout << "info string ERROR: " << msg5 << sync_endl;
+    sync_cout << "info string ERROR: " << msg1 << sync_endl;
+    sync_cout << "info string ERROR: " << msg2 << sync_endl;
+    sync_cout << "info string ERROR: " << msg3 << sync_endl;
+    sync_cout << "info string ERROR: " << msg4 << sync_endl;
+    sync_cout << "info string ERROR: " << msg5 << sync_endl;
 
-            std::exit(EXIT_FAILURE);
-        }
+    std::exit(EXIT_FAILURE);
+  }
 
-        if (useNNUE != UseNNUEMode::False)
-            sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
-        else
-            sync_cout << "info string classical evaluation enabled" << sync_endl;
-    }
+  if (useNNUE != UseNNUEMode::False)
+    sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
+  else
+    sync_cout << "info string classical evaluation enabled" << sync_endl;
+}
 
-    /// In training we override eval file so this is useful.
-    void verify_any_net_loaded() {
+/// In training we override eval file so this is useful.
+void verify_any_net_loaded() {
 
-        if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
-        {
-            UCI::OptionsMap defaults;
-            UCI::init(defaults);
+  if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
+  {
+    UCI::OptionsMap defaults;
+    UCI::init(defaults);
 
-            std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
-            std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
-            std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
-            std::string msg5 = "The engine will be terminated now.";
+    std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+    std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
+    std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+    std::string msg5 = "The engine will be terminated now.";
 
-            sync_cout << "info string ERROR: " << msg1 << sync_endl;
-            sync_cout << "info string ERROR: " << msg2 << sync_endl;
-            sync_cout << "info string ERROR: " << msg3 << sync_endl;
-            sync_cout << "info string ERROR: " << msg5 << sync_endl;
+    sync_cout << "info string ERROR: " << msg1 << sync_endl;
+    sync_cout << "info string ERROR: " << msg2 << sync_endl;
+    sync_cout << "info string ERROR: " << msg3 << sync_endl;
+    sync_cout << "info string ERROR: " << msg5 << sync_endl;
 
-            std::exit(EXIT_FAILURE);
-        }
+    std::exit(EXIT_FAILURE);
+  }
 
-        if (useNNUE != UseNNUEMode::False)
-            sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
-        else
-            sync_cout << "info string classical evaluation enabled" << sync_endl;
-    }
+  if (useNNUE != UseNNUEMode::False)
+    sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
+  else
+    sync_cout << "info string classical evaluation enabled" << sync_endl;
+}
 
 } // namespace Eval::NNUE
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index b33969fc..a1051abe 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -1,21 +1,23 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
 
-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+// header used in NNUE evaluation function
+
 #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
 #define NNUE_EVALUATE_NNUE_H_INCLUDED
 
@@ -25,84 +27,83 @@
 
 #include <memory>
 
-// header used in NNUE evaluation function
 namespace Eval::NNUE {
 
-    enum struct UseNNUEMode
-    {
-        False,
-        True,
-        Pure
-    };
+  enum struct UseNNUEMode
+  {
+    False,
+    True,
+    Pure
+  };
 
-    // Hash value of evaluation function structure
-    constexpr std::uint32_t kHashValue =
-        FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
+  // Hash value of evaluation function structure
+  constexpr std::uint32_t kHashValue =
+      FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
 
-    // Deleter for automating release of memory area
-    template <typename T>
-    struct AlignedDeleter {
-        void operator()(T* ptr) const {
-            ptr->~T();
-            std_aligned_free(ptr);
-        }
-    };
+  // Deleter for automating release of memory area
+  template <typename T>
+  struct AlignedDeleter {
+    void operator()(T* ptr) const {
+      ptr->~T();
+      std_aligned_free(ptr);
+    }
+  };
 
-    template <typename T>
-    struct LargePageDeleter {
-        void operator()(T* ptr) const {
-            ptr->~T();
-            aligned_large_pages_free(ptr);
-        }
-    };
+  template <typename T>
+  struct LargePageDeleter {
+    void operator()(T* ptr) const {
+      ptr->~T();
+      aligned_large_pages_free(ptr);
+    }
+  };
 
-    template <typename T>
-    using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
+  template <typename T>
+  using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
 
-    template <typename T>
-    using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
+  template <typename T>
+  using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
 
-    // Input feature converter
-    extern LargePagePtr<FeatureTransformer> feature_transformer;
+  // Input feature converter
+  extern LargePagePtr<FeatureTransformer> feature_transformer;
 
-    // Evaluation function
-    extern AlignedPtr<Network> network;
+  // Evaluation function
+  extern AlignedPtr<Network> network;
 
-    // Evaluation function file name
-    extern std::string fileName;
+  // Evaluation function file name
+  extern std::string fileName;
 
-    // Saved evaluation function file name
-    extern std::string savedfileName;
+  // Saved evaluation function file name
+  extern std::string savedfileName;
 
-    extern UseNNUEMode useNNUE;
+  extern UseNNUEMode useNNUE;
 
-    extern std::string eval_file_loaded;
+  extern std::string eval_file_loaded;
 
-    // Get a string that represents the structure of the evaluation function
-    std::string get_architecture_string();
+  // Get a string that represents the structure of the evaluation function
+  std::string get_architecture_string();
 
-    std::string get_layers_info();
+  std::string get_layers_info();
 
-    // read the header
-    bool read_header(std::istream& stream,
-        std::uint32_t* hash_value, std::string* architecture);
+  // read the header
+  bool read_header(std::istream& stream,
+      std::uint32_t* hash_value, std::string* architecture);
 
-    // write the header
-    bool write_header(std::ostream& stream,
-        std::uint32_t hash_value, const std::string& architecture);
+  // write the header
+  bool write_header(std::ostream& stream,
+      std::uint32_t hash_value, const std::string& architecture);
 
-    // read evaluation function parameters
-    bool ReadParameters(std::istream& stream);
+  // read evaluation function parameters
+  bool ReadParameters(std::istream& stream);
 
-    // write evaluation function parameters
-    bool WriteParameters(std::ostream& stream);
+  // write evaluation function parameters
+  bool WriteParameters(std::ostream& stream);
 
-    Value evaluate(const Position& pos);
-    bool load_eval(std::string name, std::istream& stream);
-    void init();
+  Value evaluate(const Position& pos);
+  bool load_eval(std::string name, std::istream& stream);
+  void init();
 
-    void verify_eval_file_loaded();
-    void verify_any_net_loaded();
+  void verify_eval_file_loaded();
+  void verify_any_net_loaded();
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index 32ef24ef..6602eae7 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -1,19 +1,19 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
 
-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // A class template that represents the input feature set of the NNUE evaluation function
@@ -22,7 +22,6 @@
 #define NNUE_FEATURE_SET_H_INCLUDED
 
 #include "features_common.h"
-
 #include <array>
 
 namespace Eval::NNUE::Features {
diff --git a/src/nnue/features/features_common.h b/src/nnue/features/features_common.h
index 671ceeb9..656502a3 100644
--- a/src/nnue/features/features_common.h
+++ b/src/nnue/features/features_common.h
@@ -1,19 +1,19 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
 
-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 //Common header of input features of NNUE evaluation function
@@ -21,30 +21,29 @@
 #ifndef NNUE_FEATURES_COMMON_H_INCLUDED
 #define NNUE_FEATURES_COMMON_H_INCLUDED
 
-#include "evaluate.h"
-
-#include "nnue/nnue_common.h"
+#include "../../evaluate.h"
+#include "../nnue_common.h"
 
 namespace Eval::NNUE::Features {
 
-    class IndexList;
+  class IndexList;
 
-    template <typename... FeatureTypes>
-    class FeatureSet;
+  template <typename... FeatureTypes>
+  class FeatureSet;
 
-    // Trigger to perform full calculations instead of difference only
-    enum class TriggerEvent {
-        kNone, // Calculate the difference whenever possible
-        kFriendKingMoved, // calculate full evaluation when own king moves
-        kEnemyKingMoved, // calculate full evaluation when opponent king moves
-        kAnyKingMoved, // calculate full evaluation when any king moves
-        kAnyPieceMoved, // always calculate full evaluation
-    };
+  // Trigger to perform full calculations instead of difference only
+  enum class TriggerEvent {
+    kNone, // Calculate the difference whenever possible
+    kFriendKingMoved, // calculate full evaluation when own king moves
+    kEnemyKingMoved, // calculate full evaluation when opponent king moves
+    kAnyKingMoved, // calculate full evaluation when any king moves
+    kAnyPieceMoved, // always calculate full evaluation
+  };
 
-    enum class Side {
-        kFriend, // side to move
-        kEnemy, // opponent
-    };
+  enum class Side {
+    kFriend, // side to move
+    kEnemy, // opponent
+  };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/index_list.h b/src/nnue/features/index_list.h
index 6751b26c..d9ad680a 100644
--- a/src/nnue/features/index_list.h
+++ b/src/nnue/features/index_list.h
@@ -1,19 +1,19 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
 
-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Definition of index list of input features
@@ -21,43 +21,43 @@
 #ifndef NNUE_FEATURES_INDEX_LIST_H_INCLUDED
 #define NNUE_FEATURES_INDEX_LIST_H_INCLUDED
 
-#include "position.h"
-
-#include "nnue/nnue_architecture.h"
+#include "../../position.h"
+#include "../nnue_architecture.h"
 
 namespace Eval::NNUE::Features {
 
-    // Class template used for feature index list
-    template <typename T, std::size_t MaxSize>
-    class ValueList {
+  // Class template used for feature index list
+  template <typename T, std::size_t MaxSize>
+  class ValueList {
 
-    public:
-        std::size_t size() const { return size_; }
-        void resize(std::size_t size) { size_ = size; }
-        void push_back(const T& value) { values_[size_++] = value; }
-        T& operator[](std::size_t index) { return values_[index]; }
-        T* begin() { return values_; }
-        T* end() { return values_ + size_; }
-        const T& operator[](std::size_t index) const { return values_[index]; }
-        const T* begin() const { return values_; }
-        const T* end() const { return values_ + size_; }
+   public:
+    std::size_t size() const { return size_; }
+    void resize(std::size_t size) { size_ = size; }
+    void push_back(const T& value) { values_[size_++] = value; }
+    T& operator[](std::size_t index) { return values_[index]; }
+    T* begin() { return values_; }
+    T* end() { return values_ + size_; }
+    const T& operator[](std::size_t index) const { return values_[index]; }
+    const T* begin() const { return values_; }
+    const T* end() const { return values_ + size_; }
 
-        void swap(ValueList& other) {
-            const std::size_t max_size = std::max(size_, other.size_);
-            for (std::size_t i = 0; i < max_size; ++i) {
-                std::swap(values_[i], other.values_[i]);
-            }
-            std::swap(size_, other.size_);
-        }
+    void swap(ValueList& other) {
+      const std::size_t max_size = std::max(size_, other.size_);
+      for (std::size_t i = 0; i < max_size; ++i) {
+        std::swap(values_[i], other.values_[i]);
+      }
+      std::swap(size_, other.size_);
+    }
 
-    private:
-        T values_[MaxSize] = {};
-        std::size_t size_ = 0;
-    };
+   private:
+    T values_[MaxSize];
+    std::size_t size_ = 0;
+  };
 
-    //Type of feature index list
-    class IndexList : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
-    };
+  //Type of feature index list
+  class IndexList
+      : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
+  };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index d290bc12..1f2ff7c5 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -223,13 +223,13 @@ namespace Eval::NNUE::Layers {
         return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias);
       };
 
-      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
 #if defined (USE_VNNI)
+      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
         acc = _mm512_dpbusd_epi32(acc, a, b);
 #else
+      [[maybe_unused]] auto m512_dpbusd_epi32 = [=](__m512i a, __m512i b) -> __m512i {
         __m512i product0 = _mm512_maddubs_epi16(a, b);
-        product0 = _mm512_madd_epi16(product0, kOnes512);
-        acc = _mm512_add_epi32(acc, product0);
+        return _mm512_madd_epi16(product0, kOnes512);
 #endif
       };
 
@@ -256,14 +256,13 @@ namespace Eval::NNUE::Layers {
 
         return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
       };
-
-      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
 #if defined (USE_VNNI)
+      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
         acc = _mm256_dpbusd_epi32(acc, a, b);
 #else
+      [[maybe_unused]] auto m256_dpbusd_epi32 = [=](__m256i a, __m256i b) -> __m256i {
         __m256i product0 = _mm256_maddubs_epi16(a, b);
-        product0 = _mm256_madd_epi16(product0, kOnes256);
-        acc = _mm256_add_epi32(acc, product0);
+        return _mm256_madd_epi16(product0, kOnes256);
 #endif
       };
 
@@ -288,10 +287,9 @@ namespace Eval::NNUE::Layers {
         return _mm_add_epi32(sum0, bias);
       };
 
-      [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) {
+      [[maybe_unused]] auto m128_dpbusd_epi32 = [=](__m128i a, __m128i b) -> __m128i {
         __m128i product0 = _mm_maddubs_epi16(a, b);
-        product0 = _mm_madd_epi16(product0, kOnes128);
-        acc = _mm_add_epi32(acc, product0);
+        return _mm_madd_epi16(product0, kOnes128);
       };
 
 #endif
@@ -335,15 +333,6 @@ namespace Eval::NNUE::Layers {
           const __m512i bias = *reinterpret_cast<const __m512i*>(&biases_[i]);
           __m512i* outptr = reinterpret_cast<__m512i*>(&output[i]);
 
-          __m512i sum01a = _mm512_setzero_si512();
-          __m512i sum23a = _mm512_setzero_si512();
-          __m512i sum45a = _mm512_setzero_si512();
-          __m512i sum67a = _mm512_setzero_si512();
-          __m512i sum01b = _mm512_setzero_si512();
-          __m512i sum23b = _mm512_setzero_si512();
-          __m512i sum45b = _mm512_setzero_si512();
-          __m512i sum67b = _mm512_setzero_si512();
-
           const auto row01a = *reinterpret_cast<const __m512i*>(&weights_[offset01a]);
           const auto row23a = *reinterpret_cast<const __m512i*>(&weights_[offset23a]);
           const auto row45a = *reinterpret_cast<const __m512i*>(&weights_[offset45a]);
@@ -356,6 +345,16 @@ namespace Eval::NNUE::Layers {
           const __m256i in256 = input_vector256[0];
           const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1);
 
+#if defined (USE_VNNI)
+          __m512i sum01a = _mm512_setzero_si512();
+          __m512i sum23a = _mm512_setzero_si512();
+          __m512i sum45a = _mm512_setzero_si512();
+          __m512i sum67a = _mm512_setzero_si512();
+          __m512i sum01b = _mm512_setzero_si512();
+          __m512i sum23b = _mm512_setzero_si512();
+          __m512i sum45b = _mm512_setzero_si512();
+          __m512i sum67b = _mm512_setzero_si512();
+
           m512_add_dpbusd_epi32(sum01a, in, row01a);
           m512_add_dpbusd_epi32(sum23a, in, row23a);
           m512_add_dpbusd_epi32(sum45a, in, row45a);
@@ -364,6 +363,16 @@ namespace Eval::NNUE::Layers {
           m512_add_dpbusd_epi32(sum23b, in, row23b);
           m512_add_dpbusd_epi32(sum45b, in, row45b);
           m512_add_dpbusd_epi32(sum67b, in, row67b);
+#else
+          __m512i sum01a = m512_dpbusd_epi32(in, row01a);
+          __m512i sum23a = m512_dpbusd_epi32(in, row23a);
+          __m512i sum45a = m512_dpbusd_epi32(in, row45a);
+          __m512i sum67a = m512_dpbusd_epi32(in, row67a);
+          __m512i sum01b = m512_dpbusd_epi32(in, row01b);
+          __m512i sum23b = m512_dpbusd_epi32(in, row23b);
+          __m512i sum45b = m512_dpbusd_epi32(in, row45b);
+          __m512i sum67b = m512_dpbusd_epi32(in, row67b);
+#endif
 
           *outptr = m512_hadd256x16(
             sum01a, sum23a, sum45a, sum67a,
@@ -384,48 +393,80 @@ namespace Eval::NNUE::Layers {
 
           if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
           {
-            __m512i sum0 = _mm512_setzero_si512();
-            __m512i sum1 = _mm512_setzero_si512();
-            __m512i sum2 = _mm512_setzero_si512();
-            __m512i sum3 = _mm512_setzero_si512();
-
             const auto row0 = reinterpret_cast<const __m512i*>(&weights_[offset0]);
             const auto row1 = reinterpret_cast<const __m512i*>(&weights_[offset1]);
             const auto row2 = reinterpret_cast<const __m512i*>(&weights_[offset2]);
             const auto row3 = reinterpret_cast<const __m512i*>(&weights_[offset3]);
 
-            for (IndexType j = 0; j < kNumChunks512; ++j)
+#if defined (USE_VNNI)
+            __m512i sum0 = _mm512_setzero_si512();
+            __m512i sum1 = _mm512_setzero_si512();
+            __m512i sum2 = _mm512_setzero_si512();
+            __m512i sum3 = _mm512_setzero_si512();
+            const IndexType kStart = 0;
+#else
+            __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
+            __m512i sum1 = m512_dpbusd_epi32(input_vector512[0], row1[0]);
+            __m512i sum2 = m512_dpbusd_epi32(input_vector512[0], row2[0]);
+            __m512i sum3 = m512_dpbusd_epi32(input_vector512[0], row3[0]);
+            const IndexType kStart = 1;
+#endif
+
+            for (IndexType j = kStart; j < kNumChunks512; ++j)
             {
               const __m512i in = input_vector512[j];
 
+#if defined (USE_VNNI)
               m512_add_dpbusd_epi32(sum0, in, row0[j]);
               m512_add_dpbusd_epi32(sum1, in, row1[j]);
               m512_add_dpbusd_epi32(sum2, in, row2[j]);
               m512_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+              sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
+              sum1 = _mm512_add_epi32(sum1, m512_dpbusd_epi32(in, row1[j]));
+              sum2 = _mm512_add_epi32(sum2, m512_dpbusd_epi32(in, row2[j]));
+              sum3 = _mm512_add_epi32(sum3, m512_dpbusd_epi32(in, row3[j]));
+#endif
             }
 
             *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias);
           }
           else
           {
-            __m256i sum0 = _mm256_setzero_si256();
-            __m256i sum1 = _mm256_setzero_si256();
-            __m256i sum2 = _mm256_setzero_si256();
-            __m256i sum3 = _mm256_setzero_si256();
-
             const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
             const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
             const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
             const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
 
-            for (IndexType j = 0; j < kNumChunks256; ++j)
+#if defined (USE_VNNI)
+            __m256i sum0 = _mm256_setzero_si256();
+            __m256i sum1 = _mm256_setzero_si256();
+            __m256i sum2 = _mm256_setzero_si256();
+            __m256i sum3 = _mm256_setzero_si256();
+            const IndexType kStart = 0;
+#else
+            __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
+            __m256i sum1 = m256_dpbusd_epi32(input_vector256[0], row1[0]);
+            __m256i sum2 = m256_dpbusd_epi32(input_vector256[0], row2[0]);
+            __m256i sum3 = m256_dpbusd_epi32(input_vector256[0], row3[0]);
+            const IndexType kStart = 1;
+#endif
+
+            for (IndexType j = kStart; j < kNumChunks256; ++j)
             {
               const __m256i in = input_vector256[j];
 
+#if defined (USE_VNNI)
               m256_add_dpbusd_epi32(sum0, in, row0[j]);
               m256_add_dpbusd_epi32(sum1, in, row1[j]);
               m256_add_dpbusd_epi32(sum2, in, row2[j]);
               m256_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+              sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+              sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
+              sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
+              sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
+#endif
             }
 
             *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -436,30 +477,50 @@ namespace Eval::NNUE::Layers {
       {
         if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
         {
-          __m512i sum0 = _mm512_setzero_si512();
-
           const auto row0 = reinterpret_cast<const __m512i*>(&weights_[0]);
 
-          for (IndexType j = 0; j < kNumChunks512; ++j)
+#if defined (USE_VNNI)
+          __m512i sum0 = _mm512_setzero_si512();
+          const IndexType kStart = 0;
+#else
+          __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks512; ++j)
           {
             const __m512i in = input_vector512[j];
 
+#if defined (USE_VNNI)
             m512_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+            sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
+#endif
           }
 
           output[0] = m512_hadd(sum0, biases_[0]);
         }
         else
         {
-          __m256i sum0 = _mm256_setzero_si256();
-
           const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
 
-          for (IndexType j = 0; j < kNumChunks256; ++j)
+#if defined (USE_VNNI)
+          __m256i sum0 = _mm256_setzero_si256();
+          const IndexType kStart = 0;
+#else
+          __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks256; ++j)
           {
             const __m256i in = input_vector256[j];
 
+#if defined (USE_VNNI)
             m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+#endif
           }
 
           output[0] = m256_hadd(sum0, biases_[0]);
@@ -493,24 +554,40 @@ namespace Eval::NNUE::Layers {
           const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
           __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
 
-          __m256i sum0 = _mm256_setzero_si256();
-          __m256i sum1 = _mm256_setzero_si256();
-          __m256i sum2 = _mm256_setzero_si256();
-          __m256i sum3 = _mm256_setzero_si256();
-
           const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
           const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
           const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
           const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
 
-          for (IndexType j = 0; j < kNumChunks; ++j)
+#if defined (USE_VNNI)
+          __m256i sum0 = _mm256_setzero_si256();
+          __m256i sum1 = _mm256_setzero_si256();
+          __m256i sum2 = _mm256_setzero_si256();
+          __m256i sum3 = _mm256_setzero_si256();
+          const IndexType kStart = 0;
+#else
+          __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
+          __m256i sum1 = m256_dpbusd_epi32(input_vector[0], row1[0]);
+          __m256i sum2 = m256_dpbusd_epi32(input_vector[0], row2[0]);
+          __m256i sum3 = m256_dpbusd_epi32(input_vector[0], row3[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks; ++j)
           {
             const __m256i in = input_vector[j];
 
+#if defined (USE_VNNI)
             m256_add_dpbusd_epi32(sum0, in, row0[j]);
             m256_add_dpbusd_epi32(sum1, in, row1[j]);
             m256_add_dpbusd_epi32(sum2, in, row2[j]);
             m256_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+            sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
+            sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
+            sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
+#endif
           }
 
           *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -518,15 +595,25 @@ namespace Eval::NNUE::Layers {
       }
       else if constexpr (kOutputDimensions == 1)
       {
-        __m256i sum0 = _mm256_setzero_si256();
-
         const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
 
-        for (IndexType j = 0; j < kNumChunks; ++j)
+#if defined (USE_VNNI)
+        __m256i sum0 = _mm256_setzero_si256();
+        const IndexType kStart = 0;
+#else
+        __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
+        const IndexType kStart = 1;
+#endif
+
+        for (IndexType j = kStart; j < kNumChunks; ++j)
         {
           const __m256i in = input_vector[j];
 
-            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#if defined (USE_VNNI)
+          m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+          sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+#endif
         }
 
         output[0] = m256_hadd(sum0, biases_[0]);
@@ -559,24 +646,24 @@ namespace Eval::NNUE::Layers {
           const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
           __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
 
-          __m128i sum0 = _mm_setzero_si128();
-          __m128i sum1 = _mm_setzero_si128();
-          __m128i sum2 = _mm_setzero_si128();
-          __m128i sum3 = _mm_setzero_si128();
-
           const auto row0 = reinterpret_cast<const __m128i*>(&weights_[offset0]);
           const auto row1 = reinterpret_cast<const __m128i*>(&weights_[offset1]);
           const auto row2 = reinterpret_cast<const __m128i*>(&weights_[offset2]);
           const auto row3 = reinterpret_cast<const __m128i*>(&weights_[offset3]);
 
-          for (int j = 0; j < (int)kNumChunks; j += 1)
+          __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
+          __m128i sum1 = m128_dpbusd_epi32(input_vector[0], row1[0]);
+          __m128i sum2 = m128_dpbusd_epi32(input_vector[0], row2[0]);
+          __m128i sum3 = m128_dpbusd_epi32(input_vector[0], row3[0]);
+
+          for (int j = 1; j < (int)kNumChunks; ++j)
           {
             const __m128i in = input_vector[j];
 
-            m128_add_dpbusd_epi32(sum0, in, row0[j]);
-            m128_add_dpbusd_epi32(sum1, in, row1[j]);
-            m128_add_dpbusd_epi32(sum2, in, row2[j]);
-            m128_add_dpbusd_epi32(sum3, in, row3[j]);
+            sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(in, row0[j]));
+            sum1 = _mm_add_epi32(sum1, m128_dpbusd_epi32(in, row1[j]));
+            sum2 = _mm_add_epi32(sum2, m128_dpbusd_epi32(in, row2[j]));
+            sum3 = _mm_add_epi32(sum3, m128_dpbusd_epi32(in, row3[j]));
           }
 
           *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -584,16 +671,12 @@ namespace Eval::NNUE::Layers {
       }
       else if constexpr (kOutputDimensions == 1)
       {
-        __m128i sum0 = _mm_setzero_si128();
-
         const auto row0 = reinterpret_cast<const __m128i*>(&weights_[0]);
 
-        for (int j = 0; j < (int)kNumChunks; j += 1)
-        {
-          const __m128i in = input_vector[j];
+        __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
 
-          m128_add_dpbusd_epi32(sum0, in, row0[j]);
-        }
+        for (int j = 1; j < (int)kNumChunks; ++j)
+          sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(input_vector[j], row0[j]));
 
         output[0] = m128_hadd(sum0, biases_[0]);
       }
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index 8b60dafc..3d2f5bb4 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -1,34 +1,35 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
 
-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+// Class for difference calculation of NNUE evaluation function
+
 #ifndef NNUE_ACCUMULATOR_H_INCLUDED
 #define NNUE_ACCUMULATOR_H_INCLUDED
 
 #include "nnue_architecture.h"
 
-// Class for difference calculation of NNUE evaluation function
 namespace Eval::NNUE {
 
-    // Class that holds the result of affine transformation of input features
-    struct alignas(kCacheLineSize) Accumulator {
-        std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
-        bool computed_accumulation;
-    };
+  // Class that holds the result of affine transformation of input features
+  struct alignas(kCacheLineSize) Accumulator {
+      std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
+      bool computed_accumulation;
+  };
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h
index 2ecb6999..91cdc4bd 100644
--- a/src/nnue/nnue_architecture.h
+++ b/src/nnue/nnue_architecture.h
@@ -1,36 +1,37 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
 
-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+// Input features and network structure used in NNUE evaluation function
+
 #ifndef NNUE_ARCHITECTURE_H_INCLUDED
 #define NNUE_ARCHITECTURE_H_INCLUDED
 
 // Defines the network structure
 #include "architectures/halfkp_256x2-32-32.h"
 
-// Input features and network structure used in NNUE evaluation function
 namespace Eval::NNUE {
 
-    static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
-    static_assert(Network::kOutputDimensions == 1, "");
-    static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
+  static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
+  static_assert(Network::kOutputDimensions == 1, "");
+  static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
 
-    // Trigger for full calculation instead of difference calculation
-    constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
+  // Trigger for full calculation instead of difference calculation
+  constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 5a52e0cb..8c17c959 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -1,19 +1,19 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
 
-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // A class that converts the input features of the NNUE evaluation function
@@ -23,7 +23,6 @@
 
 #include "nnue_common.h"
 #include "nnue_architecture.h"
-
 #include "features/index_list.h"
 
 #include <cstring>
@@ -31,456 +30,486 @@
 
 namespace Eval::NNUE {
 
-    // If vector instructions are enabled, we update and refresh the
-    // accumulator tile by tile such that each tile fits in the CPU's
-    // vector registers.
-#define TILING
+  // If vector instructions are enabled, we update and refresh the
+  // accumulator tile by tile such that each tile fits in the CPU's
+  // vector registers.
+  #define VECTOR
 
-#ifdef USE_AVX512
-    typedef __m512i vec_t;
-#define vec_load(a) _mm512_load_si512(a)
-#define vec_store(a,b) _mm512_store_si512(a,b)
-#define vec_add_16(a,b) _mm512_add_epi16(a,b)
-#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
-#define vec_zero _mm512_setzero_si512()
-    static constexpr IndexType kNumRegs = 8; // only 8 are needed
+  #ifdef USE_AVX512
+  typedef __m512i vec_t;
+  #define vec_load(a) _mm512_load_si512(a)
+  #define vec_store(a,b) _mm512_store_si512(a,b)
+  #define vec_add_16(a,b) _mm512_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+  #define vec_zero _mm512_setzero_si512()
+  static constexpr IndexType kNumRegs = 8; // only 8 are needed
 
-#elif USE_AVX2
-    typedef __m256i vec_t;
-#define vec_load(a) _mm256_load_si256(a)
-#define vec_store(a,b) _mm256_store_si256(a,b)
-#define vec_add_16(a,b) _mm256_add_epi16(a,b)
-#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
-#define vec_zero _mm256_setzero_si256()
-    static constexpr IndexType kNumRegs = 16;
-
-#elif USE_SSE2
-    typedef __m128i vec_t;
-#define vec_load(a) (*(a))
-#define vec_store(a,b) *(a)=(b)
-#define vec_add_16(a,b) _mm_add_epi16(a,b)
-#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
-#define vec_zero _mm_setzero_si128()
-    static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
-
-#elif USE_MMX
-    typedef __m64 vec_t;
-#define vec_load(a) (*(a))
-#define vec_store(a,b) *(a)=(b)
-#define vec_add_16(a,b) _mm_add_pi16(a,b)
-#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
-#define vec_zero _mm_setzero_si64()
-    static constexpr IndexType kNumRegs = 8;
-
-#elif USE_NEON
-    typedef int16x8_t vec_t;
-#define vec_load(a) (*(a))
-#define vec_store(a,b) *(a)=(b)
-#define vec_add_16(a,b) vaddq_s16(a,b)
-#define vec_sub_16(a,b) vsubq_s16(a,b)
-#define vec_zero {0}
+  #elif USE_AVX2
+  typedef __m256i vec_t;
+  #define vec_load(a) _mm256_load_si256(a)
+  #define vec_store(a,b) _mm256_store_si256(a,b)
+  #define vec_add_16(a,b) _mm256_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+  #define vec_zero _mm256_setzero_si256()
   static constexpr IndexType kNumRegs = 16;
 
+  #elif USE_SSE2
+  typedef __m128i vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) _mm_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
+  #define vec_zero _mm_setzero_si128()
+  static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
+
+  #elif USE_MMX
+  typedef __m64 vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) _mm_add_pi16(a,b)
+  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
+  #define vec_zero _mm_setzero_si64()
+  static constexpr IndexType kNumRegs = 8;
+
+  #elif USE_NEON
+  typedef int16x8_t vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) vaddq_s16(a,b)
+  #define vec_sub_16(a,b) vsubq_s16(a,b)
+  #define vec_zero {0}
+  static constexpr IndexType kNumRegs = 16;
+
+  #else
+  #undef VECTOR
+
+  #endif
+
+  // Input feature converter
+  class FeatureTransformer {
+
+   private:
+    // Number of output dimensions for one side
+    static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
+
+    #ifdef VECTOR
+    static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
+    static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
+    #endif
+
+   public:
+    // Output type
+    using OutputType = TransformedFeatureType;
+
+    // Number of input/output dimensions
+    static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
+    static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
+
+    // Size of forward propagation buffer
+    static constexpr std::size_t kBufferSize =
+        kOutputDimensions * sizeof(OutputType);
+
+    static constexpr int kLayerIndex = 0;
+
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t GetHashValue() {
+
+      return RawFeatures::kHashValue ^ kOutputDimensions;
+    }
+
+    static std::string get_name() {
+      return RawFeatures::get_name() + "[" +
+          std::to_string(kInputDimensions) + "->" +
+          std::to_string(kHalfDimensions) + "x2]";
+    }
+
+    // a string representing the structure
+    static std::string get_structure_string() {
+      return get_name();
+    }
+
+    static std::string get_layers_info() {
+      std::string info = "  - ";
+      info += std::to_string(kLayerIndex);
+      info += " - ";
+      info += get_name();
+      return info;
+    }
+
+    // Read network parameters
+    bool ReadParameters(std::istream& stream) {
+
+      for (std::size_t i = 0; i < kHalfDimensions; ++i)
+        biases_[i] = read_little_endian<BiasType>(stream);
+      for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
+        weights_[i] = read_little_endian<WeightType>(stream);
+      return !stream.fail();
+    }
+
+    // write parameters
+    bool WriteParameters(std::ostream& stream) const {
+      stream.write(reinterpret_cast<const char*>(biases_),
+          kHalfDimensions * sizeof(BiasType));
+
+      stream.write(reinterpret_cast<const char*>(weights_),
+          kHalfDimensions * kInputDimensions * sizeof(WeightType));
+
+      return !stream.fail();
+    }
+
+    // Proceed with the difference calculation if possible
+    bool update_accumulator_if_possible(const Position& pos) const {
+
+      const auto now = pos.state();
+      if (now->accumulator.computed_accumulation)
+        return true;
+
+      const auto prev = now->previous;
+      if (prev && prev->accumulator.computed_accumulation) {
+        update_accumulator(pos);
+        return true;
+      }
+
+      return false;
+    }
+
+    // Convert input features
+    void Transform(const Position& pos, OutputType* output) const {
+
+      if (!update_accumulator_if_possible(pos))
+        refresh_accumulator(pos);
+
+      const auto& accumulation = pos.state()->accumulator.accumulation;
+
+  #if defined(USE_AVX512)
+      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth * 2);
+      static_assert(kHalfDimensions % (kSimdWidth * 2) == 0);
+      const __m512i kControl = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+      const __m512i kZero = _mm512_setzero_si512();
+
+  #elif defined(USE_AVX2)
+      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+      constexpr int kControl = 0b11011000;
+      const __m256i kZero = _mm256_setzero_si256();
+
+  #elif defined(USE_SSE2)
+      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+
+  #ifdef USE_SSE41
+      const __m128i kZero = _mm_setzero_si128();
+  #else
+      const __m128i k0x80s = _mm_set1_epi8(-128);
+  #endif
+
+  #elif defined(USE_MMX)
+      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+      const __m64 k0x80s = _mm_set1_pi8(-128);
+
+  #elif defined(USE_NEON)
+      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+      const int8x8_t kZero = {0};
+  #endif
+
+      const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
+      for (IndexType p = 0; p < 2; ++p) {
+        const IndexType offset = kHalfDimensions * p;
+
+  #if defined(USE_AVX512)
+        auto out = reinterpret_cast<__m512i*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m512i sum0 = _mm512_load_si512(
+              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m512i sum1 = _mm512_load_si512(
+              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum0 = _mm512_add_epi16(sum0, reinterpret_cast<const __m512i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 0]);
+              sum1 = _mm512_add_epi16(sum1, reinterpret_cast<const __m512i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
+          _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(kControl,
+              _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), kZero)));
+        }
+
+  #elif defined(USE_AVX2)
+        auto out = reinterpret_cast<__m256i*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m256i sum0 = _mm256_load_si256(
+              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m256i sum1 = _mm256_load_si256(
+              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 0]);
+              sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
+          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+              _mm256_packs_epi16(sum0, sum1), kZero), kControl));
+        }
+
+  #elif defined(USE_SSE2)
+        auto out = reinterpret_cast<__m128i*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+              accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+              accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+            sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
+                accumulation[perspectives[p]][i])[j * 2 + 0]);
+            sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
+                accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
+      const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
+
+          _mm_store_si128(&out[j],
+
+  #ifdef USE_SSE41
+              _mm_max_epi8(packedbytes, kZero)
+  #else
+              _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+  #endif
+
+          );
+        }
+
+  #elif defined(USE_MMX)
+        auto out = reinterpret_cast<__m64*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m64 sum0 = *(&reinterpret_cast<const __m64*>(
+              accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m64 sum1 = *(&reinterpret_cast<const __m64*>(
+              accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 0]);
+              sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
+          const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
+          out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+        }
+
+  #elif defined(USE_NEON)
+        const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          int16x8_t sum = reinterpret_cast<const int16x8_t*>(
+              accumulation[perspectives[p]][0])[j];
+
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
+                  accumulation[perspectives[p]][i])[j]);
+          }
+
+          out[j] = vmax_s8(vqmovn_s16(sum), kZero);
+        }
+
+  #else
+        for (IndexType j = 0; j < kHalfDimensions; ++j) {
+          BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum += accumulation[static_cast<int>(perspectives[p])][i][j];
+          }
+
+          output[offset + j] = static_cast<OutputType>(
+              std::max<int>(0, std::min<int>(127, sum)));
+        }
+  #endif
+
+      }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
+    }
+
+   private:
+    // Calculate cumulative value without using difference calculation
+    void refresh_accumulator(const Position& pos) const {
+
+  #ifdef VECTOR
+      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
+      // is defined in the VECTOR code below, once in each branch
+      vec_t acc[kNumRegs];
+  #endif
+      auto& accumulator = pos.state()->accumulator;
+      for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+        Features::IndexList active_indices[2];
+        RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
+                                           active_indices);
+          for (Color perspective : { WHITE, BLACK }) {
+#ifdef VECTOR
+            for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+              auto accTile = reinterpret_cast<vec_t*>(
+                  &accumulator.accumulation[perspective][i][j * kTileHeight]);
+
+              if (i == 0) {
+                auto biasesTile = reinterpret_cast<const vec_t*>(
+                    &biases_[j * kTileHeight]);
+                for (IndexType k = 0; k < kNumRegs; ++k)
+                  acc[k] = biasesTile[k];
+              } else {
+                for (IndexType k = 0; k < kNumRegs; ++k)
+                  acc[k] = vec_zero;
+              }
+
+              for (const auto index : active_indices[perspective]) {
+                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+                for (IndexType k = 0; k < kNumRegs; ++k)
+                  acc[k] = vec_add_16(acc[k], column[k]);
+              }
+
+              for (IndexType k = 0; k < kNumRegs; k++)
+                vec_store(&accTile[k], acc[k]);
+            }
 #else
-#undef TILING
-
-#endif
-
-    // Input feature converter
-    class FeatureTransformer {
-
-    private:
-        // Number of output dimensions for one side
-        static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
-
-#ifdef TILING
-        static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
-        static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
-#endif
-
-    public:
-        // Output type
-        using OutputType = TransformedFeatureType;
-
-        // Number of input/output dimensions
-        static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
-        static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
-
-        // Size of forward propagation buffer
-        static constexpr std::size_t kBufferSize =
-            kOutputDimensions * sizeof(OutputType);
-
-        static constexpr int kLayerIndex = 0;
-
-        // Hash value embedded in the evaluation file
-        static constexpr std::uint32_t GetHashValue() {
-
-            return RawFeatures::kHashValue ^ kOutputDimensions;
-        }
-
-        static std::string get_name() {
-            return RawFeatures::get_name() + "[" +
-                std::to_string(kInputDimensions) + "->" +
-                std::to_string(kHalfDimensions) + "x2]";
-        }
-
-        // a string representing the structure
-        static std::string get_structure_string() {
-            return get_name();
-        }
-
-        static std::string get_layers_info() {
-            std::string info = "  - ";
-            info += std::to_string(kLayerIndex);
-            info += " - ";
-            info += get_name();
-            return info;
-        }
-
-        // Read network parameters
-        bool ReadParameters(std::istream& stream) {
-
-            for (std::size_t i = 0; i < kHalfDimensions; ++i)
-                biases_[i] = read_little_endian<BiasType>(stream);
-
-            for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
-                weights_[i] = read_little_endian<WeightType>(stream);
-
-            return !stream.fail();
-        }
-
-        // write parameters
-        bool WriteParameters(std::ostream& stream) const {
-            stream.write(reinterpret_cast<const char*>(biases_),
-                kHalfDimensions * sizeof(BiasType));
-
-            stream.write(reinterpret_cast<const char*>(weights_),
-                kHalfDimensions * kInputDimensions * sizeof(WeightType));
-
-            return !stream.fail();
-        }
-
-        // Proceed with the difference calculation if possible
-        bool update_accumulator_if_possible(const Position& pos) const {
-
-            const auto now = pos.state();
-            if (now->accumulator.computed_accumulation)
-                return true;
-
-            const auto prev = now->previous;
-            if (prev && prev->accumulator.computed_accumulation) {
-                update_accumulator(pos);
-                return true;
+            if (i == 0) {
+              std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                          kHalfDimensions * sizeof(BiasType));
+            } else {
+              std::memset(accumulator.accumulation[perspective][i], 0,
+                          kHalfDimensions * sizeof(BiasType));
             }
 
-            return false;
-        }
-
-        // Convert input features
-        void Transform(const Position& pos, OutputType* output) const {
-
-            if (!update_accumulator_if_possible(pos))
-              refresh_accumulator(pos);
-
-            const auto& accumulation = pos.state()->accumulator.accumulation;
-
-#if defined(USE_AVX2)
-            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-            constexpr int kControl = 0b11011000;
-            const __m256i kZero = _mm256_setzero_si256();
-
-#elif defined(USE_SSE2)
-            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-
-#ifdef USE_SSE41
-            const __m128i kZero = _mm_setzero_si128();
-#else
-            const __m128i k0x80s = _mm_set1_epi8(-128);
-#endif
-
-#elif defined(USE_MMX)
-            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-            const __m64 k0x80s = _mm_set1_pi8(-128);
-
-#elif defined(USE_NEON)
-            constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-            const int8x8_t kZero = {0};
-#endif
-
-            const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
-            for (IndexType p = 0; p < 2; ++p) {
-                const IndexType offset = kHalfDimensions * p;
-
-#if defined(USE_AVX2)
-                auto out = reinterpret_cast<__m256i*>(&output[offset]);
-                for (IndexType j = 0; j < kNumChunks; ++j) {
-                    __m256i sum0 = _mm256_load_si256(
-                        &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
-                    __m256i sum1 = _mm256_load_si256(
-                      &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
-                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-                        sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
-                            accumulation[perspectives[p]][i])[j * 2 + 0]);
-                        sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
-                            accumulation[perspectives[p]][i])[j * 2 + 1]);
-                    }
-
-                    _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
-                        _mm256_packs_epi16(sum0, sum1), kZero), kControl));
-                }
-
-#elif defined(USE_SSE2)
-                auto out = reinterpret_cast<__m128i*>(&output[offset]);
-                for (IndexType j = 0; j < kNumChunks; ++j) {
-                    __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
-                        accumulation[perspectives[p]][0])[j * 2 + 0]);
-                    __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
-                        accumulation[perspectives[p]][0])[j * 2 + 1]);
-                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-                        sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
-                            accumulation[perspectives[p]][i])[j * 2 + 0]);
-                        sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
-                            accumulation[perspectives[p]][i])[j * 2 + 1]);
-                    }
-
-                    const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
-
-                    _mm_store_si128(&out[j],
-
-#ifdef USE_SSE41
-                        _mm_max_epi8(packedbytes, kZero)
-#else
-                        _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
-#endif
-
-                    );
-                }
-
-#elif defined(USE_MMX)
-                auto out = reinterpret_cast<__m64*>(&output[offset]);
-                for (IndexType j = 0; j < kNumChunks; ++j) {
-                    __m64 sum0 = *(&reinterpret_cast<const __m64*>(
-                        accumulation[perspectives[p]][0])[j * 2 + 0]);
-                    __m64 sum1 = *(&reinterpret_cast<const __m64*>(
-                        accumulation[perspectives[p]][0])[j * 2 + 1]);
-                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-                        sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
-                            accumulation[perspectives[p]][i])[j * 2 + 0]);
-                        sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
-                            accumulation[perspectives[p]][i])[j * 2 + 1]);
-                    }
-
-                    const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
-                    out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
-                }
-
-#elif defined(USE_NEON)
-                const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
-                for (IndexType j = 0; j < kNumChunks; ++j) {
-                    int16x8_t sum = reinterpret_cast<const int16x8_t*>(
-                        accumulation[perspectives[p]][0])[j];
-
-                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-                        sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
-                            accumulation[perspectives[p]][i])[j]);
-                    }
-
-                    out[j] = vmax_s8(vqmovn_s16(sum), kZero);
-                }
-
-#else
-                for (IndexType j = 0; j < kHalfDimensions; ++j) {
-                    BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
-                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-                        sum += accumulation[static_cast<int>(perspectives[p])][i][j];
-                    }
-
-                    output[offset + j] = static_cast<OutputType>(
-                        std::max<int>(0, std::min<int>(127, sum)));
-                }
-#endif
+            for (const auto index : active_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index;
 
+              for (IndexType j = 0; j < kHalfDimensions; ++j)
+                accumulator.accumulation[perspective][i][j] += weights_[offset + j];
             }
-#if defined(USE_MMX)
-            _mm_empty();
 #endif
+          }
+
         }
 
-    private:
-        // Calculate cumulative value without using difference calculation
-        void refresh_accumulator(const Position& pos) const {
-
-            auto& accumulator = pos.state()->accumulator;
-            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-                Features::IndexList active_indices[2];
-                RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
-                                                   active_indices);
-                for (Color perspective : { WHITE, BLACK }) {
-#ifdef TILING
-                    for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
-                        auto accTile = reinterpret_cast<vec_t*>(
-                            &accumulator.accumulation[perspective][i][j * kTileHeight]);
-                        vec_t acc[kNumRegs];
-
-                        if (i == 0) {
-                            auto biasesTile = reinterpret_cast<const vec_t*>(
-                                &biases_[j * kTileHeight]);
-                            for (unsigned k = 0; k < kNumRegs; ++k)
-                                acc[k] = biasesTile[k];
-                        } else {
-                            for (unsigned k = 0; k < kNumRegs; ++k)
-                                acc[k] = vec_zero;
-                        }
-
-                        for (const auto index : active_indices[perspective]) {
-                            const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-                            auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-                            for (unsigned k = 0; k < kNumRegs; ++k)
-                                acc[k] = vec_add_16(acc[k], column[k]);
-                        }
-
-                        for (unsigned k = 0; k < kNumRegs; k++)
-                            vec_store(&accTile[k], acc[k]);
-                    }
-#else
-                    if (i == 0) {
-                        std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                                    kHalfDimensions * sizeof(BiasType));
-                    } else {
-                        std::memset(accumulator.accumulation[perspective][i], 0,
-                                    kHalfDimensions * sizeof(BiasType));
-                    }
-
-                    for (const auto index : active_indices[perspective]) {
-                        const IndexType offset = kHalfDimensions * index;
-
-                        for (IndexType j = 0; j < kHalfDimensions; ++j)
-                            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-                    }
-#endif
-                }
-
-            }
-
 #if defined(USE_MMX)
-            _mm_empty();
+        _mm_empty();
 #endif
 
-            accumulator.computed_accumulation = true;
+        accumulator.computed_accumulation = true;
+    }
+
+    // Calculate cumulative value using difference calculation
+    void update_accumulator(const Position& pos) const {
+
+  #ifdef VECTOR
+      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
+      // is defined in the VECTOR code below, once in each branch
+      vec_t acc[kNumRegs];
+  #endif
+    const auto& prev_accumulator = pos.state()->previous->accumulator;
+    auto& accumulator = pos.state()->accumulator;
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+      Features::IndexList removed_indices[2], added_indices[2];
+      bool reset[2] = { false, false };
+      RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
+                                          removed_indices, added_indices, reset);
+
+#ifdef VECTOR
+      for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+        for (Color perspective : { WHITE, BLACK }) {
+          auto accTile = reinterpret_cast<vec_t*>(
+              &accumulator.accumulation[perspective][i][j * kTileHeight]);
+
+          if (reset[perspective]) {
+            if (i == 0) {
+              auto biasesTile = reinterpret_cast<const vec_t*>(
+                  &biases_[j * kTileHeight]);
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = biasesTile[k];
+            } else {
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_zero;
+            }
+          } else {
+            auto prevAccTile = reinterpret_cast<const vec_t*>(
+                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
+
+            for (IndexType k = 0; k < kNumRegs; ++k)
+              acc[k] = vec_load(&prevAccTile[k]);
+
+            // Difference calculation for the deactivated features
+            for (const auto index : removed_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_sub_16(acc[k], column[k]);
+            }
+          }
+
+          { // Difference calculation for the activated features
+            for (const auto index : added_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_add_16(acc[k], column[k]);
+            }
+          }
+
+          for (IndexType k = 0; k < kNumRegs; ++k)
+            vec_store(&accTile[k], acc[k]);
         }
-
-        // Calculate cumulative value using difference calculation
-        void update_accumulator(const Position& pos) const {
-
-            const auto& prev_accumulator = pos.state()->previous->accumulator;
-            auto& accumulator = pos.state()->accumulator;
-            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-                Features::IndexList removed_indices[2], added_indices[2];
-                bool reset[2] = { false, false };
-                RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
-                                                    removed_indices, added_indices, reset);
-
-#ifdef TILING
-                for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
-                    for (Color perspective : { WHITE, BLACK }) {
-                        auto accTile = reinterpret_cast<vec_t*>(
-                            &accumulator.accumulation[perspective][i][j * kTileHeight]);
-                        vec_t acc[kNumRegs];
-
-                        if (reset[perspective]) {
-                            if (i == 0) {
-                                auto biasesTile = reinterpret_cast<const vec_t*>(
-                                    &biases_[j * kTileHeight]);
-                                for (unsigned k = 0; k < kNumRegs; ++k)
-                                    acc[k] = biasesTile[k];
-                            } else {
-                                for (unsigned k = 0; k < kNumRegs; ++k)
-                                    acc[k] = vec_zero;
-                            }
-                        } else {
-                            auto prevAccTile = reinterpret_cast<const vec_t*>(
-                                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
-
-                            for (IndexType k = 0; k < kNumRegs; ++k)
-                                acc[k] = vec_load(&prevAccTile[k]);
-
-                            // Difference calculation for the deactivated features
-                            for (const auto index : removed_indices[perspective]) {
-                                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-                                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-                                for (IndexType k = 0; k < kNumRegs; ++k)
-                                    acc[k] = vec_sub_16(acc[k], column[k]);
-                            }
-                        }
-
-                        { // Difference calculation for the activated features
-                          for (const auto index : added_indices[perspective]) {
-                              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-                              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-                              for (IndexType k = 0; k < kNumRegs; ++k)
-                                  acc[k] = vec_add_16(acc[k], column[k]);
-                          }
-                        }
-
-                        for (IndexType k = 0; k < kNumRegs; ++k)
-                          vec_store(&accTile[k], acc[k]);
-                    }
-                }
+      }
 #if defined(USE_MMX)
-                _mm_empty();
+      _mm_empty();
 #endif
 
 #else
-                for (Color perspective : { WHITE, BLACK }) {
+      for (Color perspective : { WHITE, BLACK }) {
 
-                    if (reset[perspective]) {
-                        if (i == 0) {
-                            std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                                        kHalfDimensions * sizeof(BiasType));
-                        } else {
-                            std::memset(accumulator.accumulation[perspective][i], 0,
-                                        kHalfDimensions * sizeof(BiasType));
-                        }
-                    } else {
-                        std::memcpy(accumulator.accumulation[perspective][i],
-                                    prev_accumulator.accumulation[perspective][i],
-                                    kHalfDimensions * sizeof(BiasType));
-                        // Difference calculation for the deactivated features
-                        for (const auto index : removed_indices[perspective]) {
-                            const IndexType offset = kHalfDimensions * index;
+        if (reset[perspective]) {
+          if (i == 0) {
+            std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                        kHalfDimensions * sizeof(BiasType));
+          } else {
+            std::memset(accumulator.accumulation[perspective][i], 0,
+                        kHalfDimensions * sizeof(BiasType));
+          }
+        } else {
+          std::memcpy(accumulator.accumulation[perspective][i],
+                      prev_accumulator.accumulation[perspective][i],
+                      kHalfDimensions * sizeof(BiasType));
+          // Difference calculation for the deactivated features
+          for (const auto index : removed_indices[perspective]) {
+            const IndexType offset = kHalfDimensions * index;
 
-                            for (IndexType j = 0; j < kHalfDimensions; ++j)
-                                accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
-                        }
-                    }
-                    { // Difference calculation for the activated features
-                        for (const auto index : added_indices[perspective]) {
-                          const IndexType offset = kHalfDimensions * index;
-
-                          for (IndexType j = 0; j < kHalfDimensions; ++j)
-                              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-                        }
-                    }
-                }
-#endif
-            }
-            accumulator.computed_accumulation = true;
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
+          }
         }
+        { // Difference calculation for the activated features
+          for (const auto index : added_indices[perspective]) {
+            const IndexType offset = kHalfDimensions * index;
 
-        using BiasType = std::int16_t;
-        using WeightType = std::int16_t;
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+          }
+        }
+      }
+#endif
+      }
+      accumulator.computed_accumulation = true;
+    }
 
-        // Make the learning class a friend
-        friend class Trainer<FeatureTransformer>;
+    using BiasType = std::int16_t;
+    using WeightType = std::int16_t;
 
-        alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
-        alignas(kCacheLineSize)
-            WeightType weights_[kHalfDimensions * kInputDimensions];
-    };
+    // Make the learning class a friend
+    friend class Trainer<FeatureTransformer>;
+
+    alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
+    alignas(kCacheLineSize)
+        WeightType weights_[kHalfDimensions * kInputDimensions];
+  };
 
 }  // namespace Eval::NNUE
 
-#endif //#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
diff --git a/src/pawns.cpp b/src/pawns.cpp
index fde70ba5..68aaf331 100644
--- a/src/pawns.cpp
+++ b/src/pawns.cpp
@@ -176,8 +176,8 @@ namespace {
             score -=  Doubled * doubled
                     + WeakLever * more_than_one(lever);
 
-        if (blocked && r > RANK_4)
-            score += BlockedPawn[r-4];
+        if (blocked && r >= RANK_5)
+            score += BlockedPawn[r - RANK_5];
     }
 
     return score;
diff --git a/src/search.cpp b/src/search.cpp
index 8d057f42..30384868 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -59,7 +59,7 @@ namespace {
   // Razor and futility margins
   constexpr int RazorMargin = 510;
   Value futility_margin(Depth d, bool improving) {
-    return Value(223 * (d - improving));
+    return Value(234 * (d - improving));
   }
 
   // Reductions lookup table, initialized at startup
@@ -67,7 +67,7 @@ namespace {
 
   Depth reduction(bool i, Depth d, int mn) {
     int r = Reductions[d] * Reductions[mn];
-    return (r + 509) / 1024 + (!i && r > 894);
+    return (r + 503) / 1024 + (!i && r > 915);
   }
 
   constexpr int futility_move_count(bool improving, Depth depth) {
@@ -188,7 +188,7 @@ namespace {
 void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
-      Reductions[i] = int((22.0 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i)));
+      Reductions[i] = int((21.3 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i)));
 }
 
 
@@ -404,7 +404,7 @@ void Thread::search() {
               beta  = std::min(prev + delta, VALUE_INFINITE);
 
               // Adjust contempt based on root move's previousScore (dynamic contempt)
-              int dct = ct + (105 - ct / 2) * prev / (abs(prev) + 149);
+              int dct = ct + (113 - ct / 2) * prev / (abs(prev) + 147);
 
               contempt = (us == WHITE ?  make_score(dct, dct / 2)
                                       : -make_score(dct, dct / 2));
@@ -824,7 +824,7 @@ namespace {
         && (ss-1)->statScore < 22977
         &&  eval >= beta
         &&  eval >= ss->staticEval
-        &&  ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 182
+        &&  ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 168
         && !excludedMove
         &&  pos.non_pawn_material(us)
         && (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor))
@@ -832,7 +832,7 @@ namespace {
         assert(eval - beta >= 0);
 
         // Null move dynamic reduction based on depth and value
-        Depth R = (982 + 85 * depth) / 256 + std::min(int(eval - beta) / 192, 3);
+        Depth R = (1015 + 85 * depth) / 256 + std::min(int(eval - beta) / 191, 3);
 
         ss->currentMove = MOVE_NULL;
         ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0];
@@ -849,7 +849,7 @@ namespace {
             if (nullValue >= VALUE_TB_WIN_IN_MAX_PLY)
                 nullValue = beta;
 
-            if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 13))
+            if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 14))
                 return nullValue;
 
             assert(!thisThread->nmpMinPly); // Recursive verification is not allowed
@@ -868,7 +868,7 @@ namespace {
         }
     }
 
-    probCutBeta = beta + 176 - 49 * improving;
+    probCutBeta = beta + 183 - 49 * improving;
 
     // Step 10. ProbCut (~10 Elo)
     // If we have a good enough capture and a reduced search returns a value
@@ -1036,7 +1036,7 @@ moves_loop: // When in check, search starts from here
               // Futility pruning: parent node (~5 Elo)
               if (   lmrDepth < 7
                   && !ss->inCheck
-                  && ss->staticEval + 283 + 170 * lmrDepth <= alpha
+                  && ss->staticEval + 266 + 170 * lmrDepth <= alpha
                   &&  (*contHist[0])[movedPiece][to_sq(move)]
                     + (*contHist[1])[movedPiece][to_sq(move)]
                     + (*contHist[3])[movedPiece][to_sq(move)]
@@ -1044,7 +1044,7 @@ moves_loop: // When in check, search starts from here
                   continue;
 
               // Prune moves with negative SEE (~20 Elo)
-              if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
+              if (!pos.see_ge(move, Value(-(30 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
                   continue;
           }
           else
@@ -1055,8 +1055,8 @@ moves_loop: // When in check, search starts from here
                   && captureHistory[movedPiece][to_sq(move)][type_of(pos.piece_on(to_sq(move)))] < 0)
                   continue;
 
-              // See based pruning
-              if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo)
+              // SEE based pruning
+              if (!pos.see_ge(move, Value(-213) * depth)) // (~25 Elo)
                   continue;
           }
       }
@@ -1150,12 +1150,12 @@ moves_loop: // When in check, search starts from here
               || moveCountPruning
               || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha
               || cutNode
-              || thisThread->ttHitAverage < 427 * TtHitAverageResolution * TtHitAverageWindow / 1024))
+              || thisThread->ttHitAverage < 432 * TtHitAverageResolution * TtHitAverageWindow / 1024))
       {
           Depth r = reduction(improving, depth, moveCount);
 
           // Decrease reduction if the ttHit running average is large
-          if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024)
+          if (thisThread->ttHitAverage > 537 * TtHitAverageResolution * TtHitAverageWindow / 1024)
               r--;
 
           // Increase reduction if other threads are searching this position
@@ -1208,10 +1208,10 @@ moves_loop: // When in check, search starts from here
                              - 5287;
 
               // Decrease/increase reduction by comparing opponent's stat score (~10 Elo)
-              if (ss->statScore >= -106 && (ss-1)->statScore < -104)
+              if (ss->statScore >= -105 && (ss-1)->statScore < -103)
                   r--;
 
-              else if ((ss-1)->statScore >= -119 && ss->statScore < -140)
+              else if ((ss-1)->statScore >= -122 && ss->statScore < -129)
                   r++;
 
               // Decrease/increase reduction for moves with a good/bad history (~30 Elo)
@@ -1225,7 +1225,7 @@ moves_loop: // When in check, search starts from here
 
               // Unless giving check, this capture is likely bad
               if (   !givesCheck
-                  && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 213 * depth <= alpha)
+                  && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 210 * depth <= alpha)
                   r++;
           }
 
@@ -1499,7 +1499,7 @@ moves_loop: // When in check, search starts from here
         if (PvNode && bestValue > alpha)
             alpha = bestValue;
 
-        futilityBase = bestValue + 145;
+        futilityBase = bestValue + 155;
     }
 
     const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory,
diff --git a/src/types.h b/src/types.h
index f55a20f7..4918e8ff 100644
--- a/src/types.h
+++ b/src/types.h
@@ -204,8 +204,8 @@ enum PieceType {
 
 enum Piece {
   NO_PIECE,
-  W_PAWN = 1, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
-  B_PAWN = 9, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING,
+  W_PAWN = PAWN,     W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
+  B_PAWN = PAWN + 8, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING,
   PIECE_NB = 16
 };