Cleanup layers.

2025-12-24 19:16:49 +08:00 · 2020-10-14 20:55:35 +02:00
parent 0d4c3014ca
commit 3041adb080
4 changed files with 616 additions and 573 deletions
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.

-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.

-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 // Definition of layer AffineTransform of NNUE evaluation function
@@ -21,267 +21,290 @@
 #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
 #define NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED

-#include <iostream>
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
+
+#include <string>
+#include <type_traits>
+#include <cstdint>

 namespace Eval::NNUE::Layers {

-  // Affine transformation layer
-  template <typename PreviousLayer, IndexType OutputDimensions>
-  class AffineTransform {
-   public:
-    // Input/output type
-    using InputType = typename PreviousLayer::OutputType;
-    using OutputType = std::int32_t;
-    static_assert(std::is_same<InputType, std::uint8_t>::value, "");
+    // Affine transformation layer
+    template <typename PreviousLayer, IndexType OutputDimensions>
+    class AffineTransform {
+    public:
+        // Input/output type
+        using InputType = typename PreviousLayer::OutputType;

-    // Number of input/output dimensions
-    static constexpr IndexType kInputDimensions =
-        PreviousLayer::kOutputDimensions;
-    static constexpr IndexType kOutputDimensions = OutputDimensions;
-    static constexpr IndexType kPaddedInputDimensions =
-        CeilToMultiple<IndexType>(kInputDimensions, kMaxSimdWidth);
+        using OutputType = std::int32_t;

-    // Size of forward propagation buffer used in this layer
-    static constexpr std::size_t kSelfBufferSize =
-        CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+        static_assert(std::is_same<InputType, std::uint8_t>::value, "");

-    // Size of the forward propagation buffer used from the input layer to this layer
-    static constexpr std::size_t kBufferSize =
-        PreviousLayer::kBufferSize + kSelfBufferSize;
+        // Number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            PreviousLayer::kOutputDimensions;

-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t GetHashValue() {
-      std::uint32_t hash_value = 0xCC03DAE4u;
-      hash_value += kOutputDimensions;
-      hash_value ^= PreviousLayer::GetHashValue() >> 1;
-      hash_value ^= PreviousLayer::GetHashValue() << 31;
-      return hash_value;
-    }
+        static constexpr IndexType kOutputDimensions = OutputDimensions;

-    // A string that represents the structure from the input layer to this layer
-    static std::string GetStructureString() {
-      return "AffineTransform[" +
-        std::to_string(kOutputDimensions) + "<-" +
-        std::to_string(kInputDimensions) + "](" +
-        PreviousLayer::GetStructureString() + ")";
-    }
-    
-   // Read network parameters
-    bool ReadParameters(std::istream& stream) {
-      if (!previous_layer_.ReadParameters(stream)) return false;
-      for (std::size_t i = 0; i < kOutputDimensions; ++i)
-        biases_[i] = read_little_endian<BiasType>(stream);
-      for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i)
-        weights_[i] = read_little_endian<WeightType>(stream);
-      return !stream.fail();
-    }
+        static constexpr IndexType kPaddedInputDimensions =
+            CeilToMultiple<IndexType>(kInputDimensions, kMaxSimdWidth);

-    // write parameters
-    bool WriteParameters(std::ostream& stream) const {
-      if (!previous_layer_.WriteParameters(stream)) return false;
-      stream.write(reinterpret_cast<const char*>(biases_),
-        kOutputDimensions * sizeof(BiasType));
-      stream.write(reinterpret_cast<const char*>(weights_),
-        kOutputDimensions * kPaddedInputDimensions *
-        sizeof(WeightType));
-      return !stream.fail();
-    }
+        // Size of forward propagation buffer used in this layer
+        static constexpr std::size_t kSelfBufferSize =
+            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);

-    // Forward propagation
-    const OutputType* Propagate(
-        const TransformedFeatureType* transformed_features, char* buffer) const {
-      const auto input = previous_layer_.Propagate(
-          transformed_features, buffer + kSelfBufferSize);
-      const auto output = reinterpret_cast<OutputType*>(buffer);
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize =
+            PreviousLayer::kBufferSize + kSelfBufferSize;

-  #if defined(USE_AVX512)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
-      const auto input_vector = reinterpret_cast<const __m512i*>(input);
-  #if !defined(USE_VNNI)
-      const __m512i kOnes = _mm512_set1_epi16(1);
-  #endif
-
-  #elif defined(USE_AVX2)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-      const auto input_vector = reinterpret_cast<const __m256i*>(input);
-  #if !defined(USE_VNNI)
-      const __m256i kOnes = _mm256_set1_epi16(1);
-  #endif
-
-  #elif defined(USE_SSE2)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-  #ifndef USE_SSSE3
-      const __m128i kZeros = _mm_setzero_si128();
-  #else
-      const __m128i kOnes = _mm_set1_epi16(1);
-  #endif
-      const auto input_vector = reinterpret_cast<const __m128i*>(input);
-
-  #elif defined(USE_MMX)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-      const __m64 kZeros = _mm_setzero_si64();
-      const auto input_vector = reinterpret_cast<const __m64*>(input);
-
-  #elif defined(USE_NEON)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-      const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
-  #endif
-
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType offset = i * kPaddedInputDimensions;
-
-  #if defined(USE_AVX512)
-        __m512i sum = _mm512_setzero_si512();
-        const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-            sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #else
-            __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-            product = _mm512_madd_epi16(product, kOnes);
-            sum = _mm512_add_epi32(sum, product);
-  #endif
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0xCC03DAE4u;
+            hash_value += kOutputDimensions;
+            hash_value ^= PreviousLayer::GetHashValue() >> 1;
+            hash_value ^= PreviousLayer::GetHashValue() << 31;
+            return hash_value;
        }

-        // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
-        // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
-        // and we have to do one more 256bit chunk.
-        if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
-        {
-            const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
-            const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
-  #if defined(USE_VNNI)
-            __m256i product256 = _mm256_dpbusd_epi32(
-                _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_inserti32x8(sum, product256, 0);
-  #else
-            __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
-  #endif
+        // A string that represents the structure from the input layer to this layer
+        static std::string GetStructureString() {
+            return "AffineTransform[" +
+                std::to_string(kOutputDimensions) + "<-" +
+                std::to_string(kInputDimensions) + "](" +
+                PreviousLayer::GetStructureString() + ")";
        }
-        output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];

-  #elif defined(USE_AVX2)
-        __m256i sum = _mm256_setzero_si256();
-        const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-          sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-  #else
-          __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-          product = _mm256_madd_epi16(product, kOnes);
-          sum = _mm256_add_epi32(sum, product);
-  #endif
+       // Read network parameters
+        bool ReadParameters(std::istream& stream) {
+            if (!previous_layer_.ReadParameters(stream))
+                return false;
+
+            for (std::size_t i = 0; i < kOutputDimensions; ++i)
+                biases_[i] = read_little_endian<BiasType>(stream);
+
+            for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i)
+                weights_[i] = read_little_endian<WeightType>(stream);
+
+            return !stream.fail();
        }
-        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
-        output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];

-  #elif defined(USE_SSSE3)
-        __m128i sum = _mm_setzero_si128();
-        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
-        for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
-          __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
-          product0 = _mm_madd_epi16(product0, kOnes);
-          sum = _mm_add_epi32(sum, product0);
-          __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
-          product1 = _mm_madd_epi16(product1, kOnes);
-          sum = _mm_add_epi32(sum, product1);
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            if (!previous_layer_.WriteParameters(stream))
+                return false;
+
+            stream.write(reinterpret_cast<const char*>(biases_),
+                kOutputDimensions * sizeof(BiasType));
+
+            stream.write(reinterpret_cast<const char*>(weights_),
+                kOutputDimensions * kPaddedInputDimensions *
+                sizeof(WeightType));
+
+            return !stream.fail();
        }
-        if (kNumChunks & 0x1) {
-          __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
-          product = _mm_madd_epi16(product, kOnes);
-          sum = _mm_add_epi32(sum, product);
+
+        // Forward propagation
+        const OutputType* Propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {
+
+            const auto input = previous_layer_.Propagate(
+                transformed_features, buffer + kSelfBufferSize);
+            const auto output = reinterpret_cast<OutputType*>(buffer);
+
+#if defined(USE_AVX512)
+            constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
+            const auto input_vector = reinterpret_cast<const __m512i*>(input);
+#if !defined(USE_VNNI)
+            const __m512i kOnes = _mm512_set1_epi16(1);
+#endif
+
+#elif defined(USE_AVX2)
+            constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+            const auto input_vector = reinterpret_cast<const __m256i*>(input);
+#if !defined(USE_VNNI)
+            const __m256i kOnes = _mm256_set1_epi16(1);
+#endif
+
+#elif defined(USE_SSE2)
+            constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+#ifndef USE_SSSE3
+            const __m128i kZeros = _mm_setzero_si128();
+#else
+            const __m128i kOnes = _mm_set1_epi16(1);
+#endif
+            const auto input_vector = reinterpret_cast<const __m128i*>(input);
+
+#elif defined(USE_MMX)
+            constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+            const __m64 kZeros = _mm_setzero_si64();
+            const auto input_vector = reinterpret_cast<const __m64*>(input);
+
+#elif defined(USE_NEON)
+            constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+            const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
+#endif
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const IndexType offset = i * kPaddedInputDimensions;
+
+#if defined(USE_AVX512)
+                __m512i sum = _mm512_setzero_si512();
+                const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+#if defined(USE_VNNI)
+                    sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+#else
+                    __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+                    product = _mm512_madd_epi16(product, kOnes);
+                    sum = _mm512_add_epi32(sum, product);
+#endif
+                }
+
+                // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
+                // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
+                // and we have to do one more 256bit chunk.
+                if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
+                {
+                    const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
+                    const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
+#if defined(USE_VNNI)
+                    __m256i product256 = _mm256_dpbusd_epi32(
+                        _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
+                    sum = _mm512_inserti32x8(sum, product256, 0);
+#else
+                    __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
+                    sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
+#endif
+                }
+
+                output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
+
+#elif defined(USE_AVX2)
+                __m256i sum = _mm256_setzero_si256();
+                const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+#if defined(USE_VNNI)
+                    sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+#else
+                    __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+                    product = _mm256_madd_epi16(product, kOnes);
+                    sum = _mm256_add_epi32(sum, product);
+#endif
+                }
+
+                __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+                sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+                sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+                output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
+
+#elif defined(USE_SSSE3)
+                __m128i sum = _mm_setzero_si128();
+                const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+                for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
+                    __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+                    product0 = _mm_madd_epi16(product0, kOnes);
+                    sum = _mm_add_epi32(sum, product0);
+                    __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
+                    product1 = _mm_madd_epi16(product1, kOnes);
+                    sum = _mm_add_epi32(sum, product1);
+                }
+
+                if (kNumChunks & 0x1) {
+                    __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
+                    product = _mm_madd_epi16(product, kOnes);
+                    sum = _mm_add_epi32(sum, product);
+                }
+
+                sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
+                sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
+                output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
+
+#elif defined(USE_SSE2)
+                __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
+                __m128i sum_hi = kZeros;
+                const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    __m128i row_j = _mm_load_si128(&row[j]);
+                    __m128i input_j = _mm_load_si128(&input_vector[j]);
+                    __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
+                    __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
+                    __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
+                    __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
+                    __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
+                    __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
+                    __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
+                    sum_lo = _mm_add_epi32(sum_lo, product_lo);
+                    sum_hi = _mm_add_epi32(sum_hi, product_hi);
+                }
+
+                __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
+                __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
+                sum = _mm_add_epi32(sum, sum_high_64);
+                __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
+                sum = _mm_add_epi32(sum, sum_second_32);
+                output[i] = _mm_cvtsi128_si32(sum);
+
+#elif defined(USE_MMX)
+                __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
+                __m64 sum_hi = kZeros;
+                const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    __m64 row_j = row[j];
+                    __m64 input_j = input_vector[j];
+                    __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
+                    __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
+                    __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
+                    __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
+                    __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
+                    __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
+                    __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
+                    sum_lo = _mm_add_pi32(sum_lo, product_lo);
+                    sum_hi = _mm_add_pi32(sum_hi, product_hi);
+                }
+
+                __m64 sum = _mm_add_pi32(sum_lo, sum_hi);
+                sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
+                output[i] = _mm_cvtsi64_si32(sum);
+
+#elif defined(USE_NEON)
+                int32x4_t sum = {biases_[i]};
+                const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]);
+                    product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]);
+                    sum = vpadalq_s16(sum, product);
+                }
+
+                output[i] = sum[0] + sum[1] + sum[2] + sum[3];
+
+#else
+                OutputType sum = biases_[i];
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    sum += weights_[offset + j] * input[j];
+                }
+
+                output[i] = sum;
+#endif
+
+            }
+#if defined(USE_MMX)
+            _mm_empty();
+#endif
+            return output;
        }
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
-        output[i] = _mm_cvtsi128_si32(sum) + biases_[i];

-  #elif defined(USE_SSE2)
-        __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
-        __m128i sum_hi = kZeros;
-        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m128i row_j = _mm_load_si128(&row[j]);
-          __m128i input_j = _mm_load_si128(&input_vector[j]);
-          __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
-          __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
-          __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
-          __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
-          __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
-          __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
-          __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
-          sum_lo = _mm_add_epi32(sum_lo, product_lo);
-          sum_hi = _mm_add_epi32(sum_hi, product_hi);
-        }
-        __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
-        __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
-        sum = _mm_add_epi32(sum, sum_high_64);
-        __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
-        sum = _mm_add_epi32(sum, sum_second_32);
-        output[i] = _mm_cvtsi128_si32(sum);
+    private:
+        using BiasType = OutputType;
+        using WeightType = std::int8_t;

-  #elif defined(USE_MMX)
-        __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
-        __m64 sum_hi = kZeros;
-        const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m64 row_j = row[j];
-          __m64 input_j = input_vector[j];
-          __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
-          __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
-          __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
-          __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
-          __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
-          __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
-          __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
-          sum_lo = _mm_add_pi32(sum_lo, product_lo);
-          sum_hi = _mm_add_pi32(sum_hi, product_hi);
-        }
-        __m64 sum = _mm_add_pi32(sum_lo, sum_hi);
-        sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
-        output[i] = _mm_cvtsi64_si32(sum);
+        // Make the learning class a friend
+        friend class Trainer<AffineTransform>;

-  #elif defined(USE_NEON)
-        int32x4_t sum = {biases_[i]};
-        const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]);
-          product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]);
-          sum = vpadalq_s16(sum, product);
-        }
-        output[i] = sum[0] + sum[1] + sum[2] + sum[3];
+        PreviousLayer previous_layer_;

-  #else
-        OutputType sum = biases_[i];
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          sum += weights_[offset + j] * input[j];
-        }
-        output[i] = sum;
-  #endif
-
-      }
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
-      return output;
-    }
-
-   private:
-    using BiasType = OutputType;
-    using WeightType = std::int8_t;
-
-    // Make the learning class a friend
-    friend class Trainer<AffineTransform>;
-
-    PreviousLayer previous_layer_;
-
-    alignas(kCacheLineSize) BiasType biases_[kOutputDimensions];
-    alignas(kCacheLineSize)
-        WeightType weights_[kOutputDimensions * kPaddedInputDimensions];
-  };
+        alignas(kCacheLineSize) BiasType biases_[kOutputDimensions];
+        alignas(kCacheLineSize) WeightType weights_[kOutputDimensions * kPaddedInputDimensions];
+    };

 }  // namespace Eval::NNUE::Layers

--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.

-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.

-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 // Definition of layer ClippedReLU of NNUE evaluation function
@@ -21,160 +21,169 @@
 #ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
 #define NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED

-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
+
+#include <string>
+#include <cstdint>
+#include <type_traits>

 namespace Eval::NNUE::Layers {

-  // Clipped ReLU
-  template <typename PreviousLayer>
-  class ClippedReLU {
-   public:
-    // Input/output type
-    using InputType = typename PreviousLayer::OutputType;
-    using OutputType = std::uint8_t;
-    static_assert(std::is_same<InputType, std::int32_t>::value, "");
+    // Clipped ReLU
+    template <typename PreviousLayer>
+    class ClippedReLU {
+    public:
+        // Input/output type
+        using InputType = typename PreviousLayer::OutputType;

-    // Number of input/output dimensions
-    static constexpr IndexType kInputDimensions =
-        PreviousLayer::kOutputDimensions;
-    static constexpr IndexType kOutputDimensions = kInputDimensions;
+        using OutputType = std::uint8_t;

-    // Size of forward propagation buffer used in this layer
-    static constexpr std::size_t kSelfBufferSize =
-        CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+        static_assert(std::is_same<InputType, std::int32_t>::value, "");

-    // Size of the forward propagation buffer used from the input layer to this layer
-    static constexpr std::size_t kBufferSize =
-        PreviousLayer::kBufferSize + kSelfBufferSize;
+        // Number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            PreviousLayer::kOutputDimensions;

-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t GetHashValue() {
-      std::uint32_t hash_value = 0x538D24C7u;
-      hash_value += PreviousLayer::GetHashValue();
-      return hash_value;
-    }
+        static constexpr IndexType kOutputDimensions = kInputDimensions;

-    // A string that represents the structure from the input layer to this layer
-    static std::string GetStructureString() {
-      return "ClippedReLU[" +
-        std::to_string(kOutputDimensions) + "](" +
-        PreviousLayer::GetStructureString() + ")";
-    }
+        // Size of forward propagation buffer used in this layer
+        static constexpr std::size_t kSelfBufferSize =
+            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);

-    // Read network parameters
-    bool ReadParameters(std::istream& stream) {
-      return previous_layer_.ReadParameters(stream);
-    }
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize =
+            PreviousLayer::kBufferSize + kSelfBufferSize;

-    // write parameters
-    bool WriteParameters(std::ostream& stream) const {
-      return previous_layer_.WriteParameters(stream);
-    }
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0x538D24C7u;
+            hash_value += PreviousLayer::GetHashValue();
+            return hash_value;
+        }

-    // Forward propagation
-    const OutputType* Propagate(
-        const TransformedFeatureType* transformed_features, char* buffer) const {
-      const auto input = previous_layer_.Propagate(
-          transformed_features, buffer + kSelfBufferSize);
-      const auto output = reinterpret_cast<OutputType*>(buffer);
+        // A string that represents the structure from the input layer to this layer
+        static std::string GetStructureString() {
+            return "ClippedReLU[" +
+                std::to_string(kOutputDimensions) + "](" +
+                PreviousLayer::GetStructureString() + ")";
+        }

-  #if defined(USE_AVX2)
-      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
-      const __m256i kZero = _mm256_setzero_si256();
-      const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-      const auto in = reinterpret_cast<const __m256i*>(input);
-      const auto out = reinterpret_cast<__m256i*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 0]),
-            _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
-        const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 2]),
-            _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
-        _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
-            _mm256_packs_epi16(words0, words1), kZero), kOffsets));
-      }
-      constexpr IndexType kStart = kNumChunks * kSimdWidth;
+        // Read network parameters
+        bool ReadParameters(std::istream& stream) {
+            return previous_layer_.ReadParameters(stream);
+        }

-  #elif defined(USE_SSE2)
-      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            return previous_layer_.WriteParameters(stream);
+        }

-  #ifdef USE_SSE41
-      const __m128i kZero = _mm_setzero_si128();
-  #else
-      const __m128i k0x80s = _mm_set1_epi8(-128);
-  #endif
+        // Forward propagation
+        const OutputType* Propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {

-      const auto in = reinterpret_cast<const __m128i*>(input);
-      const auto out = reinterpret_cast<__m128i*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
-            _mm_load_si128(&in[i * 4 + 0]),
-            _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits);
-        const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
-            _mm_load_si128(&in[i * 4 + 2]),
-            _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
-        const __m128i packedbytes = _mm_packs_epi16(words0, words1);
-        _mm_store_si128(&out[i],
+            const auto input = previous_layer_.Propagate(
+                transformed_features, buffer + kSelfBufferSize);
+            const auto output = reinterpret_cast<OutputType*>(buffer);

-  #ifdef USE_SSE41
-          _mm_max_epi8(packedbytes, kZero)
-  #else
-          _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
-  #endif
+#if defined(USE_AVX2)
+            constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+            const __m256i kZero = _mm256_setzero_si256();
+            const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+            const auto in = reinterpret_cast<const __m256i*>(input);
+            const auto out = reinterpret_cast<__m256i*>(output);
+            for (IndexType i = 0; i < kNumChunks; ++i) {
+                const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
+                    _mm256_loadA_si256(&in[i * 4 + 0]),
+                    _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
+                const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
+                    _mm256_loadA_si256(&in[i * 4 + 2]),
+                    _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
+                _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+                    _mm256_packs_epi16(words0, words1), kZero), kOffsets));
+            }

-        );
-      }
-      constexpr IndexType kStart = kNumChunks * kSimdWidth;
+            constexpr IndexType kStart = kNumChunks * kSimdWidth;

-  #elif defined(USE_MMX)
-      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
-      const __m64 k0x80s = _mm_set1_pi8(-128);
-      const auto in = reinterpret_cast<const __m64*>(input);
-      const auto out = reinterpret_cast<__m64*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        const __m64 words0 = _mm_srai_pi16(
-            _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
-            kWeightScaleBits);
-        const __m64 words1 = _mm_srai_pi16(
-            _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
-            kWeightScaleBits);
-        const __m64 packedbytes = _mm_packs_pi16(words0, words1);
-        out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
-      }
-      _mm_empty();
-      constexpr IndexType kStart = kNumChunks * kSimdWidth;
+#elif defined(USE_SSE2)
+            constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;

-  #elif defined(USE_NEON)
-      constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
-      const int8x8_t kZero = {0};
-      const auto in = reinterpret_cast<const int32x4_t*>(input);
-      const auto out = reinterpret_cast<int8x8_t*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        int16x8_t shifted;
-        const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
-        pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits);
-        pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits);
-        out[i] = vmax_s8(vqmovn_s16(shifted), kZero);
-      }
-      constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2);
-  #else
-      constexpr IndexType kStart = 0;
-  #endif
+#if defined(USE_SSE41)
+            const __m128i kZero = _mm_setzero_si128();
+#else
+            const __m128i k0x80s = _mm_set1_epi8(-128);
+#endif

-      for (IndexType i = kStart; i < kInputDimensions; ++i) {
-        output[i] = static_cast<OutputType>(
-            std::max(0, std::min(127, input[i] >> kWeightScaleBits)));
-      }
-      return output;
-    }
+            const auto in = reinterpret_cast<const __m128i*>(input);
+            const auto out = reinterpret_cast<__m128i*>(output);
+            for (IndexType i = 0; i < kNumChunks; ++i) {
+                const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
+                    _mm_load_si128(&in[i * 4 + 0]),
+                    _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits);
+                const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
+                    _mm_load_si128(&in[i * 4 + 2]),
+                    _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
+                const __m128i packedbytes = _mm_packs_epi16(words0, words1);
+                _mm_store_si128(&out[i],

-   private:
-     // Make the learning class a friend
-     friend class Trainer<ClippedReLU>;
-     
-    PreviousLayer previous_layer_;
-  };
+#if defined(USE_SSE41)
+                    _mm_max_epi8(packedbytes, kZero)
+            #else
+                    _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+#endif
+
+                );
+            }
+            constexpr IndexType kStart = kNumChunks * kSimdWidth;
+
+#elif defined(USE_MMX)
+            constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+            const __m64 k0x80s = _mm_set1_pi8(-128);
+            const auto in = reinterpret_cast<const __m64*>(input);
+            const auto out = reinterpret_cast<__m64*>(output);
+            for (IndexType i = 0; i < kNumChunks; ++i) {
+                const __m64 words0 = _mm_srai_pi16(
+                    _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
+                    kWeightScaleBits);
+                const __m64 words1 = _mm_srai_pi16(
+                    _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
+                    kWeightScaleBits);
+                const __m64 packedbytes = _mm_packs_pi16(words0, words1);
+                out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+            }
+            _mm_empty();
+            constexpr IndexType kStart = kNumChunks * kSimdWidth;
+
+#elif defined(USE_NEON)
+            constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
+            const int8x8_t kZero = {0};
+            const auto in = reinterpret_cast<const int32x4_t*>(input);
+            const auto out = reinterpret_cast<int8x8_t*>(output);
+            for (IndexType i = 0; i < kNumChunks; ++i) {
+                int16x8_t shifted;
+                const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
+                pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits);
+                pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits);
+                out[i] = vmax_s8(vqmovn_s16(shifted), kZero);
+            }
+            constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2);
+#else
+            constexpr IndexType kStart = 0;
+#endif
+
+            for (IndexType i = kStart; i < kInputDimensions; ++i) {
+                output[i] = static_cast<OutputType>(
+                    std::max(0, std::min(127, input[i] >> kWeightScaleBits)));
+            }
+            return output;
+        }
+
+    private:
+        // Make the learning class a friend
+        friend class Trainer<ClippedReLU>;
+
+        PreviousLayer previous_layer_;
+    };

 }  // namespace Eval::NNUE::Layers

--- a/src/nnue/layers/input_slice.h
+++ b/src/nnue/layers/input_slice.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.

-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.

-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 // NNUE evaluation function layer InputSlice definition
@@ -21,59 +21,63 @@
 #ifndef NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
 #define NNUE_LAYERS_INPUT_SLICE_H_INCLUDED

-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
+
+#include <string>
+#include <cstdint>

 namespace Eval::NNUE::Layers {

-// Input layer
-template <IndexType OutputDimensions, IndexType Offset = 0>
-class InputSlice {
- public:
-  // Need to maintain alignment
-  static_assert(Offset % kMaxSimdWidth == 0, "");
+  // Input layer
+  template <IndexType OutputDimensions, IndexType Offset = 0>
+  class InputSlice {
+  public:
+      // Need to maintain alignment
+      static_assert(Offset % kMaxSimdWidth == 0, "");

-  // Output type
-  using OutputType = TransformedFeatureType;
+      // Output type
+      using OutputType = TransformedFeatureType;

-  // Output dimensionality
-  static constexpr IndexType kOutputDimensions = OutputDimensions;
+      // Output dimensionality
+      static constexpr IndexType kOutputDimensions = OutputDimensions;

-  // Size of forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t kBufferSize = 0;
+      // Size of forward propagation buffer used from the input layer to this layer
+      static constexpr std::size_t kBufferSize = 0;

-  // Hash value embedded in the evaluation file
-  static constexpr std::uint32_t GetHashValue() {
-    std::uint32_t hash_value = 0xEC42E90Du;
-    hash_value ^= kOutputDimensions ^ (Offset << 10);
-    return hash_value;
-  }
+      // Hash value embedded in the evaluation file
+      static constexpr std::uint32_t GetHashValue() {
+          std::uint32_t hash_value = 0xEC42E90Du;
+          hash_value ^= kOutputDimensions ^ (Offset << 10);
+          return hash_value;
+      }

-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
-      std::to_string(Offset) + ":" +
-      std::to_string(Offset + kOutputDimensions) + ")]";
-  }
+      // A string that represents the structure from the input layer to this layer
+      static std::string GetStructureString() {
+          return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
+              std::to_string(Offset) + ":" +
+              std::to_string(Offset + kOutputDimensions) + ")]";
+      }

-  // Read network parameters
-  bool ReadParameters(std::istream& /*stream*/) {
-    return true;
-  }
+      // Read network parameters
+      bool ReadParameters(std::istream& /*stream*/) {
+          return true;
+      }

-  // write parameters
-  bool WriteParameters(std::ostream& /*stream*/) const {
-    return true;
-  }
+      // write parameters
+      bool WriteParameters(std::ostream& /*stream*/) const {
+          return true;
+      }

-  // Forward propagation
-  const OutputType* Propagate(
-      const TransformedFeatureType* transformed_features,
-      char* /*buffer*/) const {
-    return transformed_features + Offset;
-  }
+      // Forward propagation
+      const OutputType* Propagate(
+          const TransformedFeatureType* transformed_features,
+          char* /*buffer*/) const {

- private:
-};
+          return transformed_features + Offset;
+      }
+
+  private:
+  };

 }  // namespace Layers

--- a/src/nnue/layers/sum.h
+++ b/src/nnue/layers/sum.h
@@ -1,159 +1,166 @@
-// Definition of layer Sum of NNUE evaluation function
-
-#ifndef _NNUE_LAYERS_SUM_H_
+#ifndef _NNUE_LAYERS_SUM_H_
 #define _NNUE_LAYERS_SUM_H_

-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"

-namespace Eval {
+// Definition of layer Sum of NNUE evaluation function
+namespace Eval::NNUE::Layers {

-namespace NNUE {
+    // Layer that sums the output of multiple layers
+    template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+    class Sum : public Sum<RemainingPreviousLayers...> {
+    private:
+        using Head = FirstPreviousLayer;
+        using Tail = Sum<RemainingPreviousLayers...>;

-namespace Layers {
+     public:
+        // Input/output type
+        using InputType = typename Head::OutputType;

-// Layer that sums the output of multiple layers
-template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
-class Sum : public Sum<RemainingPreviousLayers...> {
- private:
-  using Head = FirstPreviousLayer;
-  using Tail = Sum<RemainingPreviousLayers...>;
+        using OutputType = InputType;

- public:
-  // Input/output type
-  using InputType = typename Head::OutputType;
-  using OutputType = InputType;
-  static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");
+        static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");

-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = kInputDimensions;
-  static_assert(kInputDimensions == Tail::kInputDimensions ,"");
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = Head::kOutputDimensions;

-  // Size of forward propagation buffer used in this layer
-  static constexpr std::size_t kSelfBufferSize =
-      CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+        static constexpr IndexType kOutputDimensions = kInputDimensions;

-  // Size of the forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t kBufferSize =
-      std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
+        static_assert(kInputDimensions == Tail::kInputDimensions ,"");

-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t GetHashValue() {
-    std::uint32_t hash_value = 0xBCE400B4u;
-    hash_value ^= Head::GetHashValue() >> 1;
-    hash_value ^= Head::GetHashValue() << 31;
-    hash_value ^= Tail::GetHashValue() >> 2;
-    hash_value ^= Tail::GetHashValue() << 30;
-    return hash_value;
-  }
+        // Size of forward propagation buffer used in this layer
+        static constexpr std::size_t kSelfBufferSize =
+            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);

-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "Sum[" +
-        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
-  }
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize =
+            std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);

-  // read parameters
-  bool ReadParameters(std::istream& stream) {
-    if (!Tail::ReadParameters(stream)) return false;
-    return previous_layer_.ReadParameters(stream);
-  }
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0xBCE400B4u;
+            hash_value ^= Head::GetHashValue() >> 1;
+            hash_value ^= Head::GetHashValue() << 31;
+            hash_value ^= Tail::GetHashValue() >> 2;
+            hash_value ^= Tail::GetHashValue() << 30;
+            return hash_value;
+        }

-  // write parameters
-  bool WriteParameters(std::ostream& stream) const {
-    if (!Tail::WriteParameters(stream)) return false;
-    return previous_layer_.WriteParameters(stream);
-  }
+        // A string that represents the structure from the input layer to this layer
+        static std::string GetStructureString() {
+            return "Sum[" +
+                std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+        }

-  // forward propagation
-  const OutputType* Propagate(
-      const TransformedFeatureType* transformed_features, char* buffer) const {
-    Tail::Propagate(transformed_features, buffer);
-    const auto head_output = previous_layer_.Propagate(
-        transformed_features, buffer + kSelfBufferSize);
-    const auto output = reinterpret_cast<OutputType*>(buffer);
-    for (IndexType i = 0; i <kOutputDimensions; ++i) {
-      output[i] += head_output[i];
-    }
-    return output;
-  }
+        // read parameters
+        bool ReadParameters(std::istream& stream) {
+            if (!Tail::ReadParameters(stream))
+                return false;

- protected:
-  // A string that represents the list of layers to be summed
-  static std::string GetSummandsString() {
-    return Head::GetStructureString() + "," + Tail::GetSummandsString();
-  }
+            return previous_layer_.ReadParameters(stream);
+        }

-  // Make the learning class a friend
-  friend class Trainer<Sum>;
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            if (!Tail::WriteParameters(stream))
+                return false;

-  // the layer immediately before this layer
-  FirstPreviousLayer previous_layer_;
-};
+            return previous_layer_.WriteParameters(stream);
+        }

-// Layer that sums the output of multiple layers (when there is one template argument)
-template <typename PreviousLayer>
-class Sum<PreviousLayer> {
- public:
-  // Input/output type
-  using InputType = typename PreviousLayer::OutputType;
-  using OutputType = InputType;
+        // forward propagation
+        const OutputType* Propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {

-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      PreviousLayer::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = kInputDimensions;
+            Tail::Propagate(transformed_features, buffer);

-  // Size of the forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+            const auto head_output = previous_layer_.Propagate(
+                transformed_features, buffer + kSelfBufferSize);

-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t GetHashValue() {
-    std::uint32_t hash_value = 0xBCE400B4u;
-    hash_value ^= PreviousLayer::GetHashValue() >> 1;
-    hash_value ^= PreviousLayer::GetHashValue() << 31;
-    return hash_value;
-  }
+            const auto output = reinterpret_cast<OutputType*>(buffer);

-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "Sum[" +
-        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
-  }
+            for (IndexType i = 0; i <kOutputDimensions; ++i) {
+                output[i] += head_output[i];
+            }

-  // read parameters
-  bool ReadParameters(std::istream& stream) {
-    return previous_layer_.ReadParameters(stream);
-  }
+            return output;
+        }

-  // write parameters
-  bool WriteParameters(std::ostream& stream) const {
-    return previous_layer_.WriteParameters(stream);
-  }
+    protected:
+        // A string that represents the list of layers to be summed
+        static std::string GetSummandsString() {
+            return Head::GetStructureString() + "," + Tail::GetSummandsString();
+        }

-  // forward propagation
-  const OutputType* Propagate(
-      const TransformedFeatureType* transformed_features, char* buffer) const {
-    return previous_layer_.Propagate(transformed_features, buffer);
-  }
+        // Make the learning class a friend
+        friend class Trainer<Sum>;

- protected:
-  // A string that represents the list of layers to be summed
-  static std::string GetSummandsString() {
-    return PreviousLayer::GetStructureString();
-  }
+        // the layer immediately before this layer
+        FirstPreviousLayer previous_layer_;
+    };

-  // Make the learning class a friend
-  friend class Trainer<Sum>;
+    // Layer that sums the output of multiple layers (when there is one template argument)
+    template <typename PreviousLayer>
+    class Sum<PreviousLayer> {
+    public:
+        // Input/output type
+        using InputType = typename PreviousLayer::OutputType;

-  // the layer immediately before this layer
-  PreviousLayer previous_layer_;
-};
+        using OutputType = InputType;

-}  // namespace Layers
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            PreviousLayer::kOutputDimensions;

-}  // namespace NNUE
+        static constexpr IndexType kOutputDimensions = kInputDimensions;

-}  // namespace Eval
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0xBCE400B4u;
+            hash_value ^= PreviousLayer::GetHashValue() >> 1;
+            hash_value ^= PreviousLayer::GetHashValue() << 31;
+            return hash_value;
+        }
+
+        // A string that represents the structure from the input layer to this layer
+        static std::string GetStructureString() {
+            return "Sum[" +
+                std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+        }
+
+        // read parameters
+        bool ReadParameters(std::istream& stream) {
+            return previous_layer_.ReadParameters(stream);
+        }
+
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            return previous_layer_.WriteParameters(stream);
+        }
+
+        // forward propagation
+        const OutputType* Propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {
+
+            return previous_layer_.Propagate(transformed_features, buffer);
+        }
+
+    protected:
+        // A string that represents the list of layers to be summed
+        static std::string GetSummandsString() {
+            return PreviousLayer::GetStructureString();
+        }
+
+        // Make the learning class a friend
+        friend class Trainer<Sum>;
+
+        // the layer immediately before this layer
+        PreviousLayer previous_layer_;
+    };
+
+}  // namespace Eval::NNUE::Layers

 #endif