diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index f24578a8..cc5e5eef 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -1,19 +1,19 @@ /* - Stockfish, a UCI chess playing engine derived from Glaurung 2.1 - Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file) + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file) - Stockfish is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. - Stockfish is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see . + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ // Definition of layer AffineTransform of NNUE evaluation function @@ -21,267 +21,290 @@ #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED #define NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED -#include -#include "../nnue_common.h" +#include "nnue/nnue_common.h" + +#include +#include +#include namespace Eval::NNUE::Layers { - // Affine transformation layer - template - class AffineTransform { - public: - // Input/output type - using InputType = typename PreviousLayer::OutputType; - using OutputType = std::int32_t; - static_assert(std::is_same::value, ""); + // Affine transformation layer + template + class AffineTransform { + public: + // Input/output type + using InputType = typename PreviousLayer::OutputType; - // Number of input/output dimensions - static constexpr IndexType kInputDimensions = - PreviousLayer::kOutputDimensions; - static constexpr IndexType kOutputDimensions = OutputDimensions; - static constexpr IndexType kPaddedInputDimensions = - CeilToMultiple(kInputDimensions, kMaxSimdWidth); + using OutputType = std::int32_t; - // Size of forward propagation buffer used in this layer - static constexpr std::size_t kSelfBufferSize = - CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize); + static_assert(std::is_same::value, ""); - // Size of the forward propagation buffer used from the input layer to this layer - static constexpr std::size_t kBufferSize = - PreviousLayer::kBufferSize + kSelfBufferSize; + // Number of input/output dimensions + static constexpr IndexType kInputDimensions = + PreviousLayer::kOutputDimensions; - // Hash value embedded in the evaluation file - static constexpr std::uint32_t GetHashValue() { - std::uint32_t hash_value = 0xCC03DAE4u; - hash_value += kOutputDimensions; - hash_value ^= PreviousLayer::GetHashValue() >> 1; - hash_value ^= PreviousLayer::GetHashValue() << 31; - return hash_value; - } + static constexpr IndexType kOutputDimensions = OutputDimensions; - // A string that represents the structure from the input layer to this layer - static std::string GetStructureString() { - return "AffineTransform[" + - std::to_string(kOutputDimensions) + "<-" + - std::to_string(kInputDimensions) + "](" + - PreviousLayer::GetStructureString() + ")"; - } - - // Read network parameters - bool ReadParameters(std::istream& stream) { - if (!previous_layer_.ReadParameters(stream)) return false; - for (std::size_t i = 0; i < kOutputDimensions; ++i) - biases_[i] = read_little_endian(stream); - for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i) - weights_[i] = read_little_endian(stream); - return !stream.fail(); - } + static constexpr IndexType kPaddedInputDimensions = + CeilToMultiple(kInputDimensions, kMaxSimdWidth); - // write parameters - bool WriteParameters(std::ostream& stream) const { - if (!previous_layer_.WriteParameters(stream)) return false; - stream.write(reinterpret_cast(biases_), - kOutputDimensions * sizeof(BiasType)); - stream.write(reinterpret_cast(weights_), - kOutputDimensions * kPaddedInputDimensions * - sizeof(WeightType)); - return !stream.fail(); - } + // Size of forward propagation buffer used in this layer + static constexpr std::size_t kSelfBufferSize = + CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize); - // Forward propagation - const OutputType* Propagate( - const TransformedFeatureType* transformed_features, char* buffer) const { - const auto input = previous_layer_.Propagate( - transformed_features, buffer + kSelfBufferSize); - const auto output = reinterpret_cast(buffer); + // Size of the forward propagation buffer used from the input layer to this layer + static constexpr std::size_t kBufferSize = + PreviousLayer::kBufferSize + kSelfBufferSize; - #if defined(USE_AVX512) - constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2); - const auto input_vector = reinterpret_cast(input); - #if !defined(USE_VNNI) - const __m512i kOnes = _mm512_set1_epi16(1); - #endif - - #elif defined(USE_AVX2) - constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; - const auto input_vector = reinterpret_cast(input); - #if !defined(USE_VNNI) - const __m256i kOnes = _mm256_set1_epi16(1); - #endif - - #elif defined(USE_SSE2) - constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; - #ifndef USE_SSSE3 - const __m128i kZeros = _mm_setzero_si128(); - #else - const __m128i kOnes = _mm_set1_epi16(1); - #endif - const auto input_vector = reinterpret_cast(input); - - #elif defined(USE_MMX) - constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; - const __m64 kZeros = _mm_setzero_si64(); - const auto input_vector = reinterpret_cast(input); - - #elif defined(USE_NEON) - constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; - const auto input_vector = reinterpret_cast(input); - #endif - - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const IndexType offset = i * kPaddedInputDimensions; - - #if defined(USE_AVX512) - __m512i sum = _mm512_setzero_si512(); - const auto row = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { - #if defined(USE_VNNI) - sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j])); - #else - __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j])); - product = _mm512_madd_epi16(product, kOnes); - sum = _mm512_add_epi32(sum, product); - #endif + // Hash value embedded in the evaluation file + static constexpr std::uint32_t GetHashValue() { + std::uint32_t hash_value = 0xCC03DAE4u; + hash_value += kOutputDimensions; + hash_value ^= PreviousLayer::GetHashValue() >> 1; + hash_value ^= PreviousLayer::GetHashValue() << 31; + return hash_value; } - // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks. - // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit) - // and we have to do one more 256bit chunk. - if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2) - { - const auto iv256 = reinterpret_cast(&input_vector[kNumChunks]); - const auto row256 = reinterpret_cast(&row[kNumChunks]); - #if defined(USE_VNNI) - __m256i product256 = _mm256_dpbusd_epi32( - _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0])); - sum = _mm512_inserti32x8(sum, product256, 0); - #else - __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0])); - sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256)); - #endif + // A string that represents the structure from the input layer to this layer + static std::string GetStructureString() { + return "AffineTransform[" + + std::to_string(kOutputDimensions) + "<-" + + std::to_string(kInputDimensions) + "](" + + PreviousLayer::GetStructureString() + ")"; } - output[i] = _mm512_reduce_add_epi32(sum) + biases_[i]; - #elif defined(USE_AVX2) - __m256i sum = _mm256_setzero_si256(); - const auto row = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { - #if defined(USE_VNNI) - sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j])); - #else - __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j])); - product = _mm256_madd_epi16(product, kOnes); - sum = _mm256_add_epi32(sum, product); - #endif + // Read network parameters + bool ReadParameters(std::istream& stream) { + if (!previous_layer_.ReadParameters(stream)) + return false; + + for (std::size_t i = 0; i < kOutputDimensions; ++i) + biases_[i] = read_little_endian(stream); + + for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i) + weights_[i] = read_little_endian(stream); + + return !stream.fail(); } - __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); - sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC)); - sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB)); - output[i] = _mm_cvtsi128_si32(sum128) + biases_[i]; - #elif defined(USE_SSSE3) - __m128i sum = _mm_setzero_si128(); - const auto row = reinterpret_cast(&weights_[offset]); - for (int j = 0; j < (int)kNumChunks - 1; j += 2) { - __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j])); - product0 = _mm_madd_epi16(product0, kOnes); - sum = _mm_add_epi32(sum, product0); - __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1])); - product1 = _mm_madd_epi16(product1, kOnes); - sum = _mm_add_epi32(sum, product1); + // write parameters + bool WriteParameters(std::ostream& stream) const { + if (!previous_layer_.WriteParameters(stream)) + return false; + + stream.write(reinterpret_cast(biases_), + kOutputDimensions * sizeof(BiasType)); + + stream.write(reinterpret_cast(weights_), + kOutputDimensions * kPaddedInputDimensions * + sizeof(WeightType)); + + return !stream.fail(); } - if (kNumChunks & 0x1) { - __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1])); - product = _mm_madd_epi16(product, kOnes); - sum = _mm_add_epi32(sum, product); + + // Forward propagation + const OutputType* Propagate( + const TransformedFeatureType* transformed_features, char* buffer) const { + + const auto input = previous_layer_.Propagate( + transformed_features, buffer + kSelfBufferSize); + const auto output = reinterpret_cast(buffer); + +#if defined(USE_AVX512) + constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2); + const auto input_vector = reinterpret_cast(input); +#if !defined(USE_VNNI) + const __m512i kOnes = _mm512_set1_epi16(1); +#endif + +#elif defined(USE_AVX2) + constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; + const auto input_vector = reinterpret_cast(input); +#if !defined(USE_VNNI) + const __m256i kOnes = _mm256_set1_epi16(1); +#endif + +#elif defined(USE_SSE2) + constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; +#ifndef USE_SSSE3 + const __m128i kZeros = _mm_setzero_si128(); +#else + const __m128i kOnes = _mm_set1_epi16(1); +#endif + const auto input_vector = reinterpret_cast(input); + +#elif defined(USE_MMX) + constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; + const __m64 kZeros = _mm_setzero_si64(); + const auto input_vector = reinterpret_cast(input); + +#elif defined(USE_NEON) + constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; + const auto input_vector = reinterpret_cast(input); +#endif + + for (IndexType i = 0; i < kOutputDimensions; ++i) { + const IndexType offset = i * kPaddedInputDimensions; + +#if defined(USE_AVX512) + __m512i sum = _mm512_setzero_si512(); + const auto row = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { +#if defined(USE_VNNI) + sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j])); +#else + __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j])); + product = _mm512_madd_epi16(product, kOnes); + sum = _mm512_add_epi32(sum, product); +#endif + } + + // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks. + // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit) + // and we have to do one more 256bit chunk. + if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2) + { + const auto iv256 = reinterpret_cast(&input_vector[kNumChunks]); + const auto row256 = reinterpret_cast(&row[kNumChunks]); +#if defined(USE_VNNI) + __m256i product256 = _mm256_dpbusd_epi32( + _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0])); + sum = _mm512_inserti32x8(sum, product256, 0); +#else + __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0])); + sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256)); +#endif + } + + output[i] = _mm512_reduce_add_epi32(sum) + biases_[i]; + +#elif defined(USE_AVX2) + __m256i sum = _mm256_setzero_si256(); + const auto row = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { +#if defined(USE_VNNI) + sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j])); +#else + __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j])); + product = _mm256_madd_epi16(product, kOnes); + sum = _mm256_add_epi32(sum, product); +#endif + } + + __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC)); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB)); + output[i] = _mm_cvtsi128_si32(sum128) + biases_[i]; + +#elif defined(USE_SSSE3) + __m128i sum = _mm_setzero_si128(); + const auto row = reinterpret_cast(&weights_[offset]); + for (int j = 0; j < (int)kNumChunks - 1; j += 2) { + __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j])); + product0 = _mm_madd_epi16(product0, kOnes); + sum = _mm_add_epi32(sum, product0); + __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1])); + product1 = _mm_madd_epi16(product1, kOnes); + sum = _mm_add_epi32(sum, product1); + } + + if (kNumChunks & 0x1) { + __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1])); + product = _mm_madd_epi16(product, kOnes); + sum = _mm_add_epi32(sum, product); + } + + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB + output[i] = _mm_cvtsi128_si32(sum) + biases_[i]; + +#elif defined(USE_SSE2) + __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]); + __m128i sum_hi = kZeros; + const auto row = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + __m128i row_j = _mm_load_si128(&row[j]); + __m128i input_j = _mm_load_si128(&input_vector[j]); + __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j); + __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs); + __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs); + __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros); + __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros); + __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo); + __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi); + sum_lo = _mm_add_epi32(sum_lo, product_lo); + sum_hi = _mm_add_epi32(sum_hi, product_hi); + } + + __m128i sum = _mm_add_epi32(sum_lo, sum_hi); + __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2)); + sum = _mm_add_epi32(sum, sum_high_64); + __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2)); + sum = _mm_add_epi32(sum, sum_second_32); + output[i] = _mm_cvtsi128_si32(sum); + +#elif defined(USE_MMX) + __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]); + __m64 sum_hi = kZeros; + const auto row = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + __m64 row_j = row[j]; + __m64 input_j = input_vector[j]; + __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j); + __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs); + __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs); + __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros); + __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros); + __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo); + __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi); + sum_lo = _mm_add_pi32(sum_lo, product_lo); + sum_hi = _mm_add_pi32(sum_hi, product_hi); + } + + __m64 sum = _mm_add_pi32(sum_lo, sum_hi); + sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum)); + output[i] = _mm_cvtsi64_si32(sum); + +#elif defined(USE_NEON) + int32x4_t sum = {biases_[i]}; + const auto row = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]); + product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]); + sum = vpadalq_s16(sum, product); + } + + output[i] = sum[0] + sum[1] + sum[2] + sum[3]; + +#else + OutputType sum = biases_[i]; + for (IndexType j = 0; j < kInputDimensions; ++j) { + sum += weights_[offset + j] * input[j]; + } + + output[i] = sum; +#endif + + } +#if defined(USE_MMX) + _mm_empty(); +#endif + return output; } - sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC - sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB - output[i] = _mm_cvtsi128_si32(sum) + biases_[i]; - #elif defined(USE_SSE2) - __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]); - __m128i sum_hi = kZeros; - const auto row = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { - __m128i row_j = _mm_load_si128(&row[j]); - __m128i input_j = _mm_load_si128(&input_vector[j]); - __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j); - __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs); - __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs); - __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros); - __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros); - __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo); - __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi); - sum_lo = _mm_add_epi32(sum_lo, product_lo); - sum_hi = _mm_add_epi32(sum_hi, product_hi); - } - __m128i sum = _mm_add_epi32(sum_lo, sum_hi); - __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2)); - sum = _mm_add_epi32(sum, sum_high_64); - __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2)); - sum = _mm_add_epi32(sum, sum_second_32); - output[i] = _mm_cvtsi128_si32(sum); + private: + using BiasType = OutputType; + using WeightType = std::int8_t; - #elif defined(USE_MMX) - __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]); - __m64 sum_hi = kZeros; - const auto row = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { - __m64 row_j = row[j]; - __m64 input_j = input_vector[j]; - __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j); - __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs); - __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs); - __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros); - __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros); - __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo); - __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi); - sum_lo = _mm_add_pi32(sum_lo, product_lo); - sum_hi = _mm_add_pi32(sum_hi, product_hi); - } - __m64 sum = _mm_add_pi32(sum_lo, sum_hi); - sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum)); - output[i] = _mm_cvtsi64_si32(sum); + // Make the learning class a friend + friend class Trainer; - #elif defined(USE_NEON) - int32x4_t sum = {biases_[i]}; - const auto row = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { - int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]); - product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]); - sum = vpadalq_s16(sum, product); - } - output[i] = sum[0] + sum[1] + sum[2] + sum[3]; + PreviousLayer previous_layer_; - #else - OutputType sum = biases_[i]; - for (IndexType j = 0; j < kInputDimensions; ++j) { - sum += weights_[offset + j] * input[j]; - } - output[i] = sum; - #endif - - } - #if defined(USE_MMX) - _mm_empty(); - #endif - return output; - } - - private: - using BiasType = OutputType; - using WeightType = std::int8_t; - - // Make the learning class a friend - friend class Trainer; - - PreviousLayer previous_layer_; - - alignas(kCacheLineSize) BiasType biases_[kOutputDimensions]; - alignas(kCacheLineSize) - WeightType weights_[kOutputDimensions * kPaddedInputDimensions]; - }; + alignas(kCacheLineSize) BiasType biases_[kOutputDimensions]; + alignas(kCacheLineSize) WeightType weights_[kOutputDimensions * kPaddedInputDimensions]; + }; } // namespace Eval::NNUE::Layers diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index d923986e..0846f3df 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -1,19 +1,19 @@ /* - Stockfish, a UCI chess playing engine derived from Glaurung 2.1 - Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file) + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file) - Stockfish is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. - Stockfish is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see . + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ // Definition of layer ClippedReLU of NNUE evaluation function @@ -21,160 +21,169 @@ #ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED #define NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED -#include "../nnue_common.h" +#include "nnue/nnue_common.h" + +#include +#include +#include namespace Eval::NNUE::Layers { - // Clipped ReLU - template - class ClippedReLU { - public: - // Input/output type - using InputType = typename PreviousLayer::OutputType; - using OutputType = std::uint8_t; - static_assert(std::is_same::value, ""); + // Clipped ReLU + template + class ClippedReLU { + public: + // Input/output type + using InputType = typename PreviousLayer::OutputType; - // Number of input/output dimensions - static constexpr IndexType kInputDimensions = - PreviousLayer::kOutputDimensions; - static constexpr IndexType kOutputDimensions = kInputDimensions; + using OutputType = std::uint8_t; - // Size of forward propagation buffer used in this layer - static constexpr std::size_t kSelfBufferSize = - CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize); + static_assert(std::is_same::value, ""); - // Size of the forward propagation buffer used from the input layer to this layer - static constexpr std::size_t kBufferSize = - PreviousLayer::kBufferSize + kSelfBufferSize; + // Number of input/output dimensions + static constexpr IndexType kInputDimensions = + PreviousLayer::kOutputDimensions; - // Hash value embedded in the evaluation file - static constexpr std::uint32_t GetHashValue() { - std::uint32_t hash_value = 0x538D24C7u; - hash_value += PreviousLayer::GetHashValue(); - return hash_value; - } + static constexpr IndexType kOutputDimensions = kInputDimensions; - // A string that represents the structure from the input layer to this layer - static std::string GetStructureString() { - return "ClippedReLU[" + - std::to_string(kOutputDimensions) + "](" + - PreviousLayer::GetStructureString() + ")"; - } + // Size of forward propagation buffer used in this layer + static constexpr std::size_t kSelfBufferSize = + CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize); - // Read network parameters - bool ReadParameters(std::istream& stream) { - return previous_layer_.ReadParameters(stream); - } + // Size of the forward propagation buffer used from the input layer to this layer + static constexpr std::size_t kBufferSize = + PreviousLayer::kBufferSize + kSelfBufferSize; - // write parameters - bool WriteParameters(std::ostream& stream) const { - return previous_layer_.WriteParameters(stream); - } + // Hash value embedded in the evaluation file + static constexpr std::uint32_t GetHashValue() { + std::uint32_t hash_value = 0x538D24C7u; + hash_value += PreviousLayer::GetHashValue(); + return hash_value; + } - // Forward propagation - const OutputType* Propagate( - const TransformedFeatureType* transformed_features, char* buffer) const { - const auto input = previous_layer_.Propagate( - transformed_features, buffer + kSelfBufferSize); - const auto output = reinterpret_cast(buffer); + // A string that represents the structure from the input layer to this layer + static std::string GetStructureString() { + return "ClippedReLU[" + + std::to_string(kOutputDimensions) + "](" + + PreviousLayer::GetStructureString() + ")"; + } - #if defined(USE_AVX2) - constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; - const __m256i kZero = _mm256_setzero_si256(); - const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); - const auto in = reinterpret_cast(input); - const auto out = reinterpret_cast<__m256i*>(output); - for (IndexType i = 0; i < kNumChunks; ++i) { - const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32( - _mm256_loadA_si256(&in[i * 4 + 0]), - _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits); - const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32( - _mm256_loadA_si256(&in[i * 4 + 2]), - _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits); - _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( - _mm256_packs_epi16(words0, words1), kZero), kOffsets)); - } - constexpr IndexType kStart = kNumChunks * kSimdWidth; + // Read network parameters + bool ReadParameters(std::istream& stream) { + return previous_layer_.ReadParameters(stream); + } - #elif defined(USE_SSE2) - constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; + // write parameters + bool WriteParameters(std::ostream& stream) const { + return previous_layer_.WriteParameters(stream); + } - #ifdef USE_SSE41 - const __m128i kZero = _mm_setzero_si128(); - #else - const __m128i k0x80s = _mm_set1_epi8(-128); - #endif + // Forward propagation + const OutputType* Propagate( + const TransformedFeatureType* transformed_features, char* buffer) const { - const auto in = reinterpret_cast(input); - const auto out = reinterpret_cast<__m128i*>(output); - for (IndexType i = 0; i < kNumChunks; ++i) { - const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32( - _mm_load_si128(&in[i * 4 + 0]), - _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits); - const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32( - _mm_load_si128(&in[i * 4 + 2]), - _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits); - const __m128i packedbytes = _mm_packs_epi16(words0, words1); - _mm_store_si128(&out[i], + const auto input = previous_layer_.Propagate( + transformed_features, buffer + kSelfBufferSize); + const auto output = reinterpret_cast(buffer); - #ifdef USE_SSE41 - _mm_max_epi8(packedbytes, kZero) - #else - _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s) - #endif +#if defined(USE_AVX2) + constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; + const __m256i kZero = _mm256_setzero_si256(); + const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast<__m256i*>(output); + for (IndexType i = 0; i < kNumChunks; ++i) { + const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32( + _mm256_loadA_si256(&in[i * 4 + 0]), + _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits); + const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32( + _mm256_loadA_si256(&in[i * 4 + 2]), + _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits); + _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( + _mm256_packs_epi16(words0, words1), kZero), kOffsets)); + } - ); - } - constexpr IndexType kStart = kNumChunks * kSimdWidth; + constexpr IndexType kStart = kNumChunks * kSimdWidth; - #elif defined(USE_MMX) - constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; - const __m64 k0x80s = _mm_set1_pi8(-128); - const auto in = reinterpret_cast(input); - const auto out = reinterpret_cast<__m64*>(output); - for (IndexType i = 0; i < kNumChunks; ++i) { - const __m64 words0 = _mm_srai_pi16( - _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]), - kWeightScaleBits); - const __m64 words1 = _mm_srai_pi16( - _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]), - kWeightScaleBits); - const __m64 packedbytes = _mm_packs_pi16(words0, words1); - out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); - } - _mm_empty(); - constexpr IndexType kStart = kNumChunks * kSimdWidth; +#elif defined(USE_SSE2) + constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; - #elif defined(USE_NEON) - constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2); - const int8x8_t kZero = {0}; - const auto in = reinterpret_cast(input); - const auto out = reinterpret_cast(output); - for (IndexType i = 0; i < kNumChunks; ++i) { - int16x8_t shifted; - const auto pack = reinterpret_cast(&shifted); - pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits); - pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits); - out[i] = vmax_s8(vqmovn_s16(shifted), kZero); - } - constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2); - #else - constexpr IndexType kStart = 0; - #endif +#if defined(USE_SSE41) + const __m128i kZero = _mm_setzero_si128(); +#else + const __m128i k0x80s = _mm_set1_epi8(-128); +#endif - for (IndexType i = kStart; i < kInputDimensions; ++i) { - output[i] = static_cast( - std::max(0, std::min(127, input[i] >> kWeightScaleBits))); - } - return output; - } + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast<__m128i*>(output); + for (IndexType i = 0; i < kNumChunks; ++i) { + const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32( + _mm_load_si128(&in[i * 4 + 0]), + _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits); + const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32( + _mm_load_si128(&in[i * 4 + 2]), + _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits); + const __m128i packedbytes = _mm_packs_epi16(words0, words1); + _mm_store_si128(&out[i], - private: - // Make the learning class a friend - friend class Trainer; - - PreviousLayer previous_layer_; - }; +#if defined(USE_SSE41) + _mm_max_epi8(packedbytes, kZero) + #else + _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s) +#endif + + ); + } + constexpr IndexType kStart = kNumChunks * kSimdWidth; + +#elif defined(USE_MMX) + constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; + const __m64 k0x80s = _mm_set1_pi8(-128); + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast<__m64*>(output); + for (IndexType i = 0; i < kNumChunks; ++i) { + const __m64 words0 = _mm_srai_pi16( + _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]), + kWeightScaleBits); + const __m64 words1 = _mm_srai_pi16( + _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]), + kWeightScaleBits); + const __m64 packedbytes = _mm_packs_pi16(words0, words1); + out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); + } + _mm_empty(); + constexpr IndexType kStart = kNumChunks * kSimdWidth; + +#elif defined(USE_NEON) + constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2); + const int8x8_t kZero = {0}; + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast(output); + for (IndexType i = 0; i < kNumChunks; ++i) { + int16x8_t shifted; + const auto pack = reinterpret_cast(&shifted); + pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits); + pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits); + out[i] = vmax_s8(vqmovn_s16(shifted), kZero); + } + constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2); +#else + constexpr IndexType kStart = 0; +#endif + + for (IndexType i = kStart; i < kInputDimensions; ++i) { + output[i] = static_cast( + std::max(0, std::min(127, input[i] >> kWeightScaleBits))); + } + return output; + } + + private: + // Make the learning class a friend + friend class Trainer; + + PreviousLayer previous_layer_; + }; } // namespace Eval::NNUE::Layers diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h index 78756a39..9d9476a5 100644 --- a/src/nnue/layers/input_slice.h +++ b/src/nnue/layers/input_slice.h @@ -1,19 +1,19 @@ /* - Stockfish, a UCI chess playing engine derived from Glaurung 2.1 - Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file) + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file) - Stockfish is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. - Stockfish is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see . + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ // NNUE evaluation function layer InputSlice definition @@ -21,59 +21,63 @@ #ifndef NNUE_LAYERS_INPUT_SLICE_H_INCLUDED #define NNUE_LAYERS_INPUT_SLICE_H_INCLUDED -#include "../nnue_common.h" +#include "nnue/nnue_common.h" + +#include +#include namespace Eval::NNUE::Layers { -// Input layer -template -class InputSlice { - public: - // Need to maintain alignment - static_assert(Offset % kMaxSimdWidth == 0, ""); + // Input layer + template + class InputSlice { + public: + // Need to maintain alignment + static_assert(Offset % kMaxSimdWidth == 0, ""); - // Output type - using OutputType = TransformedFeatureType; + // Output type + using OutputType = TransformedFeatureType; - // Output dimensionality - static constexpr IndexType kOutputDimensions = OutputDimensions; + // Output dimensionality + static constexpr IndexType kOutputDimensions = OutputDimensions; - // Size of forward propagation buffer used from the input layer to this layer - static constexpr std::size_t kBufferSize = 0; + // Size of forward propagation buffer used from the input layer to this layer + static constexpr std::size_t kBufferSize = 0; - // Hash value embedded in the evaluation file - static constexpr std::uint32_t GetHashValue() { - std::uint32_t hash_value = 0xEC42E90Du; - hash_value ^= kOutputDimensions ^ (Offset << 10); - return hash_value; - } + // Hash value embedded in the evaluation file + static constexpr std::uint32_t GetHashValue() { + std::uint32_t hash_value = 0xEC42E90Du; + hash_value ^= kOutputDimensions ^ (Offset << 10); + return hash_value; + } - // A string that represents the structure from the input layer to this layer - static std::string GetStructureString() { - return "InputSlice[" + std::to_string(kOutputDimensions) + "(" + - std::to_string(Offset) + ":" + - std::to_string(Offset + kOutputDimensions) + ")]"; - } + // A string that represents the structure from the input layer to this layer + static std::string GetStructureString() { + return "InputSlice[" + std::to_string(kOutputDimensions) + "(" + + std::to_string(Offset) + ":" + + std::to_string(Offset + kOutputDimensions) + ")]"; + } - // Read network parameters - bool ReadParameters(std::istream& /*stream*/) { - return true; - } + // Read network parameters + bool ReadParameters(std::istream& /*stream*/) { + return true; + } - // write parameters - bool WriteParameters(std::ostream& /*stream*/) const { - return true; - } + // write parameters + bool WriteParameters(std::ostream& /*stream*/) const { + return true; + } - // Forward propagation - const OutputType* Propagate( - const TransformedFeatureType* transformed_features, - char* /*buffer*/) const { - return transformed_features + Offset; - } + // Forward propagation + const OutputType* Propagate( + const TransformedFeatureType* transformed_features, + char* /*buffer*/) const { - private: -}; + return transformed_features + Offset; + } + + private: + }; } // namespace Layers diff --git a/src/nnue/layers/sum.h b/src/nnue/layers/sum.h index 419ced89..c81f5850 100644 --- a/src/nnue/layers/sum.h +++ b/src/nnue/layers/sum.h @@ -1,159 +1,166 @@ -// Definition of layer Sum of NNUE evaluation function - -#ifndef _NNUE_LAYERS_SUM_H_ +#ifndef _NNUE_LAYERS_SUM_H_ #define _NNUE_LAYERS_SUM_H_ -#include "../nnue_common.h" +#include "nnue/nnue_common.h" -namespace Eval { +// Definition of layer Sum of NNUE evaluation function +namespace Eval::NNUE::Layers { -namespace NNUE { + // Layer that sums the output of multiple layers + template + class Sum : public Sum { + private: + using Head = FirstPreviousLayer; + using Tail = Sum; -namespace Layers { + public: + // Input/output type + using InputType = typename Head::OutputType; -// Layer that sums the output of multiple layers -template -class Sum : public Sum { - private: - using Head = FirstPreviousLayer; - using Tail = Sum; + using OutputType = InputType; - public: - // Input/output type - using InputType = typename Head::OutputType; - using OutputType = InputType; - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value, ""); - // number of input/output dimensions - static constexpr IndexType kInputDimensions = Head::kOutputDimensions; - static constexpr IndexType kOutputDimensions = kInputDimensions; - static_assert(kInputDimensions == Tail::kInputDimensions ,""); + // number of input/output dimensions + static constexpr IndexType kInputDimensions = Head::kOutputDimensions; - // Size of forward propagation buffer used in this layer - static constexpr std::size_t kSelfBufferSize = - CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize); + static constexpr IndexType kOutputDimensions = kInputDimensions; - // Size of the forward propagation buffer used from the input layer to this layer - static constexpr std::size_t kBufferSize = - std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize); + static_assert(kInputDimensions == Tail::kInputDimensions ,""); - // Hash value embedded in the evaluation function file - static constexpr std::uint32_t GetHashValue() { - std::uint32_t hash_value = 0xBCE400B4u; - hash_value ^= Head::GetHashValue() >> 1; - hash_value ^= Head::GetHashValue() << 31; - hash_value ^= Tail::GetHashValue() >> 2; - hash_value ^= Tail::GetHashValue() << 30; - return hash_value; - } + // Size of forward propagation buffer used in this layer + static constexpr std::size_t kSelfBufferSize = + CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize); - // A string that represents the structure from the input layer to this layer - static std::string GetStructureString() { - return "Sum[" + - std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")"; - } + // Size of the forward propagation buffer used from the input layer to this layer + static constexpr std::size_t kBufferSize = + std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize); - // read parameters - bool ReadParameters(std::istream& stream) { - if (!Tail::ReadParameters(stream)) return false; - return previous_layer_.ReadParameters(stream); - } + // Hash value embedded in the evaluation function file + static constexpr std::uint32_t GetHashValue() { + std::uint32_t hash_value = 0xBCE400B4u; + hash_value ^= Head::GetHashValue() >> 1; + hash_value ^= Head::GetHashValue() << 31; + hash_value ^= Tail::GetHashValue() >> 2; + hash_value ^= Tail::GetHashValue() << 30; + return hash_value; + } - // write parameters - bool WriteParameters(std::ostream& stream) const { - if (!Tail::WriteParameters(stream)) return false; - return previous_layer_.WriteParameters(stream); - } + // A string that represents the structure from the input layer to this layer + static std::string GetStructureString() { + return "Sum[" + + std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")"; + } - // forward propagation - const OutputType* Propagate( - const TransformedFeatureType* transformed_features, char* buffer) const { - Tail::Propagate(transformed_features, buffer); - const auto head_output = previous_layer_.Propagate( - transformed_features, buffer + kSelfBufferSize); - const auto output = reinterpret_cast(buffer); - for (IndexType i = 0; i ; + // write parameters + bool WriteParameters(std::ostream& stream) const { + if (!Tail::WriteParameters(stream)) + return false; - // the layer immediately before this layer - FirstPreviousLayer previous_layer_; -}; + return previous_layer_.WriteParameters(stream); + } -// Layer that sums the output of multiple layers (when there is one template argument) -template -class Sum { - public: - // Input/output type - using InputType = typename PreviousLayer::OutputType; - using OutputType = InputType; + // forward propagation + const OutputType* Propagate( + const TransformedFeatureType* transformed_features, char* buffer) const { - // number of input/output dimensions - static constexpr IndexType kInputDimensions = - PreviousLayer::kOutputDimensions; - static constexpr IndexType kOutputDimensions = kInputDimensions; + Tail::Propagate(transformed_features, buffer); - // Size of the forward propagation buffer used from the input layer to this layer - static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize; + const auto head_output = previous_layer_.Propagate( + transformed_features, buffer + kSelfBufferSize); - // Hash value embedded in the evaluation function file - static constexpr std::uint32_t GetHashValue() { - std::uint32_t hash_value = 0xBCE400B4u; - hash_value ^= PreviousLayer::GetHashValue() >> 1; - hash_value ^= PreviousLayer::GetHashValue() << 31; - return hash_value; - } + const auto output = reinterpret_cast(buffer); - // A string that represents the structure from the input layer to this layer - static std::string GetStructureString() { - return "Sum[" + - std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")"; - } + for (IndexType i = 0; i ; - protected: - // A string that represents the list of layers to be summed - static std::string GetSummandsString() { - return PreviousLayer::GetStructureString(); - } + // the layer immediately before this layer + FirstPreviousLayer previous_layer_; + }; - // Make the learning class a friend - friend class Trainer; + // Layer that sums the output of multiple layers (when there is one template argument) + template + class Sum { + public: + // Input/output type + using InputType = typename PreviousLayer::OutputType; - // the layer immediately before this layer - PreviousLayer previous_layer_; -}; + using OutputType = InputType; -} // namespace Layers + // number of input/output dimensions + static constexpr IndexType kInputDimensions = + PreviousLayer::kOutputDimensions; -} // namespace NNUE + static constexpr IndexType kOutputDimensions = kInputDimensions; -} // namespace Eval + // Size of the forward propagation buffer used from the input layer to this layer + static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize; + + // Hash value embedded in the evaluation function file + static constexpr std::uint32_t GetHashValue() { + std::uint32_t hash_value = 0xBCE400B4u; + hash_value ^= PreviousLayer::GetHashValue() >> 1; + hash_value ^= PreviousLayer::GetHashValue() << 31; + return hash_value; + } + + // A string that represents the structure from the input layer to this layer + static std::string GetStructureString() { + return "Sum[" + + std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")"; + } + + // read parameters + bool ReadParameters(std::istream& stream) { + return previous_layer_.ReadParameters(stream); + } + + // write parameters + bool WriteParameters(std::ostream& stream) const { + return previous_layer_.WriteParameters(stream); + } + + // forward propagation + const OutputType* Propagate( + const TransformedFeatureType* transformed_features, char* buffer) const { + + return previous_layer_.Propagate(transformed_features, buffer); + } + + protected: + // A string that represents the list of layers to be summed + static std::string GetSummandsString() { + return PreviousLayer::GetStructureString(); + } + + // Make the learning class a friend + friend class Trainer; + + // the layer immediately before this layer + PreviousLayer previous_layer_; + }; + +} // namespace Eval::NNUE::Layers #endif