diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index f24578a8..cc5e5eef 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -1,19 +1,19 @@
/*
- Stockfish, a UCI chess playing engine derived from Glaurung 2.1
- Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
- Stockfish is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
- Stockfish is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
*/
// Definition of layer AffineTransform of NNUE evaluation function
@@ -21,267 +21,290 @@
#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
#define NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
-#include
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
+
+#include
+#include
+#include
namespace Eval::NNUE::Layers {
- // Affine transformation layer
- template
- class AffineTransform {
- public:
- // Input/output type
- using InputType = typename PreviousLayer::OutputType;
- using OutputType = std::int32_t;
- static_assert(std::is_same::value, "");
+ // Affine transformation layer
+ template
+ class AffineTransform {
+ public:
+ // Input/output type
+ using InputType = typename PreviousLayer::OutputType;
- // Number of input/output dimensions
- static constexpr IndexType kInputDimensions =
- PreviousLayer::kOutputDimensions;
- static constexpr IndexType kOutputDimensions = OutputDimensions;
- static constexpr IndexType kPaddedInputDimensions =
- CeilToMultiple(kInputDimensions, kMaxSimdWidth);
+ using OutputType = std::int32_t;
- // Size of forward propagation buffer used in this layer
- static constexpr std::size_t kSelfBufferSize =
- CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+ static_assert(std::is_same::value, "");
- // Size of the forward propagation buffer used from the input layer to this layer
- static constexpr std::size_t kBufferSize =
- PreviousLayer::kBufferSize + kSelfBufferSize;
+ // Number of input/output dimensions
+ static constexpr IndexType kInputDimensions =
+ PreviousLayer::kOutputDimensions;
- // Hash value embedded in the evaluation file
- static constexpr std::uint32_t GetHashValue() {
- std::uint32_t hash_value = 0xCC03DAE4u;
- hash_value += kOutputDimensions;
- hash_value ^= PreviousLayer::GetHashValue() >> 1;
- hash_value ^= PreviousLayer::GetHashValue() << 31;
- return hash_value;
- }
+ static constexpr IndexType kOutputDimensions = OutputDimensions;
- // A string that represents the structure from the input layer to this layer
- static std::string GetStructureString() {
- return "AffineTransform[" +
- std::to_string(kOutputDimensions) + "<-" +
- std::to_string(kInputDimensions) + "](" +
- PreviousLayer::GetStructureString() + ")";
- }
-
- // Read network parameters
- bool ReadParameters(std::istream& stream) {
- if (!previous_layer_.ReadParameters(stream)) return false;
- for (std::size_t i = 0; i < kOutputDimensions; ++i)
- biases_[i] = read_little_endian(stream);
- for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i)
- weights_[i] = read_little_endian(stream);
- return !stream.fail();
- }
+ static constexpr IndexType kPaddedInputDimensions =
+ CeilToMultiple(kInputDimensions, kMaxSimdWidth);
- // write parameters
- bool WriteParameters(std::ostream& stream) const {
- if (!previous_layer_.WriteParameters(stream)) return false;
- stream.write(reinterpret_cast(biases_),
- kOutputDimensions * sizeof(BiasType));
- stream.write(reinterpret_cast(weights_),
- kOutputDimensions * kPaddedInputDimensions *
- sizeof(WeightType));
- return !stream.fail();
- }
+ // Size of forward propagation buffer used in this layer
+ static constexpr std::size_t kSelfBufferSize =
+ CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
- // Forward propagation
- const OutputType* Propagate(
- const TransformedFeatureType* transformed_features, char* buffer) const {
- const auto input = previous_layer_.Propagate(
- transformed_features, buffer + kSelfBufferSize);
- const auto output = reinterpret_cast(buffer);
+ // Size of the forward propagation buffer used from the input layer to this layer
+ static constexpr std::size_t kBufferSize =
+ PreviousLayer::kBufferSize + kSelfBufferSize;
- #if defined(USE_AVX512)
- constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
- const auto input_vector = reinterpret_cast(input);
- #if !defined(USE_VNNI)
- const __m512i kOnes = _mm512_set1_epi16(1);
- #endif
-
- #elif defined(USE_AVX2)
- constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
- const auto input_vector = reinterpret_cast(input);
- #if !defined(USE_VNNI)
- const __m256i kOnes = _mm256_set1_epi16(1);
- #endif
-
- #elif defined(USE_SSE2)
- constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
- #ifndef USE_SSSE3
- const __m128i kZeros = _mm_setzero_si128();
- #else
- const __m128i kOnes = _mm_set1_epi16(1);
- #endif
- const auto input_vector = reinterpret_cast(input);
-
- #elif defined(USE_MMX)
- constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
- const __m64 kZeros = _mm_setzero_si64();
- const auto input_vector = reinterpret_cast(input);
-
- #elif defined(USE_NEON)
- constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
- const auto input_vector = reinterpret_cast(input);
- #endif
-
- for (IndexType i = 0; i < kOutputDimensions; ++i) {
- const IndexType offset = i * kPaddedInputDimensions;
-
- #if defined(USE_AVX512)
- __m512i sum = _mm512_setzero_si512();
- const auto row = reinterpret_cast(&weights_[offset]);
- for (IndexType j = 0; j < kNumChunks; ++j) {
- #if defined(USE_VNNI)
- sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
- #else
- __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
- product = _mm512_madd_epi16(product, kOnes);
- sum = _mm512_add_epi32(sum, product);
- #endif
+ // Hash value embedded in the evaluation file
+ static constexpr std::uint32_t GetHashValue() {
+ std::uint32_t hash_value = 0xCC03DAE4u;
+ hash_value += kOutputDimensions;
+ hash_value ^= PreviousLayer::GetHashValue() >> 1;
+ hash_value ^= PreviousLayer::GetHashValue() << 31;
+ return hash_value;
}
- // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
- // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
- // and we have to do one more 256bit chunk.
- if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
- {
- const auto iv256 = reinterpret_cast(&input_vector[kNumChunks]);
- const auto row256 = reinterpret_cast(&row[kNumChunks]);
- #if defined(USE_VNNI)
- __m256i product256 = _mm256_dpbusd_epi32(
- _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
- sum = _mm512_inserti32x8(sum, product256, 0);
- #else
- __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
- sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
- #endif
+ // A string that represents the structure from the input layer to this layer
+ static std::string GetStructureString() {
+ return "AffineTransform[" +
+ std::to_string(kOutputDimensions) + "<-" +
+ std::to_string(kInputDimensions) + "](" +
+ PreviousLayer::GetStructureString() + ")";
}
- output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
- #elif defined(USE_AVX2)
- __m256i sum = _mm256_setzero_si256();
- const auto row = reinterpret_cast(&weights_[offset]);
- for (IndexType j = 0; j < kNumChunks; ++j) {
- #if defined(USE_VNNI)
- sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
- #else
- __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
- product = _mm256_madd_epi16(product, kOnes);
- sum = _mm256_add_epi32(sum, product);
- #endif
+ // Read network parameters
+ bool ReadParameters(std::istream& stream) {
+ if (!previous_layer_.ReadParameters(stream))
+ return false;
+
+ for (std::size_t i = 0; i < kOutputDimensions; ++i)
+ biases_[i] = read_little_endian(stream);
+
+ for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i)
+ weights_[i] = read_little_endian(stream);
+
+ return !stream.fail();
}
- __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
- sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
- sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
- output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
- #elif defined(USE_SSSE3)
- __m128i sum = _mm_setzero_si128();
- const auto row = reinterpret_cast(&weights_[offset]);
- for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
- __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
- product0 = _mm_madd_epi16(product0, kOnes);
- sum = _mm_add_epi32(sum, product0);
- __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
- product1 = _mm_madd_epi16(product1, kOnes);
- sum = _mm_add_epi32(sum, product1);
+ // write parameters
+ bool WriteParameters(std::ostream& stream) const {
+ if (!previous_layer_.WriteParameters(stream))
+ return false;
+
+ stream.write(reinterpret_cast(biases_),
+ kOutputDimensions * sizeof(BiasType));
+
+ stream.write(reinterpret_cast(weights_),
+ kOutputDimensions * kPaddedInputDimensions *
+ sizeof(WeightType));
+
+ return !stream.fail();
}
- if (kNumChunks & 0x1) {
- __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
- product = _mm_madd_epi16(product, kOnes);
- sum = _mm_add_epi32(sum, product);
+
+ // Forward propagation
+ const OutputType* Propagate(
+ const TransformedFeatureType* transformed_features, char* buffer) const {
+
+ const auto input = previous_layer_.Propagate(
+ transformed_features, buffer + kSelfBufferSize);
+ const auto output = reinterpret_cast(buffer);
+
+#if defined(USE_AVX512)
+ constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
+ const auto input_vector = reinterpret_cast(input);
+#if !defined(USE_VNNI)
+ const __m512i kOnes = _mm512_set1_epi16(1);
+#endif
+
+#elif defined(USE_AVX2)
+ constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+ const auto input_vector = reinterpret_cast(input);
+#if !defined(USE_VNNI)
+ const __m256i kOnes = _mm256_set1_epi16(1);
+#endif
+
+#elif defined(USE_SSE2)
+ constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+#ifndef USE_SSSE3
+ const __m128i kZeros = _mm_setzero_si128();
+#else
+ const __m128i kOnes = _mm_set1_epi16(1);
+#endif
+ const auto input_vector = reinterpret_cast(input);
+
+#elif defined(USE_MMX)
+ constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+ const __m64 kZeros = _mm_setzero_si64();
+ const auto input_vector = reinterpret_cast(input);
+
+#elif defined(USE_NEON)
+ constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+ const auto input_vector = reinterpret_cast(input);
+#endif
+
+ for (IndexType i = 0; i < kOutputDimensions; ++i) {
+ const IndexType offset = i * kPaddedInputDimensions;
+
+#if defined(USE_AVX512)
+ __m512i sum = _mm512_setzero_si512();
+ const auto row = reinterpret_cast(&weights_[offset]);
+ for (IndexType j = 0; j < kNumChunks; ++j) {
+#if defined(USE_VNNI)
+ sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+#else
+ __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+ product = _mm512_madd_epi16(product, kOnes);
+ sum = _mm512_add_epi32(sum, product);
+#endif
+ }
+
+ // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
+ // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
+ // and we have to do one more 256bit chunk.
+ if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
+ {
+ const auto iv256 = reinterpret_cast(&input_vector[kNumChunks]);
+ const auto row256 = reinterpret_cast(&row[kNumChunks]);
+#if defined(USE_VNNI)
+ __m256i product256 = _mm256_dpbusd_epi32(
+ _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
+ sum = _mm512_inserti32x8(sum, product256, 0);
+#else
+ __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
+ sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
+#endif
+ }
+
+ output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
+
+#elif defined(USE_AVX2)
+ __m256i sum = _mm256_setzero_si256();
+ const auto row = reinterpret_cast(&weights_[offset]);
+ for (IndexType j = 0; j < kNumChunks; ++j) {
+#if defined(USE_VNNI)
+ sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+#else
+ __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+ product = _mm256_madd_epi16(product, kOnes);
+ sum = _mm256_add_epi32(sum, product);
+#endif
+ }
+
+ __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+ sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+ sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+ output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
+
+#elif defined(USE_SSSE3)
+ __m128i sum = _mm_setzero_si128();
+ const auto row = reinterpret_cast(&weights_[offset]);
+ for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
+ __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+ product0 = _mm_madd_epi16(product0, kOnes);
+ sum = _mm_add_epi32(sum, product0);
+ __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
+ product1 = _mm_madd_epi16(product1, kOnes);
+ sum = _mm_add_epi32(sum, product1);
+ }
+
+ if (kNumChunks & 0x1) {
+ __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
+ product = _mm_madd_epi16(product, kOnes);
+ sum = _mm_add_epi32(sum, product);
+ }
+
+ sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
+ sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
+ output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
+
+#elif defined(USE_SSE2)
+ __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
+ __m128i sum_hi = kZeros;
+ const auto row = reinterpret_cast(&weights_[offset]);
+ for (IndexType j = 0; j < kNumChunks; ++j) {
+ __m128i row_j = _mm_load_si128(&row[j]);
+ __m128i input_j = _mm_load_si128(&input_vector[j]);
+ __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
+ __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
+ __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
+ __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
+ __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
+ __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
+ __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
+ sum_lo = _mm_add_epi32(sum_lo, product_lo);
+ sum_hi = _mm_add_epi32(sum_hi, product_hi);
+ }
+
+ __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
+ __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
+ sum = _mm_add_epi32(sum, sum_high_64);
+ __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
+ sum = _mm_add_epi32(sum, sum_second_32);
+ output[i] = _mm_cvtsi128_si32(sum);
+
+#elif defined(USE_MMX)
+ __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
+ __m64 sum_hi = kZeros;
+ const auto row = reinterpret_cast(&weights_[offset]);
+ for (IndexType j = 0; j < kNumChunks; ++j) {
+ __m64 row_j = row[j];
+ __m64 input_j = input_vector[j];
+ __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
+ __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
+ __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
+ __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
+ __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
+ __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
+ __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
+ sum_lo = _mm_add_pi32(sum_lo, product_lo);
+ sum_hi = _mm_add_pi32(sum_hi, product_hi);
+ }
+
+ __m64 sum = _mm_add_pi32(sum_lo, sum_hi);
+ sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
+ output[i] = _mm_cvtsi64_si32(sum);
+
+#elif defined(USE_NEON)
+ int32x4_t sum = {biases_[i]};
+ const auto row = reinterpret_cast(&weights_[offset]);
+ for (IndexType j = 0; j < kNumChunks; ++j) {
+ int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]);
+ product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]);
+ sum = vpadalq_s16(sum, product);
+ }
+
+ output[i] = sum[0] + sum[1] + sum[2] + sum[3];
+
+#else
+ OutputType sum = biases_[i];
+ for (IndexType j = 0; j < kInputDimensions; ++j) {
+ sum += weights_[offset + j] * input[j];
+ }
+
+ output[i] = sum;
+#endif
+
+ }
+#if defined(USE_MMX)
+ _mm_empty();
+#endif
+ return output;
}
- sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
- sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
- output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
- #elif defined(USE_SSE2)
- __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
- __m128i sum_hi = kZeros;
- const auto row = reinterpret_cast(&weights_[offset]);
- for (IndexType j = 0; j < kNumChunks; ++j) {
- __m128i row_j = _mm_load_si128(&row[j]);
- __m128i input_j = _mm_load_si128(&input_vector[j]);
- __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
- __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
- __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
- __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
- __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
- __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
- __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
- sum_lo = _mm_add_epi32(sum_lo, product_lo);
- sum_hi = _mm_add_epi32(sum_hi, product_hi);
- }
- __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
- __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
- sum = _mm_add_epi32(sum, sum_high_64);
- __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
- sum = _mm_add_epi32(sum, sum_second_32);
- output[i] = _mm_cvtsi128_si32(sum);
+ private:
+ using BiasType = OutputType;
+ using WeightType = std::int8_t;
- #elif defined(USE_MMX)
- __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
- __m64 sum_hi = kZeros;
- const auto row = reinterpret_cast(&weights_[offset]);
- for (IndexType j = 0; j < kNumChunks; ++j) {
- __m64 row_j = row[j];
- __m64 input_j = input_vector[j];
- __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
- __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
- __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
- __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
- __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
- __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
- __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
- sum_lo = _mm_add_pi32(sum_lo, product_lo);
- sum_hi = _mm_add_pi32(sum_hi, product_hi);
- }
- __m64 sum = _mm_add_pi32(sum_lo, sum_hi);
- sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
- output[i] = _mm_cvtsi64_si32(sum);
+ // Make the learning class a friend
+ friend class Trainer;
- #elif defined(USE_NEON)
- int32x4_t sum = {biases_[i]};
- const auto row = reinterpret_cast(&weights_[offset]);
- for (IndexType j = 0; j < kNumChunks; ++j) {
- int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]);
- product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]);
- sum = vpadalq_s16(sum, product);
- }
- output[i] = sum[0] + sum[1] + sum[2] + sum[3];
+ PreviousLayer previous_layer_;
- #else
- OutputType sum = biases_[i];
- for (IndexType j = 0; j < kInputDimensions; ++j) {
- sum += weights_[offset + j] * input[j];
- }
- output[i] = sum;
- #endif
-
- }
- #if defined(USE_MMX)
- _mm_empty();
- #endif
- return output;
- }
-
- private:
- using BiasType = OutputType;
- using WeightType = std::int8_t;
-
- // Make the learning class a friend
- friend class Trainer;
-
- PreviousLayer previous_layer_;
-
- alignas(kCacheLineSize) BiasType biases_[kOutputDimensions];
- alignas(kCacheLineSize)
- WeightType weights_[kOutputDimensions * kPaddedInputDimensions];
- };
+ alignas(kCacheLineSize) BiasType biases_[kOutputDimensions];
+ alignas(kCacheLineSize) WeightType weights_[kOutputDimensions * kPaddedInputDimensions];
+ };
} // namespace Eval::NNUE::Layers
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index d923986e..0846f3df 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -1,19 +1,19 @@
/*
- Stockfish, a UCI chess playing engine derived from Glaurung 2.1
- Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
- Stockfish is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
- Stockfish is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
*/
// Definition of layer ClippedReLU of NNUE evaluation function
@@ -21,160 +21,169 @@
#ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
#define NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
+
+#include
+#include
+#include
namespace Eval::NNUE::Layers {
- // Clipped ReLU
- template
- class ClippedReLU {
- public:
- // Input/output type
- using InputType = typename PreviousLayer::OutputType;
- using OutputType = std::uint8_t;
- static_assert(std::is_same::value, "");
+ // Clipped ReLU
+ template
+ class ClippedReLU {
+ public:
+ // Input/output type
+ using InputType = typename PreviousLayer::OutputType;
- // Number of input/output dimensions
- static constexpr IndexType kInputDimensions =
- PreviousLayer::kOutputDimensions;
- static constexpr IndexType kOutputDimensions = kInputDimensions;
+ using OutputType = std::uint8_t;
- // Size of forward propagation buffer used in this layer
- static constexpr std::size_t kSelfBufferSize =
- CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+ static_assert(std::is_same::value, "");
- // Size of the forward propagation buffer used from the input layer to this layer
- static constexpr std::size_t kBufferSize =
- PreviousLayer::kBufferSize + kSelfBufferSize;
+ // Number of input/output dimensions
+ static constexpr IndexType kInputDimensions =
+ PreviousLayer::kOutputDimensions;
- // Hash value embedded in the evaluation file
- static constexpr std::uint32_t GetHashValue() {
- std::uint32_t hash_value = 0x538D24C7u;
- hash_value += PreviousLayer::GetHashValue();
- return hash_value;
- }
+ static constexpr IndexType kOutputDimensions = kInputDimensions;
- // A string that represents the structure from the input layer to this layer
- static std::string GetStructureString() {
- return "ClippedReLU[" +
- std::to_string(kOutputDimensions) + "](" +
- PreviousLayer::GetStructureString() + ")";
- }
+ // Size of forward propagation buffer used in this layer
+ static constexpr std::size_t kSelfBufferSize =
+ CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
- // Read network parameters
- bool ReadParameters(std::istream& stream) {
- return previous_layer_.ReadParameters(stream);
- }
+ // Size of the forward propagation buffer used from the input layer to this layer
+ static constexpr std::size_t kBufferSize =
+ PreviousLayer::kBufferSize + kSelfBufferSize;
- // write parameters
- bool WriteParameters(std::ostream& stream) const {
- return previous_layer_.WriteParameters(stream);
- }
+ // Hash value embedded in the evaluation file
+ static constexpr std::uint32_t GetHashValue() {
+ std::uint32_t hash_value = 0x538D24C7u;
+ hash_value += PreviousLayer::GetHashValue();
+ return hash_value;
+ }
- // Forward propagation
- const OutputType* Propagate(
- const TransformedFeatureType* transformed_features, char* buffer) const {
- const auto input = previous_layer_.Propagate(
- transformed_features, buffer + kSelfBufferSize);
- const auto output = reinterpret_cast(buffer);
+ // A string that represents the structure from the input layer to this layer
+ static std::string GetStructureString() {
+ return "ClippedReLU[" +
+ std::to_string(kOutputDimensions) + "](" +
+ PreviousLayer::GetStructureString() + ")";
+ }
- #if defined(USE_AVX2)
- constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
- const __m256i kZero = _mm256_setzero_si256();
- const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
- const auto in = reinterpret_cast(input);
- const auto out = reinterpret_cast<__m256i*>(output);
- for (IndexType i = 0; i < kNumChunks; ++i) {
- const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
- _mm256_loadA_si256(&in[i * 4 + 0]),
- _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
- const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
- _mm256_loadA_si256(&in[i * 4 + 2]),
- _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
- _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
- _mm256_packs_epi16(words0, words1), kZero), kOffsets));
- }
- constexpr IndexType kStart = kNumChunks * kSimdWidth;
+ // Read network parameters
+ bool ReadParameters(std::istream& stream) {
+ return previous_layer_.ReadParameters(stream);
+ }
- #elif defined(USE_SSE2)
- constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+ // write parameters
+ bool WriteParameters(std::ostream& stream) const {
+ return previous_layer_.WriteParameters(stream);
+ }
- #ifdef USE_SSE41
- const __m128i kZero = _mm_setzero_si128();
- #else
- const __m128i k0x80s = _mm_set1_epi8(-128);
- #endif
+ // Forward propagation
+ const OutputType* Propagate(
+ const TransformedFeatureType* transformed_features, char* buffer) const {
- const auto in = reinterpret_cast(input);
- const auto out = reinterpret_cast<__m128i*>(output);
- for (IndexType i = 0; i < kNumChunks; ++i) {
- const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
- _mm_load_si128(&in[i * 4 + 0]),
- _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits);
- const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
- _mm_load_si128(&in[i * 4 + 2]),
- _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
- const __m128i packedbytes = _mm_packs_epi16(words0, words1);
- _mm_store_si128(&out[i],
+ const auto input = previous_layer_.Propagate(
+ transformed_features, buffer + kSelfBufferSize);
+ const auto output = reinterpret_cast(buffer);
- #ifdef USE_SSE41
- _mm_max_epi8(packedbytes, kZero)
- #else
- _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
- #endif
+#if defined(USE_AVX2)
+ constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+ const __m256i kZero = _mm256_setzero_si256();
+ const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+ const auto in = reinterpret_cast(input);
+ const auto out = reinterpret_cast<__m256i*>(output);
+ for (IndexType i = 0; i < kNumChunks; ++i) {
+ const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
+ _mm256_loadA_si256(&in[i * 4 + 0]),
+ _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
+ const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
+ _mm256_loadA_si256(&in[i * 4 + 2]),
+ _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
+ _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+ _mm256_packs_epi16(words0, words1), kZero), kOffsets));
+ }
- );
- }
- constexpr IndexType kStart = kNumChunks * kSimdWidth;
+ constexpr IndexType kStart = kNumChunks * kSimdWidth;
- #elif defined(USE_MMX)
- constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
- const __m64 k0x80s = _mm_set1_pi8(-128);
- const auto in = reinterpret_cast(input);
- const auto out = reinterpret_cast<__m64*>(output);
- for (IndexType i = 0; i < kNumChunks; ++i) {
- const __m64 words0 = _mm_srai_pi16(
- _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
- kWeightScaleBits);
- const __m64 words1 = _mm_srai_pi16(
- _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
- kWeightScaleBits);
- const __m64 packedbytes = _mm_packs_pi16(words0, words1);
- out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
- }
- _mm_empty();
- constexpr IndexType kStart = kNumChunks * kSimdWidth;
+#elif defined(USE_SSE2)
+ constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
- #elif defined(USE_NEON)
- constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
- const int8x8_t kZero = {0};
- const auto in = reinterpret_cast(input);
- const auto out = reinterpret_cast(output);
- for (IndexType i = 0; i < kNumChunks; ++i) {
- int16x8_t shifted;
- const auto pack = reinterpret_cast(&shifted);
- pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits);
- pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits);
- out[i] = vmax_s8(vqmovn_s16(shifted), kZero);
- }
- constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2);
- #else
- constexpr IndexType kStart = 0;
- #endif
+#if defined(USE_SSE41)
+ const __m128i kZero = _mm_setzero_si128();
+#else
+ const __m128i k0x80s = _mm_set1_epi8(-128);
+#endif
- for (IndexType i = kStart; i < kInputDimensions; ++i) {
- output[i] = static_cast(
- std::max(0, std::min(127, input[i] >> kWeightScaleBits)));
- }
- return output;
- }
+ const auto in = reinterpret_cast(input);
+ const auto out = reinterpret_cast<__m128i*>(output);
+ for (IndexType i = 0; i < kNumChunks; ++i) {
+ const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
+ _mm_load_si128(&in[i * 4 + 0]),
+ _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits);
+ const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
+ _mm_load_si128(&in[i * 4 + 2]),
+ _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
+ const __m128i packedbytes = _mm_packs_epi16(words0, words1);
+ _mm_store_si128(&out[i],
- private:
- // Make the learning class a friend
- friend class Trainer;
-
- PreviousLayer previous_layer_;
- };
+#if defined(USE_SSE41)
+ _mm_max_epi8(packedbytes, kZero)
+ #else
+ _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+#endif
+
+ );
+ }
+ constexpr IndexType kStart = kNumChunks * kSimdWidth;
+
+#elif defined(USE_MMX)
+ constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+ const __m64 k0x80s = _mm_set1_pi8(-128);
+ const auto in = reinterpret_cast(input);
+ const auto out = reinterpret_cast<__m64*>(output);
+ for (IndexType i = 0; i < kNumChunks; ++i) {
+ const __m64 words0 = _mm_srai_pi16(
+ _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
+ kWeightScaleBits);
+ const __m64 words1 = _mm_srai_pi16(
+ _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
+ kWeightScaleBits);
+ const __m64 packedbytes = _mm_packs_pi16(words0, words1);
+ out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+ }
+ _mm_empty();
+ constexpr IndexType kStart = kNumChunks * kSimdWidth;
+
+#elif defined(USE_NEON)
+ constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
+ const int8x8_t kZero = {0};
+ const auto in = reinterpret_cast(input);
+ const auto out = reinterpret_cast(output);
+ for (IndexType i = 0; i < kNumChunks; ++i) {
+ int16x8_t shifted;
+ const auto pack = reinterpret_cast(&shifted);
+ pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits);
+ pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits);
+ out[i] = vmax_s8(vqmovn_s16(shifted), kZero);
+ }
+ constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2);
+#else
+ constexpr IndexType kStart = 0;
+#endif
+
+ for (IndexType i = kStart; i < kInputDimensions; ++i) {
+ output[i] = static_cast(
+ std::max(0, std::min(127, input[i] >> kWeightScaleBits)));
+ }
+ return output;
+ }
+
+ private:
+ // Make the learning class a friend
+ friend class Trainer;
+
+ PreviousLayer previous_layer_;
+ };
} // namespace Eval::NNUE::Layers
diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h
index 78756a39..9d9476a5 100644
--- a/src/nnue/layers/input_slice.h
+++ b/src/nnue/layers/input_slice.h
@@ -1,19 +1,19 @@
/*
- Stockfish, a UCI chess playing engine derived from Glaurung 2.1
- Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
- Stockfish is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
- Stockfish is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
*/
// NNUE evaluation function layer InputSlice definition
@@ -21,59 +21,63 @@
#ifndef NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
#define NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
+
+#include
+#include
namespace Eval::NNUE::Layers {
-// Input layer
-template
-class InputSlice {
- public:
- // Need to maintain alignment
- static_assert(Offset % kMaxSimdWidth == 0, "");
+ // Input layer
+ template
+ class InputSlice {
+ public:
+ // Need to maintain alignment
+ static_assert(Offset % kMaxSimdWidth == 0, "");
- // Output type
- using OutputType = TransformedFeatureType;
+ // Output type
+ using OutputType = TransformedFeatureType;
- // Output dimensionality
- static constexpr IndexType kOutputDimensions = OutputDimensions;
+ // Output dimensionality
+ static constexpr IndexType kOutputDimensions = OutputDimensions;
- // Size of forward propagation buffer used from the input layer to this layer
- static constexpr std::size_t kBufferSize = 0;
+ // Size of forward propagation buffer used from the input layer to this layer
+ static constexpr std::size_t kBufferSize = 0;
- // Hash value embedded in the evaluation file
- static constexpr std::uint32_t GetHashValue() {
- std::uint32_t hash_value = 0xEC42E90Du;
- hash_value ^= kOutputDimensions ^ (Offset << 10);
- return hash_value;
- }
+ // Hash value embedded in the evaluation file
+ static constexpr std::uint32_t GetHashValue() {
+ std::uint32_t hash_value = 0xEC42E90Du;
+ hash_value ^= kOutputDimensions ^ (Offset << 10);
+ return hash_value;
+ }
- // A string that represents the structure from the input layer to this layer
- static std::string GetStructureString() {
- return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
- std::to_string(Offset) + ":" +
- std::to_string(Offset + kOutputDimensions) + ")]";
- }
+ // A string that represents the structure from the input layer to this layer
+ static std::string GetStructureString() {
+ return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
+ std::to_string(Offset) + ":" +
+ std::to_string(Offset + kOutputDimensions) + ")]";
+ }
- // Read network parameters
- bool ReadParameters(std::istream& /*stream*/) {
- return true;
- }
+ // Read network parameters
+ bool ReadParameters(std::istream& /*stream*/) {
+ return true;
+ }
- // write parameters
- bool WriteParameters(std::ostream& /*stream*/) const {
- return true;
- }
+ // write parameters
+ bool WriteParameters(std::ostream& /*stream*/) const {
+ return true;
+ }
- // Forward propagation
- const OutputType* Propagate(
- const TransformedFeatureType* transformed_features,
- char* /*buffer*/) const {
- return transformed_features + Offset;
- }
+ // Forward propagation
+ const OutputType* Propagate(
+ const TransformedFeatureType* transformed_features,
+ char* /*buffer*/) const {
- private:
-};
+ return transformed_features + Offset;
+ }
+
+ private:
+ };
} // namespace Layers
diff --git a/src/nnue/layers/sum.h b/src/nnue/layers/sum.h
index 419ced89..c81f5850 100644
--- a/src/nnue/layers/sum.h
+++ b/src/nnue/layers/sum.h
@@ -1,159 +1,166 @@
-// Definition of layer Sum of NNUE evaluation function
-
-#ifndef _NNUE_LAYERS_SUM_H_
+#ifndef _NNUE_LAYERS_SUM_H_
#define _NNUE_LAYERS_SUM_H_
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
-namespace Eval {
+// Definition of layer Sum of NNUE evaluation function
+namespace Eval::NNUE::Layers {
-namespace NNUE {
+ // Layer that sums the output of multiple layers
+ template
+ class Sum : public Sum {
+ private:
+ using Head = FirstPreviousLayer;
+ using Tail = Sum;
-namespace Layers {
+ public:
+ // Input/output type
+ using InputType = typename Head::OutputType;
-// Layer that sums the output of multiple layers
-template
-class Sum : public Sum {
- private:
- using Head = FirstPreviousLayer;
- using Tail = Sum;
+ using OutputType = InputType;
- public:
- // Input/output type
- using InputType = typename Head::OutputType;
- using OutputType = InputType;
- static_assert(std::is_same::value, "");
+ static_assert(std::is_same::value, "");
- // number of input/output dimensions
- static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
- static constexpr IndexType kOutputDimensions = kInputDimensions;
- static_assert(kInputDimensions == Tail::kInputDimensions ,"");
+ // number of input/output dimensions
+ static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
- // Size of forward propagation buffer used in this layer
- static constexpr std::size_t kSelfBufferSize =
- CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+ static constexpr IndexType kOutputDimensions = kInputDimensions;
- // Size of the forward propagation buffer used from the input layer to this layer
- static constexpr std::size_t kBufferSize =
- std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
+ static_assert(kInputDimensions == Tail::kInputDimensions ,"");
- // Hash value embedded in the evaluation function file
- static constexpr std::uint32_t GetHashValue() {
- std::uint32_t hash_value = 0xBCE400B4u;
- hash_value ^= Head::GetHashValue() >> 1;
- hash_value ^= Head::GetHashValue() << 31;
- hash_value ^= Tail::GetHashValue() >> 2;
- hash_value ^= Tail::GetHashValue() << 30;
- return hash_value;
- }
+ // Size of forward propagation buffer used in this layer
+ static constexpr std::size_t kSelfBufferSize =
+ CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
- // A string that represents the structure from the input layer to this layer
- static std::string GetStructureString() {
- return "Sum[" +
- std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
- }
+ // Size of the forward propagation buffer used from the input layer to this layer
+ static constexpr std::size_t kBufferSize =
+ std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
- // read parameters
- bool ReadParameters(std::istream& stream) {
- if (!Tail::ReadParameters(stream)) return false;
- return previous_layer_.ReadParameters(stream);
- }
+ // Hash value embedded in the evaluation function file
+ static constexpr std::uint32_t GetHashValue() {
+ std::uint32_t hash_value = 0xBCE400B4u;
+ hash_value ^= Head::GetHashValue() >> 1;
+ hash_value ^= Head::GetHashValue() << 31;
+ hash_value ^= Tail::GetHashValue() >> 2;
+ hash_value ^= Tail::GetHashValue() << 30;
+ return hash_value;
+ }
- // write parameters
- bool WriteParameters(std::ostream& stream) const {
- if (!Tail::WriteParameters(stream)) return false;
- return previous_layer_.WriteParameters(stream);
- }
+ // A string that represents the structure from the input layer to this layer
+ static std::string GetStructureString() {
+ return "Sum[" +
+ std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+ }
- // forward propagation
- const OutputType* Propagate(
- const TransformedFeatureType* transformed_features, char* buffer) const {
- Tail::Propagate(transformed_features, buffer);
- const auto head_output = previous_layer_.Propagate(
- transformed_features, buffer + kSelfBufferSize);
- const auto output = reinterpret_cast(buffer);
- for (IndexType i = 0; i ;
+ // write parameters
+ bool WriteParameters(std::ostream& stream) const {
+ if (!Tail::WriteParameters(stream))
+ return false;
- // the layer immediately before this layer
- FirstPreviousLayer previous_layer_;
-};
+ return previous_layer_.WriteParameters(stream);
+ }
-// Layer that sums the output of multiple layers (when there is one template argument)
-template
-class Sum {
- public:
- // Input/output type
- using InputType = typename PreviousLayer::OutputType;
- using OutputType = InputType;
+ // forward propagation
+ const OutputType* Propagate(
+ const TransformedFeatureType* transformed_features, char* buffer) const {
- // number of input/output dimensions
- static constexpr IndexType kInputDimensions =
- PreviousLayer::kOutputDimensions;
- static constexpr IndexType kOutputDimensions = kInputDimensions;
+ Tail::Propagate(transformed_features, buffer);
- // Size of the forward propagation buffer used from the input layer to this layer
- static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+ const auto head_output = previous_layer_.Propagate(
+ transformed_features, buffer + kSelfBufferSize);
- // Hash value embedded in the evaluation function file
- static constexpr std::uint32_t GetHashValue() {
- std::uint32_t hash_value = 0xBCE400B4u;
- hash_value ^= PreviousLayer::GetHashValue() >> 1;
- hash_value ^= PreviousLayer::GetHashValue() << 31;
- return hash_value;
- }
+ const auto output = reinterpret_cast(buffer);
- // A string that represents the structure from the input layer to this layer
- static std::string GetStructureString() {
- return "Sum[" +
- std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
- }
+ for (IndexType i = 0; i ;
- protected:
- // A string that represents the list of layers to be summed
- static std::string GetSummandsString() {
- return PreviousLayer::GetStructureString();
- }
+ // the layer immediately before this layer
+ FirstPreviousLayer previous_layer_;
+ };
- // Make the learning class a friend
- friend class Trainer;
+ // Layer that sums the output of multiple layers (when there is one template argument)
+ template
+ class Sum {
+ public:
+ // Input/output type
+ using InputType = typename PreviousLayer::OutputType;
- // the layer immediately before this layer
- PreviousLayer previous_layer_;
-};
+ using OutputType = InputType;
-} // namespace Layers
+ // number of input/output dimensions
+ static constexpr IndexType kInputDimensions =
+ PreviousLayer::kOutputDimensions;
-} // namespace NNUE
+ static constexpr IndexType kOutputDimensions = kInputDimensions;
-} // namespace Eval
+ // Size of the forward propagation buffer used from the input layer to this layer
+ static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+
+ // Hash value embedded in the evaluation function file
+ static constexpr std::uint32_t GetHashValue() {
+ std::uint32_t hash_value = 0xBCE400B4u;
+ hash_value ^= PreviousLayer::GetHashValue() >> 1;
+ hash_value ^= PreviousLayer::GetHashValue() << 31;
+ return hash_value;
+ }
+
+ // A string that represents the structure from the input layer to this layer
+ static std::string GetStructureString() {
+ return "Sum[" +
+ std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+ }
+
+ // read parameters
+ bool ReadParameters(std::istream& stream) {
+ return previous_layer_.ReadParameters(stream);
+ }
+
+ // write parameters
+ bool WriteParameters(std::ostream& stream) const {
+ return previous_layer_.WriteParameters(stream);
+ }
+
+ // forward propagation
+ const OutputType* Propagate(
+ const TransformedFeatureType* transformed_features, char* buffer) const {
+
+ return previous_layer_.Propagate(transformed_features, buffer);
+ }
+
+ protected:
+ // A string that represents the list of layers to be summed
+ static std::string GetSummandsString() {
+ return PreviousLayer::GetStructureString();
+ }
+
+ // Make the learning class a friend
+ friend class Trainer;
+
+ // the layer immediately before this layer
+ PreviousLayer previous_layer_;
+ };
+
+} // namespace Eval::NNUE::Layers
#endif