New NNUE architecture and net

Introduces a new NNUE network architecture and associated network parameters, as obtained by a new pytorch trainer. The network is already very strong at short TC, without regression at longer TC, and has potential for further improvements. https://tests.stockfishchess.org/tests/view/60a159c65085663412d0921d TC: 10s+0.1s, 1 thread ELO: 21.74 +-3.4 (95%) LOS: 100.0% Total: 10000 W: 1559 L: 934 D: 7507 Ptnml(0-2): 38, 701, 2972, 1176, 113 https://tests.stockfishchess.org/tests/view/60a187005085663412d0925b TC: 60s+0.6s, 1 thread ELO: 5.85 +-1.7 (95%) LOS: 100.0% Total: 20000 W: 1381 L: 1044 D: 17575 Ptnml(0-2): 27, 885, 7864, 1172, 52 https://tests.stockfishchess.org/tests/view/60a2beede229097940a03806 TC: 20s+0.2s, 8 threads LLR: 2.93 (-2.94,2.94) <0.50,3.50> Total: 34272 W: 1610 L: 1452 D: 31210 Ptnml(0-2): 30, 1285, 14350, 1439, 32 https://tests.stockfishchess.org/tests/view/60a2d687e229097940a03c72 TC: 60s+0.6s, 8 threads LLR: 2.94 (-2.94,2.94) <-2.50,0.50> Total: 45544 W: 1262 L: 1214 D: 43068 Ptnml(0-2): 12, 1129, 20442, 1177, 12 The network has been trained (by vondele) using the https://github.com/glinscott/nnue-pytorch/ trainer (started by glinscott), specifically the branch https://github.com/Sopel97/nnue-pytorch/tree/experiment_56. The data used are in 64 billion positions (193GB total) generated and scored with the current master net d8: https://drive.google.com/file/d/1hOOYSDKgOOp38ZmD0N4DV82TOLHzjUiF/view?usp=sharing d9: https://drive.google.com/file/d/1VlhnHL8f-20AXhGkILujnNXHwy9T-MQw/view?usp=sharing d10: https://drive.google.com/file/d/1ZC5upzBYMmMj1gMYCkt6rCxQG0GnO3Kk/view?usp=sharing fishtest_d9: https://drive.google.com/file/d/1GQHt0oNgKaHazwJFTRbXhlCN3FbUedFq/view?usp=sharing This network also contains a few architectural changes with respect to the current master: Size changed from 256x2-32-32-1 to 512x2-16-32-1 ~15-20% slower ~2x larger adds a special path for 16 valued ClippedReLU fixes affine transform code for 16 inputs/outputs, buy using InputDimensions instead of PaddedInputDimensions this is safe now because the inputs are processed in groups of 4 in the current affine transform code The feature set changed from HalfKP to HalfKAv2 Includes information about the kings like HalfKA Packs king features better, resulting in 8% size reduction compared to HalfKA The board is flipped for the black's perspective, instead of rotated like in the current master PSQT values for each feature the feature transformer now outputs a part that is fowarded directly to the output and allows learning piece values more directly than the previous network architecture. The effect is visible for high imbalance positions, where the current master network outputs evaluations skewed towards zero. 8 PSQT values per feature, chosen based on (popcount(pos.pieces()) - 1) / 4 initialized to classical material values on the start of the training 8 subnetworks (512x2->16->32->1), chosen based on (popcount(pos.pieces()) - 1) / 4 only one subnetwork is evaluated for any position, no or marginal speed loss A diagram of the network is available: https://user-images.githubusercontent.com/8037982/118656988-553a1700-b7eb-11eb-82ef-56a11cbebbf2.png A more complete description: https://github.com/glinscott/nnue-pytorch/blob/master/docs/nnue.md closes https://github.com/official-stockfish/Stockfish/pull/3474 Bench: 3806488
2025-12-26 20:16:14 +08:00 · 2021-05-18 17:36:26 +02:00
parent f90274d8ce
commit e8d64af123
13 changed files with 265 additions and 173 deletions
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -69,62 +69,15 @@ namespace Stockfish::Eval::NNUE::Layers {
      if (!previousLayer.read_parameters(stream)) return false;
      for (std::size_t i = 0; i < OutputDimensions; ++i)
        biases[i] = read_little_endian<BiasType>(stream);
-#if !defined (USE_SSSE3)
      for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+#if !defined (USE_SSSE3)
        weights[i] = read_little_endian<WeightType>(stream);
 #else
-      std::unique_ptr<uint32_t[]> indexMap = std::make_unique<uint32_t[]>(OutputDimensions * PaddedInputDimensions);
-      for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) {
-        const uint32_t scrambledIdx =
+        weights[
          (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 +
          i / PaddedInputDimensions * 4 +
-          i % 4;
-        weights[scrambledIdx] = read_little_endian<WeightType>(stream);
-        indexMap[scrambledIdx] = i;
-      }
-
-      // Determine if eights of weight and input products can be summed using 16bits
-      // without saturation. We assume worst case combinations of 0 and 127 for all inputs.
-      if (OutputDimensions > 1 && !stream.fail())
-      {
-          canSaturate16.count = 0;
-#if !defined(USE_VNNI)
-          for (IndexType i = 0; i < PaddedInputDimensions; i += 16)
-              for (IndexType j = 0; j < OutputDimensions; ++j)
-                  for (int x = 0; x < 2; ++x)
-                  {
-                      WeightType* w = &weights[i * OutputDimensions + j * 4 + x * 2];
-                      int sum[2] = {0, 0};
-                      for (int k = 0; k < 8; ++k)
-                      {
-                          IndexType idx = k / 2 * OutputDimensions * 4 + k % 2;
-                          sum[w[idx] < 0] += w[idx];
-                      }
-                      for (int sign : { -1, 1 })
-                          while (sign * sum[sign == -1] > 258)
-                          {
-                              int maxK = 0, maxW = 0;
-                              for (int k = 0; k < 8; ++k)
-                              {
-                                  IndexType idx = k / 2 * OutputDimensions * 4 + k % 2;
-                                  if (maxW < sign * w[idx])
-                                      maxK = k, maxW = sign * w[idx];
-                              }
-
-                              IndexType idx = maxK / 2 * OutputDimensions * 4 + maxK % 2;
-                              sum[sign == -1] -= w[idx];
-                              const uint32_t scrambledIdx = idx + i * OutputDimensions + j * 4 + x * 2;
-                              canSaturate16.add(j, i + maxK / 2 * 4 + maxK % 2 + x * 2, w[idx], indexMap[scrambledIdx]);
-                              w[idx] = 0;
-                          }
-                  }
-
-          // Non functional optimization for faster more linear access
-          std::sort(canSaturate16.ids, canSaturate16.ids + canSaturate16.count,
-                    [](const typename CanSaturate::Entry& e1, const typename CanSaturate::Entry& e2)
-                    { return e1.in == e2.in ? e1.out < e2.out : e1.in < e2.in; });
-#endif
-      }
+          i % 4
+        ] = read_little_endian<WeightType>(stream);
 #endif

      return !stream.fail();
@@ -148,8 +101,6 @@ namespace Stockfish::Eval::NNUE::Layers {
                i % 4
              ];
      }
-      for (int i = 0; i < canSaturate16.count; ++i)
-          unscrambledWeights[canSaturate16.ids[i].wIdx] = canSaturate16.ids[i].w;

      for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
          write_little_endian<WeightType>(stream, unscrambledWeights[i]);
@@ -194,11 +145,11 @@ namespace Stockfish::Eval::NNUE::Layers {
        __m512i product1 = _mm512_maddubs_epi16(a1, b1);
        __m512i product2 = _mm512_maddubs_epi16(a2, b2);
        __m512i product3 = _mm512_maddubs_epi16(a3, b3);
-        product0 = _mm512_add_epi16(product0, product1);
-        product2 = _mm512_add_epi16(product2, product3);
-        product0 = _mm512_add_epi16(product0, product2);
+        product0 = _mm512_adds_epi16(product0, product1);
        product0 = _mm512_madd_epi16(product0, Ones512);
-        acc = _mm512_add_epi32(acc, product0);
+        product2 = _mm512_adds_epi16(product2, product3);
+        product2 = _mm512_madd_epi16(product2, Ones512);
+        acc = _mm512_add_epi32(acc, _mm512_add_epi32(product0, product2));
 #endif
      };

@@ -236,11 +187,11 @@ namespace Stockfish::Eval::NNUE::Layers {
        __m256i product1 = _mm256_maddubs_epi16(a1, b1);
        __m256i product2 = _mm256_maddubs_epi16(a2, b2);
        __m256i product3 = _mm256_maddubs_epi16(a3, b3);
-        product0 = _mm256_add_epi16(product0, product1);
-        product2 = _mm256_add_epi16(product2, product3);
-        product0 = _mm256_add_epi16(product0, product2);
+        product0 = _mm256_adds_epi16(product0, product1);
        product0 = _mm256_madd_epi16(product0, Ones256);
-        acc = _mm256_add_epi32(acc, product0);
+        product2 = _mm256_adds_epi16(product2, product3);
+        product2 = _mm256_madd_epi16(product2, Ones256);
+        acc = _mm256_add_epi32(acc, _mm256_add_epi32(product0, product2));
 #endif
      };

@@ -267,11 +218,11 @@ namespace Stockfish::Eval::NNUE::Layers {
        __m128i product1 = _mm_maddubs_epi16(a1, b1);
        __m128i product2 = _mm_maddubs_epi16(a2, b2);
        __m128i product3 = _mm_maddubs_epi16(a3, b3);
-        product0 = _mm_add_epi16(product0, product1);
-        product2 = _mm_add_epi16(product2, product3);
-        product0 = _mm_add_epi16(product0, product2);
+        product0 = _mm_adds_epi16(product0, product1);
        product0 = _mm_madd_epi16(product0, Ones128);
-        acc = _mm_add_epi32(acc, product0);
+        product2 = _mm_adds_epi16(product2, product3);
+        product2 = _mm_madd_epi16(product2, Ones128);
+        acc = _mm_add_epi32(acc, _mm_add_epi32(product0, product2));
      };

 #endif
@@ -300,6 +251,8 @@ namespace Stockfish::Eval::NNUE::Layers {
 #endif

 #if defined (USE_SSSE3)
+      // Different layout, we process 4 inputs at a time, always.
+      static_assert(InputDimensions % 4 == 0);

      const auto output = reinterpret_cast<OutputType*>(buffer);
      const auto inputVector = reinterpret_cast<const vec_t*>(input);
@@ -310,7 +263,7 @@ namespace Stockfish::Eval::NNUE::Layers {
      // because then it is also an input dimension.
      if constexpr (OutputDimensions % OutputSimdWidth == 0)
      {
-          constexpr IndexType NumChunks = PaddedInputDimensions / 4;
+          constexpr IndexType NumChunks = InputDimensions / 4;

          const auto input32 = reinterpret_cast<const std::int32_t*>(input);
          vec_t* outptr = reinterpret_cast<vec_t*>(output);
@@ -329,8 +282,6 @@ namespace Stockfish::Eval::NNUE::Layers {
              for (int j = 0; j * OutputSimdWidth < OutputDimensions; ++j)
                  vec_add_dpbusd_32x4(outptr[j], in0, col0[j], in1, col1[j], in2, col2[j], in3, col3[j]);
          }
-          for (int i = 0; i < canSaturate16.count; ++i)
-              output[canSaturate16.ids[i].out] += input[canSaturate16.ids[i].in] * canSaturate16.ids[i].w;
      }
      else if constexpr (OutputDimensions == 1)
      {
@@ -377,17 +328,21 @@ namespace Stockfish::Eval::NNUE::Layers {
      auto output = reinterpret_cast<OutputType*>(buffer);

 #if defined(USE_SSE2)
-      constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth;
+      // At least a multiple of 16, with SSE2.
+      static_assert(InputDimensions % SimdWidth == 0);
+      constexpr IndexType NumChunks = InputDimensions / SimdWidth;
      const __m128i Zeros = _mm_setzero_si128();
      const auto inputVector = reinterpret_cast<const __m128i*>(input);

 #elif defined(USE_MMX)
-      constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth;
+      static_assert(InputDimensions % SimdWidth == 0);
+      constexpr IndexType NumChunks = InputDimensions / SimdWidth;
      const __m64 Zeros = _mm_setzero_si64();
      const auto inputVector = reinterpret_cast<const __m64*>(input);

 #elif defined(USE_NEON)
-      constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth;
+      static_assert(InputDimensions % SimdWidth == 0);
+      constexpr IndexType NumChunks = InputDimensions / SimdWidth;
      const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
 #endif

@@ -473,25 +428,6 @@ namespace Stockfish::Eval::NNUE::Layers {

    alignas(CacheLineSize) BiasType biases[OutputDimensions];
    alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
-#if defined (USE_SSSE3)
-    struct CanSaturate {
-        int count;
-        struct Entry {
-            uint32_t wIdx;
-            uint16_t out;
-            uint16_t in;
-            int8_t w;
-        } ids[PaddedInputDimensions * OutputDimensions * 3 / 4];
-
-        void add(int i, int j, int8_t w, uint32_t wIdx) {
-            ids[count].wIdx = wIdx;
-            ids[count].out = i;
-            ids[count].in = j;
-            ids[count].w = w;
-            ++count;
-        }
-    } canSaturate16;
-#endif
  };

 }  // namespace Stockfish::Eval::NNUE::Layers
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -72,22 +72,42 @@ namespace Stockfish::Eval::NNUE::Layers {
      const auto output = reinterpret_cast<OutputType*>(buffer);

  #if defined(USE_AVX2)
-      constexpr IndexType NumChunks = InputDimensions / SimdWidth;
-      const __m256i Zero = _mm256_setzero_si256();
-      const __m256i Offsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-      const auto in = reinterpret_cast<const __m256i*>(input);
-      const auto out = reinterpret_cast<__m256i*>(output);
-      for (IndexType i = 0; i < NumChunks; ++i) {
-        const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_load_si256(&in[i * 4 + 0]),
-            _mm256_load_si256(&in[i * 4 + 1])), WeightScaleBits);
-        const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_load_si256(&in[i * 4 + 2]),
-            _mm256_load_si256(&in[i * 4 + 3])), WeightScaleBits);
-        _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
-            _mm256_packs_epi16(words0, words1), Zero), Offsets));
+      if constexpr (InputDimensions % SimdWidth == 0) {
+        constexpr IndexType NumChunks = InputDimensions / SimdWidth;
+        const __m256i Zero = _mm256_setzero_si256();
+        const __m256i Offsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+        const auto in = reinterpret_cast<const __m256i*>(input);
+        const auto out = reinterpret_cast<__m256i*>(output);
+        for (IndexType i = 0; i < NumChunks; ++i) {
+          const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
+              _mm256_load_si256(&in[i * 4 + 0]),
+              _mm256_load_si256(&in[i * 4 + 1])), WeightScaleBits);
+          const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
+              _mm256_load_si256(&in[i * 4 + 2]),
+              _mm256_load_si256(&in[i * 4 + 3])), WeightScaleBits);
+          _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+              _mm256_packs_epi16(words0, words1), Zero), Offsets));
+        }
+      } else {
+        constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
+        const __m128i Zero = _mm_setzero_si128();
+        const auto in = reinterpret_cast<const __m128i*>(input);
+        const auto out = reinterpret_cast<__m128i*>(output);
+        for (IndexType i = 0; i < NumChunks; ++i) {
+          const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
+              _mm_load_si128(&in[i * 4 + 0]),
+              _mm_load_si128(&in[i * 4 + 1])), WeightScaleBits);
+          const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
+              _mm_load_si128(&in[i * 4 + 2]),
+              _mm_load_si128(&in[i * 4 + 3])), WeightScaleBits);
+          const __m128i packedbytes = _mm_packs_epi16(words0, words1);
+          _mm_store_si128(&out[i], _mm_max_epi8(packedbytes, Zero));
+        }
      }
-      constexpr IndexType Start = NumChunks * SimdWidth;
+      constexpr IndexType Start =
+        InputDimensions % SimdWidth == 0
+        ? InputDimensions / SimdWidth * SimdWidth
+        : InputDimensions / (SimdWidth / 2) * (SimdWidth / 2);

  #elif defined(USE_SSE2)
      constexpr IndexType NumChunks = InputDimensions / SimdWidth;