Simplify sudden death time optimization

Passed Sudden Death STC: https://tests.stockfishchess.org/tests/view/68455fe5375c2b77d9855351 LLR: 2.91 (-2.94,2.94) <-1.75,0.25> Total: 49248 W: 13008 L: 12798 D: 23442 Ptnml(0-2): 309, 5491, 12821, 5687, 316 Passed Sudden Death LTC: https://tests.stockfishchess.org/tests/view/6845a392375c2b77d98553cf LLR: 3.01 (-2.94,2.94) <-1.75,0.25> Total: 551070 W: 141699 L: 142031 D: 267340 Ptnml(0-2): 1923, 60608, 150916, 60054, 2034 Passed Standard STC: https://tests.stockfishchess.org/tests/view/683c5ebb6ec7634154f9d989 LLR: 2.95 (-2.94,2.94) <-1.75,0.25> Total: 142624 W: 36808 L: 36709 D: 69107 Ptnml(0-2): 302, 15448, 39745, 15483, 334 Passed Standard LTC: https://tests.stockfishchess.org/tests/view/683f1a4f6ec7634154f9dc5a LLR: 2.95 (-2.94,2.94) <-1.75,0.25> Total: 146922 W: 37381 L: 37296 D: 72245 Ptnml(0-2): 69, 13552, 46117, 13671, 52 closes https://github.com/official-stockfish/Stockfish/pull/6132 Bench: 2249459
Remove eval & beta diff from NM reduction
2025-12-06 10:53:50 +08:00 · 2025-07-02 18:41:46 +02:00 · 2025-07-02 18:41:46 +02:00 · 2025-07-02 18:41:45 +02:00 · 2025-07-02 18:32:12 +02:00 · 2025-07-02 18:32:02 +02:00
47 changed files with 1568 additions and 1395 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -9,14 +9,14 @@ AllowAllParametersOfDeclarationOnNextLine: true
 AllowShortCaseLabelsOnASingleLine: false
 AllowShortEnumsOnASingleLine: false
 AllowShortIfStatementsOnASingleLine: false
-AlwaysBreakTemplateDeclarations: Yes
+BreakTemplateDeclarations: Yes
 BasedOnStyle: WebKit
 BitFieldColonSpacing: After
 BinPackParameters: false
 BreakBeforeBinaryOperators: NonAssignment
 BreakBeforeBraces: Custom
 BraceWrapping:
-  AfterFunction: false 
+  AfterFunction: false
  AfterClass: false
  AfterControlStatement: true
  BeforeElse: true
--- a/.github/ci/arm_matrix.json
+++ b/.github/ci/arm_matrix.json
@@ -4,7 +4,7 @@
      "name": "Android NDK aarch64",
      "os": "ubuntu-22.04",
      "simple_name": "android",
-      "compiler": "aarch64-linux-android21-clang++",
+      "compiler": "aarch64-linux-android29-clang++",
      "emu": "qemu-aarch64",
      "comp": "ndk",
      "shell": "bash",
@@ -14,7 +14,7 @@
      "name": "Android NDK arm",
      "os": "ubuntu-22.04",
      "simple_name": "android",
-      "compiler": "armv7a-linux-androideabi21-clang++",
+      "compiler": "armv7a-linux-androideabi29-clang++",
      "emu": "qemu-arm",
      "comp": "ndk",
      "shell": "bash",
@@ -26,25 +26,25 @@
    {
      "binaries": "armv8-dotprod",
      "config": {
-        "compiler": "armv7a-linux-androideabi21-clang++"
+        "compiler": "armv7a-linux-androideabi29-clang++"
      }
    },
    {
      "binaries": "armv8",
      "config": {
-        "compiler": "armv7a-linux-androideabi21-clang++"
+        "compiler": "armv7a-linux-androideabi29-clang++"
      }
    },
    {
      "binaries": "armv7",
      "config": {
-        "compiler": "aarch64-linux-android21-clang++"
+        "compiler": "aarch64-linux-android29-clang++"
      }
    },
    {
      "binaries": "armv7-neon",
      "config": {
-        "compiler": "aarch64-linux-android21-clang++"
+        "compiler": "aarch64-linux-android29-clang++"
      }
    }
  ]
--- a/.github/ci/matrix.json
+++ b/.github/ci/matrix.json
@@ -40,6 +40,18 @@
      "ext": ".exe",
      "sde": "/d/a/Stockfish/Stockfish/.output/sde-temp-files/sde-external-9.27.0-2023-09-13-win/sde.exe -future --",
      "archive_ext": "zip"
+    },
+    {
+      "name": "Windows 11 Mingw-w64 Clang arm64",
+      "os": "windows-11-arm",
+      "simple_name": "windows",
+      "compiler": "clang++",
+      "comp": "clang",
+      "msys_sys": "clangarm64",
+      "msys_env": "clang-aarch64-clang",
+      "shell": "msys2 {0}",
+      "ext": ".exe",
+      "archive_ext": "zip"
    }
  ],
  "binaries": [
@@ -51,7 +63,9 @@
    "x86-64-avx512",
    "x86-64-vnni256",
    "x86-64-vnni512",
-    "apple-silicon"
+    "apple-silicon",
+    "armv8",
+    "armv8-dotprod"
  ],
  "exclude": [
    {
@@ -84,12 +98,6 @@
        "os": "macos-14"
      }
    },
-    {
-      "binaries": "x86-64-avxvnni",
-      "config": {
-        "os": "macos-14"
-      }
-    },
    {
      "binaries": "x86-64-avx512",
      "config": {
@@ -108,12 +116,6 @@
        "os": "macos-14"
      }
    },
-    {
-      "binaries": "x86-64-avxvnni",
-      "config": {
-        "ubuntu-22.04": null
-      }
-    },
    {
      "binaries": "x86-64-avxvnni",
      "config": {
@@ -138,6 +140,54 @@
        "os": "macos-13"
      }
    },
+    {
+      "binaries": "x86-64",
+      "config": {
+        "os": "windows-11-arm"
+      }
+    },
+    {
+      "binaries": "x86-64-sse41-popcnt",
+      "config": {
+        "os": "windows-11-arm"
+      }
+    },
+    {
+      "binaries": "x86-64-avx2",
+      "config": {
+        "os": "windows-11-arm"
+      }
+    },
+    {
+      "binaries": "x86-64-bmi2",
+      "config": {
+        "os": "windows-11-arm"
+      }
+    },
+    {
+      "binaries": "x86-64-avxvnni",
+      "config": {
+        "os": "windows-11-arm"
+      }
+    },
+    {
+      "binaries": "x86-64-avx512",
+      "config": {
+        "os": "windows-11-arm"
+      }
+    },
+    {
+      "binaries": "x86-64-vnni256",
+      "config": {
+        "os": "windows-11-arm"
+      }
+    },
+    {
+      "binaries": "x86-64-vnni512",
+      "config": {
+        "os": "windows-11-arm"
+      }
+    },
    {
      "binaries": "apple-silicon",
      "config": {
@@ -147,7 +197,13 @@
    {
      "binaries": "apple-silicon",
      "config": {
-        "os": "macos-13"
+        "os": "windows-11-arm"
+      }
+    },
+    {
+      "binaries": "apple-silicon",
+      "config": {
+        "os": "ubuntu-20.04"
      }
    },
    {
@@ -155,6 +211,72 @@
      "config": {
        "os": "ubuntu-22.04"
      }
+    },
+    {
+      "binaries": "apple-silicon",
+      "config": {
+        "os": "macos-13"
+      }
+    },
+    {
+      "binaries": "armv8",
+      "config": {
+        "os": "windows-2022"
+      }
+    },
+    {
+      "binaries": "armv8",
+      "config": {
+        "os": "ubuntu-20.04"
+      }
+    },
+    {
+      "binaries": "armv8",
+      "config": {
+        "os": "ubuntu-22.04"
+      }
+    },
+    {
+      "binaries": "armv8",
+      "config": {
+        "os": "macos-13"
+      }
+    },
+    {
+      "binaries": "armv8",
+      "config": {
+        "os": "macos-14"
+      }
+    },
+    {
+      "binaries": "armv8-dotprod",
+      "config": {
+        "os": "windows-2022"
+      }
+    },
+    {
+      "binaries": "armv8-dotprod",
+      "config": {
+        "os": "ubuntu-20.04"
+      }
+    },
+    {
+      "binaries": "armv8-dotprod",
+      "config": {
+        "os": "ubuntu-22.04"
+      }
+    },
+    {
+      "binaries": "armv8-dotprod",
+      "config": {
+        "os": "macos-13"
+      }
+    },
+    {
+      "binaries": "armv8-dotprod",
+      "config": {
+        "os": "macos-14"
+      }
    }
  ]
 }
--- a/.github/workflows/arm_compilation.yml
+++ b/.github/workflows/arm_compilation.yml
@@ -38,7 +38,7 @@ jobs:
        if: runner.os == 'Linux'
        run: |
          if [ $COMP == ndk ]; then
-            NDKV="21.4.7075529"
+            NDKV="27.2.12479018"
            ANDROID_ROOT=/usr/local/lib/android
            ANDROID_SDK_ROOT=$ANDROID_ROOT/sdk
            SDKMANAGER=$ANDROID_SDK_ROOT/cmdline-tools/latest/bin/sdkmanager
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -25,11 +25,11 @@ jobs:
          ref: ${{ github.event.pull_request.head.sha }}

      - name: Run clang-format style check
-        uses: jidicula/clang-format-action@f62da5e3d3a2d88ff364771d9d938773a618ab5e # @v4.11.0
+        uses: jidicula/clang-format-action@4726374d1aa3c6aecf132e5197e498979588ebc8 # @v4.15.0
        id: clang-format
        continue-on-error: true
        with:
-          clang-format-version: "18"
+          clang-format-version: "20"
          exclude-regex: "incbin"

      - name: Comment on PR
@@ -37,9 +37,9 @@ jobs:
        uses: thollander/actions-comment-pull-request@fabd468d3a1a0b97feee5f6b9e499eab0dd903f6 # @v2.5.0
        with:
          message: |
-            clang-format 18 needs to be run on this PR.
+            clang-format 20 needs to be run on this PR.
            If you do not have clang-format installed, the maintainer will run it when merging.
-            For the exact version please see https://packages.ubuntu.com/noble/clang-format-18.
+            For the exact version please see https://packages.ubuntu.com/plucky/clang-format-20.

            _(execution **${{ github.run_id }}** / attempt **${{ github.run_attempt }}**)_
          comment_tag: execution
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -47,7 +47,7 @@ jobs:

      - name: Build
        working-directory: src
-        run: make -j build ARCH=x86-64-modern
+        run: make -j build

      - name: Perform CodeQL Analysis
        uses: github/codeql-action/analyze@v3
--- a/.github/workflows/compilation.yml
+++ b/.github/workflows/compilation.yml
@@ -63,13 +63,13 @@ jobs:
      - name: Check compiler
        run: $COMPCXX -v

-      - name: Show g++ cpu info
-        if: runner.os != 'macOS'
-        run: g++ -Q -march=native --help=target
-
-      - name: Show clang++ cpu info
-        if: runner.os == 'macOS'
-        run: clang++ -E - -march=native -###
+      - name: Show compiler cpu info
+        run: |
+          if [[ "$COMPCXX" == clang* ]]; then
+             $COMPCXX -E - -march=native -###
+          else
+            $COMPCXX -Q -march=native --help=target
+          fi

      # x86-64 with newer extensions tests

--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -29,24 +29,25 @@ jobs:
            shell: bash
          - name: Android NDK aarch64
            os: ubuntu-22.04
-            compiler: aarch64-linux-android21-clang++
+            compiler: aarch64-linux-android29-clang++
            comp: ndk
            run_armv8_tests: true
            shell: bash
          - name: Android NDK arm
            os: ubuntu-22.04
-            compiler: armv7a-linux-androideabi21-clang++
+            compiler: armv7a-linux-androideabi29-clang++
            comp: ndk
            run_armv7_tests: true
            shell: bash
-          - name: Linux GCC riscv64
-            os: ubuntu-22.04
-            compiler: g++
-            comp: gcc
-            run_riscv64_tests: true
-            base_image: "riscv64/alpine:edge"
-            platform: linux/riscv64
-            shell: bash
+          # Currently segfaults in the CI unrelated to a Stockfish change.
+          # - name: Linux GCC riscv64
+          #   os: ubuntu-22.04
+          #   compiler: g++
+          #   comp: gcc
+          #   run_riscv64_tests: true
+          #   base_image: "riscv64/alpine:edge"
+          #   platform: linux/riscv64
+          #   shell: bash
          - name: Linux GCC ppc64
            os: ubuntu-22.04
            compiler: g++
@@ -98,6 +99,14 @@ jobs:
            msys_sys: clang64
            msys_env: clang-x86_64-clang
            shell: msys2 {0}
+          - name: Windows 11 Mingw-w64 Clang arm64
+            os: windows-11-arm
+            compiler: clang++
+            comp: clang
+            run_armv8_tests: true
+            msys_sys: clangarm64
+            msys_env: clang-aarch64-clang
+            shell: msys2 {0}
    defaults:
      run:
        working-directory: src
@@ -118,7 +127,7 @@ jobs:
        if: runner.os == 'Linux'
        run: |
          if [ $COMP == ndk ]; then
-            NDKV="21.4.7075529"
+            NDKV="27.2.12479018"
            ANDROID_ROOT=/usr/local/lib/android
            ANDROID_SDK_ROOT=$ANDROID_ROOT/sdk
            SDKMANAGER=$ANDROID_SDK_ROOT/cmdline-tools/latest/bin/sdkmanager
@@ -302,8 +311,10 @@ jobs:
      - name: Test armv8 build
        if: matrix.config.run_armv8_tests
        run: |
-          export PATH=${{ env.ANDROID_NDK_BIN }}:$PATH
-          export LDFLAGS="-static -Wno-unused-command-line-argument"
+          if [ $COMP == ndk ]; then
+            export PATH=${{ env.ANDROID_NDK_BIN }}:$PATH
+            export LDFLAGS="-static -Wno-unused-command-line-argument"
+          fi
          make clean
          make -j4 ARCH=armv8 build
          ../tests/signature.sh $benchref
@@ -311,8 +322,10 @@ jobs:
      - name: Test armv8-dotprod build
        if: matrix.config.run_armv8_tests
        run: |
-          export PATH=${{ env.ANDROID_NDK_BIN }}:$PATH
-          export LDFLAGS="-static -Wno-unused-command-line-argument"
+          if [ $COMP == ndk ]; then
+            export PATH=${{ env.ANDROID_NDK_BIN }}:$PATH
+            export LDFLAGS="-static -Wno-unused-command-line-argument"
+          fi
          make clean
          make -j4 ARCH=armv8-dotprod build
          ../tests/signature.sh $benchref
--- a/5
+++ b/5
@@ -20,6 +20,7 @@ Alexander Kure
 Alexander Pagel (Lolligerhans)
 Alfredo Menezes (lonfom169)
 Ali AlZhrani (Cooffe)
+AliceRoselia
 Andreas Jan van der Meulen (Andyson007)
 Andreas Matthies (Matthies)
 Andrei Vetrov (proukornew)
@@ -33,6 +34,7 @@ Artem Solopiy (EntityFX)
 Auguste Pop
 Balazs Szilagyi
 Balint Pfliegel
+Baptiste Rech (breatn)
 Ben Chaney (Chaneybenjamini)
 Ben Koshy (BKSpurgeon)
 Bill Henry (VoyagerOne)
@@ -57,6 +59,7 @@ Dale Weiler (graphitemaster)
 Daniel Axtens (daxtens)
 Daniel Dugovic (ddugovic)
 Daniel Monroe (Ergodice)
+Daniel Samek (DanSamek)
 Dan Schmidt (dfannius)
 Dariusz Orzechowski (dorzechowski)
 David (dav1312)
@@ -129,6 +132,7 @@ Kenneth Lee (kennethlee33)
 Kian E (KJE-98)
 kinderchocolate
 Kiran Panditrao (Krgp)
+Kirill Zaripov (kokodio)
 Kojirion
 Krisztián Peőcz
 Krystian Kuzniarek (kuzkry)
@@ -145,6 +149,7 @@ Lucas Braesch (lucasart)
 Lyudmil Antonov (lantonov)
 Maciej Żenczykowski (zenczykowski)
 Malcolm Campbell (xoto10)
+Mark Marosi (Mapika)
 Mark Tenzer (31m059)
 marotear
 Mathias Parnaudeau (mparnaudeau)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -59,7 +59,7 @@ discussion._

 Changes to Stockfish C++ code should respect our coding style defined by
 [.clang-format](.clang-format). You can format your changes by running
-`make format`. This requires clang-format version 18 to be installed on your system.
+`make format`. This requires clang-format version 20 to be installed on your system.

 ## Navigate

--- a/scripts/get_native_properties.sh
+++ b/scripts/get_native_properties.sh
@@ -130,7 +130,13 @@ case $uname_s in
    esac
    file_ext='tar'
    ;;
-  'CYGWIN'*|'MINGW'*|'MSYS'*) # Windows system with POSIX compatibility layer
+  'MINGW'*'ARM64'*) # Windows ARM64 system with POSIX compatibility layer
+    # TODO: older chips might be armv8, but we have no good way to detect, /proc/cpuinfo shows x86 info
+    file_os='windows'
+    true_arch='armv8-dotprod'
+    file_ext='zip'
+    ;;
+  'CYGWIN'*|'MINGW'*|'MSYS'*) # Windows x86_64system with POSIX compatibility layer
    get_flags
    check_znver_1_2
    set_arch_x86_64
--- a/src/Makefile
+++ b/src/Makefile
@@ -60,9 +60,9 @@ SRCS = benchmark.cpp bitboard.cpp evaluate.cpp main.cpp \

 HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h history.h \
 		nnue/nnue_misc.h nnue/features/half_ka_v2_hm.h nnue/layers/affine_transform.h \
-		nnue/layers/affine_transform_sparse_input.h nnue/layers/clipped_relu.h nnue/layers/simd.h \
+		nnue/layers/affine_transform_sparse_input.h nnue/layers/clipped_relu.h \
 		nnue/layers/sqr_clipped_relu.h nnue/nnue_accumulator.h nnue/nnue_architecture.h \
-		nnue/nnue_common.h nnue/nnue_feature_transformer.h position.h \
+		nnue/nnue_common.h nnue/nnue_feature_transformer.h nnue/simd.h position.h \
 		search.h syzygy/tbprobe.h thread.h thread_win32_osx.h timeman.h \
 		tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h numa.h memory.h

@@ -163,8 +163,8 @@ lsx = no
 lasx = no
 STRIP = strip

-ifneq ($(shell which clang-format-18 2> /dev/null),)
-	CLANG-FORMAT = clang-format-18
+ifneq ($(shell which clang-format-20 2> /dev/null),)
+	CLANG-FORMAT = clang-format-20
 else
 	CLANG-FORMAT = clang-format
 endif
@@ -533,14 +533,12 @@ ifeq ($(KERNEL),Darwin)
 	XCRUN = xcrun
 endif

-# To cross-compile for Android, NDK version r21 or later is recommended.
-# In earlier NDK versions, you'll need to pass -fno-addrsig if using GNU binutils.
-# Currently we don't know how to make PGO builds with the NDK yet.
+# To cross-compile for Android, use NDK version r27c or later.
 ifeq ($(COMP),ndk)
-	CXXFLAGS += -stdlib=libc++ -fPIE
+	CXXFLAGS += -stdlib=libc++
 	comp=clang
 	ifeq ($(arch),armv7)
-		CXX=armv7a-linux-androideabi16-clang++
+		CXX=armv7a-linux-androideabi29-clang++
 		CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon
 		ifneq ($(shell which arm-linux-androideabi-strip 2>/dev/null),)
 			STRIP=arm-linux-androideabi-strip
@@ -549,7 +547,7 @@ ifeq ($(COMP),ndk)
 		endif
 	endif
 	ifeq ($(arch),armv8)
-		CXX=aarch64-linux-android21-clang++
+		CXX=aarch64-linux-android29-clang++
 		ifneq ($(shell which aarch64-linux-android-strip 2>/dev/null),)
 			STRIP=aarch64-linux-android-strip
 		else
@@ -557,14 +555,28 @@ ifeq ($(COMP),ndk)
 		endif
 	endif
 	ifeq ($(arch),x86_64)
-		CXX=x86_64-linux-android21-clang++
+		CXX=x86_64-linux-android29-clang++
 		ifneq ($(shell which x86_64-linux-android-strip 2>/dev/null),)
 			STRIP=x86_64-linux-android-strip
 		else
 			STRIP=llvm-strip
 		endif
 	endif
-	LDFLAGS += -static-libstdc++ -pie -lm -latomic
+	LDFLAGS += -static-libstdc++
+endif
+
+### Allow overwriting CXX from command line
+ifdef COMPCXX
+	CXX=$(COMPCXX)
+endif
+
+# llvm-profdata must be version compatible with the specified CXX (be it clang, or the gcc alias)
+# make -j profile-build CXX=clang++-20 COMP=clang
+# Locate the version in the same directory as the compiler used,
+# with fallback to a generic one if it can't be located
+	LLVM_PROFDATA := $(dir $(realpath $(shell which $(CXX) 2> /dev/null)))llvm-profdata
+ifeq ($(wildcard $(LLVM_PROFDATA)),)
+	LLVM_PROFDATA := llvm-profdata
 endif

 ifeq ($(comp),icx)
@@ -581,11 +593,6 @@ else
 	endif
 endif

-### Allow overwriting CXX from command line
-ifdef COMPCXX
-	CXX=$(COMPCXX)
-endif
-
 ### Sometimes gcc is really clang
 ifeq ($(COMP),gcc)
 	gccversion := $(shell $(CXX) --version 2>/dev/null)
@@ -694,7 +701,7 @@ endif
 ifeq ($(avx512),yes)
 	CXXFLAGS += -DUSE_AVX512
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
-		CXXFLAGS += -mavx512f -mavx512bw
+		CXXFLAGS += -mavx512f -mavx512bw -mavx512dq -mavx512vl
 	endif
 endif

@@ -989,10 +996,6 @@ net:
 format:
 	$(CLANG-FORMAT) -i $(SRCS) $(HEADERS) -style=file

-# default target
-default:
-	help
-
 ### ==========================================================================
 ### Section 5. Private Targets
 ### ==========================================================================
@@ -1081,7 +1084,7 @@ clang-profile-make:
 	all

 clang-profile-use:
-	$(XCRUN) llvm-profdata merge -output=stockfish.profdata *.profraw
+	$(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
 	EXTRACXXFLAGS='-fprofile-use=stockfish.profdata' \
 	EXTRALDFLAGS='-fprofile-use ' \
@@ -1118,6 +1121,6 @@ icx-profile-use:
 .depend: $(SRCS)
 	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@ 2> /dev/null

-ifeq (, $(filter $(MAKECMDGOALS), help strip install clean net objclean profileclean config-sanity))
+ifeq (, $(filter $(MAKECMDGOALS), help strip install clean net objclean profileclean format config-sanity))
 -include .depend
 endif
--- a/src/bitboard.cpp
+++ b/src/bitboard.cpp
@@ -32,7 +32,6 @@ uint8_t SquareDistance[SQUARE_NB][SQUARE_NB];
 Bitboard LineBB[SQUARE_NB][SQUARE_NB];
 Bitboard BetweenBB[SQUARE_NB][SQUARE_NB];
 Bitboard PseudoAttacks[PIECE_TYPE_NB][SQUARE_NB];
-Bitboard PawnAttacks[COLOR_NB][SQUARE_NB];

 alignas(64) Magic Magics[SQUARE_NB][2];

@@ -86,8 +85,8 @@ void Bitboards::init() {

    for (Square s1 = SQ_A1; s1 <= SQ_H8; ++s1)
    {
-        PawnAttacks[WHITE][s1] = pawn_attacks_bb<WHITE>(square_bb(s1));
-        PawnAttacks[BLACK][s1] = pawn_attacks_bb<BLACK>(square_bb(s1));
+        PseudoAttacks[WHITE][s1] = pawn_attacks_bb<WHITE>(square_bb(s1));
+        PseudoAttacks[BLACK][s1] = pawn_attacks_bb<BLACK>(square_bb(s1));

        for (int step : {-9, -8, -7, -1, 1, 7, 8, 9})
            PseudoAttacks[KING][s1] |= safe_destination(s1, step);
--- a/src/bitboard.h
+++ b/src/bitboard.h
@@ -62,7 +62,6 @@ extern uint8_t SquareDistance[SQUARE_NB][SQUARE_NB];
 extern Bitboard BetweenBB[SQUARE_NB][SQUARE_NB];
 extern Bitboard LineBB[SQUARE_NB][SQUARE_NB];
 extern Bitboard PseudoAttacks[PIECE_TYPE_NB][SQUARE_NB];
-extern Bitboard PawnAttacks[COLOR_NB][SQUARE_NB];


 // Magic holds all magic bitboards relevant data for a single square
@@ -103,17 +102,17 @@ constexpr Bitboard square_bb(Square s) {
 // Overloads of bitwise operators between a Bitboard and a Square for testing
 // whether a given bit is set in a bitboard, and for setting and clearing bits.

-inline Bitboard  operator&(Bitboard b, Square s) { return b & square_bb(s); }
-inline Bitboard  operator|(Bitboard b, Square s) { return b | square_bb(s); }
-inline Bitboard  operator^(Bitboard b, Square s) { return b ^ square_bb(s); }
-inline Bitboard& operator|=(Bitboard& b, Square s) { return b |= square_bb(s); }
-inline Bitboard& operator^=(Bitboard& b, Square s) { return b ^= square_bb(s); }
+constexpr Bitboard  operator&(Bitboard b, Square s) { return b & square_bb(s); }
+constexpr Bitboard  operator|(Bitboard b, Square s) { return b | square_bb(s); }
+constexpr Bitboard  operator^(Bitboard b, Square s) { return b ^ square_bb(s); }
+constexpr Bitboard& operator|=(Bitboard& b, Square s) { return b |= square_bb(s); }
+constexpr Bitboard& operator^=(Bitboard& b, Square s) { return b ^= square_bb(s); }

-inline Bitboard operator&(Square s, Bitboard b) { return b & s; }
-inline Bitboard operator|(Square s, Bitboard b) { return b | s; }
-inline Bitboard operator^(Square s, Bitboard b) { return b ^ s; }
+constexpr Bitboard operator&(Square s, Bitboard b) { return b & s; }
+constexpr Bitboard operator|(Square s, Bitboard b) { return b | s; }
+constexpr Bitboard operator^(Square s, Bitboard b) { return b ^ s; }

-inline Bitboard operator|(Square s1, Square s2) { return square_bb(s1) | s2; }
+constexpr Bitboard operator|(Square s1, Square s2) { return square_bb(s1) | s2; }

 constexpr bool more_than_one(Bitboard b) { return b & (b - 1); }

@@ -155,11 +154,6 @@ constexpr Bitboard pawn_attacks_bb(Bitboard b) {
                      : shift<SOUTH_WEST>(b) | shift<SOUTH_EAST>(b);
 }

-inline Bitboard pawn_attacks_bb(Color c, Square s) {
-
-    assert(is_ok(s));
-    return PawnAttacks[c][s];
-}

 // Returns a bitboard representing an entire line (from board edge
 // to board edge) that intersects the two given squares. If the given squares
@@ -216,10 +210,10 @@ inline int edge_distance(File f) { return std::min(f, File(FILE_H - f)); }
 // Returns the pseudo attacks of the given piece type
 // assuming an empty board.
 template<PieceType Pt>
-inline Bitboard attacks_bb(Square s) {
+inline Bitboard attacks_bb(Square s, Color c = COLOR_NB) {

-    assert((Pt != PAWN) && (is_ok(s)));
-    return PseudoAttacks[Pt][s];
+    assert((Pt != PAWN || c < COLOR_NB) && (is_ok(s)));
+    return Pt == PAWN ? PseudoAttacks[c][s] : PseudoAttacks[Pt][s];
 }


--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -38,17 +38,15 @@
 namespace Stockfish {

 // Returns a static, purely materialistic evaluation of the position from
-// the point of view of the given color. It can be divided by PawnValue to get
+// the point of view of the side to move. It can be divided by PawnValue to get
 // an approximation of the material advantage on the board in terms of pawns.
-int Eval::simple_eval(const Position& pos, Color c) {
+int Eval::simple_eval(const Position& pos) {
+    Color c = pos.side_to_move();
    return PawnValue * (pos.count<PAWN>(c) - pos.count<PAWN>(~c))
         + (pos.non_pawn_material(c) - pos.non_pawn_material(~c));
 }

-bool Eval::use_smallnet(const Position& pos) {
-    int simpleEval = simple_eval(pos, pos.side_to_move());
-    return std::abs(simpleEval) > 962;
-}
+bool Eval::use_smallnet(const Position& pos) { return std::abs(simple_eval(pos)) > 962; }

 // Evaluate is the evaluator for the outer world. It returns a static evaluation
 // of the position from the point of view of the side to move.
@@ -103,8 +101,6 @@ std::string Eval::trace(Position& pos, const Eval::NNUE::Networks& networks) {
    Eval::NNUE::AccumulatorStack accumulators;
    auto                         caches = std::make_unique<Eval::NNUE::AccumulatorCaches>(networks);

-    accumulators.reset(pos, networks, *caches);
-
    std::stringstream ss;
    ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2);
    ss << '\n' << NNUE::trace(pos, networks, *caches) << '\n';
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -44,7 +44,7 @@ class AccumulatorStack;

 std::string trace(Position& pos, const Eval::NNUE::Networks& networks);

-int   simple_eval(const Position& pos, Color c);
+int   simple_eval(const Position& pos);
 bool  use_smallnet(const Position& pos);
 Value evaluate(const NNUE::Networks&          networks,
               const Position&                pos,
--- a/src/history.h
+++ b/src/history.h
@@ -36,7 +36,7 @@ namespace Stockfish {
 constexpr int PAWN_HISTORY_SIZE        = 512;    // has to be a power of 2
 constexpr int CORRECTION_HISTORY_SIZE  = 32768;  // has to be a power of 2
 constexpr int CORRECTION_HISTORY_LIMIT = 1024;
-constexpr int LOW_PLY_HISTORY_SIZE     = 4;
+constexpr int LOW_PLY_HISTORY_SIZE     = 5;

 static_assert((PAWN_HISTORY_SIZE & (PAWN_HISTORY_SIZE - 1)) == 0,
              "PAWN_HISTORY_SIZE has to be a power of 2");
@@ -166,6 +166,8 @@ struct CorrHistTypedef<NonPawn> {
 template<CorrHistType T>
 using CorrectionHistory = typename Detail::CorrHistTypedef<T>::type;

+using TTMoveHistory = StatsEntry<std::int16_t, 8192>;
+
 }  // namespace Stockfish

 #endif  // #ifndef HISTORY_H_INCLUDED
--- a/src/memory.h
+++ b/src/memory.h
@@ -52,7 +52,6 @@ void memory_deleter(T* ptr, FREE_FUNC free_func) {
        ptr->~T();

    free_func(ptr);
-    return;
 }

 // Frees memory which was placed there with placement new.
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -40,7 +40,7 @@ namespace Stockfish {
 namespace {

 // Version number or dev.
-constexpr std::string_view version = "17.1";
+constexpr std::string_view version = "dev";

 // Our fancy logging facility. The trick here is to replace cin.rdbuf() and
 // cout.rdbuf() with two Tie objects that tie cin and cout to a file stream. We
--- a/src/misc.h
+++ b/src/misc.h
@@ -317,6 +317,22 @@ void move_to_front(std::vector<T>& vec, Predicate pred) {
 }
 }

+#if defined(__GNUC__) && !defined(__clang__)
+    #if __GNUC__ >= 13
+        #define sf_assume(cond) __attribute__((assume(cond)))
+    #else
+        #define sf_assume(cond) \
+            do \
+            { \
+                if (!(cond)) \
+                    __builtin_unreachable(); \
+            } while (0)
+    #endif
+#else
+    // do nothing for other compilers
+    #define sf_assume(cond)
+#endif
+
 }  // namespace Stockfish

 #endif  // #ifndef MISC_H_INCLUDED
--- a/src/movegen.cpp
+++ b/src/movegen.cpp
@@ -134,7 +134,7 @@ ExtMove* generate_pawn_moves(const Position& pos, ExtMove* moveList, Bitboard ta
            if (Type == EVASIONS && (target & (pos.ep_square() + Up)))
                return moveList;

-            b1 = pawnsNotOn7 & pawn_attacks_bb(Them, pos.ep_square());
+            b1 = pawnsNotOn7 & attacks_bb<PAWN>(pos.ep_square(), Them);

            assert(b1);

--- a/src/movepick.cpp
+++ b/src/movepick.cpp
@@ -20,6 +20,7 @@

 #include <cassert>
 #include <limits>
+#include <utility>

 #include "bitboard.h"
 #include "misc.h"
@@ -55,6 +56,7 @@ enum Stages {
    QCAPTURE
 };

+
 // Sort moves in descending order up to and including a given limit.
 // The order of moves smaller than the limit is left unspecified.
 void partial_insertion_sort(ExtMove* begin, ExtMove* end, int limit) {
@@ -125,74 +127,68 @@ void MovePicker::score() {

    static_assert(Type == CAPTURES || Type == QUIETS || Type == EVASIONS, "Wrong type");

-    [[maybe_unused]] Bitboard threatenedByPawn, threatenedByMinor, threatenedByRook,
-      threatenedPieces;
+    Color us = pos.side_to_move();
+
+    [[maybe_unused]] Bitboard threatByLesser[QUEEN + 1];
    if constexpr (Type == QUIETS)
    {
-        Color us = pos.side_to_move();
-
-        threatenedByPawn = pos.attacks_by<PAWN>(~us);
-        threatenedByMinor =
-          pos.attacks_by<KNIGHT>(~us) | pos.attacks_by<BISHOP>(~us) | threatenedByPawn;
-        threatenedByRook = pos.attacks_by<ROOK>(~us) | threatenedByMinor;
-
-        // Pieces threatened by pieces of lesser material value
-        threatenedPieces = (pos.pieces(us, QUEEN) & threatenedByRook)
-                         | (pos.pieces(us, ROOK) & threatenedByMinor)
-                         | (pos.pieces(us, KNIGHT, BISHOP) & threatenedByPawn);
+        threatByLesser[KNIGHT] = threatByLesser[BISHOP] = pos.attacks_by<PAWN>(~us);
+        threatByLesser[ROOK] =
+          pos.attacks_by<KNIGHT>(~us) | pos.attacks_by<BISHOP>(~us) | threatByLesser[KNIGHT];
+        threatByLesser[QUEEN] = pos.attacks_by<ROOK>(~us) | threatByLesser[ROOK];
    }

    for (auto& m : *this)
+    {
+        const Square    from          = m.from_sq();
+        const Square    to            = m.to_sq();
+        const Piece     pc            = pos.moved_piece(m);
+        const PieceType pt            = type_of(pc);
+        const Piece     capturedPiece = pos.piece_on(to);
+
        if constexpr (Type == CAPTURES)
-            m.value =
-              7 * int(PieceValue[pos.piece_on(m.to_sq())])
-              + (*captureHistory)[pos.moved_piece(m)][m.to_sq()][type_of(pos.piece_on(m.to_sq()))];
+            m.value = (*captureHistory)[pc][to][type_of(capturedPiece)]
+                    + 7 * int(PieceValue[capturedPiece]) + 1024 * bool(pos.check_squares(pt) & to);

        else if constexpr (Type == QUIETS)
        {
-            Piece     pc   = pos.moved_piece(m);
-            PieceType pt   = type_of(pc);
-            Square    from = m.from_sq();
-            Square    to   = m.to_sq();
-
            // histories
-            m.value = 2 * (*mainHistory)[pos.side_to_move()][m.from_to()];
+            m.value = 2 * (*mainHistory)[us][m.from_to()];
            m.value += 2 * (*pawnHistory)[pawn_structure_index(pos)][pc][to];
            m.value += (*continuationHistory[0])[pc][to];
            m.value += (*continuationHistory[1])[pc][to];
            m.value += (*continuationHistory[2])[pc][to];
            m.value += (*continuationHistory[3])[pc][to];
-            m.value += (*continuationHistory[4])[pc][to] / 3;
            m.value += (*continuationHistory[5])[pc][to];

            // bonus for checks
-            m.value += bool(pos.check_squares(pt) & to) * 16384;
+            m.value += (bool(pos.check_squares(pt) & to) && pos.see_ge(m, -75)) * 16384;

-            // bonus for escaping from capture
-            m.value += threatenedPieces & from ? (pt == QUEEN && !(to & threatenedByRook)   ? 51700
-                                                  : pt == ROOK && !(to & threatenedByMinor) ? 25600
-                                                  : !(to & threatenedByPawn)                ? 14450
-                                                                                            : 0)
-                                               : 0;
-
-            // malus for putting piece en prise
-            m.value -= (pt == QUEEN ? bool(to & threatenedByRook) * 49000
-                        : pt == ROOK && bool(to & threatenedByMinor) ? 24335
-                                                                     : 0);
+            // penalty for moving to a square threatened by a lesser piece
+            // or bonus for escaping an attack by a lesser piece.
+            if (KNIGHT <= pt && pt <= QUEEN)
+            {
+                static constexpr int bonus[QUEEN + 1] = {0, 0, 144, 144, 256, 517};
+                int v = threatByLesser[pt] & to ? -95 : 100 * bool(threatByLesser[pt] & from);
+                m.value += bonus[pt] * v;
+            }

            if (ply < LOW_PLY_HISTORY_SIZE)
-                m.value += 8 * (*lowPlyHistory)[ply][m.from_to()] / (1 + 2 * ply);
+                m.value += 8 * (*lowPlyHistory)[ply][m.from_to()] / (1 + ply);
        }

        else  // Type == EVASIONS
        {
            if (pos.capture_stage(m))
-                m.value = PieceValue[pos.piece_on(m.to_sq())] + (1 << 28);
+                m.value = PieceValue[capturedPiece] + (1 << 28);
            else
-                m.value = (*mainHistory)[pos.side_to_move()][m.from_to()]
-                        + (*continuationHistory[0])[pos.moved_piece(m)][m.to_sq()]
-                        + (*pawnHistory)[pawn_structure_index(pos)][pos.moved_piece(m)][m.to_sq()];
+            {
+                m.value = (*mainHistory)[us][m.from_to()] + (*continuationHistory[0])[pc][to];
+                if (ply < LOW_PLY_HISTORY_SIZE)
+                    m.value += 2 * (*lowPlyHistory)[ply][m.from_to()] / (1 + ply);
+            }
        }
+    }
 }

 // Returns the next move satisfying a predicate function.
@@ -200,7 +196,7 @@ void MovePicker::score() {
 template<typename Pred>
 Move MovePicker::select(Pred filter) {

-    for (; cur < endMoves; ++cur)
+    for (; cur < endCur; ++cur)
        if (*cur != ttMove && filter())
            return *cur++;

@@ -212,8 +208,7 @@ Move MovePicker::select(Pred filter) {
 // picking the move with the highest score from a list of generated moves.
 Move MovePicker::next_move() {

-    auto quiet_threshold = [](Depth d) { return -3560 * d; };
-
+    constexpr int goodQuietThreshold = -14000;
 top:
    switch (stage)
    {
@@ -229,18 +224,19 @@ top:
    case PROBCUT_INIT :
    case QCAPTURE_INIT :
        cur = endBadCaptures = moves;
-        endMoves             = generate<CAPTURES>(pos, cur);
+        endCur = endCaptures = generate<CAPTURES>(pos, cur);

        score<CAPTURES>();
-        partial_insertion_sort(cur, endMoves, std::numeric_limits<int>::min());
+        partial_insertion_sort(cur, endCur, std::numeric_limits<int>::min());
        ++stage;
        goto top;

    case GOOD_CAPTURE :
        if (select([&]() {
-                // Move losing capture to endBadCaptures to be tried later
-                return pos.see_ge(*cur, -cur->value / 18) ? true
-                                                          : (*endBadCaptures++ = *cur, false);
+                if (pos.see_ge(*cur, -cur->value / 18))
+                    return true;
+                std::swap(*endBadCaptures++, *cur);
+                return false;
            }))
            return *(cur - 1);

@@ -250,29 +246,22 @@ top:
    case QUIET_INIT :
        if (!skipQuiets)
        {
-            cur      = endBadCaptures;
-            endMoves = beginBadQuiets = endBadQuiets = generate<QUIETS>(pos, cur);
+            endCur = endGenerated = generate<QUIETS>(pos, cur);

            score<QUIETS>();
-            partial_insertion_sort(cur, endMoves, quiet_threshold(depth));
+            partial_insertion_sort(cur, endCur, -3560 * depth);
        }

        ++stage;
        [[fallthrough]];

    case GOOD_QUIET :
-        if (!skipQuiets && select([]() { return true; }))
-        {
-            if ((cur - 1)->value > -7998 || (cur - 1)->value <= quiet_threshold(depth))
-                return *(cur - 1);
-
-            // Remaining quiets are bad
-            beginBadQuiets = cur - 1;
-        }
+        if (!skipQuiets && select([&]() { return cur->value > goodQuietThreshold; }))
+            return *(cur - 1);

        // Prepare the pointers to loop over the bad captures
-        cur      = moves;
-        endMoves = endBadCaptures;
+        cur    = moves;
+        endCur = endBadCaptures;

        ++stage;
        [[fallthrough]];
@@ -281,25 +270,25 @@ top:
        if (select([]() { return true; }))
            return *(cur - 1);

-        // Prepare the pointers to loop over the bad quiets
-        cur      = beginBadQuiets;
-        endMoves = endBadQuiets;
+        // Prepare the pointers to loop over quiets again
+        cur    = endCaptures;
+        endCur = endGenerated;

        ++stage;
        [[fallthrough]];

    case BAD_QUIET :
        if (!skipQuiets)
-            return select([]() { return true; });
+            return select([&]() { return cur->value <= goodQuietThreshold; });

        return Move::none();

    case EVASION_INIT :
-        cur      = moves;
-        endMoves = generate<EVASIONS>(pos, cur);
+        cur    = moves;
+        endCur = endGenerated = generate<EVASIONS>(pos, cur);

        score<EVASIONS>();
-        partial_insertion_sort(cur, endMoves, std::numeric_limits<int>::min());
+        partial_insertion_sort(cur, endCur, std::numeric_limits<int>::min());
        ++stage;
        [[fallthrough]];

@@ -317,4 +306,18 @@ top:

 void MovePicker::skip_quiet_moves() { skipQuiets = true; }

+// this function must be called after all quiet moves and captures have been generated
+bool MovePicker::can_move_king_or_pawn() const {
+    // SEE negative captures shouldn't be returned in GOOD_CAPTURE stage
+    assert(stage > GOOD_CAPTURE && stage != EVASION_INIT);
+
+    for (const ExtMove* m = moves; m < endGenerated; ++m)
+    {
+        PieceType movedPieceType = type_of(pos.moved_piece(*m));
+        if ((movedPieceType == PAWN || movedPieceType == KING) && pos.legal(*m))
+            return true;
+    }
+    return false;
+}
+
 }  // namespace Stockfish
--- a/src/movepick.h
+++ b/src/movepick.h
@@ -50,6 +50,7 @@ class MovePicker {
    MovePicker(const Position&, Move, int, const CapturePieceToHistory*);
    Move next_move();
    void skip_quiet_moves();
+    bool can_move_king_or_pawn() const;

   private:
    template<typename Pred>
@@ -57,7 +58,7 @@ class MovePicker {
    template<GenType>
    void     score();
    ExtMove* begin() { return cur; }
-    ExtMove* end() { return endMoves; }
+    ExtMove* end() { return endCur; }

    const Position&              pos;
    const ButterflyHistory*      mainHistory;
@@ -66,7 +67,7 @@ class MovePicker {
    const PieceToHistory**       continuationHistory;
    const PawnHistory*           pawnHistory;
    Move                         ttMove;
-    ExtMove *                    cur, *endMoves, *endBadCaptures, *beginBadQuiets, *endBadQuiets;
+    ExtMove *                    cur, *endCur, *endBadCaptures, *endCaptures, *endGenerated;
    int                          stage;
    int                          threshold;
    Depth                        depth;
--- a/src/nnue/features/half_ka_v2_hm.cpp
+++ b/src/nnue/features/half_ka_v2_hm.cpp
@@ -23,7 +23,7 @@
 #include "../../bitboard.h"
 #include "../../position.h"
 #include "../../types.h"
-#include "../nnue_accumulator.h"
+#include "../nnue_common.h"

 namespace Stockfish::Eval::NNUE::Features {

@@ -58,13 +58,15 @@ void HalfKAv2_hm::append_changed_indices(Square            ksq,
                                         const DirtyPiece& dp,
                                         IndexList&        removed,
                                         IndexList&        added) {
-    for (int i = 0; i < dp.dirty_num; ++i)
-    {
-        if (dp.from[i] != SQ_NONE)
-            removed.push_back(make_index<Perspective>(dp.from[i], dp.piece[i], ksq));
-        if (dp.to[i] != SQ_NONE)
-            added.push_back(make_index<Perspective>(dp.to[i], dp.piece[i], ksq));
-    }
+    removed.push_back(make_index<Perspective>(dp.from, dp.pc, ksq));
+    if (dp.to != SQ_NONE)
+        added.push_back(make_index<Perspective>(dp.to, dp.pc, ksq));
+
+    if (dp.remove_sq != SQ_NONE)
+        removed.push_back(make_index<Perspective>(dp.remove_sq, dp.remove_pc, ksq));
+
+    if (dp.add_sq != SQ_NONE)
+        added.push_back(make_index<Perspective>(dp.add_sq, dp.add_pc, ksq));
 }

 // Explicit template instantiations
@@ -78,7 +80,7 @@ template void HalfKAv2_hm::append_changed_indices<BLACK>(Square            ksq,
                                                         IndexList&        added);

 bool HalfKAv2_hm::requires_refresh(const DirtyPiece& dirtyPiece, Color perspective) {
-    return dirtyPiece.piece[0] == make_piece(perspective, KING);
+    return dirtyPiece.pc == make_piece(perspective, KING);
 }

 }  // namespace Stockfish::Eval::NNUE::Features
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -25,7 +25,7 @@
 #include <iostream>

 #include "../nnue_common.h"
-#include "simd.h"
+#include "../simd.h"

 /*
  This file contains the definition for a fully connected layer (aka affine transform).
@@ -102,7 +102,7 @@ static void affine_transform_non_ssse3(std::int32_t*       output,
            product           = vmlal_s8(product, inputVector[j * 2 + 1], row[j * 2 + 1]);
            sum               = vpadalq_s16(sum, product);
        }
-        output[i] = Simd::neon_m128_reduce_add_epi32(sum);
+        output[i] = SIMD::neon_m128_reduce_add_epi32(sum);

        #endif
    }
@@ -191,20 +191,20 @@ class AffineTransform {
    #if defined(USE_AVX512)
            using vec_t = __m512i;
        #define vec_set_32 _mm512_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
+        #define vec_add_dpbusd_32 SIMD::m512_add_dpbusd_epi32
    #elif defined(USE_AVX2)
            using vec_t = __m256i;
        #define vec_set_32 _mm256_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
+        #define vec_add_dpbusd_32 SIMD::m256_add_dpbusd_epi32
    #elif defined(USE_SSSE3)
            using vec_t = __m128i;
        #define vec_set_32 _mm_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
+        #define vec_add_dpbusd_32 SIMD::m128_add_dpbusd_epi32
    #elif defined(USE_NEON_DOTPROD)
            using vec_t = int32x4_t;
        #define vec_set_32 vdupq_n_s32
        #define vec_add_dpbusd_32(acc, a, b) \
-            Simd::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \
+            SIMD::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \
                                                vreinterpretq_s8_s32(b))
    #endif

@@ -245,23 +245,20 @@ class AffineTransform {
    #if defined(USE_AVX2)
            using vec_t = __m256i;
        #define vec_setzero() _mm256_setzero_si256()
-        #define vec_set_32 _mm256_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
-        #define vec_hadd Simd::m256_hadd
+        #define vec_add_dpbusd_32 SIMD::m256_add_dpbusd_epi32
+        #define vec_hadd SIMD::m256_hadd
    #elif defined(USE_SSSE3)
            using vec_t = __m128i;
        #define vec_setzero() _mm_setzero_si128()
-        #define vec_set_32 _mm_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
-        #define vec_hadd Simd::m128_hadd
+        #define vec_add_dpbusd_32 SIMD::m128_add_dpbusd_epi32
+        #define vec_hadd SIMD::m128_hadd
    #elif defined(USE_NEON_DOTPROD)
            using vec_t = int32x4_t;
        #define vec_setzero() vdupq_n_s32(0)
-        #define vec_set_32 vdupq_n_s32
        #define vec_add_dpbusd_32(acc, a, b) \
-            Simd::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \
+            SIMD::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \
                                                vreinterpretq_s8_s32(b))
-        #define vec_hadd Simd::neon_m128_hadd
+        #define vec_hadd SIMD::neon_m128_hadd
    #endif

            const auto inputVector = reinterpret_cast<const vec_t*>(input);
@@ -282,7 +279,6 @@ class AffineTransform {
            output[0] = vec_hadd(sum0, biases[0]);

    #undef vec_setzero
-    #undef vec_set_32
    #undef vec_add_dpbusd_32
    #undef vec_hadd
        }
--- a/src/nnue/layers/affine_transform_sparse_input.h
+++ b/src/nnue/layers/affine_transform_sparse_input.h
@@ -22,14 +22,12 @@
 #define NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED

 #include <algorithm>
-#include <array>
 #include <cstdint>
 #include <iostream>

 #include "../../bitboard.h"
+#include "../simd.h"
 #include "../nnue_common.h"
-#include "affine_transform.h"
-#include "simd.h"

 /*
  This file contains the definition for a fully connected layer (aka affine transform) with block sparse input.
@@ -51,11 +49,7 @@ constexpr int constexpr_lsb(uint64_t bb) {

 alignas(CacheLineSize) static constexpr struct OffsetIndices {

-    #if (USE_SSE41)
-    std::uint8_t offset_indices[256][8];
-    #else
    std::uint16_t offset_indices[256][8];
-    #endif

    constexpr OffsetIndices() :
        offset_indices() {
@@ -74,56 +68,52 @@ alignas(CacheLineSize) static constexpr struct OffsetIndices {

 } Lookup;

+    #if defined(__GNUC__) || defined(__clang__)
+        #define RESTRICT __restrict__
+    #elif defined(_MSC_VER)
+        #define RESTRICT __restrict
+    #else
+        #define RESTRICT
+    #endif
+
 // Find indices of nonzero numbers in an int32_t array
 template<const IndexType InputDimensions>
-void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_out) {
-    #if defined(USE_SSSE3)
-        #if defined(USE_AVX512)
-    using vec_t = __m512i;
-            #define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512())
-        #elif defined(USE_AVX2)
-    using vec_t = __m256i;
-            #if defined(USE_VNNI) && !defined(USE_AVXVNNI)
-                #define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256())
-            #else
-                #define vec_nnz(a) \
-                    _mm256_movemask_ps( \
-                      _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256())))
-            #endif
-        #elif defined(USE_SSSE3)
-    using vec_t = __m128i;
-            #define vec_nnz(a) \
-                _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128())))
-        #endif
-    using vec128_t = __m128i;
-        #define vec128_zero _mm_setzero_si128()
-        #define vec128_set_16(a) _mm_set1_epi16(a)
-        #if (USE_SSE41)
-            #define vec128_load(a) _mm_cvtepu8_epi16(_mm_loadl_epi64(a))
-        #else
-            #define vec128_load(a) _mm_load_si128(a)
-        #endif
-        #define vec128_storeu(a, b) _mm_storeu_si128(a, b)
-        #define vec128_add(a, b) _mm_add_epi16(a, b)
-    #elif defined(USE_NEON)
-    using vec_t                        = uint32x4_t;
-    static const std::uint32_t Mask[4] = {1, 2, 4, 8};
-        #define vec_nnz(a) vaddvq_u32(vandq_u32(vtstq_u32(a, a), vld1q_u32(Mask)))
-    using vec128_t                     = uint16x8_t;
-        #define vec128_zero vdupq_n_u16(0)
-        #define vec128_set_16(a) vdupq_n_u16(a)
-        #define vec128_load(a) vld1q_u16(reinterpret_cast<const std::uint16_t*>(a))
-        #define vec128_storeu(a, b) vst1q_u16(reinterpret_cast<std::uint16_t*>(a), b)
-        #define vec128_add(a, b) vaddq_u16(a, b)
-    #endif
-    constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(std::int32_t);
+void find_nnz(const std::int32_t* RESTRICT input,
+              std::uint16_t* RESTRICT      out,
+              IndexType&                   count_out) {
+
+    #ifdef USE_AVX512
+    constexpr IndexType SimdWidth = 16;  // 512 bits / 32 bits
+    constexpr IndexType NumChunks = InputDimensions / SimdWidth;
+    const __m512i       increment = _mm512_set1_epi32(SimdWidth);
+    __m512i base = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+
+    IndexType count = 0;
+    for (IndexType i = 0; i < NumChunks; ++i)
+    {
+        const __m512i inputV = _mm512_load_si512(input + i * SimdWidth);
+
+        // Get a bitmask and gather non zero indices
+        const __mmask16 nnzMask = _mm512_test_epi32_mask(inputV, inputV);
+        const __m512i   nnzV    = _mm512_maskz_compress_epi32(nnzMask, base);
+        _mm512_mask_cvtepi32_storeu_epi16(out + count, 0xFFFF, nnzV);
+        count += popcount(nnzMask);
+        base = _mm512_add_epi32(base, increment);
+    }
+    count_out = count;
+
+    #else
+
+    using namespace SIMD;
+
+    constexpr IndexType InputSimdWidth = sizeof(vec_uint_t) / sizeof(std::int32_t);
    // Inputs are processed InputSimdWidth at a time and outputs are processed 8 at a time so we process in chunks of max(InputSimdWidth, 8)
    constexpr IndexType ChunkSize       = std::max<IndexType>(InputSimdWidth, 8);
    constexpr IndexType NumChunks       = InputDimensions / ChunkSize;
    constexpr IndexType InputsPerChunk  = ChunkSize / InputSimdWidth;
    constexpr IndexType OutputsPerChunk = ChunkSize / 8;

-    const auto     inputVector = reinterpret_cast<const vec_t*>(input);
+    const auto     inputVector = reinterpret_cast<const vec_uint_t*>(input);
    IndexType      count       = 0;
    vec128_t       base        = vec128_zero;
    const vec128_t increment   = vec128_set_16(8);
@@ -133,7 +123,7 @@ void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_ou
        unsigned nnz = 0;
        for (IndexType j = 0; j < InputsPerChunk; ++j)
        {
-            const vec_t inputChunk = inputVector[i * InputsPerChunk + j];
+            const vec_uint_t inputChunk = inputVector[i * InputsPerChunk + j];
            nnz |= unsigned(vec_nnz(inputChunk)) << (j * InputSimdWidth);
        }
        for (IndexType j = 0; j < OutputsPerChunk; ++j)
@@ -147,13 +137,9 @@ void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_ou
        }
    }
    count_out = count;
+    #endif
 }
-    #undef vec_nnz
-    #undef vec128_zero
-    #undef vec128_set_16
-    #undef vec128_load
-    #undef vec128_storeu
-    #undef vec128_add
+
 #endif

 // Sparse input implementation
@@ -232,27 +218,27 @@ class AffineTransformSparseInput {
        using invec_t  = __m512i;
        using outvec_t = __m512i;
        #define vec_set_32 _mm512_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
+        #define vec_add_dpbusd_32 SIMD::m512_add_dpbusd_epi32
    #elif defined(USE_AVX2)
        using invec_t  = __m256i;
        using outvec_t = __m256i;
        #define vec_set_32 _mm256_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
+        #define vec_add_dpbusd_32 SIMD::m256_add_dpbusd_epi32
    #elif defined(USE_SSSE3)
        using invec_t  = __m128i;
        using outvec_t = __m128i;
        #define vec_set_32 _mm_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
+        #define vec_add_dpbusd_32 SIMD::m128_add_dpbusd_epi32
    #elif defined(USE_NEON_DOTPROD)
        using invec_t  = int8x16_t;
        using outvec_t = int32x4_t;
        #define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a))
-        #define vec_add_dpbusd_32 Simd::dotprod_m128_add_dpbusd_epi32
+        #define vec_add_dpbusd_32 SIMD::dotprod_m128_add_dpbusd_epi32
    #elif defined(USE_NEON)
        using invec_t  = int8x16_t;
        using outvec_t = int32x4_t;
        #define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a))
-        #define vec_add_dpbusd_32 Simd::neon_m128_add_dpbusd_epi32
+        #define vec_add_dpbusd_32 SIMD::neon_m128_add_dpbusd_epi32
    #endif
        static constexpr IndexType OutputSimdWidth = sizeof(outvec_t) / sizeof(OutputType);

--- a/src/nnue/layers/simd.h
+++ b/src/nnue/layers/simd.h
@@ -1,134 +0,0 @@
-/*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2025 The Stockfish developers (see AUTHORS file)
-
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef STOCKFISH_SIMD_H_INCLUDED
-#define STOCKFISH_SIMD_H_INCLUDED
-
-#if defined(USE_AVX2)
-    #include <immintrin.h>
-
-#elif defined(USE_SSE41)
-    #include <smmintrin.h>
-
-#elif defined(USE_SSSE3)
-    #include <tmmintrin.h>
-
-#elif defined(USE_SSE2)
-    #include <emmintrin.h>
-
-#elif defined(USE_NEON)
-    #include <arm_neon.h>
-#endif
-
-namespace Stockfish::Simd {
-
-#if defined(USE_AVX512)
-
-[[maybe_unused]] static int m512_hadd(__m512i sum, int bias) {
-    return _mm512_reduce_add_epi32(sum) + bias;
-}
-
-[[maybe_unused]] static void m512_add_dpbusd_epi32(__m512i& acc, __m512i a, __m512i b) {
-
-    #if defined(USE_VNNI)
-    acc = _mm512_dpbusd_epi32(acc, a, b);
-    #else
-    __m512i product0 = _mm512_maddubs_epi16(a, b);
-    product0         = _mm512_madd_epi16(product0, _mm512_set1_epi16(1));
-    acc              = _mm512_add_epi32(acc, product0);
-    #endif
-}
-
-#endif
-
-#if defined(USE_AVX2)
-
-[[maybe_unused]] static int m256_hadd(__m256i sum, int bias) {
-    __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
-    sum128         = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
-    sum128         = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
-    return _mm_cvtsi128_si32(sum128) + bias;
-}
-
-[[maybe_unused]] static void m256_add_dpbusd_epi32(__m256i& acc, __m256i a, __m256i b) {
-
-    #if defined(USE_VNNI)
-    acc = _mm256_dpbusd_epi32(acc, a, b);
-    #else
-    __m256i product0 = _mm256_maddubs_epi16(a, b);
-    product0         = _mm256_madd_epi16(product0, _mm256_set1_epi16(1));
-    acc              = _mm256_add_epi32(acc, product0);
-    #endif
-}
-
-#endif
-
-#if defined(USE_SSSE3)
-
-[[maybe_unused]] static int m128_hadd(__m128i sum, int bias) {
-    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E));  //_MM_PERM_BADC
-    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));  //_MM_PERM_CDAB
-    return _mm_cvtsi128_si32(sum) + bias;
-}
-
-[[maybe_unused]] static void m128_add_dpbusd_epi32(__m128i& acc, __m128i a, __m128i b) {
-
-    __m128i product0 = _mm_maddubs_epi16(a, b);
-    product0         = _mm_madd_epi16(product0, _mm_set1_epi16(1));
-    acc              = _mm_add_epi32(acc, product0);
-}
-
-#endif
-
-#if defined(USE_NEON_DOTPROD)
-
-[[maybe_unused]] static void
-dotprod_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) {
-
-    acc = vdotq_s32(acc, a, b);
-}
-#endif
-
-#if defined(USE_NEON)
-
-[[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
-    #if USE_NEON >= 8
-    return vaddvq_s32(s);
-    #else
-    return s[0] + s[1] + s[2] + s[3];
-    #endif
-}
-
-[[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) {
-    return neon_m128_reduce_add_epi32(sum) + bias;
-}
-
-#endif
-
-#if USE_NEON >= 8
-[[maybe_unused]] static void neon_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) {
-
-    int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
-    int16x8_t product1 = vmull_high_s8(a, b);
-    int16x8_t sum      = vpaddq_s16(product0, product1);
-    acc                = vpadalq_s16(acc, sum);
-}
-#endif
-}
-
-#endif  // STOCKFISH_SIMD_H_INCLUDED
--- a/src/nnue/network.cpp
+++ b/src/nnue/network.cpp
@@ -212,21 +212,11 @@ NetworkOutput
 Network<Arch, Transformer>::evaluate(const Position&                         pos,
                                     AccumulatorStack&                       accumulatorStack,
                                     AccumulatorCaches::Cache<FTDimensions>* cache) const {
-    // We manually align the arrays on the stack because with gcc < 9.3
-    // overaligning stack variables with alignas() doesn't work correctly.

    constexpr uint64_t alignment = CacheLineSize;

-#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
-    TransformedFeatureType
-      transformedFeaturesUnaligned[FeatureTransformer<FTDimensions, nullptr>::BufferSize
-                                   + alignment / sizeof(TransformedFeatureType)];
-
-    auto* transformedFeatures = align_ptr_up<alignment>(&transformedFeaturesUnaligned[0]);
-#else
-    alignas(alignment) TransformedFeatureType
-      transformedFeatures[FeatureTransformer<FTDimensions, nullptr>::BufferSize];
-#endif
+    alignas(alignment)
+      TransformedFeatureType transformedFeatures[FeatureTransformer<FTDimensions>::BufferSize];

    ASSERT_ALIGNED(transformedFeatures, alignment);

@@ -284,20 +274,11 @@ NnueEvalTrace
 Network<Arch, Transformer>::trace_evaluate(const Position&                         pos,
                                           AccumulatorStack&                       accumulatorStack,
                                           AccumulatorCaches::Cache<FTDimensions>* cache) const {
-    // We manually align the arrays on the stack because with gcc < 9.3
-    // overaligning stack variables with alignas() doesn't work correctly.
+
    constexpr uint64_t alignment = CacheLineSize;

-#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
-    TransformedFeatureType
-      transformedFeaturesUnaligned[FeatureTransformer<FTDimensions, nullptr>::BufferSize
-                                   + alignment / sizeof(TransformedFeatureType)];
-
-    auto* transformedFeatures = align_ptr_up<alignment>(&transformedFeaturesUnaligned[0]);
-#else
-    alignas(alignment) TransformedFeatureType
-      transformedFeatures[FeatureTransformer<FTDimensions, nullptr>::BufferSize];
-#endif
+    alignas(alignment)
+      TransformedFeatureType transformedFeatures[FeatureTransformer<FTDimensions>::BufferSize];

    ASSERT_ALIGNED(transformedFeatures, alignment);

@@ -452,12 +433,10 @@ bool Network<Arch, Transformer>::write_parameters(std::ostream&      stream,

 // Explicit template instantiations

-template class Network<
-  NetworkArchitecture<TransformedFeatureDimensionsBig, L2Big, L3Big>,
-  FeatureTransformer<TransformedFeatureDimensionsBig, &AccumulatorState::accumulatorBig>>;
+template class Network<NetworkArchitecture<TransformedFeatureDimensionsBig, L2Big, L3Big>,
+                       FeatureTransformer<TransformedFeatureDimensionsBig>>;

-template class Network<
-  NetworkArchitecture<TransformedFeatureDimensionsSmall, L2Small, L3Small>,
-  FeatureTransformer<TransformedFeatureDimensionsSmall, &AccumulatorState::accumulatorSmall>>;
+template class Network<NetworkArchitecture<TransformedFeatureDimensionsSmall, L2Small, L3Small>,
+                       FeatureTransformer<TransformedFeatureDimensionsSmall>>;

 }  // namespace Stockfish::Eval::NNUE
--- a/src/nnue/network.h
+++ b/src/nnue/network.h
@@ -32,6 +32,7 @@
 #include "../types.h"
 #include "nnue_accumulator.h"
 #include "nnue_architecture.h"
+#include "nnue_common.h"
 #include "nnue_feature_transformer.h"
 #include "nnue_misc.h"

@@ -110,13 +111,11 @@ class Network {
 };

 // Definitions of the network types
-using SmallFeatureTransformer =
-  FeatureTransformer<TransformedFeatureDimensionsSmall, &AccumulatorState::accumulatorSmall>;
+using SmallFeatureTransformer = FeatureTransformer<TransformedFeatureDimensionsSmall>;
 using SmallNetworkArchitecture =
  NetworkArchitecture<TransformedFeatureDimensionsSmall, L2Small, L3Small>;

-using BigFeatureTransformer =
-  FeatureTransformer<TransformedFeatureDimensionsBig, &AccumulatorState::accumulatorBig>;
+using BigFeatureTransformer  = FeatureTransformer<TransformedFeatureDimensionsBig>;
 using BigNetworkArchitecture = NetworkArchitecture<TransformedFeatureDimensionsBig, L2Big, L3Big>;

 using NetworkBig   = Network<BigNetworkArchitecture, BigFeatureTransformer>;
--- a/src/nnue/nnue_accumulator.cpp
+++ b/src/nnue/nnue_accumulator.cpp
@@ -19,49 +19,43 @@
 #include "nnue_accumulator.h"

 #include <cassert>
+#include <cstdint>
 #include <initializer_list>
-#include <memory>
+#include <type_traits>

 #include "../bitboard.h"
+#include "../misc.h"
 #include "../position.h"
 #include "../types.h"
-#include "network.h"
 #include "nnue_architecture.h"
-#include "nnue_common.h"
-#include "nnue_feature_transformer.h"
+#include "nnue_feature_transformer.h"  // IWYU pragma: keep
+#include "simd.h"

 namespace Stockfish::Eval::NNUE {

-#if defined(__GNUC__) && !defined(__clang__)
-    #define sf_assume(cond) \
-        do \
-        { \
-            if (!(cond)) \
-                __builtin_unreachable(); \
-        } while (0)
-#else
-    // do nothing for other compilers
-    #define sf_assume(cond)
-#endif
+using namespace SIMD;

 namespace {

-template<Color                                     Perspective,
-         IncUpdateDirection                        Direction = FORWARD,
-         IndexType                                 TransformedFeatureDimensions,
-         Accumulator<TransformedFeatureDimensions> AccumulatorState::*accPtr>
-void update_accumulator_incremental(
-  const FeatureTransformer<TransformedFeatureDimensions, accPtr>& featureTransformer,
-  const Square                                                    ksq,
-  AccumulatorState&                                               target_state,
-  const AccumulatorState&                                         computed);
+template<Color Perspective, IndexType TransformedFeatureDimensions>
+void double_inc_update(const FeatureTransformer<TransformedFeatureDimensions>& featureTransformer,
+                       const Square                                            ksq,
+                       AccumulatorState&                                       middle_state,
+                       AccumulatorState&                                       target_state,
+                       const AccumulatorState&                                 computed);

-template<Color Perspective, IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
-void update_accumulator_refresh_cache(
-  const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
-  const Position&                               pos,
-  AccumulatorState&                             accumulatorState,
-  AccumulatorCaches::Cache<Dimensions>&         cache);
+template<Color Perspective, bool Forward, IndexType TransformedFeatureDimensions>
+void update_accumulator_incremental(
+  const FeatureTransformer<TransformedFeatureDimensions>& featureTransformer,
+  const Square                                            ksq,
+  AccumulatorState&                                       target_state,
+  const AccumulatorState&                                 computed);
+
+template<Color Perspective, IndexType Dimensions>
+void update_accumulator_refresh_cache(const FeatureTransformer<Dimensions>& featureTransformer,
+                                      const Position&                       pos,
+                                      AccumulatorState&                     accumulatorState,
+                                      AccumulatorCaches::Cache<Dimensions>& cache);

 }

@@ -71,63 +65,43 @@ void AccumulatorState::reset(const DirtyPiece& dp) noexcept {
    accumulatorSmall.computed.fill(false);
 }

-const AccumulatorState& AccumulatorStack::latest() const noexcept {
-    return m_accumulators[m_current_idx - 1];
-}
+const AccumulatorState& AccumulatorStack::latest() const noexcept { return accumulators[size - 1]; }

-AccumulatorState& AccumulatorStack::mut_latest() noexcept {
-    return m_accumulators[m_current_idx - 1];
-}
+AccumulatorState& AccumulatorStack::mut_latest() noexcept { return accumulators[size - 1]; }

-void AccumulatorStack::reset(const Position&    rootPos,
-                             const Networks&    networks,
-                             AccumulatorCaches& caches) noexcept {
-    m_current_idx = 1;
-
-    update_accumulator_refresh_cache<WHITE, TransformedFeatureDimensionsBig,
-                                     &AccumulatorState::accumulatorBig>(
-      *networks.big.featureTransformer, rootPos, m_accumulators[0], caches.big);
-    update_accumulator_refresh_cache<BLACK, TransformedFeatureDimensionsBig,
-                                     &AccumulatorState::accumulatorBig>(
-      *networks.big.featureTransformer, rootPos, m_accumulators[0], caches.big);
-
-    update_accumulator_refresh_cache<WHITE, TransformedFeatureDimensionsSmall,
-                                     &AccumulatorState::accumulatorSmall>(
-      *networks.small.featureTransformer, rootPos, m_accumulators[0], caches.small);
-    update_accumulator_refresh_cache<BLACK, TransformedFeatureDimensionsSmall,
-                                     &AccumulatorState::accumulatorSmall>(
-      *networks.small.featureTransformer, rootPos, m_accumulators[0], caches.small);
+void AccumulatorStack::reset() noexcept {
+    accumulators[0].reset({});
+    size = 1;
 }

 void AccumulatorStack::push(const DirtyPiece& dirtyPiece) noexcept {
-    assert(m_current_idx + 1 < m_accumulators.size());
-    m_accumulators[m_current_idx].reset(dirtyPiece);
-    m_current_idx++;
+    assert(size + 1 < accumulators.size());
+    accumulators[size].reset(dirtyPiece);
+    size++;
 }

 void AccumulatorStack::pop() noexcept {
-    assert(m_current_idx > 1);
-    m_current_idx--;
+    assert(size > 1);
+    size--;
 }

-template<IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
-void AccumulatorStack::evaluate(const Position&                               pos,
-                                const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
-                                AccumulatorCaches::Cache<Dimensions>&         cache) noexcept {
+template<IndexType Dimensions>
+void AccumulatorStack::evaluate(const Position&                       pos,
+                                const FeatureTransformer<Dimensions>& featureTransformer,
+                                AccumulatorCaches::Cache<Dimensions>& cache) noexcept {

    evaluate_side<WHITE>(pos, featureTransformer, cache);
    evaluate_side<BLACK>(pos, featureTransformer, cache);
 }

-template<Color Perspective, IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
-void AccumulatorStack::evaluate_side(
-  const Position&                               pos,
-  const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
-  AccumulatorCaches::Cache<Dimensions>&         cache) noexcept {
+template<Color Perspective, IndexType Dimensions>
+void AccumulatorStack::evaluate_side(const Position&                       pos,
+                                     const FeatureTransformer<Dimensions>& featureTransformer,
+                                     AccumulatorCaches::Cache<Dimensions>& cache) noexcept {

-    const auto last_usable_accum = find_last_usable_accumulator<Perspective, Dimensions, accPtr>();
+    const auto last_usable_accum = find_last_usable_accumulator<Perspective, Dimensions>();

-    if ((m_accumulators[last_usable_accum].*accPtr).computed[Perspective])
+    if ((accumulators[last_usable_accum].template acc<Dimensions>()).computed[Perspective])
        forward_update_incremental<Perspective>(pos, featureTransformer, last_usable_accum);

    else
@@ -139,91 +113,202 @@ void AccumulatorStack::evaluate_side(

 // Find the earliest usable accumulator, this can either be a computed accumulator or the accumulator
 // state just before a change that requires full refresh.
-template<Color Perspective, IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
+template<Color Perspective, IndexType Dimensions>
 std::size_t AccumulatorStack::find_last_usable_accumulator() const noexcept {

-    for (std::size_t curr_idx = m_current_idx - 1; curr_idx > 0; curr_idx--)
+    for (std::size_t curr_idx = size - 1; curr_idx > 0; curr_idx--)
    {
-        if ((m_accumulators[curr_idx].*accPtr).computed[Perspective])
+        if ((accumulators[curr_idx].template acc<Dimensions>()).computed[Perspective])
            return curr_idx;

-        if (FeatureSet::requires_refresh(m_accumulators[curr_idx].dirtyPiece, Perspective))
+        if (FeatureSet::requires_refresh(accumulators[curr_idx].dirtyPiece, Perspective))
            return curr_idx;
    }

    return 0;
 }

-template<Color Perspective, IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
+template<Color Perspective, IndexType Dimensions>
 void AccumulatorStack::forward_update_incremental(
-  const Position&                               pos,
-  const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
-  const std::size_t                             begin) noexcept {
+  const Position&                       pos,
+  const FeatureTransformer<Dimensions>& featureTransformer,
+  const std::size_t                     begin) noexcept {

-    assert(begin < m_accumulators.size());
-    assert((m_accumulators[begin].*accPtr).computed[Perspective]);
+    assert(begin < accumulators.size());
+    assert((accumulators[begin].acc<Dimensions>()).computed[Perspective]);

    const Square ksq = pos.square<KING>(Perspective);

-    for (std::size_t next = begin + 1; next < m_current_idx; next++)
-        update_accumulator_incremental<Perspective>(featureTransformer, ksq, m_accumulators[next],
-                                                    m_accumulators[next - 1]);
+    for (std::size_t next = begin + 1; next < size; next++)
+    {
+        if (next + 1 < size)
+        {
+            DirtyPiece& dp1 = accumulators[next].dirtyPiece;
+            DirtyPiece& dp2 = accumulators[next + 1].dirtyPiece;

-    assert((latest().*accPtr).computed[Perspective]);
+            if (dp1.to != SQ_NONE && dp1.to == dp2.remove_sq)
+            {
+                const Square captureSq = dp1.to;
+                dp1.to = dp2.remove_sq = SQ_NONE;
+                double_inc_update<Perspective>(featureTransformer, ksq, accumulators[next],
+                                               accumulators[next + 1], accumulators[next - 1]);
+                dp1.to = dp2.remove_sq = captureSq;
+
+                next++;
+                continue;
+            }
+        }
+        update_accumulator_incremental<Perspective, true>(
+          featureTransformer, ksq, accumulators[next], accumulators[next - 1]);
+    }
+
+    assert((latest().acc<Dimensions>()).computed[Perspective]);
 }

-template<Color Perspective, IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
+template<Color Perspective, IndexType Dimensions>
 void AccumulatorStack::backward_update_incremental(
-  const Position&                               pos,
-  const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
-  const std::size_t                             end) noexcept {
+  const Position&                       pos,
+  const FeatureTransformer<Dimensions>& featureTransformer,
+  const std::size_t                     end) noexcept {

-    assert(end < m_accumulators.size());
-    assert(end < m_current_idx);
-    assert((latest().*accPtr).computed[Perspective]);
+    assert(end < accumulators.size());
+    assert(end < size);
+    assert((latest().acc<Dimensions>()).computed[Perspective]);

    const Square ksq = pos.square<KING>(Perspective);

-    for (std::size_t next = m_current_idx - 2; next >= end; next--)
-        update_accumulator_incremental<Perspective, BACKWARDS>(
-          featureTransformer, ksq, m_accumulators[next], m_accumulators[next + 1]);
+    for (std::int64_t next = std::int64_t(size) - 2; next >= std::int64_t(end); next--)
+        update_accumulator_incremental<Perspective, false>(
+          featureTransformer, ksq, accumulators[next], accumulators[next + 1]);

-    assert((m_accumulators[end].*accPtr).computed[Perspective]);
+    assert((accumulators[end].acc<Dimensions>()).computed[Perspective]);
 }

 // Explicit template instantiations
-template void
-AccumulatorStack::evaluate<TransformedFeatureDimensionsBig, &AccumulatorState::accumulatorBig>(
-  const Position& pos,
-  const FeatureTransformer<TransformedFeatureDimensionsBig, &AccumulatorState::accumulatorBig>&
-                                                             featureTransformer,
+template void AccumulatorStack::evaluate<TransformedFeatureDimensionsBig>(
+  const Position&                                            pos,
+  const FeatureTransformer<TransformedFeatureDimensionsBig>& featureTransformer,
  AccumulatorCaches::Cache<TransformedFeatureDimensionsBig>& cache) noexcept;
-template void
-AccumulatorStack::evaluate<TransformedFeatureDimensionsSmall, &AccumulatorState::accumulatorSmall>(
-  const Position& pos,
-  const FeatureTransformer<TransformedFeatureDimensionsSmall, &AccumulatorState::accumulatorSmall>&
-                                                               featureTransformer,
+template void AccumulatorStack::evaluate<TransformedFeatureDimensionsSmall>(
+  const Position&                                              pos,
+  const FeatureTransformer<TransformedFeatureDimensionsSmall>& featureTransformer,
  AccumulatorCaches::Cache<TransformedFeatureDimensionsSmall>& cache) noexcept;


 namespace {

-template<Color                                     Perspective,
-         IncUpdateDirection                        Direction,
-         IndexType                                 TransformedFeatureDimensions,
-         Accumulator<TransformedFeatureDimensions> AccumulatorState::*accPtr>
+template<typename VectorWrapper,
+         IndexType Width,
+         UpdateOperation... ops,
+         typename ElementType,
+         typename... Ts,
+         std::enable_if_t<is_all_same_v<ElementType, Ts...>, bool> = true>
+void fused_row_reduce(const ElementType* in, ElementType* out, const Ts* const... rows) {
+    constexpr IndexType size = Width * sizeof(ElementType) / sizeof(typename VectorWrapper::type);
+
+    auto* vecIn  = reinterpret_cast<const typename VectorWrapper::type*>(in);
+    auto* vecOut = reinterpret_cast<typename VectorWrapper::type*>(out);
+
+    for (IndexType i = 0; i < size; ++i)
+        vecOut[i] = fused<VectorWrapper, ops...>(
+          vecIn[i], reinterpret_cast<const typename VectorWrapper::type*>(rows)[i]...);
+}
+
+template<Color Perspective, IndexType Dimensions>
+struct AccumulatorUpdateContext {
+    const FeatureTransformer<Dimensions>& featureTransformer;
+    const AccumulatorState&               from;
+    AccumulatorState&                     to;
+
+    AccumulatorUpdateContext(const FeatureTransformer<Dimensions>& ft,
+                             const AccumulatorState&               accF,
+                             AccumulatorState&                     accT) noexcept :
+        featureTransformer{ft},
+        from{accF},
+        to{accT} {}
+
+    template<UpdateOperation... ops,
+             typename... Ts,
+             std::enable_if_t<is_all_same_v<IndexType, Ts...>, bool> = true>
+    void apply(const Ts... indices) {
+        auto to_weight_vector = [&](const IndexType index) {
+            return &featureTransformer.weights[index * Dimensions];
+        };
+
+        auto to_psqt_weight_vector = [&](const IndexType index) {
+            return &featureTransformer.psqtWeights[index * PSQTBuckets];
+        };
+
+        fused_row_reduce<Vec16Wrapper, Dimensions, ops...>(
+          (from.acc<Dimensions>()).accumulation[Perspective],
+          (to.acc<Dimensions>()).accumulation[Perspective], to_weight_vector(indices)...);
+
+        fused_row_reduce<Vec32Wrapper, PSQTBuckets, ops...>(
+          (from.acc<Dimensions>()).psqtAccumulation[Perspective],
+          (to.acc<Dimensions>()).psqtAccumulation[Perspective], to_psqt_weight_vector(indices)...);
+    }
+};
+
+template<Color Perspective, IndexType Dimensions>
+auto make_accumulator_update_context(const FeatureTransformer<Dimensions>& featureTransformer,
+                                     const AccumulatorState&               accumulatorFrom,
+                                     AccumulatorState&                     accumulatorTo) noexcept {
+    return AccumulatorUpdateContext<Perspective, Dimensions>{featureTransformer, accumulatorFrom,
+                                                             accumulatorTo};
+}
+
+template<Color Perspective, IndexType TransformedFeatureDimensions>
+void double_inc_update(const FeatureTransformer<TransformedFeatureDimensions>& featureTransformer,
+                       const Square                                            ksq,
+                       AccumulatorState&                                       middle_state,
+                       AccumulatorState&                                       target_state,
+                       const AccumulatorState&                                 computed) {
+
+    assert(computed.acc<TransformedFeatureDimensions>().computed[Perspective]);
+    assert(!middle_state.acc<TransformedFeatureDimensions>().computed[Perspective]);
+    assert(!target_state.acc<TransformedFeatureDimensions>().computed[Perspective]);
+
+    FeatureSet::IndexList removed, added;
+    FeatureSet::append_changed_indices<Perspective>(ksq, middle_state.dirtyPiece, removed, added);
+    // you can't capture a piece that was just involved in castling since the rook ends up
+    // in a square that the king passed
+    assert(added.size() < 2);
+    FeatureSet::append_changed_indices<Perspective>(ksq, target_state.dirtyPiece, removed, added);
+
+    assert(added.size() == 1);
+    assert(removed.size() == 2 || removed.size() == 3);
+
+    // Workaround compiler warning for uninitialized variables, replicated on
+    // profile builds on windows with gcc 14.2.0.
+    // TODO remove once unneeded
+    sf_assume(added.size() == 1);
+    sf_assume(removed.size() == 2 || removed.size() == 3);
+
+    auto updateContext =
+      make_accumulator_update_context<Perspective>(featureTransformer, computed, target_state);
+
+    if (removed.size() == 2)
+    {
+        updateContext.template apply<Add, Sub, Sub>(added[0], removed[0], removed[1]);
+    }
+    else
+    {
+        updateContext.template apply<Add, Sub, Sub, Sub>(added[0], removed[0], removed[1],
+                                                         removed[2]);
+    }
+
+    target_state.acc<TransformedFeatureDimensions>().computed[Perspective] = true;
+}
+
+template<Color Perspective, bool Forward, IndexType TransformedFeatureDimensions>
 void update_accumulator_incremental(
-  const FeatureTransformer<TransformedFeatureDimensions, accPtr>& featureTransformer,
-  const Square                                                    ksq,
-  AccumulatorState&                                               target_state,
-  const AccumulatorState&                                         computed) {
-    [[maybe_unused]] constexpr bool Forward   = Direction == FORWARD;
-    [[maybe_unused]] constexpr bool Backwards = Direction == BACKWARDS;
+  const FeatureTransformer<TransformedFeatureDimensions>& featureTransformer,
+  const Square                                            ksq,
+  AccumulatorState&                                       target_state,
+  const AccumulatorState&                                 computed) {

-    assert(Forward != Backwards);
-
-    assert((computed.*accPtr).computed[Perspective]);
-    assert(!(target_state.*accPtr).computed[Perspective]);
+    assert((computed.acc<TransformedFeatureDimensions>()).computed[Perspective]);
+    assert(!(target_state.acc<TransformedFeatureDimensions>()).computed[Perspective]);

    // The size must be enough to contain the largest possible update.
    // That might depend on the feature set and generally relies on the
@@ -238,188 +323,52 @@ void update_accumulator_incremental(
    else
        FeatureSet::append_changed_indices<Perspective>(ksq, computed.dirtyPiece, added, removed);

-    if (removed.size() == 0 && added.size() == 0)
+    assert(added.size() == 1 || added.size() == 2);
+    assert(removed.size() == 1 || removed.size() == 2);
+    assert((Forward && added.size() <= removed.size())
+           || (!Forward && added.size() >= removed.size()));
+
+    // Workaround compiler warning for uninitialized variables, replicated on
+    // profile builds on windows with gcc 14.2.0.
+    // TODO remove once unneeded
+    sf_assume(added.size() == 1 || added.size() == 2);
+    sf_assume(removed.size() == 1 || removed.size() == 2);
+
+    auto updateContext =
+      make_accumulator_update_context<Perspective>(featureTransformer, computed, target_state);
+
+    if ((Forward && removed.size() == 1) || (!Forward && added.size() == 1))
    {
-        std::memcpy((target_state.*accPtr).accumulation[Perspective],
-                    (computed.*accPtr).accumulation[Perspective],
-                    TransformedFeatureDimensions * sizeof(BiasType));
-        std::memcpy((target_state.*accPtr).psqtAccumulation[Perspective],
-                    (computed.*accPtr).psqtAccumulation[Perspective],
-                    PSQTBuckets * sizeof(PSQTWeightType));
+        assert(added.size() == 1 && removed.size() == 1);
+        updateContext.template apply<Add, Sub>(added[0], removed[0]);
+    }
+    else if (Forward && added.size() == 1)
+    {
+        assert(removed.size() == 2);
+        updateContext.template apply<Add, Sub, Sub>(added[0], removed[0], removed[1]);
+    }
+    else if (!Forward && removed.size() == 1)
+    {
+        assert(added.size() == 2);
+        updateContext.template apply<Add, Add, Sub>(added[0], added[1], removed[0]);
    }
    else
    {
-        assert(added.size() == 1 || added.size() == 2);
-        assert(removed.size() == 1 || removed.size() == 2);
-
-        if (Forward)
-            assert(added.size() <= removed.size());
-        else
-            assert(removed.size() <= added.size());
-
-        // Workaround compiler warning for uninitialized variables, replicated on
-        // profile builds on windows with gcc 14.2.0.
-        // TODO remove once unneeded
-        sf_assume(added.size() == 1 || added.size() == 2);
-        sf_assume(removed.size() == 1 || removed.size() == 2);
-
-#ifdef VECTOR
-        auto* accIn =
-          reinterpret_cast<const vec_t*>(&(computed.*accPtr).accumulation[Perspective][0]);
-        auto* accOut =
-          reinterpret_cast<vec_t*>(&(target_state.*accPtr).accumulation[Perspective][0]);
-
-        const IndexType offsetA0 = TransformedFeatureDimensions * added[0];
-        auto* columnA0 = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetA0]);
-        const IndexType offsetR0 = TransformedFeatureDimensions * removed[0];
-        auto* columnR0 = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetR0]);
-
-        if ((Forward && removed.size() == 1) || (Backwards && added.size() == 1))
-        {
-            assert(added.size() == 1 && removed.size() == 1);
-            for (IndexType i = 0;
-                 i < TransformedFeatureDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
-                accOut[i] = vec_add_16(vec_sub_16(accIn[i], columnR0[i]), columnA0[i]);
-        }
-        else if (Forward && added.size() == 1)
-        {
-            assert(removed.size() == 2);
-            const IndexType offsetR1 = TransformedFeatureDimensions * removed[1];
-            auto* columnR1 = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetR1]);
-
-            for (IndexType i = 0;
-                 i < TransformedFeatureDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
-                accOut[i] = vec_sub_16(vec_add_16(accIn[i], columnA0[i]),
-                                       vec_add_16(columnR0[i], columnR1[i]));
-        }
-        else if (Backwards && removed.size() == 1)
-        {
-            assert(added.size() == 2);
-            const IndexType offsetA1 = TransformedFeatureDimensions * added[1];
-            auto* columnA1 = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetA1]);
-
-            for (IndexType i = 0;
-                 i < TransformedFeatureDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
-                accOut[i] = vec_add_16(vec_add_16(accIn[i], columnA0[i]),
-                                       vec_sub_16(columnA1[i], columnR0[i]));
-        }
-        else
-        {
-            assert(added.size() == 2 && removed.size() == 2);
-            const IndexType offsetA1 = TransformedFeatureDimensions * added[1];
-            auto* columnA1 = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetA1]);
-            const IndexType offsetR1 = TransformedFeatureDimensions * removed[1];
-            auto* columnR1 = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetR1]);
-
-            for (IndexType i = 0;
-                 i < TransformedFeatureDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
-                accOut[i] = vec_add_16(accIn[i], vec_sub_16(vec_add_16(columnA0[i], columnA1[i]),
-                                                            vec_add_16(columnR0[i], columnR1[i])));
-        }
-
-        auto* accPsqtIn =
-          reinterpret_cast<const psqt_vec_t*>(&(computed.*accPtr).psqtAccumulation[Perspective][0]);
-        auto* accPsqtOut =
-          reinterpret_cast<psqt_vec_t*>(&(target_state.*accPtr).psqtAccumulation[Perspective][0]);
-
-        const IndexType offsetPsqtA0 = PSQTBuckets * added[0];
-        auto*           columnPsqtA0 =
-          reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offsetPsqtA0]);
-        const IndexType offsetPsqtR0 = PSQTBuckets * removed[0];
-        auto*           columnPsqtR0 =
-          reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offsetPsqtR0]);
-
-        if ((Forward && removed.size() == 1)
-            || (Backwards && added.size() == 1))  // added.size() == removed.size() == 1
-        {
-            for (std::size_t i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t);
-                 ++i)
-                accPsqtOut[i] =
-                  vec_add_psqt_32(vec_sub_psqt_32(accPsqtIn[i], columnPsqtR0[i]), columnPsqtA0[i]);
-        }
-        else if (Forward && added.size() == 1)
-        {
-            const IndexType offsetPsqtR1 = PSQTBuckets * removed[1];
-            auto*           columnPsqtR1 =
-              reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offsetPsqtR1]);
-
-            for (std::size_t i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t);
-                 ++i)
-                accPsqtOut[i] = vec_sub_psqt_32(vec_add_psqt_32(accPsqtIn[i], columnPsqtA0[i]),
-                                                vec_add_psqt_32(columnPsqtR0[i], columnPsqtR1[i]));
-        }
-        else if (Backwards && removed.size() == 1)
-        {
-            const IndexType offsetPsqtA1 = PSQTBuckets * added[1];
-            auto*           columnPsqtA1 =
-              reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offsetPsqtA1]);
-
-            for (std::size_t i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t);
-                 ++i)
-                accPsqtOut[i] = vec_add_psqt_32(vec_add_psqt_32(accPsqtIn[i], columnPsqtA0[i]),
-                                                vec_sub_psqt_32(columnPsqtA1[i], columnPsqtR0[i]));
-        }
-        else
-        {
-            const IndexType offsetPsqtA1 = PSQTBuckets * added[1];
-            auto*           columnPsqtA1 =
-              reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offsetPsqtA1]);
-            const IndexType offsetPsqtR1 = PSQTBuckets * removed[1];
-            auto*           columnPsqtR1 =
-              reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offsetPsqtR1]);
-
-            for (std::size_t i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t);
-                 ++i)
-                accPsqtOut[i] = vec_add_psqt_32(
-                  accPsqtIn[i], vec_sub_psqt_32(vec_add_psqt_32(columnPsqtA0[i], columnPsqtA1[i]),
-                                                vec_add_psqt_32(columnPsqtR0[i], columnPsqtR1[i])));
-        }
-#else
-        std::memcpy((target_state.*accPtr).accumulation[Perspective],
-                    (computed.*accPtr).accumulation[Perspective],
-                    TransformedFeatureDimensions * sizeof(BiasType));
-        std::memcpy((target_state.*accPtr).psqtAccumulation[Perspective],
-                    (computed.*accPtr).psqtAccumulation[Perspective],
-                    PSQTBuckets * sizeof(PSQTWeightType));
-
-        // Difference calculation for the deactivated features
-        for (const auto index : removed)
-        {
-            const IndexType offset = TransformedFeatureDimensions * index;
-            for (IndexType i = 0; i < TransformedFeatureDimensions; ++i)
-                (target_state.*accPtr).accumulation[Perspective][i] -=
-                  featureTransformer.weights[offset + i];
-
-            for (std::size_t i = 0; i < PSQTBuckets; ++i)
-                (target_state.*accPtr).psqtAccumulation[Perspective][i] -=
-                  featureTransformer.psqtWeights[index * PSQTBuckets + i];
-        }
-
-        // Difference calculation for the activated features
-        for (const auto index : added)
-        {
-            const IndexType offset = TransformedFeatureDimensions * index;
-            for (IndexType i = 0; i < TransformedFeatureDimensions; ++i)
-                (target_state.*accPtr).accumulation[Perspective][i] +=
-                  featureTransformer.weights[offset + i];
-
-            for (std::size_t i = 0; i < PSQTBuckets; ++i)
-                (target_state.*accPtr).psqtAccumulation[Perspective][i] +=
-                  featureTransformer.psqtWeights[index * PSQTBuckets + i];
-        }
-#endif
+        assert(added.size() == 2 && removed.size() == 2);
+        updateContext.template apply<Add, Add, Sub, Sub>(added[0], added[1], removed[0],
+                                                         removed[1]);
    }

-    (target_state.*accPtr).computed[Perspective] = true;
+    (target_state.acc<TransformedFeatureDimensions>()).computed[Perspective] = true;
 }

-template<Color Perspective, IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
-void update_accumulator_refresh_cache(
-  const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
-  const Position&                               pos,
-  AccumulatorState&                             accumulatorState,
-  AccumulatorCaches::Cache<Dimensions>&         cache) {
-    using Tiling [[maybe_unused]] = SIMDTiling<Dimensions, Dimensions>;
+template<Color Perspective, IndexType Dimensions>
+void update_accumulator_refresh_cache(const FeatureTransformer<Dimensions>& featureTransformer,
+                                      const Position&                       pos,
+                                      AccumulatorState&                     accumulatorState,
+                                      AccumulatorCaches::Cache<Dimensions>& cache) {
+
+    using Tiling [[maybe_unused]] = SIMDTiling<Dimensions, Dimensions, PSQTBuckets>;

    const Square          ksq   = pos.square<KING>(Perspective);
    auto&                 entry = cache[ksq][Perspective];
@@ -448,12 +397,10 @@ void update_accumulator_refresh_cache(
        }
    }

-    auto& accumulator                 = accumulatorState.*accPtr;
+    auto& accumulator                 = accumulatorState.acc<Dimensions>();
    accumulator.computed[Perspective] = true;

 #ifdef VECTOR
-    const bool combineLast3 =
-      std::abs((int) removed.size() - (int) added.size()) == 1 && removed.size() + added.size() > 2;
    vec_t      acc[Tiling::NumRegs];
    psqt_vec_t psqt[Tiling::NumPsqtRegs];

@@ -466,8 +413,8 @@ void update_accumulator_refresh_cache(
        for (IndexType k = 0; k < Tiling::NumRegs; ++k)
            acc[k] = entryTile[k];

-        std::size_t i = 0;
-        for (; i < std::min(removed.size(), added.size()) - combineLast3; ++i)
+        IndexType i = 0;
+        for (; i < std::min(removed.size(), added.size()); ++i)
        {
            IndexType       indexR  = removed[i];
            const IndexType offsetR = Dimensions * indexR + j * Tiling::TileHeight;
@@ -477,60 +424,25 @@ void update_accumulator_refresh_cache(
            auto* columnA = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetA]);

            for (IndexType k = 0; k < Tiling::NumRegs; ++k)
-                acc[k] = vec_add_16(acc[k], vec_sub_16(columnA[k], columnR[k]));
+                acc[k] = fused<Vec16Wrapper, Add, Sub>(acc[k], columnA[k], columnR[k]);
        }
-        if (combineLast3)
+        for (; i < removed.size(); ++i)
        {
-            IndexType       indexR  = removed[i];
-            const IndexType offsetR = Dimensions * indexR + j * Tiling::TileHeight;
-            auto* columnR = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetR]);
-            IndexType       indexA  = added[i];
-            const IndexType offsetA = Dimensions * indexA + j * Tiling::TileHeight;
-            auto* columnA = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetA]);
+            IndexType       index  = removed[i];
+            const IndexType offset = Dimensions * index + j * Tiling::TileHeight;
+            auto* column = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offset]);

-            if (removed.size() > added.size())
-            {
-                IndexType       indexR2  = removed[i + 1];
-                const IndexType offsetR2 = Dimensions * indexR2 + j * Tiling::TileHeight;
-                auto*           columnR2 =
-                  reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetR2]);
-
-                for (IndexType k = 0; k < Tiling::NumRegs; ++k)
-                    acc[k] = vec_sub_16(vec_add_16(acc[k], columnA[k]),
-                                        vec_add_16(columnR[k], columnR2[k]));
-            }
-            else
-            {
-                IndexType       indexA2  = added[i + 1];
-                const IndexType offsetA2 = Dimensions * indexA2 + j * Tiling::TileHeight;
-                auto*           columnA2 =
-                  reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetA2]);
-
-                for (IndexType k = 0; k < Tiling::NumRegs; ++k)
-                    acc[k] = vec_add_16(vec_sub_16(acc[k], columnR[k]),
-                                        vec_add_16(columnA[k], columnA2[k]));
-            }
+            for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+                acc[k] = vec_sub_16(acc[k], column[k]);
        }
-        else
+        for (; i < added.size(); ++i)
        {
-            for (; i < removed.size(); ++i)
-            {
-                IndexType       index  = removed[i];
-                const IndexType offset = Dimensions * index + j * Tiling::TileHeight;
-                auto* column = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offset]);
+            IndexType       index  = added[i];
+            const IndexType offset = Dimensions * index + j * Tiling::TileHeight;
+            auto* column = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offset]);

-                for (IndexType k = 0; k < Tiling::NumRegs; ++k)
-                    acc[k] = vec_sub_16(acc[k], column[k]);
-            }
-            for (; i < added.size(); ++i)
-            {
-                IndexType       index  = added[i];
-                const IndexType offset = Dimensions * index + j * Tiling::TileHeight;
-                auto* column = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offset]);
-
-                for (IndexType k = 0; k < Tiling::NumRegs; ++k)
-                    acc[k] = vec_add_16(acc[k], column[k]);
-            }
+            for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+                acc[k] = vec_add_16(acc[k], column[k]);
        }

        for (IndexType k = 0; k < Tiling::NumRegs; k++)
@@ -546,10 +458,10 @@ void update_accumulator_refresh_cache(
        auto* entryTilePsqt =
          reinterpret_cast<psqt_vec_t*>(&entry.psqtAccumulation[j * Tiling::PsqtTileHeight]);

-        for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
+        for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k)
            psqt[k] = entryTilePsqt[k];

-        for (std::size_t i = 0; i < removed.size(); ++i)
+        for (IndexType i = 0; i < removed.size(); ++i)
        {
            IndexType       index  = removed[i];
            const IndexType offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight;
@@ -559,7 +471,7 @@ void update_accumulator_refresh_cache(
            for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
                psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
        }
-        for (std::size_t i = 0; i < added.size(); ++i)
+        for (IndexType i = 0; i < added.size(); ++i)
        {
            IndexType       index  = added[i];
            const IndexType offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight;
@@ -570,9 +482,9 @@ void update_accumulator_refresh_cache(
                psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
        }

-        for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
+        for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k)
            vec_store_psqt(&entryTilePsqt[k], psqt[k]);
-        for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
+        for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k)
            vec_store_psqt(&accTilePsqt[k], psqt[k]);
    }

--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -37,19 +37,10 @@ class Position;

 namespace Stockfish::Eval::NNUE {

-using BiasType       = std::int16_t;
-using PSQTWeightType = std::int32_t;
-using IndexType      = std::uint32_t;
-
-struct Networks;
-
 template<IndexType Size>
 struct alignas(CacheLineSize) Accumulator;

-struct AccumulatorState;
-
-template<IndexType                                 TransformedFeatureDimensions,
-         Accumulator<TransformedFeatureDimensions> AccumulatorState::*accPtr>
+template<IndexType TransformedFeatureDimensions>
 class FeatureTransformer;

 // Class that holds the result of affine transformation of input features
@@ -121,6 +112,30 @@ struct AccumulatorState {
    Accumulator<TransformedFeatureDimensionsSmall> accumulatorSmall;
    DirtyPiece                                     dirtyPiece;

+    template<IndexType Size>
+    auto& acc() noexcept {
+        static_assert(Size == TransformedFeatureDimensionsBig
+                        || Size == TransformedFeatureDimensionsSmall,
+                      "Invalid size for accumulator");
+
+        if constexpr (Size == TransformedFeatureDimensionsBig)
+            return accumulatorBig;
+        else if constexpr (Size == TransformedFeatureDimensionsSmall)
+            return accumulatorSmall;
+    }
+
+    template<IndexType Size>
+    const auto& acc() const noexcept {
+        static_assert(Size == TransformedFeatureDimensionsBig
+                        || Size == TransformedFeatureDimensionsSmall,
+                      "Invalid size for accumulator");
+
+        if constexpr (Size == TransformedFeatureDimensionsBig)
+            return accumulatorBig;
+        else if constexpr (Size == TransformedFeatureDimensionsSmall)
+            return accumulatorSmall;
+    }
+
    void reset(const DirtyPiece& dp) noexcept;
 };

@@ -128,54 +143,43 @@ struct AccumulatorState {
 class AccumulatorStack {
   public:
    AccumulatorStack() :
-        m_accumulators(MAX_PLY + 1),
-        m_current_idx{} {}
+        accumulators(MAX_PLY + 1),
+        size{1} {}

    [[nodiscard]] const AccumulatorState& latest() const noexcept;

-    void
-    reset(const Position& rootPos, const Networks& networks, AccumulatorCaches& caches) noexcept;
+    void reset() noexcept;
    void push(const DirtyPiece& dirtyPiece) noexcept;
    void pop() noexcept;

-    template<IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
-    void evaluate(const Position&                               pos,
-                  const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
-                  AccumulatorCaches::Cache<Dimensions>&         cache) noexcept;
+    template<IndexType Dimensions>
+    void evaluate(const Position&                       pos,
+                  const FeatureTransformer<Dimensions>& featureTransformer,
+                  AccumulatorCaches::Cache<Dimensions>& cache) noexcept;

   private:
    [[nodiscard]] AccumulatorState& mut_latest() noexcept;

-    template<Color                   Perspective,
-             IndexType               Dimensions,
-             Accumulator<Dimensions> AccumulatorState::*accPtr>
-    void evaluate_side(const Position&                               pos,
-                       const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
-                       AccumulatorCaches::Cache<Dimensions>&         cache) noexcept;
+    template<Color Perspective, IndexType Dimensions>
+    void evaluate_side(const Position&                       pos,
+                       const FeatureTransformer<Dimensions>& featureTransformer,
+                       AccumulatorCaches::Cache<Dimensions>& cache) noexcept;

-    template<Color                   Perspective,
-             IndexType               Dimensions,
-             Accumulator<Dimensions> AccumulatorState::*accPtr>
+    template<Color Perspective, IndexType Dimensions>
    [[nodiscard]] std::size_t find_last_usable_accumulator() const noexcept;

-    template<Color                   Perspective,
-             IndexType               Dimensions,
-             Accumulator<Dimensions> AccumulatorState::*accPtr>
-    void
-    forward_update_incremental(const Position&                               pos,
-                               const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
-                               const std::size_t                             begin) noexcept;
+    template<Color Perspective, IndexType Dimensions>
+    void forward_update_incremental(const Position&                       pos,
+                                    const FeatureTransformer<Dimensions>& featureTransformer,
+                                    const std::size_t                     begin) noexcept;

-    template<Color                   Perspective,
-             IndexType               Dimensions,
-             Accumulator<Dimensions> AccumulatorState::*accPtr>
-    void
-    backward_update_incremental(const Position&                               pos,
-                                const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
-                                const std::size_t                             end) noexcept;
+    template<Color Perspective, IndexType Dimensions>
+    void backward_update_incremental(const Position&                       pos,
+                                     const FeatureTransformer<Dimensions>& featureTransformer,
+                                     const std::size_t                     end) noexcept;

-    std::vector<AccumulatorState> m_accumulators;
-    std::size_t                   m_current_idx;
+    std::vector<AccumulatorState> accumulators;
+    std::size_t                   size;
 };

 }  // namespace Stockfish::Eval::NNUE
--- a/src/nnue/nnue_architecture.h
+++ b/src/nnue/nnue_architecture.h
@@ -49,6 +49,12 @@ constexpr int       L3Small                           = 32;
 constexpr IndexType PSQTBuckets = 8;
 constexpr IndexType LayerStacks = 8;

+// If vector instructions are enabled, we update and refresh the
+// accumulator tile by tile such that each tile fits in the CPU's
+// vector registers.
+static_assert(PSQTBuckets % 8 == 0,
+              "Per feature PSQT values cannot be processed at granularity lower than 8 at a time.");
+
 template<IndexType L1, int L2, int L3>
 struct NetworkArchitecture {
    static constexpr IndexType TransformedFeatureDimensions = L1;
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -48,6 +48,11 @@

 namespace Stockfish::Eval::NNUE {

+using BiasType       = std::int16_t;
+using WeightType     = std::int16_t;
+using PSQTWeightType = std::int32_t;
+using IndexType      = std::uint32_t;
+
 // Version of the evaluation file
 constexpr std::uint32_t Version = 0x7AF32F20u;

@@ -76,7 +81,6 @@ constexpr std::size_t MaxSimdWidth = 32;

 // Type of input feature after conversion
 using TransformedFeatureType = std::uint8_t;
-using IndexType              = std::uint32_t;

 // Round n up to be a multiple of base
 template<typename IntType>
@@ -279,11 +283,6 @@ inline void write_leb_128(std::ostream& stream, const IntType* values, std::size
    flush();
 }

-enum IncUpdateDirection {
-    FORWARD,
-    BACKWARDS
-};
-
 }  // namespace Stockfish::Eval::NNUE

 #endif  // #ifndef NNUE_COMMON_H_INCLUDED
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -31,118 +31,10 @@
 #include "nnue_accumulator.h"
 #include "nnue_architecture.h"
 #include "nnue_common.h"
+#include "simd.h"

 namespace Stockfish::Eval::NNUE {

-using BiasType       = std::int16_t;
-using WeightType     = std::int16_t;
-using PSQTWeightType = std::int32_t;
-
-// If vector instructions are enabled, we update and refresh the
-// accumulator tile by tile such that each tile fits in the CPU's
-// vector registers.
-#define VECTOR
-
-static_assert(PSQTBuckets % 8 == 0,
-              "Per feature PSQT values cannot be processed at granularity lower than 8 at a time.");
-
-#ifdef USE_AVX512
-using vec_t      = __m512i;
-using psqt_vec_t = __m256i;
-    #define vec_load(a) _mm512_load_si512(a)
-    #define vec_store(a, b) _mm512_store_si512(a, b)
-    #define vec_add_16(a, b) _mm512_add_epi16(a, b)
-    #define vec_sub_16(a, b) _mm512_sub_epi16(a, b)
-    #define vec_mulhi_16(a, b) _mm512_mulhi_epi16(a, b)
-    #define vec_zero() _mm512_setzero_epi32()
-    #define vec_set_16(a) _mm512_set1_epi16(a)
-    #define vec_max_16(a, b) _mm512_max_epi16(a, b)
-    #define vec_min_16(a, b) _mm512_min_epi16(a, b)
-    #define vec_slli_16(a, b) _mm512_slli_epi16(a, b)
-    // Inverse permuted at load time
-    #define vec_packus_16(a, b) _mm512_packus_epi16(a, b)
-    #define vec_load_psqt(a) _mm256_load_si256(a)
-    #define vec_store_psqt(a, b) _mm256_store_si256(a, b)
-    #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b)
-    #define vec_sub_psqt_32(a, b) _mm256_sub_epi32(a, b)
-    #define vec_zero_psqt() _mm256_setzero_si256()
-    #define NumRegistersSIMD 16
-    #define MaxChunkSize 64
-
-#elif USE_AVX2
-using vec_t      = __m256i;
-using psqt_vec_t = __m256i;
-    #define vec_load(a) _mm256_load_si256(a)
-    #define vec_store(a, b) _mm256_store_si256(a, b)
-    #define vec_add_16(a, b) _mm256_add_epi16(a, b)
-    #define vec_sub_16(a, b) _mm256_sub_epi16(a, b)
-    #define vec_mulhi_16(a, b) _mm256_mulhi_epi16(a, b)
-    #define vec_zero() _mm256_setzero_si256()
-    #define vec_set_16(a) _mm256_set1_epi16(a)
-    #define vec_max_16(a, b) _mm256_max_epi16(a, b)
-    #define vec_min_16(a, b) _mm256_min_epi16(a, b)
-    #define vec_slli_16(a, b) _mm256_slli_epi16(a, b)
-    // Inverse permuted at load time
-    #define vec_packus_16(a, b) _mm256_packus_epi16(a, b)
-    #define vec_load_psqt(a) _mm256_load_si256(a)
-    #define vec_store_psqt(a, b) _mm256_store_si256(a, b)
-    #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b)
-    #define vec_sub_psqt_32(a, b) _mm256_sub_epi32(a, b)
-    #define vec_zero_psqt() _mm256_setzero_si256()
-    #define NumRegistersSIMD 16
-    #define MaxChunkSize 32
-
-#elif USE_SSE2
-using vec_t      = __m128i;
-using psqt_vec_t = __m128i;
-    #define vec_load(a) (*(a))
-    #define vec_store(a, b) *(a) = (b)
-    #define vec_add_16(a, b) _mm_add_epi16(a, b)
-    #define vec_sub_16(a, b) _mm_sub_epi16(a, b)
-    #define vec_mulhi_16(a, b) _mm_mulhi_epi16(a, b)
-    #define vec_zero() _mm_setzero_si128()
-    #define vec_set_16(a) _mm_set1_epi16(a)
-    #define vec_max_16(a, b) _mm_max_epi16(a, b)
-    #define vec_min_16(a, b) _mm_min_epi16(a, b)
-    #define vec_slli_16(a, b) _mm_slli_epi16(a, b)
-    #define vec_packus_16(a, b) _mm_packus_epi16(a, b)
-    #define vec_load_psqt(a) (*(a))
-    #define vec_store_psqt(a, b) *(a) = (b)
-    #define vec_add_psqt_32(a, b) _mm_add_epi32(a, b)
-    #define vec_sub_psqt_32(a, b) _mm_sub_epi32(a, b)
-    #define vec_zero_psqt() _mm_setzero_si128()
-    #define NumRegistersSIMD (Is64Bit ? 16 : 8)
-    #define MaxChunkSize 16
-
-#elif USE_NEON
-using vec_t      = int16x8_t;
-using psqt_vec_t = int32x4_t;
-    #define vec_load(a) (*(a))
-    #define vec_store(a, b) *(a) = (b)
-    #define vec_add_16(a, b) vaddq_s16(a, b)
-    #define vec_sub_16(a, b) vsubq_s16(a, b)
-    #define vec_mulhi_16(a, b) vqdmulhq_s16(a, b)
-    #define vec_zero() \
-        vec_t { 0 }
-    #define vec_set_16(a) vdupq_n_s16(a)
-    #define vec_max_16(a, b) vmaxq_s16(a, b)
-    #define vec_min_16(a, b) vminq_s16(a, b)
-    #define vec_slli_16(a, b) vshlq_s16(a, vec_set_16(b))
-    #define vec_packus_16(a, b) reinterpret_cast<vec_t>(vcombine_u8(vqmovun_s16(a), vqmovun_s16(b)))
-    #define vec_load_psqt(a) (*(a))
-    #define vec_store_psqt(a, b) *(a) = (b)
-    #define vec_add_psqt_32(a, b) vaddq_s32(a, b)
-    #define vec_sub_psqt_32(a, b) vsubq_s32(a, b)
-    #define vec_zero_psqt() \
-        psqt_vec_t { 0 }
-    #define NumRegistersSIMD 16
-    #define MaxChunkSize 16
-
-#else
-    #undef VECTOR
-
-#endif
-
 // Returns the inverse of a permutation
 template<std::size_t Len>
 constexpr std::array<std::size_t, Len>
@@ -184,64 +76,8 @@ void permute(T (&data)[N], const std::array<std::size_t, OrderSize>& order) {
    }
 }

-// Compute optimal SIMD register count for feature transformer accumulation.
-template<IndexType TransformedFeatureWidth, IndexType HalfDimensions>
-class SIMDTiling {
-#ifdef VECTOR
-    // We use __m* types as template arguments, which causes GCC to emit warnings
-    // about losing some attribute information. This is irrelevant to us as we
-    // only take their size, so the following pragma are harmless.
-    #if defined(__GNUC__)
-        #pragma GCC diagnostic push
-        #pragma GCC diagnostic ignored "-Wignored-attributes"
-    #endif
-
-    template<typename SIMDRegisterType, typename LaneType, int NumLanes, int MaxRegisters>
-    static constexpr int BestRegisterCount() {
-        constexpr std::size_t RegisterSize = sizeof(SIMDRegisterType);
-        constexpr std::size_t LaneSize     = sizeof(LaneType);
-
-        static_assert(RegisterSize >= LaneSize);
-        static_assert(MaxRegisters <= NumRegistersSIMD);
-        static_assert(MaxRegisters > 0);
-        static_assert(NumRegistersSIMD > 0);
-        static_assert(RegisterSize % LaneSize == 0);
-        static_assert((NumLanes * LaneSize) % RegisterSize == 0);
-
-        const int ideal = (NumLanes * LaneSize) / RegisterSize;
-        if (ideal <= MaxRegisters)
-            return ideal;
-
-        // Look for the largest divisor of the ideal register count that is smaller than MaxRegisters
-        for (int divisor = MaxRegisters; divisor > 1; --divisor)
-            if (ideal % divisor == 0)
-                return divisor;
-
-        return 1;
-    }
-
-    #if defined(__GNUC__)
-        #pragma GCC diagnostic pop
-    #endif
-
-   public:
-    static constexpr int NumRegs =
-      BestRegisterCount<vec_t, WeightType, TransformedFeatureWidth, NumRegistersSIMD>();
-    static constexpr int NumPsqtRegs =
-      BestRegisterCount<psqt_vec_t, PSQTWeightType, PSQTBuckets, NumRegistersSIMD>();
-
-    static constexpr IndexType TileHeight     = NumRegs * sizeof(vec_t) / 2;
-    static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4;
-
-    static_assert(HalfDimensions % TileHeight == 0, "TileHeight must divide HalfDimensions");
-    static_assert(PSQTBuckets % PsqtTileHeight == 0, "PsqtTileHeight must divide PSQTBuckets");
-#endif
-};
-
-
 // Input feature converter
-template<IndexType                                 TransformedFeatureDimensions,
-         Accumulator<TransformedFeatureDimensions> AccumulatorState::*accPtr>
+template<IndexType TransformedFeatureDimensions>
 class FeatureTransformer {

    // Number of output dimensions for one side
@@ -342,16 +178,18 @@ class FeatureTransformer {
                           OutputType*                               output,
                           int                                       bucket) const {

+        using namespace SIMD;
+
        accumulatorStack.evaluate(pos, *this, *cache);
        const auto& accumulatorState = accumulatorStack.latest();

        const Color perspectives[2]  = {pos.side_to_move(), ~pos.side_to_move()};
-        const auto& psqtAccumulation = (accumulatorState.*accPtr).psqtAccumulation;
+        const auto& psqtAccumulation = (accumulatorState.acc<HalfDimensions>()).psqtAccumulation;
        const auto  psqt =
          (psqtAccumulation[perspectives[0]][bucket] - psqtAccumulation[perspectives[1]][bucket])
          / 2;

-        const auto& accumulation = (accumulatorState.*accPtr).accumulation;
+        const auto& accumulation = (accumulatorState.acc<HalfDimensions>()).accumulation;

        for (IndexType p = 0; p < 2; ++p)
        {
--- a/src/nnue/nnue_misc.cpp
+++ b/src/nnue/nnue_misc.cpp
@@ -121,7 +121,6 @@ trace(Position& pos, const Eval::NNUE::Networks& networks, Eval::NNUE::Accumulat
    };

    AccumulatorStack accumulators;
-    accumulators.reset(pos, networks, caches);

    // We estimate the value of each piece by doing a differential evaluation from
    // the current base eval, simulating the removal of the piece from its square.
@@ -140,7 +139,7 @@ trace(Position& pos, const Eval::NNUE::Networks& networks, Eval::NNUE::Accumulat
            {
                pos.remove_piece(sq);

-                accumulators.reset(pos, networks, caches);
+                accumulators.reset();
                std::tie(psqt, positional) = networks.big.evaluate(pos, accumulators, &caches.big);
                Value eval                 = psqt + positional;
                eval                       = pos.side_to_move() == WHITE ? eval : -eval;
@@ -157,7 +156,7 @@ trace(Position& pos, const Eval::NNUE::Networks& networks, Eval::NNUE::Accumulat
        ss << board[row] << '\n';
    ss << '\n';

-    accumulators.reset(pos, networks, caches);
+    accumulators.reset();
    auto t = networks.big.trace_evaluate(pos, accumulators, &caches.big);

    ss << " NNUE network contributions "
--- a/src/nnue/simd.h
+++ b/src/nnue/simd.h
@@ -0,0 +1,406 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2025 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef NNUE_SIMD_H_INCLUDED
+#define NNUE_SIMD_H_INCLUDED
+
+#if defined(USE_AVX2)
+    #include <immintrin.h>
+
+#elif defined(USE_SSE41)
+    #include <smmintrin.h>
+
+#elif defined(USE_SSSE3)
+    #include <tmmintrin.h>
+
+#elif defined(USE_SSE2)
+    #include <emmintrin.h>
+
+#elif defined(USE_NEON)
+    #include <arm_neon.h>
+#endif
+
+#include "../types.h"
+#include "nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::SIMD {
+
+// If vector instructions are enabled, we update and refresh the
+// accumulator tile by tile such that each tile fits in the CPU's
+// vector registers.
+#define VECTOR
+
+#ifdef USE_AVX512
+using vec_t      = __m512i;
+using vec128_t   = __m128i;
+using psqt_vec_t = __m256i;
+using vec_uint_t = __m512i;
+    #define vec_load(a) _mm512_load_si512(a)
+    #define vec_store(a, b) _mm512_store_si512(a, b)
+    #define vec_add_16(a, b) _mm512_add_epi16(a, b)
+    #define vec_sub_16(a, b) _mm512_sub_epi16(a, b)
+    #define vec_mulhi_16(a, b) _mm512_mulhi_epi16(a, b)
+    #define vec_zero() _mm512_setzero_epi32()
+    #define vec_set_16(a) _mm512_set1_epi16(a)
+    #define vec_max_16(a, b) _mm512_max_epi16(a, b)
+    #define vec_min_16(a, b) _mm512_min_epi16(a, b)
+    #define vec_slli_16(a, b) _mm512_slli_epi16(a, b)
+    // Inverse permuted at load time
+    #define vec_packus_16(a, b) _mm512_packus_epi16(a, b)
+    #define vec_load_psqt(a) _mm256_load_si256(a)
+    #define vec_store_psqt(a, b) _mm256_store_si256(a, b)
+    #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b)
+    #define vec_sub_psqt_32(a, b) _mm256_sub_epi32(a, b)
+    #define vec_zero_psqt() _mm256_setzero_si256()
+
+    #ifdef USE_SSSE3
+        #define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512())
+    #endif
+
+    #define vec128_zero _mm_setzero_si128()
+    #define vec128_set_16(a) _mm_set1_epi16(a)
+    #define vec128_load(a) _mm_load_si128(a)
+    #define vec128_storeu(a, b) _mm_storeu_si128(a, b)
+    #define vec128_add(a, b) _mm_add_epi16(a, b)
+    #define NumRegistersSIMD 16
+    #define MaxChunkSize 64
+
+#elif USE_AVX2
+using vec_t      = __m256i;
+using vec128_t   = __m128i;
+using psqt_vec_t = __m256i;
+using vec_uint_t = __m256i;
+    #define vec_load(a) _mm256_load_si256(a)
+    #define vec_store(a, b) _mm256_store_si256(a, b)
+    #define vec_add_16(a, b) _mm256_add_epi16(a, b)
+    #define vec_sub_16(a, b) _mm256_sub_epi16(a, b)
+    #define vec_mulhi_16(a, b) _mm256_mulhi_epi16(a, b)
+    #define vec_zero() _mm256_setzero_si256()
+    #define vec_set_16(a) _mm256_set1_epi16(a)
+    #define vec_max_16(a, b) _mm256_max_epi16(a, b)
+    #define vec_min_16(a, b) _mm256_min_epi16(a, b)
+    #define vec_slli_16(a, b) _mm256_slli_epi16(a, b)
+    // Inverse permuted at load time
+    #define vec_packus_16(a, b) _mm256_packus_epi16(a, b)
+    #define vec_load_psqt(a) _mm256_load_si256(a)
+    #define vec_store_psqt(a, b) _mm256_store_si256(a, b)
+    #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b)
+    #define vec_sub_psqt_32(a, b) _mm256_sub_epi32(a, b)
+    #define vec_zero_psqt() _mm256_setzero_si256()
+
+    #ifdef USE_SSSE3
+        #if defined(USE_VNNI) && !defined(USE_AVXVNNI)
+            #define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256())
+        #else
+            #define vec_nnz(a) \
+                _mm256_movemask_ps( \
+                  _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256())))
+        #endif
+    #endif
+
+    #define vec128_zero _mm_setzero_si128()
+    #define vec128_set_16(a) _mm_set1_epi16(a)
+    #define vec128_load(a) _mm_load_si128(a)
+    #define vec128_storeu(a, b) _mm_storeu_si128(a, b)
+    #define vec128_add(a, b) _mm_add_epi16(a, b)
+
+    #define NumRegistersSIMD 16
+    #define MaxChunkSize 32
+
+#elif USE_SSE2
+using vec_t      = __m128i;
+using vec128_t   = __m128i;
+using psqt_vec_t = __m128i;
+using vec_uint_t = __m128i;
+    #define vec_load(a) (*(a))
+    #define vec_store(a, b) *(a) = (b)
+    #define vec_add_16(a, b) _mm_add_epi16(a, b)
+    #define vec_sub_16(a, b) _mm_sub_epi16(a, b)
+    #define vec_mulhi_16(a, b) _mm_mulhi_epi16(a, b)
+    #define vec_zero() _mm_setzero_si128()
+    #define vec_set_16(a) _mm_set1_epi16(a)
+    #define vec_max_16(a, b) _mm_max_epi16(a, b)
+    #define vec_min_16(a, b) _mm_min_epi16(a, b)
+    #define vec_slli_16(a, b) _mm_slli_epi16(a, b)
+    #define vec_packus_16(a, b) _mm_packus_epi16(a, b)
+    #define vec_load_psqt(a) (*(a))
+    #define vec_store_psqt(a, b) *(a) = (b)
+    #define vec_add_psqt_32(a, b) _mm_add_epi32(a, b)
+    #define vec_sub_psqt_32(a, b) _mm_sub_epi32(a, b)
+    #define vec_zero_psqt() _mm_setzero_si128()
+
+    #ifdef USE_SSSE3
+        #define vec_nnz(a) \
+            _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128())))
+    #endif
+
+    #define vec128_zero _mm_setzero_si128()
+    #define vec128_set_16(a) _mm_set1_epi16(a)
+    #define vec128_load(a) _mm_load_si128(a)
+    #define vec128_storeu(a, b) _mm_storeu_si128(a, b)
+    #define vec128_add(a, b) _mm_add_epi16(a, b)
+
+    #define NumRegistersSIMD (Is64Bit ? 16 : 8)
+    #define MaxChunkSize 16
+
+#elif USE_NEON
+using vec_t      = int16x8_t;
+using psqt_vec_t = int32x4_t;
+using vec128_t   = uint16x8_t;
+using vec_uint_t = uint32x4_t;
+    #define vec_load(a) (*(a))
+    #define vec_store(a, b) *(a) = (b)
+    #define vec_add_16(a, b) vaddq_s16(a, b)
+    #define vec_sub_16(a, b) vsubq_s16(a, b)
+    #define vec_mulhi_16(a, b) vqdmulhq_s16(a, b)
+    #define vec_zero() vec_t{0}
+    #define vec_set_16(a) vdupq_n_s16(a)
+    #define vec_max_16(a, b) vmaxq_s16(a, b)
+    #define vec_min_16(a, b) vminq_s16(a, b)
+    #define vec_slli_16(a, b) vshlq_s16(a, vec_set_16(b))
+    #define vec_packus_16(a, b) reinterpret_cast<vec_t>(vcombine_u8(vqmovun_s16(a), vqmovun_s16(b)))
+    #define vec_load_psqt(a) (*(a))
+    #define vec_store_psqt(a, b) *(a) = (b)
+    #define vec_add_psqt_32(a, b) vaddq_s32(a, b)
+    #define vec_sub_psqt_32(a, b) vsubq_s32(a, b)
+    #define vec_zero_psqt() psqt_vec_t{0}
+
+static constexpr std::uint32_t Mask[4] = {1, 2, 4, 8};
+    #define vec_nnz(a) vaddvq_u32(vandq_u32(vtstq_u32(a, a), vld1q_u32(Mask)))
+    #define vec128_zero vdupq_n_u16(0)
+    #define vec128_set_16(a) vdupq_n_u16(a)
+    #define vec128_load(a) vld1q_u16(reinterpret_cast<const std::uint16_t*>(a))
+    #define vec128_storeu(a, b) vst1q_u16(reinterpret_cast<std::uint16_t*>(a), b)
+    #define vec128_add(a, b) vaddq_u16(a, b)
+
+    #define NumRegistersSIMD 16
+    #define MaxChunkSize 16
+
+#else
+    #undef VECTOR
+
+#endif
+
+struct Vec16Wrapper {
+#ifdef VECTOR
+    using type = vec_t;
+    static type add(const type& lhs, const type& rhs) { return vec_add_16(lhs, rhs); }
+    static type sub(const type& lhs, const type& rhs) { return vec_sub_16(lhs, rhs); }
+#else
+    using type = BiasType;
+    static type add(const type& lhs, const type& rhs) { return lhs + rhs; }
+    static type sub(const type& lhs, const type& rhs) { return lhs - rhs; }
+#endif
+};
+
+struct Vec32Wrapper {
+#ifdef VECTOR
+    using type = psqt_vec_t;
+    static type add(const type& lhs, const type& rhs) { return vec_add_psqt_32(lhs, rhs); }
+    static type sub(const type& lhs, const type& rhs) { return vec_sub_psqt_32(lhs, rhs); }
+#else
+    using type = PSQTWeightType;
+    static type add(const type& lhs, const type& rhs) { return lhs + rhs; }
+    static type sub(const type& lhs, const type& rhs) { return lhs - rhs; }
+#endif
+};
+
+enum UpdateOperation {
+    Add,
+    Sub
+};
+
+template<typename VecWrapper,
+         UpdateOperation... ops,
+         std::enable_if_t<sizeof...(ops) == 0, bool> = true>
+typename VecWrapper::type fused(const typename VecWrapper::type& in) {
+    return in;
+}
+
+template<typename VecWrapper,
+         UpdateOperation update_op,
+         UpdateOperation... ops,
+         typename T,
+         typename... Ts,
+         std::enable_if_t<is_all_same_v<typename VecWrapper::type, T, Ts...>, bool> = true,
+         std::enable_if_t<sizeof...(ops) == sizeof...(Ts), bool>                    = true>
+typename VecWrapper::type
+fused(const typename VecWrapper::type& in, const T& operand, const Ts&... operands) {
+    switch (update_op)
+    {
+    case Add :
+        return fused<VecWrapper, ops...>(VecWrapper::add(in, operand), operands...);
+    case Sub :
+        return fused<VecWrapper, ops...>(VecWrapper::sub(in, operand), operands...);
+    default :
+        static_assert(update_op == Add || update_op == Sub,
+                      "Only Add and Sub are currently supported.");
+        return typename VecWrapper::type();
+    }
+}
+
+#if defined(USE_AVX512)
+
+[[maybe_unused]] static int m512_hadd(__m512i sum, int bias) {
+    return _mm512_reduce_add_epi32(sum) + bias;
+}
+
+[[maybe_unused]] static void m512_add_dpbusd_epi32(__m512i& acc, __m512i a, __m512i b) {
+
+    #if defined(USE_VNNI)
+    acc = _mm512_dpbusd_epi32(acc, a, b);
+    #else
+    __m512i product0 = _mm512_maddubs_epi16(a, b);
+    product0         = _mm512_madd_epi16(product0, _mm512_set1_epi16(1));
+    acc              = _mm512_add_epi32(acc, product0);
+    #endif
+}
+
+#endif
+
+#if defined(USE_AVX2)
+
+[[maybe_unused]] static int m256_hadd(__m256i sum, int bias) {
+    __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+    sum128         = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+    sum128         = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+    return _mm_cvtsi128_si32(sum128) + bias;
+}
+
+[[maybe_unused]] static void m256_add_dpbusd_epi32(__m256i& acc, __m256i a, __m256i b) {
+
+    #if defined(USE_VNNI)
+    acc = _mm256_dpbusd_epi32(acc, a, b);
+    #else
+    __m256i product0 = _mm256_maddubs_epi16(a, b);
+    product0         = _mm256_madd_epi16(product0, _mm256_set1_epi16(1));
+    acc              = _mm256_add_epi32(acc, product0);
+    #endif
+}
+
+#endif
+
+#if defined(USE_SSSE3)
+
+[[maybe_unused]] static int m128_hadd(__m128i sum, int bias) {
+    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E));  //_MM_PERM_BADC
+    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));  //_MM_PERM_CDAB
+    return _mm_cvtsi128_si32(sum) + bias;
+}
+
+[[maybe_unused]] static void m128_add_dpbusd_epi32(__m128i& acc, __m128i a, __m128i b) {
+
+    __m128i product0 = _mm_maddubs_epi16(a, b);
+    product0         = _mm_madd_epi16(product0, _mm_set1_epi16(1));
+    acc              = _mm_add_epi32(acc, product0);
+}
+
+#endif
+
+#if defined(USE_NEON_DOTPROD)
+
+[[maybe_unused]] static void
+dotprod_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) {
+
+    acc = vdotq_s32(acc, a, b);
+}
+#endif
+
+#if defined(USE_NEON)
+
+[[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
+    #if USE_NEON >= 8
+    return vaddvq_s32(s);
+    #else
+    return s[0] + s[1] + s[2] + s[3];
+    #endif
+}
+
+[[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) {
+    return neon_m128_reduce_add_epi32(sum) + bias;
+}
+
+#endif
+
+#if USE_NEON >= 8
+[[maybe_unused]] static void neon_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) {
+
+    int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
+    int16x8_t product1 = vmull_high_s8(a, b);
+    int16x8_t sum      = vpaddq_s16(product0, product1);
+    acc                = vpadalq_s16(acc, sum);
+}
+#endif
+
+
+// Compute optimal SIMD register count for feature transformer accumulation.
+template<IndexType TransformedFeatureWidth, IndexType HalfDimensions, IndexType PSQTBuckets>
+class SIMDTiling {
+#ifdef VECTOR
+        // We use __m* types as template arguments, which causes GCC to emit warnings
+        // about losing some attribute information. This is irrelevant to us as we
+        // only take their size, so the following pragma are harmless.
+    #if defined(__GNUC__)
+        #pragma GCC diagnostic push
+        #pragma GCC diagnostic ignored "-Wignored-attributes"
+    #endif
+
+    template<typename SIMDRegisterType, typename LaneType, int NumLanes, int MaxRegisters>
+    static constexpr int BestRegisterCount() {
+        constexpr std::size_t RegisterSize = sizeof(SIMDRegisterType);
+        constexpr std::size_t LaneSize     = sizeof(LaneType);
+
+        static_assert(RegisterSize >= LaneSize);
+        static_assert(MaxRegisters <= NumRegistersSIMD);
+        static_assert(MaxRegisters > 0);
+        static_assert(NumRegistersSIMD > 0);
+        static_assert(RegisterSize % LaneSize == 0);
+        static_assert((NumLanes * LaneSize) % RegisterSize == 0);
+
+        const int ideal = (NumLanes * LaneSize) / RegisterSize;
+        if (ideal <= MaxRegisters)
+            return ideal;
+
+        // Look for the largest divisor of the ideal register count that is smaller than MaxRegisters
+        for (int divisor = MaxRegisters; divisor > 1; --divisor)
+            if (ideal % divisor == 0)
+                return divisor;
+
+        return 1;
+    }
+
+    #if defined(__GNUC__)
+        #pragma GCC diagnostic pop
+    #endif
+
+   public:
+    static constexpr int NumRegs =
+      BestRegisterCount<vec_t, WeightType, TransformedFeatureWidth, NumRegistersSIMD>();
+    static constexpr int NumPsqtRegs =
+      BestRegisterCount<psqt_vec_t, PSQTWeightType, PSQTBuckets, NumRegistersSIMD>();
+
+    static constexpr IndexType TileHeight     = NumRegs * sizeof(vec_t) / 2;
+    static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4;
+
+    static_assert(HalfDimensions % TileHeight == 0, "TileHeight must divide HalfDimensions");
+    static_assert(PSQTBuckets % PsqtTileHeight == 0, "PsqtTileHeight must divide PSQTBuckets");
+#endif
+};
+}
+
+#endif
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -54,8 +54,8 @@ namespace {

 constexpr std::string_view PieceToChar(" PNBRQK  pnbrqk");

-constexpr Piece Pieces[] = {W_PAWN, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
-                            B_PAWN, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING};
+static constexpr Piece Pieces[] = {W_PAWN, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
+                                   B_PAWN, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING};
 }  // namespace


@@ -270,7 +270,7 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si) {
        // a) side to move have a pawn threatening epSquare
        // b) there is an enemy pawn in front of epSquare
        // c) there is no piece on epSquare or behind epSquare
-        enpassant = pawn_attacks_bb(~sideToMove, st->epSquare) & pieces(sideToMove, PAWN)
+        enpassant = attacks_bb<PAWN>(st->epSquare, ~sideToMove) & pieces(sideToMove, PAWN)
                 && (pieces(~sideToMove, PAWN) & (st->epSquare + pawn_push(~sideToMove)))
                 && !(pieces() & (st->epSquare | (st->epSquare + pawn_push(sideToMove))));
    }
@@ -321,7 +321,7 @@ void Position::set_check_info() const {

    Square ksq = square<KING>(~sideToMove);

-    st->checkSquares[PAWN]   = pawn_attacks_bb(~sideToMove, ksq);
+    st->checkSquares[PAWN]   = attacks_bb<PAWN>(ksq, ~sideToMove);
    st->checkSquares[KNIGHT] = attacks_bb<KNIGHT>(ksq);
    st->checkSquares[BISHOP] = attacks_bb<BISHOP>(ksq, pieces());
    st->checkSquares[ROOK]   = attacks_bb<ROOK>(ksq, pieces());
@@ -487,8 +487,8 @@ Bitboard Position::attackers_to(Square s, Bitboard occupied) const {

    return (attacks_bb<ROOK>(s, occupied) & pieces(ROOK, QUEEN))
         | (attacks_bb<BISHOP>(s, occupied) & pieces(BISHOP, QUEEN))
-         | (pawn_attacks_bb(BLACK, s) & pieces(WHITE, PAWN))
-         | (pawn_attacks_bb(WHITE, s) & pieces(BLACK, PAWN))
+         | (attacks_bb<PAWN>(s, BLACK) & pieces(WHITE, PAWN))
+         | (attacks_bb<PAWN>(s, WHITE) & pieces(BLACK, PAWN))
         | (attacks_bb<KNIGHT>(s) & pieces(KNIGHT)) | (attacks_bb<KING>(s) & pieces(KING));
 }

@@ -498,7 +498,7 @@ bool Position::attackers_to_exist(Square s, Bitboard occupied, Color c) const {
            && (attacks_bb<ROOK>(s, occupied) & pieces(c, ROOK, QUEEN)))
        || ((attacks_bb<BISHOP>(s) & pieces(c, BISHOP, QUEEN))
            && (attacks_bb<BISHOP>(s, occupied) & pieces(c, BISHOP, QUEEN)))
-        || (((pawn_attacks_bb(~c, s) & pieces(PAWN)) | (attacks_bb<KNIGHT>(s) & pieces(KNIGHT))
+        || (((attacks_bb<PAWN>(s, ~c) & pieces(PAWN)) | (attacks_bb<KNIGHT>(s) & pieces(KNIGHT))
             | (attacks_bb<KING>(s) & pieces(KING)))
            & pieces(c));
 }
@@ -597,10 +597,14 @@ bool Position::pseudo_legal(const Move m) const {
        if ((Rank8BB | Rank1BB) & to)
            return false;

-        if (!(pawn_attacks_bb(us, from) & pieces(~us) & to)  // Not a capture
-            && !((from + pawn_push(us) == to) && empty(to))  // Not a single push
-            && !((from + 2 * pawn_push(us) == to)            // Not a double push
-                 && (relative_rank(us, from) == RANK_2) && empty(to) && empty(to - pawn_push(us))))
+        // Check if it's a valid capture, single push, or double push
+        const bool isCapture    = bool(attacks_bb<PAWN>(from, us) & pieces(~us) & to);
+        const bool isSinglePush = (from + pawn_push(us) == to) && empty(to);
+        const bool isDoublePush = (from + 2 * pawn_push(us) == to)
+                               && (relative_rank(us, from) == RANK_2) && empty(to)
+                               && empty(to - pawn_push(us));
+
+        if (!(isCapture || isSinglePush || isDoublePush))
            return false;
    }
    else if (!(attacks_bb(type_of(pc), from, pieces()) & to))
@@ -698,7 +702,6 @@ DirtyPiece Position::do_move(Move                      m,
    // our state pointer to point to the new (ready to be updated) state.
    std::memcpy(&newSt, st, offsetof(StateInfo, key));
    newSt.previous = st;
-    st->next       = &newSt;
    st             = &newSt;

    // Increment ply counters. In particular, rule50 will be reset to zero later on
@@ -707,9 +710,6 @@ DirtyPiece Position::do_move(Move                      m,
    ++st->rule50;
    ++st->pliesFromNull;

-    DirtyPiece dp;
-    dp.dirty_num = 1;
-
    Color  us       = sideToMove;
    Color  them     = ~us;
    Square from     = m.from_sq();
@@ -717,6 +717,12 @@ DirtyPiece Position::do_move(Move                      m,
    Piece  pc       = piece_on(from);
    Piece  captured = m.type_of() == EN_PASSANT ? make_piece(them, PAWN) : piece_on(to);

+    DirtyPiece dp;
+    dp.pc     = pc;
+    dp.from   = from;
+    dp.to     = to;
+    dp.add_sq = SQ_NONE;
+
    assert(color_of(pc) == us);
    assert(captured == NO_PIECE || color_of(captured) == (m.type_of() != CASTLING ? them : us));
    assert(type_of(captured) != KING);
@@ -733,8 +739,7 @@ DirtyPiece Position::do_move(Move                      m,
        st->nonPawnKey[us] ^= Zobrist::psq[captured][rfrom] ^ Zobrist::psq[captured][rto];
        captured = NO_PIECE;
    }
-
-    if (captured)
+    else if (captured)
    {
        Square capsq = to;

@@ -764,10 +769,8 @@ DirtyPiece Position::do_move(Move                      m,
                st->minorPieceKey ^= Zobrist::psq[captured][capsq];
        }

-        dp.dirty_num = 2;  // 1 piece moved, 1 piece captured
-        dp.piece[1]  = captured;
-        dp.from[1]   = capsq;
-        dp.to[1]     = SQ_NONE;
+        dp.remove_pc = captured;
+        dp.remove_sq = capsq;

        // Update board and piece lists
        remove_piece(capsq);
@@ -778,6 +781,8 @@ DirtyPiece Position::do_move(Move                      m,
        // Reset rule 50 counter
        st->rule50 = 0;
    }
+    else
+        dp.remove_sq = SQ_NONE;

    // Update hash key
    k ^= Zobrist::psq[pc][from] ^ Zobrist::psq[pc][to];
@@ -800,9 +805,6 @@ DirtyPiece Position::do_move(Move                      m,
    // Move the piece. The tricky Chess960 castling is handled earlier
    if (m.type_of() != CASTLING)
    {
-        dp.piece[0] = pc;
-        dp.from[0]  = from;
-        dp.to[0]    = to;

        move_piece(from, to);
    }
@@ -812,7 +814,7 @@ DirtyPiece Position::do_move(Move                      m,
    {
        // Set en passant square if the moved pawn can be captured
        if ((int(to) ^ int(from)) == 16
-            && (pawn_attacks_bb(us, to - pawn_push(us)) & pieces(them, PAWN)))
+            && (attacks_bb<PAWN>(to - pawn_push(us), us) & pieces(them, PAWN)))
        {
            st->epSquare = to - pawn_push(us);
            k ^= Zobrist::enpassant[file_of(st->epSquare)];
@@ -829,12 +831,9 @@ DirtyPiece Position::do_move(Move                      m,
            remove_piece(to);
            put_piece(promotion, to);

-            // Promoting pawn to SQ_NONE, promoted piece from SQ_NONE
-            dp.to[0]               = SQ_NONE;
-            dp.piece[dp.dirty_num] = promotion;
-            dp.from[dp.dirty_num]  = SQ_NONE;
-            dp.to[dp.dirty_num]    = to;
-            dp.dirty_num++;
+            dp.add_pc = promotion;
+            dp.add_sq = to;
+            dp.to     = SQ_NONE;

            // Update hash keys
            // Zobrist::psq[pc][to] is zero, so we don't need to clear it
@@ -901,6 +900,10 @@ DirtyPiece Position::do_move(Move                      m,

    assert(pos_is_ok());

+    assert(dp.pc != NO_PIECE);
+    assert(!(bool(captured) || m.type_of() == CASTLING) ^ (dp.remove_sq != SQ_NONE));
+    assert(dp.from != SQ_NONE);
+    assert(!(dp.add_sq != SQ_NONE) ^ (m.type_of() == PROMOTION || m.type_of() == CASTLING));
    return dp;
 }

@@ -983,13 +986,10 @@ void Position::do_castling(

    if (Do)
    {
-        dp->piece[0]  = make_piece(us, KING);
-        dp->from[0]   = from;
-        dp->to[0]     = to;
-        dp->piece[1]  = make_piece(us, ROOK);
-        dp->from[1]   = rfrom;
-        dp->to[1]     = rto;
-        dp->dirty_num = 2;
+        dp->to        = to;
+        dp->remove_pc = dp->add_pc = make_piece(us, ROOK);
+        dp->remove_sq              = rfrom;
+        dp->add_sq                 = rto;
    }

    // Remove both pieces first since squares could overlap in Chess960
@@ -1012,7 +1012,6 @@ void Position::do_null_move(StateInfo& newSt, const TranspositionTable& tt) {
    std::memcpy(&newSt, st, sizeof(StateInfo));

    newSt.previous = st;
-    st->next       = &newSt;
    st             = &newSt;

    if (st->epSquare != SQ_NONE)
--- a/src/position.h
+++ b/src/position.h
@@ -53,7 +53,6 @@ struct StateInfo {
    Key        key;
    Bitboard   checkersBB;
    StateInfo* previous;
-    StateInfo* next;
    Bitboard   blockersForKing[COLOR_NB];
    Bitboard   pinners[COLOR_NB];
    Bitboard   checkSquares[PIECE_TYPE_NB];
@@ -87,9 +86,9 @@ class Position {
    std::string fen() const;

    // Position representation
-    Bitboard pieces(PieceType pt = ALL_PIECES) const;
+    Bitboard pieces() const;  // All pieces
    template<typename... PieceTypes>
-    Bitboard pieces(PieceType pt, PieceTypes... pts) const;
+    Bitboard pieces(PieceTypes... pts) const;
    Bitboard pieces(Color c) const;
    template<typename... PieceTypes>
    Bitboard pieces(Color c, PieceTypes... pts) const;
@@ -165,7 +164,6 @@ class Position {
    bool pos_is_ok() const;
    void flip();

-    // Used by NNUE
    StateInfo* state() const;

    void put_piece(Piece pc, Square s);
@@ -216,11 +214,11 @@ inline bool Position::empty(Square s) const { return piece_on(s) == NO_PIECE; }

 inline Piece Position::moved_piece(Move m) const { return piece_on(m.from_sq()); }

-inline Bitboard Position::pieces(PieceType pt) const { return byTypeBB[pt]; }
+inline Bitboard Position::pieces() const { return byTypeBB[ALL_PIECES]; }

 template<typename... PieceTypes>
-inline Bitboard Position::pieces(PieceType pt, PieceTypes... pts) const {
-    return pieces(pt) | pieces(pts...);
+inline Bitboard Position::pieces(PieceTypes... pts) const {
+    return (byTypeBB[pts] | ...);
 }

 inline Bitboard Position::pieces(Color c) const { return byColorBB[c]; }
--- a/src/search.cpp
+++ b/src/search.cpp
--- a/src/search.h
+++ b/src/search.h
@@ -75,7 +75,8 @@ struct Stack {
    bool                        ttHit;
    int                         cutoffCnt;
    int                         reduction;
-    bool                        isTTMove;
+    bool                        isPvNode;
+    int                         quietMoveStreak;
 };


@@ -292,6 +293,8 @@ class Worker {
    CorrectionHistory<NonPawn>      nonPawnCorrectionHistory;
    CorrectionHistory<Continuation> continuationCorrectionHistory;

+    TTMoveHistory ttMoveHistory;
+
   private:
    void iterative_deepening();

--- a/src/thread.h
+++ b/src/thread.h
@@ -164,7 +164,7 @@ class ThreadPool {
    std::vector<std::unique_ptr<Thread>> threads;
    std::vector<NumaIndex>               boundThreadToNumaNode;

-    uint64_t accumulate(std::atomic<uint64_t> Search::Worker::*member) const {
+    uint64_t accumulate(std::atomic<uint64_t> Search::Worker::* member) const {

        uint64_t sum = 0;
        for (auto&& th : threads)
--- a/src/timeman.cpp
+++ b/src/timeman.cpp
@@ -85,16 +85,13 @@ void TimeManagement::init(Search::LimitsType& limits,
    // with constants are involved.
    const int64_t   scaleFactor = useNodesTime ? npmsec : 1;
    const TimePoint scaledTime  = limits.time[us] / scaleFactor;
-    const TimePoint scaledInc   = limits.inc[us] / scaleFactor;

    // Maximum move horizon
    int centiMTG = limits.movestogo ? std::min(limits.movestogo * 100, 5000) : 5051;

    // If less than one second, gradually reduce mtg
-    if (scaledTime < 1000 && double(centiMTG) / scaledInc > 5.051)
-    {
+    if (scaledTime < 1000)
        centiMTG = scaledTime * 5.051;
-    }

    // Make sure timeLeft is > 0 since we may use it as a divisor
    TimePoint timeLeft =
--- a/src/timeman.h
+++ b/src/timeman.h
@@ -22,11 +22,11 @@
 #include <cstdint>

 #include "misc.h"
-#include "types.h"

 namespace Stockfish {

 class OptionsMap;
+enum Color : int8_t;

 namespace Search {
 struct LimitsType;
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -110,6 +110,8 @@ void TTEntry::save(
        value16   = int16_t(v);
        eval16    = int16_t(ev);
    }
+    else if (depth8 + DEPTH_ENTRY_OFFSET >= 5 && Bound(genBound8 & 0x3) != BOUND_EXACT)
+        depth8--;
 }


@@ -234,8 +236,8 @@ std::tuple<bool, TTData, TTWriter> TranspositionTable::probe(const Key key) cons
    // Find an entry to be replaced according to the replacement strategy
    TTEntry* replace = tte;
    for (int i = 1; i < ClusterSize; ++i)
-        if (replace->depth8 - replace->relative_age(generation8) * 2
-            > tte[i].depth8 - tte[i].relative_age(generation8) * 2)
+        if (replace->depth8 - replace->relative_age(generation8)
+            > tte[i].depth8 - tte[i].relative_age(generation8))
            replace = &tte[i];

    return {false,
--- a/src/types.h
+++ b/src/types.h
@@ -37,7 +37,9 @@
 //               | only in 64-bit mode and requires hardware with pext support.

    #include <cassert>
+    #include <cstddef>
    #include <cstdint>
+    #include <type_traits>

    #if defined(_MSC_VER)
        // Disable some silly and noisy warnings from MSVC compiler
@@ -55,9 +57,15 @@
 // _WIN32                  Building on Windows (any)
 // _WIN64                  Building on Windows 64 bit

-    #if defined(__GNUC__) && (__GNUC__ < 9 || (__GNUC__ == 9 && __GNUC_MINOR__ <= 2)) \
-      && defined(_WIN32) && !defined(__clang__)
-        #define ALIGNAS_ON_STACK_VARIABLES_BROKEN
+// Enforce minimum GCC version
+    #if defined(__GNUC__) && !defined(__clang__) \
+      && (__GNUC__ < 9 || (__GNUC__ == 9 && __GNUC_MINOR__ < 3))
+        #error "Stockfish requires GCC 9.3 or later for correct compilation"
+    #endif
+
+    // Enforce minimum Clang version
+    #if defined(__clang__) && (__clang_major__ < 10)
+        #error "Stockfish requires Clang 10.0 or later for correct compilation"
    #endif

    #define ASSERT_ALIGNED(ptr, alignment) assert(reinterpret_cast<uintptr_t>(ptr) % alignment == 0)
@@ -108,13 +116,13 @@ using Bitboard = uint64_t;
 constexpr int MAX_MOVES = 256;
 constexpr int MAX_PLY   = 246;

-enum Color {
+enum Color : int8_t {
    WHITE,
    BLACK,
    COLOR_NB = 2
 };

-enum CastlingRights {
+enum CastlingRights : int8_t {
    NO_CASTLING,
    WHITE_OO,
    WHITE_OOO = WHITE_OO << 1,
@@ -130,7 +138,7 @@ enum CastlingRights {
    CASTLING_RIGHT_NB = 16
 };

-enum Bound {
+enum Bound : int8_t {
    BOUND_NONE,
    BOUND_UPPER,
    BOUND_LOWER,
@@ -181,13 +189,13 @@ constexpr Value QueenValue  = 2538;


 // clang-format off
-enum PieceType {
+enum PieceType : std::int8_t {
    NO_PIECE_TYPE, PAWN, KNIGHT, BISHOP, ROOK, QUEEN, KING,
    ALL_PIECES = 0,
    PIECE_TYPE_NB = 8
 };

-enum Piece {
+enum Piece : std::int8_t {
    NO_PIECE,
    W_PAWN = PAWN,     W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
    B_PAWN = PAWN + 8, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING,
@@ -201,26 +209,24 @@ constexpr Value PieceValue[PIECE_NB] = {

 using Depth = int;

-enum : int {
-    // The following DEPTH_ constants are used for transposition table entries
-    // and quiescence search move generation stages. In regular search, the
-    // depth stored in the transposition table is literal: the search depth
-    // (effort) used to make the corresponding transposition table value. In
-    // quiescence search, however, the transposition table entries only store
-    // the current quiescence move generation stage (which should thus compare
-    // lower than any regular search depth).
-    DEPTH_QS = 0,
-    // For transposition table entries where no searching at all was done
-    // (whether regular or qsearch) we use DEPTH_UNSEARCHED, which should thus
-    // compare lower than any quiescence or regular depth. DEPTH_ENTRY_OFFSET
-    // is used only for the transposition table entry occupancy check (see tt.cpp),
-    // and should thus be lower than DEPTH_UNSEARCHED.
-    DEPTH_UNSEARCHED   = -2,
-    DEPTH_ENTRY_OFFSET = -3
-};
+// The following DEPTH_ constants are used for transposition table entries
+// and quiescence search move generation stages. In regular search, the
+// depth stored in the transposition table is literal: the search depth
+// (effort) used to make the corresponding transposition table value. In
+// quiescence search, however, the transposition table entries only store
+// the current quiescence move generation stage (which should thus compare
+// lower than any regular search depth).
+constexpr Depth DEPTH_QS = 0;
+// For transposition table entries where no searching at all was done
+// (whether regular or qsearch) we use DEPTH_UNSEARCHED, which should thus
+// compare lower than any quiescence or regular depth. DEPTH_ENTRY_OFFSET
+// is used only for the transposition table entry occupancy check (see tt.cpp),
+// and should thus be lower than DEPTH_UNSEARCHED.
+constexpr Depth DEPTH_UNSEARCHED   = -2;
+constexpr Depth DEPTH_ENTRY_OFFSET = -3;

 // clang-format off
-enum Square : int {
+enum Square : int8_t {
    SQ_A1, SQ_B1, SQ_C1, SQ_D1, SQ_E1, SQ_F1, SQ_G1, SQ_H1,
    SQ_A2, SQ_B2, SQ_C2, SQ_D2, SQ_E2, SQ_F2, SQ_G2, SQ_H2,
    SQ_A3, SQ_B3, SQ_C3, SQ_D3, SQ_E3, SQ_F3, SQ_G3, SQ_H3,
@@ -236,7 +242,7 @@ enum Square : int {
 };
 // clang-format on

-enum Direction : int {
+enum Direction : int8_t {
    NORTH = 8,
    EAST  = 1,
    SOUTH = -NORTH,
@@ -248,7 +254,7 @@ enum Direction : int {
    NORTH_WEST = NORTH + WEST
 };

-enum File : int {
+enum File : int8_t {
    FILE_A,
    FILE_B,
    FILE_C,
@@ -260,7 +266,7 @@ enum File : int {
    FILE_NB
 };

-enum Rank : int {
+enum Rank : int8_t {
    RANK_1,
    RANK_2,
    RANK_3,
@@ -274,23 +280,19 @@ enum Rank : int {

 // Keep track of what a move changes on the board (used by NNUE)
 struct DirtyPiece {
+    Piece  pc;        // this is never allowed to be NO_PIECE
+    Square from, to;  // to should be SQ_NONE for promotions

-    // Number of changed pieces
-    int dirty_num;
-
-    // Max 3 pieces can change in one move. A promotion with capture moves
-    // both the pawn and the captured piece to SQ_NONE and the piece promoted
-    // to from SQ_NONE to the capture square.
-    Piece piece[3];
-
-    // From and to squares, which may be SQ_NONE
-    Square from[3];
-    Square to[3];
+    // if {add,remove}_sq is SQ_NONE, {add,remove}_pc is allowed to be
+    // uninitialized
+    // castling uses add_sq and remove_sq to remove and add the rook
+    Square remove_sq, add_sq;
+    Piece  remove_pc, add_pc;
 };

    #define ENABLE_INCR_OPERATORS_ON(T) \
-        inline T& operator++(T& d) { return d = T(int(d) + 1); } \
-        inline T& operator--(T& d) { return d = T(int(d) - 1); }
+        constexpr T& operator++(T& d) { return d = T(int(d) + 1); } \
+        constexpr T& operator--(T& d) { return d = T(int(d) - 1); }

 ENABLE_INCR_OPERATORS_ON(PieceType)
 ENABLE_INCR_OPERATORS_ON(Square)
@@ -303,10 +305,10 @@ constexpr Direction operator+(Direction d1, Direction d2) { return Direction(int
 constexpr Direction operator*(int i, Direction d) { return Direction(i * int(d)); }

 // Additional operators to add a Direction to a Square
-constexpr Square operator+(Square s, Direction d) { return Square(int(s) + int(d)); }
-constexpr Square operator-(Square s, Direction d) { return Square(int(s) - int(d)); }
-inline Square&   operator+=(Square& s, Direction d) { return s = s + d; }
-inline Square&   operator-=(Square& s, Direction d) { return s = s - d; }
+constexpr Square  operator+(Square s, Direction d) { return Square(int(s) + int(d)); }
+constexpr Square  operator-(Square s, Direction d) { return Square(int(s) - int(d)); }
+constexpr Square& operator+=(Square& s, Direction d) { return s = s + d; }
+constexpr Square& operator-=(Square& s, Direction d) { return s = s - d; }

 // Toggle color
 constexpr Color operator~(Color c) { return Color(c ^ BLACK); }
@@ -334,7 +336,7 @@ constexpr Piece make_piece(Color c, PieceType pt) { return Piece((c << 3) + pt);

 constexpr PieceType type_of(Piece pc) { return PieceType(pc & 7); }

-inline Color color_of(Piece pc) {
+constexpr Color color_of(Piece pc) {
    assert(pc != NO_PIECE);
    return Color(pc >> 3);
 }
@@ -429,6 +431,14 @@ class Move {
    std::uint16_t data;
 };

+template<typename T, typename... Ts>
+struct is_all_same {
+    static constexpr bool value = (std::is_same_v<T, Ts> && ...);
+};
+
+template<typename... Ts>
+constexpr auto is_all_same_v = is_all_same<Ts...>::value;
+
 }  // namespace Stockfish

 #endif  // #ifndef TYPES_H_INCLUDED
--- a/src/uci.h
+++ b/src/uci.h
@@ -33,7 +33,7 @@ namespace Stockfish {
 class Position;
 class Move;
 class Score;
-enum Square : int;
+enum Square : int8_t;
 using Value = int;

 class UCIEngine {
--- a/tests/signature.sh
+++ b/tests/signature.sh
@@ -2,16 +2,26 @@
 # obtain and optionally verify Bench / signature
 # if no reference is given, the output is deliberately limited to just the signature

+STDOUT_FILE=$(mktemp)
+STDERR_FILE=$(mktemp)
+
 error()
 {
  echo "running bench for signature failed on line $1"
+  echo "===== STDOUT ====="
+  cat "$STDOUT_FILE"
+  echo "===== STDERR ====="
+  cat "$STDERR_FILE"
+  rm -f "$STDOUT_FILE" "$STDERR_FILE"
  exit 1
 }
 trap 'error ${LINENO}' ERR

 # obtain
+eval "$WINE_PATH ./stockfish bench" > "$STDOUT_FILE" 2> "$STDERR_FILE" || error ${LINENO}
+signature=$(grep "Nodes searched  : " "$STDERR_FILE" | awk '{print $4}')

-signature=`eval "$WINE_PATH ./stockfish bench 2>&1" | grep "Nodes searched  : " | awk '{print $4}'`
+rm -f "$STDOUT_FILE" "$STDERR_FILE"

 if [ $# -gt 0 ]; then
   # compare to given reference
@@ -28,4 +38,4 @@ if [ $# -gt 0 ]; then
 else
   # just report signature
   echo $signature
-fi
+fi