diff --git a/.travis.yml b/.travis.yml
index d563a1e1..092c7f53 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -43,26 +43,47 @@ before_script:
   - cd src
 
 script:
+  # Download net
+  - make net
+
   # Obtain bench reference from git log
   - git log HEAD | grep "\b[Bb]ench[ :]\+[0-9]\{7\}" | head -n 1 | sed "s/[^0-9]*\([0-9]*\).*/\1/g" > git_sig
   - export benchref=$(cat git_sig)
   - echo "Reference bench:" $benchref
 
-  #
   # Compiler version string
   - $COMPILER -v
 
-  #
+  # test help target
+  - make help
+
   # Verify bench number against various builds
   - export CXXFLAGS="-Werror -D_GLIBCXX_DEBUG"
-  - make clean && make -j2 ARCH=x86-64 optimize=no debug=yes build && ../tests/signature.sh $benchref
+  - make clean && make -j2 ARCH=x86-64-modern optimize=no debug=yes build && ../tests/signature.sh $benchref
+  - export CXXFLAGS="-Werror"
+  - make clean && make -j2 ARCH=x86-64-modern build && ../tests/signature.sh $benchref
+  - make clean && make -j2 ARCH=x86-64-ssse3 build && ../tests/signature.sh $benchref
+  - make clean && make -j2 ARCH=x86-64-sse3-popcnt build && ../tests/signature.sh $benchref
+  - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse41-popcnt build && ../tests/signature.sh $benchref; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
+  # workaround: exclude a custom version of llvm+clang, which doesn't find llvm-profdata on ubuntu
+  - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
+
+  # compile only for some more advanced architectures (might not run in travis)
+  - make clean && make -j2 ARCH=x86-64-avx2 build
+  - make clean && make -j2 ARCH=x86-64-bmi2 build
+  - make clean && make -j2 ARCH=x86-64-avx512 build
+  - make clean && make -j2 ARCH=x86-64-vnni512 build
+  - make clean && make -j2 ARCH=x86-64-vnni256 build
 
   #
   # Check perft and reproducible search
-  - export CXXFLAGS="-Werror"
-  - make clean && make -j2 ARCH=x86-64 build
+  - make clean && make -j2 ARCH=x86-64-modern build
   - ../tests/perft.sh
   - ../tests/reprosearch.sh
 
@@ -70,11 +91,11 @@ script:
   # Valgrind
   #
   - export CXXFLAGS="-O1 -fno-inline"
-  - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64 debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind; fi
+  - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind; fi
   - if [ -x "$(command -v valgrind )" ]; then ../tests/instrumented.sh --valgrind-thread; fi
 
   #
   # Sanitizer
   #
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64 sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64 sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi
diff --git a/AUTHORS b/AUTHORS
index 21ef3e50..c96f870a 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -53,11 +53,13 @@ Ernesto Gatti
 Linmiao Xu (linrock)
 Fabian Beuke (madnight)
 Fabian Fichter (ianfab)
+Fanael Linithien (Fanael)
 fanon
 Fauzi Akram Dabat (FauziAkram)
 Felix Wittmann
 gamander
 Gary Heckman (gheckman)
+George Sobala (gsobala)
 gguliash
 Gian-Carlo Pascutto (gcp)
 Gontran Lemaire (gonlem)
@@ -126,6 +128,7 @@ Niklas Fiekas (niklasf)
 Nikolay Kostov (NikolayIT)
 Nguyen Pham (nguyenpham)
 Norman Schmidt (FireFather)
+notruck
 Ondrej Mosnáček (WOnder93)
 Oskar Werkelin Ahlin
 Pablo Vazquez
diff --git a/appveyor.yml b/appveyor.yml
index d356ba2f..a3732a23 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -61,6 +61,20 @@ before_build:
 
 build_script:
   - cmake --build . --config %CONFIGURATION% -- /verbosity:minimal
+  - ps: |
+      # Download default NNUE net from fishtest
+      $nnuenet = Get-Content -Path src\ucioption.cpp | Select-String -CaseSensitive -Pattern "Option" | Select-String -CaseSensitive -Pattern "nn-[a-z0-9]{12}.nnue"
+      $dummy = $nnuenet -match "(?<nnuenet>nn-[a-z0-9]{12}.nnue)"
+      $nnuenet = $Matches.nnuenet
+      Write-Host "Default net:" $nnuenet
+      $nnuedownloadurl = "https://tests.stockfishchess.org/api/nn/$nnuenet"
+      $nnuefilepath = "src\${env:CONFIGURATION}\$nnuenet"
+      if (Test-Path -Path $nnuefilepath) {
+            Write-Host "Already available."
+      } else {
+            Write-Host "Downloading $nnuedownloadurl to $nnuefilepath"
+            Invoke-WebRequest -Uri $nnuedownloadurl -OutFile $nnuefilepath
+      }
 
 before_test:
   - cd src/%CONFIGURATION%
diff --git a/src/Makefile b/src/Makefile
index a8736a15..75e39557 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -82,14 +82,16 @@ endif
 # bits = 64/32        --- -DIS_64BIT       --- 64-/32-bit operating system
 # prefetch = yes/no   --- -DUSE_PREFETCH   --- Use prefetch asm-instruction
 # popcnt = yes/no     --- -DUSE_POPCNT     --- Use popcnt asm-instruction
+# pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
 # sse = yes/no        --- -msse            --- Use Intel Streaming SIMD Extensions
-# sse3 = yes/no       --- -msse3           --- Use Intel Streaming SIMD Extensions 3
+# mmx = yes/no        --- -mmmx            --- Use Intel MMX instructions
+# sse2 = yes/no       --- -msse2           --- Use Intel Streaming SIMD Extensions 2
 # ssse3 = yes/no      --- -mssse3          --- Use Intel Supplemental Streaming SIMD Extensions 3
 # sse41 = yes/no      --- -msse4.1         --- Use Intel Streaming SIMD Extensions 4.1
-# sse42 = yes/no      --- -msse4.2         --- Use Intel Streaming SIMD Extensions 4.2
 # avx2 = yes/no       --- -mavx2           --- Use Intel Advanced Vector Extensions 2
-# pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
 # avx512 = yes/no     --- -mavx512bw       --- Use Intel Advanced Vector Extensions 512
+# vnni256 = yes/no    --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 256
+# vnni512 = yes/no    --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 512
 # neon = yes/no       --- -DUSE_NEON       --- Use ARM SIMD architecture
 #
 # Note that Makefile is space sensitive, so when adding new architectures
@@ -97,152 +99,184 @@ endif
 # at the end of the line for flag values.
 
 ### 2.1. General and architecture defaults
+
+# explicitly check for the list of supported architectures (as listed with make help),
+# the user can override with `make ARCH=x86-32-vnni256 SUPPORTED_ARCH=true`
+ifeq ($(ARCH),$(filter $(ARCH),x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-bmi2 x86-64-avx2 \
+                               x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
+                               x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 \
+                               armv7 armv7-neon armv8 apple-silicon general-64 general-32))
+   SUPPORTED_ARCH=true
+else
+   SUPPORTED_ARCH=false
+endif
+
 optimize = yes
 debug = no
 sanitize = no
 bits = 64
 prefetch = no
 popcnt = no
+pext = no
 sse = no
-sse3 = no
+mmx = no
+sse2 = no
 ssse3 = no
 sse41 = no
-sse42 = no
 avx2 = no
-pext = no
 avx512 = no
+vnni256 = no
+vnni512 = no
 neon = no
 ARCH = x86-64-modern
+STRIP = strip
 
 ### 2.2 Architecture specific
+
+ifeq ($(findstring x86,$(ARCH)),x86)
+
+# x86-32/64
+
+ifeq ($(findstring x86-32,$(ARCH)),x86-32)
+	arch = i386
+	bits = 32
+	sse = yes
+	mmx = yes
+else
+	arch = x86_64
+	sse = yes
+	sse2 = yes
+endif
+
+ifeq ($(findstring -sse,$(ARCH)),-sse)
+	sse = yes
+endif
+
+ifeq ($(findstring -popcnt,$(ARCH)),-popcnt)
+	popcnt = yes
+endif
+
+ifeq ($(findstring -mmx,$(ARCH)),-mmx)
+	mmx = yes
+endif
+
+ifeq ($(findstring -sse2,$(ARCH)),-sse2)
+	sse = yes
+	sse2 = yes
+endif
+
+ifeq ($(findstring -ssse3,$(ARCH)),-ssse3)
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+endif
+
+ifeq ($(findstring -sse41,$(ARCH)),-sse41)
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+endif
+
+ifeq ($(findstring -modern,$(ARCH)),-modern)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+endif
+
+ifeq ($(findstring -avx2,$(ARCH)),-avx2)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+	avx2 = yes
+endif
+
+ifeq ($(findstring -bmi2,$(ARCH)),-bmi2)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+	avx2 = yes
+	pext = yes
+endif
+
+ifeq ($(findstring -avx512,$(ARCH)),-avx512)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+	avx2 = yes
+	pext = yes
+	avx512 = yes
+endif
+
+ifeq ($(findstring -vnni256,$(ARCH)),-vnni256)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+	avx2 = yes
+	pext = yes
+	vnni256 = yes
+endif
+
+ifeq ($(findstring -vnni512,$(ARCH)),-vnni512)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+	avx2 = yes
+	pext = yes
+	avx512 = yes
+	vnni512 = yes
+endif
+
+ifeq ($(sse),yes)
+	prefetch = yes
+endif
+
+# 64-bit pext is not available on x86-32
+ifeq ($(bits),32)
+	pext = no
+endif
+
+else
+
+# all other architectures
+
 ifeq ($(ARCH),general-32)
 	arch = any
 	bits = 32
 endif
 
-ifeq ($(ARCH),x86-32-old)
-	arch = i386
-	bits = 32
-endif
-
-ifeq ($(ARCH),x86-32)
-	arch = i386
-	bits = 32
-	prefetch = yes
-	sse = yes
-endif
-
 ifeq ($(ARCH),general-64)
 	arch = any
 endif
 
-ifeq ($(ARCH),x86-64)
-	arch = x86_64
-	prefetch = yes
-	sse = yes
-endif
-
-ifeq ($(ARCH),x86-64-sse3)
-	arch = x86_64
-	prefetch = yes
-	sse = yes
-	sse3 = yes
-endif
-
-ifeq ($(ARCH),x86-64-sse3-popcnt)
-	arch = x86_64
-	prefetch = yes
-	sse = yes
-	sse3 = yes
-	popcnt = yes
-endif
-
-ifeq ($(ARCH),x86-64-ssse3)
-	arch = x86_64
-	prefetch = yes
-	sse = yes
-	sse3 = yes
-	ssse3 = yes
-endif
-
-ifeq ($(ARCH),x86-64-sse41)
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
-	sse = yes
-	sse3 = yes
-	ssse3 = yes
-	sse41 = yes
-endif
-
-ifeq ($(ARCH),x86-64-modern)
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
-	sse = yes
-	sse3 = yes
-	ssse3 = yes
-	sse41 = yes
-endif
-
-ifeq ($(ARCH),x86-64-sse42)
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
-	sse = yes
-	sse3 = yes
-	ssse3 = yes
-	sse41 = yes
-	sse42 = yes
-endif
-
-ifeq ($(ARCH),x86-64-avx2)
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
-	sse = yes
-	sse3 = yes
-	ssse3 = yes
-	sse41 = yes
-	sse42 = yes
-	avx2 = yes
-endif
-
-ifeq ($(ARCH),x86-64-bmi2)
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
-	sse = yes
-	sse3 = yes
-	ssse3 = yes
-	sse41 = yes
-	sse42 = yes
-	avx2 = yes
-	pext = yes
-endif
-
-ifeq ($(ARCH),x86-64-avx512)
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
-	sse = yes
-	sse3 = yes
-	ssse3 = yes
-	sse41 = yes
-	sse42 = yes
-	avx2 = yes
-	pext = yes
-	avx512 = yes
-endif
-
 ifeq ($(ARCH),armv7)
 	arch = armv7
 	prefetch = yes
 	bits = 32
 endif
 
+ifeq ($(ARCH),armv7-neon)
+	arch = armv7
+	prefetch = yes
+	popcnt = yes
+	neon = yes
+	bits = 32
+endif
+
 ifeq ($(ARCH),armv8)
-	arch = armv8-a
+	arch = armv8
 	prefetch = yes
 	popcnt = yes
 	neon = yes
@@ -266,6 +300,8 @@ ifeq ($(ARCH),ppc-64)
 	prefetch = yes
 endif
 
+endif
+
 ### ==========================================================================
 ### Section 3. Low-level Configuration
 ### ==========================================================================
@@ -284,7 +320,7 @@ ifeq ($(COMP),gcc)
 	CXX=g++
 	CXXFLAGS += -pedantic -Wextra -Wshadow
 
-	ifeq ($(ARCH),$(filter $(ARCH),armv7 armv8))
+	ifeq ($(arch),$(filter $(arch),armv7 armv8))
 		ifeq ($(OS),Android)
 			CXXFLAGS += -m$(bits)
 			LDFLAGS += -m$(bits)
@@ -294,12 +330,13 @@ ifeq ($(COMP),gcc)
 		LDFLAGS += -m$(bits)
 	endif
 
+	ifeq ($(arch),$(filter $(arch),armv7))
+		LDFLAGS += -latomic
+	endif
+
 	ifneq ($(KERNEL),Darwin)
 	   LDFLAGS += -Wl,--no-as-needed
 	endif
-	
-	gccversion = $(shell $(CXX) --version)
-	gccisclang = $(findstring clang,$(gccversion))
 endif
 
 ifeq ($(COMP),mingw)
@@ -344,7 +381,7 @@ ifeq ($(COMP),clang)
 	endif
 	endif
 
-	ifeq ($(ARCH),$(filter $(ARCH),armv7 armv8))
+	ifeq ($(arch),$(filter $(arch),armv7 armv8))
 		ifeq ($(OS),Android)
 			CXXFLAGS += -m$(bits)
 			LDFLAGS += -m$(bits)
@@ -371,6 +408,26 @@ endif
 ifeq ($(KERNEL),Darwin)
 	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14
 	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14
+	XCRUN = xcrun
+endif
+
+# To cross-compile for Android, NDK version r21 or later is recommended.
+# In earlier NDK versions, you'll need to pass -fno-addrsig if using GNU binutils.
+# Currently we don't know how to make PGO builds with the NDK yet.
+ifeq ($(COMP),ndk)
+	CXXFLAGS += -stdlib=libc++ -fPIE
+	ifeq ($(arch),armv7)
+		comp=armv7a-linux-androideabi16-clang
+		CXX=armv7a-linux-androideabi16-clang++
+		CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon
+		STRIP=arm-linux-androideabi-strip
+	endif
+	ifeq ($(arch),armv8)
+		comp=aarch64-linux-android21-clang
+		CXX=aarch64-linux-android21-clang++
+		STRIP=aarch64-linux-android-strip
+	endif
+	LDFLAGS += -static-libstdc++ -pie -lm -latomic
 endif
 
 ### Travis CI script uses COMPILER to overwrite CXX
@@ -383,16 +440,29 @@ ifdef COMPCXX
 	CXX=$(COMPCXX)
 endif
 
+### Sometimes gcc is really clang
+ifeq ($(COMP),gcc)
+	gccversion = $(shell $(CXX) --version)
+	gccisclang = $(findstring clang,$(gccversion))
+	ifneq ($(gccisclang),)
+		profile_make = clang-profile-make
+		profile_use = clang-profile-use
+	endif
+endif
+
 ### On mingw use Windows threads, otherwise POSIX
 ifneq ($(comp),mingw)
+	CXXFLAGS += -DUSE_PTHREADS
 	# On Android Bionic's C library comes with its own pthread implementation bundled in
 	ifneq ($(OS),Android)
 		# Haiku has pthreads in its libroot, so only link it in on other platforms
 		ifneq ($(KERNEL),Haiku)
+			ifneq ($(COMP),ndk)
 			LDFLAGS += -lpthread
 		endif
 	endif
 endif
+endif
 
 ### 3.2.1 Debugging
 ifeq ($(debug),no)
@@ -434,7 +504,6 @@ endif
 ifeq ($(prefetch),yes)
 	ifeq ($(sse),yes)
 		CXXFLAGS += -msse
-		DEPENDFLAGS += -msse
 	endif
 else
 	CXXFLAGS += -DNO_PREFETCH
@@ -442,7 +511,7 @@ endif
 
 ### 3.6 popcnt
 ifeq ($(popcnt),yes)
-	ifeq ($(arch),$(filter $(arch),ppc64 armv8-a arm64))
+	ifeq ($(arch),$(filter $(arch),ppc64 armv7 armv8 arm64))
 		CXXFLAGS += -DUSE_POPCNT
 	else ifeq ($(comp),icc)
 		CXXFLAGS += -msse3 -DUSE_POPCNT
@@ -451,6 +520,7 @@ ifeq ($(popcnt),yes)
 	endif
 endif
 
+
 ifeq ($(avx2),yes)
 	CXXFLAGS += -DUSE_AVX2
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
@@ -461,14 +531,21 @@ endif
 ifeq ($(avx512),yes)
 	CXXFLAGS += -DUSE_AVX512
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
-		CXXFLAGS += -mavx512bw
+		CXXFLAGS += -mavx512f -mavx512bw
 	endif
 endif
 
-ifeq ($(sse42),yes)
-	CXXFLAGS += -DUSE_SSE42
+ifeq ($(vnni256),yes)
+	CXXFLAGS += -DUSE_VNNI
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
-		CXXFLAGS += -msse4.2
+		CXXFLAGS += -mavx512f -mavx512bw -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=256
+	endif
+endif
+
+ifeq ($(vnni512),yes)
+	CXXFLAGS += -DUSE_VNNI
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl
 	endif
 endif
 
@@ -486,19 +563,29 @@ ifeq ($(ssse3),yes)
 	endif
 endif
 
-ifeq ($(sse3),yes)
-	CXXFLAGS += -DUSE_SSE3
+ifeq ($(sse2),yes)
+	CXXFLAGS += -DUSE_SSE2
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
-		CXXFLAGS += -msse3
+		CXXFLAGS += -msse2
+	endif
+endif
+
+ifeq ($(mmx),yes)
+	CXXFLAGS += -DUSE_MMX
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -mmmx
 	endif
 endif
 
 ifeq ($(neon),yes)
 	CXXFLAGS += -DUSE_NEON
+	ifeq ($(KERNEL),Linux)
+	ifneq ($(COMP),ndk)
+	ifneq ($(arch),armv8)
+		CXXFLAGS += -mfpu=neon
+	endif
+	endif
 endif
-
-ifeq ($(arch),x86_64)
-	CXXFLAGS += -DUSE_SSE2
 endif
 
 ### 3.7 pext
@@ -514,7 +601,10 @@ endif
 ### needs access to the optimization flags.
 ifeq ($(optimize),yes)
 ifeq ($(debug), no)
-	ifeq ($(comp),clang)
+	ifeq ($(COMP),ndk)
+		CXXFLAGS += -flto=thin
+		LDFLAGS += $(CXXFLAGS)
+	else ifeq ($(comp),clang)
 		CXXFLAGS += -flto=thin
 		LDFLAGS += $(CXXFLAGS)
 
@@ -524,13 +614,18 @@ ifeq ($(debug), no)
 	ifeq ($(gccisclang),)
 		CXXFLAGS += -flto
 		LDFLAGS += $(CXXFLAGS) -flto=jobserver
+		ifneq ($(findstring MINGW,$(KERNEL)),)
+			LDFLAGS += -save-temps
+		else ifneq ($(findstring MSYS,$(KERNEL)),)
+			LDFLAGS += -save-temps
+		endif
 	else
 		CXXFLAGS += -flto=thin
 		LDFLAGS += $(CXXFLAGS)
 	endif
 
 # To use LTO and static linking on windows, the tool chain requires a recent gcc:
-# gcc version 10.1 in msys2 or TDM-GCC version 9.2 are know to work, older might not.
+# gcc version 10.1 in msys2 or TDM-GCC version 9.2 are known to work, older might not.
 # So, only enable it for a cross from Linux by default.
 	else ifeq ($(comp),mingw)
 	ifeq ($(KERNEL),Linux)
@@ -552,6 +647,7 @@ endif
 ### Section 4. Public Targets
 ### ==========================================================================
 
+
 help:
 	@echo ""
 	@echo "To compile stockfish, type: "
@@ -560,31 +656,34 @@ help:
 	@echo ""
 	@echo "Supported targets:"
 	@echo ""
+	@echo "help                    > Display architecture details"
 	@echo "build                   > Standard build"
-	@echo "profile-build           > Standard build with PGO"
+	@echo "net                     > Download the default nnue net"
+	@echo "profile-build           > Faster build (with profile-guided optimization)"
 	@echo "strip                   > Strip executable"
 	@echo "install                 > Install executable"
 	@echo "clean                   > Clean up"
-	@echo "net                     > Download the default nnue net"
 	@echo ""
 	@echo "Supported archs:"
 	@echo ""
+	@echo "x86-64-vnni512          > x86 64-bit with vnni support 512bit wide"
+	@echo "x86-64-vnni256          > x86 64-bit with vnni support 256bit wide"
 	@echo "x86-64-avx512           > x86 64-bit with avx512 support"
 	@echo "x86-64-bmi2             > x86 64-bit with bmi2 support"
 	@echo "x86-64-avx2             > x86 64-bit with avx2 support"
-	@echo "x86-64-sse42            > x86 64-bit with sse42 support"
-	@echo "x86-64-modern           > x86 64-bit with sse41 support (x86-64-sse41)"
-	@echo "x86-64-sse41            > x86 64-bit with sse41 support"
+	@echo "x86-64-sse41-popcnt     > x86 64-bit with sse41 and popcnt support"
+	@echo "x86-64-modern           > common modern CPU, currently x86-64-sse41-popcnt"
 	@echo "x86-64-ssse3            > x86 64-bit with ssse3 support"
 	@echo "x86-64-sse3-popcnt      > x86 64-bit with sse3 and popcnt support"
-	@echo "x86-64-sse3             > x86 64-bit with sse3 support"
-	@echo "x86-64                  > x86 64-bit generic"
-	@echo "x86-32                  > x86 32-bit (also enables SSE)"
-	@echo "x86-32-old              > x86 32-bit fall back for old hardware"
+	@echo "x86-64                  > x86 64-bit generic (with sse2 support)"
+	@echo "x86-32-sse41-popcnt     > x86 32-bit with sse41 and popcnt support"
+	@echo "x86-32-sse2             > x86 32-bit with sse2 support"
+	@echo "x86-32                  > x86 32-bit generic (with mmx and sse support)"
 	@echo "ppc-64                  > PPC 64-bit"
 	@echo "ppc-32                  > PPC 32-bit"
 	@echo "armv7                   > ARMv7 32-bit"
-	@echo "armv8                   > ARMv8 64-bit"
+	@echo "armv7-neon              > ARMv7 32-bit with popcnt and neon"
+	@echo "armv8                   > ARMv8 64-bit with popcnt and neon"
 	@echo "apple-silicon           > Apple silicon ARM64"
 	@echo "general-64              > unspecified 64-bit"
 	@echo "general-32              > unspecified 32-bit"
@@ -595,20 +694,26 @@ help:
 	@echo "mingw                   > Gnu compiler with MinGW under Windows"
 	@echo "clang                   > LLVM Clang compiler"
 	@echo "icc                     > Intel compiler"
+	@echo "ndk                     > Google NDK to cross-compile for Android"
 	@echo ""
 	@echo "Simple examples. If you don't know what to do, you likely want to run: "
 	@echo ""
-	@echo "make -j build ARCH=x86-64    (This is for 64-bit systems)"
-	@echo "make -j build ARCH=x86-32    (This is for 32-bit systems)"
+	@echo "make -j build ARCH=x86-64  (A portable, slow compile for 64-bit systems)"
+	@echo "make -j build ARCH=x86-32  (A portable, slow compile for 32-bit systems)"
 	@echo ""
-	@echo "Advanced examples, for experienced users: "
+	@echo "Advanced examples, for experienced users looking for performance: "
 	@echo ""
-	@echo "make -j build ARCH=x86-64-modern COMP=clang"
-	@echo "make -j profile-build ARCH=x86-64-bmi2 COMP=gcc COMPCXX=g++-4.8"
-	@echo ""
-	@echo "The selected architecture $(ARCH) enables the following configuration: "
+	@echo "make    help  ARCH=x86-64-bmi2"
+	@echo "make -j profile-build ARCH=x86-64-bmi2 COMP=gcc COMPCXX=g++-9.0"
+	@echo "make -j build ARCH=x86-64-ssse3 COMP=clang"
 	@echo ""
+	@echo "-------------------------------"
+ifeq ($(SUPPORTED_ARCH), true)
+	@echo "The selected architecture $(ARCH) will enable the following configuration: "
 	@$(MAKE) ARCH=$(ARCH) COMP=$(COMP) config-sanity
+else
+	@echo "Specify a supported architecture with the ARCH option for more details"
+endif
 
 
 .PHONY: help build profile-build strip install clean net objclean profileclean \
@@ -618,7 +723,7 @@ help:
 build: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
 
-profile-build: config-sanity objclean profileclean
+profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make)
@@ -634,7 +739,7 @@ profile-build: config-sanity objclean profileclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean
 
 strip:
-	strip $(EXE)
+	$(STRIP) $(EXE)
 
 install:
 	-mkdir -p -m 755 $(BINDIR)
@@ -649,17 +754,34 @@ net:
 	$(eval nnuenet := $(shell grep EvalFile ucioption.cpp | grep Option | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/'))
 	@echo "Default net: $(nnuenet)"
 	$(eval nnuedownloadurl := https://tests.stockfishchess.org/api/nn/$(nnuenet))
-	$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -sL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi))
-	@if test -f "$(nnuenet)"; then echo "Already available."; else echo "Downloading $(nnuedownloadurl)"; $(curl_or_wget) $(nnuedownloadurl) > $(nnuenet); fi
+	$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi))
+	@if test -f "$(nnuenet)"; then \
+            echo "Already available."; \
+         else \
+            if [ "x$(curl_or_wget)" = "x" ]; then \
+               echo "Automatic download failed: neither curl nor wget is installed. Install one of these tools or download the net manually"; exit 1; \
+            else \
+               echo "Downloading $(nnuedownloadurl)"; $(curl_or_wget) $(nnuedownloadurl) > $(nnuenet);\
+            fi; \
+        fi;
+	$(eval shasum_command := $(shell if hash shasum 2>/dev/null; then echo "shasum -a 256 "; elif hash sha256sum 2>/dev/null; then echo "sha256sum "; fi))
+	@if [ "x$(shasum_command)" != "x" ]; then \
+	    if [ "$(nnuenet)" != "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then \
+                echo "Failed download or $(nnuenet) corrupted, please delete!"; exit 1; \
+            fi \
+         else \
+            echo "shasum / sha256sum not found, skipping net validation"; \
+        fi
+
 
 # clean binaries and objects
 objclean:
-	@rm -f $(EXE) *.o ./syzygy/*.o ./learn/*.o ./extra/*.o ./eval/*.o ./nnue/*.o ./nnue/features/*.o
+	@rm -f $(EXE) *.o ./syzygy/*.o ./nnue/*.o ./nnue/features/*.o ./learn/*.o ./extra/*.o ./eval/*.o
 
 # clean auxiliary profiling files
 profileclean:
 	@rm -rf profdir
-	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda
+	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda
 	@rm -f stockfish.profdata *.profraw
 
 default:
@@ -683,14 +805,16 @@ config-sanity:
 	@echo "os: '$(OS)'"
 	@echo "prefetch: '$(prefetch)'"
 	@echo "popcnt: '$(popcnt)'"
+	@echo "pext: '$(pext)'"
 	@echo "sse: '$(sse)'"
-	@echo "sse3: '$(sse3)'"
+	@echo "mmx: '$(mmx)'"
+	@echo "sse2: '$(sse2)'"
 	@echo "ssse3: '$(ssse3)'"
 	@echo "sse41: '$(sse41)'"
-	@echo "sse42: '$(sse42)'"
 	@echo "avx2: '$(avx2)'"
-	@echo "pext: '$(pext)'"
 	@echo "avx512: '$(avx512)'"
+	@echo "vnni256: '$(vnni256)'"
+	@echo "vnni512: '$(vnni512)'"
 	@echo "neon: '$(neon)'"
 	@echo ""
 	@echo "Flags:"
@@ -703,22 +827,26 @@ config-sanity:
 	@test "$(debug)" = "yes" || test "$(debug)" = "no"
 	@test "$(sanitize)" = "undefined" || test "$(sanitize)" = "thread" || test "$(sanitize)" = "address" || test "$(sanitize)" = "no"
 	@test "$(optimize)" = "yes" || test "$(optimize)" = "no"
+	@test "$(SUPPORTED_ARCH)" = "true"
 	@test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \
 	 test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || \
-	 test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a" || test "$(arch)" = "arm64"
+	 test "$(arch)" = "armv7" || test "$(arch)" = "armv8" || test "$(arch)" = "arm64"
 	@test "$(bits)" = "32" || test "$(bits)" = "64"
 	@test "$(prefetch)" = "yes" || test "$(prefetch)" = "no"
 	@test "$(popcnt)" = "yes" || test "$(popcnt)" = "no"
+	@test "$(pext)" = "yes" || test "$(pext)" = "no"
 	@test "$(sse)" = "yes" || test "$(sse)" = "no"
-	@test "$(sse3)" = "yes" || test "$(sse3)" = "no"
+	@test "$(mmx)" = "yes" || test "$(mmx)" = "no"
+	@test "$(sse2)" = "yes" || test "$(sse2)" = "no"
 	@test "$(ssse3)" = "yes" || test "$(ssse3)" = "no"
 	@test "$(sse41)" = "yes" || test "$(sse41)" = "no"
-	@test "$(sse42)" = "yes" || test "$(sse42)" = "no"
 	@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
-	@test "$(pext)" = "yes" || test "$(pext)" = "no"
 	@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
+	@test "$(vnni256)" = "yes" || test "$(vnni256)" = "no"
+	@test "$(vnni512)" = "yes" || test "$(vnni512)" = "no"
 	@test "$(neon)" = "yes" || test "$(neon)" = "no"
-	@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang"
+	@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" \
+	|| test "$(comp)" = "armv7a-linux-androideabi16-clang"  || test "$(comp)" = "aarch64-linux-android21-clang"
 
 $(EXE): $(OBJS)
 	+$(CXX) -o $@ $(OBJS) $(LDFLAGS)
@@ -730,7 +858,7 @@ clang-profile-make:
 	all
 
 clang-profile-use:
-	llvm-profdata merge -output=stockfish.profdata *.profraw
+	$(XCRUN) llvm-profdata merge -output=stockfish.profdata *.profraw
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
 	EXTRACXXFLAGS='-fprofile-instr-use=stockfish.profdata' \
 	EXTRALDFLAGS='-fprofile-use ' \
diff --git a/src/benchmark.cpp b/src/benchmark.cpp
index 6041d642..806e9840 100644
--- a/src/benchmark.cpp
+++ b/src/benchmark.cpp
@@ -95,8 +95,9 @@ const vector<string> Defaults = {
 /// setup_bench() builds a list of UCI commands to be run by bench. There
 /// are five parameters: TT size in MB, number of search threads that
 /// should be used, the limit value spent for each position, a file name
-/// where to look for positions in FEN format and the type of the limit:
-/// depth, perft, nodes and movetime (in millisecs).
+/// where to look for positions in FEN format, the type of the limit:
+/// depth, perft, nodes and movetime (in millisecs), and evaluation type
+/// mixed (default), classical, NNUE.
 ///
 /// bench -> search default positions up to depth 13
 /// bench 64 1 15 -> search default positions up to depth 15 (TT = 64MB)
@@ -115,6 +116,7 @@ vector<string> setup_bench(const Position& current, istream& is) {
   string limit     = (is >> token) ? token : "13";
   string fenFile   = (is >> token) ? token : "default";
   string limitType = (is >> token) ? token : "depth";
+  string evalType  = (is >> token) ? token : "mixed";
 
   go = limitType == "eval" ? "eval" : "go " + limitType + " " + limit;
 
@@ -146,13 +148,20 @@ vector<string> setup_bench(const Position& current, istream& is) {
   list.emplace_back("setoption name Hash value " + ttSize);
   list.emplace_back("ucinewgame");
 
+  size_t posCounter = 0;
+
   for (const string& fen : fens)
       if (fen.find("setoption") != string::npos)
           list.emplace_back(fen);
       else
       {
+          if (evalType == "classical" || (evalType == "mixed" && posCounter % 2 == 0))
+              list.emplace_back("setoption name Use NNUE value false");
+          else if (evalType == "NNUE" || (evalType == "mixed" && posCounter % 2 != 0))
+              list.emplace_back("setoption name Use NNUE value true");
           list.emplace_back("position fen " + fen);
           list.emplace_back(go);
+          ++posCounter;
       }
 
   return list;
diff --git a/src/bitboard.cpp b/src/bitboard.cpp
index f531010c..80206b58 100644
--- a/src/bitboard.cpp
+++ b/src/bitboard.cpp
@@ -39,6 +39,16 @@ namespace {
   Bitboard BishopTable[0x1480]; // To store bishop attacks
 
   void init_magics(PieceType pt, Bitboard table[], Magic magics[]);
+
+}
+
+
+/// safe_destination() returns the bitboard of target square for the given step
+/// from the given square. If the step is off the board, returns empty bitboard.
+
+inline Bitboard safe_destination(Square s, int step) {
+    Square to = Square(s + step);
+    return is_ok(to) && distance(s, to) <= 2 ? square_bb(to) : Bitboard(0);
 }
 
 
@@ -110,7 +120,7 @@ namespace {
     Direction   RookDirections[4] = {NORTH, SOUTH, EAST, WEST};
     Direction BishopDirections[4] = {NORTH_EAST, SOUTH_EAST, SOUTH_WEST, NORTH_WEST};
 
-    for(Direction d : (pt == ROOK ? RookDirections : BishopDirections))
+    for (Direction d : (pt == ROOK ? RookDirections : BishopDirections))
     {
         Square s = sq;
         while(safe_destination(s, d) && !(occupied & s))
diff --git a/src/bitboard.h b/src/bitboard.h
index a899d879..29d8f66d 100644
--- a/src/bitboard.h
+++ b/src/bitboard.h
@@ -279,16 +279,6 @@ inline int edge_distance(File f) { return std::min(f, File(FILE_H - f)); }
 inline int edge_distance(Rank r) { return std::min(r, Rank(RANK_8 - r)); }
 
 
-/// safe_destination() returns the bitboard of target square for the given step
-/// from the given square. If the step is off the board, returns empty bitboard.
-
-inline Bitboard safe_destination(Square s, int step)
-{
-    Square to = Square(s + step);
-    return is_ok(to) && distance(s, to) <= 2 ? square_bb(to) : Bitboard(0);
-}
-
-
 /// attacks_bb(Square) returns the pseudo attacks of the give piece type
 /// assuming an empty board.
 
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 4ba89675..5cbf821d 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -61,10 +61,11 @@ namespace Eval {
         UCI::OptionsMap defaults;
         UCI::init(defaults);
 
-        std::cerr << "NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully. "
-                  << "These network evaluation parameters must be available, and compatible with this version of the code. "
-                  << "The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file. "
-                  << "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << std::endl;
+        sync_cout << "info string ERROR: NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully." << sync_endl;
+        sync_cout << "info string ERROR: The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file." << sync_endl;
+        sync_cout << "info string ERROR: The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << sync_endl;
+        sync_cout << "info string ERROR: If the UCI option Use NNUE is set to true, network evaluation parameters compatible with the program must be available." << sync_endl;
+        sync_cout << "info string ERROR: The engine will be terminated now." << sync_endl;
         std::exit(EXIT_FAILURE);
     }
 
@@ -122,7 +123,8 @@ namespace {
   constexpr Value LazyThreshold1 =  Value(1400);
   constexpr Value LazyThreshold2 =  Value(1300);
   constexpr Value SpaceThreshold = Value(12222);
-  constexpr Value NNUEThreshold  =   Value(460);
+  constexpr Value NNUEThreshold1 =   Value(550);
+  constexpr Value NNUEThreshold2 =   Value(150);
 
   // KingAttackWeights[PieceType] contains king attack weights by piece type
   constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -294,8 +296,8 @@ namespace {
     attackedBy2[Us] = dblAttackByPawn | (attackedBy[Us][KING] & attackedBy[Us][PAWN]);
 
     // Init our king safety tables
-    Square s = make_square(Utility::clamp(file_of(ksq), FILE_B, FILE_G),
-                           Utility::clamp(rank_of(ksq), RANK_2, RANK_7));
+    Square s = make_square(std::clamp(file_of(ksq), FILE_B, FILE_G),
+                           std::clamp(rank_of(ksq), RANK_2, RANK_7));
     kingRing[Us] = attacks_bb<KING>(s) | s;
 
     kingAttackersCount[Them] = popcount(kingRing[Us] & pe->pawn_attacks(Them));
@@ -692,8 +694,8 @@ namespace {
             Square blockSq = s + Up;
 
             // Adjust bonus based on the king's proximity
-            bonus += make_score(0, (  (king_proximity(Them, blockSq) * 19) / 4
-                                     - king_proximity(Us,   blockSq) *  2) * w);
+            bonus += make_score(0, (  king_proximity(Them, blockSq) * 19 / 4
+                                    - king_proximity(Us,   blockSq) *  2) * w);
 
             // If blockSq is not the queening square then consider also a second push
             if (r != RANK_7)
@@ -737,7 +739,7 @@ namespace {
 
 
   // Evaluation::space() computes a space evaluation for a given side, aiming to improve game
-  // play in the opening. It is based on the number of safe squares on the 4 central files
+  // play in the opening. It is based on the number of safe squares on the four central files
   // on ranks 2 to 4. Completely safe squares behind a friendly pawn are counted twice.
   // Finally, the space bonus is multiplied by a weight which decreases according to occupancy.
 
@@ -810,7 +812,7 @@ namespace {
     // Now apply the bonus: note that we find the attacking side by extracting the
     // sign of the midgame or endgame values, and that we carefully cap the bonus
     // so that the midgame and endgame scores do not change sign after the bonus.
-    int u = ((mg > 0) - (mg < 0)) * Utility::clamp(complexity + 50, -abs(mg), 0);
+    int u = ((mg > 0) - (mg < 0)) * std::clamp(complexity + 50, -abs(mg), 0);
     int v = ((eg > 0) - (eg < 0)) * std::max(complexity, -abs(eg));
 
     mg += u;
@@ -935,9 +937,6 @@ make_v:
     // Side to move point of view
     v = (pos.side_to_move() == WHITE ? v : -v) + Tempo;
 
-    // Damp down the evaluation linearly when shuffling
-    v = v * (100 - pos.rule50_count()) / 100;
-
     return v;
   }
 
@@ -954,14 +953,21 @@ Value Eval::evaluate(const Position& pos) {
   }
 #endif
 
-  if (Eval::useNNUE)
-  {
-      Value v = eg_value(pos.psq_score());
-      // Take NNUE eval only on balanced positions
-      if (abs(v) < NNUEThreshold + 20 * pos.count<PAWN>())
-         return NNUE::evaluate(pos) + Tempo;
-  }
-  return Evaluation<NO_TRACE>(pos).value();
+  bool classical = !Eval::useNNUE
+                ||  abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
+  Value v = classical ? Evaluation<NO_TRACE>(pos).value()
+                      : NNUE::evaluate(pos) * 5 / 4 + Tempo;
+
+  if (classical && Eval::useNNUE && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
+      v = NNUE::evaluate(pos) * 5 / 4 + Tempo;
+
+  // Damp down the evaluation linearly when shuffling
+  v = v * (100 - pos.rule50_count()) / 100;
+
+  // Guarantee evaluation does not hit the tablebase range
+  v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
+
+  return v;
 }
 
 /// trace() is like evaluate(), but instead of returning a value, it returns
@@ -979,42 +985,46 @@ std::string Eval::trace(const Position& pos) {
 
   Value v;
 
-  if (Eval::useNNUE)
-  {
-      v = NNUE::evaluate(pos);
-  }
-  else
-  {
-      std::memset(scores, 0, sizeof(scores));
+  std::memset(scores, 0, sizeof(scores));
 
-      pos.this_thread()->contempt = SCORE_ZERO; // Reset any dynamic contempt
+  pos.this_thread()->contempt = SCORE_ZERO; // Reset any dynamic contempt
 
-      v = Evaluation<TRACE>(pos).value();
+  v = Evaluation<TRACE>(pos).value();
 
-      ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2)
-         << "     Term    |    White    |    Black    |    Total   \n"
-         << "             |   MG    EG  |   MG    EG  |   MG    EG \n"
-         << " ------------+-------------+-------------+------------\n"
-         << "    Material | " << Term(MATERIAL)
-         << "   Imbalance | " << Term(IMBALANCE)
-         << "       Pawns | " << Term(PAWN)
-         << "     Knights | " << Term(KNIGHT)
-         << "     Bishops | " << Term(BISHOP)
-         << "       Rooks | " << Term(ROOK)
-         << "      Queens | " << Term(QUEEN)
-         << "    Mobility | " << Term(MOBILITY)
-         << " King safety | " << Term(KING)
-         << "     Threats | " << Term(THREAT)
-         << "      Passed | " << Term(PASSED)
-         << "       Space | " << Term(SPACE)
-         << "    Winnable | " << Term(WINNABLE)
-         << " ------------+-------------+-------------+------------\n"
-         << "       Total | " << Term(TOTAL);
-  }
+  ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2)
+     << "     Term    |    White    |    Black    |    Total   \n"
+     << "             |   MG    EG  |   MG    EG  |   MG    EG \n"
+     << " ------------+-------------+-------------+------------\n"
+     << "    Material | " << Term(MATERIAL)
+     << "   Imbalance | " << Term(IMBALANCE)
+     << "       Pawns | " << Term(PAWN)
+     << "     Knights | " << Term(KNIGHT)
+     << "     Bishops | " << Term(BISHOP)
+     << "       Rooks | " << Term(ROOK)
+     << "      Queens | " << Term(QUEEN)
+     << "    Mobility | " << Term(MOBILITY)
+     << " King safety | " << Term(KING)
+     << "     Threats | " << Term(THREAT)
+     << "      Passed | " << Term(PASSED)
+     << "       Space | " << Term(SPACE)
+     << "    Winnable | " << Term(WINNABLE)
+     << " ------------+-------------+-------------+------------\n"
+     << "       Total | " << Term(TOTAL);
 
   v = pos.side_to_move() == WHITE ? v : -v;
 
-  ss << "\nFinal evaluation: " << to_cp(v) << " (white side)\n";
+  ss << "\nClassical evaluation: " << to_cp(v) << " (white side)\n";
+
+  if (Eval::useNNUE)
+  {
+      v = NNUE::evaluate(pos);
+      v = pos.side_to_move() == WHITE ? v : -v;
+      ss << "\nNNUE evaluation:      " << to_cp(v) << " (white side)\n";
+  }
+
+  v = evaluate(pos);
+  v = pos.side_to_move() == WHITE ? v : -v;
+  ss << "\nFinal evaluation:     " << to_cp(v) << " (white side)\n";
 
   return ss.str();
 }
diff --git a/src/material.cpp b/src/material.cpp
index 0ef9926f..870a5e11 100644
--- a/src/material.cpp
+++ b/src/material.cpp
@@ -130,7 +130,7 @@ Entry* probe(const Position& pos) {
 
   Value npm_w = pos.non_pawn_material(WHITE);
   Value npm_b = pos.non_pawn_material(BLACK);
-  Value npm   = Utility::clamp(npm_w + npm_b, EndgameLimit, MidgameLimit);
+  Value npm   = std::clamp(npm_w + npm_b, EndgameLimit, MidgameLimit);
 
   // Map total non-pawn material into [PHASE_ENDGAME, PHASE_MIDGAME]
   e->gamePhase = Phase(((npm - EndgameLimit) * PHASE_MIDGAME) / (MidgameLimit - EndgameLimit));
diff --git a/src/misc.cpp b/src/misc.cpp
index 725450c2..851280fe 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -51,6 +51,11 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
 #include <sys/mman.h>
 #endif
 
+#if defined(__APPLE__) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
+#define POSIXALIGNEDALLOC
+#include <stdlib.h>
+#endif
+
 #include "misc.h"
 #include "thread.h"
 
@@ -214,26 +219,33 @@ const std::string compiler_info() {
 
   compiler += "\nCompilation settings include: ";
   compiler += (Is64Bit ? " 64bit" : " 32bit");
+  #if defined(USE_VNNI)
+    compiler += " VNNI";
+  #endif
   #if defined(USE_AVX512)
     compiler += " AVX512";
   #endif
+  compiler += (HasPext ? " BMI2" : "");
   #if defined(USE_AVX2)
     compiler += " AVX2";
   #endif
-  #if defined(USE_SSE42)
-    compiler += " SSE42";
-  #endif
   #if defined(USE_SSE41)
     compiler += " SSE41";
   #endif
   #if defined(USE_SSSE3)
     compiler += " SSSE3";
   #endif
-  #if defined(USE_SSE3)
-    compiler += " SSE3";
+  #if defined(USE_SSE2)
+    compiler += " SSE2";
   #endif
-    compiler += (HasPext ? " BMI2" : "");
-    compiler += (HasPopCnt ? " POPCNT" : "");
+  compiler += (HasPopCnt ? " POPCNT" : "");
+  #if defined(USE_MMX)
+    compiler += " MMX";
+  #endif
+  #if defined(USE_NEON)
+    compiler += " NEON";
+  #endif
+
   #if !defined(NDEBUG)
     compiler += " DEBUG";
   #endif
@@ -316,14 +328,17 @@ void prefetch(void* addr) {
 
 #endif
 
-/// Wrappers for systems where the c++17 implementation doesn't guarantee the availability of aligned_alloc.
-/// Memory allocated with std_aligned_alloc must be freed with std_aligned_free.
-///
+
+/// std_aligned_alloc() is our wrapper for systems where the c++17 implementation
+/// does not guarantee the availability of aligned_alloc(). Memory allocated with
+/// std_aligned_alloc() must be freed with std_aligned_free().
 
 void* std_aligned_alloc(size_t alignment, size_t size) {
-#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
-  return aligned_alloc(alignment, size);
-#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
+
+#if defined(POSIXALIGNEDALLOC)
+  void *mem;
+  return posix_memalign(&mem, alignment, size) ? nullptr : mem;
+#elif defined(_WIN32)
   return _mm_malloc(size, alignment);
 #else
   return std::aligned_alloc(alignment, size);
@@ -331,16 +346,17 @@ void* std_aligned_alloc(size_t alignment, size_t size) {
 }
 
 void std_aligned_free(void* ptr) {
-#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
+
+#if defined(POSIXALIGNEDALLOC)
   free(ptr);
-#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
+#elif defined(_WIN32)
   _mm_free(ptr);
 #else
   free(ptr);
 #endif
 }
 
-/// aligned_ttmem_alloc() will return suitably aligned memory, and if possible use large pages.
+/// aligned_ttmem_alloc() will return suitably aligned memory, if possible using large pages.
 /// The returned pointer is the aligned one, while the mem argument is the one that needs
 /// to be passed to free. With c++17 some of this functionality could be simplified.
 
@@ -352,7 +368,9 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
   size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
   if (posix_memalign(&mem, alignment, size))
      mem = nullptr;
+#if defined(MADV_HUGEPAGE)
   madvise(mem, allocSize, MADV_HUGEPAGE);
+#endif
   return mem;
 }
 
diff --git a/src/misc.h b/src/misc.h
index ecef028f..19bb008c 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -67,14 +67,6 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 #define sync_cout std::cout << IO_LOCK
 #define sync_endl std::endl << IO_UNLOCK
 
-namespace Utility {
-
-/// Clamp a value between lo and hi. Available in c++17.
-template<class T> constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
-  return v < lo ? lo : v > hi ? hi : v;
-}
-
-}
 
 /// xorshift64star Pseudo-Random Number Generator
 /// This class is based on original code written and dedicated
diff --git a/src/movegen.cpp b/src/movegen.cpp
index d74df4c3..3340f65c 100644
--- a/src/movegen.cpp
+++ b/src/movegen.cpp
@@ -248,7 +248,7 @@ namespace {
             *moveList++ = make_move(ksq, pop_lsb(&b));
 
         if ((Type != CAPTURES) && pos.can_castle(Us & ANY_CASTLING))
-            for(CastlingRights cr : { Us & KING_SIDE, Us & QUEEN_SIDE } )
+            for (CastlingRights cr : { Us & KING_SIDE, Us & QUEEN_SIDE } )
                 if (!pos.castling_impeded(cr) && pos.can_castle(cr))
                     *moveList++ = make<CASTLING>(ksq, pos.castling_rook_square(cr));
     }
diff --git a/src/movepick.cpp b/src/movepick.cpp
index 96a44449..153d323e 100644
--- a/src/movepick.cpp
+++ b/src/movepick.cpp
@@ -182,7 +182,7 @@ top:
           --endMoves;
 
       ++stage;
-      /* fallthrough */
+      [[fallthrough]];
 
   case REFUTATION:
       if (select<Next>([&](){ return    *cur != MOVE_NONE
@@ -190,7 +190,7 @@ top:
                                     &&  pos.pseudo_legal(*cur); }))
           return *(cur - 1);
       ++stage;
-      /* fallthrough */
+      [[fallthrough]];
 
   case QUIET_INIT:
       if (!skipQuiets)
@@ -203,7 +203,7 @@ top:
       }
 
       ++stage;
-      /* fallthrough */
+      [[fallthrough]];
 
   case QUIET:
       if (   !skipQuiets
@@ -217,7 +217,7 @@ top:
       endMoves = endBadCaptures;
 
       ++stage;
-      /* fallthrough */
+      [[fallthrough]];
 
   case BAD_CAPTURE:
       return select<Next>([](){ return true; });
@@ -228,7 +228,7 @@ top:
 
       score<EVASIONS>();
       ++stage;
-      /* fallthrough */
+      [[fallthrough]];
 
   case EVASION:
       return select<Best>([](){ return true; });
@@ -246,14 +246,14 @@ top:
           return MOVE_NONE;
 
       ++stage;
-      /* fallthrough */
+      [[fallthrough]];
 
   case QCHECK_INIT:
       cur = moves;
       endMoves = generate<QUIET_CHECKS>(pos, cur);
 
       ++stage;
-      /* fallthrough */
+      [[fallthrough]];
 
   case QCHECK:
       return select<Next>([](){ return true; });
diff --git a/src/movepick.h b/src/movepick.h
index f080935a..4c0ad551 100644
--- a/src/movepick.h
+++ b/src/movepick.h
@@ -86,9 +86,9 @@ enum StatsType { NoCaptures, Captures };
 /// the move's from and to squares, see www.chessprogramming.org/Butterfly_Boards
 typedef Stats<int16_t, 10692, COLOR_NB, int(SQUARE_NB) * int(SQUARE_NB)> ButterflyHistory;
 
-/// At higher depths LowPlyHistory records successful quiet moves near the root and quiet
-/// moves which are/were in the PV (ttPv)
-/// It is cleared with each new search and filled during iterative deepening
+/// At higher depths LowPlyHistory records successful quiet moves near the root
+/// and quiet moves which are/were in the PV (ttPv). It is cleared with each new
+/// search and filled during iterative deepening.
 constexpr int MAX_LPH = 4;
 typedef Stats<int16_t, 10692, MAX_LPH, int(SQUARE_NB) * int(SQUARE_NB)> LowPlyHistory;
 
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index a28a4573..a2845c96 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -29,30 +29,29 @@
 
 #include "evaluate_nnue.h"
 
-ExtPieceSquare kpp_board_index[PIECE_NB] = {
- // convention: W - us, B - them
- // viewed from other side, W and B are reversed
-    { PS_NONE,     PS_NONE     },
-    { PS_W_PAWN,   PS_B_PAWN   },
-    { PS_W_KNIGHT, PS_B_KNIGHT },
-    { PS_W_BISHOP, PS_B_BISHOP },
-    { PS_W_ROOK,   PS_B_ROOK   },
-    { PS_W_QUEEN,  PS_B_QUEEN  },
-    { PS_W_KING,   PS_B_KING   },
-    { PS_NONE,     PS_NONE     },
-    { PS_NONE,     PS_NONE     },
-    { PS_B_PAWN,   PS_W_PAWN   },
-    { PS_B_KNIGHT, PS_W_KNIGHT },
-    { PS_B_BISHOP, PS_W_BISHOP },
-    { PS_B_ROOK,   PS_W_ROOK   },
-    { PS_B_QUEEN,  PS_W_QUEEN  },
-    { PS_B_KING,   PS_W_KING   },
-    { PS_NONE,     PS_NONE     }
-};
-
-
 namespace Eval::NNUE {
 
+  uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
+   // convention: W - us, B - them
+   // viewed from other side, W and B are reversed
+      { PS_NONE,     PS_NONE     },
+      { PS_W_PAWN,   PS_B_PAWN   },
+      { PS_W_KNIGHT, PS_B_KNIGHT },
+      { PS_W_BISHOP, PS_B_BISHOP },
+      { PS_W_ROOK,   PS_B_ROOK   },
+      { PS_W_QUEEN,  PS_B_QUEEN  },
+      { PS_W_KING,   PS_B_KING   },
+      { PS_NONE,     PS_NONE     },
+      { PS_NONE,     PS_NONE     },
+      { PS_B_PAWN,   PS_W_PAWN   },
+      { PS_B_KNIGHT, PS_W_KNIGHT },
+      { PS_B_BISHOP, PS_W_BISHOP },
+      { PS_B_ROOK,   PS_W_ROOK   },
+      { PS_B_QUEEN,  PS_W_QUEEN  },
+      { PS_B_KING,   PS_W_KING   },
+      { PS_NONE,     PS_NONE     }
+  };
+
   // Input feature converter
   AlignedPtr<FeatureTransformer> feature_transformer;
 
@@ -86,7 +85,7 @@ namespace Eval::NNUE {
   bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
 
     std::uint32_t header;
-    stream.read(reinterpret_cast<char*>(&header), sizeof(header));
+    header = read_little_endian<std::uint32_t>(stream);
     if (!stream || header != T::GetHashValue()) return false;
     return pointer->ReadParameters(stream);
   }
@@ -109,13 +108,13 @@ namespace Eval::NNUE {
   }
 
   // Read network header
-  bool ReadHeader(std::istream& stream,
-    std::uint32_t* hash_value, std::string* architecture) {
-
+  bool ReadHeader(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
+  {
     std::uint32_t version, size;
-    stream.read(reinterpret_cast<char*>(&version), sizeof(version));
-    stream.read(reinterpret_cast<char*>(hash_value), sizeof(*hash_value));
-    stream.read(reinterpret_cast<char*>(&size), sizeof(size));
+
+    version     = read_little_endian<std::uint32_t>(stream);
+    *hash_value = read_little_endian<std::uint32_t>(stream);
+    size        = read_little_endian<std::uint32_t>(stream);
     if (!stream || version != kVersion) return false;
     architecture->resize(size);
     stream.read(&(*architecture)[0], size);
@@ -202,10 +201,7 @@ namespace Eval::NNUE {
 
   // Evaluation function. Perform differential calculation.
   Value evaluate(const Position& pos) {
-    Value v = ComputeScore(pos, false);
-    v = Utility::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
-
-    return v;
+    return ComputeScore(pos, false);
   }
 
   // Evaluation function. Perform full calculation.
diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index ec34a486..b933d2d9 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -106,8 +106,7 @@ namespace Eval::NNUE::Features {
         reset[perspective] = false;
         switch (trigger) {
           case TriggerEvent::kFriendKingMoved:
-            reset[perspective] =
-                dp.pieceId[0] == PIECE_ID_KING + perspective;
+            reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
             break;
           default:
             assert(false);
diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index 628add6e..88e384a3 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -23,25 +23,17 @@
 
 namespace Eval::NNUE::Features {
 
-  // Find the index of the feature quantity from the king position and PieceSquare
-  template <Side AssociatedKing>
-  inline IndexType HalfKP<AssociatedKing>::MakeIndex(Square sq_k, PieceSquare p) {
-    return static_cast<IndexType>(PS_END) * static_cast<IndexType>(sq_k) + p;
+  // Orient a square according to perspective (rotates by 180 for black)
+  inline Square orient(Color perspective, Square s) {
+    return Square(int(s) ^ (bool(perspective) * 63));
   }
 
-  // Get pieces information
+  // Find the index of the feature quantity from the king position and PieceSquare
   template <Side AssociatedKing>
-  inline void HalfKP<AssociatedKing>::GetPieces(
-      const Position& pos, Color perspective,
-      PieceSquare** pieces, Square* sq_target_k) {
+  inline IndexType HalfKP<AssociatedKing>::MakeIndex(
+      Color perspective, Square s, Piece pc, Square ksq) {
 
-    *pieces = (perspective == BLACK) ?
-        pos.eval_list()->piece_list_fb() :
-        pos.eval_list()->piece_list_fw();
-    const PieceId target = (AssociatedKing == Side::kFriend) ?
-        static_cast<PieceId>(PIECE_ID_KING + perspective) :
-        static_cast<PieceId>(PIECE_ID_KING + ~perspective);
-    *sq_target_k = static_cast<Square>(((*pieces)[target] - PS_W_KING) % SQUARE_NB);
+    return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq);
   }
 
   // Get a list of indices for active features
@@ -49,16 +41,11 @@ namespace Eval::NNUE::Features {
   void HalfKP<AssociatedKing>::AppendActiveIndices(
       const Position& pos, Color perspective, IndexList* active) {
 
-    // Do nothing if array size is small to avoid compiler warning
-    if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
-
-    PieceSquare* pieces;
-    Square sq_target_k;
-    GetPieces(pos, perspective, &pieces, &sq_target_k);
-    for (PieceId i = PIECE_ID_ZERO; i < PIECE_ID_KING; ++i) {
-      if (pieces[i] != PS_NONE) {
-        active->push_back(MakeIndex(sq_target_k, pieces[i]));
-      }
+    Square ksq = orient(perspective, pos.square<KING>(perspective));
+    Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+    while (bb) {
+      Square s = pop_lsb(&bb);
+      active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
     }
   }
 
@@ -68,22 +55,15 @@ namespace Eval::NNUE::Features {
       const Position& pos, Color perspective,
       IndexList* removed, IndexList* added) {
 
-    PieceSquare* pieces;
-    Square sq_target_k;
-    GetPieces(pos, perspective, &pieces, &sq_target_k);
+    Square ksq = orient(perspective, pos.square<KING>(perspective));
     const auto& dp = pos.state()->dirtyPiece;
     for (int i = 0; i < dp.dirty_num; ++i) {
-      if (dp.pieceId[i] >= PIECE_ID_KING) continue;
-      const auto old_p = static_cast<PieceSquare>(
-          dp.old_piece[i].from[perspective]);
-      if (old_p != PS_NONE) {
-        removed->push_back(MakeIndex(sq_target_k, old_p));
-      }
-      const auto new_p = static_cast<PieceSquare>(
-          dp.new_piece[i].from[perspective]);
-      if (new_p != PS_NONE) {
-        added->push_back(MakeIndex(sq_target_k, new_p));
-      }
+      Piece pc = dp.piece[i];
+      if (type_of(pc) == KING) continue;
+      if (dp.from[i] != SQ_NONE)
+        removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
+      if (dp.to[i] != SQ_NONE)
+        added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
     }
   }
 
diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h
index 99842eea..ee6a8df3 100644
--- a/src/nnue/features/half_kp.h
+++ b/src/nnue/features/half_kp.h
@@ -41,7 +41,7 @@ namespace Eval::NNUE::Features {
     static constexpr IndexType kDimensions =
         static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END);
     // Maximum number of simultaneously active features
-    static constexpr IndexType kMaxActiveDimensions = PIECE_ID_KING;
+    static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
     // Trigger for full calculation instead of difference calculation
     static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kFriendKingMoved;
 
@@ -53,13 +53,9 @@ namespace Eval::NNUE::Features {
     static void AppendChangedIndices(const Position& pos, Color perspective,
                                      IndexList* removed, IndexList* added);
 
-    // Index of a feature for a given king position and another piece on some square
-    static IndexType MakeIndex(Square sq_k, PieceSquare p);
-
    private:
-    // Get pieces information
-    static void GetPieces(const Position& pos, Color perspective,
-                          PieceSquare** pieces, Square* sq_target_k);
+    // Index of a feature for a given king position and another piece on some square
+    static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
   };
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 7336be52..f24578a8 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -70,11 +70,10 @@ namespace Eval::NNUE::Layers {
    // Read network parameters
     bool ReadParameters(std::istream& stream) {
       if (!previous_layer_.ReadParameters(stream)) return false;
-      stream.read(reinterpret_cast<char*>(biases_),
-                  kOutputDimensions * sizeof(BiasType));
-      stream.read(reinterpret_cast<char*>(weights_),
-                  kOutputDimensions * kPaddedInputDimensions *
-                  sizeof(WeightType));
+      for (std::size_t i = 0; i < kOutputDimensions; ++i)
+        biases_[i] = read_little_endian<BiasType>(stream);
+      for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i)
+        weights_[i] = read_little_endian<WeightType>(stream);
       return !stream.fail();
     }
 
@@ -98,19 +97,32 @@ namespace Eval::NNUE::Layers {
 
   #if defined(USE_AVX512)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
-      const __m512i kOnes = _mm512_set1_epi16(1);
       const auto input_vector = reinterpret_cast<const __m512i*>(input);
+  #if !defined(USE_VNNI)
+      const __m512i kOnes = _mm512_set1_epi16(1);
+  #endif
 
   #elif defined(USE_AVX2)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-      const __m256i kOnes = _mm256_set1_epi16(1);
       const auto input_vector = reinterpret_cast<const __m256i*>(input);
+  #if !defined(USE_VNNI)
+      const __m256i kOnes = _mm256_set1_epi16(1);
+  #endif
 
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+  #ifndef USE_SSSE3
+      const __m128i kZeros = _mm_setzero_si128();
+  #else
       const __m128i kOnes = _mm_set1_epi16(1);
+  #endif
       const auto input_vector = reinterpret_cast<const __m128i*>(input);
 
+  #elif defined(USE_MMX)
+      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+      const __m64 kZeros = _mm_setzero_si64();
+      const auto input_vector = reinterpret_cast<const __m64*>(input);
+
   #elif defined(USE_NEON)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
       const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
@@ -123,60 +135,115 @@ namespace Eval::NNUE::Layers {
         __m512i sum = _mm512_setzero_si512();
         const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-            __m512i product = _mm512_maddubs_epi16(
-              _mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+  #if defined(USE_VNNI)
+            sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+  #else
+            __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
             product = _mm512_madd_epi16(product, kOnes);
             sum = _mm512_add_epi32(sum, product);
+  #endif
         }
-        output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
 
         // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
         // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
         // and we have to do one more 256bit chunk.
         if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
         {
-            const auto iv_256  = reinterpret_cast<const __m256i*>(input);
-            const auto row_256 = reinterpret_cast<const __m256i*>(&weights_[offset]);
-            int j = kNumChunks * 2;
-
-            __m256i sum256 = _mm256_maddubs_epi16(
-              _mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
-            sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1));
-            sum256 = _mm256_hadd_epi32(sum256, sum256);
-            sum256 = _mm256_hadd_epi32(sum256, sum256);
-            const __m128i lo = _mm256_extracti128_si256(sum256, 0);
-            const __m128i hi = _mm256_extracti128_si256(sum256, 1);
-            output[i] += _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi);
+            const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
+            const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
+  #if defined(USE_VNNI)
+            __m256i product256 = _mm256_dpbusd_epi32(
+                _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
+            sum = _mm512_inserti32x8(sum, product256, 0);
+  #else
+            __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
+            sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
+  #endif
         }
+        output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
 
   #elif defined(USE_AVX2)
         __m256i sum = _mm256_setzero_si256();
         const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i product = _mm256_maddubs_epi16(
-            _mm256_load_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+  #if defined(USE_VNNI)
+          sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+  #else
+          __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
           product = _mm256_madd_epi16(product, kOnes);
           sum = _mm256_add_epi32(sum, product);
+  #endif
         }
-        sum = _mm256_hadd_epi32(sum, sum);
-        sum = _mm256_hadd_epi32(sum, sum);
-        const __m128i lo = _mm256_extracti128_si256(sum, 0);
-        const __m128i hi = _mm256_extracti128_si256(sum, 1);
-        output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi) + biases_[i];
+        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+        output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
 
   #elif defined(USE_SSSE3)
-        __m128i sum = _mm_cvtsi32_si128(biases_[i]);
+        __m128i sum = _mm_setzero_si128();
         const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m128i product = _mm_maddubs_epi16(
-              _mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+        for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
+          __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+          product0 = _mm_madd_epi16(product0, kOnes);
+          sum = _mm_add_epi32(sum, product0);
+          __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
+          product1 = _mm_madd_epi16(product1, kOnes);
+          sum = _mm_add_epi32(sum, product1);
+        }
+        if (kNumChunks & 0x1) {
+          __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
           product = _mm_madd_epi16(product, kOnes);
           sum = _mm_add_epi32(sum, product);
         }
-        sum = _mm_hadd_epi32(sum, sum);
-        sum = _mm_hadd_epi32(sum, sum);
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
+        output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
+
+  #elif defined(USE_SSE2)
+        __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
+        __m128i sum_hi = kZeros;
+        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m128i row_j = _mm_load_si128(&row[j]);
+          __m128i input_j = _mm_load_si128(&input_vector[j]);
+          __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
+          __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
+          __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
+          __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
+          __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
+          __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
+          __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
+          sum_lo = _mm_add_epi32(sum_lo, product_lo);
+          sum_hi = _mm_add_epi32(sum_hi, product_hi);
+        }
+        __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
+        __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
+        sum = _mm_add_epi32(sum, sum_high_64);
+        __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
+        sum = _mm_add_epi32(sum, sum_second_32);
         output[i] = _mm_cvtsi128_si32(sum);
 
+  #elif defined(USE_MMX)
+        __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
+        __m64 sum_hi = kZeros;
+        const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m64 row_j = row[j];
+          __m64 input_j = input_vector[j];
+          __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
+          __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
+          __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
+          __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
+          __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
+          __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
+          __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
+          sum_lo = _mm_add_pi32(sum_lo, product_lo);
+          sum_hi = _mm_add_pi32(sum_hi, product_hi);
+        }
+        __m64 sum = _mm_add_pi32(sum_lo, sum_hi);
+        sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
+        output[i] = _mm_cvtsi64_si32(sum);
+
   #elif defined(USE_NEON)
         int32x4_t sum = {biases_[i]};
         const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
@@ -196,6 +263,9 @@ namespace Eval::NNUE::Layers {
   #endif
 
       }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
       return output;
     }
 
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 9b5a5f5f..d923986e 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -86,18 +86,17 @@ namespace Eval::NNUE::Layers {
       const auto out = reinterpret_cast<__m256i*>(output);
       for (IndexType i = 0; i < kNumChunks; ++i) {
         const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-          _mm256_load_si256(&in[i * 4 + 0]),
-          _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
+            _mm256_loadA_si256(&in[i * 4 + 0]),
+            _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
         const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-          _mm256_load_si256(&in[i * 4 + 2]),
-          _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
-        _mm256_store_si256(
-            &out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+            _mm256_loadA_si256(&in[i * 4 + 2]),
+            _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
+        _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
             _mm256_packs_epi16(words0, words1), kZero), kOffsets));
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
 
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
       constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
 
   #ifdef USE_SSE41
@@ -128,6 +127,24 @@ namespace Eval::NNUE::Layers {
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
 
+  #elif defined(USE_MMX)
+      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+      const __m64 k0x80s = _mm_set1_pi8(-128);
+      const auto in = reinterpret_cast<const __m64*>(input);
+      const auto out = reinterpret_cast<__m64*>(output);
+      for (IndexType i = 0; i < kNumChunks; ++i) {
+        const __m64 words0 = _mm_srai_pi16(
+            _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
+            kWeightScaleBits);
+        const __m64 words1 = _mm_srai_pi16(
+            _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
+            kWeightScaleBits);
+        const __m64 packedbytes = _mm_packs_pi16(words0, words1);
+        out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+      }
+      _mm_empty();
+      constexpr IndexType kStart = kNumChunks * kSimdWidth;
+
   #elif defined(USE_NEON)
       constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
       const int8x8_t kZero = {0};
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index 2a354a3c..69dfaad2 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -26,7 +26,7 @@
 namespace Eval::NNUE {
 
   // Class that holds the result of affine transformation of input features
-  struct alignas(32) Accumulator {
+  struct alignas(kCacheLineSize) Accumulator {
     std::int16_t
         accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
     Value score;
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 36fda7d7..d7ffa21a 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -21,6 +21,9 @@
 #ifndef NNUE_COMMON_H_INCLUDED
 #define NNUE_COMMON_H_INCLUDED
 
+#include <cstring>
+#include <iostream>
+
 #if defined(USE_AVX2)
 #include <immintrin.h>
 
@@ -33,10 +36,36 @@
 #elif defined(USE_SSE2)
 #include <emmintrin.h>
 
+#elif defined(USE_MMX)
+#include <mmintrin.h>
+
 #elif defined(USE_NEON)
 #include <arm_neon.h>
 #endif
 
+// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary
+//       compiled with older g++ crashes because the output memory is not aligned
+//       even though alignas is specified.
+#if defined(USE_AVX2)
+#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
+#define _mm256_loadA_si256  _mm256_loadu_si256
+#define _mm256_storeA_si256 _mm256_storeu_si256
+#else
+#define _mm256_loadA_si256  _mm256_load_si256
+#define _mm256_storeA_si256 _mm256_store_si256
+#endif
+#endif
+
+#if defined(USE_AVX512)
+#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
+#define _mm512_loadA_si512   _mm512_loadu_si512
+#define _mm512_storeA_si512  _mm512_storeu_si512
+#else
+#define _mm512_loadA_si512   _mm512_load_si512
+#define _mm512_storeA_si512  _mm512_store_si512
+#endif
+#endif
+
 namespace Eval::NNUE {
 
   // Version of the evaluation file
@@ -56,12 +85,36 @@ namespace Eval::NNUE {
   #elif defined(USE_SSE2)
   constexpr std::size_t kSimdWidth = 16;
 
+  #elif defined(USE_MMX)
+  constexpr std::size_t kSimdWidth = 8;
+
   #elif defined(USE_NEON)
   constexpr std::size_t kSimdWidth = 16;
   #endif
 
   constexpr std::size_t kMaxSimdWidth = 32;
 
+  // unique number for each piece type on each square
+  enum {
+    PS_NONE     =  0,
+    PS_W_PAWN   =  1,
+    PS_B_PAWN   =  1 * SQUARE_NB + 1,
+    PS_W_KNIGHT =  2 * SQUARE_NB + 1,
+    PS_B_KNIGHT =  3 * SQUARE_NB + 1,
+    PS_W_BISHOP =  4 * SQUARE_NB + 1,
+    PS_B_BISHOP =  5 * SQUARE_NB + 1,
+    PS_W_ROOK   =  6 * SQUARE_NB + 1,
+    PS_B_ROOK   =  7 * SQUARE_NB + 1,
+    PS_W_QUEEN  =  8 * SQUARE_NB + 1,
+    PS_B_QUEEN  =  9 * SQUARE_NB + 1,
+    PS_W_KING   = 10 * SQUARE_NB + 1,
+    PS_END      = PS_W_KING, // pieces without kings (pawns included)
+    PS_B_KING   = 11 * SQUARE_NB + 1,
+    PS_END2     = 12 * SQUARE_NB + 1
+  };
+
+  extern uint32_t kpp_board_index[PIECE_NB][COLOR_NB];
+
   // Type of input feature after conversion
   using TransformedFeatureType = std::uint8_t;
   using IndexType = std::uint32_t;
@@ -73,7 +126,25 @@ namespace Eval::NNUE {
   // Round n up to be a multiple of base
   template <typename IntType>
   constexpr IntType CeilToMultiple(IntType n, IntType base) {
-    return (n + base - 1) / base * base;
+      return (n + base - 1) / base * base;
+  }
+
+  // read_little_endian() is our utility to read an integer (signed or unsigned, any size)
+  // from a stream in little-endian order. We swap the byte order after the read if
+  // necessary to return a result with the byte ordering of the compiling machine.
+  template <typename IntType>
+  inline IntType read_little_endian(std::istream& stream) {
+
+      IntType result;
+      std::uint8_t u[sizeof(IntType)];
+      typename std::make_unsigned<IntType>::type v = 0;
+
+      stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
+      for (std::size_t i = 0; i < sizeof(IntType); ++i)
+          v = (v << 8) | u[sizeof(IntType) - i - 1];
+
+      std::memcpy(&result, &v, sizeof(IntType));
+      return result;
   }
 
 }  // namespace Eval::NNUE
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 29e6db6e..e1bc2ab8 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -62,10 +62,10 @@ namespace Eval::NNUE {
 
     // Read network parameters
     bool ReadParameters(std::istream& stream) {
-      stream.read(reinterpret_cast<char*>(biases_),
-                  kHalfDimensions * sizeof(BiasType));
-      stream.read(reinterpret_cast<char*>(weights_),
-                  kHalfDimensions * kInputDimensions * sizeof(WeightType));
+      for (std::size_t i = 0; i < kHalfDimensions; ++i)
+        biases_[i] = read_little_endian<BiasType>(stream);
+      for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
+        weights_[i] = read_little_endian<WeightType>(stream);
       return !stream.fail();
     }
 
@@ -104,7 +104,7 @@ namespace Eval::NNUE {
       constexpr int kControl = 0b11011000;
       const __m256i kZero = _mm256_setzero_si256();
 
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
 
   #ifdef USE_SSE41
@@ -113,6 +113,10 @@ namespace Eval::NNUE {
       const __m128i k0x80s = _mm_set1_epi8(-128);
   #endif
 
+  #elif defined(USE_MMX)
+      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+      const __m64 k0x80s = _mm_set1_pi8(-128);
+
   #elif defined(USE_NEON)
       constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
       const int8x8_t kZero = {0};
@@ -125,17 +129,15 @@ namespace Eval::NNUE {
   #if defined(USE_AVX2)
         auto out = reinterpret_cast<__m256i*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i sum0 =
-            _mm256_load_si256(&reinterpret_cast<const __m256i*>(
-              accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m256i sum1 =
-            _mm256_load_si256(&reinterpret_cast<const __m256i*>(
-              accumulation[perspectives[p]][0])[j * 2 + 1]);
-          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+          __m256i sum0 = _mm256_loadA_si256(
+              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m256i sum1 = _mm256_loadA_si256(
+            &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
         }
 
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
         auto out = reinterpret_cast<__m128i*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
           __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
@@ -155,6 +157,17 @@ namespace Eval::NNUE {
           );
         }
 
+  #elif defined(USE_MMX)
+        auto out = reinterpret_cast<__m64*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m64 sum0 = *(&reinterpret_cast<const __m64*>(
+              accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m64 sum1 = *(&reinterpret_cast<const __m64*>(
+              accumulation[perspectives[p]][0])[j * 2 + 1]);
+          const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
+          out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+        }
+
   #elif defined(USE_NEON)
         const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -172,6 +185,9 @@ namespace Eval::NNUE {
   #endif
 
       }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
     }
 
    private:
@@ -187,23 +203,37 @@ namespace Eval::NNUE {
                    kHalfDimensions * sizeof(BiasType));
         for (const auto index : active_indices[perspective]) {
           const IndexType offset = kHalfDimensions * index;
+  #if defined(USE_AVX512)
+          auto accumulation = reinterpret_cast<__m512i*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m512i*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+          for (IndexType j = 0; j < kNumChunks; ++j)
+            _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j]));
 
-  #if defined(USE_AVX2)
+  #elif defined(USE_AVX2)
           auto accumulation = reinterpret_cast<__m256i*>(
               &accumulator.accumulation[perspective][i][0]);
           auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
-            accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
-          }
+          for (IndexType j = 0; j < kNumChunks; ++j)
+            _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
 
   #elif defined(USE_SSE2)
           auto accumulation = reinterpret_cast<__m128i*>(
               &accumulator.accumulation[perspective][i][0]);
           auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
+          for (IndexType j = 0; j < kNumChunks; ++j)
             accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
+
+  #elif defined(USE_MMX)
+          auto accumulation = reinterpret_cast<__m64*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+            accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
           }
 
   #elif defined(USE_NEON)
@@ -211,18 +241,19 @@ namespace Eval::NNUE {
               &accumulator.accumulation[perspective][i][0]);
           auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
+          for (IndexType j = 0; j < kNumChunks; ++j)
             accumulation[j] = vaddq_s16(accumulation[j], column[j]);
-          }
 
   #else
-          for (IndexType j = 0; j < kHalfDimensions; ++j) {
+          for (IndexType j = 0; j < kHalfDimensions; ++j)
             accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-          }
   #endif
 
         }
       }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
 
       accumulator.computed_accumulation = true;
       accumulator.computed_score = false;
@@ -249,6 +280,11 @@ namespace Eval::NNUE {
         auto accumulation = reinterpret_cast<__m128i*>(
             &accumulator.accumulation[perspective][i][0]);
 
+  #elif defined(USE_MMX)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<__m64*>(
+            &accumulator.accumulation[perspective][i][0]);
+
   #elif defined(USE_NEON)
         constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
         auto accumulation = reinterpret_cast<int16x8_t*>(
@@ -278,6 +314,12 @@ namespace Eval::NNUE {
               accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
             }
 
+  #elif defined(USE_MMX)
+            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
+            }
+
   #elif defined(USE_NEON)
             auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
             for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -309,6 +351,12 @@ namespace Eval::NNUE {
               accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
             }
 
+  #elif defined(USE_MMX)
+            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
+            }
+
   #elif defined(USE_NEON)
             auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
             for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -325,6 +373,9 @@ namespace Eval::NNUE {
           }
         }
       }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
 
       accumulator.computed_accumulation = true;
       accumulator.computed_score = false;
diff --git a/src/pawns.cpp b/src/pawns.cpp
index 868d0c8e..af0f6618 100644
--- a/src/pawns.cpp
+++ b/src/pawns.cpp
@@ -219,7 +219,7 @@ Score Entry::evaluate_shelter(const Position& pos, Square ksq) const {
 
   Score bonus = make_score(5, 5);
 
-  File center = Utility::clamp(file_of(ksq), FILE_B, FILE_G);
+  File center = std::clamp(file_of(ksq), FILE_B, FILE_G);
   for (File f = File(center - 1); f <= File(center + 1); ++f)
   {
       b = ourPawns & file_bb(f);
diff --git a/src/position.cpp b/src/position.cpp
index 46e5d78b..fe89b753 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -198,9 +198,6 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
   std::fill_n(&pieceList[0][0], sizeof(pieceList) / sizeof(Square), SQ_NONE);
   st = si;
 
-  // Each piece on board gets a unique ID used to track the piece later
-  PieceId piece_id, next_piece_id = PIECE_ID_ZERO;
-
   ss >> std::noskipws;
 
   // 1. Piece placement
@@ -212,21 +209,8 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
       else if (token == '/')
           sq += 2 * SOUTH;
 
-      else if ((idx = PieceToChar.find(token)) != string::npos)
-      {
-          auto pc = Piece(idx);
-          put_piece(pc, sq);
-
-          if (Eval::useNNUE)
-          {
-              // Kings get a fixed ID, other pieces get ID in order of placement
-              piece_id =
-                (idx == W_KING) ? PIECE_ID_WKING :
-                (idx == B_KING) ? PIECE_ID_BKING :
-                next_piece_id++;
-              evalList.put_piece(piece_id, sq, pc);
-          }
-
+      else if ((idx = PieceToChar.find(token)) != string::npos) {
+          put_piece(Piece(idx), sq);
           ++sq;
       }
   }
@@ -721,8 +705,6 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   // Used by NNUE
   st->accumulator.computed_accumulation = false;
   st->accumulator.computed_score = false;
-  PieceId dp0 = PIECE_ID_NONE;
-  PieceId dp1 = PIECE_ID_NONE;
   auto& dp = st->dirtyPiece;
   dp.dirty_num = 1;
 
@@ -775,12 +757,10 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
 
       if (Eval::useNNUE)
       {
-          dp.dirty_num = 2; // 2 pieces moved
-          dp1 = piece_id_on(capsq);
-          dp.pieceId[1] = dp1;
-          dp.old_piece[1] = evalList.piece_with_id(dp1);
-          evalList.put_piece(dp1, capsq, NO_PIECE);
-          dp.new_piece[1] = evalList.piece_with_id(dp1);
+          dp.dirty_num = 2;  // 1 piece moved, 1 piece captured
+          dp.piece[1] = captured;
+          dp.from[1] = capsq;
+          dp.to[1] = SQ_NONE;
       }
 
       // Update board and piece lists
@@ -821,11 +801,9 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   {
       if (Eval::useNNUE)
       {
-          dp0 = piece_id_on(from);
-          dp.pieceId[0] = dp0;
-          dp.old_piece[0] = evalList.piece_with_id(dp0);
-          evalList.put_piece(dp0, to, pc);
-          dp.new_piece[0] = evalList.piece_with_id(dp0);
+          dp.piece[0] = pc;
+          dp.from[0] = from;
+          dp.to[0] = to;
       }
 
       move_piece(from, to);
@@ -854,9 +832,12 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
 
           if (Eval::useNNUE)
           {
-              dp0 = piece_id_on(to);
-              evalList.put_piece(dp0, to, promotion);
-              dp.new_piece[0] = evalList.piece_with_id(dp0);
+              // Promoting pawn to SQ_NONE, promoted piece from SQ_NONE
+              dp.to[0] = SQ_NONE;
+              dp.piece[dp.dirty_num] = promotion;
+              dp.from[dp.dirty_num] = SQ_NONE;
+              dp.to[dp.dirty_num] = to;
+              dp.dirty_num++;
           }
 
           // Update hash keys
@@ -950,12 +931,6 @@ void Position::undo_move(Move m) {
   {
       move_piece(to, from); // Put the piece back at the source square
 
-      if (Eval::useNNUE)
-      {
-          PieceId dp0 = st->dirtyPiece.pieceId[0];
-          evalList.put_piece(dp0, from, pc);
-      }
-
       if (st->capturedPiece)
       {
           Square capsq = to;
@@ -972,14 +947,6 @@ void Position::undo_move(Move m) {
           }
 
           put_piece(st->capturedPiece, capsq); // Restore the captured piece
-
-          if (Eval::useNNUE)
-          {
-              PieceId dp1 = st->dirtyPiece.pieceId[1];
-              assert(evalList.piece_with_id(dp1).from[WHITE] == PS_NONE);
-              assert(evalList.piece_with_id(dp1).from[BLACK] == PS_NONE);
-              evalList.put_piece(dp1, capsq, st->capturedPiece);
-          }
       }
   }
 
@@ -1001,32 +968,16 @@ void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Squ
   rto = relative_square(us, kingSide ? SQ_F1 : SQ_D1);
   to = relative_square(us, kingSide ? SQ_G1 : SQ_C1);
 
-  if (Eval::useNNUE)
+  if (Do && Eval::useNNUE)
   {
-      PieceId dp0, dp1;
       auto& dp = st->dirtyPiece;
-      dp.dirty_num = 2; // 2 pieces moved
-
-      if (Do)
-      {
-          dp0 = piece_id_on(from);
-          dp1 = piece_id_on(rfrom);
-          dp.pieceId[0] = dp0;
-          dp.old_piece[0] = evalList.piece_with_id(dp0);
-          evalList.put_piece(dp0, to, make_piece(us, KING));
-          dp.new_piece[0] = evalList.piece_with_id(dp0);
-          dp.pieceId[1] = dp1;
-          dp.old_piece[1] = evalList.piece_with_id(dp1);
-          evalList.put_piece(dp1, rto, make_piece(us, ROOK));
-          dp.new_piece[1] = evalList.piece_with_id(dp1);
-      }
-      else
-      {
-          dp0 = piece_id_on(to);
-          dp1 = piece_id_on(rto);
-          evalList.put_piece(dp0, from, make_piece(us, KING));
-          evalList.put_piece(dp1, rfrom, make_piece(us, ROOK));
-      }
+      dp.piece[0] = make_piece(us, KING);
+      dp.from[0] = from;
+      dp.to[0] = to;
+      dp.piece[1] = make_piece(us, ROOK);
+      dp.from[1] = rfrom;
+      dp.to[1] = rto;
+      dp.dirty_num = 2;
   }
 
   // Remove both pieces first since squares could overlap in Chess960
@@ -1145,8 +1096,8 @@ bool Position::see_ge(Move m, Value threshold) const {
 
       // Don't allow pinned pieces to attack (except the king) as long as
       // there are pinners on their original square.
-      if (st->pinners[~stm] & occupied)
-          stmAttackers &= ~st->blockersForKing[stm];
+      if (pinners(~stm) & occupied)
+          stmAttackers &= ~blockers_for_king(stm);
 
       if (!stmAttackers)
           break;
diff --git a/src/position.h b/src/position.h
index b5dbaf59..e3f758e0 100644
--- a/src/position.h
+++ b/src/position.h
@@ -116,6 +116,7 @@ public:
   Bitboard checkers() const;
   Bitboard blockers_for_king(Color c) const;
   Bitboard check_squares(PieceType pt) const;
+  Bitboard pinners(Color c) const;
   bool is_discovery_check_on_king(Color c, Move m) const;
 
   // Attacks to/from a given square
@@ -173,7 +174,6 @@ public:
 
   // Used by NNUE
   StateInfo* state() const;
-  const EvalList* eval_list() const;
 
 #if defined(EVAL_LEARN)
   // --sfenization helper
@@ -208,9 +208,6 @@ private:
   template<bool Do>
   void do_castling(Color us, Square from, Square& to, Square& rfrom, Square& rto);
 
-  // ID of a piece on a given square
-  PieceId piece_id_on(Square sq) const;
-
   // Data members
   Piece board[SQUARE_NB];
   Bitboard byTypeBB[PIECE_TYPE_NB];
@@ -227,9 +224,6 @@ private:
   Thread* thisThread;
   StateInfo* st;
   bool chess960;
-
-  // List of pieces used in NNUE evaluation function
-  EvalList evalList;
 };
 
 namespace PSQT {
@@ -332,6 +326,10 @@ inline Bitboard Position::blockers_for_king(Color c) const {
   return st->blockersForKing[c];
 }
 
+inline Bitboard Position::pinners(Color c) const {
+  return st->pinners[c];
+}
+
 inline Bitboard Position::check_squares(PieceType pt) const {
   return st->checkSquares[pt];
 }
@@ -469,20 +467,4 @@ inline StateInfo* Position::state() const {
   return st;
 }
 
-inline const EvalList* Position::eval_list() const {
-
-  return &evalList;
-}
-
-inline PieceId Position::piece_id_on(Square sq) const
-{
-
-  assert(piece_on(sq) != NO_PIECE);
-
-  PieceId pid = evalList.piece_id_list[sq];
-  assert(is_ok(pid));
-
-  return pid;
-}
-
 #endif // #ifndef POSITION_H_INCLUDED
diff --git a/src/search.cpp b/src/search.cpp
index b7561a96..2d848bcd 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -63,9 +63,9 @@ namespace {
   constexpr uint64_t TtHitAverageResolution = 1024;
 
   // Razor and futility margins
-  constexpr int RazorMargin = 527;
+  constexpr int RazorMargin = 510;
   Value futility_margin(Depth d, bool improving) {
-    return Value(227 * (d - improving));
+    return Value(223 * (d - improving));
   }
 
   bool training;
@@ -75,7 +75,7 @@ namespace {
 
   Depth reduction(bool i, Depth d, int mn) {
     int r = Reductions[d] * Reductions[mn];
-    return (r + 570) / 1024 + (!i && r > 1018);
+    return (r + 509) / 1024 + (!i && r > 894);
   }
 
   constexpr int futility_move_count(bool improving, Depth depth) {
@@ -84,7 +84,7 @@ namespace {
 
   // History and stats update bonus, based on depth
   int stat_bonus(Depth d) {
-    return d > 15 ? 27 : 17 * d * d + 133 * d - 134;
+    return d > 13 ? 29 : 17 * d * d + 134 * d - 134;
   }
 
   // Add a small random component to draw evaluations to avoid 3fold-blindness
@@ -194,7 +194,7 @@ namespace {
 void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
-      Reductions[i] = int((24.8 + std::log(Threads.size())) * std::log(i));
+      Reductions[i] = int((22.0 + std::log(Threads.size())) * std::log(i));
 
   training = Options["Training"];
 }
@@ -339,7 +339,7 @@ void Thread::search() {
   // for match (TC 60+0.6) results spanning a wide range of k values.
   PRNG rng(now());
   double floatLevel = Options["UCI_LimitStrength"] ?
-                      Utility::clamp(std::pow((Options["UCI_Elo"] - 1346.6) / 143.4, 1 / 0.806), 0.0, 20.0) :
+                      std::clamp(std::pow((Options["UCI_Elo"] - 1346.6) / 143.4, 1 / 0.806), 0.0, 20.0) :
                         double(Options["Skill Level"]);
   int intLevel = int(floatLevel) +
                  ((floatLevel - int(floatLevel)) * 1024 > rng.rand<unsigned>() % 1024  ? 1 : 0);
@@ -407,12 +407,12 @@ void Thread::search() {
           if (rootDepth >= 4)
           {
               Value prev = rootMoves[pvIdx].previousScore;
-              delta = Value(19);
+              delta = Value(17);
               alpha = std::max(prev - delta,-VALUE_INFINITE);
               beta  = std::min(prev + delta, VALUE_INFINITE);
 
               // Adjust contempt based on root move's previousScore (dynamic contempt)
-              int dct = ct + (110 - ct / 2) * prev / (abs(prev) + 140);
+              int dct = ct + (105 - ct / 2) * prev / (abs(prev) + 149);
 
               contempt = (us == WHITE ?  make_score(dct, dct / 2)
                                       : -make_score(dct, dct / 2));
@@ -510,13 +510,13 @@ void Thread::search() {
           && !Threads.stop
           && !mainThread->stopOnPonderhit)
       {
-          double fallingEval = (296 + 6 * (mainThread->bestPreviousScore - bestValue)
-                                    + 6 * (mainThread->iterValue[iterIdx] - bestValue)) / 725.0;
-          fallingEval = Utility::clamp(fallingEval, 0.5, 1.5);
+          double fallingEval = (318 + 6 * (mainThread->bestPreviousScore - bestValue)
+                                    + 6 * (mainThread->iterValue[iterIdx] - bestValue)) / 825.0;
+          fallingEval = std::clamp(fallingEval, 0.5, 1.5);
 
           // If the bestMove is stable over several iterations, reduce time accordingly
-          timeReduction = lastBestMoveDepth + 10 < completedDepth ? 1.92 : 0.95;
-          double reduction = (1.47 + mainThread->previousTimeReduction) / (2.22 * timeReduction);
+          timeReduction = lastBestMoveDepth + 9 < completedDepth ? 1.92 : 0.95;
+          double reduction = (1.47 + mainThread->previousTimeReduction) / (2.32 * timeReduction);
 
           // Use part of the gained time from a previous stable move for the current move
           for (Thread* th : Threads)
@@ -541,7 +541,7 @@ void Thread::search() {
           }
           else if (   Threads.increaseDepth
                    && !mainThread->ponder
-                   && Time.elapsed() > totalTime * 0.56)
+                   && Time.elapsed() > totalTime * 0.58)
                    Threads.increaseDepth = false;
           else
                    Threads.increaseDepth = true;
@@ -600,7 +600,7 @@ namespace {
     Key posKey;
     Move ttMove, move, excludedMove, bestMove;
     Depth extension, newDepth;
-    Value bestValue, value, ttValue, eval, maxValue, probcutBeta;
+    Value bestValue, value, ttValue, eval, maxValue, probCutBeta;
     bool ttHit, ttPv, formerPv, givesCheck, improving, didLMR, priorCapture;
     bool captureOrPromotion, doFullDepthSearch, moveCountPruning,
          ttCapture, singularQuietLMR;
@@ -798,11 +798,7 @@ namespace {
     else
     {
         if ((ss-1)->currentMove != MOVE_NULL)
-        {
-            int bonus = -(ss-1)->statScore / 512;
-
-            ss->staticEval = eval = evaluate(pos) + bonus;
-        }
+            ss->staticEval = eval = evaluate(pos);
         else
             ss->staticEval = eval = -(ss-1)->staticEval + 2 * Tempo;
 
@@ -815,8 +811,9 @@ namespace {
         &&  eval <= alpha - RazorMargin)
         return qsearch<NT>(pos, ss, alpha, beta);
 
-    improving =  (ss-2)->staticEval == VALUE_NONE ? (ss->staticEval > (ss-4)->staticEval
-              || (ss-4)->staticEval == VALUE_NONE) : ss->staticEval > (ss-2)->staticEval;
+    improving =  (ss-2)->staticEval == VALUE_NONE
+               ? ss->staticEval > (ss-4)->staticEval || (ss-4)->staticEval == VALUE_NONE
+               : ss->staticEval > (ss-2)->staticEval;
 
     // Step 8. Futility pruning: child node (~50 Elo)
     if (   !PvNode
@@ -828,10 +825,10 @@ namespace {
     // Step 9. Null move search with verification search (~40 Elo)
     if (   !PvNode
         && (ss-1)->currentMove != MOVE_NULL
-        && (ss-1)->statScore < 23824
+        && (ss-1)->statScore < 22977
         &&  eval >= beta
         &&  eval >= ss->staticEval
-        &&  ss->staticEval >= beta - 28 * depth - 28 * improving + 94 * ttPv + 200
+        &&  ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ttPv + 182
         && !excludedMove
         &&  pos.non_pawn_material(us)
         && (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor))
@@ -839,7 +836,7 @@ namespace {
         assert(eval - beta >= 0);
 
         // Null move dynamic reduction based on depth and value
-        Depth R = (737 + 77 * depth) / 246 + std::min(int(eval - beta) / 192, 3);
+        Depth R = (817 + 71 * depth) / 213 + std::min(int(eval - beta) / 192, 3);
 
         ss->currentMove = MOVE_NULL;
         ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0];
@@ -875,7 +872,7 @@ namespace {
         }
     }
 
-    probcutBeta = beta + 176 - 49 * improving;
+    probCutBeta = beta + 176 - 49 * improving;
 
     // Step 10. ProbCut (~10 Elo)
     // If we have a good enough capture and a reduced search returns a value
@@ -883,21 +880,27 @@ namespace {
     if (   !PvNode
         &&  depth > 4
         &&  abs(beta) < VALUE_TB_WIN_IN_MAX_PLY
+        // if value from transposition table is lower than probCutBeta, don't attempt probCut
+        // there and in further interactions with transposition table cutoff depth is set to depth - 3
+        // because probCut search has depth set to depth - 4 but we also do a move before it
+        // so effective depth is equal to depth - 3
         && !(   ttHit
              && tte->depth() >= depth - 3
              && ttValue != VALUE_NONE
-             && ttValue < probcutBeta))
+             && ttValue < probCutBeta))
     {
+        // if ttMove is a capture and value from transposition table is good enough produce probCut
+        // cutoff without digging into actual probCut search
         if (   ttHit
             && tte->depth() >= depth - 3
             && ttValue != VALUE_NONE
-            && ttValue >= probcutBeta
+            && ttValue >= probCutBeta
             && ttMove
             && pos.capture_or_promotion(ttMove))
-            return probcutBeta;
+            return probCutBeta;
 
-        assert(probcutBeta < VALUE_INFINITE);
-        MovePicker mp(pos, ttMove, probcutBeta - ss->staticEval, &captureHistory);
+        assert(probCutBeta < VALUE_INFINITE);
+        MovePicker mp(pos, ttMove, probCutBeta - ss->staticEval, &captureHistory);
         int probCutCount = 0;
 
         while (   (move = mp.next_move()) != MOVE_NONE
@@ -919,16 +922,17 @@ namespace {
                 pos.do_move(move, st);
 
                 // Perform a preliminary qsearch to verify that the move holds
-                value = -qsearch<NonPV>(pos, ss+1, -probcutBeta, -probcutBeta+1);
+                value = -qsearch<NonPV>(pos, ss+1, -probCutBeta, -probCutBeta+1);
 
                 // If the qsearch held, perform the regular search
-                if (value >= probcutBeta)
-                    value = -search<NonPV>(pos, ss+1, -probcutBeta, -probcutBeta+1, depth - 4, !cutNode);
+                if (value >= probCutBeta)
+                    value = -search<NonPV>(pos, ss+1, -probCutBeta, -probCutBeta+1, depth - 4, !cutNode);
 
                 pos.undo_move(move);
 
-                if (value >= probcutBeta)
+                if (value >= probCutBeta)
                 {
+                    // if transposition table doesn't have equal or more deep info write probCut data into it
                     if ( !(ttHit
                        && tte->depth() >= depth - 3
                        && ttValue != VALUE_NONE))
@@ -940,16 +944,6 @@ namespace {
             }
     }
 
-    // Step 11. Internal iterative deepening (~1 Elo)
-    if (depth >= 7 && !ttMove)
-    {
-        search<NT>(pos, ss, alpha, beta, depth - 7, cutNode);
-
-        tte = TT.probe(posKey, ttHit);
-        ttValue = ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE;
-        ttMove = ttHit ? tte->move() : MOVE_NONE;
-    }
-
 moves_loop: // When in check, search starts from here
 
     const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory,
@@ -973,7 +967,7 @@ moves_loop: // When in check, search starts from here
     // Mark this node as being searched
     ThreadHolding th(thisThread, posKey, ss->ply);
 
-    // Step 12. Loop through all pseudo-legal moves until no moves remain
+    // Step 11. Loop through all pseudo-legal moves until no moves remain
     // or a beta cutoff occurs.
     while ((move = mp.next_move(moveCountPruning)) != MOVE_NONE)
     {
@@ -1015,7 +1009,7 @@ moves_loop: // When in check, search starts from here
       // Calculate new depth for this move
       newDepth = depth - 1;
 
-      // Step 13. Pruning at shallow depth (~200 Elo)
+      // Step 12. Pruning at shallow depth (~200 Elo)
       if (  !rootNode
           && !(training && PvNode)
           && pos.non_pawn_material(us)
@@ -1037,17 +1031,17 @@ moves_loop: // When in check, search starts from here
                   continue;
 
               // Futility pruning: parent node (~5 Elo)
-              if (   lmrDepth < 8
+              if (   lmrDepth < 7
                   && !ss->inCheck
-                  && ss->staticEval + 284 + 188 * lmrDepth <= alpha
+                  && ss->staticEval + 283 + 170 * lmrDepth <= alpha
                   &&  (*contHist[0])[movedPiece][to_sq(move)]
                     + (*contHist[1])[movedPiece][to_sq(move)]
                     + (*contHist[3])[movedPiece][to_sq(move)]
-                    + (*contHist[5])[movedPiece][to_sq(move)] / 2 < 28388)
+                    + (*contHist[5])[movedPiece][to_sq(move)] / 2 < 27376)
                   continue;
 
               // Prune moves with negative SEE (~20 Elo)
-              if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 17)) * lmrDepth * lmrDepth)))
+              if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
                   continue;
           }
           else
@@ -1064,17 +1058,17 @@ moves_loop: // When in check, search starts from here
                   && !(PvNode && abs(bestValue) < 2)
                   && PieceValue[MG][type_of(movedPiece)] >= PieceValue[MG][type_of(pos.piece_on(to_sq(move)))]
                   && !ss->inCheck
-                  && ss->staticEval + 178 + 261 * lmrDepth
+                  && ss->staticEval + 169 + 244 * lmrDepth
                      + PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] <= alpha)
                   continue;
 
               // See based pruning
-              if (!pos.see_ge(move, Value(-202) * depth)) // (~25 Elo)
+              if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo)
                   continue;
           }
       }
 
-      // Step 14. Extensions (~75 Elo)
+      // Step 13. Extensions (~75 Elo)
 
       // Singular extension search (~70 Elo). If all moves but one fail low on a
       // search of (alpha-s, beta-s), and just one fails high on (alpha, beta),
@@ -1128,19 +1122,14 @@ moves_loop: // When in check, search starts from here
                && (pos.is_discovery_check_on_king(~us, move) || pos.see_ge(move)))
           extension = 1;
 
-      // Passed pawn extension
-      else if (   move == ss->killers[0]
-               && pos.advanced_pawn_push(move)
-               && pos.pawn_passed(us, to_sq(move)))
-          extension = 1;
-
       // Last captures extension
       else if (   PieceValue[EG][pos.captured_piece()] > PawnValueEg
                && pos.non_pawn_material() <= 2 * RookValueMg)
           extension = 1;
 
       // Castling extension
-      if (type_of(move) == CASTLING)
+      if (   type_of(move) == CASTLING
+          && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2)
           extension = 1;
 
       // Late irreversible move extension
@@ -1162,10 +1151,10 @@ moves_loop: // When in check, search starts from here
                                                                 [movedPiece]
                                                                 [to_sq(move)];
 
-      // Step 15. Make the move
+      // Step 14. Make the move
       pos.do_move(move, st, givesCheck);
 
-      // Step 16. Reduced depth search (LMR, ~200 Elo). If the move fails high it will be
+      // Step 15. Reduced depth search (LMR, ~200 Elo). If the move fails high it will be
       // re-searched at full depth.
       if (    depth >= 3
           &&  moveCount > 1 + 2 * rootNode + 2 * (PvNode && abs(bestValue) < 2)
@@ -1174,7 +1163,7 @@ moves_loop: // When in check, search starts from here
               || moveCountPruning
               || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha
               || cutNode
-              || thisThread->ttHitAverage < 415 * TtHitAverageResolution * TtHitAverageWindow / 1024))
+              || thisThread->ttHitAverage < 427 * TtHitAverageResolution * TtHitAverageWindow / 1024))
       {
           Depth r = reduction(improving, depth, moveCount);
 
@@ -1186,7 +1175,7 @@ moves_loop: // When in check, search starts from here
               r--;
 
           // Decrease reduction if the ttHit running average is large
-          if (thisThread->ttHitAverage > 473 * TtHitAverageResolution * TtHitAverageWindow / 1024)
+          if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024)
               r--;
 
           // Reduction if other threads are searching this position
@@ -1229,17 +1218,17 @@ moves_loop: // When in check, search starts from here
                              + (*contHist[0])[movedPiece][to_sq(move)]
                              + (*contHist[1])[movedPiece][to_sq(move)]
                              + (*contHist[3])[movedPiece][to_sq(move)]
-                             - 4826;
+                             - 5287;
 
               // Decrease/increase reduction by comparing opponent's stat score (~10 Elo)
-              if (ss->statScore >= -100 && (ss-1)->statScore < -112)
+              if (ss->statScore >= -106 && (ss-1)->statScore < -104)
                   r--;
 
-              else if ((ss-1)->statScore >= -125 && ss->statScore < -138)
+              else if ((ss-1)->statScore >= -119 && ss->statScore < -140)
                   r++;
 
               // Decrease/increase reduction for moves with a good/bad history (~30 Elo)
-              r -= ss->statScore / 14615;
+              r -= ss->statScore / 14884;
           }
           else
           {
@@ -1249,11 +1238,11 @@ moves_loop: // When in check, search starts from here
 
             // Unless giving check, this capture is likely bad
             if (   !givesCheck
-                && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 211 * depth <= alpha)
+                && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 213 * depth <= alpha)
                 r++;
           }
 
-          Depth d = Utility::clamp(newDepth - r, 1, newDepth);
+          Depth d = std::clamp(newDepth - r, 1, newDepth);
 
           value = -search<NonPV>(pos, ss+1, -(alpha+1), -alpha, d, true);
 
@@ -1268,7 +1257,7 @@ moves_loop: // When in check, search starts from here
           didLMR = false;
       }
 
-      // Step 17. Full depth search when LMR is skipped or fails high
+      // Step 16. Full depth search when LMR is skipped or fails high
       if (doFullDepthSearch)
       {
           value = -search<NonPV>(pos, ss+1, -(alpha+1), -alpha, newDepth, !cutNode);
@@ -1296,12 +1285,12 @@ moves_loop: // When in check, search starts from here
           value = -search<PV>(pos, ss+1, -beta, -alpha, newDepth, false);
       }
 
-      // Step 18. Undo move
+      // Step 17. Undo move
       pos.undo_move(move);
 
       assert(value > -VALUE_INFINITE && value < VALUE_INFINITE);
 
-      // Step 19. Check for a new best move
+      // Step 18. Check for a new best move
       // Finished searching the move. If a stop occurred, the return value of
       // the search cannot be trusted, and we return immediately without
       // updating best move, PV and TT.
@@ -1378,7 +1367,7 @@ moves_loop: // When in check, search starts from here
         return VALUE_DRAW;
     */
 
-    // Step 20. Check for mate and stalemate
+    // Step 19. Check for mate and stalemate
     // All legal moves have been searched and if there are no legal moves, it
     // must be a mate or a stalemate. If we are in a singular extension search then
     // return a fail low score.
@@ -1511,7 +1500,7 @@ moves_loop: // When in check, search starts from here
         if (PvNode && bestValue > alpha)
             alpha = bestValue;
 
-        futilityBase = bestValue + 141;
+        futilityBase = bestValue + 145;
     }
 
     const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory,
@@ -1545,6 +1534,10 @@ moves_loop: // When in check, search starts from here
       {
           assert(type_of(move) != ENPASSANT); // Due to !pos.advanced_pawn_push
 
+          // moveCount pruning
+          if (moveCount > 2)
+              continue;
+
           futilityValue = futilityBase + PieceValue[EG][pos.piece_on(to_sq(move))];
 
           if (futilityValue <= alpha)
@@ -1586,6 +1579,12 @@ moves_loop: // When in check, search starts from here
                                                                 [pos.moved_piece(move)]
                                                                 [to_sq(move)];
 
+      if (  !captureOrPromotion
+          && moveCount >= abs(depth) + 1
+          && (*contHist[0])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold
+          && (*contHist[1])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold)
+          continue;
+
       // Make and search the move
       pos.do_move(move, st, givesCheck);
       value = -qsearch<NT>(pos, ss+1, -beta, -alpha, depth - 1);
@@ -1768,7 +1767,7 @@ moves_loop: // When in check, search starts from here
     }
 
     if (depth > 11 && ss->ply < MAX_LPH)
-        thisThread->lowPlyHistory[ss->ply][from_to(move)] << stat_bonus(depth - 6);
+        thisThread->lowPlyHistory[ss->ply][from_to(move)] << stat_bonus(depth - 7);
   }
 
   // When playing with strength handicap, choose best move among a set of RootMoves
diff --git a/src/thread_win32_osx.h b/src/thread_win32_osx.h
index c4b55a48..75ef5d9a 100644
--- a/src/thread_win32_osx.h
+++ b/src/thread_win32_osx.h
@@ -27,7 +27,7 @@
 /// The implementation calls pthread_create() with the stack size parameter
 /// equal to the linux 8MB default, on platforms that support it.
 
-#if defined(__APPLE__) || defined(__MINGW32__) || defined(__MINGW64__)
+#if defined(__APPLE__) || defined(__MINGW32__) || defined(__MINGW64__) || defined(USE_PTHREADS)
 
 #include <pthread.h>
 
diff --git a/src/timeman.cpp b/src/timeman.cpp
index df4ba9b2..6d9c95ef 100644
--- a/src/timeman.cpp
+++ b/src/timeman.cpp
@@ -38,9 +38,9 @@ void TimeManagement::init(Search::LimitsType& limits, Color us, int ply) {
   TimePoint slowMover       = TimePoint(Options["Slow Mover"]);
   TimePoint npmsec          = TimePoint(Options["nodestime"]);
 
-  // opt_scale is a percentage of available time to use for the current move.
-  // max_scale is a multiplier applied to optimumTime.
-  double opt_scale, max_scale;
+  // optScale is a percentage of available time to use for the current move.
+  // maxScale is a multiplier applied to optimumTime.
+  double optScale, maxScale;
 
   // If we have to play in 'nodes as time' mode, then convert from time
   // to nodes, and use resulting values in time management formulas.
@@ -75,22 +75,22 @@ void TimeManagement::init(Search::LimitsType& limits, Color us, int ply) {
   // game time for the current move, so also cap to 20% of available game time.
   if (limits.movestogo == 0)
   {
-      opt_scale = std::min(0.008 + std::pow(ply + 3.0, 0.5) / 250.0,
+      optScale = std::min(0.008 + std::pow(ply + 3.0, 0.5) / 250.0,
                            0.2 * limits.time[us] / double(timeLeft));
-      max_scale = std::min(7.0, 4.0 + ply / 12.0);
+      maxScale = std::min(7.0, 4.0 + ply / 12.0);
   }
 
   // x moves in y seconds (+ z increment)
   else
   {
-      opt_scale = std::min((0.8 + ply / 128.0) / mtg,
+      optScale = std::min((0.8 + ply / 128.0) / mtg,
                             0.8 * limits.time[us] / double(timeLeft));
-      max_scale = std::min(6.3, 1.5 + 0.11 * mtg);
+      maxScale = std::min(6.3, 1.5 + 0.11 * mtg);
   }
 
   // Never use more than 80% of the available time for this move
-  optimumTime = TimePoint(opt_scale * timeLeft);
-  maximumTime = TimePoint(std::min(0.8 * limits.time[us] - moveOverhead, max_scale * optimumTime));
+  optimumTime = TimePoint(optScale * timeLeft);
+  maximumTime = TimePoint(std::min(0.8 * limits.time[us] - moveOverhead, maxScale * optimumTime));
 
   if (Options["Ponder"])
       optimumTime += optimumTime / 4;
diff --git a/src/tt.cpp b/src/tt.cpp
index d494c27d..60a3a5f1 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -37,18 +37,19 @@ void TTEntry::save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev)
   if (m || (uint16_t)k != key16)
       move16 = (uint16_t)m;
 
-  // Overwrite less valuable entries
-  if ((uint16_t)k != key16
-      || d - DEPTH_OFFSET > depth8 - 4
-      || b == BOUND_EXACT)
+  // Overwrite less valuable entries (cheapest checks first)
+  if (b == BOUND_EXACT
+      || (uint16_t)k != key16
+      || d - DEPTH_OFFSET > depth8 - 4)
   {
-      assert(d >= DEPTH_OFFSET);
+      assert(d > DEPTH_OFFSET);
+      assert(d < 256 + DEPTH_OFFSET);
 
       key16     = (uint16_t)k;
+      depth8    = (uint8_t)(d - DEPTH_OFFSET);
+      genBound8 = (uint8_t)(TT.generation8 | uint8_t(pv) << 2 | b);
       value16   = (int16_t)v;
       eval16    = (int16_t)ev;
-      genBound8 = (uint8_t)(TT.generation8 | uint8_t(pv) << 2 | b);
-      depth8    = (uint8_t)(d - DEPTH_OFFSET);
   }
 }
 
@@ -119,11 +120,11 @@ TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
   const uint16_t key16 = (uint16_t)key;  // Use the low 16 bits as key inside the cluster
 
   for (int i = 0; i < ClusterSize; ++i)
-      if (!tte[i].key16 || tte[i].key16 == key16)
+      if (tte[i].key16 == key16 || !tte[i].depth8)
       {
           tte[i].genBound8 = uint8_t(generation8 | (tte[i].genBound8 & 0x7)); // Refresh
 
-          return found = (bool)tte[i].key16, &tte[i];
+          return found = (bool)tte[i].depth8, &tte[i];
       }
 
   // Find an entry to be replaced according to the replacement strategy
@@ -149,7 +150,7 @@ int TranspositionTable::hashfull() const {
   int cnt = 0;
   for (int i = 0; i < 1000; ++i)
       for (int j = 0; j < ClusterSize; ++j)
-          cnt += (table[i].entry[j].genBound8 & 0xF8) == generation8;
+          cnt += table[i].entry[j].depth8 && (table[i].entry[j].genBound8 & 0xF8) == generation8;
 
   return cnt / ClusterSize;
 }
diff --git a/src/tt.h b/src/tt.h
index c177ca52..fdfd6769 100644
--- a/src/tt.h
+++ b/src/tt.h
@@ -25,13 +25,13 @@
 /// TTEntry struct is the 10 bytes transposition table entry, defined as below:
 ///
 /// key        16 bit
-/// move       16 bit
-/// value      16 bit
-/// eval value 16 bit
+/// depth       8 bit
 /// generation  5 bit
 /// pv node     1 bit
 /// bound type  2 bit
-/// depth       8 bit
+/// move       16 bit
+/// value      16 bit
+/// eval value 16 bit
 
 struct TTEntry {
 
@@ -47,11 +47,11 @@ private:
   friend class TranspositionTable;
 
   uint16_t key16;
+  uint8_t  depth8;
+  uint8_t  genBound8;
   uint16_t move16;
   int16_t  value16;
   int16_t  eval16;
-  uint8_t  genBound8;
-  uint8_t  depth8;
 };
 
 
diff --git a/src/types.h b/src/types.h
index ce4c2dbb..d34781e5 100644
--- a/src/types.h
+++ b/src/types.h
@@ -203,22 +203,6 @@ enum Piece {
   PIECE_NB = 16
 };
 
-// An ID used to track the pieces. Max. 32 pieces on board.
-enum PieceId {
-  PIECE_ID_ZERO   = 0,
-  PIECE_ID_KING   = 30,
-  PIECE_ID_WKING  = 30,
-  PIECE_ID_BKING  = 31,
-  PIECE_ID_NONE   = 32
-};
-
-inline PieceId operator++(PieceId& d, int) {
-
-  PieceId x = d;
-  d = PieceId(int(d) + 1);
-  return x;
-}
-
 constexpr Value PieceValue[PHASE_NB][PIECE_NB] = {
   { VALUE_ZERO, PawnValueMg, KnightValueMg, BishopValueMg, RookValueMg, QueenValueMg, VALUE_ZERO, VALUE_ZERO,
     VALUE_ZERO, PawnValueMg, KnightValueMg, BishopValueMg, RookValueMg, QueenValueMg, VALUE_ZERO, VALUE_ZERO },
@@ -234,7 +218,8 @@ enum : int {
   DEPTH_QS_RECAPTURES = -5,
 
   DEPTH_NONE   = -6,
-  DEPTH_OFFSET = DEPTH_NONE
+
+  DEPTH_OFFSET = -7 // value used only for TT entry occupancy check
 };
 
 enum Square : int {
@@ -272,118 +257,20 @@ enum Rank : int {
   RANK_1, RANK_2, RANK_3, RANK_4, RANK_5, RANK_6, RANK_7, RANK_8, RANK_NB
 };
 
-// unique number for each piece type on each square
-enum PieceSquare : uint32_t {
-  PS_NONE     =  0,
-  PS_W_PAWN   =  1,
-  PS_B_PAWN   =  1 * SQUARE_NB + 1,
-  PS_W_KNIGHT =  2 * SQUARE_NB + 1,
-  PS_B_KNIGHT =  3 * SQUARE_NB + 1,
-  PS_W_BISHOP =  4 * SQUARE_NB + 1,
-  PS_B_BISHOP =  5 * SQUARE_NB + 1,
-  PS_W_ROOK   =  6 * SQUARE_NB + 1,
-  PS_B_ROOK   =  7 * SQUARE_NB + 1,
-  PS_W_QUEEN  =  8 * SQUARE_NB + 1,
-  PS_B_QUEEN  =  9 * SQUARE_NB + 1,
-  PS_W_KING   = 10 * SQUARE_NB + 1,
-  PS_END      = PS_W_KING, // pieces without kings (pawns included)
-  PS_B_KING   = 11 * SQUARE_NB + 1,
-  PS_END2     = 12 * SQUARE_NB + 1,
-
-  PS_NOT_INIT = PS_END2 + 1,
-};
-
-struct ExtPieceSquare {
-  PieceSquare from[COLOR_NB];
-};
-
-// Array for finding the PieceSquare corresponding to the piece on the board
-extern ExtPieceSquare kpp_board_index[PIECE_NB];
-
-constexpr bool is_ok(PieceId pid);
-constexpr Square rotate180(Square sq);
-
-class Position;
-
-// Structure holding which tracked piece (PieceId) is where (PieceSquare)
-class EvalList {
-
-public:
-  // Max. number of pieces without kings is 30 but must be a multiple of 4 in AVX2
-  static const int MAX_LENGTH = 32;
-
-  // Array that holds the piece id for the pieces on the board
-  PieceId piece_id_list[SQUARE_NB];
-
-  // List of pieces, separate from White and Black POV
-  PieceSquare* piece_list_fw() const { return const_cast<PieceSquare*>(pieceListFw); }
-  PieceSquare* piece_list_fb() const { return const_cast<PieceSquare*>(pieceListFb); }
-
-  // Place the piece pc with piece_id on the square sq on the board
-  void put_piece(PieceId piece_id, Square sq, Piece pc)
-  {
-      assert(is_ok(piece_id));
-      if (pc != NO_PIECE)
-      {
-          pieceListFw[piece_id] = PieceSquare(kpp_board_index[pc].from[WHITE] + sq);
-          pieceListFb[piece_id] = PieceSquare(kpp_board_index[pc].from[BLACK] + rotate180(sq));
-          piece_id_list[sq] = piece_id;
-      }
-      else
-      {
-          pieceListFw[piece_id] = PS_NONE;
-          pieceListFb[piece_id] = PS_NONE;
-          piece_id_list[sq] = piece_id;
-      }
-  }
-
-  // Convert the specified piece_id piece to ExtPieceSquare type and return it
-  ExtPieceSquare piece_with_id(PieceId piece_id) const
-  {
-      ExtPieceSquare eps;
-      eps.from[WHITE] = pieceListFw[piece_id];
-      eps.from[BLACK] = pieceListFb[piece_id];
-      return eps;
-  }
-
-  // Initialize the pieceList.
-  // Set the value of unused pieces to PieceSquare::PS_NONE in case you want to deal with dropped pieces.
-  // A normal evaluation function can be used as an evaluation function for missing frames.
-  // piece_no_list is initialized with PieceId::PIECE_ID_NONE to facilitate debugging.
-  void clear()
-  {
-
-      for (auto& p : pieceListFw)
-          p = PieceSquare::PS_NONE;
-
-      for (auto& p : pieceListFb)
-          p = PieceSquare::PS_NONE;
-
-      for (auto& v : piece_id_list)
-          v = PieceId::PIECE_ID_NONE;
-  }
-
-  // Check whether the pieceListFw[] held internally is a correct BonaPiece.
-  // Note: For debugging. slow.
-  bool is_valid(const Position& pos);
-
-private:
-  PieceSquare pieceListFw[MAX_LENGTH];
-  PieceSquare pieceListFb[MAX_LENGTH];
-};
-
-// For differential evaluation of pieces that changed since last turn
+// Keep track of what a move changes on the board (used by NNUE)
 struct DirtyPiece {
 
   // Number of changed pieces
   int dirty_num;
 
-  // The ids of changed pieces, max. 2 pieces can change in one move
-  PieceId pieceId[2];
+  // Max 3 pieces can change in one move. A promotion with capture moves
+  // both the pawn and the captured piece to SQ_NONE and the piece promoted
+  // to from SQ_NONE to the capture square.
+  Piece piece[3];
 
-  // What changed from the piece with that piece number
-  ExtPieceSquare old_piece[2];
-  ExtPieceSquare new_piece[2];
+  // From and to squares, which may be SQ_NONE
+  Square from[3];
+  Square to[3];
 };
 
 /// Score enum stores a middlegame and an endgame value in a single integer (enum).
@@ -433,8 +320,6 @@ ENABLE_FULL_OPERATORS_ON(Value)
 ENABLE_FULL_OPERATORS_ON(Direction)
 
 ENABLE_INCR_OPERATORS_ON(Piece)
-ENABLE_INCR_OPERATORS_ON(PieceSquare)
-ENABLE_INCR_OPERATORS_ON(PieceId)
 ENABLE_INCR_OPERATORS_ON(PieceType)
 ENABLE_INCR_OPERATORS_ON(Square)
 ENABLE_INCR_OPERATORS_ON(File)
@@ -523,10 +408,6 @@ inline Color color_of(Piece pc) {
   return Color(pc >> 3);
 }
 
-constexpr bool is_ok(PieceId pid) {
-  return pid < PIECE_ID_NONE;
-}
-
 constexpr bool is_ok(Square s) {
   return s >= SQ_A1 && s <= SQ_H8;
 }
@@ -563,11 +444,6 @@ constexpr Square to_sq(Move m) {
   return Square(m & 0x3F);
 }
 
-// Return relative square when turning the board 180 degrees
-constexpr Square rotate180(Square sq) {
-  return (Square)(sq ^ 0x3F);
-}
-
 constexpr int from_to(Move m) {
  return m & 0xFFF;
 }
diff --git a/src/uci.cpp b/src/uci.cpp
index 00941040..d6745d19 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -260,7 +260,7 @@ double UCI::win_rate_model_double(double v, int ply) {
    double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
 
    // Transform eval to centipawns with limited range
-   double x = Utility::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0);
+     double x = std::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0);
 
    // Return win rate in per mille
    return 1000.0 / (1 + std::exp((a - x) / b));
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index ef40fe82..519160cf 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -79,8 +79,10 @@ void init(OptionsMap& o) {
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
-  o["Use NNUE"]              << Option(false, on_use_NNUE);
-  o["EvalFile"]              << Option("nn-9931db908a9b.nnue", on_eval_file);
+  o["Use NNUE"]              << Option(true, on_use_NNUE);
+  // The default must follow the format nn-[SHA256 first 12 digits].nnue
+  // for the build process (profile-build and fishtest) to work.
+  o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);
 #ifdef EVAL_NNUE
   // When the evaluation function is loaded at the ucinewgame timing, it is necessary to convert the new evaluation function.
   // I want to hit the test eval convert command, but there is no new evaluation function
diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index ae6d5c4b..03ded74a 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -70,7 +70,7 @@ for args in "eval" \
             "go depth 10" \
             "go movetime 1000" \
             "go wtime 8000 btime 8000 winc 500 binc 500" \
-            "bench 128 $threads 10 default depth"
+            "bench 128 $threads 8 default depth"
 do
 
    echo "$prefix $exeprefix ./stockfish $args $postfix"
@@ -80,7 +80,7 @@ done
 
 # more general testing, following an uci protocol exchange
 cat << EOF > game.exp
- set timeout 10
+ set timeout 240
  spawn $exeprefix ./stockfish
 
  send "uci\n"
@@ -98,7 +98,7 @@ cat << EOF > game.exp
  expect "bestmove"
 
  send "position fen 5rk1/1K4p1/8/8/3B4/8/8/8 b - - 0 1\n"
- send "go depth 30\n"
+ send "go depth 20\n"
  expect "bestmove"
 
  send "quit\n"
@@ -121,7 +121,7 @@ cat << EOF > syzygy.exp
  send "uci\n"
  send "setoption name SyzygyPath value ../tests/syzygy/\n"
  expect "info string Found 35 tablebases" {} timeout {exit 1}
- send "bench 128 1 10 default depth\n"
+ send "bench 128 1 8 default depth\n"
  send "quit\n"
  expect eof