diff --git a/.travis.yml b/.travis.yml index d563a1e1..092c7f53 100644 --- a/.travis.yml +++ b/.travis.yml @@ -43,26 +43,47 @@ before_script: - cd src script: + # Download net + - make net + # Obtain bench reference from git log - git log HEAD | grep "\b[Bb]ench[ :]\+[0-9]\{7\}" | head -n 1 | sed "s/[^0-9]*\([0-9]*\).*/\1/g" > git_sig - export benchref=$(cat git_sig) - echo "Reference bench:" $benchref - # # Compiler version string - $COMPILER -v - # + # test help target + - make help + # Verify bench number against various builds - export CXXFLAGS="-Werror -D_GLIBCXX_DEBUG" - - make clean && make -j2 ARCH=x86-64 optimize=no debug=yes build && ../tests/signature.sh $benchref + - make clean && make -j2 ARCH=x86-64-modern optimize=no debug=yes build && ../tests/signature.sh $benchref + - export CXXFLAGS="-Werror" + - make clean && make -j2 ARCH=x86-64-modern build && ../tests/signature.sh $benchref + - make clean && make -j2 ARCH=x86-64-ssse3 build && ../tests/signature.sh $benchref + - make clean && make -j2 ARCH=x86-64-sse3-popcnt build && ../tests/signature.sh $benchref + - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse41-popcnt build && ../tests/signature.sh $benchref; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi + # workaround: exclude a custom version of llvm+clang, which doesn't find llvm-profdata on ubuntu + - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi + + # compile only for some more advanced architectures (might not run in travis) + - make clean && make -j2 ARCH=x86-64-avx2 build + - make clean && make -j2 ARCH=x86-64-bmi2 build + - make clean && make -j2 ARCH=x86-64-avx512 build + - make clean && make -j2 ARCH=x86-64-vnni512 build + - make clean && make -j2 ARCH=x86-64-vnni256 build # # Check perft and reproducible search - - export CXXFLAGS="-Werror" - - make clean && make -j2 ARCH=x86-64 build + - make clean && make -j2 ARCH=x86-64-modern build - ../tests/perft.sh - ../tests/reprosearch.sh @@ -70,11 +91,11 @@ script: # Valgrind # - export CXXFLAGS="-O1 -fno-inline" - - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64 debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind; fi + - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind; fi - if [ -x "$(command -v valgrind )" ]; then ../tests/instrumented.sh --valgrind-thread; fi # # Sanitizer # - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64 sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64 sanitize=thread optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=thread optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi diff --git a/AUTHORS b/AUTHORS index 21ef3e50..c96f870a 100644 --- a/AUTHORS +++ b/AUTHORS @@ -53,11 +53,13 @@ Ernesto Gatti Linmiao Xu (linrock) Fabian Beuke (madnight) Fabian Fichter (ianfab) +Fanael Linithien (Fanael) fanon Fauzi Akram Dabat (FauziAkram) Felix Wittmann gamander Gary Heckman (gheckman) +George Sobala (gsobala) gguliash Gian-Carlo Pascutto (gcp) Gontran Lemaire (gonlem) @@ -126,6 +128,7 @@ Niklas Fiekas (niklasf) Nikolay Kostov (NikolayIT) Nguyen Pham (nguyenpham) Norman Schmidt (FireFather) +notruck Ondrej Mosnáček (WOnder93) Oskar Werkelin Ahlin Pablo Vazquez diff --git a/appveyor.yml b/appveyor.yml index d356ba2f..a3732a23 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -61,6 +61,20 @@ before_build: build_script: - cmake --build . --config %CONFIGURATION% -- /verbosity:minimal + - ps: | + # Download default NNUE net from fishtest + $nnuenet = Get-Content -Path src\ucioption.cpp | Select-String -CaseSensitive -Pattern "Option" | Select-String -CaseSensitive -Pattern "nn-[a-z0-9]{12}.nnue" + $dummy = $nnuenet -match "(?nn-[a-z0-9]{12}.nnue)" + $nnuenet = $Matches.nnuenet + Write-Host "Default net:" $nnuenet + $nnuedownloadurl = "https://tests.stockfishchess.org/api/nn/$nnuenet" + $nnuefilepath = "src\${env:CONFIGURATION}\$nnuenet" + if (Test-Path -Path $nnuefilepath) { + Write-Host "Already available." + } else { + Write-Host "Downloading $nnuedownloadurl to $nnuefilepath" + Invoke-WebRequest -Uri $nnuedownloadurl -OutFile $nnuefilepath + } before_test: - cd src/%CONFIGURATION% diff --git a/src/Makefile b/src/Makefile index a8736a15..75e39557 100644 --- a/src/Makefile +++ b/src/Makefile @@ -82,14 +82,16 @@ endif # bits = 64/32 --- -DIS_64BIT --- 64-/32-bit operating system # prefetch = yes/no --- -DUSE_PREFETCH --- Use prefetch asm-instruction # popcnt = yes/no --- -DUSE_POPCNT --- Use popcnt asm-instruction +# pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction # sse = yes/no --- -msse --- Use Intel Streaming SIMD Extensions -# sse3 = yes/no --- -msse3 --- Use Intel Streaming SIMD Extensions 3 +# mmx = yes/no --- -mmmx --- Use Intel MMX instructions +# sse2 = yes/no --- -msse2 --- Use Intel Streaming SIMD Extensions 2 # ssse3 = yes/no --- -mssse3 --- Use Intel Supplemental Streaming SIMD Extensions 3 # sse41 = yes/no --- -msse4.1 --- Use Intel Streaming SIMD Extensions 4.1 -# sse42 = yes/no --- -msse4.2 --- Use Intel Streaming SIMD Extensions 4.2 # avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2 -# pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction # avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512 +# vnni256 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 256 +# vnni512 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512 # neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture # # Note that Makefile is space sensitive, so when adding new architectures @@ -97,152 +99,184 @@ endif # at the end of the line for flag values. ### 2.1. General and architecture defaults + +# explicitly check for the list of supported architectures (as listed with make help), +# the user can override with `make ARCH=x86-32-vnni256 SUPPORTED_ARCH=true` +ifeq ($(ARCH),$(filter $(ARCH),x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-bmi2 x86-64-avx2 \ + x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \ + x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 \ + armv7 armv7-neon armv8 apple-silicon general-64 general-32)) + SUPPORTED_ARCH=true +else + SUPPORTED_ARCH=false +endif + optimize = yes debug = no sanitize = no bits = 64 prefetch = no popcnt = no +pext = no sse = no -sse3 = no +mmx = no +sse2 = no ssse3 = no sse41 = no -sse42 = no avx2 = no -pext = no avx512 = no +vnni256 = no +vnni512 = no neon = no ARCH = x86-64-modern +STRIP = strip ### 2.2 Architecture specific + +ifeq ($(findstring x86,$(ARCH)),x86) + +# x86-32/64 + +ifeq ($(findstring x86-32,$(ARCH)),x86-32) + arch = i386 + bits = 32 + sse = yes + mmx = yes +else + arch = x86_64 + sse = yes + sse2 = yes +endif + +ifeq ($(findstring -sse,$(ARCH)),-sse) + sse = yes +endif + +ifeq ($(findstring -popcnt,$(ARCH)),-popcnt) + popcnt = yes +endif + +ifeq ($(findstring -mmx,$(ARCH)),-mmx) + mmx = yes +endif + +ifeq ($(findstring -sse2,$(ARCH)),-sse2) + sse = yes + sse2 = yes +endif + +ifeq ($(findstring -ssse3,$(ARCH)),-ssse3) + sse = yes + sse2 = yes + ssse3 = yes +endif + +ifeq ($(findstring -sse41,$(ARCH)),-sse41) + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes +endif + +ifeq ($(findstring -modern,$(ARCH)),-modern) + popcnt = yes + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes +endif + +ifeq ($(findstring -avx2,$(ARCH)),-avx2) + popcnt = yes + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes + avx2 = yes +endif + +ifeq ($(findstring -bmi2,$(ARCH)),-bmi2) + popcnt = yes + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes + avx2 = yes + pext = yes +endif + +ifeq ($(findstring -avx512,$(ARCH)),-avx512) + popcnt = yes + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes + avx2 = yes + pext = yes + avx512 = yes +endif + +ifeq ($(findstring -vnni256,$(ARCH)),-vnni256) + popcnt = yes + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes + avx2 = yes + pext = yes + vnni256 = yes +endif + +ifeq ($(findstring -vnni512,$(ARCH)),-vnni512) + popcnt = yes + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes + avx2 = yes + pext = yes + avx512 = yes + vnni512 = yes +endif + +ifeq ($(sse),yes) + prefetch = yes +endif + +# 64-bit pext is not available on x86-32 +ifeq ($(bits),32) + pext = no +endif + +else + +# all other architectures + ifeq ($(ARCH),general-32) arch = any bits = 32 endif -ifeq ($(ARCH),x86-32-old) - arch = i386 - bits = 32 -endif - -ifeq ($(ARCH),x86-32) - arch = i386 - bits = 32 - prefetch = yes - sse = yes -endif - ifeq ($(ARCH),general-64) arch = any endif -ifeq ($(ARCH),x86-64) - arch = x86_64 - prefetch = yes - sse = yes -endif - -ifeq ($(ARCH),x86-64-sse3) - arch = x86_64 - prefetch = yes - sse = yes - sse3 = yes -endif - -ifeq ($(ARCH),x86-64-sse3-popcnt) - arch = x86_64 - prefetch = yes - sse = yes - sse3 = yes - popcnt = yes -endif - -ifeq ($(ARCH),x86-64-ssse3) - arch = x86_64 - prefetch = yes - sse = yes - sse3 = yes - ssse3 = yes -endif - -ifeq ($(ARCH),x86-64-sse41) - arch = x86_64 - prefetch = yes - popcnt = yes - sse = yes - sse3 = yes - ssse3 = yes - sse41 = yes -endif - -ifeq ($(ARCH),x86-64-modern) - arch = x86_64 - prefetch = yes - popcnt = yes - sse = yes - sse3 = yes - ssse3 = yes - sse41 = yes -endif - -ifeq ($(ARCH),x86-64-sse42) - arch = x86_64 - prefetch = yes - popcnt = yes - sse = yes - sse3 = yes - ssse3 = yes - sse41 = yes - sse42 = yes -endif - -ifeq ($(ARCH),x86-64-avx2) - arch = x86_64 - prefetch = yes - popcnt = yes - sse = yes - sse3 = yes - ssse3 = yes - sse41 = yes - sse42 = yes - avx2 = yes -endif - -ifeq ($(ARCH),x86-64-bmi2) - arch = x86_64 - prefetch = yes - popcnt = yes - sse = yes - sse3 = yes - ssse3 = yes - sse41 = yes - sse42 = yes - avx2 = yes - pext = yes -endif - -ifeq ($(ARCH),x86-64-avx512) - arch = x86_64 - prefetch = yes - popcnt = yes - sse = yes - sse3 = yes - ssse3 = yes - sse41 = yes - sse42 = yes - avx2 = yes - pext = yes - avx512 = yes -endif - ifeq ($(ARCH),armv7) arch = armv7 prefetch = yes bits = 32 endif +ifeq ($(ARCH),armv7-neon) + arch = armv7 + prefetch = yes + popcnt = yes + neon = yes + bits = 32 +endif + ifeq ($(ARCH),armv8) - arch = armv8-a + arch = armv8 prefetch = yes popcnt = yes neon = yes @@ -266,6 +300,8 @@ ifeq ($(ARCH),ppc-64) prefetch = yes endif +endif + ### ========================================================================== ### Section 3. Low-level Configuration ### ========================================================================== @@ -284,7 +320,7 @@ ifeq ($(COMP),gcc) CXX=g++ CXXFLAGS += -pedantic -Wextra -Wshadow - ifeq ($(ARCH),$(filter $(ARCH),armv7 armv8)) + ifeq ($(arch),$(filter $(arch),armv7 armv8)) ifeq ($(OS),Android) CXXFLAGS += -m$(bits) LDFLAGS += -m$(bits) @@ -294,12 +330,13 @@ ifeq ($(COMP),gcc) LDFLAGS += -m$(bits) endif + ifeq ($(arch),$(filter $(arch),armv7)) + LDFLAGS += -latomic + endif + ifneq ($(KERNEL),Darwin) LDFLAGS += -Wl,--no-as-needed endif - - gccversion = $(shell $(CXX) --version) - gccisclang = $(findstring clang,$(gccversion)) endif ifeq ($(COMP),mingw) @@ -344,7 +381,7 @@ ifeq ($(COMP),clang) endif endif - ifeq ($(ARCH),$(filter $(ARCH),armv7 armv8)) + ifeq ($(arch),$(filter $(arch),armv7 armv8)) ifeq ($(OS),Android) CXXFLAGS += -m$(bits) LDFLAGS += -m$(bits) @@ -371,6 +408,26 @@ endif ifeq ($(KERNEL),Darwin) CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14 LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14 + XCRUN = xcrun +endif + +# To cross-compile for Android, NDK version r21 or later is recommended. +# In earlier NDK versions, you'll need to pass -fno-addrsig if using GNU binutils. +# Currently we don't know how to make PGO builds with the NDK yet. +ifeq ($(COMP),ndk) + CXXFLAGS += -stdlib=libc++ -fPIE + ifeq ($(arch),armv7) + comp=armv7a-linux-androideabi16-clang + CXX=armv7a-linux-androideabi16-clang++ + CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon + STRIP=arm-linux-androideabi-strip + endif + ifeq ($(arch),armv8) + comp=aarch64-linux-android21-clang + CXX=aarch64-linux-android21-clang++ + STRIP=aarch64-linux-android-strip + endif + LDFLAGS += -static-libstdc++ -pie -lm -latomic endif ### Travis CI script uses COMPILER to overwrite CXX @@ -383,16 +440,29 @@ ifdef COMPCXX CXX=$(COMPCXX) endif +### Sometimes gcc is really clang +ifeq ($(COMP),gcc) + gccversion = $(shell $(CXX) --version) + gccisclang = $(findstring clang,$(gccversion)) + ifneq ($(gccisclang),) + profile_make = clang-profile-make + profile_use = clang-profile-use + endif +endif + ### On mingw use Windows threads, otherwise POSIX ifneq ($(comp),mingw) + CXXFLAGS += -DUSE_PTHREADS # On Android Bionic's C library comes with its own pthread implementation bundled in ifneq ($(OS),Android) # Haiku has pthreads in its libroot, so only link it in on other platforms ifneq ($(KERNEL),Haiku) + ifneq ($(COMP),ndk) LDFLAGS += -lpthread endif endif endif +endif ### 3.2.1 Debugging ifeq ($(debug),no) @@ -434,7 +504,6 @@ endif ifeq ($(prefetch),yes) ifeq ($(sse),yes) CXXFLAGS += -msse - DEPENDFLAGS += -msse endif else CXXFLAGS += -DNO_PREFETCH @@ -442,7 +511,7 @@ endif ### 3.6 popcnt ifeq ($(popcnt),yes) - ifeq ($(arch),$(filter $(arch),ppc64 armv8-a arm64)) + ifeq ($(arch),$(filter $(arch),ppc64 armv7 armv8 arm64)) CXXFLAGS += -DUSE_POPCNT else ifeq ($(comp),icc) CXXFLAGS += -msse3 -DUSE_POPCNT @@ -451,6 +520,7 @@ ifeq ($(popcnt),yes) endif endif + ifeq ($(avx2),yes) CXXFLAGS += -DUSE_AVX2 ifeq ($(comp),$(filter $(comp),gcc clang mingw)) @@ -461,14 +531,21 @@ endif ifeq ($(avx512),yes) CXXFLAGS += -DUSE_AVX512 ifeq ($(comp),$(filter $(comp),gcc clang mingw)) - CXXFLAGS += -mavx512bw + CXXFLAGS += -mavx512f -mavx512bw endif endif -ifeq ($(sse42),yes) - CXXFLAGS += -DUSE_SSE42 +ifeq ($(vnni256),yes) + CXXFLAGS += -DUSE_VNNI ifeq ($(comp),$(filter $(comp),gcc clang mingw)) - CXXFLAGS += -msse4.2 + CXXFLAGS += -mavx512f -mavx512bw -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=256 + endif +endif + +ifeq ($(vnni512),yes) + CXXFLAGS += -DUSE_VNNI + ifeq ($(comp),$(filter $(comp),gcc clang mingw)) + CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl endif endif @@ -486,19 +563,29 @@ ifeq ($(ssse3),yes) endif endif -ifeq ($(sse3),yes) - CXXFLAGS += -DUSE_SSE3 +ifeq ($(sse2),yes) + CXXFLAGS += -DUSE_SSE2 ifeq ($(comp),$(filter $(comp),gcc clang mingw)) - CXXFLAGS += -msse3 + CXXFLAGS += -msse2 + endif +endif + +ifeq ($(mmx),yes) + CXXFLAGS += -DUSE_MMX + ifeq ($(comp),$(filter $(comp),gcc clang mingw)) + CXXFLAGS += -mmmx endif endif ifeq ($(neon),yes) CXXFLAGS += -DUSE_NEON + ifeq ($(KERNEL),Linux) + ifneq ($(COMP),ndk) + ifneq ($(arch),armv8) + CXXFLAGS += -mfpu=neon + endif + endif endif - -ifeq ($(arch),x86_64) - CXXFLAGS += -DUSE_SSE2 endif ### 3.7 pext @@ -514,7 +601,10 @@ endif ### needs access to the optimization flags. ifeq ($(optimize),yes) ifeq ($(debug), no) - ifeq ($(comp),clang) + ifeq ($(COMP),ndk) + CXXFLAGS += -flto=thin + LDFLAGS += $(CXXFLAGS) + else ifeq ($(comp),clang) CXXFLAGS += -flto=thin LDFLAGS += $(CXXFLAGS) @@ -524,13 +614,18 @@ ifeq ($(debug), no) ifeq ($(gccisclang),) CXXFLAGS += -flto LDFLAGS += $(CXXFLAGS) -flto=jobserver + ifneq ($(findstring MINGW,$(KERNEL)),) + LDFLAGS += -save-temps + else ifneq ($(findstring MSYS,$(KERNEL)),) + LDFLAGS += -save-temps + endif else CXXFLAGS += -flto=thin LDFLAGS += $(CXXFLAGS) endif # To use LTO and static linking on windows, the tool chain requires a recent gcc: -# gcc version 10.1 in msys2 or TDM-GCC version 9.2 are know to work, older might not. +# gcc version 10.1 in msys2 or TDM-GCC version 9.2 are known to work, older might not. # So, only enable it for a cross from Linux by default. else ifeq ($(comp),mingw) ifeq ($(KERNEL),Linux) @@ -552,6 +647,7 @@ endif ### Section 4. Public Targets ### ========================================================================== + help: @echo "" @echo "To compile stockfish, type: " @@ -560,31 +656,34 @@ help: @echo "" @echo "Supported targets:" @echo "" + @echo "help > Display architecture details" @echo "build > Standard build" - @echo "profile-build > Standard build with PGO" + @echo "net > Download the default nnue net" + @echo "profile-build > Faster build (with profile-guided optimization)" @echo "strip > Strip executable" @echo "install > Install executable" @echo "clean > Clean up" - @echo "net > Download the default nnue net" @echo "" @echo "Supported archs:" @echo "" + @echo "x86-64-vnni512 > x86 64-bit with vnni support 512bit wide" + @echo "x86-64-vnni256 > x86 64-bit with vnni support 256bit wide" @echo "x86-64-avx512 > x86 64-bit with avx512 support" @echo "x86-64-bmi2 > x86 64-bit with bmi2 support" @echo "x86-64-avx2 > x86 64-bit with avx2 support" - @echo "x86-64-sse42 > x86 64-bit with sse42 support" - @echo "x86-64-modern > x86 64-bit with sse41 support (x86-64-sse41)" - @echo "x86-64-sse41 > x86 64-bit with sse41 support" + @echo "x86-64-sse41-popcnt > x86 64-bit with sse41 and popcnt support" + @echo "x86-64-modern > common modern CPU, currently x86-64-sse41-popcnt" @echo "x86-64-ssse3 > x86 64-bit with ssse3 support" @echo "x86-64-sse3-popcnt > x86 64-bit with sse3 and popcnt support" - @echo "x86-64-sse3 > x86 64-bit with sse3 support" - @echo "x86-64 > x86 64-bit generic" - @echo "x86-32 > x86 32-bit (also enables SSE)" - @echo "x86-32-old > x86 32-bit fall back for old hardware" + @echo "x86-64 > x86 64-bit generic (with sse2 support)" + @echo "x86-32-sse41-popcnt > x86 32-bit with sse41 and popcnt support" + @echo "x86-32-sse2 > x86 32-bit with sse2 support" + @echo "x86-32 > x86 32-bit generic (with mmx and sse support)" @echo "ppc-64 > PPC 64-bit" @echo "ppc-32 > PPC 32-bit" @echo "armv7 > ARMv7 32-bit" - @echo "armv8 > ARMv8 64-bit" + @echo "armv7-neon > ARMv7 32-bit with popcnt and neon" + @echo "armv8 > ARMv8 64-bit with popcnt and neon" @echo "apple-silicon > Apple silicon ARM64" @echo "general-64 > unspecified 64-bit" @echo "general-32 > unspecified 32-bit" @@ -595,20 +694,26 @@ help: @echo "mingw > Gnu compiler with MinGW under Windows" @echo "clang > LLVM Clang compiler" @echo "icc > Intel compiler" + @echo "ndk > Google NDK to cross-compile for Android" @echo "" @echo "Simple examples. If you don't know what to do, you likely want to run: " @echo "" - @echo "make -j build ARCH=x86-64 (This is for 64-bit systems)" - @echo "make -j build ARCH=x86-32 (This is for 32-bit systems)" + @echo "make -j build ARCH=x86-64 (A portable, slow compile for 64-bit systems)" + @echo "make -j build ARCH=x86-32 (A portable, slow compile for 32-bit systems)" @echo "" - @echo "Advanced examples, for experienced users: " + @echo "Advanced examples, for experienced users looking for performance: " @echo "" - @echo "make -j build ARCH=x86-64-modern COMP=clang" - @echo "make -j profile-build ARCH=x86-64-bmi2 COMP=gcc COMPCXX=g++-4.8" - @echo "" - @echo "The selected architecture $(ARCH) enables the following configuration: " + @echo "make help ARCH=x86-64-bmi2" + @echo "make -j profile-build ARCH=x86-64-bmi2 COMP=gcc COMPCXX=g++-9.0" + @echo "make -j build ARCH=x86-64-ssse3 COMP=clang" @echo "" + @echo "-------------------------------" +ifeq ($(SUPPORTED_ARCH), true) + @echo "The selected architecture $(ARCH) will enable the following configuration: " @$(MAKE) ARCH=$(ARCH) COMP=$(COMP) config-sanity +else + @echo "Specify a supported architecture with the ARCH option for more details" +endif .PHONY: help build profile-build strip install clean net objclean profileclean \ @@ -618,7 +723,7 @@ help: build: config-sanity $(MAKE) ARCH=$(ARCH) COMP=$(COMP) all -profile-build: config-sanity objclean profileclean +profile-build: net config-sanity objclean profileclean @echo "" @echo "Step 1/4. Building instrumented executable ..." $(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) @@ -634,7 +739,7 @@ profile-build: config-sanity objclean profileclean $(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean strip: - strip $(EXE) + $(STRIP) $(EXE) install: -mkdir -p -m 755 $(BINDIR) @@ -649,17 +754,34 @@ net: $(eval nnuenet := $(shell grep EvalFile ucioption.cpp | grep Option | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/')) @echo "Default net: $(nnuenet)" $(eval nnuedownloadurl := https://tests.stockfishchess.org/api/nn/$(nnuenet)) - $(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -sL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi)) - @if test -f "$(nnuenet)"; then echo "Already available."; else echo "Downloading $(nnuedownloadurl)"; $(curl_or_wget) $(nnuedownloadurl) > $(nnuenet); fi + $(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi)) + @if test -f "$(nnuenet)"; then \ + echo "Already available."; \ + else \ + if [ "x$(curl_or_wget)" = "x" ]; then \ + echo "Automatic download failed: neither curl nor wget is installed. Install one of these tools or download the net manually"; exit 1; \ + else \ + echo "Downloading $(nnuedownloadurl)"; $(curl_or_wget) $(nnuedownloadurl) > $(nnuenet);\ + fi; \ + fi; + $(eval shasum_command := $(shell if hash shasum 2>/dev/null; then echo "shasum -a 256 "; elif hash sha256sum 2>/dev/null; then echo "sha256sum "; fi)) + @if [ "x$(shasum_command)" != "x" ]; then \ + if [ "$(nnuenet)" != "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then \ + echo "Failed download or $(nnuenet) corrupted, please delete!"; exit 1; \ + fi \ + else \ + echo "shasum / sha256sum not found, skipping net validation"; \ + fi + # clean binaries and objects objclean: - @rm -f $(EXE) *.o ./syzygy/*.o ./learn/*.o ./extra/*.o ./eval/*.o ./nnue/*.o ./nnue/features/*.o + @rm -f $(EXE) *.o ./syzygy/*.o ./nnue/*.o ./nnue/features/*.o ./learn/*.o ./extra/*.o ./eval/*.o # clean auxiliary profiling files profileclean: @rm -rf profdir - @rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda + @rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda @rm -f stockfish.profdata *.profraw default: @@ -683,14 +805,16 @@ config-sanity: @echo "os: '$(OS)'" @echo "prefetch: '$(prefetch)'" @echo "popcnt: '$(popcnt)'" + @echo "pext: '$(pext)'" @echo "sse: '$(sse)'" - @echo "sse3: '$(sse3)'" + @echo "mmx: '$(mmx)'" + @echo "sse2: '$(sse2)'" @echo "ssse3: '$(ssse3)'" @echo "sse41: '$(sse41)'" - @echo "sse42: '$(sse42)'" @echo "avx2: '$(avx2)'" - @echo "pext: '$(pext)'" @echo "avx512: '$(avx512)'" + @echo "vnni256: '$(vnni256)'" + @echo "vnni512: '$(vnni512)'" @echo "neon: '$(neon)'" @echo "" @echo "Flags:" @@ -703,22 +827,26 @@ config-sanity: @test "$(debug)" = "yes" || test "$(debug)" = "no" @test "$(sanitize)" = "undefined" || test "$(sanitize)" = "thread" || test "$(sanitize)" = "address" || test "$(sanitize)" = "no" @test "$(optimize)" = "yes" || test "$(optimize)" = "no" + @test "$(SUPPORTED_ARCH)" = "true" @test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \ test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || \ - test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a" || test "$(arch)" = "arm64" + test "$(arch)" = "armv7" || test "$(arch)" = "armv8" || test "$(arch)" = "arm64" @test "$(bits)" = "32" || test "$(bits)" = "64" @test "$(prefetch)" = "yes" || test "$(prefetch)" = "no" @test "$(popcnt)" = "yes" || test "$(popcnt)" = "no" + @test "$(pext)" = "yes" || test "$(pext)" = "no" @test "$(sse)" = "yes" || test "$(sse)" = "no" - @test "$(sse3)" = "yes" || test "$(sse3)" = "no" + @test "$(mmx)" = "yes" || test "$(mmx)" = "no" + @test "$(sse2)" = "yes" || test "$(sse2)" = "no" @test "$(ssse3)" = "yes" || test "$(ssse3)" = "no" @test "$(sse41)" = "yes" || test "$(sse41)" = "no" - @test "$(sse42)" = "yes" || test "$(sse42)" = "no" @test "$(avx2)" = "yes" || test "$(avx2)" = "no" - @test "$(pext)" = "yes" || test "$(pext)" = "no" @test "$(avx512)" = "yes" || test "$(avx512)" = "no" + @test "$(vnni256)" = "yes" || test "$(vnni256)" = "no" + @test "$(vnni512)" = "yes" || test "$(vnni512)" = "no" @test "$(neon)" = "yes" || test "$(neon)" = "no" - @test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" + @test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" \ + || test "$(comp)" = "armv7a-linux-androideabi16-clang" || test "$(comp)" = "aarch64-linux-android21-clang" $(EXE): $(OBJS) +$(CXX) -o $@ $(OBJS) $(LDFLAGS) @@ -730,7 +858,7 @@ clang-profile-make: all clang-profile-use: - llvm-profdata merge -output=stockfish.profdata *.profraw + $(XCRUN) llvm-profdata merge -output=stockfish.profdata *.profraw $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ EXTRACXXFLAGS='-fprofile-instr-use=stockfish.profdata' \ EXTRALDFLAGS='-fprofile-use ' \ diff --git a/src/benchmark.cpp b/src/benchmark.cpp index 6041d642..806e9840 100644 --- a/src/benchmark.cpp +++ b/src/benchmark.cpp @@ -95,8 +95,9 @@ const vector Defaults = { /// setup_bench() builds a list of UCI commands to be run by bench. There /// are five parameters: TT size in MB, number of search threads that /// should be used, the limit value spent for each position, a file name -/// where to look for positions in FEN format and the type of the limit: -/// depth, perft, nodes and movetime (in millisecs). +/// where to look for positions in FEN format, the type of the limit: +/// depth, perft, nodes and movetime (in millisecs), and evaluation type +/// mixed (default), classical, NNUE. /// /// bench -> search default positions up to depth 13 /// bench 64 1 15 -> search default positions up to depth 15 (TT = 64MB) @@ -115,6 +116,7 @@ vector setup_bench(const Position& current, istream& is) { string limit = (is >> token) ? token : "13"; string fenFile = (is >> token) ? token : "default"; string limitType = (is >> token) ? token : "depth"; + string evalType = (is >> token) ? token : "mixed"; go = limitType == "eval" ? "eval" : "go " + limitType + " " + limit; @@ -146,13 +148,20 @@ vector setup_bench(const Position& current, istream& is) { list.emplace_back("setoption name Hash value " + ttSize); list.emplace_back("ucinewgame"); + size_t posCounter = 0; + for (const string& fen : fens) if (fen.find("setoption") != string::npos) list.emplace_back(fen); else { + if (evalType == "classical" || (evalType == "mixed" && posCounter % 2 == 0)) + list.emplace_back("setoption name Use NNUE value false"); + else if (evalType == "NNUE" || (evalType == "mixed" && posCounter % 2 != 0)) + list.emplace_back("setoption name Use NNUE value true"); list.emplace_back("position fen " + fen); list.emplace_back(go); + ++posCounter; } return list; diff --git a/src/bitboard.cpp b/src/bitboard.cpp index f531010c..80206b58 100644 --- a/src/bitboard.cpp +++ b/src/bitboard.cpp @@ -39,6 +39,16 @@ namespace { Bitboard BishopTable[0x1480]; // To store bishop attacks void init_magics(PieceType pt, Bitboard table[], Magic magics[]); + +} + + +/// safe_destination() returns the bitboard of target square for the given step +/// from the given square. If the step is off the board, returns empty bitboard. + +inline Bitboard safe_destination(Square s, int step) { + Square to = Square(s + step); + return is_ok(to) && distance(s, to) <= 2 ? square_bb(to) : Bitboard(0); } @@ -110,7 +120,7 @@ namespace { Direction RookDirections[4] = {NORTH, SOUTH, EAST, WEST}; Direction BishopDirections[4] = {NORTH_EAST, SOUTH_EAST, SOUTH_WEST, NORTH_WEST}; - for(Direction d : (pt == ROOK ? RookDirections : BishopDirections)) + for (Direction d : (pt == ROOK ? RookDirections : BishopDirections)) { Square s = sq; while(safe_destination(s, d) && !(occupied & s)) diff --git a/src/bitboard.h b/src/bitboard.h index a899d879..29d8f66d 100644 --- a/src/bitboard.h +++ b/src/bitboard.h @@ -279,16 +279,6 @@ inline int edge_distance(File f) { return std::min(f, File(FILE_H - f)); } inline int edge_distance(Rank r) { return std::min(r, Rank(RANK_8 - r)); } -/// safe_destination() returns the bitboard of target square for the given step -/// from the given square. If the step is off the board, returns empty bitboard. - -inline Bitboard safe_destination(Square s, int step) -{ - Square to = Square(s + step); - return is_ok(to) && distance(s, to) <= 2 ? square_bb(to) : Bitboard(0); -} - - /// attacks_bb(Square) returns the pseudo attacks of the give piece type /// assuming an empty board. diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 4ba89675..5cbf821d 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -61,10 +61,11 @@ namespace Eval { UCI::OptionsMap defaults; UCI::init(defaults); - std::cerr << "NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully. " - << "These network evaluation parameters must be available, and compatible with this version of the code. " - << "The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file. " - << "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << std::endl; + sync_cout << "info string ERROR: NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully." << sync_endl; + sync_cout << "info string ERROR: The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file." << sync_endl; + sync_cout << "info string ERROR: The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << sync_endl; + sync_cout << "info string ERROR: If the UCI option Use NNUE is set to true, network evaluation parameters compatible with the program must be available." << sync_endl; + sync_cout << "info string ERROR: The engine will be terminated now." << sync_endl; std::exit(EXIT_FAILURE); } @@ -122,7 +123,8 @@ namespace { constexpr Value LazyThreshold1 = Value(1400); constexpr Value LazyThreshold2 = Value(1300); constexpr Value SpaceThreshold = Value(12222); - constexpr Value NNUEThreshold = Value(460); + constexpr Value NNUEThreshold1 = Value(550); + constexpr Value NNUEThreshold2 = Value(150); // KingAttackWeights[PieceType] contains king attack weights by piece type constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 }; @@ -294,8 +296,8 @@ namespace { attackedBy2[Us] = dblAttackByPawn | (attackedBy[Us][KING] & attackedBy[Us][PAWN]); // Init our king safety tables - Square s = make_square(Utility::clamp(file_of(ksq), FILE_B, FILE_G), - Utility::clamp(rank_of(ksq), RANK_2, RANK_7)); + Square s = make_square(std::clamp(file_of(ksq), FILE_B, FILE_G), + std::clamp(rank_of(ksq), RANK_2, RANK_7)); kingRing[Us] = attacks_bb(s) | s; kingAttackersCount[Them] = popcount(kingRing[Us] & pe->pawn_attacks(Them)); @@ -692,8 +694,8 @@ namespace { Square blockSq = s + Up; // Adjust bonus based on the king's proximity - bonus += make_score(0, ( (king_proximity(Them, blockSq) * 19) / 4 - - king_proximity(Us, blockSq) * 2) * w); + bonus += make_score(0, ( king_proximity(Them, blockSq) * 19 / 4 + - king_proximity(Us, blockSq) * 2) * w); // If blockSq is not the queening square then consider also a second push if (r != RANK_7) @@ -737,7 +739,7 @@ namespace { // Evaluation::space() computes a space evaluation for a given side, aiming to improve game - // play in the opening. It is based on the number of safe squares on the 4 central files + // play in the opening. It is based on the number of safe squares on the four central files // on ranks 2 to 4. Completely safe squares behind a friendly pawn are counted twice. // Finally, the space bonus is multiplied by a weight which decreases according to occupancy. @@ -810,7 +812,7 @@ namespace { // Now apply the bonus: note that we find the attacking side by extracting the // sign of the midgame or endgame values, and that we carefully cap the bonus // so that the midgame and endgame scores do not change sign after the bonus. - int u = ((mg > 0) - (mg < 0)) * Utility::clamp(complexity + 50, -abs(mg), 0); + int u = ((mg > 0) - (mg < 0)) * std::clamp(complexity + 50, -abs(mg), 0); int v = ((eg > 0) - (eg < 0)) * std::max(complexity, -abs(eg)); mg += u; @@ -935,9 +937,6 @@ make_v: // Side to move point of view v = (pos.side_to_move() == WHITE ? v : -v) + Tempo; - // Damp down the evaluation linearly when shuffling - v = v * (100 - pos.rule50_count()) / 100; - return v; } @@ -954,14 +953,21 @@ Value Eval::evaluate(const Position& pos) { } #endif - if (Eval::useNNUE) - { - Value v = eg_value(pos.psq_score()); - // Take NNUE eval only on balanced positions - if (abs(v) < NNUEThreshold + 20 * pos.count()) - return NNUE::evaluate(pos) + Tempo; - } - return Evaluation(pos).value(); + bool classical = !Eval::useNNUE + || abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count()); + Value v = classical ? Evaluation(pos).value() + : NNUE::evaluate(pos) * 5 / 4 + Tempo; + + if (classical && Eval::useNNUE && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count())) + v = NNUE::evaluate(pos) * 5 / 4 + Tempo; + + // Damp down the evaluation linearly when shuffling + v = v * (100 - pos.rule50_count()) / 100; + + // Guarantee evaluation does not hit the tablebase range + v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1); + + return v; } /// trace() is like evaluate(), but instead of returning a value, it returns @@ -979,42 +985,46 @@ std::string Eval::trace(const Position& pos) { Value v; - if (Eval::useNNUE) - { - v = NNUE::evaluate(pos); - } - else - { - std::memset(scores, 0, sizeof(scores)); + std::memset(scores, 0, sizeof(scores)); - pos.this_thread()->contempt = SCORE_ZERO; // Reset any dynamic contempt + pos.this_thread()->contempt = SCORE_ZERO; // Reset any dynamic contempt - v = Evaluation(pos).value(); + v = Evaluation(pos).value(); - ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2) - << " Term | White | Black | Total \n" - << " | MG EG | MG EG | MG EG \n" - << " ------------+-------------+-------------+------------\n" - << " Material | " << Term(MATERIAL) - << " Imbalance | " << Term(IMBALANCE) - << " Pawns | " << Term(PAWN) - << " Knights | " << Term(KNIGHT) - << " Bishops | " << Term(BISHOP) - << " Rooks | " << Term(ROOK) - << " Queens | " << Term(QUEEN) - << " Mobility | " << Term(MOBILITY) - << " King safety | " << Term(KING) - << " Threats | " << Term(THREAT) - << " Passed | " << Term(PASSED) - << " Space | " << Term(SPACE) - << " Winnable | " << Term(WINNABLE) - << " ------------+-------------+-------------+------------\n" - << " Total | " << Term(TOTAL); - } + ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2) + << " Term | White | Black | Total \n" + << " | MG EG | MG EG | MG EG \n" + << " ------------+-------------+-------------+------------\n" + << " Material | " << Term(MATERIAL) + << " Imbalance | " << Term(IMBALANCE) + << " Pawns | " << Term(PAWN) + << " Knights | " << Term(KNIGHT) + << " Bishops | " << Term(BISHOP) + << " Rooks | " << Term(ROOK) + << " Queens | " << Term(QUEEN) + << " Mobility | " << Term(MOBILITY) + << " King safety | " << Term(KING) + << " Threats | " << Term(THREAT) + << " Passed | " << Term(PASSED) + << " Space | " << Term(SPACE) + << " Winnable | " << Term(WINNABLE) + << " ------------+-------------+-------------+------------\n" + << " Total | " << Term(TOTAL); v = pos.side_to_move() == WHITE ? v : -v; - ss << "\nFinal evaluation: " << to_cp(v) << " (white side)\n"; + ss << "\nClassical evaluation: " << to_cp(v) << " (white side)\n"; + + if (Eval::useNNUE) + { + v = NNUE::evaluate(pos); + v = pos.side_to_move() == WHITE ? v : -v; + ss << "\nNNUE evaluation: " << to_cp(v) << " (white side)\n"; + } + + v = evaluate(pos); + v = pos.side_to_move() == WHITE ? v : -v; + ss << "\nFinal evaluation: " << to_cp(v) << " (white side)\n"; return ss.str(); } diff --git a/src/material.cpp b/src/material.cpp index 0ef9926f..870a5e11 100644 --- a/src/material.cpp +++ b/src/material.cpp @@ -130,7 +130,7 @@ Entry* probe(const Position& pos) { Value npm_w = pos.non_pawn_material(WHITE); Value npm_b = pos.non_pawn_material(BLACK); - Value npm = Utility::clamp(npm_w + npm_b, EndgameLimit, MidgameLimit); + Value npm = std::clamp(npm_w + npm_b, EndgameLimit, MidgameLimit); // Map total non-pawn material into [PHASE_ENDGAME, PHASE_MIDGAME] e->gamePhase = Phase(((npm - EndgameLimit) * PHASE_MIDGAME) / (MidgameLimit - EndgameLimit)); diff --git a/src/misc.cpp b/src/misc.cpp index 725450c2..851280fe 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -51,6 +51,11 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY); #include #endif +#if defined(__APPLE__) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) +#define POSIXALIGNEDALLOC +#include +#endif + #include "misc.h" #include "thread.h" @@ -214,26 +219,33 @@ const std::string compiler_info() { compiler += "\nCompilation settings include: "; compiler += (Is64Bit ? " 64bit" : " 32bit"); + #if defined(USE_VNNI) + compiler += " VNNI"; + #endif #if defined(USE_AVX512) compiler += " AVX512"; #endif + compiler += (HasPext ? " BMI2" : ""); #if defined(USE_AVX2) compiler += " AVX2"; #endif - #if defined(USE_SSE42) - compiler += " SSE42"; - #endif #if defined(USE_SSE41) compiler += " SSE41"; #endif #if defined(USE_SSSE3) compiler += " SSSE3"; #endif - #if defined(USE_SSE3) - compiler += " SSE3"; + #if defined(USE_SSE2) + compiler += " SSE2"; #endif - compiler += (HasPext ? " BMI2" : ""); - compiler += (HasPopCnt ? " POPCNT" : ""); + compiler += (HasPopCnt ? " POPCNT" : ""); + #if defined(USE_MMX) + compiler += " MMX"; + #endif + #if defined(USE_NEON) + compiler += " NEON"; + #endif + #if !defined(NDEBUG) compiler += " DEBUG"; #endif @@ -316,14 +328,17 @@ void prefetch(void* addr) { #endif -/// Wrappers for systems where the c++17 implementation doesn't guarantee the availability of aligned_alloc. -/// Memory allocated with std_aligned_alloc must be freed with std_aligned_free. -/// + +/// std_aligned_alloc() is our wrapper for systems where the c++17 implementation +/// does not guarantee the availability of aligned_alloc(). Memory allocated with +/// std_aligned_alloc() must be freed with std_aligned_free(). void* std_aligned_alloc(size_t alignment, size_t size) { -#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) - return aligned_alloc(alignment, size); -#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES))) + +#if defined(POSIXALIGNEDALLOC) + void *mem; + return posix_memalign(&mem, alignment, size) ? nullptr : mem; +#elif defined(_WIN32) return _mm_malloc(size, alignment); #else return std::aligned_alloc(alignment, size); @@ -331,16 +346,17 @@ void* std_aligned_alloc(size_t alignment, size_t size) { } void std_aligned_free(void* ptr) { -#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) + +#if defined(POSIXALIGNEDALLOC) free(ptr); -#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES))) +#elif defined(_WIN32) _mm_free(ptr); #else free(ptr); #endif } -/// aligned_ttmem_alloc() will return suitably aligned memory, and if possible use large pages. +/// aligned_ttmem_alloc() will return suitably aligned memory, if possible using large pages. /// The returned pointer is the aligned one, while the mem argument is the one that needs /// to be passed to free. With c++17 some of this functionality could be simplified. @@ -352,7 +368,9 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) { size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment if (posix_memalign(&mem, alignment, size)) mem = nullptr; +#if defined(MADV_HUGEPAGE) madvise(mem, allocSize, MADV_HUGEPAGE); +#endif return mem; } diff --git a/src/misc.h b/src/misc.h index ecef028f..19bb008c 100644 --- a/src/misc.h +++ b/src/misc.h @@ -67,14 +67,6 @@ std::ostream& operator<<(std::ostream&, SyncCout); #define sync_cout std::cout << IO_LOCK #define sync_endl std::endl << IO_UNLOCK -namespace Utility { - -/// Clamp a value between lo and hi. Available in c++17. -template constexpr const T& clamp(const T& v, const T& lo, const T& hi) { - return v < lo ? lo : v > hi ? hi : v; -} - -} /// xorshift64star Pseudo-Random Number Generator /// This class is based on original code written and dedicated diff --git a/src/movegen.cpp b/src/movegen.cpp index d74df4c3..3340f65c 100644 --- a/src/movegen.cpp +++ b/src/movegen.cpp @@ -248,7 +248,7 @@ namespace { *moveList++ = make_move(ksq, pop_lsb(&b)); if ((Type != CAPTURES) && pos.can_castle(Us & ANY_CASTLING)) - for(CastlingRights cr : { Us & KING_SIDE, Us & QUEEN_SIDE } ) + for (CastlingRights cr : { Us & KING_SIDE, Us & QUEEN_SIDE } ) if (!pos.castling_impeded(cr) && pos.can_castle(cr)) *moveList++ = make(ksq, pos.castling_rook_square(cr)); } diff --git a/src/movepick.cpp b/src/movepick.cpp index 96a44449..153d323e 100644 --- a/src/movepick.cpp +++ b/src/movepick.cpp @@ -182,7 +182,7 @@ top: --endMoves; ++stage; - /* fallthrough */ + [[fallthrough]]; case REFUTATION: if (select([&](){ return *cur != MOVE_NONE @@ -190,7 +190,7 @@ top: && pos.pseudo_legal(*cur); })) return *(cur - 1); ++stage; - /* fallthrough */ + [[fallthrough]]; case QUIET_INIT: if (!skipQuiets) @@ -203,7 +203,7 @@ top: } ++stage; - /* fallthrough */ + [[fallthrough]]; case QUIET: if ( !skipQuiets @@ -217,7 +217,7 @@ top: endMoves = endBadCaptures; ++stage; - /* fallthrough */ + [[fallthrough]]; case BAD_CAPTURE: return select([](){ return true; }); @@ -228,7 +228,7 @@ top: score(); ++stage; - /* fallthrough */ + [[fallthrough]]; case EVASION: return select([](){ return true; }); @@ -246,14 +246,14 @@ top: return MOVE_NONE; ++stage; - /* fallthrough */ + [[fallthrough]]; case QCHECK_INIT: cur = moves; endMoves = generate(pos, cur); ++stage; - /* fallthrough */ + [[fallthrough]]; case QCHECK: return select([](){ return true; }); diff --git a/src/movepick.h b/src/movepick.h index f080935a..4c0ad551 100644 --- a/src/movepick.h +++ b/src/movepick.h @@ -86,9 +86,9 @@ enum StatsType { NoCaptures, Captures }; /// the move's from and to squares, see www.chessprogramming.org/Butterfly_Boards typedef Stats ButterflyHistory; -/// At higher depths LowPlyHistory records successful quiet moves near the root and quiet -/// moves which are/were in the PV (ttPv) -/// It is cleared with each new search and filled during iterative deepening +/// At higher depths LowPlyHistory records successful quiet moves near the root +/// and quiet moves which are/were in the PV (ttPv). It is cleared with each new +/// search and filled during iterative deepening. constexpr int MAX_LPH = 4; typedef Stats LowPlyHistory; diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp index a28a4573..a2845c96 100644 --- a/src/nnue/evaluate_nnue.cpp +++ b/src/nnue/evaluate_nnue.cpp @@ -29,30 +29,29 @@ #include "evaluate_nnue.h" -ExtPieceSquare kpp_board_index[PIECE_NB] = { - // convention: W - us, B - them - // viewed from other side, W and B are reversed - { PS_NONE, PS_NONE }, - { PS_W_PAWN, PS_B_PAWN }, - { PS_W_KNIGHT, PS_B_KNIGHT }, - { PS_W_BISHOP, PS_B_BISHOP }, - { PS_W_ROOK, PS_B_ROOK }, - { PS_W_QUEEN, PS_B_QUEEN }, - { PS_W_KING, PS_B_KING }, - { PS_NONE, PS_NONE }, - { PS_NONE, PS_NONE }, - { PS_B_PAWN, PS_W_PAWN }, - { PS_B_KNIGHT, PS_W_KNIGHT }, - { PS_B_BISHOP, PS_W_BISHOP }, - { PS_B_ROOK, PS_W_ROOK }, - { PS_B_QUEEN, PS_W_QUEEN }, - { PS_B_KING, PS_W_KING }, - { PS_NONE, PS_NONE } -}; - - namespace Eval::NNUE { + uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = { + // convention: W - us, B - them + // viewed from other side, W and B are reversed + { PS_NONE, PS_NONE }, + { PS_W_PAWN, PS_B_PAWN }, + { PS_W_KNIGHT, PS_B_KNIGHT }, + { PS_W_BISHOP, PS_B_BISHOP }, + { PS_W_ROOK, PS_B_ROOK }, + { PS_W_QUEEN, PS_B_QUEEN }, + { PS_W_KING, PS_B_KING }, + { PS_NONE, PS_NONE }, + { PS_NONE, PS_NONE }, + { PS_B_PAWN, PS_W_PAWN }, + { PS_B_KNIGHT, PS_W_KNIGHT }, + { PS_B_BISHOP, PS_W_BISHOP }, + { PS_B_ROOK, PS_W_ROOK }, + { PS_B_QUEEN, PS_W_QUEEN }, + { PS_B_KING, PS_W_KING }, + { PS_NONE, PS_NONE } + }; + // Input feature converter AlignedPtr feature_transformer; @@ -86,7 +85,7 @@ namespace Eval::NNUE { bool ReadParameters(std::istream& stream, const AlignedPtr& pointer) { std::uint32_t header; - stream.read(reinterpret_cast(&header), sizeof(header)); + header = read_little_endian(stream); if (!stream || header != T::GetHashValue()) return false; return pointer->ReadParameters(stream); } @@ -109,13 +108,13 @@ namespace Eval::NNUE { } // Read network header - bool ReadHeader(std::istream& stream, - std::uint32_t* hash_value, std::string* architecture) { - + bool ReadHeader(std::istream& stream, std::uint32_t* hash_value, std::string* architecture) + { std::uint32_t version, size; - stream.read(reinterpret_cast(&version), sizeof(version)); - stream.read(reinterpret_cast(hash_value), sizeof(*hash_value)); - stream.read(reinterpret_cast(&size), sizeof(size)); + + version = read_little_endian(stream); + *hash_value = read_little_endian(stream); + size = read_little_endian(stream); if (!stream || version != kVersion) return false; architecture->resize(size); stream.read(&(*architecture)[0], size); @@ -202,10 +201,7 @@ namespace Eval::NNUE { // Evaluation function. Perform differential calculation. Value evaluate(const Position& pos) { - Value v = ComputeScore(pos, false); - v = Utility::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1); - - return v; + return ComputeScore(pos, false); } // Evaluation function. Perform full calculation. diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h index ec34a486..b933d2d9 100644 --- a/src/nnue/features/feature_set.h +++ b/src/nnue/features/feature_set.h @@ -106,8 +106,7 @@ namespace Eval::NNUE::Features { reset[perspective] = false; switch (trigger) { case TriggerEvent::kFriendKingMoved: - reset[perspective] = - dp.pieceId[0] == PIECE_ID_KING + perspective; + reset[perspective] = dp.piece[0] == make_piece(perspective, KING); break; default: assert(false); diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp index 628add6e..88e384a3 100644 --- a/src/nnue/features/half_kp.cpp +++ b/src/nnue/features/half_kp.cpp @@ -23,25 +23,17 @@ namespace Eval::NNUE::Features { - // Find the index of the feature quantity from the king position and PieceSquare - template - inline IndexType HalfKP::MakeIndex(Square sq_k, PieceSquare p) { - return static_cast(PS_END) * static_cast(sq_k) + p; + // Orient a square according to perspective (rotates by 180 for black) + inline Square orient(Color perspective, Square s) { + return Square(int(s) ^ (bool(perspective) * 63)); } - // Get pieces information + // Find the index of the feature quantity from the king position and PieceSquare template - inline void HalfKP::GetPieces( - const Position& pos, Color perspective, - PieceSquare** pieces, Square* sq_target_k) { + inline IndexType HalfKP::MakeIndex( + Color perspective, Square s, Piece pc, Square ksq) { - *pieces = (perspective == BLACK) ? - pos.eval_list()->piece_list_fb() : - pos.eval_list()->piece_list_fw(); - const PieceId target = (AssociatedKing == Side::kFriend) ? - static_cast(PIECE_ID_KING + perspective) : - static_cast(PIECE_ID_KING + ~perspective); - *sq_target_k = static_cast(((*pieces)[target] - PS_W_KING) % SQUARE_NB); + return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq); } // Get a list of indices for active features @@ -49,16 +41,11 @@ namespace Eval::NNUE::Features { void HalfKP::AppendActiveIndices( const Position& pos, Color perspective, IndexList* active) { - // Do nothing if array size is small to avoid compiler warning - if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return; - - PieceSquare* pieces; - Square sq_target_k; - GetPieces(pos, perspective, &pieces, &sq_target_k); - for (PieceId i = PIECE_ID_ZERO; i < PIECE_ID_KING; ++i) { - if (pieces[i] != PS_NONE) { - active->push_back(MakeIndex(sq_target_k, pieces[i])); - } + Square ksq = orient(perspective, pos.square(perspective)); + Bitboard bb = pos.pieces() & ~pos.pieces(KING); + while (bb) { + Square s = pop_lsb(&bb); + active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq)); } } @@ -68,22 +55,15 @@ namespace Eval::NNUE::Features { const Position& pos, Color perspective, IndexList* removed, IndexList* added) { - PieceSquare* pieces; - Square sq_target_k; - GetPieces(pos, perspective, &pieces, &sq_target_k); + Square ksq = orient(perspective, pos.square(perspective)); const auto& dp = pos.state()->dirtyPiece; for (int i = 0; i < dp.dirty_num; ++i) { - if (dp.pieceId[i] >= PIECE_ID_KING) continue; - const auto old_p = static_cast( - dp.old_piece[i].from[perspective]); - if (old_p != PS_NONE) { - removed->push_back(MakeIndex(sq_target_k, old_p)); - } - const auto new_p = static_cast( - dp.new_piece[i].from[perspective]); - if (new_p != PS_NONE) { - added->push_back(MakeIndex(sq_target_k, new_p)); - } + Piece pc = dp.piece[i]; + if (type_of(pc) == KING) continue; + if (dp.from[i] != SQ_NONE) + removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq)); + if (dp.to[i] != SQ_NONE) + added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq)); } } diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h index 99842eea..ee6a8df3 100644 --- a/src/nnue/features/half_kp.h +++ b/src/nnue/features/half_kp.h @@ -41,7 +41,7 @@ namespace Eval::NNUE::Features { static constexpr IndexType kDimensions = static_cast(SQUARE_NB) * static_cast(PS_END); // Maximum number of simultaneously active features - static constexpr IndexType kMaxActiveDimensions = PIECE_ID_KING; + static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count // Trigger for full calculation instead of difference calculation static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kFriendKingMoved; @@ -53,13 +53,9 @@ namespace Eval::NNUE::Features { static void AppendChangedIndices(const Position& pos, Color perspective, IndexList* removed, IndexList* added); - // Index of a feature for a given king position and another piece on some square - static IndexType MakeIndex(Square sq_k, PieceSquare p); - private: - // Get pieces information - static void GetPieces(const Position& pos, Color perspective, - PieceSquare** pieces, Square* sq_target_k); + // Index of a feature for a given king position and another piece on some square + static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k); }; } // namespace Eval::NNUE::Features diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 7336be52..f24578a8 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -70,11 +70,10 @@ namespace Eval::NNUE::Layers { // Read network parameters bool ReadParameters(std::istream& stream) { if (!previous_layer_.ReadParameters(stream)) return false; - stream.read(reinterpret_cast(biases_), - kOutputDimensions * sizeof(BiasType)); - stream.read(reinterpret_cast(weights_), - kOutputDimensions * kPaddedInputDimensions * - sizeof(WeightType)); + for (std::size_t i = 0; i < kOutputDimensions; ++i) + biases_[i] = read_little_endian(stream); + for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i) + weights_[i] = read_little_endian(stream); return !stream.fail(); } @@ -98,19 +97,32 @@ namespace Eval::NNUE::Layers { #if defined(USE_AVX512) constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2); - const __m512i kOnes = _mm512_set1_epi16(1); const auto input_vector = reinterpret_cast(input); + #if !defined(USE_VNNI) + const __m512i kOnes = _mm512_set1_epi16(1); + #endif #elif defined(USE_AVX2) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; - const __m256i kOnes = _mm256_set1_epi16(1); const auto input_vector = reinterpret_cast(input); + #if !defined(USE_VNNI) + const __m256i kOnes = _mm256_set1_epi16(1); + #endif - #elif defined(USE_SSSE3) + #elif defined(USE_SSE2) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; + #ifndef USE_SSSE3 + const __m128i kZeros = _mm_setzero_si128(); + #else const __m128i kOnes = _mm_set1_epi16(1); + #endif const auto input_vector = reinterpret_cast(input); + #elif defined(USE_MMX) + constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; + const __m64 kZeros = _mm_setzero_si64(); + const auto input_vector = reinterpret_cast(input); + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; const auto input_vector = reinterpret_cast(input); @@ -123,60 +135,115 @@ namespace Eval::NNUE::Layers { __m512i sum = _mm512_setzero_si512(); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - __m512i product = _mm512_maddubs_epi16( - _mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j])); + #if defined(USE_VNNI) + sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j])); + #else + __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j])); product = _mm512_madd_epi16(product, kOnes); sum = _mm512_add_epi32(sum, product); + #endif } - output[i] = _mm512_reduce_add_epi32(sum) + biases_[i]; // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks. // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit) // and we have to do one more 256bit chunk. if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2) { - const auto iv_256 = reinterpret_cast(input); - const auto row_256 = reinterpret_cast(&weights_[offset]); - int j = kNumChunks * 2; - - __m256i sum256 = _mm256_maddubs_epi16( - _mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); - sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1)); - sum256 = _mm256_hadd_epi32(sum256, sum256); - sum256 = _mm256_hadd_epi32(sum256, sum256); - const __m128i lo = _mm256_extracti128_si256(sum256, 0); - const __m128i hi = _mm256_extracti128_si256(sum256, 1); - output[i] += _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi); + const auto iv256 = reinterpret_cast(&input_vector[kNumChunks]); + const auto row256 = reinterpret_cast(&row[kNumChunks]); + #if defined(USE_VNNI) + __m256i product256 = _mm256_dpbusd_epi32( + _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0])); + sum = _mm512_inserti32x8(sum, product256, 0); + #else + __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0])); + sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256)); + #endif } + output[i] = _mm512_reduce_add_epi32(sum) + biases_[i]; #elif defined(USE_AVX2) __m256i sum = _mm256_setzero_si256(); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - __m256i product = _mm256_maddubs_epi16( - _mm256_load_si256(&input_vector[j]), _mm256_load_si256(&row[j])); + #if defined(USE_VNNI) + sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j])); + #else + __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j])); product = _mm256_madd_epi16(product, kOnes); sum = _mm256_add_epi32(sum, product); + #endif } - sum = _mm256_hadd_epi32(sum, sum); - sum = _mm256_hadd_epi32(sum, sum); - const __m128i lo = _mm256_extracti128_si256(sum, 0); - const __m128i hi = _mm256_extracti128_si256(sum, 1); - output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi) + biases_[i]; + __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC)); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB)); + output[i] = _mm_cvtsi128_si32(sum128) + biases_[i]; #elif defined(USE_SSSE3) - __m128i sum = _mm_cvtsi32_si128(biases_[i]); + __m128i sum = _mm_setzero_si128(); const auto row = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { - __m128i product = _mm_maddubs_epi16( - _mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j])); + for (int j = 0; j < (int)kNumChunks - 1; j += 2) { + __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j])); + product0 = _mm_madd_epi16(product0, kOnes); + sum = _mm_add_epi32(sum, product0); + __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1])); + product1 = _mm_madd_epi16(product1, kOnes); + sum = _mm_add_epi32(sum, product1); + } + if (kNumChunks & 0x1) { + __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1])); product = _mm_madd_epi16(product, kOnes); sum = _mm_add_epi32(sum, product); } - sum = _mm_hadd_epi32(sum, sum); - sum = _mm_hadd_epi32(sum, sum); + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB + output[i] = _mm_cvtsi128_si32(sum) + biases_[i]; + + #elif defined(USE_SSE2) + __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]); + __m128i sum_hi = kZeros; + const auto row = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + __m128i row_j = _mm_load_si128(&row[j]); + __m128i input_j = _mm_load_si128(&input_vector[j]); + __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j); + __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs); + __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs); + __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros); + __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros); + __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo); + __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi); + sum_lo = _mm_add_epi32(sum_lo, product_lo); + sum_hi = _mm_add_epi32(sum_hi, product_hi); + } + __m128i sum = _mm_add_epi32(sum_lo, sum_hi); + __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2)); + sum = _mm_add_epi32(sum, sum_high_64); + __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2)); + sum = _mm_add_epi32(sum, sum_second_32); output[i] = _mm_cvtsi128_si32(sum); + #elif defined(USE_MMX) + __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]); + __m64 sum_hi = kZeros; + const auto row = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + __m64 row_j = row[j]; + __m64 input_j = input_vector[j]; + __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j); + __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs); + __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs); + __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros); + __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros); + __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo); + __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi); + sum_lo = _mm_add_pi32(sum_lo, product_lo); + sum_hi = _mm_add_pi32(sum_hi, product_hi); + } + __m64 sum = _mm_add_pi32(sum_lo, sum_hi); + sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum)); + output[i] = _mm_cvtsi64_si32(sum); + #elif defined(USE_NEON) int32x4_t sum = {biases_[i]}; const auto row = reinterpret_cast(&weights_[offset]); @@ -196,6 +263,9 @@ namespace Eval::NNUE::Layers { #endif } + #if defined(USE_MMX) + _mm_empty(); + #endif return output; } diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index 9b5a5f5f..d923986e 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -86,18 +86,17 @@ namespace Eval::NNUE::Layers { const auto out = reinterpret_cast<__m256i*>(output); for (IndexType i = 0; i < kNumChunks; ++i) { const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32( - _mm256_load_si256(&in[i * 4 + 0]), - _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits); + _mm256_loadA_si256(&in[i * 4 + 0]), + _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits); const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32( - _mm256_load_si256(&in[i * 4 + 2]), - _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits); - _mm256_store_si256( - &out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( + _mm256_loadA_si256(&in[i * 4 + 2]), + _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits); + _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( _mm256_packs_epi16(words0, words1), kZero), kOffsets)); } constexpr IndexType kStart = kNumChunks * kSimdWidth; - #elif defined(USE_SSSE3) + #elif defined(USE_SSE2) constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; #ifdef USE_SSE41 @@ -128,6 +127,24 @@ namespace Eval::NNUE::Layers { } constexpr IndexType kStart = kNumChunks * kSimdWidth; + #elif defined(USE_MMX) + constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; + const __m64 k0x80s = _mm_set1_pi8(-128); + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast<__m64*>(output); + for (IndexType i = 0; i < kNumChunks; ++i) { + const __m64 words0 = _mm_srai_pi16( + _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]), + kWeightScaleBits); + const __m64 words1 = _mm_srai_pi16( + _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]), + kWeightScaleBits); + const __m64 packedbytes = _mm_packs_pi16(words0, words1); + out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); + } + _mm_empty(); + constexpr IndexType kStart = kNumChunks * kSimdWidth; + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2); const int8x8_t kZero = {0}; diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index 2a354a3c..69dfaad2 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -26,7 +26,7 @@ namespace Eval::NNUE { // Class that holds the result of affine transformation of input features - struct alignas(32) Accumulator { + struct alignas(kCacheLineSize) Accumulator { std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions]; Value score; diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index 36fda7d7..d7ffa21a 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -21,6 +21,9 @@ #ifndef NNUE_COMMON_H_INCLUDED #define NNUE_COMMON_H_INCLUDED +#include +#include + #if defined(USE_AVX2) #include @@ -33,10 +36,36 @@ #elif defined(USE_SSE2) #include +#elif defined(USE_MMX) +#include + #elif defined(USE_NEON) #include #endif +// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary +// compiled with older g++ crashes because the output memory is not aligned +// even though alignas is specified. +#if defined(USE_AVX2) +#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__) +#define _mm256_loadA_si256 _mm256_loadu_si256 +#define _mm256_storeA_si256 _mm256_storeu_si256 +#else +#define _mm256_loadA_si256 _mm256_load_si256 +#define _mm256_storeA_si256 _mm256_store_si256 +#endif +#endif + +#if defined(USE_AVX512) +#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__) +#define _mm512_loadA_si512 _mm512_loadu_si512 +#define _mm512_storeA_si512 _mm512_storeu_si512 +#else +#define _mm512_loadA_si512 _mm512_load_si512 +#define _mm512_storeA_si512 _mm512_store_si512 +#endif +#endif + namespace Eval::NNUE { // Version of the evaluation file @@ -56,12 +85,36 @@ namespace Eval::NNUE { #elif defined(USE_SSE2) constexpr std::size_t kSimdWidth = 16; + #elif defined(USE_MMX) + constexpr std::size_t kSimdWidth = 8; + #elif defined(USE_NEON) constexpr std::size_t kSimdWidth = 16; #endif constexpr std::size_t kMaxSimdWidth = 32; + // unique number for each piece type on each square + enum { + PS_NONE = 0, + PS_W_PAWN = 1, + PS_B_PAWN = 1 * SQUARE_NB + 1, + PS_W_KNIGHT = 2 * SQUARE_NB + 1, + PS_B_KNIGHT = 3 * SQUARE_NB + 1, + PS_W_BISHOP = 4 * SQUARE_NB + 1, + PS_B_BISHOP = 5 * SQUARE_NB + 1, + PS_W_ROOK = 6 * SQUARE_NB + 1, + PS_B_ROOK = 7 * SQUARE_NB + 1, + PS_W_QUEEN = 8 * SQUARE_NB + 1, + PS_B_QUEEN = 9 * SQUARE_NB + 1, + PS_W_KING = 10 * SQUARE_NB + 1, + PS_END = PS_W_KING, // pieces without kings (pawns included) + PS_B_KING = 11 * SQUARE_NB + 1, + PS_END2 = 12 * SQUARE_NB + 1 + }; + + extern uint32_t kpp_board_index[PIECE_NB][COLOR_NB]; + // Type of input feature after conversion using TransformedFeatureType = std::uint8_t; using IndexType = std::uint32_t; @@ -73,7 +126,25 @@ namespace Eval::NNUE { // Round n up to be a multiple of base template constexpr IntType CeilToMultiple(IntType n, IntType base) { - return (n + base - 1) / base * base; + return (n + base - 1) / base * base; + } + + // read_little_endian() is our utility to read an integer (signed or unsigned, any size) + // from a stream in little-endian order. We swap the byte order after the read if + // necessary to return a result with the byte ordering of the compiling machine. + template + inline IntType read_little_endian(std::istream& stream) { + + IntType result; + std::uint8_t u[sizeof(IntType)]; + typename std::make_unsigned::type v = 0; + + stream.read(reinterpret_cast(u), sizeof(IntType)); + for (std::size_t i = 0; i < sizeof(IntType); ++i) + v = (v << 8) | u[sizeof(IntType) - i - 1]; + + std::memcpy(&result, &v, sizeof(IntType)); + return result; } } // namespace Eval::NNUE diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 29e6db6e..e1bc2ab8 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -62,10 +62,10 @@ namespace Eval::NNUE { // Read network parameters bool ReadParameters(std::istream& stream) { - stream.read(reinterpret_cast(biases_), - kHalfDimensions * sizeof(BiasType)); - stream.read(reinterpret_cast(weights_), - kHalfDimensions * kInputDimensions * sizeof(WeightType)); + for (std::size_t i = 0; i < kHalfDimensions; ++i) + biases_[i] = read_little_endian(stream); + for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i) + weights_[i] = read_little_endian(stream); return !stream.fail(); } @@ -104,7 +104,7 @@ namespace Eval::NNUE { constexpr int kControl = 0b11011000; const __m256i kZero = _mm256_setzero_si256(); - #elif defined(USE_SSSE3) + #elif defined(USE_SSE2) constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; #ifdef USE_SSE41 @@ -113,6 +113,10 @@ namespace Eval::NNUE { const __m128i k0x80s = _mm_set1_epi8(-128); #endif + #elif defined(USE_MMX) + constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; + const __m64 k0x80s = _mm_set1_pi8(-128); + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); const int8x8_t kZero = {0}; @@ -125,17 +129,15 @@ namespace Eval::NNUE { #if defined(USE_AVX2) auto out = reinterpret_cast<__m256i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - __m256i sum0 = - _mm256_load_si256(&reinterpret_cast( - accumulation[perspectives[p]][0])[j * 2 + 0]); - __m256i sum1 = - _mm256_load_si256(&reinterpret_cast( - accumulation[perspectives[p]][0])[j * 2 + 1]); - _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( + __m256i sum0 = _mm256_loadA_si256( + &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 0]); + __m256i sum1 = _mm256_loadA_si256( + &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 1]); + _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( _mm256_packs_epi16(sum0, sum1), kZero), kControl)); } - #elif defined(USE_SSSE3) + #elif defined(USE_SSE2) auto out = reinterpret_cast<__m128i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m128i sum0 = _mm_load_si128(&reinterpret_cast( @@ -155,6 +157,17 @@ namespace Eval::NNUE { ); } + #elif defined(USE_MMX) + auto out = reinterpret_cast<__m64*>(&output[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + __m64 sum0 = *(&reinterpret_cast( + accumulation[perspectives[p]][0])[j * 2 + 0]); + __m64 sum1 = *(&reinterpret_cast( + accumulation[perspectives[p]][0])[j * 2 + 1]); + const __m64 packedbytes = _mm_packs_pi16(sum0, sum1); + out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); + } + #elif defined(USE_NEON) const auto out = reinterpret_cast(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { @@ -172,6 +185,9 @@ namespace Eval::NNUE { #endif } + #if defined(USE_MMX) + _mm_empty(); + #endif } private: @@ -187,23 +203,37 @@ namespace Eval::NNUE { kHalfDimensions * sizeof(BiasType)); for (const auto index : active_indices[perspective]) { const IndexType offset = kHalfDimensions * index; + #if defined(USE_AVX512) + auto accumulation = reinterpret_cast<__m512i*>( + &accumulator.accumulation[perspective][i][0]); + auto column = reinterpret_cast(&weights_[offset]); + constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; + for (IndexType j = 0; j < kNumChunks; ++j) + _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j])); - #if defined(USE_AVX2) + #elif defined(USE_AVX2) auto accumulation = reinterpret_cast<__m256i*>( &accumulator.accumulation[perspective][i][0]); auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) { - accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]); - } + for (IndexType j = 0; j < kNumChunks; ++j) + _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j])); #elif defined(USE_SSE2) auto accumulation = reinterpret_cast<__m128i*>( &accumulator.accumulation[perspective][i][0]); auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) { + for (IndexType j = 0; j < kNumChunks; ++j) accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); + + #elif defined(USE_MMX) + auto accumulation = reinterpret_cast<__m64*>( + &accumulator.accumulation[perspective][i][0]); + auto column = reinterpret_cast(&weights_[offset]); + constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); + for (IndexType j = 0; j < kNumChunks; ++j) { + accumulation[j] = _mm_add_pi16(accumulation[j], column[j]); } #elif defined(USE_NEON) @@ -211,18 +241,19 @@ namespace Eval::NNUE { &accumulator.accumulation[perspective][i][0]); auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) { + for (IndexType j = 0; j < kNumChunks; ++j) accumulation[j] = vaddq_s16(accumulation[j], column[j]); - } #else - for (IndexType j = 0; j < kHalfDimensions; ++j) { + for (IndexType j = 0; j < kHalfDimensions; ++j) accumulator.accumulation[perspective][i][j] += weights_[offset + j]; - } #endif } } + #if defined(USE_MMX) + _mm_empty(); + #endif accumulator.computed_accumulation = true; accumulator.computed_score = false; @@ -249,6 +280,11 @@ namespace Eval::NNUE { auto accumulation = reinterpret_cast<__m128i*>( &accumulator.accumulation[perspective][i][0]); + #elif defined(USE_MMX) + constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); + auto accumulation = reinterpret_cast<__m64*>( + &accumulator.accumulation[perspective][i][0]); + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); auto accumulation = reinterpret_cast( @@ -278,6 +314,12 @@ namespace Eval::NNUE { accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]); } + #elif defined(USE_MMX) + auto column = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]); + } + #elif defined(USE_NEON) auto column = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { @@ -309,6 +351,12 @@ namespace Eval::NNUE { accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); } + #elif defined(USE_MMX) + auto column = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + accumulation[j] = _mm_add_pi16(accumulation[j], column[j]); + } + #elif defined(USE_NEON) auto column = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { @@ -325,6 +373,9 @@ namespace Eval::NNUE { } } } + #if defined(USE_MMX) + _mm_empty(); + #endif accumulator.computed_accumulation = true; accumulator.computed_score = false; diff --git a/src/pawns.cpp b/src/pawns.cpp index 868d0c8e..af0f6618 100644 --- a/src/pawns.cpp +++ b/src/pawns.cpp @@ -219,7 +219,7 @@ Score Entry::evaluate_shelter(const Position& pos, Square ksq) const { Score bonus = make_score(5, 5); - File center = Utility::clamp(file_of(ksq), FILE_B, FILE_G); + File center = std::clamp(file_of(ksq), FILE_B, FILE_G); for (File f = File(center - 1); f <= File(center + 1); ++f) { b = ourPawns & file_bb(f); diff --git a/src/position.cpp b/src/position.cpp index 46e5d78b..fe89b753 100644 --- a/src/position.cpp +++ b/src/position.cpp @@ -198,9 +198,6 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th std::fill_n(&pieceList[0][0], sizeof(pieceList) / sizeof(Square), SQ_NONE); st = si; - // Each piece on board gets a unique ID used to track the piece later - PieceId piece_id, next_piece_id = PIECE_ID_ZERO; - ss >> std::noskipws; // 1. Piece placement @@ -212,21 +209,8 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th else if (token == '/') sq += 2 * SOUTH; - else if ((idx = PieceToChar.find(token)) != string::npos) - { - auto pc = Piece(idx); - put_piece(pc, sq); - - if (Eval::useNNUE) - { - // Kings get a fixed ID, other pieces get ID in order of placement - piece_id = - (idx == W_KING) ? PIECE_ID_WKING : - (idx == B_KING) ? PIECE_ID_BKING : - next_piece_id++; - evalList.put_piece(piece_id, sq, pc); - } - + else if ((idx = PieceToChar.find(token)) != string::npos) { + put_piece(Piece(idx), sq); ++sq; } } @@ -721,8 +705,6 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { // Used by NNUE st->accumulator.computed_accumulation = false; st->accumulator.computed_score = false; - PieceId dp0 = PIECE_ID_NONE; - PieceId dp1 = PIECE_ID_NONE; auto& dp = st->dirtyPiece; dp.dirty_num = 1; @@ -775,12 +757,10 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { if (Eval::useNNUE) { - dp.dirty_num = 2; // 2 pieces moved - dp1 = piece_id_on(capsq); - dp.pieceId[1] = dp1; - dp.old_piece[1] = evalList.piece_with_id(dp1); - evalList.put_piece(dp1, capsq, NO_PIECE); - dp.new_piece[1] = evalList.piece_with_id(dp1); + dp.dirty_num = 2; // 1 piece moved, 1 piece captured + dp.piece[1] = captured; + dp.from[1] = capsq; + dp.to[1] = SQ_NONE; } // Update board and piece lists @@ -821,11 +801,9 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { { if (Eval::useNNUE) { - dp0 = piece_id_on(from); - dp.pieceId[0] = dp0; - dp.old_piece[0] = evalList.piece_with_id(dp0); - evalList.put_piece(dp0, to, pc); - dp.new_piece[0] = evalList.piece_with_id(dp0); + dp.piece[0] = pc; + dp.from[0] = from; + dp.to[0] = to; } move_piece(from, to); @@ -854,9 +832,12 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { if (Eval::useNNUE) { - dp0 = piece_id_on(to); - evalList.put_piece(dp0, to, promotion); - dp.new_piece[0] = evalList.piece_with_id(dp0); + // Promoting pawn to SQ_NONE, promoted piece from SQ_NONE + dp.to[0] = SQ_NONE; + dp.piece[dp.dirty_num] = promotion; + dp.from[dp.dirty_num] = SQ_NONE; + dp.to[dp.dirty_num] = to; + dp.dirty_num++; } // Update hash keys @@ -950,12 +931,6 @@ void Position::undo_move(Move m) { { move_piece(to, from); // Put the piece back at the source square - if (Eval::useNNUE) - { - PieceId dp0 = st->dirtyPiece.pieceId[0]; - evalList.put_piece(dp0, from, pc); - } - if (st->capturedPiece) { Square capsq = to; @@ -972,14 +947,6 @@ void Position::undo_move(Move m) { } put_piece(st->capturedPiece, capsq); // Restore the captured piece - - if (Eval::useNNUE) - { - PieceId dp1 = st->dirtyPiece.pieceId[1]; - assert(evalList.piece_with_id(dp1).from[WHITE] == PS_NONE); - assert(evalList.piece_with_id(dp1).from[BLACK] == PS_NONE); - evalList.put_piece(dp1, capsq, st->capturedPiece); - } } } @@ -1001,32 +968,16 @@ void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Squ rto = relative_square(us, kingSide ? SQ_F1 : SQ_D1); to = relative_square(us, kingSide ? SQ_G1 : SQ_C1); - if (Eval::useNNUE) + if (Do && Eval::useNNUE) { - PieceId dp0, dp1; auto& dp = st->dirtyPiece; - dp.dirty_num = 2; // 2 pieces moved - - if (Do) - { - dp0 = piece_id_on(from); - dp1 = piece_id_on(rfrom); - dp.pieceId[0] = dp0; - dp.old_piece[0] = evalList.piece_with_id(dp0); - evalList.put_piece(dp0, to, make_piece(us, KING)); - dp.new_piece[0] = evalList.piece_with_id(dp0); - dp.pieceId[1] = dp1; - dp.old_piece[1] = evalList.piece_with_id(dp1); - evalList.put_piece(dp1, rto, make_piece(us, ROOK)); - dp.new_piece[1] = evalList.piece_with_id(dp1); - } - else - { - dp0 = piece_id_on(to); - dp1 = piece_id_on(rto); - evalList.put_piece(dp0, from, make_piece(us, KING)); - evalList.put_piece(dp1, rfrom, make_piece(us, ROOK)); - } + dp.piece[0] = make_piece(us, KING); + dp.from[0] = from; + dp.to[0] = to; + dp.piece[1] = make_piece(us, ROOK); + dp.from[1] = rfrom; + dp.to[1] = rto; + dp.dirty_num = 2; } // Remove both pieces first since squares could overlap in Chess960 @@ -1145,8 +1096,8 @@ bool Position::see_ge(Move m, Value threshold) const { // Don't allow pinned pieces to attack (except the king) as long as // there are pinners on their original square. - if (st->pinners[~stm] & occupied) - stmAttackers &= ~st->blockersForKing[stm]; + if (pinners(~stm) & occupied) + stmAttackers &= ~blockers_for_king(stm); if (!stmAttackers) break; diff --git a/src/position.h b/src/position.h index b5dbaf59..e3f758e0 100644 --- a/src/position.h +++ b/src/position.h @@ -116,6 +116,7 @@ public: Bitboard checkers() const; Bitboard blockers_for_king(Color c) const; Bitboard check_squares(PieceType pt) const; + Bitboard pinners(Color c) const; bool is_discovery_check_on_king(Color c, Move m) const; // Attacks to/from a given square @@ -173,7 +174,6 @@ public: // Used by NNUE StateInfo* state() const; - const EvalList* eval_list() const; #if defined(EVAL_LEARN) // --sfenization helper @@ -208,9 +208,6 @@ private: template void do_castling(Color us, Square from, Square& to, Square& rfrom, Square& rto); - // ID of a piece on a given square - PieceId piece_id_on(Square sq) const; - // Data members Piece board[SQUARE_NB]; Bitboard byTypeBB[PIECE_TYPE_NB]; @@ -227,9 +224,6 @@ private: Thread* thisThread; StateInfo* st; bool chess960; - - // List of pieces used in NNUE evaluation function - EvalList evalList; }; namespace PSQT { @@ -332,6 +326,10 @@ inline Bitboard Position::blockers_for_king(Color c) const { return st->blockersForKing[c]; } +inline Bitboard Position::pinners(Color c) const { + return st->pinners[c]; +} + inline Bitboard Position::check_squares(PieceType pt) const { return st->checkSquares[pt]; } @@ -469,20 +467,4 @@ inline StateInfo* Position::state() const { return st; } -inline const EvalList* Position::eval_list() const { - - return &evalList; -} - -inline PieceId Position::piece_id_on(Square sq) const -{ - - assert(piece_on(sq) != NO_PIECE); - - PieceId pid = evalList.piece_id_list[sq]; - assert(is_ok(pid)); - - return pid; -} - #endif // #ifndef POSITION_H_INCLUDED diff --git a/src/search.cpp b/src/search.cpp index b7561a96..2d848bcd 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -63,9 +63,9 @@ namespace { constexpr uint64_t TtHitAverageResolution = 1024; // Razor and futility margins - constexpr int RazorMargin = 527; + constexpr int RazorMargin = 510; Value futility_margin(Depth d, bool improving) { - return Value(227 * (d - improving)); + return Value(223 * (d - improving)); } bool training; @@ -75,7 +75,7 @@ namespace { Depth reduction(bool i, Depth d, int mn) { int r = Reductions[d] * Reductions[mn]; - return (r + 570) / 1024 + (!i && r > 1018); + return (r + 509) / 1024 + (!i && r > 894); } constexpr int futility_move_count(bool improving, Depth depth) { @@ -84,7 +84,7 @@ namespace { // History and stats update bonus, based on depth int stat_bonus(Depth d) { - return d > 15 ? 27 : 17 * d * d + 133 * d - 134; + return d > 13 ? 29 : 17 * d * d + 134 * d - 134; } // Add a small random component to draw evaluations to avoid 3fold-blindness @@ -194,7 +194,7 @@ namespace { void Search::init() { for (int i = 1; i < MAX_MOVES; ++i) - Reductions[i] = int((24.8 + std::log(Threads.size())) * std::log(i)); + Reductions[i] = int((22.0 + std::log(Threads.size())) * std::log(i)); training = Options["Training"]; } @@ -339,7 +339,7 @@ void Thread::search() { // for match (TC 60+0.6) results spanning a wide range of k values. PRNG rng(now()); double floatLevel = Options["UCI_LimitStrength"] ? - Utility::clamp(std::pow((Options["UCI_Elo"] - 1346.6) / 143.4, 1 / 0.806), 0.0, 20.0) : + std::clamp(std::pow((Options["UCI_Elo"] - 1346.6) / 143.4, 1 / 0.806), 0.0, 20.0) : double(Options["Skill Level"]); int intLevel = int(floatLevel) + ((floatLevel - int(floatLevel)) * 1024 > rng.rand() % 1024 ? 1 : 0); @@ -407,12 +407,12 @@ void Thread::search() { if (rootDepth >= 4) { Value prev = rootMoves[pvIdx].previousScore; - delta = Value(19); + delta = Value(17); alpha = std::max(prev - delta,-VALUE_INFINITE); beta = std::min(prev + delta, VALUE_INFINITE); // Adjust contempt based on root move's previousScore (dynamic contempt) - int dct = ct + (110 - ct / 2) * prev / (abs(prev) + 140); + int dct = ct + (105 - ct / 2) * prev / (abs(prev) + 149); contempt = (us == WHITE ? make_score(dct, dct / 2) : -make_score(dct, dct / 2)); @@ -510,13 +510,13 @@ void Thread::search() { && !Threads.stop && !mainThread->stopOnPonderhit) { - double fallingEval = (296 + 6 * (mainThread->bestPreviousScore - bestValue) - + 6 * (mainThread->iterValue[iterIdx] - bestValue)) / 725.0; - fallingEval = Utility::clamp(fallingEval, 0.5, 1.5); + double fallingEval = (318 + 6 * (mainThread->bestPreviousScore - bestValue) + + 6 * (mainThread->iterValue[iterIdx] - bestValue)) / 825.0; + fallingEval = std::clamp(fallingEval, 0.5, 1.5); // If the bestMove is stable over several iterations, reduce time accordingly - timeReduction = lastBestMoveDepth + 10 < completedDepth ? 1.92 : 0.95; - double reduction = (1.47 + mainThread->previousTimeReduction) / (2.22 * timeReduction); + timeReduction = lastBestMoveDepth + 9 < completedDepth ? 1.92 : 0.95; + double reduction = (1.47 + mainThread->previousTimeReduction) / (2.32 * timeReduction); // Use part of the gained time from a previous stable move for the current move for (Thread* th : Threads) @@ -541,7 +541,7 @@ void Thread::search() { } else if ( Threads.increaseDepth && !mainThread->ponder - && Time.elapsed() > totalTime * 0.56) + && Time.elapsed() > totalTime * 0.58) Threads.increaseDepth = false; else Threads.increaseDepth = true; @@ -600,7 +600,7 @@ namespace { Key posKey; Move ttMove, move, excludedMove, bestMove; Depth extension, newDepth; - Value bestValue, value, ttValue, eval, maxValue, probcutBeta; + Value bestValue, value, ttValue, eval, maxValue, probCutBeta; bool ttHit, ttPv, formerPv, givesCheck, improving, didLMR, priorCapture; bool captureOrPromotion, doFullDepthSearch, moveCountPruning, ttCapture, singularQuietLMR; @@ -798,11 +798,7 @@ namespace { else { if ((ss-1)->currentMove != MOVE_NULL) - { - int bonus = -(ss-1)->statScore / 512; - - ss->staticEval = eval = evaluate(pos) + bonus; - } + ss->staticEval = eval = evaluate(pos); else ss->staticEval = eval = -(ss-1)->staticEval + 2 * Tempo; @@ -815,8 +811,9 @@ namespace { && eval <= alpha - RazorMargin) return qsearch(pos, ss, alpha, beta); - improving = (ss-2)->staticEval == VALUE_NONE ? (ss->staticEval > (ss-4)->staticEval - || (ss-4)->staticEval == VALUE_NONE) : ss->staticEval > (ss-2)->staticEval; + improving = (ss-2)->staticEval == VALUE_NONE + ? ss->staticEval > (ss-4)->staticEval || (ss-4)->staticEval == VALUE_NONE + : ss->staticEval > (ss-2)->staticEval; // Step 8. Futility pruning: child node (~50 Elo) if ( !PvNode @@ -828,10 +825,10 @@ namespace { // Step 9. Null move search with verification search (~40 Elo) if ( !PvNode && (ss-1)->currentMove != MOVE_NULL - && (ss-1)->statScore < 23824 + && (ss-1)->statScore < 22977 && eval >= beta && eval >= ss->staticEval - && ss->staticEval >= beta - 28 * depth - 28 * improving + 94 * ttPv + 200 + && ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ttPv + 182 && !excludedMove && pos.non_pawn_material(us) && (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor)) @@ -839,7 +836,7 @@ namespace { assert(eval - beta >= 0); // Null move dynamic reduction based on depth and value - Depth R = (737 + 77 * depth) / 246 + std::min(int(eval - beta) / 192, 3); + Depth R = (817 + 71 * depth) / 213 + std::min(int(eval - beta) / 192, 3); ss->currentMove = MOVE_NULL; ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0]; @@ -875,7 +872,7 @@ namespace { } } - probcutBeta = beta + 176 - 49 * improving; + probCutBeta = beta + 176 - 49 * improving; // Step 10. ProbCut (~10 Elo) // If we have a good enough capture and a reduced search returns a value @@ -883,21 +880,27 @@ namespace { if ( !PvNode && depth > 4 && abs(beta) < VALUE_TB_WIN_IN_MAX_PLY + // if value from transposition table is lower than probCutBeta, don't attempt probCut + // there and in further interactions with transposition table cutoff depth is set to depth - 3 + // because probCut search has depth set to depth - 4 but we also do a move before it + // so effective depth is equal to depth - 3 && !( ttHit && tte->depth() >= depth - 3 && ttValue != VALUE_NONE - && ttValue < probcutBeta)) + && ttValue < probCutBeta)) { + // if ttMove is a capture and value from transposition table is good enough produce probCut + // cutoff without digging into actual probCut search if ( ttHit && tte->depth() >= depth - 3 && ttValue != VALUE_NONE - && ttValue >= probcutBeta + && ttValue >= probCutBeta && ttMove && pos.capture_or_promotion(ttMove)) - return probcutBeta; + return probCutBeta; - assert(probcutBeta < VALUE_INFINITE); - MovePicker mp(pos, ttMove, probcutBeta - ss->staticEval, &captureHistory); + assert(probCutBeta < VALUE_INFINITE); + MovePicker mp(pos, ttMove, probCutBeta - ss->staticEval, &captureHistory); int probCutCount = 0; while ( (move = mp.next_move()) != MOVE_NONE @@ -919,16 +922,17 @@ namespace { pos.do_move(move, st); // Perform a preliminary qsearch to verify that the move holds - value = -qsearch(pos, ss+1, -probcutBeta, -probcutBeta+1); + value = -qsearch(pos, ss+1, -probCutBeta, -probCutBeta+1); // If the qsearch held, perform the regular search - if (value >= probcutBeta) - value = -search(pos, ss+1, -probcutBeta, -probcutBeta+1, depth - 4, !cutNode); + if (value >= probCutBeta) + value = -search(pos, ss+1, -probCutBeta, -probCutBeta+1, depth - 4, !cutNode); pos.undo_move(move); - if (value >= probcutBeta) + if (value >= probCutBeta) { + // if transposition table doesn't have equal or more deep info write probCut data into it if ( !(ttHit && tte->depth() >= depth - 3 && ttValue != VALUE_NONE)) @@ -940,16 +944,6 @@ namespace { } } - // Step 11. Internal iterative deepening (~1 Elo) - if (depth >= 7 && !ttMove) - { - search(pos, ss, alpha, beta, depth - 7, cutNode); - - tte = TT.probe(posKey, ttHit); - ttValue = ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE; - ttMove = ttHit ? tte->move() : MOVE_NONE; - } - moves_loop: // When in check, search starts from here const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory, @@ -973,7 +967,7 @@ moves_loop: // When in check, search starts from here // Mark this node as being searched ThreadHolding th(thisThread, posKey, ss->ply); - // Step 12. Loop through all pseudo-legal moves until no moves remain + // Step 11. Loop through all pseudo-legal moves until no moves remain // or a beta cutoff occurs. while ((move = mp.next_move(moveCountPruning)) != MOVE_NONE) { @@ -1015,7 +1009,7 @@ moves_loop: // When in check, search starts from here // Calculate new depth for this move newDepth = depth - 1; - // Step 13. Pruning at shallow depth (~200 Elo) + // Step 12. Pruning at shallow depth (~200 Elo) if ( !rootNode && !(training && PvNode) && pos.non_pawn_material(us) @@ -1037,17 +1031,17 @@ moves_loop: // When in check, search starts from here continue; // Futility pruning: parent node (~5 Elo) - if ( lmrDepth < 8 + if ( lmrDepth < 7 && !ss->inCheck - && ss->staticEval + 284 + 188 * lmrDepth <= alpha + && ss->staticEval + 283 + 170 * lmrDepth <= alpha && (*contHist[0])[movedPiece][to_sq(move)] + (*contHist[1])[movedPiece][to_sq(move)] + (*contHist[3])[movedPiece][to_sq(move)] - + (*contHist[5])[movedPiece][to_sq(move)] / 2 < 28388) + + (*contHist[5])[movedPiece][to_sq(move)] / 2 < 27376) continue; // Prune moves with negative SEE (~20 Elo) - if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 17)) * lmrDepth * lmrDepth))) + if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth))) continue; } else @@ -1064,17 +1058,17 @@ moves_loop: // When in check, search starts from here && !(PvNode && abs(bestValue) < 2) && PieceValue[MG][type_of(movedPiece)] >= PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] && !ss->inCheck - && ss->staticEval + 178 + 261 * lmrDepth + && ss->staticEval + 169 + 244 * lmrDepth + PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] <= alpha) continue; // See based pruning - if (!pos.see_ge(move, Value(-202) * depth)) // (~25 Elo) + if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo) continue; } } - // Step 14. Extensions (~75 Elo) + // Step 13. Extensions (~75 Elo) // Singular extension search (~70 Elo). If all moves but one fail low on a // search of (alpha-s, beta-s), and just one fails high on (alpha, beta), @@ -1128,19 +1122,14 @@ moves_loop: // When in check, search starts from here && (pos.is_discovery_check_on_king(~us, move) || pos.see_ge(move))) extension = 1; - // Passed pawn extension - else if ( move == ss->killers[0] - && pos.advanced_pawn_push(move) - && pos.pawn_passed(us, to_sq(move))) - extension = 1; - // Last captures extension else if ( PieceValue[EG][pos.captured_piece()] > PawnValueEg && pos.non_pawn_material() <= 2 * RookValueMg) extension = 1; // Castling extension - if (type_of(move) == CASTLING) + if ( type_of(move) == CASTLING + && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2) extension = 1; // Late irreversible move extension @@ -1162,10 +1151,10 @@ moves_loop: // When in check, search starts from here [movedPiece] [to_sq(move)]; - // Step 15. Make the move + // Step 14. Make the move pos.do_move(move, st, givesCheck); - // Step 16. Reduced depth search (LMR, ~200 Elo). If the move fails high it will be + // Step 15. Reduced depth search (LMR, ~200 Elo). If the move fails high it will be // re-searched at full depth. if ( depth >= 3 && moveCount > 1 + 2 * rootNode + 2 * (PvNode && abs(bestValue) < 2) @@ -1174,7 +1163,7 @@ moves_loop: // When in check, search starts from here || moveCountPruning || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha || cutNode - || thisThread->ttHitAverage < 415 * TtHitAverageResolution * TtHitAverageWindow / 1024)) + || thisThread->ttHitAverage < 427 * TtHitAverageResolution * TtHitAverageWindow / 1024)) { Depth r = reduction(improving, depth, moveCount); @@ -1186,7 +1175,7 @@ moves_loop: // When in check, search starts from here r--; // Decrease reduction if the ttHit running average is large - if (thisThread->ttHitAverage > 473 * TtHitAverageResolution * TtHitAverageWindow / 1024) + if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024) r--; // Reduction if other threads are searching this position @@ -1229,17 +1218,17 @@ moves_loop: // When in check, search starts from here + (*contHist[0])[movedPiece][to_sq(move)] + (*contHist[1])[movedPiece][to_sq(move)] + (*contHist[3])[movedPiece][to_sq(move)] - - 4826; + - 5287; // Decrease/increase reduction by comparing opponent's stat score (~10 Elo) - if (ss->statScore >= -100 && (ss-1)->statScore < -112) + if (ss->statScore >= -106 && (ss-1)->statScore < -104) r--; - else if ((ss-1)->statScore >= -125 && ss->statScore < -138) + else if ((ss-1)->statScore >= -119 && ss->statScore < -140) r++; // Decrease/increase reduction for moves with a good/bad history (~30 Elo) - r -= ss->statScore / 14615; + r -= ss->statScore / 14884; } else { @@ -1249,11 +1238,11 @@ moves_loop: // When in check, search starts from here // Unless giving check, this capture is likely bad if ( !givesCheck - && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 211 * depth <= alpha) + && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 213 * depth <= alpha) r++; } - Depth d = Utility::clamp(newDepth - r, 1, newDepth); + Depth d = std::clamp(newDepth - r, 1, newDepth); value = -search(pos, ss+1, -(alpha+1), -alpha, d, true); @@ -1268,7 +1257,7 @@ moves_loop: // When in check, search starts from here didLMR = false; } - // Step 17. Full depth search when LMR is skipped or fails high + // Step 16. Full depth search when LMR is skipped or fails high if (doFullDepthSearch) { value = -search(pos, ss+1, -(alpha+1), -alpha, newDepth, !cutNode); @@ -1296,12 +1285,12 @@ moves_loop: // When in check, search starts from here value = -search(pos, ss+1, -beta, -alpha, newDepth, false); } - // Step 18. Undo move + // Step 17. Undo move pos.undo_move(move); assert(value > -VALUE_INFINITE && value < VALUE_INFINITE); - // Step 19. Check for a new best move + // Step 18. Check for a new best move // Finished searching the move. If a stop occurred, the return value of // the search cannot be trusted, and we return immediately without // updating best move, PV and TT. @@ -1378,7 +1367,7 @@ moves_loop: // When in check, search starts from here return VALUE_DRAW; */ - // Step 20. Check for mate and stalemate + // Step 19. Check for mate and stalemate // All legal moves have been searched and if there are no legal moves, it // must be a mate or a stalemate. If we are in a singular extension search then // return a fail low score. @@ -1511,7 +1500,7 @@ moves_loop: // When in check, search starts from here if (PvNode && bestValue > alpha) alpha = bestValue; - futilityBase = bestValue + 141; + futilityBase = bestValue + 145; } const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory, @@ -1545,6 +1534,10 @@ moves_loop: // When in check, search starts from here { assert(type_of(move) != ENPASSANT); // Due to !pos.advanced_pawn_push + // moveCount pruning + if (moveCount > 2) + continue; + futilityValue = futilityBase + PieceValue[EG][pos.piece_on(to_sq(move))]; if (futilityValue <= alpha) @@ -1586,6 +1579,12 @@ moves_loop: // When in check, search starts from here [pos.moved_piece(move)] [to_sq(move)]; + if ( !captureOrPromotion + && moveCount >= abs(depth) + 1 + && (*contHist[0])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold + && (*contHist[1])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold) + continue; + // Make and search the move pos.do_move(move, st, givesCheck); value = -qsearch(pos, ss+1, -beta, -alpha, depth - 1); @@ -1768,7 +1767,7 @@ moves_loop: // When in check, search starts from here } if (depth > 11 && ss->ply < MAX_LPH) - thisThread->lowPlyHistory[ss->ply][from_to(move)] << stat_bonus(depth - 6); + thisThread->lowPlyHistory[ss->ply][from_to(move)] << stat_bonus(depth - 7); } // When playing with strength handicap, choose best move among a set of RootMoves diff --git a/src/thread_win32_osx.h b/src/thread_win32_osx.h index c4b55a48..75ef5d9a 100644 --- a/src/thread_win32_osx.h +++ b/src/thread_win32_osx.h @@ -27,7 +27,7 @@ /// The implementation calls pthread_create() with the stack size parameter /// equal to the linux 8MB default, on platforms that support it. -#if defined(__APPLE__) || defined(__MINGW32__) || defined(__MINGW64__) +#if defined(__APPLE__) || defined(__MINGW32__) || defined(__MINGW64__) || defined(USE_PTHREADS) #include diff --git a/src/timeman.cpp b/src/timeman.cpp index df4ba9b2..6d9c95ef 100644 --- a/src/timeman.cpp +++ b/src/timeman.cpp @@ -38,9 +38,9 @@ void TimeManagement::init(Search::LimitsType& limits, Color us, int ply) { TimePoint slowMover = TimePoint(Options["Slow Mover"]); TimePoint npmsec = TimePoint(Options["nodestime"]); - // opt_scale is a percentage of available time to use for the current move. - // max_scale is a multiplier applied to optimumTime. - double opt_scale, max_scale; + // optScale is a percentage of available time to use for the current move. + // maxScale is a multiplier applied to optimumTime. + double optScale, maxScale; // If we have to play in 'nodes as time' mode, then convert from time // to nodes, and use resulting values in time management formulas. @@ -75,22 +75,22 @@ void TimeManagement::init(Search::LimitsType& limits, Color us, int ply) { // game time for the current move, so also cap to 20% of available game time. if (limits.movestogo == 0) { - opt_scale = std::min(0.008 + std::pow(ply + 3.0, 0.5) / 250.0, + optScale = std::min(0.008 + std::pow(ply + 3.0, 0.5) / 250.0, 0.2 * limits.time[us] / double(timeLeft)); - max_scale = std::min(7.0, 4.0 + ply / 12.0); + maxScale = std::min(7.0, 4.0 + ply / 12.0); } // x moves in y seconds (+ z increment) else { - opt_scale = std::min((0.8 + ply / 128.0) / mtg, + optScale = std::min((0.8 + ply / 128.0) / mtg, 0.8 * limits.time[us] / double(timeLeft)); - max_scale = std::min(6.3, 1.5 + 0.11 * mtg); + maxScale = std::min(6.3, 1.5 + 0.11 * mtg); } // Never use more than 80% of the available time for this move - optimumTime = TimePoint(opt_scale * timeLeft); - maximumTime = TimePoint(std::min(0.8 * limits.time[us] - moveOverhead, max_scale * optimumTime)); + optimumTime = TimePoint(optScale * timeLeft); + maximumTime = TimePoint(std::min(0.8 * limits.time[us] - moveOverhead, maxScale * optimumTime)); if (Options["Ponder"]) optimumTime += optimumTime / 4; diff --git a/src/tt.cpp b/src/tt.cpp index d494c27d..60a3a5f1 100644 --- a/src/tt.cpp +++ b/src/tt.cpp @@ -37,18 +37,19 @@ void TTEntry::save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev) if (m || (uint16_t)k != key16) move16 = (uint16_t)m; - // Overwrite less valuable entries - if ((uint16_t)k != key16 - || d - DEPTH_OFFSET > depth8 - 4 - || b == BOUND_EXACT) + // Overwrite less valuable entries (cheapest checks first) + if (b == BOUND_EXACT + || (uint16_t)k != key16 + || d - DEPTH_OFFSET > depth8 - 4) { - assert(d >= DEPTH_OFFSET); + assert(d > DEPTH_OFFSET); + assert(d < 256 + DEPTH_OFFSET); key16 = (uint16_t)k; + depth8 = (uint8_t)(d - DEPTH_OFFSET); + genBound8 = (uint8_t)(TT.generation8 | uint8_t(pv) << 2 | b); value16 = (int16_t)v; eval16 = (int16_t)ev; - genBound8 = (uint8_t)(TT.generation8 | uint8_t(pv) << 2 | b); - depth8 = (uint8_t)(d - DEPTH_OFFSET); } } @@ -119,11 +120,11 @@ TTEntry* TranspositionTable::probe(const Key key, bool& found) const { const uint16_t key16 = (uint16_t)key; // Use the low 16 bits as key inside the cluster for (int i = 0; i < ClusterSize; ++i) - if (!tte[i].key16 || tte[i].key16 == key16) + if (tte[i].key16 == key16 || !tte[i].depth8) { tte[i].genBound8 = uint8_t(generation8 | (tte[i].genBound8 & 0x7)); // Refresh - return found = (bool)tte[i].key16, &tte[i]; + return found = (bool)tte[i].depth8, &tte[i]; } // Find an entry to be replaced according to the replacement strategy @@ -149,7 +150,7 @@ int TranspositionTable::hashfull() const { int cnt = 0; for (int i = 0; i < 1000; ++i) for (int j = 0; j < ClusterSize; ++j) - cnt += (table[i].entry[j].genBound8 & 0xF8) == generation8; + cnt += table[i].entry[j].depth8 && (table[i].entry[j].genBound8 & 0xF8) == generation8; return cnt / ClusterSize; } diff --git a/src/tt.h b/src/tt.h index c177ca52..fdfd6769 100644 --- a/src/tt.h +++ b/src/tt.h @@ -25,13 +25,13 @@ /// TTEntry struct is the 10 bytes transposition table entry, defined as below: /// /// key 16 bit -/// move 16 bit -/// value 16 bit -/// eval value 16 bit +/// depth 8 bit /// generation 5 bit /// pv node 1 bit /// bound type 2 bit -/// depth 8 bit +/// move 16 bit +/// value 16 bit +/// eval value 16 bit struct TTEntry { @@ -47,11 +47,11 @@ private: friend class TranspositionTable; uint16_t key16; + uint8_t depth8; + uint8_t genBound8; uint16_t move16; int16_t value16; int16_t eval16; - uint8_t genBound8; - uint8_t depth8; }; diff --git a/src/types.h b/src/types.h index ce4c2dbb..d34781e5 100644 --- a/src/types.h +++ b/src/types.h @@ -203,22 +203,6 @@ enum Piece { PIECE_NB = 16 }; -// An ID used to track the pieces. Max. 32 pieces on board. -enum PieceId { - PIECE_ID_ZERO = 0, - PIECE_ID_KING = 30, - PIECE_ID_WKING = 30, - PIECE_ID_BKING = 31, - PIECE_ID_NONE = 32 -}; - -inline PieceId operator++(PieceId& d, int) { - - PieceId x = d; - d = PieceId(int(d) + 1); - return x; -} - constexpr Value PieceValue[PHASE_NB][PIECE_NB] = { { VALUE_ZERO, PawnValueMg, KnightValueMg, BishopValueMg, RookValueMg, QueenValueMg, VALUE_ZERO, VALUE_ZERO, VALUE_ZERO, PawnValueMg, KnightValueMg, BishopValueMg, RookValueMg, QueenValueMg, VALUE_ZERO, VALUE_ZERO }, @@ -234,7 +218,8 @@ enum : int { DEPTH_QS_RECAPTURES = -5, DEPTH_NONE = -6, - DEPTH_OFFSET = DEPTH_NONE + + DEPTH_OFFSET = -7 // value used only for TT entry occupancy check }; enum Square : int { @@ -272,118 +257,20 @@ enum Rank : int { RANK_1, RANK_2, RANK_3, RANK_4, RANK_5, RANK_6, RANK_7, RANK_8, RANK_NB }; -// unique number for each piece type on each square -enum PieceSquare : uint32_t { - PS_NONE = 0, - PS_W_PAWN = 1, - PS_B_PAWN = 1 * SQUARE_NB + 1, - PS_W_KNIGHT = 2 * SQUARE_NB + 1, - PS_B_KNIGHT = 3 * SQUARE_NB + 1, - PS_W_BISHOP = 4 * SQUARE_NB + 1, - PS_B_BISHOP = 5 * SQUARE_NB + 1, - PS_W_ROOK = 6 * SQUARE_NB + 1, - PS_B_ROOK = 7 * SQUARE_NB + 1, - PS_W_QUEEN = 8 * SQUARE_NB + 1, - PS_B_QUEEN = 9 * SQUARE_NB + 1, - PS_W_KING = 10 * SQUARE_NB + 1, - PS_END = PS_W_KING, // pieces without kings (pawns included) - PS_B_KING = 11 * SQUARE_NB + 1, - PS_END2 = 12 * SQUARE_NB + 1, - - PS_NOT_INIT = PS_END2 + 1, -}; - -struct ExtPieceSquare { - PieceSquare from[COLOR_NB]; -}; - -// Array for finding the PieceSquare corresponding to the piece on the board -extern ExtPieceSquare kpp_board_index[PIECE_NB]; - -constexpr bool is_ok(PieceId pid); -constexpr Square rotate180(Square sq); - -class Position; - -// Structure holding which tracked piece (PieceId) is where (PieceSquare) -class EvalList { - -public: - // Max. number of pieces without kings is 30 but must be a multiple of 4 in AVX2 - static const int MAX_LENGTH = 32; - - // Array that holds the piece id for the pieces on the board - PieceId piece_id_list[SQUARE_NB]; - - // List of pieces, separate from White and Black POV - PieceSquare* piece_list_fw() const { return const_cast(pieceListFw); } - PieceSquare* piece_list_fb() const { return const_cast(pieceListFb); } - - // Place the piece pc with piece_id on the square sq on the board - void put_piece(PieceId piece_id, Square sq, Piece pc) - { - assert(is_ok(piece_id)); - if (pc != NO_PIECE) - { - pieceListFw[piece_id] = PieceSquare(kpp_board_index[pc].from[WHITE] + sq); - pieceListFb[piece_id] = PieceSquare(kpp_board_index[pc].from[BLACK] + rotate180(sq)); - piece_id_list[sq] = piece_id; - } - else - { - pieceListFw[piece_id] = PS_NONE; - pieceListFb[piece_id] = PS_NONE; - piece_id_list[sq] = piece_id; - } - } - - // Convert the specified piece_id piece to ExtPieceSquare type and return it - ExtPieceSquare piece_with_id(PieceId piece_id) const - { - ExtPieceSquare eps; - eps.from[WHITE] = pieceListFw[piece_id]; - eps.from[BLACK] = pieceListFb[piece_id]; - return eps; - } - - // Initialize the pieceList. - // Set the value of unused pieces to PieceSquare::PS_NONE in case you want to deal with dropped pieces. - // A normal evaluation function can be used as an evaluation function for missing frames. - // piece_no_list is initialized with PieceId::PIECE_ID_NONE to facilitate debugging. - void clear() - { - - for (auto& p : pieceListFw) - p = PieceSquare::PS_NONE; - - for (auto& p : pieceListFb) - p = PieceSquare::PS_NONE; - - for (auto& v : piece_id_list) - v = PieceId::PIECE_ID_NONE; - } - - // Check whether the pieceListFw[] held internally is a correct BonaPiece. - // Note: For debugging. slow. - bool is_valid(const Position& pos); - -private: - PieceSquare pieceListFw[MAX_LENGTH]; - PieceSquare pieceListFb[MAX_LENGTH]; -}; - -// For differential evaluation of pieces that changed since last turn +// Keep track of what a move changes on the board (used by NNUE) struct DirtyPiece { // Number of changed pieces int dirty_num; - // The ids of changed pieces, max. 2 pieces can change in one move - PieceId pieceId[2]; + // Max 3 pieces can change in one move. A promotion with capture moves + // both the pawn and the captured piece to SQ_NONE and the piece promoted + // to from SQ_NONE to the capture square. + Piece piece[3]; - // What changed from the piece with that piece number - ExtPieceSquare old_piece[2]; - ExtPieceSquare new_piece[2]; + // From and to squares, which may be SQ_NONE + Square from[3]; + Square to[3]; }; /// Score enum stores a middlegame and an endgame value in a single integer (enum). @@ -433,8 +320,6 @@ ENABLE_FULL_OPERATORS_ON(Value) ENABLE_FULL_OPERATORS_ON(Direction) ENABLE_INCR_OPERATORS_ON(Piece) -ENABLE_INCR_OPERATORS_ON(PieceSquare) -ENABLE_INCR_OPERATORS_ON(PieceId) ENABLE_INCR_OPERATORS_ON(PieceType) ENABLE_INCR_OPERATORS_ON(Square) ENABLE_INCR_OPERATORS_ON(File) @@ -523,10 +408,6 @@ inline Color color_of(Piece pc) { return Color(pc >> 3); } -constexpr bool is_ok(PieceId pid) { - return pid < PIECE_ID_NONE; -} - constexpr bool is_ok(Square s) { return s >= SQ_A1 && s <= SQ_H8; } @@ -563,11 +444,6 @@ constexpr Square to_sq(Move m) { return Square(m & 0x3F); } -// Return relative square when turning the board 180 degrees -constexpr Square rotate180(Square sq) { - return (Square)(sq ^ 0x3F); -} - constexpr int from_to(Move m) { return m & 0xFFF; } diff --git a/src/uci.cpp b/src/uci.cpp index 00941040..d6745d19 100644 --- a/src/uci.cpp +++ b/src/uci.cpp @@ -260,7 +260,7 @@ double UCI::win_rate_model_double(double v, int ply) { double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3]; // Transform eval to centipawns with limited range - double x = Utility::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0); + double x = std::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0); // Return win rate in per mille return 1000.0 / (1 + std::exp((a - x) / b)); diff --git a/src/ucioption.cpp b/src/ucioption.cpp index ef40fe82..519160cf 100644 --- a/src/ucioption.cpp +++ b/src/ucioption.cpp @@ -79,8 +79,10 @@ void init(OptionsMap& o) { o["SyzygyProbeDepth"] << Option(1, 1, 100); o["Syzygy50MoveRule"] << Option(true); o["SyzygyProbeLimit"] << Option(7, 0, 7); - o["Use NNUE"] << Option(false, on_use_NNUE); - o["EvalFile"] << Option("nn-9931db908a9b.nnue", on_eval_file); + o["Use NNUE"] << Option(true, on_use_NNUE); + // The default must follow the format nn-[SHA256 first 12 digits].nnue + // for the build process (profile-build and fishtest) to work. + o["EvalFile"] << Option("nn-82215d0fd0df.nnue", on_eval_file); #ifdef EVAL_NNUE // When the evaluation function is loaded at the ucinewgame timing, it is necessary to convert the new evaluation function. // I want to hit the test eval convert command, but there is no new evaluation function diff --git a/tests/instrumented.sh b/tests/instrumented.sh index ae6d5c4b..03ded74a 100755 --- a/tests/instrumented.sh +++ b/tests/instrumented.sh @@ -70,7 +70,7 @@ for args in "eval" \ "go depth 10" \ "go movetime 1000" \ "go wtime 8000 btime 8000 winc 500 binc 500" \ - "bench 128 $threads 10 default depth" + "bench 128 $threads 8 default depth" do echo "$prefix $exeprefix ./stockfish $args $postfix" @@ -80,7 +80,7 @@ done # more general testing, following an uci protocol exchange cat << EOF > game.exp - set timeout 10 + set timeout 240 spawn $exeprefix ./stockfish send "uci\n" @@ -98,7 +98,7 @@ cat << EOF > game.exp expect "bestmove" send "position fen 5rk1/1K4p1/8/8/3B4/8/8/8 b - - 0 1\n" - send "go depth 30\n" + send "go depth 20\n" expect "bestmove" send "quit\n" @@ -121,7 +121,7 @@ cat << EOF > syzygy.exp send "uci\n" send "setoption name SyzygyPath value ../tests/syzygy/\n" expect "info string Found 35 tablebases" {} timeout {exit 1} - send "bench 128 1 10 default depth\n" + send "bench 128 1 8 default depth\n" send "quit\n" expect eof