mirror of
https://github.com/HChaZZY/Stockfish.git
synced 2025-12-25 11:36:51 +08:00
Support VNNI on 256bit vectors
due to downclocking on current chips (tested up to cascade lake) supporting avx512 and vnni512, it is better to use avx2 or vnni256 in multithreaded (in particular hyperthreaded) engine use. In single threaded use, the picture is different. gcc compilation for vnni256 requires a toolchain for gcc >= 9. closes https://github.com/official-stockfish/Stockfish/pull/3038 No functional change
This commit is contained in:
committed by
Joost VandeVondele
parent
e453f09f06
commit
701b2427bd
39
src/Makefile
39
src/Makefile
@@ -75,7 +75,8 @@ endif
|
||||
# sse41 = yes/no --- -msse4.1 --- Use Intel Streaming SIMD Extensions 4.1
|
||||
# avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2
|
||||
# avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512
|
||||
# vnni = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512
|
||||
# vnni256 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 256
|
||||
# vnni512 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512
|
||||
# neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture
|
||||
#
|
||||
# Note that Makefile is space sensitive, so when adding new architectures
|
||||
@@ -102,7 +103,8 @@ ssse3 = no
|
||||
sse41 = no
|
||||
avx2 = no
|
||||
avx512 = no
|
||||
vnni = no
|
||||
vnni256 = no
|
||||
vnni512 = no
|
||||
neon = no
|
||||
ARCH = x86-64-modern
|
||||
STRIP = strip
|
||||
@@ -192,7 +194,18 @@ ifeq ($(findstring -avx512,$(ARCH)),-avx512)
|
||||
avx512 = yes
|
||||
endif
|
||||
|
||||
ifeq ($(findstring -vnni,$(ARCH)),-vnni)
|
||||
ifeq ($(findstring -vnni256,$(ARCH)),-vnni256)
|
||||
popcnt = yes
|
||||
sse = yes
|
||||
sse2 = yes
|
||||
ssse3 = yes
|
||||
sse41 = yes
|
||||
avx2 = yes
|
||||
pext = yes
|
||||
vnni256 = yes
|
||||
endif
|
||||
|
||||
ifeq ($(findstring -vnni512,$(ARCH)),-vnni512)
|
||||
popcnt = yes
|
||||
sse = yes
|
||||
sse2 = yes
|
||||
@@ -201,7 +214,7 @@ ifeq ($(findstring -vnni,$(ARCH)),-vnni)
|
||||
avx2 = yes
|
||||
pext = yes
|
||||
avx512 = yes
|
||||
vnni = yes
|
||||
vnni512 = yes
|
||||
endif
|
||||
|
||||
ifeq ($(sse),yes)
|
||||
@@ -500,7 +513,14 @@ ifeq ($(avx512),yes)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(vnni),yes)
|
||||
ifeq ($(vnni256),yes)
|
||||
CXXFLAGS += -DUSE_VNNI
|
||||
ifeq ($(comp),$(filter $(comp),gcc clang mingw))
|
||||
CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=256
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(vnni512),yes)
|
||||
CXXFLAGS += -DUSE_VNNI
|
||||
ifeq ($(comp),$(filter $(comp),gcc clang mingw))
|
||||
CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl
|
||||
@@ -623,7 +643,8 @@ help:
|
||||
@echo ""
|
||||
@echo "Supported archs:"
|
||||
@echo ""
|
||||
@echo "x86-64-vnni > x86 64-bit with vnni support"
|
||||
@echo "x86-64-vnni512 > x86 64-bit with vnni support 512bit wide"
|
||||
@echo "x86-64-vnni256 > x86 64-bit with vnni support 256bit wide"
|
||||
@echo "x86-64-avx512 > x86 64-bit with avx512 support"
|
||||
@echo "x86-64-bmi2 > x86 64-bit with bmi2 support"
|
||||
@echo "x86-64-avx2 > x86 64-bit with avx2 support"
|
||||
@@ -767,7 +788,8 @@ config-sanity:
|
||||
@echo "sse41: '$(sse41)'"
|
||||
@echo "avx2: '$(avx2)'"
|
||||
@echo "avx512: '$(avx512)'"
|
||||
@echo "vnni: '$(vnni)'"
|
||||
@echo "vnni256: '$(vnni256)'"
|
||||
@echo "vnni512: '$(vnni512)'"
|
||||
@echo "neon: '$(neon)'"
|
||||
@echo ""
|
||||
@echo "Flags:"
|
||||
@@ -794,7 +816,8 @@ config-sanity:
|
||||
@test "$(sse41)" = "yes" || test "$(sse41)" = "no"
|
||||
@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
|
||||
@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
|
||||
@test "$(vnni)" = "yes" || test "$(vnni)" = "no"
|
||||
@test "$(vnni256)" = "yes" || test "$(vnni256)" = "no"
|
||||
@test "$(vnni512)" = "yes" || test "$(vnni512)" = "no"
|
||||
@test "$(neon)" = "yes" || test "$(neon)" = "no"
|
||||
@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" \
|
||||
|| test "$(comp)" = "armv7a-linux-androideabi16-clang" || test "$(comp)" = "aarch64-linux-android21-clang"
|
||||
|
||||
@@ -85,8 +85,10 @@ namespace Eval::NNUE::Layers {
|
||||
|
||||
#elif defined(USE_AVX2)
|
||||
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
|
||||
const __m256i kOnes = _mm256_set1_epi16(1);
|
||||
const auto input_vector = reinterpret_cast<const __m256i*>(input);
|
||||
#if !defined(USE_VNNI)
|
||||
const __m256i kOnes = _mm256_set1_epi16(1);
|
||||
#endif
|
||||
|
||||
#elif defined(USE_SSE2)
|
||||
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
|
||||
@@ -145,9 +147,13 @@ namespace Eval::NNUE::Layers {
|
||||
__m256i sum = _mm256_setzero_si256();
|
||||
const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
#if defined(USE_VNNI)
|
||||
sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
|
||||
#else
|
||||
__m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
|
||||
product = _mm256_madd_epi16(product, kOnes);
|
||||
sum = _mm256_add_epi32(sum, product);
|
||||
#endif
|
||||
}
|
||||
__m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
|
||||
sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
|
||||
|
||||
Reference in New Issue
Block a user