diff --git a/src/Makefile b/src/Makefile index a9333a22..3cf97873 100644 --- a/src/Makefile +++ b/src/Makefile @@ -128,6 +128,7 @@ avx512 = no vnni256 = no vnni512 = no neon = no +arm_version = 0 STRIP = strip ### 2.2 Architecture specific @@ -275,6 +276,7 @@ ifeq ($(ARCH),armv7) arch = armv7 prefetch = yes bits = 32 + arm_version = 7 endif ifeq ($(ARCH),armv7-neon) @@ -283,6 +285,7 @@ ifeq ($(ARCH),armv7-neon) popcnt = yes neon = yes bits = 32 + arm_version = 7 endif ifeq ($(ARCH),armv8) @@ -290,6 +293,7 @@ ifeq ($(ARCH),armv8) prefetch = yes popcnt = yes neon = yes + arm_version = 8 endif ifeq ($(ARCH),apple-silicon) @@ -297,6 +301,7 @@ ifeq ($(ARCH),apple-silicon) prefetch = yes popcnt = yes neon = yes + arm_version = 8 endif ifeq ($(ARCH),ppc-32) @@ -614,7 +619,7 @@ ifeq ($(mmx),yes) endif ifeq ($(neon),yes) - CXXFLAGS += -DUSE_NEON + CXXFLAGS += -DUSE_NEON=$(arm_version) ifeq ($(KERNEL),Linux) ifneq ($(COMP),ndk) ifneq ($(arch),armv8) @@ -863,6 +868,7 @@ config-sanity: net @echo "vnni256: '$(vnni256)'" @echo "vnni512: '$(vnni512)'" @echo "neon: '$(neon)'" + @echo "arm_version: '$(arm_version)'" @echo "" @echo "Flags:" @echo "CXX: $(CXX)" diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index b2871278..11038d69 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -75,8 +75,7 @@ namespace Stockfish::Eval::NNUE::Layers { const auto inputVector = reinterpret_cast(input); # elif defined(USE_NEON) - static_assert(PaddedInputDimensions % 16 == 0); - constexpr IndexType NumChunks = PaddedInputDimensions / 16; + constexpr IndexType NumChunks = (InputDimensions + 15) / 16; const auto inputVector = reinterpret_cast(input); # endif @@ -181,6 +180,9 @@ namespace Stockfish::Eval::NNUE::Layers { #elif defined (USE_SSSE3) static constexpr const IndexType InputSimdWidth = 16; static constexpr const IndexType MaxNumOutputRegs = 8; +#elif defined (USE_NEON) + static constexpr const IndexType InputSimdWidth = 8; + static constexpr const IndexType MaxNumOutputRegs = 8; #else // The fallback implementation will not have permuted weights. // We define these to avoid a lot of ifdefs later. @@ -270,52 +272,64 @@ namespace Stockfish::Eval::NNUE::Layers { OutputType* output = reinterpret_cast(buffer); #if defined (USE_AVX512) - using vec_t = __m512i; - #define vec_setzero _mm512_setzero_si512 - #define vec_set_32 _mm512_set1_epi32 - #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32 + using acc_vec_t = __m512i; + using bias_vec_t = __m128i; + using weight_vec_t = __m512i; + using in_vec_t = __m512i; + #define vec_zero _mm512_setzero_si512() #define vec_add_dpbusd_32x2 Simd::m512_add_dpbusd_epi32x2 #define vec_hadd Simd::m512_hadd #define vec_haddx4 Simd::m512_haddx4 #elif defined (USE_AVX2) - using vec_t = __m256i; - #define vec_setzero _mm256_setzero_si256 - #define vec_set_32 _mm256_set1_epi32 - #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32 + using acc_vec_t = __m256i; + using bias_vec_t = __m128i; + using weight_vec_t = __m256i; + using in_vec_t = __m256i; + #define vec_zero _mm256_setzero_si256() #define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2 #define vec_hadd Simd::m256_hadd #define vec_haddx4 Simd::m256_haddx4 #elif defined (USE_SSSE3) - using vec_t = __m128i; - #define vec_setzero _mm_setzero_si128 - #define vec_set_32 _mm_set1_epi32 - #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32 + using acc_vec_t = __m128i; + using bias_vec_t = __m128i; + using weight_vec_t = __m128i; + using in_vec_t = __m128i; + #define vec_zero _mm_setzero_si128() #define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2 #define vec_hadd Simd::m128_hadd #define vec_haddx4 Simd::m128_haddx4 +#elif defined (USE_NEON) + using acc_vec_t = int32x4_t; + using bias_vec_t = int32x4_t; + using weight_vec_t = int8x8_t; + using in_vec_t = int8x8_t; + #define vec_zero {0} + #define vec_add_dpbusd_32x2 Simd::neon_m128_add_dpbusd_epi32x2 + #define vec_hadd Simd::neon_m128_hadd + #define vec_haddx4 Simd::neon_m128_haddx4 #endif -#if defined (USE_SSSE3) - const vec_t* invec = reinterpret_cast(input); +#if defined (USE_SSSE3) || defined (USE_NEON) + const in_vec_t* invec = reinterpret_cast(input); // Perform accumulation to registers for each big block for (IndexType bigBlock = 0; bigBlock < NumBigBlocks; ++bigBlock) { - vec_t acc[NumOutputRegs] = { vec_setzero() }; + acc_vec_t acc[NumOutputRegs] = { vec_zero }; // Each big block has NumOutputRegs small blocks in each "row", one per register. // We process two small blocks at a time to save on one addition without VNNI. for (IndexType smallBlock = 0; smallBlock < NumSmallBlocksPerOutput; smallBlock += 2) { - const vec_t* weightvec = - reinterpret_cast( + const weight_vec_t* weightvec = + reinterpret_cast( weights + bigBlock * BigBlockSize + smallBlock * SmallBlockSize * NumOutputRegs); - const vec_t in0 = invec[smallBlock + 0]; - const vec_t in1 = invec[smallBlock + 1]; + const in_vec_t in0 = invec[smallBlock + 0]; + const in_vec_t in1 = invec[smallBlock + 1]; for (IndexType k = 0; k < NumOutputRegs; ++k) vec_add_dpbusd_32x2(acc[k], in0, weightvec[k], in1, weightvec[k + NumOutputRegs]); @@ -324,8 +338,8 @@ namespace Stockfish::Eval::NNUE::Layers { // Horizontally add all accumulators. if constexpr (NumOutputRegs % 4 == 0) { - __m128i* outputvec = reinterpret_cast<__m128i*>(output); - const __m128i* biasvec = reinterpret_cast(biases); + bias_vec_t* outputvec = reinterpret_cast(output); + const bias_vec_t* biasvec = reinterpret_cast(biases); for (IndexType k = 0; k < NumOutputRegs; k += 4) { @@ -343,9 +357,7 @@ namespace Stockfish::Eval::NNUE::Layers { } } -# undef vec_setzero -# undef vec_set_32 -# undef vec_add_dpbusd_32 +# undef vec_zero # undef vec_add_dpbusd_32x2 # undef vec_hadd # undef vec_haddx4 diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 0297b323..4f6a174a 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -336,10 +336,17 @@ namespace Stockfish::Eval::NNUE { { const IndexType offset = HalfDimensions * p; const auto out = reinterpret_cast(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) + + constexpr IndexType UnrollFactor = 16; + static_assert(UnrollFactor % UnrollFactor == 0); + for (IndexType j = 0; j < NumChunks; j += UnrollFactor) { - int16x8_t sum = reinterpret_cast(accumulation[perspectives[p]])[j]; - out[j] = vmax_s8(vqmovn_s16(sum), Zero); + int16x8_t sums[UnrollFactor]; + for (IndexType i = 0; i < UnrollFactor; ++i) + sums[i] = reinterpret_cast(accumulation[perspectives[p]])[j+i]; + + for (IndexType i = 0; i < UnrollFactor; ++i) + out[j+i] = vmax_s8(vqmovn_s16(sums[i]), Zero); } } return psqt; diff --git a/src/simd.h b/src/simd.h index 1ac98067..ffa54d96 100644 --- a/src/simd.h +++ b/src/simd.h @@ -343,6 +343,45 @@ namespace Stockfish::Simd { #endif +#if defined (USE_NEON) + + [[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) { +# if USE_NEON >= 8 + return vaddvq_s32(s); +# else + return s[0] + s[1] + s[2] + s[3]; +# endif + } + + [[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) { + return neon_m128_reduce_add_epi32(sum) + bias; + } + + [[maybe_unused]] static int32x4_t neon_m128_haddx4( + int32x4_t sum0, int32x4_t sum1, int32x4_t sum2, int32x4_t sum3, + int32x4_t bias) { + + int32x4_t hsums { + neon_m128_reduce_add_epi32(sum0), + neon_m128_reduce_add_epi32(sum1), + neon_m128_reduce_add_epi32(sum2), + neon_m128_reduce_add_epi32(sum3) + }; + return vaddq_s32(hsums, bias); + } + + [[maybe_unused]] static void neon_m128_add_dpbusd_epi32x2( + int32x4_t& acc, + int8x8_t a0, int8x8_t b0, + int8x8_t a1, int8x8_t b1) { + + int16x8_t product = vmull_s8(a0, b0); + product = vmlal_s8(product, a1, b1); + acc = vpadalq_s16(acc, product); + } + +#endif + } #endif // STOCKFISH_SIMD_H_INCLUDED