Optimize FT activation and affine transform for NEON.

This patch optimizes the NEON implementation in two ways.

    The activation layer after the feature transformer is rewritten to make it easier for the compiler to see through dependencies and unroll. This in itself is a minimal, but a positive improvement. Other architectures could benefit from this too in the future. This is not an algorithmic change.
    The affine transform for large matrices (first layer after FT) on NEON now utilizes the same optimized code path as >=SSSE3, which makes the memory accesses more sequential and makes better use of the available registers, which allows for code that has longer dependency chains.

Benchmarks from Redshift#161, profile-build with apple clang

george@Georges-MacBook-Air nets % ./stockfish-b82d93 bench 2>&1 | tail -4 (current master)
===========================
Total time (ms) : 2167
Nodes searched  : 4667742
Nodes/second    : 2154011
george@Georges-MacBook-Air nets % ./stockfish-7377b8 bench 2>&1 | tail -4 (this patch)
===========================
Total time (ms) : 1842
Nodes searched  : 4667742
Nodes/second    : 2534061

This is a solid 18% improvement overall, larger in a bench with NNUE-only, not mixed.

Improvement is also observed on armv7-neon (Raspberry Pi, and older phones), around 5% speedup.

No changes for architectures other than NEON.

closes https://github.com/official-stockfish/Stockfish/pull/3837

No functional changes.
This commit is contained in:
Tomasz Sobczyk
2021-12-02 12:29:11 +01:00
committed by Joost VandeVondele
parent b82d93ece4
commit 4766dfc395
4 changed files with 94 additions and 30 deletions

View File

@@ -128,6 +128,7 @@ avx512 = no
vnni256 = no
vnni512 = no
neon = no
arm_version = 0
STRIP = strip
### 2.2 Architecture specific
@@ -275,6 +276,7 @@ ifeq ($(ARCH),armv7)
arch = armv7
prefetch = yes
bits = 32
arm_version = 7
endif
ifeq ($(ARCH),armv7-neon)
@@ -283,6 +285,7 @@ ifeq ($(ARCH),armv7-neon)
popcnt = yes
neon = yes
bits = 32
arm_version = 7
endif
ifeq ($(ARCH),armv8)
@@ -290,6 +293,7 @@ ifeq ($(ARCH),armv8)
prefetch = yes
popcnt = yes
neon = yes
arm_version = 8
endif
ifeq ($(ARCH),apple-silicon)
@@ -297,6 +301,7 @@ ifeq ($(ARCH),apple-silicon)
prefetch = yes
popcnt = yes
neon = yes
arm_version = 8
endif
ifeq ($(ARCH),ppc-32)
@@ -614,7 +619,7 @@ ifeq ($(mmx),yes)
endif
ifeq ($(neon),yes)
CXXFLAGS += -DUSE_NEON
CXXFLAGS += -DUSE_NEON=$(arm_version)
ifeq ($(KERNEL),Linux)
ifneq ($(COMP),ndk)
ifneq ($(arch),armv8)
@@ -863,6 +868,7 @@ config-sanity: net
@echo "vnni256: '$(vnni256)'"
@echo "vnni512: '$(vnni512)'"
@echo "neon: '$(neon)'"
@echo "arm_version: '$(arm_version)'"
@echo ""
@echo "Flags:"
@echo "CXX: $(CXX)"

View File

@@ -75,8 +75,7 @@ namespace Stockfish::Eval::NNUE::Layers {
const auto inputVector = reinterpret_cast<const __m64*>(input);
# elif defined(USE_NEON)
static_assert(PaddedInputDimensions % 16 == 0);
constexpr IndexType NumChunks = PaddedInputDimensions / 16;
constexpr IndexType NumChunks = (InputDimensions + 15) / 16;
const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
# endif
@@ -181,6 +180,9 @@ namespace Stockfish::Eval::NNUE::Layers {
#elif defined (USE_SSSE3)
static constexpr const IndexType InputSimdWidth = 16;
static constexpr const IndexType MaxNumOutputRegs = 8;
#elif defined (USE_NEON)
static constexpr const IndexType InputSimdWidth = 8;
static constexpr const IndexType MaxNumOutputRegs = 8;
#else
// The fallback implementation will not have permuted weights.
// We define these to avoid a lot of ifdefs later.
@@ -270,52 +272,64 @@ namespace Stockfish::Eval::NNUE::Layers {
OutputType* output = reinterpret_cast<OutputType*>(buffer);
#if defined (USE_AVX512)
using vec_t = __m512i;
#define vec_setzero _mm512_setzero_si512
#define vec_set_32 _mm512_set1_epi32
#define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
using acc_vec_t = __m512i;
using bias_vec_t = __m128i;
using weight_vec_t = __m512i;
using in_vec_t = __m512i;
#define vec_zero _mm512_setzero_si512()
#define vec_add_dpbusd_32x2 Simd::m512_add_dpbusd_epi32x2
#define vec_hadd Simd::m512_hadd
#define vec_haddx4 Simd::m512_haddx4
#elif defined (USE_AVX2)
using vec_t = __m256i;
#define vec_setzero _mm256_setzero_si256
#define vec_set_32 _mm256_set1_epi32
#define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
using acc_vec_t = __m256i;
using bias_vec_t = __m128i;
using weight_vec_t = __m256i;
using in_vec_t = __m256i;
#define vec_zero _mm256_setzero_si256()
#define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2
#define vec_hadd Simd::m256_hadd
#define vec_haddx4 Simd::m256_haddx4
#elif defined (USE_SSSE3)
using vec_t = __m128i;
#define vec_setzero _mm_setzero_si128
#define vec_set_32 _mm_set1_epi32
#define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
using acc_vec_t = __m128i;
using bias_vec_t = __m128i;
using weight_vec_t = __m128i;
using in_vec_t = __m128i;
#define vec_zero _mm_setzero_si128()
#define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
#define vec_hadd Simd::m128_hadd
#define vec_haddx4 Simd::m128_haddx4
#elif defined (USE_NEON)
using acc_vec_t = int32x4_t;
using bias_vec_t = int32x4_t;
using weight_vec_t = int8x8_t;
using in_vec_t = int8x8_t;
#define vec_zero {0}
#define vec_add_dpbusd_32x2 Simd::neon_m128_add_dpbusd_epi32x2
#define vec_hadd Simd::neon_m128_hadd
#define vec_haddx4 Simd::neon_m128_haddx4
#endif
#if defined (USE_SSSE3)
const vec_t* invec = reinterpret_cast<const vec_t*>(input);
#if defined (USE_SSSE3) || defined (USE_NEON)
const in_vec_t* invec = reinterpret_cast<const in_vec_t*>(input);
// Perform accumulation to registers for each big block
for (IndexType bigBlock = 0; bigBlock < NumBigBlocks; ++bigBlock)
{
vec_t acc[NumOutputRegs] = { vec_setzero() };
acc_vec_t acc[NumOutputRegs] = { vec_zero };
// Each big block has NumOutputRegs small blocks in each "row", one per register.
// We process two small blocks at a time to save on one addition without VNNI.
for (IndexType smallBlock = 0; smallBlock < NumSmallBlocksPerOutput; smallBlock += 2)
{
const vec_t* weightvec =
reinterpret_cast<const vec_t*>(
const weight_vec_t* weightvec =
reinterpret_cast<const weight_vec_t*>(
weights
+ bigBlock * BigBlockSize
+ smallBlock * SmallBlockSize * NumOutputRegs);
const vec_t in0 = invec[smallBlock + 0];
const vec_t in1 = invec[smallBlock + 1];
const in_vec_t in0 = invec[smallBlock + 0];
const in_vec_t in1 = invec[smallBlock + 1];
for (IndexType k = 0; k < NumOutputRegs; ++k)
vec_add_dpbusd_32x2(acc[k], in0, weightvec[k], in1, weightvec[k + NumOutputRegs]);
@@ -324,8 +338,8 @@ namespace Stockfish::Eval::NNUE::Layers {
// Horizontally add all accumulators.
if constexpr (NumOutputRegs % 4 == 0)
{
__m128i* outputvec = reinterpret_cast<__m128i*>(output);
const __m128i* biasvec = reinterpret_cast<const __m128i*>(biases);
bias_vec_t* outputvec = reinterpret_cast<bias_vec_t*>(output);
const bias_vec_t* biasvec = reinterpret_cast<const bias_vec_t*>(biases);
for (IndexType k = 0; k < NumOutputRegs; k += 4)
{
@@ -343,9 +357,7 @@ namespace Stockfish::Eval::NNUE::Layers {
}
}
# undef vec_setzero
# undef vec_set_32
# undef vec_add_dpbusd_32
# undef vec_zero
# undef vec_add_dpbusd_32x2
# undef vec_hadd
# undef vec_haddx4

View File

@@ -336,10 +336,17 @@ namespace Stockfish::Eval::NNUE {
{
const IndexType offset = HalfDimensions * p;
const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
for (IndexType j = 0; j < NumChunks; ++j)
constexpr IndexType UnrollFactor = 16;
static_assert(UnrollFactor % UnrollFactor == 0);
for (IndexType j = 0; j < NumChunks; j += UnrollFactor)
{
int16x8_t sum = reinterpret_cast<const int16x8_t*>(accumulation[perspectives[p]])[j];
out[j] = vmax_s8(vqmovn_s16(sum), Zero);
int16x8_t sums[UnrollFactor];
for (IndexType i = 0; i < UnrollFactor; ++i)
sums[i] = reinterpret_cast<const int16x8_t*>(accumulation[perspectives[p]])[j+i];
for (IndexType i = 0; i < UnrollFactor; ++i)
out[j+i] = vmax_s8(vqmovn_s16(sums[i]), Zero);
}
}
return psqt;

View File

@@ -343,6 +343,45 @@ namespace Stockfish::Simd {
#endif
#if defined (USE_NEON)
[[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
# if USE_NEON >= 8
return vaddvq_s32(s);
# else
return s[0] + s[1] + s[2] + s[3];
# endif
}
[[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) {
return neon_m128_reduce_add_epi32(sum) + bias;
}
[[maybe_unused]] static int32x4_t neon_m128_haddx4(
int32x4_t sum0, int32x4_t sum1, int32x4_t sum2, int32x4_t sum3,
int32x4_t bias) {
int32x4_t hsums {
neon_m128_reduce_add_epi32(sum0),
neon_m128_reduce_add_epi32(sum1),
neon_m128_reduce_add_epi32(sum2),
neon_m128_reduce_add_epi32(sum3)
};
return vaddq_s32(hsums, bias);
}
[[maybe_unused]] static void neon_m128_add_dpbusd_epi32x2(
int32x4_t& acc,
int8x8_t a0, int8x8_t b0,
int8x8_t a1, int8x8_t b1) {
int16x8_t product = vmull_s8(a0, b0);
product = vmlal_s8(product, a1, b1);
acc = vpadalq_s16(acc, product);
}
#endif
}
#endif // STOCKFISH_SIMD_H_INCLUDED