Optimize find_nnz() using AVX512

About a 1% speedup for ARCH x86-64-avx512 and x86-64-vnni512.

Note:  This could be optimized further if we wanted to add an ARCH
supporting VBMI2 which is even more modern than VNNI.

https://en.wikichip.org/wiki/x86/avx512_vbmi2

closes https://github.com/official-stockfish/Stockfish/pull/6139

No functional change
This commit is contained in:
mstembera
2025-06-24 15:09:51 -07:00
committed by Disservin
parent ea85a54fef
commit ce7254b5ea
2 changed files with 36 additions and 2 deletions

View File

@@ -701,7 +701,7 @@ endif
ifeq ($(avx512),yes) ifeq ($(avx512),yes)
CXXFLAGS += -DUSE_AVX512 CXXFLAGS += -DUSE_AVX512
ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
CXXFLAGS += -mavx512f -mavx512bw CXXFLAGS += -mavx512f -mavx512bw -mavx512dq -mavx512vl
endif endif
endif endif

View File

@@ -68,9 +68,42 @@ alignas(CacheLineSize) static constexpr struct OffsetIndices {
} Lookup; } Lookup;
#if defined(__GNUC__) || defined(__clang__)
#define RESTRICT __restrict__
#elif defined(_MSC_VER)
#define RESTRICT __restrict
#else
#define RESTRICT
#endif
// Find indices of nonzero numbers in an int32_t array // Find indices of nonzero numbers in an int32_t array
template<const IndexType InputDimensions> template<const IndexType InputDimensions>
void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_out) { void find_nnz(const std::int32_t* RESTRICT input,
std::uint16_t* RESTRICT out,
IndexType& count_out) {
#ifdef USE_AVX512
constexpr IndexType SimdWidth = 16; // 512 bits / 32 bits
constexpr IndexType NumChunks = InputDimensions / SimdWidth;
const __m512i increment = _mm512_set1_epi32(SimdWidth);
__m512i base = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
IndexType count = 0;
for (IndexType i = 0; i < NumChunks; ++i)
{
const __m512i inputV = _mm512_load_si512(input + i * SimdWidth);
// Get a bitmask and gather non zero indices
const __mmask16 nnzMask = _mm512_test_epi32_mask(inputV, inputV);
const __m512i nnzV = _mm512_maskz_compress_epi32(nnzMask, base);
_mm512_mask_cvtepi32_storeu_epi16(out + count, 0xFFFF, nnzV);
count += popcount(nnzMask);
base = _mm512_add_epi32(base, increment);
}
count_out = count;
#else
using namespace SIMD; using namespace SIMD;
constexpr IndexType InputSimdWidth = sizeof(vec_uint_t) / sizeof(std::int32_t); constexpr IndexType InputSimdWidth = sizeof(vec_uint_t) / sizeof(std::int32_t);
@@ -104,6 +137,7 @@ void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_ou
} }
} }
count_out = count; count_out = count;
#endif
} }
#endif #endif