mirror of
https://github.com/HChaZZY/Stockfish.git
synced 2025-12-06 10:53:50 +08:00
Optimize find_nnz() using AVX512
About a 1% speedup for ARCH x86-64-avx512 and x86-64-vnni512. Note: This could be optimized further if we wanted to add an ARCH supporting VBMI2 which is even more modern than VNNI. https://en.wikichip.org/wiki/x86/avx512_vbmi2 closes https://github.com/official-stockfish/Stockfish/pull/6139 No functional change
This commit is contained in:
@@ -701,7 +701,7 @@ endif
|
|||||||
ifeq ($(avx512),yes)
|
ifeq ($(avx512),yes)
|
||||||
CXXFLAGS += -DUSE_AVX512
|
CXXFLAGS += -DUSE_AVX512
|
||||||
ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
|
ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
|
||||||
CXXFLAGS += -mavx512f -mavx512bw
|
CXXFLAGS += -mavx512f -mavx512bw -mavx512dq -mavx512vl
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|||||||
@@ -68,9 +68,42 @@ alignas(CacheLineSize) static constexpr struct OffsetIndices {
|
|||||||
|
|
||||||
} Lookup;
|
} Lookup;
|
||||||
|
|
||||||
|
#if defined(__GNUC__) || defined(__clang__)
|
||||||
|
#define RESTRICT __restrict__
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
#define RESTRICT __restrict
|
||||||
|
#else
|
||||||
|
#define RESTRICT
|
||||||
|
#endif
|
||||||
|
|
||||||
// Find indices of nonzero numbers in an int32_t array
|
// Find indices of nonzero numbers in an int32_t array
|
||||||
template<const IndexType InputDimensions>
|
template<const IndexType InputDimensions>
|
||||||
void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_out) {
|
void find_nnz(const std::int32_t* RESTRICT input,
|
||||||
|
std::uint16_t* RESTRICT out,
|
||||||
|
IndexType& count_out) {
|
||||||
|
|
||||||
|
#ifdef USE_AVX512
|
||||||
|
constexpr IndexType SimdWidth = 16; // 512 bits / 32 bits
|
||||||
|
constexpr IndexType NumChunks = InputDimensions / SimdWidth;
|
||||||
|
const __m512i increment = _mm512_set1_epi32(SimdWidth);
|
||||||
|
__m512i base = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||||
|
|
||||||
|
IndexType count = 0;
|
||||||
|
for (IndexType i = 0; i < NumChunks; ++i)
|
||||||
|
{
|
||||||
|
const __m512i inputV = _mm512_load_si512(input + i * SimdWidth);
|
||||||
|
|
||||||
|
// Get a bitmask and gather non zero indices
|
||||||
|
const __mmask16 nnzMask = _mm512_test_epi32_mask(inputV, inputV);
|
||||||
|
const __m512i nnzV = _mm512_maskz_compress_epi32(nnzMask, base);
|
||||||
|
_mm512_mask_cvtepi32_storeu_epi16(out + count, 0xFFFF, nnzV);
|
||||||
|
count += popcount(nnzMask);
|
||||||
|
base = _mm512_add_epi32(base, increment);
|
||||||
|
}
|
||||||
|
count_out = count;
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
using namespace SIMD;
|
using namespace SIMD;
|
||||||
|
|
||||||
constexpr IndexType InputSimdWidth = sizeof(vec_uint_t) / sizeof(std::int32_t);
|
constexpr IndexType InputSimdWidth = sizeof(vec_uint_t) / sizeof(std::int32_t);
|
||||||
@@ -104,6 +137,7 @@ void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_ou
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
count_out = count;
|
count_out = count;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user