From e7367cef0f5e7ccef36836ba072f165f5147e4f6 Mon Sep 17 00:00:00 2001 From: Shawn Xu Date: Sun, 12 Jan 2025 19:38:02 -0800 Subject: [PATCH] Clean up pack reordering closes https://github.com/official-stockfish/Stockfish/pull/5802 no functional change --- src/nnue/nnue_feature_transformer.h | 125 ++++++++++++++++------------ 1 file changed, 73 insertions(+), 52 deletions(-) diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 7a37cda8..4f0ce6cf 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include "../position.h" @@ -145,6 +146,46 @@ using psqt_vec_t = int32x4_t; #endif +// Returns the inverse of a permutation +template +constexpr std::array +invert_permutation(const std::array& order) { + std::array inverse{}; + for (std::size_t i = 0; i < order.size(); i++) + inverse[order[i]] = i; + return inverse; +} + +// Divide a byte region of size TotalSize to chunks of size +// BlockSize, and permute the blocks by a given order +template +void permute(T (&data)[N], const std::array& order) { + constexpr std::size_t TotalSize = N * sizeof(T); + + static_assert(TotalSize % (BlockSize * OrderSize) == 0, + "ChunkSize * OrderSize must perfectly divide TotalSize"); + + constexpr std::size_t ProcessChunkSize = BlockSize * OrderSize; + + std::array buffer{}; + + std::byte* const bytes = reinterpret_cast(data); + + for (std::size_t i = 0; i < TotalSize; i += ProcessChunkSize) + { + std::byte* const values = &bytes[i]; + + for (std::size_t j = 0; j < OrderSize; j++) + { + auto* const buffer_chunk = &buffer[j * BlockSize]; + auto* const value_chunk = &values[order[j] * BlockSize]; + + std::copy(value_chunk, value_chunk + BlockSize, buffer_chunk); + } + + std::copy(std::begin(buffer), std::end(buffer), values); + } +} // Compute optimal SIMD register count for feature transformer accumulation. template @@ -223,62 +264,42 @@ class FeatureTransformer { // Size of forward propagation buffer static constexpr std::size_t BufferSize = OutputDimensions * sizeof(OutputType); + // Store the order by which 128-bit blocks of a 1024-bit data must + // be permuted so that calling packus on adjacent vectors of 16-bit + // integers loaded from the data results in the pre-permutation order + static constexpr auto PackusEpi16Order = []() -> std::array { +#if defined(USE_AVX512) + // _mm512_packus_epi16 after permutation: + // | 0 | 2 | 4 | 6 | // Vector 0 + // | 1 | 3 | 5 | 7 | // Vector 1 + // | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | // Packed Result + return {0, 2, 4, 6, 1, 3, 5, 7}; +#elif defined(USE_AVX2) + // _mm256_packus_epi16 after permutation: + // | 0 | 2 | | 4 | 6 | // Vector 0, 2 + // | 1 | 3 | | 5 | 7 | // Vector 1, 3 + // | 0 | 1 | 2 | 3 | | 4 | 5 | 6 | 7 | // Packed Result + return {0, 2, 1, 3, 4, 6, 5, 7}; +#else + return {0, 1, 2, 3, 4, 5, 6, 7}; +#endif + }(); + + static constexpr auto InversePackusEpi16Order = invert_permutation(PackusEpi16Order); + // Hash value embedded in the evaluation file static constexpr std::uint32_t get_hash_value() { return FeatureSet::HashValue ^ (OutputDimensions * 2); } - static constexpr void order_packs([[maybe_unused]] uint64_t* v) { -#if defined(USE_AVX512) // _mm512_packs_epi16 ordering - uint64_t tmp0 = v[2], tmp1 = v[3]; - v[2] = v[8], v[3] = v[9]; - v[8] = v[4], v[9] = v[5]; - v[4] = tmp0, v[5] = tmp1; - tmp0 = v[6], tmp1 = v[7]; - v[6] = v[10], v[7] = v[11]; - v[10] = v[12], v[11] = v[13]; - v[12] = tmp0, v[13] = tmp1; -#elif defined(USE_AVX2) // _mm256_packs_epi16 ordering - std::swap(v[2], v[4]); - std::swap(v[3], v[5]); -#endif + void permute_weights() { + permute<16>(biases, PackusEpi16Order); + permute<16>(weights, PackusEpi16Order); } - static constexpr void inverse_order_packs([[maybe_unused]] uint64_t* v) { -#if defined(USE_AVX512) // Inverse _mm512_packs_epi16 ordering - uint64_t tmp0 = v[2], tmp1 = v[3]; - v[2] = v[4], v[3] = v[5]; - v[4] = v[8], v[5] = v[9]; - v[8] = tmp0, v[9] = tmp1; - tmp0 = v[6], tmp1 = v[7]; - v[6] = v[12], v[7] = v[13]; - v[12] = v[10], v[13] = v[11]; - v[10] = tmp0, v[11] = tmp1; -#elif defined(USE_AVX2) // Inverse _mm256_packs_epi16 ordering - std::swap(v[2], v[4]); - std::swap(v[3], v[5]); -#endif - } - - void permute_weights([[maybe_unused]] void (*order_fn)(uint64_t*)) { -#if defined(USE_AVX2) - #if defined(USE_AVX512) - constexpr IndexType di = 16; - #else - constexpr IndexType di = 8; - #endif - uint64_t* b = reinterpret_cast(&biases[0]); - for (IndexType i = 0; i < HalfDimensions * sizeof(BiasType) / sizeof(uint64_t); i += di) - order_fn(&b[i]); - - for (IndexType j = 0; j < InputDimensions; ++j) - { - uint64_t* w = reinterpret_cast(&weights[j * HalfDimensions]); - for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(uint64_t); - i += di) - order_fn(&w[i]); - } -#endif + void unpermute_weights() { + permute<16>(biases, InversePackusEpi16Order); + permute<16>(weights, InversePackusEpi16Order); } inline void scale_weights(bool read) { @@ -300,7 +321,7 @@ class FeatureTransformer { read_leb_128(stream, weights, HalfDimensions * InputDimensions); read_leb_128(stream, psqtWeights, PSQTBuckets * InputDimensions); - permute_weights(inverse_order_packs); + permute_weights(); scale_weights(true); return !stream.fail(); } @@ -308,14 +329,14 @@ class FeatureTransformer { // Write network parameters bool write_parameters(std::ostream& stream) { - permute_weights(order_packs); + unpermute_weights(); scale_weights(false); write_leb_128(stream, biases, HalfDimensions); write_leb_128(stream, weights, HalfDimensions * InputDimensions); write_leb_128(stream, psqtWeights, PSQTBuckets * InputDimensions); - permute_weights(inverse_order_packs); + permute_weights(); scale_weights(true); return !stream.fail(); }