mirror of
https://github.com/HChaZZY/Stockfish.git
synced 2025-12-26 12:06:22 +08:00
add clang-format
This introduces clang-format to enforce a consistent code style for Stockfish. Having a documented and consistent style across the code will make contributing easier for new developers, and will make larger changes to the codebase easier to make. To facilitate formatting, this PR includes a Makefile target (`make format`) to format the code, this requires clang-format (version 17 currently) to be installed locally. Installing clang-format is straightforward on most OS and distros (e.g. with https://apt.llvm.org/, brew install clang-format, etc), as this is part of quite commonly used suite of tools and compilers (llvm / clang). Additionally, a CI action is present that will verify if the code requires formatting, and comment on the PR as needed. Initially, correct formatting is not required, it will be done by maintainers as part of the merge or in later commits, but obviously this is encouraged. fixes https://github.com/official-stockfish/Stockfish/issues/3608 closes https://github.com/official-stockfish/Stockfish/pull/4790 Co-Authored-By: Joost VandeVondele <Joost.VandeVondele@gmail.com>
This commit is contained in:
committed by
Joost VandeVondele
parent
8366ec48ae
commit
2d0237db3f
@@ -42,95 +42,102 @@ namespace Stockfish::Eval::NNUE::Layers {
|
||||
// Fallback implementation for older/other architectures.
|
||||
// Requires the input to be padded to at least 16 values.
|
||||
#if !defined(USE_SSSE3)
|
||||
template <IndexType InputDimensions, IndexType PaddedInputDimensions, IndexType OutputDimensions>
|
||||
static void affine_transform_non_ssse3(std::int32_t* output, const std::int8_t* weights, const std::int32_t* biases, const std::uint8_t* input)
|
||||
{
|
||||
# if defined(USE_SSE2) || defined(USE_NEON_DOTPROD) || defined(USE_NEON)
|
||||
# if defined(USE_SSE2)
|
||||
template<IndexType InputDimensions, IndexType PaddedInputDimensions, IndexType OutputDimensions>
|
||||
static void affine_transform_non_ssse3(std::int32_t* output,
|
||||
const std::int8_t* weights,
|
||||
const std::int32_t* biases,
|
||||
const std::uint8_t* input) {
|
||||
#if defined(USE_SSE2) || defined(USE_NEON_DOTPROD) || defined(USE_NEON)
|
||||
#if defined(USE_SSE2)
|
||||
// At least a multiple of 16, with SSE2.
|
||||
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
|
||||
const __m128i Zeros = _mm_setzero_si128();
|
||||
const auto inputVector = reinterpret_cast<const __m128i*>(input);
|
||||
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
|
||||
const __m128i Zeros = _mm_setzero_si128();
|
||||
const auto inputVector = reinterpret_cast<const __m128i*>(input);
|
||||
|
||||
# elif defined(USE_NEON_DOTPROD)
|
||||
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
|
||||
const auto inputVector = reinterpret_cast<const int8x16_t*>(input);
|
||||
#elif defined(USE_NEON_DOTPROD)
|
||||
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
|
||||
const auto inputVector = reinterpret_cast<const int8x16_t*>(input);
|
||||
|
||||
# elif defined(USE_NEON)
|
||||
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
|
||||
const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
|
||||
# endif
|
||||
#elif defined(USE_NEON)
|
||||
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
|
||||
const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
|
||||
#endif
|
||||
|
||||
for (IndexType i = 0; i < OutputDimensions; ++i) {
|
||||
const IndexType offset = i * PaddedInputDimensions;
|
||||
for (IndexType i = 0; i < OutputDimensions; ++i)
|
||||
{
|
||||
const IndexType offset = i * PaddedInputDimensions;
|
||||
|
||||
# if defined(USE_SSE2)
|
||||
__m128i sumLo = _mm_cvtsi32_si128(biases[i]);
|
||||
__m128i sumHi = Zeros;
|
||||
const auto row = reinterpret_cast<const __m128i*>(&weights[offset]);
|
||||
for (IndexType j = 0; j < NumChunks; ++j) {
|
||||
__m128i row_j = _mm_load_si128(&row[j]);
|
||||
__m128i input_j = _mm_load_si128(&inputVector[j]);
|
||||
__m128i extendedRowLo = _mm_srai_epi16(_mm_unpacklo_epi8(row_j, row_j), 8);
|
||||
__m128i extendedRowHi = _mm_srai_epi16(_mm_unpackhi_epi8(row_j, row_j), 8);
|
||||
__m128i extendedInputLo = _mm_unpacklo_epi8(input_j, Zeros);
|
||||
__m128i extendedInputHi = _mm_unpackhi_epi8(input_j, Zeros);
|
||||
__m128i productLo = _mm_madd_epi16(extendedRowLo, extendedInputLo);
|
||||
__m128i productHi = _mm_madd_epi16(extendedRowHi, extendedInputHi);
|
||||
sumLo = _mm_add_epi32(sumLo, productLo);
|
||||
sumHi = _mm_add_epi32(sumHi, productHi);
|
||||
}
|
||||
__m128i sum = _mm_add_epi32(sumLo, sumHi);
|
||||
__m128i sumHigh_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
sum = _mm_add_epi32(sum, sumHigh_64);
|
||||
__m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
sum = _mm_add_epi32(sum, sum_second_32);
|
||||
output[i] = _mm_cvtsi128_si32(sum);
|
||||
#if defined(USE_SSE2)
|
||||
__m128i sumLo = _mm_cvtsi32_si128(biases[i]);
|
||||
__m128i sumHi = Zeros;
|
||||
const auto row = reinterpret_cast<const __m128i*>(&weights[offset]);
|
||||
for (IndexType j = 0; j < NumChunks; ++j)
|
||||
{
|
||||
__m128i row_j = _mm_load_si128(&row[j]);
|
||||
__m128i input_j = _mm_load_si128(&inputVector[j]);
|
||||
__m128i extendedRowLo = _mm_srai_epi16(_mm_unpacklo_epi8(row_j, row_j), 8);
|
||||
__m128i extendedRowHi = _mm_srai_epi16(_mm_unpackhi_epi8(row_j, row_j), 8);
|
||||
__m128i extendedInputLo = _mm_unpacklo_epi8(input_j, Zeros);
|
||||
__m128i extendedInputHi = _mm_unpackhi_epi8(input_j, Zeros);
|
||||
__m128i productLo = _mm_madd_epi16(extendedRowLo, extendedInputLo);
|
||||
__m128i productHi = _mm_madd_epi16(extendedRowHi, extendedInputHi);
|
||||
sumLo = _mm_add_epi32(sumLo, productLo);
|
||||
sumHi = _mm_add_epi32(sumHi, productHi);
|
||||
}
|
||||
__m128i sum = _mm_add_epi32(sumLo, sumHi);
|
||||
__m128i sumHigh_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
sum = _mm_add_epi32(sum, sumHigh_64);
|
||||
__m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
sum = _mm_add_epi32(sum, sum_second_32);
|
||||
output[i] = _mm_cvtsi128_si32(sum);
|
||||
|
||||
# elif defined(USE_NEON_DOTPROD)
|
||||
int32x4_t sum = {biases[i]};
|
||||
const auto row = reinterpret_cast<const int8x16_t*>(&weights[offset]);
|
||||
for (IndexType j = 0; j < NumChunks; ++j) {
|
||||
sum = vdotq_s32(sum, inputVector[j], row[j]);
|
||||
}
|
||||
output[i] = vaddvq_s32(sum);
|
||||
#elif defined(USE_NEON_DOTPROD)
|
||||
int32x4_t sum = {biases[i]};
|
||||
const auto row = reinterpret_cast<const int8x16_t*>(&weights[offset]);
|
||||
for (IndexType j = 0; j < NumChunks; ++j)
|
||||
{
|
||||
sum = vdotq_s32(sum, inputVector[j], row[j]);
|
||||
}
|
||||
output[i] = vaddvq_s32(sum);
|
||||
|
||||
# elif defined(USE_NEON)
|
||||
int32x4_t sum = {biases[i]};
|
||||
const auto row = reinterpret_cast<const int8x8_t*>(&weights[offset]);
|
||||
for (IndexType j = 0; j < NumChunks; ++j) {
|
||||
int16x8_t product = vmull_s8(inputVector[j * 2], row[j * 2]);
|
||||
product = vmlal_s8(product, inputVector[j * 2 + 1], row[j * 2 + 1]);
|
||||
sum = vpadalq_s16(sum, product);
|
||||
}
|
||||
output[i] = sum[0] + sum[1] + sum[2] + sum[3];
|
||||
#elif defined(USE_NEON)
|
||||
int32x4_t sum = {biases[i]};
|
||||
const auto row = reinterpret_cast<const int8x8_t*>(&weights[offset]);
|
||||
for (IndexType j = 0; j < NumChunks; ++j)
|
||||
{
|
||||
int16x8_t product = vmull_s8(inputVector[j * 2], row[j * 2]);
|
||||
product = vmlal_s8(product, inputVector[j * 2 + 1], row[j * 2 + 1]);
|
||||
sum = vpadalq_s16(sum, product);
|
||||
}
|
||||
output[i] = sum[0] + sum[1] + sum[2] + sum[3];
|
||||
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
# else
|
||||
std::memcpy(output, biases, sizeof(std::int32_t) * OutputDimensions);
|
||||
#else
|
||||
std::memcpy(output, biases, sizeof(std::int32_t) * OutputDimensions);
|
||||
|
||||
// Traverse weights in transpose order to take advantage of input sparsity
|
||||
for (IndexType i = 0; i < InputDimensions; ++i)
|
||||
if (input[i]) {
|
||||
const std::int8_t* w = &weights[i];
|
||||
const int in = input[i];
|
||||
for (IndexType j = 0; j < OutputDimensions; ++j)
|
||||
output[j] += w[j * PaddedInputDimensions] * in;
|
||||
}
|
||||
# endif
|
||||
}
|
||||
// Traverse weights in transpose order to take advantage of input sparsity
|
||||
for (IndexType i = 0; i < InputDimensions; ++i)
|
||||
if (input[i])
|
||||
{
|
||||
const std::int8_t* w = &weights[i];
|
||||
const int in = input[i];
|
||||
for (IndexType j = 0; j < OutputDimensions; ++j)
|
||||
output[j] += w[j * PaddedInputDimensions] * in;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
template <IndexType InDims, IndexType OutDims>
|
||||
class AffineTransform {
|
||||
template<IndexType InDims, IndexType OutDims>
|
||||
class AffineTransform {
|
||||
public:
|
||||
// Input/output type
|
||||
using InputType = std::uint8_t;
|
||||
using InputType = std::uint8_t;
|
||||
using OutputType = std::int32_t;
|
||||
|
||||
// Number of input/output dimensions
|
||||
static constexpr IndexType InputDimensions = InDims;
|
||||
static constexpr IndexType InputDimensions = InDims;
|
||||
static constexpr IndexType OutputDimensions = OutDims;
|
||||
|
||||
static constexpr IndexType PaddedInputDimensions =
|
||||
@@ -142,175 +149,168 @@ namespace Stockfish::Eval::NNUE::Layers {
|
||||
|
||||
// Hash value embedded in the evaluation file
|
||||
static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
|
||||
std::uint32_t hashValue = 0xCC03DAE4u;
|
||||
hashValue += OutputDimensions;
|
||||
hashValue ^= prevHash >> 1;
|
||||
hashValue ^= prevHash << 31;
|
||||
return hashValue;
|
||||
std::uint32_t hashValue = 0xCC03DAE4u;
|
||||
hashValue += OutputDimensions;
|
||||
hashValue ^= prevHash >> 1;
|
||||
hashValue ^= prevHash << 31;
|
||||
return hashValue;
|
||||
}
|
||||
|
||||
static constexpr IndexType get_weight_index_scrambled(IndexType i)
|
||||
{
|
||||
return
|
||||
(i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 +
|
||||
i / PaddedInputDimensions * 4 +
|
||||
i % 4;
|
||||
static constexpr IndexType get_weight_index_scrambled(IndexType i) {
|
||||
return (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4
|
||||
+ i / PaddedInputDimensions * 4 + i % 4;
|
||||
}
|
||||
|
||||
static constexpr IndexType get_weight_index(IndexType i)
|
||||
{
|
||||
#if defined (USE_SSSE3)
|
||||
return get_weight_index_scrambled(i);
|
||||
static constexpr IndexType get_weight_index(IndexType i) {
|
||||
#if defined(USE_SSSE3)
|
||||
return get_weight_index_scrambled(i);
|
||||
#else
|
||||
return i;
|
||||
return i;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Read network parameters
|
||||
bool read_parameters(std::istream& stream) {
|
||||
read_little_endian<BiasType>(stream, biases, OutputDimensions);
|
||||
for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
|
||||
weights[get_weight_index(i)] = read_little_endian<WeightType>(stream);
|
||||
read_little_endian<BiasType>(stream, biases, OutputDimensions);
|
||||
for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
|
||||
weights[get_weight_index(i)] = read_little_endian<WeightType>(stream);
|
||||
|
||||
return !stream.fail();
|
||||
return !stream.fail();
|
||||
}
|
||||
|
||||
// Write network parameters
|
||||
bool write_parameters(std::ostream& stream) const {
|
||||
write_little_endian<BiasType>(stream, biases, OutputDimensions);
|
||||
write_little_endian<BiasType>(stream, biases, OutputDimensions);
|
||||
|
||||
for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
|
||||
write_little_endian<WeightType>(stream, weights[get_weight_index(i)]);
|
||||
for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
|
||||
write_little_endian<WeightType>(stream, weights[get_weight_index(i)]);
|
||||
|
||||
return !stream.fail();
|
||||
return !stream.fail();
|
||||
}
|
||||
// Forward propagation
|
||||
void propagate(
|
||||
const InputType* input, OutputType* output) const {
|
||||
void propagate(const InputType* input, OutputType* output) const {
|
||||
|
||||
#if defined (USE_SSSE3)
|
||||
#if defined(USE_SSSE3)
|
||||
|
||||
if constexpr (OutputDimensions > 1)
|
||||
{
|
||||
|
||||
#if defined (USE_AVX512)
|
||||
using vec_t = __m512i;
|
||||
#define vec_setzero _mm512_setzero_si512
|
||||
#define vec_set_32 _mm512_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
|
||||
#define vec_add_dpbusd_32x2 Simd::m512_add_dpbusd_epi32x2
|
||||
#define vec_hadd Simd::m512_hadd
|
||||
#elif defined (USE_AVX2)
|
||||
using vec_t = __m256i;
|
||||
#define vec_setzero _mm256_setzero_si256
|
||||
#define vec_set_32 _mm256_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
|
||||
#define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2
|
||||
#define vec_hadd Simd::m256_hadd
|
||||
#elif defined (USE_SSSE3)
|
||||
using vec_t = __m128i;
|
||||
#define vec_setzero _mm_setzero_si128
|
||||
#define vec_set_32 _mm_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
|
||||
#define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
|
||||
#define vec_hadd Simd::m128_hadd
|
||||
#endif
|
||||
|
||||
static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType);
|
||||
|
||||
static_assert(OutputDimensions % OutputSimdWidth == 0);
|
||||
|
||||
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / 4;
|
||||
constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth;
|
||||
|
||||
const auto input32 = reinterpret_cast<const std::int32_t*>(input);
|
||||
const vec_t* biasvec = reinterpret_cast<const vec_t*>(biases);
|
||||
vec_t acc[NumRegs];
|
||||
for (IndexType k = 0; k < NumRegs; ++k)
|
||||
acc[k] = biasvec[k];
|
||||
|
||||
for (IndexType i = 0; i < NumChunks; i += 2)
|
||||
if constexpr (OutputDimensions > 1)
|
||||
{
|
||||
const vec_t in0 = vec_set_32(input32[i + 0]);
|
||||
const vec_t in1 = vec_set_32(input32[i + 1]);
|
||||
const auto col0 = reinterpret_cast<const vec_t*>(&weights[(i + 0) * OutputDimensions * 4]);
|
||||
const auto col1 = reinterpret_cast<const vec_t*>(&weights[(i + 1) * OutputDimensions * 4]);
|
||||
for (IndexType k = 0; k < NumRegs; ++k)
|
||||
vec_add_dpbusd_32x2(acc[k], in0, col0[k], in1, col1[k]);
|
||||
|
||||
#if defined(USE_AVX512)
|
||||
using vec_t = __m512i;
|
||||
#define vec_setzero _mm512_setzero_si512
|
||||
#define vec_set_32 _mm512_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
|
||||
#define vec_add_dpbusd_32x2 Simd::m512_add_dpbusd_epi32x2
|
||||
#define vec_hadd Simd::m512_hadd
|
||||
#elif defined(USE_AVX2)
|
||||
using vec_t = __m256i;
|
||||
#define vec_setzero _mm256_setzero_si256
|
||||
#define vec_set_32 _mm256_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
|
||||
#define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2
|
||||
#define vec_hadd Simd::m256_hadd
|
||||
#elif defined(USE_SSSE3)
|
||||
using vec_t = __m128i;
|
||||
#define vec_setzero _mm_setzero_si128
|
||||
#define vec_set_32 _mm_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
|
||||
#define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
|
||||
#define vec_hadd Simd::m128_hadd
|
||||
#endif
|
||||
|
||||
static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType);
|
||||
|
||||
static_assert(OutputDimensions % OutputSimdWidth == 0);
|
||||
|
||||
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / 4;
|
||||
constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth;
|
||||
|
||||
const auto input32 = reinterpret_cast<const std::int32_t*>(input);
|
||||
const vec_t* biasvec = reinterpret_cast<const vec_t*>(biases);
|
||||
vec_t acc[NumRegs];
|
||||
for (IndexType k = 0; k < NumRegs; ++k)
|
||||
acc[k] = biasvec[k];
|
||||
|
||||
for (IndexType i = 0; i < NumChunks; i += 2)
|
||||
{
|
||||
const vec_t in0 = vec_set_32(input32[i + 0]);
|
||||
const vec_t in1 = vec_set_32(input32[i + 1]);
|
||||
const auto col0 =
|
||||
reinterpret_cast<const vec_t*>(&weights[(i + 0) * OutputDimensions * 4]);
|
||||
const auto col1 =
|
||||
reinterpret_cast<const vec_t*>(&weights[(i + 1) * OutputDimensions * 4]);
|
||||
for (IndexType k = 0; k < NumRegs; ++k)
|
||||
vec_add_dpbusd_32x2(acc[k], in0, col0[k], in1, col1[k]);
|
||||
}
|
||||
|
||||
vec_t* outptr = reinterpret_cast<vec_t*>(output);
|
||||
for (IndexType k = 0; k < NumRegs; ++k)
|
||||
outptr[k] = acc[k];
|
||||
|
||||
#undef vec_setzero
|
||||
#undef vec_set_32
|
||||
#undef vec_add_dpbusd_32
|
||||
#undef vec_add_dpbusd_32x2
|
||||
#undef vec_hadd
|
||||
}
|
||||
|
||||
vec_t* outptr = reinterpret_cast<vec_t*>(output);
|
||||
for (IndexType k = 0; k < NumRegs; ++k)
|
||||
outptr[k] = acc[k];
|
||||
|
||||
# undef vec_setzero
|
||||
# undef vec_set_32
|
||||
# undef vec_add_dpbusd_32
|
||||
# undef vec_add_dpbusd_32x2
|
||||
# undef vec_hadd
|
||||
|
||||
}
|
||||
else if constexpr (OutputDimensions == 1)
|
||||
{
|
||||
|
||||
// We cannot use AVX512 for the last layer because there's only 32 inputs and the buffer is not padded to 64 elements.
|
||||
#if defined (USE_AVX2)
|
||||
using vec_t = __m256i;
|
||||
#define vec_setzero _mm256_setzero_si256
|
||||
#define vec_set_32 _mm256_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
|
||||
#define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2
|
||||
#define vec_hadd Simd::m256_hadd
|
||||
#elif defined (USE_SSSE3)
|
||||
using vec_t = __m128i;
|
||||
#define vec_setzero _mm_setzero_si128
|
||||
#define vec_set_32 _mm_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
|
||||
#define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
|
||||
#define vec_hadd Simd::m128_hadd
|
||||
#endif
|
||||
|
||||
const auto inputVector = reinterpret_cast<const vec_t*>(input);
|
||||
|
||||
static constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(InputType);
|
||||
|
||||
static_assert(PaddedInputDimensions % InputSimdWidth == 0);
|
||||
|
||||
constexpr IndexType NumChunks = PaddedInputDimensions / InputSimdWidth;
|
||||
vec_t sum0 = vec_setzero();
|
||||
const auto row0 = reinterpret_cast<const vec_t*>(&weights[0]);
|
||||
|
||||
for (int j = 0; j < int(NumChunks); ++j)
|
||||
else if constexpr (OutputDimensions == 1)
|
||||
{
|
||||
const vec_t in = inputVector[j];
|
||||
vec_add_dpbusd_32(sum0, in, row0[j]);
|
||||
|
||||
// We cannot use AVX512 for the last layer because there's only 32 inputs and the buffer is not padded to 64 elements.
|
||||
#if defined(USE_AVX2)
|
||||
using vec_t = __m256i;
|
||||
#define vec_setzero _mm256_setzero_si256
|
||||
#define vec_set_32 _mm256_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
|
||||
#define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2
|
||||
#define vec_hadd Simd::m256_hadd
|
||||
#elif defined(USE_SSSE3)
|
||||
using vec_t = __m128i;
|
||||
#define vec_setzero _mm_setzero_si128
|
||||
#define vec_set_32 _mm_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
|
||||
#define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
|
||||
#define vec_hadd Simd::m128_hadd
|
||||
#endif
|
||||
|
||||
const auto inputVector = reinterpret_cast<const vec_t*>(input);
|
||||
|
||||
static constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(InputType);
|
||||
|
||||
static_assert(PaddedInputDimensions % InputSimdWidth == 0);
|
||||
|
||||
constexpr IndexType NumChunks = PaddedInputDimensions / InputSimdWidth;
|
||||
vec_t sum0 = vec_setzero();
|
||||
const auto row0 = reinterpret_cast<const vec_t*>(&weights[0]);
|
||||
|
||||
for (int j = 0; j < int(NumChunks); ++j)
|
||||
{
|
||||
const vec_t in = inputVector[j];
|
||||
vec_add_dpbusd_32(sum0, in, row0[j]);
|
||||
}
|
||||
output[0] = vec_hadd(sum0, biases[0]);
|
||||
|
||||
#undef vec_setzero
|
||||
#undef vec_set_32
|
||||
#undef vec_add_dpbusd_32
|
||||
#undef vec_add_dpbusd_32x2
|
||||
#undef vec_hadd
|
||||
}
|
||||
output[0] = vec_hadd(sum0, biases[0]);
|
||||
|
||||
# undef vec_setzero
|
||||
# undef vec_set_32
|
||||
# undef vec_add_dpbusd_32
|
||||
# undef vec_add_dpbusd_32x2
|
||||
# undef vec_hadd
|
||||
|
||||
}
|
||||
#else
|
||||
// Use old implementation for the other architectures.
|
||||
affine_transform_non_ssse3<
|
||||
InputDimensions,
|
||||
PaddedInputDimensions,
|
||||
OutputDimensions>(output, weights, biases, input);
|
||||
// Use old implementation for the other architectures.
|
||||
affine_transform_non_ssse3<InputDimensions, PaddedInputDimensions, OutputDimensions>(
|
||||
output, weights, biases, input);
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
using BiasType = OutputType;
|
||||
using BiasType = OutputType;
|
||||
using WeightType = std::int8_t;
|
||||
|
||||
alignas(CacheLineSize) BiasType biases[OutputDimensions];
|
||||
alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace Stockfish::Eval::NNUE::Layers
|
||||
|
||||
#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
|
||||
#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
|
||||
|
||||
@@ -38,104 +38,110 @@
|
||||
namespace Stockfish::Eval::NNUE::Layers {
|
||||
|
||||
#if (USE_SSSE3 | (USE_NEON >= 8))
|
||||
alignas(CacheLineSize) static inline const std::array<std::array<std::uint16_t, 8>, 256> lookup_indices = [](){
|
||||
std::array<std::array<std::uint16_t, 8>, 256> v{};
|
||||
for (unsigned i = 0; i < 256; ++i)
|
||||
{
|
||||
std::uint64_t j = i, k = 0;
|
||||
while(j)
|
||||
v[i][k++] = pop_lsb(j);
|
||||
}
|
||||
return v;
|
||||
alignas(CacheLineSize) static inline const
|
||||
std::array<std::array<std::uint16_t, 8>, 256> lookup_indices = []() {
|
||||
std::array<std::array<std::uint16_t, 8>, 256> v{};
|
||||
for (unsigned i = 0; i < 256; ++i)
|
||||
{
|
||||
std::uint64_t j = i, k = 0;
|
||||
while (j)
|
||||
v[i][k++] = pop_lsb(j);
|
||||
}
|
||||
return v;
|
||||
}();
|
||||
|
||||
// Find indices of nonzero numbers in an int32_t array
|
||||
template<const IndexType InputDimensions>
|
||||
void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_out) {
|
||||
#if defined (USE_SSSE3)
|
||||
#if defined (USE_AVX512)
|
||||
using vec_t = __m512i;
|
||||
#define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512())
|
||||
#elif defined (USE_AVX2)
|
||||
using vec_t = __m256i;
|
||||
#if defined(USE_VNNI) && !defined(USE_AVXVNNI)
|
||||
#define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256())
|
||||
#else
|
||||
#define vec_nnz(a) _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256())))
|
||||
// Find indices of nonzero numbers in an int32_t array
|
||||
template<const IndexType InputDimensions>
|
||||
void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_out) {
|
||||
#if defined(USE_SSSE3)
|
||||
#if defined(USE_AVX512)
|
||||
using vec_t = __m512i;
|
||||
#define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512())
|
||||
#elif defined(USE_AVX2)
|
||||
using vec_t = __m256i;
|
||||
#if defined(USE_VNNI) && !defined(USE_AVXVNNI)
|
||||
#define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256())
|
||||
#else
|
||||
#define vec_nnz(a) \
|
||||
_mm256_movemask_ps( \
|
||||
_mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256())))
|
||||
#endif
|
||||
#elif defined(USE_SSSE3)
|
||||
using vec_t = __m128i;
|
||||
#define vec_nnz(a) \
|
||||
_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128())))
|
||||
#endif
|
||||
#elif defined (USE_SSSE3)
|
||||
using vec_t = __m128i;
|
||||
#define vec_nnz(a) _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128())))
|
||||
#endif
|
||||
using vec128_t = __m128i;
|
||||
#define vec128_zero _mm_setzero_si128()
|
||||
#define vec128_set_16(a) _mm_set1_epi16(a)
|
||||
#define vec128_load(a) _mm_load_si128(a)
|
||||
#define vec128_storeu(a, b) _mm_storeu_si128(a, b)
|
||||
#define vec128_add(a, b) _mm_add_epi16(a, b)
|
||||
#elif defined (USE_NEON)
|
||||
using vec_t = uint32x4_t;
|
||||
#define vec128_zero _mm_setzero_si128()
|
||||
#define vec128_set_16(a) _mm_set1_epi16(a)
|
||||
#define vec128_load(a) _mm_load_si128(a)
|
||||
#define vec128_storeu(a, b) _mm_storeu_si128(a, b)
|
||||
#define vec128_add(a, b) _mm_add_epi16(a, b)
|
||||
#elif defined(USE_NEON)
|
||||
using vec_t = uint32x4_t;
|
||||
static const std::uint32_t Mask[4] = {1, 2, 4, 8};
|
||||
#define vec_nnz(a) vaddvq_u32(vandq_u32(vtstq_u32(a, a), vld1q_u32(Mask)))
|
||||
using vec128_t = uint16x8_t;
|
||||
#define vec128_zero vdupq_n_u16(0)
|
||||
#define vec128_set_16(a) vdupq_n_u16(a)
|
||||
#define vec128_load(a) vld1q_u16(reinterpret_cast<const std::uint16_t*>(a))
|
||||
#define vec128_storeu(a, b) vst1q_u16(reinterpret_cast<std::uint16_t*>(a), b)
|
||||
#define vec128_add(a, b) vaddq_u16(a, b)
|
||||
#endif
|
||||
#define vec_nnz(a) vaddvq_u32(vandq_u32(vtstq_u32(a, a), vld1q_u32(Mask)))
|
||||
using vec128_t = uint16x8_t;
|
||||
#define vec128_zero vdupq_n_u16(0)
|
||||
#define vec128_set_16(a) vdupq_n_u16(a)
|
||||
#define vec128_load(a) vld1q_u16(reinterpret_cast<const std::uint16_t*>(a))
|
||||
#define vec128_storeu(a, b) vst1q_u16(reinterpret_cast<std::uint16_t*>(a), b)
|
||||
#define vec128_add(a, b) vaddq_u16(a, b)
|
||||
#endif
|
||||
constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(std::int32_t);
|
||||
// Inputs are processed InputSimdWidth at a time and outputs are processed 8 at a time so we process in chunks of max(InputSimdWidth, 8)
|
||||
constexpr IndexType ChunkSize = std::max<IndexType>(InputSimdWidth, 8);
|
||||
constexpr IndexType NumChunks = InputDimensions / ChunkSize;
|
||||
constexpr IndexType InputsPerChunk = ChunkSize / InputSimdWidth;
|
||||
constexpr IndexType ChunkSize = std::max<IndexType>(InputSimdWidth, 8);
|
||||
constexpr IndexType NumChunks = InputDimensions / ChunkSize;
|
||||
constexpr IndexType InputsPerChunk = ChunkSize / InputSimdWidth;
|
||||
constexpr IndexType OutputsPerChunk = ChunkSize / 8;
|
||||
|
||||
const auto inputVector = reinterpret_cast<const vec_t*>(input);
|
||||
IndexType count = 0;
|
||||
vec128_t base = vec128_zero;
|
||||
const vec128_t increment = vec128_set_16(8);
|
||||
const auto inputVector = reinterpret_cast<const vec_t*>(input);
|
||||
IndexType count = 0;
|
||||
vec128_t base = vec128_zero;
|
||||
const vec128_t increment = vec128_set_16(8);
|
||||
for (IndexType i = 0; i < NumChunks; ++i)
|
||||
{
|
||||
// bitmask of nonzero values in this chunk
|
||||
unsigned nnz = 0;
|
||||
for (IndexType j = 0; j < InputsPerChunk; ++j)
|
||||
{
|
||||
const vec_t inputChunk = inputVector[i * InputsPerChunk + j];
|
||||
nnz |= unsigned(vec_nnz(inputChunk)) << (j * InputSimdWidth);
|
||||
}
|
||||
for (IndexType j = 0; j < OutputsPerChunk; ++j)
|
||||
{
|
||||
const auto lookup = (nnz >> (j * 8)) & 0xFF;
|
||||
const auto offsets = vec128_load(reinterpret_cast<const vec128_t*>(&lookup_indices[lookup]));
|
||||
vec128_storeu(reinterpret_cast<vec128_t*>(out + count), vec128_add(base, offsets));
|
||||
count += popcount(lookup);
|
||||
base = vec128_add(base, increment);
|
||||
}
|
||||
// bitmask of nonzero values in this chunk
|
||||
unsigned nnz = 0;
|
||||
for (IndexType j = 0; j < InputsPerChunk; ++j)
|
||||
{
|
||||
const vec_t inputChunk = inputVector[i * InputsPerChunk + j];
|
||||
nnz |= unsigned(vec_nnz(inputChunk)) << (j * InputSimdWidth);
|
||||
}
|
||||
for (IndexType j = 0; j < OutputsPerChunk; ++j)
|
||||
{
|
||||
const auto lookup = (nnz >> (j * 8)) & 0xFF;
|
||||
const auto offsets =
|
||||
vec128_load(reinterpret_cast<const vec128_t*>(&lookup_indices[lookup]));
|
||||
vec128_storeu(reinterpret_cast<vec128_t*>(out + count), vec128_add(base, offsets));
|
||||
count += popcount(lookup);
|
||||
base = vec128_add(base, increment);
|
||||
}
|
||||
}
|
||||
count_out = count;
|
||||
}
|
||||
# undef vec_nnz
|
||||
# undef vec128_zero
|
||||
# undef vec128_set_16
|
||||
# undef vec128_load
|
||||
# undef vec128_storeu
|
||||
# undef vec128_add
|
||||
}
|
||||
#undef vec_nnz
|
||||
#undef vec128_zero
|
||||
#undef vec128_set_16
|
||||
#undef vec128_load
|
||||
#undef vec128_storeu
|
||||
#undef vec128_add
|
||||
#endif
|
||||
|
||||
// Sparse input implementation
|
||||
template <IndexType InDims, IndexType OutDims>
|
||||
class AffineTransformSparseInput {
|
||||
// Sparse input implementation
|
||||
template<IndexType InDims, IndexType OutDims>
|
||||
class AffineTransformSparseInput {
|
||||
public:
|
||||
// Input/output type
|
||||
using InputType = std::uint8_t;
|
||||
using InputType = std::uint8_t;
|
||||
using OutputType = std::int32_t;
|
||||
|
||||
// Number of input/output dimensions
|
||||
static constexpr IndexType InputDimensions = InDims;
|
||||
static constexpr IndexType InputDimensions = InDims;
|
||||
static constexpr IndexType OutputDimensions = OutDims;
|
||||
|
||||
static_assert(OutputDimensions % 16 == 0, "Only implemented for OutputDimensions divisible by 16.");
|
||||
static_assert(OutputDimensions % 16 == 0,
|
||||
"Only implemented for OutputDimensions divisible by 16.");
|
||||
|
||||
static constexpr IndexType PaddedInputDimensions =
|
||||
ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth);
|
||||
@@ -152,127 +158,121 @@ namespace Stockfish::Eval::NNUE::Layers {
|
||||
|
||||
// Hash value embedded in the evaluation file
|
||||
static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
|
||||
std::uint32_t hashValue = 0xCC03DAE4u;
|
||||
hashValue += OutputDimensions;
|
||||
hashValue ^= prevHash >> 1;
|
||||
hashValue ^= prevHash << 31;
|
||||
return hashValue;
|
||||
std::uint32_t hashValue = 0xCC03DAE4u;
|
||||
hashValue += OutputDimensions;
|
||||
hashValue ^= prevHash >> 1;
|
||||
hashValue ^= prevHash << 31;
|
||||
return hashValue;
|
||||
}
|
||||
|
||||
static constexpr IndexType get_weight_index_scrambled(IndexType i)
|
||||
{
|
||||
return
|
||||
(i / ChunkSize) % (PaddedInputDimensions / ChunkSize) * OutputDimensions * ChunkSize +
|
||||
i / PaddedInputDimensions * ChunkSize +
|
||||
i % ChunkSize;
|
||||
static constexpr IndexType get_weight_index_scrambled(IndexType i) {
|
||||
return (i / ChunkSize) % (PaddedInputDimensions / ChunkSize) * OutputDimensions * ChunkSize
|
||||
+ i / PaddedInputDimensions * ChunkSize + i % ChunkSize;
|
||||
}
|
||||
|
||||
static constexpr IndexType get_weight_index(IndexType i)
|
||||
{
|
||||
static constexpr IndexType get_weight_index(IndexType i) {
|
||||
#if (USE_SSSE3 | (USE_NEON >= 8))
|
||||
return get_weight_index_scrambled(i);
|
||||
return get_weight_index_scrambled(i);
|
||||
#else
|
||||
return i;
|
||||
return i;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Read network parameters
|
||||
bool read_parameters(std::istream& stream) {
|
||||
read_little_endian<BiasType>(stream, biases, OutputDimensions);
|
||||
for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
|
||||
weights[get_weight_index(i)] = read_little_endian<WeightType>(stream);
|
||||
read_little_endian<BiasType>(stream, biases, OutputDimensions);
|
||||
for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
|
||||
weights[get_weight_index(i)] = read_little_endian<WeightType>(stream);
|
||||
|
||||
return !stream.fail();
|
||||
return !stream.fail();
|
||||
}
|
||||
|
||||
// Write network parameters
|
||||
bool write_parameters(std::ostream& stream) const {
|
||||
write_little_endian<BiasType>(stream, biases, OutputDimensions);
|
||||
write_little_endian<BiasType>(stream, biases, OutputDimensions);
|
||||
|
||||
for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
|
||||
write_little_endian<WeightType>(stream, weights[get_weight_index(i)]);
|
||||
for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
|
||||
write_little_endian<WeightType>(stream, weights[get_weight_index(i)]);
|
||||
|
||||
return !stream.fail();
|
||||
return !stream.fail();
|
||||
}
|
||||
// Forward propagation
|
||||
void propagate(
|
||||
const InputType* input, OutputType* output) const {
|
||||
void propagate(const InputType* input, OutputType* output) const {
|
||||
|
||||
#if (USE_SSSE3 | (USE_NEON >= 8))
|
||||
#if defined (USE_AVX512)
|
||||
using invec_t = __m512i;
|
||||
using outvec_t = __m512i;
|
||||
#define vec_set_32 _mm512_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
|
||||
#elif defined (USE_AVX2)
|
||||
using invec_t = __m256i;
|
||||
using outvec_t = __m256i;
|
||||
#define vec_set_32 _mm256_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
|
||||
#elif defined (USE_SSSE3)
|
||||
using invec_t = __m128i;
|
||||
using outvec_t = __m128i;
|
||||
#define vec_set_32 _mm_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
|
||||
#elif defined (USE_NEON_DOTPROD)
|
||||
using invec_t = int8x16_t;
|
||||
using outvec_t = int32x4_t;
|
||||
#define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a))
|
||||
#define vec_add_dpbusd_32 Simd::dotprod_m128_add_dpbusd_epi32
|
||||
#elif defined (USE_NEON)
|
||||
using invec_t = int8x16_t;
|
||||
using outvec_t = int32x4_t;
|
||||
#define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a))
|
||||
#define vec_add_dpbusd_32 Simd::neon_m128_add_dpbusd_epi32
|
||||
#endif
|
||||
static constexpr IndexType OutputSimdWidth = sizeof(outvec_t) / sizeof(OutputType);
|
||||
#if defined(USE_AVX512)
|
||||
using invec_t = __m512i;
|
||||
using outvec_t = __m512i;
|
||||
#define vec_set_32 _mm512_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
|
||||
#elif defined(USE_AVX2)
|
||||
using invec_t = __m256i;
|
||||
using outvec_t = __m256i;
|
||||
#define vec_set_32 _mm256_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
|
||||
#elif defined(USE_SSSE3)
|
||||
using invec_t = __m128i;
|
||||
using outvec_t = __m128i;
|
||||
#define vec_set_32 _mm_set1_epi32
|
||||
#define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
|
||||
#elif defined(USE_NEON_DOTPROD)
|
||||
using invec_t = int8x16_t;
|
||||
using outvec_t = int32x4_t;
|
||||
#define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a))
|
||||
#define vec_add_dpbusd_32 Simd::dotprod_m128_add_dpbusd_epi32
|
||||
#elif defined(USE_NEON)
|
||||
using invec_t = int8x16_t;
|
||||
using outvec_t = int32x4_t;
|
||||
#define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a))
|
||||
#define vec_add_dpbusd_32 Simd::neon_m128_add_dpbusd_epi32
|
||||
#endif
|
||||
static constexpr IndexType OutputSimdWidth = sizeof(outvec_t) / sizeof(OutputType);
|
||||
|
||||
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / ChunkSize;
|
||||
constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth;
|
||||
std::uint16_t nnz[NumChunks];
|
||||
IndexType count;
|
||||
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / ChunkSize;
|
||||
constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth;
|
||||
std::uint16_t nnz[NumChunks];
|
||||
IndexType count;
|
||||
|
||||
const auto input32 = reinterpret_cast<const std::int32_t*>(input);
|
||||
const auto input32 = reinterpret_cast<const std::int32_t*>(input);
|
||||
|
||||
// Find indices of nonzero 32bit blocks
|
||||
find_nnz<NumChunks>(input32, nnz, count);
|
||||
// Find indices of nonzero 32bit blocks
|
||||
find_nnz<NumChunks>(input32, nnz, count);
|
||||
|
||||
const outvec_t* biasvec = reinterpret_cast<const outvec_t*>(biases);
|
||||
outvec_t acc[NumRegs];
|
||||
for (IndexType k = 0; k < NumRegs; ++k)
|
||||
acc[k] = biasvec[k];
|
||||
|
||||
for (IndexType j = 0; j < count; ++j)
|
||||
{
|
||||
const auto i = nnz[j];
|
||||
const invec_t in = vec_set_32(input32[i]);
|
||||
const auto col = reinterpret_cast<const invec_t*>(&weights[i * OutputDimensions * ChunkSize]);
|
||||
const outvec_t* biasvec = reinterpret_cast<const outvec_t*>(biases);
|
||||
outvec_t acc[NumRegs];
|
||||
for (IndexType k = 0; k < NumRegs; ++k)
|
||||
vec_add_dpbusd_32(acc[k], in, col[k]);
|
||||
}
|
||||
acc[k] = biasvec[k];
|
||||
|
||||
outvec_t* outptr = reinterpret_cast<outvec_t*>(output);
|
||||
for (IndexType k = 0; k < NumRegs; ++k)
|
||||
outptr[k] = acc[k];
|
||||
# undef vec_set_32
|
||||
# undef vec_add_dpbusd_32
|
||||
for (IndexType j = 0; j < count; ++j)
|
||||
{
|
||||
const auto i = nnz[j];
|
||||
const invec_t in = vec_set_32(input32[i]);
|
||||
const auto col =
|
||||
reinterpret_cast<const invec_t*>(&weights[i * OutputDimensions * ChunkSize]);
|
||||
for (IndexType k = 0; k < NumRegs; ++k)
|
||||
vec_add_dpbusd_32(acc[k], in, col[k]);
|
||||
}
|
||||
|
||||
outvec_t* outptr = reinterpret_cast<outvec_t*>(output);
|
||||
for (IndexType k = 0; k < NumRegs; ++k)
|
||||
outptr[k] = acc[k];
|
||||
#undef vec_set_32
|
||||
#undef vec_add_dpbusd_32
|
||||
#else
|
||||
// Use dense implementation for the other architectures.
|
||||
affine_transform_non_ssse3<
|
||||
InputDimensions,
|
||||
PaddedInputDimensions,
|
||||
OutputDimensions>(output, weights, biases, input);
|
||||
// Use dense implementation for the other architectures.
|
||||
affine_transform_non_ssse3<InputDimensions, PaddedInputDimensions, OutputDimensions>(
|
||||
output, weights, biases, input);
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
using BiasType = OutputType;
|
||||
using BiasType = OutputType;
|
||||
using WeightType = std::int8_t;
|
||||
|
||||
alignas(CacheLineSize) BiasType biases[OutputDimensions];
|
||||
alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace Stockfish::Eval::NNUE::Layers
|
||||
|
||||
#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
|
||||
#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
|
||||
|
||||
@@ -29,136 +29,140 @@
|
||||
|
||||
namespace Stockfish::Eval::NNUE::Layers {
|
||||
|
||||
// Clipped ReLU
|
||||
template <IndexType InDims>
|
||||
class ClippedReLU {
|
||||
// Clipped ReLU
|
||||
template<IndexType InDims>
|
||||
class ClippedReLU {
|
||||
public:
|
||||
// Input/output type
|
||||
using InputType = std::int32_t;
|
||||
using InputType = std::int32_t;
|
||||
using OutputType = std::uint8_t;
|
||||
|
||||
// Number of input/output dimensions
|
||||
static constexpr IndexType InputDimensions = InDims;
|
||||
static constexpr IndexType InputDimensions = InDims;
|
||||
static constexpr IndexType OutputDimensions = InputDimensions;
|
||||
static constexpr IndexType PaddedOutputDimensions =
|
||||
ceil_to_multiple<IndexType>(OutputDimensions, 32);
|
||||
ceil_to_multiple<IndexType>(OutputDimensions, 32);
|
||||
|
||||
using OutputBuffer = OutputType[PaddedOutputDimensions];
|
||||
|
||||
// Hash value embedded in the evaluation file
|
||||
static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
|
||||
std::uint32_t hashValue = 0x538D24C7u;
|
||||
hashValue += prevHash;
|
||||
return hashValue;
|
||||
std::uint32_t hashValue = 0x538D24C7u;
|
||||
hashValue += prevHash;
|
||||
return hashValue;
|
||||
}
|
||||
|
||||
// Read network parameters
|
||||
bool read_parameters(std::istream&) {
|
||||
return true;
|
||||
}
|
||||
bool read_parameters(std::istream&) { return true; }
|
||||
|
||||
// Write network parameters
|
||||
bool write_parameters(std::ostream&) const {
|
||||
return true;
|
||||
}
|
||||
bool write_parameters(std::ostream&) const { return true; }
|
||||
|
||||
// Forward propagation
|
||||
void propagate(
|
||||
const InputType* input, OutputType* output) const {
|
||||
void propagate(const InputType* input, OutputType* output) const {
|
||||
|
||||
#if defined(USE_AVX2)
|
||||
if constexpr (InputDimensions % SimdWidth == 0) {
|
||||
#if defined(USE_AVX2)
|
||||
if constexpr (InputDimensions % SimdWidth == 0)
|
||||
{
|
||||
constexpr IndexType NumChunks = InputDimensions / SimdWidth;
|
||||
const __m256i Zero = _mm256_setzero_si256();
|
||||
const __m256i Offsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
|
||||
const auto in = reinterpret_cast<const __m256i*>(input);
|
||||
const auto out = reinterpret_cast<__m256i*>(output);
|
||||
for (IndexType i = 0; i < NumChunks; ++i)
|
||||
{
|
||||
const __m256i words0 =
|
||||
_mm256_srai_epi16(_mm256_packs_epi32(_mm256_load_si256(&in[i * 4 + 0]),
|
||||
_mm256_load_si256(&in[i * 4 + 1])),
|
||||
WeightScaleBits);
|
||||
const __m256i words1 =
|
||||
_mm256_srai_epi16(_mm256_packs_epi32(_mm256_load_si256(&in[i * 4 + 2]),
|
||||
_mm256_load_si256(&in[i * 4 + 3])),
|
||||
WeightScaleBits);
|
||||
_mm256_store_si256(
|
||||
&out[i], _mm256_permutevar8x32_epi32(
|
||||
_mm256_max_epi8(_mm256_packs_epi16(words0, words1), Zero), Offsets));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
|
||||
const __m128i Zero = _mm_setzero_si128();
|
||||
const auto in = reinterpret_cast<const __m128i*>(input);
|
||||
const auto out = reinterpret_cast<__m128i*>(output);
|
||||
for (IndexType i = 0; i < NumChunks; ++i)
|
||||
{
|
||||
const __m128i words0 = _mm_srai_epi16(
|
||||
_mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])),
|
||||
WeightScaleBits);
|
||||
const __m128i words1 = _mm_srai_epi16(
|
||||
_mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])),
|
||||
WeightScaleBits);
|
||||
const __m128i packedbytes = _mm_packs_epi16(words0, words1);
|
||||
_mm_store_si128(&out[i], _mm_max_epi8(packedbytes, Zero));
|
||||
}
|
||||
}
|
||||
constexpr IndexType Start = InputDimensions % SimdWidth == 0
|
||||
? InputDimensions / SimdWidth * SimdWidth
|
||||
: InputDimensions / (SimdWidth / 2) * (SimdWidth / 2);
|
||||
|
||||
#elif defined(USE_SSE2)
|
||||
constexpr IndexType NumChunks = InputDimensions / SimdWidth;
|
||||
const __m256i Zero = _mm256_setzero_si256();
|
||||
const __m256i Offsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
|
||||
const auto in = reinterpret_cast<const __m256i*>(input);
|
||||
const auto out = reinterpret_cast<__m256i*>(output);
|
||||
for (IndexType i = 0; i < NumChunks; ++i) {
|
||||
const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
|
||||
_mm256_load_si256(&in[i * 4 + 0]),
|
||||
_mm256_load_si256(&in[i * 4 + 1])), WeightScaleBits);
|
||||
const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
|
||||
_mm256_load_si256(&in[i * 4 + 2]),
|
||||
_mm256_load_si256(&in[i * 4 + 3])), WeightScaleBits);
|
||||
_mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
|
||||
_mm256_packs_epi16(words0, words1), Zero), Offsets));
|
||||
}
|
||||
} else {
|
||||
constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
|
||||
|
||||
#ifdef USE_SSE41
|
||||
const __m128i Zero = _mm_setzero_si128();
|
||||
const auto in = reinterpret_cast<const __m128i*>(input);
|
||||
#else
|
||||
const __m128i k0x80s = _mm_set1_epi8(-128);
|
||||
#endif
|
||||
|
||||
const auto in = reinterpret_cast<const __m128i*>(input);
|
||||
const auto out = reinterpret_cast<__m128i*>(output);
|
||||
for (IndexType i = 0; i < NumChunks; ++i) {
|
||||
const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
|
||||
_mm_load_si128(&in[i * 4 + 0]),
|
||||
_mm_load_si128(&in[i * 4 + 1])), WeightScaleBits);
|
||||
const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
|
||||
_mm_load_si128(&in[i * 4 + 2]),
|
||||
_mm_load_si128(&in[i * 4 + 3])), WeightScaleBits);
|
||||
const __m128i packedbytes = _mm_packs_epi16(words0, words1);
|
||||
_mm_store_si128(&out[i], _mm_max_epi8(packedbytes, Zero));
|
||||
for (IndexType i = 0; i < NumChunks; ++i)
|
||||
{
|
||||
const __m128i words0 = _mm_srai_epi16(
|
||||
_mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])),
|
||||
WeightScaleBits);
|
||||
const __m128i words1 = _mm_srai_epi16(
|
||||
_mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])),
|
||||
WeightScaleBits);
|
||||
const __m128i packedbytes = _mm_packs_epi16(words0, words1);
|
||||
_mm_store_si128(&out[i],
|
||||
|
||||
#ifdef USE_SSE41
|
||||
_mm_max_epi8(packedbytes, Zero)
|
||||
#else
|
||||
_mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
|
||||
#endif
|
||||
|
||||
);
|
||||
}
|
||||
}
|
||||
constexpr IndexType Start =
|
||||
InputDimensions % SimdWidth == 0
|
||||
? InputDimensions / SimdWidth * SimdWidth
|
||||
: InputDimensions / (SimdWidth / 2) * (SimdWidth / 2);
|
||||
constexpr IndexType Start = NumChunks * SimdWidth;
|
||||
|
||||
#elif defined(USE_SSE2)
|
||||
constexpr IndexType NumChunks = InputDimensions / SimdWidth;
|
||||
#elif defined(USE_NEON)
|
||||
constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
|
||||
const int8x8_t Zero = {0};
|
||||
const auto in = reinterpret_cast<const int32x4_t*>(input);
|
||||
const auto out = reinterpret_cast<int8x8_t*>(output);
|
||||
for (IndexType i = 0; i < NumChunks; ++i)
|
||||
{
|
||||
int16x8_t shifted;
|
||||
const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
|
||||
pack[0] = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits);
|
||||
pack[1] = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits);
|
||||
out[i] = vmax_s8(vqmovn_s16(shifted), Zero);
|
||||
}
|
||||
constexpr IndexType Start = NumChunks * (SimdWidth / 2);
|
||||
#else
|
||||
constexpr IndexType Start = 0;
|
||||
#endif
|
||||
|
||||
#ifdef USE_SSE41
|
||||
const __m128i Zero = _mm_setzero_si128();
|
||||
#else
|
||||
const __m128i k0x80s = _mm_set1_epi8(-128);
|
||||
#endif
|
||||
|
||||
const auto in = reinterpret_cast<const __m128i*>(input);
|
||||
const auto out = reinterpret_cast<__m128i*>(output);
|
||||
for (IndexType i = 0; i < NumChunks; ++i) {
|
||||
const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
|
||||
_mm_load_si128(&in[i * 4 + 0]),
|
||||
_mm_load_si128(&in[i * 4 + 1])), WeightScaleBits);
|
||||
const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
|
||||
_mm_load_si128(&in[i * 4 + 2]),
|
||||
_mm_load_si128(&in[i * 4 + 3])), WeightScaleBits);
|
||||
const __m128i packedbytes = _mm_packs_epi16(words0, words1);
|
||||
_mm_store_si128(&out[i],
|
||||
|
||||
#ifdef USE_SSE41
|
||||
_mm_max_epi8(packedbytes, Zero)
|
||||
#else
|
||||
_mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
|
||||
#endif
|
||||
|
||||
);
|
||||
}
|
||||
constexpr IndexType Start = NumChunks * SimdWidth;
|
||||
|
||||
#elif defined(USE_NEON)
|
||||
constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
|
||||
const int8x8_t Zero = {0};
|
||||
const auto in = reinterpret_cast<const int32x4_t*>(input);
|
||||
const auto out = reinterpret_cast<int8x8_t*>(output);
|
||||
for (IndexType i = 0; i < NumChunks; ++i) {
|
||||
int16x8_t shifted;
|
||||
const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
|
||||
pack[0] = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits);
|
||||
pack[1] = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits);
|
||||
out[i] = vmax_s8(vqmovn_s16(shifted), Zero);
|
||||
}
|
||||
constexpr IndexType Start = NumChunks * (SimdWidth / 2);
|
||||
#else
|
||||
constexpr IndexType Start = 0;
|
||||
#endif
|
||||
|
||||
for (IndexType i = Start; i < InputDimensions; ++i) {
|
||||
output[i] = static_cast<OutputType>(
|
||||
std::clamp(input[i] >> WeightScaleBits, 0, 127));
|
||||
}
|
||||
for (IndexType i = Start; i < InputDimensions; ++i)
|
||||
{
|
||||
output[i] = static_cast<OutputType>(std::clamp(input[i] >> WeightScaleBits, 0, 127));
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace Stockfish::Eval::NNUE::Layers
|
||||
|
||||
#endif // NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
|
||||
#endif // NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
|
||||
|
||||
@@ -20,30 +20,30 @@
|
||||
#define STOCKFISH_SIMD_H_INCLUDED
|
||||
|
||||
#if defined(USE_AVX2)
|
||||
# include <immintrin.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
#elif defined(USE_SSE41)
|
||||
# include <smmintrin.h>
|
||||
#include <smmintrin.h>
|
||||
|
||||
#elif defined(USE_SSSE3)
|
||||
# include <tmmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
|
||||
#elif defined(USE_SSE2)
|
||||
# include <emmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
#elif defined(USE_NEON)
|
||||
# include <arm_neon.h>
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
namespace Stockfish::Simd {
|
||||
|
||||
#if defined (USE_AVX512)
|
||||
#if defined(USE_AVX512)
|
||||
|
||||
[[maybe_unused]] static int m512_hadd(__m512i sum, int bias) {
|
||||
return _mm512_reduce_add_epi32(sum) + bias;
|
||||
}
|
||||
[[maybe_unused]] static int m512_hadd(__m512i sum, int bias) {
|
||||
return _mm512_reduce_add_epi32(sum) + bias;
|
||||
}
|
||||
|
||||
/*
|
||||
/*
|
||||
Parameters:
|
||||
sum0 = [zmm0.i128[0], zmm0.i128[1], zmm0.i128[2], zmm0.i128[3]]
|
||||
sum1 = [zmm1.i128[0], zmm1.i128[1], zmm1.i128[2], zmm1.i128[3]]
|
||||
@@ -58,186 +58,164 @@ namespace Stockfish::Simd {
|
||||
reduce_add_epi32(zmm0.i128[3]), reduce_add_epi32(zmm1.i128[3]), reduce_add_epi32(zmm2.i128[3]), reduce_add_epi32(zmm3.i128[3])
|
||||
]
|
||||
*/
|
||||
[[maybe_unused]] static __m512i m512_hadd128x16_interleave(
|
||||
__m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3) {
|
||||
[[maybe_unused]] static __m512i
|
||||
m512_hadd128x16_interleave(__m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3) {
|
||||
|
||||
__m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1);
|
||||
__m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1);
|
||||
__m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1);
|
||||
__m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1);
|
||||
|
||||
__m512i sum23a = _mm512_unpacklo_epi32(sum2, sum3);
|
||||
__m512i sum23b = _mm512_unpackhi_epi32(sum2, sum3);
|
||||
__m512i sum23a = _mm512_unpacklo_epi32(sum2, sum3);
|
||||
__m512i sum23b = _mm512_unpackhi_epi32(sum2, sum3);
|
||||
|
||||
__m512i sum01 = _mm512_add_epi32(sum01a, sum01b);
|
||||
__m512i sum23 = _mm512_add_epi32(sum23a, sum23b);
|
||||
__m512i sum01 = _mm512_add_epi32(sum01a, sum01b);
|
||||
__m512i sum23 = _mm512_add_epi32(sum23a, sum23b);
|
||||
|
||||
__m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23);
|
||||
__m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23);
|
||||
__m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23);
|
||||
__m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23);
|
||||
|
||||
return _mm512_add_epi32(sum0123a, sum0123b);
|
||||
}
|
||||
return _mm512_add_epi32(sum0123a, sum0123b);
|
||||
}
|
||||
|
||||
[[maybe_unused]] static void m512_add_dpbusd_epi32(
|
||||
__m512i& acc,
|
||||
__m512i a,
|
||||
__m512i b) {
|
||||
[[maybe_unused]] static void m512_add_dpbusd_epi32(__m512i& acc, __m512i a, __m512i b) {
|
||||
|
||||
# if defined (USE_VNNI)
|
||||
acc = _mm512_dpbusd_epi32(acc, a, b);
|
||||
# else
|
||||
__m512i product0 = _mm512_maddubs_epi16(a, b);
|
||||
product0 = _mm512_madd_epi16(product0, _mm512_set1_epi16(1));
|
||||
acc = _mm512_add_epi32(acc, product0);
|
||||
# endif
|
||||
}
|
||||
#if defined(USE_VNNI)
|
||||
acc = _mm512_dpbusd_epi32(acc, a, b);
|
||||
#else
|
||||
__m512i product0 = _mm512_maddubs_epi16(a, b);
|
||||
product0 = _mm512_madd_epi16(product0, _mm512_set1_epi16(1));
|
||||
acc = _mm512_add_epi32(acc, product0);
|
||||
#endif
|
||||
}
|
||||
|
||||
[[maybe_unused]] static void m512_add_dpbusd_epi32x2(
|
||||
__m512i& acc,
|
||||
__m512i a0, __m512i b0,
|
||||
__m512i a1, __m512i b1) {
|
||||
[[maybe_unused]] static void
|
||||
m512_add_dpbusd_epi32x2(__m512i& acc, __m512i a0, __m512i b0, __m512i a1, __m512i b1) {
|
||||
|
||||
# if defined (USE_VNNI)
|
||||
acc = _mm512_dpbusd_epi32(acc, a0, b0);
|
||||
acc = _mm512_dpbusd_epi32(acc, a1, b1);
|
||||
# else
|
||||
__m512i product0 = _mm512_maddubs_epi16(a0, b0);
|
||||
__m512i product1 = _mm512_maddubs_epi16(a1, b1);
|
||||
product0 = _mm512_madd_epi16(product0, _mm512_set1_epi16(1));
|
||||
product1 = _mm512_madd_epi16(product1, _mm512_set1_epi16(1));
|
||||
acc = _mm512_add_epi32(acc, _mm512_add_epi32(product0, product1));
|
||||
# endif
|
||||
}
|
||||
#if defined(USE_VNNI)
|
||||
acc = _mm512_dpbusd_epi32(acc, a0, b0);
|
||||
acc = _mm512_dpbusd_epi32(acc, a1, b1);
|
||||
#else
|
||||
__m512i product0 = _mm512_maddubs_epi16(a0, b0);
|
||||
__m512i product1 = _mm512_maddubs_epi16(a1, b1);
|
||||
product0 = _mm512_madd_epi16(product0, _mm512_set1_epi16(1));
|
||||
product1 = _mm512_madd_epi16(product1, _mm512_set1_epi16(1));
|
||||
acc = _mm512_add_epi32(acc, _mm512_add_epi32(product0, product1));
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (USE_AVX2)
|
||||
#if defined(USE_AVX2)
|
||||
|
||||
[[maybe_unused]] static int m256_hadd(__m256i sum, int bias) {
|
||||
__m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
|
||||
sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
|
||||
sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
|
||||
return _mm_cvtsi128_si32(sum128) + bias;
|
||||
}
|
||||
[[maybe_unused]] static int m256_hadd(__m256i sum, int bias) {
|
||||
__m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
|
||||
sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
|
||||
sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
|
||||
return _mm_cvtsi128_si32(sum128) + bias;
|
||||
}
|
||||
|
||||
[[maybe_unused]] static void m256_add_dpbusd_epi32(
|
||||
__m256i& acc,
|
||||
__m256i a,
|
||||
__m256i b) {
|
||||
[[maybe_unused]] static void m256_add_dpbusd_epi32(__m256i& acc, __m256i a, __m256i b) {
|
||||
|
||||
# if defined (USE_VNNI)
|
||||
acc = _mm256_dpbusd_epi32(acc, a, b);
|
||||
# else
|
||||
__m256i product0 = _mm256_maddubs_epi16(a, b);
|
||||
product0 = _mm256_madd_epi16(product0, _mm256_set1_epi16(1));
|
||||
acc = _mm256_add_epi32(acc, product0);
|
||||
# endif
|
||||
}
|
||||
#if defined(USE_VNNI)
|
||||
acc = _mm256_dpbusd_epi32(acc, a, b);
|
||||
#else
|
||||
__m256i product0 = _mm256_maddubs_epi16(a, b);
|
||||
product0 = _mm256_madd_epi16(product0, _mm256_set1_epi16(1));
|
||||
acc = _mm256_add_epi32(acc, product0);
|
||||
#endif
|
||||
}
|
||||
|
||||
[[maybe_unused]] static void m256_add_dpbusd_epi32x2(
|
||||
__m256i& acc,
|
||||
__m256i a0, __m256i b0,
|
||||
__m256i a1, __m256i b1) {
|
||||
[[maybe_unused]] static void
|
||||
m256_add_dpbusd_epi32x2(__m256i& acc, __m256i a0, __m256i b0, __m256i a1, __m256i b1) {
|
||||
|
||||
# if defined (USE_VNNI)
|
||||
acc = _mm256_dpbusd_epi32(acc, a0, b0);
|
||||
acc = _mm256_dpbusd_epi32(acc, a1, b1);
|
||||
# else
|
||||
__m256i product0 = _mm256_maddubs_epi16(a0, b0);
|
||||
__m256i product1 = _mm256_maddubs_epi16(a1, b1);
|
||||
product0 = _mm256_madd_epi16(product0, _mm256_set1_epi16(1));
|
||||
product1 = _mm256_madd_epi16(product1, _mm256_set1_epi16(1));
|
||||
acc = _mm256_add_epi32(acc, _mm256_add_epi32(product0, product1));
|
||||
# endif
|
||||
}
|
||||
#if defined(USE_VNNI)
|
||||
acc = _mm256_dpbusd_epi32(acc, a0, b0);
|
||||
acc = _mm256_dpbusd_epi32(acc, a1, b1);
|
||||
#else
|
||||
__m256i product0 = _mm256_maddubs_epi16(a0, b0);
|
||||
__m256i product1 = _mm256_maddubs_epi16(a1, b1);
|
||||
product0 = _mm256_madd_epi16(product0, _mm256_set1_epi16(1));
|
||||
product1 = _mm256_madd_epi16(product1, _mm256_set1_epi16(1));
|
||||
acc = _mm256_add_epi32(acc, _mm256_add_epi32(product0, product1));
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (USE_SSSE3)
|
||||
#if defined(USE_SSSE3)
|
||||
|
||||
[[maybe_unused]] static int m128_hadd(__m128i sum, int bias) {
|
||||
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
|
||||
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
|
||||
return _mm_cvtsi128_si32(sum) + bias;
|
||||
}
|
||||
[[maybe_unused]] static int m128_hadd(__m128i sum, int bias) {
|
||||
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
|
||||
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
|
||||
return _mm_cvtsi128_si32(sum) + bias;
|
||||
}
|
||||
|
||||
[[maybe_unused]] static void m128_add_dpbusd_epi32(
|
||||
__m128i& acc,
|
||||
__m128i a,
|
||||
__m128i b) {
|
||||
[[maybe_unused]] static void m128_add_dpbusd_epi32(__m128i& acc, __m128i a, __m128i b) {
|
||||
|
||||
__m128i product0 = _mm_maddubs_epi16(a, b);
|
||||
product0 = _mm_madd_epi16(product0, _mm_set1_epi16(1));
|
||||
acc = _mm_add_epi32(acc, product0);
|
||||
}
|
||||
__m128i product0 = _mm_maddubs_epi16(a, b);
|
||||
product0 = _mm_madd_epi16(product0, _mm_set1_epi16(1));
|
||||
acc = _mm_add_epi32(acc, product0);
|
||||
}
|
||||
|
||||
[[maybe_unused]] static void m128_add_dpbusd_epi32x2(
|
||||
__m128i& acc,
|
||||
__m128i a0, __m128i b0,
|
||||
__m128i a1, __m128i b1) {
|
||||
[[maybe_unused]] static void
|
||||
m128_add_dpbusd_epi32x2(__m128i& acc, __m128i a0, __m128i b0, __m128i a1, __m128i b1) {
|
||||
|
||||
__m128i product0 = _mm_maddubs_epi16(a0, b0);
|
||||
__m128i product1 = _mm_maddubs_epi16(a1, b1);
|
||||
product0 = _mm_madd_epi16(product0, _mm_set1_epi16(1));
|
||||
product1 = _mm_madd_epi16(product1, _mm_set1_epi16(1));
|
||||
acc = _mm_add_epi32(acc, _mm_add_epi32(product0, product1));
|
||||
}
|
||||
__m128i product0 = _mm_maddubs_epi16(a0, b0);
|
||||
__m128i product1 = _mm_maddubs_epi16(a1, b1);
|
||||
product0 = _mm_madd_epi16(product0, _mm_set1_epi16(1));
|
||||
product1 = _mm_madd_epi16(product1, _mm_set1_epi16(1));
|
||||
acc = _mm_add_epi32(acc, _mm_add_epi32(product0, product1));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (USE_NEON_DOTPROD)
|
||||
#if defined(USE_NEON_DOTPROD)
|
||||
|
||||
[[maybe_unused]] static void dotprod_m128_add_dpbusd_epi32x2(
|
||||
int32x4_t& acc,
|
||||
int8x16_t a0, int8x16_t b0,
|
||||
int8x16_t a1, int8x16_t b1) {
|
||||
[[maybe_unused]] static void dotprod_m128_add_dpbusd_epi32x2(
|
||||
int32x4_t& acc, int8x16_t a0, int8x16_t b0, int8x16_t a1, int8x16_t b1) {
|
||||
|
||||
acc = vdotq_s32(acc, a0, b0);
|
||||
acc = vdotq_s32(acc, a1, b1);
|
||||
}
|
||||
acc = vdotq_s32(acc, a0, b0);
|
||||
acc = vdotq_s32(acc, a1, b1);
|
||||
}
|
||||
|
||||
[[maybe_unused]] static void dotprod_m128_add_dpbusd_epi32(
|
||||
int32x4_t& acc,
|
||||
int8x16_t a, int8x16_t b) {
|
||||
[[maybe_unused]] static void
|
||||
dotprod_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) {
|
||||
|
||||
acc = vdotq_s32(acc, a, b);
|
||||
}
|
||||
acc = vdotq_s32(acc, a, b);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined (USE_NEON)
|
||||
#if defined(USE_NEON)
|
||||
|
||||
[[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
|
||||
# if USE_NEON >= 8
|
||||
return vaddvq_s32(s);
|
||||
# else
|
||||
return s[0] + s[1] + s[2] + s[3];
|
||||
# endif
|
||||
}
|
||||
[[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
|
||||
#if USE_NEON >= 8
|
||||
return vaddvq_s32(s);
|
||||
#else
|
||||
return s[0] + s[1] + s[2] + s[3];
|
||||
#endif
|
||||
}
|
||||
|
||||
[[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) {
|
||||
return neon_m128_reduce_add_epi32(sum) + bias;
|
||||
}
|
||||
[[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) {
|
||||
return neon_m128_reduce_add_epi32(sum) + bias;
|
||||
}
|
||||
|
||||
[[maybe_unused]] static void neon_m128_add_dpbusd_epi32x2(
|
||||
int32x4_t& acc,
|
||||
int8x8_t a0, int8x8_t b0,
|
||||
int8x8_t a1, int8x8_t b1) {
|
||||
[[maybe_unused]] static void
|
||||
neon_m128_add_dpbusd_epi32x2(int32x4_t& acc, int8x8_t a0, int8x8_t b0, int8x8_t a1, int8x8_t b1) {
|
||||
|
||||
int16x8_t product = vmull_s8(a0, b0);
|
||||
product = vmlal_s8(product, a1, b1);
|
||||
acc = vpadalq_s16(acc, product);
|
||||
}
|
||||
int16x8_t product = vmull_s8(a0, b0);
|
||||
product = vmlal_s8(product, a1, b1);
|
||||
acc = vpadalq_s16(acc, product);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if USE_NEON >= 8
|
||||
[[maybe_unused]] static void neon_m128_add_dpbusd_epi32(
|
||||
int32x4_t& acc,
|
||||
int8x16_t a, int8x16_t b) {
|
||||
[[maybe_unused]] static void neon_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) {
|
||||
|
||||
int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
|
||||
int16x8_t product1 = vmull_high_s8(a, b);
|
||||
int16x8_t sum = vpaddq_s16(product0, product1);
|
||||
acc = vpadalq_s16(acc, sum);
|
||||
}
|
||||
int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
|
||||
int16x8_t product1 = vmull_high_s8(a, b);
|
||||
int16x8_t sum = vpaddq_s16(product0, product1);
|
||||
acc = vpadalq_s16(acc, sum);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // STOCKFISH_SIMD_H_INCLUDED
|
||||
#endif // STOCKFISH_SIMD_H_INCLUDED
|
||||
|
||||
@@ -29,80 +29,75 @@
|
||||
|
||||
namespace Stockfish::Eval::NNUE::Layers {
|
||||
|
||||
// Clipped ReLU
|
||||
template <IndexType InDims>
|
||||
class SqrClippedReLU {
|
||||
// Clipped ReLU
|
||||
template<IndexType InDims>
|
||||
class SqrClippedReLU {
|
||||
public:
|
||||
// Input/output type
|
||||
using InputType = std::int32_t;
|
||||
using InputType = std::int32_t;
|
||||
using OutputType = std::uint8_t;
|
||||
|
||||
// Number of input/output dimensions
|
||||
static constexpr IndexType InputDimensions = InDims;
|
||||
static constexpr IndexType InputDimensions = InDims;
|
||||
static constexpr IndexType OutputDimensions = InputDimensions;
|
||||
static constexpr IndexType PaddedOutputDimensions =
|
||||
ceil_to_multiple<IndexType>(OutputDimensions, 32);
|
||||
ceil_to_multiple<IndexType>(OutputDimensions, 32);
|
||||
|
||||
using OutputBuffer = OutputType[PaddedOutputDimensions];
|
||||
|
||||
// Hash value embedded in the evaluation file
|
||||
static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
|
||||
std::uint32_t hashValue = 0x538D24C7u;
|
||||
hashValue += prevHash;
|
||||
return hashValue;
|
||||
std::uint32_t hashValue = 0x538D24C7u;
|
||||
hashValue += prevHash;
|
||||
return hashValue;
|
||||
}
|
||||
|
||||
// Read network parameters
|
||||
bool read_parameters(std::istream&) {
|
||||
return true;
|
||||
}
|
||||
bool read_parameters(std::istream&) { return true; }
|
||||
|
||||
// Write network parameters
|
||||
bool write_parameters(std::ostream&) const {
|
||||
return true;
|
||||
}
|
||||
bool write_parameters(std::ostream&) const { return true; }
|
||||
|
||||
// Forward propagation
|
||||
void propagate(
|
||||
const InputType* input, OutputType* output) const {
|
||||
void propagate(const InputType* input, OutputType* output) const {
|
||||
|
||||
#if defined(USE_SSE2)
|
||||
constexpr IndexType NumChunks = InputDimensions / 16;
|
||||
#if defined(USE_SSE2)
|
||||
constexpr IndexType NumChunks = InputDimensions / 16;
|
||||
|
||||
static_assert(WeightScaleBits == 6);
|
||||
const auto in = reinterpret_cast<const __m128i*>(input);
|
||||
const auto out = reinterpret_cast<__m128i*>(output);
|
||||
for (IndexType i = 0; i < NumChunks; ++i) {
|
||||
__m128i words0 = _mm_packs_epi32(
|
||||
_mm_load_si128(&in[i * 4 + 0]),
|
||||
_mm_load_si128(&in[i * 4 + 1]));
|
||||
__m128i words1 = _mm_packs_epi32(
|
||||
_mm_load_si128(&in[i * 4 + 2]),
|
||||
_mm_load_si128(&in[i * 4 + 3]));
|
||||
static_assert(WeightScaleBits == 6);
|
||||
const auto in = reinterpret_cast<const __m128i*>(input);
|
||||
const auto out = reinterpret_cast<__m128i*>(output);
|
||||
for (IndexType i = 0; i < NumChunks; ++i)
|
||||
{
|
||||
__m128i words0 =
|
||||
_mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1]));
|
||||
__m128i words1 =
|
||||
_mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3]));
|
||||
|
||||
// We shift by WeightScaleBits * 2 = 12 and divide by 128
|
||||
// which is an additional shift-right of 7, meaning 19 in total.
|
||||
// MulHi strips the lower 16 bits so we need to shift out 3 more to match.
|
||||
words0 = _mm_srli_epi16(_mm_mulhi_epi16(words0, words0), 3);
|
||||
words1 = _mm_srli_epi16(_mm_mulhi_epi16(words1, words1), 3);
|
||||
// We shift by WeightScaleBits * 2 = 12 and divide by 128
|
||||
// which is an additional shift-right of 7, meaning 19 in total.
|
||||
// MulHi strips the lower 16 bits so we need to shift out 3 more to match.
|
||||
words0 = _mm_srli_epi16(_mm_mulhi_epi16(words0, words0), 3);
|
||||
words1 = _mm_srli_epi16(_mm_mulhi_epi16(words1, words1), 3);
|
||||
|
||||
_mm_store_si128(&out[i], _mm_packs_epi16(words0, words1));
|
||||
}
|
||||
constexpr IndexType Start = NumChunks * 16;
|
||||
_mm_store_si128(&out[i], _mm_packs_epi16(words0, words1));
|
||||
}
|
||||
constexpr IndexType Start = NumChunks * 16;
|
||||
|
||||
#else
|
||||
constexpr IndexType Start = 0;
|
||||
#endif
|
||||
#else
|
||||
constexpr IndexType Start = 0;
|
||||
#endif
|
||||
|
||||
for (IndexType i = Start; i < InputDimensions; ++i) {
|
||||
output[i] = static_cast<OutputType>(
|
||||
// Really should be /127 but we need to make it fast so we right shift
|
||||
// by an extra 7 bits instead. Needs to be accounted for in the trainer.
|
||||
std::min(127ll, ((long long)input[i] * input[i]) >> (2 * WeightScaleBits + 7)));
|
||||
}
|
||||
for (IndexType i = Start; i < InputDimensions; ++i)
|
||||
{
|
||||
output[i] = static_cast<OutputType>(
|
||||
// Really should be /127 but we need to make it fast so we right shift
|
||||
// by an extra 7 bits instead. Needs to be accounted for in the trainer.
|
||||
std::min(127ll, ((long long) input[i] * input[i]) >> (2 * WeightScaleBits + 7)));
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace Stockfish::Eval::NNUE::Layers
|
||||
|
||||
#endif // NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
|
||||
#endif // NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
|
||||
|
||||
Reference in New Issue
Block a user