From 1c8495b54b7b5c52d33492f458b829f18fe61460 Mon Sep 17 00:00:00 2001 From: Tomasz Sobczyk Date: Wed, 25 Nov 2020 20:37:38 +0100 Subject: [PATCH] Remove handwritten saxpy because compilers optimize the second look anyway. --- src/extra/stockfish_blas.cpp | 45 +------------------ .../trainer/trainer_feature_transformer.h | 6 +-- 2 files changed, 5 insertions(+), 46 deletions(-) diff --git a/src/extra/stockfish_blas.cpp b/src/extra/stockfish_blas.cpp index 109a4b44..2bf28b8f 100644 --- a/src/extra/stockfish_blas.cpp +++ b/src/extra/stockfish_blas.cpp @@ -178,53 +178,11 @@ namespace Blas { ) { -#if defined (USE_SSE2) - - const __m128 alpha4 = _mm_set1_ps(alpha); - - int i = 0; - for(; i < N - 15; i += 16) - { - __m128 x0 = _mm_loadu_ps(X + i + 0); - __m128 x1 = _mm_loadu_ps(X + i + 4); - __m128 x2 = _mm_loadu_ps(X + i + 8); - __m128 x3 = _mm_loadu_ps(X + i + 12); - - __m128 y0 = _mm_loadu_ps(Y + i + 0); - __m128 y1 = _mm_loadu_ps(Y + i + 4); - __m128 y2 = _mm_loadu_ps(Y + i + 8); - __m128 y3 = _mm_loadu_ps(Y + i + 12); - - x0 = _mm_mul_ps(x0, alpha4); - x1 = _mm_mul_ps(x1, alpha4); - x2 = _mm_mul_ps(x2, alpha4); - x3 = _mm_mul_ps(x3, alpha4); - - x0 = _mm_add_ps(x0, y0); - x1 = _mm_add_ps(x1, y1); - x2 = _mm_add_ps(x2, y2); - x3 = _mm_add_ps(x3, y3); - - _mm_storeu_ps(Y + i + 0, x0); - _mm_storeu_ps(Y + i + 4, x1); - _mm_storeu_ps(Y + i + 8, x2); - _mm_storeu_ps(Y + i + 12, x3); - } - - for(; i < N; ++i) - { - Y[i] += X[i] * alpha; - } - -#else - for(int i = 0; i < N; ++i) { Y[i] += X[i] * alpha; } -#endif - } void saxpy( @@ -564,7 +522,8 @@ namespace Blas { const __m128 alpha4 = _mm_set1_ps(alpha); const __m128 beta4 = _mm_set1_ps(beta); - for (int m = 0; m < M - 1; m += 2) + int m = 0; + for (; m < M - 1; m += 2) { int n = 0; for (; n < N - 3; n += 4) diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h index 9686002f..78729064 100644 --- a/src/nnue/trainer/trainer_feature_transformer.h +++ b/src/nnue/trainer/trainer_feature_transformer.h @@ -164,7 +164,7 @@ namespace Eval::NNUE { const IndexType weights_offset = kHalfDimensions * feature.get_index(); Blas::saxpy( kHalfDimensions, (float)feature.get_count(), - &weights_[weights_offset], 1, &output_[output_offset], 1 + &weights_[weights_offset], &output_[output_offset] ); } @@ -497,8 +497,8 @@ namespace Eval::NNUE { Blas::saxpy( kHalfDimensions, -scale, - &gradients_[output_offset], 1, - &weights_[weights_offset], 1 + &gradients_[output_offset], + &weights_[weights_offset] ); #endif