From 35ab8254b70f62a4e0138c475fad0c77dcc0af2d Mon Sep 17 00:00:00 2001 From: mckx00 Date: Sun, 13 Sep 2020 19:28:32 -0700 Subject: [PATCH 1/8] Simplify StatSCore Initialization No need to initialize StatScore at rootNode. Current Logic is redundant because at subsequent levels the grandchildren statScore is initialized to zero. closes https://github.com/official-stockfish/Stockfish/pull/3122 Non functional change. --- src/search.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index 4aeadc28..07c491b6 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -654,9 +654,7 @@ namespace { // starts with statScore = 0. Later grandchildren start with the last calculated // statScore of the previous grandchild. This influences the reduction rules in // LMR which are based on the statScore of parent position. - if (rootNode) - (ss+4)->statScore = 0; - else + if (!rootNode) (ss+2)->statScore = 0; // Step 4. Transposition table lookup. We don't want the score of a partial From 7135678f71b7f6ee32e92b8dbef2b16b403d8ea9 Mon Sep 17 00:00:00 2001 From: Sergio Vieri Date: Mon, 14 Sep 2020 17:24:05 +0800 Subject: [PATCH 2/8] Update default net to nn-03744f8d56d8.nnue Equivalent to 20200914-1520 closes https://github.com/official-stockfish/Stockfish/pull/3123 Bench: 4222126 --- src/evaluate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evaluate.h b/src/evaluate.h index 3da6a9fe..c723bd8f 100644 --- a/src/evaluate.h +++ b/src/evaluate.h @@ -38,7 +38,7 @@ namespace Eval { // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue // for the build process (profile-build and fishtest) to work. Do not change the // name of the macro, as it is used in the Makefile. - #define EvalFileDefaultName "nn-308d71810dff.nnue" + #define EvalFileDefaultName "nn-03744f8d56d8.nnue" namespace NNUE { From 5f426d8667feda65eaf1eca699f629d31e170d43 Mon Sep 17 00:00:00 2001 From: xoto10 Date: Thu, 10 Sep 2020 21:10:57 +0100 Subject: [PATCH 3/8] Use 2 * bestMoveChanges. NNUE appears to provide a more stable eval than the classic eval, so the time use dependencies on bestMoveChanges, fallingEval, etc may need to change to make the best use of available time. This change doubles the effect of totBestMoveChanges when giving more time because the choice of best move is unstable. STC: LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 101928 W: 11995 L: 11698 D: 78235 Elo +0.78 Ptnml(0-2): 592, 8707, 32103, 8936, 626 https://tests.stockfishchess.org/tests/view/5f538a462d02727c56b36cec LTC: LLR: 2.94 (-2.94,2.94) {0.25,1.25} Total: 186392 W: 10383 L: 9877 D: 166132 Elo +0.81 Ptnml(0-2): 207, 8370, 75539, 8870, 210 https://tests.stockfishchess.org/tests/view/5f54a9712d02727c56b36d5a closes https://github.com/official-stockfish/Stockfish/pull/3119 Bench 4222126 --- src/search.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search.cpp b/src/search.cpp index 07c491b6..c7d2efd4 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -520,7 +520,7 @@ void Thread::search() { totBestMoveChanges += th->bestMoveChanges; th->bestMoveChanges = 0; } - double bestMoveInstability = 1 + totBestMoveChanges / Threads.size(); + double bestMoveInstability = 1 + 2 * totBestMoveChanges / Threads.size(); double totalTime = rootMoves.size() == 1 ? 0 : Time.optimum() * fallingEval * reduction * bestMoveInstability; From d86663af141f1256bfc32ab95891e944d84e8755 Mon Sep 17 00:00:00 2001 From: syzygy1 <3028851+syzygy1@users.noreply.github.com> Date: Sun, 13 Sep 2020 20:16:52 +0200 Subject: [PATCH 4/8] Improve NDK section in Makefile This PR sets the "comp" variable simply to "clang", which seems to be more consistent and allows a small simplification. The PR also moves the section that sets "profile_make" and "profile_use" to after the NDK section, which ensures that these variables are now set correctly for NDK/clang. closes https://github.com/official-stockfish/Stockfish/pull/3121 No functional change --- src/Makefile | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/src/Makefile b/src/Makefile index 340b3008..54868b39 100644 --- a/src/Makefile +++ b/src/Makefile @@ -381,19 +381,6 @@ ifeq ($(COMP),clang) endif endif -ifeq ($(comp),icc) - profile_make = icc-profile-make - profile_use = icc-profile-use -else -ifeq ($(comp),clang) - profile_make = clang-profile-make - profile_use = clang-profile-use -else - profile_make = gcc-profile-make - profile_use = gcc-profile-use -endif -endif - ifeq ($(KERNEL),Darwin) CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14 LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14 @@ -405,20 +392,30 @@ endif # Currently we don't know how to make PGO builds with the NDK yet. ifeq ($(COMP),ndk) CXXFLAGS += -stdlib=libc++ -fPIE + comp=clang ifeq ($(arch),armv7) - comp=armv7a-linux-androideabi16-clang CXX=armv7a-linux-androideabi16-clang++ CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon STRIP=arm-linux-androideabi-strip endif ifeq ($(arch),armv8) - comp=aarch64-linux-android21-clang CXX=aarch64-linux-android21-clang++ STRIP=aarch64-linux-android-strip endif LDFLAGS += -static-libstdc++ -pie -lm -latomic endif +ifeq ($(comp),icc) + profile_make = icc-profile-make + profile_use = icc-profile-use +else ifeq ($(comp),clang) + profile_make = clang-profile-make + profile_use = clang-profile-use +else + profile_make = gcc-profile-make + profile_use = gcc-profile-use +endif + ### Travis CI script uses COMPILER to overwrite CXX ifdef COMPILER COMPCXX=$(COMPILER) @@ -590,10 +587,7 @@ endif ### needs access to the optimization flags. ifeq ($(optimize),yes) ifeq ($(debug), no) - ifeq ($(COMP),ndk) - CXXFLAGS += -flto=thin - LDFLAGS += $(CXXFLAGS) - else ifeq ($(comp),clang) + ifeq ($(comp),clang) CXXFLAGS += -flto=thin ifneq ($(findstring MINGW,$(KERNEL)),) CXXFLAGS += -fuse-ld=lld From df43805953b241f95c246ff3e96aece76b518590 Mon Sep 17 00:00:00 2001 From: GoldenRare Date: Thu, 10 Sep 2020 00:24:40 -0400 Subject: [PATCH 5/8] Added FEN string to bench output fixes https://github.com/official-stockfish/Stockfish/pull/3117 closes https://github.com/official-stockfish/Stockfish/pull/3118 No functional change --- AUTHORS | 1 + src/uci.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/AUTHORS b/AUTHORS index c00ab657..198dfa5a 100644 --- a/AUTHORS +++ b/AUTHORS @@ -63,6 +63,7 @@ Gary Heckman (gheckman) George Sobala (gsobala) gguliash Gian-Carlo Pascutto (gcp) +Deshawn Mohan-Smith (GoldenRare) Gontran Lemaire (gonlem) Goodkov Vasiliy Aleksandrovich (goodkov) Gregor Cramer diff --git a/src/uci.cpp b/src/uci.cpp index bc0ee0a0..3f3cc458 100644 --- a/src/uci.cpp +++ b/src/uci.cpp @@ -170,7 +170,7 @@ namespace { if (token == "go" || token == "eval") { - cerr << "\nPosition: " << cnt++ << '/' << num << endl; + cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")" << endl; if (token == "go") { go(pos, is, states); From 0ca93c5b94b820a41e2850ede084096120128a28 Mon Sep 17 00:00:00 2001 From: Unai Corzo Date: Wed, 16 Sep 2020 19:14:32 +0200 Subject: [PATCH 6/8] Remove castling extension STC https://tests.stockfishchess.org/tests/view/5f5fa5348fbc1c8a3f476eca LLR: 2.94 (-2.94,2.94) {-1.25,0.25} Total: 38520 W: 4713 L: 4610 D: 29197 Ptnml(0-2): 233, 3486, 11734, 3559, 248 LTC https://tests.stockfishchess.org/tests/view/5f62166a912c15f19854b806 LLR: 2.93 (-2.94,2.94) {-0.75,0.25} Total: 48024 W: 2673 L: 2600 D: 42751 Ptnml(0-2): 64, 2247, 19316, 2322, 63 closes https://github.com/official-stockfish/Stockfish/pull/3128 bench: 3818400 --- src/search.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index c7d2efd4..17cd0a73 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1127,11 +1127,6 @@ moves_loop: // When in check, search starts from here && pos.non_pawn_material() <= 2 * RookValueMg) extension = 1; - // Castling extension - if ( type_of(move) == CASTLING - && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2) - extension = 1; - // Late irreversible move extension if ( move == ttMove && pos.rule50_count() > 80 From 64a63464d7bc72a3aac33aa680cd2b2b240ff903 Mon Sep 17 00:00:00 2001 From: Unai Corzo Date: Wed, 16 Sep 2020 20:42:38 +0200 Subject: [PATCH 7/8] Simplify futility pruning for captures STC https://tests.stockfishchess.org/tests/view/5f61f0e4b91f2ec371e429c2 LLR: 2.94 (-2.94,2.94) {-1.25,0.25} Total: 75512 W: 8747 L: 8704 D: 58061 Ptnml(0-2): 440, 6589, 23683, 6576, 468 LTC https://tests.stockfishchess.org/tests/view/5f6215d3912c15f19854b801 LLR: 2.95 (-2.94,2.94) {-0.75,0.25} Total: 92912 W: 5030 L: 4992 D: 82890 Ptnml(0-2): 88, 4363, 37532, 4369, 104 closes https://github.com/official-stockfish/Stockfish/pull/3129 bench: 3856086 --- src/search.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/search.cpp b/src/search.cpp index 17cd0a73..9c5fb58b 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1056,7 +1056,6 @@ moves_loop: // When in check, search starts from here if ( !givesCheck && lmrDepth < 6 && !(PvNode && abs(bestValue) < 2) - && PieceValue[MG][type_of(movedPiece)] >= PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] && !ss->inCheck && ss->staticEval + 169 + 244 * lmrDepth + PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] <= alpha) From 8b8a510fd6a1a17b39b2d4b166f60ac7be0dab23 Mon Sep 17 00:00:00 2001 From: syzygy1 <3028851+syzygy1@users.noreply.github.com> Date: Wed, 16 Sep 2020 17:39:11 +0200 Subject: [PATCH 8/8] Use tiling to speed up accumulator refreshes and updates Perform the update and refresh operations tile by tile in a local array of vectors. By selecting the array size carefully, we achieve that the compiler keeps the whole array in vector registers. Idea and original implementation by @sf-x. STC: https://tests.stockfishchess.org/tests/view/5f623eec912c15f19854b855 LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 4872 W: 623 L: 477 D: 3772 Ptnml(0-2): 14, 350, 1585, 450, 37 LTC: https://tests.stockfishchess.org/tests/view/5f62434e912c15f19854b860 LLR: 2.94 (-2.94,2.94) {0.25,1.25} Total: 25808 W: 1565 L: 1401 D: 22842 Ptnml(0-2): 23, 1186, 10332, 1330, 33 closes https://github.com/official-stockfish/Stockfish/pull/3130 No functional change --- src/nnue/nnue_feature_transformer.h | 233 +++++++++++++++------------- 1 file changed, 125 insertions(+), 108 deletions(-) diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 2b6259c3..e71ee60d 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -29,6 +29,56 @@ namespace Eval::NNUE { + // If vector instructions are enabled, we update and refresh the + // accumulator tile by tile such that each tile fits in the CPU's + // vector registers. + #define TILING + + #ifdef USE_AVX512 + typedef __m512i vec_t; + #define vec_load(a) _mm512_loadA_si512(a) + #define vec_store(a,b) _mm512_storeA_si512(a,b) + #define vec_add_16(a,b) _mm512_add_epi16(a,b) + #define vec_sub_16(a,b) _mm512_sub_epi16(a,b) + static constexpr IndexType kNumRegs = 8; // only 8 are needed + + #elif USE_AVX2 + typedef __m256i vec_t; + #define vec_load(a) _mm256_loadA_si256(a) + #define vec_store(a,b) _mm256_storeA_si256(a,b) + #define vec_add_16(a,b) _mm256_add_epi16(a,b) + #define vec_sub_16(a,b) _mm256_sub_epi16(a,b) + static constexpr IndexType kNumRegs = 16; + + #elif USE_SSE2 + typedef __m128i vec_t; + #define vec_load(a) (*(a)) + #define vec_store(a,b) *(a)=(b) + #define vec_add_16(a,b) _mm_add_epi16(a,b) + #define vec_sub_16(a,b) _mm_sub_epi16(a,b) + static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8; + + #elif USE_MMX + typedef __m64 vec_t; + #define vec_load(a) (*(a)) + #define vec_store(a,b) *(a)=(b) + #define vec_add_16(a,b) _mm_add_pi16(a,b) + #define vec_sub_16(a,b) _mm_sub_pi16(a,b) + static constexpr IndexType kNumRegs = 8; + + #elif USE_NEON + typedef int16x8_t vec_t; + #define vec_load(a) (*(a)) + #define vec_store(a,b) *(a)=(b) + #define vec_add_16(a,b) vaddq_s16(a,b) + #define vec_sub_16(a,b) vsubq_s16(a,b) + static constexpr IndexType kNumRegs = 16; + + #else + #undef TILING + + #endif + // Input feature converter class FeatureTransformer { @@ -36,6 +86,11 @@ namespace Eval::NNUE { // Number of output dimensions for one side static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions; + #ifdef TILING + static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2; + static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions"); + #endif + public: // Output type using OutputType = TransformedFeatureType; @@ -189,57 +244,41 @@ namespace Eval::NNUE { RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i], active_indices); for (Color perspective : { WHITE, BLACK }) { + #ifdef TILING + for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) { + auto biasesTile = reinterpret_cast( + &biases_[j * kTileHeight]); + auto accTile = reinterpret_cast( + &accumulator.accumulation[perspective][i][j * kTileHeight]); + vec_t acc[kNumRegs]; + + for (unsigned k = 0; k < kNumRegs; ++k) + acc[k] = biasesTile[k]; + + for (const auto index : active_indices[perspective]) { + const IndexType offset = kHalfDimensions * index + j * kTileHeight; + auto column = reinterpret_cast(&weights_[offset]); + + for (unsigned k = 0; k < kNumRegs; ++k) + acc[k] = vec_add_16(acc[k], column[k]); + } + + for (unsigned k = 0; k < kNumRegs; k++) + vec_store(&accTile[k], acc[k]); + } + #else std::memcpy(accumulator.accumulation[perspective][i], biases_, - kHalfDimensions * sizeof(BiasType)); + kHalfDimensions * sizeof(BiasType)); + for (const auto index : active_indices[perspective]) { const IndexType offset = kHalfDimensions * index; - #if defined(USE_AVX512) - auto accumulation = reinterpret_cast<__m512i*>( - &accumulator.accumulation[perspective][i][0]); - auto column = reinterpret_cast(&weights_[offset]); - constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; - for (IndexType j = 0; j < kNumChunks; ++j) - _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j])); - #elif defined(USE_AVX2) - auto accumulation = reinterpret_cast<__m256i*>( - &accumulator.accumulation[perspective][i][0]); - auto column = reinterpret_cast(&weights_[offset]); - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) - _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j])); - - #elif defined(USE_SSE2) - auto accumulation = reinterpret_cast<__m128i*>( - &accumulator.accumulation[perspective][i][0]); - auto column = reinterpret_cast(&weights_[offset]); - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); - - #elif defined(USE_MMX) - auto accumulation = reinterpret_cast<__m64*>( - &accumulator.accumulation[perspective][i][0]); - auto column = reinterpret_cast(&weights_[offset]); - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm_add_pi16(accumulation[j], column[j]); - - #elif defined(USE_NEON) - auto accumulation = reinterpret_cast( - &accumulator.accumulation[perspective][i][0]); - auto column = reinterpret_cast(&weights_[offset]); - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = vaddq_s16(accumulation[j], column[j]); - - #else for (IndexType j = 0; j < kHalfDimensions; ++j) accumulator.accumulation[perspective][i][j] += weights_[offset + j]; - #endif - } + #endif } + #if defined(USE_MMX) _mm_empty(); #endif @@ -257,29 +296,55 @@ namespace Eval::NNUE { bool reset[2]; RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i], removed_indices, added_indices, reset); - for (Color perspective : { WHITE, BLACK }) { - #if defined(USE_AVX2) - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - auto accumulation = reinterpret_cast<__m256i*>( - &accumulator.accumulation[perspective][i][0]); + #ifdef TILING + for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) { + for (Color perspective : { WHITE, BLACK }) { + auto accTile = reinterpret_cast( + &accumulator.accumulation[perspective][i][j * kTileHeight]); + vec_t acc[kNumRegs]; - #elif defined(USE_SSE2) - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - auto accumulation = reinterpret_cast<__m128i*>( - &accumulator.accumulation[perspective][i][0]); + if (reset[perspective]) { + auto biasesTile = reinterpret_cast( + &biases_[j * kTileHeight]); + for (unsigned k = 0; k < kNumRegs; ++k) + acc[k] = biasesTile[k]; + } else { + auto prevAccTile = reinterpret_cast( + &prev_accumulator.accumulation[perspective][i][j * kTileHeight]); + for (IndexType k = 0; k < kNumRegs; ++k) + acc[k] = vec_load(&prevAccTile[k]); - #elif defined(USE_MMX) - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - auto accumulation = reinterpret_cast<__m64*>( - &accumulator.accumulation[perspective][i][0]); + // Difference calculation for the deactivated features + for (const auto index : removed_indices[perspective]) { + const IndexType offset = kHalfDimensions * index + j * kTileHeight; + auto column = reinterpret_cast(&weights_[offset]); - #elif defined(USE_NEON) - constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - auto accumulation = reinterpret_cast( - &accumulator.accumulation[perspective][i][0]); + for (IndexType k = 0; k < kNumRegs; ++k) + acc[k] = vec_sub_16(acc[k], column[k]); + } + } + { // Difference calculation for the activated features + for (const auto index : added_indices[perspective]) { + const IndexType offset = kHalfDimensions * index + j * kTileHeight; + auto column = reinterpret_cast(&weights_[offset]); + + for (IndexType k = 0; k < kNumRegs; ++k) + acc[k] = vec_add_16(acc[k], column[k]); + } + } + + for (IndexType k = 0; k < kNumRegs; ++k) + vec_store(&accTile[k], acc[k]); + } + } + #if defined(USE_MMX) + _mm_empty(); #endif + #else + for (Color perspective : { WHITE, BLACK }) { + if (reset[perspective]) { std::memcpy(accumulator.accumulation[perspective][i], biases_, kHalfDimensions * sizeof(BiasType)); @@ -291,67 +356,19 @@ namespace Eval::NNUE { for (const auto index : removed_indices[perspective]) { const IndexType offset = kHalfDimensions * index; - #if defined(USE_AVX2) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]); - - #elif defined(USE_SSE2) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]); - - #elif defined(USE_MMX) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]); - - #elif defined(USE_NEON) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = vsubq_s16(accumulation[j], column[j]); - - #else for (IndexType j = 0; j < kHalfDimensions; ++j) accumulator.accumulation[perspective][i][j] -= weights_[offset + j]; - #endif - } } { // Difference calculation for the activated features for (const auto index : added_indices[perspective]) { const IndexType offset = kHalfDimensions * index; - #if defined(USE_AVX2) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]); - - #elif defined(USE_SSE2) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); - - #elif defined(USE_MMX) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = _mm_add_pi16(accumulation[j], column[j]); - - #elif defined(USE_NEON) - auto column = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) - accumulation[j] = vaddq_s16(accumulation[j], column[j]); - - #else for (IndexType j = 0; j < kHalfDimensions; ++j) accumulator.accumulation[perspective][i][j] += weights_[offset + j]; - #endif - } } } - #if defined(USE_MMX) - _mm_empty(); #endif accumulator.computed_accumulation = true;