Refactor accumulator storage/updates

Passed Non-regression STC: LLR: 2.93 (-2.94,2.94) <-1.75,0.25> Total: 115840 W: 29983 L: 29854 D: 56003 Ptnml(0-2): 338, 12990, 31149, 13091, 352 https://tests.stockfishchess.org/tests/view/67d0a044166a3e8781d84223 closes https://github.com/official-stockfish/Stockfish/pull/5927 No functional change
2025-12-06 10:53:50 +08:00 · 2025-03-09 19:33:30 -07:00
parent 66aee01bb1
commit fc0e0a44d4
17 changed files with 813 additions and 527 deletions
--- a/src/Makefile
+++ b/src/Makefile
@@ -55,7 +55,8 @@ PGOBENCH = $(WINE_PATH) ./$(EXE) bench
 SRCS = benchmark.cpp bitboard.cpp evaluate.cpp main.cpp \
 	misc.cpp movegen.cpp movepick.cpp position.cpp \
 	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
-	nnue/nnue_misc.cpp nnue/features/half_ka_v2_hm.cpp nnue/network.cpp engine.cpp score.cpp memory.cpp
+	nnue/nnue_accumulator.cpp nnue/nnue_misc.cpp nnue/features/half_ka_v2_hm.cpp nnue/network.cpp \
+	engine.cpp score.cpp memory.cpp

 HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h history.h \
 		nnue/nnue_misc.h nnue/features/half_ka_v2_hm.h nnue/layers/affine_transform.h \
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -54,21 +54,22 @@ bool Eval::use_smallnet(const Position& pos) {
 // of the position from the point of view of the side to move.
 Value Eval::evaluate(const Eval::NNUE::Networks&    networks,
                     const Position&                pos,
+                     Eval::NNUE::AccumulatorStack&  accumulators,
                     Eval::NNUE::AccumulatorCaches& caches,
                     int                            optimism) {

    assert(!pos.checkers());

    bool smallNet           = use_smallnet(pos);
-    auto [psqt, positional] = smallNet ? networks.small.evaluate(pos, &caches.small)
-                                       : networks.big.evaluate(pos, &caches.big);
+    auto [psqt, positional] = smallNet ? networks.small.evaluate(pos, accumulators, &caches.small)
+                                       : networks.big.evaluate(pos, accumulators, &caches.big);

    Value nnue = (125 * psqt + 131 * positional) / 128;

    // Re-evaluate the position when higher eval accuracy is worth the time spent
    if (smallNet && (std::abs(nnue) < 236))
    {
-        std::tie(psqt, positional) = networks.big.evaluate(pos, &caches.big);
+        std::tie(psqt, positional) = networks.big.evaluate(pos, accumulators, &caches.big);
        nnue                       = (125 * psqt + 131 * positional) / 128;
        smallNet                   = false;
    }
@@ -99,7 +100,10 @@ std::string Eval::trace(Position& pos, const Eval::NNUE::Networks& networks) {
    if (pos.checkers())
        return "Final evaluation: none (in check)";

-    auto caches = std::make_unique<Eval::NNUE::AccumulatorCaches>(networks);
+    Eval::NNUE::AccumulatorStack accumulators;
+    auto                         caches = std::make_unique<Eval::NNUE::AccumulatorCaches>(networks);
+
+    accumulators.reset(pos, networks, *caches);

    std::stringstream ss;
    ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2);
@@ -107,12 +111,12 @@ std::string Eval::trace(Position& pos, const Eval::NNUE::Networks& networks) {

    ss << std::showpoint << std::showpos << std::fixed << std::setprecision(2) << std::setw(15);

-    auto [psqt, positional] = networks.big.evaluate(pos, &caches->big);
+    auto [psqt, positional] = networks.big.evaluate(pos, accumulators, &caches->big);
    Value v                 = psqt + positional;
    v                       = pos.side_to_move() == WHITE ? v : -v;
    ss << "NNUE evaluation        " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)\n";

-    v = evaluate(networks, pos, *caches, VALUE_ZERO);
+    v = evaluate(networks, pos, accumulators, *caches, VALUE_ZERO);
    v = pos.side_to_move() == WHITE ? v : -v;
    ss << "Final evaluation       " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)";
    ss << " [with scaled NNUE, ...]";
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -39,6 +39,7 @@ namespace Eval {
 namespace NNUE {
 struct Networks;
 struct AccumulatorCaches;
+class AccumulatorStack;
 }

 std::string trace(Position& pos, const Eval::NNUE::Networks& networks);
@@ -47,6 +48,7 @@ int   simple_eval(const Position& pos, Color c);
 bool  use_smallnet(const Position& pos);
 Value evaluate(const NNUE::Networks&          networks,
               const Position&                pos,
+               Eval::NNUE::AccumulatorStack&  accumulators,
               Eval::NNUE::AccumulatorCaches& caches,
               int                            optimism);
 }  // namespace Eval
--- a/src/nnue/features/half_ka_v2_hm.cpp
+++ b/src/nnue/features/half_ka_v2_hm.cpp
@@ -77,8 +77,8 @@ template void HalfKAv2_hm::append_changed_indices<BLACK>(Square            ksq,
                                                         IndexList&        removed,
                                                         IndexList&        added);

-bool HalfKAv2_hm::requires_refresh(const StateInfo* st, Color perspective) {
-    return st->dirtyPiece.piece[0] == make_piece(perspective, KING);
+bool HalfKAv2_hm::requires_refresh(const DirtyPiece& dirtyPiece, Color perspective) {
+    return dirtyPiece.piece[0] == make_piece(perspective, KING);
 }

 }  // namespace Stockfish::Eval::NNUE::Features
--- a/src/nnue/features/half_ka_v2_hm.h
+++ b/src/nnue/features/half_ka_v2_hm.h
@@ -28,7 +28,6 @@
 #include "../nnue_common.h"

 namespace Stockfish {
-struct StateInfo;
 class Position;
 }

@@ -135,9 +134,9 @@ class HalfKAv2_hm {
    static void
    append_changed_indices(Square ksq, const DirtyPiece& dp, IndexList& removed, IndexList& added);

-    // Returns whether the change stored in this StateInfo means
+    // Returns whether the change stored in this DirtyPiece means
    // that a full accumulator refresh is required.
-    static bool requires_refresh(const StateInfo* st, Color perspective);
+    static bool requires_refresh(const DirtyPiece& dirtyPiece, Color perspective);
 };

 }  // namespace Stockfish::Eval::NNUE::Features
--- a/src/nnue/network.cpp
+++ b/src/nnue/network.cpp
@@ -210,6 +210,7 @@ bool Network<Arch, Transformer>::save(const std::optional<std::string>& filename
 template<typename Arch, typename Transformer>
 NetworkOutput
 Network<Arch, Transformer>::evaluate(const Position&                         pos,
+                                     AccumulatorStack&                       accumulatorStack,
                                     AccumulatorCaches::Cache<FTDimensions>* cache) const {
    // We manually align the arrays on the stack because with gcc < 9.3
    // overaligning stack variables with alignas() doesn't work correctly.
@@ -229,8 +230,9 @@ Network<Arch, Transformer>::evaluate(const Position&                         pos

    ASSERT_ALIGNED(transformedFeatures, alignment);

-    const int  bucket     = (pos.count<ALL_PIECES>() - 1) / 4;
-    const auto psqt       = featureTransformer->transform(pos, cache, transformedFeatures, bucket);
+    const int  bucket = (pos.count<ALL_PIECES>() - 1) / 4;
+    const auto psqt =
+      featureTransformer->transform(pos, accumulatorStack, cache, transformedFeatures, bucket);
    const auto positional = network[bucket].propagate(transformedFeatures);
    return {static_cast<Value>(psqt / OutputScale), static_cast<Value>(positional / OutputScale)};
 }
@@ -280,6 +282,7 @@ void Network<Arch, Transformer>::verify(std::string
 template<typename Arch, typename Transformer>
 NnueEvalTrace
 Network<Arch, Transformer>::trace_evaluate(const Position&                         pos,
+                                           AccumulatorStack&                       accumulatorStack,
                                           AccumulatorCaches::Cache<FTDimensions>* cache) const {
    // We manually align the arrays on the stack because with gcc < 9.3
    // overaligning stack variables with alignas() doesn't work correctly.
@@ -303,7 +306,7 @@ Network<Arch, Transformer>::trace_evaluate(const Position&
    for (IndexType bucket = 0; bucket < LayerStacks; ++bucket)
    {
        const auto materialist =
-          featureTransformer->transform(pos, cache, transformedFeatures, bucket);
+          featureTransformer->transform(pos, accumulatorStack, cache, transformedFeatures, bucket);
        const auto positional = network[bucket].propagate(transformedFeatures);

        t.psqt[bucket]       = static_cast<Value>(materialist / OutputScale);
@@ -447,14 +450,14 @@ bool Network<Arch, Transformer>::write_parameters(std::ostream&      stream,
    return bool(stream);
 }

-// Explicit template instantiation
+// Explicit template instantiations

 template class Network<
  NetworkArchitecture<TransformedFeatureDimensionsBig, L2Big, L3Big>,
-  FeatureTransformer<TransformedFeatureDimensionsBig, &StateInfo::accumulatorBig>>;
+  FeatureTransformer<TransformedFeatureDimensionsBig, &AccumulatorState::accumulatorBig>>;

 template class Network<
  NetworkArchitecture<TransformedFeatureDimensionsSmall, L2Small, L3Small>,
-  FeatureTransformer<TransformedFeatureDimensionsSmall, &StateInfo::accumulatorSmall>>;
+  FeatureTransformer<TransformedFeatureDimensionsSmall, &AccumulatorState::accumulatorSmall>>;

 }  // namespace Stockfish::Eval::NNUE
--- a/src/nnue/network.h
+++ b/src/nnue/network.h
@@ -29,13 +29,16 @@
 #include <utility>

 #include "../memory.h"
-#include "../position.h"
 #include "../types.h"
 #include "nnue_accumulator.h"
 #include "nnue_architecture.h"
 #include "nnue_feature_transformer.h"
 #include "nnue_misc.h"

+namespace Stockfish {
+class Position;
+}
+
 namespace Stockfish::Eval::NNUE {

 enum class EmbeddedNNUEType {
@@ -64,11 +67,13 @@ class Network {
    bool save(const std::optional<std::string>& filename) const;

    NetworkOutput evaluate(const Position&                         pos,
+                           AccumulatorStack&                       accumulatorStack,
                           AccumulatorCaches::Cache<FTDimensions>* cache) const;


    void verify(std::string evalfilePath, const std::function<void(std::string_view)>&) const;
    NnueEvalTrace trace_evaluate(const Position&                         pos,
+                                 AccumulatorStack&                       accumulatorStack,
                                 AccumulatorCaches::Cache<FTDimensions>* cache) const;

   private:
@@ -100,16 +105,18 @@ class Network {

    template<IndexType Size>
    friend struct AccumulatorCaches::Cache;
+
+    friend class AccumulatorStack;
 };

 // Definitions of the network types
 using SmallFeatureTransformer =
-  FeatureTransformer<TransformedFeatureDimensionsSmall, &StateInfo::accumulatorSmall>;
+  FeatureTransformer<TransformedFeatureDimensionsSmall, &AccumulatorState::accumulatorSmall>;
 using SmallNetworkArchitecture =
  NetworkArchitecture<TransformedFeatureDimensionsSmall, L2Small, L3Small>;

 using BigFeatureTransformer =
-  FeatureTransformer<TransformedFeatureDimensionsBig, &StateInfo::accumulatorBig>;
+  FeatureTransformer<TransformedFeatureDimensionsBig, &AccumulatorState::accumulatorBig>;
 using BigNetworkArchitecture = NetworkArchitecture<TransformedFeatureDimensionsBig, L2Big, L3Big>;

 using NetworkBig   = Network<BigNetworkArchitecture, BigFeatureTransformer>;
--- a/src/nnue/nnue_accumulator.cpp
+++ b/src/nnue/nnue_accumulator.cpp
@@ -0,0 +1,601 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2025 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "nnue_accumulator.h"
+
+#include <cassert>
+#include <initializer_list>
+#include <memory>
+
+#include "../bitboard.h"
+#include "../position.h"
+#include "../types.h"
+#include "nnue_architecture.h"
+#include "network.h"
+#include "nnue_common.h"
+#include "nnue_feature_transformer.h"
+
+namespace Stockfish::Eval::NNUE {
+
+namespace {
+
+template<Color                                     Perspective,
+         IncUpdateDirection                        Direction = FORWARD,
+         IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> AccumulatorState::*accPtr>
+void update_accumulator_incremental(
+  const FeatureTransformer<TransformedFeatureDimensions, accPtr>& featureTransformer,
+  const Square                                                    ksq,
+  AccumulatorState&                                               target_state,
+  const AccumulatorState&                                         computed);
+
+template<Color Perspective, IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
+void update_accumulator_refresh_cache(
+  const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
+  const Position&                               pos,
+  AccumulatorState&                             accumulatorState,
+  AccumulatorCaches::Cache<Dimensions>&         cache);
+
+}
+
+void AccumulatorState::reset(const DirtyPiece& dp) noexcept {
+    dirtyPiece = dp;
+    accumulatorBig.computed.fill(false);
+    accumulatorSmall.computed.fill(false);
+}
+
+const AccumulatorState& AccumulatorStack::latest() const noexcept {
+    return m_accumulators[m_current_idx - 1];
+}
+
+AccumulatorState& AccumulatorStack::mut_latest() noexcept {
+    return m_accumulators[m_current_idx - 1];
+}
+
+void AccumulatorStack::reset(const Position&    rootPos,
+                             const Networks&    networks,
+                             AccumulatorCaches& caches) noexcept {
+    m_current_idx = 1;
+
+    update_accumulator_refresh_cache<WHITE, TransformedFeatureDimensionsBig,
+                                     &AccumulatorState::accumulatorBig>(
+      *networks.big.featureTransformer, rootPos, m_accumulators[0], caches.big);
+    update_accumulator_refresh_cache<BLACK, TransformedFeatureDimensionsBig,
+                                     &AccumulatorState::accumulatorBig>(
+      *networks.big.featureTransformer, rootPos, m_accumulators[0], caches.big);
+
+    update_accumulator_refresh_cache<WHITE, TransformedFeatureDimensionsSmall,
+                                     &AccumulatorState::accumulatorSmall>(
+      *networks.small.featureTransformer, rootPos, m_accumulators[0], caches.small);
+    update_accumulator_refresh_cache<BLACK, TransformedFeatureDimensionsSmall,
+                                     &AccumulatorState::accumulatorSmall>(
+      *networks.small.featureTransformer, rootPos, m_accumulators[0], caches.small);
+}
+
+void AccumulatorStack::push(const DirtyPiece& dirtyPiece) noexcept {
+    assert(m_current_idx + 1 < m_accumulators.size());
+    m_accumulators[m_current_idx].reset(dirtyPiece);
+    m_current_idx++;
+}
+
+void AccumulatorStack::pop() noexcept {
+    assert(m_current_idx > 1);
+    m_current_idx--;
+}
+
+template<IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
+void AccumulatorStack::evaluate(const Position&                               pos,
+                                const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
+                                AccumulatorCaches::Cache<Dimensions>&         cache) noexcept {
+
+    evaluate_side<WHITE>(pos, featureTransformer, cache);
+    evaluate_side<BLACK>(pos, featureTransformer, cache);
+}
+
+template<Color Perspective, IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
+void AccumulatorStack::evaluate_side(
+  const Position&                               pos,
+  const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
+  AccumulatorCaches::Cache<Dimensions>&         cache) noexcept {
+
+    const auto last_usable_accum = find_last_usable_accumulator<Perspective, Dimensions, accPtr>();
+
+    if ((m_accumulators[last_usable_accum].*accPtr).computed[Perspective])
+        forward_update_incremental<Perspective>(pos, featureTransformer, last_usable_accum);
+
+    else
+    {
+        update_accumulator_refresh_cache<Perspective>(featureTransformer, pos, mut_latest(), cache);
+        backward_update_incremental<Perspective>(pos, featureTransformer, last_usable_accum);
+    }
+}
+
+// Find the earliest usable accumulator, this can either be a computed accumulator or the accumulator
+// state just before a change that requires full refresh.
+template<Color Perspective, IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
+std::size_t AccumulatorStack::find_last_usable_accumulator() const noexcept {
+
+    for (std::size_t curr_idx = m_current_idx - 1; curr_idx > 0; curr_idx--)
+    {
+        if ((m_accumulators[curr_idx].*accPtr).computed[Perspective])
+            return curr_idx;
+
+        if (FeatureSet::requires_refresh(m_accumulators[curr_idx].dirtyPiece, Perspective))
+            return curr_idx;
+    }
+
+    return 0;
+}
+
+template<Color Perspective, IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
+void AccumulatorStack::forward_update_incremental(
+  const Position&                               pos,
+  const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
+  const std::size_t                             begin) noexcept {
+
+    assert(begin < m_accumulators.size());
+    assert((m_accumulators[begin].*accPtr).computed[Perspective]);
+
+    const Square ksq = pos.square<KING>(Perspective);
+
+    for (std::size_t next = begin + 1; next < m_current_idx; next++)
+        update_accumulator_incremental<Perspective>(featureTransformer, ksq, m_accumulators[next],
+                                                    m_accumulators[next - 1]);
+
+    assert((latest().*accPtr).computed[Perspective]);
+}
+
+template<Color Perspective, IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
+void AccumulatorStack::backward_update_incremental(
+  const Position&                               pos,
+  const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
+  const std::size_t                             end) noexcept {
+
+    assert(end < m_accumulators.size());
+    assert(end < m_current_idx);
+    assert((latest().*accPtr).computed[Perspective]);
+
+    const Square ksq = pos.square<KING>(Perspective);
+
+    for (std::size_t next = m_current_idx - 2; next >= end; next--)
+        update_accumulator_incremental<Perspective, BACKWARDS>(
+          featureTransformer, ksq, m_accumulators[next], m_accumulators[next + 1]);
+
+    assert((m_accumulators[end].*accPtr).computed[Perspective]);
+}
+
+// Explicit template instantiations
+template void
+AccumulatorStack::evaluate<TransformedFeatureDimensionsBig, &AccumulatorState::accumulatorBig>(
+  const Position& pos,
+  const FeatureTransformer<TransformedFeatureDimensionsBig, &AccumulatorState::accumulatorBig>&
+                                                             featureTransformer,
+  AccumulatorCaches::Cache<TransformedFeatureDimensionsBig>& cache) noexcept;
+template void
+AccumulatorStack::evaluate<TransformedFeatureDimensionsSmall, &AccumulatorState::accumulatorSmall>(
+  const Position& pos,
+  const FeatureTransformer<TransformedFeatureDimensionsSmall, &AccumulatorState::accumulatorSmall>&
+                                                               featureTransformer,
+  AccumulatorCaches::Cache<TransformedFeatureDimensionsSmall>& cache) noexcept;
+
+
+namespace {
+
+template<Color                                     Perspective,
+         IncUpdateDirection                        Direction,
+         IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> AccumulatorState::*accPtr>
+void update_accumulator_incremental(
+  const FeatureTransformer<TransformedFeatureDimensions, accPtr>& featureTransformer,
+  const Square                                                    ksq,
+  AccumulatorState&                                               target_state,
+  const AccumulatorState&                                         computed) {
+    [[maybe_unused]] constexpr bool Forward   = Direction == FORWARD;
+    [[maybe_unused]] constexpr bool Backwards = Direction == BACKWARDS;
+
+    assert(Forward != Backwards);
+
+    assert((computed.*accPtr).computed[Perspective]);
+    assert(!(target_state.*accPtr).computed[Perspective]);
+
+    // The size must be enough to contain the largest possible update.
+    // That might depend on the feature set and generally relies on the
+    // feature set's update cost calculation to be correct and never allow
+    // updates with more added/removed features than MaxActiveDimensions.
+    // In this case, the maximum size of both feature addition and removal
+    // is 2, since we are incrementally updating one move at a time.
+    FeatureSet::IndexList removed, added;
+    if constexpr (Forward)
+        FeatureSet::append_changed_indices<Perspective>(ksq, target_state.dirtyPiece, removed,
+                                                        added);
+    else
+        FeatureSet::append_changed_indices<Perspective>(ksq, computed.dirtyPiece, added, removed);
+
+    if (removed.size() == 0 && added.size() == 0)
+    {
+        std::memcpy((target_state.*accPtr).accumulation[Perspective],
+                    (computed.*accPtr).accumulation[Perspective],
+                    TransformedFeatureDimensions * sizeof(BiasType));
+        std::memcpy((target_state.*accPtr).psqtAccumulation[Perspective],
+                    (computed.*accPtr).psqtAccumulation[Perspective],
+                    PSQTBuckets * sizeof(PSQTWeightType));
+    }
+    else
+    {
+        assert(added.size() == 1 || added.size() == 2);
+        assert(removed.size() == 1 || removed.size() == 2);
+
+        if (Forward)
+            assert(added.size() <= removed.size());
+        else
+            assert(removed.size() <= added.size());
+
+#ifdef VECTOR
+        auto* accIn =
+          reinterpret_cast<const vec_t*>(&(computed.*accPtr).accumulation[Perspective][0]);
+        auto* accOut =
+          reinterpret_cast<vec_t*>(&(target_state.*accPtr).accumulation[Perspective][0]);
+
+        const IndexType offsetA0 = TransformedFeatureDimensions * added[0];
+        auto* columnA0 = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetA0]);
+        const IndexType offsetR0 = TransformedFeatureDimensions * removed[0];
+        auto* columnR0 = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetR0]);
+
+        if ((Forward && removed.size() == 1) || (Backwards && added.size() == 1))
+        {
+            assert(added.size() == 1 && removed.size() == 1);
+            for (IndexType i = 0;
+                 i < TransformedFeatureDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
+                accOut[i] = vec_add_16(vec_sub_16(accIn[i], columnR0[i]), columnA0[i]);
+        }
+        else if (Forward && added.size() == 1)
+        {
+            assert(removed.size() == 2);
+            const IndexType offsetR1 = TransformedFeatureDimensions * removed[1];
+            auto* columnR1 = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetR1]);
+
+            for (IndexType i = 0;
+                 i < TransformedFeatureDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
+                accOut[i] = vec_sub_16(vec_add_16(accIn[i], columnA0[i]),
+                                       vec_add_16(columnR0[i], columnR1[i]));
+        }
+        else if (Backwards && removed.size() == 1)
+        {
+            assert(added.size() == 2);
+            const IndexType offsetA1 = TransformedFeatureDimensions * added[1];
+            auto* columnA1 = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetA1]);
+
+            for (IndexType i = 0;
+                 i < TransformedFeatureDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
+                accOut[i] = vec_add_16(vec_add_16(accIn[i], columnA0[i]),
+                                       vec_sub_16(columnA1[i], columnR0[i]));
+        }
+        else
+        {
+            assert(added.size() == 2 && removed.size() == 2);
+            const IndexType offsetA1 = TransformedFeatureDimensions * added[1];
+            auto* columnA1 = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetA1]);
+            const IndexType offsetR1 = TransformedFeatureDimensions * removed[1];
+            auto* columnR1 = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetR1]);
+
+            for (IndexType i = 0;
+                 i < TransformedFeatureDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
+                accOut[i] = vec_add_16(accIn[i], vec_sub_16(vec_add_16(columnA0[i], columnA1[i]),
+                                                            vec_add_16(columnR0[i], columnR1[i])));
+        }
+
+        auto* accPsqtIn =
+          reinterpret_cast<const psqt_vec_t*>(&(computed.*accPtr).psqtAccumulation[Perspective][0]);
+        auto* accPsqtOut =
+          reinterpret_cast<psqt_vec_t*>(&(target_state.*accPtr).psqtAccumulation[Perspective][0]);
+
+        const IndexType offsetPsqtA0 = PSQTBuckets * added[0];
+        auto*           columnPsqtA0 =
+          reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offsetPsqtA0]);
+        const IndexType offsetPsqtR0 = PSQTBuckets * removed[0];
+        auto*           columnPsqtR0 =
+          reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offsetPsqtR0]);
+
+        if ((Forward && removed.size() == 1)
+            || (Backwards && added.size() == 1))  // added.size() == removed.size() == 1
+        {
+            for (std::size_t i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t);
+                 ++i)
+                accPsqtOut[i] =
+                  vec_add_psqt_32(vec_sub_psqt_32(accPsqtIn[i], columnPsqtR0[i]), columnPsqtA0[i]);
+        }
+        else if (Forward && added.size() == 1)
+        {
+            const IndexType offsetPsqtR1 = PSQTBuckets * removed[1];
+            auto*           columnPsqtR1 =
+              reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offsetPsqtR1]);
+
+            for (std::size_t i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t);
+                 ++i)
+                accPsqtOut[i] = vec_sub_psqt_32(vec_add_psqt_32(accPsqtIn[i], columnPsqtA0[i]),
+                                                vec_add_psqt_32(columnPsqtR0[i], columnPsqtR1[i]));
+        }
+        else if (Backwards && removed.size() == 1)
+        {
+            const IndexType offsetPsqtA1 = PSQTBuckets * added[1];
+            auto*           columnPsqtA1 =
+              reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offsetPsqtA1]);
+
+            for (std::size_t i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t);
+                 ++i)
+                accPsqtOut[i] = vec_add_psqt_32(vec_add_psqt_32(accPsqtIn[i], columnPsqtA0[i]),
+                                                vec_sub_psqt_32(columnPsqtA1[i], columnPsqtR0[i]));
+        }
+        else
+        {
+            const IndexType offsetPsqtA1 = PSQTBuckets * added[1];
+            auto*           columnPsqtA1 =
+              reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offsetPsqtA1]);
+            const IndexType offsetPsqtR1 = PSQTBuckets * removed[1];
+            auto*           columnPsqtR1 =
+              reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offsetPsqtR1]);
+
+            for (std::size_t i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t);
+                 ++i)
+                accPsqtOut[i] = vec_add_psqt_32(
+                  accPsqtIn[i], vec_sub_psqt_32(vec_add_psqt_32(columnPsqtA0[i], columnPsqtA1[i]),
+                                                vec_add_psqt_32(columnPsqtR0[i], columnPsqtR1[i])));
+        }
+#else
+        std::memcpy((target_state.*accPtr).accumulation[Perspective],
+                    (computed.*accPtr).accumulation[Perspective],
+                    TransformedFeatureDimensions * sizeof(BiasType));
+        std::memcpy((target_state.*accPtr).psqtAccumulation[Perspective],
+                    (computed.*accPtr).psqtAccumulation[Perspective],
+                    PSQTBuckets * sizeof(PSQTWeightType));
+
+        // Difference calculation for the deactivated features
+        for (const auto index : removed)
+        {
+            const IndexType offset = TransformedFeatureDimensions * index;
+            for (IndexType i = 0; i < TransformedFeatureDimensions; ++i)
+                (target_state.*accPtr).accumulation[Perspective][i] -=
+                  featureTransformer.weights[offset + i];
+
+            for (std::size_t i = 0; i < PSQTBuckets; ++i)
+                (target_state.*accPtr).psqtAccumulation[Perspective][i] -=
+                  featureTransformer.psqtWeights[index * PSQTBuckets + i];
+        }
+
+        // Difference calculation for the activated features
+        for (const auto index : added)
+        {
+            const IndexType offset = TransformedFeatureDimensions * index;
+            for (IndexType i = 0; i < TransformedFeatureDimensions; ++i)
+                (target_state.*accPtr).accumulation[Perspective][i] +=
+                  featureTransformer.weights[offset + i];
+
+            for (std::size_t i = 0; i < PSQTBuckets; ++i)
+                (target_state.*accPtr).psqtAccumulation[Perspective][i] +=
+                  featureTransformer.psqtWeights[index * PSQTBuckets + i];
+        }
+#endif
+    }
+
+    (target_state.*accPtr).computed[Perspective] = true;
+}
+
+template<Color Perspective, IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
+void update_accumulator_refresh_cache(
+  const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
+  const Position&                               pos,
+  AccumulatorState&                             accumulatorState,
+  AccumulatorCaches::Cache<Dimensions>&         cache) {
+    using Tiling [[maybe_unused]] = SIMDTiling<Dimensions, Dimensions>;
+
+    const Square          ksq   = pos.square<KING>(Perspective);
+    auto&                 entry = cache[ksq][Perspective];
+    FeatureSet::IndexList removed, added;
+
+    for (Color c : {WHITE, BLACK})
+    {
+        for (PieceType pt = PAWN; pt <= KING; ++pt)
+        {
+            const Piece    piece    = make_piece(c, pt);
+            const Bitboard oldBB    = entry.byColorBB[c] & entry.byTypeBB[pt];
+            const Bitboard newBB    = pos.pieces(c, pt);
+            Bitboard       toRemove = oldBB & ~newBB;
+            Bitboard       toAdd    = newBB & ~oldBB;
+
+            while (toRemove)
+            {
+                Square sq = pop_lsb(toRemove);
+                removed.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
+            }
+            while (toAdd)
+            {
+                Square sq = pop_lsb(toAdd);
+                added.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
+            }
+        }
+    }
+
+    auto& accumulator                 = accumulatorState.*accPtr;
+    accumulator.computed[Perspective] = true;
+
+#ifdef VECTOR
+    const bool combineLast3 =
+      std::abs((int) removed.size() - (int) added.size()) == 1 && removed.size() + added.size() > 2;
+    vec_t      acc[Tiling::NumRegs];
+    psqt_vec_t psqt[Tiling::NumPsqtRegs];
+
+    for (IndexType j = 0; j < Dimensions / Tiling::TileHeight; ++j)
+    {
+        auto* accTile =
+          reinterpret_cast<vec_t*>(&accumulator.accumulation[Perspective][j * Tiling::TileHeight]);
+        auto* entryTile = reinterpret_cast<vec_t*>(&entry.accumulation[j * Tiling::TileHeight]);
+
+        for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+            acc[k] = entryTile[k];
+
+        std::size_t i = 0;
+        for (; i < std::min(removed.size(), added.size()) - combineLast3; ++i)
+        {
+            IndexType       indexR  = removed[i];
+            const IndexType offsetR = Dimensions * indexR + j * Tiling::TileHeight;
+            auto* columnR = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetR]);
+            IndexType       indexA  = added[i];
+            const IndexType offsetA = Dimensions * indexA + j * Tiling::TileHeight;
+            auto* columnA = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetA]);
+
+            for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+                acc[k] = vec_add_16(acc[k], vec_sub_16(columnA[k], columnR[k]));
+        }
+        if (combineLast3)
+        {
+            IndexType       indexR  = removed[i];
+            const IndexType offsetR = Dimensions * indexR + j * Tiling::TileHeight;
+            auto* columnR = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetR]);
+            IndexType       indexA  = added[i];
+            const IndexType offsetA = Dimensions * indexA + j * Tiling::TileHeight;
+            auto* columnA = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetA]);
+
+            if (removed.size() > added.size())
+            {
+                IndexType       indexR2  = removed[i + 1];
+                const IndexType offsetR2 = Dimensions * indexR2 + j * Tiling::TileHeight;
+                auto*           columnR2 =
+                  reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetR2]);
+
+                for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+                    acc[k] = vec_sub_16(vec_add_16(acc[k], columnA[k]),
+                                        vec_add_16(columnR[k], columnR2[k]));
+            }
+            else
+            {
+                IndexType       indexA2  = added[i + 1];
+                const IndexType offsetA2 = Dimensions * indexA2 + j * Tiling::TileHeight;
+                auto*           columnA2 =
+                  reinterpret_cast<const vec_t*>(&featureTransformer.weights[offsetA2]);
+
+                for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+                    acc[k] = vec_add_16(vec_sub_16(acc[k], columnR[k]),
+                                        vec_add_16(columnA[k], columnA2[k]));
+            }
+        }
+        else
+        {
+            for (; i < removed.size(); ++i)
+            {
+                IndexType       index  = removed[i];
+                const IndexType offset = Dimensions * index + j * Tiling::TileHeight;
+                auto* column = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offset]);
+
+                for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+                    acc[k] = vec_sub_16(acc[k], column[k]);
+            }
+            for (; i < added.size(); ++i)
+            {
+                IndexType       index  = added[i];
+                const IndexType offset = Dimensions * index + j * Tiling::TileHeight;
+                auto* column = reinterpret_cast<const vec_t*>(&featureTransformer.weights[offset]);
+
+                for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+                    acc[k] = vec_add_16(acc[k], column[k]);
+            }
+        }
+
+        for (IndexType k = 0; k < Tiling::NumRegs; k++)
+            vec_store(&entryTile[k], acc[k]);
+        for (IndexType k = 0; k < Tiling::NumRegs; k++)
+            vec_store(&accTile[k], acc[k]);
+    }
+
+    for (IndexType j = 0; j < PSQTBuckets / Tiling::PsqtTileHeight; ++j)
+    {
+        auto* accTilePsqt = reinterpret_cast<psqt_vec_t*>(
+          &accumulator.psqtAccumulation[Perspective][j * Tiling::PsqtTileHeight]);
+        auto* entryTilePsqt =
+          reinterpret_cast<psqt_vec_t*>(&entry.psqtAccumulation[j * Tiling::PsqtTileHeight]);
+
+        for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
+            psqt[k] = entryTilePsqt[k];
+
+        for (std::size_t i = 0; i < removed.size(); ++i)
+        {
+            IndexType       index  = removed[i];
+            const IndexType offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight;
+            auto*           columnPsqt =
+              reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offset]);
+
+            for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
+                psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
+        }
+        for (std::size_t i = 0; i < added.size(); ++i)
+        {
+            IndexType       index  = added[i];
+            const IndexType offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight;
+            auto*           columnPsqt =
+              reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offset]);
+
+            for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
+                psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
+        }
+
+        for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
+            vec_store_psqt(&entryTilePsqt[k], psqt[k]);
+        for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
+            vec_store_psqt(&accTilePsqt[k], psqt[k]);
+    }
+
+#else
+
+    for (const auto index : removed)
+    {
+        const IndexType offset = Dimensions * index;
+        for (IndexType j = 0; j < Dimensions; ++j)
+            entry.accumulation[j] -= featureTransformer.weights[offset + j];
+
+        for (std::size_t k = 0; k < PSQTBuckets; ++k)
+            entry.psqtAccumulation[k] -= featureTransformer.psqtWeights[index * PSQTBuckets + k];
+    }
+    for (const auto index : added)
+    {
+        const IndexType offset = Dimensions * index;
+        for (IndexType j = 0; j < Dimensions; ++j)
+            entry.accumulation[j] += featureTransformer.weights[offset + j];
+
+        for (std::size_t k = 0; k < PSQTBuckets; ++k)
+            entry.psqtAccumulation[k] += featureTransformer.psqtWeights[index * PSQTBuckets + k];
+    }
+
+    // The accumulator of the refresh entry has been updated.
+    // Now copy its content to the actual accumulator we were refreshing.
+
+    std::memcpy(accumulator.accumulation[Perspective], entry.accumulation,
+                sizeof(BiasType) * Dimensions);
+
+    std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation,
+                sizeof(int32_t) * PSQTBuckets);
+#endif
+
+    for (Color c : {WHITE, BLACK})
+        entry.byColorBB[c] = pos.pieces(c);
+
+    for (PieceType pt = PAWN; pt <= KING; ++pt)
+        entry.byTypeBB[pt] = pos.pieces(pt);
+}
+
+}
+
+}
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -21,23 +21,43 @@
 #ifndef NNUE_ACCUMULATOR_H_INCLUDED
 #define NNUE_ACCUMULATOR_H_INCLUDED

+#include <array>
+#include <cstddef>
 #include <cstdint>
+#include <cstring>
+#include <vector>

+#include "../types.h"
 #include "nnue_architecture.h"
 #include "nnue_common.h"

+namespace Stockfish {
+class Position;
+}
+
 namespace Stockfish::Eval::NNUE {

 using BiasType       = std::int16_t;
 using PSQTWeightType = std::int32_t;
 using IndexType      = std::uint32_t;

+struct Networks;
+
+template<IndexType Size>
+struct alignas(CacheLineSize) Accumulator;
+
+struct AccumulatorState;
+
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> AccumulatorState::*accPtr>
+class FeatureTransformer;
+
 // Class that holds the result of affine transformation of input features
 template<IndexType Size>
 struct alignas(CacheLineSize) Accumulator {
-    std::int16_t accumulation[COLOR_NB][Size];
-    std::int32_t psqtAccumulation[COLOR_NB][PSQTBuckets];
-    bool         computed[COLOR_NB];
+    std::int16_t               accumulation[COLOR_NB][Size];
+    std::int32_t               psqtAccumulation[COLOR_NB][PSQTBuckets];
+    std::array<bool, COLOR_NB> computed;
 };


@@ -95,6 +115,69 @@ struct AccumulatorCaches {
    Cache<TransformedFeatureDimensionsSmall> small;
 };

+
+struct AccumulatorState {
+    Accumulator<TransformedFeatureDimensionsBig>   accumulatorBig;
+    Accumulator<TransformedFeatureDimensionsSmall> accumulatorSmall;
+    DirtyPiece                                     dirtyPiece;
+
+    void reset(const DirtyPiece& dp) noexcept;
+};
+
+
+class AccumulatorStack {
+   public:
+    AccumulatorStack() :
+        m_accumulators(MAX_PLY + 1),
+        m_current_idx{} {}
+
+    [[nodiscard]] const AccumulatorState& latest() const noexcept;
+
+    void
+    reset(const Position& rootPos, const Networks& networks, AccumulatorCaches& caches) noexcept;
+    void push(const DirtyPiece& dirtyPiece) noexcept;
+    void pop() noexcept;
+
+    template<IndexType Dimensions, Accumulator<Dimensions> AccumulatorState::*accPtr>
+    void evaluate(const Position&                               pos,
+                  const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
+                  AccumulatorCaches::Cache<Dimensions>&         cache) noexcept;
+
+   private:
+    [[nodiscard]] AccumulatorState& mut_latest() noexcept;
+
+    template<Color                   Perspective,
+             IndexType               Dimensions,
+             Accumulator<Dimensions> AccumulatorState::*accPtr>
+    void evaluate_side(const Position&                               pos,
+                       const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
+                       AccumulatorCaches::Cache<Dimensions>&         cache) noexcept;
+
+    template<Color                   Perspective,
+             IndexType               Dimensions,
+             Accumulator<Dimensions> AccumulatorState::*accPtr>
+    [[nodiscard]] std::size_t find_last_usable_accumulator() const noexcept;
+
+    template<Color                   Perspective,
+             IndexType               Dimensions,
+             Accumulator<Dimensions> AccumulatorState::*accPtr>
+    void
+    forward_update_incremental(const Position&                               pos,
+                               const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
+                               const std::size_t                             begin) noexcept;
+
+    template<Color                   Perspective,
+             IndexType               Dimensions,
+             Accumulator<Dimensions> AccumulatorState::*accPtr>
+    void
+    backward_update_incremental(const Position&                               pos,
+                                const FeatureTransformer<Dimensions, accPtr>& featureTransformer,
+                                const std::size_t                             end) noexcept;
+
+    std::vector<AccumulatorState> m_accumulators;
+    std::size_t                   m_current_idx;
+};
+
 }  // namespace Stockfish::Eval::NNUE

 #endif  // NNUE_ACCUMULATOR_H_INCLUDED
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -279,6 +279,11 @@ inline void write_leb_128(std::ostream& stream, const IntType* values, std::size
    flush();
 }

+enum IncUpdateDirection {
+    FORWARD,
+    BACKWARDS
+};
+
 }  // namespace Stockfish::Eval::NNUE

 #endif  // #ifndef NNUE_COMMON_H_INCLUDED
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -22,12 +22,9 @@
 #define NNUE_FEATURE_TRANSFORMER_H_INCLUDED

 #include <algorithm>
-#include <cassert>
 #include <cstdint>
 #include <cstring>
 #include <iosfwd>
-#include <type_traits>
-#include <utility>

 #include "../position.h"
 #include "../types.h"
@@ -41,11 +38,6 @@ using BiasType       = std::int16_t;
 using WeightType     = std::int16_t;
 using PSQTWeightType = std::int32_t;

-enum IncUpdateDirection {
-    FORWARD,
-    BACKWARDS
-};
-
 // If vector instructions are enabled, we update and refresh the
 // accumulator tile by tile such that each tile fits in the CPU's
 // vector registers.
@@ -249,15 +241,12 @@ class SIMDTiling {

 // Input feature converter
 template<IndexType                                 TransformedFeatureDimensions,
-         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
+         Accumulator<TransformedFeatureDimensions> AccumulatorState::*accPtr>
 class FeatureTransformer {

    // Number of output dimensions for one side
    static constexpr IndexType HalfDimensions = TransformedFeatureDimensions;

-   private:
-    using Tiling = SIMDTiling<TransformedFeatureDimensions, HalfDimensions>;
-
   public:
    // Output type
    using OutputType = TransformedFeatureType;
@@ -348,19 +337,21 @@ class FeatureTransformer {

    // Convert input features
    std::int32_t transform(const Position&                           pos,
+                           AccumulatorStack&                         accumulatorStack,
                           AccumulatorCaches::Cache<HalfDimensions>* cache,
                           OutputType*                               output,
                           int                                       bucket) const {
-        update_accumulator<WHITE>(pos, cache);
-        update_accumulator<BLACK>(pos, cache);
+
+        accumulatorStack.evaluate(pos, *this, *cache);
+        const auto& accumulatorState = accumulatorStack.latest();

        const Color perspectives[2]  = {pos.side_to_move(), ~pos.side_to_move()};
-        const auto& psqtAccumulation = (pos.state()->*accPtr).psqtAccumulation;
+        const auto& psqtAccumulation = (accumulatorState.*accPtr).psqtAccumulation;
        const auto  psqt =
          (psqtAccumulation[perspectives[0]][bucket] - psqtAccumulation[perspectives[1]][bucket])
          / 2;

-        const auto& accumulation = (pos.state()->*accPtr).accumulation;
+        const auto& accumulation = (accumulatorState.*accPtr).accumulation;

        for (IndexType p = 0; p < 2; ++p)
        {
@@ -473,432 +464,6 @@ class FeatureTransformer {
        return psqt;
    }  // end of function transform()

-   private:
-    // Given a computed accumulator, computes the accumulator of another position.
-    template<Color Perspective, IncUpdateDirection Direction = FORWARD>
-    void update_accumulator_incremental(const Square     ksq,
-                                        StateInfo*       target_state,
-                                        const StateInfo* computed) const {
-        [[maybe_unused]] constexpr bool Forward   = Direction == FORWARD;
-        [[maybe_unused]] constexpr bool Backwards = Direction == BACKWARDS;
-        assert((computed->*accPtr).computed[Perspective]);
-
-        StateInfo* next = Forward ? computed->next : computed->previous;
-
-        assert(next != nullptr);
-        assert(!(next->*accPtr).computed[Perspective]);
-
-        // The size must be enough to contain the largest possible update.
-        // That might depend on the feature set and generally relies on the
-        // feature set's update cost calculation to be correct and never allow
-        // updates with more added/removed features than MaxActiveDimensions.
-        // In this case, the maximum size of both feature addition and removal
-        // is 2, since we are incrementally updating one move at a time.
-        FeatureSet::IndexList removed, added;
-        if constexpr (Forward)
-            FeatureSet::append_changed_indices<Perspective>(ksq, next->dirtyPiece, removed, added);
-        else
-            FeatureSet::append_changed_indices<Perspective>(ksq, computed->dirtyPiece, added,
-                                                            removed);
-
-        if (removed.size() == 0 && added.size() == 0)
-        {
-            std::memcpy((next->*accPtr).accumulation[Perspective],
-                        (computed->*accPtr).accumulation[Perspective],
-                        HalfDimensions * sizeof(BiasType));
-            std::memcpy((next->*accPtr).psqtAccumulation[Perspective],
-                        (computed->*accPtr).psqtAccumulation[Perspective],
-                        PSQTBuckets * sizeof(PSQTWeightType));
-        }
-        else
-        {
-            assert(added.size() == 1 || added.size() == 2);
-            assert(removed.size() == 1 || removed.size() == 2);
-            if (Forward)
-                assert(added.size() <= removed.size());
-            else
-                assert(removed.size() <= added.size());
-
-#ifdef VECTOR
-            auto* accIn =
-              reinterpret_cast<const vec_t*>(&(computed->*accPtr).accumulation[Perspective][0]);
-            auto* accOut = reinterpret_cast<vec_t*>(&(next->*accPtr).accumulation[Perspective][0]);
-
-            const IndexType offsetA0 = HalfDimensions * added[0];
-            auto*           columnA0 = reinterpret_cast<const vec_t*>(&weights[offsetA0]);
-            const IndexType offsetR0 = HalfDimensions * removed[0];
-            auto*           columnR0 = reinterpret_cast<const vec_t*>(&weights[offsetR0]);
-
-            if ((Forward && removed.size() == 1) || (Backwards && added.size() == 1))
-            {
-                assert(added.size() == 1 && removed.size() == 1);
-                for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
-                    accOut[i] = vec_add_16(vec_sub_16(accIn[i], columnR0[i]), columnA0[i]);
-            }
-            else if (Forward && added.size() == 1)
-            {
-                assert(removed.size() == 2);
-                const IndexType offsetR1 = HalfDimensions * removed[1];
-                auto*           columnR1 = reinterpret_cast<const vec_t*>(&weights[offsetR1]);
-
-                for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
-                    accOut[i] = vec_sub_16(vec_add_16(accIn[i], columnA0[i]),
-                                           vec_add_16(columnR0[i], columnR1[i]));
-            }
-            else if (Backwards && removed.size() == 1)
-            {
-                assert(added.size() == 2);
-                const IndexType offsetA1 = HalfDimensions * added[1];
-                auto*           columnA1 = reinterpret_cast<const vec_t*>(&weights[offsetA1]);
-
-                for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
-                    accOut[i] = vec_add_16(vec_add_16(accIn[i], columnA0[i]),
-                                           vec_sub_16(columnA1[i], columnR0[i]));
-            }
-            else
-            {
-                assert(added.size() == 2 && removed.size() == 2);
-                const IndexType offsetA1 = HalfDimensions * added[1];
-                auto*           columnA1 = reinterpret_cast<const vec_t*>(&weights[offsetA1]);
-                const IndexType offsetR1 = HalfDimensions * removed[1];
-                auto*           columnR1 = reinterpret_cast<const vec_t*>(&weights[offsetR1]);
-
-                for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
-                    accOut[i] =
-                      vec_add_16(accIn[i], vec_sub_16(vec_add_16(columnA0[i], columnA1[i]),
-                                                      vec_add_16(columnR0[i], columnR1[i])));
-            }
-
-            auto* accPsqtIn = reinterpret_cast<const psqt_vec_t*>(
-              &(computed->*accPtr).psqtAccumulation[Perspective][0]);
-            auto* accPsqtOut =
-              reinterpret_cast<psqt_vec_t*>(&(next->*accPtr).psqtAccumulation[Perspective][0]);
-
-            const IndexType offsetPsqtA0 = PSQTBuckets * added[0];
-            auto* columnPsqtA0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtA0]);
-            const IndexType offsetPsqtR0 = PSQTBuckets * removed[0];
-            auto* columnPsqtR0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtR0]);
-
-            if ((Forward && removed.size() == 1)
-                || (Backwards && added.size() == 1))  // added.size() == removed.size() == 1
-            {
-                for (std::size_t i = 0;
-                     i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); ++i)
-                    accPsqtOut[i] = vec_add_psqt_32(vec_sub_psqt_32(accPsqtIn[i], columnPsqtR0[i]),
-                                                    columnPsqtA0[i]);
-            }
-            else if (Forward && added.size() == 1)
-            {
-                const IndexType offsetPsqtR1 = PSQTBuckets * removed[1];
-                auto*           columnPsqtR1 =
-                  reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtR1]);
-
-                for (std::size_t i = 0;
-                     i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); ++i)
-                    accPsqtOut[i] =
-                      vec_sub_psqt_32(vec_add_psqt_32(accPsqtIn[i], columnPsqtA0[i]),
-                                      vec_add_psqt_32(columnPsqtR0[i], columnPsqtR1[i]));
-            }
-            else if (Backwards && removed.size() == 1)
-            {
-                const IndexType offsetPsqtA1 = PSQTBuckets * added[1];
-                auto*           columnPsqtA1 =
-                  reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtA1]);
-
-                for (std::size_t i = 0;
-                     i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); ++i)
-                    accPsqtOut[i] =
-                      vec_add_psqt_32(vec_add_psqt_32(accPsqtIn[i], columnPsqtA0[i]),
-                                      vec_sub_psqt_32(columnPsqtA1[i], columnPsqtR0[i]));
-            }
-            else
-            {
-                const IndexType offsetPsqtA1 = PSQTBuckets * added[1];
-                auto*           columnPsqtA1 =
-                  reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtA1]);
-                const IndexType offsetPsqtR1 = PSQTBuckets * removed[1];
-                auto*           columnPsqtR1 =
-                  reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtR1]);
-
-                for (std::size_t i = 0;
-                     i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); ++i)
-                    accPsqtOut[i] = vec_add_psqt_32(
-                      accPsqtIn[i],
-                      vec_sub_psqt_32(vec_add_psqt_32(columnPsqtA0[i], columnPsqtA1[i]),
-                                      vec_add_psqt_32(columnPsqtR0[i], columnPsqtR1[i])));
-            }
-#else
-            std::memcpy((next->*accPtr).accumulation[Perspective],
-                        (computed->*accPtr).accumulation[Perspective],
-                        HalfDimensions * sizeof(BiasType));
-            std::memcpy((next->*accPtr).psqtAccumulation[Perspective],
-                        (computed->*accPtr).psqtAccumulation[Perspective],
-                        PSQTBuckets * sizeof(PSQTWeightType));
-
-            // Difference calculation for the deactivated features
-            for (const auto index : removed)
-            {
-                const IndexType offset = HalfDimensions * index;
-                for (IndexType i = 0; i < HalfDimensions; ++i)
-                    (next->*accPtr).accumulation[Perspective][i] -= weights[offset + i];
-
-                for (std::size_t i = 0; i < PSQTBuckets; ++i)
-                    (next->*accPtr).psqtAccumulation[Perspective][i] -=
-                      psqtWeights[index * PSQTBuckets + i];
-            }
-
-            // Difference calculation for the activated features
-            for (const auto index : added)
-            {
-                const IndexType offset = HalfDimensions * index;
-                for (IndexType i = 0; i < HalfDimensions; ++i)
-                    (next->*accPtr).accumulation[Perspective][i] += weights[offset + i];
-
-                for (std::size_t i = 0; i < PSQTBuckets; ++i)
-                    (next->*accPtr).psqtAccumulation[Perspective][i] +=
-                      psqtWeights[index * PSQTBuckets + i];
-            }
-#endif
-        }
-
-        (next->*accPtr).computed[Perspective] = true;
-
-        if (next != target_state)
-            update_accumulator_incremental<Perspective, Direction>(ksq, target_state, next);
-    }
-
-
-    template<Color Perspective>
-    void update_accumulator_refresh_cache(const Position&                           pos,
-                                          AccumulatorCaches::Cache<HalfDimensions>* cache) const {
-        assert(cache != nullptr);
-
-        Square                ksq   = pos.square<KING>(Perspective);
-        auto&                 entry = (*cache)[ksq][Perspective];
-        FeatureSet::IndexList removed, added;
-
-        for (Color c : {WHITE, BLACK})
-        {
-            for (PieceType pt = PAWN; pt <= KING; ++pt)
-            {
-                const Piece    piece    = make_piece(c, pt);
-                const Bitboard oldBB    = entry.byColorBB[c] & entry.byTypeBB[pt];
-                const Bitboard newBB    = pos.pieces(c, pt);
-                Bitboard       toRemove = oldBB & ~newBB;
-                Bitboard       toAdd    = newBB & ~oldBB;
-
-                while (toRemove)
-                {
-                    Square sq = pop_lsb(toRemove);
-                    removed.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
-                }
-                while (toAdd)
-                {
-                    Square sq = pop_lsb(toAdd);
-                    added.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
-                }
-            }
-        }
-
-        auto& accumulator                 = pos.state()->*accPtr;
-        accumulator.computed[Perspective] = true;
-
-#ifdef VECTOR
-        const bool combineLast3 = std::abs((int) removed.size() - (int) added.size()) == 1
-                               && removed.size() + added.size() > 2;
-        vec_t      acc[Tiling::NumRegs];
-        psqt_vec_t psqt[Tiling::NumPsqtRegs];
-
-        for (IndexType j = 0; j < HalfDimensions / Tiling::TileHeight; ++j)
-        {
-            auto* accTile = reinterpret_cast<vec_t*>(
-              &accumulator.accumulation[Perspective][j * Tiling::TileHeight]);
-            auto* entryTile = reinterpret_cast<vec_t*>(&entry.accumulation[j * Tiling::TileHeight]);
-
-            for (IndexType k = 0; k < Tiling::NumRegs; ++k)
-                acc[k] = entryTile[k];
-
-            std::size_t i = 0;
-            for (; i < std::min(removed.size(), added.size()) - combineLast3; ++i)
-            {
-                IndexType       indexR  = removed[i];
-                const IndexType offsetR = HalfDimensions * indexR + j * Tiling::TileHeight;
-                auto*           columnR = reinterpret_cast<const vec_t*>(&weights[offsetR]);
-                IndexType       indexA  = added[i];
-                const IndexType offsetA = HalfDimensions * indexA + j * Tiling::TileHeight;
-                auto*           columnA = reinterpret_cast<const vec_t*>(&weights[offsetA]);
-
-                for (IndexType k = 0; k < Tiling::NumRegs; ++k)
-                    acc[k] = vec_add_16(acc[k], vec_sub_16(columnA[k], columnR[k]));
-            }
-            if (combineLast3)
-            {
-                IndexType       indexR  = removed[i];
-                const IndexType offsetR = HalfDimensions * indexR + j * Tiling::TileHeight;
-                auto*           columnR = reinterpret_cast<const vec_t*>(&weights[offsetR]);
-                IndexType       indexA  = added[i];
-                const IndexType offsetA = HalfDimensions * indexA + j * Tiling::TileHeight;
-                auto*           columnA = reinterpret_cast<const vec_t*>(&weights[offsetA]);
-
-                if (removed.size() > added.size())
-                {
-                    IndexType       indexR2  = removed[i + 1];
-                    const IndexType offsetR2 = HalfDimensions * indexR2 + j * Tiling::TileHeight;
-                    auto*           columnR2 = reinterpret_cast<const vec_t*>(&weights[offsetR2]);
-
-                    for (IndexType k = 0; k < Tiling::NumRegs; ++k)
-                        acc[k] = vec_sub_16(vec_add_16(acc[k], columnA[k]),
-                                            vec_add_16(columnR[k], columnR2[k]));
-                }
-                else
-                {
-                    IndexType       indexA2  = added[i + 1];
-                    const IndexType offsetA2 = HalfDimensions * indexA2 + j * Tiling::TileHeight;
-                    auto*           columnA2 = reinterpret_cast<const vec_t*>(&weights[offsetA2]);
-
-                    for (IndexType k = 0; k < Tiling::NumRegs; ++k)
-                        acc[k] = vec_add_16(vec_sub_16(acc[k], columnR[k]),
-                                            vec_add_16(columnA[k], columnA2[k]));
-                }
-            }
-            else
-            {
-                for (; i < removed.size(); ++i)
-                {
-                    IndexType       index  = removed[i];
-                    const IndexType offset = HalfDimensions * index + j * Tiling::TileHeight;
-                    auto*           column = reinterpret_cast<const vec_t*>(&weights[offset]);
-
-                    for (IndexType k = 0; k < Tiling::NumRegs; ++k)
-                        acc[k] = vec_sub_16(acc[k], column[k]);
-                }
-                for (; i < added.size(); ++i)
-                {
-                    IndexType       index  = added[i];
-                    const IndexType offset = HalfDimensions * index + j * Tiling::TileHeight;
-                    auto*           column = reinterpret_cast<const vec_t*>(&weights[offset]);
-
-                    for (IndexType k = 0; k < Tiling::NumRegs; ++k)
-                        acc[k] = vec_add_16(acc[k], column[k]);
-                }
-            }
-
-            for (IndexType k = 0; k < Tiling::NumRegs; k++)
-                vec_store(&entryTile[k], acc[k]);
-            for (IndexType k = 0; k < Tiling::NumRegs; k++)
-                vec_store(&accTile[k], acc[k]);
-        }
-
-        for (IndexType j = 0; j < PSQTBuckets / Tiling::PsqtTileHeight; ++j)
-        {
-            auto* accTilePsqt = reinterpret_cast<psqt_vec_t*>(
-              &accumulator.psqtAccumulation[Perspective][j * Tiling::PsqtTileHeight]);
-            auto* entryTilePsqt =
-              reinterpret_cast<psqt_vec_t*>(&entry.psqtAccumulation[j * Tiling::PsqtTileHeight]);
-
-            for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
-                psqt[k] = entryTilePsqt[k];
-
-            for (std::size_t i = 0; i < removed.size(); ++i)
-            {
-                IndexType       index  = removed[i];
-                const IndexType offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight;
-                auto* columnPsqt       = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
-
-                for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
-                    psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
-            }
-            for (std::size_t i = 0; i < added.size(); ++i)
-            {
-                IndexType       index  = added[i];
-                const IndexType offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight;
-                auto* columnPsqt       = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
-
-                for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
-                    psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
-            }
-
-            for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
-                vec_store_psqt(&entryTilePsqt[k], psqt[k]);
-            for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
-                vec_store_psqt(&accTilePsqt[k], psqt[k]);
-        }
-
-#else
-
-        for (const auto index : removed)
-        {
-            const IndexType offset = HalfDimensions * index;
-            for (IndexType j = 0; j < HalfDimensions; ++j)
-                entry.accumulation[j] -= weights[offset + j];
-
-            for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                entry.psqtAccumulation[k] -= psqtWeights[index * PSQTBuckets + k];
-        }
-        for (const auto index : added)
-        {
-            const IndexType offset = HalfDimensions * index;
-            for (IndexType j = 0; j < HalfDimensions; ++j)
-                entry.accumulation[j] += weights[offset + j];
-
-            for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                entry.psqtAccumulation[k] += psqtWeights[index * PSQTBuckets + k];
-        }
-
-        // The accumulator of the refresh entry has been updated.
-        // Now copy its content to the actual accumulator we were refreshing.
-
-        std::memcpy(accumulator.accumulation[Perspective], entry.accumulation,
-                    sizeof(BiasType) * HalfDimensions);
-
-        std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation,
-                    sizeof(int32_t) * PSQTBuckets);
-#endif
-
-        for (Color c : {WHITE, BLACK})
-            entry.byColorBB[c] = pos.pieces(c);
-
-        for (PieceType pt = PAWN; pt <= KING; ++pt)
-            entry.byTypeBB[pt] = pos.pieces(pt);
-    }
-
-
-    template<Color Perspective>
-    void update_accumulator(const Position&                           pos,
-                            AccumulatorCaches::Cache<HalfDimensions>* cache) const {
-        StateInfo* st = pos.state();
-        if ((st->*accPtr).computed[Perspective])
-            return;  // nothing to do
-
-        // Look for a usable already computed accumulator of an earlier position.
-        // Always try to do an incremental update as most accumulators will be reusable.
-        do
-        {
-            if (FeatureSet::requires_refresh(st, Perspective) || !st->previous
-                || st->previous->next != st)
-            {
-                // compute accumulator from scratch for this position
-                update_accumulator_refresh_cache<Perspective>(pos, cache);
-                if (st != pos.state())
-                    // when computing an accumulator from scratch we can use it to
-                    // efficiently compute the accumulator backwards, until we get to a king
-                    // move. We expect that we will need these accumulators later anyway, so
-                    // computing them now will save some work.
-                    update_accumulator_incremental<Perspective, BACKWARDS>(
-                      pos.square<KING>(Perspective), st, pos.state());
-                return;
-            }
-            st = st->previous;
-        } while (!(st->*accPtr).computed[Perspective]);
-
-        // Start from the oldest computed accumulator, update all the
-        // accumulators up to the current position.
-        update_accumulator_incremental<Perspective>(pos.square<KING>(Perspective), pos.state(), st);
-    }
-
-    template<IndexType Size>
-    friend struct AccumulatorCaches::Cache;
-
    alignas(CacheLineSize) BiasType biases[HalfDimensions];
    alignas(CacheLineSize) WeightType weights[HalfDimensions * InputDimensions];
    alignas(CacheLineSize) PSQTWeightType psqtWeights[InputDimensions * PSQTBuckets];
--- a/src/nnue/nnue_misc.cpp
+++ b/src/nnue/nnue_misc.cpp
@@ -120,9 +120,12 @@ trace(Position& pos, const Eval::NNUE::Networks& networks, Eval::NNUE::Accumulat
            format_cp_compact(value, &board[y + 2][x + 2], pos);
    };

+    AccumulatorStack accumulators;
+    accumulators.reset(pos, networks, caches);
+
    // We estimate the value of each piece by doing a differential evaluation from
    // the current base eval, simulating the removal of the piece from its square.
-    auto [psqt, positional] = networks.big.evaluate(pos, &caches.big);
+    auto [psqt, positional] = networks.big.evaluate(pos, accumulators, &caches.big);
    Value base              = psqt + positional;
    base                    = pos.side_to_move() == WHITE ? base : -base;

@@ -135,18 +138,15 @@ trace(Position& pos, const Eval::NNUE::Networks& networks, Eval::NNUE::Accumulat

            if (pc != NO_PIECE && type_of(pc) != KING)
            {
-                auto st = pos.state();
-
                pos.remove_piece(sq);
-                st->accumulatorBig.computed[WHITE] = st->accumulatorBig.computed[BLACK] = false;

-                std::tie(psqt, positional) = networks.big.evaluate(pos, &caches.big);
+                accumulators.reset(pos, networks, caches);
+                std::tie(psqt, positional) = networks.big.evaluate(pos, accumulators, &caches.big);
                Value eval                 = psqt + positional;
                eval                       = pos.side_to_move() == WHITE ? eval : -eval;
                v                          = base - eval;

                pos.put_piece(pc, sq);
-                st->accumulatorBig.computed[WHITE] = st->accumulatorBig.computed[BLACK] = false;
            }

            writeSquare(f, r, pc, v);
@@ -157,7 +157,8 @@ trace(Position& pos, const Eval::NNUE::Networks& networks, Eval::NNUE::Accumulat
        ss << board[row] << '\n';
    ss << '\n';

-    auto t = networks.big.trace_evaluate(pos, &caches.big);
+    accumulators.reset(pos, networks, caches);
+    auto t = networks.big.trace_evaluate(pos, accumulators, &caches.big);

    ss << " NNUE network contributions "
       << (pos.side_to_move() == WHITE ? "(White to move)" : "(Black to move)") << std::endl
--- a/src/perft.h
+++ b/src/perft.h
@@ -34,7 +34,6 @@ template<bool Root>
 uint64_t perft(Position& pos, Depth depth) {

    StateInfo st;
-    ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize);

    uint64_t   cnt, nodes = 0;
    const bool leaf = (depth == 2);
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -34,7 +34,6 @@
 #include "bitboard.h"
 #include "misc.h"
 #include "movegen.h"
-#include "nnue/nnue_common.h"
 #include "syzygy/tbprobe.h"
 #include "tt.h"
 #include "uci.h"
@@ -83,7 +82,6 @@ std::ostream& operator<<(std::ostream& os, const Position& pos) {
    if (int(Tablebases::MaxCardinality) >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING))
    {
        StateInfo st;
-        ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize);

        Position p;
        p.set(pos.fen(), pos.is_chess960(), &st);
@@ -685,10 +683,10 @@ bool Position::gives_check(Move m) const {
 // moves should be filtered out before this function is called.
 // If a pointer to the TT table is passed, the entry for the new position
 // will be prefetched
-void Position::do_move(Move                      m,
-                       StateInfo&                newSt,
-                       bool                      givesCheck,
-                       const TranspositionTable* tt = nullptr) {
+DirtyPiece Position::do_move(Move                      m,
+                             StateInfo&                newSt,
+                             bool                      givesCheck,
+                             const TranspositionTable* tt = nullptr) {

    assert(m.is_ok());
    assert(&newSt != st);
@@ -709,11 +707,7 @@ void Position::do_move(Move                      m,
    ++st->rule50;
    ++st->pliesFromNull;

-    // Used by NNUE
-    st->accumulatorBig.computed[WHITE]     = st->accumulatorBig.computed[BLACK] =
-      st->accumulatorSmall.computed[WHITE] = st->accumulatorSmall.computed[BLACK] = false;
-
-    auto& dp     = st->dirtyPiece;
+    DirtyPiece dp;
    dp.dirty_num = 1;

    Color  us       = sideToMove;
@@ -733,7 +727,7 @@ void Position::do_move(Move                      m,
        assert(captured == make_piece(us, ROOK));

        Square rfrom, rto;
-        do_castling<true>(us, from, to, rfrom, rto);
+        do_castling<true>(us, from, to, rfrom, rto, &dp);

        k ^= Zobrist::psq[captured][rfrom] ^ Zobrist::psq[captured][rto];
        st->nonPawnKey[us] ^= Zobrist::psq[captured][rfrom] ^ Zobrist::psq[captured][rto];
@@ -906,6 +900,8 @@ void Position::do_move(Move                      m,
    }

    assert(pos_is_ok());
+
+    return dp;
 }


@@ -975,23 +971,25 @@ void Position::undo_move(Move m) {
 // Helper used to do/undo a castling move. This is a bit
 // tricky in Chess960 where from/to squares can overlap.
 template<bool Do>
-void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Square& rto) {
+void Position::do_castling(
+  Color us, Square from, Square& to, Square& rfrom, Square& rto, DirtyPiece* const dp) {

    bool kingSide = to > from;
    rfrom         = to;  // Castling is encoded as "king captures friendly rook"
    rto           = relative_square(us, kingSide ? SQ_F1 : SQ_D1);
    to            = relative_square(us, kingSide ? SQ_G1 : SQ_C1);

+    assert(!Do || dp);
+
    if (Do)
    {
-        auto& dp     = st->dirtyPiece;
-        dp.piece[0]  = make_piece(us, KING);
-        dp.from[0]   = from;
-        dp.to[0]     = to;
-        dp.piece[1]  = make_piece(us, ROOK);
-        dp.from[1]   = rfrom;
-        dp.to[1]     = rto;
-        dp.dirty_num = 2;
+        dp->piece[0]  = make_piece(us, KING);
+        dp->from[0]   = from;
+        dp->to[0]     = to;
+        dp->piece[1]  = make_piece(us, ROOK);
+        dp->from[1]   = rfrom;
+        dp->to[1]     = rto;
+        dp->dirty_num = 2;
    }

    // Remove both pieces first since squares could overlap in Chess960
@@ -1011,7 +1009,7 @@ void Position::do_null_move(StateInfo& newSt, const TranspositionTable& tt) {
    assert(!checkers());
    assert(&newSt != st);

-    std::memcpy(&newSt, st, offsetof(StateInfo, accumulatorBig));
+    std::memcpy(&newSt, st, sizeof(StateInfo));

    newSt.previous = st;
    st->next       = &newSt;
@@ -1026,11 +1024,6 @@ void Position::do_null_move(StateInfo& newSt, const TranspositionTable& tt) {
    st->key ^= Zobrist::side;
    prefetch(tt.first_entry(key()));

-    st->dirtyPiece.dirty_num               = 0;
-    st->dirtyPiece.piece[0]                = NO_PIECE;  // Avoid checks in UpdateAccumulator()
-    st->accumulatorBig.computed[WHITE]     = st->accumulatorBig.computed[BLACK] =
-      st->accumulatorSmall.computed[WHITE] = st->accumulatorSmall.computed[BLACK] = false;
-
    st->pliesFromNull = 0;

    sideToMove = ~sideToMove;
--- a/src/position.h
+++ b/src/position.h
@@ -26,8 +26,6 @@
 #include <string>

 #include "bitboard.h"
-#include "nnue/nnue_accumulator.h"
-#include "nnue/nnue_architecture.h"
 #include "types.h"

 namespace Stockfish {
@@ -61,11 +59,6 @@ struct StateInfo {
    Bitboard   checkSquares[PIECE_TYPE_NB];
    Piece      capturedPiece;
    int        repetition;
-
-    // Used by NNUE
-    DirtyPiece                                                             dirtyPiece;
-    Eval::NNUE::Accumulator<Eval::NNUE::TransformedFeatureDimensionsBig>   accumulatorBig;
-    Eval::NNUE::Accumulator<Eval::NNUE::TransformedFeatureDimensionsSmall> accumulatorSmall;
 };


@@ -140,11 +133,11 @@ class Position {
    Piece captured_piece() const;

    // Doing and undoing moves
-    void do_move(Move m, StateInfo& newSt, const TranspositionTable* tt);
-    void do_move(Move m, StateInfo& newSt, bool givesCheck, const TranspositionTable* tt);
-    void undo_move(Move m);
-    void do_null_move(StateInfo& newSt, const TranspositionTable& tt);
-    void undo_null_move();
+    void       do_move(Move m, StateInfo& newSt, const TranspositionTable* tt);
+    DirtyPiece do_move(Move m, StateInfo& newSt, bool givesCheck, const TranspositionTable* tt);
+    void       undo_move(Move m);
+    void       do_null_move(StateInfo& newSt, const TranspositionTable& tt);
+    void       undo_null_move();

    // Static Exchange Evaluation
    bool see_ge(Move m, int threshold = 0) const;
@@ -187,7 +180,12 @@ class Position {
    // Other helpers
    void move_piece(Square from, Square to);
    template<bool Do>
-    void do_castling(Color us, Square from, Square& to, Square& rfrom, Square& rto);
+    void do_castling(Color             us,
+                     Square            from,
+                     Square&           to,
+                     Square&           rfrom,
+                     Square&           rto,
+                     DirtyPiece* const dp = nullptr);
    template<bool AfterMove>
    Key adjust_key50(Key k) const;

--- a/src/search.cpp
+++ b/src/search.cpp
@@ -41,7 +41,6 @@
 #include "movepick.h"
 #include "nnue/network.h"
 #include "nnue/nnue_accumulator.h"
-#include "nnue/nnue_common.h"
 #include "position.h"
 #include "syzygy/tbprobe.h"
 #include "thread.h"
@@ -197,6 +196,8 @@ void Search::Worker::ensure_network_replicated() {

 void Search::Worker::start_searching() {

+    accumulatorStack.reset(rootPos, networks[numaAccessToken], refreshTable);
+
    // Non-main threads go directly to iterative_deepening()
    if (!is_mainthread())
    {
@@ -552,6 +553,26 @@ void Search::Worker::iterative_deepening() {
                             skill.best ? skill.best : skill.pick_best(rootMoves, multiPV)));
 }

+
+void Search::Worker::do_move(Position& pos, const Move move, StateInfo& st) {
+    do_move(pos, move, st, pos.gives_check(move));
+}
+
+void Search::Worker::do_move(Position& pos, const Move move, StateInfo& st, const bool givesCheck) {
+    DirtyPiece dp = pos.do_move(move, st, givesCheck, &tt);
+    accumulatorStack.push(dp);
+}
+
+void Search::Worker::do_null_move(Position& pos, StateInfo& st) { pos.do_null_move(st, tt); }
+
+void Search::Worker::undo_move(Position& pos, const Move move) {
+    pos.undo_move(move);
+    accumulatorStack.pop();
+}
+
+void Search::Worker::undo_null_move(Position& pos) { pos.undo_null_move(); }
+
+
 // Reset histories, usually before a new game
 void Search::Worker::clear() {
    mainHistory.fill(66);
@@ -614,7 +635,6 @@ Value Search::Worker::search(

    Move      pv[MAX_PLY + 1];
    StateInfo st;
-    ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize);

    Key   posKey;
    Move  move, excludedMove, bestMove;
@@ -859,11 +879,11 @@ Value Search::Worker::search(
        ss->continuationHistory           = &thisThread->continuationHistory[0][0][NO_PIECE][0];
        ss->continuationCorrectionHistory = &thisThread->continuationCorrectionHistory[NO_PIECE][0];

-        pos.do_null_move(st, tt);
+        do_null_move(pos, st);

        Value nullValue = -search<NonPV>(pos, ss + 1, -beta, -beta + 1, depth - R, false);

-        pos.undo_null_move();
+        undo_null_move(pos);

        // Do not return unproven mate or TB scores
        if (nullValue >= beta && !is_win(nullValue))
@@ -925,7 +945,7 @@ Value Search::Worker::search(

            movedPiece = pos.moved_piece(move);

-            pos.do_move(move, st, &tt);
+            do_move(pos, move, st);
            thisThread->nodes.fetch_add(1, std::memory_order_relaxed);

            ss->currentMove = move;
@@ -943,7 +963,7 @@ Value Search::Worker::search(
                value = -search<NonPV>(pos, ss + 1, -probCutBeta, -probCutBeta + 1, probCutDepth,
                                       !cutNode);

-            pos.undo_move(move);
+            undo_move(pos, move);

            if (value >= probCutBeta)
            {
@@ -1165,7 +1185,7 @@ moves_loop:  // When in check, search starts here
        }

        // Step 16. Make the move
-        pos.do_move(move, st, givesCheck, &tt);
+        do_move(pos, move, st, givesCheck);
        thisThread->nodes.fetch_add(1, std::memory_order_relaxed);

        // Add extension to new depth
@@ -1290,7 +1310,7 @@ moves_loop:  // When in check, search starts here
        }

        // Step 19. Undo move
-        pos.undo_move(move);
+        undo_move(pos, move);

        assert(value > -VALUE_INFINITE && value < VALUE_INFINITE);

@@ -1510,7 +1530,6 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta)

    Move      pv[MAX_PLY + 1];
    StateInfo st;
-    ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize);

    Key   posKey;
    Move  move, bestMove;
@@ -1674,7 +1693,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta)
        // Step 7. Make and search the move
        Piece movedPiece = pos.moved_piece(move);

-        pos.do_move(move, st, givesCheck, &tt);
+        do_move(pos, move, st, givesCheck);
        thisThread->nodes.fetch_add(1, std::memory_order_relaxed);

        // Update the current move
@@ -1685,7 +1704,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta)
          &thisThread->continuationCorrectionHistory[movedPiece][move.to_sq()];

        value = -qsearch<nodeType>(pos, ss + 1, -beta, -alpha);
-        pos.undo_move(move);
+        undo_move(pos, move);

        assert(value > -VALUE_INFINITE && value < VALUE_INFINITE);

@@ -1752,7 +1771,7 @@ TimePoint Search::Worker::elapsed() const {
 TimePoint Search::Worker::elapsed_time() const { return main_manager()->tm.elapsed_time(); }

 Value Search::Worker::evaluate(const Position& pos) {
-    return Eval::evaluate(networks[numaAccessToken], pos, refreshTable,
+    return Eval::evaluate(networks[numaAccessToken], pos, accumulatorStack, refreshTable,
                          optimism[pos.side_to_move()]);
 }

@@ -2178,7 +2197,6 @@ void SearchManager::pv(Search::Worker&           worker,
 bool RootMove::extract_ponder_from_tt(const TranspositionTable& tt, Position& pos) {

    StateInfo st;
-    ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize);

    assert(pv.size() == 1);
    if (pv[0] == Move::none())
--- a/src/search.h
+++ b/src/search.h
@@ -295,6 +295,12 @@ class Worker {
   private:
    void iterative_deepening();

+    void do_move(Position& pos, const Move move, StateInfo& st);
+    void do_move(Position& pos, const Move move, StateInfo& st, const bool givesCheck);
+    void do_null_move(Position& pos, StateInfo& st);
+    void undo_move(Position& pos, const Move move);
+    void undo_null_move(Position& pos);
+
    // This is the main search function, for both PV and non-PV nodes
    template<NodeType nodeType>
    Value search(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth, bool cutNode);
@@ -347,6 +353,7 @@ class Worker {
    const LazyNumaReplicated<Eval::NNUE::Networks>& networks;

    // Used by NNUE
+    Eval::NNUE::AccumulatorStack  accumulatorStack;
    Eval::NNUE::AccumulatorCaches refreshTable;

    friend class Stockfish::ThreadPool;