diff --git a/src/Makefile b/src/Makefile
index 59fb90c5..e871f267 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -368,8 +368,8 @@ endif
 endif
 
 ifeq ($(KERNEL),Darwin)
-	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.15
-	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.15
+	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14
+	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14
 endif
 
 ### Travis CI script uses COMPILER to overwrite CXX
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 61824b3a..9b3b58c3 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -51,9 +51,13 @@ namespace Eval {
     std::string eval_file = std::string(Options["EvalFile"]);
     if (useNNUE && eval_file_loaded != eval_file)
     {
-        std::cerr << "Use of NNUE evaluation, but the file " << eval_file << " was not loaded successfully. "
-                  << "These network evaluation parameters must be available, compatible with this version of the code. "
-                  << "The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file." << std::endl;
+        UCI::OptionsMap defaults;
+        UCI::init(defaults);
+
+        std::cerr << "NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully. "
+                  << "These network evaluation parameters must be available, and compatible with this version of the code. "
+                  << "The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file. "
+                  << "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << std::endl;
         std::exit(EXIT_FAILURE);
     }
 
@@ -111,7 +115,7 @@ namespace {
   constexpr Value LazyThreshold1 =  Value(1400);
   constexpr Value LazyThreshold2 =  Value(1300);
   constexpr Value SpaceThreshold = Value(12222);
-  constexpr Value NNUEThreshold  =   Value(520);
+  constexpr Value NNUEThreshold  =   Value(460);
 
   // KingAttackWeights[PieceType] contains king attack weights by piece type
   constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -942,7 +946,7 @@ Value Eval::evaluate(const Position& pos) {
   {
       Value v = eg_value(pos.psq_score());
       // Take NNUE eval only on balanced positions
-      if (abs(v) < NNUEThreshold)
+      if (abs(v) < NNUEThreshold + 20 * pos.count<PAWN>())
          return NNUE::evaluate(pos) + Tempo;
   }
   return Evaluation<NO_TRACE>(pos).value();
diff --git a/src/misc.cpp b/src/misc.cpp
index 0cb98e17..725450c2 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -321,9 +321,9 @@ void prefetch(void* addr) {
 ///
 
 void* std_aligned_alloc(size_t alignment, size_t size) {
-#if defined(__APPLE__)
+#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
   return aligned_alloc(alignment, size);
-#elif defined(_WIN32)
+#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
   return _mm_malloc(size, alignment);
 #else
   return std::aligned_alloc(alignment, size);
@@ -331,9 +331,9 @@ void* std_aligned_alloc(size_t alignment, size_t size) {
 }
 
 void std_aligned_free(void* ptr) {
-#if defined(__APPLE__)
+#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
   free(ptr);
-#elif defined(_WIN32)
+#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
   _mm_free(ptr);
 #else
   free(ptr);
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 057de8e1..7336be52 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -123,13 +123,8 @@ namespace Eval::NNUE::Layers {
         __m512i sum = _mm512_setzero_si512();
         const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #else
-            __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #endif
-
+            __m512i product = _mm512_maddubs_epi16(
+              _mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
             product = _mm512_madd_epi16(product, kOnes);
             sum = _mm512_add_epi32(sum, product);
         }
@@ -144,12 +139,8 @@ namespace Eval::NNUE::Layers {
             const auto row_256 = reinterpret_cast<const __m256i*>(&weights_[offset]);
             int j = kNumChunks * 2;
 
-  #if defined(__MINGW32__) || defined(__MINGW64__)  // See HACK comment below in AVX2.
-            __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
-  #else
-            __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
-  #endif
-
+            __m256i sum256 = _mm256_maddubs_epi16(
+              _mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
             sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1));
             sum256 = _mm256_hadd_epi32(sum256, sum256);
             sum256 = _mm256_hadd_epi32(sum256, sum256);
@@ -163,17 +154,7 @@ namespace Eval::NNUE::Layers {
         const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
           __m256i product = _mm256_maddubs_epi16(
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
-            //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
-            //       even though alignas is specified.
-            _mm256_loadu_si256
-  #else
-            _mm256_load_si256
-  #endif
-
-            (&input_vector[j]), _mm256_load_si256(&row[j]));
+            _mm256_load_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
           product = _mm256_madd_epi16(product, kOnes);
           sum = _mm256_add_epi32(sum, product);
         }
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 822e60b0..9b5a5f5f 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -86,50 +86,13 @@ namespace Eval::NNUE::Layers {
       const auto out = reinterpret_cast<__m256i*>(output);
       for (IndexType i = 0; i < kNumChunks; ++i) {
         const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
-          //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
-          //       even though alignas is specified.
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 0]),
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 1])), kWeightScaleBits);
+          _mm256_load_si256(&in[i * 4 + 0]),
+          _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
         const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 2]),
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 3])), kWeightScaleBits);
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-        _mm256_storeu_si256
-  #else
-        _mm256_store_si256
-  #endif
-
-          (&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+          _mm256_load_si256(&in[i * 4 + 2]),
+          _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
+        _mm256_store_si256(
+            &out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
             _mm256_packs_epi16(words0, words1), kZero), kOffsets));
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index a9fcf434..29e6db6e 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -126,36 +126,12 @@ namespace Eval::NNUE {
         auto out = reinterpret_cast<__m256i*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
           __m256i sum0 =
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
-            //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
-            //       even though alignas is specified.
-            _mm256_loadu_si256
-  #else
-            _mm256_load_si256
-  #endif
-
-            (&reinterpret_cast<const __m256i*>(
+            _mm256_load_si256(&reinterpret_cast<const __m256i*>(
               accumulation[perspectives[p]][0])[j * 2 + 0]);
           __m256i sum1 =
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            _mm256_loadu_si256
-  #else
-            _mm256_load_si256
-  #endif
-
-            (&reinterpret_cast<const __m256i*>(
+            _mm256_load_si256(&reinterpret_cast<const __m256i*>(
               accumulation[perspectives[p]][0])[j * 2 + 1]);
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_storeu_si256
-  #else
-          _mm256_store_si256
-  #endif
-
-          (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
         }
 
@@ -218,11 +194,7 @@ namespace Eval::NNUE {
           auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
           for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j]));
-  #else
             accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
-  #endif
           }
 
   #elif defined(USE_SSE2)
diff --git a/src/thread.cpp b/src/thread.cpp
index 44aea14e..1aa66a81 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -204,21 +204,18 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
 
   // We use Position::set() to set root position across threads. But there are
   // some StateInfo fields (previous, pliesFromNull, capturedPiece) that cannot
-  // be deduced from a fen string, so set() clears them and to not lose the info
-  // we need to backup and later restore setupStates->back(). Note that setupStates
-  // is shared by threads but is accessed in read-only mode.
-  StateInfo tmp = setupStates->back();
-
+  // be deduced from a fen string, so set() clears them and they are set from
+  // setupStates->back() later. The rootState is per thread, earlier states are shared
+  // since they are read-only.
   for (Thread* th : *this)
   {
       th->nodes = th->tbHits = th->nmpMinPly = th->bestMoveChanges = 0;
       th->rootDepth = th->completedDepth = 0;
       th->rootMoves = rootMoves;
-      th->rootPos.set(pos.fen(), pos.is_chess960(), &setupStates->back(), th);
+      th->rootPos.set(pos.fen(), pos.is_chess960(), &th->rootState, th);
+      th->rootState = setupStates->back();
   }
 
-  setupStates->back() = tmp;
-
   main()->start_searching();
 }
 
diff --git a/src/thread.h b/src/thread.h
index 46da1e34..042bc2e9 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -65,6 +65,7 @@ public:
   std::atomic<uint64_t> nodes, tbHits, bestMoveChanges;
 
   Position rootPos;
+  StateInfo rootState;
   Search::RootMoves rootMoves;
   Depth rootDepth, completedDepth;
   CounterMoveHistory counterMoves;