diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h index 78729064..fa0859ed 100644 --- a/src/nnue/trainer/trainer_feature_transformer.h +++ b/src/nnue/trainer/trainer_feature_transformer.h @@ -138,6 +138,7 @@ namespace Eval::NNUE { for (IndexType b = offset; b < offset + count; ++b) { const IndexType batch_offset = kOutputDimensions * b; + for (IndexType c = 0; c < 2; ++c) { const IndexType output_offset = batch_offset + kHalfDimensions * c; @@ -459,10 +460,16 @@ namespace Eval::NNUE { for (IndexType b = 0; b < batch_->size(); ++b) { const IndexType batch_offset = kOutputDimensions * b; + for (IndexType c = 0; c < 2; ++c) { const IndexType output_offset = batch_offset + kHalfDimensions * c; for (const auto& feature : (*batch_)[b].training_features[c]) { const IndexType feature_index = feature.get_index(); + const IndexType weights_offset = + kHalfDimensions * feature_index; +#if defined (USE_SSE2) + _mm_prefetch(reinterpret_cast(&weights_[weights_offset]), _MM_HINT_T2); +#endif // We assign each bucket a continuous range of bits at least // of cache line size to prevent false sharing. @@ -479,9 +486,6 @@ namespace Eval::NNUE { // (even a different cache line) observed_features.set(feature_index); - const IndexType weights_offset = - kHalfDimensions * feature_index; - const auto scale = static_cast( effective_learning_rate / feature.get_count());