Merge branch 'master' into trainer

2025-12-25 03:26:24 +08:00 · 2020-09-09 17:04:57 +08:00
parent 675d336ebb 4206a1edd0
commit 84ba591118
17 changed files with 553 additions and 836 deletions
--- a/src/Makefile
+++ b/src/Makefile
@@ -915,7 +915,7 @@ icc-profile-use:

 learn: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS=' -DEVAL_LEARN -DNNUE_EMBEDDING_OFF -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	EXTRACXXFLAGS=' -DEVAL_LEARN -DNNUE_EMBEDDING_OFF -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
 	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
 	all

@@ -923,7 +923,7 @@ profile-learn: config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DNNUE_EMBEDDING_OFF -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DNNUE_EMBEDDING_OFF -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
 	LEARNLDFLAGS='  $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
@@ -932,7 +932,7 @@ profile-learn: config-sanity objclean profileclean
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DNNUE_EMBEDDING_OFF -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DNNUE_EMBEDDING_OFF -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
 	LEARNLDFLAGS=' $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -1,62 +1,25 @@
 #ifndef _EVALUATE_COMMON_H_
 #define _EVALUATE_COMMON_H_

-#include <functional>
+#if defined(EVAL_LEARN)

-#include "../position.h"
+// A common header-like function for modern evaluation functions.
+#include <string>

 namespace Eval
 {
-
-#if defined(USE_EVAL_HASH)
-	// prefetch function
-	void prefetch_evalhash(const Key key);
-#endif
-
-	// An operator that applies the function f to each parameter of the evaluation function.
-	// Used for parameter analysis etc.
-	// type indicates the survey target.
-	// type = -1 :KK,KKP,KPP all
-	// type = 0: KK only
-	// type = 1: KKP only
-	// type = 2: KPP only
-	void foreach_eval_param(std::function<void(int32_t, int32_t)>f, int type = -1);
-
 	// --------------------------
 	// for learning
 	// --------------------------

-#if defined(EVAL_LEARN)
-	// Initialize the gradient array during learning
-	// Pass the learning rate as an argument. If 0.0, the default value is used.
-	// The epoch of update_weights() gradually changes from eta to eta2 until eta_epoch.
-	// After eta2_epoch, gradually change from eta2 to eta3.
-	void init_grad(double eta1, uint64_t eta_epoch, double eta2, uint64_t eta2_epoch, double eta3);
-
-	// Add the gradient difference value to the gradient array for all features that appear in the current phase.
-	void add_grad(Position& pos, Color rootColor, double delt_grad);
-
-	// Do SGD or AdaGrad or something based on the current gradient.
-	// epoch: Generation counter (starting from 0)
-	void update_weights(uint64_t epoch);
-
 	// Save the evaluation function parameters to a file.
 	// You can specify the extension added to the end of the file.
 	void save_eval(std::string suffix);

 	// Get the current eta.
 	double get_eta();
-
-	// --learning related commands
-
-	// A function that normalizes KK. Note that it is not completely equivalent to the original evaluation function.
-	// By making the values of kkp and kpp as close to zero as possible, the value of the feature factor (which is zero) that did not appear during learning
-	// The idea of ensuring it is valid.
-	void regularize_kk();
-
-#endif
-
-
 }

-#endif // _EVALUATE_KPPT_COMMON_H_
+#endif // defined(EVAL_LEARN)
+
+#endif // _EVALUATE_COMMON_H_
--- a/src/extra/sfen_packer.cpp
+++ b/src/extra/sfen_packer.cpp
@@ -218,7 +218,7 @@ struct SfenPacker
    PieceType pr = type_of(pc);
    auto c = huffman_table[pr];
    stream.write_n_bit(c.code, c.bits);
- 
+
    if (pc == NO_PIECE)
      return;

@@ -249,7 +249,7 @@ struct SfenPacker

    // first and second flag
    Color c = (Color)stream.read_one_bit();
-    
+
    return make_piece(c, pr);
  }
 };
@@ -266,7 +266,10 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
 {
 	SfenPacker packer;
 	auto& stream = packer.stream;
-	stream.set_data((uint8_t*)&sfen);
+
+  // TODO: separate streams for writing and reading. Here we actually have to
+  // const_cast which is not safe in the long run.
+	stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));

 	std::memset(this, 0, sizeof(Position));
 	std::memset(si, 0, sizeof(StateInfo));
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -25,19 +25,7 @@
 #include <chrono>
 #include <random>
 #include <regex>
-
-#if defined (_OPENMP)
-#include <omp.h>
-#endif
-
-#if defined(_MSC_VER)
-// The C++ filesystem cannot be used unless it is C++17 or later or MSVC.
-// I tried to use windows.h, but with g++ of msys2 I can not get the files in the folder well.
-// Use dirent.h because there is no help for it.
 #include <filesystem>
-#elif defined(__GNUC__)
-#include <dirent.h>
-#endif

 using namespace std;

--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1,46 +1,32 @@
 #if defined(EVAL_LEARN)

 #include "../eval/evaluate_common.h"
-
-#include "learn.h"
-#include "multi_think.h"
 #include "../misc.h"
-#include "../thread.h"
+#include "../nnue/evaluate_nnue_learner.h"
 #include "../position.h"
+#include "../syzygy/tbprobe.h"
+#include "../thread.h"
 #include "../tt.h"
 #include "../uci.h"
-#include "../syzygy/tbprobe.h"
+#include "learn.h"
+#include "multi_think.h"

 #include <chrono>
-#include <random>
-#include <regex>
-#include <sstream>
-#include <fstream>
-#include <unordered_set>
-#include <iomanip>
-#include <list>
+#include <climits>
 #include <cmath>
 #include <cstring>
-#include <memory>
-#include <limits>
-#include <optional>
-
-#if defined (_OPENMP)
-#include <omp.h>
-#endif
-
-#if defined(_MSC_VER)
-// std::filesystem doesn't work on GCC even though it claims to support C++17.
 #include <filesystem>
-#elif defined(__GNUC__)
-#include <dirent.h>
-#endif
-
-#if defined(EVAL_LEARN)
-#include "../nnue/evaluate_nnue_learner.h"
-#include <climits>
+#include <fstream>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <memory>
+#include <optional>
+#include <random>
+#include <regex>
 #include <shared_mutex>
-#endif
+#include <sstream>
+#include <unordered_set>

 using namespace std;

@@ -80,7 +66,7 @@ namespace Learner
            file_worker_thread.join();
            output_file_stream.close();

-#if !defined(DNDEBUG)
+#if defined(_DEBUG)
            {
                // All buffers should be empty since file_worker_thread
                // should have written everything before exiting.
@@ -176,7 +162,7 @@ namespace Learner
                        output_file_stream.write(reinterpret_cast<const char*>(buf->data()), sizeof(PackedSfenValue) * buf->size());

                        sfen_write_count += buf->size();
-#if 1
+
                        // Add the processed number here, and if it exceeds save_every,
                        // change the file name and reset this counter.
                        sfen_write_count_current_file += buf->size();
@@ -196,7 +182,7 @@ namespace Learner
                            output_file_stream.open(new_filename, ios::out | ios::binary | ios::app);
                            cout << endl << "output sfen file = " << new_filename << endl;
                        }
-#endif
+
                        // Output '.' every time when writing a game record.
                        std::cout << ".";

@@ -522,10 +508,6 @@ namespace Learner
        {
            // Write out one sfen.
            sfen_writer.write(thread_id, *it);
-#if 0
-            pos.set_from_packed_sfen(it->sfen);
-            cout << pos << "Win : " << it->is_win << " , " << it->score << endl;
-#endif
        }

        return quit;
@@ -717,6 +699,7 @@ namespace Learner
                    flush_psv(result.value());
                    break;
                }
+
                {
                    auto [search_value, search_pv] = search(pos, depth, 1, nodes);

@@ -744,19 +727,6 @@ namespace Learner
                    // Save the move score for adjudication.
                    move_hist_scores.push_back(search_value);

-#if 0
-                    dbg_hit_on(search_value == leaf_value);
-                    // gensfen depth 3 eval_limit 32000
-                    // Total 217749 Hits 203579 hit rate (%) 93.490
-                    // gensfen depth 6 eval_limit 32000
-                    // Total 78407 Hits 69190 hit rate (%) 88.245
-                    // gensfen depth 6 eval_limit 3000
-                    // Total 53879 Hits 43713 hit rate (%) 81.132
-
-                    // Problems such as pruning with moves in the substitution table.
-                    // This is a little uncomfortable as a teacher...
-#endif
-
                    // If depth 0, pv is not obtained, so search again at depth 2.
                    if (search_depth_min <= 0)
                    {
@@ -892,12 +862,6 @@ namespace Learner

        string token;

-        // When hit to eval hash, as a evaluation value near the initial stage, if a hash collision occurs and a large value is written
-        // When eval_limit is set small, eval_limit will be exceeded every time in the initial phase, and phase generation will not proceed.
-        // Therefore, eval hash needs to be disabled.
-        // After that, when the hash of the eval hash collides, the evaluation value of a strange value is used, and it may be unpleasant to use it for the teacher.
-        bool use_eval_hash = false;
-
        // Save to file in this unit.
        // File names are serialized like file_1.bin, file_2.bin.
        uint64_t save_every = UINT64_MAX;
@@ -946,8 +910,6 @@ namespace Learner
                is >> write_minply;
            else if (token == "write_maxply")
                is >> write_maxply;
-            else if (token == "use_eval_hash")
-                is >> use_eval_hash;
            else if (token == "save_every")
                is >> save_every;
            else if (token == "random_file_name")
@@ -1017,7 +979,6 @@ namespace Learner
            << "  write_minply           = " << write_minply << endl
            << "  write_maxply           = " << write_maxply << endl
            << "  output_file_name       = " << output_file_name << endl
-            << "  use_eval_hash          = " << use_eval_hash << endl
            << "  save_every             = " << save_every << endl
            << "  random_file_name       = " << random_file_name << endl
            << "  write_out_draw_game_in_training_data_generation = " << write_out_draw_game_in_training_data_generation << endl
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -5,84 +5,6 @@

 #include <vector>

-// =====================
-// Settings for learning
-// =====================
-
-// If you select one of the following, the details after that will be automatically selected.
-// If you don't select any of them, you need to set the subsequent details one by one.
-
-// Learning setting by elmo method. This is the default setting.
-// To make a standard squeeze diaphragm, specify "lambda 1" with the learn command.
-#define LEARN_ELMO_METHOD
-
-
-// ----------------------
-// update formula
-// ----------------------
-
-// Ada Grad. Recommended because it is stable.
-// #define ADA_GRAD_UPDATE
-
-// SGD looking only at the sign of the gradient. It requires less memory, but the accuracy is...
-// #define SGD_UPDATE
-
-// ----------------------
-// Settings for learning
-// ----------------------
-
-// mini-batch size.
-// Calculate the gradient by combining this number of phases.
-// If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
-// If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
-// I don't think you need to change this value in most cases.
-
-#define LEARN_MINI_BATCH_SIZE (1000 * 1000 * 1)
-
-// The number of phases to read from the file at one time. After reading this much, shuffle.
-// It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
-// Must be a multiple of THREAD_BUFFER_SIZE(=10000).
-
-#define LEARN_SFEN_READ_SIZE (1000 * 1000 * 10)
-
-// Saving interval of evaluation function at learning. Save each time you learn this number of phases.
-// Needless to say, the longer the saving interval, the shorter the learning time.
-// Folder name is incremented for each save like 0/, 1/, 2/...
-// By default, once every 1 billion phases.
-#define LEARN_EVAL_SAVE_INTERVAL (1000000000ULL)
-
-
-// ----------------------
-// Select the objective function
-// ----------------------
-
-// The objective function is the sum of squares of the difference in winning percentage
-// See learner.cpp for more information.
-
-// #define LOSS_FUNCTION_IS_WINNING_PERCENTAGE
-
-// Objective function is cross entropy
-// See learner.cpp for more information.
-// So-called ordinary "rag cloth squeezer"
-// #define LOSS_FUNCTION_IS_CROSS_ENTOROPY
-
-// A version in which the objective function is cross entropy, but the win rate function is not passed
-// #define LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE
-
-// elmo (WCSC27) method
-// #define LOSS_FUNCTION_IS_ELMO_METHOD
-
-// ※ Other things may be added.
-
-
-// ----------------------
-// debug settings for learning
-// ----------------------
-
-// Reduce the output of rmse during learning to 1 for this number of times.
-// rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
-#define LEARN_RMSE_OUTPUT_INTERVAL 1
-
 // ----------------------
 // Floating point for learning
 // ----------------------
@@ -109,28 +31,7 @@ typedef float LearnFloatType;
 // Learning with the method of elmo (WCSC27)
 // ----------------------

-#if defined( LEARN_ELMO_METHOD )
-#define LOSS_FUNCTION_IS_ELMO_METHOD
-#define ADA_GRAD_UPDATE
-#endif
-
-// Character string according to update formula. (Output for debugging.)
-// Implemented various update expressions, but concluded that AdaGrad is the best in terms of speed and memory.
-#if defined(ADA_GRAD_UPDATE)
-#define LEARN_UPDATE "AdaGrad"
-#elif defined(SGD_UPDATE)
-#define LEARN_UPDATE "SGD"
-#endif
-
-#if defined(LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
-#define LOSS_FUNCTION "WINNING_PERCENTAGE"
-#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY)
-#define LOSS_FUNCTION "CROSS_ENTOROPY"
-#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE)
-#define LOSS_FUNCTION "CROSS_ENTOROPY_FOR_VALUE"
-#elif defined(LOSS_FUNCTION_IS_ELMO_METHOD)
 #define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
-#endif

 // ----------------------
 // Definition of struct used in Learner
@@ -139,6 +40,34 @@ typedef float LearnFloatType;

 namespace Learner
 {
+	// ----------------------
+	// Settings for learning
+	// ----------------------
+
+	// mini-batch size.
+	// Calculate the gradient by combining this number of phases.
+	// If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
+	// If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
+	// I don't think you need to change this value in most cases.
+
+	constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;
+
+	// The number of phases to read from the file at one time. After reading this much, shuffle.
+	// It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
+	// Must be a multiple of THREAD_BUFFER_SIZE(=10000).
+
+	constexpr std::size_t LEARN_SFEN_READ_SIZE = 1000 * 1000 * 10;
+
+	// Saving interval of evaluation function at learning. Save each time you learn this number of phases.
+	// Needless to say, the longer the saving interval, the shorter the learning time.
+	// Folder name is incremented for each save like 0/, 1/, 2/...
+	// By default, once every 1 billion phases.
+	constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 1000000000ULL;
+
+	// Reduce the output of rmse during learning to 1 for this number of times.
+	// rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
+	constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;
+
 	//Structure in which PackedSfen and evaluation value are integrated
 	// If you write different contents for each option, it will be a problem when reusing the teacher game
 	// For the time being, write all the following members regardless of the options.
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
--- a/src/learn/learning_tools.cpp
+++ b/src/learn/learning_tools.cpp
@@ -2,9 +2,6 @@

 #if defined (EVAL_LEARN)

-#if defined(_OPENMP)
-#include <omp.h>
-#endif
 #include "../misc.h"

 using namespace Eval;
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -4,7 +4,11 @@
 // A set of machine learning tools related to the weight array used for machine learning of evaluation functions

 #include "learn.h"
+
 #if defined (EVAL_LEARN)
+
+#include "../misc.h"  // PRNG , my_insertion_sort
+
 #include <array>
 #include <cmath>	// std::sqrt()

@@ -24,14 +28,6 @@ namespace EvalLearningTools
 		// cumulative value of one mini-batch gradient
 		LearnFloatType g = LearnFloatType(0);

-		// When ADA_GRAD_UPDATE. LearnFloatType == float,
-		// total 4*2 + 4*2 + 1*2 = 18 bytes
-		// It suffices to secure a Weight array that is 4.5 times the size of the evaluation function parameter of 1GB.
-		// However, sizeof(Weight)==20 code is generated if the structure alignment is in 4-byte units, so
-		// Specify pragma pack(2).
-
-		// For SGD_UPDATE, this structure is reduced by 10 bytes to 8 bytes.
-
 		// Learning rate η(eta) such as AdaGrad.
 		// It is assumed that eta1,2,3,eta1_epoch,eta2_epoch have been set by the time updateFV() is called.
 		// The epoch of update_weights() gradually changes from eta1 to eta2 until eta1_epoch.
@@ -71,96 +67,6 @@ namespace EvalLearningTools

 		template <typename T> void updateFV(T& v) { updateFV(v, 1.0); }

-#if defined (ADA_GRAD_UPDATE)
-
-		// Since the maximum value that can be accurately calculated with float is INT16_MAX*256-1
-		// Keep the small value as a marker.
-		const LearnFloatType V0_NOT_INIT = (INT16_MAX * 128);
-
-		// What holds v internally. The previous implementation kept a fixed decimal with only a fractional part to save memory,
-		// Since it is doubtful in accuracy and the visibility is bad, it was abolished.
-		LearnFloatType v0 = LearnFloatType(V0_NOT_INIT);
-
-		// AdaGrad g2
-		LearnFloatType g2 = LearnFloatType(0);
-
-		// update with AdaGrad
-		// When executing this function, the value of g and the member do not change
-		// Guaranteed by the caller. It does not have to be an atomic operation.
-		// k is a coefficient for eta. 1.0 is usually sufficient. If you want to lower eta for your turn item, set this to 1/8.0 etc.
-		template <typename T>
-		void updateFV(T& v,double k)
-		{
-			// AdaGrad update formula
-			// Gradient vector is g, vector to be updated is v, η(eta) is a constant,
-			//     g2 = g2 + g^2
-			//     v = v - ηg/sqrt(g2)
-
-			constexpr double epsilon = 0.000001;
-
-			if (g == LearnFloatType(0))
-				return;
-
-			g2 += g * g;
-
-			// If v0 is V0_NOT_INIT, it means that the value is not initialized with the value of KK/KKP/KPP array,
-			// In this case, read the value of v from the one passed in the argument.
-			double V = (v0 == V0_NOT_INIT) ? v : v0;
-
-			V -= k * eta * (double)g / sqrt((double)g2 + epsilon);
-
-			// Limit the value of V to be within the range of types.
-			// By the way, windows.h defines the min and max macros, so to avoid it,
-			// Here, it is enclosed in parentheses so that it is not treated as a function-like macro.
-			V = (std::min)((double)(std::numeric_limits<T>::max)() , V);
-			V = (std::max)((double)(std::numeric_limits<T>::min)() , V);
-
-			v0 = (LearnFloatType)V;
-			v = (T)round(V);
-
-			// Clear g because one update of mini-batch for this element is over
-			// g[i] = 0;
-			// → There is a problem of dimension reduction, so this will be done by the caller.
-		}
-
-#elif defined(SGD_UPDATE)
-
-		// See only the sign of the gradient Update with SGD
-		// When executing this function, the value of g and the member do not change
-		// Guaranteed by the caller. It does not have to be an atomic operation.
-		template <typename T>
-		void updateFV(T & v , double k)
-		{
-			if (g == 0)
-				return;
-
-			// See only the sign of g and update.
-			// If g <0, add v a little.
-			// If g> 0, subtract v slightly.
-
-			// Since we only add integers, no decimal part is required.
-
-			// It's a good idea to move around 0-5.
-			// It is better to have a Gaussian distribution, so generate a 5-bit random number (each bit has a 1/2 probability of 1),
-			// Pop_count() it. At this time, it has a binomial distribution.
-			//int16_t diff = (int16_t)POPCNT32((u32)prng.rand(31));
-			// → If I do this with 80 threads, this AsyncPRNG::rand() locks, so I slowed down. This implementation is not good.
-			int16_t diff = 1;
-
-			double V = v;
-			if (g > 0.0)
-				V-= diff;
-			else
-				V+= diff;
-
-			V = (std::min)((double)(std::numeric_limits<T>::max)(), V);
-			V = (std::max)((double)(std::numeric_limits<T>::min)(), V);
-
-			v = (T)V;
-		}
-
-#endif
-
 		// grad setting
 		template <typename T> void set_grad(const T& g_) { g = g_; }

--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -685,18 +685,27 @@ void* aligned_malloc(size_t size, size_t align)
    return p;
 }

+std::uint64_t get_file_size(std::fstream& fs)
+{
+    auto pos = fs.tellg();
+
+    fs.seekg(0, fstream::end);
+    const uint64_t eofPos = (uint64_t)fs.tellg();
+    fs.clear(); // Otherwise, the next seek may fail.
+    fs.seekg(0, fstream::beg);
+    const uint64_t begPos = (uint64_t)fs.tellg();
+    fs.seekg(pos);
+
+    return eofPos - begPos;
+}
+
 int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func)
 {
    fstream fs(filename, ios::in | ios::binary);
    if (fs.fail())
        return 1;

-    fs.seekg(0, fstream::end);
-    uint64_t eofPos = (uint64_t)fs.tellg();
-    fs.clear(); // Otherwise the next seek may fail.
-    fs.seekg(0, fstream::beg);
-    uint64_t begPos = (uint64_t)fs.tellg();
-    uint64_t file_size = eofPos - begPos;
+    const uint64_t file_size = get_file_size(fs);
    //std::cout << "filename = " << filename << " , file_size = " << file_size << endl;

    // I know the file size, so call callback_func to get a buffer for this,
--- a/src/misc.h
+++ b/src/misc.h
@@ -26,6 +26,8 @@
 #include <ostream>
 #include <string>
 #include <vector>
+#include <utility>
+#include <cmath>

 #include "types.h"

@@ -153,6 +155,7 @@ std::string now_string();
 // Also, if the buffer cannot be allocated in the callback function or if the file size is different from the expected file size,
 // Return nullptr. At this time, read_file_to_memory() interrupts reading and returns with an error.

+std::uint64_t get_file_size(std::fstream& fs);
 int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func);
 int write_memory_to_file(std::string filename, void* ptr, uint64_t size);

@@ -197,20 +200,38 @@ inline std::ostream& operator<<(std::ostream& os, AsyncPRNG& prng)

 // Mathematical function used for progress calculation and learning
 namespace Math {
-	// Sigmoid function
-	// = 1.0 / (1.0 + std::exp(-x))
-	double sigmoid(double x);
+    inline double sigmoid(double x)
+    {
+        return 1.0 / (1.0 + std::exp(-x));
+    }

-	// Differentiation of sigmoid function
-	// = sigmoid(x) * (1.0-sigmoid(x))
-	double dsigmoid(double x);
+    inline double dsigmoid(double x)
+    {
+        // Sigmoid function
+        // f(x) = 1/(1+exp(-x))
+        // the first derivative is
+        // f'(x) = df/dx = f(x)・{ 1-f(x)}
+        // becomes
+
+        return sigmoid(x) * (1.0 - sigmoid(x));
+    }

 	// Clip v so that it fits between [lo,hi].
 	// * In Stockfish, this function is written in bitboard.h.
 	template<class T> constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
 		return v < lo ? lo : v > hi ? hi : v;
 	}
+}

+namespace Algo {
+    // Fisher-Yates
+    template <typename Rng, typename T>
+    void shuffle(std::vector<T>& buf, Rng&& prng)
+    {
+        const auto size = buf.size();
+        for (uint64_t i = 0; i < size; ++i)
+            std::swap(buf[i], buf[prng.rand(size - i) + i]);
+    }
 }

 // --------------------
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -165,6 +165,7 @@ namespace Eval::NNUE {
  bool load_eval(std::string streamName, std::istream& stream) {

    Initialize();
+
    if (Options["SkipLoadingEval"])
    {
      std::cout << "info string SkipLoadingEval set to true, Net not loaded!" << std::endl;
--- a/src/nnue/features/castling_right.cpp
+++ b/src/nnue/features/castling_right.cpp
@@ -26,7 +26,7 @@ namespace Eval {
            & ((castling_rights >> 2) & 3);
        }

-        for (unsigned int i = 0; i <kDimensions; ++i) {
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
          if (relative_castling_rights & (i << 1)) {
            active->push_back(i);
          }
@@ -36,7 +36,7 @@ namespace Eval {
      // Get a list of indices whose values have changed from the previous one in the feature quantity
      void CastlingRight::AppendChangedIndices(
        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+        IndexList* removed, IndexList* /* added */) {

        int previous_castling_rights = pos.state()->previous->castlingRights;
        int current_castling_rights = pos.state()->castlingRights;
@@ -54,7 +54,7 @@ namespace Eval {
            & ((current_castling_rights >> 2) & 3);
        }

-        for (unsigned int i = 0; i < kDimensions; ++i) {
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
          if ((relative_previous_castling_rights & (i << 1)) &&
            (relative_current_castling_rights & (i << 1)) == 0) {
            removed->push_back(i);
--- a/src/nnue/features/enpassant.cpp
+++ b/src/nnue/features/enpassant.cpp
@@ -30,8 +30,8 @@ namespace Eval {

      // Get a list of indices whose values ??have changed from the previous one in the feature quantity
      void EnPassant::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+        const Position& /* pos */, Color /* perspective */,
+        IndexList* /* removed */, IndexList* /* added */) {
        // Not implemented.
        assert(false);
      }
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -1,7 +1,5 @@
 // USI extended command for NNUE evaluation function

-#if defined(ENABLE_TEST_CMD)
-
 #include "../thread.h"
 #include "../uci.h"
 #include "evaluate_nnue.h"
@@ -36,12 +34,12 @@ void TestFeatures(Position& pos) {
  std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
  constexpr IndexType kUnknown = -1;
  std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
-  auto make_index_sets = [&](const Position& pos) {
+  auto make_index_sets = [&](const Position& position) {
    std::vector<std::vector<std::set<IndexType>>> index_sets(
        kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
      Features::IndexList active_indices[2];
-      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
+      RawFeatures::AppendActiveIndices(position, kRefreshTriggers[i],
                                       active_indices);
      for (const auto perspective : Colors) {
        for (const auto index : active_indices[perspective]) {
@@ -55,11 +53,11 @@ void TestFeatures(Position& pos) {
    }
    return index_sets;
  };
-  auto update_index_sets = [&](const Position& pos, auto* index_sets) {
+  auto update_index_sets = [&](const Position& position, auto* index_sets) {
    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
      Features::IndexList removed_indices[2], added_indices[2];
      bool reset[2];
-      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
+      RawFeatures::AppendChangedIndices(position, kRefreshTriggers[i],
                                        removed_indices, added_indices, reset);
      for (const auto perspective : Colors) {
        if (reset[perspective]) {
@@ -197,5 +195,3 @@ void TestCommand(Position& pos, std::istream& stream) {
 }  // namespace NNUE

 }  // namespace Eval
-
-#endif  // defined(ENABLE_TEST_CMD)
--- a/src/nnue/nnue_test_command.h
+++ b/src/nnue/nnue_test_command.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TEST_COMMAND_H_
 #define _NNUE_TEST_COMMAND_H_

-#if defined(ENABLE_TEST_CMD)
-
 namespace Eval {

 namespace NNUE {
@@ -16,6 +14,4 @@ void TestCommand(Position& pos, std::istream& stream);

 }  // namespace Eval

-#endif  // defined(ENABLE_TEST_CMD)
-
 #endif
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -24,17 +24,14 @@

 #include "evaluate.h"
 #include "movegen.h"
+#include "nnue/nnue_test_command.h"
 #include "position.h"
 #include "search.h"
+#include "syzygy/tbprobe.h"
 #include "thread.h"
 #include "timeman.h"
 #include "tt.h"
 #include "uci.h"
-#include "syzygy/tbprobe.h"
-
-#if defined(ENABLE_TEST_CMD)
-#include "nnue/nnue_test_command.h"
-#endif

 using namespace std;

@@ -53,7 +50,6 @@ namespace Learner
  // Learning from the generated game record
  void learn(Position& pos, istringstream& is);

-
  // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
  typedef std::pair<Value, std::vector<Move> > ValueAndPV;

@@ -63,7 +59,6 @@ namespace Learner
 }
 #endif

-#if defined(ENABLE_TEST_CMD)
 void test_cmd(Position& pos, istringstream& is)
 {
    // Initialize as it may be searched.
@@ -74,7 +69,6 @@ void test_cmd(Position& pos, istringstream& is)

    if (param == "nnue") Eval::NNUE::TestCommand(pos, is);
 }
-#endif

 namespace {

@@ -365,10 +359,8 @@ void UCI::loop(int argc, char* argv[]) {

 #endif

-#if defined(ENABLE_TEST_CMD)
      // test command
      else if (token == "test") test_cmd(pos, is);
-#endif
      else
          sync_cout << "Unknown command: " << cmd << sync_endl;