mirror of
https://github.com/HChaZZY/Stockfish.git
synced 2025-12-25 11:36:51 +08:00
Remove whole file shuffling as it does not change learning behaviour, only works for bin, and is considered harmful for binpack.
This commit is contained in:
@@ -904,252 +904,6 @@ namespace Learner
|
||||
return false;
|
||||
}
|
||||
|
||||
// Shuffle_files(), shuffle_files_quick() subcontracting, writing part.
|
||||
// output_file_name: Name of the file to write
|
||||
// prng: random number generator
|
||||
// sfen_file_streams: fstream of each teacher phase file
|
||||
// sfen_count_in_file: The number of teacher positions present in each file.
|
||||
void shuffle_write(
|
||||
const string& output_file_name,
|
||||
PRNG& prng,
|
||||
vector<fstream>& sfen_file_streams,
|
||||
vector<uint64_t>& sfen_count_in_file)
|
||||
{
|
||||
uint64_t total_sfen_count = 0;
|
||||
for (auto c : sfen_count_in_file)
|
||||
total_sfen_count += c;
|
||||
|
||||
// number of exported phases
|
||||
uint64_t write_sfen_count = 0;
|
||||
|
||||
// Output the progress on the screen for each phase.
|
||||
const uint64_t buffer_size = 10000000;
|
||||
|
||||
auto print_status = [&]()
|
||||
{
|
||||
// Output progress every 10M phase or when all writing is completed
|
||||
if (((write_sfen_count % buffer_size) == 0) ||
|
||||
(write_sfen_count == total_sfen_count))
|
||||
{
|
||||
cout << write_sfen_count << " / " << total_sfen_count << endl;
|
||||
}
|
||||
};
|
||||
|
||||
cout << endl << "write : " << output_file_name << endl;
|
||||
|
||||
fstream fs(output_file_name, ios::out | ios::binary);
|
||||
|
||||
// total teacher positions
|
||||
uint64_t sfen_count_left = total_sfen_count;
|
||||
|
||||
while (sfen_count_left != 0)
|
||||
{
|
||||
auto r = prng.rand(sfen_count_left);
|
||||
|
||||
// Aspects stored in fs[0] file ... Aspects stored in fs[1] file ...
|
||||
//Think of it as a series like, and determine in which file r is pointing.
|
||||
// The contents of the file are shuffled, so you can take the next element from that file.
|
||||
// Each file has a_count[x] phases, so this process can be written as follows.
|
||||
|
||||
uint64_t i = 0;
|
||||
while (sfen_count_in_file[i] <= r)
|
||||
r -= sfen_count_in_file[i++];
|
||||
|
||||
// This confirms n. Before you forget it, reduce the remaining number.
|
||||
|
||||
--sfen_count_in_file[i];
|
||||
--sfen_count_left;
|
||||
|
||||
PackedSfenValue psv;
|
||||
// It's better to read and write all at once until the performance is not so good...
|
||||
if (sfen_file_streams[i].read((char*)&psv, sizeof(PackedSfenValue)))
|
||||
{
|
||||
fs.write((char*)&psv, sizeof(PackedSfenValue));
|
||||
++write_sfen_count;
|
||||
print_status();
|
||||
}
|
||||
}
|
||||
|
||||
print_status();
|
||||
fs.close();
|
||||
|
||||
cout << "done!" << endl;
|
||||
}
|
||||
|
||||
// Subcontracting the teacher shuffle "learn shuffle" command.
|
||||
// output_file_name: name of the output file where the shuffled teacher positions will be written
|
||||
void shuffle_files(const vector<string>& filenames, const string& output_file_name, uint64_t buffer_size, const std::string& seed)
|
||||
{
|
||||
// The destination folder is
|
||||
// tmp/ for temporary writing
|
||||
|
||||
// Temporary file is written to tmp/ folder for each buffer_size phase.
|
||||
// For example, if buffer_size = 20M, you need a buffer of 20M*40bytes = 800MB.
|
||||
// In a PC with a small memory, it would be better to reduce this.
|
||||
// However, if the number of files increases too much,
|
||||
// it will not be possible to open at the same time due to OS restrictions.
|
||||
// There should have been a limit of 512 per process on Windows, so you can open here as 500,
|
||||
// The current setting is 500 files x 20M = 10G = 10 billion phases.
|
||||
|
||||
PSVector buf(buffer_size);
|
||||
|
||||
// ↑ buffer, a marker that indicates how much you have used
|
||||
uint64_t buf_write_marker = 0;
|
||||
|
||||
// File name to write (incremental counter because it is a serial number)
|
||||
uint64_t write_file_count = 0;
|
||||
|
||||
// random number to shuffle
|
||||
// Do not use std::random_device(). Because it always the same integers on MinGW.
|
||||
PRNG prng(seed);
|
||||
|
||||
// generate the name of the temporary file
|
||||
auto make_filename = [](uint64_t i)
|
||||
{
|
||||
return "tmp/" + to_string(i) + ".bin";
|
||||
};
|
||||
|
||||
// Exported files in tmp/ folder, number of teacher positions stored in each
|
||||
vector<uint64_t> a_count;
|
||||
|
||||
auto write_buffer = [&](uint64_t size)
|
||||
{
|
||||
Algo::shuffle(buf, prng);
|
||||
|
||||
// write to a file
|
||||
fstream fs;
|
||||
fs.open(make_filename(write_file_count++), ios::out | ios::binary);
|
||||
fs.write(reinterpret_cast<char*>(buf.data()), size * sizeof(PackedSfenValue));
|
||||
fs.close();
|
||||
a_count.push_back(size);
|
||||
|
||||
buf_write_marker = 0;
|
||||
cout << ".";
|
||||
};
|
||||
|
||||
std::filesystem::create_directory("tmp");
|
||||
|
||||
// Shuffle and export as a 10M phase shredded file.
|
||||
for (auto filename : filenames)
|
||||
{
|
||||
fstream fs(filename, ios::in | ios::binary);
|
||||
cout << endl << "open file = " << filename;
|
||||
while (fs.read(reinterpret_cast<char*>(&buf[buf_write_marker]), sizeof(PackedSfenValue)))
|
||||
if (++buf_write_marker == buffer_size)
|
||||
write_buffer(buffer_size);
|
||||
|
||||
// Read in units of sizeof(PackedSfenValue),
|
||||
// Ignore the last remaining fraction. (Fails in fs.read, so exit while)
|
||||
// (The remaining fraction seems to be half-finished data
|
||||
// that was created because it was stopped halfway during teacher generation.)
|
||||
}
|
||||
|
||||
if (buf_write_marker != 0)
|
||||
write_buffer(buf_write_marker);
|
||||
|
||||
// Only shuffled files have been written write_file_count.
|
||||
// As a second pass, if you open all of them at the same time,
|
||||
// select one at random and load one phase at a time
|
||||
// Now you have shuffled.
|
||||
|
||||
// Original file for shirt full + tmp file + file to write
|
||||
// requires 3 times the storage capacity of the original file.
|
||||
// 1 billion SSD is not enough for shuffling because it is 400GB for 10 billion phases.
|
||||
// If you want to delete (or delete by hand) the
|
||||
// original file at this point after writing to tmp,
|
||||
// The storage capacity is about twice that of the original file.
|
||||
// So, maybe we should have an option to delete the original file.
|
||||
|
||||
// Files are opened at the same time. It is highly possible that this will exceed FOPEN_MAX.
|
||||
// In that case, rather than adjusting buffer_size to reduce the number of files.
|
||||
|
||||
vector<fstream> afs;
|
||||
for (uint64_t i = 0; i < write_file_count; ++i)
|
||||
afs.emplace_back(fstream(make_filename(i), ios::in | ios::binary));
|
||||
|
||||
// Throw to the subcontract function and end.
|
||||
shuffle_write(output_file_name, prng, afs, a_count);
|
||||
}
|
||||
|
||||
// Subcontracting the teacher shuffle "learn shuffleq" command.
|
||||
// This is written in 1 pass.
|
||||
// output_file_name: name of the output file where the shuffled teacher positions will be written
|
||||
void shuffle_files_quick(const vector<string>& filenames, const string& output_file_name, const std::string& seed)
|
||||
{
|
||||
// random number to shuffle
|
||||
// Do not use std::random_device(). Because it always the same integers on MinGW.
|
||||
PRNG prng(seed);
|
||||
|
||||
// number of files
|
||||
const size_t file_count = filenames.size();
|
||||
|
||||
// Number of teacher positions stored in each file in filenames
|
||||
vector<uint64_t> sfen_count_in_file(file_count);
|
||||
|
||||
// Count the number of teacher aspects in each file.
|
||||
vector<fstream> sfen_file_streams(file_count);
|
||||
|
||||
for (size_t i = 0; i < file_count; ++i)
|
||||
{
|
||||
auto filename = filenames[i];
|
||||
auto& fs = sfen_file_streams[i];
|
||||
|
||||
fs.open(filename, ios::in | ios::binary);
|
||||
const uint64_t file_size = get_file_size(fs);
|
||||
const uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
|
||||
sfen_count_in_file[i] = sfen_count;
|
||||
|
||||
// Output the number of sfen stored in each file.
|
||||
cout << filename << " = " << sfen_count << " sfens." << endl;
|
||||
}
|
||||
|
||||
// Since we know the file size of each file,
|
||||
// open them all at once (already open),
|
||||
// Select one at a time and load one phase at a time
|
||||
// Now you have shuffled.
|
||||
|
||||
// Throw to the subcontract function and end.
|
||||
shuffle_write(output_file_name, prng, sfen_file_streams, sfen_count_in_file);
|
||||
}
|
||||
|
||||
// Subcontracting the teacher shuffle "learn shufflem" command.
|
||||
// Read the whole memory and write it out with the specified file name.
|
||||
void shuffle_files_on_memory(const vector<string>& filenames, const string output_file_name, const std::string& seed)
|
||||
{
|
||||
PSVector buf;
|
||||
|
||||
for (auto filename : filenames)
|
||||
{
|
||||
std::cout << "read : " << filename << std::endl;
|
||||
read_file_to_memory(filename, [&buf](uint64_t size) {
|
||||
assert((size % sizeof(PackedSfenValue)) == 0);
|
||||
// Expand the buffer and read after the last end.
|
||||
uint64_t last = buf.size();
|
||||
buf.resize(last + size / sizeof(PackedSfenValue));
|
||||
return (void*)&buf[last];
|
||||
});
|
||||
}
|
||||
|
||||
// shuffle from buf[0] to buf[size-1]
|
||||
// Do not use std::random_device(). Because it always the same integers on MinGW.
|
||||
PRNG prng(seed);
|
||||
uint64_t size = (uint64_t)buf.size();
|
||||
std::cout << "shuffle buf.size() = " << size << std::endl;
|
||||
|
||||
Algo::shuffle(buf, prng);
|
||||
|
||||
std::cout << "write : " << output_file_name << endl;
|
||||
|
||||
// If the file to be written exceeds 2GB, it cannot be
|
||||
// written in one shot with fstream::write, so use wrapper.
|
||||
write_memory_to_file(
|
||||
output_file_name,
|
||||
(void*)&buf[0],
|
||||
sizeof(PackedSfenValue) * buf.size());
|
||||
|
||||
std::cout << "..shuffle_on_memory done." << std::endl;
|
||||
}
|
||||
|
||||
static void set_learning_search_limits()
|
||||
{
|
||||
// About Search::Limits
|
||||
@@ -1192,13 +946,6 @@ namespace Learner
|
||||
// --- Function that only shuffles the teacher aspect
|
||||
|
||||
// normal shuffle
|
||||
bool shuffle_normal = false;
|
||||
uint64_t buffer_size = 20000000;
|
||||
// fast shuffling assuming each file is shuffled
|
||||
bool shuffle_quick = false;
|
||||
// A function to read the entire file in memory and shuffle it.
|
||||
// (Requires file size memory)
|
||||
bool shuffle_on_memory = false;
|
||||
// Conversion of packed sfen. In plain, it consists of sfen(string),
|
||||
// evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
|
||||
bool use_convert_plain = false;
|
||||
@@ -1318,13 +1065,6 @@ namespace Learner
|
||||
|
||||
else if (option == "reduction_gameply") is >> reduction_gameply;
|
||||
|
||||
// shuffle related
|
||||
else if (option == "shuffle") shuffle_normal = true;
|
||||
else if (option == "buffer_size") is >> buffer_size;
|
||||
else if (option == "shuffleq") shuffle_quick = true;
|
||||
else if (option == "shufflem") shuffle_on_memory = true;
|
||||
else if (option == "output_file_name") is >> output_file_name;
|
||||
|
||||
else if (option == "eval_limit") is >> eval_limit;
|
||||
else if (option == "save_only_once") save_only_once = true;
|
||||
else if (option == "no_shuffle") no_shuffle = true;
|
||||
@@ -1404,29 +1144,6 @@ namespace Learner
|
||||
cout << "base dir : " << base_dir << endl;
|
||||
cout << "target dir : " << target_dir << endl;
|
||||
|
||||
// shuffle mode
|
||||
if (shuffle_normal)
|
||||
{
|
||||
cout << "buffer_size : " << buffer_size << endl;
|
||||
cout << "shuffle mode.." << endl;
|
||||
shuffle_files(filenames, output_file_name, buffer_size, seed);
|
||||
return;
|
||||
}
|
||||
|
||||
if (shuffle_quick)
|
||||
{
|
||||
cout << "quick shuffle mode.." << endl;
|
||||
shuffle_files_quick(filenames, output_file_name, seed);
|
||||
return;
|
||||
}
|
||||
|
||||
if (shuffle_on_memory)
|
||||
{
|
||||
cout << "shuffle on memory.." << endl;
|
||||
shuffle_files_on_memory(filenames, output_file_name, seed);
|
||||
return;
|
||||
}
|
||||
|
||||
if (use_convert_plain)
|
||||
{
|
||||
Eval::NNUE::init();
|
||||
|
||||
Reference in New Issue
Block a user