Remove whole file shuffling as it does not change learning behaviour, only works for bin, and is considered harmful for binpack.

This commit is contained in:
Tomasz Sobczyk
2020-10-18 14:29:12 +02:00
committed by nodchip
parent 7b4a769cca
commit 9564a52523

View File

@@ -904,252 +904,6 @@ namespace Learner
return false;
}
// Shuffle_files(), shuffle_files_quick() subcontracting, writing part.
// output_file_name: Name of the file to write
// prng: random number generator
// sfen_file_streams: fstream of each teacher phase file
// sfen_count_in_file: The number of teacher positions present in each file.
void shuffle_write(
const string& output_file_name,
PRNG& prng,
vector<fstream>& sfen_file_streams,
vector<uint64_t>& sfen_count_in_file)
{
uint64_t total_sfen_count = 0;
for (auto c : sfen_count_in_file)
total_sfen_count += c;
// number of exported phases
uint64_t write_sfen_count = 0;
// Output the progress on the screen for each phase.
const uint64_t buffer_size = 10000000;
auto print_status = [&]()
{
// Output progress every 10M phase or when all writing is completed
if (((write_sfen_count % buffer_size) == 0) ||
(write_sfen_count == total_sfen_count))
{
cout << write_sfen_count << " / " << total_sfen_count << endl;
}
};
cout << endl << "write : " << output_file_name << endl;
fstream fs(output_file_name, ios::out | ios::binary);
// total teacher positions
uint64_t sfen_count_left = total_sfen_count;
while (sfen_count_left != 0)
{
auto r = prng.rand(sfen_count_left);
// Aspects stored in fs[0] file ... Aspects stored in fs[1] file ...
//Think of it as a series like, and determine in which file r is pointing.
// The contents of the file are shuffled, so you can take the next element from that file.
// Each file has a_count[x] phases, so this process can be written as follows.
uint64_t i = 0;
while (sfen_count_in_file[i] <= r)
r -= sfen_count_in_file[i++];
// This confirms n. Before you forget it, reduce the remaining number.
--sfen_count_in_file[i];
--sfen_count_left;
PackedSfenValue psv;
// It's better to read and write all at once until the performance is not so good...
if (sfen_file_streams[i].read((char*)&psv, sizeof(PackedSfenValue)))
{
fs.write((char*)&psv, sizeof(PackedSfenValue));
++write_sfen_count;
print_status();
}
}
print_status();
fs.close();
cout << "done!" << endl;
}
// Subcontracting the teacher shuffle "learn shuffle" command.
// output_file_name: name of the output file where the shuffled teacher positions will be written
void shuffle_files(const vector<string>& filenames, const string& output_file_name, uint64_t buffer_size, const std::string& seed)
{
// The destination folder is
// tmp/ for temporary writing
// Temporary file is written to tmp/ folder for each buffer_size phase.
// For example, if buffer_size = 20M, you need a buffer of 20M*40bytes = 800MB.
// In a PC with a small memory, it would be better to reduce this.
// However, if the number of files increases too much,
// it will not be possible to open at the same time due to OS restrictions.
// There should have been a limit of 512 per process on Windows, so you can open here as 500,
// The current setting is 500 files x 20M = 10G = 10 billion phases.
PSVector buf(buffer_size);
// ↑ buffer, a marker that indicates how much you have used
uint64_t buf_write_marker = 0;
// File name to write (incremental counter because it is a serial number)
uint64_t write_file_count = 0;
// random number to shuffle
// Do not use std::random_device(). Because it always the same integers on MinGW.
PRNG prng(seed);
// generate the name of the temporary file
auto make_filename = [](uint64_t i)
{
return "tmp/" + to_string(i) + ".bin";
};
// Exported files in tmp/ folder, number of teacher positions stored in each
vector<uint64_t> a_count;
auto write_buffer = [&](uint64_t size)
{
Algo::shuffle(buf, prng);
// write to a file
fstream fs;
fs.open(make_filename(write_file_count++), ios::out | ios::binary);
fs.write(reinterpret_cast<char*>(buf.data()), size * sizeof(PackedSfenValue));
fs.close();
a_count.push_back(size);
buf_write_marker = 0;
cout << ".";
};
std::filesystem::create_directory("tmp");
// Shuffle and export as a 10M phase shredded file.
for (auto filename : filenames)
{
fstream fs(filename, ios::in | ios::binary);
cout << endl << "open file = " << filename;
while (fs.read(reinterpret_cast<char*>(&buf[buf_write_marker]), sizeof(PackedSfenValue)))
if (++buf_write_marker == buffer_size)
write_buffer(buffer_size);
// Read in units of sizeof(PackedSfenValue),
// Ignore the last remaining fraction. (Fails in fs.read, so exit while)
// (The remaining fraction seems to be half-finished data
// that was created because it was stopped halfway during teacher generation.)
}
if (buf_write_marker != 0)
write_buffer(buf_write_marker);
// Only shuffled files have been written write_file_count.
// As a second pass, if you open all of them at the same time,
// select one at random and load one phase at a time
// Now you have shuffled.
// Original file for shirt full + tmp file + file to write
// requires 3 times the storage capacity of the original file.
// 1 billion SSD is not enough for shuffling because it is 400GB for 10 billion phases.
// If you want to delete (or delete by hand) the
// original file at this point after writing to tmp,
// The storage capacity is about twice that of the original file.
// So, maybe we should have an option to delete the original file.
// Files are opened at the same time. It is highly possible that this will exceed FOPEN_MAX.
// In that case, rather than adjusting buffer_size to reduce the number of files.
vector<fstream> afs;
for (uint64_t i = 0; i < write_file_count; ++i)
afs.emplace_back(fstream(make_filename(i), ios::in | ios::binary));
// Throw to the subcontract function and end.
shuffle_write(output_file_name, prng, afs, a_count);
}
// Subcontracting the teacher shuffle "learn shuffleq" command.
// This is written in 1 pass.
// output_file_name: name of the output file where the shuffled teacher positions will be written
void shuffle_files_quick(const vector<string>& filenames, const string& output_file_name, const std::string& seed)
{
// random number to shuffle
// Do not use std::random_device(). Because it always the same integers on MinGW.
PRNG prng(seed);
// number of files
const size_t file_count = filenames.size();
// Number of teacher positions stored in each file in filenames
vector<uint64_t> sfen_count_in_file(file_count);
// Count the number of teacher aspects in each file.
vector<fstream> sfen_file_streams(file_count);
for (size_t i = 0; i < file_count; ++i)
{
auto filename = filenames[i];
auto& fs = sfen_file_streams[i];
fs.open(filename, ios::in | ios::binary);
const uint64_t file_size = get_file_size(fs);
const uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
sfen_count_in_file[i] = sfen_count;
// Output the number of sfen stored in each file.
cout << filename << " = " << sfen_count << " sfens." << endl;
}
// Since we know the file size of each file,
// open them all at once (already open),
// Select one at a time and load one phase at a time
// Now you have shuffled.
// Throw to the subcontract function and end.
shuffle_write(output_file_name, prng, sfen_file_streams, sfen_count_in_file);
}
// Subcontracting the teacher shuffle "learn shufflem" command.
// Read the whole memory and write it out with the specified file name.
void shuffle_files_on_memory(const vector<string>& filenames, const string output_file_name, const std::string& seed)
{
PSVector buf;
for (auto filename : filenames)
{
std::cout << "read : " << filename << std::endl;
read_file_to_memory(filename, [&buf](uint64_t size) {
assert((size % sizeof(PackedSfenValue)) == 0);
// Expand the buffer and read after the last end.
uint64_t last = buf.size();
buf.resize(last + size / sizeof(PackedSfenValue));
return (void*)&buf[last];
});
}
// shuffle from buf[0] to buf[size-1]
// Do not use std::random_device(). Because it always the same integers on MinGW.
PRNG prng(seed);
uint64_t size = (uint64_t)buf.size();
std::cout << "shuffle buf.size() = " << size << std::endl;
Algo::shuffle(buf, prng);
std::cout << "write : " << output_file_name << endl;
// If the file to be written exceeds 2GB, it cannot be
// written in one shot with fstream::write, so use wrapper.
write_memory_to_file(
output_file_name,
(void*)&buf[0],
sizeof(PackedSfenValue) * buf.size());
std::cout << "..shuffle_on_memory done." << std::endl;
}
static void set_learning_search_limits()
{
// About Search::Limits
@@ -1192,13 +946,6 @@ namespace Learner
// --- Function that only shuffles the teacher aspect
// normal shuffle
bool shuffle_normal = false;
uint64_t buffer_size = 20000000;
// fast shuffling assuming each file is shuffled
bool shuffle_quick = false;
// A function to read the entire file in memory and shuffle it.
// (Requires file size memory)
bool shuffle_on_memory = false;
// Conversion of packed sfen. In plain, it consists of sfen(string),
// evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
bool use_convert_plain = false;
@@ -1318,13 +1065,6 @@ namespace Learner
else if (option == "reduction_gameply") is >> reduction_gameply;
// shuffle related
else if (option == "shuffle") shuffle_normal = true;
else if (option == "buffer_size") is >> buffer_size;
else if (option == "shuffleq") shuffle_quick = true;
else if (option == "shufflem") shuffle_on_memory = true;
else if (option == "output_file_name") is >> output_file_name;
else if (option == "eval_limit") is >> eval_limit;
else if (option == "save_only_once") save_only_once = true;
else if (option == "no_shuffle") no_shuffle = true;
@@ -1404,29 +1144,6 @@ namespace Learner
cout << "base dir : " << base_dir << endl;
cout << "target dir : " << target_dir << endl;
// shuffle mode
if (shuffle_normal)
{
cout << "buffer_size : " << buffer_size << endl;
cout << "shuffle mode.." << endl;
shuffle_files(filenames, output_file_name, buffer_size, seed);
return;
}
if (shuffle_quick)
{
cout << "quick shuffle mode.." << endl;
shuffle_files_quick(filenames, output_file_name, seed);
return;
}
if (shuffle_on_memory)
{
cout << "shuffle on memory.." << endl;
shuffle_files_on_memory(filenames, output_file_name, seed);
return;
}
if (use_convert_plain)
{
Eval::NNUE::init();