From 9564a52523b6001ea4d0e34fa17b8835c4a7b116 Mon Sep 17 00:00:00 2001 From: Tomasz Sobczyk Date: Sun, 18 Oct 2020 14:29:12 +0200 Subject: [PATCH] Remove whole file shuffling as it does not change learning behaviour, only works for bin, and is considered harmful for binpack. --- src/learn/learn.cpp | 283 -------------------------------------------- 1 file changed, 283 deletions(-) diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp index b945e06c..2cab54b7 100644 --- a/src/learn/learn.cpp +++ b/src/learn/learn.cpp @@ -904,252 +904,6 @@ namespace Learner return false; } - // Shuffle_files(), shuffle_files_quick() subcontracting, writing part. - // output_file_name: Name of the file to write - // prng: random number generator - // sfen_file_streams: fstream of each teacher phase file - // sfen_count_in_file: The number of teacher positions present in each file. - void shuffle_write( - const string& output_file_name, - PRNG& prng, - vector& sfen_file_streams, - vector& sfen_count_in_file) - { - uint64_t total_sfen_count = 0; - for (auto c : sfen_count_in_file) - total_sfen_count += c; - - // number of exported phases - uint64_t write_sfen_count = 0; - - // Output the progress on the screen for each phase. - const uint64_t buffer_size = 10000000; - - auto print_status = [&]() - { - // Output progress every 10M phase or when all writing is completed - if (((write_sfen_count % buffer_size) == 0) || - (write_sfen_count == total_sfen_count)) - { - cout << write_sfen_count << " / " << total_sfen_count << endl; - } - }; - - cout << endl << "write : " << output_file_name << endl; - - fstream fs(output_file_name, ios::out | ios::binary); - - // total teacher positions - uint64_t sfen_count_left = total_sfen_count; - - while (sfen_count_left != 0) - { - auto r = prng.rand(sfen_count_left); - - // Aspects stored in fs[0] file ... Aspects stored in fs[1] file ... - //Think of it as a series like, and determine in which file r is pointing. - // The contents of the file are shuffled, so you can take the next element from that file. - // Each file has a_count[x] phases, so this process can be written as follows. - - uint64_t i = 0; - while (sfen_count_in_file[i] <= r) - r -= sfen_count_in_file[i++]; - - // This confirms n. Before you forget it, reduce the remaining number. - - --sfen_count_in_file[i]; - --sfen_count_left; - - PackedSfenValue psv; - // It's better to read and write all at once until the performance is not so good... - if (sfen_file_streams[i].read((char*)&psv, sizeof(PackedSfenValue))) - { - fs.write((char*)&psv, sizeof(PackedSfenValue)); - ++write_sfen_count; - print_status(); - } - } - - print_status(); - fs.close(); - - cout << "done!" << endl; - } - - // Subcontracting the teacher shuffle "learn shuffle" command. - // output_file_name: name of the output file where the shuffled teacher positions will be written - void shuffle_files(const vector& filenames, const string& output_file_name, uint64_t buffer_size, const std::string& seed) - { - // The destination folder is - // tmp/ for temporary writing - - // Temporary file is written to tmp/ folder for each buffer_size phase. - // For example, if buffer_size = 20M, you need a buffer of 20M*40bytes = 800MB. - // In a PC with a small memory, it would be better to reduce this. - // However, if the number of files increases too much, - // it will not be possible to open at the same time due to OS restrictions. - // There should have been a limit of 512 per process on Windows, so you can open here as 500, - // The current setting is 500 files x 20M = 10G = 10 billion phases. - - PSVector buf(buffer_size); - - // ↑ buffer, a marker that indicates how much you have used - uint64_t buf_write_marker = 0; - - // File name to write (incremental counter because it is a serial number) - uint64_t write_file_count = 0; - - // random number to shuffle - // Do not use std::random_device(). Because it always the same integers on MinGW. - PRNG prng(seed); - - // generate the name of the temporary file - auto make_filename = [](uint64_t i) - { - return "tmp/" + to_string(i) + ".bin"; - }; - - // Exported files in tmp/ folder, number of teacher positions stored in each - vector a_count; - - auto write_buffer = [&](uint64_t size) - { - Algo::shuffle(buf, prng); - - // write to a file - fstream fs; - fs.open(make_filename(write_file_count++), ios::out | ios::binary); - fs.write(reinterpret_cast(buf.data()), size * sizeof(PackedSfenValue)); - fs.close(); - a_count.push_back(size); - - buf_write_marker = 0; - cout << "."; - }; - - std::filesystem::create_directory("tmp"); - - // Shuffle and export as a 10M phase shredded file. - for (auto filename : filenames) - { - fstream fs(filename, ios::in | ios::binary); - cout << endl << "open file = " << filename; - while (fs.read(reinterpret_cast(&buf[buf_write_marker]), sizeof(PackedSfenValue))) - if (++buf_write_marker == buffer_size) - write_buffer(buffer_size); - - // Read in units of sizeof(PackedSfenValue), - // Ignore the last remaining fraction. (Fails in fs.read, so exit while) - // (The remaining fraction seems to be half-finished data - // that was created because it was stopped halfway during teacher generation.) - } - - if (buf_write_marker != 0) - write_buffer(buf_write_marker); - - // Only shuffled files have been written write_file_count. - // As a second pass, if you open all of them at the same time, - // select one at random and load one phase at a time - // Now you have shuffled. - - // Original file for shirt full + tmp file + file to write - // requires 3 times the storage capacity of the original file. - // 1 billion SSD is not enough for shuffling because it is 400GB for 10 billion phases. - // If you want to delete (or delete by hand) the - // original file at this point after writing to tmp, - // The storage capacity is about twice that of the original file. - // So, maybe we should have an option to delete the original file. - - // Files are opened at the same time. It is highly possible that this will exceed FOPEN_MAX. - // In that case, rather than adjusting buffer_size to reduce the number of files. - - vector afs; - for (uint64_t i = 0; i < write_file_count; ++i) - afs.emplace_back(fstream(make_filename(i), ios::in | ios::binary)); - - // Throw to the subcontract function and end. - shuffle_write(output_file_name, prng, afs, a_count); - } - - // Subcontracting the teacher shuffle "learn shuffleq" command. - // This is written in 1 pass. - // output_file_name: name of the output file where the shuffled teacher positions will be written - void shuffle_files_quick(const vector& filenames, const string& output_file_name, const std::string& seed) - { - // random number to shuffle - // Do not use std::random_device(). Because it always the same integers on MinGW. - PRNG prng(seed); - - // number of files - const size_t file_count = filenames.size(); - - // Number of teacher positions stored in each file in filenames - vector sfen_count_in_file(file_count); - - // Count the number of teacher aspects in each file. - vector sfen_file_streams(file_count); - - for (size_t i = 0; i < file_count; ++i) - { - auto filename = filenames[i]; - auto& fs = sfen_file_streams[i]; - - fs.open(filename, ios::in | ios::binary); - const uint64_t file_size = get_file_size(fs); - const uint64_t sfen_count = file_size / sizeof(PackedSfenValue); - sfen_count_in_file[i] = sfen_count; - - // Output the number of sfen stored in each file. - cout << filename << " = " << sfen_count << " sfens." << endl; - } - - // Since we know the file size of each file, - // open them all at once (already open), - // Select one at a time and load one phase at a time - // Now you have shuffled. - - // Throw to the subcontract function and end. - shuffle_write(output_file_name, prng, sfen_file_streams, sfen_count_in_file); - } - - // Subcontracting the teacher shuffle "learn shufflem" command. - // Read the whole memory and write it out with the specified file name. - void shuffle_files_on_memory(const vector& filenames, const string output_file_name, const std::string& seed) - { - PSVector buf; - - for (auto filename : filenames) - { - std::cout << "read : " << filename << std::endl; - read_file_to_memory(filename, [&buf](uint64_t size) { - assert((size % sizeof(PackedSfenValue)) == 0); - // Expand the buffer and read after the last end. - uint64_t last = buf.size(); - buf.resize(last + size / sizeof(PackedSfenValue)); - return (void*)&buf[last]; - }); - } - - // shuffle from buf[0] to buf[size-1] - // Do not use std::random_device(). Because it always the same integers on MinGW. - PRNG prng(seed); - uint64_t size = (uint64_t)buf.size(); - std::cout << "shuffle buf.size() = " << size << std::endl; - - Algo::shuffle(buf, prng); - - std::cout << "write : " << output_file_name << endl; - - // If the file to be written exceeds 2GB, it cannot be - // written in one shot with fstream::write, so use wrapper. - write_memory_to_file( - output_file_name, - (void*)&buf[0], - sizeof(PackedSfenValue) * buf.size()); - - std::cout << "..shuffle_on_memory done." << std::endl; - } - static void set_learning_search_limits() { // About Search::Limits @@ -1192,13 +946,6 @@ namespace Learner // --- Function that only shuffles the teacher aspect // normal shuffle - bool shuffle_normal = false; - uint64_t buffer_size = 20000000; - // fast shuffling assuming each file is shuffled - bool shuffle_quick = false; - // A function to read the entire file in memory and shuffle it. - // (Requires file size memory) - bool shuffle_on_memory = false; // Conversion of packed sfen. In plain, it consists of sfen(string), // evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0) bool use_convert_plain = false; @@ -1318,13 +1065,6 @@ namespace Learner else if (option == "reduction_gameply") is >> reduction_gameply; - // shuffle related - else if (option == "shuffle") shuffle_normal = true; - else if (option == "buffer_size") is >> buffer_size; - else if (option == "shuffleq") shuffle_quick = true; - else if (option == "shufflem") shuffle_on_memory = true; - else if (option == "output_file_name") is >> output_file_name; - else if (option == "eval_limit") is >> eval_limit; else if (option == "save_only_once") save_only_once = true; else if (option == "no_shuffle") no_shuffle = true; @@ -1404,29 +1144,6 @@ namespace Learner cout << "base dir : " << base_dir << endl; cout << "target dir : " << target_dir << endl; - // shuffle mode - if (shuffle_normal) - { - cout << "buffer_size : " << buffer_size << endl; - cout << "shuffle mode.." << endl; - shuffle_files(filenames, output_file_name, buffer_size, seed); - return; - } - - if (shuffle_quick) - { - cout << "quick shuffle mode.." << endl; - shuffle_files_quick(filenames, output_file_name, seed); - return; - } - - if (shuffle_on_memory) - { - cout << "shuffle on memory.." << endl; - shuffle_files_on_memory(filenames, output_file_name, seed); - return; - } - if (use_convert_plain) { Eval::NNUE::init();