From bf187c46c82d08ca8cef48a3e6e52ca64ee607e5 Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Wed, 26 May 2021 15:12:08 +0200 Subject: [PATCH] Add a tool to interleave binpacks this tool with take N binpacks as input to produce 1 binpack as output. The input binpacks are read in random order, with a probability related to their size, but each input file is read sequentially. The output is thus an appropriately shuffled binpack. The tool is much faster than cat'ing the files together followed by a shuffle. It assumes that the input binpacks themselves have no particular internal ordering. --- script/interleave_binpacks.py | 86 +++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 script/interleave_binpacks.py diff --git a/script/interleave_binpacks.py b/script/interleave_binpacks.py new file mode 100644 index 00000000..02888b33 --- /dev/null +++ b/script/interleave_binpacks.py @@ -0,0 +1,86 @@ +import struct +import sys +import os +import random +from pathlib import Path + + +def copy_next_chunk(in_file, out_file): + chunk_header = in_file.read(8) + assert chunk_header[0:4] == b"BINP" + size = struct.unpack(" 0: + where = random.randrange(total_remaining) + i = 0 + while where >= in_files_remaining[i]: + where -= in_files_remaining[i] + i += 1 + size = copy_next_chunk(in_files[i], out_file) + in_files_remaining[i] -= size + total_remaining -= size + total_size += size + mib = total_size // 1024 // 1024 + if mib // 100 != prev_mib // 100: + print("Copied {} MiB".format(mib)) + prev_mib = mib + + out_file.close() + for in_file in in_files: + in_file.close() + + print("Merged {} bytes".format(total_size)) + + +main()