-
Notifications
You must be signed in to change notification settings - Fork 0
/
split_into_batches
executable file
·77 lines (61 loc) · 1.89 KB
/
split_into_batches
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python3
"""
Copy or move files into batches of fixed size (apart from the last one).
"""
import sys
from pathlib import Path
from shutil import move
import subprocess
DEFAULT_PREFIX = '00_batch_'
def sublists(size):
p = Path('.')
all_files = list(p.glob('*'))
all_files.sort()
l = len(all_files)
for i in range(0, len(all_files), size):
l -= len(all_files[i:i+size])
yield all_files[i: i+size]
assert l == 0
def copy(src, dst):
"""Simpler than fiddling with copytree...
"""
subprocess.run(['cp', '-r', src, dst])
def main_generic(size, action, prefix=DEFAULT_PREFIX):
p = Path('.')
for i, files_in_batch in enumerate(sublists(size)):
batch_path = p / (prefix + "{0:0=7_d}".format(i))
batch_path.mkdir()
for f in files_in_batch:
print(f'{action.__name__} on {f} to {batch_path}')
action(str(f), str(batch_path))
def main_move(size):
main_generic(size, move)
def main_copy(size):
main_generic(size, copy)
def unbatch(prefix=DEFAULT_PREFIX):
p = Path('.')
# we use this folder to prevent from unbatching recursively
# XXX: use a proper tmp folder instead
unbatching_folder = p / 'zzzzzz_unbatching'
unbatching_folder.mkdir()
for batch_folder in p.glob(prefix + '*'):
for f in batch_folder.iterdir():
move(str(f), str(unbatching_folder))
batch_folder.rmdir()
for f in unbatching_folder.iterdir():
move(str(f), str(p))
unbatching_folder.rmdir()
if __name__ == '__main__':
# arg: size max of batches
mode = sys.argv[1]
if mode == 'm':
assert len(sys.argv) == 3
size = int(sys.argv[2])
main_move(size)
elif mode == 'c':
assert len(sys.argv) == 3
size = int(sys.argv[2])
main_copy(size)
elif mode == 'u':
assert len(sys.argv) == 2
unbatch()