File size: 507 Bytes
f5feb4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
from sys import argv
filename = argv[1]
num_line = argv[2]
output_dir = argv[3]
lines = open(filename).read().strip().split('\n')
ckpt = 0
shard_lines = []
for i, line in enumerate(lines):
if line == '' and (i-ckpt)>=int(num_line):
shard_lines.append(lines[ckpt:i+1])
ckpt = i+1
if ckpt < len(lines) - 1:
shard_lines.append(lines[ckpt:])
for i, doc in enumerate(shard_lines):
with open(f'{output_dir}/{i:06}.txt', 'w') as f:
print('\n'.join(doc), file=f, end='\n')
|