Fairseq
English
File size: 507 Bytes
f5feb4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from sys import argv

filename = argv[1]
num_line = argv[2]
output_dir = argv[3]

lines = open(filename).read().strip().split('\n')

ckpt = 0
shard_lines = []

for i, line in enumerate(lines):
    if line == '' and (i-ckpt)>=int(num_line):
        shard_lines.append(lines[ckpt:i+1])
        ckpt = i+1

if ckpt < len(lines) - 1:
    shard_lines.append(lines[ckpt:])

for i, doc in enumerate(shard_lines):
    with open(f'{output_dir}/{i:06}.txt', 'w') as f:
        print('\n'.join(doc), file=f, end='\n')