PyTorch
ssl-aasist
custom_code
File size: 1,290 Bytes
010952f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import sys
import argparse
from tqdm import tqdm
from build_emov_translation_manifests import dedup, remove_under_k


if __name__ == "__main__":
    """
    this is a standalone script to process a km file
    specifically, to dedup or remove tokens that repeat less
    than k times in a row
    """
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("km", type=str, help="path to km file")
    parser.add_argument("--dedup", action='store_true')
    parser.add_argument("--remove-under-k", type=int, default=0)
    parser.add_argument("--output", default=None)
    args = parser.parse_args()

    if not args.dedup and args.remove_under_k == 0:
        print("nothing to do! quitting...")
        sys.exit(0)

    km = open(args.km, "r").readlines()
    out = []
    for line in tqdm(km):
        if args.remove_under_k > 0:
            line = remove_under_k(line, args.remove_under_k)
        if args.dedup:
            line = dedup(line)
        out.append(line)

    path = args.km if args.output is None else args.output
    if args.remove_under_k > 0:
        path = path.replace(".km", f"-k{args.remove_under_k}.km")
    if args.dedup:
        path = path.replace(".km", f"-deduped.km")

    open(path, "w").writelines(out)
    print(f"written to {path}")