PyTorch
ssl-aasist
custom_code
ash56's picture
Add files using upload-large-folder tool
010952f verified
raw
history blame
1.29 kB
import sys
import argparse
from tqdm import tqdm
from build_emov_translation_manifests import dedup, remove_under_k
if __name__ == "__main__":
"""
this is a standalone script to process a km file
specifically, to dedup or remove tokens that repeat less
than k times in a row
"""
parser = argparse.ArgumentParser(description="")
parser.add_argument("km", type=str, help="path to km file")
parser.add_argument("--dedup", action='store_true')
parser.add_argument("--remove-under-k", type=int, default=0)
parser.add_argument("--output", default=None)
args = parser.parse_args()
if not args.dedup and args.remove_under_k == 0:
print("nothing to do! quitting...")
sys.exit(0)
km = open(args.km, "r").readlines()
out = []
for line in tqdm(km):
if args.remove_under_k > 0:
line = remove_under_k(line, args.remove_under_k)
if args.dedup:
line = dedup(line)
out.append(line)
path = args.km if args.output is None else args.output
if args.remove_under_k > 0:
path = path.replace(".km", f"-k{args.remove_under_k}.km")
if args.dedup:
path = path.replace(".km", f"-deduped.km")
open(path, "w").writelines(out)
print(f"written to {path}")