import sys import argparse from tqdm import tqdm from build_emov_translation_manifests import dedup, remove_under_k if __name__ == "__main__": """ this is a standalone script to process a km file specifically, to dedup or remove tokens that repeat less than k times in a row """ parser = argparse.ArgumentParser(description="") parser.add_argument("km", type=str, help="path to km file") parser.add_argument("--dedup", action='store_true') parser.add_argument("--remove-under-k", type=int, default=0) parser.add_argument("--output", default=None) args = parser.parse_args() if not args.dedup and args.remove_under_k == 0: print("nothing to do! quitting...") sys.exit(0) km = open(args.km, "r").readlines() out = [] for line in tqdm(km): if args.remove_under_k > 0: line = remove_under_k(line, args.remove_under_k) if args.dedup: line = dedup(line) out.append(line) path = args.km if args.output is None else args.output if args.remove_under_k > 0: path = path.replace(".km", f"-k{args.remove_under_k}.km") if args.dedup: path = path.replace(".km", f"-deduped.km") open(path, "w").writelines(out) print(f"written to {path}")