import torchaudio import argparse import json def main(): parser = argparse.ArgumentParser(description="example: python create_hifigan_manifest.py --tsv /checkpoint/felixkreuk/datasets/vctk/splits/vctk_16khz/train.tsv --km /checkpoint/felixkreuk/experiments/hubert/hubert_feats/vctk_16khz_km_100/train.km --km_type hubert_100km > ~/tmp/tmp_mani.txt") parser.add_argument("--tsv", required=True, help="path to fairseq tsv file") parser.add_argument("--km", required=True, help="path to a km file generated by HuBERT clustering") parser.add_argument("--km_type", required=True, help="name of the codes in the output json (for example: 'cpc_100km')") args = parser.parse_args() km_lines = open(args.km, "r").readlines() tsv_lines = open(args.tsv, "r").readlines() assert len(km_lines) == len(tsv_lines) - 1, "tsv and km files are not of the same length!" wav_root = tsv_lines[0].strip() tsv_lines = tsv_lines[1:] for tsv_line, km_line in zip(tsv_lines, km_lines): tsv_line, km_line = tsv_line.strip(), km_line.strip() wav_basename, wav_num_frames = tsv_line.split("\t") wav_path = wav_root + "/" + wav_basename wav_info = torchaudio.info(wav_path) assert int(wav_num_frames) == wav_info.num_frames, "tsv duration and actual duration don't match!" wav_duration = wav_info.num_frames / wav_info.sample_rate manifest_line = {"audio": wav_path, "duration": wav_duration, args.km_type: km_line} print(json.dumps(manifest_line)) if __name__ == "__main__": """ usage: python create_hifigan_manifest.py \ --tsv /checkpoint/felixkreuk/datasets/vctk/manifests/vctk_16khz/valid.tsv \ --km /checkpoint/felixkreuk/datasets/vctk/manifests/vctk_16khz/hubert_km_100/valid.km \ --km_type hubert \ > /checkpoint/felixkreuk/datasets/vctk/manifests/vctk_16khz/hubert_km_100/hifigan_valid_manifest.txt """ main()