import os, sys, re, json import argparse #import whisper_timestamped as wt from pdb import set_trace as b from pprint import pprint as pp from profanity_check import predict, predict_prob from pydub import AudioSegment from pydub.playback import play from subprocess import Popen, PIPE def parse_args(): """ """ parser = argparse.ArgumentParser( description=('Tool to mute profanities in a song (source separation -> speech recognition -> profanity detection -> mask profanities -> re-mix)'), usage=('see or run as local web app with streamlit: ') ) parser.add_argument( '-i', '--input', default=None, nargs='?', #required=True, help=("path to a mp3") ) parser.add_argument( '-m', '--model', default='small', nargs='?', help=("model used by whisper for speech recognition: tiny, small (default), medium or large") ) parser.add_argument( '-p', '--play', default=False, action='store_true', help=("play output audio at the end") ) parser.add_argument( '-v', '--verbose', default=False, action='store_true', help=("print transcribed text and detected profanities to screen") ) return parser.parse_args() def main(args, input_file=None, model_size=None, verbose=False, play_output=False): """ """ if not input_file: input_file = args.input if not model_size: model_size = args.model if not verbose: verbose = args.verbose if not play_output: play_output = args.play # exit if input file not found if not os.path.isfile(input_file): print('Error: --input file not found') sys.exit() print(f'\nProcessing input file: {input_file}') # split audio into vocals + accompaniment print('Running source separation') stems_dir = source_separation(input_file) vocal_stem = os.path.join(stems_dir, 'vocals.wav') instr_stem = os.path.join(stems_dir, 'no_vocals.wav') print(f'Vocal stem written to: {vocal_stem}') # speech rec (audio->text) print('Transcribe vocal stem into text with word-level timestamps') #cmd = f'whisper_timestamped --task transcribe --model {model_size} {vocal_stem}' #stdout, stderr = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True, executable='/bin/bash').communicate() #text = json.loads('\n'.join(stdout.decode('utf8').split('\n')[1:])) import whisper_timestamped as wt audio = wt.load_audio(vocal_stem) model = wt.load_model(model_size, device='cpu') text = wt.transcribe(model, audio, language='en') if verbose: print('\nTranscribed text:') print(text['text']+'\n') # checking for profanities in text print('Run profanity detection on text') profanities = profanity_detection(text) if not profanities: print(f'No profanities found in {input_file} - exiting') sys.exit() if verbose: print('profanities found in text:') pp(profanities) # masking print('Mask profanities in vocal stem') vocals = mask_profanities(vocal_stem, profanities) # re-mixing print('Merge instrumentals stem and masked vocals stem') mix = AudioSegment.from_wav(instr_stem).overlay(vocals) # write mix to file outpath = input_file.replace('.mp3', '_masked.mp3').replace('.wav', '_masked.wav') if input_file.endswith('.wav'): mix.export(outpath, format="wav") elif input_file.endswith('.mp3'): mix.export(outpath, format="mp3") print(f'Mixed file written to: {outpath}') # play output if play_output: print('\nPlaying output...') play(mix) return outpath def source_separation(inpath): """ Execute shell command to run demucs and pipe stdout/stderr back to python """ cmd = f'demucs --two-stems=vocals --jobs 8 "{inpath}"' stdout, stderr = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True, executable='/bin/bash').communicate() stdout = stdout.decode('utf8') # exit if demucs error'd out if stderr: stderr = stderr.decode('utf-8').lower() if 'error' in stderr or 'not exist' in stderr: print(stderr.decode('utf8').split('\n')[0]) sys.exit() # parse stems directory path from stdout and return it if successful stems_dir = ''.join(re.findall('/.*', stdout)).replace('.mp3','').replace('.wav','').replace('samples/','') if not os.path.isdir(stems_dir): print(f'Error: output stem directory "{stems_dir}" not found') sys.exit() return stems_dir def profanity_detection(text): """ """ # detect profanities in text profs = [] for segment in text['segments']: for word in segment['words']: #if word['confidence']<.25: # print(word) text = word['text'].replace('.','').replace(',','').lower() # skip false positives if text in ['cancer', 'hell', 'junk', 'die', 'lame', 'freak', 'freaky', 'white', 'stink', 'shut', 'spit', 'mouth','orders','eat','clouds']: continue # assume anything returned by whisper with more than 1 * is profanity e.g n***a if '**' in text: profs.append(word) continue # add true negatives if text in ['bitchy', 'puss']: profs.append(word) continue # run profanity detection - returns 1 (True) or 0 (False) if predict([word['text']])[0]: profs.append(word) return profs def mask_profanities(vocal_stem, profanities): """ """ # load vocal stem and mask profanities vocals = AudioSegment.from_wav(vocal_stem) for prof in profanities: mask = vocals[prof['start']*1000:prof['end']*1000] # pydub works in milliseconds mask -= 50 # reduce lvl by some dB (enough to ~mute it) #mask = mask.silent(len(mask)) #mask = mask.fade_in(100).fade_out(100) # it prepends/appends fades so end up with longer mask start = vocals[:prof['start']*1000] end = vocals[prof['end']*1000:] #print(f"masking {prof['text']} from {prof['start']} to {prof['end']}") vocals = start + mask + end return vocals if __name__ == "__main__": args = parse_args() if len(sys.argv)>1: main(args) else: import streamlit as st st.title('Saylss') model = st.selectbox('Choose model size:', ('tiny','small','medium','large'), index=1) uploaded_file = st.file_uploader("Choose input track", type=[".mp3",".wav"], accept_multiple_files=False) if uploaded_file is not None: # display input audio #st.text('Play input track:') audio_bytes_input = uploaded_file.read() st.audio(audio_bytes_input, format='audio/wav') # run code with st.spinner('Processing input audio...'): outpath = main(args, input_file=os.path.join('audio/samples',uploaded_file.name), model_size=model) # display output audio #st.text('Play output Track:') st.text('\nOutput:') audio_file = open(outpath, 'rb') audio_bytes = audio_file.read() st.audio(audio_bytes, format='audio/wav')