MVSEP-MDX23-music-separation-model

Running

App Files Files Community

MVSEP-MDX23-music-separation-model / app.py

Ma5onic

Update app.py

1aab015 almost 2 years ago

raw

history blame

7.82 kB

	import os
	import time
	import numpy as np
	import tempfile
	from scipy.io import wavfile
	from pytube import YouTube
	import gradio as gr
	from moviepy.editor import AudioFileClip
	from inference import EnsembleDemucsMDXMusicSeparationModel, predict_with_model
	import torch
	import librosa
	import librosa.display
	import matplotlib.pyplot as plt


	def download_youtube_video_as_wav(youtube_url):
	output_dir = "downloads"
	os.makedirs(output_dir, exist_ok=True)
	output_file = os.path.join(output_dir, "temp.mp4")

	try:
	yt = YouTube(youtube_url)
	yt.streams.filter(only_audio=True).first().download(filename=output_file)
	print("Download completed successfully.")
	except Exception as e:
	print(f"An error occurred while downloading the video: {e}")
	return None

	# Convert mp4 audio to wav
	wav_file = os.path.join(output_dir, "mixture.wav")
	clip = AudioFileClip(output_file)
	clip.write_audiofile(wav_file)

	return wav_file


	def check_file_readiness(filepath):
	# If the loop finished, it means the file size has not changed for 5 seconds
	# which indicates that the file is ready
	num_same_size_checks = 0
	last_size = -1
	while num_same_size_checks < 5:
	current_size = os.path.getsize(filepath)
	if current_size == last_size:
	num_same_size_checks += 1
	else:
	num_same_size_checks = 0
	last_size = current_size
	time.sleep(0.5)
	return True


	def generate_spectrogram(audio_file_path):
	y, sr = librosa.load(audio_file_path)
	plt.figure(figsize=(10, 4))
	S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
	librosa.display.specshow(librosa.power_to_db(S, ref=np.max),
	y_axis='mel', fmax=8000, x_axis='time')
	plt.colorbar(format='%+2.0f dB')
	plt.title('Mel spectrogram')
	plt.tight_layout()
	image_path = tempfile.mktemp('.png')
	plt.savefig(image_path)
	plt.close()
	return image_path


	def generate_spectrograms(audio_files):
	output_spectrograms = []
	for audio_file in audio_files:
	output_spectrograms.append(generate_spectrogram(audio_file))
	return tuple(output_spectrograms)


	def separate_music_file_wrapper(input_string, use_cpu, use_single_onnx, large_overlap, small_overlap, chunk_size, use_large_gpu):
	input_files = []
	# Validate YouTube URL or directory path
	if input_string.startswith("https://www.youtube.com") or input_string.startswith("https://youtu.be"):
	output_file = download_youtube_video_as_wav(input_string)
	if output_file is not None:
	input_files.append(output_file)
	elif os.path.isdir(input_string):
	input_directory = input_string
	input_files = [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith('.wav')]
	else:
	raise ValueError("Invalid input! Please provide a valid YouTube link or a directory path.")

	# Validate overlap values
	if not (0 <= large_overlap <= 1) or not (0 <= small_overlap <= 1):
	raise ValueError("Overlap values must be between 0 and 1.")

	# Validate chunk size
	if chunk_size <= 0:
	raise ValueError("Chunk size must be greater than 0.") # not thicc enough

	options = {
	'input_audio': input_files,
	'output_folder': 'results',
	'cpu': use_cpu,
	'single_onnx': use_single_onnx,
	'overlap_large': large_overlap,
	'overlap_small': small_overlap,
	'chunk_size': chunk_size,
	'large_gpu': use_large_gpu,
	}

	predict_with_model(options)

	# Clear GPU cache
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	output_files = {}
	for f in input_files:
	audio_file_name = os.path.splitext(os.path.basename(f))[0]
	output_files["vocals"] = os.path.join(options['output_folder'], audio_file_name + "_vocals.wav")
	output_files["instrumental"] = os.path.join(options['output_folder'], audio_file_name + "_instrum.wav")
	output_files["instrumental2"] = os.path.join(options['output_folder'], audio_file_name + "_instrum2.wav") # For the second instrumental output
	output_files["bass"] = os.path.join(options['output_folder'], audio_file_name + "_bass.wav")
	output_files["drums"] = os.path.join(options['output_folder'], audio_file_name + "_drums.wav")
	output_files["other"] = os.path.join(options['output_folder'], audio_file_name + "_other.wav")

	# Check the readiness of the files
	output_files_ready = []
	for k, v in output_files.items():
	if os.path.exists(v) and check_file_readiness(v):
	output_files_ready.append(v)
	else:
	empty_data = np.zeros((44100, 2)) # 2 channels, 1 second of silence at 44100Hz
	empty_file = tempfile.mktemp('.wav')
	wavfile.write(empty_file, 44100, empty_data.astype(np.int16)) # Cast to int16 as wavfile does not support float32
	output_files_ready.append(empty_file)

	# Generate spectrograms right after separating the audio
	output_spectrograms = generate_spectrograms(output_files_ready)

	print(len(output_files_ready)) # should print 6
	print(len(output_spectrograms)) # should print 6

	print("Before return")
	return tuple(output_files_ready) + output_spectrograms
	print("After return")


	description = """
	# ZFTurbo Web-UI
	Web-UI by [Ma5onic](https://github.com/Ma5onic)
	## Options:
	- Use CPU Only: Select this if you have not enough GPU memory. It will be slower.
	- Use Single ONNX: Select this to use a single ONNX model. It will decrease quality a little bit but can help with GPU memory usage.
	- Large Overlap: The overlap for large chunks. Adjust as needed.
	- Small Overlap: The overlap for small chunks. Adjust as needed.
	- Chunk Size: The size of chunks to be processed at a time. Reduce this if facing memory issues.
	- Use Fast Large GPU Version: Select this to use the old fast method that requires > 11 GB of GPU memory. It will work faster.
	"""
	theme = gr.themes.Base(
	primary_hue="cyan",
	secondary_hue="cyan",
	)

	with gr.Blocks(theme=theme) as demo:
	gr.Markdown(description)
	input_string = gr.Text(label="YouTube Link/URL")
	use_cpu = gr.Checkbox(label="Use CPU Only", value=True)
	use_single_onnx = gr.Checkbox(label="Use Single ONNX", value=False)
	large_overlap = gr.Number(label="Large Overlap", value=0.6)
	small_overlap = gr.Number(label="Small Overlap", value=0.5)
	chunk_size = gr.Number(label="Chunk Size", value=1000000)
	use_large_gpu = gr.Checkbox(label="Use Fast Large GPU Version", value=False)
	process_button = gr.Button("Process Audio")

	vocals = gr.Audio(label="Vocals")
	vocals_spectrogram = gr.Image(label="Vocals Spectrogram")
	instrumental = gr.Audio(label="Instrumental")
	instrumental_spectrogram = gr.Image(label="Instrumental Spectrogram")
	instrumental2 = gr.Audio(label="Instrumental 2")
	instrumental2_spectrogram = gr.Image(label="Instrumental 2 Spectrogram")
	bass = gr.Audio(label="Bass")
	bass_spectrogram = gr.Image(label="Bass Spectrogram")
	drums = gr.Audio(label="Drums")
	drums_spectrogram = gr.Image(label="Drums Spectrogram")
	other = gr.Audio(label="Other")
	other_spectrogram = gr.Image(label="Other Spectrogram")

	process_button.click(
	separate_music_file_wrapper,
	inputs=[input_string, use_cpu, use_single_onnx, large_overlap, small_overlap, chunk_size, use_large_gpu],
	outputs=[vocals, instrumental, instrumental2, bass, drums, other, vocals_spectrogram, instrumental_spectrogram, instrumental2_spectrogram, bass_spectrogram, drums_spectrogram, other_spectrogram],
	)

	demo.queue().launch(debug=True, share=False)