Ma5onic's picture
Update app.py
1aab015
raw
history blame
7.82 kB
import os
import time
import numpy as np
import tempfile
from scipy.io import wavfile
from pytube import YouTube
import gradio as gr
from moviepy.editor import AudioFileClip
from inference import EnsembleDemucsMDXMusicSeparationModel, predict_with_model
import torch
import librosa
import librosa.display
import matplotlib.pyplot as plt
def download_youtube_video_as_wav(youtube_url):
output_dir = "downloads"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "temp.mp4")
try:
yt = YouTube(youtube_url)
yt.streams.filter(only_audio=True).first().download(filename=output_file)
print("Download completed successfully.")
except Exception as e:
print(f"An error occurred while downloading the video: {e}")
return None
# Convert mp4 audio to wav
wav_file = os.path.join(output_dir, "mixture.wav")
clip = AudioFileClip(output_file)
clip.write_audiofile(wav_file)
return wav_file
def check_file_readiness(filepath):
# If the loop finished, it means the file size has not changed for 5 seconds
# which indicates that the file is ready
num_same_size_checks = 0
last_size = -1
while num_same_size_checks < 5:
current_size = os.path.getsize(filepath)
if current_size == last_size:
num_same_size_checks += 1
else:
num_same_size_checks = 0
last_size = current_size
time.sleep(0.5)
return True
def generate_spectrogram(audio_file_path):
y, sr = librosa.load(audio_file_path)
plt.figure(figsize=(10, 4))
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
librosa.display.specshow(librosa.power_to_db(S, ref=np.max),
y_axis='mel', fmax=8000, x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel spectrogram')
plt.tight_layout()
image_path = tempfile.mktemp('.png')
plt.savefig(image_path)
plt.close()
return image_path
def generate_spectrograms(audio_files):
output_spectrograms = []
for audio_file in audio_files:
output_spectrograms.append(generate_spectrogram(audio_file))
return tuple(output_spectrograms)
def separate_music_file_wrapper(input_string, use_cpu, use_single_onnx, large_overlap, small_overlap, chunk_size, use_large_gpu):
input_files = []
# Validate YouTube URL or directory path
if input_string.startswith("https://www.youtube.com") or input_string.startswith("https://youtu.be"):
output_file = download_youtube_video_as_wav(input_string)
if output_file is not None:
input_files.append(output_file)
elif os.path.isdir(input_string):
input_directory = input_string
input_files = [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith('.wav')]
else:
raise ValueError("Invalid input! Please provide a valid YouTube link or a directory path.")
# Validate overlap values
if not (0 <= large_overlap <= 1) or not (0 <= small_overlap <= 1):
raise ValueError("Overlap values must be between 0 and 1.")
# Validate chunk size
if chunk_size <= 0:
raise ValueError("Chunk size must be greater than 0.") # not thicc enough
options = {
'input_audio': input_files,
'output_folder': 'results',
'cpu': use_cpu,
'single_onnx': use_single_onnx,
'overlap_large': large_overlap,
'overlap_small': small_overlap,
'chunk_size': chunk_size,
'large_gpu': use_large_gpu,
}
predict_with_model(options)
# Clear GPU cache
if torch.cuda.is_available():
torch.cuda.empty_cache()
output_files = {}
for f in input_files:
audio_file_name = os.path.splitext(os.path.basename(f))[0]
output_files["vocals"] = os.path.join(options['output_folder'], audio_file_name + "_vocals.wav")
output_files["instrumental"] = os.path.join(options['output_folder'], audio_file_name + "_instrum.wav")
output_files["instrumental2"] = os.path.join(options['output_folder'], audio_file_name + "_instrum2.wav") # For the second instrumental output
output_files["bass"] = os.path.join(options['output_folder'], audio_file_name + "_bass.wav")
output_files["drums"] = os.path.join(options['output_folder'], audio_file_name + "_drums.wav")
output_files["other"] = os.path.join(options['output_folder'], audio_file_name + "_other.wav")
# Check the readiness of the files
output_files_ready = []
for k, v in output_files.items():
if os.path.exists(v) and check_file_readiness(v):
output_files_ready.append(v)
else:
empty_data = np.zeros((44100, 2)) # 2 channels, 1 second of silence at 44100Hz
empty_file = tempfile.mktemp('.wav')
wavfile.write(empty_file, 44100, empty_data.astype(np.int16)) # Cast to int16 as wavfile does not support float32
output_files_ready.append(empty_file)
# Generate spectrograms right after separating the audio
output_spectrograms = generate_spectrograms(output_files_ready)
print(len(output_files_ready)) # should print 6
print(len(output_spectrograms)) # should print 6
print("Before return")
return tuple(output_files_ready) + output_spectrograms
print("After return")
description = """
# ZFTurbo Web-UI
Web-UI by [Ma5onic](https://github.com/Ma5onic)
## Options:
- **Use CPU Only:** Select this if you have not enough GPU memory. It will be slower.
- **Use Single ONNX:** Select this to use a single ONNX model. It will decrease quality a little bit but can help with GPU memory usage.
- **Large Overlap:** The overlap for large chunks. Adjust as needed.
- **Small Overlap:** The overlap for small chunks. Adjust as needed.
- **Chunk Size:** The size of chunks to be processed at a time. Reduce this if facing memory issues.
- **Use Fast Large GPU Version:** Select this to use the old fast method that requires > 11 GB of GPU memory. It will work faster.
"""
theme = gr.themes.Base(
primary_hue="cyan",
secondary_hue="cyan",
)
with gr.Blocks(theme=theme) as demo:
gr.Markdown(description)
input_string = gr.Text(label="YouTube Link/URL")
use_cpu = gr.Checkbox(label="Use CPU Only", value=True)
use_single_onnx = gr.Checkbox(label="Use Single ONNX", value=False)
large_overlap = gr.Number(label="Large Overlap", value=0.6)
small_overlap = gr.Number(label="Small Overlap", value=0.5)
chunk_size = gr.Number(label="Chunk Size", value=1000000)
use_large_gpu = gr.Checkbox(label="Use Fast Large GPU Version", value=False)
process_button = gr.Button("Process Audio")
vocals = gr.Audio(label="Vocals")
vocals_spectrogram = gr.Image(label="Vocals Spectrogram")
instrumental = gr.Audio(label="Instrumental")
instrumental_spectrogram = gr.Image(label="Instrumental Spectrogram")
instrumental2 = gr.Audio(label="Instrumental 2")
instrumental2_spectrogram = gr.Image(label="Instrumental 2 Spectrogram")
bass = gr.Audio(label="Bass")
bass_spectrogram = gr.Image(label="Bass Spectrogram")
drums = gr.Audio(label="Drums")
drums_spectrogram = gr.Image(label="Drums Spectrogram")
other = gr.Audio(label="Other")
other_spectrogram = gr.Image(label="Other Spectrogram")
process_button.click(
separate_music_file_wrapper,
inputs=[input_string, use_cpu, use_single_onnx, large_overlap, small_overlap, chunk_size, use_large_gpu],
outputs=[vocals, instrumental, instrumental2, bass, drums, other, vocals_spectrogram, instrumental_spectrogram, instrumental2_spectrogram, bass_spectrogram, drums_spectrogram, other_spectrogram],
)
demo.queue().launch(debug=True, share=False)