|
import os |
|
import time |
|
import numpy as np |
|
import tempfile |
|
from scipy.io import wavfile |
|
from pytube import YouTube |
|
import gradio as gr |
|
from moviepy.editor import AudioFileClip |
|
from inference import EnsembleDemucsMDXMusicSeparationModel, predict_with_model |
|
import torch |
|
import librosa |
|
import librosa.display |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
def download_youtube_video_as_wav(youtube_url): |
|
output_dir = "downloads" |
|
os.makedirs(output_dir, exist_ok=True) |
|
output_file = os.path.join(output_dir, "temp.mp4") |
|
|
|
try: |
|
yt = YouTube(youtube_url) |
|
yt.streams.filter(only_audio=True).first().download(filename=output_file) |
|
print("Download completed successfully.") |
|
except Exception as e: |
|
print(f"An error occurred while downloading the video: {e}") |
|
return None |
|
|
|
|
|
wav_file = os.path.join(output_dir, "mixture.wav") |
|
clip = AudioFileClip(output_file) |
|
clip.write_audiofile(wav_file) |
|
|
|
return wav_file |
|
|
|
|
|
def check_file_readiness(filepath): |
|
|
|
|
|
num_same_size_checks = 0 |
|
last_size = -1 |
|
while num_same_size_checks < 5: |
|
current_size = os.path.getsize(filepath) |
|
if current_size == last_size: |
|
num_same_size_checks += 1 |
|
else: |
|
num_same_size_checks = 0 |
|
last_size = current_size |
|
time.sleep(0.5) |
|
return True |
|
|
|
|
|
def generate_spectrogram(audio_file_path): |
|
y, sr = librosa.load(audio_file_path) |
|
plt.figure(figsize=(10, 4)) |
|
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000) |
|
librosa.display.specshow(librosa.power_to_db(S, ref=np.max), |
|
y_axis='mel', fmax=8000, x_axis='time') |
|
plt.colorbar(format='%+2.0f dB') |
|
plt.title('Mel spectrogram') |
|
plt.tight_layout() |
|
image_path = tempfile.mktemp('.png') |
|
plt.savefig(image_path) |
|
plt.close() |
|
return image_path |
|
|
|
|
|
def generate_spectrograms(audio_files): |
|
output_spectrograms = [] |
|
for audio_file in audio_files: |
|
output_spectrograms.append(generate_spectrogram(audio_file)) |
|
return tuple(output_spectrograms) |
|
|
|
|
|
def separate_music_file_wrapper(input_string, use_cpu, use_single_onnx, large_overlap, small_overlap, chunk_size, use_large_gpu): |
|
input_files = [] |
|
|
|
if input_string.startswith("https://www.youtube.com") or input_string.startswith("https://youtu.be"): |
|
output_file = download_youtube_video_as_wav(input_string) |
|
if output_file is not None: |
|
input_files.append(output_file) |
|
elif os.path.isdir(input_string): |
|
input_directory = input_string |
|
input_files = [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith('.wav')] |
|
else: |
|
raise ValueError("Invalid input! Please provide a valid YouTube link or a directory path.") |
|
|
|
|
|
if not (0 <= large_overlap <= 1) or not (0 <= small_overlap <= 1): |
|
raise ValueError("Overlap values must be between 0 and 1.") |
|
|
|
|
|
if chunk_size <= 0: |
|
raise ValueError("Chunk size must be greater than 0.") |
|
|
|
options = { |
|
'input_audio': input_files, |
|
'output_folder': 'results', |
|
'cpu': use_cpu, |
|
'single_onnx': use_single_onnx, |
|
'overlap_large': large_overlap, |
|
'overlap_small': small_overlap, |
|
'chunk_size': chunk_size, |
|
'large_gpu': use_large_gpu, |
|
} |
|
|
|
predict_with_model(options) |
|
|
|
|
|
if torch.cuda.is_available(): |
|
torch.cuda.empty_cache() |
|
|
|
output_files = {} |
|
for f in input_files: |
|
audio_file_name = os.path.splitext(os.path.basename(f))[0] |
|
output_files["vocals"] = os.path.join(options['output_folder'], audio_file_name + "_vocals.wav") |
|
output_files["instrumental"] = os.path.join(options['output_folder'], audio_file_name + "_instrum.wav") |
|
output_files["instrumental2"] = os.path.join(options['output_folder'], audio_file_name + "_instrum2.wav") |
|
output_files["bass"] = os.path.join(options['output_folder'], audio_file_name + "_bass.wav") |
|
output_files["drums"] = os.path.join(options['output_folder'], audio_file_name + "_drums.wav") |
|
output_files["other"] = os.path.join(options['output_folder'], audio_file_name + "_other.wav") |
|
|
|
|
|
output_files_ready = [] |
|
for k, v in output_files.items(): |
|
if os.path.exists(v) and check_file_readiness(v): |
|
output_files_ready.append(v) |
|
else: |
|
empty_data = np.zeros((44100, 2)) |
|
empty_file = tempfile.mktemp('.wav') |
|
wavfile.write(empty_file, 44100, empty_data.astype(np.int16)) |
|
output_files_ready.append(empty_file) |
|
|
|
|
|
output_spectrograms = generate_spectrograms(output_files_ready) |
|
|
|
print(len(output_files_ready)) |
|
print(len(output_spectrograms)) |
|
|
|
print("Before return") |
|
return tuple(output_files_ready) + output_spectrograms |
|
print("After return") |
|
|
|
|
|
description = """ |
|
# ZFTurbo Web-UI |
|
Web-UI by [Ma5onic](https://github.com/Ma5onic) |
|
## Options: |
|
- **Use CPU Only:** Select this if you have not enough GPU memory. It will be slower. |
|
- **Use Single ONNX:** Select this to use a single ONNX model. It will decrease quality a little bit but can help with GPU memory usage. |
|
- **Large Overlap:** The overlap for large chunks. Adjust as needed. |
|
- **Small Overlap:** The overlap for small chunks. Adjust as needed. |
|
- **Chunk Size:** The size of chunks to be processed at a time. Reduce this if facing memory issues. |
|
- **Use Fast Large GPU Version:** Select this to use the old fast method that requires > 11 GB of GPU memory. It will work faster. |
|
""" |
|
theme = gr.themes.Base( |
|
primary_hue="cyan", |
|
secondary_hue="cyan", |
|
) |
|
|
|
with gr.Blocks(theme=theme) as demo: |
|
gr.Markdown(description) |
|
input_string = gr.Text(label="YouTube Link/URL") |
|
use_cpu = gr.Checkbox(label="Use CPU Only", value=True) |
|
use_single_onnx = gr.Checkbox(label="Use Single ONNX", value=False) |
|
large_overlap = gr.Number(label="Large Overlap", value=0.6) |
|
small_overlap = gr.Number(label="Small Overlap", value=0.5) |
|
chunk_size = gr.Number(label="Chunk Size", value=1000000) |
|
use_large_gpu = gr.Checkbox(label="Use Fast Large GPU Version", value=False) |
|
process_button = gr.Button("Process Audio") |
|
|
|
vocals = gr.Audio(label="Vocals") |
|
vocals_spectrogram = gr.Image(label="Vocals Spectrogram") |
|
instrumental = gr.Audio(label="Instrumental") |
|
instrumental_spectrogram = gr.Image(label="Instrumental Spectrogram") |
|
instrumental2 = gr.Audio(label="Instrumental 2") |
|
instrumental2_spectrogram = gr.Image(label="Instrumental 2 Spectrogram") |
|
bass = gr.Audio(label="Bass") |
|
bass_spectrogram = gr.Image(label="Bass Spectrogram") |
|
drums = gr.Audio(label="Drums") |
|
drums_spectrogram = gr.Image(label="Drums Spectrogram") |
|
other = gr.Audio(label="Other") |
|
other_spectrogram = gr.Image(label="Other Spectrogram") |
|
|
|
process_button.click( |
|
separate_music_file_wrapper, |
|
inputs=[input_string, use_cpu, use_single_onnx, large_overlap, small_overlap, chunk_size, use_large_gpu], |
|
outputs=[vocals, instrumental, instrumental2, bass, drums, other, vocals_spectrogram, instrumental_spectrogram, instrumental2_spectrogram, bass_spectrogram, drums_spectrogram, other_spectrogram], |
|
) |
|
|
|
demo.queue().launch(debug=True, share=False) |
|
|