BoldActionMan commited on
Commit
941f600
·
1 Parent(s): db31449

Added app.py

Browse files
Files changed (1) hide show
  1. app.py +223 -0
app.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import torch
4
+ from openvoice import se_extractor
5
+ from openvoice.api import ToneColorConverter
6
+ import whisper
7
+ from moviepy.editor import VideoFileClip
8
+ from pydub import AudioSegment
9
+ from df.enhance import enhance, init_df, load_audio, save_audio
10
+ import translators as ts
11
+ from melo.api import TTS
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ import ffmpeg
14
+
15
+ def process_video(video_file, language_choice):
16
+ if video_file == None or language_choice == None:
17
+ return None
18
+
19
+ # Initialize paths and devices
20
+ ckpt_converter = 'checkpoints_v2/converter'
21
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
22
+ output_dir = 'outputs_v2'
23
+ os.makedirs(output_dir, exist_ok=True)
24
+
25
+ tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
26
+ tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
27
+
28
+ # Process the reference video
29
+ reference_video = VideoFileClip(video_file)
30
+ reference_audio = os.path.join(output_dir, "reference_audio.wav")
31
+ reference_video.audio.write_audiofile(reference_audio)
32
+ audio = AudioSegment.from_file(reference_audio)
33
+ resampled_audio = audio.set_frame_rate(48000)
34
+ resampled_audio.export(reference_audio, format="wav")
35
+
36
+ # Enhance the audio
37
+ model, df_state, _ = init_df()
38
+ audio, _ = load_audio(reference_audio, sr=df_state.sr())
39
+ enhanced = enhance(model, df_state, audio)
40
+ save_audio(reference_audio, enhanced, df_state.sr())
41
+ reference_speaker = reference_audio # This is the voice you want to clone
42
+ target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)
43
+
44
+ src_path = os.path.join(output_dir, "tmp.wav")
45
+
46
+ # Speed is adjustable
47
+ speed = 1.0
48
+
49
+ # Transcribe the original audio with timestamps
50
+ sttmodel = whisper.load_model("base")
51
+ sttresult = sttmodel.transcribe(reference_speaker, verbose=True)
52
+
53
+ # Print the original transcription
54
+ print(sttresult["text"])
55
+ print(sttresult["language"])
56
+
57
+ # Get the segments with start and end times
58
+ segments = sttresult['segments']
59
+
60
+ # Choose the target language for translation
61
+ language = 'EN_NEWEST'
62
+ match language_choice:
63
+ case 'en':
64
+ language = 'EN_NEWEST'
65
+ case 'es':
66
+ language = 'ES'
67
+ case 'fr':
68
+ language = 'FR'
69
+ case 'zh':
70
+ language = 'ZH'
71
+ case 'ja':
72
+ language = 'JP'
73
+ case 'ko':
74
+ language = 'KR'
75
+ case _:
76
+ language = 'EN_NEWEST'
77
+
78
+ # Translate the transcription segment by segment
79
+ def translate_segment(segment):
80
+ return segment["start"], segment["end"], ts.translate_text(query_text=segment["text"], translator="google", to_language=language_choice)
81
+
82
+ # Batch translation to reduce memory load
83
+ batch_size = 2
84
+ translation_segments = []
85
+ for i in range(0, len(segments), batch_size):
86
+ batch = segments[i:i + batch_size]
87
+ with ThreadPoolExecutor(max_workers=5) as executor:
88
+ batch_translations = list(executor.map(translate_segment, batch))
89
+ translation_segments.extend(batch_translations)
90
+
91
+ # Generate the translated audio for each segment
92
+ model = TTS(language=language, device=device)
93
+ speaker_ids = model.hps.data.spk2id
94
+
95
+ def generate_segment_audio(segment, speaker_id):
96
+ start, end, translated_text = segment
97
+ segment_path = os.path.join(output_dir, f'segment_{start}_{end}.wav')
98
+ model.tts_to_file(translated_text, speaker_id, segment_path, speed=speed)
99
+ return segment_path, start, end, translated_text
100
+
101
+ for speaker_key in speaker_ids.keys():
102
+ speaker_id = speaker_ids[speaker_key]
103
+ speaker_key = speaker_key.lower().replace('_', '-')
104
+
105
+ source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)
106
+
107
+ segment_files = []
108
+ subtitle_entries = []
109
+ for segment in translation_segments:
110
+ segment_file, start, end, translated_text = generate_segment_audio(segment, speaker_id)
111
+
112
+ # Run the tone color converter
113
+ encode_message = "@MyShell"
114
+ tone_color_converter.convert(
115
+ audio_src_path=segment_file,
116
+ src_se=source_se,
117
+ tgt_se=target_se,
118
+ output_path=segment_file,
119
+ message=encode_message)
120
+
121
+ segment_files.append((segment_file, start, end, translated_text))
122
+
123
+ # Combine the audio segments
124
+ combined_audio = AudioSegment.empty()
125
+ video_segments = []
126
+ previous_end = 0
127
+ subtitle_counter = 1
128
+ for segment_file, start, end, translated_text in segment_files:
129
+ segment_audio = AudioSegment.from_file(segment_file)
130
+ combined_audio += segment_audio
131
+
132
+ # Calculate the duration of the audio segment
133
+ audio_duration = len(segment_audio) / 1000.0
134
+
135
+ # Add the subtitle entry for this segment
136
+ subtitle_entries.append((subtitle_counter, previous_end, previous_end + audio_duration, translated_text))
137
+ subtitle_counter += 1
138
+
139
+ # Get the corresponding video segment and adjust its speed to match the audio duration
140
+ video_segment = (
141
+ ffmpeg
142
+ .input(reference_video.filename, ss=start, to=end)
143
+ .filter('setpts', f'PTS / {(end - start) / audio_duration}')
144
+ )
145
+ video_segments.append((video_segment, ffmpeg.input(segment_file)))
146
+ previous_end += audio_duration
147
+
148
+ save_path = os.path.join(output_dir, f'output_v2_{speaker_key}.wav')
149
+ combined_audio.export(save_path, format="wav")
150
+
151
+ # Combine video and audio segments using ffmpeg
152
+ video_and_audio_files = [item for sublist in video_segments for item in sublist]
153
+ joined = (
154
+ ffmpeg
155
+ .concat(*video_and_audio_files, v=1, a=1)
156
+ .node
157
+ )
158
+
159
+ final_video_path = os.path.join(output_dir, f'final_video_{speaker_key}.mp4')
160
+ try:
161
+ (
162
+ ffmpeg
163
+ .output(joined[0], joined[1], final_video_path, vcodec='libx264', acodec='aac')
164
+ .run(overwrite_output=True)
165
+ )
166
+ except ffmpeg.Error as e:
167
+ print('ffmpeg error:', e)
168
+ print(e.stderr.decode('utf-8'))
169
+
170
+ print(f"Final video without subtitles saved to: {final_video_path}")
171
+
172
+ # Generate subtitles file in SRT format
173
+ srt_path = os.path.join(output_dir, 'subtitles.srt')
174
+ with open(srt_path, 'w', encoding='utf-8') as srt_file:
175
+ for entry in subtitle_entries:
176
+ index, start, end, text = entry
177
+ start_hours, start_minutes = divmod(int(start), 3600)
178
+ start_minutes, start_seconds = divmod(start_minutes, 60)
179
+ start_milliseconds = int((start * 1000) % 1000)
180
+
181
+ end_hours, end_minutes = divmod(int(end), 3600)
182
+ end_minutes, end_seconds = divmod(end_minutes, 60)
183
+ end_milliseconds = int((end * 1000) % 1000)
184
+
185
+ srt_file.write(f"{index}\n")
186
+ srt_file.write(f"{start_hours:02}:{start_minutes:02}:{start_seconds:02},{start_milliseconds:03} --> "
187
+ f"{end_hours:02}:{end_minutes:02}:{end_seconds:02},{end_milliseconds:03}\n")
188
+ srt_file.write(f"{text}\n\n")
189
+
190
+ # Add subtitles to the video
191
+ final_video_with_subs_path = os.path.join(output_dir, f'final_video_with_subs_{speaker_key}.mp4')
192
+ try:
193
+ (
194
+ ffmpeg
195
+ .input(final_video_path)
196
+ .output(final_video_with_subs_path, vf=f"subtitles={srt_path}")
197
+ .run(overwrite_output=True)
198
+ )
199
+ except ffmpeg.Error as e:
200
+ print('ffmpeg error:', e)
201
+ print(e.stderr.decode('utf-8'))
202
+
203
+ print(f"Final video with subtitles saved to: {final_video_with_subs_path}")
204
+
205
+ return final_video_with_subs_path
206
+
207
+
208
+ # Define Gradio interface
209
+ def gradio_interface(video_file, language_choice):
210
+ return process_video(video_file, language_choice)
211
+
212
+ language_choices = ts.get_languages("google")["en"]
213
+
214
+ gr.Interface(
215
+ fn=gradio_interface,
216
+ inputs=[
217
+ gr.Video(label="Upload Video"),
218
+ gr.Dropdown(choices=language_choices, label="Choose Language for Translation")
219
+ ],
220
+ outputs=gr.Video(label="Translated Video"),
221
+ title="Video Translation and Voice Cloning",
222
+ description="Upload a video, choose a language to translate the audio, and download the processed video with translated audio."
223
+ ).launch()