Amr-h commited on
Commit
0ca3a79
Β·
1 Parent(s): d38e095

remove youtube

Browse files
Files changed (2) hide show
  1. app.py +77 -92
  2. audio_extractor.py +73 -298
app.py CHANGED
@@ -2,12 +2,10 @@ import streamlit as st
2
  import pandas as pd
3
  import plotly.express as px
4
  import plotly.graph_objects as go
5
- from plotly.subplots import make_subplots
6
  import time
7
  import os
8
  from pathlib import Path
9
  import tempfile
10
- import shutil
11
 
12
  # Import your existing modules
13
  try:
@@ -19,7 +17,7 @@ except ImportError as e:
19
 
20
  # Page configuration
21
  st.set_page_config(
22
- page_title="🎀 Accent Analyzer",
23
  page_icon="🎀",
24
  layout="wide",
25
  initial_sidebar_state="expanded"
@@ -35,12 +33,6 @@ st.markdown("""
35
  font-weight: bold;
36
  margin-bottom: 2rem;
37
  }
38
- .metric-container {
39
- background-color: #f0f2f6;
40
- padding: 1rem;
41
- border-radius: 0.5rem;
42
- margin: 0.5rem 0;
43
- }
44
  .success-box {
45
  background-color: #d4edda;
46
  border: 1px solid #c3e6cb;
@@ -74,8 +66,6 @@ def initialize_session_state():
74
  st.session_state.analysis_results = None
75
  if 'processing' not in st.session_state:
76
  st.session_state.processing = False
77
- if 'uploaded_file_path' not in st.session_state:
78
- st.session_state.uploaded_file_path = None
79
 
80
  def save_uploaded_file(uploaded_file):
81
  """Save uploaded file to temporary directory"""
@@ -90,31 +80,31 @@ def save_uploaded_file(uploaded_file):
90
  return None
91
 
92
  def create_confidence_chart(chunk_results):
93
- """Create confidence score chart for chunks"""
94
  if not chunk_results:
95
  return None
96
 
97
  chunk_data = []
98
- for result in chunk_results:
99
  chunk_data.append({
100
- 'Chunk': result['chunk'],
101
  'Confidence': result['confidence'],
102
  'Accent': result['accent'],
103
- 'Is Confident': 'βœ“ Confident' if result['is_confident'] else 'βœ— Low Confidence'
104
  })
105
 
106
  df = pd.DataFrame(chunk_data)
107
 
108
  fig = px.bar(df,
109
- x='Chunk',
110
  y='Confidence',
111
  color='Is Confident',
112
  hover_data=['Accent'],
113
- title='Confidence Scores by Chunk',
114
- color_discrete_map={'βœ“ Confident': '#28a745', 'βœ— Low Confidence': '#dc3545'})
115
 
116
  fig.update_layout(
117
- xaxis_title="Chunk Number",
118
  yaxis_title="Confidence Score",
119
  showlegend=True,
120
  height=400
@@ -156,30 +146,30 @@ def display_results(results):
156
 
157
  with col1:
158
  st.metric(
159
- label="🎯 Confidence Score",
160
- value=f"{results['confidence_score']:.3f}",
161
- delta=f"{results['confidence_percentage']}"
162
  )
163
 
164
  with col2:
165
  st.metric(
166
- label="πŸ“Š Chunks Processed",
167
- value=f"{results['processed_chunks_count']}/{results['available_chunks_count']}",
168
- delta="Early stopped" if results.get('early_stopped', False) else "Complete"
169
  )
170
 
171
  with col3:
172
  st.metric(
173
- label="βœ… Confident Predictions",
174
  value=results['confident_chunks_count'],
175
- delta=f"{(results['confident_chunks_count']/results['processed_chunks_count']*100):.1f}%"
176
  )
177
 
178
  with col4:
179
  st.metric(
180
  label="⏱️ Processing Time",
181
  value=f"{results['processing_time']:.1f}s",
182
- delta=f"{results.get('duration_minutes', 0):.1f}min video"
183
  )
184
 
185
  # Detailed Analysis
@@ -198,49 +188,45 @@ def display_results(results):
198
  with chart_col2:
199
  confident_chart = create_accent_distribution_chart(
200
  results['confident_accent_counts'],
201
- "Confident Predictions Distribution"
202
  )
203
  if confident_chart:
204
  st.plotly_chart(confident_chart, use_container_width=True)
205
 
206
- # All predictions distribution
207
- if results['all_accent_counts'] != results['confident_accent_counts']:
208
- st.subheader("πŸ“Š All Predictions (Including Low Confidence)")
209
- all_chart = create_accent_distribution_chart(
210
- results['all_accent_counts'],
211
- "All Predictions Distribution"
212
- )
213
- if all_chart:
214
- st.plotly_chart(all_chart, use_container_width=True)
215
-
216
- # Detailed chunk results table
217
- with st.expander("πŸ” View Detailed Chunk Results"):
218
- chunk_df = pd.DataFrame(results['chunk_results'])
219
- st.dataframe(chunk_df, use_container_width=True)
220
 
221
  # Summary statistics
222
  with st.expander("πŸ“‹ Summary Statistics"):
223
  col1, col2 = st.columns(2)
224
 
225
  with col1:
226
- st.write("**Confident Predictions:**")
227
- for accent, count in results['confident_accent_counts'].items():
228
- percentage = (count / results['confident_chunks_count']) * 100
229
- st.write(f"β€’ {accent}: {count} chunks ({percentage:.1f}%)")
 
 
 
230
 
231
  with col2:
232
  st.write("**All Predictions:**")
233
- for accent, count in results['all_accent_counts'].items():
234
- percentage = (count / results['processed_chunks_count']) * 100
235
- st.write(f"β€’ {accent}: {count} chunks ({percentage:.1f}%)")
 
236
 
237
  def main():
238
  """Main Streamlit application"""
239
  initialize_session_state()
240
 
241
  # Header
242
- st.markdown('<h1 class="main-header">🎀 Accent Analyzer</h1>', unsafe_allow_html=True)
243
- st.markdown("Analyze accents from video files, URLs, or audio sources using advanced AI models.")
244
 
245
  # Sidebar configuration
246
  st.sidebar.header("βš™οΈ Configuration")
@@ -251,13 +237,7 @@ def main():
251
  max_value=0.9,
252
  value=0.6,
253
  step=0.05,
254
- help="Only predictions above this threshold are considered confident"
255
- )
256
-
257
- early_stopping = st.sidebar.checkbox(
258
- "Enable Early Stopping",
259
- value=True,
260
- help="Stop processing when 3 consecutive confident predictions agree"
261
  )
262
 
263
  # Input section
@@ -265,30 +245,30 @@ def main():
265
 
266
  input_method = st.radio(
267
  "Choose input method:",
268
- ["URL (YouTube, Loom, etc.)", "Upload File"],
269
  horizontal=True
270
  )
271
 
272
  source = None
273
 
274
- if input_method == "URL (YouTube, Loom, etc.)":
275
  source = st.text_input(
276
  "Enter video URL:",
277
- placeholder="https://www.youtube.com/watch?v=...",
278
- help="Supports YouTube, Loom, and direct media URLs"
279
  )
280
 
281
  # URL examples
282
  with st.expander("πŸ”— Supported URL Examples"):
283
- st.write("β€’ YouTube: `https://www.youtube.com/watch?v=VIDEO_ID`")
284
- st.write("β€’ YouTube Shorts: `https://www.youtube.com/shorts/VIDEO_ID`")
285
- st.write("β€’ Loom: `https://www.loom.com/share/VIDEO_ID`")
286
- st.write("β€’ Direct media files: `https://example.com/video.mp4`")
287
 
288
  else: # Upload File
289
  uploaded_file = st.file_uploader(
290
  "Choose a video or audio file",
291
- type=['mp4', 'webm', 'avi', 'mov', 'mkv', 'm4v', '3gp', 'mp3', 'wav', 'm4a', 'aac', 'ogg', 'flac'],
292
  help="Upload video or audio files for accent analysis"
293
  )
294
 
@@ -296,16 +276,17 @@ def main():
296
  # Save uploaded file
297
  with st.spinner("Saving uploaded file..."):
298
  source = save_uploaded_file(uploaded_file)
299
- st.session_state.uploaded_file_path = source
300
 
301
  if source:
302
  st.success(f"βœ… File uploaded: {uploaded_file.name}")
 
 
303
  else:
304
  st.error("❌ Failed to save uploaded file")
305
 
306
  # Analysis button
307
  analyze_button = st.button(
308
- "πŸš€ Start Analysis",
309
  type="primary",
310
  disabled=not source or st.session_state.processing,
311
  use_container_width=True
@@ -321,15 +302,15 @@ def main():
321
 
322
  try:
323
  status_text.text("🎡 Extracting audio...")
324
- progress_bar.progress(20)
325
 
326
- status_text.text("🧠 Loading AI model...")
327
- progress_bar.progress(40)
328
 
329
- status_text.text("πŸ” Analyzing accent...")
330
- progress_bar.progress(60)
331
 
332
- # Run analysis
333
  results = analyze_video_accent(source, confidence_threshold=confidence_threshold)
334
 
335
  progress_bar.progress(100)
@@ -353,37 +334,41 @@ def main():
353
 
354
  # Display results
355
  if st.session_state.analysis_results:
356
- st.header("πŸ“Š Results")
357
  display_results(st.session_state.analysis_results)
358
 
359
  # Information section
360
  with st.expander("ℹ️ About This Tool"):
361
  st.markdown("""
362
- **Accent Analyzer** uses advanced machine learning models to identify accents from speech in videos and audio files.
363
 
364
- **Features:**
365
- - Supports multiple input sources (URLs, file uploads)
366
- - Smart chunking for efficient processing
367
- - Confidence-based predictions
368
- - Early stopping for faster results
369
- - Detailed analysis with visualizations
370
 
371
  **Supported Formats:**
372
- - **Video:** MP4, WebM, AVI, MOV, MKV, M4V, 3GP
373
  - **Audio:** MP3, WAV, M4A, AAC, OGG, FLAC
374
- - **URLs:** YouTube, Loom, direct media links
375
 
376
  **How it works:**
377
- 1. Audio is extracted from the source
378
- 2. Audio is chunked into smaller segments
379
- 3. Each chunk is analyzed for accent features
380
- 4. Results are aggregated with confidence scoring
381
- 5. Final prediction is made based on confident predictions
 
 
 
 
 
382
  """)
383
 
384
  # Footer
385
  st.markdown("---")
386
- st.markdown("Made with ❀️ using Streamlit and SpeechBrain")
387
 
388
  if __name__ == "__main__":
389
  main()
 
2
  import pandas as pd
3
  import plotly.express as px
4
  import plotly.graph_objects as go
 
5
  import time
6
  import os
7
  from pathlib import Path
8
  import tempfile
 
9
 
10
  # Import your existing modules
11
  try:
 
17
 
18
  # Page configuration
19
  st.set_page_config(
20
+ page_title="🎀 English Accent Analyzer",
21
  page_icon="🎀",
22
  layout="wide",
23
  initial_sidebar_state="expanded"
 
33
  font-weight: bold;
34
  margin-bottom: 2rem;
35
  }
 
 
 
 
 
 
36
  .success-box {
37
  background-color: #d4edda;
38
  border: 1px solid #c3e6cb;
 
66
  st.session_state.analysis_results = None
67
  if 'processing' not in st.session_state:
68
  st.session_state.processing = False
 
 
69
 
70
  def save_uploaded_file(uploaded_file):
71
  """Save uploaded file to temporary directory"""
 
80
  return None
81
 
82
  def create_confidence_chart(chunk_results):
83
+ """Create confidence score chart for 1-minute chunks"""
84
  if not chunk_results:
85
  return None
86
 
87
  chunk_data = []
88
+ for i, result in enumerate(chunk_results):
89
  chunk_data.append({
90
+ 'Minute': f"Min {i+1}",
91
  'Confidence': result['confidence'],
92
  'Accent': result['accent'],
93
+ 'Is Confident': 'βœ“ High Confidence' if result['is_confident'] else 'βœ— Low Confidence'
94
  })
95
 
96
  df = pd.DataFrame(chunk_data)
97
 
98
  fig = px.bar(df,
99
+ x='Minute',
100
  y='Confidence',
101
  color='Is Confident',
102
  hover_data=['Accent'],
103
+ title='Confidence Scores by Minute',
104
+ color_discrete_map={'βœ“ High Confidence': '#28a745', 'βœ— Low Confidence': '#dc3545'})
105
 
106
  fig.update_layout(
107
+ xaxis_title="Time Segment",
108
  yaxis_title="Confidence Score",
109
  showlegend=True,
110
  height=400
 
146
 
147
  with col1:
148
  st.metric(
149
+ label="🎯 Overall Confidence",
150
+ value=f"{results['confidence_score']:.1%}",
151
+ help="Overall confidence in the prediction"
152
  )
153
 
154
  with col2:
155
  st.metric(
156
+ label="πŸ“Š Minutes Analyzed",
157
+ value=f"{results['processed_chunks_count']} min",
158
+ delta=f"of {results.get('duration_minutes', 0):.1f} min total"
159
  )
160
 
161
  with col3:
162
  st.metric(
163
+ label="βœ… High Confidence Segments",
164
  value=results['confident_chunks_count'],
165
+ delta=f"{(results['confident_chunks_count']/results['processed_chunks_count']*100):.0f}%" if results['processed_chunks_count'] > 0 else "0%"
166
  )
167
 
168
  with col4:
169
  st.metric(
170
  label="⏱️ Processing Time",
171
  value=f"{results['processing_time']:.1f}s",
172
+ help="Time taken to analyze the audio"
173
  )
174
 
175
  # Detailed Analysis
 
188
  with chart_col2:
189
  confident_chart = create_accent_distribution_chart(
190
  results['confident_accent_counts'],
191
+ "High Confidence Predictions"
192
  )
193
  if confident_chart:
194
  st.plotly_chart(confident_chart, use_container_width=True)
195
 
196
+ # Detailed results table
197
+ with st.expander("πŸ” View Minute-by-Minute Results"):
198
+ if results['chunk_results']:
199
+ chunk_df = pd.DataFrame(results['chunk_results'])
200
+ chunk_df.index = [f"Minute {i+1}" for i in range(len(chunk_df))]
201
+ st.dataframe(chunk_df, use_container_width=True)
 
 
 
 
 
 
 
 
202
 
203
  # Summary statistics
204
  with st.expander("πŸ“‹ Summary Statistics"):
205
  col1, col2 = st.columns(2)
206
 
207
  with col1:
208
+ st.write("**High Confidence Predictions:**")
209
+ if results['confident_accent_counts']:
210
+ for accent, count in results['confident_accent_counts'].items():
211
+ percentage = (count / results['confident_chunks_count']) * 100
212
+ st.write(f"β€’ {accent}: {count} segments ({percentage:.1f}%)")
213
+ else:
214
+ st.write("No high confidence predictions")
215
 
216
  with col2:
217
  st.write("**All Predictions:**")
218
+ if results['all_accent_counts']:
219
+ for accent, count in results['all_accent_counts'].items():
220
+ percentage = (count / results['processed_chunks_count']) * 100
221
+ st.write(f"β€’ {accent}: {count} segments ({percentage:.1f}%)")
222
 
223
  def main():
224
  """Main Streamlit application"""
225
  initialize_session_state()
226
 
227
  # Header
228
+ st.markdown('<h1 class="main-header">🎀 English Accent Analyzer</h1>', unsafe_allow_html=True)
229
+ st.markdown("Analyze English accents from video files, Loom videos, or direct media URLs. Audio is processed in 1-minute segments for detailed analysis.")
230
 
231
  # Sidebar configuration
232
  st.sidebar.header("βš™οΈ Configuration")
 
237
  max_value=0.9,
238
  value=0.6,
239
  step=0.05,
240
+ help="Only predictions above this threshold are considered high confidence"
 
 
 
 
 
 
241
  )
242
 
243
  # Input section
 
245
 
246
  input_method = st.radio(
247
  "Choose input method:",
248
+ ["URL (Loom or Direct Link)", "Upload File"],
249
  horizontal=True
250
  )
251
 
252
  source = None
253
 
254
+ if input_method == "URL (Loom or Direct Link)":
255
  source = st.text_input(
256
  "Enter video URL:",
257
+ placeholder="https://www.loom.com/share/...",
258
+ help="Supports Loom videos and direct media URLs"
259
  )
260
 
261
  # URL examples
262
  with st.expander("πŸ”— Supported URL Examples"):
263
+ st.write("β€’ **Loom:** `https://www.loom.com/share/VIDEO_ID`")
264
+ st.write("β€’ **Direct MP4:** `https://example.com/video.mp4`")
265
+ st.write("β€’ **Direct audio:** `https://example.com/audio.mp3`")
266
+ st.markdown('<div class="info-box">πŸ“ <strong>Note:</strong> YouTube URLs are not supported to avoid authentication issues in deployment.</div>', unsafe_allow_html=True)
267
 
268
  else: # Upload File
269
  uploaded_file = st.file_uploader(
270
  "Choose a video or audio file",
271
+ type=['mp4', 'webm', 'avi', 'mov', 'mkv', 'm4v', 'mp3', 'wav', 'm4a', 'aac', 'ogg', 'flac'],
272
  help="Upload video or audio files for accent analysis"
273
  )
274
 
 
276
  # Save uploaded file
277
  with st.spinner("Saving uploaded file..."):
278
  source = save_uploaded_file(uploaded_file)
 
279
 
280
  if source:
281
  st.success(f"βœ… File uploaded: {uploaded_file.name}")
282
+ file_size = len(uploaded_file.getbuffer()) / 1024 / 1024
283
+ st.info(f"πŸ“Š File size: {file_size:.1f}MB")
284
  else:
285
  st.error("❌ Failed to save uploaded file")
286
 
287
  # Analysis button
288
  analyze_button = st.button(
289
+ "πŸš€ Start Accent Analysis",
290
  type="primary",
291
  disabled=not source or st.session_state.processing,
292
  use_container_width=True
 
302
 
303
  try:
304
  status_text.text("🎡 Extracting audio...")
305
+ progress_bar.progress(25)
306
 
307
+ status_text.text("🧩 Creating 1-minute segments...")
308
+ progress_bar.progress(50)
309
 
310
+ status_text.text("🧠 Analyzing accent patterns...")
311
+ progress_bar.progress(75)
312
 
313
+ # Run analysis with the confidence threshold
314
  results = analyze_video_accent(source, confidence_threshold=confidence_threshold)
315
 
316
  progress_bar.progress(100)
 
334
 
335
  # Display results
336
  if st.session_state.analysis_results:
337
+ st.header("πŸ“Š Analysis Results")
338
  display_results(st.session_state.analysis_results)
339
 
340
  # Information section
341
  with st.expander("ℹ️ About This Tool"):
342
  st.markdown("""
343
+ **English Accent Analyzer** uses advanced machine learning models to identify English accents from speech.
344
 
345
+ **Key Features:**
346
+ - 🎯 **1-minute segments:** Audio is processed in 1-minute chunks for detailed analysis
347
+ - 🎀 **Accent detection:** Identifies British, American, Australian, and other English accents
348
+ - πŸ“Š **Confidence scoring:** Provides reliability scores for each prediction
349
+ - πŸ”— **Multiple sources:** Supports Loom videos, direct URLs, and file uploads
 
350
 
351
  **Supported Formats:**
352
+ - **Video:** MP4, WebM, AVI, MOV, MKV, M4V
353
  - **Audio:** MP3, WAV, M4A, AAC, OGG, FLAC
354
+ - **URLs:** Loom videos, direct media links
355
 
356
  **How it works:**
357
+ 1. Audio is extracted from your source
358
+ 2. Audio is split into 1-minute segments
359
+ 3. Each segment is analyzed for accent characteristics
360
+ 4. Results are combined with confidence weighting
361
+ 5. Final accent prediction is provided
362
+
363
+ **Best Results:**
364
+ - Use clear speech audio
365
+ - Longer videos provide more accurate results
366
+ - Multiple speakers may affect accuracy
367
  """)
368
 
369
  # Footer
370
  st.markdown("---")
371
+ st.markdown("πŸš€ **Deployment Ready:** Optimized for Hugging Face Spaces deployment")
372
 
373
  if __name__ == "__main__":
374
  main()
audio_extractor.py CHANGED
@@ -4,7 +4,6 @@ import tempfile
4
  import warnings
5
  import time
6
  import shutil
7
- import random
8
  import requests
9
  from urllib.parse import urlparse, unquote
10
  from pathlib import Path
@@ -30,24 +29,14 @@ def suppress_stdout_stderr():
30
  sys.stdout = old_stdout
31
  sys.stderr = old_stderr
32
 
33
- class RobustAudioExtractor:
34
  def __init__(self):
35
- self.supported_video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv', '.m4v', '.3gp']
36
  self.supported_audio_formats = ['.mp3', '.wav', '.m4a', '.aac', '.ogg', '.flac']
37
- self.user_agents = [
38
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
39
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
40
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
41
- ]
42
 
43
  def extract_audio_from_source(self, source):
44
- """
45
- Extract audio from various sources:
46
- - File path (uploaded file)
47
- - Direct media URL (MP4, etc.)
48
- - Loom URL
49
- - Other video hosting URLs
50
- """
51
  start_time = time.time()
52
 
53
  # Check if source is a file path
@@ -65,9 +54,7 @@ class RobustAudioExtractor:
65
  print(f"πŸŽ₯ Processing Loom URL: {source}")
66
  return self._extract_from_loom(source, start_time)
67
 
68
- # Try with yt-dlp for other platforms (with robust error handling)
69
- print(f"🌐 Processing URL with yt-dlp: {source}")
70
- return self._extract_with_ytdlp_robust(source, start_time)
71
 
72
  def _is_file_path(self, source):
73
  """Check if source is a local file path"""
@@ -95,14 +82,13 @@ class RobustAudioExtractor:
95
  try:
96
  file_ext = Path(file_path).suffix.lower()
97
 
98
- # If it's already an audio file, just return it
99
  if file_ext in self.supported_audio_formats:
100
  if file_ext == '.wav':
101
  end_time = time.time()
102
  print(f"[⏱️] Audio file processing took {end_time - start_time:.2f} seconds.")
103
  return file_path
104
  else:
105
- # Convert to WAV
106
  return self._convert_to_wav(file_path, start_time)
107
 
108
  # If it's a video file, extract audio
@@ -121,38 +107,30 @@ class RobustAudioExtractor:
121
 
122
  try:
123
  headers = {
124
- 'User-Agent': random.choice(self.user_agents),
125
  'Accept': '*/*',
126
  'Accept-Language': 'en-US,en;q=0.9',
127
- 'Accept-Encoding': 'gzip, deflate, br',
128
  'Connection': 'keep-alive',
129
- 'Upgrade-Insecure-Requests': '1',
130
  }
131
 
132
- response = requests.get(url, headers=headers, stream=True, timeout=30)
133
  response.raise_for_status()
134
 
135
- # Determine file extension
136
- content_type = response.headers.get('content-type', '').lower()
137
- if 'video' in content_type:
138
- if 'mp4' in content_type:
 
 
 
 
 
 
139
  ext = '.mp4'
140
- elif 'webm' in content_type:
141
- ext = '.webm'
142
- else:
143
- ext = '.mp4' # default
144
- elif 'audio' in content_type:
145
- if 'mpeg' in content_type or 'mp3' in content_type:
146
  ext = '.mp3'
147
- elif 'wav' in content_type:
148
- ext = '.wav'
149
  else:
150
- ext = '.mp3' # default
151
- else:
152
- # Try to get from URL
153
- parsed_url = urlparse(url)
154
- url_ext = Path(parsed_url.path).suffix.lower()
155
- ext = url_ext if url_ext in self.supported_video_formats + self.supported_audio_formats else '.mp4'
156
 
157
  downloaded_file = os.path.join(temp_dir, f'downloaded{ext}')
158
 
@@ -179,163 +157,55 @@ class RobustAudioExtractor:
179
  shutil.rmtree(temp_dir, ignore_errors=True)
180
  raise Exception(f"Failed to download direct media: {str(e)}")
181
 
182
-
183
- def extract_audio_from_loom(url):
184
- """Simple Loom audio extractor using yt-dlp"""
185
  temp_dir = tempfile.mkdtemp()
186
- ydl_opts = {
187
- 'format': 'bestaudio/best',
188
- 'postprocessors': [{
189
- 'key': 'FFmpegExtractAudio',
190
- 'preferredcodec': 'wav',
191
- 'preferredquality': '192',
192
- }],
193
- 'outtmpl': os.path.join(temp_dir, 'loom_audio.%(ext)s'),
194
- 'quiet': True,
195
- 'no_warnings': True,
196
- 'noplaylist': True,
197
- }
198
-
199
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
200
- ydl.download([url])
201
-
202
- for f in os.listdir(temp_dir):
203
- if f.endswith('.wav'):
204
- return os.path.join(temp_dir, f)
205
-
206
- raise Exception("Audio file not found in output.")
207
-
208
- def _extract_with_ytdlp_robust(self, url, start_time):
209
- """Robust yt-dlp extraction with multiple strategies"""
210
- strategies = [
211
- self._ytdlp_strategy_basic,
212
- self._ytdlp_strategy_with_headers,
213
- self._ytdlp_strategy_low_quality,
214
- self._ytdlp_strategy_audio_only,
215
- ]
216
-
217
- for i, strategy in enumerate(strategies):
218
- try:
219
- print(f"Trying yt-dlp strategy {i+1}...")
220
- result = strategy(url, start_time)
221
- if result:
222
- return result
223
- time.sleep(random.uniform(1, 3))
224
- except Exception as e:
225
- print(f"yt-dlp strategy {i+1} failed: {str(e)}")
226
- continue
227
 
228
- raise Exception("Failed to extract audio with all yt-dlp strategies")
229
-
230
- def _ytdlp_strategy_basic(self, url, start_time):
231
- """Basic yt-dlp strategy"""
232
- temp_dir = tempfile.mkdtemp()
233
- ydl_opts = {
234
- 'format': 'bestaudio[abr<=64]/worst',
235
- 'postprocessors': [{
236
- 'key': 'FFmpegExtractAudio',
237
- 'preferredcodec': 'wav',
238
- 'preferredquality': '192',
239
- }],
240
- 'outtmpl': os.path.join(temp_dir, 'audio.%(ext)s'),
241
- 'quiet': True,
242
- 'no_warnings': True,
243
- 'noplaylist': True,
244
- }
245
-
246
- with suppress_stdout_stderr():
247
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
248
- ydl.download([url])
249
-
250
- return self._find_audio_file(temp_dir, start_time)
251
-
252
- def _ytdlp_strategy_with_headers(self, url, start_time):
253
- """yt-dlp with browser-like headers"""
254
- temp_dir = tempfile.mkdtemp()
255
- ydl_opts = {
256
- 'format': 'bestaudio[abr<=64]/worst',
257
- 'postprocessors': [{
258
- 'key': 'FFmpegExtractAudio',
259
- 'preferredcodec': 'wav',
260
- 'preferredquality': '192',
261
- }],
262
- 'outtmpl': os.path.join(temp_dir, 'audio.%(ext)s'),
263
- 'quiet': True,
264
- 'no_warnings': True,
265
- 'noplaylist': True,
266
- 'http_headers': {
267
- 'User-Agent': random.choice(self.user_agents),
268
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
269
- 'Accept-Language': 'en-US,en;q=0.9',
270
- 'Accept-Encoding': 'gzip, deflate',
271
- 'Connection': 'keep-alive',
272
- },
273
- 'sleep_interval': 1,
274
- 'max_sleep_interval': 3,
275
- }
276
-
277
- with suppress_stdout_stderr():
278
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
279
- ydl.download([url])
280
-
281
- return self._find_audio_file(temp_dir, start_time)
282
-
283
- def _ytdlp_strategy_low_quality(self, url, start_time):
284
- """yt-dlp with lowest quality to avoid detection"""
285
- temp_dir = tempfile.mkdtemp()
286
- ydl_opts = {
287
- 'format': 'worstaudio/worst',
288
- 'postprocessors': [{
289
- 'key': 'FFmpegExtractAudio',
290
- 'preferredcodec': 'wav',
291
- 'preferredquality': '128',
292
- }],
293
- 'outtmpl': os.path.join(temp_dir, 'audio.%(ext)s'),
294
- 'quiet': True,
295
- 'no_warnings': True,
296
- 'noplaylist': True,
297
- 'sleep_interval': 2,
298
- 'max_sleep_interval': 5,
299
- }
300
-
301
- with suppress_stdout_stderr():
302
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
303
- ydl.download([url])
304
 
305
- return self._find_audio_file(temp_dir, start_time)
 
 
306
 
307
- def _ytdlp_strategy_audio_only(self, url, start_time):
308
- """yt-dlp targeting audio-only streams"""
309
- temp_dir = tempfile.mkdtemp()
310
- ydl_opts = {
311
- 'format': 'bestaudio',
312
- 'outtmpl': os.path.join(temp_dir, 'audio.%(ext)s'),
313
- 'postprocessors': [{
314
- 'key': 'FFmpegExtractAudio',
315
- 'preferredcodec': 'wav',
316
- 'preferredquality': '192',
317
- }],
318
- 'prefer_ffmpeg': True,
319
- 'ignoreerrors': True,
320
- 'quiet': True,
321
- 'no_warnings': True,
322
- }
323
-
324
- with suppress_stdout_stderr():
325
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
326
- ydl.download([url])
327
 
328
- return self._find_audio_file(temp_dir, start_time)
 
 
 
 
 
329
 
330
  def _extract_audio_from_video_file(self, video_file, start_time):
331
- """Extract audio from video file using FFmpeg"""
332
  temp_dir = tempfile.mkdtemp()
333
  output_audio = os.path.join(temp_dir, 'extracted_audio.wav')
334
 
335
  try:
 
336
  import subprocess
337
 
338
- # Use FFmpeg to extract audio
339
  cmd = [
340
  'ffmpeg', '-i', video_file,
341
  '-vn', # no video
@@ -353,16 +223,14 @@ class RobustAudioExtractor:
353
  print(f"[⏱️] Audio extraction from video took {end_time - start_time:.2f} seconds.")
354
  return output_audio
355
  else:
356
- raise Exception(f"FFmpeg failed: {result.stderr}")
357
 
358
- except FileNotFoundError:
359
- # Fallback to torchaudio if FFmpeg not available
360
  return self._convert_to_wav(video_file, start_time)
361
- except Exception as e:
362
- raise Exception(f"Failed to extract audio from video: {str(e)}")
363
 
364
  def _convert_to_wav(self, audio_file, start_time):
365
- """Convert audio file to WAV format"""
366
  try:
367
  waveform, sample_rate = torchaudio.load(audio_file)
368
 
@@ -386,65 +254,9 @@ class RobustAudioExtractor:
386
  except Exception as e:
387
  raise Exception(f"Failed to convert audio to WAV: {str(e)}")
388
 
389
- def _find_audio_file(self, directory, start_time):
390
- """Find the extracted audio file"""
391
- audio_extensions = ['.wav', '.mp3', '.m4a', '.ogg', '.aac']
392
-
393
- for file in os.listdir(directory):
394
- if any(file.lower().endswith(ext) for ext in audio_extensions):
395
- audio_path = os.path.join(directory, file)
396
-
397
- # Convert to WAV if not already
398
- if not file.lower().endswith('.wav'):
399
- return self._convert_to_wav(audio_path, start_time)
400
-
401
- end_time = time.time()
402
- print(f"[⏱️] Audio extraction took {end_time - start_time:.2f} seconds.")
403
- return audio_path
404
-
405
- raise Exception("No audio file found after extraction")
406
-
407
- # Update the main function to use the new extractor
408
- def extract_audio_from_video_url(video_source):
409
- """
410
- Main function that handles all types of video sources:
411
- - File paths (uploaded files)
412
- - Direct media URLs
413
- - Loom URLs
414
- - Other video platform URLs
415
- """
416
- extractor = RobustAudioExtractor()
417
- return extractor.extract_audio_from_source(video_source)
418
-
419
- # Keep the existing chunking functions unchanged
420
- def smart_chunk_audio(waveform, sample_rate, duration_minutes):
421
- """Smart chunking based on video duration"""
422
- total_duration = waveform.size(1) / sample_rate
423
- print(f"πŸ“ Video duration: {total_duration/60:.1f} minutes")
424
-
425
- if duration_minutes <= 1:
426
- # Short videos: smaller chunks, process all
427
- chunk_length_sec = 10
428
- return chunk_audio_all(waveform, sample_rate, chunk_length_sec)
429
-
430
- elif duration_minutes <= 5:
431
- # Medium videos: normal chunks, skip some randomly
432
- chunk_length_sec = 20
433
- all_chunks = chunk_audio_all(waveform, sample_rate, chunk_length_sec)
434
- # Keep 70% of chunks randomly
435
- keep_ratio = 0.7
436
- num_keep = max(1, int(len(all_chunks) * keep_ratio))
437
- selected_chunks = random.sample(all_chunks, num_keep)
438
- print(f"πŸ“¦ Selected {len(selected_chunks)} out of {len(all_chunks)} chunks")
439
- return selected_chunks
440
-
441
- else:
442
- # Long videos: strategic sampling from beginning, middle, end
443
- chunk_length_sec = 25
444
- return chunk_audio_strategic(waveform, sample_rate, chunk_length_sec)
445
-
446
- def chunk_audio_all(waveform, sample_rate, chunk_length_sec=20):
447
- """Create all chunks from audio"""
448
  chunk_samples = chunk_length_sec * sample_rate
449
  total_samples = waveform.size(1)
450
  chunks = []
@@ -452,56 +264,19 @@ def chunk_audio_all(waveform, sample_rate, chunk_length_sec=20):
452
  for start in range(0, total_samples, chunk_samples):
453
  end = min(start + chunk_samples, total_samples)
454
  chunk = waveform[:, start:end]
455
- if chunk.size(1) > sample_rate * 3: # ignore very short chunks (3 sec minimum)
456
- chunks.append(chunk)
457
- return chunks
458
-
459
- def chunk_audio_strategic(waveform, sample_rate, chunk_length_sec=25):
460
- """Strategic chunking for long videos - sample from beginning, middle, end"""
461
- total_samples = waveform.size(1)
462
- chunk_samples = chunk_length_sec * sample_rate
463
-
464
- chunks = []
465
-
466
- # Beginning: 2-3 chunks
467
- beginning_chunks = min(3, total_samples // chunk_samples)
468
- for i in range(beginning_chunks):
469
- start = i * chunk_samples
470
- end = min(start + chunk_samples, total_samples)
471
- chunk = waveform[:, start:end]
472
- if chunk.size(1) > sample_rate * 3:
473
  chunks.append(chunk)
474
 
475
- # Middle: 2-3 chunks
476
- middle_start = total_samples // 2 - chunk_samples
477
- middle_chunks = min(3, 2)
478
- for i in range(middle_chunks):
479
- start = middle_start + (i * chunk_samples)
480
- end = min(start + chunk_samples, total_samples)
481
- if start >= 0 and start < total_samples:
482
- chunk = waveform[:, start:end]
483
- if chunk.size(1) > sample_rate * 3:
484
- chunks.append(chunk)
485
-
486
- # End: 2-3 chunks
487
- end_start = total_samples - (3 * chunk_samples)
488
- end_chunks = min(3, 3)
489
- for i in range(end_chunks):
490
- start = max(0, end_start + (i * chunk_samples))
491
- end = min(start + chunk_samples, total_samples)
492
- if start < total_samples:
493
- chunk = waveform[:, start:end]
494
- if chunk.size(1) > sample_rate * 3:
495
- chunks.append(chunk)
496
-
497
- print(f"πŸ“¦ Strategic sampling: {len(chunks)} chunks from long video")
498
  return chunks
499
 
500
  def prepare_audio(video_source):
501
- """Main function to extract and prepare audio chunks"""
502
  try:
503
  print(f"🎡 Extracting audio from source...")
504
- audio_path = extract_audio_from_video_url(video_source)
 
505
  print(f"βœ… Audio extracted to: {audio_path}")
506
 
507
  print(f"🎯 Loading and preparing audio...")
@@ -520,14 +295,14 @@ def prepare_audio(video_source):
520
  end = time.time()
521
  print(f"[⏱️] Audio preparation took {end - start:.2f} seconds.")
522
 
523
- # Calculate duration and apply smart chunking
524
  duration_minutes = waveform.size(1) / sample_rate / 60
525
 
526
- print(f"🧩 Smart chunking based on duration...")
527
  start = time.time()
528
- chunks = smart_chunk_audio(waveform, sample_rate, duration_minutes)
529
  end = time.time()
530
- print(f"[⏱️] Smart chunking took {end - start:.2f} seconds. Total chunks: {len(chunks)}")
531
 
532
  return {
533
  "success": True,
 
4
  import warnings
5
  import time
6
  import shutil
 
7
  import requests
8
  from urllib.parse import urlparse, unquote
9
  from pathlib import Path
 
29
  sys.stdout = old_stdout
30
  sys.stderr = old_stderr
31
 
32
+ class SimpleAudioExtractor:
33
  def __init__(self):
34
+ self.supported_video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv', '.m4v']
35
  self.supported_audio_formats = ['.mp3', '.wav', '.m4a', '.aac', '.ogg', '.flac']
36
+ self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
 
 
 
 
37
 
38
  def extract_audio_from_source(self, source):
39
+ """Extract audio from file path, direct media URL, or Loom URL"""
 
 
 
 
 
 
40
  start_time = time.time()
41
 
42
  # Check if source is a file path
 
54
  print(f"πŸŽ₯ Processing Loom URL: {source}")
55
  return self._extract_from_loom(source, start_time)
56
 
57
+ raise Exception("Unsupported URL format. Please use Loom URLs or direct media links.")
 
 
58
 
59
  def _is_file_path(self, source):
60
  """Check if source is a local file path"""
 
82
  try:
83
  file_ext = Path(file_path).suffix.lower()
84
 
85
+ # If it's already an audio file, convert to WAV if needed
86
  if file_ext in self.supported_audio_formats:
87
  if file_ext == '.wav':
88
  end_time = time.time()
89
  print(f"[⏱️] Audio file processing took {end_time - start_time:.2f} seconds.")
90
  return file_path
91
  else:
 
92
  return self._convert_to_wav(file_path, start_time)
93
 
94
  # If it's a video file, extract audio
 
107
 
108
  try:
109
  headers = {
110
+ 'User-Agent': self.user_agent,
111
  'Accept': '*/*',
112
  'Accept-Language': 'en-US,en;q=0.9',
 
113
  'Connection': 'keep-alive',
 
114
  }
115
 
116
+ response = requests.get(url, headers=headers, stream=True, timeout=60)
117
  response.raise_for_status()
118
 
119
+ # Determine file extension from URL or content type
120
+ parsed_url = urlparse(url)
121
+ url_ext = Path(parsed_url.path).suffix.lower()
122
+
123
+ if url_ext in self.supported_video_formats + self.supported_audio_formats:
124
+ ext = url_ext
125
+ else:
126
+ # Try to get from content type
127
+ content_type = response.headers.get('content-type', '').lower()
128
+ if 'video' in content_type:
129
  ext = '.mp4'
130
+ elif 'audio' in content_type:
 
 
 
 
 
131
  ext = '.mp3'
 
 
132
  else:
133
+ ext = '.mp4' # default
 
 
 
 
 
134
 
135
  downloaded_file = os.path.join(temp_dir, f'downloaded{ext}')
136
 
 
157
  shutil.rmtree(temp_dir, ignore_errors=True)
158
  raise Exception(f"Failed to download direct media: {str(e)}")
159
 
160
+ def _extract_from_loom(self, url, start_time):
161
+ """Extract audio from Loom URL using yt-dlp"""
 
162
  temp_dir = tempfile.mkdtemp()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
+ try:
165
+ ydl_opts = {
166
+ 'format': 'bestaudio/best',
167
+ 'postprocessors': [{
168
+ 'key': 'FFmpegExtractAudio',
169
+ 'preferredcodec': 'wav',
170
+ 'preferredquality': '192',
171
+ }],
172
+ 'outtmpl': os.path.join(temp_dir, 'loom_audio.%(ext)s'),
173
+ 'quiet': True,
174
+ 'no_warnings': True,
175
+ 'noplaylist': True,
176
+ 'http_headers': {
177
+ 'User-Agent': self.user_agent,
178
+ },
179
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
+ with suppress_stdout_stderr():
182
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
183
+ ydl.download([url])
184
 
185
+ # Find the extracted audio file
186
+ for file in os.listdir(temp_dir):
187
+ if file.endswith('.wav'):
188
+ audio_path = os.path.join(temp_dir, file)
189
+ end_time = time.time()
190
+ print(f"[⏱️] Loom audio extraction took {end_time - start_time:.2f} seconds.")
191
+ return audio_path
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
+ raise Exception("Audio file not found after Loom extraction")
194
+
195
+ except Exception as e:
196
+ if os.path.exists(temp_dir):
197
+ shutil.rmtree(temp_dir, ignore_errors=True)
198
+ raise Exception(f"Failed to extract from Loom: {str(e)}")
199
 
200
  def _extract_audio_from_video_file(self, video_file, start_time):
201
+ """Extract audio from video file using FFmpeg or torchaudio"""
202
  temp_dir = tempfile.mkdtemp()
203
  output_audio = os.path.join(temp_dir, 'extracted_audio.wav')
204
 
205
  try:
206
+ # Try FFmpeg first
207
  import subprocess
208
 
 
209
  cmd = [
210
  'ffmpeg', '-i', video_file,
211
  '-vn', # no video
 
223
  print(f"[⏱️] Audio extraction from video took {end_time - start_time:.2f} seconds.")
224
  return output_audio
225
  else:
226
+ raise Exception("FFmpeg failed, trying torchaudio...")
227
 
228
+ except (FileNotFoundError, Exception):
229
+ # Fallback to torchaudio
230
  return self._convert_to_wav(video_file, start_time)
 
 
231
 
232
  def _convert_to_wav(self, audio_file, start_time):
233
+ """Convert audio file to WAV format using torchaudio"""
234
  try:
235
  waveform, sample_rate = torchaudio.load(audio_file)
236
 
 
254
  except Exception as e:
255
  raise Exception(f"Failed to convert audio to WAV: {str(e)}")
256
 
257
+ def chunk_audio_1min(waveform, sample_rate):
258
+ """Create 1-minute chunks from audio"""
259
+ chunk_length_sec = 60 # 1 minute chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  chunk_samples = chunk_length_sec * sample_rate
261
  total_samples = waveform.size(1)
262
  chunks = []
 
264
  for start in range(0, total_samples, chunk_samples):
265
  end = min(start + chunk_samples, total_samples)
266
  chunk = waveform[:, start:end]
267
+ # Only include chunks that are at least 10 seconds long
268
+ if chunk.size(1) > sample_rate * 10:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  chunks.append(chunk)
270
 
271
+ print(f"πŸ“¦ Created {len(chunks)} 1-minute chunks")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  return chunks
273
 
274
  def prepare_audio(video_source):
275
+ """Main function to extract and prepare 1-minute audio chunks"""
276
  try:
277
  print(f"🎡 Extracting audio from source...")
278
+ extractor = SimpleAudioExtractor()
279
+ audio_path = extractor.extract_audio_from_source(video_source)
280
  print(f"βœ… Audio extracted to: {audio_path}")
281
 
282
  print(f"🎯 Loading and preparing audio...")
 
295
  end = time.time()
296
  print(f"[⏱️] Audio preparation took {end - start:.2f} seconds.")
297
 
298
+ # Calculate duration and create 1-minute chunks
299
  duration_minutes = waveform.size(1) / sample_rate / 60
300
 
301
+ print(f"🧩 Creating 1-minute chunks...")
302
  start = time.time()
303
+ chunks = chunk_audio_1min(waveform, sample_rate)
304
  end = time.time()
305
+ print(f"[⏱️] Chunking took {end - start:.2f} seconds. Total chunks: {len(chunks)}")
306
 
307
  return {
308
  "success": True,