Amr-h commited on
Commit
70f7db6
Β·
0 Parent(s):
.gitignore ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+
26
+ # PyInstaller
27
+ # Usually these files are written by a python script from a template
28
+ *.manifest
29
+ *.spec
30
+
31
+ # Installer logs
32
+ pip-log.txt
33
+ pip-delete-this-directory.txt
34
+
35
+ # Unit test / coverage reports
36
+ htmlcov/
37
+ .tox/
38
+ .nox/
39
+ .coverage
40
+ .coverage.*
41
+ .cache
42
+ nosetests.xml
43
+ coverage.xml
44
+ *.cover
45
+ *.py,cover
46
+ .hypothesis/
47
+ .pytest_cache/
48
+
49
+ # Jupyter Notebook checkpoints
50
+ .ipynb_checkpoints/
51
+
52
+ # IPython history
53
+ profile_default/
54
+ ipython_config.py
55
+
56
+ # PyTorch save files
57
+ *.pt
58
+ *.pth
59
+ *.ckpt
60
+
61
+ # Torchaudio temp files or cache (if any)
62
+ *.wav
63
+
64
+ # Hugging Face Hub cache and tokens
65
+ cache/
66
+ hf_home/
67
+ *.json
68
+ *.token
69
+
70
+ # VS Code settings
71
+ .vscode/
72
+
73
+ # IDE settings
74
+ .idea/
75
+ *.iml
76
+
77
+ # macOS files
78
+ .DS_Store
79
+
80
+ # Logs and temp files
81
+ *.log
82
+ *.tmp
83
+ *.temp
84
+
85
+ # Temp directories used in your code
86
+ tmp/
87
+ temp/
88
+ temp_dir/
89
+ tmp_dir/
90
+
91
+ # yt-dlp downloads (temporary audio files)
92
+ *.m4a
93
+ *.mp3
94
+ *.webm
95
+ *.mkv
96
+
97
+ # dotenv environment variables file
98
+ .env
99
+
100
+ # Python virtual environments
101
+ venv/
102
+ env/
103
+ .venv/
104
+ .env/
105
+
106
+ # Misc
107
+ *.bak
108
+ *.swp
README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: AI Accent Analyzer
3
+ emoji: 🎀
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.28.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # 🎀 AI Accent Analyzer
14
+
15
+ Analyze accents from YouTube videos using advanced AI models with confidence-based filtering.
16
+
17
+ ## Features
18
+ - πŸŽ₯ YouTube video support (including Shorts)
19
+ - 🧠 SpeechBrain AI model for accent detection
20
+ - πŸ“Š Confidence-based filtering (configurable threshold)
21
+ - ⚑ Early stopping mechanism
22
+ - πŸ“ˆ Interactive visualizations
23
+ - πŸ“₯ Export results (CSV/JSON)
24
+
25
+ ## How to Use
26
+ 1. Paste a YouTube video URL
27
+ 2. Adjust confidence threshold if needed
28
+ 3. Click "Analyze Accent"
29
+ 4. View detailed results and visualizations
30
+
31
+ Built with Streamlit, SpeechBrain, and Plotly.
analyze.txt ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ============================================================
2
+ 🎡 Extracting and preparing audio...
3
+ [⏱️] Audio extraction took 11.33 seconds.
4
+ βœ… Audio prepared in 11.56s | Duration: 1.8 minutes
5
+ 🧠 Loading model...
6
+ βœ… Model loaded in 1.63s
7
+
8
+ ============================================================
9
+ πŸ“Š CHUNK SIZE ANALYSIS RESULTS
10
+ ============================================================
11
+
12
+ 🧩 Testing 10-second chunks...
13
+ CategoricalEncoder.expect_len was never called: assuming category count of 16 to be correct! Sanity check your encoder using `.expect_len`. Ensure that downstream code also uses the correct size. If you are sure this does not apply to you, use `.ignore_len`.
14
+ πŸ“¦ Chunks created: 11
15
+ ⏱️ Chunking time: 0.001s
16
+ 🧠 Prediction time: 23.477s
17
+ πŸ”„ Total processing: 23.478s
18
+ ⚑ Processing rate: 0.5 chunks/sec
19
+ πŸ“ˆ Avg confidence: 0.666
20
+ 🎯 Most common: indian (5 times)
21
+ πŸ“Š Confidence range: 0.592 - 0.797
22
+
23
+ 🧩 Testing 15-second chunks...
24
+ πŸ“¦ Chunks created: 8
25
+ ⏱️ Chunking time: 0.000s
26
+ 🧠 Prediction time: 25.102s
27
+ πŸ”„ Total processing: 25.102s
28
+ ⚑ Processing rate: 0.3 chunks/sec
29
+ πŸ“ˆ Avg confidence: 0.681
30
+ 🎯 Most common: england (4 times)
31
+ πŸ“Š Confidence range: 0.602 - 0.849
32
+
33
+ 🧩 Testing 20-second chunks...
34
+ πŸ“¦ Chunks created: 6
35
+ ⏱️ Chunking time: 0.000s
36
+ 🧠 Prediction time: 26.239s
37
+ πŸ”„ Total processing: 26.239s
38
+ ⚑ Processing rate: 0.2 chunks/sec
39
+ πŸ“ˆ Avg confidence: 0.671
40
+ 🎯 Most common: england (4 times)
41
+ πŸ“Š Confidence range: 0.603 - 0.733
42
+
43
+ 🧩 Testing 30-second chunks...
44
+ πŸ“¦ Chunks created: 4
45
+ ⏱️ Chunking time: 0.000s
46
+ 🧠 Prediction time: 28.015s
47
+ πŸ”„ Total processing: 28.015s
48
+ ⚑ Processing rate: 0.1 chunks/sec
49
+ πŸ“ˆ Avg confidence: 0.659
50
+ 🎯 Most common: england (2 times)
51
+ πŸ“Š Confidence range: 0.559 - 0.714
52
+
53
+ 🧩 Testing 60-second chunks...
54
+ πŸ“¦ Chunks created: 2
55
+ ⏱️ Chunking time: 0.000s
56
+ 🧠 Prediction time: 25.356s
57
+ πŸ”„ Total processing: 25.356s
58
+ ⚑ Processing rate: 0.1 chunks/sec
59
+ πŸ“ˆ Avg confidence: 0.714
60
+ 🎯 Most common: indian (2 times)
61
+ πŸ“Š Confidence range: 0.667 - 0.760
62
+
63
+ ================================================================================
64
+ πŸ“ˆ PERFORMANCE COMPARISON SUMMARY
65
+ ================================================================================
66
+ Size Chunks Total Time Rate Avg Conf Consistency Winner
67
+ --------------------------------------------------------------------------------
68
+ 10 11 23.478 0.5 0.666 0.90 indian
69
+ 15 8 25.102 0.3 0.681 0.88 england
70
+ 20 6 26.239 0.2 0.671 0.93 england
71
+ 30 4 28.015 0.1 0.659 0.89 england
72
+ 60 2 25.356 0.1 0.714 0.91 indian
73
+
74
+ ============================================================
75
+ πŸ† RECOMMENDATIONS
76
+ ============================================================
77
+ ⚑ Fastest processing: 10s chunks (23.48s total)
78
+ 🎯 Highest accuracy: 60s chunks (0.714 avg confidence)
79
+ πŸ“Š Most consistent: 20s chunks (0.926 consistency)
80
+ βš–οΈ Best balance: 60s chunks (score: 42.8)
app.py ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ from plotly.subplots import make_subplots
6
+ import time
7
+ import re
8
+ from datetime import datetime
9
+ import numpy as np
10
+ from dialect_predector import analyze_video_accent
11
+
12
+ # Import your accent analysis function
13
+ # from your_accent_module import analyze_video_accent
14
+
15
+ # Page configuration
16
+ st.set_page_config(
17
+ page_title="🎀 AI Accent Analyzer",
18
+ page_icon="🎀",
19
+ layout="wide",
20
+ initial_sidebar_state="expanded"
21
+ )
22
+
23
+ # Custom CSS for beautiful styling
24
+ st.markdown("""
25
+ <style>
26
+ .main-header {
27
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
28
+ padding: 2rem;
29
+ border-radius: 10px;
30
+ color: white;
31
+ text-align: center;
32
+ margin-bottom: 2rem;
33
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
34
+ }
35
+
36
+ .metric-card {
37
+ background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
38
+ padding: 1.5rem;
39
+ border-radius: 10px;
40
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
41
+ margin: 0.5rem 0;
42
+ border-left: 4px solid #667eea;
43
+ }
44
+
45
+ .analysis-section {
46
+ background: white;
47
+ padding: 1.5rem;
48
+ border-radius: 10px;
49
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
50
+ margin: 1rem 0;
51
+ border: 1px solid #e0e6ed;
52
+ }
53
+
54
+ .accent-tag {
55
+ display: inline-block;
56
+ padding: 0.3rem 0.8rem;
57
+ margin: 0.2rem;
58
+ border-radius: 20px;
59
+ font-weight: bold;
60
+ font-size: 0.9rem;
61
+ }
62
+
63
+ .accent-primary {
64
+ background: linear-gradient(45deg, #667eea, #764ba2);
65
+ color: white;
66
+ }
67
+
68
+ .accent-secondary {
69
+ background: linear-gradient(45deg, #ffecd2, #fcb69f);
70
+ color: #333;
71
+ }
72
+
73
+ .processing-animation {
74
+ display: flex;
75
+ justify-content: center;
76
+ align-items: center;
77
+ padding: 2rem;
78
+ }
79
+
80
+ .confidence-bar {
81
+ background: linear-gradient(90deg, #ff6b6b, #feca57, #48cae4, #06ffa5);
82
+ height: 20px;
83
+ border-radius: 10px;
84
+ margin: 0.5rem 0;
85
+ }
86
+
87
+ .chunk-result {
88
+ background: #f8f9fa;
89
+ border-left: 4px solid #28a745;
90
+ padding: 0.8rem;
91
+ margin: 0.3rem 0;
92
+ border-radius: 5px;
93
+ }
94
+
95
+ .chunk-result.low-confidence {
96
+ border-left-color: #ffc107;
97
+ }
98
+
99
+ .sidebar-info {
100
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
101
+ color: white;
102
+ padding: 1rem;
103
+ border-radius: 10px;
104
+ margin-bottom: 1rem;
105
+ }
106
+ </style>
107
+ """, unsafe_allow_html=True)
108
+
109
+ def validate_url(url):
110
+ """Validate if the URL is a valid YouTube URL"""
111
+ youtube_patterns = [
112
+ r'(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/',
113
+ r'(https?://)?(www\.)?youtube\.com/shorts/',
114
+ r'(https?://)?(www\.)?youtu\.be/'
115
+ ]
116
+
117
+ for pattern in youtube_patterns:
118
+ if re.match(pattern, url):
119
+ return True
120
+ return False
121
+
122
+ def create_confidence_gauge(confidence):
123
+ """Create a beautiful confidence gauge"""
124
+ fig = go.Figure(go.Indicator(
125
+ mode = "gauge+number+delta",
126
+ value = confidence * 100,
127
+ domain = {'x': [0, 1], 'y': [0, 1]},
128
+ title = {'text': "Confidence Score"},
129
+ delta = {'reference': 70},
130
+ gauge = {
131
+ 'axis': {'range': [None, 100]},
132
+ 'bar': {'color': "darkblue"},
133
+ 'steps': [
134
+ {'range': [0, 50], 'color': "lightgray"},
135
+ {'range': [50, 80], 'color': "yellow"},
136
+ {'range': [80, 100], 'color': "green"}
137
+ ],
138
+ 'threshold': {
139
+ 'line': {'color': "red", 'width': 4},
140
+ 'thickness': 0.75,
141
+ 'value': 90
142
+ }
143
+ }
144
+ ))
145
+
146
+ fig.update_layout(height=300, margin=dict(l=20, r=20, t=40, b=20))
147
+ return fig
148
+
149
+ def create_accent_distribution_chart(accent_counts, title="Accent Distribution"):
150
+ """Create a beautiful pie chart for accent distribution"""
151
+ if not accent_counts:
152
+ return None
153
+
154
+ accents = list(accent_counts.keys())
155
+ counts = list(accent_counts.values())
156
+
157
+ fig = px.pie(
158
+ values=counts,
159
+ names=accents,
160
+ title=title,
161
+ color_discrete_sequence=px.colors.qualitative.Set3
162
+ )
163
+
164
+ fig.update_traces(
165
+ textposition='inside',
166
+ textinfo='percent+label',
167
+ hovertemplate='<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percent}<extra></extra>'
168
+ )
169
+
170
+ fig.update_layout(
171
+ height=400,
172
+ margin=dict(l=20, r=20, t=40, b=20),
173
+ font=dict(size=12)
174
+ )
175
+
176
+ return fig
177
+
178
+ def create_chunk_confidence_chart(chunk_results):
179
+ """Create a chart showing confidence over chunks"""
180
+ if not chunk_results:
181
+ return None
182
+
183
+ df = pd.DataFrame(chunk_results)
184
+
185
+ fig = px.line(
186
+ df,
187
+ x='chunk',
188
+ y='confidence',
189
+ title='Confidence Score Across Audio Chunks',
190
+ markers=True,
191
+ color='accent',
192
+ hover_data=['accent', 'is_confident']
193
+ )
194
+
195
+ fig.add_hline(y=0.6, line_dash="dash", line_color="red",
196
+ annotation_text="Confidence Threshold (60%)")
197
+
198
+ fig.update_layout(
199
+ height=400,
200
+ xaxis_title="Chunk Number",
201
+ yaxis_title="Confidence Score",
202
+ margin=dict(l=20, r=20, t=40, b=20)
203
+ )
204
+
205
+ return fig
206
+
207
+ def create_detailed_analysis(result):
208
+ """Create detailed analysis section"""
209
+ if not result or not result.get("success"):
210
+ return
211
+
212
+ st.markdown('<div class="analysis-section">', unsafe_allow_html=True)
213
+ st.markdown("## πŸ“Š Detailed Analysis")
214
+
215
+ # Key metrics
216
+ col1, col2, col3, col4 = st.columns(4)
217
+
218
+ with col1:
219
+ st.markdown('<div class="metric-card">', unsafe_allow_html=True)
220
+ st.metric(
221
+ "🎯 Final Accent",
222
+ result['predicted_accent'],
223
+ f"{result['confidence_percentage']}"
224
+ )
225
+ st.markdown('</div>', unsafe_allow_html=True)
226
+
227
+ with col2:
228
+ st.markdown('<div class="metric-card">', unsafe_allow_html=True)
229
+ st.metric(
230
+ "πŸ“¦ Chunks Processed",
231
+ f"{result['processed_chunks_count']}/{result['available_chunks_count']}",
232
+ f"Confident: {result.get('confident_chunks_count', 0)}"
233
+ )
234
+ st.markdown('</div>', unsafe_allow_html=True)
235
+
236
+ with col3:
237
+ st.markdown('<div class="metric-card">', unsafe_allow_html=True)
238
+ st.metric(
239
+ "⏱️ Processing Time",
240
+ f"{result['processing_time']:.1f}s",
241
+ f"Audio: {result.get('duration_minutes', 0):.1f}min" if result.get('duration_minutes') else ""
242
+ )
243
+ st.markdown('</div>', unsafe_allow_html=True)
244
+
245
+ with col4:
246
+ st.markdown('<div class="metric-card">', unsafe_allow_html=True)
247
+ early_stopped_text = "Yes ⚑" if result.get('early_stopped') else "No πŸ”„"
248
+ st.metric(
249
+ "πŸ›‘ Early Stopped",
250
+ early_stopped_text,
251
+ f"Threshold: {result.get('confidence_threshold', 0.6)*100:.0f}%"
252
+ )
253
+ st.markdown('</div>', unsafe_allow_html=True)
254
+
255
+ st.markdown('</div>', unsafe_allow_html=True)
256
+
257
+ # Charts section
258
+ col1, col2 = st.columns(2)
259
+
260
+ with col1:
261
+ # Confidence gauge
262
+ gauge_fig = create_confidence_gauge(result['confidence_score'])
263
+ st.plotly_chart(gauge_fig, use_container_width=True)
264
+
265
+ # Accent distribution (confident predictions)
266
+ if result.get('confident_accent_counts'):
267
+ pie_fig = create_accent_distribution_chart(
268
+ result['confident_accent_counts'],
269
+ "Confident Predictions Distribution"
270
+ )
271
+ if pie_fig:
272
+ st.plotly_chart(pie_fig, use_container_width=True)
273
+
274
+ with col2:
275
+ # Chunk confidence over time
276
+ if result.get('chunk_results'):
277
+ confidence_fig = create_chunk_confidence_chart(result['chunk_results'])
278
+ if confidence_fig:
279
+ st.plotly_chart(confidence_fig, use_container_width=True)
280
+
281
+ # All predictions distribution
282
+ if result.get('all_accent_counts') and len(result['all_accent_counts']) > 1:
283
+ all_pie_fig = create_accent_distribution_chart(
284
+ result['all_accent_counts'],
285
+ "All Predictions Distribution"
286
+ )
287
+ if all_pie_fig:
288
+ st.plotly_chart(all_pie_fig, use_container_width=True)
289
+
290
+ def display_chunk_details(chunk_results, confidence_threshold=0.6):
291
+ """Display detailed chunk-by-chunk results"""
292
+ if not chunk_results:
293
+ return
294
+
295
+ st.markdown("### πŸ” Chunk-by-Chunk Analysis")
296
+
297
+ # Summary statistics
298
+ confident_chunks = [r for r in chunk_results if r.get('is_confident', r['confidence'] > confidence_threshold)]
299
+
300
+ col1, col2, col3 = st.columns(3)
301
+ with col1:
302
+ st.info(f"**Total Chunks:** {len(chunk_results)}")
303
+ with col2:
304
+ st.success(f"**Confident Chunks:** {len(confident_chunks)}")
305
+ with col3:
306
+ confidence_rate = len(confident_chunks) / len(chunk_results) * 100 if chunk_results else 0
307
+ st.warning(f"**Confidence Rate:** {confidence_rate:.1f}%")
308
+
309
+ # Detailed results
310
+ with st.expander("πŸ“‹ View Detailed Chunk Results", expanded=False):
311
+ for i, result in enumerate(chunk_results):
312
+ confidence = result['confidence']
313
+ is_confident = result.get('is_confident', confidence > confidence_threshold)
314
+
315
+ confidence_emoji = "βœ…" if is_confident else "⚠️"
316
+ confidence_class = "" if is_confident else "low-confidence"
317
+
318
+ st.markdown(f"""
319
+ <div class="chunk-result {confidence_class}">
320
+ <strong>Chunk {result['chunk']}</strong> {confidence_emoji}<br>
321
+ <strong>Accent:</strong> {result['accent']}<br>
322
+ <strong>Confidence:</strong> {confidence:.3f} ({confidence*100:.1f}%)<br>
323
+ <strong>Status:</strong> {'Confident' if is_confident else 'Low Confidence'}
324
+ </div>
325
+ """, unsafe_allow_html=True)
326
+
327
+ def main():
328
+ # Header
329
+ st.markdown("""
330
+ <div class="main-header">
331
+ <h1>🎀 AI Accent Analyzer</h1>
332
+ <p>Analyze accents from YouTube videos using advanced AI models</p>
333
+ </div>
334
+ """, unsafe_allow_html=True)
335
+
336
+ # Sidebar
337
+ with st.sidebar:
338
+ st.markdown("""
339
+ <div class="sidebar-info">
340
+ <h3>πŸ”§ Configuration</h3>
341
+ <p>Adjust analysis parameters</p>
342
+ </div>
343
+ """, unsafe_allow_html=True)
344
+
345
+ confidence_threshold = st.slider(
346
+ "🎯 Confidence Threshold",
347
+ min_value=0.1,
348
+ max_value=0.9,
349
+ value=0.6,
350
+ step=0.05,
351
+ help="Only predictions above this confidence level are considered reliable"
352
+ )
353
+
354
+ early_stopping_threshold = st.slider(
355
+ "⚑ Early Stopping Threshold",
356
+ min_value=2,
357
+ max_value=10,
358
+ value=3,
359
+ help="Stop processing after this many consecutive confident predictions"
360
+ )
361
+
362
+ st.markdown("---")
363
+
364
+ st.markdown("""
365
+ ### πŸ“‹ Supported Formats
366
+ - YouTube videos
367
+ - YouTube Shorts
368
+ - YouTube Music
369
+ - Youtu.be links
370
+
371
+ ### βš™οΈ How it works
372
+ 1. **Audio Extraction**: Extracts audio from video
373
+ 2. **Chunking**: Splits audio into manageable segments
374
+ 3. **AI Analysis**: Uses SpeechBrain model for accent detection
375
+ 4. **Confidence Filtering**: Only considers high-confidence predictions
376
+ 5. **Results**: Provides detailed analysis and visualization
377
+ """)
378
+
379
+ # Main interface
380
+ st.markdown("## πŸ”— Enter Video URL")
381
+
382
+ # URL input with examples
383
+ col1, col2 = st.columns([3, 1])
384
+
385
+ with col1:
386
+ video_url = st.text_input(
387
+ "YouTube Video URL",
388
+ placeholder="https://www.youtube.com/watch?v=example or https://youtu.be/example",
389
+ help="Paste any YouTube video URL here"
390
+ )
391
+
392
+ with col2:
393
+ st.markdown("**Quick Examples:**")
394
+ example_urls = [
395
+ "https://www.youtube.com/shorts/mxMzNp3RfpA",
396
+ "https://youtu.be/dQw4w9WgXcQ",
397
+ "https://www.youtube.com/watch?v=example"
398
+ ]
399
+
400
+ for i, url in enumerate(example_urls):
401
+ if st.button(f"Example {i+1}", key=f"example_{i}"):
402
+ st.session_state.example_url = url
403
+ st.rerun()
404
+
405
+ # Use example URL if selected
406
+ if hasattr(st.session_state, 'example_url'):
407
+ video_url = st.session_state.example_url
408
+ delattr(st.session_state, 'example_url')
409
+
410
+ # URL validation
411
+ if video_url:
412
+ if validate_url(video_url):
413
+ st.success("βœ… Valid YouTube URL detected!")
414
+ else:
415
+ st.error("❌ Please enter a valid YouTube URL")
416
+ st.stop()
417
+
418
+ # Analysis button
419
+ if st.button("πŸš€ Analyze Accent", type="primary", disabled=not video_url):
420
+ if not video_url:
421
+ st.warning("Please enter a video URL first!")
422
+ return
423
+
424
+ # Progress tracking
425
+ progress_bar = st.progress(0)
426
+ status_text = st.empty()
427
+
428
+ try:
429
+ # Simulate the analysis process with progress updates
430
+ status_text.text("πŸ”„ Initializing analysis...")
431
+ progress_bar.progress(10)
432
+ time.sleep(1)
433
+
434
+ status_text.text("🎡 Extracting audio from video...")
435
+ progress_bar.progress(30)
436
+ time.sleep(1)
437
+
438
+ status_text.text("🧠 Loading AI model...")
439
+ progress_bar.progress(50)
440
+ time.sleep(1)
441
+
442
+ status_text.text("πŸ” Analyzing accent patterns...")
443
+ progress_bar.progress(80)
444
+
445
+ # Here you would call your actual analysis function
446
+ # result = analyze_video_accent(video_url, confidence_threshold)
447
+
448
+ # For demo purposes, creating mock result
449
+ result = analyze_video_accent(video_url, confidence_threshold)
450
+
451
+ progress_bar.progress(100)
452
+ status_text.text("βœ… Analysis complete!")
453
+ time.sleep(0.5)
454
+
455
+ # Clear progress indicators
456
+ progress_bar.empty()
457
+ status_text.empty()
458
+
459
+ # Display results
460
+ if result["success"]:
461
+ st.success("πŸŽ‰ Analysis completed successfully!")
462
+
463
+ # Main result highlight
464
+ st.markdown(f"""
465
+ <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
466
+ color: white; padding: 2rem; border-radius: 15px; text-align: center; margin: 2rem 0;">
467
+ <h2>🎀 Detected Accent: {result['predicted_accent']}</h2>
468
+ <h3>πŸ“Š Confidence: {result['confidence_percentage']}</h3>
469
+ </div>
470
+ """, unsafe_allow_html=True)
471
+
472
+ # Detailed analysis
473
+ create_detailed_analysis(result)
474
+
475
+ # Chunk details
476
+ if result.get('chunk_results'):
477
+ display_chunk_details(result['chunk_results'], confidence_threshold)
478
+
479
+ # Raw data download
480
+ with st.expander("πŸ“₯ Download Results", expanded=False):
481
+ # Convert results to DataFrame for download
482
+ if result.get('chunk_results'):
483
+ df = pd.DataFrame(result['chunk_results'])
484
+ csv = df.to_csv(index=False)
485
+ st.download_button(
486
+ label="πŸ“Š Download Chunk Results (CSV)",
487
+ data=csv,
488
+ file_name=f"accent_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
489
+ mime="text/csv"
490
+ )
491
+
492
+ # JSON download
493
+ import json
494
+ json_str = json.dumps(result, indent=2, default=str)
495
+ st.download_button(
496
+ label="πŸ“‹ Download Full Results (JSON)",
497
+ data=json_str,
498
+ file_name=f"accent_analysis_full_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
499
+ mime="application/json"
500
+ )
501
+ else:
502
+ st.error(f"❌ Analysis failed: {result.get('error', 'Unknown error')}")
503
+
504
+ except Exception as e:
505
+ progress_bar.empty()
506
+ status_text.empty()
507
+ st.error(f"❌ An error occurred during analysis: {str(e)}")
508
+
509
+ # Footer
510
+ st.markdown("---")
511
+ st.markdown("""
512
+ <div style="text-align: center; color: #666; margin-top: 2rem;">
513
+ <p>🎀 AI Accent Analyzer | Built with Streamlit & SpeechBrain</p>
514
+ <p>Analyze accents from YouTube videos with confidence-based filtering</p>
515
+ </div>
516
+ """, unsafe_allow_html=True)
517
+
518
+ if __name__ == "__main__":
519
+ main()
audio_extractor.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import tempfile
4
+ import warnings
5
+ import time
6
+ import shutil
7
+ import random
8
+
9
+ import torch
10
+ import torchaudio
11
+ import yt_dlp
12
+ from contextlib import contextmanager
13
+
14
+ warnings.filterwarnings("ignore")
15
+ os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
16
+
17
+ @contextmanager
18
+ def suppress_stdout_stderr():
19
+ with open(os.devnull, "w") as devnull:
20
+ old_stdout = sys.stdout
21
+ old_stderr = sys.stderr
22
+ sys.stdout = devnull
23
+ sys.stderr = devnull
24
+ try:
25
+ yield
26
+ finally:
27
+ sys.stdout = old_stdout
28
+ sys.stderr = old_stderr
29
+
30
+ def extract_audio_from_video_url(video_url):
31
+ start_time = time.time()
32
+ temp_dir = tempfile.mkdtemp()
33
+ ydl_opts = {
34
+ 'format': 'bestaudio[abr<=64]',
35
+ 'postprocessors': [{
36
+ 'key': 'FFmpegExtractAudio',
37
+ 'preferredcodec': 'wav',
38
+ 'preferredquality': '192',
39
+ }],
40
+ 'outtmpl': os.path.join(temp_dir, 'audio.%(ext)s'),
41
+ 'quiet': True,
42
+ 'no_warnings': True,
43
+ 'noplaylist': True,
44
+ }
45
+
46
+ with suppress_stdout_stderr():
47
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
48
+ ydl.download([video_url])
49
+
50
+ for file in os.listdir(temp_dir):
51
+ if file.endswith('.wav'):
52
+ end_time = time.time()
53
+ print(f"[⏱️] Audio extraction took {end_time - start_time:.2f} seconds.")
54
+ return os.path.join(temp_dir, file)
55
+ raise Exception("Failed to extract audio in WAV format")
56
+
57
+
58
+
59
+ def smart_chunk_audio(waveform, sample_rate, duration_minutes):
60
+ """Smart chunking based on video duration"""
61
+ total_duration = waveform.size(1) / sample_rate
62
+ print(f"πŸ“ Video duration: {total_duration/60:.1f} minutes")
63
+
64
+ if duration_minutes <= 1:
65
+ # Short videos: smaller chunks, process all
66
+ chunk_length_sec = 10
67
+ return chunk_audio_all(waveform, sample_rate, chunk_length_sec)
68
+
69
+ elif duration_minutes <= 5:
70
+ # Medium videos: normal chunks, skip some randomly
71
+ chunk_length_sec = 20
72
+ all_chunks = chunk_audio_all(waveform, sample_rate, chunk_length_sec)
73
+ # Keep 70% of chunks randomly
74
+ keep_ratio = 0.7
75
+ num_keep = max(1, int(len(all_chunks) * keep_ratio))
76
+ selected_chunks = random.sample(all_chunks, num_keep)
77
+ print(f"πŸ“¦ Selected {len(selected_chunks)} out of {len(all_chunks)} chunks")
78
+ return selected_chunks
79
+
80
+ else:
81
+ # Long videos: strategic sampling from beginning, middle, end
82
+ chunk_length_sec = 25
83
+ return chunk_audio_strategic(waveform, sample_rate, chunk_length_sec)
84
+
85
+ def chunk_audio_all(waveform, sample_rate, chunk_length_sec=20):
86
+ """Create all chunks from audio"""
87
+ chunk_samples = chunk_length_sec * sample_rate
88
+ total_samples = waveform.size(1)
89
+ chunks = []
90
+
91
+ for start in range(0, total_samples, chunk_samples):
92
+ end = min(start + chunk_samples, total_samples)
93
+ chunk = waveform[:, start:end]
94
+ if chunk.size(1) > sample_rate * 3: # ignore very short chunks (3 sec minimum)
95
+ chunks.append(chunk)
96
+ return chunks
97
+
98
+ def chunk_audio_strategic(waveform, sample_rate, chunk_length_sec=25):
99
+ """Strategic chunking for long videos - sample from beginning, middle, end"""
100
+ total_samples = waveform.size(1)
101
+ chunk_samples = chunk_length_sec * sample_rate
102
+
103
+ chunks = []
104
+
105
+ # Beginning: 2-3 chunks
106
+ beginning_chunks = min(3, total_samples // chunk_samples)
107
+ for i in range(beginning_chunks):
108
+ start = i * chunk_samples
109
+ end = min(start + chunk_samples, total_samples)
110
+ chunk = waveform[:, start:end]
111
+ if chunk.size(1) > sample_rate * 3:
112
+ chunks.append(chunk)
113
+
114
+ # Middle: 2-3 chunks
115
+ middle_start = total_samples // 2 - chunk_samples
116
+ middle_chunks = min(3, 2)
117
+ for i in range(middle_chunks):
118
+ start = middle_start + (i * chunk_samples)
119
+ end = min(start + chunk_samples, total_samples)
120
+ if start >= 0 and start < total_samples:
121
+ chunk = waveform[:, start:end]
122
+ if chunk.size(1) > sample_rate * 3:
123
+ chunks.append(chunk)
124
+
125
+ # End: 2-3 chunks
126
+ end_start = total_samples - (3 * chunk_samples)
127
+ end_chunks = min(3, 3)
128
+ for i in range(end_chunks):
129
+ start = max(0, end_start + (i * chunk_samples))
130
+ end = min(start + chunk_samples, total_samples)
131
+ if start < total_samples:
132
+ chunk = waveform[:, start:end]
133
+ if chunk.size(1) > sample_rate * 3:
134
+ chunks.append(chunk)
135
+
136
+ print(f"πŸ“¦ Strategic sampling: {len(chunks)} chunks from long video")
137
+ return chunks
138
+
139
+ def prepare_audio(video_url):
140
+ """Main function to extract and prepare audio chunks"""
141
+ try:
142
+ print(f"🎡 Extracting audio from video...")
143
+ audio_path = extract_audio_from_video_url(video_url)
144
+ print(f"βœ… Audio extracted to: {audio_path}")
145
+
146
+ print(f"οΏ½οΏ½οΏ½ Loading and preparing audio...")
147
+ start = time.time()
148
+ waveform, sample_rate = torchaudio.load(audio_path)
149
+
150
+ # Resample to 16kHz if needed
151
+ if sample_rate != 16000:
152
+ waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
153
+ sample_rate = 16000
154
+
155
+ # Convert to mono if needed
156
+ if waveform.shape[0] > 1:
157
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
158
+
159
+ end = time.time()
160
+ print(f"[⏱️] Audio preparation took {end - start:.2f} seconds.")
161
+
162
+ # # Apply simple VAD
163
+ # print(f"🎀 Applying Voice Activity Detection...")
164
+ # start = time.time()
165
+ # waveform = simple_vad(waveform, sample_rate)
166
+ # end = time.time()
167
+ # print(f"[⏱️] VAD took {end - start:.2f} seconds.")
168
+
169
+ # Calculate duration and apply smart chunking
170
+ duration_minutes = waveform.size(1) / sample_rate / 60
171
+
172
+ print(f"🧩 Smart chunking based on duration...")
173
+ start = time.time()
174
+ chunks = smart_chunk_audio(waveform, sample_rate, duration_minutes)
175
+ end = time.time()
176
+ print(f"[⏱️] Smart chunking took {end - start:.2f} seconds. Total chunks: {len(chunks)}")
177
+
178
+ return {
179
+ "success": True,
180
+ "chunks": chunks,
181
+ "audio_path": audio_path,
182
+ "duration_minutes": duration_minutes,
183
+ "total_chunks": len(chunks)
184
+ }
185
+
186
+ except Exception as e:
187
+ print(f"❌ Error in audio preparation: {str(e)}")
188
+ return {
189
+ "success": False,
190
+ "error": str(e),
191
+ "chunks": [],
192
+ "audio_path": None
193
+ }
chunck_time.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import warnings
4
+ import time
5
+ import statistics
6
+ from collections import Counter
7
+
8
+ import torch
9
+ import torchaudio
10
+ from speechbrain.inference.classifiers import EncoderClassifier
11
+
12
+ from audio_extractor import extract_audio_from_video_url
13
+
14
+ warnings.filterwarnings("ignore")
15
+ os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
16
+
17
+ def create_chunks_by_size(waveform, sample_rate, chunk_length_sec):
18
+ """Create chunks of specific size"""
19
+ chunk_samples = chunk_length_sec * sample_rate
20
+ total_samples = waveform.size(1)
21
+ chunks = []
22
+
23
+ for start in range(0, total_samples, chunk_samples):
24
+ end = min(start + chunk_samples, total_samples)
25
+ chunk = waveform[:, start:end]
26
+ if chunk.size(1) > sample_rate * 2: # minimum 2 seconds
27
+ chunks.append(chunk)
28
+ return chunks
29
+
30
+ def predict_chunks_timing(chunks, classifier):
31
+ """Time the prediction process for chunks"""
32
+ if not chunks:
33
+ return [], 0.0
34
+
35
+ start_time = time.time()
36
+
37
+ # Pad to same length
38
+ max_len = max(chunk.size(1) for chunk in chunks)
39
+ padded_chunks = [torch.nn.functional.pad(chunk, (0, max_len - chunk.size(1))) for chunk in chunks]
40
+ batch = torch.cat(padded_chunks, dim=0).unsqueeze(1)
41
+ batch = batch.squeeze(1)
42
+
43
+ out_prob, score, index, text_lab = classifier.classify_batch(batch)
44
+
45
+ end_time = time.time()
46
+ prediction_time = end_time - start_time
47
+
48
+ results = []
49
+ for i in range(len(chunks)):
50
+ results.append({
51
+ "accent": text_lab[i],
52
+ "confidence": score[i].item(),
53
+ })
54
+
55
+ return results, prediction_time
56
+
57
+ def analyze_chunk_size_performance(video_url, chunk_sizes=[10, 15, 20, 30, 60]):
58
+ """Analyze performance for different chunk sizes"""
59
+ print("πŸ” Starting Chunk Size Performance Analysis")
60
+ print("=" * 60)
61
+
62
+ # Extract and prepare audio once
63
+ print("🎡 Extracting and preparing audio...")
64
+ audio_start = time.time()
65
+
66
+ audio_path = extract_audio_from_video_url(video_url)
67
+ waveform, sample_rate = torchaudio.load(audio_path)
68
+
69
+ if sample_rate != 16000:
70
+ waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
71
+ sample_rate = 16000
72
+
73
+ if waveform.shape[0] > 1:
74
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
75
+
76
+ # # Apply VAD
77
+ # waveform = simple_vad(waveform, sample_rate)
78
+
79
+ audio_end = time.time()
80
+ audio_prep_time = audio_end - audio_start
81
+
82
+ duration_minutes = waveform.size(1) / sample_rate / 60
83
+ print(f"βœ… Audio prepared in {audio_prep_time:.2f}s | Duration: {duration_minutes:.1f} minutes")
84
+
85
+ # Load model once
86
+ print("🧠 Loading model...")
87
+ model_start = time.time()
88
+ classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa")
89
+ model_end = time.time()
90
+ model_load_time = model_end - model_start
91
+ print(f"βœ… Model loaded in {model_load_time:.2f}s")
92
+
93
+ print("\n" + "=" * 60)
94
+ print("πŸ“Š CHUNK SIZE ANALYSIS RESULTS")
95
+ print("=" * 60)
96
+
97
+ results = []
98
+
99
+ for chunk_size in chunk_sizes:
100
+ print(f"\n🧩 Testing {chunk_size}-second chunks...")
101
+
102
+ # Create chunks
103
+ chunk_start = time.time()
104
+ chunks = create_chunks_by_size(waveform, sample_rate, chunk_size)
105
+ chunk_end = time.time()
106
+ chunking_time = chunk_end - chunk_start
107
+
108
+ if not chunks:
109
+ print(f"❌ No valid chunks created for {chunk_size}s size")
110
+ continue
111
+
112
+ # Predict
113
+ predictions, prediction_time = predict_chunks_timing(chunks, classifier)
114
+
115
+ # Calculate statistics
116
+ confidences = [p["confidence"] for p in predictions]
117
+ accents = [p["accent"] for p in predictions]
118
+
119
+ avg_confidence = statistics.mean(confidences) if confidences else 0
120
+ max_confidence = max(confidences) if confidences else 0
121
+ min_confidence = min(confidences) if confidences else 0
122
+ std_confidence = statistics.stdev(confidences) if len(confidences) > 1 else 0
123
+
124
+ # Most common accent
125
+ accent_counts = Counter(accents)
126
+ most_common_accent = accent_counts.most_common(1)[0] if accent_counts else ("Unknown", 0)
127
+
128
+ # Calculate processing rates
129
+ total_processing_time = chunking_time + prediction_time
130
+ chunks_per_second = len(chunks) / total_processing_time if total_processing_time > 0 else 0
131
+ seconds_per_chunk = total_processing_time / len(chunks) if len(chunks) > 0 else 0
132
+
133
+ result = {
134
+ "chunk_size": chunk_size,
135
+ "num_chunks": len(chunks),
136
+ "chunking_time": chunking_time,
137
+ "prediction_time": prediction_time,
138
+ "total_time": total_processing_time,
139
+ "avg_confidence": avg_confidence,
140
+ "max_confidence": max_confidence,
141
+ "min_confidence": min_confidence,
142
+ "std_confidence": std_confidence,
143
+ "most_common_accent": most_common_accent[0],
144
+ "accent_occurrence": most_common_accent[1],
145
+ "chunks_per_second": chunks_per_second,
146
+ "seconds_per_chunk": seconds_per_chunk,
147
+ "confidence_consistency": 1 - (std_confidence / avg_confidence) if avg_confidence > 0 else 0
148
+ }
149
+
150
+ results.append(result)
151
+
152
+ # Print results for this chunk size
153
+ print(f" πŸ“¦ Chunks created: {len(chunks)}")
154
+ print(f" ⏱️ Chunking time: {chunking_time:.3f}s")
155
+ print(f" 🧠 Prediction time: {prediction_time:.3f}s")
156
+ print(f" πŸ”„ Total processing: {total_processing_time:.3f}s")
157
+ print(f" ⚑ Processing rate: {chunks_per_second:.1f} chunks/sec")
158
+ print(f" πŸ“ˆ Avg confidence: {avg_confidence:.3f}")
159
+ print(f" 🎯 Most common: {most_common_accent[0]} ({most_common_accent[1]} times)")
160
+ print(f" πŸ“Š Confidence range: {min_confidence:.3f} - {max_confidence:.3f}")
161
+
162
+ # Print summary comparison
163
+ print("\n" + "=" * 80)
164
+ print("πŸ“ˆ PERFORMANCE COMPARISON SUMMARY")
165
+ print("=" * 80)
166
+
167
+ if results:
168
+ print(f"{'Size':<6} {'Chunks':<8} {'Total Time':<12} {'Rate':<12} {'Avg Conf':<10} {'Consistency':<12} {'Winner'}")
169
+ print("-" * 80)
170
+
171
+ for r in results:
172
+ consistency = f"{r['confidence_consistency']:.2f}"
173
+ print(f"{r['chunk_size']:<6} {r['num_chunks']:<8} {r['total_time']:<12.3f} {r['chunks_per_second']:<12.1f} {r['avg_confidence']:<10.3f} {consistency:<12} {r['most_common_accent']}")
174
+
175
+ # Recommendations
176
+ print("\n" + "=" * 60)
177
+ print("πŸ† RECOMMENDATIONS")
178
+ print("=" * 60)
179
+
180
+ if results:
181
+ # Find best for speed
182
+ fastest = min(results, key=lambda x: x['total_time'])
183
+ print(f"⚑ Fastest processing: {fastest['chunk_size']}s chunks ({fastest['total_time']:.2f}s total)")
184
+
185
+ # Find best for accuracy (highest average confidence)
186
+ most_accurate = max(results, key=lambda x: x['avg_confidence'])
187
+ print(f"🎯 Highest accuracy: {most_accurate['chunk_size']}s chunks ({most_accurate['avg_confidence']:.3f} avg confidence)")
188
+
189
+ # Find most consistent
190
+ most_consistent = max(results, key=lambda x: x['confidence_consistency'])
191
+ print(f"πŸ“Š Most consistent: {most_consistent['chunk_size']}s chunks ({most_consistent['confidence_consistency']:.3f} consistency)")
192
+
193
+ # Find best balance (speed + accuracy)
194
+ for r in results:
195
+ r['balance_score'] = (r['chunks_per_second'] * 0.4) + (r['avg_confidence'] * 100 * 0.6)
196
+
197
+ best_balance = max(results, key=lambda x: x['balance_score'])
198
+ print(f"βš–οΈ Best balance: {best_balance['chunk_size']}s chunks (score: {best_balance['balance_score']:.1f})")
199
+
200
+ return results
201
+
202
+ def quick_test_multiple_videos(video_urls, chunk_sizes=[10, 15, 20, 30]):
203
+ """Quick test on multiple videos to get average performance"""
204
+ print("πŸ” MULTI-VIDEO CHUNK SIZE ANALYSIS")
205
+ print("=" * 60)
206
+
207
+ all_results = {size: [] for size in chunk_sizes}
208
+
209
+ for i, video_url in enumerate(video_urls, 1):
210
+ print(f"\nπŸ“Ή Testing Video {i}/{len(video_urls)}")
211
+ try:
212
+ video_results = analyze_chunk_size_performance(video_url, chunk_sizes)
213
+ for result in video_results:
214
+ all_results[result['chunk_size']].append(result)
215
+ except Exception as e:
216
+ print(f"❌ Error with video {i}: {str(e)}")
217
+ continue
218
+
219
+ # Calculate averages
220
+ print("\n" + "=" * 60)
221
+ print("πŸ“Š AVERAGE PERFORMANCE ACROSS ALL VIDEOS")
222
+ print("=" * 60)
223
+
224
+ avg_results = []
225
+ for chunk_size in chunk_sizes:
226
+ if all_results[chunk_size]:
227
+ results = all_results[chunk_size]
228
+ avg_result = {
229
+ 'chunk_size': chunk_size,
230
+ 'avg_total_time': statistics.mean([r['total_time'] for r in results]),
231
+ 'avg_chunks_per_sec': statistics.mean([r['chunks_per_second'] for r in results]),
232
+ 'avg_confidence': statistics.mean([r['avg_confidence'] for r in results]),
233
+ 'avg_consistency': statistics.mean([r['confidence_consistency'] for r in results]),
234
+ 'sample_count': len(results)
235
+ }
236
+ avg_results.append(avg_result)
237
+
238
+ if avg_results:
239
+ print(f"{'Size':<6} {'Samples':<8} {'Avg Time':<10} {'Avg Rate':<10} {'Avg Conf':<10} {'Consistency'}")
240
+ print("-" * 60)
241
+ for r in avg_results:
242
+ print(f"{r['chunk_size']:<6} {r['sample_count']:<8} {r['avg_total_time']:<10.2f} {r['avg_chunks_per_sec']:<10.1f} {r['avg_confidence']:<10.3f} {r['avg_consistency']:.3f}")
243
+
244
+ return avg_results
245
+
246
+ if __name__ == "__main__":
247
+ # Test with single video
248
+ video_url = "https://www.youtube.com/watch?v=-JTq1BFBwmo&list=PLDN4rrl48XKpZkf03iYFl-O29szjTrs_O&index=2"
249
+
250
+ print("πŸš€ Starting Single Video Analysis...")
251
+ results = analyze_chunk_size_performance(video_url)
252
+
253
+ # Uncomment below to test multiple videos
254
+ # print("\n" + "="*60)
255
+ # print("πŸš€ Starting Multi-Video Analysis...")
256
+ # video_urls = [
257
+ # "https://www.youtube.com/watch?v=VIDEO1",
258
+ # "https://www.youtube.com/watch?v=VIDEO2",
259
+ # # Add more video URLs here
260
+ # ]
261
+ # multi_results = quick_test_multiple_videos(video_urls)
dialect_predector.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import warnings
4
+ import time
5
+ from collections import Counter
6
+
7
+ import torch
8
+ from speechbrain.inference.classifiers import EncoderClassifier
9
+
10
+ from audio_extractor import prepare_audio
11
+
12
+ warnings.filterwarnings("ignore")
13
+ os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
14
+
15
+ def predict_accent_from_chunks(chunks, classifier, early_stopping_threshold=3, confidence_threshold=0.6):
16
+ """Predict accents for chunks iteratively with early stopping based on confident predictions only."""
17
+ print(f"\nπŸ“¦ Running prediction for up to {len(chunks)} chunks with early stopping (threshold={early_stopping_threshold}, confidence>{confidence_threshold*100}%)...")
18
+ iterative_start_time = time.time()
19
+
20
+ results = []
21
+ consecutive_dialect_count = 0
22
+ last_dialect = None
23
+
24
+ processed_chunks_count_in_func = 0 # Renamed to avoid clash if this func is nested
25
+
26
+ for i, chunk_tensor in enumerate(chunks):
27
+ processed_chunks_count_in_func += 1
28
+
29
+ current_chunk_for_batch = chunk_tensor
30
+ if current_chunk_for_batch.ndim == 1:
31
+ current_chunk_for_batch = current_chunk_for_batch.unsqueeze(0) # Shape: [1, T]
32
+ elif not (current_chunk_for_batch.ndim == 2 and current_chunk_for_batch.shape[0] == 1):
33
+ print(f"Warning: Chunk {i+1} has unexpected shape {current_chunk_for_batch.shape}. Required [T] or [1,T]. Skipping.")
34
+ continue
35
+
36
+ # Perform prediction for the single chunk
37
+ out_prob, score, index, text_lab = classifier.classify_batch(current_chunk_for_batch)
38
+
39
+ accent = text_lab[0] # Batch of 1
40
+ confidence = score[0].item()
41
+ class_idx = index[0].item()
42
+
43
+ # Determine if prediction is confident enough
44
+ is_confident = confidence > confidence_threshold
45
+ confidence_indicator = "βœ“" if is_confident else "βœ—"
46
+
47
+ print(f"Chunk {i+1}/{len(chunks)}: {accent} | Confidence: {confidence:.2f} {confidence_indicator}")
48
+
49
+ current_result = {
50
+ "chunk_index_original": i + 1,
51
+ "accent": accent,
52
+ "confidence": confidence,
53
+ "class_index": class_idx,
54
+ "is_confident": is_confident
55
+ }
56
+ results.append(current_result)
57
+
58
+ # Only consider confident predictions for early stopping
59
+ if is_confident:
60
+ if accent == last_dialect:
61
+ consecutive_dialect_count += 1
62
+ else:
63
+ last_dialect = accent
64
+ consecutive_dialect_count = 1
65
+
66
+ if consecutive_dialect_count >= early_stopping_threshold:
67
+ print(f"\n⚠️ Early stopping triggered after processing chunk {i+1}: "
68
+ f"{early_stopping_threshold} consecutive confident chunks predicted '{last_dialect}'.")
69
+ break
70
+ else:
71
+ # Reset consecutive count if prediction is not confident
72
+ consecutive_dialect_count = 0
73
+ last_dialect = None
74
+
75
+ iterative_end_time = time.time()
76
+ num_actually_processed = len(results)
77
+ confident_predictions = sum(1 for r in results if r["is_confident"])
78
+ print(f"[⏱️] Prediction for {num_actually_processed} out of {len(chunks)} available chunks took {iterative_end_time - iterative_start_time:.2f} seconds.")
79
+ print(f"[πŸ“Š] {confident_predictions}/{num_actually_processed} predictions were confident (>{confidence_threshold*100}%).")
80
+
81
+ # Add sequential "chunk" number for processed chunks
82
+ for idx, res_item in enumerate(results):
83
+ res_item["chunk"] = idx + 1
84
+
85
+ return results
86
+
87
+ def get_final_verdict(chunk_results, confidence_threshold=0.6):
88
+ """Determine final accent based on confident predictions only (confidence > threshold)."""
89
+ if not chunk_results:
90
+ return None, 0.0, {}, {}
91
+
92
+ # Filter for confident predictions only
93
+ confident_results = [r for r in chunk_results if r["confidence"] > confidence_threshold]
94
+
95
+ if not confident_results:
96
+ print(f"\n⚠️ No confident predictions found (confidence > {confidence_threshold*100}%). Using all predictions as fallback.")
97
+ confident_results = chunk_results
98
+
99
+ accent_confidence_sum = {}
100
+ accent_counts = Counter()
101
+ all_accent_counts = Counter() # Track all predictions for reporting
102
+
103
+ # Calculate stats for confident predictions
104
+ for result in confident_results:
105
+ accent = result["accent"]
106
+ confidence = result["confidence"]
107
+ accent_counts[accent] += 1
108
+ accent_confidence_sum[accent] = accent_confidence_sum.get(accent, 0.0) + confidence
109
+
110
+ # Calculate stats for all predictions (for reporting)
111
+ for result in chunk_results:
112
+ all_accent_counts[result["accent"]] += 1
113
+
114
+ final_accent = max(accent_confidence_sum, key=accent_confidence_sum.get)
115
+ final_confidence = accent_confidence_sum[final_accent] / accent_counts[final_accent]
116
+
117
+ print(f"\nπŸ“Š Accent Analysis (based on {len(confident_results)} confident predictions out of {len(chunk_results)} total):")
118
+ print(f" Confident predictions (confidence > {confidence_threshold*100}%):")
119
+ for accent in accent_counts:
120
+ count = accent_counts[accent]
121
+ total_conf = accent_confidence_sum[accent]
122
+ avg_conf = total_conf / count
123
+ print(f" {accent}: {count} chunks, total confidence: {total_conf:.2f}, avg confidence: {avg_conf:.2f}")
124
+
125
+ print(f" All predictions (including low confidence):")
126
+ for accent in all_accent_counts:
127
+ count = all_accent_counts[accent]
128
+ print(f" {accent}: {count} chunks")
129
+
130
+ return final_accent, final_confidence, accent_counts, all_accent_counts
131
+
132
+
133
+ def analyze_video_accent(video_url, confidence_threshold=0.6):
134
+ """Main function to analyze video accent with confidence threshold"""
135
+ total_start = time.time()
136
+
137
+ try:
138
+ audio_result = prepare_audio(video_url)
139
+
140
+ if not audio_result["success"]:
141
+ return {
142
+ "success": False, "error": audio_result["error"], "predicted_accent": "Error",
143
+ "confidence_score": 0.0, "confidence_percentage": "0.0%", "video_url": video_url,
144
+ "processing_time": time.time() - total_start
145
+ }
146
+
147
+ chunks = audio_result["chunks"]
148
+ available_chunks_count = len(chunks)
149
+
150
+ if not chunks:
151
+ return {
152
+ "success": False, "error": "No valid audio chunks found", "predicted_accent": "Error",
153
+ "confidence_score": 0.0, "confidence_percentage": "0.0%", "video_url": video_url,
154
+ "available_chunks_count": 0, "processed_chunks_count": 0,
155
+ "processing_time": time.time() - total_start
156
+ }
157
+
158
+ print(f"🧠 Loading accent classification model...")
159
+ load_model_start = time.time()
160
+ classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa")
161
+ load_model_end = time.time()
162
+ print(f"[⏱️] Model loading took {load_model_end - load_model_start:.2f} seconds.")
163
+
164
+ chunk_results = predict_accent_from_chunks(chunks, classifier, confidence_threshold=confidence_threshold)
165
+ processed_chunks_count = len(chunk_results)
166
+
167
+ final_accent, final_confidence, confident_accent_counts, all_accent_counts = get_final_verdict(chunk_results, confidence_threshold)
168
+
169
+ if final_accent is None:
170
+ return {
171
+ "success": False, "error": "Could not determine accent (no chunks processed or no consensus)",
172
+ "predicted_accent": "Unknown", "confidence_score": 0.0, "confidence_percentage": "0.0%",
173
+ "video_url": video_url, "available_chunks_count": available_chunks_count,
174
+ "processed_chunks_count": processed_chunks_count, "chunk_results": chunk_results,
175
+ "processing_time": time.time() - total_start
176
+ }
177
+
178
+ # Calculate statistics
179
+ confident_chunks = [r for r in chunk_results if r["confidence"] > confidence_threshold]
180
+ confident_chunks_count = len(confident_chunks)
181
+
182
+ avg_conf_processed_chunks = 0.0
183
+ if processed_chunks_count > 0:
184
+ avg_conf_processed_chunks = sum(r["confidence"] for r in chunk_results) / processed_chunks_count
185
+
186
+ avg_conf_confident_chunks = 0.0
187
+ if confident_chunks_count > 0:
188
+ avg_conf_confident_chunks = sum(r["confidence"] for r in confident_chunks) / confident_chunks_count
189
+
190
+ total_end = time.time()
191
+ total_processing_time = total_end - total_start
192
+ print(f"\n[⏱️] πŸ” Total pipeline time: {total_processing_time:.2f} seconds.")
193
+
194
+ winning_chunks_for_final_accent = confident_accent_counts.get(final_accent, 0)
195
+ early_stopped = processed_chunks_count < available_chunks_count
196
+
197
+ print(f"\nβœ… Final Verdict: {final_accent}")
198
+ print(f"πŸ“ˆ Final Confidence (for '{final_accent}'): {final_confidence:.2f}")
199
+ print(f"🎯 Based on {winning_chunks_for_final_accent} confident occurrences out of {confident_chunks_count} confident chunks.")
200
+ print(f" ({confident_chunks_count}/{processed_chunks_count} chunks were confident, threshold: {confidence_threshold*100}%)")
201
+ if early_stopped:
202
+ print(f" (Early stopping occurred. {available_chunks_count} chunks were available in total).")
203
+ print(f"πŸ“Š Average Confidence Across All Processed Chunks: {avg_conf_processed_chunks:.2f}")
204
+ print(f"πŸ“Š Average Confidence Across Confident Chunks: {avg_conf_confident_chunks:.2f}")
205
+
206
+ return {
207
+ "success": True,
208
+ "predicted_accent": final_accent,
209
+ "confidence_score": final_confidence,
210
+ "confidence_percentage": f"{final_confidence * 100:.1f}%",
211
+ "confidence_threshold": confidence_threshold,
212
+ "average_confidence_processed_chunks": avg_conf_processed_chunks,
213
+ "average_confidence_confident_chunks": avg_conf_confident_chunks,
214
+ "confident_accent_counts": dict(confident_accent_counts),
215
+ "all_accent_counts": dict(all_accent_counts),
216
+ "processed_chunks_count": processed_chunks_count,
217
+ "confident_chunks_count": confident_chunks_count,
218
+ "available_chunks_count": available_chunks_count,
219
+ "winning_chunks_for_final_accent": winning_chunks_for_final_accent,
220
+ "audio_file": audio_result.get("audio_path"),
221
+ "video_url": video_url,
222
+ "duration_minutes": audio_result.get("duration_minutes"),
223
+ "chunk_results": chunk_results,
224
+ "processing_time": total_processing_time,
225
+ "early_stopped": early_stopped
226
+ }
227
+
228
+ except Exception as e:
229
+ total_end = time.time()
230
+ processing_time_before_error = total_end - total_start
231
+ print(f"❌ Error: {str(e)}")
232
+ print(f"[⏱️] Total time before error: {processing_time_before_error:.2f} seconds.")
233
+
234
+ return {
235
+ "success": False, "error": str(e), "predicted_accent": "Error",
236
+ "confidence_score": 0.0, "confidence_percentage": "0.0%", "video_url": video_url,
237
+ "processing_time": processing_time_before_error
238
+ }
239
+
240
+ if __name__ == "__main__":
241
+ video_url = "https://www.youtube.com/shorts/sWUvKMC2450"
242
+ result = analyze_video_accent(video_url, confidence_threshold=0.6)
243
+
244
+ if result["success"]:
245
+ print(f"\n🎀 Final Predicted Accent: {result['predicted_accent']}")
246
+ print(f"πŸ”’ Confidence Score: {result['confidence_score']:.4f}")
247
+ print(f"πŸ“Š Confidence Percentage: {result['confidence_percentage']}")
248
+ print(f"🎯 Based on {result['confident_chunks_count']} confident chunks out of {result['processed_chunks_count']} total")
249
+ else:
250
+ print(f"❌ Error: {result['error']}")
251
+ print(f"⏱️ Processing Time: {result.get('processing_time', 0):.2f} seconds")
pretrained_models/accent-id-commonaccent_ecapa/hyperparams.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ C:/Users/Amr/.cache/huggingface/hub/models--Jzuluaga--accent-id-commonaccent_ecapa/snapshots/14bebf44b7e7a34204d0acc2c897935945fb5c51/hyperparams.yaml
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ yt_dlp==2025.5.22
2
+ speechbrain==1.0.3
3
+ torch==2.7.0+cpu
4
+ torchaudio==2.7.0+cpu
5
+ requests==2.32.3
6
+ ipywidgets==8.1.5
7
+ IPython==7.34.0
8
+ ffmpeg-python==0.2.0
9
+ validators==0.35.0
10
+ streamlit==1.45.1
11
+ plotly==6.1.2
12
+ pandas==2.2.3
13
+ numpy==2.2.6