Spaces:

freddyaboulton
/

voice-match

Running on CPU Upgrade

App Files Files Community

freddyaboulton HF Staff commited on Apr 11

Commit

1e5834c

1 Parent(s): af73830

commit

Browse files

Files changed (4) hide show

app.py +39 -3
audio_index.py +153 -0
requirements.txt +1 -1
search.py +23 -0

app.py CHANGED Viewed

@@ -1,7 +1,43 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 demo.launch()

 import gradio as gr
+from huggingface_hub import hf_hub_download
+from .audio_index import AudioEmbeddingSystem
+from .search import search
+import pandas as pd
+db_file = hf_hub_download(
+    repo_id="freddyaboulton/common-voice-english-audio", filename="audio_db.sqlite"
+)
+index_file = hf_hub_download(
+    repo_id="freddyaboulton/common-voice-english-audio", filename="audio_faiss.index"
+)
+audio_embedding_system = AudioEmbeddingSystem(db_path=db_file, index_path=index_file)
+def audio_search(audio_tuple):
+    sample_rate, array = audio_tuple
+    array = array[: int(sample_rate * 10)]
+    rows = audio_embedding_system.search(sample_rate, array)
+    orig_rows = search(rows)
+    for row in rows:
+        path = row["path"]
+        for orig_row in orig_rows:
+            if orig_row["path"] == path:
+                row["sentence"] = orig_row["sentence"]
+                row["audio"] = [
+                    "<audio src=" + orig_row["audio"]["src"] + " controls />"
+                ]
+    return pd.DataFrame(rows).sort_values(by="distance", ascending=True)
+demo = gr.Interface(
+    fn=audio_search,
+    inputs=gr.Audio(
+        label="Record or upload a clip of your voice", sources=["upload", "microphone"]
+    ),
+    outputs=gr.Dataframe(
+        headers=["path", "audio", "sentence", "distance", "vector_id"],
+        datatype=["str", "markdown", "str", "number", "str"],
+    ),
+)
 demo.launch()

audio_index.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import os
+import numpy as np
+import faiss
+import sqlite3
+import torch
+import librosa
+import nemo.collections.asr as nemo_asr
+speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(
+    "rimelabs/rimecaster"
+)
+speaker_model.freeze()
+def get_embedding(row: dict) -> torch.Tensor | None:
+    # Ensure audio is mono
+    if row["audio"]["array"].ndim > 1:
+        audio_array = librosa.to_mono(
+            row["audio"]["array"].T
+        )  # Transpose if shape is (samples, channels)
+    else:
+        audio_array = row["audio"]["array"]
+    # Resample for embedding (keep original for upload)
+    try:
+        audio_resampled = librosa.resample(
+            audio_array, orig_sr=row["audio"]["sampling_rate"], target_sr=16_000
+        )
+    except Exception as e:
+        print(f"Error resampling audio: {e}. Skipping embedding for row.")
+        return None  # Return None if resampling fails
+    audio_length = audio_resampled.shape[0]
+    device = speaker_model.device
+    audio_resampled = np.array([audio_resampled])  # Add batch dim for model
+    audio_signal, audio_signal_len = (
+        torch.tensor(audio_resampled, device=device, dtype=torch.float32),
+        torch.tensor([audio_length], device=device),
+    )
+    _, emb = speaker_model.forward(
+        input_signal=audio_signal, input_signal_length=audio_signal_len
+    )
+    del audio_signal, audio_signal_len, audio_resampled  # Clean up resampled audio
+    return emb.detach().cpu()  # Return the tensor
+def get_embedding_from_array(sample_rate: int, audio_array: np.ndarray):
+    row = {"audio": {"array": audio_array, "sampling_rate": sample_rate}}
+    return get_embedding(row)
+class AudioEmbeddingSystem:
+    def __init__(
+        self, db_path="audio_db.sqlite", index_path="audio_faiss.index", vector_dim=768
+    ):
+        """
+        Initialize the audio embedding system.
+        Args:
+            model_name: HuggingFace model to use for embeddings
+            db_path: Path to SQLite database
+            index_path: Path to save FAISS index
+            vector_dim: Dimension of embedding vectors
+            use_quantization: Whether to use vector quantization (reduces size)
+        """
+        self.db_path = db_path
+        self.index_path = index_path
+        self.vector_dim = vector_dim
+        self._init_db()
+        if os.path.exists(index_path):
+            self.index = faiss.read_index(index_path)
+        else:
+            self.index = faiss.IndexFlatL2(vector_dim)
+    def _init_db(self):
+        """Initialize SQLite database with required tables"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute("""
+        CREATE TABLE IF NOT EXISTS audio_files (
+            id INTEGER PRIMARY KEY,
+            file_path TEXT UNIQUE,
+            vector_id INTEGER
+        )
+        """)
+        conn.commit()
+        conn.close()
+    def extract_embedding(self, row: dict):
+        """Extract embedding from audio file"""
+        return get_embedding(row)
+    def add_audio(self, row):
+        """Add audio file to the database and index"""
+        embedding = self.extract_embedding(row)
+        embedding_normalized = embedding.reshape(1, -1).astype(np.float32)
+        current_index_size = self.index.ntotal
+        self.index.add(embedding_normalized)
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute(
+            "INSERT INTO audio_files (file_path, vector_id) VALUES (?, ?)",
+            (row["path"], current_index_size),
+        )
+        conn.commit()
+        conn.close()
+        faiss.write_index(self.index, self.index_path)
+        return current_index_size
+    def search(self, row: dict | tuple, top_k=5):
+        """
+        Search for similar audio files.
+        Either provide query_audio (path to audio file) or query_embedding (numpy array)
+        """
+        if isinstance(row, dict):
+            query_embedding = self.extract_embedding(row)
+        else:
+            query_embedding = get_embedding_from_array(row)
+        query_embedding = query_embedding.reshape(1, -1).astype(np.float32)
+        distances, indices = self.index.search(query_embedding, top_k)
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        results = []
+        for i, idx in enumerate(indices[0]):
+            cursor.execute(
+                "SELECT file_path, metadata FROM audio_files WHERE vector_id = ?",
+                (int(idx),),
+            )
+            row = cursor.fetchone()
+            if row:
+                results.append(
+                    {
+                        "file_path": row[0],
+                        "distance": float(distances[0][i]),
+                        "vector_id": int(idx),
+                    }
+                )
+        conn.close()
+        return results

requirements.txt CHANGED Viewed

	@@ -1,2 +1,2 @@
1	faiss-cpu
2	- nemo_toolkit['all']


1	faiss-cpu
2	+ nemo_toolkit[all]

search.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import requests
+import os
+headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
+dataset = "mozilla-foundation/common_voice_17_0"
+config = "en"
+split = "validation"
+def search(rows: list[dict]):
+    file_paths_to_find = [row["path"] for row in rows]
+    paths_in_clause = ", ".join([f"'{path}'" for path in file_paths_to_find])
+    where_clause = f'"path" IN ({paths_in_clause})'
+    api_url = f"https://datasets-server.huggingface.co/filter?dataset={dataset}&config={config}&split={split}&where={where_clause}&offset=0"
+    response = requests.get(api_url, headers=headers)
+    response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
+    data = response.json()
+    return data.get("rows", [])