voice-match / app.py
freddyaboulton's picture
Add Prompts
e1ef382
raw
history blame
2.82 kB
import gradio as gr
from huggingface_hub import hf_hub_download
from audio_index import AudioEmbeddingSystem
from search import search, get_prompt
import pandas as pd
import numpy as np
db_file = hf_hub_download(
repo_id="freddyaboulton/common-voice-english-audio",
filename="audio_db_full.sqlite",
repo_type="dataset",
)
index_file = hf_hub_download(
repo_id="freddyaboulton/common-voice-english-audio",
filename="audio_faiss_full.index",
repo_type="dataset",
)
audio_embedding_system = AudioEmbeddingSystem(db_path=db_file, index_path=index_file)
def audio_search(audio_tuple, prompt: str):
if audio_tuple is None:
return gr.skip()
sample_rate, array = audio_tuple
if array.dtype == np.int16:
array = array.astype(np.float32) / 32768.0
rows = audio_embedding_system.search((sample_rate, array))
print(rows)
orig_rows = search(rows)
for row in rows:
path = row["path"]
for orig in orig_rows:
orig_row = orig["row"]
print(orig_row)
if orig_row["path"] == path:
row["sentence"] = orig_row["sentence"]
row["audio"] = [
"<audio src=" + orig_row["audio"][0]["src"] + " controls />"
]
return pd.DataFrame(rows)[["path", "audio", "sentence", "distance"]].sort_values(
by="distance", ascending=True
)
sample_text = gr.Textbox(
label="Prompt",
info="Hit Enter to get a prompt from the common voice dataset",
value=get_prompt(),
)
iface = gr.Interface(
fn=audio_search,
inputs=[gr.Audio(
label="Record or upload a clip of your voice", sources=["microphone", "upload"]
), sample_text],
outputs=gr.Dataframe(
show_label=False,
headers=["path", "audio", "sentence", "distance"],
datatype=["str", "html", "str", "number"],
),
)
with gr.Blocks() as demo:
gr.HTML(
f"""
<h1 style='text-align: center; display: flex; align-items: center; justify-content: center;'>
<img src="/gradio_api/file=Karaoke_Huggy.png" alt="Voice Match" style="height: 100px; margin-right: 10px"> Voice Match
</h1>
"""
)
gr.HTML(
"""
<h2 style='text-align: center'>
Powered by <a href="https://huggingface.co/rimelabs/rimecaster">RimeCaster</a>
</h2>
"""
)
gr.Markdown(
f"""
<div style='text-align: center'>
Record or upload an English clip of your voice and we'll find the most similar voices in the <a href="https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0">Common Voice dataset</a>.
</div>
"""
)
iface.render()
sample_text.submit(fn=get_prompt, inputs=None, outputs=sample_text)
demo.launch(allowed_paths=["Karaoke_Huggy.png"])