import os | |
import torch | |
from speechbrain.pretrained import EncoderClassifier | |
spk_model_name = "speechbrain/spkrec-xvect-voxceleb" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
speaker_model = EncoderClassifier.from_hparams( | |
source=spk_model_name, | |
run_opts={"device": device}, | |
savedir=os.path.join("/tmp", spk_model_name), | |
) | |
def create_speaker_embedding(waveform): | |
with torch.no_grad(): | |
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) | |
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) | |
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() | |
return speaker_embeddings | |
It's important to note that the speechbrain/spkrec-xvect-voxceleb model was trained on English speech from the VoxCeleb | |
dataset, whereas the training examples in this guide are in Dutch. |