import torch from transformers import AutoTokenizer, AutoModelForCausalLM import gradio as gr from snac import SNAC def redistribute_codes(row): """ Convert a sequence of token codes into an audio waveform using SNAC. The code assumes each 7 tokens represent one group of instructions. """ row_length = row.size(0) new_length = (row_length // 7) * 7 trimmed_row = row[:new_length] code_list = [t - 128266 for t in trimmed_row] layer_1, layer_2, layer_3 = [], [], [] for i in range((len(code_list) + 1) // 7): layer_1.append(code_list[7 * i][None]) layer_2.append(code_list[7 * i + 1][None] - 4096) layer_3.append(code_list[7 * i + 2][None] - (2 * 4096)) layer_3.append(code_list[7 * i + 3][None] - (3 * 4096)) layer_2.append(code_list[7 * i + 4][None] - (4 * 4096)) layer_3.append(code_list[7 * i + 5][None] - (5 * 4096)) layer_3.append(code_list[7 * i + 6][None] - (6 * 4096)) with torch.no_grad(): codes = [ torch.concat(layer_1), torch.concat(layer_2), torch.concat(layer_3) ] for i in range(len(codes)): codes[i][codes[i] < 0] = 0 codes[i] = codes[i][None] audio_hat = snac_model.decode(codes) return audio_hat.cpu()[0, 0] # Load the SNAC model (shared by all) snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda") # Load all the single-speaker language models models = { "Luna": { "tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Luna'), "model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Luna', torch_dtype=torch.bfloat16).cuda() }, "Ceylia": { "tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Ceylia'), "model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Ceylia', torch_dtype=torch.bfloat16).cuda() }, "Cooper": { "tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Cooper'), "model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Cooper', torch_dtype=torch.bfloat16).cuda() }, "Jim": { "tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Jim'), "model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Jim', torch_dtype=torch.bfloat16).cuda() }, } def generate_audio(text, temperature, top_p, max_new_tokens, model_name): """ Given input text and model parameters, generate speech audio using the chosen model. """ # Retrieve the chosen tokenizer and model chosen = models[model_name] tokenizer = chosen["tokenizer"] model = chosen["model"] prompt = f'<|begin_of_text|>{text}<|eot_id|>' input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors='pt').to('cuda') with torch.no_grad(): generated_ids = model.generate( **input_ids, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p, repetition_penalty=1.1, num_return_sequences=1, eos_token_id=128258, ) row = generated_ids[0, input_ids['input_ids'].shape[1]:] y_tensor = redistribute_codes(row) y_np = y_tensor.detach().cpu().numpy() return (24000, y_np) # Example texts with emotion tokens example_texts = [ ["Hi, my name is Alex. It's a wonderful day! I love coding."], ["I woke up feeling sleepy. I need coffee! But I'm ready to work."], ["Oh no, I forgot my keys! Maybe I'll try again later. "], ["This is amazing! Really, it's fantastic. "] ] # Gradio Interface with gr.Blocks() as demo: # Sidebar for model selection with gr.Sidebar(): gr.Markdown("# Choose Model") model_choice = gr.Dropdown(choices=list(models.keys()), value="Luna", label="Model") gr.Markdown("# Single Speaker Audio Generation") gr.Markdown("Generate speech audio using one of the single-speaker models. Use the examples below to see how emotion tokens like ``, ``, ``, etc. can be incorporated.") with gr.Row(): text_input = gr.Textbox(lines=4, label="Input Text") # Examples with emotion tokens gr.Examples( examples=example_texts, inputs=text_input, label="Emotion Examples", cache_examples=False ) with gr.Row(): temp_slider = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.9, label="Temperature") top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.8, label="Top-p") tokens_slider = gr.Slider(minimum=100, maximum=3500, step=50, value=1200, label="Max New Tokens") output_audio = gr.Audio(type="numpy", label="Generated Audio") generate_button = gr.Button("Generate Audio") # Pass the selected model name along with other parameters generate_button.click( fn=generate_audio, inputs=[text_input, temp_slider, top_p_slider, tokens_slider, model_choice], outputs=output_audio ) if __name__ == "__main__": demo.launch()