Spaces:

prithivMLmods
/

Callisto-OCR-2B

Running on Zero

App Files Files Community

Callisto-OCR-2B / app.py

prithivMLmods

Update app.py

9fbf2d1 verified about 1 month ago

raw

history blame

5.39 kB

	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import gradio as gr
	from snac import SNAC

	def redistribute_codes(row):
	"""
	Convert a sequence of token codes into an audio waveform using SNAC.
	The code assumes each 7 tokens represent one group of instructions.
	"""
	row_length = row.size(0)
	new_length = (row_length // 7) * 7
	trimmed_row = row[:new_length]
	code_list = [t - 128266 for t in trimmed_row]

	layer_1, layer_2, layer_3 = [], [], []

	for i in range((len(code_list) + 1) // 7):
	layer_1.append(code_list[7 * i][None])
	layer_2.append(code_list[7 * i + 1][None] - 4096)
	layer_3.append(code_list[7 * i + 2][None] - (2 * 4096))
	layer_3.append(code_list[7 * i + 3][None] - (3 * 4096))
	layer_2.append(code_list[7 * i + 4][None] - (4 * 4096))
	layer_3.append(code_list[7 * i + 5][None] - (5 * 4096))
	layer_3.append(code_list[7 * i + 6][None] - (6 * 4096))

	with torch.no_grad():
	codes = [
	torch.concat(layer_1),
	torch.concat(layer_2),
	torch.concat(layer_3)
	]
	for i in range(len(codes)):
	codes[i][codes[i] < 0] = 0
	codes[i] = codes[i][None]

	audio_hat = snac_model.decode(codes)
	return audio_hat.cpu()[0, 0]

	# Load the SNAC model (shared by all)
	snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda")

	# Load all the single-speaker language models
	models = {
	"Luna": {
	"tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Luna'),
	"model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Luna', torch_dtype=torch.bfloat16).cuda()
	},
	"Ceylia": {
	"tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Ceylia'),
	"model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Ceylia', torch_dtype=torch.bfloat16).cuda()
	},
	"Cooper": {
	"tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Cooper'),
	"model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Cooper', torch_dtype=torch.bfloat16).cuda()
	},
	"Jim": {
	"tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Jim'),
	"model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Jim', torch_dtype=torch.bfloat16).cuda()
	},
	}

	def generate_audio(text, temperature, top_p, max_new_tokens, model_name):
	"""
	Given input text and model parameters, generate speech audio using the chosen model.
	"""
	# Retrieve the chosen tokenizer and model
	chosen = models[model_name]
	tokenizer = chosen["tokenizer"]
	model = chosen["model"]

	prompt = f'<custom_token_3><\|begin_of_text\|>{text}<\|eot_id\|><custom_token_4><custom_token_5><custom_token_1>'
	input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors='pt').to('cuda')

	with torch.no_grad():
	generated_ids = model.generate(
	**input_ids,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	temperature=temperature,
	top_p=top_p,
	repetition_penalty=1.1,
	num_return_sequences=1,
	eos_token_id=128258,
	)

	row = generated_ids[0, input_ids['input_ids'].shape[1]:]
	y_tensor = redistribute_codes(row)
	y_np = y_tensor.detach().cpu().numpy()
	return (24000, y_np)

	# Example texts with emotion tokens
	example_texts = [
	["Hi, my name is Alex. <laugh> It's a wonderful day! <chuckle> I love coding."],
	["I woke up feeling sleepy. <yawn> I need coffee! <sniffle> But I'm ready to work."],
	["Oh no, I forgot my keys! <groan> <uhm> Maybe I'll try again later. <sigh>"],
	["This is amazing! <gasp> Really, it's fantastic. <giggles>"]
	]

	# Gradio Interface
	with gr.Blocks() as demo:
	# Sidebar for model selection
	with gr.Sidebar():
	gr.Markdown("# Choose Model")
	model_choice = gr.Dropdown(choices=list(models.keys()), value="Luna", label="Model")

	gr.Markdown("# Single Speaker Audio Generation")
	gr.Markdown("Generate speech audio using one of the single-speaker models. Use the examples below to see how emotion tokens like `<laugh>`, `<chuckle>`, `<sigh>`, etc. can be incorporated.")

	with gr.Row():
	text_input = gr.Textbox(lines=4, label="Input Text")

	# Examples with emotion tokens
	gr.Examples(
	examples=example_texts,
	inputs=text_input,
	label="Emotion Examples",
	cache_examples=False
	)

	with gr.Row():
	temp_slider = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.9, label="Temperature")
	top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.8, label="Top-p")
	tokens_slider = gr.Slider(minimum=100, maximum=3500, step=50, value=1200, label="Max New Tokens")

	output_audio = gr.Audio(type="numpy", label="Generated Audio")
	generate_button = gr.Button("Generate Audio")

	# Pass the selected model name along with other parameters
	generate_button.click(
	fn=generate_audio,
	inputs=[text_input, temp_slider, top_p_slider, tokens_slider, model_choice],
	outputs=output_audio
	)

	if __name__ == "__main__":
	demo.launch()