Spaces:

traversaal-internal
/

Alif-1.0-8B-Instruct

Sleeping

App Files Files Community

Alif-1.0-8B-Instruct / app.py

alishafique

Update app.py

4a6b784 verified 2 months ago

raw

history blame

4.17 kB

	import os
	import json
	import subprocess
	import gradio as gr
	from threading import Thread
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama
	from datetime import datetime

	# Load model from Hugging Face Hub
	MODEL_ID = "large-traversaal/Alif-1.0-8B-Instruct"
	MODEL_FILE = "model-Q8_0.gguf"

	model_path_file = hf_hub_download(MODEL_ID, filename=MODEL_FILE)

	# Initialize Llama model
	llama = Llama(
	model_path=model_path_file,
	n_gpu_layers=40, # Adjust based on VRAM
	n_threads=8, # Match CPU cores
	n_batch=512, # Optimize for better VRAM usage
	n_ctx=4096, # Context window size
	verbose=True # Enable debug logging
	)

	CHAT_TEMPLATE = "Alif Chat"
	CONTEXT_LENGTH = 4096
	COLOR = "blue"
	EMOJI = "💬"
	DESCRIPTION = "Urdu AI Chatbot powered by Llama.cpp"

	# Function to generate responses
	def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
	chat_prompt = f"You are an Urdu Chatbot. Write an appropriate response for the given instruction: {message} Response:"
	response = llama(chat_prompt, max_tokens=max_new_tokens, stop=["Q:", "\n"], echo=False, stream=True)

	text = ""
	for chunk in response:
	content = chunk["choices"][0]["text"]
	if content:
	text += content
	yield text

	# Create Gradio interface
	with gr.Blocks() as demo:
	chatbot = gr.Chatbot(label="Urdu Chatbot", likeable=True, render=False)
	chat = gr.ChatInterface(
	generate_response,
	chatbot=chatbot,
	title=EMOJI + " " + "Alif-1.0 Chatbot",
	description=DESCRIPTION,
	examples=[
	["شہر کراچی کے بارے میں بتاؤ"],
	["قابل تجدید توانائی کیا ہے؟"],
	["پاکستان کی تاریخ کے بارے میں بتائیں۔"]
	],
	additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
	additional_inputs=[
	gr.Textbox("", label="System prompt", render=False),
	gr.Slider(0, 1, 0.6, label="Temperature", render=False),
	gr.Slider(128, CONTEXT_LENGTH, 1024, label="Max new tokens", render=False),
	gr.Slider(1, 80, 40, step=1, label="Top K sampling", render=False),
	gr.Slider(0, 2, 1.1, label="Repetition penalty", render=False),
	gr.Slider(0, 1, 0.95, label="Top P sampling", render=False),
	],
	theme=gr.themes.Soft(primary_hue=COLOR),
	)

	demo.queue(max_size=20).launch(share=True)


	# import llama_cpp
	# from llama_cpp import Llama
	# # import llama_cpp.llama_tokenizer
	# import gradio as gr

	# from huggingface_hub import hf_hub_download

	# model_name = "large-traversaal/Alif-1.0-8B-Instruct"
	# model_file = "model-Q8_0.gguf"
	# model_path_file = hf_hub_download(model_name,
	# filename=model_file,)


	# llama = Llama(
	# model_path=model_path_file,
	# n_gpu_layers=40, # Adjust based on VRAM
	# n_threads=8, # Match CPU cores
	# n_batch=512, # Optimize for better VRAM usage
	# n_ctx=4096, # Context window size
	# verbose=True # Enable debug logging
	# )

	# chat_prompt = """You are Urdu Chatbot. Write approriate response for given instruction:{inp} Response:"""

	# # Function to generate text with streaming output
	# def chat_with_ai(prompt):
	# query = chat_prompt.format(inp=prompt)

	# #response = llama(prompt, max_tokens=1024, stop=stop_tokens, echo=False, stream=True) # Enable streaming
	# response = llama(query, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True) # Enable streaming

	# text = ""
	# for chunk in response:
	# content = chunk["choices"][0]["text"]
	# if content:
	# text += content
	# yield text


	# # Gradio UI setup
	# demo = gr.Interface(
	# fn=chat_with_ai, # Streaming function
	# inputs="text", # User input
	# outputs="text", # Model response
	# title="Streaming Alif-1.0-8B-Instruct Chatbot 🚀",
	# description="Enter a prompt and get a streamed response."
	# )

	# # Launch the Gradio app
	# demo.launch(share=True)