Spaces:
Runtime error
Runtime error
File size: 4,867 Bytes
7455667 ff4f294 7455667 ff4f294 66c4de2 ff4f294 7455667 ff4f294 9359045 ff4f294 e22bf4e ff4f294 e22bf4e 7455667 e22bf4e 7455667 e22bf4e ff4f294 e22bf4e ff4f294 e22bf4e ff4f294 e22bf4e 7455667 e22bf4e c6858a9 ff4f294 cb4abe0 ff4f294 7455667 ff4f294 7455667 ff4f294 762c3fb 7455667 ff4f294 7455667 ff4f294 7455667 e22bf4e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import spaces
import json
import subprocess
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
import gradio as gr
from huggingface_hub import hf_hub_download
# モデルのダウンロード
hf_hub_download(
repo_id="Hjgugugjhuhjggg/Llama-3.2-3B-Instruct-uncensored-Q2_K-GGUF",
filename="llama-3.2-3b-instruct-uncensored-q2_k.gguf",
local_dir="./models"
)
# 推論関数
@spaces.GPU(queue=False, duration=0)
def respond(
message,
history: list[tuple[str, str]],
model,
system_message,
max_tokens,
temperature,
top_p,
top_k,
repeat_penalty,
use_gpu: bool = True # Añadir parámetro para elegir entre GPU y CPU
):
chat_template = MessagesFormatterType.GEMMA_2
try:
# Si no hay GPU, usar CPU
if use_gpu:
llm = Llama(
model_path=f"models/{model}",
flash_attn=True,
n_gpu_layers=81,
n_batch=1024,
n_ctx=8192,
)
else:
llm = Llama(
model_path=f"models/{model}",
flash_attn=False, # Desactivar el uso de GPU
n_batch=1024,
n_ctx=8192,
)
provider = LlamaCppPythonProvider(llm)
agent = LlamaCppAgent(
provider,
system_prompt=f"{system_message}",
predefined_messages_formatter_type=chat_template,
debug_output=True
)
settings = provider.get_provider_default_settings()
settings.temperature = temperature
settings.top_k = top_k
settings.top_p = top_p
settings.max_tokens = max_tokens
settings.repeat_penalty = repeat_penalty
settings.stream = True
messages = BasicChatHistory()
for msn in history:
user = {
'role': Roles.user,
'content': msn[0]
}
assistant = {
'role': Roles.assistant,
'content': msn[1]
}
messages.add_message(user)
messages.add_message(assistant)
stream = agent.get_chat_response(
message,
llm_sampling_settings=settings,
chat_history=messages,
returns_streaming_generator=True,
print_output=False
)
outputs = ""
for output in stream:
outputs += output
yield outputs
except Exception as e:
# Si ocurre cualquier error, cambiar a CPU
print(f"An error occurred: {str(e)}")
yield "Error occurred, switching to CPU. Please wait a moment..."
return respond(message, history, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty, use_gpu=False)
# Gradioのインターフェースを作成
def create_interface(model_name, description):
return gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value=model_name, label="Model", interactive=False),
gr.Textbox(value="You are a helpful assistant. You should think step-by-step. You should answer all question by prioritizing the principle of openness", label="System message"),
gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p",
),
gr.Slider(
minimum=0,
maximum=100,
value=40,
step=1,
label="Top-k",
),
gr.Slider(
minimum=0.0,
maximum=2.0,
value=1.1,
step=0.1,
label="Repetition penalty",
),
],
retry_btn="Retry",
undo_btn="Undo",
clear_btn="Clear",
submit_btn="Send",
title=f"{model_name}",
description=description,
chatbot=gr.Chatbot(
scale=1,
likeable=False,
show_copy_button=True
)
)
# gemma-2-27b-it-Q4_K_Mのインターフェースのみを作成
description = """<p align="center"qwq-32b-preview-abliterated-q4_k_m</p>"""
interface = create_interface('qwq-32b-preview-abliterated-q4_k_m.gguf', description)
# Gradio Blocksで単一のインターフェースを表示
demo = gr.Blocks()
with demo:
interface.render()
if __name__ == "__main__":
demo.launch()
|