Chatbot-Ros-DeepSeek-R1-Distill-Qwen-32B

Sleeping

App Files Files Community

ruslanmv commited on Feb 24

Commit

490ab38

verified ·

1 Parent(s): 417925c

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -11

app.py CHANGED Viewed

@@ -1,21 +1,32 @@
 import gradio as gr
 from huggingface_hub import InferenceClient
 from transformers import AutoTokenizer  # Import the tokenizer
 from langchain.memory import ConversationBufferMemory
 from langchain.schema import HumanMessage, AIMessage
 # Use the appropriate tokenizer for your model.
-tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-# Define a maximum context length (tokens).  Check your model's documentation!
-MAX_CONTEXT_LENGTH = 4096  # Example: Adjust this based on your model!
-# Read the default prompt from a file
 with open("prompt.txt", "r") as file:
     nvc_prompt_template = file.read()
-# Initialize LangChain Conversation Memory
 memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
 def count_tokens(text: str) -> int:
@@ -38,7 +49,7 @@ def truncate_memory(memory, system_message: str, max_length: int):
     system_tokens = count_tokens(system_message)
     current_length = system_tokens
-    # Iterate backwards through the memory (newest to oldest)
     for msg in reversed(memory.chat_memory.messages):
         tokens = count_tokens(msg.content)
         if current_length + tokens <= max_length:
@@ -52,7 +63,7 @@ def truncate_memory(memory, system_message: str, max_length: int):
 def respond(
     message,
-    history: list[tuple[str, str]],  # Required by Gradio but we now use LangChain memory
     system_message,
     max_tokens,
     temperature,
@@ -83,13 +94,15 @@ def respond(
     response = ""
     try:
-        for chunk in client.chat_completion(
-            messages,
             max_tokens=max_tokens,
             stream=True,
             temperature=temperature,
             top_p=top_p,
-        ):
             token = chunk.choices[0].delta.content
             response += token
             yield response

+import os
 import gradio as gr
 from huggingface_hub import InferenceClient
 from transformers import AutoTokenizer  # Import the tokenizer
 from langchain.memory import ConversationBufferMemory
 from langchain.schema import HumanMessage, AIMessage
+# Load HF token from environment variables.
+HF_TOKEN = os.getenv("HF_TOKEN")
+if not HF_TOKEN:
+    raise ValueError("HF_TOKEN environment variable not set")
 # Use the appropriate tokenizer for your model.
+tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-32B")
+# Instantiate the client with the new inference mechanism.
+client = InferenceClient(
+    provider="novita",
+    api_key=HF_TOKEN
+)
+# Define a maximum context length (tokens). Adjust this based on your model's requirements.
+MAX_CONTEXT_LENGTH = 4096
+# Read the default prompt from a file.
 with open("prompt.txt", "r") as file:
     nvc_prompt_template = file.read()
+# Initialize LangChain Conversation Memory.
 memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
 def count_tokens(text: str) -> int:
     system_tokens = count_tokens(system_message)
     current_length = system_tokens
+    # Iterate backwards through the memory (newest to oldest).
     for msg in reversed(memory.chat_memory.messages):
         tokens = count_tokens(msg.content)
         if current_length + tokens <= max_length:
 def respond(
     message,
+    history: list[tuple[str, str]],  # Required by Gradio but we now use LangChain memory.
     system_message,
     max_tokens,
     temperature,
     response = ""
     try:
+        stream = client.chat.completions.create(
+            model="deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+            messages=messages,
             max_tokens=max_tokens,
             stream=True,
             temperature=temperature,
             top_p=top_p,
+        )
+        for chunk in stream:
             token = chunk.choices[0].delta.content
             response += token
             yield response