|
import gradio as gr |
|
import torch |
|
from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor |
|
from qwen_omni_utils import process_mm_info |
|
import soundfile as sf |
|
import tempfile |
|
import spaces |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16 |
|
|
|
model = Qwen2_5OmniModel.from_pretrained( |
|
"Qwen/Qwen2.5-Omni-7B", |
|
torch_dtype=torch_dtype, |
|
device_map="auto", |
|
enable_audio_output=True, |
|
|
|
) |
|
|
|
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B") |
|
|
|
|
|
SYSTEM_PROMPT = { |
|
"role": "system", |
|
"content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech." |
|
} |
|
|
|
|
|
VOICE_OPTIONS = { |
|
"Chelsie (Female)": "Chelsie", |
|
"Ethan (Male)": "Ethan" |
|
} |
|
|
|
@spaces.GPU |
|
def process_input(image, audio, video, text, chat_history, voice_type, enable_audio_output): |
|
|
|
user_input = { |
|
"text": text, |
|
"image": image if image is not None else None, |
|
"audio": audio if audio is not None else None, |
|
"video": video if video is not None else None |
|
} |
|
|
|
|
|
conversation = [SYSTEM_PROMPT] |
|
|
|
|
|
if isinstance(chat_history, list): |
|
for item in chat_history: |
|
if isinstance(item, tuple) and len(item) == 2: |
|
user_msg, bot_msg = item |
|
conversation.append({"role": "user", "content": user_input_to_content(user_msg)}) |
|
conversation.append({"role": "assistant", "content": bot_msg}) |
|
else: |
|
|
|
chat_history = [] |
|
|
|
|
|
conversation.append({"role": "user", "content": user_input_to_content(user_input)}) |
|
|
|
|
|
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) |
|
audios, images, videos = process_mm_info(conversation, use_audio_in_video=True) |
|
|
|
inputs = processor( |
|
text=text, |
|
audios=audios, |
|
images=images, |
|
videos=videos, |
|
return_tensors="pt", |
|
padding=True |
|
) |
|
inputs = inputs.to(model.device).to(model.dtype) |
|
|
|
|
|
if enable_audio_output: |
|
voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie") |
|
text_ids, audio = model.generate( |
|
**inputs, |
|
use_audio_in_video=True, |
|
return_audio=True, |
|
spk=voice_type_value |
|
) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: |
|
sf.write( |
|
tmp_file.name, |
|
audio.reshape(-1).detach().cpu().numpy(), |
|
samplerate=24000, |
|
) |
|
audio_path = tmp_file.name |
|
else: |
|
text_ids = model.generate( |
|
**inputs, |
|
use_audio_in_video=True, |
|
return_audio=False |
|
) |
|
audio_path = None |
|
|
|
|
|
text_response = processor.batch_decode( |
|
text_ids, |
|
skip_special_tokens=True, |
|
clean_up_tokenization_spaces=False |
|
)[0] |
|
|
|
|
|
text_response = text_response.strip() |
|
|
|
|
|
user_message_for_display = str(text) if text is not None else "" |
|
if image is not None: |
|
user_message_for_display = (user_message_for_display or "Image uploaded") + " [Image]" |
|
if audio is not None: |
|
user_message_for_display = (user_message_for_display or "Audio uploaded") + " [Audio]" |
|
if video is not None: |
|
user_message_for_display = (user_message_for_display or "Video uploaded") + " [Video]" |
|
|
|
|
|
if not user_message_for_display.strip(): |
|
user_message_for_display = "Multimodal input" |
|
|
|
|
|
if not isinstance(chat_history, list): |
|
chat_history = [] |
|
chat_history.append((user_message_for_display, text_response)) |
|
|
|
|
|
if enable_audio_output and audio_path: |
|
return chat_history, text_response, audio_path |
|
else: |
|
return chat_history, text_response, None |
|
|
|
def user_input_to_content(user_input): |
|
if isinstance(user_input, str): |
|
return user_input |
|
elif isinstance(user_input, dict): |
|
|
|
content = [] |
|
if "text" in user_input and user_input["text"]: |
|
content.append({"type": "text", "text": user_input["text"]}) |
|
if "image" in user_input and user_input["image"]: |
|
content.append({"type": "image", "image": user_input["image"]}) |
|
if "audio" in user_input and user_input["audio"]: |
|
content.append({"type": "audio", "audio": user_input["audio"]}) |
|
if "video" in user_input and user_input["video"]: |
|
content.append({"type": "video", "video": user_input["video"]}) |
|
return content |
|
return user_input |
|
|
|
def create_demo(): |
|
with gr.Blocks(title="Qwen2.5-Omni ChatBot", theme=gr.themes.Soft()) as demo: |
|
gr.Markdown("# Qwen2.5-Omni Multimodal ChatBot") |
|
gr.Markdown("Experience the omni-modal capabilities of Qwen2.5-Omni through text, images, audio, and video interactions.") |
|
|
|
|
|
placeholder_image = gr.Image(type="filepath", visible=False) |
|
placeholder_audio = gr.Audio(type="filepath", visible=False) |
|
placeholder_video = gr.Video(visible=False) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(scale=3): |
|
chatbot = gr.Chatbot(height=600) |
|
with gr.Accordion("Advanced Options", open=False): |
|
voice_type = gr.Dropdown( |
|
choices=list(VOICE_OPTIONS.keys()), |
|
value="Chelsie (Female)", |
|
label="Voice Type" |
|
) |
|
enable_audio_output = gr.Checkbox( |
|
value=True, |
|
label="Enable Audio Output" |
|
) |
|
|
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Text Input"): |
|
text_input = gr.Textbox( |
|
placeholder="Type your message here...", |
|
label="Text Input" |
|
) |
|
text_submit = gr.Button("Send Text") |
|
|
|
with gr.TabItem("Multimodal Input"): |
|
with gr.Row(): |
|
image_input = gr.Image( |
|
type="filepath", |
|
label="Upload Image" |
|
) |
|
audio_input = gr.Audio( |
|
type="filepath", |
|
label="Upload Audio" |
|
) |
|
with gr.Row(): |
|
video_input = gr.Video( |
|
label="Upload Video" |
|
) |
|
additional_text = gr.Textbox( |
|
placeholder="Additional text message...", |
|
label="Additional Text" |
|
) |
|
multimodal_submit = gr.Button("Send Multimodal Input") |
|
|
|
clear_button = gr.Button("Clear Chat") |
|
|
|
with gr.Column(scale=1): |
|
gr.Markdown("## Model Capabilities") |
|
gr.Markdown(""" |
|
**Qwen2.5-Omni can:** |
|
- Process and understand text |
|
- Analyze images and answer questions about them |
|
- Transcribe and understand audio |
|
- Analyze video content (with or without audio) |
|
- Generate natural speech responses |
|
""") |
|
|
|
gr.Markdown("### Example Prompts") |
|
gr.Examples( |
|
examples=[ |
|
["Describe what you see in this image", "image"], |
|
["What is being said in this audio clip?", "audio"], |
|
["What's happening in this video?", "video"], |
|
["Explain Artificial Intelligence in simple terms", "text"], |
|
["Generate a short story about a robot learning to play AlphaGo", "text"] |
|
], |
|
inputs=[text_input, gr.Textbox(visible=False)], |
|
label="Text Examples" |
|
) |
|
|
|
audio_output = gr.Audio( |
|
label="Model Speech Output", |
|
visible=True, |
|
autoplay=True |
|
) |
|
text_output = gr.Textbox( |
|
label="Model Text Response", |
|
interactive=False |
|
) |
|
|
|
|
|
text_submit.click( |
|
fn=lambda text: str(text) if text is not None else "", |
|
inputs=text_input, |
|
outputs=[chatbot], |
|
queue=False |
|
).then( |
|
fn=process_input, |
|
inputs=[placeholder_image, placeholder_audio, placeholder_video, text_input, chatbot, voice_type, enable_audio_output], |
|
outputs=[chatbot, text_output, audio_output] |
|
) |
|
|
|
|
|
def prepare_multimodal_input(image, audio, video, text): |
|
|
|
display_message = str(text) if text is not None else "" |
|
if image is not None: |
|
display_message = (display_message + " " if display_message.strip() else "") + "[Image]" |
|
if audio is not None: |
|
display_message = (display_message + " " if display_message.strip() else "") + "[Audio]" |
|
if video is not None: |
|
display_message = (display_message + " " if display_message.strip() else "") + "[Video]" |
|
|
|
if not display_message.strip(): |
|
display_message = "Multimodal content" |
|
|
|
return display_message |
|
|
|
multimodal_submit.click( |
|
fn=prepare_multimodal_input, |
|
inputs=[image_input, audio_input, video_input, additional_text], |
|
outputs=[chatbot], |
|
queue=False |
|
).then( |
|
fn=process_input, |
|
inputs=[image_input, audio_input, video_input, additional_text, |
|
chatbot, voice_type, enable_audio_output], |
|
outputs=[chatbot, text_output, audio_output] |
|
) |
|
|
|
|
|
def clear_chat(): |
|
return [], None, None |
|
|
|
clear_button.click( |
|
fn=clear_chat, |
|
outputs=[chatbot, text_output, audio_output] |
|
) |
|
|
|
|
|
def toggle_audio_output(enable_audio): |
|
return gr.Audio(visible=enable_audio) |
|
|
|
enable_audio_output.change( |
|
fn=toggle_audio_output, |
|
inputs=enable_audio_output, |
|
outputs=audio_output |
|
) |
|
|
|
return demo |
|
|
|
if __name__ == "__main__": |
|
demo = create_demo() |
|
demo.launch(server_name="0.0.0.0", server_port=7860) |