tommytracx commited on
Commit
e3a0e6a
·
verified ·
1 Parent(s): 46bcb17

Upload 4 files

Browse files
Files changed (4) hide show
  1. README (1).md +12 -0
  2. app (2).py +141 -0
  3. gitattributes (1) +35 -0
  4. requirements.txt +11 -0
README (1).md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Gabi Multimodal Demo
3
+ emoji: 🤖
4
+ colorFrom: purple
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.19.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ short_description: Gabi powered with Phi4 MultiModal
12
+ ---
app (2).py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PIL import Image
3
+ import torch
4
+ import soundfile as sf
5
+ from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
6
+ from urllib.request import urlopen
7
+ import spaces
8
+
9
+ # Define model path
10
+ model_path = "microsoft/Phi-4-multimodal-instruct"
11
+
12
+ # Load model and processor
13
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
14
+ model = AutoModelForCausalLM.from_pretrained(
15
+ model_path,
16
+ device_map="auto",
17
+ torch_dtype="auto",
18
+ trust_remote_code=True,
19
+ _attn_implementation="eager",
20
+ )
21
+
22
+ # Define prompt structure
23
+ user_prompt = '<|user|>'
24
+ assistant_prompt = '<|assistant|>'
25
+ prompt_suffix = '<|end|>'
26
+
27
+ # Define inference function
28
+ @spaces.GPU
29
+ def process_input(input_type, file, question):
30
+ if not file or not question:
31
+ return "Please upload a file and provide a question for Gabi."
32
+
33
+ # Prepare the prompt
34
+ if input_type == "Image":
35
+ prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
36
+ # Open image from uploaded file
37
+ image = Image.open(file)
38
+ inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
39
+ elif input_type == "Audio":
40
+ prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
41
+ # Read audio from uploaded file
42
+ audio, samplerate = sf.read(file)
43
+ inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
44
+ else:
45
+ return "Invalid input type selected."
46
+
47
+ # Generate response
48
+ with torch.no_grad():
49
+ generate_ids = model.generate(
50
+ **inputs,
51
+ max_new_tokens=200,
52
+ num_logits_to_keep=0,
53
+ )
54
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
55
+ response = processor.batch_decode(
56
+ generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
57
+ )[0]
58
+
59
+ return response
60
+
61
+ # Gradio interface
62
+ with gr.Blocks(
63
+ title="Demo of how GABI could use a Multimodal",
64
+ theme=gr.themes.Soft(
65
+ primary_hue="blue",
66
+ secondary_hue="gray",
67
+ radius_size="lg",
68
+ ),
69
+ ) as demo:
70
+
71
+ # Insert Simli Widget
72
+ gr.HTML(
73
+ """
74
+ <simli-widget
75
+ token="gAAAAABoEN7c6Z4ZuimkCDa7PmB5OgiOqepELAtSQYwUliuC1Zdw6LOPejI0g1XpnDWchiwNCDFDPMd80TVY2NXjnEx2zvnv3FUSXfT4C0dsJT8QTXAklaXyxtGSZD4sG53AFxo1jSzjQWXPnQHVfIU_ISxQqenWluJrCIL1jmEMZehyj3Hx4xpnJ3lOZs3LX4YPPxbUR_CEtIMcp7roc083OVvDJO1Ycxew9KJmiBLqFbiT6hBQUjLi3BLTcEZtl8HxV_YKaKCqZNP9dt73H4a5QTQ5UvypJK2JlQiCWeH6t8LfpON66Hr-aDuZOhTiKbzhNF27jlPHJh6uXyF_rUSRvaOArQJL0S9_x3PCTCi-HBOs9VcSBCe7ICCQFMdQrF1rk7EiGQhjrJeD57rrxZXw6SeOBQjK8-a8JEeS6Fzd7ORNiWXeSEtT46TbVq03X0e44E7hZY90sSwERr2DIeCA7CM5eeHXf_iU_NCl0OwCLgF2Yd6TFQgtT-bPmEnyye5oH-GvZ52U"
76
+ agentid="ff60ad9c-1afd-4b76-86a0-f94bf6e7b3b2"
77
+ position="right"
78
+ customimage="https://i.postimg.cc/K8PPT4GD/temp-Imagerldp-BZ.avif"
79
+ customtext="FaceTime GABI"
80
+ ></simli-widget>
81
+ <script src="https://app.simli.com/simli-widget/index.js" async type="text/javascript"></script>
82
+ """
83
+ )
84
+
85
+ gr.Markdown(
86
+ """
87
+ # This Space is using Phi-4 as the LLM for the Gabi Multimodal Demo
88
+ Try uploading an **image** or **audio** file, ask Gabi a question, and get a response!
89
+ We want to leverage this to allow GABI to have the ability to interact and understand various contents.
90
+ """
91
+ )
92
+
93
+ with gr.Row():
94
+ with gr.Column(scale=1):
95
+ input_type = gr.Radio(
96
+ choices=["Image", "Audio"],
97
+ label="Select Input Type",
98
+ value="Image",
99
+ )
100
+ file_input = gr.File(
101
+ label="Upload Your File",
102
+ file_types=["image", "audio"],
103
+ )
104
+ question_input = gr.Textbox(
105
+ label="Your Question",
106
+ placeholder="e.g., 'Gabi, what is shown in this image?' or 'Gabi, transcribe this audio.'",
107
+ lines=2,
108
+ )
109
+ submit_btn = gr.Button("Submit", variant="primary")
110
+
111
+ with gr.Column(scale=2):
112
+ output_text = gr.Textbox(
113
+ label="Gabi's Response",
114
+ placeholder="Gabi's response will appear here...",
115
+ lines=10,
116
+ interactive=False,
117
+ )
118
+
119
+ # Example section
120
+ with gr.Accordion("Examples", open=False):
121
+ gr.Markdown("Try these examples:")
122
+ gr.Examples(
123
+ examples=[
124
+ ["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "Gabi, what is shown in this image?"],
125
+ ["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Gabi, transcribe the audio to text."],
126
+ ],
127
+ inputs=[input_type, file_input, question_input],
128
+ outputs=output_text,
129
+ fn=process_input,
130
+ cache_examples=False,
131
+ )
132
+
133
+ # Connect the submit button
134
+ submit_btn.click(
135
+ fn=process_input,
136
+ inputs=[input_type, file_input, question_input],
137
+ outputs=output_text,
138
+ )
139
+
140
+ # Launch the demo
141
+ demo.launch()
gitattributes (1) ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ spaces
3
+ requests
4
+ torch
5
+ pillow
6
+ soundfile
7
+ transformers
8
+ torchvision
9
+ scipy
10
+ peft
11
+ backoff