Haoyu Lu commited on
Commit
2648bb4
·
1 Parent(s): e101549

Add application file

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.png filter=lfs diff=lfs merge=lfs -text
37
+ *.ico filter=lfs diff=lfs merge=lfs -text
38
+ *.ttc filter=lfs diff=lfs merge=lfs -text
39
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ *.log
2
+
3
+ __pycache__
4
+ *.pyc
5
+ *.pyo
README.md CHANGED
@@ -1,12 +1,28 @@
1
  ---
2
- title: Kimi VL A3B Thinking
3
- emoji: 🏢
4
- colorFrom: pink
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.25.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Kimi-VL-A3B-Thinking
3
+ emoji: 🤔
4
+ colorFrom: green
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.24.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+
15
+ ## Citation
16
+
17
+ ```
18
+ @misc{kimiteam2025kimivltechnicalreport,
19
+ title={{Kimi-VL} Technical Report},
20
+ author={Kimi Team and Angang Du and Bohong Yin and Bowei Xing and Bowen Qu and Bowen Wang and Cheng Chen and Chenlin Zhang and Chenzhuang Du and Chu Wei and Congcong Wang and Dehao Zhang and Dikang Du and Dongliang Wang and Enming Yuan and Enzhe Lu and Fang Li and Flood Sung and Guangda Wei and Guokun Lai and Han Zhu and Hao Ding and Hao Hu and Hao Yang and Hao Zhang and Haoning Wu and Haotian Yao and Haoyu Lu and Heng Wang and Hongcheng Gao and Huabin Zheng and Jiaming Li and Jianlin Su and Jianzhou Wang and Jiaqi Deng and Jiezhong Qiu and Jin Xie and Jinhong Wang and Jingyuan Liu and Junjie Yan and Kun Ouyang and Liang Chen and Lin Sui and Longhui Yu and Mengfan Dong and Mengnan Dong and Nuo Xu and Pengyu Cheng and Qizheng Gu and Runjie Zhou and Shaowei Liu and Sihan Cao and Tao Yu and Tianhui Song and Tongtong Bai and Wei Song and Weiran He and Weixiao Huang and Weixin Xu and Xiaokun Yuan and Xingcheng Yao and Xingzhe Wu and Xinxing Zu and Xinyu Zhou and Xinyuan Wang and Y. Charles and Yan Zhong and Yang Li and Yangyang Hu and Yanru Chen and Yejie Wang and Yibo Liu and Yibo Miao and Yidao Qin and Yimin Chen and Yiping Bao and Yiqin Wang and Yongsheng Kang and Yuanxin Liu and Yulun Du and Yuxin Wu and Yuzhi Wang and Yuzi Yan and Zaida Zhou and Zhaowei Li and Zhejun Jiang and Zheng Zhang and Zhilin Yang and Zhiqi Huang and Zihao Huang and Zijia Zhao and Ziwei Chen},
21
+ year={2025},
22
+ eprint={2504.07491},
23
+ archivePrefix={arXiv},
24
+ primaryClass={cs.CV},
25
+ url={https://arxiv.org/abs/2504.07491},
26
+ }
27
+ ```
28
+
app.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import gradio as gr
3
+ import os
4
+ from PIL import Image
5
+ import torch
6
+
7
+ from kimi_vl.serve.frontend import reload_javascript
8
+ from kimi_vl.serve.utils import (
9
+ configure_logger,
10
+ pil_to_base64,
11
+ parse_ref_bbox,
12
+ strip_stop_words,
13
+ is_variable_assigned,
14
+ )
15
+ from kimi_vl.serve.gradio_utils import (
16
+ cancel_outputing,
17
+ delete_last_conversation,
18
+ reset_state,
19
+ reset_textbox,
20
+ transfer_input,
21
+ wrap_gen_fn,
22
+ )
23
+ from kimi_vl.serve.chat_utils import (
24
+ generate_prompt_with_history,
25
+ convert_conversation_to_prompts,
26
+ to_gradio_chatbot,
27
+ to_gradio_history,
28
+ )
29
+ from kimi_vl.serve.inference import kimi_vl_generate, load_model
30
+ from kimi_vl.serve.examples import get_examples
31
+
32
+ TITLE = """<h1 align="left" style="min-width:200px; margin-top:0;">Chat with Kimi-VL-A3B-Thinking🤔 </h1>"""
33
+ DESCRIPTION_TOP = """<a href="https://github.com/MoonshotAI/Kimi-VL" target="_blank">Kimi-VL-A3B-Thinking</a> is a multi-modal LLM that can understand text and images, and generate text with thinking processes. For non-thinking version, please try [Kimi-VL-A3B](https://huggingface.co/spaces/moonshotai/Kimi-VL-A3B)."""
34
+ DESCRIPTION = """"""
35
+ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
36
+ DEPLOY_MODELS = dict()
37
+ logger = configure_logger()
38
+
39
+
40
+ def parse_args():
41
+ parser = argparse.ArgumentParser()
42
+ parser.add_argument("--model", type=str, default="Kimi-VL-A3B-Thinking")
43
+ parser.add_argument(
44
+ "--local-path",
45
+ type=str,
46
+ default="",
47
+ help="huggingface ckpt, optional",
48
+ )
49
+ parser.add_argument("--ip", type=str, default="0.0.0.0")
50
+ parser.add_argument("--port", type=int, default=7860)
51
+ return parser.parse_args()
52
+
53
+
54
+ def fetch_model(model_name: str):
55
+ global args, DEPLOY_MODELS
56
+
57
+ if args.local_path:
58
+ model_path = args.local_path
59
+ else:
60
+ model_path = f"moonshotai/{args.model}"
61
+
62
+ if model_name in DEPLOY_MODELS:
63
+ model_info = DEPLOY_MODELS[model_name]
64
+ print(f"{model_name} has been loaded.")
65
+ else:
66
+ print(f"{model_name} is loading...")
67
+ DEPLOY_MODELS[model_name] = load_model(model_path)
68
+ print(f"Load {model_name} successfully...")
69
+ model_info = DEPLOY_MODELS[model_name]
70
+
71
+ return model_info
72
+
73
+
74
+ def preview_images(files) -> list[str]:
75
+ if files is None:
76
+ return []
77
+
78
+ image_paths = []
79
+ for file in files:
80
+ image_paths.append(file.name)
81
+ return image_paths
82
+
83
+
84
+ def get_prompt(conversation) -> str:
85
+ """
86
+ Get the prompt for the conversation.
87
+ """
88
+ system_prompt = conversation.system_template.format(system_message=conversation.system_message)
89
+ return system_prompt
90
+
91
+
92
+ @wrap_gen_fn
93
+ def predict(
94
+ text,
95
+ images,
96
+ chatbot,
97
+ history,
98
+ top_p,
99
+ temperature,
100
+ max_length_tokens,
101
+ max_context_length_tokens,
102
+ chunk_size: int = 512,
103
+ ):
104
+ """
105
+ Predict the response for the input text and images.
106
+ Args:
107
+ text (str): The input text.
108
+ images (list[PIL.Image.Image]): The input images.
109
+ chatbot (list): The chatbot.
110
+ history (list): The history.
111
+ top_p (float): The top-p value.
112
+ temperature (float): The temperature value.
113
+ repetition_penalty (float): The repetition penalty value.
114
+ max_length_tokens (int): The max length tokens.
115
+ max_context_length_tokens (int): The max context length tokens.
116
+ chunk_size (int): The chunk size.
117
+ """
118
+ print("running the prediction function")
119
+ try:
120
+ model, processor = fetch_model(args.model)
121
+
122
+ if text == "":
123
+ yield chatbot, history, "Empty context."
124
+ return
125
+ except KeyError:
126
+ yield [[text, "No Model Found"]], [], "No Model Found"
127
+ return
128
+
129
+ if images is None:
130
+ images = []
131
+
132
+ # load images
133
+ pil_images = []
134
+ for img_or_file in images:
135
+ try:
136
+ # load as pil image
137
+ if isinstance(images, Image.Image):
138
+ pil_images.append(img_or_file)
139
+ else:
140
+ image = Image.open(img_or_file.name).convert("RGB")
141
+ pil_images.append(image)
142
+ except Exception as e:
143
+ print(f"Error loading image: {e}")
144
+
145
+ # generate prompt
146
+ conversation = generate_prompt_with_history(
147
+ text,
148
+ pil_images,
149
+ history,
150
+ processor,
151
+ max_length=max_context_length_tokens,
152
+ )
153
+ all_conv, last_image = convert_conversation_to_prompts(conversation)
154
+ stop_words = conversation.stop_str
155
+ gradio_chatbot_output = to_gradio_chatbot(conversation)
156
+
157
+ full_response = ""
158
+ with torch.no_grad():
159
+ for x in kimi_vl_generate(
160
+ conversations=all_conv,
161
+ model=model,
162
+ processor=processor,
163
+ stop_words=stop_words,
164
+ max_length=max_length_tokens,
165
+ temperature=temperature,
166
+ top_p=top_p,
167
+ ):
168
+ full_response += x
169
+ response = strip_stop_words(full_response, stop_words)
170
+ conversation.update_last_message(response)
171
+ gradio_chatbot_output[-1][1] = response
172
+
173
+ yield gradio_chatbot_output, to_gradio_history(conversation), "Generating..."
174
+
175
+ if last_image is not None:
176
+ vg_image = parse_ref_bbox(response, last_image)
177
+ if vg_image is not None:
178
+ vg_base64 = pil_to_base64(vg_image, "vg", max_size=800, min_size=400)
179
+ gradio_chatbot_output[-1][1] += vg_base64
180
+ yield gradio_chatbot_output, to_gradio_history(conversation), "Generating..."
181
+
182
+ logger.info("flushed result to gradio")
183
+ torch.cuda.empty_cache()
184
+
185
+ if is_variable_assigned("x"):
186
+ print(
187
+ f"temperature: {temperature}, "
188
+ f"top_p: {top_p}, "
189
+ f"max_length_tokens: {max_length_tokens}"
190
+ )
191
+
192
+ yield gradio_chatbot_output, to_gradio_history(conversation), "Generate: Success"
193
+
194
+
195
+ def retry(
196
+ text,
197
+ images,
198
+ chatbot,
199
+ history,
200
+ top_p,
201
+ temperature,
202
+ max_length_tokens,
203
+ max_context_length_tokens,
204
+ chunk_size: int = 512,
205
+ ):
206
+ """
207
+ Retry the response for the input text and images.
208
+ """
209
+ if len(history) == 0:
210
+ yield (chatbot, history, "Empty context")
211
+ return
212
+
213
+ chatbot.pop()
214
+ history.pop()
215
+ text = history.pop()[-1]
216
+ if type(text) is tuple:
217
+ text, _ = text
218
+
219
+ yield from predict(
220
+ text,
221
+ images,
222
+ chatbot,
223
+ history,
224
+ top_p,
225
+ temperature,
226
+ max_length_tokens,
227
+ max_context_length_tokens,
228
+ chunk_size,
229
+ )
230
+
231
+
232
+ def build_demo(args: argparse.Namespace) -> gr.Blocks:
233
+ with gr.Blocks(theme=gr.themes.Soft(), delete_cache=(1800, 1800)) as demo:
234
+ history = gr.State([])
235
+ input_text = gr.State()
236
+ input_images = gr.State()
237
+
238
+ with gr.Row():
239
+ gr.HTML(TITLE)
240
+ status_display = gr.Markdown("Success", elem_id="status_display")
241
+ gr.Markdown(DESCRIPTION_TOP)
242
+
243
+ with gr.Row(equal_height=True):
244
+ with gr.Column(scale=4):
245
+ with gr.Row():
246
+ chatbot = gr.Chatbot(
247
+ elem_id="Kimi-VL-A3B-Thinking-chatbot",
248
+ show_share_button=True,
249
+ bubble_full_width=False,
250
+ height=600,
251
+ )
252
+ with gr.Row():
253
+ with gr.Column(scale=4):
254
+ text_box = gr.Textbox(show_label=False, placeholder="Enter text", container=False)
255
+ with gr.Column(min_width=70):
256
+ submit_btn = gr.Button("Send")
257
+ with gr.Column(min_width=70):
258
+ cancel_btn = gr.Button("Stop")
259
+ with gr.Row():
260
+ empty_btn = gr.Button("🧹 New Conversation")
261
+ retry_btn = gr.Button("🔄 Regenerate")
262
+ del_last_btn = gr.Button("🗑️ Remove Last Turn")
263
+
264
+ with gr.Column():
265
+ # add note no more than 2 images once
266
+ gr.Markdown("Note: you can upload no more than 2 images once")
267
+ upload_images = gr.Files(file_types=["image"], show_label=True)
268
+ gallery = gr.Gallery(columns=[3], height="200px", show_label=True)
269
+ upload_images.change(preview_images, inputs=upload_images, outputs=gallery)
270
+ # Parameter Setting Tab for control the generation parameters
271
+ with gr.Tab(label="Parameter Setting"):
272
+ top_p = gr.Slider(minimum=-0, maximum=1.0, value=1.0, step=0.05, interactive=True, label="Top-p")
273
+ temperature = gr.Slider(
274
+ minimum=0, maximum=1.0, value=0.6, step=0.1, interactive=True, label="Temperature"
275
+ )
276
+ max_length_tokens = gr.Slider(
277
+ minimum=512, maximum=8192, value=2048, step=64, interactive=True, label="Max Length Tokens"
278
+ )
279
+ max_context_length_tokens = gr.Slider(
280
+ minimum=512, maximum=8192, value=2048, step=64, interactive=True, label="Max Context Length Tokens"
281
+ )
282
+
283
+ show_images = gr.HTML(visible=False)
284
+
285
+ gr.Examples(
286
+ examples=get_examples(ROOT_DIR),
287
+ inputs=[upload_images, show_images, text_box],
288
+ )
289
+ gr.Markdown()
290
+
291
+ input_widgets = [
292
+ input_text,
293
+ input_images,
294
+ chatbot,
295
+ history,
296
+ top_p,
297
+ temperature,
298
+ max_length_tokens,
299
+ max_context_length_tokens,
300
+ ]
301
+ output_widgets = [chatbot, history, status_display]
302
+
303
+ transfer_input_args = dict(
304
+ fn=transfer_input,
305
+ inputs=[text_box, upload_images],
306
+ outputs=[input_text, input_images, text_box, upload_images, submit_btn],
307
+ show_progress=True,
308
+ )
309
+
310
+ predict_args = dict(fn=predict, inputs=input_widgets, outputs=output_widgets, show_progress=True)
311
+ retry_args = dict(fn=retry, inputs=input_widgets, outputs=output_widgets, show_progress=True)
312
+ reset_args = dict(fn=reset_textbox, inputs=[], outputs=[text_box, status_display])
313
+
314
+ predict_events = [
315
+ text_box.submit(**transfer_input_args).then(**predict_args),
316
+ submit_btn.click(**transfer_input_args).then(**predict_args),
317
+ ]
318
+
319
+ empty_btn.click(reset_state, outputs=output_widgets, show_progress=True)
320
+ empty_btn.click(**reset_args)
321
+ retry_btn.click(**retry_args)
322
+ del_last_btn.click(delete_last_conversation, [chatbot, history], output_widgets, show_progress=True)
323
+ cancel_btn.click(cancel_outputing, [], [status_display], cancels=predict_events)
324
+
325
+ demo.title = "Kimi-VL-A3B-Thinking Chatbot"
326
+ return demo
327
+
328
+
329
+ def main(args: argparse.Namespace):
330
+ demo = build_demo(args)
331
+ reload_javascript()
332
+
333
+ # concurrency_count=CONCURRENT_COUNT, max_size=MAX_EVENTS
334
+ favicon_path = os.path.join("kimi_vl/serve/assets/favicon.ico")
335
+ demo.queue().launch(
336
+ favicon_path=favicon_path,
337
+ server_name=args.ip,
338
+ server_port=args.port,
339
+ )
340
+
341
+
342
+ if __name__ == "__main__":
343
+ args = parse_args()
344
+ main(args)
images/demo1.jpeg ADDED

Git LFS Details

  • SHA256: 8fc81bcaf75321eb871827fb0cad556cc5d3fe304864516c3dbace377fb82b64
  • Pointer size: 132 Bytes
  • Size of remote file: 5.31 MB
images/demo2.jpeg ADDED

Git LFS Details

  • SHA256: fddde8fc86f53fce4625f8defb54640c7fb885f1049eb0fb631d6234ac0e994b
  • Pointer size: 131 Bytes
  • Size of remote file: 450 kB
images/demo3.jpeg ADDED

Git LFS Details

  • SHA256: 9a3833fb7fc115cb7f74454296023e548e2eee32642ccbcee3baa7ad9b561097
  • Pointer size: 130 Bytes
  • Size of remote file: 20.3 kB
images/demo4.jpeg ADDED

Git LFS Details

  • SHA256: 2761a3226f9cd4d894e822c6dc98a4a418a89c4f82e1cc00a57d960fb66fc51f
  • Pointer size: 131 Bytes
  • Size of remote file: 223 kB
images/demo5.jpeg ADDED

Git LFS Details

  • SHA256: 927541679993f7bd2bcd344c04d648bed64ba1a97a4473a16eab1647fa190e8d
  • Pointer size: 131 Bytes
  • Size of remote file: 264 kB
kimi_vl/__init__.py ADDED
File without changes
kimi_vl/serve/__init__.py ADDED
File without changes
kimi_vl/serve/assets/Kelpy-Codos.js ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Copyright (c) 2023-2024 DeepSeek.
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
5
+ * this software and associated documentation files (the "Software"), to deal in
6
+ * the Software without restriction, including without limitation the rights to
7
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8
+ * the Software, and to permit persons to whom the Software is furnished to do so,
9
+ * subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be included in all
12
+ * copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20
+ */
21
+
22
+ // ==UserScript==
23
+ // @name Kelpy Codos
24
+ // @namespace https://github.com/Keldos-Li/Kelpy-Codos
25
+ // @version 1.0.5
26
+ // @author Keldos; https://keldos.me/
27
+ // @description Add copy button to PRE tags before CODE tag, for Chuanhu ChatGPT especially.
28
+ // Based on Chuanhu ChatGPT version: ac04408 (2023-3-22)
29
+ // @license GPL-3.0
30
+ // @grant none
31
+ // ==/UserScript==
32
+
33
+ (function () {
34
+ "use strict";
35
+
36
+ function addCopyButton(pre) {
37
+ var code = pre.querySelector("code");
38
+ if (!code) {
39
+ return; // 如果没有找到 <code> 元素,则不添加按钮
40
+ }
41
+ var firstChild = code.firstChild;
42
+ if (!firstChild) {
43
+ return; // 如果 <code> 元素没有子节点,则不添加按钮
44
+ }
45
+ var button = document.createElement("button");
46
+ button.textContent = "\uD83D\uDCCE"; // 使用 📎 符号作为“复制”按钮的文本
47
+ button.style.position = "relative";
48
+ button.style.float = "right";
49
+ button.style.fontSize = "1em"; // 可选:调整按钮大小
50
+ button.style.background = "none"; // 可选:去掉背景颜色
51
+ button.style.border = "none"; // 可选:去掉边框
52
+ button.style.cursor = "pointer"; // 可选:显示指针样式
53
+ button.addEventListener("click", function () {
54
+ var range = document.createRange();
55
+ range.selectNodeContents(code);
56
+ range.setStartBefore(firstChild); // 将范围设置为第一个子节点之前
57
+ var selection = window.getSelection();
58
+ selection.removeAllRanges();
59
+ selection.addRange(range);
60
+
61
+ try {
62
+ var success = document.execCommand("copy");
63
+ if (success) {
64
+ button.textContent = "\u2714";
65
+ setTimeout(function () {
66
+ button.textContent = "\uD83D\uDCCE"; // 恢复按钮为“复制”
67
+ }, 2000);
68
+ } else {
69
+ button.textContent = "\u2716";
70
+ }
71
+ } catch (e) {
72
+ console.error(e);
73
+ button.textContent = "\u2716";
74
+ }
75
+
76
+ selection.removeAllRanges();
77
+ });
78
+ code.insertBefore(button, firstChild); // 将按钮插入到第一个子元素之前
79
+ }
80
+
81
+ function handleNewElements(mutationsList, observer) {
82
+ for (var mutation of mutationsList) {
83
+ if (mutation.type === "childList") {
84
+ for (var node of mutation.addedNodes) {
85
+ if (node.nodeName === "PRE") {
86
+ addCopyButton(node);
87
+ }
88
+ }
89
+ }
90
+ }
91
+ }
92
+
93
+ var observer = new MutationObserver(handleNewElements);
94
+ observer.observe(document.documentElement, {
95
+ childList: true,
96
+ subtree: true,
97
+ });
98
+
99
+ document.querySelectorAll("pre").forEach(addCopyButton);
100
+ })();
kimi_vl/serve/assets/avatar.png ADDED

Git LFS Details

  • SHA256: 3395211efab793b89a4e579d90bd606b0eb435e2566aedf54bec585e436a8e71
  • Pointer size: 130 Bytes
  • Size of remote file: 62.1 kB
kimi_vl/serve/assets/custom.css ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Copyright (c) 2023-2024 DeepSeek.
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
5
+ * this software and associated documentation files (the "Software"), to deal in
6
+ * the Software without restriction, including without limitation the rights to
7
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8
+ * the Software, and to permit persons to whom the Software is furnished to do so,
9
+ * subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be included in all
12
+ * copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20
+ */
21
+
22
+ :root {
23
+ --chatbot-color-light: #f3f3f3;
24
+ --chatbot-color-dark: #121111;
25
+ }
26
+
27
+ /* status_display */
28
+ #status_display {
29
+ display: flex;
30
+ min-height: 2.5em;
31
+ align-items: flex-end;
32
+ justify-content: flex-end;
33
+ }
34
+ #status_display p {
35
+ font-size: 0.85em;
36
+ font-family: monospace;
37
+ color: var(--body-text-color-subdued);
38
+ }
39
+
40
+ /* usage_display */
41
+ #usage_display {
42
+ height: 1em;
43
+ }
44
+ #usage_display p {
45
+ padding: 0 1em;
46
+ font-size: 0.85em;
47
+ font-family: monospace;
48
+ color: var(--body-text-color-subdued);
49
+ }
50
+ /* list */
51
+ ol:not(.options),
52
+ ul:not(.options) {
53
+ padding-inline-start: 2em !important;
54
+ }
55
+
56
+ /* Thank @Keldos-Li for fixing it */
57
+ /* Light mode (default) */
58
+ #deepseek_chatbot {
59
+ background-color: var(--chatbot-color-light) !important;
60
+ color: #000000 !important;
61
+ }
62
+ [data-testid="bot"] {
63
+ background-color: #ffffff !important;
64
+ }
65
+ [data-testid="user"] {
66
+ background-color: #95ec69 !important;
67
+ }
68
+
69
+ /* Dark mode */
70
+ .dark #deepseek_chatbot {
71
+ background-color: var(--chatbot-color-dark) !important;
72
+ color: #ffffff !important;
73
+ }
74
+ .dark [data-testid="bot"] {
75
+ background-color: #2c2c2c !important;
76
+ }
77
+ .dark [data-testid="user"] {
78
+ background-color: #26b561 !important;
79
+ }
80
+
81
+ #deepseek_chatbot {
82
+ height: 100%;
83
+ min-height: 800px;
84
+ flex-grow: 1;
85
+ overflow: auto;
86
+ }
87
+
88
+ [class*="message"] {
89
+ border-radius: var(--radius-xl) !important;
90
+ border: none;
91
+ padding: var(--spacing-xl) !important;
92
+ font-size: var(--text-md) !important;
93
+ line-height: var(--line-md) !important;
94
+ min-height: calc(var(--text-md) * var(--line-md) + 2 * var(--spacing-xl));
95
+ min-width: calc(var(--text-md) * var(--line-md) + 2 * var(--spacing-xl));
96
+ }
97
+ [data-testid="bot"] {
98
+ max-width: 85%;
99
+ border-bottom-left-radius: 0 !important;
100
+ }
101
+ [data-testid="user"] {
102
+ max-width: 85%;
103
+ width: auto !important;
104
+ border-bottom-right-radius: 0 !important;
105
+ }
106
+ /* Table */
107
+ table {
108
+ margin: 1em 0;
109
+ border-collapse: collapse;
110
+ empty-cells: show;
111
+ }
112
+ td,
113
+ th {
114
+ border: 1.2px solid var(--border-color-primary) !important;
115
+ padding: 0.2em;
116
+ }
117
+ thead {
118
+ background-color: rgba(175, 184, 193, 0.2);
119
+ }
120
+ thead th {
121
+ padding: 0.5em 0.2em;
122
+ }
123
+ /* Inline code */
124
+ #deepseek_chatbot code {
125
+ display: inline;
126
+ white-space: break-spaces;
127
+ border-radius: 6px;
128
+ margin: 0 2px 0 2px;
129
+ padding: 0.2em 0.4em 0.1em 0.4em;
130
+ background-color: rgba(175, 184, 193, 0.2);
131
+ }
132
+ /* Code block */
133
+ #deepseek_chatbot pre code {
134
+ display: block;
135
+ overflow: auto;
136
+ white-space: pre;
137
+ background-color: #1c1d1e !important;
138
+ border-radius: 10px;
139
+ padding: 1.4em 1.2em 0em 1.4em;
140
+ margin: 1.2em 2em 1.2em 0.5em;
141
+ color: #fdf8f8;
142
+ box-shadow: 6px 6px 16px hsla(0, 0%, 0%, 0.2);
143
+ }
144
+ /* Hightlight */
145
+ #deepseek_chatbot .highlight {
146
+ background-color: transparent;
147
+ }
148
+ #deepseek_chatbot .highlight .hll {
149
+ background-color: #49483e;
150
+ }
151
+ #deepseek_chatbot .highlight .c {
152
+ color: #75715e;
153
+ } /* Comment */
154
+ #deepseek_chatbot .highlight .err {
155
+ color: #960050;
156
+ background-color: #1e0010;
157
+ } /* Error */
158
+ #deepseek_chatbot .highlight .k {
159
+ color: #66d9ef;
160
+ } /* Keyword */
161
+ #deepseek_chatbot .highlight .l {
162
+ color: #ae81ff;
163
+ } /* Literal */
164
+ #deepseek_chatbot .highlight .n {
165
+ color: #f8f8f2;
166
+ } /* Name */
167
+ #deepseek_chatbot .highlight .o {
168
+ color: #f92672;
169
+ } /* Operator */
170
+ #deepseek_chatbot .highlight .p {
171
+ color: #f8f8f2;
172
+ } /* Punctuation */
173
+ #deepseek_chatbot .highlight .ch {
174
+ color: #75715e;
175
+ } /* Comment.Hashbang */
176
+ #deepseek_chatbot .highlight .cm {
177
+ color: #75715e;
178
+ } /* Comment.Multiline */
179
+ #deepseek_chatbot .highlight .cp {
180
+ color: #75715e;
181
+ } /* Comment.Preproc */
182
+ #deepseek_chatbot .highlight .cpf {
183
+ color: #75715e;
184
+ } /* Comment.PreprocFile */
185
+ #deepseek_chatbot .highlight .c1 {
186
+ color: #75715e;
187
+ } /* Comment.Single */
188
+ #deepseek_chatbot .highlight .cs {
189
+ color: #75715e;
190
+ } /* Comment.Special */
191
+ #deepseek_chatbot .highlight .gd {
192
+ color: #f92672;
193
+ } /* Generic.Deleted */
194
+ #deepseek_chatbot .highlight .ge {
195
+ font-style: italic;
196
+ } /* Generic.Emph */
197
+ #deepseek_chatbot .highlight .gi {
198
+ color: #a6e22e;
199
+ } /* Generic.Inserted */
200
+ #deepseek_chatbot .highlight .gs {
201
+ font-weight: bold;
202
+ } /* Generic.Strong */
203
+ #deepseek_chatbot .highlight .gu {
204
+ color: #75715e;
205
+ } /* Generic.Subheading */
206
+ #deepseek_chatbot .highlight .kc {
207
+ color: #66d9ef;
208
+ } /* Keyword.Constant */
209
+ #deepseek_chatbot .highlight .kd {
210
+ color: #66d9ef;
211
+ } /* Keyword.Declaration */
212
+ #deepseek_chatbot .highlight .kn {
213
+ color: #f92672;
214
+ } /* Keyword.Namespace */
215
+ #deepseek_chatbot .highlight .kp {
216
+ color: #66d9ef;
217
+ } /* Keyword.Pseudo */
218
+ #deepseek_chatbot .highlight .kr {
219
+ color: #66d9ef;
220
+ } /* Keyword.Reserved */
221
+ #deepseek_chatbot .highlight .kt {
222
+ color: #66d9ef;
223
+ } /* Keyword.Type */
224
+ #deepseek_chatbot .highlight .ld {
225
+ color: #e6db74;
226
+ } /* Literal.Date */
227
+ #deepseek_chatbot .highlight .m {
228
+ color: #ae81ff;
229
+ } /* Literal.Number */
230
+ #deepseek_chatbot .highlight .s {
231
+ color: #e6db74;
232
+ } /* Literal.String */
233
+ #deepseek_chatbot .highlight .na {
234
+ color: #a6e22e;
235
+ } /* Name.Attribute */
236
+ #deepseek_chatbot .highlight .nb {
237
+ color: #f8f8f2;
238
+ } /* Name.Builtin */
239
+ #deepseek_chatbot .highlight .nc {
240
+ color: #a6e22e;
241
+ } /* Name.Class */
242
+ #deepseek_chatbot .highlight .no {
243
+ color: #66d9ef;
244
+ } /* Name.Constant */
245
+ #deepseek_chatbot .highlight .nd {
246
+ color: #a6e22e;
247
+ } /* Name.Decorator */
248
+ #deepseek_chatbot .highlight .ni {
249
+ color: #f8f8f2;
250
+ } /* Name.Entity */
251
+ #deepseek_chatbot .highlight .ne {
252
+ color: #a6e22e;
253
+ } /* Name.Exception */
254
+ #deepseek_chatbot .highlight .nf {
255
+ color: #a6e22e;
256
+ } /* Name.Function */
257
+ #deepseek_chatbot .highlight .nl {
258
+ color: #f8f8f2;
259
+ } /* Name.Label */
260
+ #deepseek_chatbot .highlight .nn {
261
+ color: #f8f8f2;
262
+ } /* Name.Namespace */
263
+ #deepseek_chatbot .highlight .nx {
264
+ color: #a6e22e;
265
+ } /* Name.Other */
266
+ #deepseek_chatbot .highlight .py {
267
+ color: #f8f8f2;
268
+ } /* Name.Property */
269
+ #deepseek_chatbot .highlight .nt {
270
+ color: #f92672;
271
+ } /* Name.Tag */
272
+ #deepseek_chatbot .highlight .nv {
273
+ color: #f8f8f2;
274
+ } /* Name.Variable */
275
+ #deepseek_chatbot .highlight .ow {
276
+ color: #f92672;
277
+ } /* Operator.Word */
278
+ #deepseek_chatbot .highlight .w {
279
+ color: #f8f8f2;
280
+ } /* Text.Whitespace */
281
+ #deepseek_chatbot .highlight .mb {
282
+ color: #ae81ff;
283
+ } /* Literal.Number.Bin */
284
+ #deepseek_chatbot .highlight .mf {
285
+ color: #ae81ff;
286
+ } /* Literal.Number.Float */
287
+ #deepseek_chatbot .highlight .mh {
288
+ color: #ae81ff;
289
+ } /* Literal.Number.Hex */
290
+ #deepseek_chatbot .highlight .mi {
291
+ color: #ae81ff;
292
+ } /* Literal.Number.Integer */
293
+ #deepseek_chatbot .highlight .mo {
294
+ color: #ae81ff;
295
+ } /* Literal.Number.Oct */
296
+ #deepseek_chatbot .highlight .sa {
297
+ color: #e6db74;
298
+ } /* Literal.String.Affix */
299
+ #deepseek_chatbot .highlight .sb {
300
+ color: #e6db74;
301
+ } /* Literal.String.Backtick */
302
+ #deepseek_chatbot .highlight .sc {
303
+ color: #e6db74;
304
+ } /* Literal.String.Char */
305
+ #deepseek_chatbot .highlight .dl {
306
+ color: #e6db74;
307
+ } /* Literal.String.Delimiter */
308
+ #deepseek_chatbot .highlight .sd {
309
+ color: #e6db74;
310
+ } /* Literal.String.Doc */
311
+ #deepseek_chatbot .highlight .s2 {
312
+ color: #e6db74;
313
+ } /* Literal.String.Double */
314
+ #deepseek_chatbot .highlight .se {
315
+ color: #ae81ff;
316
+ } /* Literal.String.Escape */
317
+ #deepseek_chatbot .highlight .sh {
318
+ color: #e6db74;
319
+ } /* Literal.String.Heredoc */
320
+ #deepseek_chatbot .highlight .si {
321
+ color: #e6db74;
322
+ } /* Literal.String.Interpol */
323
+ #deepseek_chatbot .highlight .sx {
324
+ color: #e6db74;
325
+ } /* Literal.String.Other */
326
+ #deepseek_chatbot .highlight .sr {
327
+ color: #e6db74;
328
+ } /* Literal.String.Regex */
329
+ #deepseek_chatbot .highlight .s1 {
330
+ color: #e6db74;
331
+ } /* Literal.String.Single */
332
+ #deepseek_chatbot .highlight .ss {
333
+ color: #e6db74;
334
+ } /* Literal.String.Symbol */
335
+ #deepseek_chatbot .highlight .bp {
336
+ color: #f8f8f2;
337
+ } /* Name.Builtin.Pseudo */
338
+ #deepseek_chatbot .highlight .fm {
339
+ color: #a6e22e;
340
+ } /* Name.Function.Magic */
341
+ #deepseek_chatbot .highlight .vc {
342
+ color: #f8f8f2;
343
+ } /* Name.Variable.Class */
344
+ #deepseek_chatbot .highlight .vg {
345
+ color: #f8f8f2;
346
+ } /* Name.Variable.Global */
347
+ #deepseek_chatbot .highlight .vi {
348
+ color: #f8f8f2;
349
+ } /* Name.Variable.Instance */
350
+ #deepseek_chatbot .highlight .vm {
351
+ color: #f8f8f2;
352
+ } /* Name.Variable.Magic */
353
+ #deepseek_chatbot .highlight .il {
354
+ color: #ae81ff;
355
+ } /* Literal.Number.Integer.Long */
kimi_vl/serve/assets/custom.js ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Copyright (c) 2023-2024 DeepSeek.
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
5
+ * this software and associated documentation files (the "Software"), to deal in
6
+ * the Software without restriction, including without limitation the rights to
7
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8
+ * the Software, and to permit persons to whom the Software is furnished to do so,
9
+ * subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be included in all
12
+ * copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20
+ */
21
+
22
+ // custom javascript here
kimi_vl/serve/assets/favicon.ico ADDED

Git LFS Details

  • SHA256: 28dab71bd4190f41c7de510615e91afcba52ad7ce6826fbf86b213205be62b45
  • Pointer size: 130 Bytes
  • Size of remote file: 15.4 kB
kimi_vl/serve/assets/simsun.ttc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff7d69bfa6588d3fdedbddbe3a29ac11f0c50236723ee72a9ea49ec3e2553f5d
3
+ size 15323200
kimi_vl/serve/chat_utils.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ From https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
3
+ """
4
+
5
+ import dataclasses
6
+ import logging
7
+ import copy
8
+ from enum import IntEnum, auto
9
+ from typing import Dict, List
10
+ import base64
11
+
12
+ import gradio as gr
13
+ import torch
14
+
15
+ from .utils import pil_to_base64
16
+
17
+ IMAGE_TOKEN = "<image>"
18
+ logger = logging.getLogger("gradio_logger")
19
+
20
+
21
+ class SeparatorStyle(IntEnum):
22
+ """Separator styles."""
23
+
24
+ PLAIN = auto()
25
+ ALIGNMENT = auto()
26
+ KIMI_VL = auto()
27
+
28
+
29
+ @dataclasses.dataclass
30
+ class Conversation:
31
+ """A class that manages prompt templates and keeps all conversation history."""
32
+
33
+ # The name of this template
34
+ name: str
35
+ # The template of the system prompt
36
+ system_template: str = "{system_message}"
37
+ # The system message
38
+ system_message: str = ""
39
+ # The names of two roles
40
+ roles: List[str] = (("USER", "ASSISTANT"),)
41
+ # All messages. Each item is (role, message).
42
+ messages: List[List[str]] = ()
43
+ # The number of few shot examples
44
+ offset: int = 0
45
+ # The separator style and configurations
46
+ sep_style: SeparatorStyle = SeparatorStyle.PLAIN
47
+ sep: str = "\n"
48
+ sep2: str = None
49
+ # Stop criteria (the default one is EOS token)
50
+ stop_str: str = None
51
+ # Stops generation if meeting any token in this list
52
+ stop_token_ids: List[int] = None
53
+
54
+ def get_prompt(self) -> str:
55
+ """Get the prompt for generation."""
56
+ system_prompt = self.system_template.format(system_message=self.system_message)
57
+ if self.sep_style == SeparatorStyle.PLAIN:
58
+ seps = [self.sep, self.sep2]
59
+ ret = ""
60
+ for i, (role, message) in enumerate(self.messages):
61
+ if message:
62
+ if type(message) is tuple:
63
+ message = message[0]
64
+ if i % 2 == 0:
65
+ ret += message + seps[i % 2]
66
+ else:
67
+ ret += message + seps[i % 2]
68
+ else:
69
+ ret += ""
70
+ return ret
71
+ elif self.sep_style == SeparatorStyle.ALIGNMENT:
72
+ seps = [self.sep, self.sep2]
73
+ ret = ""
74
+ for i, (role, message) in enumerate(self.messages):
75
+ if message:
76
+ if type(message) is tuple:
77
+ message, _, _ = message
78
+ if i % 2 == 0:
79
+ ret += '<image>\n' + seps[i % 2]
80
+ else:
81
+ ret += message + seps[i % 2]
82
+ else:
83
+ ret += ""
84
+ return ret
85
+ elif self.sep_style == SeparatorStyle.KIMI_VL:
86
+ seps = [self.sep, self.sep2]
87
+ if system_prompt == "" or system_prompt is None:
88
+ ret = ""
89
+ else:
90
+ ret = system_prompt + seps[0]
91
+ for i, (role, message) in enumerate(self.messages):
92
+ if message:
93
+ if type(message) is tuple:
94
+ message = message[0]
95
+
96
+ if role == "user":
97
+ ret += message + self.sep
98
+ else:
99
+ if self.sep2 is not None:
100
+ ret += message + self.sep2
101
+ else:
102
+ ret += message
103
+ else:
104
+ ret = ret
105
+ return ret
106
+ else:
107
+ raise ValueError(f"Invalid style: {self.sep_style}")
108
+
109
+ def set_system_message(self, system_message: str):
110
+ """Set the system message."""
111
+ self.system_message = system_message
112
+
113
+ def append_message(self, role: str, message: str):
114
+ """Append a new message."""
115
+ self.messages.append([role, message])
116
+
117
+ def update_last_message(self, message: str):
118
+ """Update the last output.
119
+
120
+ The last message is typically set to be None when constructing the prompt,
121
+ so we need to update it in-place after getting the response from a model.
122
+ """
123
+ self.messages[-1][1] = message
124
+
125
+ def reset_message(self):
126
+ """Reset a new message."""
127
+ self.messages = []
128
+
129
+ def to_gradio_chatbot(self):
130
+ """Convert the conversation to gradio chatbot format."""
131
+ ret = []
132
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
133
+ if i % 2 == 0:
134
+ ret.append([msg, None])
135
+ else:
136
+ ret[-1][-1] = msg
137
+ return ret
138
+
139
+ def to_openai_api_messages(self):
140
+ """Convert the conversation to OpenAI chat completion format."""
141
+ system_prompt = self.system_template.format(system_message=self.system_message)
142
+ ret = [{"role": "system", "content": system_prompt}]
143
+
144
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
145
+ if i % 2 == 0:
146
+ ret.append({"role": "user", "content": msg})
147
+ else:
148
+ if msg is not None:
149
+ ret.append({"role": "assistant", "content": msg})
150
+ return ret
151
+
152
+ def copy(self):
153
+ return Conversation(
154
+ name=self.name,
155
+ system_template=self.system_template,
156
+ system_message=self.system_message,
157
+ roles=self.roles,
158
+ messages=[[x, y] for x, y in self.messages],
159
+ offset=self.offset,
160
+ sep_style=self.sep_style,
161
+ sep=self.sep,
162
+ sep2=self.sep2,
163
+ stop_str=self.stop_str,
164
+ stop_token_ids=self.stop_token_ids,
165
+ )
166
+
167
+ def dict(self):
168
+ return {
169
+ "template_name": self.name,
170
+ "system_message": self.system_message,
171
+ "roles": self.roles,
172
+ "messages": self.messages,
173
+ "offset": self.offset,
174
+ }
175
+
176
+
177
+ # A global registry for all conversation templates
178
+ conv_templates: Dict[str, Conversation] = {}
179
+
180
+
181
+ def register_conv_template(template: Conversation, override: bool = False):
182
+ """Register a new conversation template."""
183
+ if not override:
184
+ assert template.name not in conv_templates, f"{template.name} has been registered."
185
+
186
+ conv_templates[template.name] = template
187
+
188
+
189
+ def get_conv_template(name: str) -> Conversation:
190
+ """Get a conversation template."""
191
+ return conv_templates[name].copy()
192
+
193
+
194
+ register_conv_template(
195
+ Conversation(
196
+ name="plain",
197
+ system_template="",
198
+ system_message="",
199
+ roles=("", ""),
200
+ messages=(),
201
+ offset=0,
202
+ sep_style=SeparatorStyle.PLAIN,
203
+ sep="",
204
+ sep2="",
205
+ stop_token_ids=[100001],
206
+ stop_str=['</s>'],
207
+ )
208
+ )
209
+
210
+
211
+ register_conv_template(
212
+ Conversation(
213
+ name="alignment",
214
+ system_template="",
215
+ system_message="",
216
+ roles=("", ""),
217
+ messages=(),
218
+ offset=0,
219
+ sep_style=SeparatorStyle.ALIGNMENT,
220
+ sep="",
221
+ sep2="",
222
+ stop_token_ids=[100001],
223
+ stop_str=['</s>'],
224
+ )
225
+ )
226
+
227
+ register_conv_template(
228
+ Conversation(
229
+ name="kimi-vl",
230
+ system_template="{system_message}",
231
+ system_message="You are a helpful assistant",
232
+ roles=("user", "assistant"),
233
+ messages=(),
234
+ offset=0,
235
+ sep_style=SeparatorStyle.KIMI_VL,
236
+ sep="<|im_end|>",
237
+ sep2=None,
238
+ stop_token_ids=None,
239
+ stop_str=["<|im_end|>"],
240
+ )
241
+ )
242
+
243
+
244
+ def new_chat_template(sft_format: str = "kimi-vl"):
245
+ return get_conv_template(sft_format)
246
+
247
+
248
+ def get_prompt(conv: Conversation) -> str:
249
+ """Get the prompt for generation."""
250
+ return conv.get_prompt()
251
+
252
+
253
+ def generate_prompt_with_history(text, images, history, processor, max_length=2048):
254
+ """
255
+ Generate a prompt with the chat history.
256
+
257
+ Args:
258
+ text (str): The text prompt.
259
+ images (list[PIL.Image.Image]): The image prompt.
260
+ history (list): List of previous conversation messages.
261
+ processor (KimiVLProcessor): The chat processor used for encoding the prompt.
262
+ max_length (int): The maximum length of the prompt.
263
+ """
264
+ global IMAGE_TOKEN
265
+
266
+ user_role_ind = 0
267
+ bot_role_ind = 1
268
+
269
+ # Initialize conversation
270
+ conversation = new_chat_template(sft_format="kimi-vl")
271
+
272
+ if history:
273
+ conversation.messages = history
274
+
275
+ if images is not None and len(images) > 0:
276
+ # num_image_tags = text.count(IMAGE_TOKEN)
277
+ # num_images = len(images)
278
+ # if num_images > num_image_tags:
279
+ # pad_image_tags = num_images - num_image_tags
280
+ # image_tokens = "\n".join([IMAGE_TOKEN] * pad_image_tags)
281
+
282
+ # # append the <image> in a new line after the text prompt
283
+ # text = image_tokens + "\n" + text
284
+ # elif num_images < num_image_tags:
285
+ # remove_image_tags = num_image_tags - num_images
286
+ # text = text.replace(IMAGE_TOKEN, "", remove_image_tags)
287
+
288
+ print(f"prompt = {text}, len(images) = {len(images)}")
289
+ text = (text, images)
290
+
291
+ conversation.append_message(conversation.roles[user_role_ind], text)
292
+ conversation.append_message(conversation.roles[bot_role_ind], "")
293
+
294
+ # Create a copy of the conversation to avoid history truncation in the UI
295
+ conversation_copy = conversation.copy()
296
+ logger.info("=" * 80)
297
+ logger.info(get_prompt(conversation))
298
+
299
+ rounds = len(conversation.messages) // 2
300
+
301
+ for _ in range(rounds):
302
+ current_prompt = get_prompt(conversation)
303
+ assert isinstance(current_prompt, str) and len(current_prompt) > 0, f"current_prompt = {current_prompt}"
304
+ if torch.tensor(processor.tokenizer.encode(current_prompt)).size(-1) <= max_length:
305
+ return conversation_copy
306
+
307
+ if len(conversation.messages) % 2 != 0:
308
+ gr.Error("The messages between user and assistant are not paired.")
309
+ return
310
+
311
+ try:
312
+ for _ in range(2): # pop out two messages in a row
313
+ conversation.messages.pop(0)
314
+ except IndexError:
315
+ gr.Error("Input text processing failed, unable to respond in this round.")
316
+ return None
317
+
318
+ gr.Error("Prompt could not be generated within max_length limit.")
319
+ return None
320
+
321
+
322
+ def convert_conversation_to_prompts(conversation: Conversation):
323
+ """
324
+ Convert the conversation to prompts.
325
+ """
326
+ conv_prompts = []
327
+ last_image = None
328
+
329
+ messages = conversation.messages
330
+ for i in range(0, len(messages), 2):
331
+ if isinstance(messages[i][1], tuple):
332
+ text, images = messages[i][1]
333
+ last_image = images[-1]
334
+ else:
335
+ text, images = messages[i][1], []
336
+
337
+ prompt = {"role": messages[i][0], "content": text, "images": images}
338
+ response = {"role": messages[i + 1][0], "content": messages[i + 1][1]}
339
+ conv_prompts.extend([prompt, response])
340
+
341
+ return conv_prompts, last_image
342
+
343
+
344
+ def to_gradio_chatbot(conversation: Conversation) -> list:
345
+ """Convert the conversation to gradio chatbot format."""
346
+ ret = []
347
+ for i, (_, msg) in enumerate(conversation.messages[conversation.offset :]):
348
+ if i % 2 == 0:
349
+ if type(msg) is tuple:
350
+ msg, images = copy.deepcopy(msg)
351
+
352
+ if isinstance(images, list):
353
+ img_str = ""
354
+ for j, image in enumerate(images):
355
+ if isinstance(image, str):
356
+ with open(image, "rb") as f:
357
+ data = f.read()
358
+ img_b64_str = base64.b64encode(data).decode()
359
+ image_str = (
360
+ f'<img src="data:image/png;base64,{img_b64_str}" '
361
+ f'alt="user upload image" style="max-width: 300px; height: auto;" />'
362
+ )
363
+ else:
364
+ image_str = pil_to_base64(image, f"user upload image_{j}", max_size=800, min_size=400)
365
+
366
+ img_str += image_str
367
+ msg = img_str + msg
368
+ else:
369
+ pass
370
+
371
+ ret.append([msg, None])
372
+ else:
373
+ ret[-1][-1] = msg
374
+ return ret
375
+
376
+
377
+ def to_gradio_history(conversation: Conversation):
378
+ """Convert the conversation to gradio history format."""
379
+ return conversation.messages[conversation.offset :]
kimi_vl/serve/examples.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import base64
4
+ from PIL import Image
5
+
6
+ EXAMPLES_LIST = [
7
+ [
8
+ ["images/demo1.jpeg"],
9
+ "Where am I?",
10
+ ],
11
+ [
12
+ ["images/demo2.jpeg"],
13
+ "Based on the abstract above, write a concise and elegant Twitter post that highlights key points and figures without sounding overly promotional. Use English, include emojis and hashtags.",
14
+ ],
15
+ [
16
+ ["images/demo3.jpeg"],
17
+ "If you are free, what would you most like to do?"
18
+ ],
19
+ # mulit-frames example
20
+ [
21
+ ["images/demo4.jpeg", "images/demo5.jpeg"],
22
+ "Please infer step by step who this manuscript belongs to and what it records."
23
+ ],
24
+ ]
25
+
26
+
27
+ def display_example(image_list, root_dir: str = None):
28
+ images_html = ""
29
+ for _, img_path in enumerate(image_list):
30
+ if root_dir is not None:
31
+ img_path = os.path.join(root_dir, img_path)
32
+
33
+ image = Image.open(img_path)
34
+ buffered = io.BytesIO()
35
+ image.save(buffered, format="PNG", quality=100)
36
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
37
+ img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="{img_path}" style="height:80px; margin-right: 10px;" />'
38
+ images_html += img_str
39
+
40
+ result_html = f"""
41
+ <div style="display: flex; align-items: center; margin-bottom: 10px;">
42
+ <div style="flex: 1; margin-right: 10px;">{images_html}</div>
43
+ </div>
44
+ """
45
+
46
+ return result_html
47
+
48
+
49
+ def get_examples(root_dir: str = None):
50
+ examples = []
51
+ for images, texts in EXAMPLES_LIST:
52
+ examples.append([images, display_example(images, root_dir), texts])
53
+
54
+ return examples
kimi_vl/serve/frontend.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from typing import List, Tuple
4
+
5
+ import gradio as gr
6
+
7
+ from kimi_vl.serve.utils import convert_asis, convert_mdtext, detect_converted_mark
8
+
9
+ ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
10
+
11
+
12
+ small_and_beautiful_theme = gr.themes.Soft(
13
+ primary_hue=gr.themes.Color(
14
+ c50="#EBFAF2",
15
+ c100="#CFF3E1",
16
+ c200="#A8EAC8",
17
+ c300="#77DEA9",
18
+ c400="#3FD086",
19
+ c500="#02C160",
20
+ c600="#06AE56",
21
+ c700="#05974E",
22
+ c800="#057F45",
23
+ c900="#04673D",
24
+ c950="#2E5541",
25
+ name="small_and_beautiful",
26
+ ),
27
+ secondary_hue=gr.themes.Color(
28
+ c50="#576b95",
29
+ c100="#576b95",
30
+ c200="#576b95",
31
+ c300="#576b95",
32
+ c400="#576b95",
33
+ c500="#576b95",
34
+ c600="#576b95",
35
+ c700="#576b95",
36
+ c800="#576b95",
37
+ c900="#576b95",
38
+ c950="#576b95",
39
+ ),
40
+ neutral_hue=gr.themes.Color(
41
+ name="gray",
42
+ c50="#f6f7f8",
43
+ # c100="#f3f4f6",
44
+ c100="#F2F2F2",
45
+ c200="#e5e7eb",
46
+ c300="#d1d5db",
47
+ c400="#B2B2B2",
48
+ c500="#808080",
49
+ c600="#636363",
50
+ c700="#515151",
51
+ c800="#393939",
52
+ # c900="#272727",
53
+ c900="#2B2B2B",
54
+ c950="#171717",
55
+ ),
56
+ radius_size=gr.themes.sizes.radius_sm,
57
+ ).set(
58
+ # button_primary_background_fill="*primary_500",
59
+ button_primary_background_fill_dark="*primary_600",
60
+ # button_primary_background_fill_hover="*primary_400",
61
+ # button_primary_border_color="*primary_500",
62
+ button_primary_border_color_dark="*primary_600",
63
+ button_primary_text_color="white",
64
+ button_primary_text_color_dark="white",
65
+ button_secondary_background_fill="*neutral_100",
66
+ button_secondary_background_fill_hover="*neutral_50",
67
+ button_secondary_background_fill_dark="*neutral_900",
68
+ button_secondary_text_color="*neutral_800",
69
+ button_secondary_text_color_dark="white",
70
+ # background_fill_primary="#F7F7F7",
71
+ # background_fill_primary_dark="#1F1F1F",
72
+ # block_title_text_color="*primary_500",
73
+ block_title_background_fill_dark="*primary_900",
74
+ block_label_background_fill_dark="*primary_900",
75
+ input_background_fill="#F6F6F6",
76
+ # chatbot_code_background_color_dark="*neutral_950",
77
+ )
78
+
79
+
80
+ def compact_text_chunks(self, prompt, text_chunks: List[str]) -> List[str]:
81
+ logging.debug("Compacting text chunks...🚀🚀🚀")
82
+ combined_str = [c.strip() for c in text_chunks if c.strip()]
83
+ combined_str = [f"[{index+1}] {c}" for index, c in enumerate(combined_str)]
84
+ combined_str = "\n\n".join(combined_str)
85
+ # resplit based on self.max_chunk_overlap
86
+ text_splitter = self.get_text_splitter_given_prompt(prompt, 1, padding=1)
87
+ return text_splitter.split_text(combined_str)
88
+
89
+
90
+ def postprocess(y: List[Tuple[str | None, str | None]]) -> List[Tuple[str | None, str | None]]:
91
+ """
92
+ Parameters:
93
+ y: List of tuples representing the message and response pairs. Each message and response should be a string, which may be in Markdown format.
94
+ Returns:
95
+ List of tuples representing the message and response. Each message and response will be a string of HTML.
96
+ """
97
+ if y is None or y == []:
98
+ return []
99
+ temp = []
100
+ for x in y:
101
+ user, bot = x
102
+ if not detect_converted_mark(user):
103
+ user = convert_asis(user)
104
+ if not detect_converted_mark(bot):
105
+ bot = convert_mdtext(bot)
106
+ temp.append((user, bot))
107
+ return temp
108
+
109
+
110
+ custom_js_path = os.path.join(ROOT_PATH, "assets/custom.js")
111
+ kelpy_codos_path = os.path.join(ROOT_PATH, "assets/Kelpy-Codos.js")
112
+
113
+ with (
114
+ open(custom_js_path, "r", encoding="utf-8") as f,
115
+ open(kelpy_codos_path, "r", encoding="utf-8") as f2,
116
+ ):
117
+ customJS = f.read()
118
+ kelpyCodos = f2.read()
119
+
120
+
121
+ def reload_javascript():
122
+ print("Reloading javascript...")
123
+ js = f"<script>{customJS}</script><script>{kelpyCodos}</script>"
124
+
125
+ def template_response(*args, **kwargs):
126
+ res = GradioTemplateResponseOriginal(*args, **kwargs)
127
+ res.body = res.body.replace(b"</html>", f"{js}</html>".encode("utf8"))
128
+ res.init_headers()
129
+ return res
130
+
131
+ gr.routes.templates.TemplateResponse = template_response
132
+
133
+
134
+ GradioTemplateResponseOriginal = gr.routes.templates.TemplateResponse
kimi_vl/serve/gradio_utils.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio utils for the Kimi-VL application.
3
+ """
4
+
5
+ import functools
6
+ from typing import Callable
7
+ import traceback
8
+
9
+ import gradio as gr
10
+
11
+
12
+ IMAGE_TOKEN = "<image>"
13
+
14
+
15
+ def transfer_input(input_text, input_images):
16
+ """
17
+ Transfer the input text and images to the input text and images.
18
+ """
19
+ return (input_text, input_images, gr.update(value=""), gr.update(value=None), gr.Button(visible=True))
20
+
21
+
22
+ def delete_last_conversation(chatbot, history):
23
+ """
24
+ Delete the last conversation from the chatbot and history.
25
+
26
+ Args:
27
+ chatbot (list): The chatbot list.
28
+ history (list): The history list.
29
+ """
30
+ if len(history) % 2 != 0:
31
+ gr.Error("history length is not even")
32
+ return (
33
+ chatbot,
34
+ history,
35
+ "Delete Done",
36
+ )
37
+
38
+ if len(chatbot) > 0:
39
+ chatbot.pop()
40
+
41
+ if len(history) > 0 and len(history) % 2 == 0:
42
+ history.pop()
43
+ history.pop()
44
+
45
+ return (
46
+ chatbot,
47
+ history,
48
+ "Delete Done",
49
+ )
50
+
51
+
52
+ def reset_state():
53
+ return [], [], None, "Reset Done"
54
+
55
+
56
+ def reset_textbox():
57
+ return gr.update(value=""), ""
58
+
59
+
60
+ def cancel_outputing():
61
+ return "Stop Done"
62
+
63
+
64
+ class State:
65
+ interrupted = False
66
+
67
+ def interrupt(self):
68
+ self.interrupted = True
69
+
70
+ def recover(self):
71
+ self.interrupted = False
72
+
73
+
74
+ shared_state = State()
75
+
76
+
77
+ def wrap_gen_fn(gen_fn: Callable):
78
+ """
79
+ Wrap the generator function to handle errors.
80
+ """
81
+
82
+ @functools.wraps(gen_fn)
83
+ def wrapped_gen_fn(prompt, *args, **kwargs):
84
+ try:
85
+ yield from gen_fn(prompt, *args, **kwargs)
86
+ except gr.Error as g_err:
87
+ traceback.print_exc()
88
+ raise g_err
89
+ except Exception as e:
90
+ traceback.print_exc()
91
+ raise gr.Error(f"Failed to generate text: {e}") from e
92
+
93
+ return wrapped_gen_fn
kimi_vl/serve/inference.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+ from threading import Thread
4
+ from typing import List, Optional
5
+
6
+ import torch
7
+ from transformers import (
8
+ AutoModelForCausalLM,
9
+ AutoProcessor,
10
+ AutoConfig,
11
+ StoppingCriteria,
12
+ StoppingCriteriaList,
13
+ TextIteratorStreamer,
14
+ )
15
+
16
+ from .chat_utils import Conversation, get_conv_template
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def load_model(model_path: str = "moonshotai/Kimi-VL-A3B-Thinking"):
22
+ # hotfix the model to use flash attention 2
23
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
24
+ config._attn_implementation = "flash_attention_2"
25
+ config.vision_config._attn_implementation = "flash_attention_2"
26
+ config.text_config._attn_implementation = "flash_attention_2"
27
+ print("Successfully set the attn_implementation to flash_attention_2")
28
+
29
+ model = AutoModelForCausalLM.from_pretrained(
30
+ model_path,
31
+ config=config,
32
+ torch_dtype="auto",
33
+ device_map="auto",
34
+ trust_remote_code=True,
35
+ )
36
+ processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True)
37
+
38
+ return model, processor
39
+
40
+
41
+ class StoppingCriteriaSub(StoppingCriteria):
42
+ def __init__(self, stops=[], encounters=1):
43
+ super().__init__()
44
+ self.stops = [stop.to("cuda") for stop in stops]
45
+
46
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
47
+ for stop in self.stops:
48
+ if input_ids.shape[-1] < len(stop):
49
+ continue
50
+ if torch.all((stop == input_ids[0][-len(stop) :])).item():
51
+ return True
52
+
53
+ return False
54
+
55
+
56
+ def format_messages(
57
+ conversations: list[Conversation],
58
+ system_prompt: Optional[str] = "",
59
+ sft_format: Optional[str] = "kimi-vl",
60
+ ):
61
+ """
62
+ Format the conversations to the input format of the model.
63
+ """
64
+ converstion = get_conv_template(sft_format)
65
+ converstion.set_system_message(system_prompt)
66
+ for message in conversations:
67
+ converstion.append_message(message["role"], message["content"])
68
+ return converstion
69
+
70
+
71
+ def preprocess(
72
+ messages: list[dict],
73
+ processor,
74
+ sft_format: Optional[str] = "kimi-vl",
75
+ ):
76
+ """
77
+ Build messages from the conversations and images.
78
+ """
79
+ # get images from conversations
80
+ results = []
81
+ images = []
82
+
83
+ # get texts from conversations
84
+ converstion = get_conv_template(sft_format)
85
+ # only use the last 3 round of messages
86
+ latest_messages = messages[-3:]
87
+ for mid, message in enumerate(latest_messages):
88
+ if message["role"] == converstion.roles[0] or message["role"] == "user":
89
+ record = {
90
+ "role": message["role"],
91
+ "content": [],
92
+ }
93
+ if "images" in message:
94
+ per_round_images = message["images"]
95
+ if len(per_round_images) > 2:
96
+ per_round_images = per_round_images[-2:]
97
+ print(f"Only use the last 2 images in the {mid}-th round")
98
+
99
+ images.extend(per_round_images)
100
+ for image in per_round_images:
101
+ record["content"].append(
102
+ {
103
+ "type": "image",
104
+ "image": image,
105
+ }
106
+ )
107
+ if 'content' in message:
108
+ record["content"].append(
109
+ {
110
+ "type": "text",
111
+ "text": str(message["content"]).strip(),
112
+ }
113
+ )
114
+ results.append(record)
115
+ elif message["role"] == converstion.roles[1] or message["role"] == "assistant":
116
+ formatted_answer = message["content"].strip()
117
+ # ◁think▷用户说了“你好”,这是一个非常简单的问候,通常用于开启对话。我需要判断用户的意图。可能性一:用户只是礼貌性地打招呼,想要开启一段对话;可能性二:用户可能有更具体的需求,比如询问我的功能、功能或者需要帮助。由于用户没有提供更多信息,我需要保持开放,同时引导用户进一步说明他们的需求。
118
+ # 我的回复需要既友好又开放,不能显得过于正式或冷漠。同时,我需要避免假设用户的具体需求,而是提供一个轻松的、鼓励继续对话的回应。◁/think▷你好!很高兴见到你。有什么我可以帮助你的吗
119
+ # delete all the texts between ◁think▷ and ◁/think▷
120
+ # FIXME: this is a hack to remove the thinking texts
121
+ # formatted_answer = re.sub(r"◁think▷.*◁/think▷", "", formatted_answer)
122
+ think_end_token = '◁/think▷'
123
+ formatted_answer = formatted_answer.split(think_end_token)[-1]
124
+ results.append(
125
+ {
126
+ "role": message["role"],
127
+ "content": [
128
+ {
129
+ "type": "text",
130
+ "text": formatted_answer,
131
+ }
132
+ ],
133
+ }
134
+ )
135
+ assert (
136
+ formatted_answer.count(processor.image_token) == 0
137
+ ), f"there should be no {processor.image_token} in the assistant's reply, but got {messages}"
138
+ converstion.append_message(converstion.roles[1], formatted_answer)
139
+
140
+ text = processor.apply_chat_template(results, add_generation_prompt=True)
141
+ print(f"raw text = {text}")
142
+ if len(images) == 0:
143
+ images = None
144
+
145
+ inputs = processor(
146
+ images=images,
147
+ text=[text],
148
+ return_tensors="pt",
149
+ padding=True,
150
+ truncation=True,
151
+ )
152
+ return inputs
153
+
154
+
155
+ @torch.inference_mode()
156
+ def kimi_vl_generate(
157
+ model: torch.nn.Module,
158
+ processor: AutoProcessor,
159
+ conversations: list[Conversation],
160
+ stop_words: list,
161
+ max_length: int = 256,
162
+ temperature: float = 1.0,
163
+ top_p: float = 1.0,
164
+ chunk_size: int = -1,
165
+ ):
166
+ # convert conversation to inputs
167
+ print(f"conversations = {conversations}")
168
+ inputs = preprocess(conversations, processor=processor)
169
+ inputs = inputs.to(model.device)
170
+
171
+ return generate(
172
+ model,
173
+ processor,
174
+ inputs,
175
+ max_gen_len=max_length,
176
+ temperature=temperature,
177
+ top_p=top_p,
178
+ stop_words=stop_words,
179
+ chunk_size=chunk_size,
180
+ )
181
+
182
+
183
+ @torch.inference_mode()
184
+ def generate(
185
+ model,
186
+ processor,
187
+ inputs,
188
+ max_gen_len: int = 256,
189
+ temperature: float = 0,
190
+ top_p: float = 0.95,
191
+ stop_words: List[str] = [],
192
+ chunk_size: int = -1,
193
+ ):
194
+ """Stream the text output from the multimodality model with prompt and image inputs."""
195
+ tokenizer = processor.tokenizer
196
+ stop_words_ids = [torch.tensor(tokenizer.encode(stop_word)) for stop_word in stop_words]
197
+ stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
198
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
199
+
200
+ kwargs = dict(
201
+ **inputs,
202
+ max_new_tokens=max_gen_len,
203
+ do_sample=True,
204
+ use_cache=True,
205
+ streamer=streamer,
206
+ stopping_criteria=stopping_criteria,
207
+ )
208
+
209
+ if temperature > 0:
210
+ kwargs.update(
211
+ {
212
+ "do_sample": True,
213
+ "top_p": top_p,
214
+ "temperature": temperature,
215
+ }
216
+ )
217
+ else:
218
+ kwargs["do_sample"] = False
219
+
220
+ thread = Thread(target=model.generate, kwargs=kwargs)
221
+ thread.start()
222
+
223
+ yield from streamer
kimi_vl/serve/utils.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import html
4
+ import logging
5
+ import io
6
+ import os
7
+ import re
8
+ import base64
9
+ import time
10
+ from PIL import Image, ImageDraw, ImageFont
11
+
12
+ import mdtex2html
13
+ from markdown import markdown
14
+ from pygments import highlight
15
+ from pygments.formatters import HtmlFormatter
16
+ from pygments.lexers import ClassNotFound, get_lexer_by_name, guess_lexer
17
+
18
+
19
+ ALREADY_CONVERTED_MARK = "<!-- ALREADY CONVERTED BY PARSER. -->"
20
+ BOX2COLOR = {
21
+ 0: (255, 0, 0),
22
+ 1: (0, 255, 0),
23
+ 2: (0, 0, 255),
24
+ }
25
+ MAX_IMAGE_SIZE = 1024
26
+ MIN_IMAGE_SIZE = 1024
27
+ logger = logging.getLogger("gradio_logger")
28
+
29
+
30
+ def configure_logger(log_dir: str = "logs"):
31
+ logger = logging.getLogger("gradio_logger")
32
+ logger.setLevel(logging.DEBUG)
33
+
34
+ timestr = time.strftime("%Y%m%d-%H%M%S")
35
+ os.makedirs(log_dir, exist_ok=True)
36
+ file_handler = logging.FileHandler(f"{log_dir}/{timestr}_gradio_log.log")
37
+ console_handler = logging.StreamHandler()
38
+
39
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
40
+ console_handler.setFormatter(formatter)
41
+ file_handler.setFormatter(formatter)
42
+
43
+ console_handler.setLevel(logging.INFO)
44
+ file_handler.setLevel(logging.INFO)
45
+
46
+ logger.addHandler(console_handler)
47
+ logger.addHandler(file_handler)
48
+
49
+ return logger
50
+
51
+
52
+ def strip_stop_words(x, stop_words):
53
+ for w in stop_words:
54
+ if w in x:
55
+ return x[: x.index(w)].strip()
56
+ return x.strip()
57
+
58
+
59
+ def format_output(history, text, x):
60
+ updated_history = history + [[text, x]]
61
+ a = [[y[0], convert_to_markdown(y[1])] for y in updated_history]
62
+ return a, updated_history
63
+
64
+
65
+ def markdown_to_html_with_syntax_highlight(md_str): # deprecated
66
+ def replacer(match):
67
+ lang = match.group(1) or "text"
68
+ code = match.group(2)
69
+
70
+ try:
71
+ lexer = get_lexer_by_name(lang, stripall=True)
72
+ except ValueError:
73
+ lexer = get_lexer_by_name("text", stripall=True)
74
+
75
+ formatter = HtmlFormatter()
76
+ highlighted_code = highlight(code, lexer, formatter)
77
+
78
+ return f'<pre><code class="{lang}">{highlighted_code}</code></pre>'
79
+
80
+ code_block_pattern = r"```(\w+)?\n([\s\S]+?)\n```"
81
+ md_str = re.sub(code_block_pattern, replacer, md_str, flags=re.MULTILINE)
82
+
83
+ html_str = markdown(md_str)
84
+ return html_str
85
+
86
+
87
+ def normalize_markdown(md_text: str) -> str: # deprecated
88
+ lines = md_text.split("\n")
89
+ normalized_lines = []
90
+ inside_list = False
91
+
92
+ for i, line in enumerate(lines):
93
+ if re.match(r"^(\d+\.|-|\*|\+)\s", line.strip()):
94
+ if not inside_list and i > 0 and lines[i - 1].strip() != "":
95
+ normalized_lines.append("")
96
+ inside_list = True
97
+ normalized_lines.append(line)
98
+ elif inside_list and line.strip() == "":
99
+ if i < len(lines) - 1 and not re.match(r"^(\d+\.|-|\*|\+)\s", lines[i + 1].strip()):
100
+ normalized_lines.append(line)
101
+ continue
102
+ else:
103
+ inside_list = False
104
+ normalized_lines.append(line)
105
+
106
+ return "\n".join(normalized_lines)
107
+
108
+
109
+ def convert_mdtext(md_text):
110
+ code_block_pattern = re.compile(r"```(.*?)(?:```|$)", re.DOTALL)
111
+ inline_code_pattern = re.compile(r"`(.*?)`", re.DOTALL)
112
+ code_blocks = code_block_pattern.findall(md_text)
113
+ non_code_parts = code_block_pattern.split(md_text)[::2]
114
+
115
+ result = []
116
+ for non_code, code in zip(non_code_parts, code_blocks + [""]):
117
+ if non_code.strip():
118
+ non_code = normalize_markdown(non_code)
119
+ if inline_code_pattern.search(non_code):
120
+ result.append(markdown(non_code, extensions=["tables"]))
121
+ else:
122
+ result.append(mdtex2html.convert(non_code, extensions=["tables"]))
123
+ if code.strip():
124
+ code = f"\n```{code}\n\n```"
125
+ code = markdown_to_html_with_syntax_highlight(code)
126
+ result.append(code)
127
+ result = "".join(result)
128
+ result += ALREADY_CONVERTED_MARK
129
+ return result
130
+
131
+
132
+ def convert_asis(userinput):
133
+ return f'<p style="white-space:pre-wrap;">{html.escape(userinput)}</p>{ALREADY_CONVERTED_MARK}'
134
+
135
+
136
+ def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
137
+ return any(s.endswith(stop_word) for stop_word in stop_words)
138
+
139
+
140
+ def detect_converted_mark(userinput):
141
+ return bool(userinput.endswith(ALREADY_CONVERTED_MARK))
142
+
143
+
144
+ def detect_language(code):
145
+ first_line = "" if code.startswith("\n") else code.strip().split("\n", 1)[0]
146
+ language = first_line.lower() if first_line else ""
147
+ code_without_language = code[len(first_line) :].lstrip() if first_line else code
148
+ return language, code_without_language
149
+
150
+
151
+ def convert_to_markdown(text):
152
+ text = text.replace("$", "&#36;")
153
+ text = text.replace("\r\n", "\n")
154
+
155
+ def replace_leading_tabs_and_spaces(line):
156
+ new_line = []
157
+
158
+ for char in line:
159
+ if char == "\t":
160
+ new_line.append("&#9;")
161
+ elif char == " ":
162
+ new_line.append("&nbsp;")
163
+ else:
164
+ break
165
+ return "".join(new_line) + line[len(new_line) :]
166
+
167
+ markdown_text = ""
168
+ lines = text.split("\n")
169
+ in_code_block = False
170
+
171
+ for line in lines:
172
+ if in_code_block is False and line.startswith("```"):
173
+ in_code_block = True
174
+ markdown_text += f"{line}\n"
175
+ elif in_code_block is True and line.startswith("```"):
176
+ in_code_block = False
177
+ markdown_text += f"{line}\n"
178
+ elif in_code_block:
179
+ markdown_text += f"{line}\n"
180
+ else:
181
+ line = replace_leading_tabs_and_spaces(line)
182
+ line = re.sub(r"^(#)", r"\\\1", line)
183
+ markdown_text += f"{line} \n"
184
+
185
+ return markdown_text
186
+
187
+
188
+ def add_language_tag(text):
189
+ def detect_language(code_block):
190
+ try:
191
+ lexer = guess_lexer(code_block)
192
+ return lexer.name.lower()
193
+ except ClassNotFound:
194
+ return ""
195
+
196
+ code_block_pattern = re.compile(r"(```)(\w*\n[^`]+```)", re.MULTILINE)
197
+
198
+ def replacement(match):
199
+ code_block = match.group(2)
200
+ if match.group(2).startswith("\n"):
201
+ language = detect_language(code_block)
202
+ return f"```{language}{code_block}```" if language else f"```\n{code_block}```"
203
+ else:
204
+ return match.group(1) + code_block + "```"
205
+
206
+ text2 = code_block_pattern.sub(replacement, text)
207
+ return text2
208
+
209
+
210
+ def is_variable_assigned(var_name: str) -> bool:
211
+ return var_name in locals()
212
+
213
+
214
+ def pil_to_base64(
215
+ image: Image.Image,
216
+ alt: str = "user upload image",
217
+ resize: bool = True,
218
+ max_size: int = MAX_IMAGE_SIZE,
219
+ min_size: int = MIN_IMAGE_SIZE,
220
+ format: str = "JPEG",
221
+ quality: int = 95,
222
+ ) -> str:
223
+ """
224
+ Convert a PIL image to a base64 string.
225
+ """
226
+
227
+ if resize:
228
+ max_hw, min_hw = max(image.size), min(image.size)
229
+ aspect_ratio = max_hw / min_hw
230
+ shortest_edge = int(min(max_size / aspect_ratio, min_size, min_hw))
231
+ longest_edge = int(shortest_edge * aspect_ratio)
232
+ W, H = image.size
233
+ if H > W:
234
+ H, W = longest_edge, shortest_edge
235
+ else:
236
+ H, W = shortest_edge, longest_edge
237
+ image = image.resize((W, H))
238
+
239
+ buffered = io.BytesIO()
240
+ image.save(buffered, format=format, quality=quality)
241
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
242
+ img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="{alt}" />'
243
+
244
+ return img_str
245
+
246
+
247
+ def parse_ref_bbox(response, image: Image.Image):
248
+ try:
249
+ image = image.copy()
250
+ image_h, image_w = image.size
251
+ draw = ImageDraw.Draw(image)
252
+
253
+ ref = re.findall(r'<\|ref\|>.*?<\|/ref\|>', response)
254
+ bbox = re.findall(r'<\|det\|>.*?<\|/det\|>', response)
255
+ assert len(ref) == len(bbox)
256
+
257
+ if len(ref) == 0:
258
+ return None
259
+
260
+ boxes, labels = [], []
261
+ for box, label in zip(bbox, ref):
262
+ box = box.replace('<|det|>', '').replace('<|/det|>', '')
263
+ label = label.replace('<|ref|>', '').replace('<|/ref|>', '')
264
+ box = box[1:-1]
265
+ for onebox in re.findall(r'\[.*?\]', box):
266
+ boxes.append(eval(onebox))
267
+ labels.append(label)
268
+
269
+ for indice, (box, label) in enumerate(zip(boxes, labels)):
270
+ box = (
271
+ int(box[0] / 999 * image_h),
272
+ int(box[1] / 999 * image_w),
273
+ int(box[2] / 999 * image_h),
274
+ int(box[3] / 999 * image_w),
275
+ )
276
+
277
+ box_color = BOX2COLOR[indice % len(BOX2COLOR.keys())]
278
+ box_width = 3
279
+ draw.rectangle(box, outline=box_color, width=box_width)
280
+
281
+ text_x = box[0]
282
+ text_y = box[1] - 20
283
+ text_color = box_color
284
+ font = ImageFont.truetype("kimi_vl/serve/assets/simsun.ttc", size=20)
285
+ draw.text((text_x, text_y), label, font=font, fill=text_color)
286
+
287
+ return image
288
+ except Exception as e:
289
+ logger.error(f"Error parsing reference bounding boxes: {e}")
290
+ return None
pyproject.toml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "kimi_vl"
3
+ version = "1.0.0"
4
+ description = "Kimi-VL"
5
+ license = {file = "LICENSE-CODE"}
6
+ readme = "README.md"
7
+ requires-python = ">=3.8"
8
+ dependencies = [
9
+ "torch==2.5.0",
10
+ "transformers==4.50.0",
11
+ "accelerate",
12
+ "tiktoken",
13
+ "blobfile",
14
+ "sentencepiece",
15
+ ]
16
+
17
+ [project.optional-dependencies]
18
+ gradio = [
19
+ "gradio==3.48.0",
20
+ "gradio-client==0.6.1",
21
+ "mdtex2html==1.3.0",
22
+ "pypinyin==0.50.0",
23
+ ]
24
+
25
+ # Linter tools:
26
+ [tool.black]
27
+ line-length = 120
28
+ skip-string-normalization = true
29
+ target-version = ["py310"]
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.5.0
2
+ torchvision==0.20.0
3
+ transformers==4.51.1
4
+ accelerate
5
+ sentencepiece
6
+ attrdict
7
+ einops
8
+ tiktoken
9
+ blobfile
10
+ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
11
+
12
+ # for gradio demo
13
+ gradio
14
+ gradio-client
15
+ mdtex2html
16
+ pypinyin
17
+ tqdm
18
+ colorama
19
+ Pygments
20
+ markdown
21
+ SentencePiece