陈益民 commited on
Commit
d9b3bd5
·
1 Parent(s): eec1d8a

version 1.0.0

Browse files
.flake8 ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [flake8]
2
+ max-line-length = 120
3
+ max-complexity = 20
4
+ # Ignore E501 line length error. Those that cannot be fixed by yapf automatically are usually not worth fixing.
5
+ ignore = E731, W504, W503, E501, E741, E203, E402, F824
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.png filter=lfs diff=lfs merge=lfs -text
37
+ *.ico filter=lfs diff=lfs merge=lfs -text
38
+ *.ttc filter=lfs diff=lfs merge=lfs -text
39
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ *.log
2
+
3
+ __pycache__
4
+ *.pyc
5
+ *.pyo
app.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import gradio as gr
3
+ import os
4
+ from PIL import Image
5
+ import torch
6
+
7
+ from kimi_vl.serve.frontend import reload_javascript
8
+ from kimi_vl.serve.utils import (
9
+ configure_logger,
10
+ pil_to_base64,
11
+ parse_ref_bbox,
12
+ strip_stop_words,
13
+ is_variable_assigned,
14
+ )
15
+ from kimi_vl.serve.gradio_utils import (
16
+ cancel_outputing,
17
+ delete_last_conversation,
18
+ reset_state,
19
+ reset_textbox,
20
+ transfer_input,
21
+ wrap_gen_fn,
22
+ )
23
+ from kimi_vl.serve.chat_utils import (
24
+ generate_prompt_with_history,
25
+ convert_conversation_to_prompts,
26
+ to_gradio_chatbot,
27
+ to_gradio_history,
28
+ )
29
+ from kimi_vl.serve.inference import kimi_vl_generate, load_model
30
+ from kimi_vl.serve.examples import get_examples
31
+
32
+ TITLE = """<h1 align="left" style="min-width:200px; margin-top:0;">Chat with Kimi-VL-A3B-Thinking </h1>"""
33
+ DESCRIPTION_TOP = """<a href="https://github.com/MoonshotAI/Kimi-VL" target="_blank">Kimi-VL</a> is a multi-modal LLM that can understand and generate text and images."""
34
+ DESCRIPTION = """"""
35
+ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
36
+ DEPLOY_MODELS = dict()
37
+ logger = configure_logger()
38
+
39
+
40
+ def parse_args():
41
+ parser = argparse.ArgumentParser()
42
+ parser.add_argument("--model", type=str, default="Kimi-VL-A3B-Thinking")
43
+ parser.add_argument(
44
+ "--local-path",
45
+ type=str,
46
+ default="",
47
+ help="huggingface ckpt, optional",
48
+ )
49
+ parser.add_argument("--ip", type=str, default="0.0.0.0")
50
+ parser.add_argument("--port", type=int, default=7860)
51
+ return parser.parse_args()
52
+
53
+
54
+ def fetch_model(model_name: str):
55
+ global args, DEPLOY_MODELS
56
+
57
+ if args.local_path:
58
+ model_path = args.local_path
59
+ else:
60
+ model_path = f"moonshotai/{args.model}"
61
+
62
+ if model_name in DEPLOY_MODELS:
63
+ model_info = DEPLOY_MODELS[model_name]
64
+ print(f"{model_name} has been loaded.")
65
+ else:
66
+ print(f"{model_name} is loading...")
67
+ DEPLOY_MODELS[model_name] = load_model(model_path)
68
+ print(f"Load {model_name} successfully...")
69
+ model_info = DEPLOY_MODELS[model_name]
70
+
71
+ return model_info
72
+
73
+
74
+ def preview_images(files) -> list[str]:
75
+ if files is None:
76
+ return []
77
+
78
+ image_paths = []
79
+ for file in files:
80
+ image_paths.append(file.name)
81
+ return image_paths
82
+
83
+
84
+ def get_prompt(conversation) -> str:
85
+ """
86
+ Get the prompt for the conversation.
87
+ """
88
+ system_prompt = conversation.system_template.format(system_message=conversation.system_message)
89
+ return system_prompt
90
+
91
+
92
+ @wrap_gen_fn
93
+ def predict(
94
+ text,
95
+ images,
96
+ chatbot,
97
+ history,
98
+ top_p,
99
+ temperature,
100
+ max_length_tokens,
101
+ max_context_length_tokens,
102
+ chunk_size: int = 512,
103
+ ):
104
+ """
105
+ Predict the response for the input text and images.
106
+ Args:
107
+ text (str): The input text.
108
+ images (list[PIL.Image.Image]): The input images.
109
+ chatbot (list): The chatbot.
110
+ history (list): The history.
111
+ top_p (float): The top-p value.
112
+ temperature (float): The temperature value.
113
+ repetition_penalty (float): The repetition penalty value.
114
+ max_length_tokens (int): The max length tokens.
115
+ max_context_length_tokens (int): The max context length tokens.
116
+ chunk_size (int): The chunk size.
117
+ """
118
+ print("running the prediction function")
119
+ try:
120
+ model, processor = fetch_model(args.model)
121
+
122
+ if text == "":
123
+ yield chatbot, history, "Empty context."
124
+ return
125
+ except KeyError:
126
+ yield [[text, "No Model Found"]], [], "No Model Found"
127
+ return
128
+
129
+ if images is None:
130
+ images = []
131
+
132
+ # load images
133
+ pil_images = []
134
+ for img_or_file in images:
135
+ try:
136
+ # load as pil image
137
+ if isinstance(images, Image.Image):
138
+ pil_images.append(img_or_file)
139
+ else:
140
+ image = Image.open(img_or_file.name).convert("RGB")
141
+ pil_images.append(image)
142
+ except Exception as e:
143
+ print(f"Error loading image: {e}")
144
+
145
+ # generate prompt
146
+ conversation = generate_prompt_with_history(
147
+ text,
148
+ pil_images,
149
+ history,
150
+ processor,
151
+ max_length=max_context_length_tokens,
152
+ )
153
+ all_conv, last_image = convert_conversation_to_prompts(conversation)
154
+ stop_words = conversation.stop_str
155
+ gradio_chatbot_output = to_gradio_chatbot(conversation)
156
+
157
+ full_response = ""
158
+ with torch.no_grad():
159
+ for x in kimi_vl_generate(
160
+ conversations=all_conv,
161
+ model=model,
162
+ processor=processor,
163
+ stop_words=stop_words,
164
+ max_length=max_length_tokens,
165
+ temperature=temperature,
166
+ top_p=top_p,
167
+ ):
168
+ full_response += x
169
+ response = strip_stop_words(full_response, stop_words)
170
+ conversation.update_last_message(response)
171
+ gradio_chatbot_output[-1][1] = response
172
+
173
+ yield gradio_chatbot_output, to_gradio_history(conversation), "Generating..."
174
+
175
+ if last_image is not None:
176
+ vg_image = parse_ref_bbox(response, last_image)
177
+ if vg_image is not None:
178
+ vg_base64 = pil_to_base64(vg_image, "vg", max_size=800, min_size=400)
179
+ gradio_chatbot_output[-1][1] += vg_base64
180
+ yield gradio_chatbot_output, to_gradio_history(conversation), "Generating..."
181
+
182
+ logger.info("flushed result to gradio")
183
+ torch.cuda.empty_cache()
184
+
185
+ if is_variable_assigned("x"):
186
+ print(
187
+ f"temperature: {temperature}, "
188
+ f"top_p: {top_p}, "
189
+ f"max_length_tokens: {max_length_tokens}"
190
+ )
191
+
192
+ yield gradio_chatbot_output, to_gradio_history(conversation), "Generate: Success"
193
+
194
+
195
+ def retry(
196
+ text,
197
+ images,
198
+ chatbot,
199
+ history,
200
+ top_p,
201
+ temperature,
202
+ max_length_tokens,
203
+ max_context_length_tokens,
204
+ chunk_size: int = 512,
205
+ ):
206
+ """
207
+ Retry the response for the input text and images.
208
+ """
209
+ if len(history) == 0:
210
+ yield (chatbot, history, "Empty context")
211
+ return
212
+
213
+ chatbot.pop()
214
+ history.pop()
215
+ text = history.pop()[-1]
216
+ if type(text) is tuple:
217
+ text, _ = text
218
+
219
+ yield from predict(
220
+ text,
221
+ images,
222
+ chatbot,
223
+ history,
224
+ top_p,
225
+ temperature,
226
+ max_length_tokens,
227
+ max_context_length_tokens,
228
+ chunk_size,
229
+ )
230
+
231
+
232
+ def build_demo(args: argparse.Namespace) -> gr.Blocks:
233
+ with gr.Blocks(theme=gr.themes.Soft(), delete_cache=(1800, 1800)) as demo:
234
+ history = gr.State([])
235
+ input_text = gr.State()
236
+ input_images = gr.State()
237
+
238
+ with gr.Row():
239
+ gr.HTML(TITLE)
240
+ status_display = gr.Markdown("Success", elem_id="status_display")
241
+ gr.Markdown(DESCRIPTION_TOP)
242
+
243
+ with gr.Row(equal_height=True):
244
+ with gr.Column(scale=4):
245
+ with gr.Row():
246
+ chatbot = gr.Chatbot(
247
+ elem_id="Kimi-VL-A3B-Thinking-chatbot",
248
+ show_share_button=True,
249
+ bubble_full_width=False,
250
+ height=600,
251
+ )
252
+ with gr.Row():
253
+ with gr.Column(scale=4):
254
+ text_box = gr.Textbox(show_label=False, placeholder="Enter text", container=False)
255
+ with gr.Column(min_width=70):
256
+ submit_btn = gr.Button("Send")
257
+ with gr.Column(min_width=70):
258
+ cancel_btn = gr.Button("Stop")
259
+ with gr.Row():
260
+ empty_btn = gr.Button("🧹 New Conversation")
261
+ retry_btn = gr.Button("🔄 Regenerate")
262
+ del_last_btn = gr.Button("🗑️ Remove Last Turn")
263
+
264
+ with gr.Column():
265
+ # add note no more than 2 images once
266
+ gr.Markdown("Note: you can upload no more than 2 images once")
267
+ upload_images = gr.Files(file_types=["image"], show_label=True)
268
+ gallery = gr.Gallery(columns=[3], height="200px", show_label=True)
269
+ upload_images.change(preview_images, inputs=upload_images, outputs=gallery)
270
+ # Parameter Setting Tab for control the generation parameters
271
+ with gr.Tab(label="Parameter Setting"):
272
+ top_p = gr.Slider(minimum=-0, maximum=1.0, value=1.0, step=0.05, interactive=True, label="Top-p")
273
+ temperature = gr.Slider(
274
+ minimum=0, maximum=1.0, value=0.6, step=0.1, interactive=True, label="Temperature"
275
+ )
276
+ max_length_tokens = gr.Slider(
277
+ minimum=512, maximum=8192, value=2048, step=64, interactive=True, label="Max Length Tokens"
278
+ )
279
+ max_context_length_tokens = gr.Slider(
280
+ minimum=512, maximum=8192, value=2048, step=64, interactive=True, label="Max Context Length Tokens"
281
+ )
282
+
283
+ show_images = gr.HTML(visible=False)
284
+
285
+ gr.Examples(
286
+ examples=get_examples(ROOT_DIR),
287
+ inputs=[upload_images, show_images, text_box],
288
+ )
289
+ gr.Markdown()
290
+
291
+ input_widgets = [
292
+ input_text,
293
+ input_images,
294
+ chatbot,
295
+ history,
296
+ top_p,
297
+ temperature,
298
+ max_length_tokens,
299
+ max_context_length_tokens,
300
+ ]
301
+ output_widgets = [chatbot, history, status_display]
302
+
303
+ transfer_input_args = dict(
304
+ fn=transfer_input,
305
+ inputs=[text_box, upload_images],
306
+ outputs=[input_text, input_images, text_box, upload_images, submit_btn],
307
+ show_progress=True,
308
+ )
309
+
310
+ predict_args = dict(fn=predict, inputs=input_widgets, outputs=output_widgets, show_progress=True)
311
+ retry_args = dict(fn=retry, inputs=input_widgets, outputs=output_widgets, show_progress=True)
312
+ reset_args = dict(fn=reset_textbox, inputs=[], outputs=[text_box, status_display])
313
+
314
+ predict_events = [
315
+ text_box.submit(**transfer_input_args).then(**predict_args),
316
+ submit_btn.click(**transfer_input_args).then(**predict_args),
317
+ ]
318
+
319
+ empty_btn.click(reset_state, outputs=output_widgets, show_progress=True)
320
+ empty_btn.click(**reset_args)
321
+ retry_btn.click(**retry_args)
322
+ del_last_btn.click(delete_last_conversation, [chatbot, history], output_widgets, show_progress=True)
323
+ cancel_btn.click(cancel_outputing, [], [status_display], cancels=predict_events)
324
+
325
+ demo.title = "Kimi-VL-A3B-Thinking Chatbot"
326
+ return demo
327
+
328
+
329
+ def main(args: argparse.Namespace):
330
+ demo = build_demo(args)
331
+ reload_javascript()
332
+
333
+ # concurrency_count=CONCURRENT_COUNT, max_size=MAX_EVENTS
334
+ favicon_path = os.path.join("kimi_vl/serve/assets/favicon.ico")
335
+ demo.queue().launch(
336
+ favicon_path=favicon_path,
337
+ server_name=args.ip,
338
+ server_port=args.port,
339
+ )
340
+
341
+
342
+ if __name__ == "__main__":
343
+ args = parse_args()
344
+ main(args)
images/demo1.jpeg ADDED

Git LFS Details

  • SHA256: 8fc81bcaf75321eb871827fb0cad556cc5d3fe304864516c3dbace377fb82b64
  • Pointer size: 132 Bytes
  • Size of remote file: 5.31 MB
images/demo2.jpeg ADDED

Git LFS Details

  • SHA256: fddde8fc86f53fce4625f8defb54640c7fb885f1049eb0fb631d6234ac0e994b
  • Pointer size: 131 Bytes
  • Size of remote file: 450 kB
images/demo3.jpeg ADDED

Git LFS Details

  • SHA256: 9a3833fb7fc115cb7f74454296023e548e2eee32642ccbcee3baa7ad9b561097
  • Pointer size: 130 Bytes
  • Size of remote file: 20.3 kB
images/demo4.jpeg ADDED

Git LFS Details

  • SHA256: 2761a3226f9cd4d894e822c6dc98a4a418a89c4f82e1cc00a57d960fb66fc51f
  • Pointer size: 131 Bytes
  • Size of remote file: 223 kB
images/demo5.jpeg ADDED

Git LFS Details

  • SHA256: 927541679993f7bd2bcd344c04d648bed64ba1a97a4473a16eab1647fa190e8d
  • Pointer size: 131 Bytes
  • Size of remote file: 264 kB
kimi_vl/__init__.py ADDED
File without changes
kimi_vl/serve/__init__.py ADDED
File without changes
kimi_vl/serve/assets/Kelpy-Codos.js ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Copyright (c) 2023-2024 DeepSeek.
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
5
+ * this software and associated documentation files (the "Software"), to deal in
6
+ * the Software without restriction, including without limitation the rights to
7
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8
+ * the Software, and to permit persons to whom the Software is furnished to do so,
9
+ * subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be included in all
12
+ * copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20
+ */
21
+
22
+ // ==UserScript==
23
+ // @name Kelpy Codos
24
+ // @namespace https://github.com/Keldos-Li/Kelpy-Codos
25
+ // @version 1.0.5
26
+ // @author Keldos; https://keldos.me/
27
+ // @description Add copy button to PRE tags before CODE tag, for Chuanhu ChatGPT especially.
28
+ // Based on Chuanhu ChatGPT version: ac04408 (2023-3-22)
29
+ // @license GPL-3.0
30
+ // @grant none
31
+ // ==/UserScript==
32
+
33
+ (function () {
34
+ "use strict";
35
+
36
+ function addCopyButton(pre) {
37
+ var code = pre.querySelector("code");
38
+ if (!code) {
39
+ return; // 如果没有找到 <code> 元素,则不添加按钮
40
+ }
41
+ var firstChild = code.firstChild;
42
+ if (!firstChild) {
43
+ return; // 如果 <code> 元素没有子节点,则不添加按钮
44
+ }
45
+ var button = document.createElement("button");
46
+ button.textContent = "\uD83D\uDCCE"; // 使用 📎 符号作为“复制”按钮的文本
47
+ button.style.position = "relative";
48
+ button.style.float = "right";
49
+ button.style.fontSize = "1em"; // 可选:调整按钮大小
50
+ button.style.background = "none"; // 可选:去掉背景颜色
51
+ button.style.border = "none"; // 可选:去掉边框
52
+ button.style.cursor = "pointer"; // 可选:显示指针样式
53
+ button.addEventListener("click", function () {
54
+ var range = document.createRange();
55
+ range.selectNodeContents(code);
56
+ range.setStartBefore(firstChild); // 将范围设置为第一个子节点之前
57
+ var selection = window.getSelection();
58
+ selection.removeAllRanges();
59
+ selection.addRange(range);
60
+
61
+ try {
62
+ var success = document.execCommand("copy");
63
+ if (success) {
64
+ button.textContent = "\u2714";
65
+ setTimeout(function () {
66
+ button.textContent = "\uD83D\uDCCE"; // 恢复按钮为“复制”
67
+ }, 2000);
68
+ } else {
69
+ button.textContent = "\u2716";
70
+ }
71
+ } catch (e) {
72
+ console.error(e);
73
+ button.textContent = "\u2716";
74
+ }
75
+
76
+ selection.removeAllRanges();
77
+ });
78
+ code.insertBefore(button, firstChild); // 将按钮插入到第一个子元素之前
79
+ }
80
+
81
+ function handleNewElements(mutationsList, observer) {
82
+ for (var mutation of mutationsList) {
83
+ if (mutation.type === "childList") {
84
+ for (var node of mutation.addedNodes) {
85
+ if (node.nodeName === "PRE") {
86
+ addCopyButton(node);
87
+ }
88
+ }
89
+ }
90
+ }
91
+ }
92
+
93
+ var observer = new MutationObserver(handleNewElements);
94
+ observer.observe(document.documentElement, {
95
+ childList: true,
96
+ subtree: true,
97
+ });
98
+
99
+ document.querySelectorAll("pre").forEach(addCopyButton);
100
+ })();
kimi_vl/serve/assets/avatar.png ADDED

Git LFS Details

  • SHA256: 3395211efab793b89a4e579d90bd606b0eb435e2566aedf54bec585e436a8e71
  • Pointer size: 130 Bytes
  • Size of remote file: 62.1 kB
kimi_vl/serve/assets/custom.css ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Copyright (c) 2023-2024 DeepSeek.
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
5
+ * this software and associated documentation files (the "Software"), to deal in
6
+ * the Software without restriction, including without limitation the rights to
7
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8
+ * the Software, and to permit persons to whom the Software is furnished to do so,
9
+ * subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be included in all
12
+ * copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20
+ */
21
+
22
+ :root {
23
+ --chatbot-color-light: #f3f3f3;
24
+ --chatbot-color-dark: #121111;
25
+ }
26
+
27
+ /* status_display */
28
+ #status_display {
29
+ display: flex;
30
+ min-height: 2.5em;
31
+ align-items: flex-end;
32
+ justify-content: flex-end;
33
+ }
34
+ #status_display p {
35
+ font-size: 0.85em;
36
+ font-family: monospace;
37
+ color: var(--body-text-color-subdued);
38
+ }
39
+
40
+ /* usage_display */
41
+ #usage_display {
42
+ height: 1em;
43
+ }
44
+ #usage_display p {
45
+ padding: 0 1em;
46
+ font-size: 0.85em;
47
+ font-family: monospace;
48
+ color: var(--body-text-color-subdued);
49
+ }
50
+ /* list */
51
+ ol:not(.options),
52
+ ul:not(.options) {
53
+ padding-inline-start: 2em !important;
54
+ }
55
+
56
+ /* Thank @Keldos-Li for fixing it */
57
+ /* Light mode (default) */
58
+ #deepseek_chatbot {
59
+ background-color: var(--chatbot-color-light) !important;
60
+ color: #000000 !important;
61
+ }
62
+ [data-testid="bot"] {
63
+ background-color: #ffffff !important;
64
+ }
65
+ [data-testid="user"] {
66
+ background-color: #95ec69 !important;
67
+ }
68
+
69
+ /* Dark mode */
70
+ .dark #deepseek_chatbot {
71
+ background-color: var(--chatbot-color-dark) !important;
72
+ color: #ffffff !important;
73
+ }
74
+ .dark [data-testid="bot"] {
75
+ background-color: #2c2c2c !important;
76
+ }
77
+ .dark [data-testid="user"] {
78
+ background-color: #26b561 !important;
79
+ }
80
+
81
+ #deepseek_chatbot {
82
+ height: 100%;
83
+ min-height: 800px;
84
+ flex-grow: 1;
85
+ overflow: auto;
86
+ }
87
+
88
+ [class*="message"] {
89
+ border-radius: var(--radius-xl) !important;
90
+ border: none;
91
+ padding: var(--spacing-xl) !important;
92
+ font-size: var(--text-md) !important;
93
+ line-height: var(--line-md) !important;
94
+ min-height: calc(var(--text-md) * var(--line-md) + 2 * var(--spacing-xl));
95
+ min-width: calc(var(--text-md) * var(--line-md) + 2 * var(--spacing-xl));
96
+ }
97
+ [data-testid="bot"] {
98
+ max-width: 85%;
99
+ border-bottom-left-radius: 0 !important;
100
+ }
101
+ [data-testid="user"] {
102
+ max-width: 85%;
103
+ width: auto !important;
104
+ border-bottom-right-radius: 0 !important;
105
+ }
106
+ /* Table */
107
+ table {
108
+ margin: 1em 0;
109
+ border-collapse: collapse;
110
+ empty-cells: show;
111
+ }
112
+ td,
113
+ th {
114
+ border: 1.2px solid var(--border-color-primary) !important;
115
+ padding: 0.2em;
116
+ }
117
+ thead {
118
+ background-color: rgba(175, 184, 193, 0.2);
119
+ }
120
+ thead th {
121
+ padding: 0.5em 0.2em;
122
+ }
123
+ /* Inline code */
124
+ #deepseek_chatbot code {
125
+ display: inline;
126
+ white-space: break-spaces;
127
+ border-radius: 6px;
128
+ margin: 0 2px 0 2px;
129
+ padding: 0.2em 0.4em 0.1em 0.4em;
130
+ background-color: rgba(175, 184, 193, 0.2);
131
+ }
132
+ /* Code block */
133
+ #deepseek_chatbot pre code {
134
+ display: block;
135
+ overflow: auto;
136
+ white-space: pre;
137
+ background-color: #1c1d1e !important;
138
+ border-radius: 10px;
139
+ padding: 1.4em 1.2em 0em 1.4em;
140
+ margin: 1.2em 2em 1.2em 0.5em;
141
+ color: #fdf8f8;
142
+ box-shadow: 6px 6px 16px hsla(0, 0%, 0%, 0.2);
143
+ }
144
+ /* Hightlight */
145
+ #deepseek_chatbot .highlight {
146
+ background-color: transparent;
147
+ }
148
+ #deepseek_chatbot .highlight .hll {
149
+ background-color: #49483e;
150
+ }
151
+ #deepseek_chatbot .highlight .c {
152
+ color: #75715e;
153
+ } /* Comment */
154
+ #deepseek_chatbot .highlight .err {
155
+ color: #960050;
156
+ background-color: #1e0010;
157
+ } /* Error */
158
+ #deepseek_chatbot .highlight .k {
159
+ color: #66d9ef;
160
+ } /* Keyword */
161
+ #deepseek_chatbot .highlight .l {
162
+ color: #ae81ff;
163
+ } /* Literal */
164
+ #deepseek_chatbot .highlight .n {
165
+ color: #f8f8f2;
166
+ } /* Name */
167
+ #deepseek_chatbot .highlight .o {
168
+ color: #f92672;
169
+ } /* Operator */
170
+ #deepseek_chatbot .highlight .p {
171
+ color: #f8f8f2;
172
+ } /* Punctuation */
173
+ #deepseek_chatbot .highlight .ch {
174
+ color: #75715e;
175
+ } /* Comment.Hashbang */
176
+ #deepseek_chatbot .highlight .cm {
177
+ color: #75715e;
178
+ } /* Comment.Multiline */
179
+ #deepseek_chatbot .highlight .cp {
180
+ color: #75715e;
181
+ } /* Comment.Preproc */
182
+ #deepseek_chatbot .highlight .cpf {
183
+ color: #75715e;
184
+ } /* Comment.PreprocFile */
185
+ #deepseek_chatbot .highlight .c1 {
186
+ color: #75715e;
187
+ } /* Comment.Single */
188
+ #deepseek_chatbot .highlight .cs {
189
+ color: #75715e;
190
+ } /* Comment.Special */
191
+ #deepseek_chatbot .highlight .gd {
192
+ color: #f92672;
193
+ } /* Generic.Deleted */
194
+ #deepseek_chatbot .highlight .ge {
195
+ font-style: italic;
196
+ } /* Generic.Emph */
197
+ #deepseek_chatbot .highlight .gi {
198
+ color: #a6e22e;
199
+ } /* Generic.Inserted */
200
+ #deepseek_chatbot .highlight .gs {
201
+ font-weight: bold;
202
+ } /* Generic.Strong */
203
+ #deepseek_chatbot .highlight .gu {
204
+ color: #75715e;
205
+ } /* Generic.Subheading */
206
+ #deepseek_chatbot .highlight .kc {
207
+ color: #66d9ef;
208
+ } /* Keyword.Constant */
209
+ #deepseek_chatbot .highlight .kd {
210
+ color: #66d9ef;
211
+ } /* Keyword.Declaration */
212
+ #deepseek_chatbot .highlight .kn {
213
+ color: #f92672;
214
+ } /* Keyword.Namespace */
215
+ #deepseek_chatbot .highlight .kp {
216
+ color: #66d9ef;
217
+ } /* Keyword.Pseudo */
218
+ #deepseek_chatbot .highlight .kr {
219
+ color: #66d9ef;
220
+ } /* Keyword.Reserved */
221
+ #deepseek_chatbot .highlight .kt {
222
+ color: #66d9ef;
223
+ } /* Keyword.Type */
224
+ #deepseek_chatbot .highlight .ld {
225
+ color: #e6db74;
226
+ } /* Literal.Date */
227
+ #deepseek_chatbot .highlight .m {
228
+ color: #ae81ff;
229
+ } /* Literal.Number */
230
+ #deepseek_chatbot .highlight .s {
231
+ color: #e6db74;
232
+ } /* Literal.String */
233
+ #deepseek_chatbot .highlight .na {
234
+ color: #a6e22e;
235
+ } /* Name.Attribute */
236
+ #deepseek_chatbot .highlight .nb {
237
+ color: #f8f8f2;
238
+ } /* Name.Builtin */
239
+ #deepseek_chatbot .highlight .nc {
240
+ color: #a6e22e;
241
+ } /* Name.Class */
242
+ #deepseek_chatbot .highlight .no {
243
+ color: #66d9ef;
244
+ } /* Name.Constant */
245
+ #deepseek_chatbot .highlight .nd {
246
+ color: #a6e22e;
247
+ } /* Name.Decorator */
248
+ #deepseek_chatbot .highlight .ni {
249
+ color: #f8f8f2;
250
+ } /* Name.Entity */
251
+ #deepseek_chatbot .highlight .ne {
252
+ color: #a6e22e;
253
+ } /* Name.Exception */
254
+ #deepseek_chatbot .highlight .nf {
255
+ color: #a6e22e;
256
+ } /* Name.Function */
257
+ #deepseek_chatbot .highlight .nl {
258
+ color: #f8f8f2;
259
+ } /* Name.Label */
260
+ #deepseek_chatbot .highlight .nn {
261
+ color: #f8f8f2;
262
+ } /* Name.Namespace */
263
+ #deepseek_chatbot .highlight .nx {
264
+ color: #a6e22e;
265
+ } /* Name.Other */
266
+ #deepseek_chatbot .highlight .py {
267
+ color: #f8f8f2;
268
+ } /* Name.Property */
269
+ #deepseek_chatbot .highlight .nt {
270
+ color: #f92672;
271
+ } /* Name.Tag */
272
+ #deepseek_chatbot .highlight .nv {
273
+ color: #f8f8f2;
274
+ } /* Name.Variable */
275
+ #deepseek_chatbot .highlight .ow {
276
+ color: #f92672;
277
+ } /* Operator.Word */
278
+ #deepseek_chatbot .highlight .w {
279
+ color: #f8f8f2;
280
+ } /* Text.Whitespace */
281
+ #deepseek_chatbot .highlight .mb {
282
+ color: #ae81ff;
283
+ } /* Literal.Number.Bin */
284
+ #deepseek_chatbot .highlight .mf {
285
+ color: #ae81ff;
286
+ } /* Literal.Number.Float */
287
+ #deepseek_chatbot .highlight .mh {
288
+ color: #ae81ff;
289
+ } /* Literal.Number.Hex */
290
+ #deepseek_chatbot .highlight .mi {
291
+ color: #ae81ff;
292
+ } /* Literal.Number.Integer */
293
+ #deepseek_chatbot .highlight .mo {
294
+ color: #ae81ff;
295
+ } /* Literal.Number.Oct */
296
+ #deepseek_chatbot .highlight .sa {
297
+ color: #e6db74;
298
+ } /* Literal.String.Affix */
299
+ #deepseek_chatbot .highlight .sb {
300
+ color: #e6db74;
301
+ } /* Literal.String.Backtick */
302
+ #deepseek_chatbot .highlight .sc {
303
+ color: #e6db74;
304
+ } /* Literal.String.Char */
305
+ #deepseek_chatbot .highlight .dl {
306
+ color: #e6db74;
307
+ } /* Literal.String.Delimiter */
308
+ #deepseek_chatbot .highlight .sd {
309
+ color: #e6db74;
310
+ } /* Literal.String.Doc */
311
+ #deepseek_chatbot .highlight .s2 {
312
+ color: #e6db74;
313
+ } /* Literal.String.Double */
314
+ #deepseek_chatbot .highlight .se {
315
+ color: #ae81ff;
316
+ } /* Literal.String.Escape */
317
+ #deepseek_chatbot .highlight .sh {
318
+ color: #e6db74;
319
+ } /* Literal.String.Heredoc */
320
+ #deepseek_chatbot .highlight .si {
321
+ color: #e6db74;
322
+ } /* Literal.String.Interpol */
323
+ #deepseek_chatbot .highlight .sx {
324
+ color: #e6db74;
325
+ } /* Literal.String.Other */
326
+ #deepseek_chatbot .highlight .sr {
327
+ color: #e6db74;
328
+ } /* Literal.String.Regex */
329
+ #deepseek_chatbot .highlight .s1 {
330
+ color: #e6db74;
331
+ } /* Literal.String.Single */
332
+ #deepseek_chatbot .highlight .ss {
333
+ color: #e6db74;
334
+ } /* Literal.String.Symbol */
335
+ #deepseek_chatbot .highlight .bp {
336
+ color: #f8f8f2;
337
+ } /* Name.Builtin.Pseudo */
338
+ #deepseek_chatbot .highlight .fm {
339
+ color: #a6e22e;
340
+ } /* Name.Function.Magic */
341
+ #deepseek_chatbot .highlight .vc {
342
+ color: #f8f8f2;
343
+ } /* Name.Variable.Class */
344
+ #deepseek_chatbot .highlight .vg {
345
+ color: #f8f8f2;
346
+ } /* Name.Variable.Global */
347
+ #deepseek_chatbot .highlight .vi {
348
+ color: #f8f8f2;
349
+ } /* Name.Variable.Instance */
350
+ #deepseek_chatbot .highlight .vm {
351
+ color: #f8f8f2;
352
+ } /* Name.Variable.Magic */
353
+ #deepseek_chatbot .highlight .il {
354
+ color: #ae81ff;
355
+ } /* Literal.Number.Integer.Long */
kimi_vl/serve/assets/custom.js ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Copyright (c) 2023-2024 DeepSeek.
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
5
+ * this software and associated documentation files (the "Software"), to deal in
6
+ * the Software without restriction, including without limitation the rights to
7
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8
+ * the Software, and to permit persons to whom the Software is furnished to do so,
9
+ * subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be included in all
12
+ * copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20
+ */
21
+
22
+ // custom javascript here
kimi_vl/serve/assets/favicon.ico ADDED

Git LFS Details

  • SHA256: 28dab71bd4190f41c7de510615e91afcba52ad7ce6826fbf86b213205be62b45
  • Pointer size: 130 Bytes
  • Size of remote file: 15.4 kB
kimi_vl/serve/assets/simsun.ttc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff7d69bfa6588d3fdedbddbe3a29ac11f0c50236723ee72a9ea49ec3e2553f5d
3
+ size 15323200
kimi_vl/serve/chat_utils.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ From https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
3
+ """
4
+
5
+ import dataclasses
6
+ import logging
7
+ import copy
8
+ from enum import IntEnum, auto
9
+ from typing import Dict, List
10
+ import base64
11
+
12
+ import gradio as gr
13
+ import torch
14
+
15
+ from .utils import pil_to_base64
16
+
17
+ IMAGE_TOKEN = "<image>"
18
+ logger = logging.getLogger("gradio_logger")
19
+
20
+
21
+ class SeparatorStyle(IntEnum):
22
+ """Separator styles."""
23
+
24
+ PLAIN = auto()
25
+ ALIGNMENT = auto()
26
+ KIMI_VL = auto()
27
+
28
+
29
+ @dataclasses.dataclass
30
+ class Conversation:
31
+ """A class that manages prompt templates and keeps all conversation history."""
32
+
33
+ # The name of this template
34
+ name: str
35
+ # The template of the system prompt
36
+ system_template: str = "{system_message}"
37
+ # The system message
38
+ system_message: str = ""
39
+ # The names of two roles
40
+ roles: List[str] = (("USER", "ASSISTANT"),)
41
+ # All messages. Each item is (role, message).
42
+ messages: List[List[str]] = ()
43
+ # The number of few shot examples
44
+ offset: int = 0
45
+ # The separator style and configurations
46
+ sep_style: SeparatorStyle = SeparatorStyle.PLAIN
47
+ sep: str = "\n"
48
+ sep2: str = None
49
+ # Stop criteria (the default one is EOS token)
50
+ stop_str: str = None
51
+ # Stops generation if meeting any token in this list
52
+ stop_token_ids: List[int] = None
53
+
54
+ def get_prompt(self) -> str:
55
+ """Get the prompt for generation."""
56
+ system_prompt = self.system_template.format(system_message=self.system_message)
57
+ if self.sep_style == SeparatorStyle.PLAIN:
58
+ seps = [self.sep, self.sep2]
59
+ ret = ""
60
+ for i, (role, message) in enumerate(self.messages):
61
+ if message:
62
+ if type(message) is tuple:
63
+ message = message[0]
64
+ if i % 2 == 0:
65
+ ret += message + seps[i % 2]
66
+ else:
67
+ ret += message + seps[i % 2]
68
+ else:
69
+ ret += ""
70
+ return ret
71
+ elif self.sep_style == SeparatorStyle.ALIGNMENT:
72
+ seps = [self.sep, self.sep2]
73
+ ret = ""
74
+ for i, (role, message) in enumerate(self.messages):
75
+ if message:
76
+ if type(message) is tuple:
77
+ message, _, _ = message
78
+ if i % 2 == 0:
79
+ ret += '<image>\n' + seps[i % 2]
80
+ else:
81
+ ret += message + seps[i % 2]
82
+ else:
83
+ ret += ""
84
+ return ret
85
+ elif self.sep_style == SeparatorStyle.KIMI_VL:
86
+ seps = [self.sep, self.sep2]
87
+ if system_prompt == "" or system_prompt is None:
88
+ ret = ""
89
+ else:
90
+ ret = system_prompt + seps[0]
91
+ for i, (role, message) in enumerate(self.messages):
92
+ if message:
93
+ if type(message) is tuple:
94
+ message = message[0]
95
+
96
+ if role == "user":
97
+ ret += message + self.sep
98
+ else:
99
+ if self.sep2 is not None:
100
+ ret += message + self.sep2
101
+ else:
102
+ ret += message
103
+ else:
104
+ ret = ret
105
+ return ret
106
+ else:
107
+ raise ValueError(f"Invalid style: {self.sep_style}")
108
+
109
+ def set_system_message(self, system_message: str):
110
+ """Set the system message."""
111
+ self.system_message = system_message
112
+
113
+ def append_message(self, role: str, message: str):
114
+ """Append a new message."""
115
+ self.messages.append([role, message])
116
+
117
+ def update_last_message(self, message: str):
118
+ """Update the last output.
119
+
120
+ The last message is typically set to be None when constructing the prompt,
121
+ so we need to update it in-place after getting the response from a model.
122
+ """
123
+ self.messages[-1][1] = message
124
+
125
+ def reset_message(self):
126
+ """Reset a new message."""
127
+ self.messages = []
128
+
129
+ def to_gradio_chatbot(self):
130
+ """Convert the conversation to gradio chatbot format."""
131
+ ret = []
132
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
133
+ if i % 2 == 0:
134
+ ret.append([msg, None])
135
+ else:
136
+ ret[-1][-1] = msg
137
+ return ret
138
+
139
+ def to_openai_api_messages(self):
140
+ """Convert the conversation to OpenAI chat completion format."""
141
+ system_prompt = self.system_template.format(system_message=self.system_message)
142
+ ret = [{"role": "system", "content": system_prompt}]
143
+
144
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
145
+ if i % 2 == 0:
146
+ ret.append({"role": "user", "content": msg})
147
+ else:
148
+ if msg is not None:
149
+ ret.append({"role": "assistant", "content": msg})
150
+ return ret
151
+
152
+ def copy(self):
153
+ return Conversation(
154
+ name=self.name,
155
+ system_template=self.system_template,
156
+ system_message=self.system_message,
157
+ roles=self.roles,
158
+ messages=[[x, y] for x, y in self.messages],
159
+ offset=self.offset,
160
+ sep_style=self.sep_style,
161
+ sep=self.sep,
162
+ sep2=self.sep2,
163
+ stop_str=self.stop_str,
164
+ stop_token_ids=self.stop_token_ids,
165
+ )
166
+
167
+ def dict(self):
168
+ return {
169
+ "template_name": self.name,
170
+ "system_message": self.system_message,
171
+ "roles": self.roles,
172
+ "messages": self.messages,
173
+ "offset": self.offset,
174
+ }
175
+
176
+
177
+ # A global registry for all conversation templates
178
+ conv_templates: Dict[str, Conversation] = {}
179
+
180
+
181
+ def register_conv_template(template: Conversation, override: bool = False):
182
+ """Register a new conversation template."""
183
+ if not override:
184
+ assert template.name not in conv_templates, f"{template.name} has been registered."
185
+
186
+ conv_templates[template.name] = template
187
+
188
+
189
+ def get_conv_template(name: str) -> Conversation:
190
+ """Get a conversation template."""
191
+ return conv_templates[name].copy()
192
+
193
+
194
+ register_conv_template(
195
+ Conversation(
196
+ name="plain",
197
+ system_template="",
198
+ system_message="",
199
+ roles=("", ""),
200
+ messages=(),
201
+ offset=0,
202
+ sep_style=SeparatorStyle.PLAIN,
203
+ sep="",
204
+ sep2="",
205
+ stop_token_ids=[100001],
206
+ stop_str=['</s>'],
207
+ )
208
+ )
209
+
210
+
211
+ register_conv_template(
212
+ Conversation(
213
+ name="alignment",
214
+ system_template="",
215
+ system_message="",
216
+ roles=("", ""),
217
+ messages=(),
218
+ offset=0,
219
+ sep_style=SeparatorStyle.ALIGNMENT,
220
+ sep="",
221
+ sep2="",
222
+ stop_token_ids=[100001],
223
+ stop_str=['</s>'],
224
+ )
225
+ )
226
+
227
+ register_conv_template(
228
+ Conversation(
229
+ name="kimi-vl",
230
+ system_template="{system_message}",
231
+ system_message="You are a helpful assistant",
232
+ roles=("user", "assistant"),
233
+ messages=(),
234
+ offset=0,
235
+ sep_style=SeparatorStyle.KIMI_VL,
236
+ sep="<|im_end|>",
237
+ sep2=None,
238
+ stop_token_ids=None,
239
+ stop_str=["<|im_end|>"],
240
+ )
241
+ )
242
+
243
+
244
+ def new_chat_template(sft_format: str = "kimi-vl"):
245
+ return get_conv_template(sft_format)
246
+
247
+
248
+ def get_prompt(conv: Conversation) -> str:
249
+ """Get the prompt for generation."""
250
+ return conv.get_prompt()
251
+
252
+
253
+ def generate_prompt_with_history(text, images, history, processor, max_length=2048):
254
+ """
255
+ Generate a prompt with the chat history.
256
+
257
+ Args:
258
+ text (str): The text prompt.
259
+ images (list[PIL.Image.Image]): The image prompt.
260
+ history (list): List of previous conversation messages.
261
+ processor (KimiVLProcessor): The chat processor used for encoding the prompt.
262
+ max_length (int): The maximum length of the prompt.
263
+ """
264
+ global IMAGE_TOKEN
265
+
266
+ user_role_ind = 0
267
+ bot_role_ind = 1
268
+
269
+ # Initialize conversation
270
+ conversation = new_chat_template(sft_format="kimi-vl")
271
+
272
+ if history:
273
+ conversation.messages = history
274
+
275
+ if images is not None and len(images) > 0:
276
+ # num_image_tags = text.count(IMAGE_TOKEN)
277
+ # num_images = len(images)
278
+ # if num_images > num_image_tags:
279
+ # pad_image_tags = num_images - num_image_tags
280
+ # image_tokens = "\n".join([IMAGE_TOKEN] * pad_image_tags)
281
+
282
+ # # append the <image> in a new line after the text prompt
283
+ # text = image_tokens + "\n" + text
284
+ # elif num_images < num_image_tags:
285
+ # remove_image_tags = num_image_tags - num_images
286
+ # text = text.replace(IMAGE_TOKEN, "", remove_image_tags)
287
+
288
+ print(f"prompt = {text}, len(images) = {len(images)}")
289
+ text = (text, images)
290
+
291
+ conversation.append_message(conversation.roles[user_role_ind], text)
292
+ conversation.append_message(conversation.roles[bot_role_ind], "")
293
+
294
+ # Create a copy of the conversation to avoid history truncation in the UI
295
+ conversation_copy = conversation.copy()
296
+ logger.info("=" * 80)
297
+ logger.info(get_prompt(conversation))
298
+
299
+ rounds = len(conversation.messages) // 2
300
+
301
+ for _ in range(rounds):
302
+ current_prompt = get_prompt(conversation)
303
+ assert isinstance(current_prompt, str) and len(current_prompt) > 0, f"current_prompt = {current_prompt}"
304
+ if torch.tensor(processor.tokenizer.encode(current_prompt)).size(-1) <= max_length:
305
+ return conversation_copy
306
+
307
+ if len(conversation.messages) % 2 != 0:
308
+ gr.Error("The messages between user and assistant are not paired.")
309
+ return
310
+
311
+ try:
312
+ for _ in range(2): # pop out two messages in a row
313
+ conversation.messages.pop(0)
314
+ except IndexError:
315
+ gr.Error("Input text processing failed, unable to respond in this round.")
316
+ return None
317
+
318
+ gr.Error("Prompt could not be generated within max_length limit.")
319
+ return None
320
+
321
+
322
+ def convert_conversation_to_prompts(conversation: Conversation):
323
+ """
324
+ Convert the conversation to prompts.
325
+ """
326
+ conv_prompts = []
327
+ last_image = None
328
+
329
+ messages = conversation.messages
330
+ for i in range(0, len(messages), 2):
331
+ if isinstance(messages[i][1], tuple):
332
+ text, images = messages[i][1]
333
+ last_image = images[-1]
334
+ else:
335
+ text, images = messages[i][1], []
336
+
337
+ prompt = {"role": messages[i][0], "content": text, "images": images}
338
+ response = {"role": messages[i + 1][0], "content": messages[i + 1][1]}
339
+ conv_prompts.extend([prompt, response])
340
+
341
+ return conv_prompts, last_image
342
+
343
+
344
+ def to_gradio_chatbot(conversation: Conversation) -> list:
345
+ """Convert the conversation to gradio chatbot format."""
346
+ ret = []
347
+ for i, (_, msg) in enumerate(conversation.messages[conversation.offset :]):
348
+ if i % 2 == 0:
349
+ if type(msg) is tuple:
350
+ msg, images = copy.deepcopy(msg)
351
+
352
+ if isinstance(images, list):
353
+ img_str = ""
354
+ for j, image in enumerate(images):
355
+ if isinstance(image, str):
356
+ with open(image, "rb") as f:
357
+ data = f.read()
358
+ img_b64_str = base64.b64encode(data).decode()
359
+ image_str = (
360
+ f'<img src="data:image/png;base64,{img_b64_str}" '
361
+ f'alt="user upload image" style="max-width: 300px; height: auto;" />'
362
+ )
363
+ else:
364
+ image_str = pil_to_base64(image, f"user upload image_{j}", max_size=800, min_size=400)
365
+
366
+ img_str += image_str
367
+ msg = img_str + msg
368
+ else:
369
+ pass
370
+
371
+ ret.append([msg, None])
372
+ else:
373
+ ret[-1][-1] = msg
374
+ return ret
375
+
376
+
377
+ def to_gradio_history(conversation: Conversation):
378
+ """Convert the conversation to gradio history format."""
379
+ return conversation.messages[conversation.offset :]
kimi_vl/serve/examples.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import base64
4
+ from PIL import Image
5
+
6
+ EXAMPLES_LIST = [
7
+ [
8
+ ["images/demo1.jpeg"],
9
+ "Where am I?",
10
+ ],
11
+ [
12
+ ["images/demo2.jpeg"],
13
+ "Based on the abstract above, write a concise and elegant Twitter post that highlights key points and figures without sounding overly promotional. Use English, include emojis and hashtags.",
14
+ ],
15
+ [
16
+ ["images/demo3.jpeg"],
17
+ "If you are free, what would you most like to do?"
18
+ ],
19
+ # mulit-frames example
20
+ [
21
+ ["images/demo4.jpeg", "images/demo5.jpeg"],
22
+ "Please infer step by step who this manuscript belongs to and what it records."
23
+ ],
24
+ ]
25
+
26
+
27
+ def display_example(image_list, root_dir: str = None):
28
+ images_html = ""
29
+ for _, img_path in enumerate(image_list):
30
+ if root_dir is not None:
31
+ img_path = os.path.join(root_dir, img_path)
32
+
33
+ image = Image.open(img_path)
34
+ buffered = io.BytesIO()
35
+ image.save(buffered, format="PNG", quality=100)
36
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
37
+ img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="{img_path}" style="height:80px; margin-right: 10px;" />'
38
+ images_html += img_str
39
+
40
+ result_html = f"""
41
+ <div style="display: flex; align-items: center; margin-bottom: 10px;">
42
+ <div style="flex: 1; margin-right: 10px;">{images_html}</div>
43
+ </div>
44
+ """
45
+
46
+ return result_html
47
+
48
+
49
+ def get_examples(root_dir: str = None):
50
+ examples = []
51
+ for images, texts in EXAMPLES_LIST:
52
+ examples.append([images, display_example(images, root_dir), texts])
53
+
54
+ return examples
kimi_vl/serve/frontend.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from typing import List, Tuple
4
+
5
+ import gradio as gr
6
+
7
+ from kimi_vl.serve.utils import convert_asis, convert_mdtext, detect_converted_mark
8
+
9
+ ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
10
+
11
+
12
+ small_and_beautiful_theme = gr.themes.Soft(
13
+ primary_hue=gr.themes.Color(
14
+ c50="#EBFAF2",
15
+ c100="#CFF3E1",
16
+ c200="#A8EAC8",
17
+ c300="#77DEA9",
18
+ c400="#3FD086",
19
+ c500="#02C160",
20
+ c600="#06AE56",
21
+ c700="#05974E",
22
+ c800="#057F45",
23
+ c900="#04673D",
24
+ c950="#2E5541",
25
+ name="small_and_beautiful",
26
+ ),
27
+ secondary_hue=gr.themes.Color(
28
+ c50="#576b95",
29
+ c100="#576b95",
30
+ c200="#576b95",
31
+ c300="#576b95",
32
+ c400="#576b95",
33
+ c500="#576b95",
34
+ c600="#576b95",
35
+ c700="#576b95",
36
+ c800="#576b95",
37
+ c900="#576b95",
38
+ c950="#576b95",
39
+ ),
40
+ neutral_hue=gr.themes.Color(
41
+ name="gray",
42
+ c50="#f6f7f8",
43
+ # c100="#f3f4f6",
44
+ c100="#F2F2F2",
45
+ c200="#e5e7eb",
46
+ c300="#d1d5db",
47
+ c400="#B2B2B2",
48
+ c500="#808080",
49
+ c600="#636363",
50
+ c700="#515151",
51
+ c800="#393939",
52
+ # c900="#272727",
53
+ c900="#2B2B2B",
54
+ c950="#171717",
55
+ ),
56
+ radius_size=gr.themes.sizes.radius_sm,
57
+ ).set(
58
+ # button_primary_background_fill="*primary_500",
59
+ button_primary_background_fill_dark="*primary_600",
60
+ # button_primary_background_fill_hover="*primary_400",
61
+ # button_primary_border_color="*primary_500",
62
+ button_primary_border_color_dark="*primary_600",
63
+ button_primary_text_color="white",
64
+ button_primary_text_color_dark="white",
65
+ button_secondary_background_fill="*neutral_100",
66
+ button_secondary_background_fill_hover="*neutral_50",
67
+ button_secondary_background_fill_dark="*neutral_900",
68
+ button_secondary_text_color="*neutral_800",
69
+ button_secondary_text_color_dark="white",
70
+ # background_fill_primary="#F7F7F7",
71
+ # background_fill_primary_dark="#1F1F1F",
72
+ # block_title_text_color="*primary_500",
73
+ block_title_background_fill_dark="*primary_900",
74
+ block_label_background_fill_dark="*primary_900",
75
+ input_background_fill="#F6F6F6",
76
+ # chatbot_code_background_color_dark="*neutral_950",
77
+ )
78
+
79
+
80
+ def compact_text_chunks(self, prompt, text_chunks: List[str]) -> List[str]:
81
+ logging.debug("Compacting text chunks...🚀🚀🚀")
82
+ combined_str = [c.strip() for c in text_chunks if c.strip()]
83
+ combined_str = [f"[{index+1}] {c}" for index, c in enumerate(combined_str)]
84
+ combined_str = "\n\n".join(combined_str)
85
+ # resplit based on self.max_chunk_overlap
86
+ text_splitter = self.get_text_splitter_given_prompt(prompt, 1, padding=1)
87
+ return text_splitter.split_text(combined_str)
88
+
89
+
90
+ def postprocess(y: List[Tuple[str | None, str | None]]) -> List[Tuple[str | None, str | None]]:
91
+ """
92
+ Parameters:
93
+ y: List of tuples representing the message and response pairs. Each message and response should be a string, which may be in Markdown format.
94
+ Returns:
95
+ List of tuples representing the message and response. Each message and response will be a string of HTML.
96
+ """
97
+ if y is None or y == []:
98
+ return []
99
+ temp = []
100
+ for x in y:
101
+ user, bot = x
102
+ if not detect_converted_mark(user):
103
+ user = convert_asis(user)
104
+ if not detect_converted_mark(bot):
105
+ bot = convert_mdtext(bot)
106
+ temp.append((user, bot))
107
+ return temp
108
+
109
+
110
+ custom_js_path = os.path.join(ROOT_PATH, "assets/custom.js")
111
+ kelpy_codos_path = os.path.join(ROOT_PATH, "assets/Kelpy-Codos.js")
112
+
113
+ with (
114
+ open(custom_js_path, "r", encoding="utf-8") as f,
115
+ open(kelpy_codos_path, "r", encoding="utf-8") as f2,
116
+ ):
117
+ customJS = f.read()
118
+ kelpyCodos = f2.read()
119
+
120
+
121
+ def reload_javascript():
122
+ print("Reloading javascript...")
123
+ js = f"<script>{customJS}</script><script>{kelpyCodos}</script>"
124
+
125
+ def template_response(*args, **kwargs):
126
+ res = GradioTemplateResponseOriginal(*args, **kwargs)
127
+ res.body = res.body.replace(b"</html>", f"{js}</html>".encode("utf8"))
128
+ res.init_headers()
129
+ return res
130
+
131
+ gr.routes.templates.TemplateResponse = template_response
132
+
133
+
134
+ GradioTemplateResponseOriginal = gr.routes.templates.TemplateResponse
kimi_vl/serve/gradio_utils.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio utils for the Kimi-VL application.
3
+ """
4
+
5
+ import functools
6
+ from typing import Callable
7
+ import traceback
8
+
9
+ import gradio as gr
10
+
11
+
12
+ IMAGE_TOKEN = "<image>"
13
+
14
+
15
+ def transfer_input(input_text, input_images):
16
+ """
17
+ Transfer the input text and images to the input text and images.
18
+ """
19
+ return (input_text, input_images, gr.update(value=""), gr.update(value=None), gr.Button(visible=True))
20
+
21
+
22
+ def delete_last_conversation(chatbot, history):
23
+ """
24
+ Delete the last conversation from the chatbot and history.
25
+
26
+ Args:
27
+ chatbot (list): The chatbot list.
28
+ history (list): The history list.
29
+ """
30
+ if len(history) % 2 != 0:
31
+ gr.Error("history length is not even")
32
+ return (
33
+ chatbot,
34
+ history,
35
+ "Delete Done",
36
+ )
37
+
38
+ if len(chatbot) > 0:
39
+ chatbot.pop()
40
+
41
+ if len(history) > 0 and len(history) % 2 == 0:
42
+ history.pop()
43
+ history.pop()
44
+
45
+ return (
46
+ chatbot,
47
+ history,
48
+ "Delete Done",
49
+ )
50
+
51
+
52
+ def reset_state():
53
+ return [], [], None, "Reset Done"
54
+
55
+
56
+ def reset_textbox():
57
+ return gr.update(value=""), ""
58
+
59
+
60
+ def cancel_outputing():
61
+ return "Stop Done"
62
+
63
+
64
+ class State:
65
+ interrupted = False
66
+
67
+ def interrupt(self):
68
+ self.interrupted = True
69
+
70
+ def recover(self):
71
+ self.interrupted = False
72
+
73
+
74
+ shared_state = State()
75
+
76
+
77
+ def wrap_gen_fn(gen_fn: Callable):
78
+ """
79
+ Wrap the generator function to handle errors.
80
+ """
81
+
82
+ @functools.wraps(gen_fn)
83
+ def wrapped_gen_fn(prompt, *args, **kwargs):
84
+ try:
85
+ yield from gen_fn(prompt, *args, **kwargs)
86
+ except gr.Error as g_err:
87
+ traceback.print_exc()
88
+ raise g_err
89
+ except Exception as e:
90
+ traceback.print_exc()
91
+ raise gr.Error(f"Failed to generate text: {e}") from e
92
+
93
+ return wrapped_gen_fn
kimi_vl/serve/inference.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+ from threading import Thread
4
+ from typing import List, Optional
5
+
6
+ import torch
7
+ from transformers import (
8
+ AutoModelForCausalLM,
9
+ AutoProcessor,
10
+ AutoConfig,
11
+ StoppingCriteria,
12
+ StoppingCriteriaList,
13
+ TextIteratorStreamer,
14
+ )
15
+
16
+ from .chat_utils import Conversation, get_conv_template
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def load_model(model_path: str = "moonshotai/Kimi-VL-A3B-Thinking"):
22
+ # hotfix the model to use flash attention 2
23
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
24
+ config._attn_implementation = "flash_attention_2"
25
+ config.vision_config._attn_implementation = "flash_attention_2"
26
+ config.text_config._attn_implementation = "flash_attention_2"
27
+ print("Successfully set the attn_implementation to flash_attention_2")
28
+
29
+ model = AutoModelForCausalLM.from_pretrained(
30
+ model_path,
31
+ config=config,
32
+ torch_dtype="auto",
33
+ device_map="auto",
34
+ trust_remote_code=True,
35
+ )
36
+ processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True)
37
+
38
+ return model, processor
39
+
40
+
41
+ class StoppingCriteriaSub(StoppingCriteria):
42
+ def __init__(self, stops=[], encounters=1):
43
+ super().__init__()
44
+ self.stops = [stop.to("cuda") for stop in stops]
45
+
46
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
47
+ for stop in self.stops:
48
+ if input_ids.shape[-1] < len(stop):
49
+ continue
50
+ if torch.all((stop == input_ids[0][-len(stop) :])).item():
51
+ return True
52
+
53
+ return False
54
+
55
+
56
+ def format_messages(
57
+ conversations: list[Conversation],
58
+ system_prompt: Optional[str] = "",
59
+ sft_format: Optional[str] = "kimi-vl",
60
+ ):
61
+ """
62
+ Format the conversations to the input format of the model.
63
+ """
64
+ converstion = get_conv_template(sft_format)
65
+ converstion.set_system_message(system_prompt)
66
+ for message in conversations:
67
+ converstion.append_message(message["role"], message["content"])
68
+ return converstion
69
+
70
+
71
+ def preprocess(
72
+ messages: list[dict],
73
+ processor,
74
+ sft_format: Optional[str] = "kimi-vl",
75
+ ):
76
+ """
77
+ Build messages from the conversations and images.
78
+ """
79
+ # get images from conversations
80
+ results = []
81
+ images = []
82
+
83
+ # get texts from conversations
84
+ converstion = get_conv_template(sft_format)
85
+ # only use the last 3 round of messages
86
+ latest_messages = messages[-3:]
87
+ for mid, message in enumerate(latest_messages):
88
+ if message["role"] == converstion.roles[0] or message["role"] == "user":
89
+ record = {
90
+ "role": message["role"],
91
+ "content": [],
92
+ }
93
+ if "images" in message:
94
+ per_round_images = message["images"]
95
+ if len(per_round_images) > 2:
96
+ per_round_images = per_round_images[-2:]
97
+ print(f"Only use the last 2 images in the {mid}-th round")
98
+
99
+ images.extend(per_round_images)
100
+ for image in per_round_images:
101
+ record["content"].append(
102
+ {
103
+ "type": "image",
104
+ "image": image,
105
+ }
106
+ )
107
+ if 'content' in message:
108
+ record["content"].append(
109
+ {
110
+ "type": "text",
111
+ "text": str(message["content"]).strip(),
112
+ }
113
+ )
114
+ results.append(record)
115
+ elif message["role"] == converstion.roles[1] or message["role"] == "assistant":
116
+ formatted_answer = message["content"].strip()
117
+ # ◁think▷用户说了“你好”,这是一个非常简单的问候,通常用于开启对话。我需要判断用户的意图。可能性一:用户只是礼貌性地打招呼,想要开启一段对话;可能性二:用户可能有更具体的需求,比如询问我的功能、功能或者需要帮助。由于用户没有提供更多信息,我需要保持开放,同时引导用户进一步说明他们的需求。
118
+ # 我的回复需要既友好又开放,不能显得过于正式或冷漠。同时,我需要避免假设用户的具体需求,而是提供一个轻松的、鼓励继续对话的回应。◁/think▷你好!很高兴见到你。有什么我可以帮助你的吗
119
+ # delete all the texts between ◁think▷ and ◁/think▷
120
+ # FIXME: this is a hack to remove the thinking texts
121
+ # formatted_answer = re.sub(r"◁think▷.*◁/think▷", "", formatted_answer)
122
+ think_end_token = '◁/think▷'
123
+ formatted_answer = formatted_answer.split(think_end_token)[-1]
124
+ results.append(
125
+ {
126
+ "role": message["role"],
127
+ "content": [
128
+ {
129
+ "type": "text",
130
+ "text": formatted_answer,
131
+ }
132
+ ],
133
+ }
134
+ )
135
+ assert (
136
+ formatted_answer.count(processor.image_token) == 0
137
+ ), f"there should be no {processor.image_token} in the assistant's reply, but got {messages}"
138
+ converstion.append_message(converstion.roles[1], formatted_answer)
139
+
140
+ text = processor.apply_chat_template(results, add_generation_prompt=True)
141
+ print(f"raw text = {text}")
142
+ if len(images) == 0:
143
+ images = None
144
+
145
+ inputs = processor(
146
+ images=images,
147
+ text=[text],
148
+ return_tensors="pt",
149
+ padding=True,
150
+ truncation=True,
151
+ )
152
+ return inputs
153
+
154
+
155
+ @torch.inference_mode()
156
+ def kimi_vl_generate(
157
+ model: torch.nn.Module,
158
+ processor: AutoProcessor,
159
+ conversations: list[Conversation],
160
+ stop_words: list,
161
+ max_length: int = 256,
162
+ temperature: float = 1.0,
163
+ top_p: float = 1.0,
164
+ chunk_size: int = -1,
165
+ ):
166
+ # convert conversation to inputs
167
+ print(f"conversations = {conversations}")
168
+ inputs = preprocess(conversations, processor=processor)
169
+ inputs = inputs.to(model.device)
170
+
171
+ return generate(
172
+ model,
173
+ processor,
174
+ inputs,
175
+ max_gen_len=max_length,
176
+ temperature=temperature,
177
+ top_p=top_p,
178
+ stop_words=stop_words,
179
+ chunk_size=chunk_size,
180
+ )
181
+
182
+
183
+ @torch.inference_mode()
184
+ def generate(
185
+ model,
186
+ processor,
187
+ inputs,
188
+ max_gen_len: int = 256,
189
+ temperature: float = 0,
190
+ top_p: float = 0.95,
191
+ stop_words: List[str] = [],
192
+ chunk_size: int = -1,
193
+ ):
194
+ """Stream the text output from the multimodality model with prompt and image inputs."""
195
+ tokenizer = processor.tokenizer
196
+ stop_words_ids = [torch.tensor(tokenizer.encode(stop_word)) for stop_word in stop_words]
197
+ stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
198
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
199
+
200
+ kwargs = dict(
201
+ **inputs,
202
+ max_new_tokens=max_gen_len,
203
+ do_sample=True,
204
+ use_cache=True,
205
+ streamer=streamer,
206
+ stopping_criteria=stopping_criteria,
207
+ )
208
+
209
+ if temperature > 0:
210
+ kwargs.update(
211
+ {
212
+ "do_sample": True,
213
+ "top_p": top_p,
214
+ "temperature": temperature,
215
+ }
216
+ )
217
+ else:
218
+ kwargs["do_sample"] = False
219
+
220
+ thread = Thread(target=model.generate, kwargs=kwargs)
221
+ thread.start()
222
+
223
+ yield from streamer
kimi_vl/serve/utils.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import html
4
+ import logging
5
+ import io
6
+ import os
7
+ import re
8
+ import base64
9
+ import time
10
+ from PIL import Image, ImageDraw, ImageFont
11
+
12
+ import mdtex2html
13
+ from markdown import markdown
14
+ from pygments import highlight
15
+ from pygments.formatters import HtmlFormatter
16
+ from pygments.lexers import ClassNotFound, get_lexer_by_name, guess_lexer
17
+
18
+
19
+ ALREADY_CONVERTED_MARK = "<!-- ALREADY CONVERTED BY PARSER. -->"
20
+ BOX2COLOR = {
21
+ 0: (255, 0, 0),
22
+ 1: (0, 255, 0),
23
+ 2: (0, 0, 255),
24
+ }
25
+ MAX_IMAGE_SIZE = 1024
26
+ MIN_IMAGE_SIZE = 1024
27
+ logger = logging.getLogger("gradio_logger")
28
+
29
+
30
+ def configure_logger(log_dir: str = "logs"):
31
+ logger = logging.getLogger("gradio_logger")
32
+ logger.setLevel(logging.DEBUG)
33
+
34
+ timestr = time.strftime("%Y%m%d-%H%M%S")
35
+ os.makedirs(log_dir, exist_ok=True)
36
+ file_handler = logging.FileHandler(f"{log_dir}/{timestr}_gradio_log.log")
37
+ console_handler = logging.StreamHandler()
38
+
39
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
40
+ console_handler.setFormatter(formatter)
41
+ file_handler.setFormatter(formatter)
42
+
43
+ console_handler.setLevel(logging.INFO)
44
+ file_handler.setLevel(logging.INFO)
45
+
46
+ logger.addHandler(console_handler)
47
+ logger.addHandler(file_handler)
48
+
49
+ return logger
50
+
51
+
52
+ def strip_stop_words(x, stop_words):
53
+ for w in stop_words:
54
+ if w in x:
55
+ return x[: x.index(w)].strip()
56
+ return x.strip()
57
+
58
+
59
+ def format_output(history, text, x):
60
+ updated_history = history + [[text, x]]
61
+ a = [[y[0], convert_to_markdown(y[1])] for y in updated_history]
62
+ return a, updated_history
63
+
64
+
65
+ def markdown_to_html_with_syntax_highlight(md_str): # deprecated
66
+ def replacer(match):
67
+ lang = match.group(1) or "text"
68
+ code = match.group(2)
69
+
70
+ try:
71
+ lexer = get_lexer_by_name(lang, stripall=True)
72
+ except ValueError:
73
+ lexer = get_lexer_by_name("text", stripall=True)
74
+
75
+ formatter = HtmlFormatter()
76
+ highlighted_code = highlight(code, lexer, formatter)
77
+
78
+ return f'<pre><code class="{lang}">{highlighted_code}</code></pre>'
79
+
80
+ code_block_pattern = r"```(\w+)?\n([\s\S]+?)\n```"
81
+ md_str = re.sub(code_block_pattern, replacer, md_str, flags=re.MULTILINE)
82
+
83
+ html_str = markdown(md_str)
84
+ return html_str
85
+
86
+
87
+ def normalize_markdown(md_text: str) -> str: # deprecated
88
+ lines = md_text.split("\n")
89
+ normalized_lines = []
90
+ inside_list = False
91
+
92
+ for i, line in enumerate(lines):
93
+ if re.match(r"^(\d+\.|-|\*|\+)\s", line.strip()):
94
+ if not inside_list and i > 0 and lines[i - 1].strip() != "":
95
+ normalized_lines.append("")
96
+ inside_list = True
97
+ normalized_lines.append(line)
98
+ elif inside_list and line.strip() == "":
99
+ if i < len(lines) - 1 and not re.match(r"^(\d+\.|-|\*|\+)\s", lines[i + 1].strip()):
100
+ normalized_lines.append(line)
101
+ continue
102
+ else:
103
+ inside_list = False
104
+ normalized_lines.append(line)
105
+
106
+ return "\n".join(normalized_lines)
107
+
108
+
109
+ def convert_mdtext(md_text):
110
+ code_block_pattern = re.compile(r"```(.*?)(?:```|$)", re.DOTALL)
111
+ inline_code_pattern = re.compile(r"`(.*?)`", re.DOTALL)
112
+ code_blocks = code_block_pattern.findall(md_text)
113
+ non_code_parts = code_block_pattern.split(md_text)[::2]
114
+
115
+ result = []
116
+ for non_code, code in zip(non_code_parts, code_blocks + [""]):
117
+ if non_code.strip():
118
+ non_code = normalize_markdown(non_code)
119
+ if inline_code_pattern.search(non_code):
120
+ result.append(markdown(non_code, extensions=["tables"]))
121
+ else:
122
+ result.append(mdtex2html.convert(non_code, extensions=["tables"]))
123
+ if code.strip():
124
+ code = f"\n```{code}\n\n```"
125
+ code = markdown_to_html_with_syntax_highlight(code)
126
+ result.append(code)
127
+ result = "".join(result)
128
+ result += ALREADY_CONVERTED_MARK
129
+ return result
130
+
131
+
132
+ def convert_asis(userinput):
133
+ return f'<p style="white-space:pre-wrap;">{html.escape(userinput)}</p>{ALREADY_CONVERTED_MARK}'
134
+
135
+
136
+ def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
137
+ return any(s.endswith(stop_word) for stop_word in stop_words)
138
+
139
+
140
+ def detect_converted_mark(userinput):
141
+ return bool(userinput.endswith(ALREADY_CONVERTED_MARK))
142
+
143
+
144
+ def detect_language(code):
145
+ first_line = "" if code.startswith("\n") else code.strip().split("\n", 1)[0]
146
+ language = first_line.lower() if first_line else ""
147
+ code_without_language = code[len(first_line) :].lstrip() if first_line else code
148
+ return language, code_without_language
149
+
150
+
151
+ def convert_to_markdown(text):
152
+ text = text.replace("$", "&#36;")
153
+ text = text.replace("\r\n", "\n")
154
+
155
+ def replace_leading_tabs_and_spaces(line):
156
+ new_line = []
157
+
158
+ for char in line:
159
+ if char == "\t":
160
+ new_line.append("&#9;")
161
+ elif char == " ":
162
+ new_line.append("&nbsp;")
163
+ else:
164
+ break
165
+ return "".join(new_line) + line[len(new_line) :]
166
+
167
+ markdown_text = ""
168
+ lines = text.split("\n")
169
+ in_code_block = False
170
+
171
+ for line in lines:
172
+ if in_code_block is False and line.startswith("```"):
173
+ in_code_block = True
174
+ markdown_text += f"{line}\n"
175
+ elif in_code_block is True and line.startswith("```"):
176
+ in_code_block = False
177
+ markdown_text += f"{line}\n"
178
+ elif in_code_block:
179
+ markdown_text += f"{line}\n"
180
+ else:
181
+ line = replace_leading_tabs_and_spaces(line)
182
+ line = re.sub(r"^(#)", r"\\\1", line)
183
+ markdown_text += f"{line} \n"
184
+
185
+ return markdown_text
186
+
187
+
188
+ def add_language_tag(text):
189
+ def detect_language(code_block):
190
+ try:
191
+ lexer = guess_lexer(code_block)
192
+ return lexer.name.lower()
193
+ except ClassNotFound:
194
+ return ""
195
+
196
+ code_block_pattern = re.compile(r"(```)(\w*\n[^`]+```)", re.MULTILINE)
197
+
198
+ def replacement(match):
199
+ code_block = match.group(2)
200
+ if match.group(2).startswith("\n"):
201
+ language = detect_language(code_block)
202
+ return f"```{language}{code_block}```" if language else f"```\n{code_block}```"
203
+ else:
204
+ return match.group(1) + code_block + "```"
205
+
206
+ text2 = code_block_pattern.sub(replacement, text)
207
+ return text2
208
+
209
+
210
+ def is_variable_assigned(var_name: str) -> bool:
211
+ return var_name in locals()
212
+
213
+
214
+ def pil_to_base64(
215
+ image: Image.Image,
216
+ alt: str = "user upload image",
217
+ resize: bool = True,
218
+ max_size: int = MAX_IMAGE_SIZE,
219
+ min_size: int = MIN_IMAGE_SIZE,
220
+ format: str = "JPEG",
221
+ quality: int = 95,
222
+ ) -> str:
223
+ """
224
+ Convert a PIL image to a base64 string.
225
+ """
226
+
227
+ if resize:
228
+ max_hw, min_hw = max(image.size), min(image.size)
229
+ aspect_ratio = max_hw / min_hw
230
+ shortest_edge = int(min(max_size / aspect_ratio, min_size, min_hw))
231
+ longest_edge = int(shortest_edge * aspect_ratio)
232
+ W, H = image.size
233
+ if H > W:
234
+ H, W = longest_edge, shortest_edge
235
+ else:
236
+ H, W = shortest_edge, longest_edge
237
+ image = image.resize((W, H))
238
+
239
+ buffered = io.BytesIO()
240
+ image.save(buffered, format=format, quality=quality)
241
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
242
+ img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="{alt}" />'
243
+
244
+ return img_str
245
+
246
+
247
+ def parse_ref_bbox(response, image: Image.Image):
248
+ try:
249
+ image = image.copy()
250
+ image_h, image_w = image.size
251
+ draw = ImageDraw.Draw(image)
252
+
253
+ ref = re.findall(r'<\|ref\|>.*?<\|/ref\|>', response)
254
+ bbox = re.findall(r'<\|det\|>.*?<\|/det\|>', response)
255
+ assert len(ref) == len(bbox)
256
+
257
+ if len(ref) == 0:
258
+ return None
259
+
260
+ boxes, labels = [], []
261
+ for box, label in zip(bbox, ref):
262
+ box = box.replace('<|det|>', '').replace('<|/det|>', '')
263
+ label = label.replace('<|ref|>', '').replace('<|/ref|>', '')
264
+ box = box[1:-1]
265
+ for onebox in re.findall(r'\[.*?\]', box):
266
+ boxes.append(eval(onebox))
267
+ labels.append(label)
268
+
269
+ for indice, (box, label) in enumerate(zip(boxes, labels)):
270
+ box = (
271
+ int(box[0] / 999 * image_h),
272
+ int(box[1] / 999 * image_w),
273
+ int(box[2] / 999 * image_h),
274
+ int(box[3] / 999 * image_w),
275
+ )
276
+
277
+ box_color = BOX2COLOR[indice % len(BOX2COLOR.keys())]
278
+ box_width = 3
279
+ draw.rectangle(box, outline=box_color, width=box_width)
280
+
281
+ text_x = box[0]
282
+ text_y = box[1] - 20
283
+ text_color = box_color
284
+ font = ImageFont.truetype("kimi_vl/serve/assets/simsun.ttc", size=20)
285
+ draw.text((text_x, text_y), label, font=font, fill=text_color)
286
+
287
+ return image
288
+ except Exception as e:
289
+ logger.error(f"Error parsing reference bounding boxes: {e}")
290
+ return None
pyproject.toml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "kimi_vl"
3
+ version = "1.0.0"
4
+ description = "Kimi-VL"
5
+ license = {file = "LICENSE-CODE"}
6
+ readme = "README.md"
7
+ requires-python = ">=3.8"
8
+ dependencies = [
9
+ "torch==2.5.0",
10
+ "transformers==4.50.0",
11
+ "accelerate",
12
+ "tiktoken",
13
+ "blobfile",
14
+ "sentencepiece",
15
+ ]
16
+
17
+ [project.optional-dependencies]
18
+ gradio = [
19
+ "gradio==3.48.0",
20
+ "gradio-client==0.6.1",
21
+ "mdtex2html==1.3.0",
22
+ "pypinyin==0.50.0",
23
+ ]
24
+
25
+ # Linter tools:
26
+ [tool.black]
27
+ line-length = 120
28
+ skip-string-normalization = true
29
+ target-version = ["py310"]
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.5.0
2
+ torchvision==0.20.0
3
+ transformers==4.51.1
4
+ accelerate
5
+ sentencepiece
6
+ attrdict
7
+ einops
8
+ tiktoken
9
+ blobfile
10
+ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
11
+
12
+ # for gradio demo
13
+ gradio
14
+ gradio-client
15
+ mdtex2html
16
+ pypinyin
17
+ tqdm
18
+ colorama
19
+ Pygments
20
+ markdown
21
+ SentencePiece