Kimi-VL-A3B

Running on Zero

App Files Files Community

Haoyu Lu commited on 16 days ago

Commit

2648bb4

1 Parent(s): e101549

Add application file

Browse files

Files changed (25) hide show

.gitattributes +4 -0
.gitignore +5 -0
README.md +21 -5
app.py +344 -0
images/demo1.jpeg +3 -0
images/demo2.jpeg +3 -0
images/demo3.jpeg +3 -0
images/demo4.jpeg +3 -0
images/demo5.jpeg +3 -0
kimi_vl/__init__.py +0 -0
kimi_vl/serve/__init__.py +0 -0
kimi_vl/serve/assets/Kelpy-Codos.js +100 -0
kimi_vl/serve/assets/avatar.png +3 -0
kimi_vl/serve/assets/custom.css +355 -0
kimi_vl/serve/assets/custom.js +22 -0
kimi_vl/serve/assets/favicon.ico +3 -0
kimi_vl/serve/assets/simsun.ttc +3 -0
kimi_vl/serve/chat_utils.py +379 -0
kimi_vl/serve/examples.py +54 -0
kimi_vl/serve/frontend.py +134 -0
kimi_vl/serve/gradio_utils.py +93 -0
kimi_vl/serve/inference.py +223 -0
kimi_vl/serve/utils.py +290 -0
pyproject.toml +29 -0
requirements.txt +21 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.ico filter=lfs diff=lfs merge=lfs -text
+*.ttc filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+*.log
+__pycache__
+*.pyc
+*.pyo

README.md CHANGED Viewed

@@ -1,12 +1,28 @@
 ---
-title: Kimi VL A3B Thinking
-emoji: 🏢
-colorFrom: pink
-colorTo: red
 sdk: gradio
-sdk_version: 5.25.1
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Kimi-VL-A3B-Thinking
+emoji: 🤔
+colorFrom: green
+colorTo: blue
 sdk: gradio
+sdk_version: 5.24.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+## Citation
+```
+@misc{kimiteam2025kimivltechnicalreport,
+      title={{Kimi-VL} Technical Report},
+      author={Kimi Team and Angang Du and Bohong Yin and Bowei Xing and Bowen Qu and Bowen Wang and Cheng Chen and Chenlin Zhang and Chenzhuang Du and Chu Wei and Congcong Wang and Dehao Zhang and Dikang Du and Dongliang Wang and Enming Yuan and Enzhe Lu and Fang Li and Flood Sung and Guangda Wei and Guokun Lai and Han Zhu and Hao Ding and Hao Hu and Hao Yang and Hao Zhang and Haoning Wu and Haotian Yao and Haoyu Lu and Heng Wang and Hongcheng Gao and Huabin Zheng and Jiaming Li and Jianlin Su and Jianzhou Wang and Jiaqi Deng and Jiezhong Qiu and Jin Xie and Jinhong Wang and Jingyuan Liu and Junjie Yan and Kun Ouyang and Liang Chen and Lin Sui and Longhui Yu and Mengfan Dong and Mengnan Dong and Nuo Xu and Pengyu Cheng and Qizheng Gu and Runjie Zhou and Shaowei Liu and Sihan Cao and Tao Yu and Tianhui Song and Tongtong Bai and Wei Song and Weiran He and Weixiao Huang and Weixin Xu and Xiaokun Yuan and Xingcheng Yao and Xingzhe Wu and Xinxing Zu and Xinyu Zhou and Xinyuan Wang and Y. Charles and Yan Zhong and Yang Li and Yangyang Hu and Yanru Chen and Yejie Wang and Yibo Liu and Yibo Miao and Yidao Qin and Yimin Chen and Yiping Bao and Yiqin Wang and Yongsheng Kang and Yuanxin Liu and Yulun Du and Yuxin Wu and Yuzhi Wang and Yuzi Yan and Zaida Zhou and Zhaowei Li and Zhejun Jiang and Zheng Zhang and Zhilin Yang and Zhiqi Huang and Zihao Huang and Zijia Zhao and Ziwei Chen},
+      year={2025},
+      eprint={2504.07491},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2504.07491},
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import argparse
+import gradio as gr
+import os
+from PIL import Image
+import torch
+from kimi_vl.serve.frontend import reload_javascript
+from kimi_vl.serve.utils import (
+    configure_logger,
+    pil_to_base64,
+    parse_ref_bbox,
+    strip_stop_words,
+    is_variable_assigned,
+)
+from kimi_vl.serve.gradio_utils import (
+    cancel_outputing,
+    delete_last_conversation,
+    reset_state,
+    reset_textbox,
+    transfer_input,
+    wrap_gen_fn,
+)
+from kimi_vl.serve.chat_utils import (
+    generate_prompt_with_history,
+    convert_conversation_to_prompts,
+    to_gradio_chatbot,
+    to_gradio_history,
+)
+from kimi_vl.serve.inference import kimi_vl_generate, load_model
+from kimi_vl.serve.examples import get_examples
+TITLE = """<h1 align="left" style="min-width:200px; margin-top:0;">Chat with Kimi-VL-A3B-Thinking🤔 </h1>"""
+DESCRIPTION_TOP = """<a href="https://github.com/MoonshotAI/Kimi-VL" target="_blank">Kimi-VL-A3B-Thinking</a> is a multi-modal LLM that can understand text and images, and generate text with thinking processes. For non-thinking version, please try [Kimi-VL-A3B](https://huggingface.co/spaces/moonshotai/Kimi-VL-A3B)."""
+DESCRIPTION = """"""
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+DEPLOY_MODELS = dict()
+logger = configure_logger()
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default="Kimi-VL-A3B-Thinking")
+    parser.add_argument(
+        "--local-path",
+        type=str,
+        default="",
+        help="huggingface ckpt, optional",
+    )
+    parser.add_argument("--ip", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=7860)
+    return parser.parse_args()
+def fetch_model(model_name: str):
+    global args, DEPLOY_MODELS
+    if args.local_path:
+        model_path = args.local_path
+    else:
+        model_path = f"moonshotai/{args.model}"
+    if model_name in DEPLOY_MODELS:
+        model_info = DEPLOY_MODELS[model_name]
+        print(f"{model_name} has been loaded.")
+    else:
+        print(f"{model_name} is loading...")
+        DEPLOY_MODELS[model_name] = load_model(model_path)
+        print(f"Load {model_name} successfully...")
+        model_info = DEPLOY_MODELS[model_name]
+    return model_info
+def preview_images(files) -> list[str]:
+    if files is None:
+        return []
+    image_paths = []
+    for file in files:
+        image_paths.append(file.name)
+    return image_paths
+def get_prompt(conversation) -> str:
+    """
+    Get the prompt for the conversation.
+    """
+    system_prompt = conversation.system_template.format(system_message=conversation.system_message)
+    return system_prompt
+@wrap_gen_fn
+def predict(
+    text,
+    images,
+    chatbot,
+    history,
+    top_p,
+    temperature,
+    max_length_tokens,
+    max_context_length_tokens,
+    chunk_size: int = 512,
+):
+    """
+    Predict the response for the input text and images.
+    Args:
+        text (str): The input text.
+        images (list[PIL.Image.Image]): The input images.
+        chatbot (list): The chatbot.
+        history (list): The history.
+        top_p (float): The top-p value.
+        temperature (float): The temperature value.
+        repetition_penalty (float): The repetition penalty value.
+        max_length_tokens (int): The max length tokens.
+        max_context_length_tokens (int): The max context length tokens.
+        chunk_size (int): The chunk size.
+    """
+    print("running the prediction function")
+    try:
+        model, processor = fetch_model(args.model)
+        if text == "":
+            yield chatbot, history, "Empty context."
+            return
+    except KeyError:
+        yield [[text, "No Model Found"]], [], "No Model Found"
+        return
+    if images is None:
+        images = []
+    # load images
+    pil_images = []
+    for img_or_file in images:
+        try:
+            # load as pil image
+            if isinstance(images, Image.Image):
+                pil_images.append(img_or_file)
+            else:
+                image = Image.open(img_or_file.name).convert("RGB")
+                pil_images.append(image)
+        except Exception as e:
+            print(f"Error loading image: {e}")
+    # generate prompt
+    conversation = generate_prompt_with_history(
+        text,
+        pil_images,
+        history,
+        processor,
+        max_length=max_context_length_tokens,
+    )
+    all_conv, last_image = convert_conversation_to_prompts(conversation)
+    stop_words = conversation.stop_str
+    gradio_chatbot_output = to_gradio_chatbot(conversation)
+    full_response = ""
+    with torch.no_grad():
+        for x in kimi_vl_generate(
+            conversations=all_conv,
+            model=model,
+            processor=processor,
+            stop_words=stop_words,
+            max_length=max_length_tokens,
+            temperature=temperature,
+            top_p=top_p,
+        ):
+            full_response += x
+            response = strip_stop_words(full_response, stop_words)
+            conversation.update_last_message(response)
+            gradio_chatbot_output[-1][1] = response
+            yield gradio_chatbot_output, to_gradio_history(conversation), "Generating..."
+    if last_image is not None:
+        vg_image = parse_ref_bbox(response, last_image)
+        if vg_image is not None:
+            vg_base64 = pil_to_base64(vg_image, "vg", max_size=800, min_size=400)
+            gradio_chatbot_output[-1][1] += vg_base64
+            yield gradio_chatbot_output, to_gradio_history(conversation), "Generating..."
+    logger.info("flushed result to gradio")
+    torch.cuda.empty_cache()
+    if is_variable_assigned("x"):
+        print(
+            f"temperature: {temperature}, "
+            f"top_p: {top_p}, "
+            f"max_length_tokens: {max_length_tokens}"
+        )
+    yield gradio_chatbot_output, to_gradio_history(conversation), "Generate: Success"
+def retry(
+    text,
+    images,
+    chatbot,
+    history,
+    top_p,
+    temperature,
+    max_length_tokens,
+    max_context_length_tokens,
+    chunk_size: int = 512,
+):
+    """
+    Retry the response for the input text and images.
+    """
+    if len(history) == 0:
+        yield (chatbot, history, "Empty context")
+        return
+    chatbot.pop()
+    history.pop()
+    text = history.pop()[-1]
+    if type(text) is tuple:
+        text, _ = text
+    yield from predict(
+        text,
+        images,
+        chatbot,
+        history,
+        top_p,
+        temperature,
+        max_length_tokens,
+        max_context_length_tokens,
+        chunk_size,
+    )
+def build_demo(args: argparse.Namespace) -> gr.Blocks:
+    with gr.Blocks(theme=gr.themes.Soft(), delete_cache=(1800, 1800)) as demo:
+        history = gr.State([])
+        input_text = gr.State()
+        input_images = gr.State()
+        with gr.Row():
+            gr.HTML(TITLE)
+            status_display = gr.Markdown("Success", elem_id="status_display")
+        gr.Markdown(DESCRIPTION_TOP)
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=4):
+                with gr.Row():
+                    chatbot = gr.Chatbot(
+                        elem_id="Kimi-VL-A3B-Thinking-chatbot",
+                        show_share_button=True,
+                        bubble_full_width=False,
+                        height=600,
+                    )
+                with gr.Row():
+                    with gr.Column(scale=4):
+                        text_box = gr.Textbox(show_label=False, placeholder="Enter text", container=False)
+                    with gr.Column(min_width=70):
+                        submit_btn = gr.Button("Send")
+                    with gr.Column(min_width=70):
+                        cancel_btn = gr.Button("Stop")
+                with gr.Row():
+                    empty_btn = gr.Button("🧹 New Conversation")
+                    retry_btn = gr.Button("🔄 Regenerate")
+                    del_last_btn = gr.Button("🗑️ Remove Last Turn")
+            with gr.Column():
+                # add note no more than 2 images once
+                gr.Markdown("Note: you can upload no more than 2 images once")
+                upload_images = gr.Files(file_types=["image"], show_label=True)
+                gallery = gr.Gallery(columns=[3], height="200px", show_label=True)
+                upload_images.change(preview_images, inputs=upload_images, outputs=gallery)
+                # Parameter Setting Tab for control the generation parameters
+                with gr.Tab(label="Parameter Setting"):
+                    top_p = gr.Slider(minimum=-0, maximum=1.0, value=1.0, step=0.05, interactive=True, label="Top-p")
+                    temperature = gr.Slider(
+                        minimum=0, maximum=1.0, value=0.6, step=0.1, interactive=True, label="Temperature"
+                    )
+                    max_length_tokens = gr.Slider(
+                        minimum=512, maximum=8192, value=2048, step=64, interactive=True, label="Max Length Tokens"
+                    )
+                    max_context_length_tokens = gr.Slider(
+                        minimum=512, maximum=8192, value=2048, step=64, interactive=True, label="Max Context Length Tokens"
+                    )
+                    show_images = gr.HTML(visible=False)
+        gr.Examples(
+            examples=get_examples(ROOT_DIR),
+            inputs=[upload_images, show_images, text_box],
+        )
+        gr.Markdown()
+        input_widgets = [
+            input_text,
+            input_images,
+            chatbot,
+            history,
+            top_p,
+            temperature,
+            max_length_tokens,
+            max_context_length_tokens,
+        ]
+        output_widgets = [chatbot, history, status_display]
+        transfer_input_args = dict(
+            fn=transfer_input,
+            inputs=[text_box, upload_images],
+            outputs=[input_text, input_images, text_box, upload_images, submit_btn],
+            show_progress=True,
+        )
+        predict_args = dict(fn=predict, inputs=input_widgets, outputs=output_widgets, show_progress=True)
+        retry_args = dict(fn=retry, inputs=input_widgets, outputs=output_widgets, show_progress=True)
+        reset_args = dict(fn=reset_textbox, inputs=[], outputs=[text_box, status_display])
+        predict_events = [
+            text_box.submit(**transfer_input_args).then(**predict_args),
+            submit_btn.click(**transfer_input_args).then(**predict_args),
+        ]
+        empty_btn.click(reset_state, outputs=output_widgets, show_progress=True)
+        empty_btn.click(**reset_args)
+        retry_btn.click(**retry_args)
+        del_last_btn.click(delete_last_conversation, [chatbot, history], output_widgets, show_progress=True)
+        cancel_btn.click(cancel_outputing, [], [status_display], cancels=predict_events)
+    demo.title = "Kimi-VL-A3B-Thinking Chatbot"
+    return demo
+def main(args: argparse.Namespace):
+    demo = build_demo(args)
+    reload_javascript()
+    # concurrency_count=CONCURRENT_COUNT, max_size=MAX_EVENTS
+    favicon_path = os.path.join("kimi_vl/serve/assets/favicon.ico")
+    demo.queue().launch(
+        favicon_path=favicon_path,
+        server_name=args.ip,
+        server_port=args.port,
+    )
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

images/demo1.jpeg ADDED Viewed

Git LFS Details

SHA256: 8fc81bcaf75321eb871827fb0cad556cc5d3fe304864516c3dbace377fb82b64
Pointer size: 132 Bytes
Size of remote file: 5.31 MB

images/demo2.jpeg ADDED Viewed

Git LFS Details

SHA256: fddde8fc86f53fce4625f8defb54640c7fb885f1049eb0fb631d6234ac0e994b
Pointer size: 131 Bytes
Size of remote file: 450 kB

images/demo3.jpeg ADDED Viewed

Git LFS Details

SHA256: 9a3833fb7fc115cb7f74454296023e548e2eee32642ccbcee3baa7ad9b561097
Pointer size: 130 Bytes
Size of remote file: 20.3 kB

images/demo4.jpeg ADDED Viewed

Git LFS Details

SHA256: 2761a3226f9cd4d894e822c6dc98a4a418a89c4f82e1cc00a57d960fb66fc51f
Pointer size: 131 Bytes
Size of remote file: 223 kB

images/demo5.jpeg ADDED Viewed

Git LFS Details

SHA256: 927541679993f7bd2bcd344c04d648bed64ba1a97a4473a16eab1647fa190e8d
Pointer size: 131 Bytes
Size of remote file: 264 kB

kimi_vl/__init__.py ADDED Viewed

File without changes

kimi_vl/serve/__init__.py ADDED Viewed

File without changes

kimi_vl/serve/assets/Kelpy-Codos.js ADDED Viewed

	@@ -0,0 +1,100 @@

+/**
+ * Copyright (c) 2023-2024 DeepSeek.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+// ==UserScript==
+// @name         Kelpy Codos
+// @namespace    https://github.com/Keldos-Li/Kelpy-Codos
+// @version      1.0.5
+// @author       Keldos; https://keldos.me/
+// @description  Add copy button to PRE tags before CODE tag, for Chuanhu ChatGPT especially.
+//               Based on Chuanhu ChatGPT version: ac04408 (2023-3-22)
+// @license      GPL-3.0
+// @grant        none
+// ==/UserScript==
+(function () {
+  "use strict";
+  function addCopyButton(pre) {
+    var code = pre.querySelector("code");
+    if (!code) {
+      return; // 如果没有找到 <code> 元素，则不添加按钮
+    }
+    var firstChild = code.firstChild;
+    if (!firstChild) {
+      return; // 如果 <code> 元素没有子节点，则不添加按钮
+    }
+    var button = document.createElement("button");
+    button.textContent = "\uD83D\uDCCE"; // 使用 📎 符号作为“复制”按钮的文本
+    button.style.position = "relative";
+    button.style.float = "right";
+    button.style.fontSize = "1em"; // 可选：调整按钮大小
+    button.style.background = "none"; // 可选：去掉背景颜色
+    button.style.border = "none"; // 可选：去掉边框
+    button.style.cursor = "pointer"; // 可选：显示指针样式
+    button.addEventListener("click", function () {
+      var range = document.createRange();
+      range.selectNodeContents(code);
+      range.setStartBefore(firstChild); // 将范围设置为第一个子节点之前
+      var selection = window.getSelection();
+      selection.removeAllRanges();
+      selection.addRange(range);
+      try {
+        var success = document.execCommand("copy");
+        if (success) {
+          button.textContent = "\u2714";
+          setTimeout(function () {
+            button.textContent = "\uD83D\uDCCE"; // 恢复按钮为“复制”
+          }, 2000);
+        } else {
+          button.textContent = "\u2716";
+        }
+      } catch (e) {
+        console.error(e);
+        button.textContent = "\u2716";
+      }
+      selection.removeAllRanges();
+    });
+    code.insertBefore(button, firstChild); // 将按钮插入到第一个子元素之前
+  }
+  function handleNewElements(mutationsList, observer) {
+    for (var mutation of mutationsList) {
+      if (mutation.type === "childList") {
+        for (var node of mutation.addedNodes) {
+          if (node.nodeName === "PRE") {
+            addCopyButton(node);
+          }
+        }
+      }
+    }
+  }
+  var observer = new MutationObserver(handleNewElements);
+  observer.observe(document.documentElement, {
+    childList: true,
+    subtree: true,
+  });
+  document.querySelectorAll("pre").forEach(addCopyButton);
+})();

kimi_vl/serve/assets/avatar.png ADDED Viewed

Git LFS Details

SHA256: 3395211efab793b89a4e579d90bd606b0eb435e2566aedf54bec585e436a8e71
Pointer size: 130 Bytes
Size of remote file: 62.1 kB

kimi_vl/serve/assets/custom.css ADDED Viewed

	@@ -0,0 +1,355 @@

+/**
+ * Copyright (c) 2023-2024 DeepSeek.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+:root {
+  --chatbot-color-light: #f3f3f3;
+  --chatbot-color-dark: #121111;
+}
+/* status_display */
+#status_display {
+  display: flex;
+  min-height: 2.5em;
+  align-items: flex-end;
+  justify-content: flex-end;
+}
+#status_display p {
+  font-size: 0.85em;
+  font-family: monospace;
+  color: var(--body-text-color-subdued);
+}
+/* usage_display */
+#usage_display {
+  height: 1em;
+}
+#usage_display p {
+  padding: 0 1em;
+  font-size: 0.85em;
+  font-family: monospace;
+  color: var(--body-text-color-subdued);
+}
+/* list */
+ol:not(.options),
+ul:not(.options) {
+  padding-inline-start: 2em !important;
+}
+/* Thank @Keldos-Li for fixing it */
+/* Light mode (default) */
+#deepseek_chatbot {
+  background-color: var(--chatbot-color-light) !important;
+  color: #000000 !important;
+}
+[data-testid="bot"] {
+  background-color: #ffffff !important;
+}
+[data-testid="user"] {
+  background-color: #95ec69 !important;
+}
+/* Dark mode */
+.dark #deepseek_chatbot {
+  background-color: var(--chatbot-color-dark) !important;
+  color: #ffffff !important;
+}
+.dark [data-testid="bot"] {
+  background-color: #2c2c2c !important;
+}
+.dark [data-testid="user"] {
+  background-color: #26b561 !important;
+}
+#deepseek_chatbot {
+  height: 100%;
+  min-height: 800px;
+  flex-grow: 1;
+  overflow: auto;
+}
+[class*="message"] {
+  border-radius: var(--radius-xl) !important;
+  border: none;
+  padding: var(--spacing-xl) !important;
+  font-size: var(--text-md) !important;
+  line-height: var(--line-md) !important;
+  min-height: calc(var(--text-md) * var(--line-md) + 2 * var(--spacing-xl));
+  min-width: calc(var(--text-md) * var(--line-md) + 2 * var(--spacing-xl));
+}
+[data-testid="bot"] {
+  max-width: 85%;
+  border-bottom-left-radius: 0 !important;
+}
+[data-testid="user"] {
+  max-width: 85%;
+  width: auto !important;
+  border-bottom-right-radius: 0 !important;
+}
+/* Table */
+table {
+  margin: 1em 0;
+  border-collapse: collapse;
+  empty-cells: show;
+}
+td,
+th {
+  border: 1.2px solid var(--border-color-primary) !important;
+  padding: 0.2em;
+}
+thead {
+  background-color: rgba(175, 184, 193, 0.2);
+}
+thead th {
+  padding: 0.5em 0.2em;
+}
+/* Inline code */
+#deepseek_chatbot code {
+  display: inline;
+  white-space: break-spaces;
+  border-radius: 6px;
+  margin: 0 2px 0 2px;
+  padding: 0.2em 0.4em 0.1em 0.4em;
+  background-color: rgba(175, 184, 193, 0.2);
+}
+/* Code block */
+#deepseek_chatbot pre code {
+  display: block;
+  overflow: auto;
+  white-space: pre;
+  background-color: #1c1d1e !important;
+  border-radius: 10px;
+  padding: 1.4em 1.2em 0em 1.4em;
+  margin: 1.2em 2em 1.2em 0.5em;
+  color: #fdf8f8;
+  box-shadow: 6px 6px 16px hsla(0, 0%, 0%, 0.2);
+}
+/* Hightlight */
+#deepseek_chatbot .highlight {
+  background-color: transparent;
+}
+#deepseek_chatbot .highlight .hll {
+  background-color: #49483e;
+}
+#deepseek_chatbot .highlight .c {
+  color: #75715e;
+} /* Comment */
+#deepseek_chatbot .highlight .err {
+  color: #960050;
+  background-color: #1e0010;
+} /* Error */
+#deepseek_chatbot .highlight .k {
+  color: #66d9ef;
+} /* Keyword */
+#deepseek_chatbot .highlight .l {
+  color: #ae81ff;
+} /* Literal */
+#deepseek_chatbot .highlight .n {
+  color: #f8f8f2;
+} /* Name */
+#deepseek_chatbot .highlight .o {
+  color: #f92672;
+} /* Operator */
+#deepseek_chatbot .highlight .p {
+  color: #f8f8f2;
+} /* Punctuation */
+#deepseek_chatbot .highlight .ch {
+  color: #75715e;
+} /* Comment.Hashbang */
+#deepseek_chatbot .highlight .cm {
+  color: #75715e;
+} /* Comment.Multiline */
+#deepseek_chatbot .highlight .cp {
+  color: #75715e;
+} /* Comment.Preproc */
+#deepseek_chatbot .highlight .cpf {
+  color: #75715e;
+} /* Comment.PreprocFile */
+#deepseek_chatbot .highlight .c1 {
+  color: #75715e;
+} /* Comment.Single */
+#deepseek_chatbot .highlight .cs {
+  color: #75715e;
+} /* Comment.Special */
+#deepseek_chatbot .highlight .gd {
+  color: #f92672;
+} /* Generic.Deleted */
+#deepseek_chatbot .highlight .ge {
+  font-style: italic;
+} /* Generic.Emph */
+#deepseek_chatbot .highlight .gi {
+  color: #a6e22e;
+} /* Generic.Inserted */
+#deepseek_chatbot .highlight .gs {
+  font-weight: bold;
+} /* Generic.Strong */
+#deepseek_chatbot .highlight .gu {
+  color: #75715e;
+} /* Generic.Subheading */
+#deepseek_chatbot .highlight .kc {
+  color: #66d9ef;
+} /* Keyword.Constant */
+#deepseek_chatbot .highlight .kd {
+  color: #66d9ef;
+} /* Keyword.Declaration */
+#deepseek_chatbot .highlight .kn {
+  color: #f92672;
+} /* Keyword.Namespace */
+#deepseek_chatbot .highlight .kp {
+  color: #66d9ef;
+} /* Keyword.Pseudo */
+#deepseek_chatbot .highlight .kr {
+  color: #66d9ef;
+} /* Keyword.Reserved */
+#deepseek_chatbot .highlight .kt {
+  color: #66d9ef;
+} /* Keyword.Type */
+#deepseek_chatbot .highlight .ld {
+  color: #e6db74;
+} /* Literal.Date */
+#deepseek_chatbot .highlight .m {
+  color: #ae81ff;
+} /* Literal.Number */
+#deepseek_chatbot .highlight .s {
+  color: #e6db74;
+} /* Literal.String */
+#deepseek_chatbot .highlight .na {
+  color: #a6e22e;
+} /* Name.Attribute */
+#deepseek_chatbot .highlight .nb {
+  color: #f8f8f2;
+} /* Name.Builtin */
+#deepseek_chatbot .highlight .nc {
+  color: #a6e22e;
+} /* Name.Class */
+#deepseek_chatbot .highlight .no {
+  color: #66d9ef;
+} /* Name.Constant */
+#deepseek_chatbot .highlight .nd {
+  color: #a6e22e;
+} /* Name.Decorator */
+#deepseek_chatbot .highlight .ni {
+  color: #f8f8f2;
+} /* Name.Entity */
+#deepseek_chatbot .highlight .ne {
+  color: #a6e22e;
+} /* Name.Exception */
+#deepseek_chatbot .highlight .nf {
+  color: #a6e22e;
+} /* Name.Function */
+#deepseek_chatbot .highlight .nl {
+  color: #f8f8f2;
+} /* Name.Label */
+#deepseek_chatbot .highlight .nn {
+  color: #f8f8f2;
+} /* Name.Namespace */
+#deepseek_chatbot .highlight .nx {
+  color: #a6e22e;
+} /* Name.Other */
+#deepseek_chatbot .highlight .py {
+  color: #f8f8f2;
+} /* Name.Property */
+#deepseek_chatbot .highlight .nt {
+  color: #f92672;
+} /* Name.Tag */
+#deepseek_chatbot .highlight .nv {
+  color: #f8f8f2;
+} /* Name.Variable */
+#deepseek_chatbot .highlight .ow {
+  color: #f92672;
+} /* Operator.Word */
+#deepseek_chatbot .highlight .w {
+  color: #f8f8f2;
+} /* Text.Whitespace */
+#deepseek_chatbot .highlight .mb {
+  color: #ae81ff;
+} /* Literal.Number.Bin */
+#deepseek_chatbot .highlight .mf {
+  color: #ae81ff;
+} /* Literal.Number.Float */
+#deepseek_chatbot .highlight .mh {
+  color: #ae81ff;
+} /* Literal.Number.Hex */
+#deepseek_chatbot .highlight .mi {
+  color: #ae81ff;
+} /* Literal.Number.Integer */
+#deepseek_chatbot .highlight .mo {
+  color: #ae81ff;
+} /* Literal.Number.Oct */
+#deepseek_chatbot .highlight .sa {
+  color: #e6db74;
+} /* Literal.String.Affix */
+#deepseek_chatbot .highlight .sb {
+  color: #e6db74;
+} /* Literal.String.Backtick */
+#deepseek_chatbot .highlight .sc {
+  color: #e6db74;
+} /* Literal.String.Char */
+#deepseek_chatbot .highlight .dl {
+  color: #e6db74;
+} /* Literal.String.Delimiter */
+#deepseek_chatbot .highlight .sd {
+  color: #e6db74;
+} /* Literal.String.Doc */
+#deepseek_chatbot .highlight .s2 {
+  color: #e6db74;
+} /* Literal.String.Double */
+#deepseek_chatbot .highlight .se {
+  color: #ae81ff;
+} /* Literal.String.Escape */
+#deepseek_chatbot .highlight .sh {
+  color: #e6db74;
+} /* Literal.String.Heredoc */
+#deepseek_chatbot .highlight .si {
+  color: #e6db74;
+} /* Literal.String.Interpol */
+#deepseek_chatbot .highlight .sx {
+  color: #e6db74;
+} /* Literal.String.Other */
+#deepseek_chatbot .highlight .sr {
+  color: #e6db74;
+} /* Literal.String.Regex */
+#deepseek_chatbot .highlight .s1 {
+  color: #e6db74;
+} /* Literal.String.Single */
+#deepseek_chatbot .highlight .ss {
+  color: #e6db74;
+} /* Literal.String.Symbol */
+#deepseek_chatbot .highlight .bp {
+  color: #f8f8f2;
+} /* Name.Builtin.Pseudo */
+#deepseek_chatbot .highlight .fm {
+  color: #a6e22e;
+} /* Name.Function.Magic */
+#deepseek_chatbot .highlight .vc {
+  color: #f8f8f2;
+} /* Name.Variable.Class */
+#deepseek_chatbot .highlight .vg {
+  color: #f8f8f2;
+} /* Name.Variable.Global */
+#deepseek_chatbot .highlight .vi {
+  color: #f8f8f2;
+} /* Name.Variable.Instance */
+#deepseek_chatbot .highlight .vm {
+  color: #f8f8f2;
+} /* Name.Variable.Magic */
+#deepseek_chatbot .highlight .il {
+  color: #ae81ff;
+} /* Literal.Number.Integer.Long */

kimi_vl/serve/assets/custom.js ADDED Viewed

	@@ -0,0 +1,22 @@

+/**
+ * Copyright (c) 2023-2024 DeepSeek.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+// custom javascript here

kimi_vl/serve/assets/favicon.ico ADDED Viewed

Git LFS Details

SHA256: 28dab71bd4190f41c7de510615e91afcba52ad7ce6826fbf86b213205be62b45
Pointer size: 130 Bytes
Size of remote file: 15.4 kB

kimi_vl/serve/assets/simsun.ttc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff7d69bfa6588d3fdedbddbe3a29ac11f0c50236723ee72a9ea49ec3e2553f5d
+size 15323200

kimi_vl/serve/chat_utils.py ADDED Viewed

	@@ -0,0 +1,379 @@

+"""
+From https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+"""
+import dataclasses
+import logging
+import copy
+from enum import IntEnum, auto
+from typing import Dict, List
+import base64
+import gradio as gr
+import torch
+from .utils import pil_to_base64
+IMAGE_TOKEN = "<image>"
+logger = logging.getLogger("gradio_logger")
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+    PLAIN = auto()
+    ALIGNMENT = auto()
+    KIMI_VL = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = "{system_message}"
+    # The system message
+    system_message: str = ""
+    # The names of two roles
+    roles: List[str] = (("USER", "ASSISTANT"),)
+    # All messages. Each item is (role, message).
+    messages: List[List[str]] = ()
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.PLAIN
+    sep: str = "\n"
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: str = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    if i % 2 == 0:
+                        ret += message + seps[i % 2]
+                    else:
+                        ret += message + seps[i % 2]
+                else:
+                    ret += ""
+            return ret
+        elif self.sep_style == SeparatorStyle.ALIGNMENT:
+            seps = [self.sep, self.sep2]
+            ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i % 2 == 0:
+                        ret += '<image>\n' + seps[i % 2]
+                    else:
+                        ret += message + seps[i % 2]
+                else:
+                    ret += ""
+            return ret
+        elif self.sep_style == SeparatorStyle.KIMI_VL:
+            seps = [self.sep, self.sep2]
+            if system_prompt == "" or system_prompt is None:
+                ret = ""
+            else:
+                ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    if role == "user":
+                        ret += message + self.sep
+                    else:
+                        if self.sep2 is not None:
+                            ret += message + self.sep2
+                        else:
+                            ret += message
+                else:
+                    ret = ret
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+    def update_last_message(self, message: str):
+        """Update the last output.
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+    def reset_message(self):
+        """Reset a new message."""
+        self.messages = []
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format."""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        ret = [{"role": "system", "content": system_prompt}]
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({"role": "user", "content": msg})
+            else:
+                if msg is not None:
+                    ret.append({"role": "assistant", "content": msg})
+        return ret
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+        )
+    def dict(self):
+        return {
+            "template_name": self.name,
+            "system_message": self.system_message,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+        }
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {}
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert template.name not in conv_templates, f"{template.name} has been registered."
+    conv_templates[template.name] = template
+def get_conv_template(name: str) -> Conversation:
+    """Get a conversation template."""
+    return conv_templates[name].copy()
+register_conv_template(
+    Conversation(
+        name="plain",
+        system_template="",
+        system_message="",
+        roles=("", ""),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.PLAIN,
+        sep="",
+        sep2="",
+        stop_token_ids=[100001],
+        stop_str=['</s>'],
+    )
+)
+register_conv_template(
+    Conversation(
+        name="alignment",
+        system_template="",
+        system_message="",
+        roles=("", ""),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ALIGNMENT,
+        sep="",
+        sep2="",
+        stop_token_ids=[100001],
+        stop_str=['</s>'],
+    )
+)
+register_conv_template(
+    Conversation(
+        name="kimi-vl",
+        system_template="{system_message}",
+        system_message="You are a helpful assistant",
+        roles=("user", "assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.KIMI_VL,
+        sep="<|im_end|>",
+        sep2=None,
+        stop_token_ids=None,
+        stop_str=["<|im_end|>"],
+    )
+)
+def new_chat_template(sft_format: str = "kimi-vl"):
+    return get_conv_template(sft_format)
+def get_prompt(conv: Conversation) -> str:
+    """Get the prompt for generation."""
+    return conv.get_prompt()
+def generate_prompt_with_history(text, images, history, processor, max_length=2048):
+    """
+    Generate a prompt with the chat history.
+    Args:
+        text (str): The text prompt.
+        images (list[PIL.Image.Image]): The image prompt.
+        history (list): List of previous conversation messages.
+        processor (KimiVLProcessor): The chat processor used for encoding the prompt.
+        max_length (int): The maximum length of the prompt.
+    """
+    global IMAGE_TOKEN
+    user_role_ind = 0
+    bot_role_ind = 1
+    # Initialize conversation
+    conversation = new_chat_template(sft_format="kimi-vl")
+    if history:
+        conversation.messages = history
+    if images is not None and len(images) > 0:
+        # num_image_tags = text.count(IMAGE_TOKEN)
+        # num_images = len(images)
+        # if num_images > num_image_tags:
+        #     pad_image_tags = num_images - num_image_tags
+        #     image_tokens = "\n".join([IMAGE_TOKEN] * pad_image_tags)
+        #     # append the <image> in a new line after the text prompt
+        #     text = image_tokens + "\n" + text
+        # elif num_images < num_image_tags:
+        #     remove_image_tags = num_image_tags - num_images
+        #     text = text.replace(IMAGE_TOKEN, "", remove_image_tags)
+        print(f"prompt = {text}, len(images) = {len(images)}")
+        text = (text, images)
+    conversation.append_message(conversation.roles[user_role_ind], text)
+    conversation.append_message(conversation.roles[bot_role_ind], "")
+    # Create a copy of the conversation to avoid history truncation in the UI
+    conversation_copy = conversation.copy()
+    logger.info("=" * 80)
+    logger.info(get_prompt(conversation))
+    rounds = len(conversation.messages) // 2
+    for _ in range(rounds):
+        current_prompt = get_prompt(conversation)
+        assert isinstance(current_prompt, str) and len(current_prompt) > 0, f"current_prompt = {current_prompt}"
+        if torch.tensor(processor.tokenizer.encode(current_prompt)).size(-1) <= max_length:
+            return conversation_copy
+        if len(conversation.messages) % 2 != 0:
+            gr.Error("The messages between user and assistant are not paired.")
+            return
+        try:
+            for _ in range(2):  # pop out two messages in a row
+                conversation.messages.pop(0)
+        except IndexError:
+            gr.Error("Input text processing failed, unable to respond in this round.")
+            return None
+    gr.Error("Prompt could not be generated within max_length limit.")
+    return None
+def convert_conversation_to_prompts(conversation: Conversation):
+    """
+    Convert the conversation to prompts.
+    """
+    conv_prompts = []
+    last_image = None
+    messages = conversation.messages
+    for i in range(0, len(messages), 2):
+        if isinstance(messages[i][1], tuple):
+            text, images = messages[i][1]
+            last_image = images[-1]
+        else:
+            text, images = messages[i][1], []
+        prompt = {"role": messages[i][0], "content": text, "images": images}
+        response = {"role": messages[i + 1][0], "content": messages[i + 1][1]}
+        conv_prompts.extend([prompt, response])
+    return conv_prompts, last_image
+def to_gradio_chatbot(conversation: Conversation) -> list:
+    """Convert the conversation to gradio chatbot format."""
+    ret = []
+    for i, (_, msg) in enumerate(conversation.messages[conversation.offset :]):
+        if i % 2 == 0:
+            if type(msg) is tuple:
+                msg, images = copy.deepcopy(msg)
+                if isinstance(images, list):
+                    img_str = ""
+                    for j, image in enumerate(images):
+                        if isinstance(image, str):
+                            with open(image, "rb") as f:
+                                data = f.read()
+                            img_b64_str = base64.b64encode(data).decode()
+                            image_str = (
+                                f'<img src="data:image/png;base64,{img_b64_str}" '
+                                f'alt="user upload image" style="max-width: 300px; height: auto;" />'
+                            )
+                        else:
+                            image_str = pil_to_base64(image, f"user upload image_{j}", max_size=800, min_size=400)
+                        img_str += image_str
+                    msg = img_str + msg
+                else:
+                    pass
+            ret.append([msg, None])
+        else:
+            ret[-1][-1] = msg
+    return ret
+def to_gradio_history(conversation: Conversation):
+    """Convert the conversation to gradio history format."""
+    return conversation.messages[conversation.offset :]

kimi_vl/serve/examples.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import io
+import base64
+from PIL import Image
+EXAMPLES_LIST = [
+    [
+        ["images/demo1.jpeg"],
+        "Where am I?",
+    ],
+    [
+        ["images/demo2.jpeg"],
+        "Based on the abstract above, write a concise and elegant Twitter post that highlights key points and figures without sounding overly promotional. Use English, include emojis and hashtags.",
+    ],
+    [
+        ["images/demo3.jpeg"],
+        "If you are free, what would you most like to do?"
+    ],
+    # mulit-frames example
+    [
+        ["images/demo4.jpeg", "images/demo5.jpeg"],
+        "Please infer step by step who this manuscript belongs to and what it records."
+    ],
+]
+def display_example(image_list, root_dir: str = None):
+    images_html = ""
+    for _, img_path in enumerate(image_list):
+        if root_dir is not None:
+            img_path = os.path.join(root_dir, img_path)
+        image = Image.open(img_path)
+        buffered = io.BytesIO()
+        image.save(buffered, format="PNG", quality=100)
+        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+        img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="{img_path}" style="height:80px; margin-right: 10px;" />'
+        images_html += img_str
+    result_html = f"""
+    <div style="display: flex; align-items: center; margin-bottom: 10px;">
+        <div style="flex: 1; margin-right: 10px;">{images_html}</div>
+    </div>
+    """
+    return result_html
+def get_examples(root_dir: str = None):
+    examples = []
+    for images, texts in EXAMPLES_LIST:
+        examples.append([images, display_example(images, root_dir), texts])
+    return examples

kimi_vl/serve/frontend.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import logging
+import os
+from typing import List, Tuple
+import gradio as gr
+from kimi_vl.serve.utils import convert_asis, convert_mdtext, detect_converted_mark
+ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
+small_and_beautiful_theme = gr.themes.Soft(
+    primary_hue=gr.themes.Color(
+        c50="#EBFAF2",
+        c100="#CFF3E1",
+        c200="#A8EAC8",
+        c300="#77DEA9",
+        c400="#3FD086",
+        c500="#02C160",
+        c600="#06AE56",
+        c700="#05974E",
+        c800="#057F45",
+        c900="#04673D",
+        c950="#2E5541",
+        name="small_and_beautiful",
+    ),
+    secondary_hue=gr.themes.Color(
+        c50="#576b95",
+        c100="#576b95",
+        c200="#576b95",
+        c300="#576b95",
+        c400="#576b95",
+        c500="#576b95",
+        c600="#576b95",
+        c700="#576b95",
+        c800="#576b95",
+        c900="#576b95",
+        c950="#576b95",
+    ),
+    neutral_hue=gr.themes.Color(
+        name="gray",
+        c50="#f6f7f8",
+        # c100="#f3f4f6",
+        c100="#F2F2F2",
+        c200="#e5e7eb",
+        c300="#d1d5db",
+        c400="#B2B2B2",
+        c500="#808080",
+        c600="#636363",
+        c700="#515151",
+        c800="#393939",
+        # c900="#272727",
+        c900="#2B2B2B",
+        c950="#171717",
+    ),
+    radius_size=gr.themes.sizes.radius_sm,
+).set(
+    # button_primary_background_fill="*primary_500",
+    button_primary_background_fill_dark="*primary_600",
+    # button_primary_background_fill_hover="*primary_400",
+    # button_primary_border_color="*primary_500",
+    button_primary_border_color_dark="*primary_600",
+    button_primary_text_color="white",
+    button_primary_text_color_dark="white",
+    button_secondary_background_fill="*neutral_100",
+    button_secondary_background_fill_hover="*neutral_50",
+    button_secondary_background_fill_dark="*neutral_900",
+    button_secondary_text_color="*neutral_800",
+    button_secondary_text_color_dark="white",
+    # background_fill_primary="#F7F7F7",
+    # background_fill_primary_dark="#1F1F1F",
+    # block_title_text_color="*primary_500",
+    block_title_background_fill_dark="*primary_900",
+    block_label_background_fill_dark="*primary_900",
+    input_background_fill="#F6F6F6",
+    # chatbot_code_background_color_dark="*neutral_950",
+)
+def compact_text_chunks(self, prompt, text_chunks: List[str]) -> List[str]:
+    logging.debug("Compacting text chunks...🚀🚀🚀")
+    combined_str = [c.strip() for c in text_chunks if c.strip()]
+    combined_str = [f"[{index+1}] {c}" for index, c in enumerate(combined_str)]
+    combined_str = "\n\n".join(combined_str)
+    # resplit based on self.max_chunk_overlap
+    text_splitter = self.get_text_splitter_given_prompt(prompt, 1, padding=1)
+    return text_splitter.split_text(combined_str)
+def postprocess(y: List[Tuple[str | None, str | None]]) -> List[Tuple[str | None, str | None]]:
+    """
+    Parameters:
+        y: List of tuples representing the message and response pairs. Each message and response should be a string, which may be in Markdown format.
+    Returns:
+        List of tuples representing the message and response. Each message and response will be a string of HTML.
+    """
+    if y is None or y == []:
+        return []
+    temp = []
+    for x in y:
+        user, bot = x
+        if not detect_converted_mark(user):
+            user = convert_asis(user)
+        if not detect_converted_mark(bot):
+            bot = convert_mdtext(bot)
+        temp.append((user, bot))
+    return temp
+custom_js_path = os.path.join(ROOT_PATH, "assets/custom.js")
+kelpy_codos_path = os.path.join(ROOT_PATH, "assets/Kelpy-Codos.js")
+with (
+    open(custom_js_path, "r", encoding="utf-8") as f,
+    open(kelpy_codos_path, "r", encoding="utf-8") as f2,
+):
+    customJS = f.read()
+    kelpyCodos = f2.read()
+def reload_javascript():
+    print("Reloading javascript...")
+    js = f"<script>{customJS}</script><script>{kelpyCodos}</script>"
+    def template_response(*args, **kwargs):
+        res = GradioTemplateResponseOriginal(*args, **kwargs)
+        res.body = res.body.replace(b"</html>", f"{js}</html>".encode("utf8"))
+        res.init_headers()
+        return res
+    gr.routes.templates.TemplateResponse = template_response
+GradioTemplateResponseOriginal = gr.routes.templates.TemplateResponse

kimi_vl/serve/gradio_utils.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""
+Gradio utils for the Kimi-VL application.
+"""
+import functools
+from typing import Callable
+import traceback
+import gradio as gr
+IMAGE_TOKEN = "<image>"
+def transfer_input(input_text, input_images):
+    """
+    Transfer the input text and images to the input text and images.
+    """
+    return (input_text, input_images, gr.update(value=""), gr.update(value=None), gr.Button(visible=True))
+def delete_last_conversation(chatbot, history):
+    """
+    Delete the last conversation from the chatbot and history.
+    Args:
+        chatbot (list): The chatbot list.
+        history (list): The history list.
+    """
+    if len(history) % 2 != 0:
+        gr.Error("history length is not even")
+        return (
+            chatbot,
+            history,
+            "Delete Done",
+        )
+    if len(chatbot) > 0:
+        chatbot.pop()
+    if len(history) > 0 and len(history) % 2 == 0:
+        history.pop()
+        history.pop()
+    return (
+        chatbot,
+        history,
+        "Delete Done",
+    )
+def reset_state():
+    return [], [], None, "Reset Done"
+def reset_textbox():
+    return gr.update(value=""), ""
+def cancel_outputing():
+    return "Stop Done"
+class State:
+    interrupted = False
+    def interrupt(self):
+        self.interrupted = True
+    def recover(self):
+        self.interrupted = False
+shared_state = State()
+def wrap_gen_fn(gen_fn: Callable):
+    """
+    Wrap the generator function to handle errors.
+    """
+    @functools.wraps(gen_fn)
+    def wrapped_gen_fn(prompt, *args, **kwargs):
+        try:
+            yield from gen_fn(prompt, *args, **kwargs)
+        except gr.Error as g_err:
+            traceback.print_exc()
+            raise g_err
+        except Exception as e:
+            traceback.print_exc()
+            raise gr.Error(f"Failed to generate text: {e}") from e
+    return wrapped_gen_fn

kimi_vl/serve/inference.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import logging
+import re
+from threading import Thread
+from typing import List, Optional
+import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoProcessor,
+    AutoConfig,
+    StoppingCriteria,
+    StoppingCriteriaList,
+    TextIteratorStreamer,
+)
+from .chat_utils import Conversation, get_conv_template
+logger = logging.getLogger(__name__)
+def load_model(model_path: str = "moonshotai/Kimi-VL-A3B-Thinking"):
+    # hotfix the model to use flash attention 2
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    config._attn_implementation = "flash_attention_2"
+    config.vision_config._attn_implementation = "flash_attention_2"
+    config.text_config._attn_implementation = "flash_attention_2"
+    print("Successfully set the attn_implementation to flash_attention_2")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        config=config,
+        torch_dtype="auto",
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True)
+    return model, processor
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops=[], encounters=1):
+        super().__init__()
+        self.stops = [stop.to("cuda") for stop in stops]
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
+        for stop in self.stops:
+            if input_ids.shape[-1] < len(stop):
+                continue
+            if torch.all((stop == input_ids[0][-len(stop) :])).item():
+                return True
+        return False
+def format_messages(
+    conversations: list[Conversation],
+    system_prompt: Optional[str] = "",
+    sft_format: Optional[str] = "kimi-vl",
+):
+    """
+    Format the conversations to the input format of the model.
+    """
+    converstion = get_conv_template(sft_format)
+    converstion.set_system_message(system_prompt)
+    for message in conversations:
+        converstion.append_message(message["role"], message["content"])
+    return converstion
+def preprocess(
+    messages: list[dict],
+    processor,
+    sft_format: Optional[str] = "kimi-vl",
+):
+    """
+    Build messages from the conversations and images.
+    """
+    # get images from conversations
+    results = []
+    images = []
+    # get texts from conversations
+    converstion = get_conv_template(sft_format)
+    # only use the last 3 round of messages
+    latest_messages = messages[-3:]
+    for mid, message in enumerate(latest_messages):
+        if message["role"] == converstion.roles[0] or message["role"] == "user":
+            record = {
+                "role": message["role"],
+                "content": [],
+            }
+            if "images" in message:
+                per_round_images = message["images"]
+                if len(per_round_images) > 2:
+                    per_round_images = per_round_images[-2:]
+                    print(f"Only use the last 2 images in the {mid}-th round")
+                images.extend(per_round_images)
+                for image in per_round_images:
+                    record["content"].append(
+                        {
+                            "type": "image",
+                            "image": image,
+                        }
+                    )
+            if 'content' in message:
+                record["content"].append(
+                    {
+                        "type": "text",
+                        "text": str(message["content"]).strip(),
+                    }
+                )
+            results.append(record)
+        elif message["role"] == converstion.roles[1] or message["role"] == "assistant":
+            formatted_answer = message["content"].strip()
+            # ◁think▷用户说了“你好”，这是一个非常简单的问候，通常用于开启对话。我需要判断用户的意图。可能性一：用户只是礼貌性地打招呼，想要开启一段对话；可能性二：用户可能有更具体的需求，比如询问我的功能、功能或者需要帮助。由于用户没有提供更多信息，我需要保持开放，同时引导用户进一步说明他们的需求。
+            # 我的回复需要既友好又开放，不能显得过于正式或冷漠。同时，我需要避免假设用户的具体需求，而是提供一个轻松的、鼓励继续对话的回应。◁/think▷你好！很高兴见到你。有什么我可以帮助你的吗
+            # delete all the texts between ◁think▷ and ◁/think▷
+            # FIXME: this is a hack to remove the thinking texts
+            # formatted_answer = re.sub(r"◁think▷.*◁/think▷", "", formatted_answer)
+            think_end_token = '◁/think▷'
+            formatted_answer = formatted_answer.split(think_end_token)[-1]
+            results.append(
+                {
+                    "role": message["role"],
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": formatted_answer,
+                        }
+                    ],
+                }
+            )
+            assert (
+                formatted_answer.count(processor.image_token) == 0
+            ), f"there should be no {processor.image_token} in the assistant's reply, but got {messages}"
+            converstion.append_message(converstion.roles[1], formatted_answer)
+    text = processor.apply_chat_template(results, add_generation_prompt=True)
+    print(f"raw text = {text}")
+    if len(images) == 0:
+        images = None
+    inputs = processor(
+        images=images,
+        text=[text],
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+    )
+    return inputs
+@torch.inference_mode()
+def kimi_vl_generate(
+    model: torch.nn.Module,
+    processor: AutoProcessor,
+    conversations: list[Conversation],
+    stop_words: list,
+    max_length: int = 256,
+    temperature: float = 1.0,
+    top_p: float = 1.0,
+    chunk_size: int = -1,
+):
+    # convert conversation to inputs
+    print(f"conversations = {conversations}")
+    inputs = preprocess(conversations, processor=processor)
+    inputs = inputs.to(model.device)
+    return generate(
+        model,
+        processor,
+        inputs,
+        max_gen_len=max_length,
+        temperature=temperature,
+        top_p=top_p,
+        stop_words=stop_words,
+        chunk_size=chunk_size,
+    )
+@torch.inference_mode()
+def generate(
+    model,
+    processor,
+    inputs,
+    max_gen_len: int = 256,
+    temperature: float = 0,
+    top_p: float = 0.95,
+    stop_words: List[str] = [],
+    chunk_size: int = -1,
+):
+    """Stream the text output from the multimodality model with prompt and image inputs."""
+    tokenizer = processor.tokenizer
+    stop_words_ids = [torch.tensor(tokenizer.encode(stop_word)) for stop_word in stop_words]
+    stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
+    kwargs = dict(
+        **inputs,
+        max_new_tokens=max_gen_len,
+        do_sample=True,
+        use_cache=True,
+        streamer=streamer,
+        stopping_criteria=stopping_criteria,
+    )
+    if temperature > 0:
+        kwargs.update(
+            {
+                "do_sample": True,
+                "top_p": top_p,
+                "temperature": temperature,
+            }
+        )
+    else:
+        kwargs["do_sample"] = False
+    thread = Thread(target=model.generate, kwargs=kwargs)
+    thread.start()
+    yield from streamer

kimi_vl/serve/utils.py ADDED Viewed

	@@ -0,0 +1,290 @@

+from __future__ import annotations
+import html
+import logging
+import io
+import os
+import re
+import base64
+import time
+from PIL import Image, ImageDraw, ImageFont
+import mdtex2html
+from markdown import markdown
+from pygments import highlight
+from pygments.formatters import HtmlFormatter
+from pygments.lexers import ClassNotFound, get_lexer_by_name, guess_lexer
+ALREADY_CONVERTED_MARK = "<!-- ALREADY CONVERTED BY PARSER. -->"
+BOX2COLOR = {
+    0: (255, 0, 0),
+    1: (0, 255, 0),
+    2: (0, 0, 255),
+}
+MAX_IMAGE_SIZE = 1024
+MIN_IMAGE_SIZE = 1024
+logger = logging.getLogger("gradio_logger")
+def configure_logger(log_dir: str = "logs"):
+    logger = logging.getLogger("gradio_logger")
+    logger.setLevel(logging.DEBUG)
+    timestr = time.strftime("%Y%m%d-%H%M%S")
+    os.makedirs(log_dir, exist_ok=True)
+    file_handler = logging.FileHandler(f"{log_dir}/{timestr}_gradio_log.log")
+    console_handler = logging.StreamHandler()
+    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    console_handler.setFormatter(formatter)
+    file_handler.setFormatter(formatter)
+    console_handler.setLevel(logging.INFO)
+    file_handler.setLevel(logging.INFO)
+    logger.addHandler(console_handler)
+    logger.addHandler(file_handler)
+    return logger
+def strip_stop_words(x, stop_words):
+    for w in stop_words:
+        if w in x:
+            return x[: x.index(w)].strip()
+    return x.strip()
+def format_output(history, text, x):
+    updated_history = history + [[text, x]]
+    a = [[y[0], convert_to_markdown(y[1])] for y in updated_history]
+    return a, updated_history
+def markdown_to_html_with_syntax_highlight(md_str):  # deprecated
+    def replacer(match):
+        lang = match.group(1) or "text"
+        code = match.group(2)
+        try:
+            lexer = get_lexer_by_name(lang, stripall=True)
+        except ValueError:
+            lexer = get_lexer_by_name("text", stripall=True)
+        formatter = HtmlFormatter()
+        highlighted_code = highlight(code, lexer, formatter)
+        return f'<pre><code class="{lang}">{highlighted_code}</code></pre>'
+    code_block_pattern = r"```(\w+)?\n([\s\S]+?)\n```"
+    md_str = re.sub(code_block_pattern, replacer, md_str, flags=re.MULTILINE)
+    html_str = markdown(md_str)
+    return html_str
+def normalize_markdown(md_text: str) -> str:  # deprecated
+    lines = md_text.split("\n")
+    normalized_lines = []
+    inside_list = False
+    for i, line in enumerate(lines):
+        if re.match(r"^(\d+\.|-|\*|\+)\s", line.strip()):
+            if not inside_list and i > 0 and lines[i - 1].strip() != "":
+                normalized_lines.append("")
+            inside_list = True
+            normalized_lines.append(line)
+        elif inside_list and line.strip() == "":
+            if i < len(lines) - 1 and not re.match(r"^(\d+\.|-|\*|\+)\s", lines[i + 1].strip()):
+                normalized_lines.append(line)
+            continue
+        else:
+            inside_list = False
+            normalized_lines.append(line)
+    return "\n".join(normalized_lines)
+def convert_mdtext(md_text):
+    code_block_pattern = re.compile(r"```(.*?)(?:```|$)", re.DOTALL)
+    inline_code_pattern = re.compile(r"`(.*?)`", re.DOTALL)
+    code_blocks = code_block_pattern.findall(md_text)
+    non_code_parts = code_block_pattern.split(md_text)[::2]
+    result = []
+    for non_code, code in zip(non_code_parts, code_blocks + [""]):
+        if non_code.strip():
+            non_code = normalize_markdown(non_code)
+            if inline_code_pattern.search(non_code):
+                result.append(markdown(non_code, extensions=["tables"]))
+            else:
+                result.append(mdtex2html.convert(non_code, extensions=["tables"]))
+        if code.strip():
+            code = f"\n```{code}\n\n```"
+            code = markdown_to_html_with_syntax_highlight(code)
+            result.append(code)
+    result = "".join(result)
+    result += ALREADY_CONVERTED_MARK
+    return result
+def convert_asis(userinput):
+    return f'<p style="white-space:pre-wrap;">{html.escape(userinput)}</p>{ALREADY_CONVERTED_MARK}'
+def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
+    return any(s.endswith(stop_word) for stop_word in stop_words)
+def detect_converted_mark(userinput):
+    return bool(userinput.endswith(ALREADY_CONVERTED_MARK))
+def detect_language(code):
+    first_line = "" if code.startswith("\n") else code.strip().split("\n", 1)[0]
+    language = first_line.lower() if first_line else ""
+    code_without_language = code[len(first_line) :].lstrip() if first_line else code
+    return language, code_without_language
+def convert_to_markdown(text):
+    text = text.replace("$", "&#36;")
+    text = text.replace("\r\n", "\n")
+    def replace_leading_tabs_and_spaces(line):
+        new_line = []
+        for char in line:
+            if char == "\t":
+                new_line.append("&#9;")
+            elif char == " ":
+                new_line.append("&nbsp;")
+            else:
+                break
+        return "".join(new_line) + line[len(new_line) :]
+    markdown_text = ""
+    lines = text.split("\n")
+    in_code_block = False
+    for line in lines:
+        if in_code_block is False and line.startswith("```"):
+            in_code_block = True
+            markdown_text += f"{line}\n"
+        elif in_code_block is True and line.startswith("```"):
+            in_code_block = False
+            markdown_text += f"{line}\n"
+        elif in_code_block:
+            markdown_text += f"{line}\n"
+        else:
+            line = replace_leading_tabs_and_spaces(line)
+            line = re.sub(r"^(#)", r"\\\1", line)
+            markdown_text += f"{line}  \n"
+    return markdown_text
+def add_language_tag(text):
+    def detect_language(code_block):
+        try:
+            lexer = guess_lexer(code_block)
+            return lexer.name.lower()
+        except ClassNotFound:
+            return ""
+    code_block_pattern = re.compile(r"(```)(\w*\n[^`]+```)", re.MULTILINE)
+    def replacement(match):
+        code_block = match.group(2)
+        if match.group(2).startswith("\n"):
+            language = detect_language(code_block)
+            return f"```{language}{code_block}```" if language else f"```\n{code_block}```"
+        else:
+            return match.group(1) + code_block + "```"
+    text2 = code_block_pattern.sub(replacement, text)
+    return text2
+def is_variable_assigned(var_name: str) -> bool:
+    return var_name in locals()
+def pil_to_base64(
+    image: Image.Image,
+    alt: str = "user upload image",
+    resize: bool = True,
+    max_size: int = MAX_IMAGE_SIZE,
+    min_size: int = MIN_IMAGE_SIZE,
+    format: str = "JPEG",
+    quality: int = 95,
+) -> str:
+    """
+    Convert a PIL image to a base64 string.
+    """
+    if resize:
+        max_hw, min_hw = max(image.size), min(image.size)
+        aspect_ratio = max_hw / min_hw
+        shortest_edge = int(min(max_size / aspect_ratio, min_size, min_hw))
+        longest_edge = int(shortest_edge * aspect_ratio)
+        W, H = image.size
+        if H > W:
+            H, W = longest_edge, shortest_edge
+        else:
+            H, W = shortest_edge, longest_edge
+        image = image.resize((W, H))
+    buffered = io.BytesIO()
+    image.save(buffered, format=format, quality=quality)
+    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="{alt}" />'
+    return img_str
+def parse_ref_bbox(response, image: Image.Image):
+    try:
+        image = image.copy()
+        image_h, image_w = image.size
+        draw = ImageDraw.Draw(image)
+        ref = re.findall(r'<\|ref\|>.*?<\|/ref\|>', response)
+        bbox = re.findall(r'<\|det\|>.*?<\|/det\|>', response)
+        assert len(ref) == len(bbox)
+        if len(ref) == 0:
+            return None
+        boxes, labels = [], []
+        for box, label in zip(bbox, ref):
+            box = box.replace('<|det|>', '').replace('<|/det|>', '')
+            label = label.replace('<|ref|>', '').replace('<|/ref|>', '')
+            box = box[1:-1]
+            for onebox in re.findall(r'\[.*?\]', box):
+                boxes.append(eval(onebox))
+                labels.append(label)
+        for indice, (box, label) in enumerate(zip(boxes, labels)):
+            box = (
+                int(box[0] / 999 * image_h),
+                int(box[1] / 999 * image_w),
+                int(box[2] / 999 * image_h),
+                int(box[3] / 999 * image_w),
+            )
+            box_color = BOX2COLOR[indice % len(BOX2COLOR.keys())]
+            box_width = 3
+            draw.rectangle(box, outline=box_color, width=box_width)
+            text_x = box[0]
+            text_y = box[1] - 20
+            text_color = box_color
+            font = ImageFont.truetype("kimi_vl/serve/assets/simsun.ttc", size=20)
+            draw.text((text_x, text_y), label, font=font, fill=text_color)
+        return image
+    except Exception as e:
+        logger.error(f"Error parsing reference bounding boxes: {e}")
+        return None

pyproject.toml ADDED Viewed

	@@ -0,0 +1,29 @@

+[project]
+name = "kimi_vl"
+version = "1.0.0"
+description = "Kimi-VL"
+license = {file = "LICENSE-CODE"}
+readme = "README.md"
+requires-python = ">=3.8"
+dependencies = [
+    "torch==2.5.0",
+    "transformers==4.50.0",
+    "accelerate",
+    "tiktoken",
+    "blobfile",
+    "sentencepiece",
+]
+[project.optional-dependencies]
+gradio = [
+    "gradio==3.48.0",
+    "gradio-client==0.6.1",
+    "mdtex2html==1.3.0",
+    "pypinyin==0.50.0",
+]
+# Linter tools:
+[tool.black]
+line-length = 120
+skip-string-normalization = true
+target-version = ["py310"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+torch==2.5.0
+torchvision==0.20.0
+transformers==4.51.1
+accelerate
+sentencepiece
+attrdict
+einops
+tiktoken
+blobfile
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+# for gradio demo
+gradio
+gradio-client
+mdtex2html
+pypinyin
+tqdm
+colorama
+Pygments
+markdown
+SentencePiece