Spaces:

FL33TW00D-HF
/

throughput-calculator

Running

App Files Files Community

FL33TW00D commited on Apr 8

Commit

dc80200

unverified ·

1 Parent(s): 1c9123b

chore: init

Browse files

Files changed (11) hide show

README.md +54 -14
pyproject.toml +12 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/__pycache__/app.cpython-310.pyc +0 -0
src/__pycache__/throughput_utils.cpython-310.pyc +0 -0
src/__pycache__/throughput_utils.cpython-313.pyc +0 -0
src/app.py +252 -0
src/throughput_utils.py +148 -0
uv.lock +0 -0

README.md CHANGED Viewed

@@ -1,14 +1,54 @@
----
-title: Throughput Calculator
-emoji: 🐠
-colorFrom: gray
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.23.3
-app_file: app.py
-pinned: false
-license: mit
-short_description: Calculate the estimated throughput of on-device LLMs 🚀
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# On-Device LLM Throughput Calculator
+A Gradio web application that helps visualize LLM throughput on memory-bandwidth-constrained devices.
+## Overview
+This tool calculates and visualizes the theoretical throughput (tokens per second) that can be achieved by a Large Language Model (LLM) running on devices with memory bandwidth constraints. It supports different attention mechanisms:
+- Grouped Query Attention (GQA)
+- Multi-Query Attention (MQA)
+- Memory-Latent Attention (MLA)
+It also visualizes how sliding window attention impacts throughput at different context lengths.
+## Features
+- Customize device specifications (memory bandwidth)
+- Configure model parameters (size, layers, heads)
+- Compare different attention mechanisms
+- Visualize performance across different context lengths
+- Sliding window attention support
+## Usage
+1. Configure your device details (name, memory bandwidth)
+2. Set model parameters (number of parameters, layer count, etc.)
+3. Choose which attention mechanism configurations to compare
+4. Generate a visualization of expected throughput
+## Installation
+```bash
+pip install -r requirements.txt
+```
+## Running Locally
+```bash
+cd src
+python app.py
+```
+## Theory
+The calculations are based on memory bandwidth bottlenecks as described in the [JAX ML Scaling Book](https://jax-ml.github.io/scaling-book/inference/#theoretical-estimates-for-llm-latency-and-throughput).
+The basic formula for tokens per second:
+```
+tokens_per_second = (batch_size * memory_bandwidth) / (batch_size * total_kv_size + parameter_size)
+```
+## License
+MIT

pyproject.toml ADDED Viewed

	@@ -0,0 +1,12 @@

+[project]
+name = "throughput-calculator"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10.6"
+dependencies = [
+    "gradio>=4.0.0",
+    "numpy>=1.24.0",
+    "matplotlib>=3.7.0",
+    "seaborn>=0.12.0",
+]

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (178 Bytes). View file

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (182 Bytes). View file

src/__pycache__/app.cpython-310.pyc ADDED Viewed

Binary file (7.07 kB). View file

src/__pycache__/throughput_utils.cpython-310.pyc ADDED Viewed

Binary file (4.47 kB). View file

src/__pycache__/throughput_utils.cpython-313.pyc ADDED Viewed

Binary file (6.68 kB). View file

src/app.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import gradio as gr
+from enum import Enum
+from throughput_utils import create_throughput_plot
+class AttentionType(Enum):
+    LOCAL = 0
+    GLOBAL = 1
+class PhoneBandwidth(Enum):
+    Sixteen = 60
+    Fifteen = 51.2
+    Fourteen = 34.1
+custom_css = """
+#plot-container {
+    border-radius: 10px;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1), 0 1px 3px rgba(0, 0, 0, 0.08);
+    padding: 1rem;
+    background-color: white;
+    height: 100%;
+    margin-bottom: 1.5rem;
+}
+#generate-button {
+    background-color: #2563eb;
+    color: white;
+    border-radius: 8px;
+    font-weight: bold;
+    padding: 10px 20px;
+    box-shadow: 0 4px 6px rgba(37, 99, 235, 0.1);
+    transition: all 0.2s ease;
+    width: 100%;
+    max-width: 400px;
+    margin: 0 auto;
+    font-size: 16px;
+}
+#generate-button:hover {
+    background-color: #1d4ed8;
+    box-shadow: 0 6px 8px rgba(37, 99, 235, 0.2);
+    transform: translateY(-2px);
+}
+.gradio-container {
+    background-color: #f5f7fa;
+}
+/* Custom styles for sliders containers */
+.sliders-container {
+    border: 1px solid rgba(0, 0, 0, 0.1);
+    border-radius: 8px;
+    padding: 1rem;
+    margin-top: 0.5rem;
+    background-color: rgba(255, 255, 255, 0.8);
+}
+#error-status {
+    color: #b91c1c;
+    background-color: #fee2e2;
+    border-radius: 8px;
+    padding: 0.75rem;
+    margin-top: 0.5rem;
+    border: 1px solid #f87171;
+    font-weight: 500;
+}
+"""
+with gr.Blocks(css=custom_css) as demo:
+    gqa_sliders = []
+    mla_sliders = []
+    with gr.Column():
+        gr.Markdown(
+            """# 📊 On-Device LLM Throughput Calculator
+This tool estimates the throughput (tokens per second) of Large Language Models on devices with memory bandwidth constraints.
+It visualizes how different attention mechanisms (GQA, MLA) and context lengths affect throughput.
+"""
+        )
+        with gr.Row():
+            plot_output = gr.Image(label="Throughput Plot", type="pil", elem_id="plot-container")
+        # Add status element to display validation errors
+        status_output = gr.Markdown(visible=False, elem_id="error-status")
+        with gr.Row():
+            plot_button = gr.Button("Generate Throughput Plot", size="lg", elem_id="generate-button", variant="primary")
+        with gr.Row():
+            with gr.Column(scale=1):
+                with gr.Group():
+                    gr.Markdown("### Device Configuration")
+                    model_name = gr.Textbox(label="Model Name", value="TinyLLM")
+                    iphone_model = gr.Dropdown(
+                        label="iPhone Model",
+                        choices=[e.name for e in PhoneBandwidth],
+                        value=PhoneBandwidth.Sixteen.name,
+                        interactive=True
+                    )
+                with gr.Group():
+                    gr.Markdown("### Attention Configurations to Plot")
+                    gr.Markdown("#### GQA Head Configurations")
+                    gr.Markdown("*Note: GQA head count must be less than or equal to the total number of heads*")
+                    with gr.Column(elem_classes="sliders-container"):
+                        gqa_slider1 = gr.Slider(minimum=1, maximum=32, step=2, value=4,
+                                               label="GQA Head Count #1")
+                        gqa_slider2 = gr.Slider(minimum=1, maximum=32, step=2, value=8,
+                                               label="GQA Head Count #2")
+                        gqa_sliders.extend([gqa_slider1, gqa_slider2])
+                    gr.Markdown("#### MLA Compressed Dimensions")
+                    gr.Markdown("*Note: MLA dimension must be less than or equal to d_model*")
+                    with gr.Column(elem_classes="sliders-container"):
+                        mla_slider1 = gr.Slider(minimum=64, maximum=1024, step=64, value=256,
+                                              label="MLA Dimension #1")
+                        mla_slider2 = gr.Slider(minimum=64, maximum=1024, step=64, value=512,
+                                              label="MLA Dimension #2")
+                        mla_sliders.extend([mla_slider1, mla_slider2])
+            with gr.Column(scale=1):
+                with gr.Group():
+                    gr.Markdown("### Model Configuration")
+                    num_parameters = gr.Number(label="Parameters (Billions)", value=3)
+                    parameter_size = gr.Slider(minimum=1, maximum=16.0, step=1.0,  label="Parameter Size (bits per param)", value=5)
+                    kv_parameter_size = gr.Slider(minimum=0.25, maximum=4.0, step=0.25,
+                                                 label="KV Cache Size (bytes per value)", value=2.0)
+                    num_layers = gr.Number(label="Number of Layers", value=36)
+                    num_heads = gr.Number(label="Number of Heads", value=16,
+                                        info="GQA head counts must be less than or equal to this value")
+                    d_model = gr.Number(label="D Model", value=2048,
+                                       info="MLA dimensions must be less than or equal to this value")
+                with gr.Group():
+                    gr.Markdown("### Context Configuration")
+                    ctx_length = gr.Slider(minimum=1024, maximum=131072, step=1024,
+                                          label="Max Context Length", value=65536)
+                    local_layers = gr.Number(label="Local Attention Layers", value=0)
+                    global_layers = gr.Number(label="Global Attention Layers", value=1)
+                    swa_size = gr.Slider(minimum=1024, maximum=32768, step=1024,
+                                       label="Sliding Window Size", value=4096)
+        gr.Markdown(
+            """
+            For more information, see [JAX ML Scaling Book](https://jax-ml.github.io/scaling-book/inference/#theoretical-estimates-for-llm-latency-and-throughput).
+            """
+        )
+    def generate_throughput_plot(
+        model_name, iphone_model, num_parameters, parameter_size,
+        kv_parameter_size, num_layers, num_heads, d_model, ctx_length,
+        local_layers, global_layers, swa_size, gqa_1, gqa_2, mla_1, mla_2
+    ):
+        memory_bandwidth = PhoneBandwidth[iphone_model].value
+        if "iPhone" not in model_name:
+            model_name = f"iPhone {iphone_model}: {model_name}"
+        try:
+            # Validate GQA head counts must be less than total attention heads
+            for gqa_heads, label in [(gqa_1, "GQA Head Count #1"), (gqa_2, "GQA Head Count #2")]:
+                if gqa_heads > num_heads:
+                    raise ValueError(f"{label} ({gqa_heads}) cannot be greater than the total number of attention heads ({num_heads})")
+            # Validate MLA compressed dimensions must be less than d_model
+            for mla_dim, label in [(mla_1, "MLA Dimension #1"), (mla_2, "MLA Dimension #2")]:
+                if mla_dim > d_model:
+                    raise ValueError(f"{label} ({mla_dim}) cannot be greater than the model dimension (d_model = {d_model})")
+            plot_img = create_throughput_plot(
+                model_name,
+                memory_bandwidth,
+                num_parameters,
+                parameter_size,
+                kv_parameter_size,
+                num_layers,
+                num_heads,
+                d_model,
+                ctx_length,
+                local_layers,
+                global_layers,
+                swa_size,
+                [gqa_1, gqa_2],
+                [mla_1, mla_2],
+            )
+            # Hide error message, show plot
+            return [
+                gr.update(value=plot_img),
+                gr.update(visible=False, value="")
+            ]
+        except Exception as e:
+            err_string = f"Error generating plot: {str(e)}"
+            print(err_string)
+            # Show error message, clear plot
+            return [
+                gr.update(value=None),
+                gr.update(visible=True, value=f"⚠️ {err_string}")
+            ]
+    # Function to update GQA sliders based on number of heads
+    def update_gqa_sliders(heads_value):
+        if not heads_value or heads_value < 1:
+            heads_value = 1
+        return [gr.update(maximum=heads_value, value=min(slider.value, heads_value)) for slider in gqa_sliders]
+    # Function to update MLA sliders based on d_model
+    def update_mla_sliders(d_model_value):
+        if not d_model_value or d_model_value < 64:
+            d_model_value = 64
+        return [gr.update(maximum=d_model_value, value=min(slider.value, d_model_value)) for slider in mla_sliders]
+    # Add event handlers to update sliders when model configuration changes
+    num_heads.change(
+        update_gqa_sliders,
+        inputs=[num_heads],
+        outputs=gqa_sliders
+    )
+    d_model.change(
+        update_mla_sliders,
+        inputs=[d_model],
+        outputs=mla_sliders
+    )
+    plot_button.click(
+        generate_throughput_plot,
+        inputs=[
+            model_name,
+            iphone_model,
+            num_parameters,
+            parameter_size,
+            kv_parameter_size,
+            num_layers,
+            num_heads,
+            d_model,
+            ctx_length,
+            local_layers,
+            global_layers,
+            swa_size,
+            *gqa_sliders,
+            *mla_sliders,
+        ],
+        outputs=[plot_output, status_output]
+    )
+if __name__ == "__main__":
+    demo.launch()

src/throughput_utils.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from matplotlib.ticker import ScalarFormatter
+from enum import Enum
+import io
+class AttentionType(Enum):
+    LOCAL = 0
+    GLOBAL = 1
+def gqa_kv_per_layer_per_token(n_kv_heads, d_head, kv_parameter_size):
+    return  2 * kv_parameter_size * n_kv_heads * d_head
+def mla_kv_per_layer_per_token(d_compressed, kv_parameter_size):
+    return kv_parameter_size * d_compressed
+def tokens_per_second(batch_size, bandwidth, total_kv_size, param_size):
+    return (batch_size * bandwidth) / (batch_size * total_kv_size + param_size)
+def compute_tps(kv_per_layer_per_token, seq_len, batch_size, total_param_size,
+                num_layers, swa_pattern, swa_size, bandwidth):
+    tps_values = []
+    for ctx_len in seq_len:
+        total_kv_size = 0
+        for l in range(num_layers):
+            if swa_pattern[l % len(swa_pattern)] == AttentionType.LOCAL:
+                total_kv_size += kv_per_layer_per_token * min(ctx_len, swa_size)
+            else:
+                total_kv_size += kv_per_layer_per_token * ctx_len
+        tps = tokens_per_second(batch_size, bandwidth, total_kv_size, total_param_size)
+        tps_values.append(tps)
+    return tps_values
+def create_throughput_plot(
+    model_name,
+    memory_bandwidth,
+    num_parameters,
+    parameter_size,
+    kv_parameter_size,
+    num_layers,
+    num_heads,
+    d_model,
+    ctx_length,
+    local_layers,
+    global_layers,
+    swa_size,
+    gqa_heads,
+    mla_d_compressed,
+):
+    memory_bandwidth = float(memory_bandwidth) * 1_000_000_000
+    num_parameters = float(num_parameters) * 1_000_000_000
+    d_head = d_model // num_heads
+    total_param_size = num_parameters * (parameter_size / 8.0)
+    swa_pattern = ([AttentionType.LOCAL] * local_layers +
+                  [AttentionType.GLOBAL] * global_layers)
+    if len(swa_pattern) == 0:
+        swa_pattern = [AttentionType.GLOBAL]
+    sns.set_theme(style="whitegrid", context="paper")
+    palette = sns.color_palette("viridis", len(gqa_heads) + len(mla_d_compressed))
+    plt.figure(figsize=(14, 8), dpi=300)
+    seq_len = np.logspace(2, 5, 100).astype(int)
+    batch_size = 1
+    tps_values = []
+    gqa_count = len(gqa_heads)
+    for i, n_kv_head in enumerate(gqa_heads):
+        n_kv_head = int(n_kv_head)
+        kv_per_token = gqa_kv_per_layer_per_token(n_kv_head, d_head, kv_parameter_size)
+        gqa_tps_values = compute_tps(kv_per_token, seq_len, batch_size, total_param_size,
+                                num_layers, swa_pattern, swa_size, memory_bandwidth)
+        tps_values.extend(gqa_tps_values)
+        plt.plot(seq_len, gqa_tps_values, label=f"GQA: {n_kv_head} heads", color=palette[i],
+                linewidth=3.5, alpha=0.85)
+    plt.axvline(x=ctx_length, color='red', linestyle='--', alpha=0.8, linewidth=2.5,
+                label=f"Max Context Length ({ctx_length:,})")
+    local_count = swa_pattern.count(AttentionType.LOCAL)
+    global_count = swa_pattern.count(AttentionType.GLOBAL)
+    if local_count > 0:
+        plt.axvline(x=swa_size, color='blue', linestyle='--', alpha=0.8, linewidth=2.5,
+                    label=f"Sliding Window Limit ({swa_size:,})")
+    for i, d_comp in enumerate(mla_d_compressed):
+        d_comp = int(d_comp)
+        kv_per_token = mla_kv_per_layer_per_token(d_comp, kv_parameter_size)
+        mla_tps_values = compute_tps(kv_per_token, seq_len, batch_size, total_param_size,
+                                num_layers, swa_pattern, swa_size, memory_bandwidth)
+        tps_values.extend(mla_tps_values)
+        plt.plot(seq_len, mla_tps_values, label=f"MLA: dc = {d_comp}",
+                color=palette[i + gqa_count], linewidth=3.5, alpha=0.85)
+    plt.xscale('log')
+    if all(np.isfinite(tps_values)):
+        min_tps = min(tps_values)
+        max_tps = max(tps_values)
+        y_min = max(0, min_tps * 0.9)
+        y_max = max_tps * 1.1
+        plt.ylim(y_min, y_max)
+    else:
+        plt.ylim(15, 40)
+    plt.gca().xaxis.set_major_formatter(ScalarFormatter())
+    plt.gca().yaxis.set_major_formatter(ScalarFormatter())
+    ax = plt.gca()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['left'].set_linewidth(1.5)
+    ax.spines['bottom'].set_linewidth(1.5)
+    attn_label = "Global" if local_count == 0 else f"SWA {local_count}:{global_count}"
+    device_name = model_name.split(':')[0] if ':' in model_name else model_name
+    plt.annotate(f"{device_name}\nBandwidth: {memory_bandwidth/1e9:.1f} GB/s\nParameter Size: {parameter_size:.1f} bits\nAttention Kind: {attn_label}",
+                 xy=(0.8, 0.97),
+                 xycoords='axes fraction',
+                 bbox=dict(boxstyle="round,pad=0.4", facecolor="white", alpha=0.9, edgecolor='darkgray'),
+                 va='top',
+                 fontsize=11)
+    plt.xlabel('Context Length (tokens)', fontsize=14, fontweight='bold')
+    plt.ylabel('Tokens per Second', fontsize=14, fontweight='bold')
+    plt.tick_params(axis='both', which='major', labelsize=12)
+    model_title = model_name.split(':')[1] if ':' in model_name else model_name
+    plt.title(f"{model_title}: Tokens Per Second vs. Sequence Length", fontsize=18,
+              fontweight='bold', pad=20)
+    plt.legend(title="Configuration", frameon=True, framealpha=0.95, fontsize=12, title_fontsize=14)
+    plt.grid(True, alpha=0.5)
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    plt.close()
+    buf.seek(0)
+    from PIL import Image
+    img = Image.open(buf)
+    return img

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff