Commit
·
6e06b7a
1
Parent(s):
8bcba7b
Add enhanced KV cache calculator with GQA/MHA detection and fp4 support
Browse files- Added support for GQA vs MHA detection and display
- Implemented fp4 data type support (MXFP4)
- Enhanced model configuration display with calculation formulas
- Set Qwen3-30B-A3B as default model
- Added proper attribution to gaunernst's original implementation
- Optimized interface for iframe embedding in blogs
- README.md +27 -5
- app.py +125 -0
- requirements.txt +1 -0
README.md
CHANGED
|
@@ -1,13 +1,35 @@
|
|
| 1 |
---
|
| 2 |
title: LLM KV Cache Calculator
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.45.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
short_description: KV cache
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: LLM KV Cache Calculator
|
| 3 |
+
emoji: 💻
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.45.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
short_description: Calculate KV cache memory requirements for transformer models with support for MHA, GQA, and MLA attention mechanisms
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# KV Cache Calculator
|
| 14 |
+
|
| 15 |
+
Calculate KV cache memory requirements for transformer models.
|
| 16 |
+
|
| 17 |
+
## Credits
|
| 18 |
+
|
| 19 |
+
This implementation is derived from and builds upon the excellent work by [gaunernst](https://huggingface.co/spaces/gaunernst/kv-cache-calculator). Special thanks for the original implementation!
|
| 20 |
+
|
| 21 |
+
## Features
|
| 22 |
+
|
| 23 |
+
- **Multi-attention support**: MHA (Multi-Head Attention), GQA (Grouped Query Attention), and MLA (Multi-head Latent Attention)
|
| 24 |
+
- **Multiple data types**: fp16/bf16, fp8, and fp4 quantization
|
| 25 |
+
- **Real-time calculation**: Instant memory requirement estimates
|
| 26 |
+
- **Model analysis**: Detailed breakdown of model configuration
|
| 27 |
+
- **Universal compatibility**: Works with any HuggingFace transformer model
|
| 28 |
+
|
| 29 |
+
## Usage
|
| 30 |
+
|
| 31 |
+
1. Enter your model ID (e.g., "Qwen/Qwen3-30B-A3B")
|
| 32 |
+
2. Set context length and number of users
|
| 33 |
+
3. Choose data type precision
|
| 34 |
+
4. Add HuggingFace token if needed for gated models
|
| 35 |
+
5. Click calculate to get memory requirements
|
app.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from transformers import AutoConfig
|
| 3 |
+
|
| 4 |
+
# Credits: This implementation is derived from and builds upon the excellent work by gaunernst
|
| 5 |
+
# Original implementation: https://huggingface.co/spaces/gaunernst/kv-cache-calculator
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def calculate(name: str, ctx_len: int, num_users: int, dtype: str, hf_token: str):
|
| 9 |
+
hf_token = hf_token.strip()
|
| 10 |
+
try:
|
| 11 |
+
cfg = AutoConfig.from_pretrained(
|
| 12 |
+
name,
|
| 13 |
+
trust_remote_code=True,
|
| 14 |
+
token=hf_token or None,
|
| 15 |
+
)
|
| 16 |
+
except Exception as e:
|
| 17 |
+
raise gr.Error(e)
|
| 18 |
+
|
| 19 |
+
use_mla = cfg.architectures[0].startswith(("DeepseekV2", "DeepseekV3"))
|
| 20 |
+
|
| 21 |
+
if hasattr(cfg, "text_config"):
|
| 22 |
+
cfg = cfg.text_config
|
| 23 |
+
|
| 24 |
+
num_layers = cfg.num_hidden_layers
|
| 25 |
+
|
| 26 |
+
# Determine attention mechanism type
|
| 27 |
+
num_attention_heads = cfg.num_attention_heads
|
| 28 |
+
num_kv_heads = getattr(cfg, "num_key_value_heads", num_attention_heads)
|
| 29 |
+
|
| 30 |
+
if use_mla:
|
| 31 |
+
attention_type = "MLA"
|
| 32 |
+
elif num_kv_heads == num_attention_heads:
|
| 33 |
+
attention_type = "MHA"
|
| 34 |
+
else:
|
| 35 |
+
attention_type = "GQA"
|
| 36 |
+
|
| 37 |
+
model_config = [
|
| 38 |
+
["num_layers", num_layers],
|
| 39 |
+
["max_ctx_len", cfg.max_position_embeddings],
|
| 40 |
+
["attention_type", attention_type],
|
| 41 |
+
["num_attention_heads", num_attention_heads],
|
| 42 |
+
["num_kv_heads", num_kv_heads],
|
| 43 |
+
]
|
| 44 |
+
if ctx_len > cfg.max_position_embeddings:
|
| 45 |
+
gr.Warning(
|
| 46 |
+
"Requested context length is larger than the max value supported by the model"
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
# Calculate KV cache elements per token based on attention mechanism
|
| 50 |
+
if use_mla:
|
| 51 |
+
kv_lora_rank = cfg.kv_lora_rank
|
| 52 |
+
qk_rope_head_dim = cfg.qk_rope_head_dim
|
| 53 |
+
nelems_per_token = num_layers * (kv_lora_rank + qk_rope_head_dim)
|
| 54 |
+
|
| 55 |
+
model_config.append(["kv_lora_rank", kv_lora_rank])
|
| 56 |
+
model_config.append(["qk_rope_head_dim", qk_rope_head_dim])
|
| 57 |
+
model_config.append(["calc_formula", f"{num_layers} * ({kv_lora_rank} + {qk_rope_head_dim})"])
|
| 58 |
+
|
| 59 |
+
else:
|
| 60 |
+
head_dim = getattr(cfg, "head_dim", cfg.hidden_size // num_attention_heads)
|
| 61 |
+
nelems_per_token = num_layers * num_kv_heads * head_dim * 2 # 2 for key and value
|
| 62 |
+
|
| 63 |
+
model_config.append(["head_dim", head_dim])
|
| 64 |
+
if attention_type == "GQA":
|
| 65 |
+
kv_ratio = num_attention_heads // num_kv_heads
|
| 66 |
+
model_config.append(["gqa_ratio", f"{kv_ratio}:1"])
|
| 67 |
+
model_config.append(["calc_formula", f"{num_layers} * {num_kv_heads} * {head_dim} * 2"])
|
| 68 |
+
|
| 69 |
+
if dtype == "fp16/bf16":
|
| 70 |
+
nbytes_per_elem = 2
|
| 71 |
+
elif dtype == "fp8":
|
| 72 |
+
nbytes_per_elem = 1 + 2 / cfg.hidden_size # assume per-token scaling
|
| 73 |
+
elif dtype == "fp4":
|
| 74 |
+
nbytes_per_elem = 0.5 + 2 / 32 # 4-bit weights + scaling factor every 32 elements (MXFP4)
|
| 75 |
+
|
| 76 |
+
kv_cache_size = nelems_per_token * ctx_len * num_users * nbytes_per_elem / 1e9
|
| 77 |
+
return kv_cache_size, model_config
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# Minimal description for iframe embedding
|
| 81 |
+
DESCRIPTION = (
|
| 82 |
+
"Calculate KV cache memory requirements for transformer models. "
|
| 83 |
+
"Supports MHA, GQA, and MLA attention mechanisms with fp16/bf16, fp8, and fp4 data types."
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
demo = gr.Interface(
|
| 87 |
+
title="KV Cache Calculator",
|
| 88 |
+
description=DESCRIPTION,
|
| 89 |
+
fn=calculate,
|
| 90 |
+
inputs=[
|
| 91 |
+
gr.Textbox(label="Model ID", value="Qwen/Qwen3-30B-A3B", placeholder="e.g., Qwen/Qwen3-30B-A3B"),
|
| 92 |
+
gr.Number(label="Context Length", value=128_000, minimum=1),
|
| 93 |
+
gr.Number(label="Number of Users", value=1, minimum=1),
|
| 94 |
+
gr.Dropdown(label="KV Cache Data Type", choices=["fp16/bf16", "fp8", "fp4"], value="fp16/bf16"),
|
| 95 |
+
gr.Textbox(label="HuggingFace Token (optional)", type="password", placeholder="For gated models"),
|
| 96 |
+
],
|
| 97 |
+
outputs=[
|
| 98 |
+
gr.Number(label="KV Cache Size (GB)", precision=2),
|
| 99 |
+
gr.Dataframe(
|
| 100 |
+
label="Model Configuration",
|
| 101 |
+
headers=["Parameter", "Value"],
|
| 102 |
+
datatype=["str", "str"],
|
| 103 |
+
wrap=True
|
| 104 |
+
),
|
| 105 |
+
],
|
| 106 |
+
theme=gr.themes.Soft(),
|
| 107 |
+
css="""
|
| 108 |
+
.gradio-container {
|
| 109 |
+
max-width: 800px !important;
|
| 110 |
+
margin: 0 auto !important;
|
| 111 |
+
}
|
| 112 |
+
""",
|
| 113 |
+
analytics_enabled=False,
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
if __name__ == "__main__":
|
| 117 |
+
demo.launch(
|
| 118 |
+
server_name="0.0.0.0",
|
| 119 |
+
server_port=7860,
|
| 120 |
+
share=False,
|
| 121 |
+
show_error=True,
|
| 122 |
+
# Enable embedding in iframes
|
| 123 |
+
allowed_paths=[],
|
| 124 |
+
app_kwargs={"docs_url": None, "redoc_url": None}
|
| 125 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
transformers
|