-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.yaml
More file actions
97 lines (88 loc) · 2.34 KB
/
config.yaml
File metadata and controls
97 lines (88 loc) · 2.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
runtime:
seed: 7
device: auto
dtype: auto
torch_compile: false
profile_cuda: true
model:
name: distilgpt2
revision: main
trust_remote_code: false
max_new_tokens: 24
use_attention_outputs: true
attention_implementation: eager
smoke_model_name: distilgpt2
full_benchmark_model_name: Qwen/Qwen2.5-0.5B-Instruct
workloads:
long_context_targets: [2048, 4096, 8192]
synthetic_prompt_tokens: 1536
streaming_chunk_tokens: 128
streaming_steps: 8
multi_turn_turns: 6
benchmark_prompts_path: benchmark/data/real_prompts.json
benchmark:
modes: [no_cache, standard_cache, kvcachex]
repetitions: 1
warmup_runs: 1
teacher_forced_accuracy: true
greedy_similarity: true
compression:
enabled: true
method: quantization
quantization_bits: 8
asymmetric_kv_quantization: false
key_quantization_scheme: affine_per_channel
value_quantization_scheme: affine_per_token
materialization_cache_enabled: false
materialization_chunk_cache_size: 128
materialization_prefix_cache_size: 16
segmented_prefix_enabled: true
segment_tail_tokens: 96
segment_flush_tokens: 48
low_rank_ratio: 0.5
cluster_ratio: 0.45
cluster_prefix_fraction: 0.7
min_seq_for_low_rank: 256
min_seq_for_clustering: 256
eviction:
enabled: true
target_keep_ratio: 0.7
attention_threshold: 0.002
recent_tokens_to_keep: 128
pin_first_tokens: 16
min_tokens_to_keep: 128
prefill_observation_window: 64
prefill_head_aware_eviction: true
dynamic_decode_pruning: true
decode_prune_margin: 64
head_score_weight: 0.65
head_support_weight: 0.35
semantic_model_path: models/token_importance_model.pkl
train_if_missing: true
attention_weight: 0.45
recency_weight: 0.2
semantic_weight: 0.35
scheduler:
adaptive_window: true
base_cache_tokens: 384
max_cache_tokens: 384
min_cache_tokens: 128
pressure_latency_ms: 40.0
pressure_memory_utilization: 0.8
attention_probe_interval: 12
compression_interval: 1
grow_factor: 1.2
shrink_factor: 0.8
monitor:
attention_decay: 0.92
accuracy_warning_nll: 2.5
token_agreement_warning: 0.8
edge:
simulate_edge: true
max_memory_bytes: 4294967296
outputs:
metrics_csv: results/metrics.csv
experiment_logs: results/experiment_logs.json
bottleneck_report: analysis/bottleneck_report.md
failure_report: analysis/failure_cases.md
dashboard_html: dashboard/kvcachex_dashboard.html