Skip to content

Commit e0dbec0

Browse files
authoredMar 13, 2025
llama : refactor llama_context, llama_kv_cache, llm_build_context (ggml-org#12181)
* llama : refactor llama_context, llama_kv_cache, llm_build_context ggml-ci * graph : don't mutate the KV cache during defrag ggml-ci * context : reduce virtuals + remove test function ggml-ci * context : move interface implementation to source file + factory ggml-ci * graph : move KV cache build functions to llama_context impl ggml-ci * graph : remove model reference from build_pooling ggml-ci * graph : remove llama_model reference ggml-ci * kv_cache : provide rope factors ggml-ci * graph : rework inputs to use only unique_ptr, remove attn input abstraction ggml-ci * context : remove llama_context_i abstraction ggml-ci * context : clean-up ggml-ci * graph : clean-up ggml-ci * llama : remove redundant keywords (struct, enum) ggml-ci * model : adapt gemma3 ggml-ci * graph : restore same attention ops as on master ggml-ci * llama : remove TODO + fix indent ggml-ci
1 parent 2048b59 commit e0dbec0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+13785
-12072
lines changed
 

‎common/common.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -955,8 +955,8 @@ struct common_init_result common_init_from_params(common_params & params) {
955955
return iparams;
956956
}
957957

958-
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
959-
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
958+
if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
959+
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
960960
params.ctx_shift = false;
961961
}
962962

@@ -1060,7 +1060,7 @@ struct common_init_result common_init_from_params(common_params & params) {
10601060
if (llama_model_has_decoder(model)) {
10611061
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
10621062
}
1063-
llama_kv_cache_clear(lctx);
1063+
llama_kv_self_clear(lctx);
10641064
llama_synchronize(lctx);
10651065
llama_perf_context_reset(lctx);
10661066
}

‎common/speculative.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ llama_tokens common_speculative_gen_draft(
173173
result.reserve(params.n_draft);
174174

175175
if (reuse_n == 0) {
176-
llama_kv_cache_clear(ctx);
176+
llama_kv_self_clear(ctx);
177177

178178
prompt.clear();
179179
} else {
@@ -192,14 +192,14 @@ llama_tokens common_speculative_gen_draft(
192192
}
193193

194194
if (reuse_i > 0) {
195-
llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
196-
llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
195+
llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
196+
llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
197197

198198
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
199199
}
200200

201201
if (reuse_n < (int) prompt.size()) {
202-
llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
202+
llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
203203

204204
prompt.erase(prompt.begin() + reuse_n, prompt.end());
205205
}

0 commit comments

Comments
 (0)