Skip to content

Update API to match latest llama.cpp version #1991

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 25 additions & 4 deletions llama_cpp/_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
from dataclasses import dataclass, field
from contextlib import ExitStack

try:
from warnings import deprecated
except ImportError:
from ._utils import deprecated

import numpy as np
import numpy.typing as npt

Expand Down Expand Up @@ -276,21 +281,37 @@ def n_ctx(self) -> int:
def pooling_type(self) -> int:
return llama_cpp.llama_pooling_type(self.ctx)

@deprecated("Use llama_kv_self_clear")
def kv_cache_clear(self):
llama_cpp.llama_kv_cache_clear(self.ctx)
self.llama_kv_self_clear()

@deprecated("Use kv_self_seq_rm")
def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1)
self.kv_self_seq_rm(seq_id, p0, p1)

@deprecated("Use kv_self_seq_cp")
def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
self.kv_self_seq_cp(seq_id_src, seq_id_dst, p0, p1)

@deprecated("Use kv_self_seq_keep")
def kv_cache_seq_keep(self, seq_id: int):
llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id)
self.kv_self_seq_keep(seq_id)

def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
llama_cpp.llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift)

def llama_kv_self_clear(self):
llama_cpp.llama_llama_kv_self_clear(self.ctx)

def kv_self_seq_rm(self, seq_id: int, p0: int, p1: int):
llama_cpp.llama_kv_self_seq_rm(self.ctx, seq_id, p0, p1)

def kv_self_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
llama_cpp.llama_kv_self_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)

def kv_self_seq_keep(self, seq_id: int):
llama_cpp.llama_kv_self_seq_keep(self.ctx, seq_id)

def get_state_size(self) -> int:
return llama_cpp.llama_get_state_size(self.ctx)

Expand Down
16 changes: 16 additions & 0 deletions llama_cpp/_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import sys
import warnings
import functools

from typing import Any, Dict

Expand Down Expand Up @@ -76,3 +78,17 @@ class Singleton(object, metaclass=MetaSingleton):

def __init__(self):
super(Singleton, self).__init__()


def deprecated(reason):
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
warnings.warn(
f"Call to deprecated function {func.__name__} ({reason}).",
category=DeprecationWarning,
stacklevel=2,
)
return func(*args, **kwargs)
return wrapper
return decorator
125 changes: 92 additions & 33 deletions llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,10 @@
# llama_sampler_p = NewType("llama_sampler_p", int)
# llama_sampler_p_ctypes = ctypes.c_void_p

# struct llama_kv_cache;
llama_kv_cache_p = NewType("llama_kv_cache_p", int)
llama_kv_cache_p_ctypes = ctypes.c_void_p

# typedef int32_t llama_pos;
llama_pos = ctypes.c_int32
# typedef int32_t llama_token;
Expand Down Expand Up @@ -259,7 +263,9 @@
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29

LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32

# // note: these values should be synchronized with ggml_rope
# // TODO: maybe move this enum to ggml.h (ggml_rope_type)
Expand Down Expand Up @@ -630,10 +636,29 @@ class llama_model_kv_override(ctypes.Structure):
value: Union[int, float, bool, bytes]



# struct llama_model_tensor_buft_override {
# const char * pattern;
# ggml_backend_buffer_type_t buft;
#
# };
class llama_model_tensor_buft_override(ctypes.Structure):
_fields_ = [
("pattern", ctypes.c_char_p),
("buft", ctypes.c_void_p)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't sure if c_void_p is the correct type to use here, so feel free to change it if there's a better alternative.

]


llama_model_tensor_buft_override_p = ctypes.POINTER(llama_model_tensor_buft_override)


# struct llama_model_params {
# // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
# ggml_backend_dev_t * devices;

# // NULL-terminated list of buffer types to use for tensors that match a pattern
# const struct llama_model_tensor_buft_override * tensor_buft_overrides;

# int32_t n_gpu_layers; // number of layers to store in VRAM
# enum llama_split_mode split_mode; // how to split the model across multiple GPUs

Expand Down Expand Up @@ -695,6 +720,7 @@ class llama_model_params(ctypes.Structure):

_fields_ = [
("devices", ctypes.c_void_p), # NOTE: unnused
("llama_model_tensor_buft_override", llama_model_tensor_buft_override_p),
("n_gpu_layers", ctypes.c_int32),
("split_mode", ctypes.c_int),
("main_gpu", ctypes.c_int32),
Expand Down Expand Up @@ -1316,6 +1342,10 @@ def llama_n_vocab(model: llama_vocab_p, /) -> int:
def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
...

# LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx);
@ctypes_function("llama_get_kv_self", [llama_context_p_ctypes], llama_model_p_ctypes)
def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]:
...

# LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
@ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int)
Expand Down Expand Up @@ -1810,7 +1840,19 @@ def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[ll

# // Returns the number of tokens in the KV cache (slow, use only for debug)
# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
# LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
# LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
@ctypes_function(
"llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32
)
def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
"""Returns the number of tokens in the KV cache (slow, use only for debug)
If a KV cell has multiple sequences assigned to it, it will be counted multiple times
"""
...

# // Returns the number of tokens in the KV cache (slow, use only for debug)
# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
# DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx), "use llama_kv_self_n_tokens instead");
@ctypes_function(
"llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32
)
Expand All @@ -1832,10 +1874,10 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:


# // Clear the KV cache - both cell info is erased and KV data is zeroed
# LLAMA_API void llama_kv_cache_clear(
# LLAMA_API void llama_kv_self_clear(
# struct llama_context * ctx);
@ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None)
def llama_kv_cache_clear(ctx: llama_context_p, /):
@ctypes_function("llama_kv_self_clear", [llama_context_p_ctypes], None)
def llama_kv_self_clear(ctx: llama_context_p, /):
"""Clear the KV cache"""
...

Expand All @@ -1845,13 +1887,13 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
# // seq_id < 0 : match any sequence
# // p0 < 0 : [0, p1]
# // p1 < 0 : [p0, inf)
# LLAMA_API bool llama_kv_cache_seq_rm(
# LLAMA_API bool llama_kv_self_seq_rm(
# struct llama_context * ctx,
# llama_seq_id seq_id,
# llama_pos p0,
# llama_pos p1);
@ctypes_function(
"llama_kv_cache_seq_rm",
"llama_kv_self_seq_rm",
[
llama_context_p_ctypes,
llama_seq_id,
Expand All @@ -1860,7 +1902,7 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
],
ctypes.c_bool,
)
def llama_kv_cache_seq_rm(
def llama_kv_self_seq_rm(
ctx: llama_context_p,
seq_id: Union[llama_seq_id, int],
p0: Union[llama_pos, int],
Expand All @@ -1881,14 +1923,14 @@ def llama_kv_cache_seq_rm(
# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
# // p0 < 0 : [0, p1]
# // p1 < 0 : [p0, inf)
# LLAMA_API void llama_kv_cache_seq_cp(
# LLAMA_API void llama_kv_self_seq_cp(
# struct llama_context * ctx,
# llama_seq_id seq_id_src,
# llama_seq_id seq_id_dst,
# llama_pos p0,
# llama_pos p1);
@ctypes_function(
"llama_kv_cache_seq_cp",
"llama_kv_self_seq_cp",
[
llama_context_p_ctypes,
llama_seq_id,
Expand All @@ -1898,7 +1940,7 @@ def llama_kv_cache_seq_rm(
],
None,
)
def llama_kv_cache_seq_cp(
def llama_kv_self_seq_cp(
ctx: llama_context_p,
seq_id_src: Union[llama_seq_id, int],
seq_id_dst: Union[llama_seq_id, int],
Expand All @@ -1914,31 +1956,31 @@ def llama_kv_cache_seq_cp(


# // Removes all tokens that do not belong to the specified sequence
# LLAMA_API void llama_kv_cache_seq_keep(
# LLAMA_API void llama_kv_self_seq_keep(
# struct llama_context * ctx,
# llama_seq_id seq_id);
@ctypes_function(
"llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
"llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
)
def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
"""Removes all tokens that do not belong to the specified sequence"""
...


# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
# // If the KV cache is RoPEd, the KV data is updated accordingly:
# // - lazily on next llama_decode()
# // - explicitly with llama_kv_cache_update()
# // - explicitly with llama_kv_self_update()
# // p0 < 0 : [0, p1]
# // p1 < 0 : [p0, inf)
# LLAMA_API void llama_kv_cache_seq_add(
# LLAMA_API void llama_kv_self_seq_add(
# struct llama_context * ctx,
# llama_seq_id seq_id,
# llama_pos p0,
# llama_pos p1,
# llama_pos delta);
@ctypes_function(
"llama_kv_cache_seq_add",
"llama_kv_self_seq_add",
[
llama_context_p_ctypes,
llama_seq_id,
Expand All @@ -1948,7 +1990,7 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in
],
None,
)
def llama_kv_cache_seq_add(
def llama_kv_self_seq_add(
ctx: llama_context_p,
seq_id: Union[llama_seq_id, int],
p0: Union[llama_pos, int],
Expand All @@ -1959,7 +2001,7 @@ def llama_kv_cache_seq_add(
"""Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
If the KV cache is RoPEd, the KV data is updated accordingly:
- lazily on next llama_decode()
- explicitly with llama_kv_cache_update()
- explicitly with llama_kv_self_update()
p0 < 0 : [0, p1]
p1 < 0 : [p0, inf)"""
...
Expand All @@ -1969,14 +2011,14 @@ def llama_kv_cache_seq_add(
# // If the KV cache is RoPEd, the KV data is updated accordingly
# // p0 < 0 : [0, p1]
# // p1 < 0 : [p0, inf)
# LLAMA_API void llama_kv_cache_seq_div(
# LLAMA_API void llama_kv_self_seq_div(
# struct llama_context * ctx,
# llama_seq_id seq_id,
# llama_pos p0,
# llama_pos p1,
# int d);
@ctypes_function(
"llama_kv_cache_seq_div",
"llama_kv_self_seq_div",
[
llama_context_p_ctypes,
llama_seq_id,
Expand All @@ -1986,7 +2028,7 @@ def llama_kv_cache_seq_add(
],
None,
)
def llama_kv_cache_seq_div(
def llama_kv_self_seq_div(
ctx: llama_context_p,
seq_id: Union[llama_seq_id, int],
p0: Union[llama_pos, int],
Expand All @@ -2004,29 +2046,29 @@ def llama_kv_cache_seq_div(
# // Defragment the KV cache
# // This will be applied:
# // - lazily on next llama_decode()
# // - explicitly with llama_kv_cache_update()
# LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
@ctypes_function("llama_kv_cache_defrag", [llama_context_p_ctypes], None)
def llama_kv_cache_defrag(ctx: llama_context_p, /):
# // - explicitly with llama_kv_self_update()
# LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None)
def llama_kv_self_defrag(ctx: llama_context_p, /):
"""Defragment the KV cache
This will be applied:
- lazily on next llama_decode()
- explicitly with llama_kv_cache_update()"""
- explicitly with llama_kv_self_update()"""
...


# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
# LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
@ctypes_function("llama_kv_cache_update", [llama_context_p_ctypes], None)
def llama_kv_cache_update(ctx: llama_context_p, /):
# LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None)
def llama_kv_self_update(ctx: llama_context_p, /):
"""Apply the KV cache updates (such as K-shifts, defragmentation, etc.)"""
...


# // Check if the context supports KV cache shifting
# LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
@ctypes_function("llama_kv_cache_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
def llama_kv_cache_can_shift(ctx: llama_context_p, /) -> bool:
# LLAMA_API bool llama_kv_self_can_shift(struct llama_context * ctx);
@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
"""Check if the context supports KV cache shifting"""
...

Expand Down Expand Up @@ -2547,6 +2589,16 @@ def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /):
...


# // Set whether the model is in warmup mode or not
# // If true, all model tensors are activated during llama_decode() to load and cache their weights.
# LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
@ctypes_function("llama_set_warmup", [llama_context_p_ctypes, ctypes.c_bool], None)
def llama_set_warmup(ctx: llama_context_p, warmup: bool, /):
"""Set whether to use causal attention or not
If set to true, the model will only attend to the past tokens"""
...


# // Set abort callback
# LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
@ctypes_function(
Expand Down Expand Up @@ -3701,6 +3753,13 @@ def llama_sampler_init_mirostat_v2(
...





# /// @details Intializes a GBNF grammar, see grammars/README.md for details.
# /// @param vocab The vocabulary that this grammar will be used with.
# /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
# /// @param grammar_root The name of the start symbol for the grammar.
# LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
# const struct llama_vocab * vocab,
# const char * grammar_str,
Expand Down
2 changes: 1 addition & 1 deletion vendor/llama.cpp