abetlen · mamei16 · Apr 3, 2025 · Apr 3, 2025 · Apr 3, 2025 · mamei16
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -13,6 +13,11 @@
 from dataclasses import dataclass, field
 from contextlib import ExitStack
 
+try:
+    from warnings import deprecated
+except ImportError:
+    from ._utils import deprecated
+
 import numpy as np
 import numpy.typing as npt
 
@@ -276,21 +281,37 @@ def n_ctx(self) -> int:
     def pooling_type(self) -> int:
         return llama_cpp.llama_pooling_type(self.ctx)
 
+    @deprecated("Use llama_kv_self_clear")
     def kv_cache_clear(self):
-        llama_cpp.llama_kv_cache_clear(self.ctx)
+        self.llama_kv_self_clear()
 
+    @deprecated("Use kv_self_seq_rm")
     def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
-        llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1)
+        self.kv_self_seq_rm(seq_id, p0, p1)
 
+    @deprecated("Use kv_self_seq_cp")
     def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
-        llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
+        self.kv_self_seq_cp(seq_id_src, seq_id_dst, p0, p1)
 
+    @deprecated("Use kv_self_seq_keep")
     def kv_cache_seq_keep(self, seq_id: int):
-        llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id)
+        self.kv_self_seq_keep(seq_id)
 
     def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
         llama_cpp.llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift)
 
+    def llama_kv_self_clear(self):
+        llama_cpp.llama_llama_kv_self_clear(self.ctx)
+
+    def kv_self_seq_rm(self, seq_id: int, p0: int, p1: int):
+        llama_cpp.llama_kv_self_seq_rm(self.ctx, seq_id, p0, p1)
+
+    def kv_self_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
+        llama_cpp.llama_kv_self_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
+
+    def kv_self_seq_keep(self, seq_id: int):
+        llama_cpp.llama_kv_self_seq_keep(self.ctx, seq_id)
+
     def get_state_size(self) -> int:
         return llama_cpp.llama_get_state_size(self.ctx)
 

diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py
@@ -1,5 +1,7 @@
 import os
 import sys
+import warnings
+import functools
 
 from typing import Any, Dict
 
@@ -76,3 +78,17 @@ class Singleton(object, metaclass=MetaSingleton):
 
     def __init__(self):
         super(Singleton, self).__init__()
+
+
+def deprecated(reason):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            warnings.warn(
+                f"Call to deprecated function {func.__name__} ({reason}).",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+            return func(*args, **kwargs)
+        return wrapper
+    return decorator
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -165,6 +165,10 @@
 # llama_sampler_p = NewType("llama_sampler_p", int)
 # llama_sampler_p_ctypes = ctypes.c_void_p
 
+# struct llama_kv_cache;
+llama_kv_cache_p = NewType("llama_kv_cache_p", int)
+llama_kv_cache_p_ctypes = ctypes.c_void_p
+
 # typedef int32_t llama_pos;
 llama_pos = ctypes.c_int32
 # typedef int32_t llama_token;
@@ -259,7 +263,9 @@
 LLAMA_VOCAB_PRE_TYPE_MINERVA = 27
 LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28
 LLAMA_VOCAB_PRE_TYPE_GPT4O = 29
-
+LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30
+LLAMA_VOCAB_PRE_TYPE_TRILLION = 31
+LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32
 
 # // note: these values should be synchronized with ggml_rope
 # // TODO: maybe move this enum to ggml.h (ggml_rope_type)
@@ -630,10 +636,29 @@ class llama_model_kv_override(ctypes.Structure):
         value: Union[int, float, bool, bytes]
 
 
+
+# struct llama_model_tensor_buft_override {
+#       const char * pattern;
+#       ggml_backend_buffer_type_t buft;
+#
+# };
+class llama_model_tensor_buft_override(ctypes.Structure):
+    _fields_ = [
+        ("pattern", ctypes.c_char_p),
+        ("buft", ctypes.c_void_p)
+    ]
+
+
+llama_model_tensor_buft_override_p = ctypes.POINTER(llama_model_tensor_buft_override)
+
+
 # struct llama_model_params {
 #     // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
 #     ggml_backend_dev_t * devices;
 
+#     // NULL-terminated list of buffer types to use for tensors that match a pattern
+#     const struct llama_model_tensor_buft_override * tensor_buft_overrides;
+
 #     int32_t n_gpu_layers; // number of layers to store in VRAM
 #     enum llama_split_mode split_mode; // how to split the model across multiple GPUs
 
@@ -695,6 +720,7 @@ class llama_model_params(ctypes.Structure):
 
     _fields_ = [
         ("devices", ctypes.c_void_p), # NOTE: unnused
+        ("llama_model_tensor_buft_override", llama_model_tensor_buft_override_p),
         ("n_gpu_layers", ctypes.c_int32),
         ("split_mode", ctypes.c_int),
         ("main_gpu", ctypes.c_int32),
@@ -1316,6 +1342,10 @@ def llama_n_vocab(model: llama_vocab_p, /) -> int:
 def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
     ...
 
+# LLAMA_API struct llama_kv_cache * llama_get_kv_self (      struct llama_context * ctx);
+@ctypes_function("llama_get_kv_self", [llama_context_p_ctypes], llama_model_p_ctypes)
+def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]:
+    ...
 
 # LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
 @ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int)
@@ -1810,7 +1840,19 @@ def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[ll
 
 # // Returns the number of tokens in the KV cache (slow, use only for debug)
 # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-# LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
+# LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
+@ctypes_function(
+    "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32
+)
+def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
+    """Returns the number of tokens in the KV cache (slow, use only for debug)
+    If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+    """
+    ...
+
+# // Returns the number of tokens in the KV cache (slow, use only for debug)
+# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+# DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx), "use llama_kv_self_n_tokens instead");
 @ctypes_function(
     "llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32
 )
@@ -1832,10 +1874,10 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
 
 
 # // Clear the KV cache - both cell info is erased and KV data is zeroed
-# LLAMA_API void llama_kv_cache_clear(
+# LLAMA_API void llama_kv_self_clear(
 #         struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None)
-def llama_kv_cache_clear(ctx: llama_context_p, /):
+@ctypes_function("llama_kv_self_clear", [llama_context_p_ctypes], None)
+def llama_kv_self_clear(ctx: llama_context_p, /):
     """Clear the KV cache"""
     ...
 
@@ -1845,13 +1887,13 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
 # // seq_id < 0 : match any sequence
 # // p0 < 0     : [0,  p1]
 # // p1 < 0     : [p0, inf)
-# LLAMA_API bool llama_kv_cache_seq_rm(
+# LLAMA_API bool llama_kv_self_seq_rm(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id,
 #                    llama_pos   p0,
 #                    llama_pos   p1);
 @ctypes_function(
-    "llama_kv_cache_seq_rm",
+    "llama_kv_self_seq_rm",
     [
         llama_context_p_ctypes,
         llama_seq_id,
@@ -1860,7 +1902,7 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
     ],
     ctypes.c_bool,
 )
-def llama_kv_cache_seq_rm(
+def llama_kv_self_seq_rm(
     ctx: llama_context_p,
     seq_id: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
@@ -1881,14 +1923,14 @@ def llama_kv_cache_seq_rm(
 # // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
 # // p0 < 0 : [0,  p1]
 # // p1 < 0 : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_cp(
+# LLAMA_API void llama_kv_self_seq_cp(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id_src,
 #                 llama_seq_id   seq_id_dst,
 #                    llama_pos   p0,
 #                    llama_pos   p1);
 @ctypes_function(
-    "llama_kv_cache_seq_cp",
+    "llama_kv_self_seq_cp",
     [
         llama_context_p_ctypes,
         llama_seq_id,
@@ -1898,7 +1940,7 @@ def llama_kv_cache_seq_rm(
     ],
     None,
 )
-def llama_kv_cache_seq_cp(
+def llama_kv_self_seq_cp(
     ctx: llama_context_p,
     seq_id_src: Union[llama_seq_id, int],
     seq_id_dst: Union[llama_seq_id, int],
@@ -1914,31 +1956,31 @@ def llama_kv_cache_seq_cp(
 
 
 # // Removes all tokens that do not belong to the specified sequence
-# LLAMA_API void llama_kv_cache_seq_keep(
+# LLAMA_API void llama_kv_self_seq_keep(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id);
 @ctypes_function(
-    "llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
+    "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
 )
-def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
+def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
     """Removes all tokens that do not belong to the specified sequence"""
     ...
 
 
 # // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
 # // If the KV cache is RoPEd, the KV data is updated accordingly:
 # //   - lazily on next llama_decode()
-# //   - explicitly with llama_kv_cache_update()
+# //   - explicitly with llama_kv_self_update()
 # // p0 < 0 : [0,  p1]
 # // p1 < 0 : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_add(
+# LLAMA_API void llama_kv_self_seq_add(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id,
 #                    llama_pos   p0,
 #                    llama_pos   p1,
 #                    llama_pos   delta);
 @ctypes_function(
-    "llama_kv_cache_seq_add",
+    "llama_kv_self_seq_add",
     [
         llama_context_p_ctypes,
         llama_seq_id,
@@ -1948,7 +1990,7 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in
     ],
     None,
 )
-def llama_kv_cache_seq_add(
+def llama_kv_self_seq_add(
     ctx: llama_context_p,
     seq_id: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
@@ -1959,7 +2001,7 @@ def llama_kv_cache_seq_add(
     """Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     If the KV cache is RoPEd, the KV data is updated accordingly:
     - lazily on next llama_decode()
-    - explicitly with llama_kv_cache_update()
+    - explicitly with llama_kv_self_update()
     p0 < 0 : [0,  p1]
     p1 < 0 : [p0, inf)"""
     ...
@@ -1969,14 +2011,14 @@ def llama_kv_cache_seq_add(
 # // If the KV cache is RoPEd, the KV data is updated accordingly
 # // p0 < 0 : [0,  p1]
 # // p1 < 0 : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_div(
+# LLAMA_API void llama_kv_self_seq_div(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id,
 #                    llama_pos   p0,
 #                    llama_pos   p1,
 #                          int   d);
 @ctypes_function(
-    "llama_kv_cache_seq_div",
+    "llama_kv_self_seq_div",
     [
         llama_context_p_ctypes,
         llama_seq_id,
@@ -1986,7 +2028,7 @@ def llama_kv_cache_seq_add(
     ],
     None,
 )
-def llama_kv_cache_seq_div(
+def llama_kv_self_seq_div(
     ctx: llama_context_p,
     seq_id: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
@@ -2004,29 +2046,29 @@ def llama_kv_cache_seq_div(
 # // Defragment the KV cache
 # // This will be applied:
 # //   - lazily on next llama_decode()
-# //   - explicitly with llama_kv_cache_update()
-# LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_defrag", [llama_context_p_ctypes], None)
-def llama_kv_cache_defrag(ctx: llama_context_p, /):
+# //   - explicitly with llama_kv_self_update()
+# LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
+@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None)
+def llama_kv_self_defrag(ctx: llama_context_p, /):
     """Defragment the KV cache
     This will be applied:
     - lazily on next llama_decode()
-    - explicitly with llama_kv_cache_update()"""
+    - explicitly with llama_kv_self_update()"""
     ...
 
 
 # // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-# LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_update", [llama_context_p_ctypes], None)
-def llama_kv_cache_update(ctx: llama_context_p, /):
+# LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
+@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None)
+def llama_kv_self_update(ctx: llama_context_p, /):
     """Apply the KV cache updates (such as K-shifts, defragmentation, etc.)"""
     ...
 
 
 # // Check if the context supports KV cache shifting
-# LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
-def llama_kv_cache_can_shift(ctx: llama_context_p, /) -> bool:
+# LLAMA_API bool llama_kv_self_can_shift(struct llama_context * ctx);
+@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
+def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
     """Check if the context supports KV cache shifting"""
     ...
 
@@ -2547,6 +2589,16 @@ def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /):
     ...
 
 
+# // Set whether the model is in warmup mode or not
+# // If true, all model tensors are activated during llama_decode() to load and cache their weights.
+# LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
+@ctypes_function("llama_set_warmup", [llama_context_p_ctypes, ctypes.c_bool], None)
+def llama_set_warmup(ctx: llama_context_p, warmup: bool, /):
+    """Set whether to use causal attention or not
+    If set to true, the model will only attend to the past tokens"""
+    ...
+
+
 # // Set abort callback
 # LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
 @ctypes_function(
@@ -3701,6 +3753,13 @@ def llama_sampler_init_mirostat_v2(
     ...
 
 
+
+
+
+# /// @details Intializes a GBNF grammar, see grammars/README.md for details.
+# /// @param vocab The vocabulary that this grammar will be used with.
+# /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
+# /// @param grammar_root The name of the start symbol for the grammar.
 # LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
 #         const struct llama_vocab * vocab,
 #                       const char * grammar_str,

diff --git a/vendor/llama.cpp b/vendor/llama.cpp