Skip to content

Commit cdf5976

Browse files
committed
Update llama.cpp
1 parent 7a536e8 commit cdf5976

File tree

4 files changed

+16
-5
lines changed

4 files changed

+16
-5
lines changed

llama_cpp/llama.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ def __init__(
8383
# NOTE: These parameters are likely to change in the future.
8484
n_ctx: int = 512,
8585
n_parts: int = -1,
86+
n_gpu_layers: int = 0,
8687
seed: int = 1337,
8788
f16_kv: bool = True,
8889
logits_all: bool = False,
@@ -129,6 +130,7 @@ def __init__(
129130
self.params = llama_cpp.llama_context_default_params()
130131
self.params.n_ctx = n_ctx
131132
self.params.n_parts = n_parts
133+
self.params.n_gpu_layers = n_gpu_layers
132134
self.params.seed = seed
133135
self.params.f16_kv = f16_kv
134136
self.params.logits_all = logits_all
@@ -1081,6 +1083,7 @@ def __getstate__(self):
10811083
model_path=self.model_path,
10821084
n_ctx=self.params.n_ctx,
10831085
n_parts=self.params.n_parts,
1086+
n_gpu_layers=self.params.n_gpu_layers,
10841087
seed=self.params.seed,
10851088
f16_kv=self.params.f16_kv,
10861089
logits_all=self.params.logits_all,
@@ -1100,6 +1103,7 @@ def __setstate__(self, state):
11001103
model_path=state["model_path"],
11011104
n_ctx=state["n_ctx"],
11021105
n_parts=state["n_parts"],
1106+
n_gpu_layers=state["n_gpu_layers"],
11031107
seed=state["seed"],
11041108
f16_kv=state["f16_kv"],
11051109
logits_all=state["logits_all"],

llama_cpp/llama_cpp.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def _load_shared_library(lib_base_name: str):
6868
_lib = _load_shared_library(_lib_base_name)
6969

7070
# C types
71-
LLAMA_FILE_VERSION = c_int(1)
71+
LLAMA_FILE_VERSION = c_int(2)
7272
LLAMA_FILE_MAGIC = b"ggjt"
7373
LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
7474
LLAMA_SESSION_MAGIC = b"ggsn"
@@ -109,6 +109,7 @@ class llama_context_params(Structure):
109109
_fields_ = [
110110
("n_ctx", c_int), # text context
111111
("n_parts", c_int), # -1 for default
112+
("n_gpu_layers", c_int), # number of layers to store in VRAM
112113
("seed", c_int), # RNG seed, 0 for random
113114
("f16_kv", c_bool), # use fp16 for KV cache
114115
(
@@ -135,7 +136,7 @@ class llama_context_params(Structure):
135136
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(
136137
4
137138
) # tok_embeddings.weight and output.weight are F16
138-
LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors
139+
# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors
139140
# LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors
140141
LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors
141142
LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors
@@ -259,9 +260,9 @@ def llama_get_state_size(ctx: llama_context_p) -> c_size_t:
259260
# Destination needs to have allocated enough memory.
260261
# Returns the number of bytes copied
261262
def llama_copy_state_data(
262-
ctx: llama_context_p, dest # type: Array[c_uint8]
263+
ctx: llama_context_p, dst # type: Array[c_uint8]
263264
) -> int:
264-
return _lib.llama_copy_state_data(ctx, dest)
265+
return _lib.llama_copy_state_data(ctx, dst)
265266

266267

267268
_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]

llama_cpp/server/app.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ class Settings(BaseSettings):
1717
description="The path to the model to use for generating completions."
1818
)
1919
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
20+
n_gpu_layers: int = Field(
21+
default=0,
22+
ge=0,
23+
description="The number of layers to put on the GPU. The rest will be on the CPU.",
24+
)
2025
n_batch: int = Field(
2126
default=512, ge=1, description="The batch size to use per eval."
2227
)
@@ -80,6 +85,7 @@ def create_app(settings: Optional[Settings] = None):
8085
global llama
8186
llama = llama_cpp.Llama(
8287
model_path=settings.model,
88+
n_gpu_layers=settings.n_gpu_layers,
8389
f16_kv=settings.f16_kv,
8490
use_mlock=settings.use_mlock,
8591
use_mmap=settings.use_mmap,

vendor/llama.cpp

0 commit comments

Comments
 (0)