Skip to content

Commit f74b90e

Browse files
committed
Fix streaming hang on last token when cache is on.
1 parent 5be8354 commit f74b90e

File tree

1 file changed

+9
-5
lines changed

1 file changed

+9
-5
lines changed

llama_cpp/llama.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -848,11 +848,6 @@ def _create_completion(
848848
finish_reason = "length"
849849
break
850850

851-
if self.cache:
852-
if self.verbose:
853-
print("Llama._create_completion: cache save", file=sys.stderr)
854-
self.cache[prompt_tokens + completion_tokens] = self.save_state()
855-
856851
if self.verbose:
857852
llama_cpp.llama_print_timings(self.ctx)
858853

@@ -941,8 +936,17 @@ def _create_completion(
941936
}
942937
],
943938
}
939+
if self.cache:
940+
if self.verbose:
941+
print("Llama._create_completion: cache save", file=sys.stderr)
942+
self.cache[prompt_tokens + completion_tokens] = self.save_state()
944943
return
945944

945+
if self.cache:
946+
if self.verbose:
947+
print("Llama._create_completion: cache save", file=sys.stderr)
948+
self.cache[prompt_tokens + completion_tokens] = self.save_state()
949+
946950
text_str = text.decode("utf-8", errors="ignore")
947951

948952
if echo:

0 commit comments

Comments
 (0)