guardrails-ai
diff --git a/‎docs/examples/streaming.ipynb‎
Lines changed: 1230 additions & 0 deletions b/‎docs/examples/streaming.ipynb‎
Lines changed: 1230 additions & 0 deletions
diff --git a/‎guardrails/llm_providers.py‎
Lines changed: 31 additions & 5 deletions b/‎guardrails/llm_providers.py‎
Lines changed: 31 additions & 5 deletions
diff --git a/‎guardrails/utils/openai_utils/streaming_utils.py‎
Lines changed: 79 additions & 0 deletions b/‎guardrails/utils/openai_utils/streaming_utils.py‎
Lines changed: 79 additions & 0 deletions
@@ -1,4 +1,4 @@
-from typing import Any, Awaitable, Callable, Dict, List, Optional, cast
+from typing import Any, Awaitable, Callable, Dict, Iterable, List, Optional, cast
 
 from pydantic import BaseModel
 
@@ -166,6 +166,7 @@ def _invoke_llm(
             api_key = kwargs.pop("api_key")
         else:
             api_key = None
+
         client = OpenAIClient(api_key=api_key)
         return client.create_chat_completion(
             model=model,
@@ -256,9 +257,34 @@ def _invoke_llm(self, *args, **kwargs) -> LLMResponse:
         )
         ```
         """
-        return LLMResponse(
-            output=self.llm_api(*args, **kwargs),
-        )
+        # Get the response from the callable
+        # The LLM response should either be a
+        # string or an generator object of strings
+        llm_response = self.llm_api(*args, **kwargs)
+
+        # Check if kwargs stream is passed in
+        if kwargs.get("stream", None) in [None, False]:
+            # If stream is not defined or is set to False,
+            # return default behavior
+            # Strongly type the response as a string
+            llm_response = cast(str, llm_response)
+            return LLMResponse(
+                output=llm_response,
+            )
+        else:
+            # If stream is defined and set to True,
+            # the callable returns a generator object
+            complete_output = ""
+
+            # Strongly type the response as an iterable of strings
+            llm_response = cast(Iterable[str], llm_response)
+            for response in llm_response:
+                complete_output += response
+
+            # Return the LLMResponse
+            return LLMResponse(
+                output=complete_output,
+            )
 
 
 def get_llm_ask(llm_api: Callable, *args, **kwargs) -> PromptCallableBase:
@@ -405,6 +431,7 @@ async def invoke_llm(
             api_key = kwargs.pop("api_key")
         else:
             api_key = None
+
         aclient = AsyncOpenAIClient(api_key=api_key)
         return await aclient.create_chat_completion(
             model=model,
@@ -481,7 +508,6 @@ async def invoke_llm(self, *args, **kwargs) -> LLMResponse:
 def get_async_llm_ask(
     llm_api: Callable[[Any], Awaitable[Any]], *args, **kwargs
 ) -> AsyncPromptCallableBase:
-
     # these only work with openai v0 (None otherwise)
     if llm_api == get_static_openai_acreate_func():
         return AsyncOpenAICallable(*args, **kwargs)
 
@@ -0,0 +1,79 @@
+from typing import Dict, List
+
+import tiktoken
+
+
+def num_tokens_from_string(text: str, model_name: str) -> int:
+    """Returns the number of tokens in a text string.
+
+    Supported for OpenAI models only. This is a helper function
+    that is required when OpenAI's `stream` parameter is set to `True`,
+    because OpenAI does not return the number of tokens in that case.
+    Requires the `tiktoken` package to be installed.
+
+    Args:
+        text (str): The text string to count the number of tokens in.
+        model_name (str): The name of the OpenAI model to use.
+
+    Returns:
+        num_tokens (int): The number of tokens in the text string.
+    """
+    encoding = tiktoken.encoding_for_model(model_name)
+    num_tokens = len(encoding.encode(text))
+    return num_tokens
+
+
+def num_tokens_from_messages(
+    messages: List[Dict[str, str]], model: str = "gpt-3.5-turbo-0613"
+) -> int:
+    """Return the number of tokens used by a list of messages."""
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        print("Warning: model not found. Using cl100k_base encoding.")
+        encoding = tiktoken.get_encoding("cl100k_base")
+    if model in {
+        "gpt-3.5-turbo-0613",
+        "gpt-3.5-turbo-16k-0613",
+        "gpt-4-0314",
+        "gpt-4-32k-0314",
+        "gpt-4-0613",
+        "gpt-4-32k-0613",
+    }:
+        tokens_per_message = 3
+        tokens_per_name = 1
+    elif model == "gpt-3.5-turbo-0301":
+        tokens_per_message = (
+            4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
+        )
+        tokens_per_name = -1  # if there's a name, the role is omitted
+    elif "gpt-3.5-turbo" in model:
+        print(
+            """Warning: gpt-3.5-turbo may update over time.
+            Returning num tokens assuming gpt-3.5-turbo-0613."""
+        )
+        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
+    elif "gpt-4" in model:
+        print(
+            """Warning: gpt-4 may update over time.
+            Returning num tokens assuming gpt-4-0613."""
+        )
+        return num_tokens_from_messages(messages, model="gpt-4-0613")
+    else:
+        raise NotImplementedError(
+            f"""num_tokens_from_messages() is not implemented for model {model}.
+            See https://github.com/openai/openai-python/blob/main/chatml.md for
+            information on how messages are converted to tokens."""
+        )
+
+    num_tokens = 0
+    for message in messages:
+        num_tokens += tokens_per_message
+        for key, value in message.items():
+            num_tokens += len(encoding.encode(value))
+            if key == "name":
+                num_tokens += tokens_per_name
+
+    # every reply is primed with <|start|>assistant<|message|>
+    num_tokens += 3
+    return num_tokens