scaleway · redanrd · Apr 22, 2024 · Apr 22, 2024 · Apr 23, 2024 · Apr 23, 2024
diff --git a/README.md b/README.md
@@ -71,6 +71,7 @@ Table of Contents:
 | **[Python S3 upload](containers/python-s3-upload/README.md)** <br/> A Python + Flask HTTP server that receives file uploads and writes them to S3.                                          | Python       | [Terraform]            |
 | **[Terraform NGINX hello world](containers/terraform-nginx-hello-world/README.md)** <br/> A minimal example running the base NGINX image in a serverless container deployed with Terraform. | N/A          | [Terraform]            |
 | **[Triggers with Terraform](containers/terraform-triggers/README.md)** <br/> Configuring two SQS triggers, used to trigger two containers, one public, one private.                         | N/A          | [Terraform]            |
+| **[Inference with Hugging Face Models](containers/hugging-face-inference/README.md)** <br/> Experimentation to deploy and benchmark some lightweight Hugging Face Models in Serverless Containers.                | N/A          | [Terraform]            |
 
 ### ⚙️ Jobs
 

diff --git a/containers/hugging-face-inference/Dockerfile b/containers/hugging-face-inference/Dockerfile
@@ -0,0 +1,23 @@
+FROM python:3.12-slim-bookworm
+
+ARG MODEL_DOWNLOAD_SOURCE
+
+RUN apt-get update && apt-get install -y wget
+
+WORKDIR /app
+
+RUN pip install --upgrade pip
+
+COPY requirements.txt .
+
+RUN pip install -r requirements.txt
+
+RUN pip install llama-cpp-python==0.2.62 \
+    --no-cache-dir \
+    --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+
+RUN wget $MODEL_DOWNLOAD_SOURCE
+
+COPY main.py .
+
+CMD ["uvicorn", "main:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "80"]
diff --git a/containers/hugging-face-inference/README.md b/containers/hugging-face-inference/README.md
@@ -0,0 +1,33 @@
+# Hugging Face Models
+
+### Deploy models in Serverless Containers
+
+- Export these variables:
+
+```bash
+export SCW_ACCESS_KEY="access-key" SCW_SECRET_KEY="secret-key" SCW_PROJECT_ID="project-id" REGION="fr-par"
+```
+
+- Add/remove Hugging Face models (with `.gguf` extension) in `terraform/hf-models.json` file.
+
+- Run script to deploy multiple hugging face models using terraform workspaces:
+
+```bash
+cd terraform && bash terraform.sh -a
+```
+
+### Benchmark models
+
+Check your models were deployed on the console and copy your container endpoints to the `terraform/hf-models.json` file, then perform the following command:
+
+```bash
+python benchmark-models.py
+```
+
+This will generate a box plot to analyze response time per model family, and a `csv` file containing textual responses per each model.
+
+### Destroy terraform resources for all models
+
+```bash
+bash terraform.sh -d
+```
diff --git a/containers/hugging-face-inference/main.py b/containers/hugging-face-inference/main.py
@@ -0,0 +1,46 @@
+import os
+
+from fastapi import FastAPI
+from llama_cpp import Llama
+from pydantic import BaseModel
+
+
+class Message(BaseModel):
+    content: str
+
+
+MODEL_FILE_NAME = os.environ["MODEL_FILE_NAME"]
+
+app = FastAPI()
+
+print("loading model starts", flush=True)
+
+llm = Llama(model_path=MODEL_FILE_NAME)
+
+print("loading model successfully ends", flush=True)
+
+
+@app.get("/")
+def hello():
+    """Get info of inference server"""
+
+    return {
+        "message": "Hello, this is the inference server! Serving model {model_name}".format(
+            model_name=MODEL_FILE_NAME
+        )
+    }
+
+
+@app.post("/")
+def infer(message: Message):
+    """Post a message and receive a response from inference server"""
+
+    print("inference endpoint is called", flush=True)
+
+    output = llm(prompt=message.content, max_tokens=200)
+
+    print("output is successfully inferred", flush=True)
+
+    print(output, flush=True)
+
+    return output
diff --git a/containers/hugging-face-inference/requirements.txt b/containers/hugging-face-inference/requirements.txt
@@ -0,0 +1,2 @@
+fastapi==0.104.1
+uvicorn==0.24.0.post1
diff --git a/containers/hugging-face-inference/terraform/benchmark-models.py b/containers/hugging-face-inference/terraform/benchmark-models.py
@@ -0,0 +1,102 @@
+import csv
+import json
+
+import matplotlib.pyplot as plt
+import pandas
+import requests
+
+
+class Benchmark:
+    _model_families = ["llama", "mistral", "phi"]
+    _endpoints = {}
+
+    def __init__(
+        self, models_file: str, benchmark_file: str, results_figure: str, message: str
+    ) -> None:
+        self.models_file = models_file
+        self.benchmark_file = benchmark_file
+        self.message = message
+        self.results_figure = results_figure
+
+    def get_container_endpoints_from_json_file(self) -> None:
+        if self.models_file == "":
+            raise Exception("file name is empty")
+
+        with open(self.models_file, "r") as models_file:
+            json_data = json.load(models_file)
+
+        for family in self._model_families:
+            self._endpoints[family] = []
+            for model in json_data[family]:
+                self._endpoints[family].append(
+                    {"model": model["file"], "endpoint": model["ctn_endpoint"]}
+                )
+
+    def analyze_results(self) -> None:
+        benchmark_results = pandas.read_csv(self.benchmark_file)
+        benchmark_results.boxplot(column="Total Response Time", by="Family").plot()
+        plt.ylabel("Total Response Time in seconds")
+        plt.savefig(self.results_figure)
+
+    def benchmark_models(self, num_samples: int) -> None:
+        self.get_container_endpoints_from_json_file()
+
+        fields = ["Model", "Family", "Total Response Time", "Response Message"]
+        benchmark_data = []
+
+        for family in self._model_families:
+            for endpoint in self._endpoints[family]:
+                if endpoint["endpoint"] == "":
+                    raise Exception("model endpoint is empty")
+
+                for _ in range(num_samples):
+                    try:
+                        print(
+                            "Calling model {model} on endpoint {endpoint} with message {message}".format(
+                                model=endpoint["model"],
+                                endpoint=endpoint["endpoint"],
+                                message=self.message,
+                            )
+                        )
+
+                        rsp = requests.post(
+                            endpoint["endpoint"], json={"message": self.message}
+                        )
+
+                        response_text = rsp.json()["choices"][0]["text"]
+
+                        print(
+                            "The model {model} responded with: {response_text}".format(
+                                model=endpoint["model"], response_text=response_text
+                            )
+                        )
+
+                        benchmark_data.append(
+                            [
+                                endpoint["model"],
+                                family,
+                                rsp.elapsed.total_seconds(),
+                                response_text,
+                            ]
+                        )
+                    except:
+                        pass
+
+        with open(self.benchmark_file, "w") as results_file:
+            wrt = csv.writer(results_file)
+            wrt.writerow(fields)
+            wrt.writerows(benchmark_data)
+
+        self.analyze_results()
+
+
+if __name__ == "__main__":
+
+    benchmark = Benchmark(
+        models_file="hf-models.json",
+        benchmark_file="benchmark-results.csv",
+        results_figure="results-plot.png",
+        message="What the difference between an elephant and an ant?",
+    )
+
+    benchmark.benchmark_models(num_samples=50)
diff --git a/containers/hugging-face-inference/terraform/container.tf b/containers/hugging-face-inference/terraform/container.tf
@@ -0,0 +1,20 @@
+resource "scaleway_container_namespace" "main" {
+  name        = "ifr-${lower(replace(var.hf_model_file_name, "/[.]|[_]/", "-"))}-${random_string.random_suffix.result}"
+  description = "Inference using Hugging Face models"
+}
+
+resource "scaleway_container" "inference-hugging-face" {
+  name           = "inference"
+  description    = "Inference serving API using a Hugging Face model"
+  namespace_id   = scaleway_container_namespace.main.id
+  registry_image = docker_image.inference.name
+  environment_variables = {
+    "MODEL_FILE_NAME" = var.hf_model_file_name
+  }
+  port           = 80
+  cpu_limit      = 2240
+  memory_limit   = 4096
+  min_scale      = 1
+  max_scale      = 1
+  deploy   = true
+}
diff --git a/containers/hugging-face-inference/terraform/hf-models.json b/containers/hugging-face-inference/terraform/hf-models.json
@@ -0,0 +1,46 @@
+{
+    "llama" : [
+        {
+            "file": "llama-2-7b.Q2_K.gguf",
+            "source" : "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q2_K.gguf",
+            "size_gb": "2.83",
+            "ctn_endpoint": "paste container endpoint here"
+        },
+        {
+            "file": "llama-2-7b.Q3_K_L.gguf",
+            "source" : "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q3_K_L.gguf",
+            "size_gb": "3.6",
+            "ctn_endpoint": "paste container endpoint here"
+        }
+    ],
+
+    "mistral" : [
+        {
+            "file": "mistral-7b-instruct-v0.2.Q2_K.gguf",
+            "source" : "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q2_K.gguf",
+            "size_gb": "3.08",
+            "ctn_endpoint": "paste container endpoint here"
+        },
+        {
+            "file": "mistral-7b-instruct-v0.2.Q3_K_L.gguf",
+            "source" : "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q3_K_L.gguf",
+            "size_gb": "3.82",
+            "ctn_endpoint": "paste container endpoint here"
+        }
+    ],
+
+    "phi" : [
+        {
+            "file": "phi-2.Q2_K.gguf",
+            "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q2_K.gguf",
+            "size_gb": "1.17",
+            "ctn_endpoint": "paste container endpoint here"
+        },
+        {
+            "file": "phi-2.Q5_K_M.gguf",
+            "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q5_K_M.gguf",
+            "size_gb": "2.07",
+            "ctn_endpoint": "paste container endpoint here"
+        }
+    ]
+}
diff --git a/containers/hugging-face-inference/terraform/images.tf b/containers/hugging-face-inference/terraform/images.tf
@@ -0,0 +1,20 @@
+resource "scaleway_registry_namespace" "main" {
+  name       = "ifr-${lower(replace(var.hf_model_file_name, "/[.]|[_]/", "-"))}-${random_string.random_suffix.result}"
+  region     = var.region
+  project_id = var.project_id
+}
+
+resource "docker_image" "inference" {
+  name = "${scaleway_registry_namespace.main.endpoint}/inference-with-huggingface:${var.image_version}"
+  build {
+    context = "${path.cwd}/../"
+    no_cache = true
+    build_args = {
+     MODEL_DOWNLOAD_SOURCE : var.hf_model_download_source
+    }
+  }
+
+  provisioner "local-exec" {
+    command = "docker push ${docker_image.inference.name}"
+  }
+}
diff --git a/containers/hugging-face-inference/terraform/providers.tf b/containers/hugging-face-inference/terraform/providers.tf
@@ -0,0 +1,16 @@
+provider "scaleway" {
+  region     = var.region
+  access_key = var.access_key
+  secret_key = var.secret_key
+  project_id = var.project_id
+}
+
+provider "docker" {
+  host = "unix:///var/run/docker.sock"
+
+  registry_auth {
+    address  = scaleway_registry_namespace.main.endpoint
+    username = "nologin"
+    password = var.secret_key
+  }
+}
diff --git a/containers/hugging-face-inference/terraform/terraform.sh b/containers/hugging-face-inference/terraform/terraform.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+set -e
+
+# Common environment variables
+export TF_VAR_access_key=${SCW_ACCESS_KEY} \
+       TF_VAR_secret_key=${SCW_SECRET_KEY} \
+       TF_VAR_project_id=${SCW_PROJECT_ID}
+
+# Associative list of models to deploy using json data
+declare -A hf_models
+eval "$(jq -r '.[]|.[]|"hf_models[\(.file)]=\(.source)"' hf-models.json)"
+
+# Login to docker Scaleway's registry
+docker login "rg.$REGION.scw.cloud" -u nologin --password-stdin <<< "$SCW_SECRET_KEY"
+
+# Initialize, plan, and deploy each model in a Terraform workspace
+apply() {
+       terraform init
+       for model_file_name in "${!hf_models[@]}";
+       do
+         terraform workspace select -or-create $model_file_name
+         export TF_VAR_hf_model_file_name=$model_file_name \
+                TF_VAR_hf_model_download_source=${hf_models[$model_file_name]}
+         terraform plan
+         terraform apply -auto-approve
+       done
+}
+
+# Destroy resources of each Terraform workspace
+destroy() {
+       for model_file_name in "${!hf_models[@]}";
+       do
+         terraform workspace select $model_file_name
+         export TF_VAR_hf_model_file_name=$model_file_name \
+                TF_VAR_hf_model_download_source=${hf_models[$model_file_name]}
+         terraform destroy -auto-approve
+       done
+}
+
+# Script actions
+while getopts "ad" option; do
+  case $option in
+    a)
+      echo "deploying models"
+      apply
+      ;;
+    d)
+      echo "destroying models"
+      destroy
+      ;;
+    *)
+      echo "flag is not provided"
+      exit 1
+  esac
+done
diff --git a/containers/hugging-face-inference/terraform/utils.tf b/containers/hugging-face-inference/terraform/utils.tf
@@ -0,0 +1,5 @@
+resource "random_string" "random_suffix" {
+  length  = 3
+  upper   = false
+  special = false
+}