From 836abf18dc4dd957a8c3571da46edf737c0e4443 Mon Sep 17 00:00:00 2001
From: rsnm2 <rshaw@neuralmagic.com>
Date: Sun, 9 Jul 2023 17:52:40 +0000
Subject: [PATCH 1/2] added mlserver example

---
 examples/mlserver/README.md                   | 75 +++++++++++++++++++
 .../model-settings.json                       | 10 +++
 .../text-classification-model/models.py       | 19 +++++
 examples/mlserver/requirements.txt            |  2 +
 4 files changed, 106 insertions(+)
 create mode 100644 examples/mlserver/README.md
 create mode 100644 examples/mlserver/models/text-classification-model/model-settings.json
 create mode 100644 examples/mlserver/models/text-classification-model/models.py
 create mode 100644 examples/mlserver/requirements.txt

diff --git a/examples/mlserver/README.md b/examples/mlserver/README.md
new file mode 100644
index 0000000000..216fc9f58f
--- /dev/null
+++ b/examples/mlserver/README.md
@@ -0,0 +1,75 @@
+# **Step 1: Installation**
+
+Install DeepSparse and MLServer.
+
+```bash
+pip install -r requirements.txt
+```
+
+# **Step 2: Write Custom Runtime**
+
+We need to write a [Custom Inference Runtime](https://mlserver.readthedocs.io/en/stable/user-guide/custom.html) to use DeepSparse within MLServer.
+
+### Implement `load()` and `predict()`
+
+First, we implement the `load()` and `predict()` methods in `models/text-classification-model/models.py`. Note that your implementation of the of `load()` and `predict()` will vary by the task that you choose.
+
+Here's an example for text classification:
+```python
+from mlserver import MLModel
+from mlserver.codecs import decode_args
+from typing import List
+from deepsparse import Pipeline
+
+class DeepSparseRuntime(MLModel):
+    async def load(self) -> bool:
+        # compiles the pipeline
+        self._pipeline = Pipeline.create(
+            task = self._settings.parameters.task,                          # from model-settings.json
+            model_path = self._settings.parameters.model_path,              # from model-settings.json
+            batch_size = self._settings.parameters.batch_size,              # from model-settings.json
+            sequence_length = self._settings.parameters.sequence_length,    # from model-settings.json
+        )
+        return True
+    
+    @decode_args
+    async def predict(self, sequences: List[str]) -> List[str]:
+        # runs the inference
+        prediction = self._pipeline(sequences=sequences)
+        return prediction.labels
+```
+
+### Create `model-settings.json`
+
+Second, we create a `model-settings` file. In this file, we will specify the location of the implementation of the custom runtime as well as the 
+paramters of the deepsparse inference session.
+
+```json
+{
+    "name": "text-classification-model",
+    "implementation": "models.DeepSparseRuntime",
+    "parameters": {
+        "task": "text-classification",
+        "model_path": "zoo:nlp/sentiment_analysis/obert-base/pytorch/huggingface/sst2/pruned90_quant-none",
+        "batch_size": 1,
+        "sequence_length": 128
+    }
+}
+```
+
+# **Step 3: Launch MLServer**
+
+Launch the server with the CLI:
+
+```bash
+mlserver start ./models/text-classification-model/
+```
+
+# **Step 4: Send Inference Requests**
+
+Now, an inference endpoint is exposed at `http://localhost:8080/v2/models/text-classification-model/infer`. `client.py` is a sample script for requesting the endpoint.
+
+Run the following:
+```python
+python3 client.py
+```
\ No newline at end of file
diff --git a/examples/mlserver/models/text-classification-model/model-settings.json b/examples/mlserver/models/text-classification-model/model-settings.json
new file mode 100644
index 0000000000..39b798f134
--- /dev/null
+++ b/examples/mlserver/models/text-classification-model/model-settings.json
@@ -0,0 +1,10 @@
+{
+    "name": "text-classification-model",
+    "implementation": "models.DeepSparseRuntime",
+    "parameters": {
+        "task": "text-classification",
+        "model_path": "zoo:nlp/sentiment_analysis/obert-base/pytorch/huggingface/sst2/pruned90_quant-none",
+        "batch_size": 1,
+        "sequence_length": 128
+    }
+}
\ No newline at end of file
diff --git a/examples/mlserver/models/text-classification-model/models.py b/examples/mlserver/models/text-classification-model/models.py
new file mode 100644
index 0000000000..9007d2cc13
--- /dev/null
+++ b/examples/mlserver/models/text-classification-model/models.py
@@ -0,0 +1,19 @@
+from mlserver import MLModel
+from mlserver.codecs import decode_args
+from typing import List
+from deepsparse import Pipeline
+
+class DeepSparseRuntime(MLModel):
+    async def load(self) -> bool:
+        self._pipeline = Pipeline.create(
+            task = self._settings.parameters.task,
+            model_path = self._settings.parameters.model_path,
+            batch_size = self._settings.parameters.batch_size,
+            sequence_length = self._settings.parameters.sequence_length,
+        )
+        return True
+    
+    @decode_args
+    async def predict(self, sequences: List[str]) -> List[str]:
+        prediction = self._pipeline(sequences=sequences)
+        return prediction.labels
\ No newline at end of file
diff --git a/examples/mlserver/requirements.txt b/examples/mlserver/requirements.txt
new file mode 100644
index 0000000000..f460faaf38
--- /dev/null
+++ b/examples/mlserver/requirements.txt
@@ -0,0 +1,2 @@
+mlserver
+deepsparse[transformers]
\ No newline at end of file

From ab75bfe0e92df770306223f1478bc5d735336a82 Mon Sep 17 00:00:00 2001
From: rsnm2 <rshaw@neuralmagic.com>
Date: Sun, 9 Jul 2023 17:56:00 +0000
Subject: [PATCH 2/2] added client.py

---
 examples/mlserver/README.md |  2 +-
 examples/mlserver/client.py | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 examples/mlserver/client.py

diff --git a/examples/mlserver/README.md b/examples/mlserver/README.md
index 216fc9f58f..738111e7fe 100644
--- a/examples/mlserver/README.md
+++ b/examples/mlserver/README.md
@@ -41,7 +41,7 @@ class DeepSparseRuntime(MLModel):
 
 ### Create `model-settings.json`
 
-Second, we create a `model-settings` file. In this file, we will specify the location of the implementation of the custom runtime as well as the 
+Second, we create a config at `models/text-classification-model/model-settings.json`. In this file, we will specify the location of the implementation of the custom runtime as well as the 
 paramters of the deepsparse inference session.
 
 ```json
diff --git a/examples/mlserver/client.py b/examples/mlserver/client.py
new file mode 100644
index 0000000000..248c2fea1d
--- /dev/null
+++ b/examples/mlserver/client.py
@@ -0,0 +1,27 @@
+import requests, threading
+
+NUM_THREADS = 2
+URL = "http://localhost:8080/v2/models/text-classification-model/infer"
+sentences = ["I hate using GPUs for inference", "I love using DeepSparse on CPUs"] * 100
+
+def tfunc(text):
+    inference_request = {
+        "inputs": [
+            {
+                "name": "sequences",
+                "shape": [1],
+                "datatype": "BYTES",
+                "data": [text],
+            },
+        ]
+    }   
+    resp = requests.post(URL, json=inference_request).json()
+    for output in resp["outputs"]:
+        print(output["data"])
+
+
+threads = [threading.Thread(target=tfunc, args=(sentence,)) for sentence in sentences[:NUM_THREADS]]
+for thread in threads:
+    thread.start()
+for thread in threads:
+    thread.join()
\ No newline at end of file