From 836abf18dc4dd957a8c3571da46edf737c0e4443 Mon Sep 17 00:00:00 2001 From: rsnm2 Date: Sun, 9 Jul 2023 17:52:40 +0000 Subject: [PATCH 1/2] added mlserver example --- examples/mlserver/README.md | 75 +++++++++++++++++++ .../model-settings.json | 10 +++ .../text-classification-model/models.py | 19 +++++ examples/mlserver/requirements.txt | 2 + 4 files changed, 106 insertions(+) create mode 100644 examples/mlserver/README.md create mode 100644 examples/mlserver/models/text-classification-model/model-settings.json create mode 100644 examples/mlserver/models/text-classification-model/models.py create mode 100644 examples/mlserver/requirements.txt diff --git a/examples/mlserver/README.md b/examples/mlserver/README.md new file mode 100644 index 0000000000..216fc9f58f --- /dev/null +++ b/examples/mlserver/README.md @@ -0,0 +1,75 @@ +# **Step 1: Installation** + +Install DeepSparse and MLServer. + +```bash +pip install -r requirements.txt +``` + +# **Step 2: Write Custom Runtime** + +We need to write a [Custom Inference Runtime](https://mlserver.readthedocs.io/en/stable/user-guide/custom.html) to use DeepSparse within MLServer. + +### Implement `load()` and `predict()` + +First, we implement the `load()` and `predict()` methods in `models/text-classification-model/models.py`. Note that your implementation of the of `load()` and `predict()` will vary by the task that you choose. + +Here's an example for text classification: +```python +from mlserver import MLModel +from mlserver.codecs import decode_args +from typing import List +from deepsparse import Pipeline + +class DeepSparseRuntime(MLModel): + async def load(self) -> bool: + # compiles the pipeline + self._pipeline = Pipeline.create( + task = self._settings.parameters.task, # from model-settings.json + model_path = self._settings.parameters.model_path, # from model-settings.json + batch_size = self._settings.parameters.batch_size, # from model-settings.json + sequence_length = self._settings.parameters.sequence_length, # from model-settings.json + ) + return True + + @decode_args + async def predict(self, sequences: List[str]) -> List[str]: + # runs the inference + prediction = self._pipeline(sequences=sequences) + return prediction.labels +``` + +### Create `model-settings.json` + +Second, we create a `model-settings` file. In this file, we will specify the location of the implementation of the custom runtime as well as the +paramters of the deepsparse inference session. + +```json +{ + "name": "text-classification-model", + "implementation": "models.DeepSparseRuntime", + "parameters": { + "task": "text-classification", + "model_path": "zoo:nlp/sentiment_analysis/obert-base/pytorch/huggingface/sst2/pruned90_quant-none", + "batch_size": 1, + "sequence_length": 128 + } +} +``` + +# **Step 3: Launch MLServer** + +Launch the server with the CLI: + +```bash +mlserver start ./models/text-classification-model/ +``` + +# **Step 4: Send Inference Requests** + +Now, an inference endpoint is exposed at `http://localhost:8080/v2/models/text-classification-model/infer`. `client.py` is a sample script for requesting the endpoint. + +Run the following: +```python +python3 client.py +``` \ No newline at end of file diff --git a/examples/mlserver/models/text-classification-model/model-settings.json b/examples/mlserver/models/text-classification-model/model-settings.json new file mode 100644 index 0000000000..39b798f134 --- /dev/null +++ b/examples/mlserver/models/text-classification-model/model-settings.json @@ -0,0 +1,10 @@ +{ + "name": "text-classification-model", + "implementation": "models.DeepSparseRuntime", + "parameters": { + "task": "text-classification", + "model_path": "zoo:nlp/sentiment_analysis/obert-base/pytorch/huggingface/sst2/pruned90_quant-none", + "batch_size": 1, + "sequence_length": 128 + } +} \ No newline at end of file diff --git a/examples/mlserver/models/text-classification-model/models.py b/examples/mlserver/models/text-classification-model/models.py new file mode 100644 index 0000000000..9007d2cc13 --- /dev/null +++ b/examples/mlserver/models/text-classification-model/models.py @@ -0,0 +1,19 @@ +from mlserver import MLModel +from mlserver.codecs import decode_args +from typing import List +from deepsparse import Pipeline + +class DeepSparseRuntime(MLModel): + async def load(self) -> bool: + self._pipeline = Pipeline.create( + task = self._settings.parameters.task, + model_path = self._settings.parameters.model_path, + batch_size = self._settings.parameters.batch_size, + sequence_length = self._settings.parameters.sequence_length, + ) + return True + + @decode_args + async def predict(self, sequences: List[str]) -> List[str]: + prediction = self._pipeline(sequences=sequences) + return prediction.labels \ No newline at end of file diff --git a/examples/mlserver/requirements.txt b/examples/mlserver/requirements.txt new file mode 100644 index 0000000000..f460faaf38 --- /dev/null +++ b/examples/mlserver/requirements.txt @@ -0,0 +1,2 @@ +mlserver +deepsparse[transformers] \ No newline at end of file From ab75bfe0e92df770306223f1478bc5d735336a82 Mon Sep 17 00:00:00 2001 From: rsnm2 Date: Sun, 9 Jul 2023 17:56:00 +0000 Subject: [PATCH 2/2] added client.py --- examples/mlserver/README.md | 2 +- examples/mlserver/client.py | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 examples/mlserver/client.py diff --git a/examples/mlserver/README.md b/examples/mlserver/README.md index 216fc9f58f..738111e7fe 100644 --- a/examples/mlserver/README.md +++ b/examples/mlserver/README.md @@ -41,7 +41,7 @@ class DeepSparseRuntime(MLModel): ### Create `model-settings.json` -Second, we create a `model-settings` file. In this file, we will specify the location of the implementation of the custom runtime as well as the +Second, we create a config at `models/text-classification-model/model-settings.json`. In this file, we will specify the location of the implementation of the custom runtime as well as the paramters of the deepsparse inference session. ```json diff --git a/examples/mlserver/client.py b/examples/mlserver/client.py new file mode 100644 index 0000000000..248c2fea1d --- /dev/null +++ b/examples/mlserver/client.py @@ -0,0 +1,27 @@ +import requests, threading + +NUM_THREADS = 2 +URL = "http://localhost:8080/v2/models/text-classification-model/infer" +sentences = ["I hate using GPUs for inference", "I love using DeepSparse on CPUs"] * 100 + +def tfunc(text): + inference_request = { + "inputs": [ + { + "name": "sequences", + "shape": [1], + "datatype": "BYTES", + "data": [text], + }, + ] + } + resp = requests.post(URL, json=inference_request).json() + for output in resp["outputs"]: + print(output["data"]) + + +threads = [threading.Thread(target=tfunc, args=(sentence,)) for sentence in sentences[:NUM_THREADS]] +for thread in threads: + thread.start() +for thread in threads: + thread.join() \ No newline at end of file