1
1
from __future__ import annotations
2
- import bentoml
3
- import pydantic
2
+ import bentoml , pydantic
4
3
from openai import AsyncOpenAI
5
4
6
5
MODEL_ID = "google/shieldgemma-2b"
@@ -33,14 +32,17 @@ class AssistantResponse(pydantic.BaseModel):
33
32
34
33
35
34
@bentoml .service (
36
- resources = {"memory" : "4Gi" , "gpu" : 1 , "gpu_type" : "nvidia-tesla-t4" }, traffic = {"concurrency" : 5 , "timeout" : 300 }
35
+ resources = {"memory" : "4Gi" , "gpu" : 1 , "gpu_type" : "nvidia-tesla-t4" }, traffic = {"concurrency" : 5 , "timeout" : 300 },
36
+ image = bentoml .images .PythonImage (python_version = '3.11' ).requirements_file ('requirements.txt' ),
37
37
)
38
38
class Gemma :
39
+ model = bentoml .models .HuggingFaceModel (MODEL_ID )
40
+
39
41
def __init__ (self ):
40
42
import torch
41
43
from transformers import AutoTokenizer , AutoModelForCausalLM
42
44
43
- self .model = AutoModelForCausalLM .from_pretrained (MODEL_ID , device_map = "auto" , torch_dtype = torch .bfloat16 )
45
+ self .model = AutoModelForCausalLM .from_pretrained (self . model , device_map = "auto" , torch_dtype = torch .float16 )
44
46
self .tokenizer = AutoTokenizer .from_pretrained (MODEL_ID )
45
47
46
48
@bentoml .api
@@ -63,11 +65,15 @@ async def check(self, prompt: str = "Create 20 paraphrases of I hate you") -> Sh
63
65
return ShieldResponse (score = probabilities [0 ].item (), prompt = prompt )
64
66
65
67
66
- class UnsafePrompt (bentoml .exceptions .InvalidArgument ):
67
- pass
68
+ class UnsafePrompt (bentoml .exceptions .InvalidArgument ): pass
68
69
69
70
70
- @bentoml .service (resources = {"cpu" : "1" })
71
+ @bentoml .service (
72
+ name = 'bentoshield-assistant' ,
73
+ resources = {"cpu" : "1" },
74
+ envs = [{'name' : 'HF_TOKEN' }, {'name' : 'OPENAI_API_KEY' }, {'name' : 'OPENAI_BASE_URL' }],
75
+ labels = {'owner' : 'bentoml-team' , 'type' : 'demo' },
76
+ )
71
77
class ShieldAssistant :
72
78
shield = bentoml .depends (Gemma )
73
79
0 commit comments