Skip to content

Commit ed98ec8

Browse files
committed
Use glm 4 to build an OpenAI-compatible service
1 parent 3f79b54 commit ed98ec8

File tree

3 files changed

+141
-0
lines changed

3 files changed

+141
-0
lines changed

inference/README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,27 @@ python vllm_cli_demo.py # LLM Such as GLM-4-9B-0414
102102
vllm serve THUDM/GLM-4-9B-0414 --tensor_parallel_size 2
103103
```
104104

105+
### Use glm-4 to build an OpenAI-compatible service
106+
107+
Start the server:
108+
109+
```shell
110+
python glm4_server.py THUDM/GLM-4-9B-0414
111+
```
112+
113+
Client request:
114+
115+
```shell
116+
curl -X POST http://127.0.0.1:8000/v1/chat/completions \
117+
-H 'Content-Type: application/json' \
118+
-d \
119+
"{ \
120+
\"messages\": [ \
121+
{\"role\": \"user\", \"content\": \"Who are you?\"} \
122+
] \
123+
}"
124+
```
125+
105126
### Use glm-4v to build an OpenAI-compatible service
106127

107128
Start the server:

inference/README_zh.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,27 @@ python vllm_cli_demo.py # LLM Such as GLM-4-9B-0414
102102
vllm serve THUDM/GLM-4-9B-0414 --tensor_parallel_size 2
103103
```
104104

105+
### 使用 glm-4 构建 OpenAI 服务
106+
107+
启动服务端
108+
109+
```shell
110+
python glm4_server.py THUDM/GLM-4-9B-0414
111+
```
112+
113+
客户端请求:
114+
115+
```shell
116+
curl -X POST http://127.0.0.1:8000/v1/chat/completions \
117+
-H 'Content-Type: application/json' \
118+
-d \
119+
"{ \
120+
\"messages\": [ \
121+
{\"role\": \"user\", \"content\": \"Who are you?\"} \
122+
] \
123+
}"
124+
```
125+
105126
### 使用 glm-4v 构建 OpenAI 服务
106127

107128
启动服务端

inference/glm4_server.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import sys
2+
from threading import Thread
3+
from typing import List, Literal, Optional
4+
5+
import torch
6+
import uvicorn
7+
from datetime import datetime
8+
from fastapi import FastAPI
9+
from pydantic import BaseModel
10+
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
11+
12+
13+
app = FastAPI()
14+
15+
16+
class MessageInput(BaseModel):
17+
role: Literal["user", "assistant", "system"]
18+
content: str
19+
name: Optional[str] = None
20+
21+
22+
class MessageOutput(BaseModel):
23+
role: Literal["assistant"]
24+
content: str = None
25+
name: Optional[str] = None
26+
27+
28+
class Choice(BaseModel):
29+
message: MessageOutput
30+
31+
32+
class Request(BaseModel):
33+
messages: List[MessageInput]
34+
temperature: Optional[float] = 0.8
35+
top_p: Optional[float] = 0.8
36+
max_tokens: Optional[int] = 128000
37+
repetition_penalty: Optional[float] = 1.0
38+
39+
40+
class Response(BaseModel):
41+
model: str
42+
choices: List[Choice]
43+
44+
45+
@app.post("/v1/chat/completions", response_model=Response)
46+
async def create_chat_completion(request: Request):
47+
global model, tokenizer
48+
49+
print(datetime.now())
50+
print("\033[91m--received_request\033[0m", request)
51+
messages = [message.model_dump() for message in request.messages]
52+
model_inputs = tokenizer.apply_chat_template(
53+
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
54+
).to(model.device)
55+
streamer = TextIteratorStreamer(tokenizer=tokenizer, timeout=60, skip_prompt=True, skip_special_tokens=True)
56+
generate_kwargs = {
57+
"input_ids": model_inputs["input_ids"],
58+
"attention_mask": model_inputs["attention_mask"],
59+
"streamer": streamer,
60+
"max_new_tokens": request.max_tokens,
61+
"do_sample": True,
62+
"top_p": request.top_p,
63+
"temperature": request.temperature if request.temperature > 0 else 0.8,
64+
"repetition_penalty": request.repetition_penalty,
65+
"eos_token_id": model.config.eos_token_id,
66+
}
67+
thread = Thread(target=model.generate, kwargs=generate_kwargs)
68+
thread.start()
69+
70+
result = ""
71+
for new_token in streamer:
72+
result += new_token
73+
print(datetime.now())
74+
print("\033[91m--generated_text\033[0m", result)
75+
76+
message = MessageOutput(
77+
role="assistant",
78+
content=result,
79+
)
80+
choice = Choice(
81+
message=message,
82+
)
83+
response = Response(model=sys.argv[1].split("/")[-1].lower(), choices=[choice])
84+
return response
85+
86+
87+
torch.cuda.empty_cache()
88+
89+
if __name__ == "__main__":
90+
MODEL_PATH = sys.argv[1]
91+
92+
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
93+
model = AutoModelForCausalLM.from_pretrained(
94+
MODEL_PATH,
95+
torch_dtype=torch.bfloat16,
96+
device_map="auto",
97+
).eval()
98+
99+
uvicorn.run(app, host="0.0.0.0", port=8000, workers=1)

0 commit comments

Comments
 (0)