Merge pull request GoogleCloudPlatform#3644 from justinsb/llm_completion_method

google-oss-prow[bot] · web-flow · commit 27b8eeb17a31 · 2025-02-11T17:10:29.000Z
codebot: add support for completion method
diff --git a/dev/tools/controllerbuilder/pkg/commands/exportcsv/prompt.go b/dev/tools/controllerbuilder/pkg/commands/exportcsv/prompt.go
@@ -139,11 +139,12 @@ func RunPrompt(ctx context.Context, o *PromptOptions) error {
 	}
 
 	dataPoint := dataPoints[0]
+	dataPoint.Output = ""
 
 	log.Info("built data point", "dataPoint", dataPoint)
 
 	out := &bytes.Buffer{}
-	if err := x.RunGemini(ctx, dataPoint, out); err != nil {
+	if err := x.InferOutput_WithCompletion(ctx, dataPoint, out); err != nil {
 		return fmt.Errorf("running LLM inference: %w", err)
 
 	}
diff --git a/dev/tools/controllerbuilder/pkg/llm/gemini.go b/dev/tools/controllerbuilder/pkg/llm/gemini.go
@@ -39,19 +39,25 @@ func BuildGeminiClient(ctx context.Context) (Client, error) {
 
 	return &GeminiClient{
 		client: client,
+		model:  "gemini-2.0-pro-exp-02-05",
 	}, nil
 }
 
 type GeminiClient struct {
 	client *genai.Client
+	model  string
 }
 
 func (c *GeminiClient) Close() error {
 	return c.client.Close()
 }
 
+func (c *GeminiClient) GenerateCompletion(ctx context.Context, request *CompletionRequest) (CompletionResponse, error) {
+	return nil, fmt.Errorf("GeminiClient::GenerateCompletion not implemented")
+}
+
 func (c *GeminiClient) StartChat(systemPrompt string) Chat {
-	model := c.client.GenerativeModel("gemini-2.0-flash-exp")
+	model := c.client.GenerativeModel(c.model)
 	// model := c.client.GenerativeModel("gemini-1.5-pro-002")
 
 	// Some values that are recommended by aistudio
diff --git a/dev/tools/controllerbuilder/pkg/llm/interfaces.go b/dev/tools/controllerbuilder/pkg/llm/interfaces.go
@@ -22,6 +22,8 @@ import (
 type Client interface {
 	io.Closer
 	StartChat(systemPrompt string) Chat
+
+	GenerateCompletion(ctx context.Context, req *CompletionRequest) (CompletionResponse, error)
 }
 
 type Chat interface {
@@ -44,3 +46,12 @@ type Part interface {
 	AsText() (string, bool)
 	AsFunctionCalls() ([]FunctionCall, bool)
 }
+
+type CompletionRequest struct {
+	Prompt string
+}
+
+type CompletionResponse interface {
+	Response() string
+	UsageMetadata() any
+}
diff --git a/dev/tools/controllerbuilder/pkg/llm/ollama.go b/dev/tools/controllerbuilder/pkg/llm/ollama.go
@@ -41,29 +41,32 @@ func BuildOllamaClient(ctx context.Context) (*OllamaClient, error) {
 	}
 	klog.Infof("using ollama with base url %v", baseURL.String())
 
+	model := os.Getenv("OLLAMA_MODEL")
+	if model == "" {
+		klog.Fatalf("OLLAMA_MODEL not set")
+	}
+
 	return &OllamaClient{
 		baseURL:    baseURL,
 		httpClient: http.DefaultClient,
+		model:      model,
 	}, nil
 }
 
 type OllamaClient struct {
 	baseURL    *url.URL
 	httpClient *http.Client
+	model      string
 }
 
 func (c *OllamaClient) Close() error {
 	return nil
 }
 
 func (c *OllamaClient) StartChat(systemPrompt string) Chat {
-	session := &chatRequest{}
-
-	model := os.Getenv("OLLAMA_MODEL")
-	if model == "" {
-		klog.Fatalf("OLLAMA_MODEL not set")
+	session := &chatRequest{
+		Model: c.model,
 	}
-	session.Model = model
 
 	// HACK: Setting the system prompt seems to really mess up some ollama models
 	// session.Messages = append(session.Messages, chatMessage{
@@ -114,6 +117,51 @@ type chatResponse struct {
 	EvalDuration       int64        `json:"eval_duration"`
 }
 
+type completionRequest struct {
+	// model: (required) the model name
+	Model string `json:"model,omitempty"`
+	// prompt: the prompt to generate a response for
+	Prompt string `json:"prompt,omitempty"`
+
+	// suffix: the text after the model response
+
+	// images: (optional) a list of base64-encoded images (for multimodal models such as llava)
+
+	// format: the format to return a response in. Format can be json or a JSON schema
+
+	// options: additional model parameters listed in the documentation for the Modelfile such as temperature
+	Options map[string]any `json:"options,omitempty"`
+
+	// system: system message to (overrides what is defined in the Modelfile)
+
+	// template: the prompt template to use (overrides what is defined in the Modelfile)
+
+	// stream: if false the response will be returned as a single response object, rather than a stream of objects
+	Stream *bool `json:"stream,omitempty"`
+
+	// raw: if true no formatting will be applied to the prompt. You may choose to use the raw parameter if you are specifying a full templated prompt in your request to the API
+
+	// keep_alive: controls how long the model will stay loaded into memory following the request (default: 5m)
+
+	// context (deprecated): the context parameter returned from a previous request to /generate, this can be used to keep a short conversational memory
+}
+
+type completionResponse struct {
+	Model     string `json:"model"`
+	CreatedAt string `json:"created_at"`
+	Response  string `json:"response"`
+	Done      bool   `json:"done"`
+
+	//  "context": [1, 2, 3],
+
+	TotalDuration      int64 `json:"total_duration"`
+	LoadDuration       int64 `json:"load_duration"`
+	PromptEvalCount    int64 `json:"prompt_eval_count"`
+	PromptEvalDuration int64 `json:"prompt_eval_duration"`
+	EvalCount          int64 `json:"eval_count"`
+	EvalDuration       int64 `json:"eval_duration"`
+}
+
 type chatMessage struct {
 	// role: the role of the message, either system, user, assistant, or tool
 	Role string `json:"role,omitempty"`
@@ -196,7 +244,9 @@ func (c *OllamaChat) SendMessage(ctx context.Context, parts ...string) (Response
 			Role:    "user",
 			Content: part,
 		})
+		klog.Infof("sending user:\n%v", part)
 	}
+
 	ollamaResponse, err := c.client.doChat(ctx, c.session)
 	if err != nil {
 		return nil, err
@@ -213,6 +263,68 @@ func (c *OllamaChat) SendMessage(ctx context.Context, parts ...string) (Response
 	return response, nil
 }
 
+func (c *OllamaClient) GenerateCompletion(ctx context.Context, request *CompletionRequest) (CompletionResponse, error) {
+	ollamaRequest := &completionRequest{
+		Model:  c.model,
+		Prompt: request.Prompt,
+		Options: map[string]any{
+			"num_ctx": 128 * 1024,
+		},
+	}
+
+	ollamaResponse, err := c.doCompletion(ctx, ollamaRequest)
+	if err != nil {
+		return nil, err
+	}
+
+	if ollamaResponse.Response == "" {
+		return nil, fmt.Errorf("no response returned from ollama")
+	}
+
+	response := &OllamaCompletionResponse{ollamaResponse: ollamaResponse}
+	return response, nil
+}
+
+func (c *OllamaClient) doCompletion(ctx context.Context, req *completionRequest) (*completionResponse, error) {
+	stream := false
+	req.Stream = &stream
+
+	body, err := json.Marshal(req)
+	if err != nil {
+		return nil, fmt.Errorf("building json body: %w", err)
+	}
+	u := c.baseURL.JoinPath("api", "generate")
+	klog.V(2).Infof("sending POST request to %v: %v", u.String(), string(body))
+	httpRequest, err := http.NewRequestWithContext(ctx, "POST", u.String(), bytes.NewReader(body))
+	if err != nil {
+		return nil, fmt.Errorf("building http request: %w", err)
+	}
+	httpRequest.Header.Set("Content-Type", "application/json")
+
+	httpResponse, err := c.httpClient.Do(httpRequest)
+	if err != nil {
+		return nil, fmt.Errorf("performing http request: %w", err)
+	}
+	defer httpResponse.Body.Close()
+
+	b, err := io.ReadAll(httpResponse.Body)
+	if err != nil {
+		return nil, fmt.Errorf("reading response body: %w", err)
+	}
+
+	klog.Infof("response is: %v", string(b))
+
+	if httpResponse.StatusCode != 200 {
+		return nil, fmt.Errorf("unexpected http status: %q with response %q", httpResponse.Status, string(b))
+	}
+
+	completionResponse := &completionResponse{}
+	if err := json.Unmarshal(b, completionResponse); err != nil {
+		return nil, fmt.Errorf("unmarshalling json response: %w", err)
+	}
+	return completionResponse, nil
+}
+
 func (c *OllamaClient) doChat(ctx context.Context, req *chatRequest) (*chatResponse, error) {
 	stream := false
 	req.Stream = &stream
@@ -222,7 +334,7 @@ func (c *OllamaClient) doChat(ctx context.Context, req *chatRequest) (*chatRespo
 		return nil, fmt.Errorf("building json body: %w", err)
 	}
 	u := c.baseURL.JoinPath("api", "chat")
-	klog.Infof("sending POST request to %v: %v", u.String(), string(body))
+	klog.V(2).Infof("sending POST request to %v: %v", u.String(), string(body))
 	httpRequest, err := http.NewRequestWithContext(ctx, "POST", u.String(), bytes.NewReader(body))
 	if err != nil {
 		return nil, fmt.Errorf("building http request: %w", err)
@@ -323,3 +435,17 @@ func (p *OllamaPart) AsFunctionCalls() ([]FunctionCall, bool) {
 	}
 	return functionCalls, true
 }
+
+type OllamaCompletionResponse struct {
+	ollamaResponse *completionResponse
+}
+
+var _ CompletionResponse = &OllamaCompletionResponse{}
+
+func (r *OllamaCompletionResponse) Response() string {
+	return r.ollamaResponse.Response
+}
+
+func (r *OllamaCompletionResponse) UsageMetadata() any {
+	return r.ollamaResponse
+}
diff --git a/dev/tools/controllerbuilder/pkg/llm/vertexai.go b/dev/tools/controllerbuilder/pkg/llm/vertexai.go
@@ -60,24 +60,24 @@ func BuildVertexAIClient(ctx context.Context) (*VertexAIClient, error) {
 	if err != nil {
 		return nil, fmt.Errorf("building vertexai client: %w", err)
 	}
-	return &VertexAIClient{client: client}, nil
+	model := "gemini-2.0-pro-exp-02-05"
+	return &VertexAIClient{
+		client: client,
+		model:  model,
+	}, nil
 }
 
 type VertexAIClient struct {
 	client *genai.Client
+	model  string
 }
 
 func (c *VertexAIClient) Close() error {
 	return c.client.Close()
 }
 
 func (c *VertexAIClient) StartChat(systemPrompt string) Chat {
-	// model := c.client.GenerativeModel("vertexai-1.5-flash")
-	// model := c.client.GenerativeModel("vertexai-exp-1206")
-	// model := c.client.GenerativeModel("gemini-2.0-flash-exp")
-	model := c.client.GenerativeModel("gemini-2.0-pro-exp-02-05")
-	// model := c.client.GenerativeModel("gemma-2-27b-it")
-	// model := c.client.GenerativeModel("gemini-1.5-pro-002")
+	model := c.client.GenerativeModel(c.model)
 
 	// Some values that are recommended by aistudio
 	model.SetTemperature(1)
@@ -173,6 +173,45 @@ func toVertexAISchema(schema *Schema) (*genai.Schema, error) {
 // 	})
 // }
 
+func (c *VertexAIClient) GenerateCompletion(ctx context.Context, request *CompletionRequest) (CompletionResponse, error) {
+	log := klog.FromContext(ctx)
+
+	model := c.client.GenerativeModel(c.model)
+
+	var vertexaiParts []genai.Part
+
+	vertexaiParts = append(vertexaiParts, genai.Text(request.Prompt))
+
+	log.Info("sending GenerateContent request to vertexai", "parts", vertexaiParts)
+	vertexaiResponse, err := model.GenerateContent(ctx, vertexaiParts...)
+	if err != nil {
+		return nil, err
+	}
+
+	if len(vertexaiResponse.Candidates) > 1 {
+		klog.Infof("only considering first candidate")
+		for i := 1; i < len(vertexaiResponse.Candidates); i++ {
+			candidate := vertexaiResponse.Candidates[i]
+			klog.Infof("ignoring candidate: %q", candidate.Content)
+		}
+	}
+	var response strings.Builder
+	candidate := vertexaiResponse.Candidates[0]
+	for _, part := range candidate.Content.Parts {
+		switch part := part.(type) {
+		case genai.Text:
+			if response.Len() != 0 {
+				response.WriteString("\n")
+			}
+			response.WriteString(string(part))
+		default:
+			return nil, fmt.Errorf("unexpected type of content part: %T", part)
+		}
+	}
+
+	return &VertexAICompletionResponse{vertexaiResponse: vertexaiResponse, text: response.String()}, nil
+}
+
 func (c *VertexAIChat) SendMessage(ctx context.Context, parts ...string) (Response, error) {
 	log := klog.FromContext(ctx)
 	var vertexaiParts []genai.Part
@@ -256,3 +295,18 @@ func (p *VertexAIPart) AsFunctionCalls() ([]FunctionCall, bool) {
 	}
 	return nil, false
 }
+
+type VertexAICompletionResponse struct {
+	vertexaiResponse *genai.GenerateContentResponse
+	text             string
+}
+
+var _ CompletionResponse = &VertexAICompletionResponse{}
+
+func (r *VertexAICompletionResponse) Response() string {
+	return r.text
+}
+
+func (r *VertexAICompletionResponse) UsageMetadata() any {
+	return r.vertexaiResponse.UsageMetadata
+}
diff --git a/dev/tools/controllerbuilder/pkg/toolbot/csv.go b/dev/tools/controllerbuilder/pkg/toolbot/csv.go

Original file line number	Diff line number	Diff line change
`@@ -139,11 +139,12 @@ func RunPrompt(ctx context.Context, o *PromptOptions) error {`
`139`	`139`	`}`
`140`	`140`
`141`	`141`	`dataPoint := dataPoints[0]`
	`142`	`+ dataPoint.Output = ""`
`142`	`143`
`143`	`144`	`log.Info("built data point", "dataPoint", dataPoint)`
`144`	`145`
`145`	`146`	`out := &bytes.Buffer{}`
`146`		`- if err := x.RunGemini(ctx, dataPoint, out); err != nil {`
	`147`	`+ if err := x.InferOutput_WithCompletion(ctx, dataPoint, out); err != nil {`
`147`	`148`	`return fmt.Errorf("running LLM inference: %w", err)`
`148`	`149`
`149`	`150`	`}`