Skip to content

Commit 392c27b

Browse files
authored
Add eval command (#54)
2 parents 5445685 + f70f858 commit 392c27b

File tree

13 files changed

+1434
-40
lines changed

13 files changed

+1434
-40
lines changed

cmd/eval/builtins.go

Lines changed: 386 additions & 0 deletions
Large diffs are not rendered by default.

cmd/eval/eval.go

Lines changed: 441 additions & 0 deletions
Large diffs are not rendered by default.

cmd/eval/eval_test.go

Lines changed: 301 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,301 @@
1+
package eval
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"os"
7+
"path/filepath"
8+
"testing"
9+
10+
"github.com/github/gh-models/internal/azuremodels"
11+
"github.com/github/gh-models/internal/sse"
12+
"github.com/github/gh-models/pkg/command"
13+
"github.com/github/gh-models/pkg/prompt"
14+
"github.com/stretchr/testify/require"
15+
)
16+
17+
func TestEval(t *testing.T) {
18+
t.Run("loads and parses evaluation prompt file", func(t *testing.T) {
19+
const yamlBody = `
20+
name: Test Evaluation
21+
description: A test evaluation
22+
model: openai/gpt-4o
23+
modelParameters:
24+
temperature: 0.5
25+
maxTokens: 100
26+
testData:
27+
- input: "hello"
28+
expected: "hello world"
29+
- input: "goodbye"
30+
expected: "goodbye world"
31+
messages:
32+
- role: system
33+
content: You are a helpful assistant.
34+
- role: user
35+
content: "Please respond to: {{input}}"
36+
evaluators:
37+
- name: contains-world
38+
string:
39+
contains: "world"
40+
- name: similarity-check
41+
uses: github/similarity
42+
`
43+
44+
tmpDir := t.TempDir()
45+
promptFile := filepath.Join(tmpDir, "test.prompt.yml")
46+
err := os.WriteFile(promptFile, []byte(yamlBody), 0644)
47+
require.NoError(t, err)
48+
49+
evalFile, err := prompt.LoadFromFile(promptFile)
50+
require.NoError(t, err)
51+
require.Equal(t, "Test Evaluation", evalFile.Name)
52+
require.Equal(t, "A test evaluation", evalFile.Description)
53+
require.Equal(t, "openai/gpt-4o", evalFile.Model)
54+
require.Equal(t, 0.5, *evalFile.ModelParameters.Temperature)
55+
require.Equal(t, 100, *evalFile.ModelParameters.MaxTokens)
56+
require.Len(t, evalFile.TestData, 2)
57+
require.Len(t, evalFile.Messages, 2)
58+
require.Len(t, evalFile.Evaluators, 2)
59+
})
60+
61+
t.Run("templates messages correctly", func(t *testing.T) {
62+
evalFile := &prompt.File{
63+
Messages: []prompt.Message{
64+
{Role: "system", Content: "You are helpful."},
65+
{Role: "user", Content: "Process {{input}} and return {{expected}}"},
66+
},
67+
}
68+
69+
handler := &evalCommandHandler{evalFile: evalFile}
70+
testCase := map[string]interface{}{
71+
"input": "hello",
72+
"expected": "world",
73+
}
74+
75+
messages, err := handler.templateMessages(testCase)
76+
require.NoError(t, err)
77+
require.Len(t, messages, 2)
78+
require.Equal(t, "You are helpful.", *messages[0].Content)
79+
require.Equal(t, "Process hello and return world", *messages[1].Content)
80+
})
81+
82+
t.Run("string evaluator works correctly", func(t *testing.T) {
83+
handler := &evalCommandHandler{}
84+
85+
tests := []struct {
86+
name string
87+
evaluator prompt.StringEvaluator
88+
response string
89+
expected bool
90+
}{
91+
{
92+
name: "contains match",
93+
evaluator: prompt.StringEvaluator{Contains: "world"},
94+
response: "hello world",
95+
expected: true,
96+
},
97+
{
98+
name: "contains no match",
99+
evaluator: prompt.StringEvaluator{Contains: "world"},
100+
response: "hello there",
101+
expected: false,
102+
},
103+
{
104+
name: "equals match",
105+
evaluator: prompt.StringEvaluator{Equals: "exact"},
106+
response: "exact",
107+
expected: true,
108+
},
109+
{
110+
name: "equals no match",
111+
evaluator: prompt.StringEvaluator{Equals: "exact"},
112+
response: "not exact",
113+
expected: false,
114+
},
115+
{
116+
name: "starts with match",
117+
evaluator: prompt.StringEvaluator{StartsWith: "hello"},
118+
response: "hello world",
119+
expected: true,
120+
},
121+
{
122+
name: "ends with match",
123+
evaluator: prompt.StringEvaluator{EndsWith: "world"},
124+
response: "hello world",
125+
expected: true,
126+
},
127+
}
128+
129+
for _, tt := range tests {
130+
t.Run(tt.name, func(t *testing.T) {
131+
result, err := handler.runStringEvaluator("test", tt.evaluator, tt.response)
132+
require.NoError(t, err)
133+
require.Equal(t, tt.expected, result.Passed)
134+
if tt.expected {
135+
require.Equal(t, 1.0, result.Score)
136+
} else {
137+
require.Equal(t, 0.0, result.Score)
138+
}
139+
})
140+
}
141+
})
142+
143+
t.Run("plugin evaluator works with github/similarity", func(t *testing.T) {
144+
out := new(bytes.Buffer)
145+
client := azuremodels.NewMockClient()
146+
cfg := command.NewConfig(out, out, client, true, 100)
147+
148+
// Mock a response that returns "4" for the LLM evaluator
149+
client.MockGetChatCompletionStream = func(ctx context.Context, req azuremodels.ChatCompletionOptions) (*azuremodels.ChatCompletionResponse, error) {
150+
reader := sse.NewMockEventReader([]azuremodels.ChatCompletion{
151+
{
152+
Choices: []azuremodels.ChatChoice{
153+
{
154+
Message: &azuremodels.ChatChoiceMessage{
155+
Content: func() *string { s := "4"; return &s }(),
156+
},
157+
},
158+
},
159+
},
160+
})
161+
return &azuremodels.ChatCompletionResponse{Reader: reader}, nil
162+
}
163+
164+
handler := &evalCommandHandler{
165+
cfg: cfg,
166+
client: client,
167+
}
168+
testCase := map[string]interface{}{
169+
"input": "test question",
170+
"expected": "test answer",
171+
}
172+
173+
result, err := handler.runPluginEvaluator(context.Background(), "similarity", "github/similarity", testCase, "test response")
174+
require.NoError(t, err)
175+
require.Equal(t, "similarity", result.EvaluatorName)
176+
require.Equal(t, 0.75, result.Score) // Score for choice "4"
177+
require.True(t, result.Passed)
178+
})
179+
180+
t.Run("command creation works", func(t *testing.T) {
181+
out := new(bytes.Buffer)
182+
client := azuremodels.NewMockClient()
183+
cfg := command.NewConfig(out, out, client, true, 100)
184+
185+
cmd := NewEvalCommand(cfg)
186+
require.Equal(t, "eval", cmd.Use)
187+
require.Contains(t, cmd.Short, "Evaluate prompts")
188+
})
189+
190+
t.Run("integration test with mock client", func(t *testing.T) {
191+
const yamlBody = `
192+
name: Mock Test
193+
description: Test with mock client
194+
model: openai/test-model
195+
testData:
196+
- input: "test input"
197+
expected: "test response"
198+
messages:
199+
- role: user
200+
content: "{{input}}"
201+
evaluators:
202+
- name: contains-test
203+
string:
204+
contains: "test"
205+
`
206+
207+
tmpDir := t.TempDir()
208+
promptFile := filepath.Join(tmpDir, "test.prompt.yml")
209+
err := os.WriteFile(promptFile, []byte(yamlBody), 0644)
210+
require.NoError(t, err)
211+
212+
client := azuremodels.NewMockClient()
213+
214+
// Mock a simple response
215+
client.MockGetChatCompletionStream = func(ctx context.Context, req azuremodels.ChatCompletionOptions) (*azuremodels.ChatCompletionResponse, error) {
216+
// Create a mock reader that returns "test response"
217+
reader := sse.NewMockEventReader([]azuremodels.ChatCompletion{
218+
{
219+
Choices: []azuremodels.ChatChoice{
220+
{
221+
Message: &azuremodels.ChatChoiceMessage{
222+
Content: func() *string { s := "test response"; return &s }(),
223+
},
224+
},
225+
},
226+
},
227+
})
228+
return &azuremodels.ChatCompletionResponse{Reader: reader}, nil
229+
}
230+
231+
out := new(bytes.Buffer)
232+
cfg := command.NewConfig(out, out, client, true, 100)
233+
234+
cmd := NewEvalCommand(cfg)
235+
cmd.SetArgs([]string{promptFile})
236+
237+
err = cmd.Execute()
238+
require.NoError(t, err)
239+
240+
output := out.String()
241+
require.Contains(t, output, "Mock Test")
242+
require.Contains(t, output, "Running test case")
243+
require.Contains(t, output, "PASSED")
244+
})
245+
246+
t.Run("logs model response when test fails", func(t *testing.T) {
247+
const yamlBody = `
248+
name: Failing Test
249+
description: Test that fails to check model response logging
250+
model: openai/test-model
251+
testData:
252+
- input: "test input"
253+
expected: "expected but not returned"
254+
messages:
255+
- role: user
256+
content: "{{input}}"
257+
evaluators:
258+
- name: contains-nonexistent
259+
string:
260+
contains: "nonexistent text"
261+
`
262+
263+
tmpDir := t.TempDir()
264+
promptFile := filepath.Join(tmpDir, "test.prompt.yml")
265+
err := os.WriteFile(promptFile, []byte(yamlBody), 0644)
266+
require.NoError(t, err)
267+
268+
client := azuremodels.NewMockClient()
269+
270+
// Mock a response that will fail the evaluator
271+
client.MockGetChatCompletionStream = func(ctx context.Context, req azuremodels.ChatCompletionOptions) (*azuremodels.ChatCompletionResponse, error) {
272+
reader := sse.NewMockEventReader([]azuremodels.ChatCompletion{
273+
{
274+
Choices: []azuremodels.ChatChoice{
275+
{
276+
Message: &azuremodels.ChatChoiceMessage{
277+
Content: func() *string { s := "actual model response"; return &s }(),
278+
},
279+
},
280+
},
281+
},
282+
})
283+
return &azuremodels.ChatCompletionResponse{Reader: reader}, nil
284+
}
285+
286+
out := new(bytes.Buffer)
287+
cfg := command.NewConfig(out, out, client, true, 100)
288+
289+
cmd := NewEvalCommand(cfg)
290+
cmd.SetArgs([]string{promptFile})
291+
292+
err = cmd.Execute()
293+
require.NoError(t, err)
294+
295+
output := out.String()
296+
require.Contains(t, output, "Failing Test")
297+
require.Contains(t, output, "Running test case")
298+
require.Contains(t, output, "FAILED")
299+
require.Contains(t, output, "Model Response: actual model response")
300+
})
301+
}

cmd/root.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"github.com/MakeNowJust/heredoc"
99
"github.com/cli/go-gh/v2/pkg/auth"
1010
"github.com/cli/go-gh/v2/pkg/term"
11+
"github.com/github/gh-models/cmd/eval"
1112
"github.com/github/gh-models/cmd/list"
1213
"github.com/github/gh-models/cmd/run"
1314
"github.com/github/gh-models/cmd/view"
@@ -54,6 +55,7 @@ func NewRootCommand() *cobra.Command {
5455

5556
cfg := command.NewConfigWithTerminal(terminal, client)
5657

58+
cmd.AddCommand(eval.NewEvalCommand(cfg))
5759
cmd.AddCommand(list.NewListCommand(cfg))
5860
cmd.AddCommand(run.NewRunCommand(cfg))
5961
cmd.AddCommand(view.NewViewCommand(cfg))

cmd/root_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ func TestRoot(t *testing.T) {
1919
require.NoError(t, err)
2020
output := buf.String()
2121
require.Regexp(t, regexp.MustCompile(`Usage:\n\s+gh models \[command\]`), output)
22+
require.Regexp(t, regexp.MustCompile(`eval\s+Evaluate prompts using test data and evaluators`), output)
2223
require.Regexp(t, regexp.MustCompile(`list\s+List available models`), output)
2324
require.Regexp(t, regexp.MustCompile(`run\s+Run inference with the specified model`), output)
2425
require.Regexp(t, regexp.MustCompile(`view\s+View details about a model`), output)

0 commit comments

Comments
 (0)