@@ -3,6 +3,7 @@ package eval
33
44import (
55 "context"
6+ "encoding/json"
67 "errors"
78 "fmt"
89 "strings"
@@ -15,9 +16,22 @@ import (
1516 "github.com/spf13/cobra"
1617)
1718
18- // EvaluationPromptFile represents the structure of a prompt.yml file for evaluation
19- // It extends the base prompt.File with evaluation-specific fields
20- type EvaluationPromptFile = prompt.File
19+ // EvaluationSummary represents the overall evaluation summary
20+ type EvaluationSummary struct {
21+ Name string `json:"name"`
22+ Description string `json:"description"`
23+ Model string `json:"model"`
24+ TestResults []TestResult `json:"testResults"`
25+ Summary Summary `json:"summary"`
26+ }
27+
28+ // Summary represents the evaluation summary statistics
29+ type Summary struct {
30+ TotalTests int `json:"totalTests"`
31+ PassedTests int `json:"passedTests"`
32+ FailedTests int `json:"failedTests"`
33+ PassRate float64 `json:"passRate"`
34+ }
2135
2236// TestResult represents the result of running a test case
2337type TestResult struct {
@@ -61,12 +75,23 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command {
6175 - name: contains-hello
6276 string:
6377 contains: "hello"
78+
79+ By default, results are displayed in a human-readable format. Use the --json flag
80+ to output structured JSON data for programmatic use or integration with CI/CD pipelines.
81+
82+ See https://docs.github.com/github-models/use-github-models/storing-prompts-in-github-repositories#supported-file-format for more information.
6483 ` ),
6584 Example : "gh models eval my_prompt.prompt.yml" ,
6685 Args : cobra .ExactArgs (1 ),
6786 RunE : func (cmd * cobra.Command , args []string ) error {
6887 promptFilePath := args [0 ]
6988
89+ // Get the json flag
90+ jsonOutput , err := cmd .Flags ().GetBool ("json" )
91+ if err != nil {
92+ return err
93+ }
94+
7095 // Load the evaluation prompt file
7196 evalFile , err := loadEvaluationPromptFile (promptFilePath )
7297 if err != nil {
@@ -75,25 +100,28 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command {
75100
76101 // Run evaluation
77102 handler := & evalCommandHandler {
78- cfg : cfg ,
79- client : cfg .Client ,
80- evalFile : evalFile ,
103+ cfg : cfg ,
104+ client : cfg .Client ,
105+ evalFile : evalFile ,
106+ jsonOutput : jsonOutput ,
81107 }
82108
83109 return handler .runEvaluation (cmd .Context ())
84110 },
85111 }
86112
113+ cmd .Flags ().Bool ("json" , false , "Output results in JSON format" )
87114 return cmd
88115}
89116
90117type evalCommandHandler struct {
91- cfg * command.Config
92- client azuremodels.Client
93- evalFile * EvaluationPromptFile
118+ cfg * command.Config
119+ client azuremodels.Client
120+ evalFile * prompt.File
121+ jsonOutput bool
94122}
95123
96- func loadEvaluationPromptFile (filePath string ) (* EvaluationPromptFile , error ) {
124+ func loadEvaluationPromptFile (filePath string ) (* prompt. File , error ) {
97125 evalFile , err := prompt .LoadFromFile (filePath )
98126 if err != nil {
99127 return nil , fmt .Errorf ("failed to load prompt file: %w" , err )
@@ -103,23 +131,31 @@ func loadEvaluationPromptFile(filePath string) (*EvaluationPromptFile, error) {
103131}
104132
105133func (h * evalCommandHandler ) runEvaluation (ctx context.Context ) error {
106- h .cfg .WriteToOut (fmt .Sprintf ("Running evaluation: %s\n " , h .evalFile .Name ))
107- h .cfg .WriteToOut (fmt .Sprintf ("Description: %s\n " , h .evalFile .Description ))
108- h .cfg .WriteToOut (fmt .Sprintf ("Model: %s\n " , h .evalFile .Model ))
109- h .cfg .WriteToOut (fmt .Sprintf ("Test cases: %d\n " , len (h .evalFile .TestData )))
110- h .cfg .WriteToOut ("\n " )
134+ // Print header info only for human-readable output
135+ if ! h .jsonOutput {
136+ h .cfg .WriteToOut (fmt .Sprintf ("Running evaluation: %s\n " , h .evalFile .Name ))
137+ h .cfg .WriteToOut (fmt .Sprintf ("Description: %s\n " , h .evalFile .Description ))
138+ h .cfg .WriteToOut (fmt .Sprintf ("Model: %s\n " , h .evalFile .Model ))
139+ h .cfg .WriteToOut (fmt .Sprintf ("Test cases: %d\n " , len (h .evalFile .TestData )))
140+ h .cfg .WriteToOut ("\n " )
141+ }
111142
143+ var testResults []TestResult
112144 passedTests := 0
113145 totalTests := len (h .evalFile .TestData )
114146
115147 for i , testCase := range h .evalFile .TestData {
116- h .cfg .WriteToOut (fmt .Sprintf ("Running test case %d/%d...\n " , i + 1 , totalTests ))
148+ if ! h .jsonOutput {
149+ h .cfg .WriteToOut (fmt .Sprintf ("Running test case %d/%d...\n " , i + 1 , totalTests ))
150+ }
117151
118152 result , err := h .runTestCase (ctx , testCase )
119153 if err != nil {
120154 return fmt .Errorf ("test case %d failed: %w" , i + 1 , err )
121155 }
122156
157+ testResults = append (testResults , result )
158+
123159 // Check if all evaluators passed
124160 testPassed := true
125161 for _ , evalResult := range result .EvaluationResults {
@@ -131,48 +167,91 @@ func (h *evalCommandHandler) runEvaluation(ctx context.Context) error {
131167
132168 if testPassed {
133169 passedTests ++
134- h .cfg .WriteToOut (" ✓ PASSED\n " )
135- } else {
136- h .cfg .WriteToOut (" ✗ FAILED\n " )
137- // Show the first 100 characters of the model response when test fails
138- preview := result .ModelResponse
139- if len (preview ) > 100 {
140- preview = preview [:100 ] + "..."
141- }
142- h .cfg .WriteToOut (fmt .Sprintf (" Model Response: %s\n " , preview ))
143170 }
144171
145- // Show evaluation details
146- for _ , evalResult := range result .EvaluationResults {
147- status := "✓"
148- if ! evalResult .Passed {
149- status = "✗"
150- }
151- h .cfg .WriteToOut (fmt .Sprintf (" %s %s (score: %.2f)\n " ,
152- status , evalResult .EvaluatorName , evalResult .Score ))
153- if evalResult .Details != "" {
154- h .cfg .WriteToOut (fmt .Sprintf (" %s\n " , evalResult .Details ))
155- }
172+ if ! h .jsonOutput {
173+ h .printTestResult (result , testPassed )
156174 }
157- h .cfg .WriteToOut ("\n " )
158175 }
159176
177+ // Calculate pass rate
178+ passRate := 100.0
179+ if totalTests > 0 {
180+ passRate = float64 (passedTests ) / float64 (totalTests ) * 100
181+ }
182+
183+ if h .jsonOutput {
184+ // Output JSON format
185+ summary := EvaluationSummary {
186+ Name : h .evalFile .Name ,
187+ Description : h .evalFile .Description ,
188+ Model : h .evalFile .Model ,
189+ TestResults : testResults ,
190+ Summary : Summary {
191+ TotalTests : totalTests ,
192+ PassedTests : passedTests ,
193+ FailedTests : totalTests - passedTests ,
194+ PassRate : passRate ,
195+ },
196+ }
197+
198+ jsonData , err := json .MarshalIndent (summary , "" , " " )
199+ if err != nil {
200+ return fmt .Errorf ("failed to marshal JSON: %w" , err )
201+ }
202+
203+ h .cfg .WriteToOut (string (jsonData ) + "\n " )
204+ } else {
205+ // Output human-readable format summary
206+ h .printSummary (passedTests , totalTests , passRate )
207+ }
208+
209+ return nil
210+ }
211+
212+ func (h * evalCommandHandler ) printTestResult (result TestResult , testPassed bool ) {
213+ if testPassed {
214+ h .cfg .WriteToOut (" ✓ PASSED\n " )
215+ } else {
216+ h .cfg .WriteToOut (" ✗ FAILED\n " )
217+ // Show the first 100 characters of the model response when test fails
218+ preview := result .ModelResponse
219+ if len (preview ) > 100 {
220+ preview = preview [:100 ] + "..."
221+ }
222+ h .cfg .WriteToOut (fmt .Sprintf (" Model Response: %s\n " , preview ))
223+ }
224+
225+ // Show evaluation details
226+ for _ , evalResult := range result .EvaluationResults {
227+ status := "✓"
228+ if ! evalResult .Passed {
229+ status = "✗"
230+ }
231+ h .cfg .WriteToOut (fmt .Sprintf (" %s %s (score: %.2f)\n " ,
232+ status , evalResult .EvaluatorName , evalResult .Score ))
233+ if evalResult .Details != "" {
234+ h .cfg .WriteToOut (fmt .Sprintf (" %s\n " , evalResult .Details ))
235+ }
236+ }
237+ h .cfg .WriteToOut ("\n " )
238+ }
239+
240+ func (h * evalCommandHandler ) printSummary (passedTests , totalTests int , passRate float64 ) {
160241 // Summary
161242 h .cfg .WriteToOut ("Evaluation Summary:\n " )
162243 if totalTests == 0 {
163- h .cfg .WriteToOut ("Passed: 0/0 (0.0 %)\n " )
244+ h .cfg .WriteToOut ("Passed: 0/0 (0.00 %)\n " )
164245 } else {
165- h .cfg .WriteToOut (fmt .Sprintf ("Passed: %d/%d (%.1f %%)\n " ,
166- passedTests , totalTests , float64 ( passedTests ) / float64 ( totalTests ) * 100 ))
246+ h .cfg .WriteToOut (fmt .Sprintf ("Passed: %d/%d (%.2f %%)\n " ,
247+ passedTests , totalTests , passRate ))
167248 }
168249
169250 if passedTests == totalTests {
170251 h .cfg .WriteToOut ("🎉 All tests passed!\n " )
171252 } else {
172253 h .cfg .WriteToOut ("❌ Some tests failed.\n " )
173254 }
174-
175- return nil
176255}
177256
178257func (h * evalCommandHandler ) runTestCase (ctx context.Context , testCase map [string ]interface {}) (TestResult , error ) {
@@ -210,16 +289,9 @@ func (h *evalCommandHandler) templateMessages(testCase map[string]interface{}) (
210289 return nil , fmt .Errorf ("failed to template message content: %w" , err )
211290 }
212291
213- var role azuremodels.ChatMessageRole
214- switch strings .ToLower (msg .Role ) {
215- case "system" :
216- role = azuremodels .ChatMessageRoleSystem
217- case "user" :
218- role = azuremodels .ChatMessageRoleUser
219- case "assistant" :
220- role = azuremodels .ChatMessageRoleAssistant
221- default :
222- return nil , fmt .Errorf ("unknown message role: %s" , msg .Role )
292+ role , err := prompt .GetAzureChatMessageRole (msg .Role )
293+ if err != nil {
294+ return nil , err
223295 }
224296
225297 messages = append (messages , azuremodels.ChatMessage {
@@ -236,22 +308,7 @@ func (h *evalCommandHandler) templateString(templateStr string, data map[string]
236308}
237309
238310func (h * evalCommandHandler ) callModel (ctx context.Context , messages []azuremodels.ChatMessage ) (string , error ) {
239- req := azuremodels.ChatCompletionOptions {
240- Messages : messages ,
241- Model : h .evalFile .Model ,
242- Stream : false ,
243- }
244-
245- // Apply model parameters
246- if h .evalFile .ModelParameters .MaxTokens != nil {
247- req .MaxTokens = h .evalFile .ModelParameters .MaxTokens
248- }
249- if h .evalFile .ModelParameters .Temperature != nil {
250- req .Temperature = h .evalFile .ModelParameters .Temperature
251- }
252- if h .evalFile .ModelParameters .TopP != nil {
253- req .TopP = h .evalFile .ModelParameters .TopP
254- }
311+ req := h .evalFile .BuildChatCompletionOptions (messages )
255312
256313 resp , err := h .client .GetChatCompletionStream (ctx , req )
257314 if err != nil {
0 commit comments