Skip to content

Commit ffed560

Browse files
Add advanced health check configuration to config file
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent 0cebf7a commit ffed560

File tree

6 files changed

+1168
-41
lines changed

6 files changed

+1168
-41
lines changed

api/config/v1/config.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ type Config struct {
3737
Resources Resources `json:"resources,omitempty" yaml:"resources,omitempty"`
3838
Sharing Sharing `json:"sharing,omitempty" yaml:"sharing,omitempty"`
3939
Imex Imex `json:"imex,omitempty" yaml:"imex,omitempty"`
40+
Health *Health `json:"health,omitempty" yaml:"health,omitempty"`
4041
}
4142

4243
// NewConfig builds out a Config struct from a config file (or command line flags).
@@ -77,6 +78,16 @@ func NewConfig(c *cli.Context, flags []cli.Flag) (*Config, error) {
7778
config.Sharing.MPS.FailRequestsGreaterThanOne = true
7879
}
7980

81+
// Initialize health configuration with defaults if not specified
82+
if config.Health == nil {
83+
config.Health = DefaultHealth()
84+
}
85+
86+
// Validate health configuration
87+
if err := config.Health.Validate(); err != nil {
88+
return nil, fmt.Errorf("invalid health configuration: %v", err)
89+
}
90+
8091
return config, nil
8192
}
8293

api/config/v1/health.go

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package v1
18+
19+
import (
20+
"encoding/json"
21+
"fmt"
22+
"strings"
23+
)
24+
25+
// Health defines the configuration for device health checks
26+
type Health struct {
27+
// Disabled indicates whether health checks are disabled entirely
28+
Disabled bool `json:"disabled,omitempty" yaml:"disabled,omitempty"`
29+
// EventTypes specifies which NVML event types to monitor
30+
EventTypes []string `json:"eventTypes,omitempty" yaml:"eventTypes,omitempty"`
31+
// IgnoredXIDs lists XIDs that should be ignored (non-fatal)
32+
IgnoredXIDs []uint64 `json:"ignoredXIDs,omitempty" yaml:"ignoredXIDs,omitempty"`
33+
// CriticalXIDs specifies which XIDs are considered critical
34+
// Can be "all" or a list of specific XIDs
35+
CriticalXIDs *CriticalXIDsType `json:"criticalXIDs,omitempty" yaml:"criticalXIDs,omitempty"`
36+
}
37+
38+
// CriticalXIDsType represents either "all" XIDs or a specific list
39+
type CriticalXIDsType struct {
40+
// All indicates if all XIDs should be considered critical
41+
All bool
42+
// XIDs contains specific XIDs to treat as critical
43+
XIDs []uint64
44+
}
45+
46+
// UnmarshalJSON implements custom JSON unmarshaling for CriticalXIDsType
47+
func (c *CriticalXIDsType) UnmarshalJSON(data []byte) error {
48+
// Try to unmarshal as string first
49+
var str string
50+
if err := json.Unmarshal(data, &str); err == nil {
51+
if strings.ToLower(str) == "all" {
52+
c.All = true
53+
c.XIDs = nil
54+
return nil
55+
}
56+
return fmt.Errorf("invalid string value for criticalXIDs: %s", str)
57+
}
58+
59+
// Try to unmarshal as array of numbers
60+
var xids []uint64
61+
if err := json.Unmarshal(data, &xids); err == nil {
62+
c.All = false
63+
c.XIDs = xids
64+
return nil
65+
}
66+
67+
return fmt.Errorf("criticalXIDs must be either \"all\" or an array of numbers")
68+
}
69+
70+
// MarshalJSON implements custom JSON marshaling for CriticalXIDsType
71+
func (c CriticalXIDsType) MarshalJSON() ([]byte, error) {
72+
if c.All {
73+
return json.Marshal("all")
74+
}
75+
return json.Marshal(c.XIDs)
76+
}
77+
78+
// UnmarshalYAML implements custom YAML unmarshaling for CriticalXIDsType
79+
func (c *CriticalXIDsType) UnmarshalYAML(unmarshal func(interface{}) error) error {
80+
// Try to unmarshal as string first
81+
var str string
82+
if err := unmarshal(&str); err == nil {
83+
if strings.ToLower(str) == "all" {
84+
c.All = true
85+
c.XIDs = nil
86+
return nil
87+
}
88+
return fmt.Errorf("invalid string value for criticalXIDs: %s", str)
89+
}
90+
91+
// Try to unmarshal as array of numbers
92+
var xids []uint64
93+
if err := unmarshal(&xids); err == nil {
94+
c.All = false
95+
c.XIDs = xids
96+
return nil
97+
}
98+
99+
return fmt.Errorf("criticalXIDs must be either \"all\" or an array of numbers")
100+
}
101+
102+
// MarshalYAML implements custom YAML marshaling for CriticalXIDsType
103+
func (c CriticalXIDsType) MarshalYAML() (interface{}, error) {
104+
if c.All {
105+
return "all", nil
106+
}
107+
return c.XIDs, nil
108+
}
109+
110+
// DefaultHealth returns the default health configuration for standard deployments
111+
func DefaultHealth() *Health {
112+
return &Health{
113+
Disabled: false,
114+
EventTypes: []string{
115+
"EventTypeXidCriticalError",
116+
"EventTypeDoubleBitEccError",
117+
"EventTypeSingleBitEccError",
118+
},
119+
IgnoredXIDs: []uint64{
120+
13, // Graphics Engine Exception
121+
31, // GPU memory page fault
122+
43, // GPU stopped processing
123+
45, // Preemptive cleanup, due to previous errors
124+
68, // Video processor exception
125+
109, // Context Switch Timeout Error
126+
},
127+
CriticalXIDs: &CriticalXIDsType{
128+
All: true,
129+
},
130+
}
131+
}
132+
133+
// IsCritical checks if a given XID should be treated as critical
134+
func (h *Health) IsCritical(xid uint64) bool {
135+
// If health checks are disabled, nothing is critical
136+
if h.Disabled {
137+
return false
138+
}
139+
140+
// Check if XID is in ignored list
141+
for _, ignoredXID := range h.IgnoredXIDs {
142+
if xid == ignoredXID {
143+
return false
144+
}
145+
}
146+
147+
// If no critical XIDs specified, default to all
148+
if h.CriticalXIDs == nil {
149+
return true
150+
}
151+
152+
// If all XIDs are critical (except ignored ones)
153+
if h.CriticalXIDs.All {
154+
return true
155+
}
156+
157+
// Check if XID is in critical list
158+
for _, criticalXID := range h.CriticalXIDs.XIDs {
159+
if xid == criticalXID {
160+
return true
161+
}
162+
}
163+
164+
return false
165+
}
166+
167+
// Validate checks if the health configuration is valid
168+
func (h *Health) Validate() error {
169+
if h == nil {
170+
return nil
171+
}
172+
173+
// Validate event types
174+
validEventTypes := map[string]bool{
175+
"EventTypeXidCriticalError": true,
176+
"EventTypeDoubleBitEccError": true,
177+
"EventTypeSingleBitEccError": true,
178+
}
179+
180+
for _, eventType := range h.EventTypes {
181+
if !validEventTypes[eventType] {
182+
return fmt.Errorf("invalid event type: %s", eventType)
183+
}
184+
}
185+
186+
// Check for XID conflicts
187+
if h.CriticalXIDs != nil && !h.CriticalXIDs.All && len(h.CriticalXIDs.XIDs) > 0 {
188+
ignoredMap := make(map[uint64]bool)
189+
for _, xid := range h.IgnoredXIDs {
190+
ignoredMap[xid] = true
191+
}
192+
193+
for _, xid := range h.CriticalXIDs.XIDs {
194+
if ignoredMap[xid] {
195+
return fmt.Errorf("XID %d is in both ignored and critical lists", xid)
196+
}
197+
}
198+
}
199+
200+
return nil
201+
}
202+
203+
// Merge applies values from another Health config, overriding only non-nil/non-empty values
204+
func (h *Health) Merge(other *Health) {
205+
if other == nil {
206+
return
207+
}
208+
209+
if other.Disabled {
210+
h.Disabled = other.Disabled
211+
}
212+
213+
if len(other.EventTypes) > 0 {
214+
h.EventTypes = other.EventTypes
215+
}
216+
217+
if len(other.IgnoredXIDs) > 0 {
218+
h.IgnoredXIDs = other.IgnoredXIDs
219+
}
220+
221+
if other.CriticalXIDs != nil {
222+
h.CriticalXIDs = other.CriticalXIDs
223+
}
224+
}

0 commit comments

Comments
 (0)