|  | 
|  | 1 | +/* | 
|  | 2 | + * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved. | 
|  | 3 | + * | 
|  | 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | 5 | + * you may not use this file except in compliance with the License. | 
|  | 6 | + * You may obtain a copy of the License at | 
|  | 7 | + * | 
|  | 8 | + *     http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 9 | + * | 
|  | 10 | + * Unless required by applicable law or agreed to in writing, software | 
|  | 11 | + * distributed under the License is distributed on an "AS IS" BASIS, | 
|  | 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | 13 | + * See the License for the specific language governing permissions and | 
|  | 14 | + * limitations under the License. | 
|  | 15 | + */ | 
|  | 16 | + | 
|  | 17 | +package v1 | 
|  | 18 | + | 
|  | 19 | +import ( | 
|  | 20 | +	"encoding/json" | 
|  | 21 | +	"fmt" | 
|  | 22 | +	"strings" | 
|  | 23 | +) | 
|  | 24 | + | 
|  | 25 | +// Health defines the configuration for device health checks | 
|  | 26 | +type Health struct { | 
|  | 27 | +	// Disabled indicates whether health checks are disabled entirely | 
|  | 28 | +	Disabled bool `json:"disabled,omitempty" yaml:"disabled,omitempty"` | 
|  | 29 | +	// EventTypes specifies which NVML event types to monitor | 
|  | 30 | +	EventTypes []string `json:"eventTypes,omitempty" yaml:"eventTypes,omitempty"` | 
|  | 31 | +	// IgnoredXIDs lists XIDs that should be ignored (non-fatal) | 
|  | 32 | +	IgnoredXIDs []uint64 `json:"ignoredXIDs,omitempty" yaml:"ignoredXIDs,omitempty"` | 
|  | 33 | +	// CriticalXIDs specifies which XIDs are considered critical | 
|  | 34 | +	// Can be "all" or a list of specific XIDs | 
|  | 35 | +	CriticalXIDs *CriticalXIDsType `json:"criticalXIDs,omitempty" yaml:"criticalXIDs,omitempty"` | 
|  | 36 | +} | 
|  | 37 | + | 
|  | 38 | +// CriticalXIDsType represents either "all" XIDs or a specific list | 
|  | 39 | +type CriticalXIDsType struct { | 
|  | 40 | +	// All indicates if all XIDs should be considered critical | 
|  | 41 | +	All bool | 
|  | 42 | +	// XIDs contains specific XIDs to treat as critical | 
|  | 43 | +	XIDs []uint64 | 
|  | 44 | +} | 
|  | 45 | + | 
|  | 46 | +// UnmarshalJSON implements custom JSON unmarshaling for CriticalXIDsType | 
|  | 47 | +func (c *CriticalXIDsType) UnmarshalJSON(data []byte) error { | 
|  | 48 | +	// Try to unmarshal as string first | 
|  | 49 | +	var str string | 
|  | 50 | +	if err := json.Unmarshal(data, &str); err == nil { | 
|  | 51 | +		if strings.ToLower(str) == "all" { | 
|  | 52 | +			c.All = true | 
|  | 53 | +			c.XIDs = nil | 
|  | 54 | +			return nil | 
|  | 55 | +		} | 
|  | 56 | +		return fmt.Errorf("invalid string value for criticalXIDs: %s", str) | 
|  | 57 | +	} | 
|  | 58 | + | 
|  | 59 | +	// Try to unmarshal as array of numbers | 
|  | 60 | +	var xids []uint64 | 
|  | 61 | +	if err := json.Unmarshal(data, &xids); err == nil { | 
|  | 62 | +		c.All = false | 
|  | 63 | +		c.XIDs = xids | 
|  | 64 | +		return nil | 
|  | 65 | +	} | 
|  | 66 | + | 
|  | 67 | +	return fmt.Errorf("criticalXIDs must be either \"all\" or an array of numbers") | 
|  | 68 | +} | 
|  | 69 | + | 
|  | 70 | +// MarshalJSON implements custom JSON marshaling for CriticalXIDsType | 
|  | 71 | +func (c CriticalXIDsType) MarshalJSON() ([]byte, error) { | 
|  | 72 | +	if c.All { | 
|  | 73 | +		return json.Marshal("all") | 
|  | 74 | +	} | 
|  | 75 | +	return json.Marshal(c.XIDs) | 
|  | 76 | +} | 
|  | 77 | + | 
|  | 78 | +// UnmarshalYAML implements custom YAML unmarshaling for CriticalXIDsType | 
|  | 79 | +func (c *CriticalXIDsType) UnmarshalYAML(unmarshal func(interface{}) error) error { | 
|  | 80 | +	// Try to unmarshal as string first | 
|  | 81 | +	var str string | 
|  | 82 | +	if err := unmarshal(&str); err == nil { | 
|  | 83 | +		if strings.ToLower(str) == "all" { | 
|  | 84 | +			c.All = true | 
|  | 85 | +			c.XIDs = nil | 
|  | 86 | +			return nil | 
|  | 87 | +		} | 
|  | 88 | +		return fmt.Errorf("invalid string value for criticalXIDs: %s", str) | 
|  | 89 | +	} | 
|  | 90 | + | 
|  | 91 | +	// Try to unmarshal as array of numbers | 
|  | 92 | +	var xids []uint64 | 
|  | 93 | +	if err := unmarshal(&xids); err == nil { | 
|  | 94 | +		c.All = false | 
|  | 95 | +		c.XIDs = xids | 
|  | 96 | +		return nil | 
|  | 97 | +	} | 
|  | 98 | + | 
|  | 99 | +	return fmt.Errorf("criticalXIDs must be either \"all\" or an array of numbers") | 
|  | 100 | +} | 
|  | 101 | + | 
|  | 102 | +// MarshalYAML implements custom YAML marshaling for CriticalXIDsType | 
|  | 103 | +func (c CriticalXIDsType) MarshalYAML() (interface{}, error) { | 
|  | 104 | +	if c.All { | 
|  | 105 | +		return "all", nil | 
|  | 106 | +	} | 
|  | 107 | +	return c.XIDs, nil | 
|  | 108 | +} | 
|  | 109 | + | 
|  | 110 | +// DefaultHealth returns the default health configuration for standard deployments | 
|  | 111 | +func DefaultHealth() *Health { | 
|  | 112 | +	return &Health{ | 
|  | 113 | +		Disabled: false, | 
|  | 114 | +		EventTypes: []string{ | 
|  | 115 | +			"EventTypeXidCriticalError", | 
|  | 116 | +			"EventTypeDoubleBitEccError", | 
|  | 117 | +			"EventTypeSingleBitEccError", | 
|  | 118 | +		}, | 
|  | 119 | +		IgnoredXIDs: []uint64{ | 
|  | 120 | +			13,  // Graphics Engine Exception | 
|  | 121 | +			31,  // GPU memory page fault | 
|  | 122 | +			43,  // GPU stopped processing | 
|  | 123 | +			45,  // Preemptive cleanup, due to previous errors | 
|  | 124 | +			68,  // Video processor exception | 
|  | 125 | +			109, // Context Switch Timeout Error | 
|  | 126 | +		}, | 
|  | 127 | +		CriticalXIDs: &CriticalXIDsType{ | 
|  | 128 | +			All: true, | 
|  | 129 | +		}, | 
|  | 130 | +	} | 
|  | 131 | +} | 
|  | 132 | + | 
|  | 133 | +// IsCritical checks if a given XID should be treated as critical | 
|  | 134 | +func (h *Health) IsCritical(xid uint64) bool { | 
|  | 135 | +	// If health checks are disabled, nothing is critical | 
|  | 136 | +	if h.Disabled { | 
|  | 137 | +		return false | 
|  | 138 | +	} | 
|  | 139 | + | 
|  | 140 | +	// Check if XID is in ignored list | 
|  | 141 | +	for _, ignoredXID := range h.IgnoredXIDs { | 
|  | 142 | +		if xid == ignoredXID { | 
|  | 143 | +			return false | 
|  | 144 | +		} | 
|  | 145 | +	} | 
|  | 146 | + | 
|  | 147 | +	// If no critical XIDs specified, default to all | 
|  | 148 | +	if h.CriticalXIDs == nil { | 
|  | 149 | +		return true | 
|  | 150 | +	} | 
|  | 151 | + | 
|  | 152 | +	// If all XIDs are critical (except ignored ones) | 
|  | 153 | +	if h.CriticalXIDs.All { | 
|  | 154 | +		return true | 
|  | 155 | +	} | 
|  | 156 | + | 
|  | 157 | +	// Check if XID is in critical list | 
|  | 158 | +	for _, criticalXID := range h.CriticalXIDs.XIDs { | 
|  | 159 | +		if xid == criticalXID { | 
|  | 160 | +			return true | 
|  | 161 | +		} | 
|  | 162 | +	} | 
|  | 163 | + | 
|  | 164 | +	return false | 
|  | 165 | +} | 
|  | 166 | + | 
|  | 167 | +// Validate checks if the health configuration is valid | 
|  | 168 | +func (h *Health) Validate() error { | 
|  | 169 | +	if h == nil { | 
|  | 170 | +		return nil | 
|  | 171 | +	} | 
|  | 172 | + | 
|  | 173 | +	// Validate event types | 
|  | 174 | +	validEventTypes := map[string]bool{ | 
|  | 175 | +		"EventTypeXidCriticalError":  true, | 
|  | 176 | +		"EventTypeDoubleBitEccError": true, | 
|  | 177 | +		"EventTypeSingleBitEccError": true, | 
|  | 178 | +	} | 
|  | 179 | + | 
|  | 180 | +	for _, eventType := range h.EventTypes { | 
|  | 181 | +		if !validEventTypes[eventType] { | 
|  | 182 | +			return fmt.Errorf("invalid event type: %s", eventType) | 
|  | 183 | +		} | 
|  | 184 | +	} | 
|  | 185 | + | 
|  | 186 | +	// Check for XID conflicts | 
|  | 187 | +	if h.CriticalXIDs != nil && !h.CriticalXIDs.All && len(h.CriticalXIDs.XIDs) > 0 { | 
|  | 188 | +		ignoredMap := make(map[uint64]bool) | 
|  | 189 | +		for _, xid := range h.IgnoredXIDs { | 
|  | 190 | +			ignoredMap[xid] = true | 
|  | 191 | +		} | 
|  | 192 | + | 
|  | 193 | +		for _, xid := range h.CriticalXIDs.XIDs { | 
|  | 194 | +			if ignoredMap[xid] { | 
|  | 195 | +				return fmt.Errorf("XID %d is in both ignored and critical lists", xid) | 
|  | 196 | +			} | 
|  | 197 | +		} | 
|  | 198 | +	} | 
|  | 199 | + | 
|  | 200 | +	return nil | 
|  | 201 | +} | 
|  | 202 | + | 
|  | 203 | +// Merge applies values from another Health config, overriding only non-nil/non-empty values | 
|  | 204 | +func (h *Health) Merge(other *Health) { | 
|  | 205 | +	if other == nil { | 
|  | 206 | +		return | 
|  | 207 | +	} | 
|  | 208 | + | 
|  | 209 | +	if other.Disabled { | 
|  | 210 | +		h.Disabled = other.Disabled | 
|  | 211 | +	} | 
|  | 212 | + | 
|  | 213 | +	if len(other.EventTypes) > 0 { | 
|  | 214 | +		h.EventTypes = other.EventTypes | 
|  | 215 | +	} | 
|  | 216 | + | 
|  | 217 | +	if len(other.IgnoredXIDs) > 0 { | 
|  | 218 | +		h.IgnoredXIDs = other.IgnoredXIDs | 
|  | 219 | +	} | 
|  | 220 | + | 
|  | 221 | +	if other.CriticalXIDs != nil { | 
|  | 222 | +		h.CriticalXIDs = other.CriticalXIDs | 
|  | 223 | +	} | 
|  | 224 | +} | 
0 commit comments