Skip to content

S.M.A.R.T support #614

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions beszel/internal/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ type Agent struct {
sensorsWhitelist map[string]struct{} // List of sensors to monitor
systemInfo system.Info // Host system info
gpuManager *GPUManager // Manages GPU data
smartManager *SmartManager // Manages SMART data
}

func NewAgent() *Agent {
Expand Down Expand Up @@ -83,6 +84,12 @@ func NewAgent() *Agent {
agent.gpuManager = gm
}

if sm, err := NewSmartManager(); err != nil {
slog.Debug("SMART", "err", err)
} else {
agent.smartManager = sm
}

// if debugging, print stats
if agent.debug {
slog.Debug("Stats", "data", agent.gatherStats())
Expand Down
288 changes: 288 additions & 0 deletions beszel/internal/agent/smart.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
package agent

import (
"beszel/internal/entities/smart"
"beszel/internal/entities/system"
"context"
"encoding/json"
"fmt"
"os/exec"
"reflect"
"sync"
"time"

"golang.org/x/exp/slog"
)

// SmartManager manages data collection for SMART devices
type SmartManager struct {
SmartDataMap map[string]*system.SmartData
SmartDevices []*DeviceInfo
mutex sync.Mutex
}

type scanOutput struct {
Devices []struct {
Name string `json:"name"`
Type string `json:"type"`
InfoName string `json:"info_name"`
Protocol string `json:"protocol"`
} `json:"devices"`
}

type DeviceInfo struct {
Name string `json:"name"`
Type string `json:"type"`
InfoName string `json:"info_name"`
Protocol string `json:"protocol"`
}

var errNoValidSmartData = fmt.Errorf("no valid GPU data found") // Error for missing data

// Starts the SmartManager
func (sm *SmartManager) Start() {
sm.SmartDataMap = make(map[string]*system.SmartData)
for {
sm.ScanDevices()
// TODO: add retry logic
for _, deviceInfo := range sm.SmartDevices {
err := sm.CollectSmart(deviceInfo)
if err != nil {
slog.Warn("smartctl failed, stopping", "err", err)
return
}
}
}
}

// GetCurrentData returns the current SMART data
func (sm *SmartManager) GetCurrentData() map[string]system.SmartData {
sm.mutex.Lock()
defer sm.mutex.Unlock()
result := make(map[string]system.SmartData)
for key, value := range sm.SmartDataMap {
result[key] = *value
}
return result
}

// ScanDevices scans for SMART devices
func (sm *SmartManager) ScanDevices() error {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()

cmd := exec.CommandContext(ctx, "smartctl", "--scan", "-j")
output, err := cmd.Output()

if err != nil {
return err
}

hasValidData := sm.parseScan(output)
if !hasValidData {
return errNoValidSmartData
}
return nil
}

// CollectSmart collects SMART data for a device
func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()

cmd := exec.CommandContext(ctx, "smartctl", "--all", "-j", deviceInfo.Name)

output, err := cmd.Output()

if err != nil {
return err
}

hasValidData := false
if deviceInfo.Type == "scsi" {
hasValidData = sm.parseSmartForScsi(output)
} else if deviceInfo.Type == "nvme" {
hasValidData = sm.parseSmartForNvme(output)
}

if !hasValidData {
return errNoValidSmartData
}
return nil
}

// parseScan parses the output of smartctl --scan -j and updates the SmartDevices slice
func (sm *SmartManager) parseScan(output []byte) bool {
sm.mutex.Lock()
defer sm.mutex.Unlock()

sm.SmartDevices = make([]*DeviceInfo, 0)
scan := &scanOutput{}

if err := json.Unmarshal(output, scan); err != nil {
fmt.Printf("Failed to parse JSON: %v\n", err)
return false
}

scannedDeviceNameMap := make(map[string]bool)

for _, device := range scan.Devices {
deviceInfo := &DeviceInfo{
Name: device.Name,
Type: device.Type,
InfoName: device.InfoName,
Protocol: device.Protocol,
}
sm.SmartDevices = append(sm.SmartDevices, deviceInfo)
scannedDeviceNameMap[device.Name] = true
}
// remove devices that are not in the scan
for key := range sm.SmartDataMap {
if _, ok := scannedDeviceNameMap[key]; !ok {
delete(sm.SmartDataMap, key)
}
}
devicesString := ""
for _, device := range sm.SmartDevices {
devicesString += device.Name + " "
}

return true
}

// parseSmartForScsi parses the output of smartctl --all -j /dev/sdX and updates the SmartDataMap
func (sm *SmartManager) parseSmartForScsi(output []byte) bool {
data := &smart.SmartInfoForSata{}

if err := json.Unmarshal(output, &data); err != nil {
return false
}

sm.mutex.Lock()
defer sm.mutex.Unlock()

// get device name (e.g. /dev/sda)
deviceName := data.Device.Name

// if device does not exist in SmartDataMap, initialize it
if _, ok := sm.SmartDataMap[deviceName]; !ok {
sm.SmartDataMap[deviceName] = &system.SmartData{}
}

// update SmartData
smartData := sm.SmartDataMap[deviceName]
smartData.ModelFamily = data.ModelFamily
smartData.ModelName = data.ModelName
smartData.SerialNumber = data.SerialNumber
smartData.FirmwareVersion = data.FirmwareVersion
smartData.Capacity = data.UserCapacity.Bytes
if data.SmartStatus.Passed {
smartData.SmartStatus = "PASSED"
} else {
smartData.SmartStatus = "FAILED"
}
smartData.DiskName = deviceName
smartData.DiskType = data.Device.Type

// update SmartAttributes
smartData.Attributes = make([]*system.SmartAttribute, 0, len(data.AtaSmartAttributes.Table))
for _, attr := range data.AtaSmartAttributes.Table {
smartAttr := &system.SmartAttribute{
Id: attr.ID,
Name: attr.Name,
Value: attr.Value,
Worst: attr.Worst,
Threshold: attr.Thresh,
RawValue: attr.Raw.Value,
RawString: attr.Raw.String,
Flags: attr.Flags.String,
WhenFailed: attr.WhenFailed,
}
smartData.Attributes = append(smartData.Attributes, smartAttr)
}
smartData.Temperature = data.Temperature.Current
sm.SmartDataMap[deviceName] = smartData

return true
}

// parseSmartForNvme parses the output of smartctl --all -j /dev/nvmeX and updates the SmartDataMap
func (sm *SmartManager) parseSmartForNvme(output []byte) bool {
data := &smart.SmartInfoForNvme{}

if err := json.Unmarshal(output, &data); err != nil {
return false
}

sm.mutex.Lock()
defer sm.mutex.Unlock()

// get device name (e.g. /dev/nvme0)
deviceName := data.Device.Name

// if device does not exist in SmartDataMap, initialize it
if _, ok := sm.SmartDataMap[deviceName]; !ok {
sm.SmartDataMap[deviceName] = &system.SmartData{}
}

// update SmartData
smartData := sm.SmartDataMap[deviceName]
smartData.ModelName = data.ModelName
smartData.SerialNumber = data.SerialNumber
smartData.FirmwareVersion = data.FirmwareVersion
smartData.Capacity = data.UserCapacity.Bytes
if data.SmartStatus.Passed {
smartData.SmartStatus = "PASSED"
} else {
smartData.SmartStatus = "FAILED"
}
smartData.DiskName = deviceName
smartData.DiskType = data.Device.Type

v := reflect.ValueOf(data.NVMeSmartHealthInformationLog)
t := v.Type()
smartData.Attributes = make([]*system.SmartAttribute, 0, v.NumField())

// nvme attributes does not follow the same format as ata attributes,
// so we have to manually iterate over the fields abd update SmartAttributes
for i := 0; i < v.NumField(); i++ {
field := t.Field(i)
value := v.Field(i)
key := field.Name
val := value.Interface()
// drop non int values
if _, ok := val.(int); !ok {
continue
}
smartAttr := &system.SmartAttribute{
Name: key,
Value: val.(int),
}
smartData.Attributes = append(smartData.Attributes, smartAttr)
}
smartData.Temperature = data.NVMeSmartHealthInformationLog.Temperature

sm.SmartDataMap[deviceName] = smartData

return true
}

// detectSmartctl checks if smartctl is installed, returns an error if not
func (sm *SmartManager) detectSmartctl() error {
if _, err := exec.LookPath("smartctl"); err == nil {
return nil
}
return fmt.Errorf("no smartctl found - install smartctl")
}

// NewGPUManager creates and initializes a new GPUManager
func NewSmartManager() (*SmartManager, error) {
var sm SmartManager
if err := sm.detectSmartctl(); err != nil {
return nil, err
}

go sm.Start()

return &sm, nil
}
11 changes: 11 additions & 0 deletions beszel/internal/agent/system.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,17 @@ func (a *Agent) getSystemStats() system.Stats {
}
}
}
if a.smartManager != nil {
if smartData := a.smartManager.GetCurrentData(); len(smartData) > 0 {
systemStats.SmartData = smartData
if systemStats.Temperatures == nil {
systemStats.Temperatures = make(map[string]float64, len(a.smartManager.SmartDataMap))
}
for key, value := range a.smartManager.SmartDataMap {
systemStats.Temperatures[key] = float64(value.Temperature)
}
}
}

// update base system info
a.systemInfo.Cpu = systemStats.Cpu
Expand Down
Loading