Skip to content

Commit 76b6d4d

Browse files
committed
[no-relnote] Add functions to create gpu device nodes
Signed-off-by: Evan Lezar <[email protected]>
1 parent 4523b2e commit 76b6d4d

File tree

3 files changed

+145
-0
lines changed

3 files changed

+145
-0
lines changed

internal/system/nvdevices/control-device-nodes.go

+23
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@
1717
package nvdevices
1818

1919
import (
20+
"errors"
2021
"fmt"
2122
"path/filepath"
2223
"strings"
2324

2425
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
26+
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
2527
)
2628

2729
// A controlDeviceNode represents an NVIDIA devices node for control or meta devices.
@@ -43,6 +45,27 @@ func (m *Interface) CreateNVIDIAControlDevices() error {
4345
return nil
4446
}
4547

48+
// CreateNVIDIACapsControlDeviceNodes creates the nvidia-caps control device nodes at the configured devRoot.
49+
func (m *Interface) CreateNVIDIACapsControlDeviceNodes() error {
50+
capsMajor, exists := m.Get("nvidia-caps")
51+
if !exists {
52+
return nil
53+
}
54+
55+
var errs error
56+
for _, migCap := range []nvcaps.MigCap{"config", "monitor"} {
57+
migMinor, exists := m.migCaps[migCap]
58+
if !exists {
59+
continue
60+
}
61+
deviceNodePath := migMinor.DevicePath()
62+
if err := m.createDeviceNode(deviceNodePath, int(capsMajor), int(migMinor)); err != nil {
63+
errs = errors.Join(errs, fmt.Errorf("failed to create nvidia-caps device node %v: %w", deviceNodePath, err))
64+
}
65+
}
66+
return errs
67+
}
68+
4669
// createControlDeviceNode creates the specified NVIDIA device node at the configured devRoot.
4770
func (m *Interface) createControlDeviceNode(node controlDeviceNode) error {
4871
if !strings.HasPrefix(string(node), "nvidia") {

internal/system/nvdevices/devices.go

+46
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,14 @@ import (
2020
"errors"
2121
"fmt"
2222
"path/filepath"
23+
"strconv"
24+
"strings"
25+
26+
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
2327

2428
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
2529
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
30+
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
2631
)
2732

2833
var errInvalidDeviceNode = errors.New("invalid device node")
@@ -37,6 +42,8 @@ type Interface struct {
3742
// devRoot is the root directory where device nodes are expected to exist.
3843
devRoot string
3944

45+
migCaps nvcaps.MigCaps
46+
4047
mknoder
4148
}
4249

@@ -61,6 +68,14 @@ func New(opts ...Option) (*Interface, error) {
6168
i.Devices = devices
6269
}
6370

71+
if i.migCaps == nil {
72+
migCaps, err := nvcaps.NewMigCaps()
73+
if err != nil {
74+
return nil, fmt.Errorf("failed to load MIG caps: %w", err)
75+
}
76+
i.migCaps = migCaps
77+
}
78+
6479
if i.dryRun {
6580
i.mknoder = &mknodLogger{i.logger}
6681
} else {
@@ -69,6 +84,37 @@ func New(opts ...Option) (*Interface, error) {
6984
return i, nil
7085
}
7186

87+
// CreateDeviceNodes creates the device nodes for a device with the specified identifier.
88+
// A list of created device nodes are returned and an error.
89+
func (m *Interface) CreateDeviceNodes(id device.Identifier) error {
90+
switch {
91+
case id.IsGpuIndex():
92+
index, err := strconv.Atoi(string(id))
93+
if err != nil {
94+
return fmt.Errorf("invalid GPU index: %v", id)
95+
}
96+
return m.createGPUDeviceNode(index)
97+
case id.IsMigIndex():
98+
indices := strings.Split(string(id), ":")
99+
if len(indices) != 2 {
100+
return fmt.Errorf("invalid MIG index %v", id)
101+
}
102+
gpuIndex, err := strconv.Atoi(indices[0])
103+
if err != nil {
104+
return fmt.Errorf("invalid parent index %v: %w", indices[0], err)
105+
}
106+
if err := m.createGPUDeviceNode(gpuIndex); err != nil {
107+
return fmt.Errorf("failed to create parent device node: %w", err)
108+
}
109+
110+
return m.createMigDeviceNodes(gpuIndex)
111+
case id.IsGpuUUID(), id.IsMigUUID(), id == "all":
112+
return m.createAllGPUDeviceNodes()
113+
default:
114+
return fmt.Errorf("invalid device identifier: %v", id)
115+
}
116+
}
117+
72118
// createDeviceNode creates the specified device node with the require major and minor numbers.
73119
// If a devRoot is configured, this is prepended to the path.
74120
func (m *Interface) createDeviceNode(path string, major int, minor int) error {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
/**
2+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
**/
16+
17+
package nvdevices
18+
19+
import (
20+
"errors"
21+
"fmt"
22+
"path/filepath"
23+
24+
"github.com/NVIDIA/go-nvlib/pkg/nvpci"
25+
26+
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
27+
)
28+
29+
func (m *Interface) createGPUDeviceNode(gpuIndex int) error {
30+
major, exists := m.Get(devices.NVIDIAGPU)
31+
if !exists {
32+
return fmt.Errorf("failed to determine device major; nvidia kernel module may not be loaded")
33+
}
34+
35+
deviceNodePath := fmt.Sprintf("/dev/nvidia%d", gpuIndex)
36+
if err := m.createDeviceNode(deviceNodePath, int(major), gpuIndex); err != nil {
37+
return fmt.Errorf("failed to create device node %v: %w", deviceNodePath, err)
38+
}
39+
return nil
40+
}
41+
42+
func (m *Interface) createMigDeviceNodes(gpuIndex int) error {
43+
capsMajor, exists := m.Get("nvidia-caps")
44+
if !exists {
45+
return nil
46+
}
47+
var errs error
48+
for _, capsDeviceMinor := range m.migCaps.FilterForGPU(gpuIndex) {
49+
capDevicePath := capsDeviceMinor.DevicePath()
50+
err := m.createDeviceNode(capDevicePath, int(capsMajor), int(capsDeviceMinor))
51+
errs = errors.Join(errs, fmt.Errorf("failed to create %v: %w", capDevicePath, err))
52+
}
53+
return errs
54+
}
55+
56+
func (m *Interface) createAllGPUDeviceNodes() error {
57+
gpus, err := nvpci.New(
58+
nvpci.WithPCIDevicesRoot(filepath.Join(m.devRoot, nvpci.PCIDevicesRoot)),
59+
nvpci.WithLogger(m.logger),
60+
).GetGPUs()
61+
if err != nil {
62+
return fmt.Errorf("failed to get GPU information from PCI: %w", err)
63+
}
64+
65+
count := len(gpus)
66+
if count == 0 {
67+
return nil
68+
}
69+
70+
var errs error
71+
for gpuIndex := 0; gpuIndex < count; gpuIndex++ {
72+
errs = errors.Join(errs, m.createGPUDeviceNode(gpuIndex))
73+
errs = errors.Join(errs, m.createMigDeviceNodes(gpuIndex))
74+
}
75+
return errs
76+
}

0 commit comments

Comments
 (0)