Skip to content

Commit 874b657

Browse files
authored
Merge pull request #1684 from sunya-ch/model-server
feat: add machine spec generator/reader for model weight request
2 parents 112a810 + d8a6c14 commit 874b657

File tree

114 files changed

+12445
-21
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

114 files changed

+12445
-21
lines changed

cmd/exporter/exporter.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ type AppConfig struct {
6464
ApiserverEnabled bool
6565
RedfishCredFilePath string
6666
ExposeEstimatedIdlePower bool
67+
MachineSpecFilePath string
6768
DisablePowerMeter bool
6869
}
6970

@@ -80,6 +81,7 @@ func newAppConfig() *AppConfig {
8081
flag.BoolVar(&_config.ApiserverEnabled, "apiserver", true, "if apiserver is disabled, we collect pod information from kubelet")
8182
flag.StringVar(&_config.RedfishCredFilePath, "redfish-cred-file-path", "", "path to the redfish credential file")
8283
flag.BoolVar(&_config.ExposeEstimatedIdlePower, "expose-estimated-idle-power", false, "estimated idle power is meaningful only if Kepler is running on bare-metal or when there is only one virtual machine on the node")
84+
flag.StringVar(&_config.MachineSpecFilePath, "machine-spec", "", "path to the machine spec file in json format")
8385
flag.BoolVar(&_config.DisablePowerMeter, "disable-power-meter", false, "whether manually disable power meter read and forcefully apply the estimator for node powers")
8486

8587
return _config
@@ -134,6 +136,10 @@ func main() {
134136
config.SetRedfishCredFilePath(appConfig.RedfishCredFilePath)
135137
}
136138

139+
if appConfig.MachineSpecFilePath != "" {
140+
config.SetMachineSpecFilePath(appConfig.MachineSpecFilePath)
141+
}
142+
137143
config.LogConfigs()
138144

139145
components.InitPowerImpl()

cmd/validator/validator.go

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
"github.com/sustainable-computing-io/kepler/pkg/sensors/components"
3333
"github.com/sustainable-computing-io/kepler/pkg/sensors/components/source"
3434
"github.com/sustainable-computing-io/kepler/pkg/sensors/platform"
35+
"github.com/sustainable-computing-io/kepler/pkg/utils"
3536
)
3637

3738
const (
@@ -171,11 +172,6 @@ func getX86Architecture() (string, error) {
171172
return uarch, err
172173
}
173174

174-
func isFileExists(path string) bool {
175-
_, err := os.Stat(path)
176-
return !os.IsNotExist(err)
177-
}
178-
179175
func main() {
180176
// init stuffs
181177
flag.Parse()
@@ -199,7 +195,7 @@ func main() {
199195
platform.InitPowerImpl()
200196

201197
csvFilePath := filepath.Join(resultDirPath, "power.csv")
202-
if !isFileExists(csvFilePath) {
198+
if !utils.IsFileExists(csvFilePath) {
203199
columnHeaders := []string{"Pkg", "Core", "Uncore", "Dram"}
204200
csvFile, e := os.Create(csvFilePath)
205201
if e != nil {

go.mod

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ require (
2020
github.com/pkg/errors v0.9.1
2121
github.com/prometheus/client_golang v1.20.2
2222
github.com/prometheus/prometheus v0.54.0
23+
github.com/shirou/gopsutil v3.21.11+incompatible
2324
github.com/sirupsen/logrus v1.9.3
2425
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56
2526
golang.org/x/sys v0.24.0
@@ -72,6 +73,9 @@ require (
7273
github.com/prometheus/procfs v0.15.1 // indirect
7374
github.com/spf13/pflag v1.0.5 // indirect
7475
github.com/stretchr/testify v1.9.0 // indirect
76+
github.com/tklauser/go-sysconf v0.3.14 // indirect
77+
github.com/tklauser/numcpus v0.8.0 // indirect
78+
github.com/yusufpapurcu/wmi v1.2.4 // indirect
7579
golang.org/x/crypto v0.26.0 // indirect
7680
golang.org/x/net v0.28.0 // indirect
7781
golang.org/x/oauth2 v0.21.0 // indirect

go.sum

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,8 @@ github.com/prometheus/prometheus v0.54.0 h1:6+VmEkohHcofl3W5LyRlhw1Lfm575w/aX6ZF
145145
github.com/prometheus/prometheus v0.54.0/go.mod h1:xlLByHhk2g3ycakQGrMaU8K7OySZx98BzeCR99991NY=
146146
github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M=
147147
github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA=
148+
github.com/shirou/gopsutil v3.21.11+incompatible h1:+1+c1VGhc88SSonWP6foOcLhvnKlUeu/erjjvaPEYiI=
149+
github.com/shirou/gopsutil v3.21.11+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
148150
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
149151
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
150152
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
@@ -154,8 +156,14 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV
154156
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
155157
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
156158
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
159+
github.com/tklauser/go-sysconf v0.3.14 h1:g5vzr9iPFFz24v2KZXs/pvpvh8/V9Fw6vQK5ZZb78yU=
160+
github.com/tklauser/go-sysconf v0.3.14/go.mod h1:1ym4lWMLUOhuBOPGtRcJm7tEGX4SCYNEEEtghGG/8uY=
161+
github.com/tklauser/numcpus v0.8.0 h1:Mx4Wwe/FjZLeQsK/6kt2EOepwwSl7SmJrK5bV/dXYgY=
162+
github.com/tklauser/numcpus v0.8.0/go.mod h1:ZJZlAY+dmR4eut8epnzf0u/VwodKmryxR8txiloSqBE=
157163
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
158164
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
165+
github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
166+
github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
159167
go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
160168
go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
161169
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=

hack/build-manifest.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ declare PROMETHEUS_DEPLOY=false
5050
declare HIGH_GRANULARITY=false
5151
declare DCGM_DEPLOY=false
5252
declare HABANA_DEPLOY=false
53+
declare MACHINE_SPEC_DEPLOY=false
5354

5455
ensure_all_tools() {
5556
header "Ensuring all tools are installed"
@@ -203,6 +204,16 @@ deploy_habana() {
203204
uncomment_patch habana "${MANIFESTS_OUT_DIR}"/exporter/kustomization.yaml
204205
ok "Habana deployment configured"
205206
}
207+
deploy_machine_spec() {
208+
header "Machine Spec Deployment"
209+
$MACHINE_SPEC_DEPLOY || {
210+
skip "skipping machine spec deployment"
211+
return 0
212+
}
213+
uncomment machine_spec_configmap "${MANIFESTS_OUT_DIR}"/exporter/kustomization.yaml
214+
uncomment_patch machine-spec "${MANIFESTS_OUT_DIR}"/exporter/kustomization.yaml
215+
ok "Machine spec deployment configured"
216+
}
206217
build_manifest() {
207218
info "Building manifests ..."
208219
for deploy in $(declare -F | cut -f3 -d ' ' | grep 'deploy_'); do

manifests/k8s/config/exporter/kustomization.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ resources:
66
# - prometheus_common_service_monitor.yaml
77
# - prometheus_common_rules.yaml
88
# - prometheus_high_granularity_rules.yaml
9+
# uncomment this line for default machine spec configmap
10+
# - machine_spec_configmap.yaml
911

1012
patchesStrategicMerge: []
1113
# add this line to allow ci
@@ -26,6 +28,8 @@ patchesStrategicMerge: []
2628
# - ./patch/patch-dcgmi.yaml
2729
# add this line for habana patch
2830
# - ./patch/patch-habana.yaml
31+
# add this line for machine spec patch
32+
# - ./patch/patch-machine-spec.yaml
2933

3034
secretGenerator:
3135
- name: redfish
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: kepler-machine-spec
5+
namespace: system
6+
data:
7+
m5.metal: |
8+
{"processor": "intel_xeon_platinum_8259cl", "cores": 96, "chips": 2, "memory": 377, "frequency": 3500}
9+
i3.metal: |
10+
{"processor": "intel_xeon_e5_2686v4", "cores": 72, "chips": 2, "memory": 503, "frequency": 3000}
11+
c5.metal: |
12+
{"processor": "intel_xeon_platinum_8275cl", "cores": 96, "chips": 2, "memory": 188, "frequency": 3900}
13+
r5.metal: |
14+
{"processor": "intel_xeon_platinum_8259cl", "cores": 96, "chips": 2, "memory": 755, "frequency": 3500}
15+
m5zn.metal: |
16+
{"processor": "intel_xeon_platinum_8252c", "cores": 48, "chips": 2, "memory": 188, "frequency": 4500}
17+
m7i.metal-24xl: |
18+
{"processor": "intel_xeon_platinum_8488c", "cores": 96, "chips": 1, "memory": 377, "frequency": 3800}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
apiVersion: apps/v1
2+
kind: DaemonSet
3+
metadata:
4+
name: kepler-exporter
5+
namespace: system
6+
spec:
7+
template:
8+
spec:
9+
containers:
10+
- name: kepler-exporter
11+
volumeMounts:
12+
- name: machine-spec
13+
mountPath: /etc/kepler/models/machine
14+
readOnly: true
15+
volumes:
16+
- name: config-models
17+
configMap:
18+
name: kepler-machine-spec
19+
items:
20+
- key: m5.metal # set a target machine refer to kepler-machine-spec configmap
21+
path: spec.json

pkg/config/config.go

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,10 @@ var (
107107
redfishSkipSSLVerify = getBoolConfig("REDFISH_SKIP_SSL_VERIFY", true)
108108

109109
////////////////////////////////////
110-
ModelServerEnable = getBoolConfig("MODEL_SERVER_ENABLE", false)
111-
ModelServerEndpoint = SetModelServerReqEndpoint()
110+
DefaultMachineSpecFilePath = "/etc/kepler/models/machine/spec.json"
111+
machineSpecFilePath string
112+
ModelServerEnable = getBoolConfig("MODEL_SERVER_ENABLE", false)
113+
ModelServerEndpoint = SetModelServerReqEndpoint()
112114
// for model config
113115
ModelConfigValues map[string]string
114116
// model_parameter_prefix
@@ -245,6 +247,22 @@ func GetMockACPIPowerPath() string {
245247
return MockACPIPowerPath
246248
}
247249

250+
func SetMachineSpecFilePath(specFilePath string) {
251+
machineSpecFilePath = specFilePath
252+
}
253+
254+
// GetMachineSpec initializes a map of MachineSpecValues from MACHINE_SPEC
255+
func GetMachineSpec() *MachineSpec {
256+
if machineSpecFilePath != "" {
257+
if spec, err := readMachineSpec(machineSpecFilePath); err == nil {
258+
return spec
259+
} else {
260+
klog.Warningf("failed to read spec from %s: %v, use default machine spec", machineSpecFilePath, err)
261+
}
262+
}
263+
return getDefaultMachineSpec()
264+
}
265+
248266
// InitModelConfigMap initializes map of config from MODEL_CONFIG
249267
func InitModelConfigMap() {
250268
ModelConfigValues = GetModelConfigMap()

pkg/config/config_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package config
1818

1919
import (
20+
"encoding/json"
2021
"os"
2122
"runtime"
2223

@@ -56,6 +57,16 @@ func createTempFile(contents string) (filename string, reterr error) {
5657
return f.Name(), nil
5758
}
5859

60+
func (spec *MachineSpec) saveToFile(path string) error {
61+
file, err := os.Create(path)
62+
if err != nil {
63+
return err
64+
}
65+
defer file.Close()
66+
encoder := json.NewEncoder(file)
67+
return encoder.Encode(spec)
68+
}
69+
5970
var _ = Describe("Test Configuration", func() {
6071
It("Test cgroup version", func() {
6172
file, err := createTempFile("")
@@ -113,4 +124,17 @@ var _ = Describe("Test Configuration", func() {
113124
// no test
114125
}
115126
})
127+
It("Test machine spec generation and read", func() {
128+
tmpPath := "./test_spec"
129+
// generate spec
130+
spec := generateSpec()
131+
Expect(spec).NotTo(BeNil())
132+
err := spec.saveToFile(tmpPath)
133+
Expect(err).To(BeNil())
134+
readSpec, err := readMachineSpec(tmpPath)
135+
Expect(err).To(BeNil())
136+
Expect(*spec).To(BeEquivalentTo(*readSpec))
137+
err = os.Remove(tmpPath)
138+
Expect(err).To(BeNil())
139+
})
116140
})

0 commit comments

Comments
 (0)