forked from CCI-MOC/openshift-usage-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinvoice.py
More file actions
271 lines (235 loc) · 8.86 KB
/
invoice.py
File metadata and controls
271 lines (235 loc) · 8.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
import math
from dataclasses import dataclass, field
from collections import namedtuple
from typing import List, Tuple, Optional
from decimal import Decimal, ROUND_HALF_UP
import datetime
# GPU types
GPU_A100 = "NVIDIA-A100-40GB"
GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
GPU_V100 = "Tesla-V100-PCIE-32GB"
GPU_H100 = "NVIDIA-H100-80GB-HBM3"
GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"
# GPU Resource - MIG Geometries
# A100 Strategies
MIG_1G_5GB = "nvidia.com/mig-1g.5gb"
MIG_2G_10GB = "nvidia.com/mig-2g.10gb"
MIG_3G_20GB = "nvidia.com/mig-3g.20gb"
WHOLE_GPU = "nvidia.com/gpu"
# VM GPU Resources
VM_GPU_H100 = "nvidia.com/H100_SXM5_80GB"
VM_GPU_A100_SXM4 = "nvidia.com/A100_SXM4_40GB"
VM_GPU_V100 = "nvidia.com/GV100GL_Tesla_V100"
# SU Types
SU_CPU = "OpenShift CPU"
SU_A100_GPU = "OpenShift GPUA100"
SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
SU_V100_GPU = "OpenShift GPUV100"
SU_H100_GPU = "OpenShift GPUH100"
SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU"
SU_UNKNOWN = "Openshift Unknown"
ServiceUnit = namedtuple("ServiceUnit", ["su_type", "su_count", "determinig_resource"])
@dataclass
class Pod:
"""Object that represents a pod"""
pod_name: str
namespace: str
start_time: int
duration: int
cpu_request: Decimal
gpu_request: Decimal
memory_request: Decimal
gpu_type: str
gpu_resource: str
node_hostname: str
node_model: str
def get_service_unit(self, su_definitions) -> ServiceUnit:
"""
Returns the type of service unit, the count, and the determining resource
"""
su_type = SU_UNKNOWN
su_count = 0
# pods that requested a specific GPU but weren't scheduled may report 0 GPU
if self.gpu_resource is not None and self.gpu_request == 0:
return ServiceUnit(SU_UNKNOWN_GPU, 0, "GPU")
# pods in weird states
if self.cpu_request == 0 or self.memory_request == 0:
return ServiceUnit(SU_UNKNOWN, 0, "CPU")
known_gpu_su = {
GPU_A100: SU_A100_GPU,
GPU_A100_SXM4: SU_A100_SXM4_GPU,
GPU_V100: SU_V100_GPU,
GPU_H100: SU_H100_GPU,
}
A100_SXM4_MIG = {
MIG_1G_5GB: SU_UNKNOWN_MIG_GPU,
MIG_2G_10GB: SU_UNKNOWN_MIG_GPU,
MIG_3G_20GB: SU_UNKNOWN_MIG_GPU,
}
if self.gpu_resource is None and self.gpu_request == 0:
su_type = SU_CPU
elif self.gpu_type is not None and self.gpu_resource == WHOLE_GPU:
su_type = known_gpu_su.get(self.gpu_type, SU_UNKNOWN_GPU)
elif self.gpu_resource == VM_GPU_A100_SXM4:
su_type = SU_A100_SXM4_GPU
elif self.gpu_resource == VM_GPU_H100:
su_type = SU_H100_GPU
elif self.gpu_resource == VM_GPU_V100:
su_type = SU_V100_GPU
elif self.gpu_type == GPU_A100_SXM4: # for MIG GPU of type A100_SXM4
su_type = A100_SXM4_MIG.get(self.gpu_resource, SU_UNKNOWN_MIG_GPU)
else:
return ServiceUnit(SU_UNKNOWN_GPU, 0, "GPU")
cpu_multiplier = self.cpu_request / int(su_definitions[su_type]["vCPUs"])
memory_multiplier = self.memory_request / int(
(int(su_definitions[su_type]["RAM"]) / 1024)
)
if int(su_definitions[su_type]["GPUs"]) != 0:
gpu_multiplier = self.gpu_request / int(su_definitions[su_type]["GPUs"])
else:
gpu_multiplier = 0
su_count = max(cpu_multiplier, gpu_multiplier, memory_multiplier)
# no fractional SUs for GPU SUs
if su_type != SU_CPU:
su_count = math.ceil(su_count)
if gpu_multiplier >= cpu_multiplier and gpu_multiplier >= memory_multiplier:
determining_resource = "GPU"
elif cpu_multiplier >= gpu_multiplier and cpu_multiplier >= memory_multiplier:
determining_resource = "CPU"
else:
determining_resource = "RAM"
return ServiceUnit(su_type, su_count, determining_resource)
def get_runtime(
self, ignore_times: List[Tuple[datetime.datetime, datetime.datetime]] = None
) -> Decimal:
"""Return runtime eligible for billing in hours"""
total_runtime = self.duration
if ignore_times:
for ignore_start_date, ignore_end_date in ignore_times:
ignore_start = int(ignore_start_date.timestamp())
ignore_end = int(ignore_end_date.timestamp())
if ignore_end <= self.start_time or ignore_start >= self.end_time:
continue
overlap_start = max(self.start_time, ignore_start)
overlap_end = min(self.end_time, ignore_end)
overlap_duration = max(0, overlap_end - overlap_start)
total_runtime = max(0, total_runtime - overlap_duration)
return Decimal(total_runtime) / 3600
@property
def end_time(self) -> int:
return self.start_time + self.duration
def generate_pod_row(self, ignore_times, su_definitions):
"""
This returns a row to represent pod data.
It converts the epoch_time stamps to datetime timestamps so it's more readable.
Additionally, some metrics are rounded for readibility.
"""
su_type, su_count, determining_resource = self.get_service_unit(su_definitions)
start_time = datetime.datetime.fromtimestamp(
self.start_time, datetime.UTC
).strftime("%Y-%m-%dT%H:%M:%S")
end_time = datetime.datetime.fromtimestamp(
self.end_time, datetime.UTC
).strftime("%Y-%m-%dT%H:%M:%S")
memory_request = self.memory_request.quantize(
Decimal(".0001"), rounding=ROUND_HALF_UP
)
runtime = self.get_runtime(ignore_times).quantize(
Decimal(".0001"), rounding=ROUND_HALF_UP
)
return [
self.namespace,
start_time,
end_time,
runtime,
self.pod_name,
self.cpu_request,
self.gpu_request,
self.gpu_type,
self.gpu_resource,
self.node_hostname,
self.node_model,
memory_request,
determining_resource,
su_type,
su_count,
]
@dataclass()
class Rates:
cpu: Decimal
gpu_a100: Decimal
gpu_a100sxm4: Decimal
gpu_v100: Decimal
gpu_h100: Decimal
@dataclass()
class ReportMetadata:
report_month: str
cluster_name: str
report_start_time: datetime.datetime
report_end_time: datetime.datetime
generated_at: datetime.datetime
@dataclass
class ProjectInvoce:
"""Represents the invoicing data for a project."""
project: str
project_id: str
rates: Rates
su_definitions: dict
ignore_hours: Optional[List[Tuple[datetime.datetime, datetime.datetime]]] = None
su_hours: dict = field(
default_factory=lambda: {
SU_CPU: 0,
SU_A100_GPU: 0,
SU_A100_SXM4_GPU: 0,
SU_V100_GPU: 0,
SU_H100_GPU: 0,
SU_UNKNOWN_GPU: 0,
SU_UNKNOWN_MIG_GPU: 0,
SU_UNKNOWN: 0,
}
)
def add_pod(self, pod: Pod) -> None:
"""Aggregate a pods data"""
su_type, su_count, _ = pod.get_service_unit(self.su_definitions)
duration_in_hours = pod.get_runtime(self.ignore_hours)
self.su_hours[su_type] += su_count * duration_in_hours
def get_rate(self, su_type) -> Decimal:
if su_type == SU_CPU:
return self.rates.cpu
if su_type == SU_A100_GPU:
return self.rates.gpu_a100
if su_type == SU_A100_SXM4_GPU:
return self.rates.gpu_a100sxm4
if su_type == SU_V100_GPU:
return self.rates.gpu_v100
if su_type == SU_H100_GPU:
return self.rates.gpu_h100
return Decimal(0)
def generate_invoice_rows(self, metadata: ReportMetadata) -> List[str]:
rows = []
for su_type, hours in self.su_hours.items():
if hours > 0:
hours = math.ceil(hours)
rate = self.get_rate(su_type)
cost = (rate * hours).quantize(Decimal(".01"), rounding=ROUND_HALF_UP)
row = [
metadata.report_month,
metadata.report_start_time.isoformat(timespec="seconds"),
metadata.report_end_time.isoformat(timespec="seconds"),
self.project,
self.project_id,
"", # pi
metadata.cluster_name,
"", # invoice_email
"", # invoice_address
"", # institution
"", # institution_specific_code
hours,
su_type,
rate,
cost,
metadata.generated_at.isoformat(timespec="seconds"),
]
rows.append(row)
return rows