openshift-usage-scripts/openshift_metrics/invoice.py at 49c6c0e9470b68cbd265a554e2854833f26ec50d · naved001/openshift-usage-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
import math
from dataclasses import dataclass, field
from collections import namedtuple
from typing import List, Tuple, Optional
from decimal import Decimal, ROUND_HALF_UP
import datetime

# GPU types
GPU_A100 = "NVIDIA-A100-40GB"
GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
GPU_V100 = "Tesla-V100-PCIE-32GB"
GPU_H100 = "NVIDIA-H100-80GB-HBM3"
GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"

# GPU Resource - MIG Geometries
# A100 Strategies
MIG_1G_5GB = "nvidia.com/mig-1g.5gb"
MIG_2G_10GB = "nvidia.com/mig-2g.10gb"
MIG_3G_20GB = "nvidia.com/mig-3g.20gb"
WHOLE_GPU = "nvidia.com/gpu"

# VM GPU Resources
VM_GPU_H100 = "nvidia.com/H100_SXM5_80GB"
VM_GPU_A100_SXM4 = "nvidia.com/A100_SXM4_40GB"
VM_GPU_V100 = "nvidia.com/GV100GL_Tesla_V100"

# SU Types
SU_CPU = "OpenShift CPU"
SU_A100_GPU = "OpenShift GPUA100"
SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
SU_V100_GPU = "OpenShift GPUV100"
SU_H100_GPU = "OpenShift GPUH100"
SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU"
SU_UNKNOWN = "Openshift Unknown"

ServiceUnit = namedtuple("ServiceUnit", ["su_type", "su_count", "determinig_resource"])


@dataclass
class Pod:
    """Object that represents a pod"""

    pod_name: str
    namespace: str
    start_time: int
    duration: int
    cpu_request: Decimal
    gpu_request: Decimal
    memory_request: Decimal
    gpu_type: str
    gpu_resource: str
    node_hostname: str
    node_model: str

    def get_service_unit(self, su_definitions) -> ServiceUnit:
        """
        Returns the type of service unit, the count, and the determining resource
        """
        su_type = SU_UNKNOWN
        su_count = 0

        # pods that requested a specific GPU but weren't scheduled may report 0 GPU
        if self.gpu_resource is not None and self.gpu_request == 0:
            return ServiceUnit(SU_UNKNOWN_GPU, 0, "GPU")

        # pods in weird states
        if self.cpu_request == 0 or self.memory_request == 0:
            return ServiceUnit(SU_UNKNOWN, 0, "CPU")

        known_gpu_su = {
            GPU_A100: SU_A100_GPU,
            GPU_A100_SXM4: SU_A100_SXM4_GPU,
            GPU_V100: SU_V100_GPU,
            GPU_H100: SU_H100_GPU,
        }

        A100_SXM4_MIG = {
            MIG_1G_5GB: SU_UNKNOWN_MIG_GPU,
            MIG_2G_10GB: SU_UNKNOWN_MIG_GPU,
            MIG_3G_20GB: SU_UNKNOWN_MIG_GPU,
        }

        if self.gpu_resource is None and self.gpu_request == 0:
            su_type = SU_CPU
        elif self.gpu_type is not None and self.gpu_resource == WHOLE_GPU:
            su_type = known_gpu_su.get(self.gpu_type, SU_UNKNOWN_GPU)
        elif self.gpu_resource == VM_GPU_A100_SXM4:
            su_type = SU_A100_SXM4_GPU
        elif self.gpu_resource == VM_GPU_H100:
            su_type = SU_H100_GPU
        elif self.gpu_resource == VM_GPU_V100:
            su_type = SU_V100_GPU
        elif self.gpu_type == GPU_A100_SXM4:  # for MIG GPU of type A100_SXM4
            su_type = A100_SXM4_MIG.get(self.gpu_resource, SU_UNKNOWN_MIG_GPU)
        else:
            return ServiceUnit(SU_UNKNOWN_GPU, 0, "GPU")

        cpu_multiplier = self.cpu_request / int(su_definitions[su_type]["vCPUs"])
        memory_multiplier = self.memory_request / int(
            (int(su_definitions[su_type]["RAM"]) / 1024)
        )
        if int(su_definitions[su_type]["GPUs"]) != 0:
            gpu_multiplier = self.gpu_request / int(su_definitions[su_type]["GPUs"])
        else:
            gpu_multiplier = 0

        su_count = max(cpu_multiplier, gpu_multiplier, memory_multiplier)

        # no fractional SUs for GPU SUs
        if su_type != SU_CPU:
            su_count = math.ceil(su_count)

        if gpu_multiplier >= cpu_multiplier and gpu_multiplier >= memory_multiplier:
            determining_resource = "GPU"
        elif cpu_multiplier >= gpu_multiplier and cpu_multiplier >= memory_multiplier:
            determining_resource = "CPU"
        else:
            determining_resource = "RAM"

        return ServiceUnit(su_type, su_count, determining_resource)

    def get_runtime(
        self, ignore_times: List[Tuple[datetime.datetime, datetime.datetime]] = None
    ) -> Decimal:
        """Return runtime eligible for billing in hours"""

        total_runtime = self.duration

        if ignore_times:
            for ignore_start_date, ignore_end_date in ignore_times:
                ignore_start = int(ignore_start_date.timestamp())
                ignore_end = int(ignore_end_date.timestamp())
                if ignore_end <= self.start_time or ignore_start >= self.end_time:
                    continue
                overlap_start = max(self.start_time, ignore_start)
                overlap_end = min(self.end_time, ignore_end)

                overlap_duration = max(0, overlap_end - overlap_start)
                total_runtime = max(0, total_runtime - overlap_duration)

        return Decimal(total_runtime) / 3600

    @property
    def end_time(self) -> int:
        return self.start_time + self.duration

    def generate_pod_row(self, ignore_times, su_definitions):
        """
        This returns a row to represent pod data.
        It converts the epoch_time stamps to datetime timestamps so it's more readable.
        Additionally, some metrics are rounded for readibility.
        """
        su_type, su_count, determining_resource = self.get_service_unit(su_definitions)
        start_time = datetime.datetime.fromtimestamp(
            self.start_time, datetime.UTC
        ).strftime("%Y-%m-%dT%H:%M:%S")
        end_time = datetime.datetime.fromtimestamp(
            self.end_time, datetime.UTC
        ).strftime("%Y-%m-%dT%H:%M:%S")
        memory_request = self.memory_request.quantize(
            Decimal(".0001"), rounding=ROUND_HALF_UP
        )
        runtime = self.get_runtime(ignore_times).quantize(
            Decimal(".0001"), rounding=ROUND_HALF_UP
        )
        return [
            self.namespace,
            start_time,
            end_time,
            runtime,
            self.pod_name,
            self.cpu_request,
            self.gpu_request,
            self.gpu_type,
            self.gpu_resource,
            self.node_hostname,
            self.node_model,
            memory_request,
            determining_resource,
            su_type,
            su_count,
        ]


@dataclass()
class Rates:
    cpu: Decimal
    gpu_a100: Decimal
    gpu_a100sxm4: Decimal
    gpu_v100: Decimal
    gpu_h100: Decimal


@dataclass()
class ReportMetadata:
    report_month: str
    cluster_name: str
    report_start_time: datetime.datetime
    report_end_time: datetime.datetime
    generated_at: datetime.datetime


@dataclass
class ProjectInvoce:
    """Represents the invoicing data for a project."""

    project: str
    project_id: str
    rates: Rates
    su_definitions: dict
    ignore_hours: Optional[List[Tuple[datetime.datetime, datetime.datetime]]] = None
    su_hours: dict = field(
        default_factory=lambda: {
            SU_CPU: 0,
            SU_A100_GPU: 0,
            SU_A100_SXM4_GPU: 0,
            SU_V100_GPU: 0,
            SU_H100_GPU: 0,
            SU_UNKNOWN_GPU: 0,
            SU_UNKNOWN_MIG_GPU: 0,
            SU_UNKNOWN: 0,
        }
    )

    def add_pod(self, pod: Pod) -> None:
        """Aggregate a pods data"""
        su_type, su_count, _ = pod.get_service_unit(self.su_definitions)
        duration_in_hours = pod.get_runtime(self.ignore_hours)
        self.su_hours[su_type] += su_count * duration_in_hours

    def get_rate(self, su_type) -> Decimal:
        if su_type == SU_CPU:
            return self.rates.cpu
        if su_type == SU_A100_GPU:
            return self.rates.gpu_a100
        if su_type == SU_A100_SXM4_GPU:
            return self.rates.gpu_a100sxm4
        if su_type == SU_V100_GPU:
            return self.rates.gpu_v100
        if su_type == SU_H100_GPU:
            return self.rates.gpu_h100
        return Decimal(0)

    def generate_invoice_rows(self, metadata: ReportMetadata) -> List[str]:
        rows = []
        for su_type, hours in self.su_hours.items():
            if hours > 0:
                hours = math.ceil(hours)
                rate = self.get_rate(su_type)
                cost = (rate * hours).quantize(Decimal(".01"), rounding=ROUND_HALF_UP)
                row = [
                    metadata.report_month,
                    metadata.report_start_time.isoformat(timespec="seconds"),
                    metadata.report_end_time.isoformat(timespec="seconds"),
                    self.project,
                    self.project_id,
                    "",  # pi
                    metadata.cluster_name,
                    "",  # invoice_email
                    "",  # invoice_address
                    "",  # institution
                    "",  # institution_specific_code
                    hours,
                    su_type,
                    rate,
                    cost,
                    metadata.generated_at.isoformat(timespec="seconds"),
                ]
                rows.append(row)
        return rows