Skip to content

Commit 6fbbd6c

Browse files
trentmlegendecas
andauthored
feat(host-metrics): add 'metricGroups' configuration option to limit which metrics are collected (#3149)
Collection of some host-metrics metrics can be costly. Using a Metrics View can be used to drop metrics, but there will still be the cost of having collected them in the first place. This adds a config option to select which groups of metrics should be collected. Co-authored-by: Chengzhong Wu <[email protected]>
1 parent 835b997 commit 6fbbd6c

File tree

4 files changed

+256
-106
lines changed

4 files changed

+256
-106
lines changed

packages/host-metrics/README.md

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@ const hostMetrics = new HostMetrics({ meterProvider });
4040
hostMetrics.start();
4141
```
4242

43+
## Configuration
44+
45+
| Option | Type | Description |
46+
| -------------- | ---------- | ----------- |
47+
| `metricGroups` | `string[]` | Optionally specify zero or more groups of metrics to collect. This package can collect many metrics. They are grouped by metric name prefix (see the "Semantic Conventions" section below). If this option is specified, only metrics from the named groups will be collected. For example, `metricGroups: ['process.cpu', 'process.memory']` will limit collection to just those 3 metrics. |
48+
4349
## Semantic Conventions
4450

4551
This package uses Semantic Conventions [Version 1.25.0](https://github.com/open-telemetry/semantic-conventions/tree/v1.25.0/docs/system).
@@ -48,18 +54,25 @@ Ref: [opentelemetry-js/issues/4235](https://github.com/open-telemetry/openteleme
4854

4955
Metrics collected:
5056

51-
| Metric | Short Description |
52-
| --------------------------- | --------------------------------------------------------- |
53-
| `system.cpu.time` | Seconds each logical CPU spent on each mode |
54-
| `system.cpu.utilization` | CPU usage time (0-1) |
55-
| `system.memory.usage` | Reports memory in use by state |
56-
| `system.memory.utilization` | Memory usage (0-1) |
57-
| `system.network.dropped` | Count of packets that are dropped |
58-
| `system.network.errors` | Count of network errors detected |
59-
| `system.network.io` | Network flow direction |
60-
| `process.cpu.time` | Total CPU seconds |
61-
| `process.cpu.utilization` | Difference in process.cpu.time since the last measurement |
62-
| `process.memory.usage` | The amount of physical memory in use |
57+
| Metric | Short Description |
58+
| ----------------------------- | --------------------------------------------------------- |
59+
| **Group `system.cpu`** | |
60+
| `system.cpu.time` | Seconds each logical CPU spent on each mode |
61+
| `system.cpu.utilization` | CPU usage time (0-1) |
62+
| **Group `system.memory`** | |
63+
| `system.memory.usage` | Reports memory in use by state |
64+
| `system.memory.utilization` | Memory usage (0-1) |
65+
| **Group `system.network`** | |
66+
| `system.network.dropped` | Count of packets that are dropped |
67+
| `system.network.errors` | Count of network errors detected |
68+
| `system.network.io` | Network flow direction |
69+
| **Group `process.cpu`** | |
70+
| `process.cpu.time` | Total CPU seconds |
71+
| `process.cpu.utilization` | Difference in process.cpu.time since the last measurement |
72+
| **Group `process.memory`** | |
73+
| `process.memory.usage` | The amount of physical memory in use |
74+
75+
Note: the "Group" names are groupings used by the `metricGroups` configuration option.
6376

6477
Attributes collected:
6578

packages/host-metrics/src/BaseMetrics.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* limitations under the License.
1515
*/
1616

17-
import { Meter, MeterProvider, diag, metrics } from '@opentelemetry/api';
17+
import { Meter, MeterProvider, metrics } from '@opentelemetry/api';
1818

1919
/** @knipignore */
2020
import { PACKAGE_NAME, PACKAGE_VERSION } from './version';
@@ -27,6 +27,7 @@ export interface MetricsCollectorConfig {
2727
meterProvider?: MeterProvider;
2828
// Name of component
2929
name?: string;
30+
metricGroups?: string[];
3031
}
3132

3233
const DEFAULT_NAME = PACKAGE_NAME;
@@ -35,16 +36,17 @@ const DEFAULT_NAME = PACKAGE_NAME;
3536
* Base Class for metrics
3637
*/
3738
export abstract class BaseMetrics {
38-
protected _logger = diag;
3939
protected _meter: Meter;
4040
private _name: string;
41+
protected _metricGroups: Array<string> | undefined;
4142

4243
constructor(config?: MetricsCollectorConfig) {
4344
// Do not use `??` operator to allow falling back to default when the
4445
// specified name is an empty string.
4546
this._name = config?.name || DEFAULT_NAME;
4647
const meterProvider = config?.meterProvider ?? metrics.getMeterProvider();
4748
this._meter = meterProvider.getMeter(this._name, PACKAGE_VERSION);
49+
this._metricGroups = config?.metricGroups;
4850
}
4951

5052
/**

packages/host-metrics/src/metric.ts

Lines changed: 114 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -205,102 +205,130 @@ export class HostMetrics extends BaseMetrics {
205205
* Creates metrics
206206
*/
207207
protected _createMetrics(): void {
208-
this._cpuTime = this._meter.createObservableCounter(
209-
METRIC_SYSTEM_CPU_TIME,
210-
{
211-
description: 'Cpu time in seconds',
212-
unit: 's',
213-
}
214-
);
215-
this._cpuUtilization = this._meter.createObservableGauge(
216-
METRIC_SYSTEM_CPU_UTILIZATION,
217-
{
218-
description: 'Cpu usage time 0-1',
219-
}
220-
);
208+
const observables = [];
221209

222-
this._memoryUsage = this._meter.createObservableGauge(
223-
METRIC_SYSTEM_MEMORY_USAGE,
224-
{
225-
description: 'Memory usage in bytes',
226-
}
227-
);
228-
this._memoryUtilization = this._meter.createObservableGauge(
229-
METRIC_SYSTEM_MEMORY_UTILIZATION,
230-
{
231-
description: 'Memory usage 0-1',
232-
}
233-
);
210+
const systemCpuGroupEnabled =
211+
!this._metricGroups || this._metricGroups.includes('system.cpu');
212+
const systemMemoryGroupEnabled =
213+
!this._metricGroups || this._metricGroups.includes('system.memory');
214+
const systemNetworkGroupEnabled =
215+
!this._metricGroups || this._metricGroups.includes('system.network');
216+
const processCpuGroupEnabled =
217+
!this._metricGroups || this._metricGroups.includes('process.cpu');
218+
const processMemoryGroupEnabled =
219+
!this._metricGroups || this._metricGroups.includes('process.memory');
234220

235-
this._networkDropped = this._meter.createObservableCounter(
236-
// There is no semconv pkg export for this in v1.37.0 because
237-
// https://github.com/open-telemetry/semantic-conventions/issues/2828.
238-
// TODO: update to `METRIC_SYSTEM_NETWORK_PACKET_DROPPED` (breaking change)
239-
'system.network.dropped',
240-
{
241-
description: 'Network dropped packets',
242-
}
243-
);
244-
this._networkErrors = this._meter.createObservableCounter(
245-
METRIC_SYSTEM_NETWORK_ERRORS,
246-
{
247-
description: 'Network errors counter',
248-
}
249-
);
250-
this._networkIo = this._meter.createObservableCounter(
251-
METRIC_SYSTEM_NETWORK_IO,
252-
{
253-
description: 'Network transmit and received bytes',
254-
}
255-
);
221+
if (systemCpuGroupEnabled) {
222+
this._cpuTime = this._meter.createObservableCounter(
223+
METRIC_SYSTEM_CPU_TIME,
224+
{
225+
description: 'Cpu time in seconds',
226+
unit: 's',
227+
}
228+
);
229+
observables.push(this._cpuTime);
230+
this._cpuUtilization = this._meter.createObservableGauge(
231+
METRIC_SYSTEM_CPU_UTILIZATION,
232+
{
233+
description: 'Cpu usage time 0-1',
234+
}
235+
);
236+
observables.push(this._cpuUtilization);
237+
}
256238

257-
this._processCpuTime = this._meter.createObservableCounter(
258-
METRIC_PROCESS_CPU_TIME,
259-
{
260-
description: 'Process Cpu time in seconds',
261-
unit: 's',
262-
}
263-
);
264-
this._processCpuUtilization = this._meter.createObservableGauge(
265-
METRIC_PROCESS_CPU_UTILIZATION,
266-
{
267-
description: 'Process Cpu usage time 0-1',
268-
}
269-
);
270-
this._processMemoryUsage = this._meter.createObservableGauge(
271-
METRIC_PROCESS_MEMORY_USAGE,
272-
{
273-
description: 'Process Memory usage in bytes',
274-
}
275-
);
239+
if (systemMemoryGroupEnabled) {
240+
this._memoryUsage = this._meter.createObservableGauge(
241+
METRIC_SYSTEM_MEMORY_USAGE,
242+
{
243+
description: 'Memory usage in bytes',
244+
}
245+
);
246+
observables.push(this._memoryUsage);
247+
this._memoryUtilization = this._meter.createObservableGauge(
248+
METRIC_SYSTEM_MEMORY_UTILIZATION,
249+
{
250+
description: 'Memory usage 0-1',
251+
}
252+
);
253+
observables.push(this._memoryUtilization);
254+
}
276255

277-
this._meter.addBatchObservableCallback(
278-
async observableResult => {
279-
const cpuUsages = getCpuUsageData();
280-
const memoryUsages = getMemoryData();
281-
const processCpuUsages = getProcessCpuUsageData();
282-
const processMemoryUsages = getProcessMemoryData();
283-
const networkData = await getNetworkData();
256+
if (systemNetworkGroupEnabled) {
257+
this._networkDropped = this._meter.createObservableCounter(
258+
// There is no semconv pkg export for this in v1.37.0 because
259+
// https://github.com/open-telemetry/semantic-conventions/issues/2828.
260+
// TODO: update to `METRIC_SYSTEM_NETWORK_PACKET_DROPPED` (breaking change)
261+
'system.network.dropped',
262+
{
263+
description: 'Network dropped packets',
264+
}
265+
);
266+
observables.push(this._networkDropped);
267+
this._networkErrors = this._meter.createObservableCounter(
268+
METRIC_SYSTEM_NETWORK_ERRORS,
269+
{
270+
description: 'Network errors counter',
271+
}
272+
);
273+
observables.push(this._networkErrors);
274+
this._networkIo = this._meter.createObservableCounter(
275+
METRIC_SYSTEM_NETWORK_IO,
276+
{
277+
description: 'Network transmit and received bytes',
278+
}
279+
);
280+
observables.push(this._networkIo);
281+
}
282+
283+
if (processCpuGroupEnabled) {
284+
this._processCpuTime = this._meter.createObservableCounter(
285+
METRIC_PROCESS_CPU_TIME,
286+
{
287+
description: 'Process Cpu time in seconds',
288+
unit: 's',
289+
}
290+
);
291+
observables.push(this._processCpuTime);
292+
this._processCpuUtilization = this._meter.createObservableGauge(
293+
METRIC_PROCESS_CPU_UTILIZATION,
294+
{
295+
description: 'Process Cpu usage time 0-1',
296+
}
297+
);
298+
observables.push(this._processCpuUtilization);
299+
}
300+
if (processMemoryGroupEnabled) {
301+
this._processMemoryUsage = this._meter.createObservableGauge(
302+
METRIC_PROCESS_MEMORY_USAGE,
303+
{
304+
description: 'Process Memory usage in bytes',
305+
}
306+
);
307+
observables.push(this._processMemoryUsage);
308+
}
284309

310+
this._meter.addBatchObservableCallback(async observableResult => {
311+
if (systemCpuGroupEnabled) {
312+
const cpuUsages = getCpuUsageData();
285313
this._batchUpdateCpuUsages(observableResult, cpuUsages);
314+
}
315+
if (systemMemoryGroupEnabled) {
316+
const memoryUsages = getMemoryData();
286317
this._batchUpdateMemUsages(observableResult, memoryUsages);
318+
}
319+
if (processCpuGroupEnabled) {
320+
const processCpuUsages = getProcessCpuUsageData();
287321
this._batchUpdateProcessCpuUsages(observableResult, processCpuUsages);
322+
}
323+
if (processMemoryGroupEnabled) {
324+
const processMemoryUsages = getProcessMemoryData();
288325
this._batchUpdateProcessMemUsage(observableResult, processMemoryUsages);
326+
}
327+
if (systemNetworkGroupEnabled) {
328+
const networkData = await getNetworkData();
289329
this._batchUpdateNetworkData(observableResult, networkData);
290-
},
291-
[
292-
this._cpuTime,
293-
this._cpuUtilization,
294-
this._memoryUsage,
295-
this._memoryUtilization,
296-
this._processCpuTime,
297-
this._processCpuUtilization,
298-
this._processMemoryUsage,
299-
this._networkDropped,
300-
this._networkErrors,
301-
this._networkIo,
302-
]
303-
);
330+
}
331+
}, observables);
304332
}
305333

306334
/**

0 commit comments

Comments
 (0)