-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
Copy pathdisk.py
572 lines (472 loc) · 24.3 KB
/
disk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
# (C) Datadog, Inc. 2018-present
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)
from __future__ import division
import os
import platform
import re
from xml.etree import ElementTree as ET
import psutil
from datadog_checks.base import AgentCheck, ConfigurationError, is_affirmative
from datadog_checks.base.utils.platform import Platform
from datadog_checks.base.utils.subprocess_output import SubprocessOutputEmptyError, get_subprocess_output
from datadog_checks.base.utils.timeout import TimeoutException, timeout
if platform.system() == 'Windows':
import win32wnet
# See: https://github.com/DataDog/integrations-core/pull/1109#discussion_r167133580
IGNORE_CASE = re.I
def _base_device_name(device):
return device.strip('\\').lower()
else:
IGNORE_CASE = 0
def _base_device_name(device):
return os.path.basename(device)
class Disk(AgentCheck):
"""Collects metrics about the machine's disks."""
METRIC_DISK = 'system.disk.{}'
METRIC_INODE = 'system.fs.inodes.{}'
def __init__(self, name, init_config, instances):
if instances is not None and len(instances) > 1:
raise ConfigurationError('Disk check only supports one configured instance.')
super(Disk, self).__init__(name, init_config, instances)
instance = instances[0]
self._use_mount = is_affirmative(instance.get('use_mount', False))
self._all_partitions = is_affirmative(instance.get('all_partitions', False))
self._file_system_include = instance.get('file_system_include', []) or instance.get('file_system_whitelist', [])
self._file_system_exclude = instance.get('file_system_exclude', []) or instance.get('file_system_blacklist', [])
# FIXME (8.X): Exclude special file systems by default
self._include_all_devices = is_affirmative(instance.get('include_all_devices', True))
self._device_include = instance.get('device_include', []) or instance.get('device_whitelist', [])
self._device_exclude = instance.get('device_exclude', []) or instance.get('device_blacklist', [])
self._mount_point_include = instance.get('mount_point_include', []) or instance.get('mount_point_whitelist', [])
self._mount_point_exclude = instance.get('mount_point_exclude', []) or instance.get('mount_point_blacklist', [])
self._tag_by_filesystem = is_affirmative(instance.get('tag_by_filesystem', False))
self._tag_by_label = is_affirmative(instance.get('tag_by_label', True))
self._device_tag_re = instance.get('device_tag_re', {})
self._custom_tags = instance.get('tags', [])
self._service_check_rw = is_affirmative(instance.get('service_check_rw', False))
self._min_disk_size = instance.get('min_disk_size', 0) * 1024 * 1024
self._blkid_cache_file = instance.get('blkid_cache_file')
self._use_lsblk = is_affirmative(instance.get('use_lsblk', False))
self._timeout = instance.get('timeout', 5)
self._compile_pattern_filters(instance)
self._compile_tag_re()
self._blkid_label_re = re.compile('LABEL=\"(.*?)\"', re.I)
self._lowercase_device_tag = is_affirmative(instance.get('lowercase_device_tag', False))
if self._use_lsblk and self._blkid_cache_file:
raise ConfigurationError("Only one of 'use_lsblk' and 'blkid_cache_file' can be set at the same time.")
if platform.system() == 'Windows':
self._manual_mounts = instance.get('create_mounts', [])
self._create_manual_mounts()
deprecations_init_conf = {
'file_system_global_blacklist': 'file_system_global_exclude',
'device_global_blacklist': 'device_global_exclude',
'mount_point_global_blacklist': 'mount_point_global_exclude',
}
for old_name, new_name in deprecations_init_conf.items():
if init_config.get(old_name):
self.warning(
'`%s` is deprecated and will be removed in a future release. Please use `%s` instead.',
old_name,
new_name,
)
deprecations_instance = {
'file_system_whitelist': 'file_system_include',
'file_system_blacklist': 'file_system_exclude',
'device_whitelist': 'device_include',
'device_blacklist': 'device_exclude',
'mount_point_whitelist': 'mount_point_include',
'mount_point_blacklist': 'mount_point_exclude',
'excluded_filesystems': 'file_system_exclude',
'excluded_disks': 'device_exclude',
'excluded_disk_re': 'device_exclude',
'excluded_mountpoint_re': 'mount_point_exclude',
}
for old_name, new_name in deprecations_instance.items():
if instance.get(old_name):
self.warning(
'`%s` is deprecated and will be removed in a future release. Please use `%s` instead.',
old_name,
new_name,
)
self.devices_label = {}
def check(self, _):
"""Get disk space/inode stats"""
if self._tag_by_label and Platform.is_linux():
self.devices_label = self._get_devices_label()
for part in psutil.disk_partitions(all=self._include_all_devices):
self.log.debug('Checking device %s', part.device)
# we check all exclude conditions
if self.exclude_disk(part):
self.log.debug('Excluding device %s', part.device)
continue
# Get disk metrics here to be able to exclude on total usage
try:
disk_usage = timeout(self._timeout)(psutil.disk_usage)(part.mountpoint)
except TimeoutException:
self.log.warning(
u'Timeout after %d seconds while retrieving the disk usage of `%s` mountpoint. '
u'You might want to change the timeout length in the settings.',
self._timeout,
part.mountpoint,
)
continue
except Exception as e:
self.log.debug(
u'Unable to get disk metrics for %s: %s. '
u'You can exclude this mountpoint in the settings if it is invalid.',
part.mountpoint,
e,
)
continue
# Exclude disks with size less than min_disk_size
if disk_usage.total <= self._min_disk_size:
self.log.debug('Excluding device %s with total disk size %s', part.device, disk_usage.total)
if disk_usage.total > 0:
self.log.info('Excluding device %s with total disk size %s', part.device, disk_usage.total)
continue
self.log.debug('Passed: %s', part.device)
tags = self._get_tags(part)
for metric_name, metric_value in self._collect_part_metrics(part, disk_usage).items():
self.gauge(metric_name, metric_value, tags=tags)
# Add in a disk read write or read only check
if self._service_check_rw:
rwro = {'rw', 'ro'} & set(part.opts.split(','))
if len(rwro) == 1:
self.service_check(
'disk.read_write', AgentCheck.OK if rwro.pop() == 'rw' else AgentCheck.CRITICAL, tags=tags
)
else:
self.service_check('disk.read_write', AgentCheck.UNKNOWN, tags=tags)
self.collect_latency_metrics()
def _get_tags(self, part):
device_name = part.mountpoint if self._use_mount else part.device
tags = [part.fstype, 'filesystem:{}'.format(part.fstype)] if self._tag_by_filesystem else []
tags.extend(self._custom_tags)
# apply device-specific tags
device_specific_tags = self._get_device_specific_tags(device_name)
tags.extend(device_specific_tags)
# apply device labels as tags (from blkid or lsblk).
# we want to use the real device name and not the device_name (which can be the mountpoint)
if self.devices_label.get(part.device):
tags.extend(self.devices_label.get(part.device))
# legacy check names c: vs psutil name C:\\
if Platform.is_win32():
device_name = device_name.strip('\\').lower()
tags.append('device:{}'.format(device_name.lower() if self._lowercase_device_tag else device_name))
tags.append('device_name:{}'.format(_base_device_name(part.device)))
return tags
def exclude_disk(self, part):
# skip cd-rom drives with no disk in it; they may raise
# ENOENT, pop-up a Windows GUI error for a non-ready
# partition or just hang;
# and all the other excluded disks
skip_win = Platform.is_win32() and ('cdrom' in part.opts or part.fstype == '')
return skip_win or self._exclude_disk(part.device, part.fstype, part.mountpoint)
def _exclude_disk(self, device, file_system, mount_point):
"""
Return True for disks we don't want or that match regex in the config file
"""
if not device or device == 'none':
device = None
# Allow no device if `all_partitions` is true so we can evaluate mount points
if not self._all_partitions:
return True
# Hack for NFS secure mounts
# Secure mounts might look like this: '/mypath (deleted)', we should
# ignore all the bits not part of the mount point name. Take also into
# account a space might be in the mount point.
mount_point = mount_point.rsplit(' ', 1)[0]
return self._partition_excluded(device, file_system, mount_point) or not self._partition_included(
device, file_system, mount_point
)
def _partition_included(self, device, file_system, mount_point):
return (
self._file_system_included(file_system)
and self._device_included(device)
and self._mount_point_included(mount_point)
)
def _partition_excluded(self, device, file_system, mount_point):
return (
self._file_system_excluded(file_system)
or self._device_excluded(device)
or self._mount_point_excluded(mount_point)
)
def _file_system_included(self, file_system):
if self._file_system_include is None:
return True
return not not self._file_system_include.match(file_system)
def _file_system_excluded(self, file_system):
if self._file_system_exclude is None:
return False
return not not self._file_system_exclude.match(file_system)
def _device_included(self, device):
if not device or self._device_include is None:
return True
return not not self._device_include.match(device)
def _device_excluded(self, device):
if not device or self._device_exclude is None:
return False
return not not self._device_exclude.match(device)
def _mount_point_included(self, mount_point):
if self._mount_point_include is None:
return True
return not not self._mount_point_include.match(mount_point)
def _mount_point_excluded(self, mount_point):
if self._mount_point_exclude is None:
return False
return not not self._mount_point_exclude.match(mount_point)
def _collect_part_metrics(self, part, usage):
metrics = {}
for name in ['total', 'used', 'free']:
# For legacy reasons, the standard unit it kB
metrics[self.METRIC_DISK.format(name)] = getattr(usage, name) / 1024
metrics[self.METRIC_DISK.format('utilized')] = usage.percent
# TODO: deprecate in favor of `utilized` metric
metrics[self.METRIC_DISK.format('in_use')] = usage.percent / 100
if Platform.is_unix():
metrics.update(self._collect_inodes_metrics(part.mountpoint))
return metrics
def _collect_inodes_metrics(self, mountpoint):
metrics = {}
# we need to timeout this, too.
try:
inodes = timeout(self._timeout)(os.statvfs)(mountpoint)
except TimeoutException:
self.log.warning(
u'Timeout after %d seconds while retrieving the disk usage of `%s` mountpoint. '
u'You might want to change the timeout length in the settings.',
self._timeout,
mountpoint,
)
return metrics
except Exception as e:
self.log.debug(
u'Unable to get disk metrics for %s: %s. '
u'You can exclude this mountpoint in the settings if it is invalid.',
mountpoint,
e,
)
return metrics
if inodes.f_files != 0:
total = inodes.f_files
free = inodes.f_ffree
metrics[self.METRIC_INODE.format('total')] = total
metrics[self.METRIC_INODE.format('free')] = free
used = total - free
metrics[self.METRIC_INODE.format('used')] = used
metrics[self.METRIC_INODE.format('utilized')] = (used / total) * 100
# TODO: deprecate in favor of `utilized` metric
metrics[self.METRIC_INODE.format('in_use')] = used / total
return metrics
def collect_latency_metrics(self):
for disk_name, disk in psutil.disk_io_counters(perdisk=True).items():
self.log.debug('IO Counters: %s -> %s', disk_name, disk)
try:
metric_tags = [] if self._custom_tags is None else self._custom_tags[:]
device_specific_tags = self._get_device_specific_tags(disk_name)
metric_tags.extend(device_specific_tags)
metric_tags.append('device:{}'.format(disk_name.lower() if self._lowercase_device_tag else disk_name))
metric_tags.append('device_name:{}'.format(_base_device_name(disk_name)))
if self.devices_label.get(disk_name):
metric_tags.extend(self.devices_label.get(disk_name))
self.monotonic_count(self.METRIC_DISK.format('read_time'), disk.read_time, tags=metric_tags)
self.monotonic_count(self.METRIC_DISK.format('write_time'), disk.write_time, tags=metric_tags)
# FIXME: 8.x, metrics kept for backwards compatibility but are incorrect: the value is not a percentage
# See: https://github.com/DataDog/integrations-core/pull/7323#issuecomment-756427024
self.rate(self.METRIC_DISK.format('read_time_pct'), disk.read_time * 100 / 1000, tags=metric_tags)
self.rate(self.METRIC_DISK.format('write_time_pct'), disk.write_time * 100 / 1000, tags=metric_tags)
except AttributeError as e:
# Some OS don't return read_time/write_time fields
# http://psutil.readthedocs.io/en/latest/#psutil.disk_io_counters
self.log.debug('Latency metrics not collected for %s: %s', disk_name, e)
def _compile_pattern_filters(self, instance):
file_system_exclude_extras = self.init_config.get(
'file_system_global_exclude',
self.init_config.get('file_system_global_blacklist', self.get_default_file_system_exclude()),
)
device_exclude_extras = self.init_config.get(
'device_global_exclude', self.init_config.get('device_global_blacklist', self.get_default_device_exclude())
)
mount_point_exclude_extras = self.init_config.get(
'mount_point_global_exclude',
self.init_config.get('mount_point_global_blacklist', self.get_default_mount_mount_exclude()),
)
if 'excluded_filesystems' in instance:
file_system_exclude_extras.extend(
'{}$'.format(pattern) for pattern in instance['excluded_filesystems'] if pattern
)
if 'excluded_disks' in instance:
device_exclude_extras.extend('{}$'.format(pattern) for pattern in instance['excluded_disks'] if pattern)
if 'excluded_disk_re' in instance:
device_exclude_extras.append(instance['excluded_disk_re'])
if 'excluded_mountpoint_re' in instance:
mount_point_exclude_extras.append(instance['excluded_mountpoint_re'])
# Any without valid patterns will become None
self._file_system_include = self._compile_valid_patterns(self._file_system_include, casing=re.I)
self._file_system_exclude = self._compile_valid_patterns(
self._file_system_exclude, casing=re.I, extra_patterns=file_system_exclude_extras
)
self._device_include = self._compile_valid_patterns(self._device_include)
self._device_exclude = self._compile_valid_patterns(self._device_exclude, extra_patterns=device_exclude_extras)
self._mount_point_include = self._compile_valid_patterns(self._mount_point_include)
self._mount_point_exclude = self._compile_valid_patterns(
self._mount_point_exclude, extra_patterns=mount_point_exclude_extras
)
def _compile_valid_patterns(self, patterns, casing=IGNORE_CASE, extra_patterns=None):
valid_patterns = []
if isinstance(patterns, str):
patterns = [patterns]
else:
patterns = list(patterns)
if extra_patterns:
for extra_pattern in extra_patterns:
if extra_pattern not in patterns:
patterns.append(extra_pattern)
for pattern in patterns:
# Ignore empty patterns as they match everything
if not pattern:
continue
try:
re.compile(pattern, casing)
except Exception:
self.log.warning('%s is not a valid regular expression and will be ignored', pattern)
else:
valid_patterns.append(pattern)
if valid_patterns:
return re.compile('|'.join(valid_patterns), casing)
def _compile_tag_re(self):
"""
Compile regex strings from device_tag_re option and return list of compiled regex/tag pairs
"""
device_tag_list = []
for regex_str, tags in self._device_tag_re.items():
try:
device_tag_list.append([re.compile(regex_str, IGNORE_CASE), [t.strip() for t in tags.split(',')]])
except TypeError:
self.log.warning('%s is not a valid regular expression and will be ignored', regex_str)
self._device_tag_re = device_tag_list
def _get_devices_label(self):
"""
Get every label to create tags and returns a map of device name to label:value
"""
if self._use_lsblk:
return self._get_devices_label_from_lsblk()
elif not self._blkid_cache_file:
return self._get_devices_label_from_blkid()
return self._get_devices_label_from_blkid_cache()
def _get_devices_label_from_lsblk(self):
"""
Get device labels using the `lsblk` command. Returns a map of device name to label:value
"""
devices_labels = {}
try:
# Use raw output mode (space-separated fields encoded in UTF-8).
# We want to be compatible with lsblk version 2.19 since
# it is the last version supported by CentOS 6 and SUSE 11.
lsblk_out, _, _ = get_subprocess_output(["lsblk", "--noheadings", "--raw", "--output=NAME,LABEL"], self.log)
for line in lsblk_out.splitlines():
device, _, label = line.partition(' ')
if label:
# Line sample (device "/dev/sda1" with label " MY LABEL")
# sda1 MY LABEL
devices_labels["/dev/" + device] = ['label:{}'.format(label), 'device_label:{}'.format(label)]
except SubprocessOutputEmptyError:
self.log.debug("Couldn't use lsblk to have device labels")
return devices_labels
def _get_devices_label_from_blkid(self):
devices_label = {}
try:
blkid_out, _, _ = get_subprocess_output(['blkid'], self.log)
all_devices = [l.split(':', 1) for l in blkid_out.splitlines()]
for d in all_devices:
# Line sample
# /dev/sda1: LABEL="MYLABEL" UUID="5eea373d-db36-4ce2-8c71-12ce544e8559" TYPE="ext4"
labels = self._blkid_label_re.findall(d[1])
if labels:
devices_label[d[0]] = ['label:{}'.format(labels[0]), 'device_label:{}'.format(labels[0])]
except SubprocessOutputEmptyError:
self.log.debug("Couldn't use blkid to have device labels")
return devices_label
def _get_devices_label_from_blkid_cache(self):
devices_label = {}
try:
with open(self._blkid_cache_file, 'r') as blkid_cache_file_handler:
blkid_cache_data = blkid_cache_file_handler.readlines()
except IOError as e:
self.log.warning("Couldn't read the blkid cache file %s: %s", self._blkid_cache_file, e)
return devices_label
# Line sample
# <device DEVNO="0x0801" LABEL="MYLABEL" UUID="..." TYPE="ext4">/dev/sda1</device>
for line in blkid_cache_data:
try:
root = ET.fromstring(line)
device = root.text
label = root.attrib.get('LABEL')
if label and device:
devices_label[device] = ['label:{}'.format(label), 'device_label:{}'.format(label)]
except ET.ParseError as e:
self.log.warning(
'Failed to parse line %s because of %s - skipping the line (some labels might be missing)', line, e
)
return devices_label
def _get_device_specific_tags(self, device_name):
device_specific_tags = []
# apply device/mountpoint specific tags
for regex, device_tags in self._device_tag_re:
if regex.match(device_name):
device_specific_tags.extend(device_tags)
return device_specific_tags
def _create_manual_mounts(self):
"""
on Windows, in order to collect statistics on remote (SMB/NFS) drives, the drive must be mounted
as the agent user in the agent context, otherwise the agent can't 'see' the drive. If so configured,
attempt to mount desired drives
"""
if not self._manual_mounts:
self.log.debug("No manual mounts")
else:
self.log.debug("Attempting to create %d mounts: ", len(self._manual_mounts))
for manual_mount in self._manual_mounts:
remote_machine = manual_mount.get('host')
share = manual_mount.get('share')
uname = manual_mount.get('user')
pword = manual_mount.get('password')
mtype = manual_mount.get('type')
mountpoint = manual_mount.get('mountpoint')
nr = win32wnet.NETRESOURCE()
if not remote_machine or not share:
self.log.error("Invalid configuration. Drive mount requires remote machine and share point")
continue
if mtype and mtype.lower() == "nfs":
nr.lpRemoteName = r"{}:{}".format(remote_machine, share)
self.log.debug("Attempting NFS mount: %s", nr.lpRemoteName)
else:
nr.lpRemoteName = r"\\{}\{}".format(remote_machine, share).rstrip('\\')
self.log.debug("Attempting SMB mount: %s", nr.lpRemoteName)
nr.dwType = 0
nr.lpLocalName = mountpoint
try:
win32wnet.WNetAddConnection2(nr, pword, uname, 0)
self.log.debug("Successfully mounted %s as %s", mountpoint, nr.lpRemoteName)
except Exception as e:
self.log.error("Failed to mount %s %s", nr.lpRemoteName, str(e))
pass
@staticmethod
def get_default_file_system_exclude():
return [
# CDROM
'iso9660$',
# tracefs
'tracefs$',
]
@staticmethod
def get_default_device_exclude():
return []
@staticmethod
def get_default_mount_mount_exclude():
return [
# https://github.com/DataDog/datadog-agent/issues/1961
# https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-1049
'(/host)?/proc/sys/fs/binfmt_misc$'
]