From a6bf661be37b1654934ac52acc61d53fcec61edc Mon Sep 17 00:00:00 2001 From: Alexander Krutikov Date: Mon, 18 Jan 2016 19:51:09 +0300 Subject: [PATCH 1/6] added support of RAID to smart-stats.py. Closes OpenTSDB/tcollector#232 --- collectors/0/smart-stats.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/collectors/0/smart-stats.py b/collectors/0/smart-stats.py index 747b9e64..a7ebcbf3 100755 --- a/collectors/0/smart-stats.py +++ b/collectors/0/smart-stats.py @@ -134,7 +134,7 @@ def is_3ware_driver_broken(drives): for i in reversed(xrange(len(drives))): drive = drives[i] signal.alarm(COMMAND_TIMEOUT) - smart_ctl = subprocess.Popen(SMART_CTL + " -i /dev/" + drive, + smart_ctl = subprocess.Popen(SMART_CTL + " -i " + drive.split("#")[0], shell=True, stdout=subprocess.PIPE) smart_output = smart_ctl.communicate()[0] if "supports SMART and is Disabled" in smart_output: @@ -191,7 +191,11 @@ def main(): """main loop for SMART collector""" # Get the list of block devices. - drives = [dev[5:] for dev in glob.glob("/dev/[hs]d[a-z]")] + scan_open = subprocess.Popen(SMART_CTL + " --scan-open", + shell=True, stdout=subprocess.PIPE) + drives = scan_open.communicate()[0] + drives = drives.split('\n') + drives = [drive for drive in drives if drive] # Exit gracefully if no block devices found if not drives: sys.exit(13) @@ -199,14 +203,13 @@ def main(): # to make sure we are done with smartctl in COMMAND_TIMEOUT seconds signal.signal(signal.SIGALRM, alarm_handler) - if smart_is_broken(drives): sys.exit(13) while True: for drive in drives: signal.alarm(COMMAND_TIMEOUT) - smart_ctl = subprocess.Popen(SMART_CTL + " -i -A /dev/" + drive, + smart_ctl = subprocess.Popen(SMART_CTL + " -i -A " + drive.split('#')[0], shell=True, stdout=subprocess.PIPE) smart_output = smart_ctl.communicate()[0] signal.alarm(0) @@ -215,7 +218,11 @@ def main(): sys.exit(13) else: print >>sys.stderr, "Command exited with: %d" % smart_ctl.returncode - process_output(drive, smart_output) + drive_s = drive.split() + if "megaraid" in drive_s[2]: + process_output(drive_s[5][1:len(drive_s[5])-1], smart_output) + else: + process_output(drive_s[0][5:], smart_output) sys.stdout.flush() time.sleep(SLEEP_BETWEEN_POLLS) From e2bed26e3d5bec79cd4ae1a3c0663478d46c0b66 Mon Sep 17 00:00:00 2001 From: Alexander Krutikov Date: Mon, 11 Apr 2016 16:35:30 +0300 Subject: [PATCH 2/6] Normalized values were added to monitoring. By setting "NUMERIC" variable to 1 or 0 smart values can be represented by numbers or real names. --- collectors/0/smart-stats.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/collectors/0/smart-stats.py b/collectors/0/smart-stats.py index a7ebcbf3..e94f0555 100755 --- a/collectors/0/smart-stats.py +++ b/collectors/0/smart-stats.py @@ -29,8 +29,9 @@ BROKEN_DRIVER_VERSIONS = ("1.1-5",) SMART_CTL = "smartctl" -SLEEP_BETWEEN_POLLS = 60 -COMMAND_TIMEOUT = 10 +NUMERIC = 1 +SLEEP_BETWEEN_POLLS = 300 +COMMAND_TIMEOUT = 60 # Common smart attributes, add more to this list if you start seeing # numbers instead of attribute names in TSD results. @@ -163,10 +164,19 @@ def process_output(drive, smart_output): if len(fields) < 2: continue field = fields[0] - if len(fields) > 2 and field in ATTRIBUTE_MAP: - metric = ATTRIBUTE_MAP[field] + if len(fields) > 2: + metric_raw = "_raw" + metric_normalized = "_normalized" + if NUMERIC == 0 and field in ATTRIBUTE_MAP: + metric_raw = ATTRIBUTE_MAP[field] + metric_raw + metric_normalized = ATTRIBUTE_MAP[field] + metric_normalized + else: + metric_raw = "smart_" + field + metric_raw + metric_normalized = "smart_" + field + metric_normalized value = fields[9].split()[0] - print ("smart.%s %d %s disk=%s" % (metric, ts, value, drive)) + print ("smart.%s %d %s disk=%s" % (metric_raw, ts, value, drive)) + value = fields[4] + print ("smart.%s %d %s disk=%s" % (metric_normalized, ts, value, drive)) if is_seagate and metric in ("seek_error_rate", "raw_read_error_rate"): # It appears that some Seagate drives (and possibly some Western # Digital ones too) use the first 16 bits to store error counts, From 402909f83467249be22e4531ba3729b4ced6a887 Mon Sep 17 00:00:00 2001 From: Alexander Krutikov Date: Mon, 11 Apr 2016 17:08:54 +0300 Subject: [PATCH 3/6] Little fix. --- collectors/0/smart_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectors/0/smart_stats.py b/collectors/0/smart_stats.py index ffb88041..e9054923 100755 --- a/collectors/0/smart_stats.py +++ b/collectors/0/smart_stats.py @@ -233,7 +233,7 @@ def main(): while True: for drive in drives: signal.alarm(COMMAND_TIMEOUT) - smart_ctl = subprocess.Popen(SMART_CTL + " -i -A /dev/" + drive, + smart_ctl = subprocess.Popen(SMART_CTL + " -i -A " + drive.split('#')[0], shell=True, stdout=subprocess.PIPE) smart_output = smart_ctl.communicate()[0] signal.alarm(0) From 63e1e2cd6b4b2b36f5396c9ffb6b0589657937c3 Mon Sep 17 00:00:00 2001 From: Alexander Krutikov Date: Mon, 11 Apr 2016 17:53:56 +0300 Subject: [PATCH 4/6] It was implemented to use together both approaches to define working disk drives. --- collectors/0/smart_stats.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/collectors/0/smart_stats.py b/collectors/0/smart_stats.py index e9054923..9cc0d935 100755 --- a/collectors/0/smart_stats.py +++ b/collectors/0/smart_stats.py @@ -218,7 +218,12 @@ def main(): shell=True, stdout=subprocess.PIPE) drives = scan_open.communicate()[0] drives = drives.split('\n') - drives = [drive for drive in drives if drive] + drives = [dev for dev in drives if dev] + drives2 = [dev for dev in glob.glob("/dev/[hs]d[a-z]") if not any(dev in drive for drive in drives)] + if drives2: + drives.extend(drives2) + if not drives: + drives = [dev for dev in glob.glob("/dev/da[0-9]")+glob.glob("/dev/da[0-9][0-9]")] # Exit gracefully if no block devices found if not drives: sys.exit(13) @@ -243,7 +248,7 @@ def main(): else: print >>sys.stderr, "Command exited with: %d" % smart_ctl.returncode drive_s = drive.split() - if "megaraid" in drive_s[2]: + if len(drive_s) > 2 and "megaraid" in drive_s[2]: process_output(drive_s[5][1:len(drive_s[5])-1], smart_output) else: process_output(drive_s[0][5:], smart_output) From 91ebd139eb12bb9b9b95fa79bcf83c1bba557f53 Mon Sep 17 00:00:00 2001 From: Alexander Krutikov Date: Mon, 11 Apr 2016 18:30:44 +0300 Subject: [PATCH 5/6] Build fix. --- collectors/0/smart_stats.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/collectors/0/smart_stats.py b/collectors/0/smart_stats.py index 9cc0d935..47fd0f9f 100755 --- a/collectors/0/smart_stats.py +++ b/collectors/0/smart_stats.py @@ -29,7 +29,7 @@ except ImportError: smart_stats_conf = None -DEFAULT_COLLECTION_INTERVAL=120 +DEFAULT_COLLECTION_INTERVAL=300 TWCLI = "/usr/sbin/tw_cli" ARCCONF = "/usr/local/bin/arcconf" @@ -185,17 +185,17 @@ def process_output(drive, smart_output): print ("smart.%s %d %s disk=%s" % (metric_raw, ts, value, drive)) value = fields[4] print ("smart.%s %d %s disk=%s" % (metric_normalized, ts, value, drive)) - if is_seagate and metric in ("seek_error_rate", "raw_read_error_rate"): + if is_seagate and metric_raw in ("seek_error_rate", "raw_read_error_rate"): # It appears that some Seagate drives (and possibly some Western # Digital ones too) use the first 16 bits to store error counts, # and the low 32 bits to store operation counts, out of these 48 # bit values. So try to be helpful and extract these here. value = int(value) print ("smart.%s %d %d disk=%s" - % (metric.replace("error_rate", "count"), ts, + % (metric_raw.replace("error_rate", "count"), ts, value & 0xFFFFFFFF, drive)) print ("smart.%s %d %d disk=%s" - % (metric.replace("error_rate", "errors"), ts, + % (metric_raw.replace("error_rate", "errors"), ts, (value & 0xFFFF00000000) >> 32, drive)) elif line.startswith("ID#"): data_marker = True From 5b89e3ff5472209462ef14eac4a0d45e51f75fc5 Mon Sep 17 00:00:00 2001 From: Alexander Krutikov Date: Wed, 13 Apr 2016 09:55:57 +0300 Subject: [PATCH 6/6] Ignores duplicates by checking serial number, uses glob.glob("/dev/[hs]d[a-z]{1,2}") first to find disks. --- collectors/0/smart_stats.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/collectors/0/smart_stats.py b/collectors/0/smart_stats.py index 47fd0f9f..d74f419f 100755 --- a/collectors/0/smart_stats.py +++ b/collectors/0/smart_stats.py @@ -17,6 +17,7 @@ import glob import os +import re import signal import subprocess import sys @@ -214,12 +215,13 @@ def main(): collection_interval=config['collection_interval'] # Get the list of block devices. + drives = [dev for dev in glob.glob("/dev/[hs]d[a-z]{1,2}")] scan_open = subprocess.Popen(SMART_CTL + " --scan-open", shell=True, stdout=subprocess.PIPE) - drives = scan_open.communicate()[0] - drives = drives.split('\n') - drives = [dev for dev in drives if dev] - drives2 = [dev for dev in glob.glob("/dev/[hs]d[a-z]") if not any(dev in drive for drive in drives)] + drives2 = scan_open.communicate()[0] + drives2 = drives2.split('\n') + drives2 = [dev for dev in drives2 if dev] + drives2 = [dev for dev in drives2 if not any(dev in drive for drive in drives)] if drives2: drives.extend(drives2) if not drives: @@ -231,16 +233,22 @@ def main(): # to make sure we are done with smartctl in COMMAND_TIMEOUT seconds signal.signal(signal.SIGALRM, alarm_handler) + + reg = re.compile("Serial Number:\s+[A-Z0-9-]+", re.M) if smart_is_broken(drives): sys.exit(13) - + serials = [] while True: for drive in drives: signal.alarm(COMMAND_TIMEOUT) smart_ctl = subprocess.Popen(SMART_CTL + " -i -A " + drive.split('#')[0], shell=True, stdout=subprocess.PIPE) smart_output = smart_ctl.communicate()[0] + search = reg.search(smart_output) + serial = "" + if search: + serial = search.group()[18:] signal.alarm(0) if smart_ctl.returncode != 0: if smart_ctl.returncode == 127: @@ -248,11 +256,12 @@ def main(): else: print >>sys.stderr, "Command exited with: %d" % smart_ctl.returncode drive_s = drive.split() - if len(drive_s) > 2 and "megaraid" in drive_s[2]: - process_output(drive_s[5][1:len(drive_s[5])-1], smart_output) - else: - process_output(drive_s[0][5:], smart_output) - + if serial and serial not in serials: + serials.append(serial) + if len(drive_s) > 2 and "megaraid" in drive_s[2]: + process_output(drive_s[5][1:len(drive_s[5])-1], smart_output) + else: + process_output(drive_s[0][5:], smart_output) sys.stdout.flush() time.sleep(collection_interval)