Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 46 additions & 6 deletions bin/check_megacli
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

# Example script using pymegacli which is suitable for invocation by nagios

from __future__ import print_function
import argparse
import os
import sys
Expand Down Expand Up @@ -31,11 +32,39 @@ def main():
action='append',
help='Subsystems to check. If not passed, defaults to all'
)
parser.add_argument(
'--led',
action='store_true',
help='Use libstoragemgmt (megaraid plugin) to toggle the fault led on unhealthy disks'
)
parser.set_defaults(led=False)
args = parser.parse_args()

if os.geteuid() != 0:
parser.error('Must run as root!')

if args.led:
try:
import lsm
try:
import lsm.plugin.megaraid
HAVE_LSM_MEGARAID = True
except:
HAVE_LSM_MEGARAID = False
print(
"--led specified but libstoragemgmt megaraid plugin was not found, install like so on Fedora/CentOS... :\n"
"\tyum install libstoragemgmt-megaraid-plugin",
file=sys.stderr
)
except ImportError:
HAVE_LSM_MEGARAID = False
print(
"--led specified but libstoragemgmt module was not found, install like so on Fedora/CentOS... :\n"
"\tyum install libstoragemgmt libstoragemgmt-megaraid-plugin",
file=sys.stderr
)


checks = {}
for check in CHECKS:
checks[check] = (not args.check or check in args.check)
Expand All @@ -52,12 +81,14 @@ def main():
def check_component(component):
if component.healthy:
messages[OK].append('%s is healthy' % component.identifier)
return OK
else:
for message in component.health_messages:
messages[CRITICAL].append('%s %s' % (
component.identifier,
message
))
return CRITICAL

controllers = list(connection.controllers)
if not controllers:
Expand All @@ -66,10 +97,19 @@ def main():
for controller in controllers:
if checks['PD']:
for disk in controller.PDs:
check_component(disk)
disk_health = check_component(disk)
if args.led and HAVE_LSM_MEGARAID:
try:
if CRITICAL == disk_health:
lsm.LocalDisk.fault_led_on(disk.devnode)
else:
lsm.LocalDisk.fault_led_off(disk.devnode)
except:
print('Couldn\'t manipulate enclosure LED', file=sys.stderr)

if checks['LD']:
for logical_device in controller.LDs:
check_component(disk)
check_component(logical_device)
if 'WriteBack' not in logical_device['Current Cache Policy']:
messages[WARNING].append('%s has cache policy %s, which does not include WriteBack' % (
logical_device.identifier,
Expand All @@ -82,16 +122,16 @@ def main():
messages[WARNING].append('%s is in learn cycle' % bbu.identifier)

if messages[CRITICAL]:
print 'CRITICAL: %s' % '; '.join(messages[CRITICAL])
print('CRITICAL: %s' % '; '.join(messages[CRITICAL]))
return CRITICAL
elif messages[WARNING]:
print 'WARNING: %s' % '; '.join(messages[WARNING])
print('WARNING: %s' % '; '.join(messages[WARNING]))
return WARNING
elif messages[UNKNOWN]:
print 'UNKNOWN: %s' % '; '.join(messages[UNKNOWN])
print('UNKNOWN: %s' % '; '.join(messages[UNKNOWN]))
return UNKNOWN
else:
print 'OK: %s' % '; '.join(messages[OK])
print('OK: %s' % '; '.join(messages[OK]))
return OK


Expand Down
24 changes: 21 additions & 3 deletions pymegacli/components.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import pipes
import subprocess
import re
import os
import glob

from .parser import BlockParser
from .parser import bail_on
Expand Down Expand Up @@ -131,20 +133,29 @@ class Disk(Component):
'Predictive Failure Count',
)
ERROR_BOOL_KEYS = ('Drive has flagged a S.M.A.R.T alert', )
REQUIRED_FIELDS = ('Enclosure Device ID', 'Slot Number')
REQUIRED_FIELDS = ('Enclosure Device ID', 'Slot Number', 'WWN')

PARSER = BlockParser(rules=[
once_per_block(colon_field('Enclosure Device ID', int_or_na)),
rule(colon_field('Slot Number', int)),
rule(colon_field('WWN', str)),
rule(colon_field('Other Error Count', int)),
rule(colon_field('Predictive Failure Count', int)),
rule(colon_field('Media Error Count', int)),
rule(colon_field('Drive has flagged a S.M.A.R.T alert', yesnobool)),
], default_constructor=colon_field(None, str))

def __init__(self, enclosure_id, slot_number, parent, props=None):
def __init__(self, enclosure_id, slot_number, wwn, parent, props=None):
self.enclosure_id = enclosure_id
self.slot_number = slot_number
self.wwn = wwn.lower()
disk_by_id_path_glob = "/dev/disk/by-id/wwn-0x%s?" % self.wwn[:-1]
disk_by_id_path_glob_results = glob.glob(disk_by_id_path_glob)
if len(disk_by_id_path_glob_results) == 1:
self.linux_disk_by_id = disk_by_id_path_glob_results[0]
else:
self.linux_disk_by_id = None

self.thresholds = dict(
(k, 0)
for k
Expand All @@ -157,7 +168,14 @@ def set_threshold(self, key, value):

@property
def identifier(self):
return 'PhysDrv [%d:%d]' % (self.enclosure_id, self.slot_number)
return 'PhysDrv [%d:%d] WWN: %s Dev: %s' % (self.enclosure_id, self.slot_number, self.wwn, self.devnode)

@property
def devnode(self):
if self.linux_disk_by_id:
return os.path.realpath(self.linux_disk_by_id)
else:
return None

@property
def health_status(self):
Expand Down