Skip to content

Commit

Permalink
mgr/devicehealth: add helpers to life_expectancy_response()
Browse files Browse the repository at this point in the history
- if mark_out_threshold is met we write to log.warn instead of raising a
  health warning.
- check that OSD is 'in' before calling mark_out().
- raise a health warning in case OSD is marked 'out' but still has PGs
  attached to it.
- cast thresholds default values to string.
- add SCSI multipath support to health warning message.
- change health warning message.

Signed-off-by: Yaarit Hatuka [email protected]
  • Loading branch information
Yaarit Hatuka committed Jun 12, 2018
1 parent 5a44d4f commit 32c80f5
Showing 1 changed file with 98 additions and 0 deletions.
98 changes: 98 additions & 0 deletions src/pybind/mgr/devicehealth/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ class Module(MgrModule):
'scrape_frequency': str(86400),
'retention_period': str(86400*14),
'pool_name': 'device_health_metrics',
'mark_out_threshold': str(86400*14),
'warn_threshold': str(86400*14*2),
'self_heal': True,
}
active = DEFAULTS['active']
scrape_frequency = DEFAULTS['scrape_frequency']
Expand Down Expand Up @@ -251,6 +254,101 @@ def show_device_metrics(self, devid, sample):
pass
return (0, json.dumps(res, indent=4), '')

def life_expectancy_response(self):
mark_out_threshold_td = timedelta(seconds=int(self.mark_out_threshold))
warn_threshold_td = timedelta(seconds=int(self.warn_threshold))
health_warnings = []
devs = self.get("devices")
for dev in devs['devices']:
if 'life_expectancy_min' not in dev:
continue
# life_expectancy_(min/max) is in the format of:
# '%Y-%m-%d %H:%M:%S.%f', e.g.:
# '2019-01-20 21:12:12.000000'
life_expectancy_min = datetime.strptime(dev['life_expectancy_min'], '%Y-%m-%d %H:%M:%S.%f')
now = datetime.now()
if life_expectancy_min - now <= mark_out_threshold_td:
if self.self_heal:
# dev['daemons'] == ["osd.0","osd.1","osd.2"]
if dev['daemons']:
osd_ids = map(lambda x: x[4:], dev['daemons'])
osds_in = []
osds_out = []
for _id in osd_ids:
if self.is_osd_in(_id):
osds_in.append(_id)
else:
osds_out.append(_id)
if osds_in:
self.mark_out(osds_in)
# OSD might be marked 'out' (which means it has no
# data), however PGs are still attached to it.
for _id in osds_out:
num_pgs = self.get_osd_num_pgs(_id)
if num_pgs > 0:
health_warnings.append('osd.%s is marked out, '
'but still has %s PG(s)'
' attached' %
(_id, num_pgs))
# TODO: set_primary_affinity
self.log.warn(self.create_warning_message(dev))
elif life_expectancy_min - now <= warn_threshold_td:
health_warnings.append(self.create_warning_message(dev))
if health_warnings:
self.set_health_checks({
'MGR_DEVICE_HEALTH': {
'severity': 'warning',
'summary': 'Imminent failure anticipated for device(s)',
'detail': health_warnings
}
})
else:
self.set_health_checks({}) # clearing health checks
return (0,"","")

def is_osd_in(self, osd_id):
osdmap = self.get("osd_map")
assert osdmap is not None
for osd in osdmap['osds']:
if str(osd_id) == str(osd['osd']):
return osd['in']
# return False

def get_osd_num_pgs(self, osd_id):
stats = self.get('osd_stats')
assert stats is not None
for stat in stats['osd_stats']:
if str(osd_id) == str(stat['osd']):
return stat['num_pgs']
return -1

def create_warning_message(self, dev):
# device can appear in more than one location in case of SCSI multipath
device_locations = map(lambda x: x['host'] + ':' + x['dev'], dev['location'])
return ('%s at %s;'
' Affected OSDs: %s;'
' Life expectancy: between %s and %s'
% (dev['devid'],
device_locations,
dev.get('daemons', 'none'),
dev['life_expectancy_min'],
dev.get('life_expectancy_max', 'unknown')))
# TODO: by default, dev['life_expectancy_max'] == '0.000000',
# so dev.get('life_expectancy_max', 'unknown')
# above should be altered.

def mark_out(self, osd_ids):
self.log.info('Marking out OSDs: %s' % osd_ids)
result = CommandResult('')
self.send_command(result, 'mon', '', json.dumps({
'prefix': 'osd out',
'format': 'json',
'ids': osd_ids,
}), '')
r, outb, outs = result.wait()
if r != 0:
self.log.warn('Could not mark OSD %s out. r: [%s], outb: [%s], outs: [%s]' % (osd_ids, r, outb, outs))

def extract_smart_features(self, raw):
# FIXME: extract and normalize raw smartctl --json output and
# generate a dict of the fields we care about.
Expand Down

0 comments on commit 32c80f5

Please sign in to comment.