From 4c0bd7010ad4d17e37ce3e22f5be56e0c402da29 Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Thu, 21 Mar 2024 08:49:28 +0530 Subject: [PATCH] syslog: Add capability to ignore kernel failures Adds capability to ignore kernel failures to jobs. This is done by adding 'syslog' dict to config dictionary which holds the 'ignorelist' of kernel failures. Also removes old kernel failurs from exclude list. Fixes: https://tracker.ceph.com/issues/50150 Signed-off-by: Kotresh HR --- teuthology/run.py | 2 +- teuthology/task/internal/syslog.py | 57 ++++++------------------------ 2 files changed, 12 insertions(+), 47 deletions(-) diff --git a/teuthology/run.py b/teuthology/run.py index e065495cff..d2926d70c6 100644 --- a/teuthology/run.py +++ b/teuthology/run.py @@ -224,7 +224,7 @@ def get_initial_tasks(lock, config, machine_type): {'internal.archive': None}, {'internal.coredump': None}, {'internal.sudo': None}, - {'internal.syslog': None}, + {'internal.syslog': config.get('syslog', {})}, ]) init_tasks.append({'internal.timer': None}) diff --git a/teuthology/task/internal/syslog.py b/teuthology/task/internal/syslog.py index 64032a8e7c..82c63db00d 100644 --- a/teuthology/task/internal/syslog.py +++ b/teuthology/task/internal/syslog.py @@ -102,56 +102,21 @@ def syslog(ctx, config): # flush the file fully. oh well. log.info('Checking logs for errors...') + exclude_errors = config.get('ignorelist', []) + log.info('Exclude error list: {0}'.format(exclude_errors)) for rem in cluster.remotes.keys(): log.debug('Checking %s', rem.name) - stdout = rem.sh( - [ + args = [ 'egrep', '--binary-files=text', - '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b', + '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b|\\bOops\\b|\\bWARNING\\b|\\bKASAN\\b', run.Raw(f'{archive_dir}/syslog/kern.log'), - run.Raw('|'), - 'grep', '-v', 'task .* blocked for more than .* seconds', - run.Raw('|'), - 'grep', '-v', 'lockdep is turned off', - run.Raw('|'), - 'grep', '-v', 'trying to register non-static key', - run.Raw('|'), - 'grep', '-v', 'DEBUG: fsize', # xfs_fsr - run.Raw('|'), - 'grep', '-v', 'CRON', # ignore cron noise - run.Raw('|'), - 'grep', '-v', 'BUG: bad unlock balance detected', # #6097 - run.Raw('|'), - 'grep', '-v', 'inconsistent lock state', # FIXME see #2523 - run.Raw('|'), - 'grep', '-v', '*** DEADLOCK ***', # part of lockdep output - run.Raw('|'), - 'grep', '-v', - # FIXME see #2590 and #147 - 'INFO: possible irq lock inversion dependency detected', - run.Raw('|'), - 'grep', '-v', - 'INFO: NMI handler (perf_event_nmi_handler) took too long to run', # noqa - run.Raw('|'), - 'grep', '-v', 'INFO: recovery required on readonly', - run.Raw('|'), - 'grep', '-v', 'ceph-create-keys: INFO', - run.Raw('|'), - 'grep', '-v', 'INFO:ceph-create-keys', - run.Raw('|'), - 'grep', '-v', 'Loaded datasource DataSourceOpenStack', - run.Raw('|'), - 'grep', '-v', 'container-storage-setup: INFO: Volume group backing root filesystem could not be determined', # noqa - run.Raw('|'), - 'egrep', '-v', '\\bsalt-master\\b|\\bsalt-minion\\b|\\bsalt-api\\b', - run.Raw('|'), - 'grep', '-v', 'ceph-crash', - run.Raw('|'), - 'egrep', '-v', '\\btcmu-runner\\b.*\\bINFO\\b', - run.Raw('|'), - 'head', '-n', '1', - ], - ) + ] + for exclude in exclude_errors: + args.extend([run.Raw('|'), 'egrep', '-v', exclude]) + args.extend([ + run.Raw('|'), 'head', '-n', '1', + ]) + stdout = rem.sh(args) if stdout != '': log.error('Error in syslog on %s: %s', rem.name, stdout) set_status(ctx.summary, 'fail')