Skip to content

Commit

Permalink
Merge pull request #6 from GreenAlgorithms/5-ignore-specific-exit-codes
Browse files Browse the repository at this point in the history
Add some custom exit codes as successful
  • Loading branch information
Llannelongue authored Jan 27, 2023
2 parents a5e35a6 + 76e487d commit b259e44
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 9 deletions.
8 changes: 7 additions & 1 deletion GreenAlgorithms_global.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,10 +393,16 @@ def main(args, cluster_info, fParams):
help='Only report on jobs launched from the current location.')
parser.add_argument('--userCWD', type=str, help=argparse.SUPPRESS)
parser.add_argument('--filterJobIDs', type=str,
help='Comma seperated list of Job IDs you want to filter on.',
help='Comma separated list of Job IDs you want to filter on.',
default='all')
parser.add_argument('--filterAccount', type=str,
help='Only consider jobs charged under this account')
parser.add_argument('--customSuccessStates', type=str, default='',
help="Comma-separated list of job states. By default, only jobs that exit with status CD or \
COMPLETED are considered succesful (PENDING, RUNNING and REQUEUD are ignored). \
Jobs with states listed here will be considered successful as well (best to list both \
2-letter and full-length codes. Full list of job states: \
https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES")
parser.add_argument('--reportBug', action='store_true', help='In case of a bug, this flag logs jobs informations so that we can fix it. \
Note that this will write out some basic information about your jobs, such as runtime, number of cores and memory usage.')
parser.add_argument('--reportBugHere', action='store_true',
Expand Down
33 changes: 25 additions & 8 deletions GreenAlgorithms_workloadManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,18 +183,31 @@ def calc_coreHoursCharged(self, x,
return x.WallclockTimeX * x.NGPUS_ / np.timedelta64(1, 'h')


def clean_State(self, x):
def clean_State(self, x, customSuccessStates_list):
'''
Standardise the job's state, coding with {-1,0,1}
:param x: [str] "State" field from sacct output
:return: [int] in [-1,0,1]
'''
if x in ['CD','COMPLETED']:
return 1
elif x in ['PD','PENDING','R','RUNNING','RQ','REQUEUED']:
return -1
# Codes are found here: https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
# self.args.customSuccessStates = 'TO,TIMEOUT'
success_codes = ['CD','COMPLETED']
running_codes = ['PD','PENDING','R','RUNNING','RQ','REQUEUED']
if x in success_codes:
codeState = 1
elif x in customSuccessStates_list:
# we allocate a lower value here so that when aggregating by jobID, the whole job keeps the flag
# Otherwise a "cancelled" job could take over with StateX=0 for example
codeState = -1
else:
return 0
codeState = 0

if x in running_codes:
# running jobs are the lowest to be removed all the time
# (if one of the subprocess is still running, the job gets ignored regardless of --customSuccessStates
codeState = -2

return codeState

def get_parent_jobID(self, x):
'''
Expand Down Expand Up @@ -322,7 +335,8 @@ def clean_logs_df(self):
self.logs_df['WorkingDir_'] = self.logs_df.WorkDir

### State
self.logs_df['StateX'] = self.logs_df.State.apply(self.clean_State)
customSuccessStates_list = self.args.customSuccessStates.split(',')
self.logs_df['StateX'] = self.logs_df.State.apply(self.clean_State, customSuccessStates_list=customSuccessStates_list)

### Pull jobID
self.logs_df['single_jobID'] = self.logs_df.JobID.apply(lambda x: x.split('.')[0])
Expand Down Expand Up @@ -353,7 +367,10 @@ def clean_logs_df(self):
})

### Remove jobs that are still running or currently queued
self.df_agg = self.df_agg_0.loc[self.df_agg_0.StateX != -1]
self.df_agg = self.df_agg_0.loc[self.df_agg_0.StateX != -2]

### Turn StateX==-2 into 1
self.df_agg.loc[self.df_agg.StateX == -1, 'StateX'] = 1

### Replace UsedMem_=-1 with memory requested (for when MaxRSS=NaN)
self.df_agg['UsedMem2_'] = self.df_agg.apply(self.cleam_UsedMem, axis=1)
Expand Down

0 comments on commit b259e44

Please sign in to comment.