Skip to content

Commit

Permalink
Merge pull request flux-framework#5656 from chu11/issue4186_flux_jobs…
Browse files Browse the repository at this point in the history
…_filter_nodes

job-list: support "hostlist" constraint to allow jobs to be filtered by nodes
  • Loading branch information
mergify[bot] authored and trws committed Jun 14, 2024
2 parents 67fc412 + d381bf3 commit a22fc0d
Show file tree
Hide file tree
Showing 9 changed files with 648 additions and 24 deletions.
1 change: 1 addition & 0 deletions src/cmd/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ fluxcmd_ldadd = \
LDADD = $(fluxcmd_ldadd)

EXTRA_DIST = \
flux-python \
builtin-cmds-list.sh
CLEANFILES = \
builtin-cmds.c \
Expand Down
1 change: 1 addition & 0 deletions src/modules/job-list/job_data.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ void job_destroy (void *data)
int save_errno = errno;
free (job->ranks);
free (job->nodelist);
hostlist_destroy (job->nodelist_hl);
json_decref (job->annotations);
grudgeset_destroy (job->dependencies);
json_decref (job->jobspec);
Expand Down
2 changes: 2 additions & 0 deletions src/modules/job-list/job_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <flux/core.h>
#include <jansson.h>

#include "src/common/libhostlist/hostlist.h"
#include "src/common/libutil/grudgeset.h"
#include "src/common/libczmqcontainers/czmq_containers.h"

Expand Down Expand Up @@ -54,6 +55,7 @@ struct job {
int nnodes;
char *ranks;
char *nodelist;
struct hostlist *nodelist_hl; /* cache of nodelist in hl form */
double expiration;
int wait_status;
bool success;
Expand Down
113 changes: 113 additions & 0 deletions src/modules/job-list/match.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ typedef enum {
MATCH_LESS_THAN = 4,
} match_comparison_t;

#define MIN_MATCH_HOSTLIST 1024

struct timestamp_value {
double t_value;
match_timestamp_type_t t_type;
Expand Down Expand Up @@ -428,6 +430,91 @@ static struct list_constraint *create_results_constraint (struct match_ctx *mctx
errp);
}

static int match_hostlist (struct list_constraint *c,
const struct job *job,
unsigned int *comparisons,
flux_error_t *errp)
{
struct hostlist *hl = zlistx_first (c->values);
const char *host;

/* nodelist may not exist if job never ran */
if (!job->nodelist)
return 0;
if (!job->nodelist_hl) {
/* hack to remove const */
struct job *jobtmp = (struct job *)job;
if (!(jobtmp->nodelist_hl = hostlist_decode (job->nodelist)))
return 0;
}
host = hostlist_first (hl);
while (host) {
if (inc_check_comparison (c->mctx, comparisons, errp) < 0)
return -1;
if (hostlist_find (job->nodelist_hl, host) >= 0)
return 1;
host = hostlist_next (hl);
}
return 0;
}

/* zlistx_set_destructor */
static void wrap_hostlist_destroy (void **item)
{
if (item) {
struct hostlist *hl = *item;
hostlist_destroy (hl);
(*item) = NULL;
}
}

static struct list_constraint *create_hostlist_constraint (
struct match_ctx *mctx,
json_t *values,
flux_error_t *errp)
{
struct list_constraint *c;
struct hostlist *hl = NULL;
json_t *entry;
size_t index;

if (!(c = list_constraint_new (mctx,
match_hostlist,
wrap_hostlist_destroy,
errp)))
return NULL;
/* Create a single hostlist if user specifies multiple nodes or
* RFC29 hostlist range */
if (!(hl = hostlist_create ())) {
errprintf (errp, "failed to create hostlist structure");
goto error;
}
json_array_foreach (values, index, entry) {
if (!json_is_string (entry)) {
errprintf (errp, "host value must be a string");
goto error;
}
if (hostlist_append (hl, json_string_value (entry)) <= 0) {
errprintf (errp, "host value not in valid Hostlist format");
goto error;
}
}
if (hostlist_count (hl) > mctx->max_hostlist) {
errprintf (errp, "too many hosts specified");
goto error;
}
if (!zlistx_add_end (c->values, hl)) {
errprintf (errp, "failed to append hostlist structure");
hostlist_destroy (hl);
goto error;
}
return c;
error:
hostlist_destroy (hl);
list_constraint_destroy (c);
return NULL;
}

static int match_timestamp (struct list_constraint *c,
const struct job *job,
unsigned int *comparisons,
Expand Down Expand Up @@ -665,6 +752,8 @@ struct list_constraint *list_constraint_create (struct match_ctx *mctx,
return create_states_constraint (mctx, values, errp);
else if (streq (op, "results"))
return create_results_constraint (mctx, values, errp);
else if (streq (op, "hostlist"))
return create_hostlist_constraint (mctx, values, errp);
else if (streq (op, "t_submit")
|| streq (op, "t_depend")
|| streq (op, "t_run")
Expand Down Expand Up @@ -743,6 +832,30 @@ struct match_ctx *match_ctx_create (flux_t *h)
goto error;
}

if (flux_get_size (mctx->h, &mctx->max_hostlist) < 0) {
flux_log_error (h, "failed to get instance size");
goto error;
}

/* Notes:
*
* We do not want a hostlist constraint match to DoS this module.
* So we want to configure a "max" amount of hosts that can exist
* within a hostlist constraint.
*
* Under normal operating conditions, the number of brokers should
* represent the most likely maximum. But there are some corner
* cases. For example, the instance gets reconfigured to be
* smaller, which is not an uncommon thing to do towards a
* cluster's end of life and hardware is beginning to die.
*
* So we configure the following compromise. If the number of
* brokers is below our defined minimum MIN_MATCH_HOSTLIST, we'll
* allow max_hostlist to be increased to this number.
*/
if (mctx->max_hostlist < MIN_MATCH_HOSTLIST)
mctx->max_hostlist = MIN_MATCH_HOSTLIST;

return mctx;

error:
Expand Down
1 change: 1 addition & 0 deletions src/modules/job-list/match.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
struct match_ctx {
flux_t *h;
uint64_t max_comparisons;
uint32_t max_hostlist;
};

struct match_ctx *match_ctx_create (flux_t *h);
Expand Down
3 changes: 2 additions & 1 deletion src/modules/job-list/state_match.c
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,8 @@ struct state_constraint *state_constraint_create (json_t *constraint, flux_error
}
if (streq (op, "userid")
|| streq (op, "name")
|| streq (op, "queue"))
|| streq (op, "queue")
|| streq (op, "hostlist"))
return state_constraint_new (match_maybe, NULL, errp);
else if (streq (op, "results"))
return state_constraint_new (match_result, NULL, errp);
Expand Down
Loading

0 comments on commit a22fc0d

Please sign in to comment.