Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

job-list: support "hostlist" constraint to allow jobs to be filtered by nodes #5656

Merged
merged 6 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/modules/job-list/job_data.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ void job_destroy (void *data)
int save_errno = errno;
free (job->ranks);
free (job->nodelist);
hostlist_destroy (job->nodelist_hl);
json_decref (job->annotations);
grudgeset_destroy (job->dependencies);
json_decref (job->jobspec);
Expand Down
2 changes: 2 additions & 0 deletions src/modules/job-list/job_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <flux/core.h>
#include <jansson.h>

#include "src/common/libhostlist/hostlist.h"
#include "src/common/libutil/grudgeset.h"
#include "src/common/libczmqcontainers/czmq_containers.h"

Expand Down Expand Up @@ -54,6 +55,7 @@ struct job {
int nnodes;
char *ranks;
char *nodelist;
struct hostlist *nodelist_hl; /* cache of nodelist in hl form */
double expiration;
int wait_status;
bool success;
Expand Down
113 changes: 113 additions & 0 deletions src/modules/job-list/match.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@
MATCH_LESS_THAN = 4,
} match_comparison_t;

#define MIN_MATCH_HOSTLIST 1024

struct timestamp_value {
double t_value;
match_timestamp_type_t t_type;
Expand Down Expand Up @@ -428,6 +430,91 @@
errp);
}

static int match_hostlist (struct list_constraint *c,
const struct job *job,
unsigned int *comparisons,
flux_error_t *errp)
{
struct hostlist *hl = zlistx_first (c->values);
const char *host;

/* nodelist may not exist if job never ran */
if (!job->nodelist)
return 0;
if (!job->nodelist_hl) {
/* hack to remove const */
struct job *jobtmp = (struct job *)job;
if (!(jobtmp->nodelist_hl = hostlist_decode (job->nodelist)))
return 0;
}
host = hostlist_first (hl);
while (host) {
if (inc_check_comparison (c->mctx, comparisons, errp) < 0)
return -1;

Check warning on line 453 in src/modules/job-list/match.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-list/match.c#L453

Added line #L453 was not covered by tests
if (hostlist_find (job->nodelist_hl, host) >= 0)
return 1;
host = hostlist_next (hl);
}
return 0;
}

/* zlistx_set_destructor */
static void wrap_hostlist_destroy (void **item)
{
if (item) {
struct hostlist *hl = *item;
hostlist_destroy (hl);
(*item) = NULL;
}
}

static struct list_constraint *create_hostlist_constraint (
struct match_ctx *mctx,
json_t *values,
flux_error_t *errp)
{
struct list_constraint *c;
struct hostlist *hl = NULL;
json_t *entry;
size_t index;

if (!(c = list_constraint_new (mctx,
match_hostlist,
wrap_hostlist_destroy,
errp)))
return NULL;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

errp->text not set here.

Copy link
Member Author

@chu11 chu11 Apr 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think this is ok, errp is set in the call to list_constraint_new()

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yeah, but looks like it is missing in the EINVAL path.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, nevermind, I think I was looking at the wrong function in the diff.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nevermind, I think I was looking at the wrong function in the diff.

/* Create a single hostlist if user specifies multiple nodes or
* RFC29 hostlist range */
if (!(hl = hostlist_create ())) {
errprintf (errp, "failed to create hostlist structure");
goto error;

Check warning on line 490 in src/modules/job-list/match.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-list/match.c#L489-L490

Added lines #L489 - L490 were not covered by tests
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

errp->text not set here.

}
json_array_foreach (values, index, entry) {
if (!json_is_string (entry)) {
errprintf (errp, "host value must be a string");
goto error;

Check warning on line 495 in src/modules/job-list/match.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-list/match.c#L494-L495

Added lines #L494 - L495 were not covered by tests
}
if (hostlist_append (hl, json_string_value (entry)) <= 0) {
errprintf (errp, "host value not in valid Hostlist format");
goto error;

Check warning on line 499 in src/modules/job-list/match.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-list/match.c#L498-L499

Added lines #L498 - L499 were not covered by tests
}
}
if (hostlist_count (hl) > mctx->max_hostlist) {
errprintf (errp, "too many hosts specified");
goto error;
}
if (!zlistx_add_end (c->values, hl)) {
errprintf (errp, "failed to append hostlist structure");
hostlist_destroy (hl);
goto error;

Check warning on line 509 in src/modules/job-list/match.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-list/match.c#L507-L509

Added lines #L507 - L509 were not covered by tests
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

errp->text not set here.

}
return c;
error:
hostlist_destroy (hl);
list_constraint_destroy (c);
return NULL;
}

static int match_timestamp (struct list_constraint *c,
const struct job *job,
unsigned int *comparisons,
Expand Down Expand Up @@ -665,6 +752,8 @@
return create_states_constraint (mctx, values, errp);
else if (streq (op, "results"))
return create_results_constraint (mctx, values, errp);
else if (streq (op, "hostlist"))
return create_hostlist_constraint (mctx, values, errp);
else if (streq (op, "t_submit")
|| streq (op, "t_depend")
|| streq (op, "t_run")
Expand Down Expand Up @@ -743,6 +832,30 @@
goto error;
}

if (flux_get_size (mctx->h, &mctx->max_hostlist) < 0) {
flux_log_error (h, "failed to get instance size");
goto error;

Check warning on line 837 in src/modules/job-list/match.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-list/match.c#L836-L837

Added lines #L836 - L837 were not covered by tests
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

errp->text not set here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think you mean I forgot a flux_log in this case, but correct!!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, heh. Yeah I must have been blindly checking goto error without errprintf() 🤦

}

/* Notes:
*
* We do not want a hostlist constraint match to DoS this module.
* So we want to configure a "max" amount of hosts that can exist
* within a hostlist constraint.
*
* Under normal operating conditions, the number of brokers should
* represent the most likely maximum. But there are some corner
* cases. For example, the instance gets reconfigured to be
* smaller, which is not an uncommon thing to do towards a
* cluster's end of life and hardware is beginning to die.
*
* So we configure the following compromise. If the number of
* brokers is below our defined minimum MIN_MATCH_HOSTLIST, we'll
* allow max_hostlist to be increased to this number.
*/
if (mctx->max_hostlist < MIN_MATCH_HOSTLIST)
mctx->max_hostlist = MIN_MATCH_HOSTLIST;

return mctx;

error:
Expand Down
1 change: 1 addition & 0 deletions src/modules/job-list/match.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
struct match_ctx {
flux_t *h;
uint64_t max_comparisons;
uint32_t max_hostlist;
};

struct match_ctx *match_ctx_create (flux_t *h);
Expand Down
3 changes: 2 additions & 1 deletion src/modules/job-list/state_match.c
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,8 @@ struct state_constraint *state_constraint_create (json_t *constraint, flux_error
}
if (streq (op, "userid")
|| streq (op, "name")
|| streq (op, "queue"))
|| streq (op, "queue")
|| streq (op, "hostlist"))
return state_constraint_new (match_maybe, NULL, errp);
else if (streq (op, "results"))
return state_constraint_new (match_result, NULL, errp);
Expand Down
Loading
Loading