From e31b66a5af41fb69356076ac3dc7feccb90f4200 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Tue, 26 Dec 2023 08:40:19 -0800 Subject: [PATCH 1/6] job-list: add nodelist hostlist to job_data Problem: In the near future it'd be convenient to do calculations on the job nodelist, but it often needs to be in a hostlist data structure for processing. We'd like to avoid converting the nodelist to hostlist struct over and over again. Add a field into the job_data struct to hold a cached version of the nodelist in a hostlist struct. --- src/modules/job-list/job_data.c | 1 + src/modules/job-list/job_data.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/modules/job-list/job_data.c b/src/modules/job-list/job_data.c index 818e3b7aa571..a48f7bdca52a 100644 --- a/src/modules/job-list/job_data.c +++ b/src/modules/job-list/job_data.c @@ -35,6 +35,7 @@ void job_destroy (void *data) int save_errno = errno; free (job->ranks); free (job->nodelist); + hostlist_destroy (job->nodelist_hl); json_decref (job->annotations); grudgeset_destroy (job->dependencies); json_decref (job->jobspec); diff --git a/src/modules/job-list/job_data.h b/src/modules/job-list/job_data.h index eb5b525a66f2..512f796ff1e4 100644 --- a/src/modules/job-list/job_data.h +++ b/src/modules/job-list/job_data.h @@ -14,6 +14,7 @@ #include #include +#include "src/common/libhostlist/hostlist.h" #include "src/common/libutil/grudgeset.h" #include "src/common/libczmqcontainers/czmq_containers.h" @@ -54,6 +55,7 @@ struct job { int nnodes; char *ranks; char *nodelist; + struct hostlist *nodelist_hl; /* cache of nodelist in hl form */ double expiration; int wait_status; bool success; From a8baf01f2840295ec6f2c707e55bd2483cb19bff Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Tue, 26 Dec 2023 12:39:43 -0800 Subject: [PATCH 2/6] job-list: support hostlist constraint Problem: It would be convenient to filter jobs based on the nodes they ran on. Add a constraint operator "hostlist" to filter on nodes within the job nodelist. Multiple nodes can be specified. Hostlists represented in RFC29 format are acceptable for input to the constraint. Fixes #4186 --- src/modules/job-list/match.c | 113 +++++++++++++++++++++++++++++ src/modules/job-list/match.h | 1 + src/modules/job-list/state_match.c | 3 +- 3 files changed, 116 insertions(+), 1 deletion(-) diff --git a/src/modules/job-list/match.c b/src/modules/job-list/match.c index 500872c3da82..f60764bc6923 100644 --- a/src/modules/job-list/match.c +++ b/src/modules/job-list/match.c @@ -51,6 +51,8 @@ typedef enum { MATCH_LESS_THAN = 4, } match_comparison_t; +#define MIN_MATCH_HOSTLIST 1024 + struct timestamp_value { double t_value; match_timestamp_type_t t_type; @@ -428,6 +430,91 @@ static struct list_constraint *create_results_constraint (struct match_ctx *mctx errp); } +static int match_hostlist (struct list_constraint *c, + const struct job *job, + unsigned int *comparisons, + flux_error_t *errp) +{ + struct hostlist *hl = zlistx_first (c->values); + const char *host; + + /* nodelist may not exist if job never ran */ + if (!job->nodelist) + return 0; + if (!job->nodelist_hl) { + /* hack to remove const */ + struct job *jobtmp = (struct job *)job; + if (!(jobtmp->nodelist_hl = hostlist_decode (job->nodelist))) + return 0; + } + host = hostlist_first (hl); + while (host) { + if (inc_check_comparison (c->mctx, comparisons, errp) < 0) + return -1; + if (hostlist_find (job->nodelist_hl, host) >= 0) + return 1; + host = hostlist_next (hl); + } + return 0; +} + +/* zlistx_set_destructor */ +static void wrap_hostlist_destroy (void **item) +{ + if (item) { + struct hostlist *hl = *item; + hostlist_destroy (hl); + (*item) = NULL; + } +} + +static struct list_constraint *create_hostlist_constraint ( + struct match_ctx *mctx, + json_t *values, + flux_error_t *errp) +{ + struct list_constraint *c; + struct hostlist *hl = NULL; + json_t *entry; + size_t index; + + if (!(c = list_constraint_new (mctx, + match_hostlist, + wrap_hostlist_destroy, + errp))) + return NULL; + /* Create a single hostlist if user specifies multiple nodes or + * RFC29 hostlist range */ + if (!(hl = hostlist_create ())) { + errprintf (errp, "failed to create hostlist structure"); + goto error; + } + json_array_foreach (values, index, entry) { + if (!json_is_string (entry)) { + errprintf (errp, "host value must be a string"); + goto error; + } + if (hostlist_append (hl, json_string_value (entry)) <= 0) { + errprintf (errp, "host value not in valid Hostlist format"); + goto error; + } + } + if (hostlist_count (hl) > mctx->max_hostlist) { + errprintf (errp, "too many hosts specified"); + goto error; + } + if (!zlistx_add_end (c->values, hl)) { + errprintf (errp, "failed to append hostlist structure"); + hostlist_destroy (hl); + goto error; + } + return c; + error: + hostlist_destroy (hl); + list_constraint_destroy (c); + return NULL; +} + static int match_timestamp (struct list_constraint *c, const struct job *job, unsigned int *comparisons, @@ -665,6 +752,8 @@ struct list_constraint *list_constraint_create (struct match_ctx *mctx, return create_states_constraint (mctx, values, errp); else if (streq (op, "results")) return create_results_constraint (mctx, values, errp); + else if (streq (op, "hostlist")) + return create_hostlist_constraint (mctx, values, errp); else if (streq (op, "t_submit") || streq (op, "t_depend") || streq (op, "t_run") @@ -743,6 +832,30 @@ struct match_ctx *match_ctx_create (flux_t *h) goto error; } + if (flux_get_size (mctx->h, &mctx->max_hostlist) < 0) { + flux_log_error (h, "failed to get instance size"); + goto error; + } + + /* Notes: + * + * We do not want a hostlist constraint match to DoS this module. + * So we want to configure a "max" amount of hosts that can exist + * within a hostlist constraint. + * + * Under normal operating conditions, the number of brokers should + * represent the most likely maximum. But there are some corner + * cases. For example, the instance gets reconfigured to be + * smaller, which is not an uncommon thing to do towards a + * cluster's end of life and hardware is beginning to die. + * + * So we configure the following compromise. If the number of + * brokers is below our defined minimum MIN_MATCH_HOSTLIST, we'll + * allow max_hostlist to be increased to this number. + */ + if (mctx->max_hostlist < MIN_MATCH_HOSTLIST) + mctx->max_hostlist = MIN_MATCH_HOSTLIST; + return mctx; error: diff --git a/src/modules/job-list/match.h b/src/modules/job-list/match.h index 195f6f8929e2..5589a671fdbc 100644 --- a/src/modules/job-list/match.h +++ b/src/modules/job-list/match.h @@ -23,6 +23,7 @@ struct match_ctx { flux_t *h; uint64_t max_comparisons; + uint32_t max_hostlist; }; struct match_ctx *match_ctx_create (flux_t *h); diff --git a/src/modules/job-list/state_match.c b/src/modules/job-list/state_match.c index ed3b7b922bed..75dc8a851bf2 100644 --- a/src/modules/job-list/state_match.c +++ b/src/modules/job-list/state_match.c @@ -366,7 +366,8 @@ struct state_constraint *state_constraint_create (json_t *constraint, flux_error } if (streq (op, "userid") || streq (op, "name") - || streq (op, "queue")) + || streq (op, "queue") + || streq (op, "hostlist")) return state_constraint_new (match_maybe, NULL, errp); else if (streq (op, "results")) return state_constraint_new (match_result, NULL, errp); From f1de8d0b6e5bedebac19565df41f0ea8f2aaa61d Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Tue, 26 Dec 2023 21:58:35 -0800 Subject: [PATCH 3/6] job-list/test: add hostlist constraint unit tests Problem: There are no unit tests for the new 'hostlist' constraint operator. Add tests in match and state_match tests. --- src/modules/job-list/test/match.c | 394 +++++++++++++++++++++++- src/modules/job-list/test/state_match.c | 1 + 2 files changed, 392 insertions(+), 3 deletions(-) diff --git a/src/modules/job-list/test/match.c b/src/modules/job-list/test/match.c index 1badb8a5fffd..6c2f0307fecd 100644 --- a/src/modules/job-list/test/match.c +++ b/src/modules/job-list/test/match.c @@ -22,7 +22,9 @@ /* normally created by job-list "main code" and passed to job_match(). * we create a global one here and initialize it manually. */ -struct match_ctx mctx = { .h = NULL, .max_comparisons = 0 }; +struct match_ctx mctx = { .h = NULL, + .max_hostlist = 1024, + .max_comparisons = 0 }; static void list_constraint_create_corner_case (const char *str, const char *fmt, @@ -100,6 +102,7 @@ static void test_corner_case (void) static struct job *setup_job (uint32_t userid, const char *name, const char *queue, + const char *nodelist, flux_job_state_t state, flux_job_result_t result, double t_submit, @@ -117,6 +120,11 @@ static struct job *setup_job (uint32_t userid, job->name = name; if (queue) job->queue = queue; + if (nodelist) { + /* N.B. internally is not const, so strdup it */ + if (!(job->nodelist = strdup (nodelist))) + BAIL_OUT ("failed to strdup nodelist"); + } job->state = state; if (state) { /* Assume all jobs run, we don't skip any states, so add bitmask @@ -160,7 +168,17 @@ static struct list_constraint *create_list_constraint (const char *constraint) static void test_basic_special_cases (void) { - struct job *job = setup_job (0, NULL, NULL, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0); + struct job *job = setup_job (0, + NULL, + NULL, + NULL, + 0, + 0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0); struct list_constraint *c; flux_error_t error; int rv; @@ -244,6 +262,7 @@ static void test_basic_userid (void) flux_error_t error; int rv; job = setup_job (tests->userid, + NULL, NULL, NULL, 0, @@ -333,6 +352,7 @@ static void test_basic_name (void) job = setup_job (0, tests->name, NULL, + NULL, 0, 0, 0.0, @@ -420,6 +440,7 @@ static void test_basic_queue (void) job = setup_job (0, NULL, tests->queue, + NULL, 0, 0, 0.0, @@ -509,6 +530,7 @@ static void test_basic_states (void) flux_error_t error; int rv; job = setup_job (0, + NULL, NULL, NULL, tests->state, @@ -603,6 +625,7 @@ static void test_basic_results (void) flux_error_t error; int rv; job = setup_job (0, + NULL, NULL, NULL, tests->state, @@ -627,6 +650,133 @@ static void test_basic_results (void) } } +static void test_corner_case_hostlist (void) +{ + struct list_constraint *c; + flux_error_t error; + json_error_t jerror; + json_t *jc = NULL; + + /* hostrange exceeds maximum allowed */ + + if (!(jc = json_loads ("{ \"hostlist\": [ \"foo[1-5000]\" ] }", 0, &jerror))) + BAIL_OUT ("json constraint invalid: %s", jerror.text); + + c = list_constraint_create (&mctx, jc, &error); + ok (c == NULL, + "list_constraint_create fail hostlist with excess hosts: %s", + error.text); + + json_decref (jc); +} + +struct basic_hostlist_test { + const char *nodelist; + bool expected; + bool end; /* nodelist can be NULL */ +}; + +struct basic_hostlist_constraint_test { + const char *constraint; + struct basic_hostlist_test tests[9]; +} basic_hostlist_tests[] = { + { + "{ \"hostlist\": [ ] }", + { + /* N.B. nodelist can potentially be NULL */ + { NULL, false, false, }, + { NULL, false, true, }, + }, + }, + { + "{ \"hostlist\": [ \"foo1\" ] }", + { + /* N.B. host can potentially be NULL */ + { NULL, false, false, }, + { "foo1", true, false, }, + { "foo2", false, false, }, + { "foo[1-2]", true, false, }, + { "foo[2-3]", false, false, }, + { NULL, false, true, }, + }, + }, + { + "{ \"hostlist\": [ \"foo[1-2]\" ] }", + { + /* N.B. host can potentially be NULL */ + { NULL, false, false, }, + { "foo1", true, false, }, + { "foo2", true, false, }, + { "foo[1-2]", true, false, }, + { "foo[2-3]", true, false, }, + { "foo[3-4]", false, false, }, + { NULL, false, true, }, + }, + }, + { + "{ \"hostlist\": [ \"foo1\", \"foo2\", \"foo3\" ] }", + { + /* N.B. host can potentially be NULL */ + { NULL, false, false, }, + { "foo1", true, false, }, + { "foo2", true, false, }, + { "foo[1-2]", true, false, }, + { "foo[2-3]", true, false, }, + { "foo[3-4]", true, false, }, + { "foo4", false, false, }, + { "foo[4-5]", false, false, }, + { NULL, false, true, }, + }, + }, + { + NULL, + { + { NULL, false, true, }, + }, + }, +}; + +static void test_basic_hostlist (void) +{ + struct basic_hostlist_constraint_test *ctests = basic_hostlist_tests; + int index = 0; + + while (ctests->constraint) { + struct basic_hostlist_test *tests = ctests->tests; + struct list_constraint *c; + int index2 = 0; + + c = create_list_constraint (ctests->constraint); + while (!tests->end) { + struct job *job; + flux_error_t error; + int rv; + job = setup_job (0, + NULL, + NULL, + tests->nodelist, + 0, + 0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0); + rv = job_match (job, c, &error); + ok (rv == tests->expected, + "basic host job match test #%d/#%d", + index, index2); + job_destroy (job); + index2++; + tests++; + } + + index++; + list_constraint_destroy (c); + ctests++; + } +} + struct basic_timestamp_test { flux_job_state_t state; int submit_version; @@ -903,6 +1053,7 @@ static void test_basic_timestamp (void) flux_error_t error; int rv; job = setup_job (0, + NULL, NULL, NULL, tests->state, @@ -1112,6 +1263,7 @@ static void test_basic_conditionals (void) job = setup_job (tests->userid, tests->name, NULL, + NULL, 0, 0, 0.0, @@ -1139,8 +1291,10 @@ struct realworld_test { uint32_t userid; const char *name; const char *queue; + const char *nodelist; flux_job_state_t state; flux_job_result_t result; + double t_run; double t_inactive; int expected; }; @@ -1162,26 +1316,32 @@ struct realworld_constraint_test { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_DEPEND, 0, 0.0, + 0.0, true, }, { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, + 0.0, true, }, { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, + 0.0, 2000.0, true, }, @@ -1189,8 +1349,10 @@ struct realworld_constraint_test { 43, "foo", "batch", + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, + 0.0, 2000.0, false, }, @@ -1198,9 +1360,11 @@ struct realworld_constraint_test { 0, NULL, NULL, + NULL, 0, 0, 0.0, + 0.0, false }, }, @@ -1218,8 +1382,10 @@ struct realworld_constraint_test { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_FAILED, + 0.0, 2000.0, true, }, @@ -1227,8 +1393,10 @@ struct realworld_constraint_test { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_CANCELED, + 0.0, 2000.0, true, }, @@ -1236,8 +1404,10 @@ struct realworld_constraint_test { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_TIMEOUT, + 0.0, 2000.0, true, }, @@ -1245,8 +1415,10 @@ struct realworld_constraint_test { 43, "foo", "batch", + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_FAILED, + 0.0, 2000.0, false, }, @@ -1254,27 +1426,33 @@ struct realworld_constraint_test { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_DEPEND, 0, 0.0, + 0.0, false, }, { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, + 0.0, false, }, { 0, NULL, NULL, + NULL, 0, 0, 0.0, + 0.0, false }, }, @@ -1293,44 +1471,54 @@ struct realworld_constraint_test { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_DEPEND, 0, 0.0, + 0.0, true, }, { 42, "foo", "debug", + NULL, FLUX_JOB_STATE_DEPEND, 0, 0.0, + 0.0, true, }, { 42, "foo", "debug", + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, + 0.0, true, }, { 43, "foo", "batch", + NULL, FLUX_JOB_STATE_DEPEND, 0, 0.0, + 0.0, false, }, { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, + 0.0, 2000.0, false, }, @@ -1338,18 +1526,22 @@ struct realworld_constraint_test { 42, "foo", "gpu", + NULL, FLUX_JOB_STATE_DEPEND, 0, 0.0, + 0.0, false, }, { 0, NULL, NULL, + NULL, 0, 0, 0.0, + 0.0, false }, }, @@ -1369,53 +1561,65 @@ struct realworld_constraint_test { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, + 0.0, true, }, { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_CLEANUP, 0, 0.0, + 0.0, true, }, { 43, "foo", "batch", + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, + 0.0, false, }, { 42, "foo", "debug", + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, + 0.0, false, }, { 42, "bar", "batch", + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, + 0.0, false, }, { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, + 0.0, 2000.0, false, }, @@ -1423,9 +1627,11 @@ struct realworld_constraint_test { 0, NULL, NULL, + NULL, 0, 0, 0.0, + 0.0, false }, }, @@ -1443,26 +1649,32 @@ struct realworld_constraint_test { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_SCHED, 0, 0.0, + 0.0, false, }, { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, + 0.0, false, }, { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, + 0.0, 100.0, false, }, @@ -1470,8 +1682,10 @@ struct realworld_constraint_test { 42, "foo", "batch", + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, + 0.0, 1000.0, true, }, @@ -1479,9 +1693,178 @@ struct realworld_constraint_test { 0, NULL, NULL, + NULL, 0, 0, 0.0, + 0.0, + false + }, + }, + }, + { + /* jobs for a user that ran on specific hostlist */ + "{ \"and\": \ + [ \ + { \"userid\": [ 42 ] }, \ + { \"hostlist\": [ \"node1\", \"node2\" ] } \ + ] \ + }", + { + { + 42, + "foo", + "batch", + "node[1-3]", + FLUX_JOB_STATE_INACTIVE, + FLUX_JOB_RESULT_COMPLETED, + 0.0, + 0.0, + true, + }, + { + 43, + "foo", + "batch", + "node[1-3]", + FLUX_JOB_STATE_INACTIVE, + FLUX_JOB_RESULT_COMPLETED, + 0.0, + 0.0, + false, + }, + { + 42, + "foo", + "batch", + "node[2-4]", + FLUX_JOB_STATE_INACTIVE, + FLUX_JOB_RESULT_COMPLETED, + 0.0, + 0.0, + true, + }, + { + 42, + "foo", + "batch", + "node[3-4]", + FLUX_JOB_STATE_INACTIVE, + FLUX_JOB_RESULT_COMPLETED, + 0.0, + 0.0, + false, + }, + { + 0, + NULL, + NULL, + NULL, + 0, + 0, + 0.0, + 0.0, + false + }, + }, + }, + { + /* jobs that ran on specific hostlist during a time period + */ + "{ \"and\": \ + [ \ + { \"hostlist\": [ \"node1\", \"node2\" ] }, \ + { \"t_run\": [ \">=500.0\" ] }, \ + { \"t_inactive\": [ \"<=5000.0\" ] } \ + ] \ + }", + { + { + 42, + "foo", + "batch", + "node[1-3]", + FLUX_JOB_STATE_RUN, + 0, + 1000.0, + 0.0, + false, + }, + { + 42, + "foo", + "batch", + "node[1-3]", + FLUX_JOB_STATE_INACTIVE, + FLUX_JOB_RESULT_COMPLETED, + 1000.0, + 2000.0, + true, + }, + { + 42, + "foo", + "batch", + "node[2-3]", + FLUX_JOB_STATE_RUN, + 0, + 1000.0, + 0.0, + false, + }, + { + 42, + "foo", + "batch", + "node[2-3]", + FLUX_JOB_STATE_INACTIVE, + FLUX_JOB_RESULT_COMPLETED, + 1000.0, + 2000.0, + true, + }, + { + 42, + "foo", + "batch", + "node[2-3]", + FLUX_JOB_STATE_INACTIVE, + FLUX_JOB_RESULT_COMPLETED, + 1000.0, + 6000.0, + false, + }, + { + 42, + "foo", + "batch", + "node[3-4]", + FLUX_JOB_STATE_RUN, + 0, + 1000.0, + 0.0, + false, + }, + { + 42, + "foo", + "batch", + "node[3-4]", + FLUX_JOB_STATE_INACTIVE, + FLUX_JOB_RESULT_COMPLETED, + 1000.0, + 2000.0, + false, + }, + { + 0, + NULL, + NULL, + NULL, + 0, + 0, + 0.0, + 0.0, false }, }, @@ -1493,9 +1876,11 @@ struct realworld_constraint_test { 0, NULL, NULL, + NULL, 0, 0, 0.0, + 0.0, false }, }, @@ -1520,11 +1905,12 @@ static void test_realworld (void) job = setup_job (tests->userid, tests->name, tests->queue, + tests->nodelist, tests->state, tests->result, 0.0, 0.0, - 0.0, + tests->t_run, 0.0, tests->t_inactive); rv = job_match (job, c, &error); @@ -1553,6 +1939,8 @@ int main (int argc, char *argv[]) test_basic_queue (); test_basic_states (); test_basic_results (); + test_corner_case_hostlist (); + test_basic_hostlist (); test_basic_timestamp (); test_basic_conditionals (); test_realworld (); diff --git a/src/modules/job-list/test/state_match.c b/src/modules/job-list/test/state_match.c index f1fb5a0daf8d..0265634a6de6 100644 --- a/src/modules/job-list/test/state_match.c +++ b/src/modules/job-list/test/state_match.c @@ -1108,6 +1108,7 @@ struct state_match_constraint_test { { \"userid\": [ 42 ] }, \ { \"name\": [ \"foo\" ] }, \ { \"queue\": [ \"foo\" ] }, \ + { \"hostlist\": [ \"bar\" ] }, \ { \"states\": [ \"running\" ] }, \ { \"results\": [ \"completed\" ] }, \ { \"t_submit\": [ \">=500.0\" ] }, \ From 68165f83aea624c7b547f0fc56033e5323d03269 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Wed, 3 Jan 2024 13:17:18 -0800 Subject: [PATCH 4/6] testsuite: convert to use flux submit Problem: In t2260-job-list.t, some test jobs are submitted using "flux job submit". This makes it inconvenient to set job requirements on those jobs, such as specific nodes the jobs should run on. Solution: Convert all uses of "flux job submit" to use "flux submit". --- t/t2260-job-list.t | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/t/t2260-job-list.t b/t/t2260-job-list.t index 429f424f4aa6..012b6c4e77df 100755 --- a/t/t2260-job-list.t +++ b/t/t2260-job-list.t @@ -53,17 +53,11 @@ test_expect_success 'create helper job submission script' ' # - alternate userid job listing test_expect_success 'submit jobs for job list testing' ' - # Create `hostname` and `sleep` jobspec - # N.B. Used w/ `flux job submit` for serial job submission - # for efficiency (vs serial `flux submit`. - # - flux submit --dry-run hostname >hostname.json && - flux submit --dry-run --time-limit=5m sleep 600 > sleeplong.json && # # submit jobs that will complete # for i in $(seq 0 3); do - flux job submit hostname.json >> inactiveids + flux submit hostname >> inactiveids fj_wait_event `tail -n 1 inactiveids` clean done && # @@ -117,20 +111,20 @@ test_expect_success 'submit jobs for job list testing' ' # Submit 8 sleep jobs to fill up resources # for i in $(seq 0 7); do - flux job submit sleeplong.json >> runningids + flux submit --time-limit=5m sleep 600 >> runningids done && tac runningids | flux job id > running.ids && # # Submit a set of jobs with misc urgencies # - id1=$(flux job submit -u20 hostname.json) && - id2=$(flux job submit hostname.json) && - id3=$(flux job submit -u31 hostname.json) && - id4=$(flux job submit -u0 hostname.json) && - id5=$(flux job submit -u20 hostname.json) && - id6=$(flux job submit hostname.json) && - id7=$(flux job submit -u31 hostname.json) && - id8=$(flux job submit -u0 hostname.json) && + id1=$(flux submit --urgency=20 hostname) && + id2=$(flux submit hostname) && + id3=$(flux submit --urgency=31 hostname) && + id4=$(flux submit --urgency=0 hostname) && + id5=$(flux submit --urgency=20 hostname) && + id6=$(flux submit hostname) && + id7=$(flux submit --urgency=31 hostname) && + id8=$(flux submit --urgency=0 hostname) && flux job id $id3 > pending.ids && flux job id $id7 >> pending.ids && flux job id $id1 >> pending.ids && @@ -2798,6 +2792,7 @@ test_expect_success 'reload job-ingest without validator' ' ' test_expect_success 'create illegal jobspec with empty command array' ' + flux submit --dry-run hostname > hostname.json && cat hostname.json | $jq ".tasks[0].command = []" > bad_jobspec.json ' From fc2aece042da3a34a65866339584237ffefef990 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Tue, 2 Jan 2024 22:04:28 -0800 Subject: [PATCH 5/6] testsuite: setup fake hostnames for job list tests Problem: In the near future we would like to filter jobs based on nodes in a job's nodelist. This would be an issue in the current test setup because test brokers all run under the same host and it is unknown which nodes jobs actually ran on. Solution: Setup fake hostnames for the test brokers in t2260-job-list.t and when necessary, run test jobs on specific hosts. --- t/t2260-job-list.t | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/t/t2260-job-list.t b/t/t2260-job-list.t index 012b6c4e77df..0129153d0e52 100755 --- a/t/t2260-job-list.t +++ b/t/t2260-job-list.t @@ -26,6 +26,15 @@ wait_jobid_state() { flux job list-ids --wait-state=$2 $1 > /dev/null } +test_expect_success 'setup specific fake hostnames' ' + flux R encode -r 0-3 -c 0-1 -H node[0-3] \ + | tr -d "\n" \ + | flux kvs put -r resource.R=- && + flux module unload sched-simple && + flux module reload resource noverify && + flux module load sched-simple +' + test_expect_success 'create helper job submission script' ' cat >sleepinf.sh <<-EOT && #!/bin/sh @@ -57,7 +66,7 @@ test_expect_success 'submit jobs for job list testing' ' # submit jobs that will complete # for i in $(seq 0 3); do - flux submit hostname >> inactiveids + flux submit --requires=host:node${i} hostname >> inactiveids fj_wait_event `tail -n 1 inactiveids` clean done && # @@ -68,7 +77,7 @@ test_expect_success 'submit jobs for job list testing' ' # Run a job that will fail, copy its JOBID to both inactive and # failed lists. # - ! jobid=`flux submit --wait nosuchcommand` && + ! jobid=`flux submit --requires=host:node0 --wait nosuchcommand` && echo $jobid >> inactiveids && flux job id $jobid > failedids && # @@ -78,7 +87,7 @@ test_expect_success 'submit jobs for job list testing' ' # N.B. sleepinf.sh and wait-event on job data to workaround # rare job startup race. See #5210 # - jobid=`flux submit ./sleepinf.sh` && + jobid=`flux submit --requires=host:node0 ./sleepinf.sh` && flux job wait-event -W -p guest.output $jobid data && flux job kill $jobid && fj_wait_event $jobid clean && @@ -92,7 +101,7 @@ test_expect_success 'submit jobs for job list testing' ' # N.B. sleepinf.sh and wait-event on job data to workaround # rare job startup race. See #5210 # - jobid=`flux submit ./sleepinf.sh` && + jobid=`flux submit --requires=host:node0 ./sleepinf.sh` && flux job wait-event -W -p guest.output $jobid data && flux job raise --type=myexception --severity=0 -m "myexception" $jobid && fj_wait_event $jobid clean && @@ -103,13 +112,16 @@ test_expect_success 'submit jobs for job list testing' ' # Run a job that will timeout, copy its JOBID to both inactive and # timeout lists. # - jobid=`flux submit --time-limit=0.5s sleep 30` && + jobid=`flux submit --requires=host:node0 --time-limit=0.5s sleep 30` && echo $jobid >> inactiveids && flux job id $jobid > timeout.ids && fj_wait_event ${jobid} clean && # # Submit 8 sleep jobs to fill up resources # + # N.B. no need to specify --requires:host, will be distributed + # evenly + # for i in $(seq 0 7); do flux submit --time-limit=5m sleep 600 >> runningids done && @@ -117,6 +129,8 @@ test_expect_success 'submit jobs for job list testing' ' # # Submit a set of jobs with misc urgencies # + # N.B. no need to specify --requires:host, these jobs wont run + # id1=$(flux submit --urgency=20 hostname) && id2=$(flux submit hostname) && id3=$(flux submit --urgency=31 hostname) && From d381bf3f0ef12f571b67cbb29f52495073c9679b Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Wed, 3 Jan 2024 22:03:29 -0800 Subject: [PATCH 6/6] testsuite: add hostlist constraint coverage Problem: There is no testing / coverage for the new hostlist constraint in the job-list module. Add some tests to t2260-job-list.t. --- t/t2260-job-list.t | 107 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/t/t2260-job-list.t b/t/t2260-job-list.t index 0129153d0e52..8fea127a0b13 100755 --- a/t/t2260-job-list.t +++ b/t/t2260-job-list.t @@ -616,6 +616,113 @@ test_expect_success 'flux job list all via t_depend (3)' ' test $(cat list_constraint_all3.out | wc -l) -eq ${numlines} ' +# +# nodelist / hostlist constraint filtering +# + +# N.B. all failed and timeout jobs we explicitly ran on node0, so +# tests below don't test against node0 to make things easier. + +# N.B. failed.ids are jobs that failed after running, thus will have +# had a node assigned. timeout.ids obviously ran on a node, but timed +# out. +test_expect_success 'flux job list all jobs that ran on any node (1)' ' + constraint="{ and: [ {hostlist:[\"node[0-3]\"]} ] }" && + $jq -j -c -n "{max_entries:1000, attrs:[], constraint:${constraint}}" \ + | $RPC job-list.list | $jq .jobs | $jq -c '.[]' | $jq .id > constraint_hostlist1.out && + numlines=$(cat completed.ids running.ids failed.ids timeout.ids | wc -l) && + test $(cat constraint_hostlist1.out | wc -l) -eq ${numlines} +' + +test_expect_success 'flux job list all jobs that ran on any node (2)' ' + constraint="{ and: [ {hostlist:[\"node[2-3]\", \"node1\", \"node0\"]} ] }" && + $jq -j -c -n "{max_entries:1000, attrs:[], constraint:${constraint}}" \ + | $RPC job-list.list | $jq .jobs | $jq -c '.[]' | $jq .id > constraint_hostlist2.out && + numlines=$(cat completed.ids running.ids failed.ids timeout.ids | wc -l) && + test $(cat constraint_hostlist2.out | wc -l) -eq ${numlines} +' + +# We evenly distributed non-bad jobs on nodes, so should be half of the jobs +test_expect_success 'flux job list all jobs that ran on nodes[1-2] (1)' ' + constraint="{ and: [ {hostlist:[\"node[1-2]\"]} ] }" && + $jq -j -c -n "{max_entries:1000, attrs:[], constraint:${constraint}}" \ + | $RPC job-list.list | $jq .jobs | $jq -c '.[]' | $jq .id > constraint_hostlist3.out && + numlines=$(expr $(cat completed.ids running.ids | wc -l) / 2) && + test $(cat constraint_hostlist3.out | wc -l) -eq ${numlines} +' + +# We evenly distributed non-bad jobs on nodes, so should be half of the jobs +test_expect_success 'flux job list all jobs that ran on nodes[1-2] (2)' ' + constraint="{ and: [ {hostlist:[\"node1\", \"node2\"]} ] }" && + $jq -j -c -n "{max_entries:1000, attrs:[], constraint:${constraint}}" \ + | $RPC job-list.list | $jq .jobs | $jq -c '.[]' | $jq .id > constraint_hostlist4.out && + numlines=$(expr $(cat completed.ids running.ids | wc -l) / 2) && + test $(cat constraint_hostlist4.out | wc -l) -eq ${numlines} +' + +# We evenly distributed non-bad jobs on nodes, so should be quarter of the jobs +test_expect_success 'flux job list all jobs that ran on node3' ' + constraint="{ and: [ {hostlist:[\"node3\"]} ] }" && + $jq -j -c -n "{max_entries:1000, attrs:[], constraint:${constraint}}" \ + | $RPC job-list.list | $jq .jobs | $jq -c '.[]' | $jq .id > constraint_hostlist5.out && + numlines=$(expr $(cat completed.ids running.ids | wc -l) / 4) && + test $(cat constraint_hostlist5.out | wc -l) -eq ${numlines} +' + +# We evenly distributed completed jobs on nodes, so should be quarter of the jobs +test_expect_success 'flux job list completed jobs that ran on node3' ' + state=`${JOB_CONV} strtostate INACTIVE` && + constraint="{ and: [ {hostlist:[\"node3\"]}, {states:[${state}]} ] }" && + $jq -j -c -n "{max_entries:1000, attrs:[], constraint:${constraint}}" \ + | $RPC job-list.list | $jq .jobs | $jq -c '.[]' | $jq .id > constraint_hostlist6.out && + numlines=$(expr $(cat completed.ids | wc -l) / 4) && + test $(cat constraint_hostlist6.out | wc -l) -eq ${numlines} +' + +# We evenly distributed running jobs on nodes, so should be quarter of the jobs +test_expect_success 'flux job list running jobs that ran on node3' ' + state=`${JOB_CONV} strtostate RUNNING` && + constraint="{ and: [ {hostlist:[\"node3\"]}, {states:[${state}]} ] }" && + $jq -j -c -n "{max_entries:1000, attrs:[], constraint:${constraint}}" \ + | $RPC job-list.list | $jq .jobs | $jq -c '.[]' | $jq .id > constraint_hostlist7.out && + numlines=$(expr $(cat running.ids | wc -l) / 4) && + test $(cat constraint_hostlist7.out | wc -l) -eq ${numlines} +' + +# We evenly distributed running jobs on nodes, so should be half of the jobs +# For this test, get start time of first running job +test_expect_success 'flux job list of running jobs that ran on node[1-2] after certain time (1)' ' + id=`tail -n1 running.ids` && + t_submit=`flux job list-ids ${id} | $jq .t_submit` && + constraint="{ and: [ {hostlist:[\"node[1-2]\"]}, {t_submit:[\">=${t_submit}\"]} ] }" && + $jq -j -c -n "{max_entries:1000, attrs:[], constraint:${constraint}}" \ + | $RPC job-list.list | $jq .jobs | $jq -c '.[]' | $jq .id > constraint_hostlist8.out && + numlines=$(expr $(cat running.ids | wc -l) / 2) && + test $(cat constraint_hostlist8.out | wc -l) -eq ${numlines} +' + +# We evenly distributed running jobs on nodes, so should be half of the jobs +# For this test, get last inactive time of completed jobs +test_expect_success 'flux job list of running jobs that ran on node[1-2] after certain time (2)' ' + id=`head -n1 completed.ids` && + t_inactive=`flux job list-ids ${id} | $jq .t_inactive` && + constraint="{ and: [ {hostlist:[\"node[1-2]\"]}, {t_submit:[\">${t_inactive}\"]} ] }" && + $jq -j -c -n "{max_entries:1000, attrs:[], constraint:${constraint}}" \ + | $RPC job-list.list | $jq .jobs | $jq -c '.[]' | $jq .id > constraint_hostlist9.out && + numlines=$(expr $(cat running.ids | wc -l) / 2) && + test $(cat constraint_hostlist9.out | wc -l) -eq ${numlines} +' + +# We evenly distributed completed & running jobs on nodes, so should be half of the jobs +test_expect_success 'flux job list of all jobs that ran on node[1-2] after certain time' ' + id=`head -n1 completed.ids` && + constraint="{ and: [ {hostlist:[\"node[1-2]\"]}, {t_submit:[\">5\"]} ] }" && + $jq -j -c -n "{max_entries:1000, attrs:[], constraint:${constraint}}" \ + | $RPC job-list.list | $jq .jobs | $jq -c '.[]' | $jq .id > constraint_hostlist9.out && + numlines=$(expr $(cat completed.ids running.ids | wc -l) / 2) && + test $(cat constraint_hostlist9.out | wc -l) -eq ${numlines} +' + # # legacy RPC tests #