Skip to content

Commit

Permalink
Merge pull request #6541 from grondo/issue#6418
Browse files Browse the repository at this point in the history
add `resource.rediscover` config key to force rediscovery of subinstance resources
  • Loading branch information
mergify[bot] authored Jan 7, 2025
2 parents e6b108f + 41dc3a3 commit 01b4d87
Show file tree
Hide file tree
Showing 9 changed files with 114 additions and 76 deletions.
4 changes: 4 additions & 0 deletions doc/man5/flux-config-resource.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ noverify
(optional) If true, disable the draining of nodes when there is a
discrepancy between configured resources and HWLOC-probed resources.

rediscover
(optional) If true, force rediscovery of resources using HWLOC, rather
then using the R and HWLOC XML from the enclosing instance.

Note that updates to the resource table are ignored until the next Flux
restart.

Expand Down
18 changes: 11 additions & 7 deletions src/modules/resource/inventory.c
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,8 @@ static int lookup_R_fallback (struct inventory *inv, flux_jobid_t id)
return rc;
}

static int start_resource_watch (struct inventory *inv, bool no_resource_watch)
static int start_resource_watch (struct inventory *inv,
struct resource_config *config)
{
flux_t *h = inv->ctx->h;
const char *jobid;
Expand All @@ -534,7 +535,7 @@ static int start_resource_watch (struct inventory *inv, bool no_resource_watch)
* simulate start under an older instance that does not support this
* RPC.
*/
if (no_resource_watch)
if (config->no_update_watch)
service = "job-info.update-watch-fake";

if (!(jobid = flux_attr_get (h, "jobid")))
Expand Down Expand Up @@ -594,7 +595,11 @@ static int start_resource_watch (struct inventory *inv, bool no_resource_watch)
}
flux_future_reset (f);
inv->R_watch_f = f;
if (R) { // R = NULL if no conversion possible (fall through to discovery)

/* If R == NULL (no conversion possible) or rediscover == true, then
* we will fall through to dynamic discovery.
*/
if (R && !config->rediscover) {
if (inventory_put (inv, R, "job-info") < 0)
goto done;
if (flux_future_then (f,
Expand Down Expand Up @@ -898,8 +903,7 @@ void inventory_destroy (struct inventory *inv)
}

struct inventory *inventory_create (struct resource_ctx *ctx,
json_t *conf_R,
bool no_update_watch)
struct resource_config *config)
{
struct inventory *inv;
json_t *R = NULL;
Expand All @@ -909,14 +913,14 @@ struct inventory *inventory_create (struct resource_ctx *ctx,
inv->ctx = ctx;
if (flux_msg_handler_addvec (ctx->h, htab, inv, &inv->handlers) < 0)
goto error;
if (conf_R && convert_R_conf (ctx->h, conf_R, &R) < 0)
if (config->R && convert_R_conf (ctx->h, config->R, &R) < 0)
goto error;
if (ctx->rank == 0) {
if (R && inventory_put (inv, R, "configuration") < 0)
goto error;
if (!inv->R && get_from_kvs (inv, "resource.R") < 0)
goto error;
if (!inv->R && start_resource_watch (inv, no_update_watch) < 0)
if (!inv->R && start_resource_watch (inv, config) < 0)
goto error;
}
else {
Expand Down
5 changes: 2 additions & 3 deletions src/modules/resource/inventory.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@
#define _FLUX_RESOURCE_INVENTORY_H

/* Create resource inventory.
* R is configured resource object, if any (ref taken).
* config->R is configured resource object, if any (ref taken).
* R is obtained from enclosing Flux instance or probed dynamically otherwise.
*/
struct inventory *inventory_create (struct resource_ctx *ctx,
json_t *R,
bool no_update_watch);
struct resource_config *config);
void inventory_destroy (struct inventory *inv);

/* Get resource object.
Expand Down
88 changes: 33 additions & 55 deletions src/modules/resource/resource.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,14 @@
* no-update-watch = false
* For testing purposes, simulate missing job-info.update-watch service
* in parent instance by sending to an invalid service name.
*
* rediscover = false
* Force rediscovery of local resources via hwloc. Do not fetch R or hwloc
* XML from the enclosing instance.
*/
static int parse_config (struct resource_ctx *ctx,
const flux_conf_t *conf,
const char **excludep,
json_t **R,
bool *noverifyp,
bool *norestrictp,
bool *no_update_watchp,
struct resource_config *rconfig,
flux_error_t *errp)
{
flux_error_t error;
Expand All @@ -74,20 +74,22 @@ static int parse_config (struct resource_ctx *ctx,
int noverify = 0;
int norestrict = 0;
int no_update_watch = 0;
int rediscover = 0;
json_t *o = NULL;
json_t *config = NULL;

if (flux_conf_unpack (conf,
&error,
"{s?{s?s s?s s?o s?s s?b s?b s?b !}}",
"{s?{s?s s?s s?o s?s s?b s?b s?b s?b !}}",
"resource",
"path", &path,
"scheduling", &scheduling_path,
"config", &config,
"exclude", &exclude,
"norestrict", &norestrict,
"noverify", &noverify,
"no-update-watch", &no_update_watch) < 0) {
"no-update-watch", &no_update_watch,
"rediscover", &rediscover) < 0) {
errprintf (errp,
"error parsing [resource] configuration: %s",
error.text);
Expand Down Expand Up @@ -143,16 +145,14 @@ static int parse_config (struct resource_ctx *ctx,
return -1;
}
}
if (excludep)
*excludep = exclude;
if (noverifyp)
*noverifyp = noverify ? true : false;
if (norestrictp)
*norestrictp = norestrict ? true : false;
if (no_update_watchp)
*no_update_watchp = no_update_watch ? true : false;
if (R)
*R = o;
if (rconfig) {
rconfig->exclude_idset = exclude;
rconfig->noverify = noverify ? true : false;
rconfig->norestrict = norestrict ? true : false;
rconfig->no_update_watch = no_update_watch ? true : false;
rconfig->rediscover = rediscover ? true : false;
rconfig->R = o;
}
else
json_decref (o);
return 0;
Expand All @@ -174,7 +174,7 @@ static void config_reload_cb (flux_t *h,

if (flux_conf_reload_decode (msg, &conf) < 0)
goto error;
if (parse_config (ctx, conf, NULL, NULL, NULL, NULL, NULL, &error) < 0) {
if (parse_config (ctx, conf, NULL, &error) < 0) {
errstr = error.text;
goto error;
}
Expand Down Expand Up @@ -305,22 +305,19 @@ static int reload_eventlog (flux_t *h, json_t **eventlog)
return -1;
}

int parse_args (flux_t *h,
int argc,
int parse_args (flux_t *h, int argc,
char **argv,
bool *monitor_force_up,
bool *noverify,
bool *no_update_watch)
struct resource_config *config)
{
int i;
for (i = 0; i < argc; i++) {
/* Test option to force all ranks to be marked online in the initial
* 'restart' event posted to resource.eventlog.
*/
if (streq (argv[i], "monitor-force-up"))
*monitor_force_up = true;
config->monitor_force_up = true;
else if (streq (argv[i], "noverify"))
*noverify = true;
config->noverify = true;
else {
flux_log (h, LOG_ERR, "unknown option: %s", argv[i]);
errno = EINVAL;
Expand All @@ -335,52 +332,31 @@ int mod_main (flux_t *h, int argc, char **argv)
{
struct resource_ctx *ctx;
flux_error_t error;
const char *exclude_idset;
json_t *eventlog = NULL;
bool monitor_force_up = false;
bool noverify = false;
bool norestrict = false;
bool no_update_watch = false;
json_t *R_from_config;
struct resource_config config = {0};

if (!(ctx = resource_ctx_create (h)))
goto error;
if (flux_get_size (h, &ctx->size) < 0)
goto error;
if (flux_get_rank (h, &ctx->rank) < 0)
goto error;
if (parse_config (ctx,
flux_get_conf (h),
&exclude_idset,
&R_from_config,
&noverify,
&norestrict,
&no_update_watch,
&error) < 0) {
if (parse_config (ctx, flux_get_conf (h), &config, &error) < 0) {
flux_log (h, LOG_ERR, "%s", error.text);
goto error;
}
if (parse_args (h,
argc,
argv,
&monitor_force_up,
&noverify,
&no_update_watch) < 0)
if (parse_args (h, argc, argv, &config) < 0)
goto error;
if (flux_attr_get (ctx->h, "broker.recovery-mode"))
noverify = true;
config.noverify = true;

/* Note: Order of creation of resource subsystems is important.
* Create inventory on all ranks first, since it is required by
* the exclude and drain subsystems on rank 0.
*/
if (!(ctx->inventory = inventory_create (ctx,
R_from_config,
no_update_watch)))
if (!(ctx->inventory = inventory_create (ctx, &config)))
goto error;
/* Done with R_from_config now, so free it.
*/
json_decref (R_from_config);

if (ctx->rank == 0) {
/* Create reslog and reload eventlog before initializing
* acquire, exclude, and drain subsystems, since these
Expand All @@ -403,19 +379,19 @@ int mod_main (flux_t *h, int argc, char **argv)
* the exclude idset to ensure drained ranks that are now
* excluded are ignored.
*/
if (!(ctx->exclude = exclude_create (ctx, exclude_idset)))
if (!(ctx->exclude = exclude_create (ctx, config.exclude_idset)))
goto error;
if (!(ctx->drain = drain_create (ctx, eventlog)))
goto error;
}
/* topology is initialized after exclude/drain etc since this
* rank may attempt to drain itself due to a topology mismatch.
*/
if (!(ctx->topology = topo_create (ctx, noverify, norestrict)))
if (!(ctx->topology = topo_create (ctx, &config)))
goto error;
if (!(ctx->monitor = monitor_create (ctx,
inventory_get_size (ctx->inventory),
monitor_force_up)))
config.monitor_force_up)))
goto error;
if (!(ctx->status = status_create (ctx)))
goto error;
Expand All @@ -427,10 +403,12 @@ int mod_main (flux_t *h, int argc, char **argv)
}
resource_ctx_destroy (ctx);
json_decref (eventlog);
json_decref (config.R);
return 0;
error:
resource_ctx_destroy (ctx);
ERRNO_SAFE_WRAP (json_decref, eventlog);
ERRNO_SAFE_WRAP (json_decref, config.R);
return -1;
}

Expand Down
10 changes: 10 additions & 0 deletions src/modules/resource/resource.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,16 @@
#ifndef _FLUX_RESOURCE_H
#define _FLUX_RESOURCE_H

struct resource_config {
json_t *R;
const char *exclude_idset;
bool rediscover;
bool noverify;
bool norestrict;
bool no_update_watch;
bool monitor_force_up;
};

struct resource_ctx {
flux_t *h;
flux_msg_handler_t **handlers;
Expand Down
19 changes: 10 additions & 9 deletions src/modules/resource/topo.c
Original file line number Diff line number Diff line change
Expand Up @@ -275,22 +275,24 @@ void topo_destroy (struct topo *topo)
}
}

static char *topo_get_local_xml (struct resource_ctx *ctx, bool no_restrict)
static char *topo_get_local_xml (struct resource_ctx *ctx,
struct resource_config *config)
{
flux_t *parent_h;
flux_future_t *f = NULL;
char *result = NULL;
const char *xml;

errno = 0;
if (!(parent_h = resource_parent_handle_open (ctx))
if (config->rediscover
|| !(parent_h = resource_parent_handle_open (ctx))
|| !(f = flux_rpc (parent_h,
"resource.topo-get",
NULL,
FLUX_NODEID_ANY,
0))
|| flux_rpc_get (f, &xml) < 0) {
rhwloc_flags_t flags = no_restrict ? RHWLOC_NO_RESTRICT : 0;
rhwloc_flags_t flags = config->norestrict ? RHWLOC_NO_RESTRICT : 0;
/* ENOENT just means there is no parent instance.
* No need for an error.
*/
Expand All @@ -305,8 +307,8 @@ static char *topo_get_local_xml (struct resource_ctx *ctx, bool no_restrict)
flux_log (ctx->h,
LOG_INFO,
"retrieved local hwloc XML from parent (norestrict=%s)",
no_restrict ? "true" : "false");
if (no_restrict) {
config->norestrict ? "true" : "false");
if (config->norestrict) {
result = strdup (xml);
goto out;
}
Expand All @@ -320,16 +322,15 @@ static char *topo_get_local_xml (struct resource_ctx *ctx, bool no_restrict)
}

struct topo *topo_create (struct resource_ctx *ctx,
bool no_verify,
bool no_restrict)
struct resource_config *config)
{
struct topo *topo;
json_t *R;

if (!(topo = calloc (1, sizeof (*topo))))
return NULL;
topo->ctx = ctx;
if (!(topo->xml = topo_get_local_xml (ctx, no_restrict))) {
if (!(topo->xml = topo_get_local_xml (ctx, config))) {
flux_log (ctx->h, LOG_ERR, "error loading hwloc topology");
goto error;
}
Expand All @@ -345,7 +346,7 @@ struct topo *topo_create (struct resource_ctx *ctx,

if (method && streq (method, "job-info"))
nodrain = true;
if (!no_verify && topo_verify (topo, R, nodrain) < 0)
if (!config->noverify && topo_verify (topo, R, nodrain) < 0)
goto error;
}
/* Reduce topo to rank 0 unconditionally in case it is needed.
Expand Down
3 changes: 1 addition & 2 deletions src/modules/resource/topo.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
#define _FLUX_RESOURCE_TOPO_H

struct topo *topo_create (struct resource_ctx *ctx,
bool no_verify,
bool no_restrict);
struct resource_config *config);
void topo_destroy (struct topo *topo);


Expand Down
1 change: 1 addition & 0 deletions t/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ TESTSCRIPTS = \
t2313-resource-acquire.t \
t2314-resource-monitor.t \
t2315-resource-system.t \
t2316-resource-rediscover.t \
t2350-resource-list.t \
t2351-resource-status-input.t \
t2352-resource-cmd-config.t \
Expand Down
Loading

0 comments on commit 01b4d87

Please sign in to comment.