From 879a9c2da75bbd64d1ca3b0399d1361530da4d13 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Tue, 19 Dec 2023 13:06:36 -0800 Subject: [PATCH] resource: fetch local hwloc XML from parent if possible Problem: The resource module always loads local topology via hwloc_topology_load(3), an expensive operation. When there is a parent, attempt to fetch the topology XML from the parent resource module. This is not only more efficient (much more so on some systems), but also ensures that the initializing subinstance is using the same hwloc topology used by the parent when a fake topology is loaded for testing purposes. If the request takes longer than 100ms or fails, or there is no parent instance detected, then the old behavior is preserved. Fixes #5611 --- src/modules/resource/topo.c | 49 ++++++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/src/modules/resource/topo.c b/src/modules/resource/topo.c index 53e15eb004d1..93664ab307f5 100644 --- a/src/modules/resource/topo.c +++ b/src/modules/resource/topo.c @@ -275,19 +275,62 @@ void topo_destroy (struct topo *topo) } } +static char *topo_get_local_xml (struct resource_ctx *ctx, bool no_restrict) +{ + flux_t *parent_h; + flux_future_t *f = NULL; + char *result = NULL; + const char *xml; + + errno = 0; + if (!(parent_h = resource_parent_handle_open (ctx)) + || !(f = flux_rpc (parent_h, + "resource.topo-get", + NULL, + FLUX_NODEID_ANY, + 0)) + || flux_future_wait_for (f, 0.1) < 0 + || flux_rpc_get (f, &xml) < 0) { + rhwloc_flags_t flags = no_restrict ? RHWLOC_NO_RESTRICT : 0; + /* ENOENT just means there is no parent instance. + * No need for an error. + */ + if (errno && errno != ENOENT) + flux_log (ctx->h, + LOG_DEBUG, + "resource.topo-get to parent failed: %s", + strerror (errno)); + result = rhwloc_local_topology_xml (flags); + goto out; + } + flux_log (ctx->h, + LOG_INFO, + "retrieved local hwloc XML from parent"); + if (no_restrict) { + result = strdup (xml); + goto out; + } + /* restrict topology to current CPU binding + */ + result = rhwloc_topology_xml_restrict (xml); +out: + flux_future_destroy (f); + resource_parent_handle_close (ctx); + return result; +} + struct topo *topo_create (struct resource_ctx *ctx, bool no_verify, bool no_restrict) { struct topo *topo; json_t *R; - int flags = no_restrict ? RHWLOC_NO_RESTRICT : 0; if (!(topo = calloc (1, sizeof (*topo)))) return NULL; topo->ctx = ctx; - if (!(topo->xml = rhwloc_local_topology_xml (flags))) { - flux_log_error (ctx->h, "error loading hwloc topology"); + if (!(topo->xml = topo_get_local_xml (ctx, no_restrict))) { + flux_log (ctx->h, LOG_ERR, "error loading hwloc topology"); goto error; } if (!(topo->r_local = rlist_from_hwloc (ctx->rank, topo->xml))) {