diff --git a/src/mca/plm/slurm/help-plm-slurm.txt b/src/mca/plm/slurm/help-plm-slurm.txt index 5f554484ff..2f71f2bb44 100644 --- a/src/mca/plm/slurm/help-plm-slurm.txt +++ b/src/mca/plm/slurm/help-plm-slurm.txt @@ -12,7 +12,7 @@ # All rights reserved. # Copyright (c) 2014-2020 Intel, Inc. All rights reserved. # Copyright (c) 2020 Cisco Systems, Inc. All rights reserved -# Copyright (c) 2022 Nanook Consulting. All rights reserved. +# Copyright (c) 2022-2024 Nanook Consulting All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -51,6 +51,7 @@ are running. Please consult with your system administrator about obtaining such support. +# [no-srun] The SLURM process starter for OpenMPI was unable to locate a usable "srun" command in its path. Please check your path @@ -80,3 +81,58 @@ process starter via the following MCA parameter: This will result in use of the ssh process starter. This will have no impact on your application, but will result in any accounting being done solely at the allocation level instead of per-job. +# +[custom-args-in-env] +The Slurm process starter for PRTE detected the presence of an MCA +parameter in the environment that assigns custom command line arguments +to the `srun` command used to start PRTE's daemons on remote nodes: + + Paramater value: %s + +This warning is provided to alert you (the user) to a perhaps +unintentional setting of command line arguments, or the unseen +overriding of your intended arguments by Slurm. + +Background: Starting with Slurm version 23.11, a command line argument +(`--external-launcher`) was added to `srun` to indicate that the +command was being initiated from within a third-party launcher (e.g., +`prte` or `prterun`). This allows Slurm to essentially freely modify +the `srun` command line while retaining a backward compatibility +capability when explicitly told to use it. Notably, the Slurm +environment does this by automatically setting the +PRTE_MCA_plm_slurm_args environment variable to pass in its own +command line arguments. This has the side effect of overriding most +user- or system-level settings. Note that arguments passed on the +PRTE command line will override any Slurm setting of the +PRTE_MCA_plm_slurm_args environment variable, but with potentially +undesirable side effects if newer versions of `srun` misinterpret or +fail to understand the user-specified arguments. + +If the setting of the MCA parameter was intentional, or if the +parameter looks acceptable to you, then please set the following +MCA parameter to disable this warning: + + Environment: PRTE_MCA_plm_slurm_disable_warning=true + Cmd line: --prtemca plm_slurm_disable_warning 1 + Default MCA param file: plm_slurm_disable_warning = true + +If you did not intentionally set the identified command line +arguments and do not wish them to be used, then set the +following MCA param to have them ignored: + + Environment: PRTE_MCA_plm_slurm_ignore_args=true + Cmd line: --prtemca plm_slurm_ignore_args 1 + Default MCA param file: plm_slurm_ignore_args = true + +Note that if you wish to provide custom `srun` command line +arguments and are finding them being overridden by Slurm, you +can ensure that your values are used by setting them with the +following param: + + Environment: PRTE_MCA_plm_slurm_force_args=foo + Cmd line: --prtemca plm_slurm_force_args foo + Default MCA param file: plm_slurm_force_args = foo + +Note that you may need to add the `--external-launcher` option +to your provided args to ensure that `srun` properly functions +if you are using a relatively recent release of Slurm. diff --git a/src/mca/plm/slurm/plm_slurm.h b/src/mca/plm/slurm/plm_slurm.h index 425d0acd89..d654a48e4d 100644 --- a/src/mca/plm/slurm/plm_slurm.h +++ b/src/mca/plm/slurm/plm_slurm.h @@ -13,7 +13,7 @@ * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2022-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -33,6 +33,7 @@ BEGIN_C_DECLS struct prte_mca_plm_slurm_component_t { prte_plm_base_component_t super; + int custom_args_index; char *custom_args; bool slurm_warning_msg; }; diff --git a/src/mca/plm/slurm/plm_slurm_component.c b/src/mca/plm/slurm/plm_slurm_component.c index 7f6e20bb49..936386bbb6 100644 --- a/src/mca/plm/slurm/plm_slurm_component.c +++ b/src/mca/plm/slurm/plm_slurm_component.c @@ -16,7 +16,7 @@ * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -84,17 +84,26 @@ prte_mca_plm_slurm_component_t prte_mca_plm_slurm_component = { here; will be initialized in plm_slurm_open() */ }; +static char *custom_args = NULL; +static char *force_args = NULL; + static int plm_slurm_register(void) { pmix_mca_base_component_t *comp = &prte_mca_plm_slurm_component.super; - prte_mca_plm_slurm_component.custom_args = NULL; - (void) pmix_mca_base_component_var_register(comp, "args", "Custom arguments to srun", + + prte_mca_plm_slurm_component.custom_args_index = + pmix_mca_base_component_var_register(comp, "args", "Custom arguments to srun", + PMIX_MCA_BASE_VAR_TYPE_STRING, + &custom_args); + + force_args = NULL; + (void) pmix_mca_base_component_var_register(comp, "force_args", "Mandatory custom arguments to srun", PMIX_MCA_BASE_VAR_TYPE_STRING, - &prte_mca_plm_slurm_component.custom_args); + &force_args); - prte_mca_plm_slurm_component.slurm_warning_msg = true; - (void) pmix_mca_base_component_var_register(comp, "warning", "Turn off warning message", + prte_mca_plm_slurm_component.slurm_warning_msg = false; + (void) pmix_mca_base_component_var_register(comp, "disable_warning", "Turn off warning message about custom args set in environment", PMIX_MCA_BASE_VAR_TYPE_BOOL, &prte_mca_plm_slurm_component.slurm_warning_msg); @@ -108,6 +117,9 @@ static int plm_slurm_open(void) static int prte_mca_plm_slurm_component_query(pmix_mca_base_module_t **module, int *priority) { + const pmix_mca_base_var_t *var; + pmix_status_t rc; + /* Are we running under a SLURM job? */ if (NULL != getenv("SLURM_JOBID")) { @@ -117,6 +129,30 @@ static int prte_mca_plm_slurm_component_query(pmix_mca_base_module_t **module, i "%s plm:slurm: available for selection", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + prte_mca_plm_slurm_component.custom_args = NULL; + + // if we were are warning about externally set custom args, then + // check to see if that was done + if (!prte_mca_plm_slurm_component.slurm_warning_msg && + NULL == force_args) { + // check for custom args + rc = pmix_mca_base_var_get(prte_mca_plm_slurm_component.custom_args_index, &var); + if (PMIX_SUCCESS == rc) { + // the variable was set - see who set it + if (PMIX_MCA_BASE_VAR_SOURCE_ENV == var->mbv_source) { + // set in the environment - warn + pmix_show_help("help-plm-slurm.txt", "custom-args-in-env", true, + custom_args); + } + } + } + + if (NULL != force_args) { + prte_mca_plm_slurm_component.custom_args = force_args; + } else if (NULL != custom_args) { + prte_mca_plm_slurm_component.custom_args = custom_args; + } + *module = (pmix_mca_base_module_t *) &prte_plm_slurm_module; return PRTE_SUCCESS; }