Skip to content

Commit

Permalink
Merge pull request #1607 from giuseppe/exec-affinity
Browse files Browse the repository at this point in the history
add support for exec cpu affinity
  • Loading branch information
rhatdan authored Nov 26, 2024
2 parents 2b3faef + fd69065 commit 52ed588
Show file tree
Hide file tree
Showing 12 changed files with 254 additions and 156 deletions.
2 changes: 1 addition & 1 deletion libocispec
Submodule libocispec updated 2 files
+1 −1 image-spec
+1 −1 runtime-spec
15 changes: 0 additions & 15 deletions src/libcrun/cgroup-setup.c
Original file line number Diff line number Diff line change
Expand Up @@ -506,20 +506,5 @@ enter_cgroup (int cgroup_mode, pid_t pid, pid_t init_pid, const char *path,
if (UNLIKELY (ret < 0))
return ret;
}
/* Reset the inherited cpu affinity. Old kernels do that automatically, but
new kernels remember the affinity that was set before the cgroup move.
This is undesirable, because it inherits the systemd affinity when the container
should really move to the container space cpus.
The sched_setaffinity call will always return an error (EINVAL or ENODEV)
when used like this. This is expected and part of the backward compatibility.
See: https://issues.redhat.com/browse/OCPBUGS-15102 */
ret = sched_setaffinity (pid, 0, NULL);
if (LIKELY (ret < 0))
{
if (UNLIKELY (! ((errno == EINVAL) || (errno == ENODEV))))
return crun_make_error (err, errno, "failed to reset affinity");
}
return 0;
}
69 changes: 0 additions & 69 deletions src/libcrun/cgroup-systemd.c
Original file line number Diff line number Diff line change
Expand Up @@ -106,75 +106,6 @@ property_missing_p (char **missing_properties, const char *property)
return false;
}

int
cpuset_string_to_bitmask (const char *str, char **out, size_t *out_size, libcrun_error_t *err)
{
cleanup_free char *mask = NULL;
size_t mask_size = 0;
const char *p = str;
char *endptr;

while (*p)
{
long long start_range, end_range;

if (*p < '0' || *p > '9')
goto invalid_input;

start_range = strtoll (p, &endptr, 10);
if (start_range < 0)
goto invalid_input;

p = endptr;

if (*p != '-')
end_range = start_range;
else
{
p++;

if (*p < '0' || *p > '9')
goto invalid_input;

end_range = strtoll (p, &endptr, 10);

if (end_range < start_range)
goto invalid_input;

p = endptr;
}

/* Just set some limit. */
if (end_range > (1 << 20))
goto invalid_input;

if (end_range >= (long long) (mask_size * CHAR_BIT))
{
size_t new_mask_size = (end_range / CHAR_BIT) + 1;
mask = xrealloc (mask, new_mask_size);
memset (mask + mask_size, 0, new_mask_size - mask_size);
mask_size = new_mask_size;
}

for (long long i = start_range; i <= end_range; i++)
mask[i / CHAR_BIT] |= (1 << (i % CHAR_BIT));

if (*p == ',')
p++;
else if (*p)
goto invalid_input;
}

*out = mask;
mask = NULL;
*out_size = mask_size;

return 0;

invalid_input:
return crun_make_error (err, 0, "cannot parse input `%s`", str);
}

static void
get_systemd_scope_and_slice (const char *id, const char *cgroup_path, char **scope, char **slice)
{
Expand Down
2 changes: 0 additions & 2 deletions src/libcrun/cgroup-systemd.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
#ifdef HAVE_SYSTEMD
extern int parse_sd_array (char *s, char **out, char **next, libcrun_error_t *err);

extern int cpuset_string_to_bitmask (const char *str, char **out, size_t *out_size, libcrun_error_t *err);

extern char *get_cgroup_scope_path (const char *cgroup_path, const char *scope);
#endif

Expand Down
63 changes: 0 additions & 63 deletions src/libcrun/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -100,37 +100,6 @@ find_delegate_cgroup (json_map_string_string *annotations)
return NULL;
}

static inline void
cleanup_sig_contp (void *p)
{
pid_t *pp = p;
if (*pp < 0)
return;

TEMP_FAILURE_RETRY (kill (*pp, SIGCONT));
}

static bool
must_stop_proc (runtime_spec_schema_config_linux_resources *resources)
{
size_t i;

if (resources == NULL)
return false;

if (resources->cpu && (resources->cpu->cpus || resources->cpu->mems))
return true;

if (resources->unified)
{
for (i = 0; i < resources->unified->len; i++)
if (has_prefix (resources->unified->keys[i], "cpuset."))
return true;
}

return false;
}

int
libcrun_cgroup_pause_unpause (struct libcrun_cgroup_status *status, const bool pause, libcrun_error_t *err)
{
Expand Down Expand Up @@ -284,7 +253,6 @@ libcrun_cgroup_preenter (struct libcrun_cgroup_args *args, int *dirfd, libcrun_e
int
libcrun_cgroup_enter (struct libcrun_cgroup_args *args, struct libcrun_cgroup_status **out, libcrun_error_t *err)
{
__attribute__ ((unused)) pid_t sigcont_cleanup __attribute__ ((cleanup (cleanup_sig_contp))) = -1;
/* status will be filled by the cgroup manager. */
cleanup_cgroup_status struct libcrun_cgroup_status *status = xmalloc0 (sizeof *status);
struct libcrun_cgroup_manager *cgroup_manager;
Expand All @@ -297,21 +265,6 @@ libcrun_cgroup_enter (struct libcrun_cgroup_args *args, struct libcrun_cgroup_st
if (UNLIKELY (cgroup_mode < 0))
return cgroup_mode;

/* If the cgroup configuration is limiting what CPUs/memory Nodes are available for the container,
then stop the container process during the cgroup configuration to avoid it being rescheduled on
a CPU that is not allowed. This extra step is required for setting up the sub cgroup with the
systemd driver. The alternative would be to temporarily setup the cpus/mems using d-bus.
*/
if (must_stop_proc (args->resources))
{
ret = TEMP_FAILURE_RETRY (kill (args->pid, SIGSTOP));
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "cannot stop container process `%d` with SIGSTOP", args->pid);

/* Send SIGCONT as soon as the function exits. */
sigcont_cleanup = args->pid;
}

if (cgroup_mode == CGROUP_MODE_HYBRID)
{
/* We don't really support hybrid mode, so check that cgroups2 is not using any controller. */
Expand Down Expand Up @@ -381,22 +334,6 @@ libcrun_cgroup_enter (struct libcrun_cgroup_args *args, struct libcrun_cgroup_st
return ret;
}
}
/* Reset the inherited cpu affinity. Old kernels do that automatically, but
new kernels remember the affinity that was set before the cgroup move.
This is undesirable, because it inherits the systemd affinity when the container
should really move to the container space cpus.
The sched_setaffinity call will always return an error (EINVAL or ENODEV)
when used like this. This is expected and part of the backward compatibility.
See: https://issues.redhat.com/browse/OCPBUGS-15102 */
ret = sched_setaffinity (args->pid, 0, NULL);
if (LIKELY (ret < 0))
{
if (UNLIKELY (! ((errno == EINVAL) || (errno == ENODEV))))
return crun_make_error (err, errno, "failed to reset affinity");
}

success:
*out = status;
status = NULL;
Expand Down
12 changes: 8 additions & 4 deletions src/libcrun/container.c
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,10 @@ initialize_security (runtime_spec_schema_config_schema_process *proc, libcrun_er
{
int ret;

ret = libcrun_init_caps (err);
if (UNLIKELY (ret < 0))
return ret;

if (UNLIKELY (proc == NULL))
return 0;

Expand All @@ -656,10 +660,6 @@ initialize_security (runtime_spec_schema_config_schema_process *proc, libcrun_er
if (UNLIKELY (ret < 0))
return ret;

ret = libcrun_init_caps (err);
if (UNLIKELY (ret < 0))
return ret;

return 0;
}

Expand Down Expand Up @@ -2590,6 +2590,10 @@ libcrun_container_run_internal (libcrun_container_t *container, libcrun_context_
if (UNLIKELY (ret < 0))
goto fail;

ret = libcrun_reset_cpu_affinity_mask (pid, err);
if (UNLIKELY (ret < 0))
goto fail;

ret = libcrun_set_io_priority (pid, def->process, err);
if (UNLIKELY (ret < 0))
goto fail;
Expand Down
35 changes: 34 additions & 1 deletion src/libcrun/linux.c
Original file line number Diff line number Diff line change
Expand Up @@ -4738,6 +4738,15 @@ handle_pidfd_receiver (pid_t pid, libcrun_container_t *container, libcrun_error_
return send_fd_to_socket (client_fd, pidfd, err);
}

static bool
has_exec_cpu_affinity (runtime_spec_schema_config_schema_process *process)
{
if (process == NULL || process->exec_cpu_affinity == NULL)
return false;
return (! is_empty_string (process->exec_cpu_affinity->initial))
|| (! is_empty_string (process->exec_cpu_affinity->final));
}

pid_t
libcrun_run_linux_container (libcrun_container_t *container, container_entrypoint_t entrypoint, void *args,
int *sync_socket_out, struct libcrun_dirfd_s *cgroup_dirfd, libcrun_error_t *err)
Expand Down Expand Up @@ -5053,6 +5062,20 @@ join_process_parent_helper (libcrun_context_t *context,
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "waitpid for exec child pid");

if (process && process->exec_cpu_affinity)
{
ret = libcrun_set_cpu_affinity_from_string (pid, process->exec_cpu_affinity->initial, err);
if (UNLIKELY (ret < 0))
return ret;
}

if (! has_exec_cpu_affinity (process))
{
ret = libcrun_reset_cpu_affinity_mask (pid, err);
if (UNLIKELY (ret < 0))
return ret;
}

if (need_move_to_cgroup)
{
if (sub_cgroup)
Expand Down Expand Up @@ -5080,6 +5103,13 @@ join_process_parent_helper (libcrun_context_t *context,
return ret;
}

if (process && process->exec_cpu_affinity)
{
ret = libcrun_set_cpu_affinity_from_string (pid, process->exec_cpu_affinity->final, err);
if (UNLIKELY (ret < 0))
return ret;
}

ret = libcrun_apply_intelrdt (context->id, container, pid, LIBCRUN_INTELRDT_MOVE, err);
if (UNLIKELY (ret < 0))
return ret;
Expand Down Expand Up @@ -5307,7 +5337,10 @@ libcrun_join_process (libcrun_context_t *context,

memset (&clone3_args, 0, sizeof (clone3_args));
clone3_args.exit_signal = SIGCHLD;
if (cgroup_dirfd < 0)

/* Do not join the cgroup immediately if an initial CPU affinity mask is specified, so that
the process can set the cpu affinity before joining the target cgroup. */
if (cgroup_dirfd < 0 || (process->exec_cpu_affinity && process->exec_cpu_affinity->initial))
need_move_to_cgroup = true;
else
{
Expand Down
63 changes: 63 additions & 0 deletions src/libcrun/scheduler.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,30 @@ syscall_sched_setattr (pid_t pid, struct sched_attr_s *attr, unsigned int flags)
#endif
}

int
libcrun_reset_cpu_affinity_mask (pid_t pid, libcrun_error_t *err)
{
int ret;

/* Reset the inherited cpu affinity. Old kernels do that automatically, but
new kernels remember the affinity that was set before the cgroup move.
This is undesirable, because it inherits the systemd affinity when the container
should really move to the container space cpus.
The sched_setaffinity call will always return an error (EINVAL or ENODEV)
when used like this. This is expected and part of the backward compatibility.
See: https://issues.redhat.com/browse/OCPBUGS-15102 */
ret = sched_setaffinity (pid, 0, NULL);
if (LIKELY (ret < 0))
{
if (UNLIKELY (! ((errno == EINVAL) || (errno == ENODEV))))
return crun_make_error (err, errno, "failed to reset affinity");
}

return 0;
}

int
libcrun_set_scheduler (pid_t pid, runtime_spec_schema_config_schema_process *process, libcrun_error_t *err)
{
Expand Down Expand Up @@ -162,3 +186,42 @@ libcrun_set_scheduler (pid_t pid, runtime_spec_schema_config_schema_process *pro

return 0;
}

int
libcrun_set_cpu_affinity_from_string (pid_t pid, const char *str, libcrun_error_t *err)
{
cleanup_free char *bitmask = NULL;
int ret, saved_errno;
size_t bitmask_size;
cpu_set_t *cpuset;
size_t alloc_size;
size_t i;

if (is_empty_string (str))
return 0;

ret = cpuset_string_to_bitmask (str, &bitmask, &bitmask_size, err);
if (UNLIKELY (ret < 0))
return ret;

alloc_size = CPU_ALLOC_SIZE (bitmask_size * CHAR_BIT);

cpuset = CPU_ALLOC (alloc_size);
if (UNLIKELY (cpuset == NULL))
OOM ();

CPU_ZERO_S (alloc_size, cpuset);

for (i = 0; i < bitmask_size * CHAR_BIT; i++)
{
if (bitmask[i / CHAR_BIT] & (1 << (i % CHAR_BIT)))
CPU_SET_S (i, alloc_size, cpuset);
}

ret = sched_setaffinity (pid, alloc_size, cpuset);
saved_errno = errno;
CPU_FREE (cpuset);
if (UNLIKELY (ret < 0))
return crun_make_error (err, saved_errno, "sched_setaffinity");
return 0;
}
4 changes: 4 additions & 0 deletions src/libcrun/scheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,8 @@

int libcrun_set_scheduler (pid_t pid, runtime_spec_schema_config_schema_process *process, libcrun_error_t *err);

int libcrun_set_cpu_affinity_from_string (pid_t pid, const char *str, libcrun_error_t *err);

int libcrun_reset_cpu_affinity_mask (pid_t pid, libcrun_error_t *err);

#endif
Loading

0 comments on commit 52ed588

Please sign in to comment.