Skip to content

Commit

Permalink
Merge branch 'free-htab-element-out-of-bucket-lock'
Browse files Browse the repository at this point in the history
Hou Tao says:

====================
The patch set continues the previous work [1] to move all the freeings
of htab elements out of bucket lock. One motivation for the patch set is
the locking problem reported by Sebastian [2]: the freeing of bpf_timer
under PREEMPT_RT may acquire a spin-lock (namely softirq_expiry_lock).
However the freeing procedure for htab element has already held a
raw-spin-lock (namely bucket lock), and it will trigger the warning:
"BUG: scheduling while atomic" as demonstrated by the selftests patch.
Another motivation is to reduce the locked scope of bucket lock.

However, the patch set doesn't move all freeing of htab element out of
bucket lock, it still keep the free of special fields in pre-allocated
hash map under the protect of bucket lock in htab_map_update_elem(). The
patch set is structured as follows:

* Patch Rust-for-Linux#1 moves the element freeing out of bucket lock for
  htab_lru_map_delete_node(). However the freeing is still in the locked
  scope of LRU raw spin lock.
* Patch Rust-for-Linux#2~Rust-for-Linux#3 move the element freeing out of bucket lock for
  __htab_map_lookup_and_delete_elem()
* Patch Rust-for-Linux#4 cancels the bpf_timer in two steps to fix the locking
  problem in htab_map_update_elem() for PREEMPT_PRT.
* Patch Rust-for-Linux#5 adds a selftest for the locking problem

Please see individual patches for more details. Comments are always
welcome.
---

v3:
 * patch Rust-for-Linux#1: update the commit message to state that the freeing of
   special field is still in the locked scope of LRU raw spin lock
 * patch Rust-for-Linux#4: cancel the bpf_timer in two steps only for PREEMPT_RT
   (suggested by Alexei)

v2: https://lore.kernel.org/bpf/[email protected]
  * cancels the bpf timer in two steps instead of breaking the reuse
    the refill of per-cpu ->extra_elems into two steps

v1: https://lore.kernel.org/bpf/[email protected]

[1]: https://lore.kernel.org/bpf/[email protected]
[2]: https://lore.kernel.org/bpf/[email protected]
====================

Link: https://patch.msgid.link/[email protected]
Signed-off-by: Alexei Starovoitov <[email protected]>
  • Loading branch information
Alexei Starovoitov committed Jan 20, 2025
2 parents 01f3ce5 + 0a5d2ef commit d10cafc
Show file tree
Hide file tree
Showing 4 changed files with 284 additions and 30 deletions.
60 changes: 32 additions & 28 deletions kernel/bpf/hashtab.c
Original file line number Diff line number Diff line change
Expand Up @@ -824,13 +824,14 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
check_and_free_fields(htab, l);
bpf_map_dec_elem_count(&htab->map);
break;
}

htab_unlock_bucket(htab, b, tgt_l->hash, flags);

if (l == tgt_l)
check_and_free_fields(htab, l);
return l == tgt_l;
}

Expand Down Expand Up @@ -1634,41 +1635,44 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
l = lookup_elem_raw(head, hash, key, key_size);
if (!l) {
ret = -ENOENT;
} else {
if (is_percpu) {
u32 roundup_value_size = round_up(map->value_size, 8);
void __percpu *pptr;
int off = 0, cpu;
goto out_unlock;
}

pptr = htab_elem_get_ptr(l, key_size);
for_each_possible_cpu(cpu) {
copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
check_and_init_map_value(&htab->map, value + off);
off += roundup_value_size;
}
} else {
u32 roundup_key_size = round_up(map->key_size, 8);
if (is_percpu) {
u32 roundup_value_size = round_up(map->value_size, 8);
void __percpu *pptr;
int off = 0, cpu;

if (flags & BPF_F_LOCK)
copy_map_value_locked(map, value, l->key +
roundup_key_size,
true);
else
copy_map_value(map, value, l->key +
roundup_key_size);
/* Zeroing special fields in the temp buffer */
check_and_init_map_value(map, value);
pptr = htab_elem_get_ptr(l, key_size);
for_each_possible_cpu(cpu) {
copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
check_and_init_map_value(&htab->map, value + off);
off += roundup_value_size;
}
} else {
u32 roundup_key_size = round_up(map->key_size, 8);

hlist_nulls_del_rcu(&l->hash_node);
if (!is_lru_map)
free_htab_elem(htab, l);
if (flags & BPF_F_LOCK)
copy_map_value_locked(map, value, l->key +
roundup_key_size,
true);
else
copy_map_value(map, value, l->key +
roundup_key_size);
/* Zeroing special fields in the temp buffer */
check_and_init_map_value(map, value);
}
hlist_nulls_del_rcu(&l->hash_node);

out_unlock:
htab_unlock_bucket(htab, b, hash, bflags);

if (is_lru_map && l)
htab_lru_push_free(htab, l);
if (l) {
if (is_lru_map)
htab_lru_push_free(htab, l);
else
free_htab_elem(htab, l);
}

return ret;
}
Expand Down
18 changes: 16 additions & 2 deletions kernel/bpf/helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -1593,10 +1593,24 @@ void bpf_timer_cancel_and_free(void *val)
* To avoid these issues, punt to workqueue context when we are in a
* timer callback.
*/
if (this_cpu_read(hrtimer_running))
if (this_cpu_read(hrtimer_running)) {
queue_work(system_unbound_wq, &t->cb.delete_work);
else
return;
}

if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
/* If the timer is running on other CPU, also use a kworker to
* wait for the completion of the timer instead of trying to
* acquire a sleepable lock in hrtimer_cancel() to wait for its
* completion.
*/
if (hrtimer_try_to_cancel(&t->timer) >= 0)
kfree_rcu(t, cb.rcu);
else
queue_work(system_unbound_wq, &t->cb.delete_work);
} else {
bpf_timer_delete_work(&t->cb.delete_work);
}
}

/* This function is called by map_delete/update_elem for individual element and
Expand Down
165 changes: 165 additions & 0 deletions tools/testing/selftests/bpf/prog_tests/free_timer.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (C) 2025. Huawei Technologies Co., Ltd */
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <test_progs.h>

#include "free_timer.skel.h"

struct run_ctx {
struct bpf_program *start_prog;
struct bpf_program *overwrite_prog;
pthread_barrier_t notify;
int loop;
bool start;
bool stop;
};

static void start_threads(struct run_ctx *ctx)
{
ctx->start = true;
}

static void stop_threads(struct run_ctx *ctx)
{
ctx->stop = true;
/* Guarantee the order between ->stop and ->start */
__atomic_store_n(&ctx->start, true, __ATOMIC_RELEASE);
}

static int wait_for_start(struct run_ctx *ctx)
{
while (!__atomic_load_n(&ctx->start, __ATOMIC_ACQUIRE))
usleep(10);

return ctx->stop;
}

static void *overwrite_timer_fn(void *arg)
{
struct run_ctx *ctx = arg;
int loop, fd, err;
cpu_set_t cpuset;
long ret = 0;

/* Pin on CPU 0 */
CPU_ZERO(&cpuset);
CPU_SET(0, &cpuset);
pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);

/* Is the thread being stopped ? */
err = wait_for_start(ctx);
if (err)
return NULL;

fd = bpf_program__fd(ctx->overwrite_prog);
loop = ctx->loop;
while (loop-- > 0) {
LIBBPF_OPTS(bpf_test_run_opts, opts);

/* Wait for start thread to complete */
pthread_barrier_wait(&ctx->notify);

/* Overwrite timers */
err = bpf_prog_test_run_opts(fd, &opts);
if (err)
ret |= 1;
else if (opts.retval)
ret |= 2;

/* Notify start thread to start timers */
pthread_barrier_wait(&ctx->notify);
}

return (void *)ret;
}

static void *start_timer_fn(void *arg)
{
struct run_ctx *ctx = arg;
int loop, fd, err;
cpu_set_t cpuset;
long ret = 0;

/* Pin on CPU 1 */
CPU_ZERO(&cpuset);
CPU_SET(1, &cpuset);
pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);

/* Is the thread being stopped ? */
err = wait_for_start(ctx);
if (err)
return NULL;

fd = bpf_program__fd(ctx->start_prog);
loop = ctx->loop;
while (loop-- > 0) {
LIBBPF_OPTS(bpf_test_run_opts, opts);

/* Run the prog to start timer */
err = bpf_prog_test_run_opts(fd, &opts);
if (err)
ret |= 4;
else if (opts.retval)
ret |= 8;

/* Notify overwrite thread to do overwrite */
pthread_barrier_wait(&ctx->notify);

/* Wait for overwrite thread to complete */
pthread_barrier_wait(&ctx->notify);
}

return (void *)ret;
}

void test_free_timer(void)
{
struct free_timer *skel;
struct bpf_program *prog;
struct run_ctx ctx;
pthread_t tid[2];
void *ret;
int err;

skel = free_timer__open_and_load();
if (!ASSERT_OK_PTR(skel, "open_load"))
return;

memset(&ctx, 0, sizeof(ctx));

prog = bpf_object__find_program_by_name(skel->obj, "start_timer");
if (!ASSERT_OK_PTR(prog, "find start prog"))
goto out;
ctx.start_prog = prog;

prog = bpf_object__find_program_by_name(skel->obj, "overwrite_timer");
if (!ASSERT_OK_PTR(prog, "find overwrite prog"))
goto out;
ctx.overwrite_prog = prog;

pthread_barrier_init(&ctx.notify, NULL, 2);
ctx.loop = 10;

err = pthread_create(&tid[0], NULL, start_timer_fn, &ctx);
if (!ASSERT_OK(err, "create start_timer"))
goto out;

err = pthread_create(&tid[1], NULL, overwrite_timer_fn, &ctx);
if (!ASSERT_OK(err, "create overwrite_timer")) {
stop_threads(&ctx);
goto out;
}

start_threads(&ctx);

ret = NULL;
err = pthread_join(tid[0], &ret);
ASSERT_EQ(err | (long)ret, 0, "start_timer");
ret = NULL;
err = pthread_join(tid[1], &ret);
ASSERT_EQ(err | (long)ret, 0, "overwrite_timer");
out:
free_timer__destroy(skel);
}
71 changes: 71 additions & 0 deletions tools/testing/selftests/bpf/progs/free_timer.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (C) 2025. Huawei Technologies Co., Ltd */
#include <linux/bpf.h>
#include <time.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_helpers.h>

#define MAX_ENTRIES 8

struct map_value {
struct bpf_timer timer;
};

struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, int);
__type(value, struct map_value);
__uint(max_entries, MAX_ENTRIES);
} map SEC(".maps");

static int timer_cb(void *map, void *key, struct map_value *value)
{
volatile int sum = 0;
int i;

bpf_for(i, 0, 1024 * 1024) sum += i;

return 0;
}

static int start_cb(int key)
{
struct map_value *value;

value = bpf_map_lookup_elem(&map, (void *)&key);
if (!value)
return 0;

bpf_timer_init(&value->timer, &map, CLOCK_MONOTONIC);
bpf_timer_set_callback(&value->timer, timer_cb);
/* Hope 100us will be enough to wake-up and run the overwrite thread */
bpf_timer_start(&value->timer, 100000, BPF_F_TIMER_CPU_PIN);

return 0;
}

static int overwrite_cb(int key)
{
struct map_value zero = {};

/* Free the timer which may run on other CPU */
bpf_map_update_elem(&map, (void *)&key, &zero, BPF_ANY);

return 0;
}

SEC("syscall")
int BPF_PROG(start_timer)
{
bpf_loop(MAX_ENTRIES, start_cb, NULL, 0);
return 0;
}

SEC("syscall")
int BPF_PROG(overwrite_timer)
{
bpf_loop(MAX_ENTRIES, overwrite_cb, NULL, 0);
return 0;
}

char _license[] SEC("license") = "GPL";

0 comments on commit d10cafc

Please sign in to comment.