Skip to content

Commit

Permalink
Merge branch 'BPF open-coded iterators'
Browse files Browse the repository at this point in the history
Andrii Nakryiko says:

====================

Add support for open-coded (aka inline) iterators in BPF world. This is a next
evolution of gradually allowing more powerful and less restrictive looping and
iteration capabilities to BPF programs.

We set up a framework for implementing all kinds of iterators (e.g., cgroup,
task, file, etc, iterators), but this patch set only implements numbers
iterator, which is used to implement ergonomic bpf_for() for-like construct
(see patches #4-#5). We also add bpf_for_each(), which is a generic
foreach-like construct that will work with any kind of open-coded iterator
implementation, as long as we stick with bpf_iter_<type>_{new,next,destroy}()
naming pattern (which we now enforce on the kernel side).

Patch #1 is preparatory refactoring for easier way to check for special kfunc
calls. Patch #2 is adding iterator kfunc registration and validation logic,
which is mostly independent from the rest of open-coded iterator logic, so is
separated out for easier reviewing.

The meat of verifier-side logic is in patch #3. Patch #4 implements numbers
iterator. I kept them separate to have clean reference for how to integrate
new iterator types (now even simpler to do than in v1 of this patch set).
Patch #5 adds bpf_for(), bpf_for_each(), and bpf_repeat() macros to
bpf_misc.h, and also adds yet another pyperf test variant, now with bpf_for()
loop. Patch #6 is verification tests, based on numbers iterator (as the only
available right now). Patch #7 actually tests runtime behavior of numbers
iterator.

Finally, with changes in v2, it's possible and trivial to implement custom
iterators completely in kernel modules, which we showcase and test by adding
a simple iterator returning same number a given number of times to
bpf_testmod. Patch #8 is where all this happens and is tested.

Most of the relevant details are in corresponding commit messages or code
comments.

v4->v5:
  - fixing missed inner for() in is_iter_reg_valid_uninit, and fixed return
    false (kernel test robot);
  - typo fixes and comment/commit description improvements throughout the
    patch set;
v3->v4:
  - remove unused variable from is_iter_reg_valid_init (kernel test robot);
v2->v3:
  - remove special kfunc leftovers for bpf_iter_num_{new,next,destroy};
  - add iters/testmod_seq* to DENYLIST.s390x, it doesn't support kfuncs in
    modules yet (CI);
v1->v2:
  - rebased on latest, dropping previously landed preparatory patches;
  - each iterator type now have its own `struct bpf_iter_<type>` which allows
    each iterator implementation to use exactly as much stack space as
    necessary, allowing to avoid runtime allocations (Alexei);
  - reworked how iterator kfuncs are defined, no verifier changes are required
    when adding new iterator type;
  - added bpf_testmod-based iterator implementation;
  - address the rest of feedback, comments, commit message adjustment, etc.

Cc: Tejun Heo <[email protected]>
====================

Signed-off-by: Alexei Starovoitov <[email protected]>
  • Loading branch information
Alexei Starovoitov committed Mar 9, 2023
2 parents ed69e06 + 7e86a8c commit 23e403b
Show file tree
Hide file tree
Showing 25 changed files with 2,790 additions and 55 deletions.
8 changes: 6 additions & 2 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -1617,8 +1617,12 @@ struct bpf_array {
#define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */
#define MAX_TAIL_CALL_CNT 33

/* Maximum number of loops for bpf_loop */
#define BPF_MAX_LOOPS BIT(23)
/* Maximum number of loops for bpf_loop and bpf_iter_num.
* It's enum to expose it (and thus make it discoverable) through BTF.
*/
enum {
BPF_MAX_LOOPS = 8 * 1024 * 1024,
};

#define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \
BPF_F_RDONLY_PROG | \
Expand Down
25 changes: 25 additions & 0 deletions include/linux/bpf_verifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ struct bpf_active_lock {
u32 id;
};

#define ITER_PREFIX "bpf_iter_"

enum bpf_iter_state {
BPF_ITER_STATE_INVALID, /* for non-first slot */
BPF_ITER_STATE_ACTIVE,
BPF_ITER_STATE_DRAINED,
};

struct bpf_reg_state {
/* Ordering of fields matters. See states_equal() */
enum bpf_reg_type type;
Expand Down Expand Up @@ -103,6 +111,18 @@ struct bpf_reg_state {
bool first_slot;
} dynptr;

/* For bpf_iter stack slots */
struct {
/* BTF container and BTF type ID describing
* struct bpf_iter_<type> of an iterator state
*/
struct btf *btf;
u32 btf_id;
/* packing following two fields to fit iter state into 16 bytes */
enum bpf_iter_state state:2;
int depth:30;
} iter;

/* Max size from any of the above. */
struct {
unsigned long raw1;
Expand Down Expand Up @@ -141,6 +161,8 @@ struct bpf_reg_state {
* same reference to the socket, to determine proper reference freeing.
* For stack slots that are dynptrs, this is used to track references to
* the dynptr to determine proper reference freeing.
* Similarly to dynptrs, we use ID to track "belonging" of a reference
* to a specific instance of bpf_iter.
*/
u32 id;
/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
Expand Down Expand Up @@ -211,9 +233,11 @@ enum bpf_stack_slot_type {
* is stored in bpf_stack_state->spilled_ptr.dynptr.type
*/
STACK_DYNPTR,
STACK_ITER,
};

#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */

#define BPF_DYNPTR_SIZE sizeof(struct bpf_dynptr_kern)
#define BPF_DYNPTR_NR_SLOTS (BPF_DYNPTR_SIZE / BPF_REG_SIZE)

Expand Down Expand Up @@ -448,6 +472,7 @@ struct bpf_insn_aux_data {
bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
bool zext_dst; /* this insn zero extends dst reg */
bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
u8 alu_state; /* used in combination with alu_limit */

/* below fields are initialized once */
Expand Down
4 changes: 4 additions & 0 deletions include/linux/btf.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@
#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */
#define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */
#define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
/* only one of KF_ITER_{NEW,NEXT,DESTROY} could be specified per kfunc */
#define KF_ITER_NEW (1 << 8) /* kfunc implements BPF iter constructor */
#define KF_ITER_NEXT (1 << 9) /* kfunc implements BPF iter next method */
#define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */

/*
* Tag marking a kernel function as a kfunc. This is meant to minimize the
Expand Down
8 changes: 8 additions & 0 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -7112,4 +7112,12 @@ enum {
BPF_F_TIMER_ABS = (1ULL << 0),
};

/* BPF numbers iterator state */
struct bpf_iter_num {
/* opaque iterator state; having __u64 here allows to preserve correct
* alignment requirements in vmlinux.h, generated from BTF
*/
__u64 __opaque[1];
} __attribute__((aligned(8)));

#endif /* _UAPI__LINUX_BPF_H__ */
70 changes: 70 additions & 0 deletions kernel/bpf/bpf_iter.c
Original file line number Diff line number Diff line change
Expand Up @@ -776,3 +776,73 @@ const struct bpf_func_proto bpf_loop_proto = {
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
.arg4_type = ARG_ANYTHING,
};

struct bpf_iter_num_kern {
int cur; /* current value, inclusive */
int end; /* final value, exclusive */
} __aligned(8);

__diag_push();
__diag_ignore_all("-Wmissing-prototypes",
"Global functions as their definitions will be in vmlinux BTF");

__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
{
struct bpf_iter_num_kern *s = (void *)it;

BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));

BTF_TYPE_EMIT(struct btf_iter_num);

/* start == end is legit, it's an empty range and we'll just get NULL
* on first (and any subsequent) bpf_iter_num_next() call
*/
if (start > end) {
s->cur = s->end = 0;
return -EINVAL;
}

/* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
s->cur = s->end = 0;
return -E2BIG;
}

/* user will call bpf_iter_num_next() first,
* which will set s->cur to exactly start value;
* underflow shouldn't matter
*/
s->cur = start - 1;
s->end = end;

return 0;
}

__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it)
{
struct bpf_iter_num_kern *s = (void *)it;

/* check failed initialization or if we are done (same behavior);
* need to be careful about overflow, so convert to s64 for checks,
* e.g., if s->cur == s->end == INT_MAX, we can't just do
* s->cur + 1 >= s->end
*/
if ((s64)(s->cur + 1) >= s->end) {
s->cur = s->end = 0;
return NULL;
}

s->cur++;

return &s->cur;
}

__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
{
struct bpf_iter_num_kern *s = (void *)it;

s->cur = s->end = 0;
}

__diag_pop();
112 changes: 111 additions & 1 deletion kernel/bpf/btf.c
Original file line number Diff line number Diff line change
Expand Up @@ -7596,6 +7596,108 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
BTF_TRACING_TYPE_xxx
#undef BTF_TRACING_TYPE

static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
const struct btf_type *func, u32 func_flags)
{
u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
const char *name, *sfx, *iter_name;
const struct btf_param *arg;
const struct btf_type *t;
char exp_name[128];
u32 nr_args;

/* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
if (!flags || (flags & (flags - 1)))
return -EINVAL;

/* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */
nr_args = btf_type_vlen(func);
if (nr_args < 1)
return -EINVAL;

arg = &btf_params(func)[0];
t = btf_type_skip_modifiers(btf, arg->type, NULL);
if (!t || !btf_type_is_ptr(t))
return -EINVAL;
t = btf_type_skip_modifiers(btf, t->type, NULL);
if (!t || !__btf_type_is_struct(t))
return -EINVAL;

name = btf_name_by_offset(btf, t->name_off);
if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
return -EINVAL;

/* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
* fit nicely in stack slots
*/
if (t->size == 0 || (t->size % 8))
return -EINVAL;

/* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
* naming pattern
*/
iter_name = name + sizeof(ITER_PREFIX) - 1;
if (flags & KF_ITER_NEW)
sfx = "new";
else if (flags & KF_ITER_NEXT)
sfx = "next";
else /* (flags & KF_ITER_DESTROY) */
sfx = "destroy";

snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx);
if (strcmp(func_name, exp_name))
return -EINVAL;

/* only iter constructor should have extra arguments */
if (!(flags & KF_ITER_NEW) && nr_args != 1)
return -EINVAL;

if (flags & KF_ITER_NEXT) {
/* bpf_iter_<type>_next() should return pointer */
t = btf_type_skip_modifiers(btf, func->type, NULL);
if (!t || !btf_type_is_ptr(t))
return -EINVAL;
}

if (flags & KF_ITER_DESTROY) {
/* bpf_iter_<type>_destroy() should return void */
t = btf_type_by_id(btf, func->type);
if (!t || !btf_type_is_void(t))
return -EINVAL;
}

return 0;
}

static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
{
const struct btf_type *func;
const char *func_name;
int err;

/* any kfunc should be FUNC -> FUNC_PROTO */
func = btf_type_by_id(btf, func_id);
if (!func || !btf_type_is_func(func))
return -EINVAL;

/* sanity check kfunc name */
func_name = btf_name_by_offset(btf, func->name_off);
if (!func_name || !func_name[0])
return -EINVAL;

func = btf_type_by_id(btf, func->type);
if (!func || !btf_type_is_func_proto(func))
return -EINVAL;

if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) {
err = btf_check_iter_kfuncs(btf, func_name, func, func_flags);
if (err)
return err;
}

return 0;
}

/* Kernel Function (kfunc) BTF ID set registration API */

static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
Expand Down Expand Up @@ -7772,7 +7874,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
const struct btf_kfunc_id_set *kset)
{
struct btf *btf;
int ret;
int ret, i;

btf = btf_get_module_btf(kset->owner);
if (!btf) {
Expand All @@ -7789,7 +7891,15 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
if (IS_ERR(btf))
return PTR_ERR(btf);

for (i = 0; i < kset->set->cnt; i++) {
ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id,
kset->set->pairs[i].flags);
if (ret)
goto err_out;
}

ret = btf_populate_kfunc_set(btf, hook, kset->set);
err_out:
btf_put(btf);
return ret;
}
Expand Down
3 changes: 3 additions & 0 deletions kernel/bpf/helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -2411,6 +2411,9 @@ BTF_ID_FLAGS(func, bpf_rcu_read_lock)
BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
BTF_SET8_END(common_btf_ids)

static const struct btf_kfunc_id_set common_kfunc_set = {
Expand Down
Loading

0 comments on commit 23e403b

Please sign in to comment.