@@ -1379,10 +1379,17 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
u8 *prog = *pprog;
int cnt = 0;
- if (emit_call(&prog, __bpf_prog_enter, prog))
- return -EINVAL;
- /* remember prog start time returned by __bpf_prog_enter */
- emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
+ if (p->aux->sleepable) {
+ if (emit_call(&prog, __bpf_prog_enter_sleepable, prog))
+ return -EINVAL;
+ /* remember srcu idx returned by __bpf_prog_enter_sleepable */
+ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
+ } else {
+ if (emit_call(&prog, __bpf_prog_enter, prog))
+ return -EINVAL;
+ /* remember prog start time returned by __bpf_prog_enter */
+ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
+ }
/* arg1: lea rdi, [rbp - stack_size] */
EMIT4(0x48, 0x8D, 0x7D, -stack_size);
@@ -1402,13 +1409,20 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
if (mod_ret)
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
- /* arg1: mov rdi, progs[i] */
- emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
- (u32) (long) p);
- /* arg2: mov rsi, rbx <- start time in nsec */
- emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
- if (emit_call(&prog, __bpf_prog_exit, prog))
- return -EINVAL;
+ if (p->aux->sleepable) {
+ /* arg1: mov rdi, rbx <- srcu idx */
+ emit_mov_reg(&prog, true, BPF_REG_1, BPF_REG_6);
+ if (emit_call(&prog, __bpf_prog_exit_sleepable, prog))
+ return -EINVAL;
+ } else {
+ /* arg1: mov rdi, progs[i] */
+ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
+ (u32) (long) p);
+ /* arg2: mov rsi, rbx <- start time in nsec */
+ emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
+ if (emit_call(&prog, __bpf_prog_exit, prog))
+ return -EINVAL;
+ }
*pprog = prog;
return 0;
@@ -36,6 +36,7 @@ struct seq_operations;
extern struct idr btf_idr;
extern spinlock_t btf_idr_lock;
+extern struct srcu_struct bpf_srcu;
/* map is generic key/value storage optionally accesible by eBPF programs */
struct bpf_map_ops {
@@ -476,6 +477,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
/* these two functions are called from generated trampoline */
u64 notrace __bpf_prog_enter(void);
void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
+u64 notrace __bpf_prog_enter_sleepable(void);
+void notrace __bpf_prog_exit_sleepable(u64 idx);
struct bpf_ksym {
unsigned long start;
@@ -668,6 +671,7 @@ struct bpf_prog_aux {
bool offload_requested;
bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
bool func_proto_unreliable;
+ bool sleepable;
enum bpf_tramp_prog_type trampoline_prog_type;
struct bpf_trampoline *trampoline;
struct hlist_node tramp_hlist;
@@ -329,6 +329,14 @@ enum bpf_link_type {
/* The verifier internal test flag. Behavior is undefined */
#define BPF_F_TEST_STATE_FREQ (1U << 3)
+/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will
+ * restrict map and helper usage for such programs. Sleepable BPF programs can
+ * only be attached to hooks where kernel execution context allows sleeping.
+ * Such programs are allowed to use helpers that may sleep like
+ * bpf_copy_from_user().
+ */
+#define BPF_F_SLEEPABLE (1U << 4)
+
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* two extensions:
*
@@ -393,6 +393,11 @@ static void array_map_free(struct bpf_map *map)
*/
synchronize_rcu();
+ /* arrays could have been used by both sleepable and non-sleepable bpf
+ * progs. Make sure to wait for both prog types to finish executing.
+ */
+ synchronize_srcu(&bpf_srcu);
+
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
bpf_array_free_percpu(array);
@@ -577,8 +577,8 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l;
u32 hash, key_size;
- /* Must be called with rcu_read_lock. */
- WARN_ON_ONCE(!rcu_read_lock_held());
+ /* Must be called with s?rcu_read_lock. */
+ WARN_ON_ONCE(!rcu_read_lock_held() && !srcu_read_lock_held(&bpf_srcu));
key_size = map->key_size;
@@ -935,7 +935,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !srcu_read_lock_held(&bpf_srcu));
key_size = map->key_size;
@@ -1026,7 +1026,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !srcu_read_lock_held(&bpf_srcu));
key_size = map->key_size;
@@ -1214,7 +1214,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret = -ENOENT;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !srcu_read_lock_held(&bpf_srcu));
key_size = map->key_size;
@@ -1246,7 +1246,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret = -ENOENT;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !srcu_read_lock_held(&bpf_srcu));
key_size = map->key_size;
@@ -1297,6 +1297,13 @@ static void htab_map_free(struct bpf_map *map)
*/
synchronize_rcu();
+ /* preallocated hash map could have been used by both sleepable and
+ * non-sleepable bpf progs. Make sure to wait for both prog types
+ * to finish executing.
+ */
+ if (htab_is_prealloc(htab))
+ synchronize_srcu(&bpf_srcu);
+
/* some of free_htab_elem() callbacks for elements of this map may
* not have executed. Wait for them.
*/
@@ -1725,10 +1725,14 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
btf_put(prog->aux->btf);
bpf_prog_free_linfo(prog);
- if (deferred)
- call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
- else
+ if (deferred) {
+ if (prog->aux->sleepable)
+ call_srcu(&bpf_srcu, &prog->aux->rcu, __bpf_prog_put_rcu);
+ else
+ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+ } else {
__bpf_prog_put_rcu(&prog->aux->rcu);
+ }
}
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
@@ -2093,6 +2097,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
BPF_F_ANY_ALIGNMENT |
BPF_F_TEST_STATE_FREQ |
+ BPF_F_SLEEPABLE |
BPF_F_TEST_RND_HI32))
return -EINVAL;
@@ -2148,6 +2153,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
}
prog->aux->offload_requested = !!attr->prog_ifindex;
+ prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
err = security_bpf_prog_alloc(prog->aux);
if (err)
@@ -205,13 +205,21 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+ /* the same trampoline can hold both sleepable and non-sleepable progs.
+ * synchronize_srcu() is cheap when sleepable progs are not running, so
+ * just call it here to make sure all sleepable progs finish executing.
+ * Then call synchronize_rcu_tasks() to make sure that the rest of
+ * generated tramopline assembly finishes too before updating
+ * trampoline.
+ */
+ synchronize_srcu(&bpf_srcu);
+
/* Though the second half of trampoline page is unused a task could be
* preempted in the middle of the first half of trampoline and two
* updates to trampoline would change the code from underneath the
* preempted task. Hence wait for tasks to voluntarily schedule or go
* to userspace.
*/
-
synchronize_rcu_tasks();
err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
@@ -344,6 +352,11 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
goto out;
bpf_image_ksym_del(&tr->ksym);
+ /* This code will be executed when all bpf progs (both sleepable and
+ * non-sleepable) went through
+ * bpf_prog_put()->call_s?rcu()->bpf_prog_free_deferred().
+ * Hence no need for another synchronize_srcu(&bpf_srcu) here.
+ */
/* wait for tasks to get out of trampoline before freeing it */
synchronize_rcu_tasks();
bpf_jit_free_exec(tr->image);
@@ -394,6 +407,23 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
rcu_read_unlock();
}
+/* when bpf_srcu lock is held it means that some sleepable bpf program is
+ * running. Those programs can use bpf arrays and preallocated hash maps. These
+ * map types are waiting on programs to complete via
+ * synchronize_srcu(&bpf_srcu);
+ */
+struct srcu_struct bpf_srcu;
+
+u64 notrace __bpf_prog_enter_sleepable(void)
+{
+ return srcu_read_lock(&bpf_srcu);
+}
+
+void notrace __bpf_prog_exit_sleepable(u64 idx)
+{
+ srcu_read_unlock(&bpf_srcu, idx);
+}
+
int __weak
arch_prepare_bpf_trampoline(void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
@@ -409,6 +439,7 @@ static int __init init_trampolines(void)
for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
INIT_HLIST_HEAD(&trampoline_table[i]);
+ init_srcu_struct(&bpf_srcu);
return 0;
}
late_initcall(init_trampolines);
@@ -8922,6 +8922,23 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
+ if (prog->aux->sleepable)
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_HASH:
+ case BPF_MAP_TYPE_LRU_HASH:
+ case BPF_MAP_TYPE_ARRAY:
+ if (!is_preallocated_map(map)) {
+ verbose(env,
+ "Sleepable programs can only use preallocated hash maps\n");
+ return -EINVAL;
+ }
+ break;
+ default:
+ verbose(env,
+ "Sleepable programs can only use array and hash maps\n");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -10532,6 +10549,22 @@ static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr)
return -EINVAL;
}
+/* list of non-sleepable kernel functions that are otherwise
+ * available to attach by bpf_lsm or fmod_ret progs.
+ */
+static int check_sleepable_blacklist(unsigned long addr)
+{
+#ifdef CONFIG_BPF_LSM
+ if (addr == (long)bpf_lsm_task_free)
+ return -EINVAL;
+#endif
+#ifdef CONFIG_SECURITY
+ if (addr == (long)security_task_free)
+ return -EINVAL;
+#endif
+ return 0;
+}
+
static int check_attach_btf_id(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
@@ -10549,6 +10582,12 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
long addr;
u64 key;
+ if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
+ prog->type != BPF_PROG_TYPE_LSM) {
+ verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n");
+ return -EINVAL;
+ }
+
if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
return check_struct_ops_btf_id(env);
@@ -10762,8 +10801,29 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
if (ret)
verbose(env, "%s() is not modifiable\n",
prog->aux->attach_func_name);
+ } else if (prog->aux->sleepable) {
+ switch (prog->type) {
+ case BPF_PROG_TYPE_TRACING:
+ /* fentry/fexit progs can be sleepable only if they are
+ * attached to ALLOW_ERROR_INJECTION or security_*() funcs.
+ */
+ ret = check_attach_modify_return(prog, addr);
+ if (!ret)
+ ret = check_sleepable_blacklist(addr);
+ break;
+ case BPF_PROG_TYPE_LSM:
+ /* LSM progs check that they are attached to bpf_lsm_*() funcs
+ * which are sleepable too.
+ */
+ ret = check_sleepable_blacklist(addr);
+ break;
+ default:
+ break;
+ }
+ if (ret)
+ verbose(env, "%s is not sleepable\n",
+ prog->aux->attach_func_name);
}
-
if (ret)
goto out;
tr->func.addr = (void *)addr;
@@ -329,6 +329,14 @@ enum bpf_link_type {
/* The verifier internal test flag. Behavior is undefined */
#define BPF_F_TEST_STATE_FREQ (1U << 3)
+/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will
+ * restrict map and helper usage for such programs. Sleepable BPF programs can
+ * only be attached to hooks where kernel execution context allows sleeping.
+ * Such programs are allowed to use helpers that may sleep like
+ * bpf_copy_from_user().
+ */
+#define BPF_F_SLEEPABLE (1U << 4)
+
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* two extensions:
*