diff mbox series

[v6,bpf-next,1/4] bpf: introduce task_vma bpf_iter

Message ID 20210212014232.414643-2-songliubraving@fb.com
State New
Headers show
Series introduce bpf_iter for task_vma | expand

Commit Message

Song Liu Feb. 12, 2021, 1:42 a.m. UTC
Introduce task_vma bpf_iter to print memory information of a process. It
can be used to print customized information similar to /proc/<pid>/maps.

Current /proc/<pid>/maps and /proc/<pid>/smaps provide information of
vma's of a process. However, these information are not flexible enough to
cover all use cases. For example, if a vma cover mixed 2MB pages and 4kB
pages (x86_64), there is no easy way to tell which address ranges are
backed by 2MB pages. task_vma solves the problem by enabling the user to
generate customize information based on the vma (and vma->vm_mm,
vma->vm_file, etc.).

To access the vma safely in the BPF program, task_vma iterator holds
target mmap_lock while calling the BPF program. If the mmap_lock is
contended, task_vma unlocks mmap_lock between iterations to unblock the
writer(s). This lock contention avoidance mechanism is similar to the one
used in show_smaps_rollup().

Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 kernel/bpf/task_iter.c | 267 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 266 insertions(+), 1 deletion(-)

Comments

Alexei Starovoitov Feb. 12, 2021, 4:47 a.m. UTC | #1
On Thu, Feb 11, 2021 at 05:42:29PM -0800, Song Liu wrote:
> +static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
> +{
> +	struct bpf_iter_seq_task_vma_info *info = seq->private;
> +	struct bpf_iter__task_vma ctx;
> +	struct bpf_iter_meta meta;
> +	struct bpf_prog *prog;
> +
> +	meta.seq = seq;
> +	prog = bpf_iter_get_info(&meta, in_stop);
> +	if (!prog)
> +		return 0;
> +
> +	ctx.meta = &meta;
> +	ctx.task = info->task;
> +	ctx.vma = info->vma;
> +	return bpf_iter_run_prog(prog, &ctx);
> +}

That part doesn't match patch 2.
If it needs to call sleepable prog it should call it differently,
since bpf_iter_run_prog() is doing rcu_read_lock().
Try adding the following:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 845b2168e006..06f65b18bc54 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1159,6 +1159,8 @@ BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz)
        long len;
        char *p;

+       might_sleep();
and you will see a splat from patch 4's task_vma test.

But this might_sleep above is not correct. d_path is not about sleepability.
It does its locking under rcu_read_lock.
The bpf_d_path_allowed is confusing, since it checks for lsm sleepable hooks,
but it's not about sleeping. Those hooks are safe because they don't hold
any vfs locks. With bpf iterators the situation is the same.
They all are called from bpf_seq_read. Which runs in user context and
there are no vfs locks to worry about.
So it's safe to enable all iterators to access bpf_d_path. No need
to introduce sleepable iterators.
The upcoming bpf_for_each_map_elem is kinda an iterator too where
map elem callback shouldn't be able to call bpf_d_path, but
it won't have expected_attach_type == BPF_TRACE_ITER, so
bpf_for_each_map_elem will be fine the way last patches were done.
So just drop 'prog->aux->sleepable' from patch 2 and delete patch 3.
diff mbox series

Patch

diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 175b7b42bfc46..b68cb5d6d6ebc 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -286,9 +286,248 @@  static const struct seq_operations task_file_seq_ops = {
 	.show	= task_file_seq_show,
 };
 
+struct bpf_iter_seq_task_vma_info {
+	/* The first field must be struct bpf_iter_seq_task_common.
+	 * this is assumed by {init, fini}_seq_pidns() callback functions.
+	 */
+	struct bpf_iter_seq_task_common common;
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	u32 tid;
+	unsigned long prev_vm_start;
+	unsigned long prev_vm_end;
+};
+
+enum bpf_task_vma_iter_find_op {
+	task_vma_iter_first_vma,   /* use mm->mmap */
+	task_vma_iter_next_vma,    /* use curr_vma->vm_next */
+	task_vma_iter_find_vma,    /* use find_vma() to find next vma */
+};
+
+static struct vm_area_struct *
+task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
+{
+	struct pid_namespace *ns = info->common.ns;
+	enum bpf_task_vma_iter_find_op op;
+	struct vm_area_struct *curr_vma;
+	struct task_struct *curr_task;
+	u32 curr_tid = info->tid;
+
+	/* If this function returns a non-NULL vma, it holds a reference to
+	 * the task_struct, and holds read lock on vma->mm->mmap_lock.
+	 * If this function returns NULL, it does not hold any reference or
+	 * lock.
+	 */
+	if (info->task) {
+		curr_task = info->task;
+		curr_vma = info->vma;
+		/* In case of lock contention, drop mmap_lock to unblock
+		 * the writer.
+		 *
+		 * After relock, call find(mm, prev_vm_end - 1) to find
+		 * new vma to process.
+		 *
+		 *   +------+------+-----------+
+		 *   | VMA1 | VMA2 | VMA3      |
+		 *   +------+------+-----------+
+		 *   |      |      |           |
+		 *  4k     8k     16k         400k
+		 *
+		 * For example, curr_vma == VMA2. Before unlock, we set
+		 *
+		 *    prev_vm_start = 8k
+		 *    prev_vm_end   = 16k
+		 *
+		 * There are a few cases:
+		 *
+		 * 1) VMA2 is freed, but VMA3 exists.
+		 *
+		 *    find_vma() will return VMA3, just process VMA3.
+		 *
+		 * 2) VMA2 still exists.
+		 *
+		 *    find_vma() will return VMA2, process VMA2->next.
+		 *
+		 * 3) no more vma in this mm.
+		 *
+		 *    Process the next task.
+		 *
+		 * 4) find_vma() returns a different vma, VMA2'.
+		 *
+		 *    4.1) If VMA2 covers same range as VMA2', skip VMA2',
+		 *         because we already covered the range;
+		 *    4.2) VMA2 and VMA2' covers different ranges, process
+		 *         VMA2'.
+		 */
+		if (mmap_lock_is_contended(curr_task->mm)) {
+			info->prev_vm_start = curr_vma->vm_start;
+			info->prev_vm_end = curr_vma->vm_end;
+			op = task_vma_iter_find_vma;
+			mmap_read_unlock(curr_task->mm);
+			if (mmap_read_lock_killable(curr_task->mm))
+				goto finish;
+		} else {
+			op = task_vma_iter_next_vma;
+		}
+	} else {
+again:
+		curr_task = task_seq_get_next(ns, &curr_tid, true);
+		if (!curr_task) {
+			info->tid = curr_tid + 1;
+			goto finish;
+		}
+
+		if (curr_tid != info->tid) {
+			info->tid = curr_tid;
+			/* new task, process the first vma */
+			op = task_vma_iter_first_vma;
+		} else {
+			/* Found the same tid, which means the user space
+			 * finished data in previous buffer and read more.
+			 * We dropped mmap_lock before returning to user
+			 * space, so it is necessary to use find_vma() to
+			 * find the next vma to process.
+			 */
+			op = task_vma_iter_find_vma;
+		}
+
+		if (!curr_task->mm)
+			goto next_task;
+
+		if (mmap_read_lock_killable(curr_task->mm))
+			goto finish;
+	}
+
+	switch (op) {
+	case task_vma_iter_first_vma:
+		curr_vma = curr_task->mm->mmap;
+		break;
+	case task_vma_iter_next_vma:
+		curr_vma = curr_vma->vm_next;
+		break;
+	case task_vma_iter_find_vma:
+		/* We dropped mmap_lock so it is necessary to use find_vma
+		 * to find the next vma. This is similar to the  mechanism
+		 * in show_smaps_rollup().
+		 */
+		curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1);
+		/* case 1) and 4.2) above just use curr_vma */
+
+		/* check for case 2) or case 4.1) above */
+		if (curr_vma &&
+		    curr_vma->vm_start == info->prev_vm_start &&
+		    curr_vma->vm_end == info->prev_vm_end)
+			curr_vma = curr_vma->vm_next;
+		break;
+	}
+	if (!curr_vma) {
+		/* case 3) above, or case 2) 4.1) with vma->next == NULL */
+		mmap_read_unlock(curr_task->mm);
+		goto next_task;
+	}
+	info->task = curr_task;
+	info->vma = curr_vma;
+	return curr_vma;
+
+next_task:
+	put_task_struct(curr_task);
+	info->task = NULL;
+	curr_tid++;
+	goto again;
+
+finish:
+	if (curr_task)
+		put_task_struct(curr_task);
+	info->task = NULL;
+	info->vma = NULL;
+	return NULL;
+}
+
+static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_task_vma_info *info = seq->private;
+	struct vm_area_struct *vma;
+
+	vma = task_vma_seq_get_next(info);
+	if (vma && *pos == 0)
+		++*pos;
+
+	return vma;
+}
+
+static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_task_vma_info *info = seq->private;
+
+	++*pos;
+	return task_vma_seq_get_next(info);
+}
+
+struct bpf_iter__task_vma {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct task_struct *, task);
+	__bpf_md_ptr(struct vm_area_struct *, vma);
+};
+
+DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta,
+		     struct task_struct *task, struct vm_area_struct *vma)
+
+static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
+{
+	struct bpf_iter_seq_task_vma_info *info = seq->private;
+	struct bpf_iter__task_vma ctx;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (!prog)
+		return 0;
+
+	ctx.meta = &meta;
+	ctx.task = info->task;
+	ctx.vma = info->vma;
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int task_vma_seq_show(struct seq_file *seq, void *v)
+{
+	return __task_vma_seq_show(seq, false);
+}
+
+static void task_vma_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_seq_task_vma_info *info = seq->private;
+
+	if (!v) {
+		(void)__task_vma_seq_show(seq, true);
+	} else {
+		/* info->vma has not been seen by the BPF program. If the
+		 * user space reads more, task_vma_seq_get_next should
+		 * return this vma again. Set prev_vm_start to ~0UL,
+		 * so that we don't skip the vma returned by the next
+		 * find_vma() (case task_vma_iter_find_vma in
+		 * task_vma_seq_get_next()).
+		 */
+		info->prev_vm_start = ~0UL;
+		info->prev_vm_end = info->vma->vm_end;
+		mmap_read_unlock(info->task->mm);
+		put_task_struct(info->task);
+		info->task = NULL;
+	}
+}
+
+static const struct seq_operations task_vma_seq_ops = {
+	.start	= task_vma_seq_start,
+	.next	= task_vma_seq_next,
+	.stop	= task_vma_seq_stop,
+	.show	= task_vma_seq_show,
+};
+
 BTF_ID_LIST(btf_task_file_ids)
 BTF_ID(struct, task_struct)
 BTF_ID(struct, file)
+BTF_ID(struct, vm_area_struct)
 
 static const struct bpf_iter_seq_info task_seq_info = {
 	.seq_ops		= &task_seq_ops,
@@ -328,6 +567,26 @@  static struct bpf_iter_reg task_file_reg_info = {
 	.seq_info		= &task_file_seq_info,
 };
 
+static const struct bpf_iter_seq_info task_vma_seq_info = {
+	.seq_ops		= &task_vma_seq_ops,
+	.init_seq_private	= init_seq_pidns,
+	.fini_seq_private	= fini_seq_pidns,
+	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_vma_info),
+};
+
+static struct bpf_iter_reg task_vma_reg_info = {
+	.target			= "task_vma",
+	.feature		= BPF_ITER_RESCHED,
+	.ctx_arg_info_size	= 2,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__task_vma, task),
+		  PTR_TO_BTF_ID_OR_NULL },
+		{ offsetof(struct bpf_iter__task_vma, vma),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
+	.seq_info		= &task_vma_seq_info,
+};
+
 static int __init task_iter_init(void)
 {
 	int ret;
@@ -339,6 +598,12 @@  static int __init task_iter_init(void)
 
 	task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
 	task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
-	return bpf_iter_reg_target(&task_file_reg_info);
+	ret =  bpf_iter_reg_target(&task_file_reg_info);
+	if (ret)
+		return ret;
+
+	task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
+	task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[2];
+	return bpf_iter_reg_target(&task_vma_reg_info);
 }
 late_initcall(task_iter_init);