mbox series

[v4,bpf-next,0/4] introduce bpf_iter for task_vma

Message ID 20210204205002.4075937-1-songliubraving@fb.com
Headers show
Series introduce bpf_iter for task_vma | expand

Message

Song Liu Feb. 4, 2021, 8:49 p.m. UTC
This set introduces bpf_iter for task_vma, which can be used to generate
information similar to /proc/pid/maps. Patch 4/4 adds an example that
mimics /proc/pid/maps.

Current /proc/<pid>/maps and /proc/<pid>/smaps provide information of
vma's of a process. However, these information are not flexible enough to
cover all use cases. For example, if a vma cover mixed 2MB pages and 4kB
pages (x86_64), there is no easy way to tell which address ranges are
backed by 2MB pages. task_vma solves the problem by enabling the user to
generate customize information based on the vma (and vma->vm_mm,
vma->vm_file, etc.).

Changes v3 => v4:
  1. Avoid skipping vma by assigning invalid prev_vm_start in
     task_vma_seq_stop(). (Yonghong)
  2. Move "again" label in task_vma_seq_get_next() save a check. (Yonghong)

Changes v2 => v3:
  1. Rewrite 1/4 so that we hold mmap_lock while calling BPF program. This
     enables the BPF program to access the real vma with BTF. (Alexei)
  2. Fix the logic when the control is returned to user space. (Yonghong)
  3. Revise commit log and cover letter. (Yonghong)

Changes v1 => v2:
  1. Small fixes in task_iter.c and the selftests. (Yonghong)

Song Liu (4):
  bpf: introduce task_vma bpf_iter
  bpf: allow bpf_d_path in sleepable bpf_iter program
  libbpf: introduce section "iter.s/" for sleepable bpf_iter program
  selftests/bpf: add test for bpf_iter_task_vma

 kernel/bpf/task_iter.c                        | 215 +++++++++++++++++-
 kernel/trace/bpf_trace.c                      |   5 +
 tools/lib/bpf/libbpf.c                        |   5 +
 .../selftests/bpf/prog_tests/bpf_iter.c       | 114 +++++++++-
 tools/testing/selftests/bpf/progs/bpf_iter.h  |   8 +
 .../selftests/bpf/progs/bpf_iter_task_vma.c   |  58 +++++
 6 files changed, 394 insertions(+), 11 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c

--
2.24.1

Comments

Yonghong Song Feb. 8, 2021, 4:30 a.m. UTC | #1
On 2/4/21 12:49 PM, Song Liu wrote:
> Introduce task_vma bpf_iter to print memory information of a process. It

> can be used to print customized information similar to /proc/<pid>/maps.

> 

> Current /proc/<pid>/maps and /proc/<pid>/smaps provide information of

> vma's of a process. However, these information are not flexible enough to

> cover all use cases. For example, if a vma cover mixed 2MB pages and 4kB

> pages (x86_64), there is no easy way to tell which address ranges are

> backed by 2MB pages. task_vma solves the problem by enabling the user to

> generate customize information based on the vma (and vma->vm_mm,

> vma->vm_file, etc.).

> 

> To access the vma safely in the BPF program, task_vma iterator holds

> target mmap_lock while calling the BPF program. If the mmap_lock is

> contended, task_vma unlocks mmap_lock between iterations to unblock the

> writer(s). This lock contention avoidance mechanism is similar to the one

> used in show_smaps_rollup().

> 

> Signed-off-by: Song Liu <songliubraving@fb.com>

> ---

>   kernel/bpf/task_iter.c | 215 ++++++++++++++++++++++++++++++++++++++++-

>   1 file changed, 214 insertions(+), 1 deletion(-)

> 

> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c

> index 175b7b42bfc46..31e63b6c3d718 100644

> --- a/kernel/bpf/task_iter.c

> +++ b/kernel/bpf/task_iter.c

> @@ -286,9 +286,196 @@ static const struct seq_operations task_file_seq_ops = {

>   	.show	= task_file_seq_show,

>   };

>   

> +struct bpf_iter_seq_task_vma_info {

> +	/* The first field must be struct bpf_iter_seq_task_common.

> +	 * this is assumed by {init, fini}_seq_pidns() callback functions.

> +	 */

> +	struct bpf_iter_seq_task_common common;

> +	struct task_struct *task;

> +	struct vm_area_struct *vma;

> +	u32 tid;

> +	unsigned long prev_vm_start;

> +	unsigned long prev_vm_end;

> +};

> +

> +enum bpf_task_vma_iter_find_op {

> +	task_vma_iter_first_vma,   /* use mm->mmap */

> +	task_vma_iter_next_vma,    /* use curr_vma->vm_next */

> +	task_vma_iter_find_vma,    /* use find_vma() to find next vma */

> +};

> +

> +static struct vm_area_struct *

> +task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)

> +{

> +	struct pid_namespace *ns = info->common.ns;

> +	enum bpf_task_vma_iter_find_op op;

> +	struct vm_area_struct *curr_vma;

> +	struct task_struct *curr_task;

> +	u32 curr_tid = info->tid;

> +

> +	/* If this function returns a non-NULL vma, it holds a reference to

> +	 * the task_struct, and holds read lock on vma->mm->mmap_lock.

> +	 * If this function returns NULL, it does not hold any reference or

> +	 * lock.

> +	 */

> +	if (info->task) {

> +		curr_task = info->task;

> +		curr_vma = info->vma;

> +		/* In case of lock contention, drop mmap_lock to unblock

> +		 * the writer.

> +		 */

> +		if (mmap_lock_is_contended(curr_task->mm)) {

> +			info->prev_vm_start = curr_vma->vm_start;

> +			info->prev_vm_end = curr_vma->vm_end;

> +			op = task_vma_iter_find_vma;

> +			mmap_read_unlock(curr_task->mm);

> +			if (mmap_read_lock_killable(curr_task->mm))

> +				goto finish;

> +		} else {

> +			op = task_vma_iter_next_vma;

> +		}

> +	} else {

> +again:

> +		curr_task = task_seq_get_next(ns, &curr_tid, true);

> +		if (!curr_task) {

> +			info->tid = curr_tid + 1;

> +			goto finish;

> +		}

> +

> +		if (curr_tid != info->tid) {

> +			info->tid = curr_tid;

> +			op = task_vma_iter_first_vma;

> +		} else {

> +			op = task_vma_iter_find_vma;

> +		}

> +

> +		if (!curr_task->mm)

> +			goto next_task;

> +

> +		if (mmap_read_lock_killable(curr_task->mm))

> +			goto finish;


We hold a reference for curr_task here.
Going to "finish" does not release the reference.

> +	}

> +

> +	switch (op) {

> +	case task_vma_iter_first_vma:

> +		curr_vma = curr_task->mm->mmap;

> +		break;

> +	case task_vma_iter_next_vma:

> +		curr_vma = curr_vma->vm_next;

> +		break;

> +	case task_vma_iter_find_vma:

> +		/* We dropped mmap_lock so it is necessary to use find_vma

> +		 * to find the next vma. This is similar to the  mechanism

> +		 * in show_smaps_rollup().

> +		 */

> +		curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1);

> +

> +		if (curr_vma && (curr_vma->vm_start == info->prev_vm_start))

> +			curr_vma = curr_vma->vm_next;

> +		break;

> +	}

> +	if (!curr_vma) {

> +		mmap_read_unlock(curr_task->mm);

> +		goto next_task;

> +	}

> +	info->task = curr_task;

> +	info->vma = curr_vma;

> +	return curr_vma;

> +

> +next_task:

> +	put_task_struct(curr_task);

> +	info->task = NULL;

> +	curr_tid++;

> +	goto again;

> +

> +finish:

> +	info->task = NULL;

> +	info->vma = NULL;

> +	return NULL;

> +}

> +

[...]