Message ID | 20201117034108.1186569-7-guro@fb.com |
---|---|
State | New |
Headers | show |
Series | [bpf-next,v6,01/34] mm: memcontrol: use helpers to read page's memcg data | expand |
On 11/17/20 4:40 AM, Roman Gushchin wrote: > In the absolute majority of cases if a process is making a kernel > allocation, it's memory cgroup is getting charged. > > Bpf maps can be updated from an interrupt context and in such > case there is no process which can be charged. It makes the memory > accounting of bpf maps non-trivial. > > Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel > memcg accounting from interrupt contexts") and b87d8cefe43c > ("mm, memcg: rework remote charging API to support nesting") > it's finally possible. > > To do it, a pointer to the memory cgroup of the process which created > the map is saved, and this cgroup is getting charged for all > allocations made from an interrupt context. > > Allocations made from a process context will be accounted in a usual way. > > Signed-off-by: Roman Gushchin <guro@fb.com> > Acked-by: Song Liu <songliubraving@fb.com> [...] > > +#ifdef CONFIG_MEMCG_KMEM > +static __always_inline int __bpf_map_update_elem(struct bpf_map *map, void *key, > + void *value, u64 flags) > +{ > + struct mem_cgroup *old_memcg; > + bool in_interrupt; > + int ret; > + > + /* > + * If update from an interrupt context results in a memory allocation, > + * the memory cgroup to charge can't be determined from the context > + * of the current task. Instead, we charge the memory cgroup, which > + * contained a process created the map. > + */ > + in_interrupt = in_interrupt(); > + if (in_interrupt) > + old_memcg = set_active_memcg(map->memcg); > + > + ret = map->ops->map_update_elem(map, key, value, flags); > + > + if (in_interrupt) > + set_active_memcg(old_memcg); > + > + return ret; Hmm, this approach here won't work, see also commit 09772d92cd5a ("bpf: avoid retpoline for lookup/update/delete calls on maps") which removes the indirect call, so the __bpf_map_update_elem() and therefore the set_active_memcg() is not invoked for the vast majority of cases. > +} > +#else > +static __always_inline int __bpf_map_update_elem(struct bpf_map *map, void *key, > + void *value, u64 flags) > +{ > + return map->ops->map_update_elem(map, key, value, flags); > +} > +#endif > + > BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, > void *, value, u64, flags) > { > WARN_ON_ONCE(!rcu_read_lock_held()); > - return map->ops->map_update_elem(map, key, value, flags); > + > + return __bpf_map_update_elem(map, key, value, flags); > } > > const struct bpf_func_proto bpf_map_update_elem_proto = { > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index f3fe9f53f93c..2d77fc2496da 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -31,6 +31,7 @@ > #include <linux/poll.h> > #include <linux/bpf-netns.h> > #include <linux/rcupdate_trace.h> > +#include <linux/memcontrol.h> > > #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ > (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ > @@ -456,6 +457,27 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) > __release(&map_idr_lock); > } > > +#ifdef CONFIG_MEMCG_KMEM > +static void bpf_map_save_memcg(struct bpf_map *map) > +{ > + map->memcg = get_mem_cgroup_from_mm(current->mm); > +} > + > +static void bpf_map_release_memcg(struct bpf_map *map) > +{ > + mem_cgroup_put(map->memcg); > +} > + > +#else > +static void bpf_map_save_memcg(struct bpf_map *map) > +{ > +} > + > +static void bpf_map_release_memcg(struct bpf_map *map) > +{ > +} > +#endif > + > /* called from workqueue */ > static void bpf_map_free_deferred(struct work_struct *work) > { > @@ -464,6 +486,7 @@ static void bpf_map_free_deferred(struct work_struct *work) > > bpf_map_charge_move(&mem, &map->memory); > security_bpf_map_free(map); > + bpf_map_release_memcg(map); > /* implementation dependent freeing */ > map->ops->map_free(map); > bpf_map_charge_finish(&mem); > @@ -875,6 +898,8 @@ static int map_create(union bpf_attr *attr) > if (err) > goto free_map_sec; > > + bpf_map_save_memcg(map); > + > err = bpf_map_new_fd(map, f_flags); > if (err < 0) { > /* failed to allocate fd. >
On Wed, Nov 18, 2020 at 01:06:17AM +0100, Daniel Borkmann wrote: > On 11/17/20 4:40 AM, Roman Gushchin wrote: > > In the absolute majority of cases if a process is making a kernel > > allocation, it's memory cgroup is getting charged. > > > > Bpf maps can be updated from an interrupt context and in such > > case there is no process which can be charged. It makes the memory > > accounting of bpf maps non-trivial. > > > > Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel > > memcg accounting from interrupt contexts") and b87d8cefe43c > > ("mm, memcg: rework remote charging API to support nesting") > > it's finally possible. > > > > To do it, a pointer to the memory cgroup of the process which created > > the map is saved, and this cgroup is getting charged for all > > allocations made from an interrupt context. > > > > Allocations made from a process context will be accounted in a usual way. > > > > Signed-off-by: Roman Gushchin <guro@fb.com> > > Acked-by: Song Liu <songliubraving@fb.com> > [...] > > +#ifdef CONFIG_MEMCG_KMEM > > +static __always_inline int __bpf_map_update_elem(struct bpf_map *map, void *key, > > + void *value, u64 flags) > > +{ > > + struct mem_cgroup *old_memcg; > > + bool in_interrupt; > > + int ret; > > + > > + /* > > + * If update from an interrupt context results in a memory allocation, > > + * the memory cgroup to charge can't be determined from the context > > + * of the current task. Instead, we charge the memory cgroup, which > > + * contained a process created the map. > > + */ > > + in_interrupt = in_interrupt(); > > + if (in_interrupt) > > + old_memcg = set_active_memcg(map->memcg); > > + > > + ret = map->ops->map_update_elem(map, key, value, flags); > > + > > + if (in_interrupt) > > + set_active_memcg(old_memcg); > > + > > + return ret; > > Hmm, this approach here won't work, see also commit 09772d92cd5a ("bpf: avoid > retpoline for lookup/update/delete calls on maps") which removes the indirect > call, so the __bpf_map_update_elem() and therefore the set_active_memcg() is > not invoked for the vast majority of cases. I see. Well, the first option is to move these calls into map-specific update functions, but the list is relatively long: nsim_map_update_elem() cgroup_storage_update_elem() htab_map_update_elem() htab_percpu_map_update_elem() dev_map_update_elem() dev_map_hash_update_elem() trie_update_elem() cpu_map_update_elem() bpf_pid_task_storage_update_elem() bpf_fd_inode_storage_update_elem() bpf_fd_sk_storage_update_elem() sock_map_update_elem() xsk_map_update_elem() Alternatively, we can set the active memcg for the whole duration of bpf execution. It's simpler, but will add some overhead. Maybe we can somehow mark programs calling into update helpers and skip all others? What do you think? Thanks!
On Tue, Nov 17, 2020 at 04:46:34PM -0800, Roman Gushchin wrote: > On Wed, Nov 18, 2020 at 01:06:17AM +0100, Daniel Borkmann wrote: > > On 11/17/20 4:40 AM, Roman Gushchin wrote: > > > In the absolute majority of cases if a process is making a kernel > > > allocation, it's memory cgroup is getting charged. > > > > > > Bpf maps can be updated from an interrupt context and in such > > > case there is no process which can be charged. It makes the memory > > > accounting of bpf maps non-trivial. > > > > > > Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel > > > memcg accounting from interrupt contexts") and b87d8cefe43c > > > ("mm, memcg: rework remote charging API to support nesting") > > > it's finally possible. > > > > > > To do it, a pointer to the memory cgroup of the process which created > > > the map is saved, and this cgroup is getting charged for all > > > allocations made from an interrupt context. > > > > > > Allocations made from a process context will be accounted in a usual way. > > > > > > Signed-off-by: Roman Gushchin <guro@fb.com> > > > Acked-by: Song Liu <songliubraving@fb.com> > > [...] > > > +#ifdef CONFIG_MEMCG_KMEM > > > +static __always_inline int __bpf_map_update_elem(struct bpf_map *map, void *key, > > > + void *value, u64 flags) > > > +{ > > > + struct mem_cgroup *old_memcg; > > > + bool in_interrupt; > > > + int ret; > > > + > > > + /* > > > + * If update from an interrupt context results in a memory allocation, > > > + * the memory cgroup to charge can't be determined from the context > > > + * of the current task. Instead, we charge the memory cgroup, which > > > + * contained a process created the map. > > > + */ > > > + in_interrupt = in_interrupt(); > > > + if (in_interrupt) > > > + old_memcg = set_active_memcg(map->memcg); > > > + > > > + ret = map->ops->map_update_elem(map, key, value, flags); > > > + > > > + if (in_interrupt) > > > + set_active_memcg(old_memcg); > > > + > > > + return ret; > > > > Hmm, this approach here won't work, see also commit 09772d92cd5a ("bpf: avoid > > retpoline for lookup/update/delete calls on maps") which removes the indirect > > call, so the __bpf_map_update_elem() and therefore the set_active_memcg() is > > not invoked for the vast majority of cases. > > I see. Well, the first option is to move these calls into map-specific update > functions, but the list is relatively long: > nsim_map_update_elem() > cgroup_storage_update_elem() > htab_map_update_elem() > htab_percpu_map_update_elem() > dev_map_update_elem() > dev_map_hash_update_elem() > trie_update_elem() > cpu_map_update_elem() > bpf_pid_task_storage_update_elem() > bpf_fd_inode_storage_update_elem() > bpf_fd_sk_storage_update_elem() > sock_map_update_elem() > xsk_map_update_elem() > > Alternatively, we can set the active memcg for the whole duration of bpf > execution. It's simpler, but will add some overhead. Maybe we can somehow > mark programs calling into update helpers and skip all others? Actually, this is problematic if a program updates several maps, because in theory they can belong to different cgroups. So it seems that the first option is the way to go. Do you agree?
On Tue, Nov 17, 2020 at 5:07 PM Roman Gushchin <guro@fb.com> wrote: > > On Tue, Nov 17, 2020 at 04:46:34PM -0800, Roman Gushchin wrote: > > On Wed, Nov 18, 2020 at 01:06:17AM +0100, Daniel Borkmann wrote: > > > On 11/17/20 4:40 AM, Roman Gushchin wrote: > > > > In the absolute majority of cases if a process is making a kernel > > > > allocation, it's memory cgroup is getting charged. > > > > > > > > Bpf maps can be updated from an interrupt context and in such > > > > case there is no process which can be charged. It makes the memory > > > > accounting of bpf maps non-trivial. > > > > > > > > Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel > > > > memcg accounting from interrupt contexts") and b87d8cefe43c > > > > ("mm, memcg: rework remote charging API to support nesting") > > > > it's finally possible. > > > > > > > > To do it, a pointer to the memory cgroup of the process which created > > > > the map is saved, and this cgroup is getting charged for all > > > > allocations made from an interrupt context. > > > > > > > > Allocations made from a process context will be accounted in a usual way. > > > > > > > > Signed-off-by: Roman Gushchin <guro@fb.com> > > > > Acked-by: Song Liu <songliubraving@fb.com> > > > [...] > > > > +#ifdef CONFIG_MEMCG_KMEM > > > > +static __always_inline int __bpf_map_update_elem(struct bpf_map *map, void *key, > > > > + void *value, u64 flags) > > > > +{ > > > > + struct mem_cgroup *old_memcg; > > > > + bool in_interrupt; > > > > + int ret; > > > > + > > > > + /* > > > > + * If update from an interrupt context results in a memory allocation, > > > > + * the memory cgroup to charge can't be determined from the context > > > > + * of the current task. Instead, we charge the memory cgroup, which > > > > + * contained a process created the map. > > > > + */ > > > > + in_interrupt = in_interrupt(); > > > > + if (in_interrupt) > > > > + old_memcg = set_active_memcg(map->memcg); > > > > + > > > > + ret = map->ops->map_update_elem(map, key, value, flags); > > > > + > > > > + if (in_interrupt) > > > > + set_active_memcg(old_memcg); > > > > + > > > > + return ret; > > > > > > Hmm, this approach here won't work, see also commit 09772d92cd5a ("bpf: avoid > > > retpoline for lookup/update/delete calls on maps") which removes the indirect > > > call, so the __bpf_map_update_elem() and therefore the set_active_memcg() is > > > not invoked for the vast majority of cases. > > > > I see. Well, the first option is to move these calls into map-specific update > > functions, but the list is relatively long: > > nsim_map_update_elem() > > cgroup_storage_update_elem() > > htab_map_update_elem() > > htab_percpu_map_update_elem() > > dev_map_update_elem() > > dev_map_hash_update_elem() > > trie_update_elem() > > cpu_map_update_elem() > > bpf_pid_task_storage_update_elem() > > bpf_fd_inode_storage_update_elem() > > bpf_fd_sk_storage_update_elem() > > sock_map_update_elem() > > xsk_map_update_elem() > > > > Alternatively, we can set the active memcg for the whole duration of bpf > > execution. It's simpler, but will add some overhead. Maybe we can somehow > > mark programs calling into update helpers and skip all others? > > Actually, this is problematic if a program updates several maps, because > in theory they can belong to different cgroups. > So it seems that the first option is the way to go. Do you agree? May be instead of kmalloc_node() that is used by most of the map updates introduce bpf_map_kmalloc_node() that takes a map pointer as an argument? And do set_memcg inside?
On Tue, Nov 17, 2020 at 05:11:00PM -0800, Alexei Starovoitov wrote: > On Tue, Nov 17, 2020 at 5:07 PM Roman Gushchin <guro@fb.com> wrote: > > > > On Tue, Nov 17, 2020 at 04:46:34PM -0800, Roman Gushchin wrote: > > > On Wed, Nov 18, 2020 at 01:06:17AM +0100, Daniel Borkmann wrote: > > > > On 11/17/20 4:40 AM, Roman Gushchin wrote: > > > > > In the absolute majority of cases if a process is making a kernel > > > > > allocation, it's memory cgroup is getting charged. > > > > > > > > > > Bpf maps can be updated from an interrupt context and in such > > > > > case there is no process which can be charged. It makes the memory > > > > > accounting of bpf maps non-trivial. > > > > > > > > > > Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel > > > > > memcg accounting from interrupt contexts") and b87d8cefe43c > > > > > ("mm, memcg: rework remote charging API to support nesting") > > > > > it's finally possible. > > > > > > > > > > To do it, a pointer to the memory cgroup of the process which created > > > > > the map is saved, and this cgroup is getting charged for all > > > > > allocations made from an interrupt context. > > > > > > > > > > Allocations made from a process context will be accounted in a usual way. > > > > > > > > > > Signed-off-by: Roman Gushchin <guro@fb.com> > > > > > Acked-by: Song Liu <songliubraving@fb.com> > > > > [...] > > > > > +#ifdef CONFIG_MEMCG_KMEM > > > > > +static __always_inline int __bpf_map_update_elem(struct bpf_map *map, void *key, > > > > > + void *value, u64 flags) > > > > > +{ > > > > > + struct mem_cgroup *old_memcg; > > > > > + bool in_interrupt; > > > > > + int ret; > > > > > + > > > > > + /* > > > > > + * If update from an interrupt context results in a memory allocation, > > > > > + * the memory cgroup to charge can't be determined from the context > > > > > + * of the current task. Instead, we charge the memory cgroup, which > > > > > + * contained a process created the map. > > > > > + */ > > > > > + in_interrupt = in_interrupt(); > > > > > + if (in_interrupt) > > > > > + old_memcg = set_active_memcg(map->memcg); > > > > > + > > > > > + ret = map->ops->map_update_elem(map, key, value, flags); > > > > > + > > > > > + if (in_interrupt) > > > > > + set_active_memcg(old_memcg); > > > > > + > > > > > + return ret; > > > > > > > > Hmm, this approach here won't work, see also commit 09772d92cd5a ("bpf: avoid > > > > retpoline for lookup/update/delete calls on maps") which removes the indirect > > > > call, so the __bpf_map_update_elem() and therefore the set_active_memcg() is > > > > not invoked for the vast majority of cases. > > > > > > I see. Well, the first option is to move these calls into map-specific update > > > functions, but the list is relatively long: > > > nsim_map_update_elem() > > > cgroup_storage_update_elem() > > > htab_map_update_elem() > > > htab_percpu_map_update_elem() > > > dev_map_update_elem() > > > dev_map_hash_update_elem() > > > trie_update_elem() > > > cpu_map_update_elem() > > > bpf_pid_task_storage_update_elem() > > > bpf_fd_inode_storage_update_elem() > > > bpf_fd_sk_storage_update_elem() > > > sock_map_update_elem() > > > xsk_map_update_elem() > > > > > > Alternatively, we can set the active memcg for the whole duration of bpf > > > execution. It's simpler, but will add some overhead. Maybe we can somehow > > > mark programs calling into update helpers and skip all others? > > > > Actually, this is problematic if a program updates several maps, because > > in theory they can belong to different cgroups. > > So it seems that the first option is the way to go. Do you agree? > > May be instead of kmalloc_node() that is used by most of the map updates > introduce bpf_map_kmalloc_node() that takes a map pointer as an argument? > And do set_memcg inside? I suspect it's not only kmalloc_node(), but if there will be 2-3 allocation helpers, it sounds like a good idea to me! I'll try and get back with v7 soon. Thanks!
On 11/18/20 2:28 AM, Roman Gushchin wrote: > On Tue, Nov 17, 2020 at 05:11:00PM -0800, Alexei Starovoitov wrote: >> On Tue, Nov 17, 2020 at 5:07 PM Roman Gushchin <guro@fb.com> wrote: >>> On Tue, Nov 17, 2020 at 04:46:34PM -0800, Roman Gushchin wrote: >>>> On Wed, Nov 18, 2020 at 01:06:17AM +0100, Daniel Borkmann wrote: >>>>> On 11/17/20 4:40 AM, Roman Gushchin wrote: >>>>>> In the absolute majority of cases if a process is making a kernel >>>>>> allocation, it's memory cgroup is getting charged. >>>>>> >>>>>> Bpf maps can be updated from an interrupt context and in such >>>>>> case there is no process which can be charged. It makes the memory >>>>>> accounting of bpf maps non-trivial. >>>>>> >>>>>> Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel >>>>>> memcg accounting from interrupt contexts") and b87d8cefe43c >>>>>> ("mm, memcg: rework remote charging API to support nesting") >>>>>> it's finally possible. >>>>>> >>>>>> To do it, a pointer to the memory cgroup of the process which created >>>>>> the map is saved, and this cgroup is getting charged for all >>>>>> allocations made from an interrupt context. >>>>>> >>>>>> Allocations made from a process context will be accounted in a usual way. >>>>>> >>>>>> Signed-off-by: Roman Gushchin <guro@fb.com> >>>>>> Acked-by: Song Liu <songliubraving@fb.com> >>>>> [...] >>>>>> +#ifdef CONFIG_MEMCG_KMEM >>>>>> +static __always_inline int __bpf_map_update_elem(struct bpf_map *map, void *key, >>>>>> + void *value, u64 flags) >>>>>> +{ >>>>>> + struct mem_cgroup *old_memcg; >>>>>> + bool in_interrupt; >>>>>> + int ret; >>>>>> + >>>>>> + /* >>>>>> + * If update from an interrupt context results in a memory allocation, >>>>>> + * the memory cgroup to charge can't be determined from the context >>>>>> + * of the current task. Instead, we charge the memory cgroup, which >>>>>> + * contained a process created the map. >>>>>> + */ >>>>>> + in_interrupt = in_interrupt(); >>>>>> + if (in_interrupt) >>>>>> + old_memcg = set_active_memcg(map->memcg); >>>>>> + >>>>>> + ret = map->ops->map_update_elem(map, key, value, flags); >>>>>> + >>>>>> + if (in_interrupt) >>>>>> + set_active_memcg(old_memcg); >>>>>> + >>>>>> + return ret; >>>>> >>>>> Hmm, this approach here won't work, see also commit 09772d92cd5a ("bpf: avoid >>>>> retpoline for lookup/update/delete calls on maps") which removes the indirect >>>>> call, so the __bpf_map_update_elem() and therefore the set_active_memcg() is >>>>> not invoked for the vast majority of cases. >>>> >>>> I see. Well, the first option is to move these calls into map-specific update >>>> functions, but the list is relatively long: >>>> nsim_map_update_elem() >>>> cgroup_storage_update_elem() >>>> htab_map_update_elem() >>>> htab_percpu_map_update_elem() >>>> dev_map_update_elem() >>>> dev_map_hash_update_elem() >>>> trie_update_elem() >>>> cpu_map_update_elem() >>>> bpf_pid_task_storage_update_elem() >>>> bpf_fd_inode_storage_update_elem() >>>> bpf_fd_sk_storage_update_elem() >>>> sock_map_update_elem() >>>> xsk_map_update_elem() >>>> >>>> Alternatively, we can set the active memcg for the whole duration of bpf >>>> execution. It's simpler, but will add some overhead. Maybe we can somehow >>>> mark programs calling into update helpers and skip all others? >>> >>> Actually, this is problematic if a program updates several maps, because >>> in theory they can belong to different cgroups. >>> So it seems that the first option is the way to go. Do you agree? >> >> May be instead of kmalloc_node() that is used by most of the map updates >> introduce bpf_map_kmalloc_node() that takes a map pointer as an argument? >> And do set_memcg inside? > > I suspect it's not only kmalloc_node(), but if there will be 2-3 allocation > helpers, it sounds like a good idea to me! I'll try and get back with v7 soon. Could this be baked into kmalloc*() API itself given we also need to pass in __GFP_ACCOUNT everywhere, so we'd have a new API with additional argument where we pass the memcg pointer to tell it directly where to account it for instead of having to have the extra set_active_memcg() set/restore dance via BPF wrapper? It seems there would be not much specifics on BPF itself and if it's more generic it could also be used by other subsystems. Thanks, Daniel
On Wed, Nov 18, 2020 at 11:22:55AM +0100, Daniel Borkmann wrote: > On 11/18/20 2:28 AM, Roman Gushchin wrote: > > On Tue, Nov 17, 2020 at 05:11:00PM -0800, Alexei Starovoitov wrote: > > > On Tue, Nov 17, 2020 at 5:07 PM Roman Gushchin <guro@fb.com> wrote: > > > > On Tue, Nov 17, 2020 at 04:46:34PM -0800, Roman Gushchin wrote: > > > > > On Wed, Nov 18, 2020 at 01:06:17AM +0100, Daniel Borkmann wrote: > > > > > > On 11/17/20 4:40 AM, Roman Gushchin wrote: > > > > > > > In the absolute majority of cases if a process is making a kernel > > > > > > > allocation, it's memory cgroup is getting charged. > > > > > > > > > > > > > > Bpf maps can be updated from an interrupt context and in such > > > > > > > case there is no process which can be charged. It makes the memory > > > > > > > accounting of bpf maps non-trivial. > > > > > > > > > > > > > > Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel > > > > > > > memcg accounting from interrupt contexts") and b87d8cefe43c > > > > > > > ("mm, memcg: rework remote charging API to support nesting") > > > > > > > it's finally possible. > > > > > > > > > > > > > > To do it, a pointer to the memory cgroup of the process which created > > > > > > > the map is saved, and this cgroup is getting charged for all > > > > > > > allocations made from an interrupt context. > > > > > > > > > > > > > > Allocations made from a process context will be accounted in a usual way. > > > > > > > > > > > > > > Signed-off-by: Roman Gushchin <guro@fb.com> > > > > > > > Acked-by: Song Liu <songliubraving@fb.com> > > > > > > [...] > > > > > > > +#ifdef CONFIG_MEMCG_KMEM > > > > > > > +static __always_inline int __bpf_map_update_elem(struct bpf_map *map, void *key, > > > > > > > + void *value, u64 flags) > > > > > > > +{ > > > > > > > + struct mem_cgroup *old_memcg; > > > > > > > + bool in_interrupt; > > > > > > > + int ret; > > > > > > > + > > > > > > > + /* > > > > > > > + * If update from an interrupt context results in a memory allocation, > > > > > > > + * the memory cgroup to charge can't be determined from the context > > > > > > > + * of the current task. Instead, we charge the memory cgroup, which > > > > > > > + * contained a process created the map. > > > > > > > + */ > > > > > > > + in_interrupt = in_interrupt(); > > > > > > > + if (in_interrupt) > > > > > > > + old_memcg = set_active_memcg(map->memcg); > > > > > > > + > > > > > > > + ret = map->ops->map_update_elem(map, key, value, flags); > > > > > > > + > > > > > > > + if (in_interrupt) > > > > > > > + set_active_memcg(old_memcg); > > > > > > > + > > > > > > > + return ret; > > > > > > > > > > > > Hmm, this approach here won't work, see also commit 09772d92cd5a ("bpf: avoid > > > > > > retpoline for lookup/update/delete calls on maps") which removes the indirect > > > > > > call, so the __bpf_map_update_elem() and therefore the set_active_memcg() is > > > > > > not invoked for the vast majority of cases. > > > > > > > > > > I see. Well, the first option is to move these calls into map-specific update > > > > > functions, but the list is relatively long: > > > > > nsim_map_update_elem() > > > > > cgroup_storage_update_elem() > > > > > htab_map_update_elem() > > > > > htab_percpu_map_update_elem() > > > > > dev_map_update_elem() > > > > > dev_map_hash_update_elem() > > > > > trie_update_elem() > > > > > cpu_map_update_elem() > > > > > bpf_pid_task_storage_update_elem() > > > > > bpf_fd_inode_storage_update_elem() > > > > > bpf_fd_sk_storage_update_elem() > > > > > sock_map_update_elem() > > > > > xsk_map_update_elem() > > > > > > > > > > Alternatively, we can set the active memcg for the whole duration of bpf > > > > > execution. It's simpler, but will add some overhead. Maybe we can somehow > > > > > mark programs calling into update helpers and skip all others? > > > > > > > > Actually, this is problematic if a program updates several maps, because > > > > in theory they can belong to different cgroups. > > > > So it seems that the first option is the way to go. Do you agree? > > > > > > May be instead of kmalloc_node() that is used by most of the map updates > > > introduce bpf_map_kmalloc_node() that takes a map pointer as an argument? > > > And do set_memcg inside? > > > > I suspect it's not only kmalloc_node(), but if there will be 2-3 allocation > > helpers, it sounds like a good idea to me! I'll try and get back with v7 soon. > > Could this be baked into kmalloc*() API itself given we also need to pass in > __GFP_ACCOUNT everywhere, so we'd have a new API with additional argument where > we pass the memcg pointer to tell it directly where to account it for instead of > having to have the extra set_active_memcg() set/restore dance via BPF wrapper? > It seems there would be not much specifics on BPF itself and if it's more generic > it could also be used by other subsystems. Actually BPF is the first example of the kernel memory accounting from an interrupt context. There are few examples where we do an indirect charging (charging an arbitrary memory cgroup, not the current one), but not so many. And usually it's easier to wrap everything into set_active_memcg(), rather than pass a memcg argument into every function which can do a memory allocation. Also, in !CONFIG_MEMCG or !CONFIG_MEMCG_KMEM it's easy to compile out set_active_memcg() and everything memcg-related, but not easy to remove a memcg argument from many functions all over the code. In this particular case the only reason why it's not easy to wrap everything into set_active_memcg() pair is the function call "inlining", which is very bpf-specific. So as now I'd go with what Alexei suggested. Thanks!
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 581b2a2e78eb..1d6e7b125877 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -37,6 +37,7 @@ struct bpf_iter_aux_info; struct bpf_local_storage; struct bpf_local_storage_map; struct kobject; +struct mem_cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -161,6 +162,9 @@ struct bpf_map { u32 btf_value_type_id; struct btf *btf; struct bpf_map_memory memory; +#ifdef CONFIG_MEMCG_KMEM + struct mem_cgroup *memcg; +#endif char name[BPF_OBJ_NAME_LEN]; u32 btf_vmlinux_value_type_id; bool bypass_spec_v1; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 25520f5eeaf6..b6327cbe7e41 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -14,6 +14,7 @@ #include <linux/jiffies.h> #include <linux/pid_namespace.h> #include <linux/proc_ns.h> +#include <linux/sched/mm.h> #include "../../lib/kstrtox.h" @@ -41,11 +42,45 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { .arg2_type = ARG_PTR_TO_MAP_KEY, }; +#ifdef CONFIG_MEMCG_KMEM +static __always_inline int __bpf_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + struct mem_cgroup *old_memcg; + bool in_interrupt; + int ret; + + /* + * If update from an interrupt context results in a memory allocation, + * the memory cgroup to charge can't be determined from the context + * of the current task. Instead, we charge the memory cgroup, which + * contained a process created the map. + */ + in_interrupt = in_interrupt(); + if (in_interrupt) + old_memcg = set_active_memcg(map->memcg); + + ret = map->ops->map_update_elem(map, key, value, flags); + + if (in_interrupt) + set_active_memcg(old_memcg); + + return ret; +} +#else +static __always_inline int __bpf_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return map->ops->map_update_elem(map, key, value, flags); +} +#endif + BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); - return map->ops->map_update_elem(map, key, value, flags); + + return __bpf_map_update_elem(map, key, value, flags); } const struct bpf_func_proto bpf_map_update_elem_proto = { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f3fe9f53f93c..2d77fc2496da 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -31,6 +31,7 @@ #include <linux/poll.h> #include <linux/bpf-netns.h> #include <linux/rcupdate_trace.h> +#include <linux/memcontrol.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -456,6 +457,27 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) __release(&map_idr_lock); } +#ifdef CONFIG_MEMCG_KMEM +static void bpf_map_save_memcg(struct bpf_map *map) +{ + map->memcg = get_mem_cgroup_from_mm(current->mm); +} + +static void bpf_map_release_memcg(struct bpf_map *map) +{ + mem_cgroup_put(map->memcg); +} + +#else +static void bpf_map_save_memcg(struct bpf_map *map) +{ +} + +static void bpf_map_release_memcg(struct bpf_map *map) +{ +} +#endif + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { @@ -464,6 +486,7 @@ static void bpf_map_free_deferred(struct work_struct *work) bpf_map_charge_move(&mem, &map->memory); security_bpf_map_free(map); + bpf_map_release_memcg(map); /* implementation dependent freeing */ map->ops->map_free(map); bpf_map_charge_finish(&mem); @@ -875,6 +898,8 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_sec; + bpf_map_save_memcg(map); + err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd.