diff mbox series

[RFC,bpf-next] bpf: change syscall_nr type to int in struct syscall_tp_t

Message ID 20231012114550.152846-1-asavkov@redhat.com
State Superseded
Headers show
Series [RFC,bpf-next] bpf: change syscall_nr type to int in struct syscall_tp_t | expand

Commit Message

Artem Savkov Oct. 12, 2023, 11:45 a.m. UTC
linux-rt-devel tree contains a patch (b1773eac3f29c ("sched: Add support
for lazy preemption")) that adds an extra member to struct trace_entry.
This causes the offset of args field in struct trace_event_raw_sys_enter
be different from the one in struct syscall_trace_enter:

struct trace_event_raw_sys_enter {
        struct trace_entry         ent;                  /*     0    12 */

        /* XXX last struct has 3 bytes of padding */
        /* XXX 4 bytes hole, try to pack */

        long int                   id;                   /*    16     8 */
        long unsigned int          args[6];              /*    24    48 */
        /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
        char                       __data[];             /*    72     0 */

        /* size: 72, cachelines: 2, members: 4 */
        /* sum members: 68, holes: 1, sum holes: 4 */
        /* paddings: 1, sum paddings: 3 */
        /* last cacheline: 8 bytes */
};

struct syscall_trace_enter {
        struct trace_entry         ent;                  /*     0    12 */

        /* XXX last struct has 3 bytes of padding */

        int                        nr;                   /*    12     4 */
        long unsigned int          args[];               /*    16     0 */

        /* size: 16, cachelines: 1, members: 3 */
        /* paddings: 1, sum paddings: 3 */
        /* last cacheline: 16 bytes */
};

This, in turn, causes perf_event_set_bpf_prog() fail while running bpf
test_profiler testcase because max_ctx_offset is calculated based on the
former struct, while off on the latter:

  10488         if (is_tracepoint || is_syscall_tp) {
  10489                 int off = trace_event_get_offsets(event->tp_event);
  10490
  10491                 if (prog->aux->max_ctx_offset > off)
  10492                         return -EACCES;
  10493         }

What bpf program is actually getting is a pointer to struct
syscall_tp_t, defined in kernel/trace/trace_syscalls.c. This patch fixes
the problem by aligning struct syscall_tp_t with with struct
syscall_trace_(enter|exit) and changing the tests to use these structs
to dereference context.

Signed-off-by: Artem Savkov <asavkov@redhat.com>
---
 kernel/trace/trace_syscalls.c                    | 4 ++--
 tools/testing/selftests/bpf/progs/profiler.inc.h | 2 +-
 tools/testing/selftests/bpf/progs/test_vmlinux.c | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

Comments

Steven Rostedt Oct. 12, 2023, 1:44 p.m. UTC | #1
On Thu, 12 Oct 2023 13:45:50 +0200
Artem Savkov <asavkov@redhat.com> wrote:

> linux-rt-devel tree contains a patch (b1773eac3f29c ("sched: Add support
> for lazy preemption")) that adds an extra member to struct trace_entry.
> This causes the offset of args field in struct trace_event_raw_sys_enter
> be different from the one in struct syscall_trace_enter:
> 
> struct trace_event_raw_sys_enter {
>         struct trace_entry         ent;                  /*     0    12 */
> 
>         /* XXX last struct has 3 bytes of padding */
>         /* XXX 4 bytes hole, try to pack */
> 
>         long int                   id;                   /*    16     8 */
>         long unsigned int          args[6];              /*    24    48 */
>         /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
>         char                       __data[];             /*    72     0 */
> 
>         /* size: 72, cachelines: 2, members: 4 */
>         /* sum members: 68, holes: 1, sum holes: 4 */
>         /* paddings: 1, sum paddings: 3 */
>         /* last cacheline: 8 bytes */
> };
> 
> struct syscall_trace_enter {
>         struct trace_entry         ent;                  /*     0    12 */
> 
>         /* XXX last struct has 3 bytes of padding */
> 
>         int                        nr;                   /*    12     4 */
>         long unsigned int          args[];               /*    16     0 */
> 
>         /* size: 16, cachelines: 1, members: 3 */
>         /* paddings: 1, sum paddings: 3 */
>         /* last cacheline: 16 bytes */
> };
> 
> This, in turn, causes perf_event_set_bpf_prog() fail while running bpf
> test_profiler testcase because max_ctx_offset is calculated based on the
> former struct, while off on the latter:
> 
>   10488         if (is_tracepoint || is_syscall_tp) {
>   10489                 int off = trace_event_get_offsets(event->tp_event);
>   10490
>   10491                 if (prog->aux->max_ctx_offset > off)
>   10492                         return -EACCES;
>   10493         }
> 
> What bpf program is actually getting is a pointer to struct
> syscall_tp_t, defined in kernel/trace/trace_syscalls.c. This patch fixes
> the problem by aligning struct syscall_tp_t with with struct
> syscall_trace_(enter|exit) and changing the tests to use these structs
> to dereference context.
> 
> Signed-off-by: Artem Savkov <asavkov@redhat.com>

Thanks for doing a proper fix.

Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>

-- Steve
Andrii Nakryiko Oct. 12, 2023, 11:32 p.m. UTC | #2
On Thu, Oct 12, 2023 at 6:43 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu, 12 Oct 2023 13:45:50 +0200
> Artem Savkov <asavkov@redhat.com> wrote:
>
> > linux-rt-devel tree contains a patch (b1773eac3f29c ("sched: Add support
> > for lazy preemption")) that adds an extra member to struct trace_entry.
> > This causes the offset of args field in struct trace_event_raw_sys_enter
> > be different from the one in struct syscall_trace_enter:
> >
> > struct trace_event_raw_sys_enter {
> >         struct trace_entry         ent;                  /*     0    12 */
> >
> >         /* XXX last struct has 3 bytes of padding */
> >         /* XXX 4 bytes hole, try to pack */
> >
> >         long int                   id;                   /*    16     8 */
> >         long unsigned int          args[6];              /*    24    48 */
> >         /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
> >         char                       __data[];             /*    72     0 */
> >
> >         /* size: 72, cachelines: 2, members: 4 */
> >         /* sum members: 68, holes: 1, sum holes: 4 */
> >         /* paddings: 1, sum paddings: 3 */
> >         /* last cacheline: 8 bytes */
> > };
> >
> > struct syscall_trace_enter {
> >         struct trace_entry         ent;                  /*     0    12 */
> >
> >         /* XXX last struct has 3 bytes of padding */
> >
> >         int                        nr;                   /*    12     4 */
> >         long unsigned int          args[];               /*    16     0 */
> >
> >         /* size: 16, cachelines: 1, members: 3 */
> >         /* paddings: 1, sum paddings: 3 */
> >         /* last cacheline: 16 bytes */
> > };
> >
> > This, in turn, causes perf_event_set_bpf_prog() fail while running bpf
> > test_profiler testcase because max_ctx_offset is calculated based on the
> > former struct, while off on the latter:
> >
> >   10488         if (is_tracepoint || is_syscall_tp) {
> >   10489                 int off = trace_event_get_offsets(event->tp_event);
> >   10490
> >   10491                 if (prog->aux->max_ctx_offset > off)
> >   10492                         return -EACCES;
> >   10493         }
> >
> > What bpf program is actually getting is a pointer to struct
> > syscall_tp_t, defined in kernel/trace/trace_syscalls.c. This patch fixes
> > the problem by aligning struct syscall_tp_t with with struct
> > syscall_trace_(enter|exit) and changing the tests to use these structs
> > to dereference context.
> >
> > Signed-off-by: Artem Savkov <asavkov@redhat.com>
>

I think these changes make sense regardless, can you please resend the
patch without RFC tag so that our CI can run tests for it?

> Thanks for doing a proper fix.
>
> Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>

But looking at [0] and briefly reading some of the discussions you,
Steven, had. I'm just wondering if it would be best to avoid
increasing struct trace_entry altogether? It seems like preempt_count
is actually a 4-bit field in trace context, so it doesn't seem like we
really need to allocate an entire byte for both preempt_count and
preempt_lazy_count. Why can't we just combine them and not waste 8
extra bytes for each trace event in a ring buffer?

  [0] https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git/commit/?id=b1773eac3f29cbdcdfd16e0339f1a164066e9f71

>
> -- Steve
Rod Webster Oct. 13, 2023, 3:13 a.m. UTC | #3
For the novice with the RT kernel, Could somebody  tell me what kernel
this bug was first introduced in and what kernel we need to install to
get the fix?

This could be the issue we have been experiencing in the Linuxcnc
community with excessive RT network latency (mostly with realtek
NIC's).
I had flagged Lazy preemption as being the possible  changes causing
this issue. I thought Lazy Preemption was added Circa kernel 5.09 as
it affected Debian Bullseye on Kernel 5.10 which coincided with when
we first observed the problem.

Thanks in anticipation.

Rod Webster

Rod Webster
1300 896 832
+61 435 765 611
VMN®
www.vmn.com.au

Sole Queensland Distributor


On Thu, 12 Oct 2023 at 21:46, Artem Savkov <asavkov@redhat.com> wrote:
>
> linux-rt-devel tree contains a patch (b1773eac3f29c ("sched: Add support
> for lazy preemption")) that adds an extra member to struct trace_entry.
> This causes the offset of args field in struct trace_event_raw_sys_enter
> be different from the one in struct syscall_trace_enter:
>
> struct trace_event_raw_sys_enter {
>         struct trace_entry         ent;                  /*     0    12 */
>
>         /* XXX last struct has 3 bytes of padding */
>         /* XXX 4 bytes hole, try to pack */
>
>         long int                   id;                   /*    16     8 */
>         long unsigned int          args[6];              /*    24    48 */
>         /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
>         char                       __data[];             /*    72     0 */
>
>         /* size: 72, cachelines: 2, members: 4 */
>         /* sum members: 68, holes: 1, sum holes: 4 */
>         /* paddings: 1, sum paddings: 3 */
>         /* last cacheline: 8 bytes */
> };
>
> struct syscall_trace_enter {
>         struct trace_entry         ent;                  /*     0    12 */
>
>         /* XXX last struct has 3 bytes of padding */
>
>         int                        nr;                   /*    12     4 */
>         long unsigned int          args[];               /*    16     0 */
>
>         /* size: 16, cachelines: 1, members: 3 */
>         /* paddings: 1, sum paddings: 3 */
>         /* last cacheline: 16 bytes */
> };
>
> This, in turn, causes perf_event_set_bpf_prog() fail while running bpf
> test_profiler testcase because max_ctx_offset is calculated based on the
> former struct, while off on the latter:
>
>   10488         if (is_tracepoint || is_syscall_tp) {
>   10489                 int off = trace_event_get_offsets(event->tp_event);
>   10490
>   10491                 if (prog->aux->max_ctx_offset > off)
>   10492                         return -EACCES;
>   10493         }
>
> What bpf program is actually getting is a pointer to struct
> syscall_tp_t, defined in kernel/trace/trace_syscalls.c. This patch fixes
> the problem by aligning struct syscall_tp_t with with struct
> syscall_trace_(enter|exit) and changing the tests to use these structs
> to dereference context.
>
> Signed-off-by: Artem Savkov <asavkov@redhat.com>
> ---
>  kernel/trace/trace_syscalls.c                    | 4 ++--
>  tools/testing/selftests/bpf/progs/profiler.inc.h | 2 +-
>  tools/testing/selftests/bpf/progs/test_vmlinux.c | 4 ++--
>  3 files changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
> index de753403cdafb..9c581d6da843a 100644
> --- a/kernel/trace/trace_syscalls.c
> +++ b/kernel/trace/trace_syscalls.c
> @@ -556,7 +556,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
>  {
>         struct syscall_tp_t {
>                 struct trace_entry ent;
> -               unsigned long syscall_nr;
> +               int syscall_nr;
>                 unsigned long args[SYSCALL_DEFINE_MAXARGS];
>         } __aligned(8) param;
>         int i;
> @@ -661,7 +661,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
>  {
>         struct syscall_tp_t {
>                 struct trace_entry ent;
> -               unsigned long syscall_nr;
> +               int syscall_nr;
>                 unsigned long ret;
>         } __aligned(8) param;
>
> diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h
> index f799d87e87002..897061930cb76 100644
> --- a/tools/testing/selftests/bpf/progs/profiler.inc.h
> +++ b/tools/testing/selftests/bpf/progs/profiler.inc.h
> @@ -609,7 +609,7 @@ ssize_t BPF_KPROBE(kprobe__proc_sys_write,
>  }
>
>  SEC("tracepoint/syscalls/sys_enter_kill")
> -int tracepoint__syscalls__sys_enter_kill(struct trace_event_raw_sys_enter* ctx)
> +int tracepoint__syscalls__sys_enter_kill(struct syscall_trace_enter* ctx)
>  {
>         struct bpf_func_stats_ctx stats_ctx;
>
> diff --git a/tools/testing/selftests/bpf/progs/test_vmlinux.c b/tools/testing/selftests/bpf/progs/test_vmlinux.c
> index 4b8e37f7fd06c..78b23934d9f8f 100644
> --- a/tools/testing/selftests/bpf/progs/test_vmlinux.c
> +++ b/tools/testing/selftests/bpf/progs/test_vmlinux.c
> @@ -16,12 +16,12 @@ bool kprobe_called = false;
>  bool fentry_called = false;
>
>  SEC("tp/syscalls/sys_enter_nanosleep")
> -int handle__tp(struct trace_event_raw_sys_enter *args)
> +int handle__tp(struct syscall_trace_enter *args)
>  {
>         struct __kernel_timespec *ts;
>         long tv_nsec;
>
> -       if (args->id != __NR_nanosleep)
> +       if (args->nr != __NR_nanosleep)
>                 return 0;
>
>         ts = (void *)args->args[0];
> --
> 2.41.0
>
>
diff mbox series

Patch

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index de753403cdafb..9c581d6da843a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -556,7 +556,7 @@  static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
 {
 	struct syscall_tp_t {
 		struct trace_entry ent;
-		unsigned long syscall_nr;
+		int syscall_nr;
 		unsigned long args[SYSCALL_DEFINE_MAXARGS];
 	} __aligned(8) param;
 	int i;
@@ -661,7 +661,7 @@  static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
 {
 	struct syscall_tp_t {
 		struct trace_entry ent;
-		unsigned long syscall_nr;
+		int syscall_nr;
 		unsigned long ret;
 	} __aligned(8) param;
 
diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h
index f799d87e87002..897061930cb76 100644
--- a/tools/testing/selftests/bpf/progs/profiler.inc.h
+++ b/tools/testing/selftests/bpf/progs/profiler.inc.h
@@ -609,7 +609,7 @@  ssize_t BPF_KPROBE(kprobe__proc_sys_write,
 }
 
 SEC("tracepoint/syscalls/sys_enter_kill")
-int tracepoint__syscalls__sys_enter_kill(struct trace_event_raw_sys_enter* ctx)
+int tracepoint__syscalls__sys_enter_kill(struct syscall_trace_enter* ctx)
 {
 	struct bpf_func_stats_ctx stats_ctx;
 
diff --git a/tools/testing/selftests/bpf/progs/test_vmlinux.c b/tools/testing/selftests/bpf/progs/test_vmlinux.c
index 4b8e37f7fd06c..78b23934d9f8f 100644
--- a/tools/testing/selftests/bpf/progs/test_vmlinux.c
+++ b/tools/testing/selftests/bpf/progs/test_vmlinux.c
@@ -16,12 +16,12 @@  bool kprobe_called = false;
 bool fentry_called = false;
 
 SEC("tp/syscalls/sys_enter_nanosleep")
-int handle__tp(struct trace_event_raw_sys_enter *args)
+int handle__tp(struct syscall_trace_enter *args)
 {
 	struct __kernel_timespec *ts;
 	long tv_nsec;
 
-	if (args->id != __NR_nanosleep)
+	if (args->nr != __NR_nanosleep)
 		return 0;
 
 	ts = (void *)args->args[0];