[08/18] arm64: convert raw syscall invocation to C

Message ID 20180514094640.27569-9-mark.rutland@arm.com
State Superseded
Headers show
Series
  • arm64: invoke syscalls with pt_regs
Related show

Commit Message

Mark Rutland May 14, 2018, 9:46 a.m.
As a first step towards invoking syscalls with a pt_regs argument,
convert the raw syscall invocation logic to C. We end up with a bit more
register shuffling, but the unified invocation logic means we can unify
the tracing paths, too.

This only converts the invocation of the syscall. The rest of the
syscall triage and tracing is left in assembly for now, and will be
converted in subsequent patches.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/kernel/Makefile  |  3 ++-
 arch/arm64/kernel/entry.S   | 36 ++++++++++--------------------------
 arch/arm64/kernel/syscall.c | 29 +++++++++++++++++++++++++++++
 3 files changed, 41 insertions(+), 27 deletions(-)
 create mode 100644 arch/arm64/kernel/syscall.c

-- 
2.11.0

Comments

Dave Martin May 14, 2018, 11:07 a.m. | #1
On Mon, May 14, 2018 at 10:46:30AM +0100, Mark Rutland wrote:
> As a first step towards invoking syscalls with a pt_regs argument,

> convert the raw syscall invocation logic to C. We end up with a bit more

> register shuffling, but the unified invocation logic means we can unify

> the tracing paths, too.

> 

> This only converts the invocation of the syscall. The rest of the

> syscall triage and tracing is left in assembly for now, and will be

> converted in subsequent patches.

> 

> Signed-off-by: Mark Rutland <mark.rutland@arm.com>

> Cc: Catalin Marinas <catalin.marinas@arm.com>

> Cc: Will Deacon <will.deacon@arm.com>

> ---

>  arch/arm64/kernel/Makefile  |  3 ++-

>  arch/arm64/kernel/entry.S   | 36 ++++++++++--------------------------

>  arch/arm64/kernel/syscall.c | 29 +++++++++++++++++++++++++++++

>  3 files changed, 41 insertions(+), 27 deletions(-)

>  create mode 100644 arch/arm64/kernel/syscall.c

> 

> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile

> index bf825f38d206..c22e8ace5ea3 100644

> --- a/arch/arm64/kernel/Makefile

> +++ b/arch/arm64/kernel/Makefile

> @@ -18,7 +18,8 @@ arm64-obj-y		:= debug-monitors.o entry.o irq.o fpsimd.o		\

>  			   hyp-stub.o psci.o cpu_ops.o insn.o	\

>  			   return_address.o cpuinfo.o cpu_errata.o		\

>  			   cpufeature.o alternative.o cacheinfo.o		\

> -			   smp.o smp_spin_table.o topology.o smccc-call.o

> +			   smp.o smp_spin_table.o topology.o smccc-call.o	\

> +			   syscall.o

>  

>  extra-$(CONFIG_EFI)			:= efi-entry.o

>  

> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S

> index 08ea3cbfb08f..d6e057500eaf 100644

> --- a/arch/arm64/kernel/entry.S

> +++ b/arch/arm64/kernel/entry.S

> @@ -873,7 +873,6 @@ ENDPROC(el0_error)

>   */

>  ret_fast_syscall:

>  	disable_daif

> -	str	x0, [sp, #S_X0]			// returned x0

>  	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for syscall tracing

>  	and	x2, x1, #_TIF_SYSCALL_WORK

>  	cbnz	x2, ret_fast_syscall_trace

> @@ -946,15 +945,11 @@ el0_svc_naked:					// compat entry point

>  

>  	tst	x16, #_TIF_SYSCALL_WORK		// check for syscall hooks

>  	b.ne	__sys_trace

> -	cmp     wscno, wsc_nr			// check upper syscall limit

> -	b.hs	ni_sys

> -	mask_nospec64 xscno, xsc_nr, x19	// enforce bounds for syscall number

> -	ldr	x16, [stbl, xscno, lsl #3]	// address in the syscall table

> -	blr	x16				// call sys_* routine

> -	b	ret_fast_syscall

> -ni_sys:

>  	mov	x0, sp

> -	bl	do_ni_syscall

> +	mov	w1, wscno

> +	mov	w2, wsc_nr

> +	mov	x3, stbl

> +	bl	invoke_syscall

>  	b	ret_fast_syscall

>  ENDPROC(el0_svc)

>  

> @@ -971,29 +966,18 @@ __sys_trace:

>  	bl	syscall_trace_enter

>  	cmp	w0, #NO_SYSCALL			// skip the syscall?

>  	b.eq	__sys_trace_return_skipped

> -	mov	wscno, w0			// syscall number (possibly new)

> -	mov	x1, sp				// pointer to regs

> -	cmp	wscno, wsc_nr			// check upper syscall limit

> -	b.hs	__ni_sys_trace

> -	ldp	x0, x1, [sp]			// restore the syscall args

> -	ldp	x2, x3, [sp, #S_X2]

> -	ldp	x4, x5, [sp, #S_X4]

> -	ldp	x6, x7, [sp, #S_X6]

> -	ldr	x16, [stbl, xscno, lsl #3]	// address in the syscall table

> -	blr	x16				// call sys_* routine

>  

> -__sys_trace_return:

> -	str	x0, [sp, #S_X0]			// save returned x0

> +	mov	x0, sp

> +	mov	w1, wscno

> +	mov w2, wsc_nr

> +	mov	x3, stbl

> +	bl	invoke_syscall

> +

>  __sys_trace_return_skipped:

>  	mov	x0, sp

>  	bl	syscall_trace_exit

>  	b	ret_to_user

>  

> -__ni_sys_trace:

> -	mov	x0, sp

> -	bl	do_ni_syscall

> -	b	__sys_trace_return

> -


Can you explain why ni_syscall is special here, why __sys_trace_return
existed, and why its disappearance doesn't break anything?

Not saying there's a bug, just that I'm a little confuse -- I see no
real reason for ni_syscall being special, and this may be a good
opportunity to decruft it.  (See also comments below.)

>  	.popsection				// .entry.text

>  

>  #ifdef CONFIG_UNMAP_KERNEL_AT_EL0

> diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c

> new file mode 100644

> index 000000000000..58d7569f47df

> --- /dev/null

> +++ b/arch/arm64/kernel/syscall.c

> @@ -0,0 +1,29 @@

> +// SPDX-License-Identifier: GPL-2.0

> +

> +#include <linux/nospec.h>

> +#include <linux/ptrace.h>

> +

> +long do_ni_syscall(struct pt_regs *regs);

> +

> +typedef long (*syscall_fn_t)(unsigned long, unsigned long,

> +			     unsigned long, unsigned long,

> +			     unsigned long, unsigned long);

> +

> +static void __invoke_syscall(struct pt_regs *regs, syscall_fn_t syscall_fn)

> +{

> +	regs->regs[0] = syscall_fn(regs->regs[0], regs->regs[1],

> +				   regs->regs[2], regs->regs[3],

> +				   regs->regs[4], regs->regs[5]);

> +}

> +

> +asmlinkage void invoke_syscall(struct pt_regs *regs, int scno, int sc_nr,

> +			       syscall_fn_t syscall_table[])

> +{

> +	if (scno < sc_nr) {


What if (int)scno < 0?  Should those args both by unsigned ints?

"sc_nr" sounds too much like "syscall number" to me.  Might
"syscall_table_size" might be clearer?  Similarly, we could have
"stbl_size" or similar in the asm.  This is purely cosmetic,
though.

> +		syscall_fn_t syscall_fn;

> +		syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)];

> +		__invoke_syscall(regs, syscall_fn);

> +	} else {

> +		regs->regs[0] = do_ni_syscall(regs);


Can we make __invoke_syscall() the universal syscall wrapper, and give
do_ni_syscall() the same interface as any other syscall body?

Then you could factor this as

static syscall_fn_t syscall_fn(syscall_fn_t const syscall_table[],
				(unsigned) int scno, (unsigned) int sc_nr)
{
	if (sc_no >= sc_nr)
		return sys_ni_syscall;

	return syscall_table[array_index_nospec(scno, sc_nr)];
}

...
	__invoke_syscall(regs, syscall_fn(syscall_table, scno, sc_nr);



This is cosmetic too, of course.

do_ni_syscall() should be given a pt_regs-based wrapper like all the
rest.

This is still cosmetic, but reduces unnecessary special-case-ness
for ni_syscall.

Cheers
---Dave
Mark Rutland May 14, 2018, 11:41 a.m. | #2
On Mon, May 14, 2018 at 12:07:18PM +0100, Dave Martin wrote:
> On Mon, May 14, 2018 at 10:46:30AM +0100, Mark Rutland wrote:

> > As a first step towards invoking syscalls with a pt_regs argument,

> > convert the raw syscall invocation logic to C. We end up with a bit more

> > register shuffling, but the unified invocation logic means we can unify

> > the tracing paths, too.

> > 

> > This only converts the invocation of the syscall. The rest of the

> > syscall triage and tracing is left in assembly for now, and will be

> > converted in subsequent patches.

> > 

> > Signed-off-by: Mark Rutland <mark.rutland@arm.com>

> > Cc: Catalin Marinas <catalin.marinas@arm.com>

> > Cc: Will Deacon <will.deacon@arm.com>

> > ---

> >  arch/arm64/kernel/Makefile  |  3 ++-

> >  arch/arm64/kernel/entry.S   | 36 ++++++++++--------------------------

> >  arch/arm64/kernel/syscall.c | 29 +++++++++++++++++++++++++++++

> >  3 files changed, 41 insertions(+), 27 deletions(-)

> >  create mode 100644 arch/arm64/kernel/syscall.c

> > 

> > diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile

> > index bf825f38d206..c22e8ace5ea3 100644

> > --- a/arch/arm64/kernel/Makefile

> > +++ b/arch/arm64/kernel/Makefile

> > @@ -18,7 +18,8 @@ arm64-obj-y		:= debug-monitors.o entry.o irq.o fpsimd.o		\

> >  			   hyp-stub.o psci.o cpu_ops.o insn.o	\

> >  			   return_address.o cpuinfo.o cpu_errata.o		\

> >  			   cpufeature.o alternative.o cacheinfo.o		\

> > -			   smp.o smp_spin_table.o topology.o smccc-call.o

> > +			   smp.o smp_spin_table.o topology.o smccc-call.o	\

> > +			   syscall.o

> >  

> >  extra-$(CONFIG_EFI)			:= efi-entry.o

> >  

> > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S

> > index 08ea3cbfb08f..d6e057500eaf 100644

> > --- a/arch/arm64/kernel/entry.S

> > +++ b/arch/arm64/kernel/entry.S

> > @@ -873,7 +873,6 @@ ENDPROC(el0_error)

> >   */

> >  ret_fast_syscall:

> >  	disable_daif

> > -	str	x0, [sp, #S_X0]			// returned x0

> >  	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for syscall tracing

> >  	and	x2, x1, #_TIF_SYSCALL_WORK

> >  	cbnz	x2, ret_fast_syscall_trace

> > @@ -946,15 +945,11 @@ el0_svc_naked:					// compat entry point

> >  

> >  	tst	x16, #_TIF_SYSCALL_WORK		// check for syscall hooks

> >  	b.ne	__sys_trace

> > -	cmp     wscno, wsc_nr			// check upper syscall limit

> > -	b.hs	ni_sys

> > -	mask_nospec64 xscno, xsc_nr, x19	// enforce bounds for syscall number

> > -	ldr	x16, [stbl, xscno, lsl #3]	// address in the syscall table

> > -	blr	x16				// call sys_* routine

> > -	b	ret_fast_syscall

> > -ni_sys:

> >  	mov	x0, sp

> > -	bl	do_ni_syscall

> > +	mov	w1, wscno

> > +	mov	w2, wsc_nr

> > +	mov	x3, stbl

> > +	bl	invoke_syscall

> >  	b	ret_fast_syscall

> >  ENDPROC(el0_svc)

> >  

> > @@ -971,29 +966,18 @@ __sys_trace:

> >  	bl	syscall_trace_enter

> >  	cmp	w0, #NO_SYSCALL			// skip the syscall?

> >  	b.eq	__sys_trace_return_skipped

> > -	mov	wscno, w0			// syscall number (possibly new)

> > -	mov	x1, sp				// pointer to regs

> > -	cmp	wscno, wsc_nr			// check upper syscall limit

> > -	b.hs	__ni_sys_trace

> > -	ldp	x0, x1, [sp]			// restore the syscall args

> > -	ldp	x2, x3, [sp, #S_X2]

> > -	ldp	x4, x5, [sp, #S_X4]

> > -	ldp	x6, x7, [sp, #S_X6]

> > -	ldr	x16, [stbl, xscno, lsl #3]	// address in the syscall table

> > -	blr	x16				// call sys_* routine

> >  

> > -__sys_trace_return:

> > -	str	x0, [sp, #S_X0]			// save returned x0

> > +	mov	x0, sp

> > +	mov	w1, wscno

> > +	mov w2, wsc_nr

> > +	mov	x3, stbl

> > +	bl	invoke_syscall

> > +

> >  __sys_trace_return_skipped:

> >  	mov	x0, sp

> >  	bl	syscall_trace_exit

> >  	b	ret_to_user

> >  

> > -__ni_sys_trace:

> > -	mov	x0, sp

> > -	bl	do_ni_syscall

> > -	b	__sys_trace_return

> > -

> 

> Can you explain why ni_syscall is special here, 


This is for out-of-range syscall numbers, instances of ni_syscall in the
syscall table are handled by the regular path. When the syscall number
is out-of-range, we can't index the syscall table, and have to call
ni_sys directly.

The c invoke_syscall() wrapper handles that case internally so that we
don't have to open-code it everywhere.

> why __sys_trace_return existed, 


The __sys_trace_return label existed so that the special __ni_sys_trace
path could return into a common tracing return path.

> and why its disappearance doesn't break anything?


Now that invoke_syscall() handles out-of-range syscall numbers, and we
can remove the __ni_sys_trace path, nothing branches to
__sys_trace_return.

Only the label has been removed, not the usual return path.

> Not saying there's a bug, just that I'm a little confuse -- I see no

> real reason for ni_syscall being special, and this may be a good

> opportunity to decruft it.  (See also comments below.)


Hopefully the above clarifies things?

I've updated the commit message with a description.

[...]

> > +asmlinkage void invoke_syscall(struct pt_regs *regs, int scno, int sc_nr,

> > +			       syscall_fn_t syscall_table[])

> > +{

> > +	if (scno < sc_nr) {

> 

> What if (int)scno < 0?  Should those args both by unsigned ints?


Yes, they should -- I've fixed that up locally.

That is a *very* good point, thanks!

> "sc_nr" sounds too much like "syscall number" to me.  Might

> "syscall_table_size" might be clearer?  Similarly, we could have

> "stbl_size" or similar in the asm.  This is purely cosmetic,

> though.


I'd tried to stick to the naming used in assembly to keep the conversion
clearer for those familiar with the asm.

I agree the names aren't great.

> > +		syscall_fn_t syscall_fn;

> > +		syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)];

> > +		__invoke_syscall(regs, syscall_fn);

> > +	} else {

> > +		regs->regs[0] = do_ni_syscall(regs);

> 

> Can we make __invoke_syscall() the universal syscall wrapper, and give

> do_ni_syscall() the same interface as any other syscall body?


Not at this point in time, since the prototype (in core code) differs.

I agree that would be nicer, but there are a number of complications;
more details below.

> Then you could factor this as

> 

> static syscall_fn_t syscall_fn(syscall_fn_t const syscall_table[],

> 				(unsigned) int scno, (unsigned) int sc_nr)

> {

> 	if (sc_no >= sc_nr)

> 		return sys_ni_syscall;

> 

> 	return syscall_table[array_index_nospec(scno, sc_nr)];

> }

> 

> ...

> 	__invoke_syscall(regs, syscall_fn(syscall_table, scno, sc_nr);

> 

> 

> 

> This is cosmetic too, of course.

> 

> do_ni_syscall() should be given a pt_regs-based wrapper like all the

> rest.


I agree it would be nicer if it had a wrapper that took a pt_regs, even
if it does nothing with it.

We can't use SYSCALL_DEFINE0() due to the fault injection muck, we'd
need a ksys_ni_syscall() for our traps.c logic, and adding this
uniformly would involve some arch-specific rework for x86, too, so I
decided it was not worth the effort.

Thanks,
Mark.
Dave Martin May 14, 2018, 12:53 p.m. | #3
On Mon, May 14, 2018 at 12:41:10PM +0100, Mark Rutland wrote:
> On Mon, May 14, 2018 at 12:07:18PM +0100, Dave Martin wrote:

> > On Mon, May 14, 2018 at 10:46:30AM +0100, Mark Rutland wrote:

> > > As a first step towards invoking syscalls with a pt_regs argument,

> > > convert the raw syscall invocation logic to C. We end up with a bit more

> > > register shuffling, but the unified invocation logic means we can unify

> > > the tracing paths, too.

> > > 

> > > This only converts the invocation of the syscall. The rest of the

> > > syscall triage and tracing is left in assembly for now, and will be

> > > converted in subsequent patches.

> > > 

> > > Signed-off-by: Mark Rutland <mark.rutland@arm.com>

> > > Cc: Catalin Marinas <catalin.marinas@arm.com>

> > > Cc: Will Deacon <will.deacon@arm.com>

> > > ---

> > >  arch/arm64/kernel/Makefile  |  3 ++-

> > >  arch/arm64/kernel/entry.S   | 36 ++++++++++--------------------------

> > >  arch/arm64/kernel/syscall.c | 29 +++++++++++++++++++++++++++++

> > >  3 files changed, 41 insertions(+), 27 deletions(-)

> > >  create mode 100644 arch/arm64/kernel/syscall.c

> > > 

> > > diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile

> > > index bf825f38d206..c22e8ace5ea3 100644

> > > --- a/arch/arm64/kernel/Makefile

> > > +++ b/arch/arm64/kernel/Makefile

> > > @@ -18,7 +18,8 @@ arm64-obj-y		:= debug-monitors.o entry.o irq.o fpsimd.o		\

> > >  			   hyp-stub.o psci.o cpu_ops.o insn.o	\

> > >  			   return_address.o cpuinfo.o cpu_errata.o		\

> > >  			   cpufeature.o alternative.o cacheinfo.o		\

> > > -			   smp.o smp_spin_table.o topology.o smccc-call.o

> > > +			   smp.o smp_spin_table.o topology.o smccc-call.o	\

> > > +			   syscall.o

> > >  

> > >  extra-$(CONFIG_EFI)			:= efi-entry.o

> > >  

> > > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S

> > > index 08ea3cbfb08f..d6e057500eaf 100644

> > > --- a/arch/arm64/kernel/entry.S

> > > +++ b/arch/arm64/kernel/entry.S

> > > @@ -873,7 +873,6 @@ ENDPROC(el0_error)

> > >   */

> > >  ret_fast_syscall:

> > >  	disable_daif

> > > -	str	x0, [sp, #S_X0]			// returned x0

> > >  	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for syscall tracing

> > >  	and	x2, x1, #_TIF_SYSCALL_WORK

> > >  	cbnz	x2, ret_fast_syscall_trace

> > > @@ -946,15 +945,11 @@ el0_svc_naked:					// compat entry point

> > >  

> > >  	tst	x16, #_TIF_SYSCALL_WORK		// check for syscall hooks

> > >  	b.ne	__sys_trace

> > > -	cmp     wscno, wsc_nr			// check upper syscall limit

> > > -	b.hs	ni_sys

> > > -	mask_nospec64 xscno, xsc_nr, x19	// enforce bounds for syscall number

> > > -	ldr	x16, [stbl, xscno, lsl #3]	// address in the syscall table

> > > -	blr	x16				// call sys_* routine

> > > -	b	ret_fast_syscall

> > > -ni_sys:

> > >  	mov	x0, sp

> > > -	bl	do_ni_syscall

> > > +	mov	w1, wscno

> > > +	mov	w2, wsc_nr

> > > +	mov	x3, stbl

> > > +	bl	invoke_syscall

> > >  	b	ret_fast_syscall

> > >  ENDPROC(el0_svc)

> > >  

> > > @@ -971,29 +966,18 @@ __sys_trace:

> > >  	bl	syscall_trace_enter

> > >  	cmp	w0, #NO_SYSCALL			// skip the syscall?

> > >  	b.eq	__sys_trace_return_skipped

> > > -	mov	wscno, w0			// syscall number (possibly new)

> > > -	mov	x1, sp				// pointer to regs

> > > -	cmp	wscno, wsc_nr			// check upper syscall limit

> > > -	b.hs	__ni_sys_trace

> > > -	ldp	x0, x1, [sp]			// restore the syscall args

> > > -	ldp	x2, x3, [sp, #S_X2]

> > > -	ldp	x4, x5, [sp, #S_X4]

> > > -	ldp	x6, x7, [sp, #S_X6]

> > > -	ldr	x16, [stbl, xscno, lsl #3]	// address in the syscall table

> > > -	blr	x16				// call sys_* routine

> > >  

> > > -__sys_trace_return:

> > > -	str	x0, [sp, #S_X0]			// save returned x0

> > > +	mov	x0, sp

> > > +	mov	w1, wscno

> > > +	mov w2, wsc_nr

> > > +	mov	x3, stbl

> > > +	bl	invoke_syscall

> > > +

> > >  __sys_trace_return_skipped:

> > >  	mov	x0, sp

> > >  	bl	syscall_trace_exit

> > >  	b	ret_to_user

> > >  

> > > -__ni_sys_trace:

> > > -	mov	x0, sp

> > > -	bl	do_ni_syscall

> > > -	b	__sys_trace_return

> > > -

> > 

> > Can you explain why ni_syscall is special here, 

> 

> This is for out-of-range syscall numbers, instances of ni_syscall in the

> syscall table are handled by the regular path. When the syscall number

> is out-of-range, we can't index the syscall table, and have to call

> ni_sys directly.

> 

> The c invoke_syscall() wrapper handles that case internally so that we

> don't have to open-code it everywhere.

> 

> > why __sys_trace_return existed, 

> 

> The __sys_trace_return label existed so that the special __ni_sys_trace

> path could return into a common tracing return path.

> 

> > and why its disappearance doesn't break anything?

> 

> Now that invoke_syscall() handles out-of-range syscall numbers, and we

> can remove the __ni_sys_trace path, nothing branches to

> __sys_trace_return.

> 

> Only the label has been removed, not the usual return path.


OK, I think I understand.  I think the name "__sys_trace_return" was
confusing me, as if this was something special that only relates to the
ni_syscall case.  If it was only ever intended as the merge point for
those two paths, then I can see that merging the paths for real enables
us to get rid of it.

> > Not saying there's a bug, just that I'm a little confuse -- I see no

> > real reason for ni_syscall being special, and this may be a good

> > opportunity to decruft it.  (See also comments below.)

> 

> Hopefully the above clarifies things?


Yes, I think so.

> I've updated the commit message with a description.

> 

> [...]

> 

> > > +asmlinkage void invoke_syscall(struct pt_regs *regs, int scno, int sc_nr,

> > > +			       syscall_fn_t syscall_table[])

> > > +{

> > > +	if (scno < sc_nr) {

> > 

> > What if (int)scno < 0?  Should those args both by unsigned ints?

> 

> Yes, they should -- I've fixed that up locally.

> 

> That is a *very* good point, thanks!

> 

> > "sc_nr" sounds too much like "syscall number" to me.  Might

> > "syscall_table_size" might be clearer?  Similarly, we could have

> > "stbl_size" or similar in the asm.  This is purely cosmetic,

> > though.

> 

> I'd tried to stick to the naming used in assembly to keep the conversion

> clearer for those familiar with the asm.

> 

> I agree the names aren't great.


Not a big deal.  If you feel you would like to rename them though, I
won't argue with it ;)

> > > +		syscall_fn_t syscall_fn;

> > > +		syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)];

> > > +		__invoke_syscall(regs, syscall_fn);

> > > +	} else {

> > > +		regs->regs[0] = do_ni_syscall(regs);

> > 

> > Can we make __invoke_syscall() the universal syscall wrapper, and give

> > do_ni_syscall() the same interface as any other syscall body?

> 

> Not at this point in time, since the prototype (in core code) differs.

> 

> I agree that would be nicer, but there are a number of complications;

> more details below.

> 

> > Then you could factor this as

> > 

> > static syscall_fn_t syscall_fn(syscall_fn_t const syscall_table[],

> > 				(unsigned) int scno, (unsigned) int sc_nr)

> > {

> > 	if (sc_no >= sc_nr)

> > 		return sys_ni_syscall;

> > 

> > 	return syscall_table[array_index_nospec(scno, sc_nr)];

> > }

> > 

> > ...

> > 	__invoke_syscall(regs, syscall_fn(syscall_table, scno, sc_nr);

> > 

> > 

> > 

> > This is cosmetic too, of course.

> > 

> > do_ni_syscall() should be given a pt_regs-based wrapper like all the

> > rest.

> 

> I agree it would be nicer if it had a wrapper that took a pt_regs, even

> if it does nothing with it.

> 

> We can't use SYSCALL_DEFINE0() due to the fault injection muck, we'd

> need a ksys_ni_syscall() for our traps.c logic, and adding this

> uniformly would involve some arch-specific rework for x86, too, so I

> decided it was not worth the effort.


Does allowing error injection for ni_syscall actually matter?  Error
injection is an ABI break by itself.  It would arguably be a bit
strange though.

Cheers
---Dave
Dominik Brodowski May 14, 2018, 6 p.m. | #4
> +static void __invoke_syscall(struct pt_regs *regs, syscall_fn_t syscall_fn)

> +{

> +	regs->regs[0] = syscall_fn(regs->regs[0], regs->regs[1],

> +				   regs->regs[2], regs->regs[3],

> +				   regs->regs[4], regs->regs[5]);

> +}


Any specific reason to have this in a separate function? This seems to be
called only from one instance, namely

> +asmlinkage void invoke_syscall(struct pt_regs *regs, int scno, int sc_nr,

> +			       syscall_fn_t syscall_table[])

> +{

> +	if (scno < sc_nr) {

> +		syscall_fn_t syscall_fn;

> +		syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)];

> +		__invoke_syscall(regs, syscall_fn);

> +	} else {

> +		regs->regs[0] = do_ni_syscall(regs);

> +	}

> +}


and result in a one-liner in the last patch of the series.

Thanks,
	Dominik
Mark Rutland May 15, 2018, 8:18 a.m. | #5
On Mon, May 14, 2018 at 08:00:29PM +0200, Dominik Brodowski wrote:
> > +static void __invoke_syscall(struct pt_regs *regs, syscall_fn_t syscall_fn)

> > +{

> > +	regs->regs[0] = syscall_fn(regs->regs[0], regs->regs[1],

> > +				   regs->regs[2], regs->regs[3],

> > +				   regs->regs[4], regs->regs[5]);

> > +}

> 

> Any specific reason to have this in a separate function? This seems to be

> called only from one instance, namely


I wanted to keep the raw syscall invocation logically separate from the syscall
table lookup and ni_syscall fallback, so that it was easier to verify in isolation.

I don't think it's a big deal either way, though.

Thanks,
Mark.
Mark Rutland May 15, 2018, 8:22 a.m. | #6
On Mon, May 14, 2018 at 10:24:45PM +0200, Dominik Brodowski wrote:
> On Mon, May 14, 2018 at 12:41:10PM +0100, Mark Rutland wrote:

> > I agree it would be nicer if it had a wrapper that took a pt_regs, even

> > if it does nothing with it.

> > 

> > We can't use SYSCALL_DEFINE0() due to the fault injection muck, we'd

> > need a ksys_ni_syscall() for our traps.c logic, and adding this

> > uniformly would involve some arch-specific rework for x86, too, so I

> > decided it was not worth the effort.

> 

> Couldn't you just open-code the "return -ENOSYS;" in traps.c?


I guess so. I was just worried that debug logic might be added to the generic
ni_syscall() in future, and wanted to avoid potential divergence.

> Error injection has no reasonable stable ABI/API expectations, so that's not

> a show-stopper either.


If people are happy with using SYSCALL_DEFINE0() for ni_syscall, I'm happy to
do that -- it's just that we'll need a fixup for x86 as that will change the
symbol name.

Thanks,
Mark.
Dominik Brodowski May 15, 2018, 10:01 a.m. | #7
On Tue, May 15, 2018 at 09:22:23AM +0100, Mark Rutland wrote:
> On Mon, May 14, 2018 at 10:24:45PM +0200, Dominik Brodowski wrote:

> > On Mon, May 14, 2018 at 12:41:10PM +0100, Mark Rutland wrote:

> > > I agree it would be nicer if it had a wrapper that took a pt_regs, even

> > > if it does nothing with it.

> > > 

> > > We can't use SYSCALL_DEFINE0() due to the fault injection muck, we'd

> > > need a ksys_ni_syscall() for our traps.c logic, and adding this

> > > uniformly would involve some arch-specific rework for x86, too, so I

> > > decided it was not worth the effort.

> > 

> > Couldn't you just open-code the "return -ENOSYS;" in traps.c?

> 

> I guess so. I was just worried that debug logic might be added to the generic

> ni_syscall() in future, and wanted to avoid potential divergence.

> 

> > Error injection has no reasonable stable ABI/API expectations, so that's not

> > a show-stopper either.

> 

> If people are happy with using SYSCALL_DEFINE0() for ni_syscall, I'm happy to

> do that -- it's just that we'll need a fixup for x86 as that will change the

> symbol name.


For me, it's less about using SYSCALL_DEFINE0() for ni_syscall, but more
about keeping the syscall invokation easy. Therefore, we do pass a pointer
struct pt_regs to sys_ni_syscall() on x86, even though it does not expect
it.

	/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */
	extern asmlinkage long sys_ni_syscall(const struct pt_regs *);

Thanks,
	Dominik
Mark Rutland May 15, 2018, 10:13 a.m. | #8
On Tue, May 15, 2018 at 12:01:58PM +0200, Dominik Brodowski wrote:
> On Tue, May 15, 2018 at 09:22:23AM +0100, Mark Rutland wrote:

> > On Mon, May 14, 2018 at 10:24:45PM +0200, Dominik Brodowski wrote:

> > > On Mon, May 14, 2018 at 12:41:10PM +0100, Mark Rutland wrote:

> > > > I agree it would be nicer if it had a wrapper that took a pt_regs, even

> > > > if it does nothing with it.

> > > > 

> > > > We can't use SYSCALL_DEFINE0() due to the fault injection muck, we'd

> > > > need a ksys_ni_syscall() for our traps.c logic, and adding this

> > > > uniformly would involve some arch-specific rework for x86, too, so I

> > > > decided it was not worth the effort.

> > > 

> > > Couldn't you just open-code the "return -ENOSYS;" in traps.c?

> > 

> > I guess so. I was just worried that debug logic might be added to the generic

> > ni_syscall() in future, and wanted to avoid potential divergence.

> > 

> > > Error injection has no reasonable stable ABI/API expectations, so that's not

> > > a show-stopper either.

> > 

> > If people are happy with using SYSCALL_DEFINE0() for ni_syscall, I'm happy to

> > do that -- it's just that we'll need a fixup for x86 as that will change the

> > symbol name.

> 

> For me, it's less about using SYSCALL_DEFINE0() for ni_syscall, but more

> about keeping the syscall invokation easy. Therefore, we do pass a pointer

> struct pt_regs to sys_ni_syscall() on x86, even though it does not expect

> it.

> 

> 	/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */

> 	extern asmlinkage long sys_ni_syscall(const struct pt_regs *);


Oh, sure, we do the same on arm64 in this series.

Having a pt_regs wrapper for it (e.g. using SYSCALL_DEFINE0()) would
allow us to avoid that lie (which might be best for CFI stuff), would
allow us to avoid some name mangling on arm64, and would seemingly
confuse people less.

Thanks,
Mark.

Patch

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index bf825f38d206..c22e8ace5ea3 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -18,7 +18,8 @@  arm64-obj-y		:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   hyp-stub.o psci.o cpu_ops.o insn.o	\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
-			   smp.o smp_spin_table.o topology.o smccc-call.o
+			   smp.o smp_spin_table.o topology.o smccc-call.o	\
+			   syscall.o
 
 extra-$(CONFIG_EFI)			:= efi-entry.o
 
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 08ea3cbfb08f..d6e057500eaf 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -873,7 +873,6 @@  ENDPROC(el0_error)
  */
 ret_fast_syscall:
 	disable_daif
-	str	x0, [sp, #S_X0]			// returned x0
 	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for syscall tracing
 	and	x2, x1, #_TIF_SYSCALL_WORK
 	cbnz	x2, ret_fast_syscall_trace
@@ -946,15 +945,11 @@  el0_svc_naked:					// compat entry point
 
 	tst	x16, #_TIF_SYSCALL_WORK		// check for syscall hooks
 	b.ne	__sys_trace
-	cmp     wscno, wsc_nr			// check upper syscall limit
-	b.hs	ni_sys
-	mask_nospec64 xscno, xsc_nr, x19	// enforce bounds for syscall number
-	ldr	x16, [stbl, xscno, lsl #3]	// address in the syscall table
-	blr	x16				// call sys_* routine
-	b	ret_fast_syscall
-ni_sys:
 	mov	x0, sp
-	bl	do_ni_syscall
+	mov	w1, wscno
+	mov	w2, wsc_nr
+	mov	x3, stbl
+	bl	invoke_syscall
 	b	ret_fast_syscall
 ENDPROC(el0_svc)
 
@@ -971,29 +966,18 @@  __sys_trace:
 	bl	syscall_trace_enter
 	cmp	w0, #NO_SYSCALL			// skip the syscall?
 	b.eq	__sys_trace_return_skipped
-	mov	wscno, w0			// syscall number (possibly new)
-	mov	x1, sp				// pointer to regs
-	cmp	wscno, wsc_nr			// check upper syscall limit
-	b.hs	__ni_sys_trace
-	ldp	x0, x1, [sp]			// restore the syscall args
-	ldp	x2, x3, [sp, #S_X2]
-	ldp	x4, x5, [sp, #S_X4]
-	ldp	x6, x7, [sp, #S_X6]
-	ldr	x16, [stbl, xscno, lsl #3]	// address in the syscall table
-	blr	x16				// call sys_* routine
 
-__sys_trace_return:
-	str	x0, [sp, #S_X0]			// save returned x0
+	mov	x0, sp
+	mov	w1, wscno
+	mov w2, wsc_nr
+	mov	x3, stbl
+	bl	invoke_syscall
+
 __sys_trace_return_skipped:
 	mov	x0, sp
 	bl	syscall_trace_exit
 	b	ret_to_user
 
-__ni_sys_trace:
-	mov	x0, sp
-	bl	do_ni_syscall
-	b	__sys_trace_return
-
 	.popsection				// .entry.text
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c
new file mode 100644
index 000000000000..58d7569f47df
--- /dev/null
+++ b/arch/arm64/kernel/syscall.c
@@ -0,0 +1,29 @@ 
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/nospec.h>
+#include <linux/ptrace.h>
+
+long do_ni_syscall(struct pt_regs *regs);
+
+typedef long (*syscall_fn_t)(unsigned long, unsigned long,
+			     unsigned long, unsigned long,
+			     unsigned long, unsigned long);
+
+static void __invoke_syscall(struct pt_regs *regs, syscall_fn_t syscall_fn)
+{
+	regs->regs[0] = syscall_fn(regs->regs[0], regs->regs[1],
+				   regs->regs[2], regs->regs[3],
+				   regs->regs[4], regs->regs[5]);
+}
+
+asmlinkage void invoke_syscall(struct pt_regs *regs, int scno, int sc_nr,
+			       syscall_fn_t syscall_table[])
+{
+	if (scno < sc_nr) {
+		syscall_fn_t syscall_fn;
+		syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)];
+		__invoke_syscall(regs, syscall_fn);
+	} else {
+		regs->regs[0] = do_ni_syscall(regs);
+	}
+}