diff mbox series

[v2,009/108] linux-user: Set up infrastructure for table-izing syscalls

Message ID 20180610030220.3777-10-richard.henderson@linaro.org
State New
Headers show
Series linux-user: Split do_syscall | expand

Commit Message

Richard Henderson June 10, 2018, 3 a.m. UTC
At the same time, split out set_robust_list and get_robust_list.
Put them together, along with their block comment, at the top
of syscall_table.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
 linux-user/syscall.c | 87 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 66 insertions(+), 21 deletions(-)

-- 
2.17.1

Comments

Peter Maydell June 10, 2018, 12:32 p.m. UTC | #1
On 10 June 2018 at 04:00, Richard Henderson
<richard.henderson@linaro.org> wrote:
> At the same time, split out set_robust_list and get_robust_list.

> Put them together, along with their block comment, at the top

> of syscall_table.

>

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

> ---

>  linux-user/syscall.c | 87 +++++++++++++++++++++++++++++++++-----------

>  1 file changed, 66 insertions(+), 21 deletions(-)

>

> diff --git a/linux-user/syscall.c b/linux-user/syscall.c

> index 46f123ee13..8678e749ee 100644

> --- a/linux-user/syscall.c

> +++ b/linux-user/syscall.c

> @@ -7947,6 +7947,17 @@ static int host_to_target_cpu_mask(const unsigned long *host_mask,

>      return 0;

>  }

>

> +typedef abi_long impl_fn(void *cpu_env, unsigned num, abi_long arg1,

> +                         abi_long arg2, abi_long arg3, abi_long arg4,

> +                         abi_long arg5, abi_long arg6, abi_long arg7,

> +                         abi_long arg8);

> +

> +#define IMPL(NAME) \

> +static abi_long impl_##NAME(void *cpu_env, unsigned num, abi_long arg1,   \

> +                            abi_long arg2, abi_long arg3, abi_long arg4,  \

> +                            abi_long arg5, abi_long arg6, abi_long arg7,  \

> +                            abi_long arg8)

> +

>  /* This is an internal helper for do_syscall so that it is easier

>   * to have a single return point, so that actions, such as logging

>   * of syscall results, can be performed.

> @@ -11740,23 +11751,6 @@ static abi_long do_syscall1(void *cpu_env, unsigned num, abi_long arg1,

>          return get_errno(safe_tgkill((int)arg1, (int)arg2,

>                           target_to_host_signal(arg3)));

>

> -#ifdef TARGET_NR_set_robust_list

> -    case TARGET_NR_set_robust_list:

> -    case TARGET_NR_get_robust_list:

> -        /* The ABI for supporting robust futexes has userspace pass

> -         * the kernel a pointer to a linked list which is updated by

> -         * userspace after the syscall; the list is walked by the kernel

> -         * when the thread exits. Since the linked list in QEMU guest

> -         * memory isn't a valid linked list for the host and we have

> -         * no way to reliably intercept the thread-death event, we can't

> -         * support these. Silently return ENOSYS so that guest userspace

> -         * falls back to a non-robust futex implementation (which should

> -         * be OK except in the corner case of the guest crashing while

> -         * holding a mutex that is shared with another process via

> -         * shared memory).

> -         */

> -        return -TARGET_ENOSYS;

> -#endif

>

>  #if defined(TARGET_NR_utimensat)

>      case TARGET_NR_utimensat:

> @@ -12412,6 +12406,54 @@ static abi_long do_syscall1(void *cpu_env, unsigned num, abi_long arg1,

>      return ret;

>  }

>

> +/* The default action for a syscall not listed in syscall_table is to

> + * log the missing syscall.  If a syscall is intentionally emulated as

> + * not present, then list it with impl_enosys as the implementation,

> + * which will avoid the logging.

> + */

> +IMPL(enosys)

> +{

> +    return -TARGET_ENOSYS;

> +}

> +

> +/* For a given syscall number, return a function implementing it.

> + * Do this via switch statement instead of table because some targets

> + * do not begin at 0 and others have a large split in the middle of

> + * the numbers.  The compiler should be able to produce a dense table.

> + */

> +static impl_fn *syscall_table(unsigned num)

> +{

> +#define SYSCALL_WITH(X, Y)    case TARGET_NR_##X: return impl_##Y

> +#define SYSCALL(X)            SYSCALL_WITH(X, X)

> +

> +    switch (num) {

> +        /* The ABI for supporting robust futexes has userspace pass

> +         * the kernel a pointer to a linked list which is updated by

> +         * userspace after the syscall; the list is walked by the kernel

> +         * when the thread exits. Since the linked list in QEMU guest

> +         * memory isn't a valid linked list for the host and we have

> +         * no way to reliably intercept the thread-death event, we can't

> +         * support these. Silently return ENOSYS so that guest userspace

> +         * falls back to a non-robust futex implementation (which should

> +         * be OK except in the corner case of the guest crashing while

> +         * holding a mutex that is shared with another process via

> +         * shared memory).

> +         */

> +        SYSCALL_WITH(get_robust_list, enosys);

> +        SYSCALL_WITH(set_robust_list, enosys);

> +

> +        /*

> +         * Other syscalls listed in collation order, with '_' ignored.

> +         */

> +    }


I was expecting this to be a table lookup, something like
   return syscalls[num].impl;

where the other entries in the syscalls[num] structs would be
for instance the strace data we currently have in strace.list.

thanks
-- PMM
Peter Maydell June 10, 2018, 12:39 p.m. UTC | #2
On 10 June 2018 at 13:32, Peter Maydell <peter.maydell@linaro.org> wrote:
> On 10 June 2018 at 04:00, Richard Henderson

> <richard.henderson@linaro.org> wrote:

>> At the same time, split out set_robust_list and get_robust_list.

>> Put them together, along with their block comment, at the top

>> of syscall_table.

>>

>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>


>> +/* For a given syscall number, return a function implementing it.

>> + * Do this via switch statement instead of table because some targets

>> + * do not begin at 0 and others have a large split in the middle of

>> + * the numbers.  The compiler should be able to produce a dense table.

>> + */


> I was expecting this to be a table lookup, something like

>    return syscalls[num].impl;

>

> where the other entries in the syscalls[num] structs would be

> for instance the strace data we currently have in strace.list.


Ah, I see the comment covers this. I'd still rather we had all
the information related to a syscall in one place, though -- this
way we end up with the ifdefs and so on which determine whether
a syscall is implemented having to be duplicated:
 (a) in the implementation
 (b) in this switch code
 (c) in the handling of strace
It would be cleaner to have a single
#if something
static foo_impl(..) { ... }
static syscall_impl foo = {
    .name = "foo",
    .impl = foo_impl,
    .strace_stuff = ...,
};
register_syscall(foo);
#endif

Hash table?

thanks
-- PMM
Richard Henderson June 10, 2018, 7:03 p.m. UTC | #3
On 06/10/2018 02:39 AM, Peter Maydell wrote:
> It would be cleaner to have a single

> #if something

> static foo_impl(..) { ... }

> static syscall_impl foo = {

>     .name = "foo",

>     .impl = foo_impl,

>     .strace_stuff = ...,

> };

> register_syscall(foo);

> #endif

> 

> Hash table?


It would be cleaner to have everything in one place, yes.  I just replied along
those lines elsewhere in the thread.

Indeed, now that I'm thinking along the lines of strace, I'm thinking that
there should be a separate argument extraction step that would be shared by
both strace and syscall implementation.

I'm not a fan of register_syscall(foo) and the startup costs that implies.

The set of syscalls that we support is fixed at compile time.  We should have a
compile-time generation step that builds everything that is required.  Whether
this is C emitting C, or python emitting C, I do not yet have an opinion.


r~
diff mbox series

Patch

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 46f123ee13..8678e749ee 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -7947,6 +7947,17 @@  static int host_to_target_cpu_mask(const unsigned long *host_mask,
     return 0;
 }
 
+typedef abi_long impl_fn(void *cpu_env, unsigned num, abi_long arg1,
+                         abi_long arg2, abi_long arg3, abi_long arg4,
+                         abi_long arg5, abi_long arg6, abi_long arg7,
+                         abi_long arg8);
+
+#define IMPL(NAME) \
+static abi_long impl_##NAME(void *cpu_env, unsigned num, abi_long arg1,   \
+                            abi_long arg2, abi_long arg3, abi_long arg4,  \
+                            abi_long arg5, abi_long arg6, abi_long arg7,  \
+                            abi_long arg8)
+
 /* This is an internal helper for do_syscall so that it is easier
  * to have a single return point, so that actions, such as logging
  * of syscall results, can be performed.
@@ -11740,23 +11751,6 @@  static abi_long do_syscall1(void *cpu_env, unsigned num, abi_long arg1,
         return get_errno(safe_tgkill((int)arg1, (int)arg2,
                          target_to_host_signal(arg3)));
 
-#ifdef TARGET_NR_set_robust_list
-    case TARGET_NR_set_robust_list:
-    case TARGET_NR_get_robust_list:
-        /* The ABI for supporting robust futexes has userspace pass
-         * the kernel a pointer to a linked list which is updated by
-         * userspace after the syscall; the list is walked by the kernel
-         * when the thread exits. Since the linked list in QEMU guest
-         * memory isn't a valid linked list for the host and we have
-         * no way to reliably intercept the thread-death event, we can't
-         * support these. Silently return ENOSYS so that guest userspace
-         * falls back to a non-robust futex implementation (which should
-         * be OK except in the corner case of the guest crashing while
-         * holding a mutex that is shared with another process via
-         * shared memory).
-         */
-        return -TARGET_ENOSYS;
-#endif
 
 #if defined(TARGET_NR_utimensat)
     case TARGET_NR_utimensat:
@@ -12412,6 +12406,54 @@  static abi_long do_syscall1(void *cpu_env, unsigned num, abi_long arg1,
     return ret;
 }
 
+/* The default action for a syscall not listed in syscall_table is to
+ * log the missing syscall.  If a syscall is intentionally emulated as
+ * not present, then list it with impl_enosys as the implementation,
+ * which will avoid the logging.
+ */
+IMPL(enosys)
+{
+    return -TARGET_ENOSYS;
+}
+
+/* For a given syscall number, return a function implementing it.
+ * Do this via switch statement instead of table because some targets
+ * do not begin at 0 and others have a large split in the middle of
+ * the numbers.  The compiler should be able to produce a dense table.
+ */
+static impl_fn *syscall_table(unsigned num)
+{
+#define SYSCALL_WITH(X, Y)    case TARGET_NR_##X: return impl_##Y
+#define SYSCALL(X)            SYSCALL_WITH(X, X)
+
+    switch (num) {
+        /* The ABI for supporting robust futexes has userspace pass
+         * the kernel a pointer to a linked list which is updated by
+         * userspace after the syscall; the list is walked by the kernel
+         * when the thread exits. Since the linked list in QEMU guest
+         * memory isn't a valid linked list for the host and we have
+         * no way to reliably intercept the thread-death event, we can't
+         * support these. Silently return ENOSYS so that guest userspace
+         * falls back to a non-robust futex implementation (which should
+         * be OK except in the corner case of the guest crashing while
+         * holding a mutex that is shared with another process via
+         * shared memory).
+         */
+        SYSCALL_WITH(get_robust_list, enosys);
+        SYSCALL_WITH(set_robust_list, enosys);
+
+        /*
+         * Other syscalls listed in collation order, with '_' ignored.
+         */
+    }
+
+#undef SYSCALL
+#undef SYSCALL_WITH
+
+    /* After do_syscall1 is fully split, this will be impl_enosys.  */
+    return do_syscall1;
+}
+
 abi_long do_syscall(void *cpu_env, unsigned num, abi_long arg1,
                     abi_long arg2, abi_long arg3, abi_long arg4,
                     abi_long arg5, abi_long arg6, abi_long arg7,
@@ -12419,6 +12461,7 @@  abi_long do_syscall(void *cpu_env, unsigned num, abi_long arg1,
 {
     CPUState *cpu = ENV_GET_CPU(cpu_env);
     abi_long ret;
+    impl_fn *fn;
 
 #ifdef DEBUG_ERESTARTSYS
     /* Debug-only code for exercising the syscall-restart code paths
@@ -12437,14 +12480,16 @@  abi_long do_syscall(void *cpu_env, unsigned num, abi_long arg1,
     trace_guest_user_syscall(cpu, num, arg1, arg2, arg3, arg4,
                              arg5, arg6, arg7, arg8);
 
+    fn = syscall_table(num);
+
     if (unlikely(do_strace)) {
         print_syscall(num, arg1, arg2, arg3, arg4, arg5, arg6);
-        ret = do_syscall1(cpu_env, num, arg1, arg2, arg3, arg4,
-                          arg5, arg6, arg7, arg8);
+        ret = fn(cpu_env, num, arg1, arg2, arg3, arg4,
+                 arg5, arg6, arg7, arg8);
         print_syscall_ret(num, ret);
     } else {
-        ret = do_syscall1(cpu_env, num, arg1, arg2, arg3, arg4,
-                          arg5, arg6, arg7, arg8);
+        ret = fn(cpu_env, num, arg1, arg2, arg3, arg4,
+                 arg5, arg6, arg7, arg8);
     }
 
     trace_guest_user_syscall_ret(cpu, num, ret);