diff mbox series

[v2,2/2] rusage: allow 64-bit times ru_utime/ru_stime

Message ID 20180420120605.1612248-2-arnd@arndb.de
State New
Headers show
Series [v2,1/2] y2038: rusage: Use __kernel_old_timeval for process times | expand

Commit Message

Arnd Bergmann April 20, 2018, 12:05 p.m. UTC
As part of the y2038 system call work, the getrusage/wait4/waitid system
calls came up, since they pass time data to user space in 'timeval'
format as required by POSIX. This means the existing kernel data structure
is no longer compatible with the one from user space once we have a C
library exposing 64-bit time_t, requiring the library to convert the
kernel structure into its own structure.

This patch moves that conversion into the kernel itself, providing a
set of system calls that can directly be used to implement the libc
getrusage/wait4/waitid functions as we have traditionally done.

There are two advantages to this:

- The new path becomes the native case, avoiding the conversion
  overhead for future 32-bit C libraries.  At least glibc will still
  have to implement a conversion logic as a fallback in order to run
  new applications on older kernels, but that does not have to be used
  on new kernels.
- The range for the ru_utime/ru_stime is no longer limited to
  a 31-bit second counter (about 68 years). That limit may theoretically
  be hit on large SMP systems with a single process running for an
  extended time, e.g. 256 concurrent threads running for more than
  97 days.  Note that there is no overflow in 2038, as all the times
  are relative to the start of a process.

The downside of this is obviously the added complexity of having three
additional system call entry points plus their respective compat handlers,
and updated syscall tables on each architecture (not included in this
patch). Overall, I think this is *not* worth it, but I feel it's important
to show how it can be done and what the cost is.

There are probably some minor improvements that can be implemented on top,
as well as bugs that I introduce.  When reviewing this patch, let's for
now focus instead on the question whether we want it at all or not.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>

---
 arch/alpha/kernel/osf_sys.c   |   2 +-
 include/linux/compat.h        |  26 ++++++++-
 include/linux/resource.h      |   4 +-
 include/linux/sched/task.h    |   4 +-
 include/linux/syscalls.h      |   8 +++
 include/uapi/linux/resource.h |  29 ++++++++++
 kernel/compat.c               |  30 ++++++++++-
 kernel/exit.c                 | 120 ++++++++++++++++++++++++++++++++++++++----
 kernel/sys.c                  |  74 +++++++++++++++++++++++---
 9 files changed, 275 insertions(+), 22 deletions(-)

-- 
2.9.0

Comments

Ingo Molnar June 21, 2018, 3:49 p.m. UTC | #1
(belated reply)

* Arnd Bergmann <arnd@arndb.de> wrote:

> +int put_compat_rusage_time64(const struct __kernel_rusage *r,

> +			     struct compat_rusage_time64 __user *ru)

> +{

> +	struct compat_rusage_time64 r32;

> +	memset(&r32, 0, sizeof(r32));

> +	r32.ru_utime.tv_sec = r->ru_utime.tv_sec;

> +	r32.ru_utime.tv_usec = r->ru_utime.tv_usec;

> +	r32.ru_stime.tv_sec = r->ru_stime.tv_sec;

> +	r32.ru_stime.tv_usec = r->ru_stime.tv_usec;

> +	r32.ru_maxrss = r->ru_maxrss;

> +	r32.ru_ixrss = r->ru_ixrss;

> +	r32.ru_idrss = r->ru_idrss;

> +	r32.ru_isrss = r->ru_isrss;

> +	r32.ru_minflt = r->ru_minflt;

> +	r32.ru_majflt = r->ru_majflt;

> +	r32.ru_nswap = r->ru_nswap;

> +	r32.ru_inblock = r->ru_inblock;

> +	r32.ru_oublock = r->ru_oublock;

> +	r32.ru_msgsnd = r->ru_msgsnd;

> +	r32.ru_msgrcv = r->ru_msgrcv;

> +	r32.ru_nsignals = r->ru_nsignals;

> +	r32.ru_nvcsw = r->ru_nvcsw;

> +	r32.ru_nivcsw = r->ru_nivcsw;


Could you please vertically align the right side of the initialization as well? 
Much easier to check at a glance.

> +	user_access_begin();

> +	unsafe_put_user(signo, &infop->si_signo, Efault);

> +	unsafe_put_user(0, &infop->si_errno, Efault);

> +	unsafe_put_user(info.cause, &infop->si_code, Efault);

> +	unsafe_put_user(info.pid, &infop->si_pid, Efault);

> +	unsafe_put_user(info.uid, &infop->si_uid, Efault);

> +	unsafe_put_user(info.status, &infop->si_status, Efault);

> +	user_access_end();


This too would look nicer the following way:

> +	user_access_begin();

> +	unsafe_put_user(signo,		&infop->si_signo,	Efault);

> +	unsafe_put_user(0,		&infop->si_errno,	Efault);

> +	unsafe_put_user(info.cause,	&infop->si_code,	Efault);

> +	unsafe_put_user(info.pid,	&infop->si_pid,		Efault);

> +	unsafe_put_user(info.uid,	&infop->si_uid,		Efault);

> +	unsafe_put_user(info.status,	&infop->si_status,	Efault);

> +	user_access_end();


Which tabulated form made me notice the info.cause / si_code asymmetry - and a 
brief check of the source shows that it's correct. No way would I have noticed it 
in the jumbled up form above, so I think aligning such mass-initializations makes 
sense.

> +	memset(&r, 0, sizeof(r));

> +	r.ru_utime.tv_sec = rk->ru_utime.tv_sec;

> +	r.ru_utime.tv_usec = rk->ru_utime.tv_usec;

> +	r.ru_stime.tv_sec = rk->ru_stime.tv_sec;

> +	r.ru_stime.tv_usec = rk->ru_stime.tv_usec;

> +	r.ru_maxrss = rk->ru_maxrss;

> +	r.ru_ixrss = rk->ru_ixrss;

> +	r.ru_idrss = rk->ru_idrss;

> +	r.ru_isrss = rk->ru_isrss;

> +	r.ru_minflt = rk->ru_minflt;

> +	r.ru_majflt = rk->ru_majflt;

> +	r.ru_nswap = rk->ru_nswap;

> +	r.ru_inblock = rk->ru_inblock;

> +	r.ru_oublock = rk->ru_oublock;

> +	r.ru_msgsnd = rk->ru_msgsnd;

> +	r.ru_msgrcv = rk->ru_msgrcv;

> +	r.ru_nsignals = rk->ru_nsignals;

> +	r.ru_nvcsw = rk->ru_nvcsw;

> +	r.ru_nivcsw = rk->ru_nivcsw;


Ditto.

Thanks,

	Ingo
Arnd Bergmann June 21, 2018, 4:01 p.m. UTC | #2
On Thu, Jun 21, 2018 at 5:49 PM, Ingo Molnar <mingo@kernel.org> wrote:
> * Arnd Bergmann <arnd@arndb.de> wrote:

>

>> +int put_compat_rusage_time64(const struct __kernel_rusage *r,

>> +                          struct compat_rusage_time64 __user *ru)

>> +{

>> +     struct compat_rusage_time64 r32;

>> +     memset(&r32, 0, sizeof(r32));

>> +     r32.ru_utime.tv_sec = r->ru_utime.tv_sec;

>> +     r32.ru_utime.tv_usec = r->ru_utime.tv_usec;

>> +     r32.ru_stime.tv_sec = r->ru_stime.tv_sec;

>> +     r32.ru_stime.tv_usec = r->ru_stime.tv_usec;

>> +     r32.ru_maxrss = r->ru_maxrss;

>> +     r32.ru_ixrss = r->ru_ixrss;

>> +     r32.ru_idrss = r->ru_idrss;

>> +     r32.ru_isrss = r->ru_isrss;

>> +     r32.ru_minflt = r->ru_minflt;

>> +     r32.ru_majflt = r->ru_majflt;

>> +     r32.ru_nswap = r->ru_nswap;

>> +     r32.ru_inblock = r->ru_inblock;

>> +     r32.ru_oublock = r->ru_oublock;

>> +     r32.ru_msgsnd = r->ru_msgsnd;

>> +     r32.ru_msgrcv = r->ru_msgrcv;

>> +     r32.ru_nsignals = r->ru_nsignals;

>> +     r32.ru_nvcsw = r->ru_nvcsw;

>> +     r32.ru_nivcsw = r->ru_nivcsw;

>

> Could you please vertically align the right side of the initialization as well?

> Much easier to check at a glance.

...

> Which tabulated form made me notice the info.cause / si_code asymmetry - and a

> brief check of the source shows that it's correct. No way would I have noticed it

> in the jumbled up form above, so I think aligning such mass-initializations makes

> sense.


Sure, no problem. Do you have an opinion on the question I raised in the
first patch [1], i.e. whether we actually want this to be done this way in the
kernel, or one of the other approaches I described there?

Thanks for taking a look here already!

      Arnd

[1] https://patchwork.kernel.org/patch/10352507/
Ingo Molnar June 21, 2018, 4:11 p.m. UTC | #3
* Arnd Bergmann <arnd@arndb.de> wrote:

> Sure, no problem. Do you have an opinion on the question I raised in the

> first patch [1], i.e. whether we actually want this to be done this way in the

> kernel, or one of the other approaches I described there?


So this looks like the most forward looking variant:

> a) deprecate the wait4() and getrusage() system calls, and create

>    a set of kernel interfaces based around a newly defined structure that

>    could solve multiple problems at once, e.g. provide more fine-grained

>    timestamps. The C library could then implement the posix interfaces

>    on top of the new system calls.


... but given the pretty long propagation time of new ABIs, is this a good 
solution? What would the limitations/trade-offs be on old-ABI systems?

Thanks,

	Ingo
Arnd Bergmann June 21, 2018, 4:25 p.m. UTC | #4
On Thu, Jun 21, 2018 at 6:11 PM, Ingo Molnar <mingo@kernel.org> wrote:
>

> * Arnd Bergmann <arnd@arndb.de> wrote:

>

>> Sure, no problem. Do you have an opinion on the question I raised in the

>> first patch [1], i.e. whether we actually want this to be done this way in the

>> kernel, or one of the other approaches I described there?

>

> So this looks like the most forward looking variant:

>

>> a) deprecate the wait4() and getrusage() system calls, and create

>>    a set of kernel interfaces based around a newly defined structure that

>>    could solve multiple problems at once, e.g. provide more fine-grained

>>    timestamps. The C library could then implement the posix interfaces

>>    on top of the new system calls.

>

> ... but given the pretty long propagation time of new ABIs, is this a good

> solution? What would the limitations/trade-offs be on old-ABI systems?


The main purpose of this is to enable consistent 64-bit time_t interfaces in
user space, and for most users this would not change anything as the
existing glibc (both 64-bit and 32-bit) can continue calling the same
interfaces as before.

For those users that are interested in 64-bit time_t on 32-bit binaries,
the first step would be to change the glibc implementation to emulate
the existing interfaces with the new time_t on top of the new syscall
rather than the old (unsafe) syscall. Those users already require both
a new kernel and a new glibc version.

If the new kernel interfaces offer a real benefit, others could start
using them directly as soon as they have updated the libc version.
Note that glibc has not updated their kernel headers version for
a long time, they still allow building with linux-3.2 header files.
However, the glibc maintainers have indicated that they would
probably update that requirement to the then-latest version
when adding support to the new 64-bit time_t syscalls, so this
would become available immediately to all users of the new glibc
version.

However, the other question that has to be asked then is whether
there is anything wrong with wait4()/waitid() and getrusuage() that
we want to change beyond the time value passing. We have
answered a similar question with 'yes' for stat(), which has led
to the introduction of statx(), but so far I expect all other syscalls
to start compatible. See [1] for my current list. In that list, I have
currently marked waitid() and getrusuage() as not to be addressed,
i.e. my approach c) of applying only the first of the two patches but
not the second.

       Arnd

[1] https://docs.google.com/spreadsheets/d/1HCYwHXxs48TsTb6IGUduNjQnmfRvMPzCN6T_0YiQwis/edit#gid=0
Ingo Molnar June 22, 2018, 2:16 a.m. UTC | #5
* Arnd Bergmann <arnd@arndb.de> wrote:

> However, the other question that has to be asked then is whether

> there is anything wrong with wait4()/waitid() and getrusuage() that

> we want to change beyond the time value passing. We have

> answered a similar question with 'yes' for stat(), which has led

> to the introduction of statx(),


So we are thinking about adding wait5() in essence, right?

One thing we might want to look into whether the wait4() and waitid() ABIs could 
be 'merged', by making wait4() essentially a natural special case of waitid().

This would mean that the only new system call we'd have to add is waitid2() in 
essence, which would solve both the rusage layout problem and would offer a 
unified ABI.

If that makes sense (it might not!!), then I'd also modernize waitid2() by making 
it attribute structure based, have a length field and make the ABI extensible from 
now on going forward without having to introduce a new syscall variant every time 
we come up with something new...

I.e. how the perf syscall does ABI extensions: we've had dozens of ABI extensions, 
some of them pretty complex, and not a single time did we have to modify glibc and 
tooling was able to adapt quickly yet in a both backwards and forwards compatible 
fashion.

Another, simpler example is the new sys_sched_setattr() syscall, that too is using 
the perf_copy_attr() ABI method, via sched_copy_attr(). (With a minor 
compatibility quirk of SCHED_ATTR_SIZE_VER0 that a new wait ABI wouldn't have to 
do - i.e. it could be made even simpler.)

This way we only have:

SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, flags)

But even 'pid' and 'flags' could have been part of the attribute, i.e. one we pick 
up an attribute structure from user-space we can have really low argument count 
system calls. This also concentrates all the compat concerns into handling the 
attribute structure properly - no weird per-arch artifacts and quirks with 4-5-6 
system call arguments.

Thanks,

	Ingo
Eric W. Biederman June 22, 2018, 5:45 p.m. UTC | #6
Ingo Molnar <mingo@kernel.org> writes:

> * Arnd Bergmann <arnd@arndb.de> wrote:

>

>> However, the other question that has to be asked then is whether

>> there is anything wrong with wait4()/waitid() and getrusuage() that

>> we want to change beyond the time value passing. We have

>> answered a similar question with 'yes' for stat(), which has led

>> to the introduction of statx(),

>

> So we are thinking about adding wait5() in essence, right?


> One thing we might want to look into whether the wait4() and waitid() ABIs could 

> be 'merged', by making wait4() essentially a natural special case of

> waitid().


Essentially waitid(2) not waitid(3) has already seen this merger.
In that there is nothing to wait for that you can not already
expression with waitid.  status vs siginfo is a little different
but the information is encoded in both.

And waitid(2) optionally returns a struct rusage.

> This would mean that the only new system call we'd have to add is waitid2() in 

> essence, which would solve both the rusage layout problem and would offer a 

> unified ABI.

>

> If that makes sense (it might not!!), then I'd also modernize waitid2() by making 

> it attribute structure based, have a length field and make the ABI extensible from 

> now on going forward without having to introduce a new syscall variant every time 

> we come up with something new...


The only part where something is not parameterized in waitid is with the
return of rusage.

What to wait for takes an explicit type parameter.
What is being returned in siginfo returns an si_code to describe how
to decode it.

If it weren't for the zombie being gone after waitid returns I don't
think it would make any sense to combine getrusage and waitid together
at all.

> I.e. how the perf syscall does ABI extensions: we've had dozens of ABI extensions, 

> some of them pretty complex, and not a single time did we have to modify glibc and 

> tooling was able to adapt quickly yet in a both backwards and forwards compatible 

> fashion.

>

> Another, simpler example is the new sys_sched_setattr() syscall, that too is using 

> the perf_copy_attr() ABI method, via sched_copy_attr(). (With a minor 

> compatibility quirk of SCHED_ATTR_SIZE_VER0 that a new wait ABI wouldn't have to 

> do - i.e. it could be made even simpler.)

>

> This way we only have:

>

> SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, flags)

>

> But even 'pid' and 'flags' could have been part of the attribute, i.e. one we pick 

> up an attribute structure from user-space we can have really low argument count 

> system calls. This also concentrates all the compat concerns into handling the 

> attribute structure properly - no weird per-arch artifacts and quirks with 4-5-6 

> system call arguments.


The trouble with attributes is that means you can't filter your system
call arguments with seccomp.  Which most of the time is a pretty big
downside.


From what I have seen the only truly interesting case for extending
waitid is something file descriptor based so the parent/child
relationship is not necessary to wait for a process to terminate.

As for getrusage.  If a sane union of the rusage limits and cgroups or
something like cgroups could be devised.  That would be ideal.  Of
course except for the memory cgroups the similarity to the resource
usage measurments and limits really isn't there.  So I don't know if
merging them would be a real possibility.

So I suspect the simplest thing to do would be to set a flag in the
idtype member of waitid that says give me rusage64 and then we would
be done.  Alternately we could use the low bits of the resource usage
pointer.  Assuming we don't want to introduce another syscall that is.
I really don't see much incremental extensibility potential in the wait
or rusage interface right now.

Eric
Ingo Molnar June 24, 2018, 7:12 a.m. UTC | #7
* Eric W. Biederman <ebiederm@xmission.com> wrote:

> The trouble with attributes is that means you can't filter your system

> call arguments with seccomp. [...]


There's nothing keeping seccomp from securely fetching those arguments and 
extending filtering to them as well ...

Allowing that would make sense for a lot of other system calls as well.

Thanks,

	Ingo
Eric W. Biederman June 25, 2018, 1:26 a.m. UTC | #8
Ingo Molnar <mingo@kernel.org> writes:

> * Eric W. Biederman <ebiederm@xmission.com> wrote:

>

>> The trouble with attributes is that means you can't filter your system

>> call arguments with seccomp. [...]

>

> There's nothing keeping seccomp from securely fetching those arguments and 

> extending filtering to them as well ...

>

> Allowing that would make sense for a lot of other system calls as

> well.


Possibly.  The challenge is that if the fetch for the kernel to use
those arguments is different from the fetch of seccomp to test those
arguments you have a time of test vs time of use race.

Given the location of the seccomp hook at the kernel user space border
there is no easy way for seccomp to share the fetch with the system
call itself.

So I don't see how seccomp could perform the fetch securely.

Eric
Ingo Molnar June 25, 2018, 9:14 a.m. UTC | #9
* Eric W. Biederman <ebiederm@xmission.com> wrote:

> Ingo Molnar <mingo@kernel.org> writes:

> 

> > * Eric W. Biederman <ebiederm@xmission.com> wrote:

> >

> >> The trouble with attributes is that means you can't filter your system

> >> call arguments with seccomp. [...]

> >

> > There's nothing keeping seccomp from securely fetching those arguments and 

> > extending filtering to them as well ...

> >

> > Allowing that would make sense for a lot of other system calls as

> > well.

> 

> Possibly.  The challenge is that if the fetch for the kernel to use

> those arguments is different from the fetch of seccomp to test those

> arguments you have a time of test vs time of use race.


Those fetched values should obviously then be used to call permitted system calls.

> Given the location of the seccomp hook at the kernel user space border

> there is no easy way for seccomp to share the fetch with the system

> call itself.

> 

> So I don't see how seccomp could perform the fetch securely.


Looks like more of a seccomp mis-design/mis-implementation than some fundamental 
problem.

Mis-designed security features should not hinder system call design.

Thanks,

	Ingo
Arnd Bergmann June 25, 2018, 11:42 a.m. UTC | #10
On Fri, Jun 22, 2018 at 7:45 PM, Eric W. Biederman
<ebiederm@xmission.com> wrote:
> Ingo Molnar <mingo@kernel.org> writes:

>

> So I suspect the simplest thing to do would be to set a flag in the

> idtype member of waitid that says give me rusage64 and then we would

> be done.


It would have to be a flag in both the 'idtype' field of waitid(), and also
'who' field of getrusage(), which unfortunately uses a separate set of
flags. Not hard to do, but still a bit more complexity.

> Alternately we could use the low bits of the resource usage

> pointer.  Assuming we don't want to introduce another syscall that is.

> I really don't see much incremental extensibility potential in the wait

> or rusage interface right now.


This is also my conclusion after looking at how various other
operating systems implement getrusage() and wait4() today.
It seems that this is one of the most stable APIs, everyone uses
exactly the same structure layout (Linux/x32 being one exception,
they have the 64-bit Linux compatible layout using __s64 instead of
long members).

For the other ~20 system calls we introduce for y2038, the general
idea has been to stay mostly compatible with the source level interface,
just using a new syscall number. statx() is a notable exception here,
with clock_adjtime() and getitimer()/setitimer() still being undecided.

If we don't do an extensible layout or any other new fields, there
are still the open questions about whether any types should change:

- changing everything to 64-bit would allow sharing the kernel
  code between compat and native
- changing only __old_kernel_timeval to new 64-bit timeval would
  be the simplest user space change (only the syscall number
  changes with sizeof(time_t)), avoiding an extra copy thorough
  the user space stack.
- changing timeval to (64-bit) timespec would seem the most
  sensible update, since it avoids the silly nanosecond-to-
  microsecond conversion in the kernel (glibc would still need
  to do it for compatibility). This is what I'm considering for
  getitimer/setitimer,  too.

         Arnd
Eric W. Biederman June 25, 2018, 4:21 p.m. UTC | #11
Ingo Molnar <mingo@kernel.org> writes:

> * Eric W. Biederman <ebiederm@xmission.com> wrote:

>

>> Ingo Molnar <mingo@kernel.org> writes:

>> 

>> > * Eric W. Biederman <ebiederm@xmission.com> wrote:

>> >

>> >> The trouble with attributes is that means you can't filter your system

>> >> call arguments with seccomp. [...]

>> >

>> > There's nothing keeping seccomp from securely fetching those arguments and 

>> > extending filtering to them as well ...

>> >

>> > Allowing that would make sense for a lot of other system calls as

>> > well.

>> 

>> Possibly.  The challenge is that if the fetch for the kernel to use

>> those arguments is different from the fetch of seccomp to test those

>> arguments you have a time of test vs time of use race.

>

> Those fetched values should obviously then be used to call permitted

> system calls.


Agreed.  To my knowledge no one has figured out how to make that work
yet.  For the most part it has been unnecessary.

>> Given the location of the seccomp hook at the kernel user space border

>> there is no easy way for seccomp to share the fetch with the system

>> call itself.

>> 

>> So I don't see how seccomp could perform the fetch securely.

>

> Looks like more of a seccomp mis-design/mis-implementation than some fundamental 

> problem.


Frankly.  Given that there are some very good solutions in other
operating systems, I think the misdesign is in unix/linux not providing
a good answer to what to do when you need more than 6 arguments to a
system call.

> Mis-designed security features should not hinder system call design.


I certainly agree that seccomp should not be the sole reason for not
doing something.  However there are lots of reasons to avoid
extensibility in general.

Excess extensibility has been the cause of more than one security issue.

Lots of flexibility comes at the price of lots of conditional execution
which tends to explode the test matrix of possibilities to test, with
the result that some combinations are never thought about or tested
because they don't make sense to combine.  Then someone with
mischievious intent see that combination and thinks what happens when I
do this.

Further that conditional execution can frequently be the cause of slow
code as well.

So while there are many nice features of tagged values.  I don't think
they are a general solution.  The lack of seccomp support (today) is
just one downside among many.


I do think it would be nice to have a general pattern for those
system calls that require extensibility.  My gut feel says something
like the L4 pseudo registers (to give a maxium request size)
combined with something like netlink encoding would make a very
nice template for making fast and flexible system calls.

Eric
diff mbox series

Patch

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index cad03ee445b3..aecdb48257b5 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1185,7 +1185,7 @@  SYSCALL_DEFINE4(osf_wait4, pid_t, pid, int __user *, ustatus, int, options,
 {
 	unsigned int status = 0;
 	struct rusage32 r32;
-	struct rusage r;
+	struct __kernel_rusage r;
 	long err = kernel_wait4(pid, &status, options, &r);
 	if (err <= 0)
 		return err;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index b73e2616a409..2ef30d314c48 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -105,7 +105,7 @@  typedef __compat_uid32_t	compat_uid_t;
 typedef __compat_gid32_t	compat_gid_t;
 
 struct compat_sel_arg_struct;
-struct rusage;
+struct __kernel_rusage;
 
 struct compat_itimerspec {
 	struct compat_timespec it_interval;
@@ -321,9 +321,31 @@  struct compat_rusage {
 	compat_long_t	ru_nivcsw;
 };
 
-extern int put_compat_rusage(const struct rusage *,
+struct compat_rusage_time64 {
+	struct __kernel_rusage_timeval ru_utime;
+	struct __kernel_rusage_timeval ru_stime;
+	compat_long_t	ru_maxrss;
+	compat_long_t	ru_ixrss;
+	compat_long_t	ru_idrss;
+	compat_long_t	ru_isrss;
+	compat_long_t	ru_minflt;
+	compat_long_t	ru_majflt;
+	compat_long_t	ru_nswap;
+	compat_long_t	ru_inblock;
+	compat_long_t	ru_oublock;
+	compat_long_t	ru_msgsnd;
+	compat_long_t	ru_msgrcv;
+	compat_long_t	ru_nsignals;
+	compat_long_t	ru_nvcsw;
+	compat_long_t	ru_nivcsw;
+};
+
+extern int put_compat_rusage(const struct __kernel_rusage *,
 			     struct compat_rusage __user *);
 
+extern int put_compat_rusage_time64(const struct __kernel_rusage *,
+				    struct compat_rusage_time64 __user *);
+
 struct compat_siginfo;
 
 struct compat_dirent {
diff --git a/include/linux/resource.h b/include/linux/resource.h
index bdf491cbcab7..8cebf90e76b7 100644
--- a/include/linux/resource.h
+++ b/include/linux/resource.h
@@ -7,8 +7,10 @@ 
 
 struct task_struct;
 
-void getrusage(struct task_struct *p, int who, struct rusage *ru);
+void getrusage(struct task_struct *p, int who, struct __kernel_rusage *ru);
 int do_prlimit(struct task_struct *tsk, unsigned int resource,
 		struct rlimit *new_rlim, struct rlimit *old_rlim);
 
+int put_rusage(const struct __kernel_rusage *rk, struct rusage __user *ru);
+
 #endif
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 5be31eb7b266..cc54ae5e6010 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -10,7 +10,7 @@ 
 #include <linux/sched.h>
 
 struct task_struct;
-struct rusage;
+struct __kernel_rusage;
 union thread_union;
 
 /*
@@ -75,7 +75,7 @@  extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *,
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
-extern long kernel_wait4(pid_t, int *, int, struct rusage *);
+extern long kernel_wait4(pid_t, int *, int, struct __kernel_rusage *);
 
 extern void free_task(struct task_struct *tsk);
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a756ab42894f..084360078f29 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -37,6 +37,7 @@  struct pollfd;
 struct rlimit;
 struct rlimit64;
 struct rusage;
+struct __kernel_rusage;
 struct sched_param;
 struct sched_attr;
 struct sel_arg_struct;
@@ -522,6 +523,10 @@  asmlinkage long sys_waitid(int which, pid_t pid,
 			   struct siginfo __user *infop,
 			   int options, struct rusage __user *ru);
 
+asmlinkage long sys_waitid_time64(int which, pid_t pid,
+			   struct siginfo __user *infop,
+			   int options, struct __kernel_rusage __user *ru);
+
 /* kernel/fork.c */
 asmlinkage long sys_set_tid_address(int __user *tidptr);
 asmlinkage long sys_unshare(unsigned long unshare_flags);
@@ -656,6 +661,7 @@  asmlinkage long sys_getrlimit(unsigned int resource,
 asmlinkage long sys_setrlimit(unsigned int resource,
 				struct rlimit __user *rlim);
 asmlinkage long sys_getrusage(int who, struct rusage __user *ru);
+asmlinkage long sys_getrusage_time64(int who, struct __kernel_rusage __user *ru);
 asmlinkage long sys_umask(int mask);
 asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			unsigned long arg4, unsigned long arg5);
@@ -821,6 +827,8 @@  asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg,
 
 asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
 				int options, struct rusage __user *ru);
+asmlinkage long sys_wait4_time64(pid_t pid, int __user *stat_addr,
+				int options, struct __kernel_rusage __user *ru);
 asmlinkage long sys_prlimit64(pid_t pid, unsigned int resource,
 				const struct rlimit64 __user *new_rlim,
 				struct rlimit64 __user *old_rlim);
diff --git a/include/uapi/linux/resource.h b/include/uapi/linux/resource.h
index 611d3745c70a..a822e716e122 100644
--- a/include/uapi/linux/resource.h
+++ b/include/uapi/linux/resource.h
@@ -50,6 +50,35 @@  struct	rusage {
 	__kernel_long_t	ru_nivcsw;	/* involuntary " */
 };
 
+/*
+ * __kernel_rusage is the binary that we expect 32-bit C libraries
+ * to provide for their 'struct rusage' after migrating to a 64-bit
+ * time_t.
+ */
+struct __kernel_rusage_timeval {
+	__s64 tv_sec;
+	__s64 tv_usec;
+};
+
+struct	__kernel_rusage {
+	struct __kernel_rusage_timeval	ru_utime; /* user time used */
+	struct __kernel_rusage_timeval	ru_stime; /* system time used */
+	__kernel_long_t	ru_maxrss;	/* maximum resident set size */
+	__kernel_long_t	ru_ixrss;	/* integral shared memory size */
+	__kernel_long_t	ru_idrss;	/* integral unshared data size */
+	__kernel_long_t	ru_isrss;	/* integral unshared stack size */
+	__kernel_long_t	ru_minflt;	/* page reclaims */
+	__kernel_long_t	ru_majflt;	/* page faults */
+	__kernel_long_t	ru_nswap;	/* swaps */
+	__kernel_long_t	ru_inblock;	/* block input operations */
+	__kernel_long_t	ru_oublock;	/* block output operations */
+	__kernel_long_t	ru_msgsnd;	/* messages sent */
+	__kernel_long_t	ru_msgrcv;	/* messages received */
+	__kernel_long_t	ru_nsignals;	/* signals received */
+	__kernel_long_t	ru_nvcsw;	/* voluntary context switches */
+	__kernel_long_t	ru_nivcsw;	/* involuntary " */
+};
+
 struct rlimit {
 	__kernel_ulong_t	rlim_cur;
 	__kernel_ulong_t	rlim_max;
diff --git a/kernel/compat.c b/kernel/compat.c
index 51a081b46832..e3cb7c14558a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -234,7 +234,7 @@  COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
 
 #endif
 
-int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
+int put_compat_rusage(const struct __kernel_rusage *r, struct compat_rusage __user *ru)
 {
 	struct compat_rusage r32;
 	memset(&r32, 0, sizeof(r32));
@@ -261,6 +261,34 @@  int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
 	return 0;
 }
 
+int put_compat_rusage_time64(const struct __kernel_rusage *r,
+			     struct compat_rusage_time64 __user *ru)
+{
+	struct compat_rusage_time64 r32;
+	memset(&r32, 0, sizeof(r32));
+	r32.ru_utime.tv_sec = r->ru_utime.tv_sec;
+	r32.ru_utime.tv_usec = r->ru_utime.tv_usec;
+	r32.ru_stime.tv_sec = r->ru_stime.tv_sec;
+	r32.ru_stime.tv_usec = r->ru_stime.tv_usec;
+	r32.ru_maxrss = r->ru_maxrss;
+	r32.ru_ixrss = r->ru_ixrss;
+	r32.ru_idrss = r->ru_idrss;
+	r32.ru_isrss = r->ru_isrss;
+	r32.ru_minflt = r->ru_minflt;
+	r32.ru_majflt = r->ru_majflt;
+	r32.ru_nswap = r->ru_nswap;
+	r32.ru_inblock = r->ru_inblock;
+	r32.ru_oublock = r->ru_oublock;
+	r32.ru_msgsnd = r->ru_msgsnd;
+	r32.ru_msgrcv = r->ru_msgrcv;
+	r32.ru_nsignals = r->ru_nsignals;
+	r32.ru_nvcsw = r->ru_nvcsw;
+	r32.ru_nivcsw = r->ru_nivcsw;
+	if (copy_to_user(ru, &r32, sizeof(r32)))
+		return -EFAULT;
+	return 0;
+}
+
 static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
 				    unsigned len, struct cpumask *new_mask)
 {
diff --git a/kernel/exit.c b/kernel/exit.c
index c3c7ac560114..5088c671ea74 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -995,7 +995,7 @@  struct wait_opts {
 
 	struct waitid_info	*wo_info;
 	int			wo_stat;
-	struct rusage		*wo_rusage;
+	struct __kernel_rusage	*wo_rusage;
 
 	wait_queue_entry_t		child_wait;
 	int			notask_error;
@@ -1548,7 +1548,7 @@  static long do_wait(struct wait_opts *wo)
 }
 
 static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
-			  int options, struct rusage *ru)
+			  int options, struct __kernel_rusage *ru)
 {
 	struct wait_opts wo;
 	struct pid *pid = NULL;
@@ -1596,7 +1596,7 @@  static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
 SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 		infop, int, options, struct rusage __user *, ru)
 {
-	struct rusage r;
+	struct __kernel_rusage r;
 	struct waitid_info info = {.status = 0};
 	long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
 	int signo = 0;
@@ -1604,7 +1604,41 @@  SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 	if (err > 0) {
 		signo = SIGCHLD;
 		err = 0;
-		if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
+		if (ru && put_rusage(&r, ru))
+			return -EFAULT;
+	}
+	if (!infop)
+		return err;
+
+	if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+		return -EFAULT;
+
+	user_access_begin();
+	unsafe_put_user(signo, &infop->si_signo, Efault);
+	unsafe_put_user(0, &infop->si_errno, Efault);
+	unsafe_put_user(info.cause, &infop->si_code, Efault);
+	unsafe_put_user(info.pid, &infop->si_pid, Efault);
+	unsafe_put_user(info.uid, &infop->si_uid, Efault);
+	unsafe_put_user(info.status, &infop->si_status, Efault);
+	user_access_end();
+	return err;
+Efault:
+	user_access_end();
+	return -EFAULT;
+}
+
+SYSCALL_DEFINE5(waitid_time64, int, which, pid_t, upid, struct siginfo __user *,
+		infop, int, options, struct __kernel_rusage __user *, ru)
+{
+	struct __kernel_rusage r;
+	struct waitid_info info = {.status = 0};
+	long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
+	int signo = 0;
+
+	if (err > 0) {
+		signo = SIGCHLD;
+		err = 0;
+		if (ru && copy_to_user(ru, &r, sizeof(struct __kernel_rusage)))
 			return -EFAULT;
 	}
 	if (!infop)
@@ -1628,7 +1662,7 @@  SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 }
 
 long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
-		  struct rusage *ru)
+		  struct __kernel_rusage *ru)
 {
 	struct wait_opts wo;
 	struct pid *pid = NULL;
@@ -1673,11 +1707,24 @@  long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
 SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
 		int, options, struct rusage __user *, ru)
 {
-	struct rusage r;
+	struct __kernel_rusage r;
 	long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
 
 	if (err > 0) {
-		if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
+		if (ru && put_rusage(&r, ru))
+			return -EFAULT;
+	}
+	return err;
+}
+
+SYSCALL_DEFINE4(wait4_time64, pid_t, upid, int __user *, stat_addr,
+		int, options, struct __kernel_rusage __user *, ru)
+{
+	struct __kernel_rusage r;
+	long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
+
+	if (err > 0) {
+		if (ru && copy_to_user(ru, &r, sizeof(struct __kernel_rusage)))
 			return -EFAULT;
 	}
 	return err;
@@ -1703,7 +1750,7 @@  COMPAT_SYSCALL_DEFINE4(wait4,
 	int, options,
 	struct compat_rusage __user *, ru)
 {
-	struct rusage r;
+	struct __kernel_rusage r;
 	long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
 	if (err > 0) {
 		if (ru && put_compat_rusage(&r, ru))
@@ -1712,12 +1759,27 @@  COMPAT_SYSCALL_DEFINE4(wait4,
 	return err;
 }
 
+COMPAT_SYSCALL_DEFINE4(wait4_time64,
+	compat_pid_t, pid,
+	compat_uint_t __user *, stat_addr,
+	int, options,
+	struct compat_rusage_time64 __user *, ru)
+{
+	struct __kernel_rusage r;
+	long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
+	if (err > 0) {
+		if (ru && put_compat_rusage_time64(&r, ru))
+			return -EFAULT;
+	}
+	return err;
+}
+
 COMPAT_SYSCALL_DEFINE5(waitid,
 		int, which, compat_pid_t, pid,
 		struct compat_siginfo __user *, infop, int, options,
 		struct compat_rusage __user *, uru)
 {
-	struct rusage ru;
+	struct __kernel_rusage ru;
 	struct waitid_info info = {.status = 0};
 	long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
 	int signo = 0;
@@ -1754,6 +1816,46 @@  COMPAT_SYSCALL_DEFINE5(waitid,
 	user_access_end();
 	return -EFAULT;
 }
+
+COMPAT_SYSCALL_DEFINE5(waitid_time64,
+		int, which, compat_pid_t, pid,
+		struct compat_siginfo __user *, infop, int, options,
+		struct compat_rusage_time64 __user *, uru)
+{
+	struct __kernel_rusage ru;
+	struct waitid_info info = {.status = 0};
+	long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
+	int signo = 0;
+	if (err > 0) {
+		signo = SIGCHLD;
+		err = 0;
+		if (uru) {
+			/* kernel_waitid() overwrites everything in ru */
+			err = put_compat_rusage_time64(&ru, uru);
+			if (err)
+				return -EFAULT;
+		}
+	}
+
+	if (!infop)
+		return err;
+
+	if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+		return -EFAULT;
+
+	user_access_begin();
+	unsafe_put_user(signo, &infop->si_signo, Efault);
+	unsafe_put_user(0, &infop->si_errno, Efault);
+	unsafe_put_user(info.cause, &infop->si_code, Efault);
+	unsafe_put_user(info.pid, &infop->si_pid, Efault);
+	unsafe_put_user(info.uid, &infop->si_uid, Efault);
+	unsafe_put_user(info.status, &infop->si_status, Efault);
+	user_access_end();
+	return err;
+Efault:
+	user_access_end();
+	return -EFAULT;
+}
 #endif
 
 __weak void abort(void)
diff --git a/kernel/sys.c b/kernel/sys.c
index 1de538f622e8..5b5f2dc19e79 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1699,7 +1699,7 @@  SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
  *
  */
 
-static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
+static void accumulate_thread_rusage(struct task_struct *t, struct __kernel_rusage *r)
 {
 	r->ru_nvcsw += t->nvcsw;
 	r->ru_nivcsw += t->nivcsw;
@@ -1709,12 +1709,13 @@  static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
 	r->ru_oublock += task_io_get_oublock(t);
 }
 
-void getrusage(struct task_struct *p, int who, struct rusage *r)
+void getrusage(struct task_struct *p, int who, struct __kernel_rusage *r)
 {
 	struct task_struct *t;
 	unsigned long flags;
 	u64 tgutime, tgstime, utime, stime;
 	unsigned long maxrss = 0;
+	struct timespec64 ts;
 
 	memset((char *)r, 0, sizeof (*r));
 	utime = stime = 0;
@@ -1769,8 +1770,12 @@  void getrusage(struct task_struct *p, int who, struct rusage *r)
 	unlock_task_sighand(p, &flags);
 
 out:
-	r->ru_utime = ns_to_kernel_old_timeval(utime);
-	r->ru_stime = ns_to_kernel_old_timeval(stime);
+	ts = ns_to_timespec64(utime);
+	r->ru_utime.tv_sec = ts.tv_sec;
+	r->ru_utime.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
+	ts = ns_to_timespec64(stime);
+	r->ru_stime.tv_sec = ts.tv_sec;
+	r->ru_stime.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
 
 	if (who != RUSAGE_CHILDREN) {
 		struct mm_struct *mm = get_task_mm(p);
@@ -1783,10 +1788,54 @@  void getrusage(struct task_struct *p, int who, struct rusage *r)
 	r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
 }
 
-SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
+int put_rusage(const struct __kernel_rusage *rk, struct rusage __user *ru)
 {
 	struct rusage r;
 
+	if (IS_ENABLED(CONFIG_64BIT))
+		return copy_to_user(ru, &rk, sizeof(rk)) ? -EFAULT : 0;
+
+	memset(&r, 0, sizeof(r));
+	r.ru_utime.tv_sec = rk->ru_utime.tv_sec;
+	r.ru_utime.tv_usec = rk->ru_utime.tv_usec;
+	r.ru_stime.tv_sec = rk->ru_stime.tv_sec;
+	r.ru_stime.tv_usec = rk->ru_stime.tv_usec;
+	r.ru_maxrss = rk->ru_maxrss;
+	r.ru_ixrss = rk->ru_ixrss;
+	r.ru_idrss = rk->ru_idrss;
+	r.ru_isrss = rk->ru_isrss;
+	r.ru_minflt = rk->ru_minflt;
+	r.ru_majflt = rk->ru_majflt;
+	r.ru_nswap = rk->ru_nswap;
+	r.ru_inblock = rk->ru_inblock;
+	r.ru_oublock = rk->ru_oublock;
+	r.ru_msgsnd = rk->ru_msgsnd;
+	r.ru_msgrcv = rk->ru_msgrcv;
+	r.ru_nsignals = rk->ru_nsignals;
+	r.ru_nvcsw = rk->ru_nvcsw;
+	r.ru_nivcsw = rk->ru_nivcsw;
+	if (copy_to_user(ru, &r, sizeof(r)))
+		return -EFAULT;
+	return 0;
+}
+
+SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
+{
+	struct __kernel_rusage r;
+
+	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
+	    who != RUSAGE_THREAD)
+		return -EINVAL;
+
+	getrusage(current, who, &r);
+	return put_rusage(&r, ru);
+}
+
+#ifndef CONFIG_64BIT
+SYSCALL_DEFINE2(getrusage_time64, int, who, struct __kernel_rusage __user *, ru)
+{
+	struct __kernel_rusage r;
+
 	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
 	    who != RUSAGE_THREAD)
 		return -EINVAL;
@@ -1794,11 +1843,12 @@  SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
 	getrusage(current, who, &r);
 	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
 }
+#endif
 
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
 {
-	struct rusage r;
+	struct __kernel_rusage r;
 
 	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
 	    who != RUSAGE_THREAD)
@@ -1807,6 +1857,18 @@  COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
 	getrusage(current, who, &r);
 	return put_compat_rusage(&r, ru);
 }
+
+COMPAT_SYSCALL_DEFINE2(getrusage_time64, int, who, struct compat_rusage_time64 __user *, ru)
+{
+	struct __kernel_rusage r;
+
+	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
+	    who != RUSAGE_THREAD)
+		return -EINVAL;
+
+	getrusage(current, who, &r);
+	return put_compat_rusage_time64(&r, ru);
+}
 #endif
 
 SYSCALL_DEFINE1(umask, int, mask)