diff mbox series

[02/11] mm: call import_iovec() instead of rw_copy_check_uvector() in process_vm_rw()

Message ID 20200921143434.707844-3-hch@lst.de
State New
Headers show
Series None | expand

Commit Message

Christoph Hellwig Sept. 21, 2020, 2:34 p.m. UTC
From: David Laight <David.Laight@ACULAB.COM>

This is the only direct call of rw_copy_check_uvector().  Removing it
will allow rw_copy_check_uvector() to be inlined into import_iovec(),
while only paying a minor price by setting up an otherwise unused
iov_iter in the process_vm_readv/process_vm_writev syscalls that aren't
in a super hot path.

Signed-off-by: David Laight <david.laight@aculab.com>
[hch: expanded the commit log, pass CHECK_IOVEC_ONLY instead of 0 for the
      compat case, handle CHECK_IOVEC_ONLY in iov_iter_init]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 lib/iov_iter.c         |  2 +-
 mm/process_vm_access.c | 33 ++++++++++++++++++---------------
 2 files changed, 19 insertions(+), 16 deletions(-)

Comments

Matthew Wilcox Sept. 21, 2020, 2:48 p.m. UTC | #1
On Mon, Sep 21, 2020 at 04:34:25PM +0200, Christoph Hellwig wrote:
>  {
> -	WARN_ON(direction & ~(READ | WRITE));
> +	WARN_ON(direction & ~(READ | WRITE | CHECK_IOVEC_ONLY));

This is now a no-op because:

include/linux/fs.h:#define CHECK_IOVEC_ONLY -1

I'd suggest we renumber it to 2?

(READ is 0, WRITE is 1.  This WARN_ON should probably be
	WARN_ON(direction > CHECK_IOVEC_ONLY)
Al Viro Sept. 21, 2020, 3:02 p.m. UTC | #2
On Mon, Sep 21, 2020 at 04:34:25PM +0200, Christoph Hellwig wrote:
> From: David Laight <David.Laight@ACULAB.COM>
> 
> This is the only direct call of rw_copy_check_uvector().  Removing it
> will allow rw_copy_check_uvector() to be inlined into import_iovec(),
> while only paying a minor price by setting up an otherwise unused
> iov_iter in the process_vm_readv/process_vm_writev syscalls that aren't
> in a super hot path.

> @@ -443,7 +443,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
>  			const struct iovec *iov, unsigned long nr_segs,
>  			size_t count)
>  {
> -	WARN_ON(direction & ~(READ | WRITE));
> +	WARN_ON(direction & ~(READ | WRITE | CHECK_IOVEC_ONLY));
>  	direction &= READ | WRITE;

Ugh...

> -	rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
> -				   iovstack_r, &iov_r);
> +	rc = import_iovec(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, &iov_r,
> +			  &iter_r);
>  	if (rc <= 0)
>  		goto free_iovecs;
>  
> -	rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
> +	rc = process_vm_rw_core(pid, &iter_l, iter_r.iov, iter_r.nr_segs,
> +				flags, vm_write);

... and ugh^2, since now you are not only setting a meaningless iov_iter,
you are creating a new place that pokes directly into struct iov_iter
guts.

Sure, moving rw_copy_check_uvector() over to lib/iov_iter.c makes sense.
But I would rather split the access_ok()-related checks out of that thing
and bury CHECK_IOVEC_ONLY.

Step 1: move the damn thing to lib/iov_iter.c (same as you do, but without
making it static)

Step 2: split it in two:

ssize_t rw_copy_check_uvector(const struct iovec __user * uvector,
                              unsigned long nr_segs, unsigned long fast_segs,
                              struct iovec *fast_pointer,
                              struct iovec **ret_pointer)
{
	unsigned long seg;
	ssize_t ret;
	struct iovec *iov = fast_pointer;

	*ret_pointer = fast_pointer;
	/*
	 * SuS says "The readv() function *may* fail if the iovcnt argument
	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
	 * traditionally returned zero for zero segments, so...
	 */
	if (nr_segs == 0)
		return 0;

	/*
	 * First get the "struct iovec" from user memory and
	 * verify all the pointers
	 */
	if (nr_segs > UIO_MAXIOV)
		return -EINVAL;

	if (nr_segs > fast_segs) {
		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
		if (!iov)
			return -ENOMEM;
		*ret_pointer = iov;
	}
	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector)))
		return -EFAULT;

	/*
	 * According to the Single Unix Specification we should return EINVAL
	 * if an element length is < 0 when cast to ssize_t or if the
	 * total length would overflow the ssize_t return value of the
	 * system call.
	 *
	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
	 * overflow case.
	 */
	ret = 0;
	for (seg = 0; seg < nr_segs; seg++) {
		void __user *buf = iov[seg].iov_base;
		ssize_t len = (ssize_t)iov[seg].iov_len;

		/* see if we we're about to use an invalid len or if
		 * it's about to overflow ssize_t */
		if (len < 0)
			return -EINVAL;
		if (len > MAX_RW_COUNT - ret) {
			len = MAX_RW_COUNT - ret;
			iov[seg].iov_len = len;
		}
		ret += len;
	}
	return ret;
}

/*
 *  This is merely an early sanity check; we do _not_ rely upon
 *  it when we get to the actual memory accesses.
 */
static bool check_iovecs(const struct iovec *iov, int nr_segs)
{
        for (seg = 0; seg < nr_segs; seg++) {
                void __user *buf = iov[seg].iov_base;
                ssize_t len = (ssize_t)iov[seg].iov_len;

                if (unlikely(!access_ok(buf, len)))
                        return false;
        }
	return true;
}

ssize_t import_iovec(int type, const struct iovec __user * uvector,
                 unsigned nr_segs, unsigned fast_segs,
                 struct iovec **iov, struct iov_iter *i)
{
	struct iovec *p;
	ssize_t n;

	n = rw_copy_check_uvector(uvector, nr_segs, fast_segs, *iov, &p);
	if (n > 0 && !check_iovecs(p, nr_segs))
		n = -EFAULT;
	if (n < 0) {
		if (p != *iov)
			kfree(p);
		*iov = NULL;
		return n;
	}
	iov_iter_init(i, type, p, nr_segs, n);
	*iov = p == *iov ? NULL : p;
	return n;
}

kill CHECK_IOVEC_ONLY and use rw_copy_check_uvector() without the type
argument in mm/process_vm_access.c

Saner that way, IMO...
David Laight Sept. 21, 2020, 3:21 p.m. UTC | #3
From: Al Viro
> Sent: 21 September 2020 16:02
> 
> On Mon, Sep 21, 2020 at 04:34:25PM +0200, Christoph Hellwig wrote:
> > From: David Laight <David.Laight@ACULAB.COM>
> >
> > This is the only direct call of rw_copy_check_uvector().  Removing it
> > will allow rw_copy_check_uvector() to be inlined into import_iovec(),
> > while only paying a minor price by setting up an otherwise unused
> > iov_iter in the process_vm_readv/process_vm_writev syscalls that aren't
> > in a super hot path.
> 
> > @@ -443,7 +443,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
> >  			const struct iovec *iov, unsigned long nr_segs,
> >  			size_t count)
> >  {
> > -	WARN_ON(direction & ~(READ | WRITE));
> > +	WARN_ON(direction & ~(READ | WRITE | CHECK_IOVEC_ONLY));
> >  	direction &= READ | WRITE;
> 
> Ugh...
> 
> > -	rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
> > -				   iovstack_r, &iov_r);
> > +	rc = import_iovec(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, &iov_r,
> > +			  &iter_r);
> >  	if (rc <= 0)
> >  		goto free_iovecs;
> >
> > -	rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
> > +	rc = process_vm_rw_core(pid, &iter_l, iter_r.iov, iter_r.nr_segs,
> > +				flags, vm_write);
> 
> ... and ugh^2, since now you are not only setting a meaningless iov_iter,
> you are creating a new place that pokes directly into struct iov_iter
> guts.
> 
> Sure, moving rw_copy_check_uvector() over to lib/iov_iter.c makes sense.
> But I would rather split the access_ok()-related checks out of that thing
> and bury CHECK_IOVEC_ONLY.
> 
> Step 1: move the damn thing to lib/iov_iter.c (same as you do, but without
> making it static)
> 
> Step 2: split it in two:
> 
> ssize_t rw_copy_check_uvector(const struct iovec __user * uvector,
>                               unsigned long nr_segs, unsigned long fast_segs,
>                               struct iovec *fast_pointer,
>                               struct iovec **ret_pointer)
> {
> 	unsigned long seg;
...
> 	ret = 0;
> 	for (seg = 0; seg < nr_segs; seg++) {
> 		void __user *buf = iov[seg].iov_base;
> 		ssize_t len = (ssize_t)iov[seg].iov_len;
> 
> 		/* see if we we're about to use an invalid len or if
> 		 * it's about to overflow ssize_t */
> 		if (len < 0)
> 			return -EINVAL;
> 		if (len > MAX_RW_COUNT - ret) {
> 			len = MAX_RW_COUNT - ret;
> 			iov[seg].iov_len = len;
> 		}
> 		ret += len;
> 	}
> 	return ret;
> }
> 
> /*
>  *  This is merely an early sanity check; we do _not_ rely upon
>  *  it when we get to the actual memory accesses.
>  */
> static bool check_iovecs(const struct iovec *iov, int nr_segs)
> {
>         for (seg = 0; seg < nr_segs; seg++) {
>                 void __user *buf = iov[seg].iov_base;
>                 ssize_t len = (ssize_t)iov[seg].iov_len;
> 
>                 if (unlikely(!access_ok(buf, len)))
>                         return false;
>         }
> 	return true;
> }

You really don't want to be looping through the array twice.
In fact you don't really want to be doing all those tests at all.
This code makes a significant fraction of the not-insignificant
difference between the 'costs' of send() and sendmsg().

I think the 'length' check can be optimised to do something like:
	for (...) {
		ssize_t len = (ssize_t)iov[seg].iov_len;
		ret += len;
		len_hi += (unsigned long)len >> 20;
	}
	if (len_hi) {
		/* Something potentially odd in the lengths.
		 * Might just be a very long fragment.
		 * Check the individial values. */
		Add the exiting loop here.
	}

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Al Viro Sept. 21, 2020, 3:29 p.m. UTC | #4
On Mon, Sep 21, 2020 at 03:21:35PM +0000, David Laight wrote:

> You really don't want to be looping through the array twice.

Profiles, please.

> I think the 'length' check can be optimised to do something like:
> 	for (...) {
> 		ssize_t len = (ssize_t)iov[seg].iov_len;
> 		ret += len;
> 		len_hi += (unsigned long)len >> 20;
> 	}
> 	if (len_hi) {
> 		/* Something potentially odd in the lengths.
> 		 * Might just be a very long fragment.
> 		 * Check the individial values. */
> 		Add the exiting loop here.
> 	}

Far too ugly to live.
David Laight Sept. 21, 2020, 3:44 p.m. UTC | #5
From: Al Viro
> Sent: 21 September 2020 16:30
> 
> On Mon, Sep 21, 2020 at 03:21:35PM +0000, David Laight wrote:
> 
> > You really don't want to be looping through the array twice.
> 
> Profiles, please.

I did some profiling of send() v sendmsg() much earlier in the year.
I can't remember the exact details but the extra cost of sendmsg()
is far more than you might expect.
(I was timing sending fully built IPv4 UDP packets using a raw socket.)

About half the difference does away if you change the
copy_from_user() to __copy_from_user() when reading the struct msghdr
and iov[] from userspace (user copy hardening is expensive).

The rest is just code path, my gut feeling is that a lot of that
is in import_iovec().

Remember semdmsg() is likely to be called with an iov count of 1.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Christoph Hellwig Sept. 21, 2020, 4:12 p.m. UTC | #6
On Mon, Sep 21, 2020 at 04:29:37PM +0100, Al Viro wrote:
> On Mon, Sep 21, 2020 at 03:21:35PM +0000, David Laight wrote:

> 

> > You really don't want to be looping through the array twice.

> 

> Profiles, please.


Given that the iov array should be cache hot I'd be surprised to
see a huge difference.
Al Viro Sept. 21, 2020, 4:27 p.m. UTC | #7
On Mon, Sep 21, 2020 at 03:44:00PM +0000, David Laight wrote:
> From: Al Viro

> > Sent: 21 September 2020 16:30

> > 

> > On Mon, Sep 21, 2020 at 03:21:35PM +0000, David Laight wrote:

> > 

> > > You really don't want to be looping through the array twice.

> > 

> > Profiles, please.

> 

> I did some profiling of send() v sendmsg() much earlier in the year.

> I can't remember the exact details but the extra cost of sendmsg()

> is far more than you might expect.

> (I was timing sending fully built IPv4 UDP packets using a raw socket.)

> 

> About half the difference does away if you change the

> copy_from_user() to __copy_from_user() when reading the struct msghdr

> and iov[] from userspace (user copy hardening is expensive).


Wha...?  So the difference is within 4 times the overhead of the
hardening checks done for one call of copy_from_user()?

> The rest is just code path, my gut feeling is that a lot of that

> is in import_iovec().

> 

> Remember semdmsg() is likely to be called with an iov count of 1.


... which makes that loop unlikely to be noticable in the entire
mess, whether you pass it once or twice.  IOW, unless you can show
profiles where that loop is sufficiently hot or if you can show
the timings change from splitting it in two, I'll remain very
sceptical about that assertion.
diff mbox series

Patch

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 5e40786c8f1232..db54588406dfae 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -443,7 +443,7 @@  void iov_iter_init(struct iov_iter *i, unsigned int direction,
 			const struct iovec *iov, unsigned long nr_segs,
 			size_t count)
 {
-	WARN_ON(direction & ~(READ | WRITE));
+	WARN_ON(direction & ~(READ | WRITE | CHECK_IOVEC_ONLY));
 	direction &= READ | WRITE;
 
 	/* It will get better.  Eventually... */
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 29c052099affdc..40cd502c337534 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -264,7 +264,7 @@  static ssize_t process_vm_rw(pid_t pid,
 	struct iovec iovstack_r[UIO_FASTIOV];
 	struct iovec *iov_l = iovstack_l;
 	struct iovec *iov_r = iovstack_r;
-	struct iov_iter iter;
+	struct iov_iter iter_l, iter_r;
 	ssize_t rc;
 	int dir = vm_write ? WRITE : READ;
 
@@ -272,23 +272,25 @@  static ssize_t process_vm_rw(pid_t pid,
 		return -EINVAL;
 
 	/* Check iovecs */
-	rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter);
+	rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter_l);
 	if (rc < 0)
 		return rc;
-	if (!iov_iter_count(&iter))
+	if (!iov_iter_count(&iter_l))
 		goto free_iovecs;
 
-	rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
-				   iovstack_r, &iov_r);
+	rc = import_iovec(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, &iov_r,
+			  &iter_r);
 	if (rc <= 0)
 		goto free_iovecs;
 
-	rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
+	rc = process_vm_rw_core(pid, &iter_l, iter_r.iov, iter_r.nr_segs,
+				flags, vm_write);
 
 free_iovecs:
 	if (iov_r != iovstack_r)
 		kfree(iov_r);
-	kfree(iov_l);
+	if (iov_l != iovstack_l)
+		kfree(iov_l);
 
 	return rc;
 }
@@ -322,30 +324,31 @@  compat_process_vm_rw(compat_pid_t pid,
 	struct iovec iovstack_r[UIO_FASTIOV];
 	struct iovec *iov_l = iovstack_l;
 	struct iovec *iov_r = iovstack_r;
-	struct iov_iter iter;
+	struct iov_iter iter_l, iter_r;
 	ssize_t rc = -EFAULT;
 	int dir = vm_write ? WRITE : READ;
 
 	if (flags != 0)
 		return -EINVAL;
 
-	rc = compat_import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter);
+	rc = compat_import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter_l);
 	if (rc < 0)
 		return rc;
-	if (!iov_iter_count(&iter))
+	if (!iov_iter_count(&iter_l))
 		goto free_iovecs;
-	rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
-					  UIO_FASTIOV, iovstack_r,
-					  &iov_r);
+	rc = compat_import_iovec(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
+				 &iov_r, &iter_r);
 	if (rc <= 0)
 		goto free_iovecs;
 
-	rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
+	rc = process_vm_rw_core(pid, &iter_l, iter_r.iov, iter_r.nr_segs,
+				flags, vm_write);
 
 free_iovecs:
 	if (iov_r != iovstack_r)
 		kfree(iov_r);
-	kfree(iov_l);
+	if (iov_l != iovstack_l)
+		kfree(iov_l);
 	return rc;
 }