diff mbox series

[v2] skb_expand_head() adjust skb->truesize incorrectly

Message ID 860513d5-fd02-832b-1c4c-ea2b17477d76@virtuozzo.com
State New
Headers show
Series [v2] skb_expand_head() adjust skb->truesize incorrectly | expand

Commit Message

Vasily Averin Aug. 29, 2021, 12:59 p.m. UTC
Christoph Paasch reports [1] about incorrect skb->truesize
after skb_expand_head() call in ip6_xmit.
This may happen because of two reasons:
- skb_set_owner_w() for newly cloned skb is called too early,
before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.
- pskb_expand_head() does not adjust truesize in (skb->sk) case.
In this case sk->sk_wmem_alloc should be adjusted too.

[1] https://lkml.org/lkml/2021/8/20/1082

Reported-by: Christoph Paasch <christoph.paasch@gmail.com>
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
---
v2: based on patch version from Eric Dumazet,
    added __pskb_expand_head() function, which can be forced
    to adjust skb->truesize and sk->sk_wmem_alloc.
---
 net/core/skbuff.c | 43 +++++++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 14 deletions(-)

Comments

Vasily Averin Aug. 30, 2021, 5:52 a.m. UTC | #1
1) I forgot to specify that the patch is intended fro net-next git
2) I forgot to ad Alexey Kuznetsov in cc. I resend the patch to him 
  in a separate letter and received his consent.
3) I forgot to set Fixed mark
Fixes: f1260ff15a71 ("skbuff: introduce skb_expand_head()")

Thank you,
	Vasily Averin

On 8/29/21 3:59 PM, Vasily Averin wrote:
> Christoph Paasch reports [1] about incorrect skb->truesize

> after skb_expand_head() call in ip6_xmit.

> This may happen because of two reasons:

> - skb_set_owner_w() for newly cloned skb is called too early,

> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.

> - pskb_expand_head() does not adjust truesize in (skb->sk) case.

> In this case sk->sk_wmem_alloc should be adjusted too.

> 

> [1] https://lkml.org/lkml/2021/8/20/1082

> 

> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>

> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>

> ---

> v2: based on patch version from Eric Dumazet,

>     added __pskb_expand_head() function, which can be forced

>     to adjust skb->truesize and sk->sk_wmem_alloc.

> ---

>  net/core/skbuff.c | 43 +++++++++++++++++++++++++++++--------------

>  1 file changed, 29 insertions(+), 14 deletions(-)

> 

> diff --git a/net/core/skbuff.c b/net/core/skbuff.c

> index f931176..4691023 100644

> --- a/net/core/skbuff.c

> +++ b/net/core/skbuff.c

> @@ -1681,10 +1681,10 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,

>   *	reloaded after call to this function.

>   */

>  

> -int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

> -		     gfp_t gfp_mask)

> +static int __pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

> +			      gfp_t gfp_mask, bool update_truesize)

>  {

> -	int i, osize = skb_end_offset(skb);

> +	int delta, i, osize = skb_end_offset(skb);

>  	int size = osize + nhead + ntail;

>  	long off;

>  	u8 *data;

> @@ -1756,9 +1756,13 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

>  	 * For the moment, we really care of rx path, or

>  	 * when skb is orphaned (not attached to a socket).

>  	 */

> -	if (!skb->sk || skb->destructor == sock_edemux)

> -		skb->truesize += size - osize;

> -

> +	delta = size - osize;

> +	if (!skb->sk || skb->destructor == sock_edemux) {

> +		skb->truesize += delta;

> +	} else if (update_truesize) {

> +		refcount_add(delta, &skb->sk->sk_wmem_alloc);

> +		skb->truesize += delta;

> +	}

>  	return 0;

>  

>  nofrags:

> @@ -1766,6 +1770,12 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

>  nodata:

>  	return -ENOMEM;

>  }

> +

> +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

> +		     gfp_t gfp_mask)

> +{

> +	return __pskb_expand_head(skb, nhead, ntail, gfp_mask, false);

> +}

>  EXPORT_SYMBOL(pskb_expand_head);

>  

>  /* Make private copy of skb with writable head and some headroom */

> @@ -1804,28 +1814,33 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)

>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)

>  {

>  	int delta = headroom - skb_headroom(skb);

> +	struct sk_buff *oskb = NULL;

>  

>  	if (WARN_ONCE(delta <= 0,

>  		      "%s is expecting an increase in the headroom", __func__))

>  		return skb;

>  

> +	delta = SKB_DATA_ALIGN(delta);

>  	/* pskb_expand_head() might crash, if skb is shared */

>  	if (skb_shared(skb)) {

>  		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

>  

> -		if (likely(nskb)) {

> -			if (skb->sk)

> -				skb_set_owner_w(nskb, skb->sk);

> -			consume_skb(skb);

> -		} else {

> +		if (unlikely(!nskb)) {

>  			kfree_skb(skb);

> +			return NULL;

>  		}

> +		oskb = skb;

>  		skb = nskb;

>  	}

> -	if (skb &&

> -	    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {

> +	if (__pskb_expand_head(skb, delta, 0, GFP_ATOMIC, true)) {

>  		kfree_skb(skb);

> -		skb = NULL;

> +		kfree_skb(oskb);

> +		return NULL;

> +	}

> +	if (oskb) {

> +		if (oskb->sk)

> +			skb_set_owner_w(skb, oskb->sk);

> +		consume_skb(oskb);

>  	}

>  	return skb;

>  }

>
Eric Dumazet Aug. 30, 2021, 4:01 p.m. UTC | #2
On 8/29/21 5:59 AM, Vasily Averin wrote:
> Christoph Paasch reports [1] about incorrect skb->truesize

> after skb_expand_head() call in ip6_xmit.

> This may happen because of two reasons:

> - skb_set_owner_w() for newly cloned skb is called too early,

> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.

> - pskb_expand_head() does not adjust truesize in (skb->sk) case.

> In this case sk->sk_wmem_alloc should be adjusted too.

> 

> [1] https://lkml.org/lkml/2021/8/20/1082

> 

> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>

> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>

> ---

> v2: based on patch version from Eric Dumazet,

>     added __pskb_expand_head() function, which can be forced

>     to adjust skb->truesize and sk->sk_wmem_alloc.

> ---

>  net/core/skbuff.c | 43 +++++++++++++++++++++++++++++--------------

>  1 file changed, 29 insertions(+), 14 deletions(-)

> 

> diff --git a/net/core/skbuff.c b/net/core/skbuff.c

> index f931176..4691023 100644

> --- a/net/core/skbuff.c

> +++ b/net/core/skbuff.c

> @@ -1681,10 +1681,10 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,

>   *	reloaded after call to this function.

>   */

>  

> -int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

> -		     gfp_t gfp_mask)

> +static int __pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

> +			      gfp_t gfp_mask, bool update_truesize)

>  {

> -	int i, osize = skb_end_offset(skb);

> +	int delta, i, osize = skb_end_offset(skb);

>  	int size = osize + nhead + ntail;

>  	long off;

>  	u8 *data;

> @@ -1756,9 +1756,13 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

>  	 * For the moment, we really care of rx path, or

>  	 * when skb is orphaned (not attached to a socket).

>  	 */

> -	if (!skb->sk || skb->destructor == sock_edemux)

> -		skb->truesize += size - osize;

> -

> +	delta = size - osize;

> +	if (!skb->sk || skb->destructor == sock_edemux) {

> +		skb->truesize += delta;

> +	} else if (update_truesize) {


Unfortunately we can not always do this sk_wmem_alloc change here.

Some skb have skb->sk set, but the 'reference on socket' is not through sk_wmem_alloc

It seems you need a helper to make sure skb->destructor is one of
the destructors that use skb->truesize and sk->sk_wmem_alloc

For instance, skb_orphan_partial() could have been used.



> +		refcount_add(delta, &skb->sk->sk_wmem_alloc);

> +		skb->truesize += delta;

> +	}

>  	return 0;

>  

>  nofrags:

> @@ -1766,6 +1770,12 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

>  nodata:

>  	return -ENOMEM;

>  }

> +

> +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

> +		     gfp_t gfp_mask)

> +{

> +	return __pskb_expand_head(skb, nhead, ntail, gfp_mask, false);

> +}

>  EXPORT_SYMBOL(pskb_expand_head);

>  

>  /* Make private copy of skb with writable head and some headroom */

> @@ -1804,28 +1814,33 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)

>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)

>  {

>  	int delta = headroom - skb_headroom(skb);

> +	struct sk_buff *oskb = NULL;

>  

>  	if (WARN_ONCE(delta <= 0,

>  		      "%s is expecting an increase in the headroom", __func__))

>  		return skb;

>  

> +	delta = SKB_DATA_ALIGN(delta);

>  	/* pskb_expand_head() might crash, if skb is shared */

>  	if (skb_shared(skb)) {

>  		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

>  

> -		if (likely(nskb)) {

> -			if (skb->sk)

> -				skb_set_owner_w(nskb, skb->sk);

> -			consume_skb(skb);

> -		} else {

> +		if (unlikely(!nskb)) {

>  			kfree_skb(skb);

> +			return NULL;

>  		}

> +		oskb = skb;

>  		skb = nskb;

>  	}

> -	if (skb &&

> -	    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {

> +	if (__pskb_expand_head(skb, delta, 0, GFP_ATOMIC, true)) {

>  		kfree_skb(skb);

> -		skb = NULL;

> +		kfree_skb(oskb);

> +		return NULL;

> +	}

> +	if (oskb) {

> +		if (oskb->sk)

> +			skb_set_owner_w(skb, oskb->sk);

> +		consume_skb(oskb);

>  	}

>  	return skb;

>  }

>
Vasily Averin Aug. 30, 2021, 6:09 p.m. UTC | #3
On 8/30/21 7:01 PM, Eric Dumazet wrote:
> On 8/29/21 5:59 AM, Vasily Averin wrote:

>> Christoph Paasch reports [1] about incorrect skb->truesize

>> after skb_expand_head() call in ip6_xmit.

>> This may happen because of two reasons:

>> - skb_set_owner_w() for newly cloned skb is called too early,

>> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.

>> - pskb_expand_head() does not adjust truesize in (skb->sk) case.

>> In this case sk->sk_wmem_alloc should be adjusted too.

>>

>> [1] https://lkml.org/lkml/2021/8/20/1082

>> @@ -1756,9 +1756,13 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

>>  	 * For the moment, we really care of rx path, or

>>  	 * when skb is orphaned (not attached to a socket).

>>  	 */

>> -	if (!skb->sk || skb->destructor == sock_edemux)

>> -		skb->truesize += size - osize;

>> -

>> +	delta = size - osize;

>> +	if (!skb->sk || skb->destructor == sock_edemux) {

>> +		skb->truesize += delta;

>> +	} else if (update_truesize) {

> 

> Unfortunately we can not always do this sk_wmem_alloc change here.

> 

> Some skb have skb->sk set, but the 'reference on socket' is not through sk_wmem_alloc


Could you please provide some example?
In past in all handeled cases we have cloned original skb and then unconditionally assigned skb sock_wfree destructor.
Do you want to say that it worked correctly somehow?

I expected if we set sock_wfree, we have guarantee that old skb adjusted sk_wmem_alloc.
Am I wrong?
Could you please point on such case?

> It seems you need a helper to make sure skb->destructor is one of

> the destructors that use skb->truesize and sk->sk_wmem_alloc

> 

> For instance, skb_orphan_partial() could have been used.


Thank you, will investigate.
	Vasily Averin
Vasily Averin Aug. 30, 2021, 6:37 p.m. UTC | #4
On 8/30/21 9:09 PM, Vasily Averin wrote:
> On 8/30/21 7:01 PM, Eric Dumazet wrote:

>> On 8/29/21 5:59 AM, Vasily Averin wrote:

>>> Christoph Paasch reports [1] about incorrect skb->truesize

>>> after skb_expand_head() call in ip6_xmit.

>>> This may happen because of two reasons:

>>> - skb_set_owner_w() for newly cloned skb is called too early,

>>> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.

>>> - pskb_expand_head() does not adjust truesize in (skb->sk) case.

>>> In this case sk->sk_wmem_alloc should be adjusted too.

>>>

>>> [1] https://lkml.org/lkml/2021/8/20/1082

>>> @@ -1756,9 +1756,13 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

>>>  	 * For the moment, we really care of rx path, or

>>>  	 * when skb is orphaned (not attached to a socket).

>>>  	 */

>>> -	if (!skb->sk || skb->destructor == sock_edemux)

>>> -		skb->truesize += size - osize;

>>> -

>>> +	delta = size - osize;

>>> +	if (!skb->sk || skb->destructor == sock_edemux) {

>>> +		skb->truesize += delta;

>>> +	} else if (update_truesize) {

>>

>> Unfortunately we can not always do this sk_wmem_alloc change here.

>>

>> Some skb have skb->sk set, but the 'reference on socket' is not through sk_wmem_alloc

> 

> Could you please provide some example?

> In past in all handeled cases we have cloned original skb and then unconditionally assigned skb sock_wfree destructor.

> Do you want to say that it worked correctly somehow?

> 

> I expected if we set sock_wfree, we have guarantee that old skb adjusted sk_wmem_alloc.

> Am I wrong?

> Could you please point on such case?


However if it is true -- it is not enough to adjust sk_wmem_alloc for proper destructors,
because another destructors may require to do something else.
In this case I can check destructor first and clone skb before pskb_expand_head() call,
like it was happen before.

>> It seems you need a helper to make sure skb->destructor is one of

>> the destructors that use skb->truesize and sk->sk_wmem_alloc

>>

>> For instance, skb_orphan_partial() could have been used.

> 

> Thank you, will investigate.

> 	Vasily Averin

>
Eric Dumazet Aug. 30, 2021, 7:58 p.m. UTC | #5
On 8/30/21 11:09 AM, Vasily Averin wrote:
> On 8/30/21 7:01 PM, Eric Dumazet wrote:

>> On 8/29/21 5:59 AM, Vasily Averin wrote:

>>> Christoph Paasch reports [1] about incorrect skb->truesize

>>> after skb_expand_head() call in ip6_xmit.

>>> This may happen because of two reasons:

>>> - skb_set_owner_w() for newly cloned skb is called too early,

>>> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.

>>> - pskb_expand_head() does not adjust truesize in (skb->sk) case.

>>> In this case sk->sk_wmem_alloc should be adjusted too.

>>>

>>> [1] https://lkml.org/lkml/2021/8/20/1082

>>> @@ -1756,9 +1756,13 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

>>>  	 * For the moment, we really care of rx path, or

>>>  	 * when skb is orphaned (not attached to a socket).

>>>  	 */

>>> -	if (!skb->sk || skb->destructor == sock_edemux)

>>> -		skb->truesize += size - osize;

>>> -

>>> +	delta = size - osize;

>>> +	if (!skb->sk || skb->destructor == sock_edemux) {

>>> +		skb->truesize += delta;

>>> +	} else if (update_truesize) {

>>

>> Unfortunately we can not always do this sk_wmem_alloc change here.

>>

>> Some skb have skb->sk set, but the 'reference on socket' is not through sk_wmem_alloc

> 

> Could you please provide some example?

> In past in all handeled cases we have cloned original skb and then unconditionally assigned skb sock_wfree destructor.


In the past we ignored old value of skb->destructor,
since the clone got a NULL destructor.

In your patch you assumes it is sock_wfree, or other destructors changing sk_wmem_alloc


You need to make sure skb->destructor is one of the known destructors which 
will basically remove skb->truesize from sk->sk_wmem_alloc.

This will also make sure skb->sk is a 'full socket'

If not, you should not change sk->sk_wmem_alloc

> Do you want to say that it worked correctly somehow?


I am simply saying your patch adds a wrong assumption.

> 

> I expected if we set sock_wfree, we have guarantee that old skb adjusted sk_wmem_alloc.

> Am I wrong?

> Could you please point on such case?

> 

>> It seems you need a helper to make sure skb->destructor is one of

>> the destructors that use skb->truesize and sk->sk_wmem_alloc

>>

>> For instance, skb_orphan_partial() could have been used.

> 

> Thank you, will investigate.

> 	Vasily Averin

>
Eric Dumazet Aug. 31, 2021, 7:38 p.m. UTC | #6
On 8/31/21 7:34 AM, Vasily Averin wrote:
> RFC because it have an extra changes:

> new is_skb_wmem() helper can be called

>  - either before pskb_expand_head(), to create skb clones 

>     for skb with destructors that does not change sk->sk_wmem_alloc

>  - or after pskb_expand_head(), to change owner in skb_set_owner_w()

> 

> In current patch I've added both these ways,

> we need to keep one of them.

> ---

> Christoph Paasch reports [1] about incorrect skb->truesize

> after skb_expand_head() call in ip6_xmit.

> This may happen because of two reasons:

> - skb_set_owner_w() for newly cloned skb is called too early,

> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.

> - pskb_expand_head() does not adjust truesize in (skb->sk) case.

> In this case sk->sk_wmem_alloc should be adjusted too.

> 

> [1] https://lkml.org/lkml/2021/8/20/1082

> 

> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>

> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>

> ---

> v3: removed __pskb_expand_head(),

>     added is_skb_wmem() helper for skb with wmem-compatible destructors

>     there are 2 ways to use it:

>      - before pskb_expand_head(), to create skb clones

>      - after pskb_expand_head(), to change owner on extended skb.

> v2: based on patch version from Eric Dumazet,

>     added __pskb_expand_head() function, which can be forced

>     to adjust skb->truesize and sk->sk_wmem_alloc.

> ---

>  include/net/sock.h |  1 +

>  net/core/skbuff.c  | 39 ++++++++++++++++++++++++++++-----------

>  net/core/sock.c    |  8 ++++++++

>  3 files changed, 37 insertions(+), 11 deletions(-)

> 

> diff --git a/include/net/sock.h b/include/net/sock.h

> index 95b2577..173d58c 100644

> --- a/include/net/sock.h

> +++ b/include/net/sock.h

> @@ -1695,6 +1695,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,

>  			     gfp_t priority);

>  void __sock_wfree(struct sk_buff *skb);

>  void sock_wfree(struct sk_buff *skb);

> +bool is_skb_wmem(const struct sk_buff *skb);

>  struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,

>  			     gfp_t priority);

>  void skb_orphan_partial(struct sk_buff *skb);

> diff --git a/net/core/skbuff.c b/net/core/skbuff.c

> index f931176..3ce33f2 100644

> --- a/net/core/skbuff.c

> +++ b/net/core/skbuff.c

> @@ -1804,30 +1804,47 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)

>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)

>  {

>  	int delta = headroom - skb_headroom(skb);

> +	int osize = skb_end_offset(skb);

> +	struct sk_buff *oskb = NULL;

> +	struct sock *sk = skb->sk;

>  

>  	if (WARN_ONCE(delta <= 0,

>  		      "%s is expecting an increase in the headroom", __func__))

>  		return skb;

>  

> -	/* pskb_expand_head() might crash, if skb is shared */

> -	if (skb_shared(skb)) {

> +	delta = SKB_DATA_ALIGN(delta);

> +	/* pskb_expand_head() might crash, if skb is shared.

> +	 * Also we should clone skb if its destructor does

> +	 * not adjust skb->truesize and sk->sk_wmem_alloc

> + 	 */

> +	if (skb_shared(skb) ||

> +	    (sk && (!sk_fullsock(sk) || !is_skb_wmem(skb)))) {


is_skb_wmem() is only possibly true for full sockets by definition.

So the (sk_fullsock(sk) && is_skb_wmem(skb)) can be reduced to is_skb_wmem(skb)

>  		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

>  

> -		if (likely(nskb)) {

> -			if (skb->sk)

> -				skb_set_owner_w(nskb, skb->sk);

> -			consume_skb(skb);

> -		} else {

> +		if (unlikely(!nskb)) {

>  			kfree_skb(skb);

> +			return NULL;

>  		}

> +		oskb = skb;

>  		skb = nskb;

>  	}

> -	if (skb &&

> -	    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {

> +	if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) {

>  		kfree_skb(skb);

> -		skb = NULL;

> +		kfree_skb(oskb);

> +		return NULL;

>  	}

> -	return skb;

> +	if (oskb) {

> +		if (sk)

> +			skb_set_owner_w(skb, sk);


Broken for non full sockets.
Calling skb_set_owner_w(skb, sk) for them is a bug.
> +		consume_skb(oskb);

> +	} else if (sk) {

> +		delta = osize - skb_end_offset(skb);

> +		if (!is_skb_wmem(skb))

> +			skb_set_owner_w(skb, sk);


This would be broken for non full sockets.
Calling skb_set_owner_w(skb, sk) for them is a bug.

> +		skb->truesize += delta;

> +		if (sk_fullsock(sk))

> +			refcount_add(delta, &sk->sk_wmem_alloc);



> +	}	return skb;

>  }

>  EXPORT_SYMBOL(skb_expand_head);

>  

> diff --git a/net/core/sock.c b/net/core/sock.c

> index 950f1e7..0315dcb 100644

> --- a/net/core/sock.c

> +++ b/net/core/sock.c

> @@ -2227,6 +2227,14 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)

>  }

>  EXPORT_SYMBOL(skb_set_owner_w);

>  

> +bool is_skb_wmem(const struct sk_buff *skb)

> +{

> +	return (skb->destructor == sock_wfree ||

> +		skb->destructor == __sock_wfree ||

> +		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));


No need for return (EXPRESSION);

You can simply : return EXPRESSION;

ie
   return skb->destructor == sock_wfree ||
          skb->destructor == __sock_wfree ||
          (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree);


> +}

> +EXPORT_SYMBOL(is_skb_wmem);

> +

>  static bool can_skb_orphan_partial(const struct sk_buff *skb)

>  {

>  #ifdef CONFIG_TLS_DEVICE

>
Vasily Averin Sept. 1, 2021, 6:20 a.m. UTC | #7
On 8/31/21 10:38 PM, Eric Dumazet wrote:
> On 8/31/21 7:34 AM, Vasily Averin wrote:

>> RFC because it have an extra changes:

>> new is_skb_wmem() helper can be called

>>  - either before pskb_expand_head(), to create skb clones 

>>     for skb with destructors that does not change sk->sk_wmem_alloc

>>  - or after pskb_expand_head(), to change owner in skb_set_owner_w()

>>

>> In current patch I've added both these ways,

>> we need to keep one of them.


If nobody object I vote for 2nd way:

>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c

>> index f931176..3ce33f2 100644

>> --- a/net/core/skbuff.c

>> +++ b/net/core/skbuff.c

>> @@ -1804,30 +1804,47 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)

>>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)

>>  {

... skipped ...
>> -	return skb;

>> +	if (oskb) {

>> +		if (sk)

>> +			skb_set_owner_w(skb, sk);

> 

> Broken for non full sockets.

> Calling skb_set_owner_w(skb, sk) for them is a bug.


I think you're wrong here.
It is 100% equivalent of old code, 
skb_set_owner_w() handles sk_fullsock(sk) inside and does not adjust sk->sk_wmem_alloc.
Please explain if I'm wrong.

>> +		consume_skb(oskb);

>> +	} else if (sk) {

>> +		delta = osize - skb_end_offset(skb);

>> +		if (!is_skb_wmem(skb))

>> +			skb_set_owner_w(skb, sk);

> 

> This would be broken for non full sockets.

> Calling skb_set_owner_w(skb, sk) for them is a bug.

See my comment above.

>> +		skb->truesize += delta;

>> +		if (sk_fullsock(sk))

>> +			refcount_add(delta, &sk->sk_wmem_alloc);

> 

> 

>> +	}	return skb;

Strange line, will fix it.

>>  }

>>  EXPORT_SYMBOL(skb_expand_head);
Christoph Paasch Sept. 1, 2021, 4:58 p.m. UTC | #8
Hello,

On Wed, Sep 1, 2021 at 1:12 AM Vasily Averin <vvs@virtuozzo.com> wrote:
>

> Christoph Paasch reports [1] about incorrect skb->truesize

> after skb_expand_head() call in ip6_xmit.

> This may happen because of two reasons:

> - skb_set_owner_w() for newly cloned skb is called too early,

> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.

> - pskb_expand_head() does not adjust truesize in (skb->sk) case.

> In this case sk->sk_wmem_alloc should be adjusted too.

>

> [1] https://lkml.org/lkml/2021/8/20/1082

>

> Fixes: f1260ff15a71 ("skbuff: introduce skb_expand_head()")

> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>

> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>

> ---

> v4: decided to use is_skb_wmem() after pskb_expand_head() call

>     fixed 'return (EXPRESSION);' in os_skb_wmem according to Eric Dumazet

> v3: removed __pskb_expand_head(),

>     added is_skb_wmem() helper for skb with wmem-compatible destructors

>     there are 2 ways to use it:

>      - before pskb_expand_head(), to create skb clones

>      - after successfull pskb_expand_head() to change owner on extended skb.

> v2: based on patch version from Eric Dumazet,

>     added __pskb_expand_head() function, which can be forced

>     to adjust skb->truesize and sk->sk_wmem_alloc.

> ---

>  include/net/sock.h |  1 +

>  net/core/skbuff.c  | 35 ++++++++++++++++++++++++++---------

>  net/core/sock.c    |  8 ++++++++

>  3 files changed, 35 insertions(+), 9 deletions(-)


this introduces more issues with the syzkaller reproducer that I
shared earlier (see below for the output).

I don't have time at the moment to dig into these though - so just
sharing this as an FYI for now.

syzkaller login: [   12.768064] cgroup: Unknown subsys name 'perf_event'
[   12.769831] cgroup: Unknown subsys name 'net_cls'
[   13.587819] ------------[ cut here ]------------
[   13.588943] refcount_t: saturated; leaking memory.
[   13.590166] WARNING: CPU: 1 PID: 1658 at lib/refcount.c:22
refcount_warn_saturate+0xce/0x1f0
[   13.591909] Modules linked in:
[   13.592595] CPU: 1 PID: 1658 Comm: syz-executor Not tainted
5.14.0ea78abdd8ff18baaea3211eabdd6a2a88169cfd6 #134
[   13.594455] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[   13.596640] RIP: 0010:refcount_warn_saturate+0xce/0x1f0
[   13.597651] Code: 1d 32 63 11 02 31 ff 89 de e8 1e 26 79 ff 84 db
75 d8 e8 b5 1e 79 ff 48 c7 c7 80 48 32 83 c6 05 12 63 11 02 01 e8 2f
39
[   13.601049] RSP: 0018:ffffc9000091f2a8 EFLAGS: 00010286
[   13.602077] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[   13.603477] RDX: ffff888100fa2880 RSI: ffffffff8121e533 RDI: fffff52000123e47
[   13.604758] RBP: ffff88810b88013c R08: 0000000000000001 R09: 0000000000000000
[   13.606110] R10: ffffffff814135db R11: 0000000000000000 R12: ffff88810b880000
[   13.607421] R13: 00000000fffffe03 R14: ffff8881094c97c0 R15: ffff88810b88013c
[   13.608874] FS:  00007f8ad457d700(0000) GS:ffff88811b480000(0000)
knlGS:0000000000000000
[   13.610515] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   13.611671] CR2: 0000000000000000 CR3: 00000001045d8000 CR4: 00000000000006e0
[   13.613017] Call Trace:
[   13.613521]  skb_expand_head+0x35a/0x470
[   13.614331]  ip6_xmit+0x105f/0x1560
[   13.615038]  ? ip6_forward+0x22b0/0x22b0
[   13.616011]  ? ip6_dst_check+0x227/0x540
[   13.616773]  ? rt6_check_expired+0x250/0x250
[   13.617657]  ? __sk_dst_check+0xfb/0x200
[   13.618424]  ? inet6_csk_route_socket+0x59e/0x980
[   13.619377]  ? inet6_csk_addr2sockaddr+0x2a0/0x2a0
[   13.620399]  ? stack_trace_consume_entry+0x160/0x160
[   13.621530]  inet6_csk_xmit+0x2b3/0x430
[   13.622290]  ? kasan_save_stack+0x32/0x40
[   13.623133]  ? kasan_save_stack+0x1b/0x40
[   13.623939]  ? inet6_csk_route_socket+0x980/0x980
[   13.624802]  ? entry_SYSCALL_64_after_hwframe+0x44/0xae
[   13.625786]  ? csum_ipv6_magic+0x26/0x70
[   13.626653]  ? inet6_csk_route_socket+0x980/0x980
[   13.627480]  __tcp_transmit_skb+0x186e/0x35d0
[   13.628358]  ? __tcp_select_window+0xa50/0xa50
[   13.629153]  ? __sanitizer_cov_trace_cmp4+0x1c/0x70
[   13.630130]  ? kasan_unpoison+0x23/0x50
[   13.630872]  ? __build_skb_around+0x241/0x300
[   13.631667]  ? __sanitizer_cov_trace_const_cmp4+0x1c/0x70
[   13.632785]  ? __alloc_skb+0x180/0x360
[   13.633545]  __tcp_send_ack.part.0+0x3da/0x650
[   13.634333]  tcp_send_ack+0x7d/0xa0
[   13.635031]  __tcp_ack_snd_check+0x156/0x8c0
[   13.635957]  tcp_rcv_established+0x1733/0x1d30
[   13.636889]  ? tcp_data_queue+0x4af0/0x4af0
[   13.637753]  tcp_v6_do_rcv+0x438/0x1380
[   13.638523]  __release_sock+0x1ad/0x310
[   13.639306]  release_sock+0x54/0x1a0
[   13.640029]  ? tcp_sendmsg_locked+0x2ee0/0x2ee0
[   13.640953]  tcp_sendmsg+0x36/0x40
[   13.641710]  inet6_sendmsg+0xb5/0x140
[   13.642359]  ? inet6_ioctl+0x2a0/0x2a0
[   13.643092]  ____sys_sendmsg+0x3b5/0x970
[   13.643834]  ? sock_release+0x1b0/0x1b0
[   13.644593]  ? __ia32_sys_recvmmsg+0x290/0x290
[   13.645505]  ? futex_wait_setup+0x2e0/0x2e0
[   13.646350]  ___sys_sendmsg+0xff/0x170
[   13.647084]  ? hash_futex+0x12/0x1f0
[   13.647870]  ? sendmsg_copy_msghdr+0x160/0x160
[   13.648691]  ? asm_exc_page_fault+0x1e/0x30
[   13.649475]  ? __sanitizer_cov_trace_const_cmp1+0x22/0x80
[   13.650523]  ? __fget_files+0x1c2/0x2a0
[   13.651245]  ? __fget_light+0xea/0x270
[   13.652027]  ? sockfd_lookup_light+0xc3/0x170
[   13.652845]  __sys_sendmmsg+0x192/0x440
[   13.653622]  ? __ia32_sys_sendmsg+0xb0/0xb0
[   13.654365]  ? vfs_fileattr_set+0xb80/0xb80
[   13.655085]  ? __sanitizer_cov_trace_const_cmp4+0x1c/0x70
[   13.656175]  ? alloc_file_pseudo+0x1/0x250
[   13.657026]  ? sock_ioctl+0x1bb/0x670
[   13.657861]  ? __do_sys_futex+0xe7/0x3d0
[   13.658697]  ? __do_sys_futex+0xe7/0x3d0
[   13.659379]  ? __do_sys_futex+0xf0/0x3d0
[   13.660090]  ? __restore_fpregs_from_fpstate+0xa9/0xf0
[   13.661212]  ? fpregs_mark_activate+0x130/0x130
[   13.662078]  ? do_futex+0x1be0/0x1be0
[   13.662846]  __x64_sys_sendmmsg+0x98/0x100
[   13.663706]  ? syscall_exit_to_user_mode+0x1d/0x40
[   13.664698]  do_syscall_64+0x3b/0x90
[   13.665450]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   13.666564] RIP: 0033:0x7f8ad3e8c469
[   13.667204] Code: 00 f3 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40
00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24
08
[   13.670776] RSP: 002b:00007f8ad457cda8 EFLAGS: 00000246 ORIG_RAX:
0000000000000133
[   13.672208] RAX: ffffffffffffffda RBX: 0000000000000133 RCX: 00007f8ad3e8c469
[   13.673598] RDX: 0000000000000003 RSI: 00000000200008c0 RDI: 0000000000000003
[   13.674946] RBP: 0000000000000133 R08: 0000000000000000 R09: 0000000000000000
[   13.676397] R10: 0000000040044040 R11: 0000000000000246 R12: 000000000069bf8c
[   13.677876] R13: 00007ffe38506fef R14: 00007f8ad455d000 R15: 0000000000000003
[   13.679129] ---[ end trace 55e20198e13af26c ]---
[   13.680043] ------------[ cut here ]------------
[   13.681049] refcount_t: underflow; use-after-free.
[   13.682005] WARNING: CPU: 1 PID: 1658 at lib/refcount.c:28
refcount_warn_saturate+0x103/0x1f0
[   13.683658] Modules linked in:
[   13.684246] CPU: 1 PID: 1658 Comm: syz-executor Tainted: G        W
        5.14.0ea78abdd8ff18baaea3211eabdd6a2a88169cfd6 #134
[   13.686321] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[   13.688388] RIP: 0010:refcount_warn_saturate+0x103/0x1f0
[   13.689502] Code: 1d fb 62 11 02 31 ff 89 de e8 e9 25 79 ff 84 db
75 a3 e8 80 1e 79 ff 48 c7 c7 80 49 32 83 c6 05 db 62 11 02 01 e8 fa
34
[   13.692805] RSP: 0018:ffffc9000091eff8 EFLAGS: 00010286
[   13.693756] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[   13.695193] RDX: ffff888100fa2880 RSI: ffffffff8121e533 RDI: fffff52000123df1
[   13.696696] RBP: ffff88810b88013c R08: 0000000000000001 R09: 0000000000000000
[   13.697982] R10: ffffffff814135db R11: 0000000000000000 R12: ffff88810b88013c
[   13.699291] R13: 00000000fffffe02 R14: ffff8881011a4c00 R15: ffff8881094c97c0
[   13.700576] FS:  00007f8ad457d700(0000) GS:ffff88811b480000(0000)
knlGS:0000000000000000
[   13.702031] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   13.703134] CR2: 0000000000000000 CR3: 00000001045d8000 CR4: 00000000000006e0
[   13.704525] Call Trace:
[   13.704973]  __sock_wfree+0xec/0x110
[   13.705737]  ? sock_wfree+0x240/0x240
[   13.706406]  loopback_xmit+0x126/0x4b0
[   13.707278]  ? refcount_warn_saturate+0xce/0x1f0
[   13.708208]  dev_hard_start_xmit+0x16c/0x5c0
[   13.709116]  __dev_queue_xmit+0x1679/0x2970
[   13.709912]  ? netdev_core_pick_tx+0x2d0/0x2d0
[   13.710758]  ? __sanitizer_cov_trace_const_cmp8+0x1d/0x70
[   13.711846]  ? report_bug+0x38/0x210
[   13.712656]  ? handle_bug+0x3c/0x60
[   13.713395]  ? exc_invalid_op+0x14/0x40
[   13.714119]  ip6_finish_output2+0xb52/0x14c0
[   13.715029]  ip6_output+0x572/0x9e0
[   13.715761]  ? ip6_fragment+0x1f40/0x1f40
[   13.716478]  ip6_xmit+0xc6f/0x1560
[   13.717083]  ? ip6_forward+0x22b0/0x22b0
[   13.717895]  ? ip6_dst_check+0x227/0x540
[   13.718689]  ? rt6_check_expired+0x250/0x250
[   13.719620]  ? __sk_dst_check+0xfb/0x200
[   13.720427]  ? inet6_csk_route_socket+0x59e/0x980
[   13.721408]  ? inet6_csk_addr2sockaddr+0x2a0/0x2a0
[   13.722286]  ? stack_trace_consume_entry+0x160/0x160
[   13.723186]  inet6_csk_xmit+0x2b3/0x430
[   13.723873]  ? kasan_save_stack+0x32/0x40
[   13.724682]  ? kasan_save_stack+0x1b/0x40
[   13.725422]  ? inet6_csk_route_socket+0x980/0x980
[   13.726398]  ? entry_SYSCALL_64_after_hwframe+0x44/0xae
[   13.727478]  ? csum_ipv6_magic+0x26/0x70
[   13.728288]  ? inet6_csk_route_socket+0x980/0x980
[   13.729267]  __tcp_transmit_skb+0x186e/0x35d0
[   13.730048]  ? __tcp_select_window+0xa50/0xa50
[   13.730952]  ? __sanitizer_cov_trace_cmp4+0x1c/0x70
[   13.732007]  ? kasan_unpoison+0x23/0x50
[   13.732740]  ? __build_skb_around+0x241/0x300
[   13.733605]  ? __sanitizer_cov_trace_const_cmp4+0x1c/0x70
[   13.734749]  ? __alloc_skb+0x180/0x360
[   13.735506]  __tcp_send_ack.part.0+0x3da/0x650
[   13.736377]  tcp_send_ack+0x7d/0xa0
[   13.737015]  __tcp_ack_snd_check+0x156/0x8c0
[   13.737758]  tcp_rcv_established+0x1733/0x1d30
[   13.738679]  ? tcp_data_queue+0x4af0/0x4af0
[   13.739417]  tcp_v6_do_rcv+0x438/0x1380
[   13.740166]  __release_sock+0x1ad/0x310
[   13.740874]  release_sock+0x54/0x1a0
[   13.741527]  ? tcp_sendmsg_locked+0x2ee0/0x2ee0
[   13.742394]  tcp_sendmsg+0x36/0x40
[   13.743037]  inet6_sendmsg+0xb5/0x140
[   13.743752]  ? inet6_ioctl+0x2a0/0x2a0
[   13.744511]  ____sys_sendmsg+0x3b5/0x970
[   13.745325]  ? sock_release+0x1b0/0x1b0
[   13.746031]  ? __ia32_sys_recvmmsg+0x290/0x290
[   13.746914]  ? futex_wait_setup+0x2e0/0x2e0
[   13.747749]  ___sys_sendmsg+0xff/0x170
[   13.748393]  ? hash_futex+0x12/0x1f0
[   13.749036]  ? sendmsg_copy_msghdr+0x160/0x160
[   13.749972]  ? asm_exc_page_fault+0x1e/0x30
[   13.750870]  ? __sanitizer_cov_trace_const_cmp1+0x22/0x80
[   13.751974]  ? __fget_files+0x1c2/0x2a0
[   13.752659]  ? __fget_light+0xea/0x270
[   13.753514]  ? sockfd_lookup_light+0xc3/0x170
[   13.754296]  __sys_sendmmsg+0x192/0x440
[   13.755102]  ? __ia32_sys_sendmsg+0xb0/0xb0
[   13.755917]  ? vfs_fileattr_set+0xb80/0xb80
[   13.756692]  ? __sanitizer_cov_trace_const_cmp4+0x1c/0x70
[   13.757790]  ? alloc_file_pseudo+0x1/0x250
[   13.758675]  ? sock_ioctl+0x1bb/0x670
[   13.759341]  ? __do_sys_futex+0xe7/0x3d0
[   13.760040]  ? __do_sys_futex+0xe7/0x3d0
[   13.760762]  ? __do_sys_futex+0xf0/0x3d0
[   13.761585]  ? __restore_fpregs_from_fpstate+0xa9/0xf0
[   13.762511]  ? fpregs_mark_activate+0x130/0x130
[   13.763382]  ? do_futex+0x1be0/0x1be0
[   13.764044]  __x64_sys_sendmmsg+0x98/0x100
[   13.764831]  ? syscall_exit_to_user_mode+0x1d/0x40
[   13.765814]  do_syscall_64+0x3b/0x90
[   13.766607]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   13.767467] RIP: 0033:0x7f8ad3e8c469
[   13.768206] Code: 00 f3 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40
00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24
08
[   13.771618] RSP: 002b:00007f8ad457cda8 EFLAGS: 00000246 ORIG_RAX:
0000000000000133
[   13.773054] RAX: ffffffffffffffda RBX: 0000000000000133 RCX: 00007f8ad3e8c469
[   13.774260] RDX: 0000000000000003 RSI: 00000000200008c0 RDI: 0000000000000003
[   13.775586] RBP: 0000000000000133 R08: 0000000000000000 R09: 0000000000000000
[   13.776909] R10: 0000000040044040 R11: 0000000000000246 R12: 000000000069bf8c
[   13.778390] R13: 00007ffe38506fef R14: 00007f8ad455d000 R15: 0000000000000003
[   13.779752] ---[ end trace 55e20198e13af26d ]---
[   13.780935] ------------[ cut here ]------------
[   13.781986] WARNING: CPU: 0 PID: 1658 at net/core/skbuff.c:5429
skb_try_coalesce+0x1019/0x12c0
[   13.783740] Modules linked in:
[   13.784398] CPU: 0 PID: 1658 Comm: syz-executor Tainted: G        W
        5.14.0ea78abdd8ff18baaea3211eabdd6a2a88169cfd6 #134
[   13.786692] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[   13.788958] RIP: 0010:skb_try_coalesce+0x1019/0x12c0
[   13.789930] Code: 24 20 bf 01 00 00 00 8b 40 20 44 0f b7 f0 44 89
f6 e8 0b 2b cf fe 41 83 ee 01 0f 85 01 f3 ff ff e9 42 f6 ff ff e8 67
2c
[   13.793371] RSP: 0018:ffffc9000091f530 EFLAGS: 00010293
[   13.794316] RAX: 0000000000000000 RBX: 0000000000000c00 RCX: 0000000000000000
[   13.795688] RDX: ffff888100fa2880 RSI: ffffffff826767a9 RDI: 0000000000000003
[   13.797093] RBP: ffff888109496de0 R08: 0000000000000c00 R09: 0000000000000000
[   13.798381] R10: ffffffff82676122 R11: 0000000000000000 R12: ffff888100efc0e0
[   13.799766] R13: ffff8881046baac0 R14: 0000000000001000 R15: ffff888100efc156
[   13.801052] FS:  00007f8ad457d700(0000) GS:ffff88811b400000(0000)
knlGS:0000000000000000
[   13.802463] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   13.803603] CR2: 00007f83d8028000 CR3: 00000001045d8000 CR4: 00000000000006f0
[   13.805079] Call Trace:
[   13.805622]  tcp_try_coalesce+0x312/0x870
[   13.806488]  ? tcp_ack_update_rtt+0xfc0/0xfc0
[   13.807406]  ? __sanitizer_cov_trace_const_cmp4+0x1c/0x70
[   13.808483]  ? tcp_try_rmem_schedule+0x99b/0x16e0
[   13.809296]  tcp_queue_rcv+0x73/0x670
[   13.810013]  tcp_data_queue+0x11e5/0x4af0
[   13.810844]  ? __sanitizer_cov_trace_const_cmp2+0x22/0x80
[   13.811890]  ? tcp_urg+0x108/0xb60
[   13.812536]  ? tcp_data_ready+0x550/0x550
[   13.813362]  ? tcp_enter_cwr+0x3f0/0x4d0
[   13.814148]  ? __sanitizer_cov_trace_cmp4+0x1c/0x70
[   13.815014]  ? ktime_get+0xf4/0x150
[   13.815637]  ? __sanitizer_cov_trace_const_cmp8+0x1d/0x70
[   13.816721]  tcp_rcv_established+0x83a/0x1d30
[   13.817541]  ? tcp_data_queue+0x4af0/0x4af0
[   13.818375]  tcp_v6_do_rcv+0x438/0x1380
[   13.819160]  __release_sock+0x1ad/0x310
[   13.819975]  release_sock+0x54/0x1a0
[   13.820745]  ? tcp_sendmsg_locked+0x2ee0/0x2ee0
[   13.821662]  tcp_sendmsg+0x36/0x40
[   13.822351]  inet6_sendmsg+0xb5/0x140
[   13.823115]  ? inet6_ioctl+0x2a0/0x2a0
[   13.823909]  ____sys_sendmsg+0x3b5/0x970
[   13.824720]  ? sock_release+0x1b0/0x1b0
[   13.825521]  ? __ia32_sys_recvmmsg+0x290/0x290
[   13.826441]  ? futex_wait_setup+0x2e0/0x2e0
[   13.827308]  ___sys_sendmsg+0xff/0x170
[   13.828112]  ? hash_futex+0x12/0x1f0
[   13.828853]  ? sendmsg_copy_msghdr+0x160/0x160
[   13.829804]  ? asm_exc_page_fault+0x1e/0x30
[   13.830660]  ? __sanitizer_cov_trace_const_cmp1+0x22/0x80
[   13.831760]  ? __fget_files+0x1c2/0x2a0
[   13.832576]  ? __fget_light+0xea/0x270
[   13.833349]  ? sockfd_lookup_light+0xc3/0x170
[   13.834289]  __sys_sendmmsg+0x192/0x440
[   13.835065]  ? __ia32_sys_sendmsg+0xb0/0xb0
[   13.835918]  ? vfs_fileattr_set+0xb80/0xb80
[   13.836823]  ? __sanitizer_cov_trace_const_cmp4+0x1c/0x70
[   13.837941]  ? alloc_file_pseudo+0x1/0x250
[   13.838810]  ? sock_ioctl+0x1bb/0x670
[   13.839550]  ? __do_sys_futex+0xe7/0x3d0
[   13.840369]  ? __do_sys_futex+0xe7/0x3d0
[   13.841205]  ? __do_sys_futex+0xf0/0x3d0
[   13.842022]  ? __restore_fpregs_from_fpstate+0xa9/0xf0
[   13.843115]  ? fpregs_mark_activate+0x130/0x130
[   13.844074]  ? do_futex+0x1be0/0x1be0
[   13.844868]  __x64_sys_sendmmsg+0x98/0x100
[   13.845725]  ? syscall_exit_to_user_mode+0x1d/0x40
[   13.846754]  do_syscall_64+0x3b/0x90
[   13.847472]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   13.848550] RIP: 0033:0x7f8ad3e8c469
[   13.849289] Code: 00 f3 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40
00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24
08
[   13.852935] RSP: 002b:00007f8ad457cda8 EFLAGS: 00000246 ORIG_RAX:
0000000000000133
[   13.854476] RAX: ffffffffffffffda RBX: 0000000000000133 RCX: 00007f8ad3e8c469
[   13.855896] RDX: 0000000000000003 RSI: 00000000200008c0 RDI: 0000000000000003
[   13.857304] RBP: 0000000000000133 R08: 0000000000000000 R09: 0000000000000000
[   13.858756] R10: 0000000040044040 R11: 0000000000000246 R12: 000000000069bf8c
[   13.860168] R13: 00007ffe38506fef R14: 00007f8ad455d000 R15: 0000000000000003
[   13.861597] ---[ end trace 55e20198e13af26e ]---


>

> diff --git a/include/net/sock.h b/include/net/sock.h

> index 95b2577..173d58c 100644

> --- a/include/net/sock.h

> +++ b/include/net/sock.h

> @@ -1695,6 +1695,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,

>                              gfp_t priority);

>  void __sock_wfree(struct sk_buff *skb);

>  void sock_wfree(struct sk_buff *skb);

> +bool is_skb_wmem(const struct sk_buff *skb);

>  struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,

>                              gfp_t priority);

>  void skb_orphan_partial(struct sk_buff *skb);

> diff --git a/net/core/skbuff.c b/net/core/skbuff.c

> index f931176..09991cb 100644

> --- a/net/core/skbuff.c

> +++ b/net/core/skbuff.c

> @@ -1804,28 +1804,45 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)

>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)

>  {

>         int delta = headroom - skb_headroom(skb);

> +       int osize = skb_end_offset(skb);

> +       struct sk_buff *oskb = NULL;

> +       struct sock *sk = skb->sk;

>

>         if (WARN_ONCE(delta <= 0,

>                       "%s is expecting an increase in the headroom", __func__))

>                 return skb;

>

> -       /* pskb_expand_head() might crash, if skb is shared */

> +       delta = SKB_DATA_ALIGN(delta);

> +       /* pskb_expand_head() might crash, if skb is shared.

> +        * Also we should clone skb if its destructor does

> +        * not adjust skb->truesize and sk->sk_wmem_alloc

> +        */

>         if (skb_shared(skb)) {

>                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

>

> -               if (likely(nskb)) {

> -                       if (skb->sk)

> -                               skb_set_owner_w(nskb, skb->sk);

> -                       consume_skb(skb);

> -               } else {

> +               if (unlikely(!nskb)) {

>                         kfree_skb(skb);

> +                       return NULL;

>                 }

> +               oskb = skb;

>                 skb = nskb;

>         }

> -       if (skb &&

> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {

> +       if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) {

>                 kfree_skb(skb);

> -               skb = NULL;

> +               kfree_skb(oskb);

> +               return NULL;

> +       }

> +       if (oskb) {

> +               if (sk)

> +                       skb_set_owner_w(skb, sk);

> +               consume_skb(oskb);

> +       } else if (sk) {

> +               delta = osize - skb_end_offset(skb);

> +               if (!is_skb_wmem(skb))

> +                       skb_set_owner_w(skb, sk);

> +               skb->truesize += delta;

> +               if (sk_fullsock(sk))

> +                       refcount_add(delta, &sk->sk_wmem_alloc);

>         }

>         return skb;

>  }

> diff --git a/net/core/sock.c b/net/core/sock.c

> index 950f1e7..6cbda43 100644

> --- a/net/core/sock.c

> +++ b/net/core/sock.c

> @@ -2227,6 +2227,14 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)

>  }

>  EXPORT_SYMBOL(skb_set_owner_w);

>

> +bool is_skb_wmem(const struct sk_buff *skb)

> +{

> +       return skb->destructor == sock_wfree ||

> +              skb->destructor == __sock_wfree ||

> +              (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree);

> +}

> +EXPORT_SYMBOL(is_skb_wmem);

> +

>  static bool can_skb_orphan_partial(const struct sk_buff *skb)

>  {

>  #ifdef CONFIG_TLS_DEVICE

> --

> 1.8.3.1

>
Eric Dumazet Sept. 1, 2021, 7:17 p.m. UTC | #9
On 9/1/21 1:11 AM, Vasily Averin wrote:
> Christoph Paasch reports [1] about incorrect skb->truesize

> after skb_expand_head() call in ip6_xmit.

> This may happen because of two reasons:

> - skb_set_owner_w() for newly cloned skb is called too early,

> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.

> - pskb_expand_head() does not adjust truesize in (skb->sk) case.

> In this case sk->sk_wmem_alloc should be adjusted too.

> 

> [1] https://lkml.org/lkml/2021/8/20/1082

> 

> Fixes: f1260ff15a71 ("skbuff: introduce skb_expand_head()")

> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>

> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>

> ---

> v4: decided to use is_skb_wmem() after pskb_expand_head() call

>     fixed 'return (EXPRESSION);' in os_skb_wmem according to Eric Dumazet

> v3: removed __pskb_expand_head(),

>     added is_skb_wmem() helper for skb with wmem-compatible destructors

>     there are 2 ways to use it:

>      - before pskb_expand_head(), to create skb clones

>      - after successfull pskb_expand_head() to change owner on extended skb.

> v2: based on patch version from Eric Dumazet,

>     added __pskb_expand_head() function, which can be forced

>     to adjust skb->truesize and sk->sk_wmem_alloc.

> ---

>  include/net/sock.h |  1 +

>  net/core/skbuff.c  | 35 ++++++++++++++++++++++++++---------

>  net/core/sock.c    |  8 ++++++++

>  3 files changed, 35 insertions(+), 9 deletions(-)

> 

> diff --git a/include/net/sock.h b/include/net/sock.h

> index 95b2577..173d58c 100644

> --- a/include/net/sock.h

> +++ b/include/net/sock.h

> @@ -1695,6 +1695,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,

>  			     gfp_t priority);

>  void __sock_wfree(struct sk_buff *skb);

>  void sock_wfree(struct sk_buff *skb);

> +bool is_skb_wmem(const struct sk_buff *skb);

>  struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,

>  			     gfp_t priority);

>  void skb_orphan_partial(struct sk_buff *skb);

> diff --git a/net/core/skbuff.c b/net/core/skbuff.c

> index f931176..09991cb 100644

> --- a/net/core/skbuff.c

> +++ b/net/core/skbuff.c

> @@ -1804,28 +1804,45 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)

>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)

>  {

>  	int delta = headroom - skb_headroom(skb);

> +	int osize = skb_end_offset(skb);

> +	struct sk_buff *oskb = NULL;

> +	struct sock *sk = skb->sk;

>  

>  	if (WARN_ONCE(delta <= 0,

>  		      "%s is expecting an increase in the headroom", __func__))

>  		return skb;

>  

> -	/* pskb_expand_head() might crash, if skb is shared */

> +	delta = SKB_DATA_ALIGN(delta);

> +	/* pskb_expand_head() might crash, if skb is shared.

> +	 * Also we should clone skb if its destructor does

> +	 * not adjust skb->truesize and sk->sk_wmem_alloc

> + 	 */

>  	if (skb_shared(skb)) {

>  		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

>  

> -		if (likely(nskb)) {

> -			if (skb->sk)

> -				skb_set_owner_w(nskb, skb->sk);

> -			consume_skb(skb);

> -		} else {

> +		if (unlikely(!nskb)) {

>  			kfree_skb(skb);

> +			return NULL;

>  		}

> +		oskb = skb;

>  		skb = nskb;

>  	}

> -	if (skb &&

> -	    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {

> +	if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) {

>  		kfree_skb(skb);

> -		skb = NULL;

> +		kfree_skb(oskb);

> +		return NULL;

> +	}

> +	if (oskb) {

> +		if (sk)


if (is_skb_wmem(oskb))

Again, it is not valid to call skb_set_owner_w(skb, sk) on all kind of sockets.

> +			skb_set_owner_w(skb, sk);

> +		consume_skb(oskb);

> +	} else if (sk) {


&& (skb->destructor != sock_edemux)

(Because in this case , pskb_expand_head() already adjusted skb->truesize)

> +		delta = osize - skb_end_offset(skb);


> +		if (!is_skb_wmem(skb))

> +			skb_set_owner_w(skb, sk);


This is dangerous, even if a socket is there, its sk->sk_wmem_alloc could be zero.

We can not add skb->truesize to a refcount_t that already reached 0 (sk_free())


If is_skb_wmem() is false, you probably should do nothing, and leave
current destructor as it is.
(skb->truesize can be adjusted without issue)

> +		skb->truesize += delta;

> +		if (sk_fullsock(sk))


if (is_skb_wmem(skb))

> +			refcount_add(delta, &sk->sk_wmem_alloc);

>  	}

>  	return skb;

>  }

> diff --git a/net/core/sock.c b/net/core/sock.c

> index 950f1e7..6cbda43 100644

> --- a/net/core/sock.c

> +++ b/net/core/sock.c

> @@ -2227,6 +2227,14 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)

>  }

>  EXPORT_SYMBOL(skb_set_owner_w);

>  

> +bool is_skb_wmem(const struct sk_buff *skb)

> +{

> +	return skb->destructor == sock_wfree ||

> +	       skb->destructor == __sock_wfree ||

> +	       (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree);

> +}

> +EXPORT_SYMBOL(is_skb_wmem);

> +

>  static bool can_skb_orphan_partial(const struct sk_buff *skb)

>  {

>  #ifdef CONFIG_TLS_DEVICE

>
Vasily Averin Sept. 2, 2021, 3:59 a.m. UTC | #10
On 9/1/21 10:17 PM, Eric Dumazet wrote:
> 

> 

> On 9/1/21 1:11 AM, Vasily Averin wrote:

>> Christoph Paasch reports [1] about incorrect skb->truesize

>> after skb_expand_head() call in ip6_xmit.

>> This may happen because of two reasons:

>> - skb_set_owner_w() for newly cloned skb is called too early,

>> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.

>> - pskb_expand_head() does not adjust truesize in (skb->sk) case.

>> In this case sk->sk_wmem_alloc should be adjusted too.

>>

>> [1] https://lkml.org/lkml/2021/8/20/1082

>>

>> Fixes: f1260ff15a71 ("skbuff: introduce skb_expand_head()")

>> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>

>> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>

>> ---

>> v4: decided to use is_skb_wmem() after pskb_expand_head() call

>>     fixed 'return (EXPRESSION);' in os_skb_wmem according to Eric Dumazet

>> v3: removed __pskb_expand_head(),

>>     added is_skb_wmem() helper for skb with wmem-compatible destructors

>>     there are 2 ways to use it:

>>      - before pskb_expand_head(), to create skb clones

>>      - after successfull pskb_expand_head() to change owner on extended skb.

>> v2: based on patch version from Eric Dumazet,

>>     added __pskb_expand_head() function, which can be forced

>>     to adjust skb->truesize and sk->sk_wmem_alloc.

>> ---

>>  include/net/sock.h |  1 +

>>  net/core/skbuff.c  | 35 ++++++++++++++++++++++++++---------

>>  net/core/sock.c    |  8 ++++++++

>>  3 files changed, 35 insertions(+), 9 deletions(-)

>>

>> diff --git a/include/net/sock.h b/include/net/sock.h

>> index 95b2577..173d58c 100644

>> --- a/include/net/sock.h

>> +++ b/include/net/sock.h

>> @@ -1695,6 +1695,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,

>>  			     gfp_t priority);

>>  void __sock_wfree(struct sk_buff *skb);

>>  void sock_wfree(struct sk_buff *skb);

>> +bool is_skb_wmem(const struct sk_buff *skb);

>>  struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,

>>  			     gfp_t priority);

>>  void skb_orphan_partial(struct sk_buff *skb);

>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c

>> index f931176..09991cb 100644

>> --- a/net/core/skbuff.c

>> +++ b/net/core/skbuff.c

>> @@ -1804,28 +1804,45 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)

>>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)

>>  {

>>  	int delta = headroom - skb_headroom(skb);

>> +	int osize = skb_end_offset(skb);

>> +	struct sk_buff *oskb = NULL;

>> +	struct sock *sk = skb->sk;

>>  

>>  	if (WARN_ONCE(delta <= 0,

>>  		      "%s is expecting an increase in the headroom", __func__))

>>  		return skb;

>>  

>> -	/* pskb_expand_head() might crash, if skb is shared */

>> +	delta = SKB_DATA_ALIGN(delta);

>> +	/* pskb_expand_head() might crash, if skb is shared.

>> +	 * Also we should clone skb if its destructor does

>> +	 * not adjust skb->truesize and sk->sk_wmem_alloc

>> + 	 */

>>  	if (skb_shared(skb)) {

>>  		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

>>  

>> -		if (likely(nskb)) {

>> -			if (skb->sk)

>> -				skb_set_owner_w(nskb, skb->sk);

>> -			consume_skb(skb);

>> -		} else {

>> +		if (unlikely(!nskb)) {

>>  			kfree_skb(skb);

>> +			return NULL;

>>  		}

>> +		oskb = skb;

>>  		skb = nskb;

>>  	}

>> -	if (skb &&

>> -	    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {

>> +	if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) {

>>  		kfree_skb(skb);

>> -		skb = NULL;

>> +		kfree_skb(oskb);

>> +		return NULL;

>> +	}

>> +	if (oskb) {

>> +		if (sk)

> 

> if (is_skb_wmem(oskb))

> Again, it is not valid to call skb_set_owner_w(skb, sk) on all kind of sockets.


I'm disagree.

In this particular case we have new skb with skb->sk = NULL,
In this case skb_orphan() called inside skb_set_owner_w(() will do nothing,
we just properly set destructor to sock_wfree and adjust sk->sk_wmem_alloc,

It is 100% equivalent of code used with skb_realloc_headroom(),
and there was no claims on this.
Cristoph's reproducer do not use shared skb and to not check this path,
so it cannot be the reason of troubles in his experiments.

Old destructor (sock_edemux?) can be calleda bit later, for old skb, inside consume_skb().
It can decrement last refcount and can trigger sk_free(). However in this case
adjusted sk_wmem_alloc did not allow to free sk.

So I'm sure it is safe.

>> +			skb_set_owner_w(skb, sk);

>> +		consume_skb(oskb);

>> +	} else if (sk) {

> 

> && (skb->destructor != sock_edemux)

> (Because in this case , pskb_expand_head() already adjusted skb->truesize)


Agree, thank you, my fault, I've missed it.
I think it was the reason of the troubles in last Cristoph's experiment.

>> +		delta = osize - skb_end_offset(skb);

> 

>> +		if (!is_skb_wmem(skb))

>> +			skb_set_owner_w(skb, sk);

> 

> This is dangerous, even if a socket is there, its sk->sk_wmem_alloc could be zero.

> We can not add skb->truesize to a refcount_t that already reached 0 (sk_free())

> 

> If is_skb_wmem() is false, you probably should do nothing, and leave

> current destructor as it is.


I;m still not sure and think it is tricky too.
I've found few destructors called sock_wfree inside, they require sk_wmem_alloc adjustement.
sctp_wfree, unix_destruct_scm and tpacket_destruct_skb

In the same time another ones do not use sk_wmem_alloc and I do not know how to detect proper ones.
Potentially there are some 3rd party protocols out-of-tree, and I cannot list all of them here.

However I think I can use the same trick as one described above:
I can increase sk_wmem_alloc before skb_orphan(), so sk_free() called by old destuctor 
cannot call __sk_free() and release sk.

I hope this should work, 
otherwise we'll need to clone skb for !is_skb_wmem(skb) before pskb_expand_head() call.

Thank you,
	Vasily Averin
Eric Dumazet Sept. 2, 2021, 4:32 a.m. UTC | #11
On 9/1/21 8:59 PM, Vasily Averin wrote:
> On 9/1/21 10:17 PM, Eric Dumazet wrote:

>>

>>

>> On 9/1/21 1:11 AM, Vasily Averin wrote:

>>> Christoph Paasch reports [1] about incorrect skb->truesize

>>> after skb_expand_head() call in ip6_xmit.

>>> This may happen because of two reasons:

>>> - skb_set_owner_w() for newly cloned skb is called too early,

>>> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.

>>> - pskb_expand_head() does not adjust truesize in (skb->sk) case.

>>> In this case sk->sk_wmem_alloc should be adjusted too.

>>>

>>> [1] https://lkml.org/lkml/2021/8/20/1082

>>>

>>> Fixes: f1260ff15a71 ("skbuff: introduce skb_expand_head()")

>>> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>

>>> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>

>>> ---

>>> v4: decided to use is_skb_wmem() after pskb_expand_head() call

>>>     fixed 'return (EXPRESSION);' in os_skb_wmem according to Eric Dumazet

>>> v3: removed __pskb_expand_head(),

>>>     added is_skb_wmem() helper for skb with wmem-compatible destructors

>>>     there are 2 ways to use it:

>>>      - before pskb_expand_head(), to create skb clones

>>>      - after successfull pskb_expand_head() to change owner on extended skb.

>>> v2: based on patch version from Eric Dumazet,

>>>     added __pskb_expand_head() function, which can be forced

>>>     to adjust skb->truesize and sk->sk_wmem_alloc.

>>> ---

>>>  include/net/sock.h |  1 +

>>>  net/core/skbuff.c  | 35 ++++++++++++++++++++++++++---------

>>>  net/core/sock.c    |  8 ++++++++

>>>  3 files changed, 35 insertions(+), 9 deletions(-)

>>>

>>> diff --git a/include/net/sock.h b/include/net/sock.h

>>> index 95b2577..173d58c 100644

>>> --- a/include/net/sock.h

>>> +++ b/include/net/sock.h

>>> @@ -1695,6 +1695,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,

>>>  			     gfp_t priority);

>>>  void __sock_wfree(struct sk_buff *skb);

>>>  void sock_wfree(struct sk_buff *skb);

>>> +bool is_skb_wmem(const struct sk_buff *skb);

>>>  struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,

>>>  			     gfp_t priority);

>>>  void skb_orphan_partial(struct sk_buff *skb);

>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c

>>> index f931176..09991cb 100644

>>> --- a/net/core/skbuff.c

>>> +++ b/net/core/skbuff.c

>>> @@ -1804,28 +1804,45 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)

>>>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)

>>>  {

>>>  	int delta = headroom - skb_headroom(skb);

>>> +	int osize = skb_end_offset(skb);

>>> +	struct sk_buff *oskb = NULL;

>>> +	struct sock *sk = skb->sk;

>>>  

>>>  	if (WARN_ONCE(delta <= 0,

>>>  		      "%s is expecting an increase in the headroom", __func__))

>>>  		return skb;

>>>  

>>> -	/* pskb_expand_head() might crash, if skb is shared */

>>> +	delta = SKB_DATA_ALIGN(delta);

>>> +	/* pskb_expand_head() might crash, if skb is shared.

>>> +	 * Also we should clone skb if its destructor does

>>> +	 * not adjust skb->truesize and sk->sk_wmem_alloc

>>> + 	 */

>>>  	if (skb_shared(skb)) {

>>>  		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

>>>  

>>> -		if (likely(nskb)) {

>>> -			if (skb->sk)

>>> -				skb_set_owner_w(nskb, skb->sk);

>>> -			consume_skb(skb);

>>> -		} else {

>>> +		if (unlikely(!nskb)) {

>>>  			kfree_skb(skb);

>>> +			return NULL;

>>>  		}

>>> +		oskb = skb;

>>>  		skb = nskb;

>>>  	}

>>> -	if (skb &&

>>> -	    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {

>>> +	if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) {

>>>  		kfree_skb(skb);

>>> -		skb = NULL;

>>> +		kfree_skb(oskb);

>>> +		return NULL;

>>> +	}

>>> +	if (oskb) {

>>> +		if (sk)

>>

>> if (is_skb_wmem(oskb))

>> Again, it is not valid to call skb_set_owner_w(skb, sk) on all kind of sockets.

> 

> I'm disagree.


:/ :/ :/

> 

> In this particular case we have new skb with skb->sk = NULL,

> In this case skb_orphan() called inside skb_set_owner_w(() will do nothing,

> we just properly set destructor to sock_wfree and adjust sk->sk_wmem_alloc,

> 


We can not adjust sk_wmem_alloc if this is already 0

The only way you can guarantee this is :

to look at is_skb_wmem(oskb)

Because then you are certain that _at_ least this skb owns a reference on sk->sk_wmem_alloc

If another kind of destructor is held by oskb, then you can not assume this.

Otherwise we need a new refcount_add_if_not_zero() function, and make skb_set_owner_w()
more expensive for a very corner case.

> It is 100% equivalent of code used with skb_realloc_headroom(),

> and there was no claims on this.

> Cristoph's reproducer do not use shared skb and to not check this path,

> so it cannot be the reason of troubles in his experiments.

> 

> Old destructor (sock_edemux?) can be calleda bit later, for old skb, inside consume_skb().

> It can decrement last refcount and can trigger sk_free(). However in this case

> adjusted sk_wmem_alloc did not allow to free sk.

> 

> So I'm sure it is safe.


It is not safe.

> 

>>> +			skb_set_owner_w(skb, sk);

>>> +		consume_skb(oskb);

>>> +	} else if (sk) {

>>

>> && (skb->destructor != sock_edemux)

>> (Because in this case , pskb_expand_head() already adjusted skb->truesize)

> 

> Agree, thank you, my fault, I've missed it.

> I think it was the reason of the troubles in last Cristoph's experiment.

> 

>>> +		delta = osize - skb_end_offset(skb);

>>

>>> +		if (!is_skb_wmem(skb))

>>> +			skb_set_owner_w(skb, sk);

>>

>> This is dangerous, even if a socket is there, its sk->sk_wmem_alloc could be zero.

>> We can not add skb->truesize to a refcount_t that already reached 0 (sk_free())

>>

>> If is_skb_wmem() is false, you probably should do nothing, and leave

>> current destructor as it is.

> 

> I;m still not sure and think it is tricky too.




> I've found few destructors called sock_wfree inside, they require sk_wmem_alloc adjustement.

> sctp_wfree, unix_destruct_scm and tpacket_destruct_skb

> 

> In the same time another ones do not use sk_wmem_alloc and I do not know how to detect proper ones.

> Potentially there are some 3rd party protocols out-of-tree, and I cannot list all of them here.


I think you missed netem case, in particular
skb_orphan_partial() which I already pointed out.

You can setup a stack of virtual devices (tunnels),
with a qdisc on them, before ip6_xmit() is finally called...

Socket might have been closed already.

To test your patch, you could force a skb_orphan_partial() at the beginning
of skb_expand_head() (extending code coverage)

> 

> However I think I can use the same trick as one described above:

> I can increase sk_wmem_alloc before skb_orphan(), so sk_free() called by old destuctor 

> cannot call __sk_free() and release sk.



You can not change sk_wmem_alloc if this is already 0.

refcount_add() will trigger a warning (panic under KASAN)

> 

> I hope this should work, 

> otherwise we'll need to clone skb for !is_skb_wmem(skb) before pskb_expand_head() call.

> 

> Thank you,

> 	Vasily Averin

>
Eric Dumazet Sept. 2, 2021, 4:48 a.m. UTC | #12
On 9/1/21 9:32 PM, Eric Dumazet wrote:

> 

> I think you missed netem case, in particular

> skb_orphan_partial() which I already pointed out.

> 

> You can setup a stack of virtual devices (tunnels),

> with a qdisc on them, before ip6_xmit() is finally called...

> 

> Socket might have been closed already.

> 

> To test your patch, you could force a skb_orphan_partial() at the beginning

> of skb_expand_head() (extending code coverage)

> 


To clarify :

It is ok to 'downgrade' an skb->destructor having a ref on sk->sk_wmem_alloc to
something owning a ref on sk->refcnt.

But the opposite operation (ref on sk->sk_refcnt -->  ref on sk->sk_wmem_alloc) is not safe.
Vasily Averin Sept. 2, 2021, 7:13 a.m. UTC | #13
On 9/2/21 7:48 AM, Eric Dumazet wrote:
> On 9/1/21 9:32 PM, Eric Dumazet wrote:

>> I think you missed netem case, in particular

>> skb_orphan_partial() which I already pointed out.

>>

>> You can setup a stack of virtual devices (tunnels),

>> with a qdisc on them, before ip6_xmit() is finally called...

>>

>> Socket might have been closed already.

>>

>> To test your patch, you could force a skb_orphan_partial() at the beginning

>> of skb_expand_head() (extending code coverage)

> 

> To clarify :

> 

> It is ok to 'downgrade' an skb->destructor having a ref on sk->sk_wmem_alloc to

> something owning a ref on sk->refcnt.

> 

> But the opposite operation (ref on sk->sk_refcnt -->  ref on sk->sk_wmem_alloc) is not safe.


Could you please explain in more details, since I stil have a completely opposite point of view?

Every sk referenced in skb have sk_wmem_alloc > 9 
It is assigned to 1 in sk_alloc and decremented right before last __sk_free(),
inside  both sk_free() sock_wfree() and __sock_wfree()

So it is safe to adjust skb->sk->sk_wmem_alloc, 
because alive skb keeps reference to alive sk and last one keeps sk_wmem_alloc > 0

So any destructor used sk->sk_refcnt will already have sk_wmem_alloc > 0, 
because last sock_put() calls sk_free().

However now I'm not sure in reversed direction.
skb_set_owner_w() check !sk_fullsock(sk) and call sock_hold(sk);
If sk->sk_refcnt can be 0 here (i.e. after execution of old destructor inside skb_orphan) 
-- it can be trigger pointed problem:
"refcount_add() will trigger a warning (panic under KASAN)".

Could you please explain where I'm wrong?

Thank you,
	Vasily Averin
Vasily Averin Sept. 2, 2021, 7:33 a.m. UTC | #14
On 9/2/21 10:13 AM, Vasily Averin wrote:
> On 9/2/21 7:48 AM, Eric Dumazet wrote:

>> On 9/1/21 9:32 PM, Eric Dumazet wrote:

>>> I think you missed netem case, in particular

>>> skb_orphan_partial() which I already pointed out.

>>>

>>> You can setup a stack of virtual devices (tunnels),

>>> with a qdisc on them, before ip6_xmit() is finally called...

>>>

>>> Socket might have been closed already.

>>>

>>> To test your patch, you could force a skb_orphan_partial() at the beginning

>>> of skb_expand_head() (extending code coverage)

>>

>> To clarify :

>>

>> It is ok to 'downgrade' an skb->destructor having a ref on sk->sk_wmem_alloc to

>> something owning a ref on sk->refcnt.

>>

>> But the opposite operation (ref on sk->sk_refcnt -->  ref on sk->sk_wmem_alloc) is not safe.

> 

> Could you please explain in more details, since I stil have a completely opposite point of view?

> 

> Every sk referenced in skb have sk_wmem_alloc > 9 

> It is assigned to 1 in sk_alloc and decremented right before last __sk_free(),

> inside  both sk_free() sock_wfree() and __sock_wfree()

> 

> So it is safe to adjust skb->sk->sk_wmem_alloc, 

> because alive skb keeps reference to alive sk and last one keeps sk_wmem_alloc > 0

> 

> So any destructor used sk->sk_refcnt will already have sk_wmem_alloc > 0, 

> because last sock_put() calls sk_free().

> 

> However now I'm not sure in reversed direction.

> skb_set_owner_w() check !sk_fullsock(sk) and call sock_hold(sk);

> If sk->sk_refcnt can be 0 here (i.e. after execution of old destructor inside skb_orphan) 

> -- it can be trigger pointed problem:

> "refcount_add() will trigger a warning (panic under KASAN)".

> 

> Could you please explain where I'm wrong?


To clarify:
I'm agree it is unsafe  to call on alive skb:
skb_orphan(skb)
adjust(skb_>sk->sk_wmem_alloc)

becasue 2 reasone:
1) old destructor can decrease sk_vmem_alloc to zero and free sk
2) becasue old destructor if !sk_fullsock(sk) can call sock_out and release last sk->sk_refcnt reference.
  in this case sock_hold() will trigger warning.

1) can be handled, we can adjust(sk_wmem_alloc) before skb_orphan()
but I badly understand how to handle 2nd case.

Thank you,
	Vasily Averin
Vasily Averin Sept. 2, 2021, 8:31 a.m. UTC | #15
On 9/2/21 10:33 AM, Vasily Averin wrote:
> On 9/2/21 10:13 AM, Vasily Averin wrote:

>> On 9/2/21 7:48 AM, Eric Dumazet wrote:

>>> On 9/1/21 9:32 PM, Eric Dumazet wrote:

>>>> I think you missed netem case, in particular

>>>> skb_orphan_partial() which I already pointed out.

>>>>

>>>> You can setup a stack of virtual devices (tunnels),

>>>> with a qdisc on them, before ip6_xmit() is finally called...

>>>>

>>>> Socket might have been closed already.

>>>>

>>>> To test your patch, you could force a skb_orphan_partial() at the beginning

>>>> of skb_expand_head() (extending code coverage)

>>>

>>> To clarify :

>>>

>>> It is ok to 'downgrade' an skb->destructor having a ref on sk->sk_wmem_alloc to

>>> something owning a ref on sk->refcnt.

>>>

>>> But the opposite operation (ref on sk->sk_refcnt -->  ref on sk->sk_wmem_alloc) is not safe.

>>

>> Could you please explain in more details, since I stil have a completely opposite point of view?

>>

>> Every sk referenced in skb have sk_wmem_alloc > 9 

>> It is assigned to 1 in sk_alloc and decremented right before last __sk_free(),

>> inside  both sk_free() sock_wfree() and __sock_wfree()

>>

>> So it is safe to adjust skb->sk->sk_wmem_alloc, 

>> because alive skb keeps reference to alive sk and last one keeps sk_wmem_alloc > 0

>>

>> So any destructor used sk->sk_refcnt will already have sk_wmem_alloc > 0, 

>> because last sock_put() calls sk_free().

>>

>> However now I'm not sure in reversed direction.

>> skb_set_owner_w() check !sk_fullsock(sk) and call sock_hold(sk);

>> If sk->sk_refcnt can be 0 here (i.e. after execution of old destructor inside skb_orphan) 

>> -- it can be trigger pointed problem:

>> "refcount_add() will trigger a warning (panic under KASAN)".

>>

>> Could you please explain where I'm wrong?

> 

> To clarify:

> I'm agree it is unsafe  to call on alive skb:


I badly explained the problem in previous letter, let me repeat once again:

I'm told about this piece of code:
+	} else if (sk && skb->destructor != sock_edemux) {
+		delta = osize - skb_end_offset(skb);
+		if (!is_skb_wmem(skb))
+			skb_set_owner_w(skb, sk);
+		skb->truesize += delta;
+		if (sk_fullsock(sk))
+			refcount_add(delta, &sk->sk_wmem_alloc);
 	}

it is called on alive expanded skb and it is incorrect because 2 reasons:

a) if old destructor use ref on sk->sk_wmem_alloc
   It can decrease to 0 and release sk.
b) if old descriptor use ref on sk->refcnt and !sk_fullsock(sk)
    old decriptor can release last reference and release sk.

We can workaround release of sk by move of 
refcount_add(delta, &sk->sk_wmem_alloc) before skb_set_owner_w()

        } else if (sk && skb->destructor != sock_edemux) {
                delta = osize - skb_end_offset(skb);
                refcount_add(delta, &sk->sk_wmem_alloc);
                if (!is_skb_wmem(skb))
                        skb_set_owner_w(skb, sk);
                skb->truesize += delta;
#ifdef CONFIG_INET
                if (!sk_fullsock(sk))
                        refcount_dec(delta, &sk->sk_wmem_alloc);
#endif
        }

However it it does not resolve b) completely
 
oid skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
        skb_orphan(skb); <<< old destructor releases last sk->refcnt ...
        skb->sk = sk;
...
        if (unlikely(!sk_fullsock(sk))) {
                skb->destructor = sock_edemux;
                sock_hold(sk);   <<<< ...and it trigger wrining/panic 
                return;
        }       

Thank you,
	Vasily Averin
Christoph Paasch Sept. 2, 2021, 3:53 p.m. UTC | #16
On Thu, Sep 2, 2021 at 4:12 AM Vasily Averin <vvs@virtuozzo.com> wrote:
>

> Christoph Paasch reports [1] about incorrect skb->truesize

> after skb_expand_head() call in ip6_xmit.

> This may happen because of two reasons:

> - skb_set_owner_w() for newly cloned skb is called too early,

> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.

> - pskb_expand_head() does not adjust truesize in (skb->sk) case.

> In this case sk->sk_wmem_alloc should be adjusted too.

>

> [1] https://lkml.org/lkml/2021/8/20/1082

>

> Fixes: f1260ff15a71 ("skbuff: introduce skb_expand_head()")

> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>

> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>

> ---

> v5: fixed else condition, thanks to Eric

>     reworked update of expanded skb,

>     added corresponding comments

> v4: decided to use is_skb_wmem() after pskb_expand_head() call

>     fixed 'return (EXPRESSION);' in os_skb_wmem according to Eric Dumazet

> v3: removed __pskb_expand_head(),

>     added is_skb_wmem() helper for skb with wmem-compatible destructors

>     there are 2 ways to use it:

>      - before pskb_expand_head(), to create skb clones

>      - after successfull pskb_expand_head() to change owner on extended skb.

> v2: based on patch version from Eric Dumazet,

>     added __pskb_expand_head() function, which can be forced

>     to adjust skb->truesize and sk->sk_wmem_alloc.

> ---

>  include/net/sock.h |  1 +

>  net/core/skbuff.c  | 63 ++++++++++++++++++++++++++++++++++++++++++++++--------

>  net/core/sock.c    |  8 +++++++

>  3 files changed, 63 insertions(+), 9 deletions(-)


Still the same issues around refcount as I reported in my other email.

Did you try running the syzkaller reproducer on your setup?


Christoph

>

> diff --git a/include/net/sock.h b/include/net/sock.h

> index 95b2577..173d58c 100644

> --- a/include/net/sock.h

> +++ b/include/net/sock.h

> @@ -1695,6 +1695,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,

>                              gfp_t priority);

>  void __sock_wfree(struct sk_buff *skb);

>  void sock_wfree(struct sk_buff *skb);

> +bool is_skb_wmem(const struct sk_buff *skb);

>  struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,

>                              gfp_t priority);

>  void skb_orphan_partial(struct sk_buff *skb);

> diff --git a/net/core/skbuff.c b/net/core/skbuff.c

> index f931176..29bb92e7 100644

> --- a/net/core/skbuff.c

> +++ b/net/core/skbuff.c

> @@ -1804,28 +1804,73 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)

>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)

>  {

>         int delta = headroom - skb_headroom(skb);

> +       int osize = skb_end_offset(skb);

> +       struct sk_buff *oskb = NULL;

> +       struct sock *sk = skb->sk;

>

>         if (WARN_ONCE(delta <= 0,

>                       "%s is expecting an increase in the headroom", __func__))

>                 return skb;

>

> -       /* pskb_expand_head() might crash, if skb is shared */

> +       delta = SKB_DATA_ALIGN(delta);

> +       /* pskb_expand_head() might crash, if skb is shared.

> +        * Also we should clone skb if its destructor does

> +        * not adjust skb->truesize and sk->sk_wmem_alloc

> +        */

>         if (skb_shared(skb)) {

>                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

>

> -               if (likely(nskb)) {

> -                       if (skb->sk)

> -                               skb_set_owner_w(nskb, skb->sk);

> -                       consume_skb(skb);

> -               } else {

> +               if (unlikely(!nskb)) {

>                         kfree_skb(skb);

> +                       return NULL;

>                 }

> +               oskb = skb;

>                 skb = nskb;

>         }

> -       if (skb &&

> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {

> +       if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) {

>                 kfree_skb(skb);

> -               skb = NULL;

> +               kfree_skb(oskb);

> +               return NULL;

> +       }

> +       if (oskb) {

> +               if (sk)

> +                       skb_set_owner_w(skb, sk);

> +               consume_skb(oskb);

> +       } else if (sk && skb->destructor != sock_edemux) {

> +               bool ref, set_owner;

> +

> +               ref = false; set_owner = false;

> +               delta = osize - skb_end_offset(skb);

> +               /* skb_set_owner_w() calls current skb destructor.

> +                * It can decrease sk_wmem_alloc to 0 and release sk,

> +                * To prevnt it we increase sk_wmem_alloc earlier.

> +                * Another kind of destructors can release last sk_refcnt,

> +                * so it will be impossible to call sock_hold for !fullsock

> +                * Take extra sk_refcnt to prevent it.

> +                * Otherwise just increase truesize of expanded skb.

> +                */

> +               refcount_add(delta, &sk->sk_wmem_alloc);

> +               if (!is_skb_wmem(skb)) {

> +                       set_owner = true;

> +                       if (!sk_fullsock(sk) && IS_ENABLED(CONFIG_INET)) {

> +                               /* skb_set_owner_w can set sock_edemux */

> +                               ref = refcount_inc_not_zero(&sk->sk_refcnt);

> +                               if (!ref) {

> +                                       set_owner = false;

> +                                       WARN_ON(refcount_sub_and_test(delta, &sk->sk_wmem_alloc));

> +                               }

> +                       }

> +               }

> +               if (set_owner)

> +                       skb_set_owner_w(skb, sk);

> +#ifdef CONFIG_INET

> +               if (skb->destructor == sock_edemux) {

> +                       WARN_ON(refcount_sub_and_test(delta, &sk->sk_wmem_alloc));

> +                       if (ref)

> +                               WARN_ON(refcount_dec_and_test(&sk->sk_refcnt));

> +               }

> +#endif

> +               skb->truesize += delta;

>         }

>         return skb;

>  }

> diff --git a/net/core/sock.c b/net/core/sock.c

> index 950f1e7..6cbda43 100644

> --- a/net/core/sock.c

> +++ b/net/core/sock.c

> @@ -2227,6 +2227,14 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)

>  }

>  EXPORT_SYMBOL(skb_set_owner_w);

>

> +bool is_skb_wmem(const struct sk_buff *skb)

> +{

> +       return skb->destructor == sock_wfree ||

> +              skb->destructor == __sock_wfree ||

> +              (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree);

> +}

> +EXPORT_SYMBOL(is_skb_wmem);

> +

>  static bool can_skb_orphan_partial(const struct sk_buff *skb)

>  {

>  #ifdef CONFIG_TLS_DEVICE

> --

> 1.8.3.1

>
Vasily Averin Sept. 2, 2021, 4:32 p.m. UTC | #17
On 9/2/21 6:53 PM, Christoph Paasch wrote:
> On Thu, Sep 2, 2021 at 4:12 AM Vasily Averin <vvs@virtuozzo.com> wrote:

>>

>> Christoph Paasch reports [1] about incorrect skb->truesize

>> after skb_expand_head() call in ip6_xmit.

>> This may happen because of two reasons:

>> - skb_set_owner_w() for newly cloned skb is called too early,

>> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.

>> - pskb_expand_head() does not adjust truesize in (skb->sk) case.

>> In this case sk->sk_wmem_alloc should be adjusted too.

>>

>> [1] https://lkml.org/lkml/2021/8/20/1082

>>

>> Fixes: f1260ff15a71 ("skbuff: introduce skb_expand_head()")

>> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>

>> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>

>> ---

>> v5: fixed else condition, thanks to Eric

>>     reworked update of expanded skb,

>>     added corresponding comments

>> v4: decided to use is_skb_wmem() after pskb_expand_head() call

>>     fixed 'return (EXPRESSION);' in os_skb_wmem according to Eric Dumazet

>> v3: removed __pskb_expand_head(),

>>     added is_skb_wmem() helper for skb with wmem-compatible destructors

>>     there are 2 ways to use it:

>>      - before pskb_expand_head(), to create skb clones

>>      - after successfull pskb_expand_head() to change owner on extended skb.

>> v2: based on patch version from Eric Dumazet,

>>     added __pskb_expand_head() function, which can be forced

>>     to adjust skb->truesize and sk->sk_wmem_alloc.

>> ---

>>  include/net/sock.h |  1 +

>>  net/core/skbuff.c  | 63 ++++++++++++++++++++++++++++++++++++++++++++++--------

>>  net/core/sock.c    |  8 +++++++

>>  3 files changed, 63 insertions(+), 9 deletions(-)

> 

> Still the same issues around refcount as I reported in my other email.

> 

> Did you try running the syzkaller reproducer on your setup?


no, I do not have 

>> +       } else if (sk && skb->destructor != sock_edemux) {

>> +               bool ref, set_owner;

>> +

>> +               ref = false; set_owner = false;

>> +               delta = osize - skb_end_offset(skb);


error is here, should be instead
delta = skb_end_offset(skb) - osize;

>> +               /* skb_set_owner_w() calls current skb destructor.

>> +                * It can decrease sk_wmem_alloc to 0 and release sk,

>> +                * To prevnt it we increase sk_wmem_alloc earlier.

>> +                * Another kind of destructors can release last sk_refcnt,

>> +                * so it will be impossible to call sock_hold for !fullsock

>> +                * Take extra sk_refcnt to prevent it.

>> +                * Otherwise just increase truesize of expanded skb.

>> +                */

>> +               refcount_add(delta, &sk->sk_wmem_alloc);

>> +               if (!is_skb_wmem(skb)) {

>> +                       set_owner = true;

>> +                       if (!sk_fullsock(sk) && IS_ENABLED(CONFIG_INET)) {

>> +                               /* skb_set_owner_w can set sock_edemux */

>> +                               ref = refcount_inc_not_zero(&sk->sk_refcnt);

>> +                               if (!ref) {

>> +                                       set_owner = false;

>> +                                       WARN_ON(refcount_sub_and_test(delta, &sk->sk_wmem_alloc));

>> +                               }

>> +                       }

>> +               }

>> +               if (set_owner)

>> +                       skb_set_owner_w(skb, sk);

>> +#ifdef CONFIG_INET

>> +               if (skb->destructor == sock_edemux) {

>> +                       WARN_ON(refcount_sub_and_test(delta, &sk->sk_wmem_alloc));

>> +                       if (ref)

>> +                               WARN_ON(refcount_dec_and_test(&sk->sk_refcnt));

>> +               }

>> +#endif

>> +               skb->truesize += delta;

>>         }

>>         return skb;

>>  }

>> diff --git a/net/core/sock.c b/net/core/sock.c

>> index 950f1e7..6cbda43 100644

>> --- a/net/core/sock.c

>> +++ b/net/core/sock.c

>> @@ -2227,6 +2227,14 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)

>>  }

>>  EXPORT_SYMBOL(skb_set_owner_w);

>>

>> +bool is_skb_wmem(const struct sk_buff *skb)

>> +{

>> +       return skb->destructor == sock_wfree ||

>> +              skb->destructor == __sock_wfree ||

>> +              (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree);

>> +}

>> +EXPORT_SYMBOL(is_skb_wmem);

>> +

>>  static bool can_skb_orphan_partial(const struct sk_buff *skb)

>>  {

>>  #ifdef CONFIG_TLS_DEVICE

>> --

>> 1.8.3.1

>>
Vasily Averin Sept. 6, 2021, 6:03 p.m. UTC | #18
I've finally reproduced original issue by using reproducer from Christoph Paasch,
and was able locally validate this patch.

Thank you,
	Vasily Averin

On 9/6/21 9:01 PM, Vasily Averin wrote:
> Christoph Paasch reports [1] about incorrect skb->truesize

> after skb_expand_head() call in ip6_xmit.

> This may happen because of two reasons:

> - skb_set_owner_w() for newly cloned skb is called too early,

> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.

> - pskb_expand_head() does not adjust truesize in (skb->sk) case.

> In this case sk->sk_wmem_alloc should be adjusted too.

> 

> [1] https://lkml.org/lkml/2021/8/20/1082

> 

> Fixes: f1260ff15a71 ("skbuff: introduce skb_expand_head()")

> Fixes: 2d85a1b31dde ("ipv6: ip6_finish_output2: set sk into newly allocated nskb")

> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>

> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>

> ---

> v6: fixed delta,

>     improved comments

> v5: fixed else condition, thanks to Eric

>     reworked update of expanded skb,

>     added corresponding comments

> v4: decided to use is_skb_wmem() after pskb_expand_head() call

>     fixed 'return (EXPRESSION);' in os_skb_wmem according to Eric Dumazet

> v3: removed __pskb_expand_head(),

>     added is_skb_wmem() helper for skb with wmem-compatible destructors

>     there are 2 ways to use it:

>      - before pskb_expand_head(), to create skb clones

>      - after successfull pskb_expand_head() to change owner on extended skb.

> v2: based on patch version from Eric Dumazet,

>     added __pskb_expand_head() function, which can be forced

>     to adjust skb->truesize and sk->sk_wmem_alloc.

> ---

>  include/net/sock.h |  1 +

>  net/core/skbuff.c  | 60 ++++++++++++++++++++++++++++++++++++++++++++++--------

>  net/core/sock.c    |  8 ++++++++

>  3 files changed, 60 insertions(+), 9 deletions(-)

> 

> diff --git a/include/net/sock.h b/include/net/sock.h

> index 95b2577..173d58c 100644

> --- a/include/net/sock.h

> +++ b/include/net/sock.h

> @@ -1695,6 +1695,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,

>  			     gfp_t priority);

>  void __sock_wfree(struct sk_buff *skb);

>  void sock_wfree(struct sk_buff *skb);

> +bool is_skb_wmem(const struct sk_buff *skb);

>  struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,

>  			     gfp_t priority);

>  void skb_orphan_partial(struct sk_buff *skb);

> diff --git a/net/core/skbuff.c b/net/core/skbuff.c

> index f931176..e2a2aa31 100644

> --- a/net/core/skbuff.c

> +++ b/net/core/skbuff.c

> @@ -1804,28 +1804,70 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)

>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)

>  {

>  	int delta = headroom - skb_headroom(skb);

> +	int osize = skb_end_offset(skb);

> +	struct sk_buff *oskb = NULL;

> +	struct sock *sk = skb->sk;

>  

>  	if (WARN_ONCE(delta <= 0,

>  		      "%s is expecting an increase in the headroom", __func__))

>  		return skb;

>  

> -	/* pskb_expand_head() might crash, if skb is shared */

> +	delta = SKB_DATA_ALIGN(delta);

> +	/* pskb_expand_head() might crash, if skb is shared. */

>  	if (skb_shared(skb)) {

>  		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

>  

> -		if (likely(nskb)) {

> -			if (skb->sk)

> -				skb_set_owner_w(nskb, skb->sk);

> -			consume_skb(skb);

> -		} else {

> +		if (unlikely(!nskb)) {

>  			kfree_skb(skb);

> +			return NULL;

>  		}

> +		oskb = skb;

>  		skb = nskb;

>  	}

> -	if (skb &&

> -	    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {

> +	if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) {

>  		kfree_skb(skb);

> -		skb = NULL;

> +		kfree_skb(oskb);

> +		return NULL;

> +	}

> +	if (oskb) {

> +		if (sk)

> +			skb_set_owner_w(skb, sk);

> +		consume_skb(oskb);

> +	} else if (sk && skb->destructor != sock_edemux) {

> +		bool ref, set_owner;

> +

> +		ref = false; set_owner = false;

> +		delta = skb_end_offset(skb) - osize;

> +		/* skb_set_owner_w() calls current skb destructor.

> +		 * It can reduce sk_wmem_alloc to 0 and release sk,

> +		 * To prevnt this, we increase sk_wmem_alloc in advance.

> +		 * Some destructors might release the last sk_refcnt,

> +		 * so it won't be possible to call sock_hold for !fullsock

> +		 * We take an extra sk_refcnt to prevent this.

> +		 * In any case we increase truesize of expanded skb.

> +		 */

> +		refcount_add(delta, &sk->sk_wmem_alloc);

> +		if (!is_skb_wmem(skb)) {

> +			set_owner = true;

> +			if (!sk_fullsock(sk) && IS_ENABLED(CONFIG_INET)) {

> +				/* skb_set_owner_w can set sock_edemux */

> +				ref = refcount_inc_not_zero(&sk->sk_refcnt);

> +				if (!ref) {

> +					set_owner = false;

> +					WARN_ON(refcount_sub_and_test(delta, &sk->sk_wmem_alloc));

> +				}

> +			}

> +		}

> +		if (set_owner)

> +			skb_set_owner_w(skb, sk);

> +#ifdef CONFIG_INET

> +		if (skb->destructor == sock_edemux) {

> +			WARN_ON(refcount_sub_and_test(delta, &sk->sk_wmem_alloc));

> +			if (ref)

> +				WARN_ON(refcount_dec_and_test(&sk->sk_refcnt));

> +		}

> +#endif

> +		skb->truesize += delta;

>  	}

>  	return skb;

>  }

> diff --git a/net/core/sock.c b/net/core/sock.c

> index 950f1e7..6cbda43 100644

> --- a/net/core/sock.c

> +++ b/net/core/sock.c

> @@ -2227,6 +2227,14 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)

>  }

>  EXPORT_SYMBOL(skb_set_owner_w);

>  

> +bool is_skb_wmem(const struct sk_buff *skb)

> +{

> +	return skb->destructor == sock_wfree ||

> +	       skb->destructor == __sock_wfree ||

> +	       (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree);

> +}

> +EXPORT_SYMBOL(is_skb_wmem);

> +

>  static bool can_skb_orphan_partial(const struct sk_buff *skb)

>  {

>  #ifdef CONFIG_TLS_DEVICE

>
diff mbox series

Patch

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f931176..4691023 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1681,10 +1681,10 @@  struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
  *	reloaded after call to this function.
  */
 
-int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
-		     gfp_t gfp_mask)
+static int __pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+			      gfp_t gfp_mask, bool update_truesize)
 {
-	int i, osize = skb_end_offset(skb);
+	int delta, i, osize = skb_end_offset(skb);
 	int size = osize + nhead + ntail;
 	long off;
 	u8 *data;
@@ -1756,9 +1756,13 @@  int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	 * For the moment, we really care of rx path, or
 	 * when skb is orphaned (not attached to a socket).
 	 */
-	if (!skb->sk || skb->destructor == sock_edemux)
-		skb->truesize += size - osize;
-
+	delta = size - osize;
+	if (!skb->sk || skb->destructor == sock_edemux) {
+		skb->truesize += delta;
+	} else if (update_truesize) {
+		refcount_add(delta, &skb->sk->sk_wmem_alloc);
+		skb->truesize += delta;
+	}
 	return 0;
 
 nofrags:
@@ -1766,6 +1770,12 @@  int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 nodata:
 	return -ENOMEM;
 }
+
+int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+		     gfp_t gfp_mask)
+{
+	return __pskb_expand_head(skb, nhead, ntail, gfp_mask, false);
+}
 EXPORT_SYMBOL(pskb_expand_head);
 
 /* Make private copy of skb with writable head and some headroom */
@@ -1804,28 +1814,33 @@  struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
 struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
 {
 	int delta = headroom - skb_headroom(skb);
+	struct sk_buff *oskb = NULL;
 
 	if (WARN_ONCE(delta <= 0,
 		      "%s is expecting an increase in the headroom", __func__))
 		return skb;
 
+	delta = SKB_DATA_ALIGN(delta);
 	/* pskb_expand_head() might crash, if skb is shared */
 	if (skb_shared(skb)) {
 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
 
-		if (likely(nskb)) {
-			if (skb->sk)
-				skb_set_owner_w(nskb, skb->sk);
-			consume_skb(skb);
-		} else {
+		if (unlikely(!nskb)) {
 			kfree_skb(skb);
+			return NULL;
 		}
+		oskb = skb;
 		skb = nskb;
 	}
-	if (skb &&
-	    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
+	if (__pskb_expand_head(skb, delta, 0, GFP_ATOMIC, true)) {
 		kfree_skb(skb);
-		skb = NULL;
+		kfree_skb(oskb);
+		return NULL;
+	}
+	if (oskb) {
+		if (oskb->sk)
+			skb_set_owner_w(skb, oskb->sk);
+		consume_skb(oskb);
 	}
 	return skb;
 }