diff mbox series

[net-next,v3] virtio-net: page_to_skb() use build_skb when there's sufficient tailroom

Message ID 20210416091615.25198-1-xuanzhuo@linux.alibaba.com
State New
Headers show
Series [net-next,v3] virtio-net: page_to_skb() use build_skb when there's sufficient tailroom | expand

Commit Message

Xuan Zhuo April 16, 2021, 9:16 a.m. UTC
In page_to_skb(), if we have enough tailroom to save skb_shared_info, we
can use build_skb to create skb directly. No need to alloc for
additional space. And it can save a 'frags slot', which is very friendly
to GRO.

Here, if the payload of the received package is too small (less than
GOOD_COPY_LEN), we still choose to copy it directly to the space got by
napi_alloc_skb. So we can reuse these pages.

Testing Machine:
    The four queues of the network card are bound to the cpu1.

Test command:
    for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 150& done

The size of the udp package is 1000, so in the case of this patch, there
will always be enough tailroom to use build_skb. The sent udp packet
will be discarded because there is no port to receive it. The irqsoftd
of the machine is 100%, we observe the received quantity displayed by
sar -n DEV 1:

no build_skb:  956864.00 rxpck/s
build_skb:    1158465.00 rxpck/s

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Suggested-by: Jason Wang <jasowang@redhat.com>
---

v3: fix the truesize when headroom > 0

v2: conflict resolution

 drivers/net/virtio_net.c | 69 ++++++++++++++++++++++++++++------------
 1 file changed, 48 insertions(+), 21 deletions(-)

--
2.31.0

Comments

Jason Wang April 19, 2021, 3:10 a.m. UTC | #1
在 2021/4/16 下午5:16, Xuan Zhuo 写道:
> In page_to_skb(), if we have enough tailroom to save skb_shared_info, we

> can use build_skb to create skb directly. No need to alloc for

> additional space. And it can save a 'frags slot', which is very friendly

> to GRO.

>

> Here, if the payload of the received package is too small (less than

> GOOD_COPY_LEN), we still choose to copy it directly to the space got by

> napi_alloc_skb. So we can reuse these pages.

>

> Testing Machine:

>      The four queues of the network card are bound to the cpu1.

>

> Test command:

>      for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 150& done

>

> The size of the udp package is 1000, so in the case of this patch, there

> will always be enough tailroom to use build_skb. The sent udp packet

> will be discarded because there is no port to receive it. The irqsoftd

> of the machine is 100%, we observe the received quantity displayed by

> sar -n DEV 1:

>

> no build_skb:  956864.00 rxpck/s

> build_skb:    1158465.00 rxpck/s



I suggess to test the case of XDP_PASS in this case as well.


>

> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>

> Suggested-by: Jason Wang <jasowang@redhat.com>

> ---

>

> v3: fix the truesize when headroom > 0

>

> v2: conflict resolution

>

>   drivers/net/virtio_net.c | 69 ++++++++++++++++++++++++++++------------

>   1 file changed, 48 insertions(+), 21 deletions(-)

>

> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c

> index 101659cd4b87..8cd76037c724 100644

> --- a/drivers/net/virtio_net.c

> +++ b/drivers/net/virtio_net.c

> @@ -379,21 +379,17 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,

>   				   struct receive_queue *rq,

>   				   struct page *page, unsigned int offset,

>   				   unsigned int len, unsigned int truesize,

> -				   bool hdr_valid, unsigned int metasize)

> +				   bool hdr_valid, unsigned int metasize,

> +				   unsigned int headroom)

>   {

>   	struct sk_buff *skb;

>   	struct virtio_net_hdr_mrg_rxbuf *hdr;

>   	unsigned int copy, hdr_len, hdr_padded_len;

> -	char *p;

> +	int tailroom, shinfo_size;

> +	char *p, *hdr_p;

>

>   	p = page_address(page) + offset;

> -

> -	/* copy small packet so we can reuse these pages for small data */

> -	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);

> -	if (unlikely(!skb))

> -		return NULL;

> -

> -	hdr = skb_vnet_hdr(skb);

> +	hdr_p = p;

>

>   	hdr_len = vi->hdr_len;

>   	if (vi->mergeable_rx_bufs)

> @@ -401,14 +397,38 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,

>   	else

>   		hdr_padded_len = sizeof(struct padded_vnet_hdr);

>

> -	/* hdr_valid means no XDP, so we can copy the vnet header */

> -	if (hdr_valid)

> -		memcpy(hdr, p, hdr_len);

> +	/* If headroom is not 0, there is an offset between the beginning of the

> +	 * data and the allocated space, otherwise the data and the allocated

> +	 * space are aligned.

> +	 */

> +	if (headroom) {

> +		/* The actual allocated space size is PAGE_SIZE. */



I think the comment in receive_mergeable() is better:

                 /* Buffers with headroom use PAGE_SIZE as alloc size,
                  * see add_recvbuf_mergeable() + get_mergeable_buf_len()
                  */


> +		truesize = PAGE_SIZE;

> +		tailroom = truesize - len - offset;

> +	} else {

> +		tailroom = truesize - len;

> +	}

>

>   	len -= hdr_len;

>   	offset += hdr_padded_len;

>   	p += hdr_padded_len;

>

> +	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

> +

> +	if (len > GOOD_COPY_LEN && tailroom >= shinfo_size) {



Any reason that you don't use build_skb() for len < GOOD_COPY_LEN?

Other looks good.

Thanks


> +		skb = build_skb(p, truesize);

> +		if (unlikely(!skb))

> +			return NULL;

> +

> +		skb_put(skb, len);

> +		goto ok;

> +	}

> +

> +	/* copy small packet so we can reuse these pages for small data */

> +	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);

> +	if (unlikely(!skb))

> +		return NULL;

> +

>   	/* Copy all frame if it fits skb->head, otherwise

>   	 * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.

>   	 */

> @@ -418,11 +438,6 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,

>   		copy = ETH_HLEN + metasize;

>   	skb_put_data(skb, p, copy);

>

> -	if (metasize) {

> -		__skb_pull(skb, metasize);

> -		skb_metadata_set(skb, metasize);

> -	}

> -

>   	len -= copy;

>   	offset += copy;

>

> @@ -431,7 +446,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,

>   			skb_add_rx_frag(skb, 0, page, offset, len, truesize);

>   		else

>   			put_page(page);

> -		return skb;

> +		goto ok;

>   	}

>

>   	/*

> @@ -458,6 +473,18 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,

>   	if (page)

>   		give_pages(rq, page);

>

> +ok:

> +	/* hdr_valid means no XDP, so we can copy the vnet header */

> +	if (hdr_valid) {

> +		hdr = skb_vnet_hdr(skb);

> +		memcpy(hdr, hdr_p, hdr_len);

> +	}

> +

> +	if (metasize) {

> +		__skb_pull(skb, metasize);

> +		skb_metadata_set(skb, metasize);

> +	}

> +

>   	return skb;

>   }

>

> @@ -808,7 +835,7 @@ static struct sk_buff *receive_big(struct net_device *dev,

>   {

>   	struct page *page = buf;

>   	struct sk_buff *skb =

> -		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0);

> +		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);

>

>   	stats->bytes += len - vi->hdr_len;

>   	if (unlikely(!skb))

> @@ -922,7 +949,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,

>   				put_page(page);

>   				head_skb = page_to_skb(vi, rq, xdp_page, offset,

>   						       len, PAGE_SIZE, false,

> -						       metasize);

> +						       metasize, headroom);

>   				return head_skb;

>   			}

>   			break;

> @@ -980,7 +1007,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,

>   	}

>

>   	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,

> -			       metasize);

> +			       metasize, headroom);

>   	curr_skb = head_skb;

>

>   	if (unlikely(!curr_skb))

> --

> 2.31.0

>
David Ahern April 19, 2021, 4:48 p.m. UTC | #2
On 4/16/21 2:16 AM, Xuan Zhuo wrote:
> In page_to_skb(), if we have enough tailroom to save skb_shared_info, we

> can use build_skb to create skb directly. No need to alloc for

> additional space. And it can save a 'frags slot', which is very friendly

> to GRO.

> 

> Here, if the payload of the received package is too small (less than

> GOOD_COPY_LEN), we still choose to copy it directly to the space got by

> napi_alloc_skb. So we can reuse these pages.

> 

> Testing Machine:

>     The four queues of the network card are bound to the cpu1.

> 

> Test command:

>     for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 150& done

> 

> The size of the udp package is 1000, so in the case of this patch, there

> will always be enough tailroom to use build_skb. The sent udp packet

> will be discarded because there is no port to receive it. The irqsoftd

> of the machine is 100%, we observe the received quantity displayed by

> sar -n DEV 1:

> 

> no build_skb:  956864.00 rxpck/s

> build_skb:    1158465.00 rxpck/s

> 


virtio_net is using napi_consume_skb, so napi_build_skb should show a
small increase from build_skb.
Mat Martineau April 19, 2021, 11:29 p.m. UTC | #3
On Fri, 16 Apr 2021, Xuan Zhuo wrote:

> In page_to_skb(), if we have enough tailroom to save skb_shared_info, we

> can use build_skb to create skb directly. No need to alloc for

> additional space. And it can save a 'frags slot', which is very friendly

> to GRO.

>

> Here, if the payload of the received package is too small (less than

> GOOD_COPY_LEN), we still choose to copy it directly to the space got by

> napi_alloc_skb. So we can reuse these pages.

>

> Testing Machine:

>    The four queues of the network card are bound to the cpu1.

>

> Test command:

>    for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 150& done

>

> The size of the udp package is 1000, so in the case of this patch, there

> will always be enough tailroom to use build_skb. The sent udp packet

> will be discarded because there is no port to receive it. The irqsoftd

> of the machine is 100%, we observe the received quantity displayed by

> sar -n DEV 1:

>

> no build_skb:  956864.00 rxpck/s

> build_skb:    1158465.00 rxpck/s

>

> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>

> Suggested-by: Jason Wang <jasowang@redhat.com>

> ---

>

> v3: fix the truesize when headroom > 0

>

> v2: conflict resolution

>

> drivers/net/virtio_net.c | 69 ++++++++++++++++++++++++++++------------

> 1 file changed, 48 insertions(+), 21 deletions(-)


Xuan,

I realize this has been merged to net-next already, but I'm getting a 
use-after-free with KASAN in page_to_skb() with this patch. Reverting this 
change fixes the UAF. I've included the KASAN dump below, and a couple of 
comments inline.


>

> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c

> index 101659cd4b87..8cd76037c724 100644

> --- a/drivers/net/virtio_net.c

> +++ b/drivers/net/virtio_net.c

> @@ -379,21 +379,17 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,

> 				   struct receive_queue *rq,

> 				   struct page *page, unsigned int offset,

> 				   unsigned int len, unsigned int truesize,

> -				   bool hdr_valid, unsigned int metasize)

> +				   bool hdr_valid, unsigned int metasize,

> +				   unsigned int headroom)

> {

> 	struct sk_buff *skb;

> 	struct virtio_net_hdr_mrg_rxbuf *hdr;

> 	unsigned int copy, hdr_len, hdr_padded_len;

> -	char *p;

> +	int tailroom, shinfo_size;

> +	char *p, *hdr_p;

>

> 	p = page_address(page) + offset;

> -

> -	/* copy small packet so we can reuse these pages for small data */

> -	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);

> -	if (unlikely(!skb))

> -		return NULL;

> -

> -	hdr = skb_vnet_hdr(skb);

> +	hdr_p = p;


hdr_p is assigned here, pointer to an address in the provided page...

>

> 	hdr_len = vi->hdr_len;

> 	if (vi->mergeable_rx_bufs)


(snip)

> @@ -431,7 +446,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,

> 			skb_add_rx_frag(skb, 0, page, offset, len, truesize);

> 		else

> 			put_page(page);


page is potentially released here...

> -		return skb;

> +		goto ok;

> 	}

>

> 	/*

> @@ -458,6 +473,18 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,

> 	if (page)

> 		give_pages(rq, page);

>

> +ok:

> +	/* hdr_valid means no XDP, so we can copy the vnet header */

> +	if (hdr_valid) {

> +		hdr = skb_vnet_hdr(skb);

> +		memcpy(hdr, hdr_p, hdr_len);


and hdr_p is dereferenced here.

I'm seeing this KASAN UAF at boot time in a kvm VM (Fedora 33 host and 
guest, if that helps):

[   61.202483] ==================================================================
[   61.204005] BUG: KASAN: use-after-free in page_to_skb+0x32a/0x4b0
[   61.205387] Read of size 12 at addr ffff888105bdf800 by task NetworkManager/579
[   61.207035] 
[   61.207408] CPU: 0 PID: 579 Comm: NetworkManager Not tainted 5.12.0-rc7+ #2
[   61.208715] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014
[   61.210257] Call Trace:
[   61.210730]  <IRQ>
[   61.211209]  dump_stack+0x93/0xc2
[   61.211996]  print_address_description.constprop.0+0x18/0x130
[   61.213310]  ? page_to_skb+0x32a/0x4b0
[   61.214318]  ? page_to_skb+0x32a/0x4b0
[   61.215085]  kasan_report.cold+0x7f/0x111
[   61.215966]  ? trace_hardirqs_off+0x10/0xe0
[   61.216823]  ? page_to_skb+0x32a/0x4b0
[   61.217809]  kasan_check_range+0xf9/0x1e0
[   61.217834]  memcpy+0x20/0x60
[   61.217848]  page_to_skb+0x32a/0x4b0
[   61.217888]  receive_buf+0x1434/0x2690
[   61.217926]  ? page_to_skb+0x4b0/0x4b0
[   61.217947]  ? find_held_lock+0x85/0xa0
[   61.217964]  ? lock_release+0x1d0/0x400
[   61.217974]  ? virtnet_poll+0x1d8/0x6b0
[   61.217983]  ? detach_buf_split+0x254/0x290
[   61.218008]  ? virtqueue_get_buf_ctx_split+0x145/0x1f0
[   61.218032]  virtnet_poll+0x2a8/0x6b0
[   61.218057]  ? receive_buf+0x2690/0x2690
[   61.218067]  ? lock_release+0x400/0x400
[   61.218119]  __napi_poll+0x57/0x2f0
[   61.229624]  net_rx_action+0x4dd/0x590
[   61.230453]  ? napi_threaded_poll+0x2b0/0x2b0
[   61.231379]  ? rcu_implicit_dynticks_qs+0x430/0x430
[   61.232429]  ? lock_is_held_type+0x98/0x110
[   61.233342]  __do_softirq+0xfd/0x59d
[   61.234131]  do_softirq+0x8a/0xb0
[   61.234896]  </IRQ>
[   61.235397]  ? virtnet_open+0x10a/0x2e0
[   61.236273]  __local_bh_enable_ip+0xb1/0xc0
[   61.237199]  virtnet_open+0x11b/0x2e0
[   61.237981]  __dev_open+0x1b7/0x2c0
[   61.238689]  ? dev_set_rx_mode+0x60/0x60
[   61.239499]  ? lockdep_hardirqs_on_prepare+0x12e/0x1f0
[   61.240523]  ? __local_bh_enable_ip+0x7b/0xc0
[   61.241399]  ? trace_hardirqs_on+0x1c/0x100
[   61.242248]  __dev_change_flags+0x2e6/0x370
[   61.243098]  ? dev_set_allmulti+0x10/0x10
[   61.243908]  ? lock_chain_count+0x20/0x20
[   61.244759]  dev_change_flags+0x55/0xb0
[   61.245595]  do_setlink+0xb52/0x1950
[   61.246385]  ? rtnl_getlink+0x560/0x560
[   61.247218]  ? mark_lock+0x101/0x19c0
[   61.248003]  ? lock_chain_count+0x20/0x20
[   61.248864]  ? lock_chain_count+0x20/0x20
[   61.249728]  ? lockdep_hardirqs_on_prepare+0x1f0/0x1f0
[   61.250821]  ? memset+0x20/0x40
[   61.251474]  ? __nla_validate_parse+0xac/0x12f0
[   61.252433]  ? nla_get_range_signed+0x1c0/0x1c0
[   61.253409]  ? __lock_acquire+0x85f/0x3090
[   61.254291]  __rtnl_newlink+0x85f/0xca0
[   61.255131]  ? rtnl_setlink+0x220/0x220
[   61.255988]  ? lock_is_held_type+0x98/0x110
[   61.256911]  ? find_held_lock+0x85/0xa0
[   61.257782]  ? __is_insn_slot_addr+0xa5/0x130
[   61.257794]  ? lock_downgrade+0x390/0x390
[   61.257802]  ? stack_access_ok+0x35/0x90
[   61.257818]  ? entry_SYSCALL_64_after_hwframe+0x44/0xae
[   61.257840]  ? __is_insn_slot_addr+0xc4/0x130
[   61.257859]  ? kernel_text_address+0xc8/0xf0
[   61.257876]  ? __kernel_text_address+0x9/0x30
[   61.257885]  ? unwind_get_return_address+0x2a/0x40
[   61.257893]  ? create_prof_cpu_mask+0x20/0x20
[   61.257903]  ? arch_stack_walk+0x99/0xf0
[   61.258007]  ? lock_release+0x1d0/0x400
[   61.258016]  ? fs_reclaim_release+0x56/0x90
[   61.258027]  ? lock_downgrade+0x390/0x390
[   61.258036]  ? find_held_lock+0x80/0xa0
[   61.258049]  ? lock_release+0x1d0/0x400
[   61.258059]  ? lock_is_held_type+0x98/0x110
[   61.258087]  rtnl_newlink+0x4b/0x70
[   61.258099]  rtnetlink_rcv_msg+0x22c/0x5e0
[   61.258116]  ? rtnetlink_put_metrics+0x2c0/0x2c0
[   61.258131]  ? lock_acquire+0x157/0x4d0
[   61.258140]  ? netlink_deliver_tap+0xa6/0x570
[   61.258154]  ? lock_release+0x400/0x400
[   61.258172]  netlink_rcv_skb+0xc4/0x1f0
[   61.258180]  ? rtnetlink_put_metrics+0x2c0/0x2c0
[   61.258193]  ? netlink_ack+0x4f0/0x4f0
[   61.258199]  ? netlink_deliver_tap+0x129/0x570
[   61.258234]  netlink_unicast+0x2d3/0x410
[   61.258248]  ? netlink_attachskb+0x400/0x400
[   61.258257]  ? _copy_from_iter_full+0xd8/0x360
[   61.258280]  netlink_sendmsg+0x394/0x670
[   61.258299]  ? netlink_unicast+0x410/0x410
[   61.258305]  ? iovec_from_user+0xa1/0x1d0
[   61.258327]  ? netlink_unicast+0x410/0x410
[   61.258340]  sock_sendmsg+0x91/0xa0
[   61.258353]  ____sys_sendmsg+0x3b7/0x400
[   61.258366]  ? kernel_sendmsg+0x30/0x30
[   61.258376]  ? __ia32_sys_recvmmsg+0x150/0x150
[   61.258390]  ? lockdep_hardirqs_on_prepare+0x1f0/0x1f0
[   61.258398]  ? stack_trace_save+0x8c/0xc0
[   61.258408]  ? stack_trace_consume_entry+0x80/0x80
[   61.258416]  ? __fput+0x1a9/0x3d0
[   61.258435]  ___sys_sendmsg+0xd3/0x130
[   61.258446]  ? sendmsg_copy_msghdr+0x110/0x110
[   61.258458]  ? find_held_lock+0x85/0xa0
[   61.258471]  ? lock_release+0x1d0/0x400
[   61.258479]  ? __fget_files+0x133/0x210
[   61.258490]  ? lock_downgrade+0x390/0x390
[   61.258508]  ? lockdep_hardirqs_on_prepare+0x1f0/0x1f0
[   61.258529]  ? __fget_files+0x152/0x210
[   61.258547]  ? __fget_light+0x66/0xf0
[   61.258568]  __sys_sendmsg+0xae/0x120
[   61.258578]  ? __sys_sendmsg_sock+0x10/0x10
[   61.258589]  ? lockdep_hardirqs_on_prepare+0x12e/0x1f0
[   61.258598]  ? call_rcu+0x414/0x670
[   61.258616]  ? mark_held_locks+0x25/0x90
[   61.258630]  ? lockdep_hardirqs_on_prepare+0x12e/0x1f0
[   61.258639]  ? syscall_enter_from_user_mode+0x1d/0x50
[   61.258647]  ? trace_hardirqs_on+0x1c/0x100
[   61.258662]  do_syscall_64+0x33/0x40
[   61.258670]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   61.258679] RIP: 0033:0x7fb33db83ecd
[   61.258686] Code: 28 89 54 24 1c 48 89 74 24 10 89 7c 24 08 e8 4a ee ff ff 8b 54 24 1c 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 44 24 08 e8 9e ee ff ff 48
[   61.258692] RSP: 002b:00007ffc85e90e30 EFLAGS: 00000293 ORIG_RAX: 000000000000002e
[   61.258702] RAX: ffffffffffffffda RBX: 000055f87a7aa030 RCX: 00007fb33db83ecd
[   61.258708] RDX: 0000000000000000 RSI: 00007ffc85e90e70 RDI: 000000000000000c
[   61.258713] RBP: 000000000000000c R08: 0000000000000000 R09: 0000000000000000
[   61.258717] R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000000
[   61.258722] R13: 00007ffc85e90fd0 R14: 00007ffc85e90fcc R15: 0000000000000000


--
Mat Martineau
Intel
Jason Wang April 20, 2021, 2:38 a.m. UTC | #4
在 2021/4/20 上午7:29, Mat Martineau 写道:
>

> On Fri, 16 Apr 2021, Xuan Zhuo wrote:

>

>> In page_to_skb(), if we have enough tailroom to save skb_shared_info, we

>> can use build_skb to create skb directly. No need to alloc for

>> additional space. And it can save a 'frags slot', which is very friendly

>> to GRO.

>>

>> Here, if the payload of the received package is too small (less than

>> GOOD_COPY_LEN), we still choose to copy it directly to the space got by

>> napi_alloc_skb. So we can reuse these pages.

>>

>> Testing Machine:

>>    The four queues of the network card are bound to the cpu1.

>>

>> Test command:

>>    for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 

>> 150& done

>>

>> The size of the udp package is 1000, so in the case of this patch, there

>> will always be enough tailroom to use build_skb. The sent udp packet

>> will be discarded because there is no port to receive it. The irqsoftd

>> of the machine is 100%, we observe the received quantity displayed by

>> sar -n DEV 1:

>>

>> no build_skb:  956864.00 rxpck/s

>> build_skb:    1158465.00 rxpck/s

>>

>> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>

>> Suggested-by: Jason Wang <jasowang@redhat.com>

>> ---

>>

>> v3: fix the truesize when headroom > 0

>>

>> v2: conflict resolution

>>

>> drivers/net/virtio_net.c | 69 ++++++++++++++++++++++++++++------------

>> 1 file changed, 48 insertions(+), 21 deletions(-)

>

> Xuan,

>

> I realize this has been merged to net-next already, but I'm getting a 

> use-after-free with KASAN in page_to_skb() with this patch. Reverting 

> this change fixes the UAF. I've included the KASAN dump below, and a 

> couple of comments inline.

>

>

>>

>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c

>> index 101659cd4b87..8cd76037c724 100644

>> --- a/drivers/net/virtio_net.c

>> +++ b/drivers/net/virtio_net.c

>> @@ -379,21 +379,17 @@ static struct sk_buff *page_to_skb(struct 

>> virtnet_info *vi,

>>                    struct receive_queue *rq,

>>                    struct page *page, unsigned int offset,

>>                    unsigned int len, unsigned int truesize,

>> -                   bool hdr_valid, unsigned int metasize)

>> +                   bool hdr_valid, unsigned int metasize,

>> +                   unsigned int headroom)

>> {

>>     struct sk_buff *skb;

>>     struct virtio_net_hdr_mrg_rxbuf *hdr;

>>     unsigned int copy, hdr_len, hdr_padded_len;

>> -    char *p;

>> +    int tailroom, shinfo_size;

>> +    char *p, *hdr_p;

>>

>>     p = page_address(page) + offset;

>> -

>> -    /* copy small packet so we can reuse these pages for small data */

>> -    skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);

>> -    if (unlikely(!skb))

>> -        return NULL;

>> -

>> -    hdr = skb_vnet_hdr(skb);

>> +    hdr_p = p;

>

> hdr_p is assigned here, pointer to an address in the provided page...

>

>>

>>     hdr_len = vi->hdr_len;

>>     if (vi->mergeable_rx_bufs)

>

> (snip)

>

>> @@ -431,7 +446,7 @@ static struct sk_buff *page_to_skb(struct 

>> virtnet_info *vi,

>>             skb_add_rx_frag(skb, 0, page, offset, len, truesize);

>>         else

>>             put_page(page);

>

> page is potentially released here...

>

>> -        return skb;

>> +        goto ok;

>>     }

>>

>>     /*

>> @@ -458,6 +473,18 @@ static struct sk_buff *page_to_skb(struct 

>> virtnet_info *vi,

>>     if (page)

>>         give_pages(rq, page);

>>

>> +ok:

>> +    /* hdr_valid means no XDP, so we can copy the vnet header */

>> +    if (hdr_valid) {

>> +        hdr = skb_vnet_hdr(skb);

>> +        memcpy(hdr, hdr_p, hdr_len);

>

> and hdr_p is dereferenced here.



Right, I tend to recover the way to copy hdr and set meta just after 
alloc/build_skb().

Thanks


>

> I'm seeing this KASAN UAF at boot time in a kvm VM (Fedora 33 host and 

> guest, if that helps):

>

> [   61.202483] 

> ==================================================================

> [   61.204005] BUG: KASAN: use-after-free in page_to_skb+0x32a/0x4b0

> [   61.205387] Read of size 12 at addr ffff888105bdf800 by task 

> NetworkManager/579

> [   61.207035] [   61.207408] CPU: 0 PID: 579 Comm: NetworkManager Not 

> tainted 5.12.0-rc7+ #2

> [   61.208715] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 

> BIOS 1.14.0-1.fc33 04/01/2014

> [   61.210257] Call Trace:

> [   61.210730]  <IRQ>

> [   61.211209]  dump_stack+0x93/0xc2

> [   61.211996]  print_address_description.constprop.0+0x18/0x130

> [   61.213310]  ? page_to_skb+0x32a/0x4b0

> [   61.214318]  ? page_to_skb+0x32a/0x4b0

> [   61.215085]  kasan_report.cold+0x7f/0x111

> [   61.215966]  ? trace_hardirqs_off+0x10/0xe0

> [   61.216823]  ? page_to_skb+0x32a/0x4b0

> [   61.217809]  kasan_check_range+0xf9/0x1e0

> [   61.217834]  memcpy+0x20/0x60

> [   61.217848]  page_to_skb+0x32a/0x4b0

> [   61.217888]  receive_buf+0x1434/0x2690

> [   61.217926]  ? page_to_skb+0x4b0/0x4b0

> [   61.217947]  ? find_held_lock+0x85/0xa0

> [   61.217964]  ? lock_release+0x1d0/0x400

> [   61.217974]  ? virtnet_poll+0x1d8/0x6b0

> [   61.217983]  ? detach_buf_split+0x254/0x290

> [   61.218008]  ? virtqueue_get_buf_ctx_split+0x145/0x1f0

> [   61.218032]  virtnet_poll+0x2a8/0x6b0

> [   61.218057]  ? receive_buf+0x2690/0x2690

> [   61.218067]  ? lock_release+0x400/0x400

> [   61.218119]  __napi_poll+0x57/0x2f0

> [   61.229624]  net_rx_action+0x4dd/0x590

> [   61.230453]  ? napi_threaded_poll+0x2b0/0x2b0

> [   61.231379]  ? rcu_implicit_dynticks_qs+0x430/0x430

> [   61.232429]  ? lock_is_held_type+0x98/0x110

> [   61.233342]  __do_softirq+0xfd/0x59d

> [   61.234131]  do_softirq+0x8a/0xb0

> [   61.234896]  </IRQ>

> [   61.235397]  ? virtnet_open+0x10a/0x2e0

> [   61.236273]  __local_bh_enable_ip+0xb1/0xc0

> [   61.237199]  virtnet_open+0x11b/0x2e0

> [   61.237981]  __dev_open+0x1b7/0x2c0

> [   61.238689]  ? dev_set_rx_mode+0x60/0x60

> [   61.239499]  ? lockdep_hardirqs_on_prepare+0x12e/0x1f0

> [   61.240523]  ? __local_bh_enable_ip+0x7b/0xc0

> [   61.241399]  ? trace_hardirqs_on+0x1c/0x100

> [   61.242248]  __dev_change_flags+0x2e6/0x370

> [   61.243098]  ? dev_set_allmulti+0x10/0x10

> [   61.243908]  ? lock_chain_count+0x20/0x20

> [   61.244759]  dev_change_flags+0x55/0xb0

> [   61.245595]  do_setlink+0xb52/0x1950

> [   61.246385]  ? rtnl_getlink+0x560/0x560

> [   61.247218]  ? mark_lock+0x101/0x19c0

> [   61.248003]  ? lock_chain_count+0x20/0x20

> [   61.248864]  ? lock_chain_count+0x20/0x20

> [   61.249728]  ? lockdep_hardirqs_on_prepare+0x1f0/0x1f0

> [   61.250821]  ? memset+0x20/0x40

> [   61.251474]  ? __nla_validate_parse+0xac/0x12f0

> [   61.252433]  ? nla_get_range_signed+0x1c0/0x1c0

> [   61.253409]  ? __lock_acquire+0x85f/0x3090

> [   61.254291]  __rtnl_newlink+0x85f/0xca0

> [   61.255131]  ? rtnl_setlink+0x220/0x220

> [   61.255988]  ? lock_is_held_type+0x98/0x110

> [   61.256911]  ? find_held_lock+0x85/0xa0

> [   61.257782]  ? __is_insn_slot_addr+0xa5/0x130

> [   61.257794]  ? lock_downgrade+0x390/0x390

> [   61.257802]  ? stack_access_ok+0x35/0x90

> [   61.257818]  ? entry_SYSCALL_64_after_hwframe+0x44/0xae

> [   61.257840]  ? __is_insn_slot_addr+0xc4/0x130

> [   61.257859]  ? kernel_text_address+0xc8/0xf0

> [   61.257876]  ? __kernel_text_address+0x9/0x30

> [   61.257885]  ? unwind_get_return_address+0x2a/0x40

> [   61.257893]  ? create_prof_cpu_mask+0x20/0x20

> [   61.257903]  ? arch_stack_walk+0x99/0xf0

> [   61.258007]  ? lock_release+0x1d0/0x400

> [   61.258016]  ? fs_reclaim_release+0x56/0x90

> [   61.258027]  ? lock_downgrade+0x390/0x390

> [   61.258036]  ? find_held_lock+0x80/0xa0

> [   61.258049]  ? lock_release+0x1d0/0x400

> [   61.258059]  ? lock_is_held_type+0x98/0x110

> [   61.258087]  rtnl_newlink+0x4b/0x70

> [   61.258099]  rtnetlink_rcv_msg+0x22c/0x5e0

> [   61.258116]  ? rtnetlink_put_metrics+0x2c0/0x2c0

> [   61.258131]  ? lock_acquire+0x157/0x4d0

> [   61.258140]  ? netlink_deliver_tap+0xa6/0x570

> [   61.258154]  ? lock_release+0x400/0x400

> [   61.258172]  netlink_rcv_skb+0xc4/0x1f0

> [   61.258180]  ? rtnetlink_put_metrics+0x2c0/0x2c0

> [   61.258193]  ? netlink_ack+0x4f0/0x4f0

> [   61.258199]  ? netlink_deliver_tap+0x129/0x570

> [   61.258234]  netlink_unicast+0x2d3/0x410

> [   61.258248]  ? netlink_attachskb+0x400/0x400

> [   61.258257]  ? _copy_from_iter_full+0xd8/0x360

> [   61.258280]  netlink_sendmsg+0x394/0x670

> [   61.258299]  ? netlink_unicast+0x410/0x410

> [   61.258305]  ? iovec_from_user+0xa1/0x1d0

> [   61.258327]  ? netlink_unicast+0x410/0x410

> [   61.258340]  sock_sendmsg+0x91/0xa0

> [   61.258353]  ____sys_sendmsg+0x3b7/0x400

> [   61.258366]  ? kernel_sendmsg+0x30/0x30

> [   61.258376]  ? __ia32_sys_recvmmsg+0x150/0x150

> [   61.258390]  ? lockdep_hardirqs_on_prepare+0x1f0/0x1f0

> [   61.258398]  ? stack_trace_save+0x8c/0xc0

> [   61.258408]  ? stack_trace_consume_entry+0x80/0x80

> [   61.258416]  ? __fput+0x1a9/0x3d0

> [   61.258435]  ___sys_sendmsg+0xd3/0x130

> [   61.258446]  ? sendmsg_copy_msghdr+0x110/0x110

> [   61.258458]  ? find_held_lock+0x85/0xa0

> [   61.258471]  ? lock_release+0x1d0/0x400

> [   61.258479]  ? __fget_files+0x133/0x210

> [   61.258490]  ? lock_downgrade+0x390/0x390

> [   61.258508]  ? lockdep_hardirqs_on_prepare+0x1f0/0x1f0

> [   61.258529]  ? __fget_files+0x152/0x210

> [   61.258547]  ? __fget_light+0x66/0xf0

> [   61.258568]  __sys_sendmsg+0xae/0x120

> [   61.258578]  ? __sys_sendmsg_sock+0x10/0x10

> [   61.258589]  ? lockdep_hardirqs_on_prepare+0x12e/0x1f0

> [   61.258598]  ? call_rcu+0x414/0x670

> [   61.258616]  ? mark_held_locks+0x25/0x90

> [   61.258630]  ? lockdep_hardirqs_on_prepare+0x12e/0x1f0

> [   61.258639]  ? syscall_enter_from_user_mode+0x1d/0x50

> [   61.258647]  ? trace_hardirqs_on+0x1c/0x100

> [   61.258662]  do_syscall_64+0x33/0x40

> [   61.258670]  entry_SYSCALL_64_after_hwframe+0x44/0xae

> [   61.258679] RIP: 0033:0x7fb33db83ecd

> [   61.258686] Code: 28 89 54 24 1c 48 89 74 24 10 89 7c 24 08 e8 4a 

> ee ff ff 8b 54 24 1c 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 2e 00 00 

> 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 44 24 08 e8 9e ee ff 

> ff 48

> [   61.258692] RSP: 002b:00007ffc85e90e30 EFLAGS: 00000293 ORIG_RAX: 

> 000000000000002e

> [   61.258702] RAX: ffffffffffffffda RBX: 000055f87a7aa030 RCX: 

> 00007fb33db83ecd

> [   61.258708] RDX: 0000000000000000 RSI: 00007ffc85e90e70 RDI: 

> 000000000000000c

> [   61.258713] RBP: 000000000000000c R08: 0000000000000000 R09: 

> 0000000000000000

> [   61.258717] R10: 0000000000000000 R11: 0000000000000293 R12: 

> 0000000000000000

> [   61.258722] R13: 00007ffc85e90fd0 R14: 00007ffc85e90fcc R15: 

> 0000000000000000

>

>

> -- 

> Mat Martineau

> Intel

>
Jason Wang April 20, 2021, 2:41 a.m. UTC | #5
在 2021/4/20 上午10:38, Jason Wang 写道:
>>> :

>>> +    /* hdr_valid means no XDP, so we can copy the vnet header */

>>> +    if (hdr_valid) {

>>> +        hdr = skb_vnet_hdr(skb);

>>> +        memcpy(hdr, hdr_p, hdr_len);

>>

>> and hdr_p is dereferenced here.

>

>

> Right, I tend to recover the way to copy hdr and set meta just after 

> alloc/build_skb().

>

> Thanks 



Btw, since the patch modifies a critical path of virtio-net I suggest to 
test the following cases:

1) netperf TCP stream
2) netperf UDP with packet size from 64 to PAGE_SIZE
3) XDP_PASS with 1)
4) XDP_PASS with 2)
5) XDP metadata with 1)
6) XDP metadata with 2)

Thanks
Jason Wang April 20, 2021, 2:49 a.m. UTC | #6
在 2021/4/20 上午12:48, David Ahern 写道:
> On 4/16/21 2:16 AM, Xuan Zhuo wrote:

>> In page_to_skb(), if we have enough tailroom to save skb_shared_info, we

>> can use build_skb to create skb directly. No need to alloc for

>> additional space. And it can save a 'frags slot', which is very friendly

>> to GRO.

>>

>> Here, if the payload of the received package is too small (less than

>> GOOD_COPY_LEN), we still choose to copy it directly to the space got by

>> napi_alloc_skb. So we can reuse these pages.

>>

>> Testing Machine:

>>      The four queues of the network card are bound to the cpu1.

>>

>> Test command:

>>      for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 150& done

>>

>> The size of the udp package is 1000, so in the case of this patch, there

>> will always be enough tailroom to use build_skb. The sent udp packet

>> will be discarded because there is no port to receive it. The irqsoftd

>> of the machine is 100%, we observe the received quantity displayed by

>> sar -n DEV 1:

>>

>> no build_skb:  956864.00 rxpck/s

>> build_skb:    1158465.00 rxpck/s

>>

> virtio_net is using napi_consume_skb, so napi_build_skb should show a

> small increase from build_skb.

>


Yes and we probably need to do this in receive_small().

Thanks
Eric Dumazet April 20, 2021, 9:28 a.m. UTC | #7
On 4/16/21 11:16 AM, Xuan Zhuo wrote:
> In page_to_skb(), if we have enough tailroom to save skb_shared_info, we

> can use build_skb to create skb directly. No need to alloc for

> additional space. And it can save a 'frags slot', which is very friendly

> to GRO.

> 

> Here, if the payload of the received package is too small (less than

> GOOD_COPY_LEN), we still choose to copy it directly to the space got by

> napi_alloc_skb. So we can reuse these pages.

> 

> Testing Machine:

>     The four queues of the network card are bound to the cpu1.

> 

> Test command:

>     for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 150& done

> 

> The size of the udp package is 1000, so in the case of this patch, there

> will always be enough tailroom to use build_skb. The sent udp packet

> will be discarded because there is no port to receive it. The irqsoftd

> of the machine is 100%, we observe the received quantity displayed by

> sar -n DEV 1:

> 

> no build_skb:  956864.00 rxpck/s

> build_skb:    1158465.00 rxpck/s

> 

> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>

> Suggested-by: Jason Wang <jasowang@redhat.com>

> ---

> 

> v3: fix the truesize when headroom > 0

> 

> v2: conflict resolution

> 

>  drivers/net/virtio_net.c | 69 ++++++++++++++++++++++++++++------------

>  1 file changed, 48 insertions(+), 21 deletions(-)

> 

> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c

> index 101659cd4b87..8cd76037c724 100644

> --- a/drivers/net/virtio_net.c

> +++ b/drivers/net/virtio_net.c

> @@ -379,21 +379,17 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,

>  				   struct receive_queue *rq,

>  				   struct page *page, unsigned int offset,

>  				   unsigned int len, unsigned int truesize,

> -				   bool hdr_valid, unsigned int metasize)

> +				   bool hdr_valid, unsigned int metasize,

> +				   unsigned int headroom)

>  {

>  	struct sk_buff *skb;

>  	struct virtio_net_hdr_mrg_rxbuf *hdr;

>  	unsigned int copy, hdr_len, hdr_padded_len;

> -	char *p;

> +	int tailroom, shinfo_size;

> +	char *p, *hdr_p;

> 

>  	p = page_address(page) + offset;

> -

> -	/* copy small packet so we can reuse these pages for small data */

> -	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);

> -	if (unlikely(!skb))

> -		return NULL;

> -

> -	hdr = skb_vnet_hdr(skb);

> +	hdr_p = p;

> 

>  	hdr_len = vi->hdr_len;

>  	if (vi->mergeable_rx_bufs)

> @@ -401,14 +397,38 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,

>  	else

>  		hdr_padded_len = sizeof(struct padded_vnet_hdr);

> 

> -	/* hdr_valid means no XDP, so we can copy the vnet header */

> -	if (hdr_valid)

> -		memcpy(hdr, p, hdr_len);

> +	/* If headroom is not 0, there is an offset between the beginning of the

> +	 * data and the allocated space, otherwise the data and the allocated

> +	 * space are aligned.

> +	 */

> +	if (headroom) {

> +		/* The actual allocated space size is PAGE_SIZE. */

> +		truesize = PAGE_SIZE;

> +		tailroom = truesize - len - offset;

> +	} else {

> +		tailroom = truesize - len;

> +	}

> 

>  	len -= hdr_len;

>  	offset += hdr_padded_len;

>  	p += hdr_padded_len;

> 

> +	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

> +

> +	if (len > GOOD_COPY_LEN && tailroom >= shinfo_size) {

> +		skb = build_skb(p, truesize);

> +		if (unlikely(!skb))

> +			return NULL;

> +

> +		skb_put(skb, len);

> +		goto ok;

> +	}

> +

> +	/* copy small packet so we can reuse these pages for small data */

> +	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);

> +	if (unlikely(!skb))

> +		return NULL;

> +

>  	/* Copy all frame if it fits skb->head, otherwise

>  	 * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.

>  	 */

> @@ -418,11 +438,6 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,

>  		copy = ETH_HLEN + metasize;

>  	skb_put_data(skb, p, copy);

> 

> -	if (metasize) {

> -		__skb_pull(skb, metasize);

> -		skb_metadata_set(skb, metasize);

> -	}

> -

>  	len -= copy;

>  	offset += copy;

> 

> @@ -431,7 +446,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,

>  			skb_add_rx_frag(skb, 0, page, offset, len, truesize);

>  		else

>  			put_page(page);


Here the struct page has been freed..

> -		return skb;

> +		goto ok;

>  	}

> 

>  	/*

> @@ -458,6 +473,18 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,

>  	if (page)

>  		give_pages(rq, page);

> 

> +ok:

> +	/* hdr_valid means no XDP, so we can copy the vnet header */

> +	if (hdr_valid) {

> +		hdr = skb_vnet_hdr(skb);

> +		memcpy(hdr, hdr_p, hdr_len);


But here you are reading its content.

This is a use-after-free.

> +	}

> +


I will test something like :


diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8cd76037c72481200ea3e8429e9fdfec005dad85..2e28c04aa6351d2b4016f7d277ce104c4970069d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -385,6 +385,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
        struct sk_buff *skb;
        struct virtio_net_hdr_mrg_rxbuf *hdr;
        unsigned int copy, hdr_len, hdr_padded_len;
+       struct page *page_to_free = NULL;
        int tailroom, shinfo_size;
        char *p, *hdr_p;
 
@@ -445,7 +446,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
                if (len)
                        skb_add_rx_frag(skb, 0, page, offset, len, truesize);
                else
-                       put_page(page);
+                       page_to_free = page;
                goto ok;
        }
 
@@ -479,6 +480,8 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
                hdr = skb_vnet_hdr(skb);
                memcpy(hdr, hdr_p, hdr_len);
        }
+       if (page_to_free)
+               put_page(page_to_free);
 
        if (metasize) {
                __skb_pull(skb, metasize);
Ido Schimmel April 22, 2021, 11:13 a.m. UTC | #8
On Fri, Apr 16, 2021 at 05:16:15PM +0800, Xuan Zhuo wrote:
> In page_to_skb(), if we have enough tailroom to save skb_shared_info, we

> can use build_skb to create skb directly. No need to alloc for

> additional space. And it can save a 'frags slot', which is very friendly

> to GRO.

> 

> Here, if the payload of the received package is too small (less than

> GOOD_COPY_LEN), we still choose to copy it directly to the space got by

> napi_alloc_skb. So we can reuse these pages.

> 

> Testing Machine:

>     The four queues of the network card are bound to the cpu1.

> 

> Test command:

>     for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 150& done

> 

> The size of the udp package is 1000, so in the case of this patch, there

> will always be enough tailroom to use build_skb. The sent udp packet

> will be discarded because there is no port to receive it. The irqsoftd

> of the machine is 100%, we observe the received quantity displayed by

> sar -n DEV 1:

> 

> no build_skb:  956864.00 rxpck/s

> build_skb:    1158465.00 rxpck/s

> 

> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>

> Suggested-by: Jason Wang <jasowang@redhat.com>


We have VMs that use virtio_net for their management interface. After
this patch was applied we started seeing crashes when these VMs access
an NFS file system. I thought Eric's patches will fix it, but problem
persists even with his two patches:

af39c8f72301 virtio-net: fix use-after-free in page_to_skb()
f5d7872a8b8a virtio-net: restrict build_skb() use to some arches

Reverting all three patches makes the problem go away. A KASAN enabled
kernel emits the following (decoded) stack trace.

[1]
BUG: KASAN: use-after-free in skb_gro_receive (net/core/skbuff.c:4260)
Write of size 16 at addr ffff88811619fffc by task kworker/u9:0/534
CPU: 2 PID: 534 Comm: kworker/u9:0 Not tainted 5.12.0-rc7-custom-16372-gb150be05b806 #3382
Hardware name: QEMU MSN2700, BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Workqueue: xprtiod xs_stream_data_receive_workfn [sunrpc]
Call Trace:
 <IRQ>
dump_stack (lib/dump_stack.c:122)
print_address_description.constprop.0 (mm/kasan/report.c:233)
kasan_report.cold (mm/kasan/report.c:400 mm/kasan/report.c:416)
skb_gro_receive (net/core/skbuff.c:4260)
tcp_gro_receive (net/ipv4/tcp_offload.c:266 (discriminator 1))
tcp4_gro_receive (net/ipv4/tcp_offload.c:316)
inet_gro_receive (net/ipv4/af_inet.c:1545 (discriminator 2))
dev_gro_receive (net/core/dev.c:6075)
napi_gro_receive (net/core/dev.c:6168 net/core/dev.c:6198)
receive_buf (drivers/net/virtio_net.c:1151) virtio_net
virtnet_poll (drivers/net/virtio_net.c:1415 drivers/net/virtio_net.c:1519) virtio_net
__napi_poll (net/core/dev.c:6964)
net_rx_action (net/core/dev.c:7033 net/core/dev.c:7118)
__do_softirq (./arch/x86/include/asm/jump_label.h:25 ./include/linux/jump_label.h:200 ./include/trace/events/irq.h:142 kernel/softirq.c:346)
irq_exit_rcu (kernel/softirq.c:221 kernel/softirq.c:422 kernel/softirq.c:434)
common_interrupt (arch/x86/kernel/irq.c:240 (discriminator 14))
</IRQ>
asm_common_interrupt (./arch/x86/include/asm/idtentry.h:623)
RIP: 0010:read_hpet (arch/x86/kernel/hpet.c:823 arch/x86/kernel/hpet.c:793)
Code: 90 48 8b 05 a5 ef 6f 02 48 89 44 24 68 48 c1 e8 20 89 c2 3b 44 24 4c 74 d1 89 d0 e9 c7 fe ff ff e8 38 90 35 00 fb 8b 44 24 6c <e9> b8 fe ff ff 8b 54 24 6
All code
========
   0:	90                   	nop
   1:	48 8b 05 a5 ef 6f 02 	mov    0x26fefa5(%rip),%rax        # 0x26fefad
   8:	48 89 44 24 68       	mov    %rax,0x68(%rsp)
   d:	48 c1 e8 20          	shr    $0x20,%rax
  11:	89 c2                	mov    %eax,%edx
  13:	3b 44 24 4c          	cmp    0x4c(%rsp),%eax
  17:	74 d1                	je     0xffffffffffffffea
  19:	89 d0                	mov    %edx,%eax
  1b:	e9 c7 fe ff ff       	jmpq   0xfffffffffffffee7
  20:	e8 38 90 35 00       	callq  0x35905d
  25:	fb                   	sti
  26:	8b 44 24 6c          	mov    0x6c(%rsp),%eax
  2a:*	e9 b8 fe ff ff       	jmpq   0xfffffffffffffee7		<-- trapping instruction
  2f:	8b 54 24 06          	mov    0x6(%rsp),%edx

Code starting with the faulting instruction
===========================================
   0:	e9 b8 fe ff ff       	jmpq   0xfffffffffffffebd
   5:	8b 54 24 06          	mov    0x6(%rsp),%edx
c 89 d0 e9 ad fe ff ff e8 fe 8c 35 00 e9
RSP: 0018:ffffc900015a7a68 EFLAGS: 00000206
RAX: 000000004ad84e9a RBX: 1ffff920002b4f4e RCX: ffffffff888897f7
RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000
RBP: 0000000000000200 R08: 0000000000000001 R09: ffffffff8c645737
R10: fffffbfff18c8ae6 R11: 0000000000000001 R12: dffffc0000000000
R13: 00000016f23c724e R14: ffff888154e24000 R15: ffff88810c2b2c00
ktime_get (kernel/time/timekeeping.c:290 (discriminator 4) kernel/time/timekeeping.c:386 (discriminator 4) kernel/time/timekeeping.c:829 (discriminator 4))
xprt_lookup_rqst (net/sunrpc/xprt.c:1049) sunrpc
xs_read_stream.constprop.0 (net/sunrpc/xprtsock.c:595 net/sunrpc/xprtsock.c:646) sunrpc
xs_stream_data_receive_workfn (net/sunrpc/xprtsock.c:712 net/sunrpc/xprtsock.c:732) sunrpc
process_one_work (./arch/x86/include/asm/jump_label.h:25 ./include/linux/jump_label.h:200 ./include/trace/events/workqueue.h:108 kernel/workqueue.c:2280)
worker_thread (./include/linux/list.h:282 kernel/workqueue.c:2422)
kthread (kernel/kthread.c:292)
ret_from_fork (arch/x86/entry/entry_64.S:300)

The buggy address belongs to the page:
page:000000000b3e5dba refcount:21 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x116198
head:000000000b3e5dba order:3 compound_mapcount:0 compound_pincount:0
flags: 0x200000000010000(head)
raw: 0200000000010000 dead000000000100 dead000000000122 0000000000000000
raw: 0000000000000000 0000000000000000 00000015ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff88811619ff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffff88811619ff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>ffff8881161a0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff

                   ^
 ffff8881161a0080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 ffff8881161a0100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
Ido Schimmel April 22, 2021, 12:55 p.m. UTC | #9
On Thu, Apr 22, 2021 at 08:12:31PM +0800, Xuan Zhuo wrote:
> Thank you very much for reporting this problem. Can you try this patch? Of

> course, it also includes two patches from eric.

> 

>  af39c8f72301 virtio-net: fix use-after-free in page_to_skb()

>  f5d7872a8b8a virtio-net: restrict build_skb() use to some arches


Applied your patch on top of net-next, looks good. Feel free to add:

Tested-by: Ido Schimmel <idosch@nvidia.com>


Thanks for the quick fix
diff mbox series

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 101659cd4b87..8cd76037c724 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -379,21 +379,17 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 				   struct receive_queue *rq,
 				   struct page *page, unsigned int offset,
 				   unsigned int len, unsigned int truesize,
-				   bool hdr_valid, unsigned int metasize)
+				   bool hdr_valid, unsigned int metasize,
+				   unsigned int headroom)
 {
 	struct sk_buff *skb;
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
 	unsigned int copy, hdr_len, hdr_padded_len;
-	char *p;
+	int tailroom, shinfo_size;
+	char *p, *hdr_p;

 	p = page_address(page) + offset;
-
-	/* copy small packet so we can reuse these pages for small data */
-	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
-	if (unlikely(!skb))
-		return NULL;
-
-	hdr = skb_vnet_hdr(skb);
+	hdr_p = p;

 	hdr_len = vi->hdr_len;
 	if (vi->mergeable_rx_bufs)
@@ -401,14 +397,38 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	else
 		hdr_padded_len = sizeof(struct padded_vnet_hdr);

-	/* hdr_valid means no XDP, so we can copy the vnet header */
-	if (hdr_valid)
-		memcpy(hdr, p, hdr_len);
+	/* If headroom is not 0, there is an offset between the beginning of the
+	 * data and the allocated space, otherwise the data and the allocated
+	 * space are aligned.
+	 */
+	if (headroom) {
+		/* The actual allocated space size is PAGE_SIZE. */
+		truesize = PAGE_SIZE;
+		tailroom = truesize - len - offset;
+	} else {
+		tailroom = truesize - len;
+	}

 	len -= hdr_len;
 	offset += hdr_padded_len;
 	p += hdr_padded_len;

+	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	if (len > GOOD_COPY_LEN && tailroom >= shinfo_size) {
+		skb = build_skb(p, truesize);
+		if (unlikely(!skb))
+			return NULL;
+
+		skb_put(skb, len);
+		goto ok;
+	}
+
+	/* copy small packet so we can reuse these pages for small data */
+	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
+	if (unlikely(!skb))
+		return NULL;
+
 	/* Copy all frame if it fits skb->head, otherwise
 	 * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
 	 */
@@ -418,11 +438,6 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 		copy = ETH_HLEN + metasize;
 	skb_put_data(skb, p, copy);

-	if (metasize) {
-		__skb_pull(skb, metasize);
-		skb_metadata_set(skb, metasize);
-	}
-
 	len -= copy;
 	offset += copy;

@@ -431,7 +446,7 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
 		else
 			put_page(page);
-		return skb;
+		goto ok;
 	}

 	/*
@@ -458,6 +473,18 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	if (page)
 		give_pages(rq, page);

+ok:
+	/* hdr_valid means no XDP, so we can copy the vnet header */
+	if (hdr_valid) {
+		hdr = skb_vnet_hdr(skb);
+		memcpy(hdr, hdr_p, hdr_len);
+	}
+
+	if (metasize) {
+		__skb_pull(skb, metasize);
+		skb_metadata_set(skb, metasize);
+	}
+
 	return skb;
 }

@@ -808,7 +835,7 @@  static struct sk_buff *receive_big(struct net_device *dev,
 {
 	struct page *page = buf;
 	struct sk_buff *skb =
-		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0);
+		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);

 	stats->bytes += len - vi->hdr_len;
 	if (unlikely(!skb))
@@ -922,7 +949,7 @@  static struct sk_buff *receive_mergeable(struct net_device *dev,
 				put_page(page);
 				head_skb = page_to_skb(vi, rq, xdp_page, offset,
 						       len, PAGE_SIZE, false,
-						       metasize);
+						       metasize, headroom);
 				return head_skb;
 			}
 			break;
@@ -980,7 +1007,7 @@  static struct sk_buff *receive_mergeable(struct net_device *dev,
 	}

 	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
-			       metasize);
+			       metasize, headroom);
 	curr_skb = head_skb;

 	if (unlikely(!curr_skb))