[RFC,V3,10/11] HV/Netvsc: Add Isolation VM support for netvsc driver

Message ID 20210530150628.2063957-11-ltykernel@gmail.com
State New
Headers show
Series
  • x86/Hyper-V: Add Hyper-V Isolation VM support
Related show

Commit Message

Tianyu Lan May 30, 2021, 3:06 p.m.
From: Tianyu Lan <Tianyu.Lan@microsoft.com>

In Isolation VM, all shared memory with host needs to mark visible
to host via hvcall. vmbus_establish_gpadl() has already done it for
netvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_
pagebuffer() still need to handle. Use DMA API to map/umap these
memory during sending/receiving packet and Hyper-V DMA ops callback
will use swiotlb function to allocate bounce buffer and copy data
from/to bounce buffer.

Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
---
 drivers/net/hyperv/hyperv_net.h   |   6 ++
 drivers/net/hyperv/netvsc.c       | 125 ++++++++++++++++++++++++++++--
 drivers/net/hyperv/rndis_filter.c |   3 +
 include/linux/hyperv.h            |   5 ++
 4 files changed, 133 insertions(+), 6 deletions(-)

Comments

Christoph Hellwig June 7, 2021, 6:50 a.m. | #1
On Sun, May 30, 2021 at 11:06:27AM -0400, Tianyu Lan wrote:
> +	if (hv_isolation_type_snp()) {

> +		pfns = kcalloc(buf_size / HV_HYP_PAGE_SIZE, sizeof(unsigned long),

> +			       GFP_KERNEL);

> +		for (i = 0; i < buf_size / HV_HYP_PAGE_SIZE; i++)

> +			pfns[i] = virt_to_hvpfn(net_device->recv_buf + i * HV_HYP_PAGE_SIZE) +

> +				(ms_hyperv.shared_gpa_boundary >> HV_HYP_PAGE_SHIFT);

> +

> +		vaddr = vmap_pfn(pfns, buf_size / HV_HYP_PAGE_SIZE, PAGE_KERNEL_IO);

> +		kfree(pfns);

> +		if (!vaddr)

> +			goto cleanup;

> +		net_device->recv_original_buf = net_device->recv_buf;

> +		net_device->recv_buf = vaddr;

> +	}


This probably wnats a helper to make the thing more readable.  But who
came up with this fucked up communication protocol where the host needs
to map random pfns into a contigous range?  Sometime I really have to
wonder what crack the hyper-v people take when comparing this to the
relatively sane approach others take.

> +	for (i = 0; i < page_count; i++)

> +		dma_unmap_single(&hv_dev->device, packet->dma_range[i].dma,

> +				 packet->dma_range[i].mapping_size,

> +				 DMA_TO_DEVICE);

> +

> +	kfree(packet->dma_range);


Any reason this isn't simply using a struct scatterlist?

> +	for (i = 0; i < page_count; i++) {

> +		char *src = phys_to_virt((pb[i].pfn << HV_HYP_PAGE_SHIFT)

> +					 + pb[i].offset);

> +		u32 len = pb[i].len;

> +

> +		dma = dma_map_single(&hv_dev->device, src, len,

> +				     DMA_TO_DEVICE);


dma_map_single can only be used on page baked memory, and if this is
using page backed memory you wouldn't need to do thee phys_to_virt
tricks.  Can someone explain the mess here in more detail?

>  	struct rndis_device *dev = nvdev->extension;

>  	struct rndis_request *request = NULL;

> +	struct hv_device *hv_dev = ((struct net_device_context *)

> +			netdev_priv(ndev))->device_ctx;


Why not use a net_device_context local variable instead of this cast
galore?
Tianyu Lan June 7, 2021, 3:21 p.m. | #2
On 6/7/2021 2:50 PM, Christoph Hellwig wrote:
> On Sun, May 30, 2021 at 11:06:27AM -0400, Tianyu Lan wrote:

>> +	if (hv_isolation_type_snp()) {

>> +		pfns = kcalloc(buf_size / HV_HYP_PAGE_SIZE, sizeof(unsigned long),

>> +			       GFP_KERNEL);

>> +		for (i = 0; i < buf_size / HV_HYP_PAGE_SIZE; i++)

>> +			pfns[i] = virt_to_hvpfn(net_device->recv_buf + i * HV_HYP_PAGE_SIZE) +

>> +				(ms_hyperv.shared_gpa_boundary >> HV_HYP_PAGE_SHIFT);

>> +

>> +		vaddr = vmap_pfn(pfns, buf_size / HV_HYP_PAGE_SIZE, PAGE_KERNEL_IO);

>> +		kfree(pfns);

>> +		if (!vaddr)

>> +			goto cleanup;

>> +		net_device->recv_original_buf = net_device->recv_buf;

>> +		net_device->recv_buf = vaddr;

>> +	}

> 

> This probably wnats a helper to make the thing more readable.  But who

> came up with this fucked up communication protocol where the host needs

> to map random pfns into a contigous range?  Sometime I really have to

> wonder what crack the hyper-v people take when comparing this to the

> relatively sane approach others take.


Agree. Will add a helper function.
> 

>> +	for (i = 0; i < page_count; i++)

>> +		dma_unmap_single(&hv_dev->device, packet->dma_range[i].dma,

>> +				 packet->dma_range[i].mapping_size,

>> +				 DMA_TO_DEVICE);

>> +

>> +	kfree(packet->dma_range);

> 

> Any reason this isn't simply using a struct scatterlist?


I will have a look. Thanks to reminder scatterlist.

> 

>> +	for (i = 0; i < page_count; i++) {

>> +		char *src = phys_to_virt((pb[i].pfn << HV_HYP_PAGE_SHIFT)

>> +					 + pb[i].offset);

>> +		u32 len = pb[i].len;

>> +

>> +		dma = dma_map_single(&hv_dev->device, src, len,

>> +				     DMA_TO_DEVICE);

> 

> dma_map_single can only be used on page baked memory, and if this is

> using page backed memory you wouldn't need to do thee phys_to_virt

> tricks.  Can someone explain the mess here in more detail?


Sorry. Could you elaborate the issue? These pages in the pb array are 
not allocated by DMA API and using dma_map_single() here is to map these 
pages' address to bounce buffer physical address.

> 

>>   	struct rndis_device *dev = nvdev->extension;

>>   	struct rndis_request *request = NULL;

>> +	struct hv_device *hv_dev = ((struct net_device_context *)

>> +			netdev_priv(ndev))->device_ctx;

> 

> Why not use a net_device_context local variable instead of this cast

> galore?

> 


OK. I will update.


Thanks.
Christoph Hellwig June 14, 2021, 7:09 a.m. | #3
On Mon, Jun 07, 2021 at 11:21:20PM +0800, Tianyu Lan wrote:
>> dma_map_single can only be used on page baked memory, and if this is

>> using page backed memory you wouldn't need to do thee phys_to_virt

>> tricks.  Can someone explain the mess here in more detail?

>

> Sorry. Could you elaborate the issue? These pages in the pb array are not 

> allocated by DMA API and using dma_map_single() here is to map these pages' 

> address to bounce buffer physical address.


dma_map_single just calls dma_map_page using virt_to_page.  So this
can't work on addresses not in the kernel linear mapping.
Tianyu Lan June 14, 2021, 2:04 p.m. | #4
On 6/14/2021 3:09 PM, Christoph Hellwig wrote:
> On Mon, Jun 07, 2021 at 11:21:20PM +0800, Tianyu Lan wrote:

>>> dma_map_single can only be used on page baked memory, and if this is

>>> using page backed memory you wouldn't need to do thee phys_to_virt

>>> tricks.  Can someone explain the mess here in more detail?

>>

>> Sorry. Could you elaborate the issue? These pages in the pb array are not

>> allocated by DMA API and using dma_map_single() here is to map these pages'

>> address to bounce buffer physical address.

> 

> dma_map_single just calls dma_map_page using virt_to_page.  So this

> can't work on addresses not in the kernel linear mapping.

> 


The pages in the hv_page_buffer array here are in the kernel linear 
mapping. The packet sent to host will contain an array which contains 
transaction data. In the isolation VM, data in the these pages needs to 
be copied to bounce buffer and so call dma_map_single() here to map 
these data pages with bounce buffer. The vmbus has ring buffer where the 
send/receive packets are copied to/from. The ring buffer has been 
remapped to the extra space above shared gpa boundary/vTom during 
probing Netvsc driver and so not call dma map function for vmbus ring
buffer.
Christoph Hellwig June 14, 2021, 3:33 p.m. | #5
On Mon, Jun 14, 2021 at 10:04:06PM +0800, Tianyu Lan wrote:
> The pages in the hv_page_buffer array here are in the kernel linear 

> mapping. The packet sent to host will contain an array which contains 

> transaction data. In the isolation VM, data in the these pages needs to be 

> copied to bounce buffer and so call dma_map_single() here to map these data 

> pages with bounce buffer. The vmbus has ring buffer where the send/receive 

> packets are copied to/from. The ring buffer has been remapped to the extra 

> space above shared gpa boundary/vTom during probing Netvsc driver and so 

> not call dma map function for vmbus ring

> buffer.


So why do we have all that PFN magic instead of using struct page or
the usual kernel I/O buffers that contain a page pointer?
Tianyu Lan June 15, 2021, 2:31 p.m. | #6
On 6/14/2021 11:33 PM, Christoph Hellwig wrote:
> On Mon, Jun 14, 2021 at 10:04:06PM +0800, Tianyu Lan wrote:
>> The pages in the hv_page_buffer array here are in the kernel linear
>> mapping. The packet sent to host will contain an array which contains
>> transaction data. In the isolation VM, data in the these pages needs to be
>> copied to bounce buffer and so call dma_map_single() here to map these data
>> pages with bounce buffer. The vmbus has ring buffer where the send/receive
>> packets are copied to/from. The ring buffer has been remapped to the extra
>> space above shared gpa boundary/vTom during probing Netvsc driver and so
>> not call dma map function for vmbus ring
>> buffer.
> 
> So why do we have all that PFN magic instead of using struct page or
> the usual kernel I/O buffers that contain a page pointer?
> 

These PFNs originally is part of Hyper-V protocol data and will be sent
to host. Host accepts these GFN and copy data from/to guest memory. The 
translation from va to pa is done by caller that populates the 
hv_page_buffer array. I will try calling dma map function before 
populating struct hv_page_buffer and this can avoid redundant 
translation between PA and VA.

Patch

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index b11aa68b44ec..c2fbb9d4df2c 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -164,6 +164,7 @@  struct hv_netvsc_packet {
 	u32 total_bytes;
 	u32 send_buf_index;
 	u32 total_data_buflen;
+	struct hv_dma_range *dma_range;
 };
 
 #define NETVSC_HASH_KEYLEN 40
@@ -1074,6 +1075,7 @@  struct netvsc_device {
 
 	/* Receive buffer allocated by us but manages by NetVSP */
 	void *recv_buf;
+	void *recv_original_buf;
 	u32 recv_buf_size; /* allocated bytes */
 	u32 recv_buf_gpadl_handle;
 	u32 recv_section_cnt;
@@ -1082,6 +1084,8 @@  struct netvsc_device {
 
 	/* Send buffer allocated by us */
 	void *send_buf;
+	void *send_original_buf;
+	u32 send_buf_size;
 	u32 send_buf_gpadl_handle;
 	u32 send_section_cnt;
 	u32 send_section_size;
@@ -1729,4 +1733,6 @@  struct rndis_message {
 #define RETRY_US_HI	10000
 #define RETRY_MAX	2000	/* >10 sec */
 
+void netvsc_dma_unmap(struct hv_device *hv_dev,
+		      struct hv_netvsc_packet *packet);
 #endif /* _HYPERV_NET_H */
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 7bd935412853..a01740c6c6b8 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -153,8 +153,21 @@  static void free_netvsc_device(struct rcu_head *head)
 	int i;
 
 	kfree(nvdev->extension);
-	vfree(nvdev->recv_buf);
-	vfree(nvdev->send_buf);
+
+	if (nvdev->recv_original_buf) {
+		vunmap(nvdev->recv_buf);
+		vfree(nvdev->recv_original_buf);
+	} else {
+		vfree(nvdev->recv_buf);
+	}
+
+	if (nvdev->send_original_buf) {
+		vunmap(nvdev->send_buf);
+		vfree(nvdev->send_original_buf);
+	} else {
+		vfree(nvdev->send_buf);
+	}
+
 	kfree(nvdev->send_section_map);
 
 	for (i = 0; i < VRSS_CHANNEL_MAX; i++) {
@@ -338,8 +351,10 @@  static int netvsc_init_buf(struct hv_device *device,
 	struct net_device *ndev = hv_get_drvdata(device);
 	struct nvsp_message *init_packet;
 	unsigned int buf_size;
+	unsigned long *pfns;
 	size_t map_words;
 	int i, ret = 0;
+	void *vaddr;
 
 	/* Get receive buffer area. */
 	buf_size = device_info->recv_sections * device_info->recv_section_size;
@@ -375,6 +390,21 @@  static int netvsc_init_buf(struct hv_device *device,
 		goto cleanup;
 	}
 
+	if (hv_isolation_type_snp()) {
+		pfns = kcalloc(buf_size / HV_HYP_PAGE_SIZE, sizeof(unsigned long),
+			       GFP_KERNEL);
+		for (i = 0; i < buf_size / HV_HYP_PAGE_SIZE; i++)
+			pfns[i] = virt_to_hvpfn(net_device->recv_buf + i * HV_HYP_PAGE_SIZE) +
+				(ms_hyperv.shared_gpa_boundary >> HV_HYP_PAGE_SHIFT);
+
+		vaddr = vmap_pfn(pfns, buf_size / HV_HYP_PAGE_SIZE, PAGE_KERNEL_IO);
+		kfree(pfns);
+		if (!vaddr)
+			goto cleanup;
+		net_device->recv_original_buf = net_device->recv_buf;
+		net_device->recv_buf = vaddr;
+	}
+
 	/* Notify the NetVsp of the gpadl handle */
 	init_packet = &net_device->channel_init_pkt;
 	memset(init_packet, 0, sizeof(struct nvsp_message));
@@ -477,6 +507,23 @@  static int netvsc_init_buf(struct hv_device *device,
 		goto cleanup;
 	}
 
+	if (hv_isolation_type_snp()) {
+		pfns = kcalloc(buf_size / HV_HYP_PAGE_SIZE, sizeof(unsigned long),
+			       GFP_KERNEL);
+
+		for (i = 0; i < buf_size / HV_HYP_PAGE_SIZE; i++)
+			pfns[i] = virt_to_hvpfn(net_device->send_buf + i * HV_HYP_PAGE_SIZE)
+				+ (ms_hyperv.shared_gpa_boundary >> HV_HYP_PAGE_SHIFT);
+
+		vaddr = vmap_pfn(pfns, buf_size / HV_HYP_PAGE_SIZE, PAGE_KERNEL_IO);
+		kfree(pfns);
+		if (!vaddr)
+			goto cleanup;
+
+		net_device->send_original_buf = net_device->send_buf;
+		net_device->send_buf = vaddr;
+	}
+
 	/* Notify the NetVsp of the gpadl handle */
 	init_packet = &net_device->channel_init_pkt;
 	memset(init_packet, 0, sizeof(struct nvsp_message));
@@ -767,7 +814,7 @@  static void netvsc_send_tx_complete(struct net_device *ndev,
 
 	/* Notify the layer above us */
 	if (likely(skb)) {
-		const struct hv_netvsc_packet *packet
+		struct hv_netvsc_packet *packet
 			= (struct hv_netvsc_packet *)skb->cb;
 		u32 send_index = packet->send_buf_index;
 		struct netvsc_stats *tx_stats;
@@ -783,6 +830,7 @@  static void netvsc_send_tx_complete(struct net_device *ndev,
 		tx_stats->bytes += packet->total_bytes;
 		u64_stats_update_end(&tx_stats->syncp);
 
+		netvsc_dma_unmap(ndev_ctx->device_ctx, packet);
 		napi_consume_skb(skb, budget);
 	}
 
@@ -947,6 +995,63 @@  static void netvsc_copy_to_send_buf(struct netvsc_device *net_device,
 		memset(dest, 0, padding);
 }
 
+void netvsc_dma_unmap(struct hv_device *hv_dev,
+		      struct hv_netvsc_packet *packet)
+{
+	u32 page_count = packet->cp_partial ?
+		packet->page_buf_cnt - packet->rmsg_pgcnt :
+		packet->page_buf_cnt;
+	int i;
+
+	if (!packet->dma_range)
+		return;
+
+	for (i = 0; i < page_count; i++)
+		dma_unmap_single(&hv_dev->device, packet->dma_range[i].dma,
+				 packet->dma_range[i].mapping_size,
+				 DMA_TO_DEVICE);
+
+	kfree(packet->dma_range);
+}
+
+int netvsc_dma_map(struct hv_device *hv_dev,
+		   struct hv_netvsc_packet *packet,
+		   struct hv_page_buffer *pb)
+{
+	u32 page_count =  packet->cp_partial ?
+		packet->page_buf_cnt - packet->rmsg_pgcnt :
+		packet->page_buf_cnt;
+	dma_addr_t dma;
+	int i;
+
+	packet->dma_range = kcalloc(page_count,
+				    sizeof(*packet->dma_range),
+				    GFP_KERNEL);
+	if (!packet->dma_range)
+		return -ENOMEM;
+
+	for (i = 0; i < page_count; i++) {
+		char *src = phys_to_virt((pb[i].pfn << HV_HYP_PAGE_SHIFT)
+					 + pb[i].offset);
+		u32 len = pb[i].len;
+
+		dma = dma_map_single(&hv_dev->device, src, len,
+				     DMA_TO_DEVICE);
+		if (dma_mapping_error(&hv_dev->device, dma)) {
+			kfree(packet->dma_range);
+			return -ENOMEM;
+		}
+
+		packet->dma_range[i].dma = dma;
+		packet->dma_range[i].mapping_size = len;
+		pb[i].pfn = dma >> HV_HYP_PAGE_SHIFT;
+		pb[i].offset = offset_in_hvpage(dma);
+		pb[i].len = len;
+	}
+
+	return 0;
+}
+
 static inline int netvsc_send_pkt(
 	struct hv_device *device,
 	struct hv_netvsc_packet *packet,
@@ -987,14 +1092,22 @@  static inline int netvsc_send_pkt(
 
 	trace_nvsp_send_pkt(ndev, out_channel, rpkt);
 
+	packet->dma_range = NULL;
 	if (packet->page_buf_cnt) {
 		if (packet->cp_partial)
 			pb += packet->rmsg_pgcnt;
 
+		ret = netvsc_dma_map(ndev_ctx->device_ctx, packet, pb);
+		if (ret)
+			return ret;
+
 		ret = vmbus_sendpacket_pagebuffer(out_channel,
-						  pb, packet->page_buf_cnt,
-						  &nvmsg, sizeof(nvmsg),
-						  req_id);
+					  pb, packet->page_buf_cnt,
+					  &nvmsg, sizeof(nvmsg),
+					  req_id);
+
+		if (ret)
+			netvsc_dma_unmap(ndev_ctx->device_ctx, packet);
 	} else {
 		ret = vmbus_sendpacket(out_channel,
 				       &nvmsg, sizeof(nvmsg),
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 983bf362466a..448c1ee39246 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -293,6 +293,8 @@  static void rndis_filter_receive_response(struct net_device *ndev,
 	u32 *req_id = &resp->msg.init_complete.req_id;
 	struct rndis_device *dev = nvdev->extension;
 	struct rndis_request *request = NULL;
+	struct hv_device *hv_dev = ((struct net_device_context *)
+			netdev_priv(ndev))->device_ctx;
 	bool found = false;
 	unsigned long flags;
 
@@ -361,6 +363,7 @@  static void rndis_filter_receive_response(struct net_device *ndev,
 			}
 		}
 
+		netvsc_dma_unmap(hv_dev, &request->pkt);
 		complete(&request->wait_event);
 	} else {
 		netdev_err(ndev,
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index babbe19f57e2..90abff664495 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1616,6 +1616,11 @@  struct hyperv_service_callback {
 	void (*callback)(void *context);
 };
 
+struct hv_dma_range {
+	dma_addr_t dma;
+	u32 mapping_size;
+};
+
 #define MAX_SRV_VER	0x7ffffff
 extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, u32 buflen,
 				const int *fw_version, int fw_vercnt,