diff mbox series

[v9,24/27] virt: gunyah: Add proxy-scheduled vCPUs

Message ID 20230120224627.4053418-25-quic_eberman@quicinc.com
State New
Headers show
Series Drivers for gunyah hypervisor | expand

Commit Message

Elliot Berman Jan. 20, 2023, 10:46 p.m. UTC
Gunyah allows host virtual machines to schedule guest virtual machines
and handle their MMIO accesses. vCPUs are presented to the host as a
Gunyah resource and represented to userspace as a Gunyah VM function.

Creating the vcpu VM function will create a file descriptor that:
 - can run an ioctl: GH_VCPU_RUN to schedule the guest vCPU until the
   next interrupt occurs on the host or when the guest vCPU can no
   longer be run.
 - can be mmap'd to share a gh_vcpu_run structure which can look up the
   reason why GH_VCPU_RUN returned and provide return values for MMIO
   access.

Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
---
 Documentation/virt/gunyah/vm-manager.rst |  30 +-
 arch/arm64/gunyah/gunyah_hypercall.c     |  28 ++
 drivers/virt/gunyah/Kconfig              |  11 +
 drivers/virt/gunyah/Makefile             |   2 +
 drivers/virt/gunyah/gunyah_vcpu.c        | 358 +++++++++++++++++++++++
 drivers/virt/gunyah/vm_mgr.c             |  25 ++
 drivers/virt/gunyah/vm_mgr.h             |   1 +
 include/linux/gunyah.h                   |   7 +
 include/uapi/linux/gunyah.h              |  30 ++
 9 files changed, 490 insertions(+), 2 deletions(-)
 create mode 100644 drivers/virt/gunyah/gunyah_vcpu.c

Comments

Srinivas Kandagatla Feb. 7, 2023, 2:43 p.m. UTC | #1
On 20/01/2023 22:46, Elliot Berman wrote:
> Gunyah allows host virtual machines to schedule guest virtual machines
> and handle their MMIO accesses. vCPUs are presented to the host as a
> Gunyah resource and represented to userspace as a Gunyah VM function.
> 
> Creating the vcpu VM function will create a file descriptor that:
>   - can run an ioctl: GH_VCPU_RUN to schedule the guest vCPU until the
>     next interrupt occurs on the host or when the guest vCPU can no
>     longer be run.
>   - can be mmap'd to share a gh_vcpu_run structure which can look up the
>     reason why GH_VCPU_RUN returned and provide return values for MMIO
>     access.
> 
> Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
> ---
>   Documentation/virt/gunyah/vm-manager.rst |  30 +-
>   arch/arm64/gunyah/gunyah_hypercall.c     |  28 ++
>   drivers/virt/gunyah/Kconfig              |  11 +
>   drivers/virt/gunyah/Makefile             |   2 +
>   drivers/virt/gunyah/gunyah_vcpu.c        | 358 +++++++++++++++++++++++
>   drivers/virt/gunyah/vm_mgr.c             |  25 ++
>   drivers/virt/gunyah/vm_mgr.h             |   1 +
>   include/linux/gunyah.h                   |   7 +
>   include/uapi/linux/gunyah.h              |  30 ++
>   9 files changed, 490 insertions(+), 2 deletions(-)
>   create mode 100644 drivers/virt/gunyah/gunyah_vcpu.c
> 
...


> diff --git a/include/uapi/linux/gunyah.h b/include/uapi/linux/gunyah.h
> index ec8da6fde045..b4afb11f538a 100644
> --- a/include/uapi/linux/gunyah.h
> +++ b/include/uapi/linux/gunyah.h
> @@ -53,9 +53,14 @@ struct gh_vm_dtb_config {
>   #define GUNYAH_FUNCTION_NAME_SIZE		32
>   #define GUNYAH_FUNCTION_MAX_ARG_SIZE		1024
>   
> +struct gh_fn_vcpu_arg {
> +	__u32 vcpu_id;
> +};
> +
>   struct gh_vm_function {
>   	char name[GUNYAH_FUNCTION_NAME_SIZE];
>   	union {
> +		struct gh_device_vcpu_arg vcpu;
>   		char data[GUNYAH_FUNCTION_MAX_ARG_SIZE];
>   	};
>   };
> @@ -63,4 +68,29 @@ struct gh_vm_function {
>   #define GH_VM_ADD_FUNCTION	_IOW(GH_IOCTL_TYPE, 0x4, struct gh_vm_function)
>   #define GH_VM_REMOVE_FUNCTION	_IOW(GH_IOCTL_TYPE, 0x7, struct gh_vm_function)
>   
> +/* for GH_VCPU_RUN, returned by mmap(vcpu_fd, offset=0) */
> +struct gh_vcpu_run {
this looks unused, I dont see any reference to this.

> +	/* in */
> +	__u8 immediate_exit;
> +	__u8 padding1[7];
> +
> +	/* out */
> +#define GH_VM_EXIT_UNKNOWN            0
> +#define GH_VM_EXIT_MMIO               1
> +	__u32 exit_reason;

If this is ment to go in any of the ioctl arguments then this suffers 
same issue of padding.

> +
> +	union {
> +		/* GH_VM_EXIT_MMIO */
> +		struct {
> +			__u64 phys_addr;
> +			__u8  data[8];
> +			__u32 len;
> +			__u8  is_write;
> +		} mmio;
> +	};
> +};
> +
> +#define GH_VCPU_RUN		_IO(GH_IOCTL_TYPE, 0x5)
> +#define GH_VCPU_MMAP_SIZE	_IO(GH_IOCTL_TYPE, 0x6)
> +
>   #endif
Elliot Berman Feb. 8, 2023, 6:36 p.m. UTC | #2
On 2/7/2023 6:43 AM, Srinivas Kandagatla wrote:
> 
> 
> On 20/01/2023 22:46, Elliot Berman wrote:
>> Gunyah allows host virtual machines to schedule guest virtual machines
>> and handle their MMIO accesses. vCPUs are presented to the host as a
>> Gunyah resource and represented to userspace as a Gunyah VM function.
>>
>> Creating the vcpu VM function will create a file descriptor that:
>>   - can run an ioctl: GH_VCPU_RUN to schedule the guest vCPU until the
>>     next interrupt occurs on the host or when the guest vCPU can no
>>     longer be run.
>>   - can be mmap'd to share a gh_vcpu_run structure which can look up the
>>     reason why GH_VCPU_RUN returned and provide return values for MMIO
>>     access.
>>
>> Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
>> Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
>> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
>> ---
>>   Documentation/virt/gunyah/vm-manager.rst |  30 +-
>>   arch/arm64/gunyah/gunyah_hypercall.c     |  28 ++
>>   drivers/virt/gunyah/Kconfig              |  11 +
>>   drivers/virt/gunyah/Makefile             |   2 +
>>   drivers/virt/gunyah/gunyah_vcpu.c        | 358 +++++++++++++++++++++++
>>   drivers/virt/gunyah/vm_mgr.c             |  25 ++
>>   drivers/virt/gunyah/vm_mgr.h             |   1 +
>>   include/linux/gunyah.h                   |   7 +
>>   include/uapi/linux/gunyah.h              |  30 ++
>>   9 files changed, 490 insertions(+), 2 deletions(-)
>>   create mode 100644 drivers/virt/gunyah/gunyah_vcpu.c
>>
> ...
> 
> 
>> diff --git a/include/uapi/linux/gunyah.h b/include/uapi/linux/gunyah.h
>> index ec8da6fde045..b4afb11f538a 100644
>> --- a/include/uapi/linux/gunyah.h
>> +++ b/include/uapi/linux/gunyah.h
>> @@ -53,9 +53,14 @@ struct gh_vm_dtb_config {
>>   #define GUNYAH_FUNCTION_NAME_SIZE        32
>>   #define GUNYAH_FUNCTION_MAX_ARG_SIZE        1024
>> +struct gh_fn_vcpu_arg {
>> +    __u32 vcpu_id;
>> +};
>> +
>>   struct gh_vm_function {
>>       char name[GUNYAH_FUNCTION_NAME_SIZE];
>>       union {
>> +        struct gh_device_vcpu_arg vcpu;
>>           char data[GUNYAH_FUNCTION_MAX_ARG_SIZE];
>>       };
>>   };
>> @@ -63,4 +68,29 @@ struct gh_vm_function {
>>   #define GH_VM_ADD_FUNCTION    _IOW(GH_IOCTL_TYPE, 0x4, struct 
>> gh_vm_function)
>>   #define GH_VM_REMOVE_FUNCTION    _IOW(GH_IOCTL_TYPE, 0x7, struct 
>> gh_vm_function)
>> +/* for GH_VCPU_RUN, returned by mmap(vcpu_fd, offset=0) */
>> +struct gh_vcpu_run {
> this looks unused, I dont see any reference to this.
> 

This structure gets mapped into userspace from the vCPU file descriptor, 
similar principle to KVM.

Thanks,
Elliot

>> +    /* in */
>> +    __u8 immediate_exit;
>> +    __u8 padding1[7];
>> +
>> +    /* out */
>> +#define GH_VM_EXIT_UNKNOWN            0
>> +#define GH_VM_EXIT_MMIO               1
>> +    __u32 exit_reason;
> 
> If this is ment to go in any of the ioctl arguments then this suffers 
> same issue of padding.
> 
>> +
>> +    union {
>> +        /* GH_VM_EXIT_MMIO */
>> +        struct {
>> +            __u64 phys_addr;
>> +            __u8  data[8];
>> +            __u32 len;
>> +            __u8  is_write;
>> +        } mmio;
>> +    };
>> +};
>> +
>> +#define GH_VCPU_RUN        _IO(GH_IOCTL_TYPE, 0x5)
>> +#define GH_VCPU_MMAP_SIZE    _IO(GH_IOCTL_TYPE, 0x6)
>> +
>>   #endif
Srivatsa Vaddagiri Feb. 9, 2023, 10:39 a.m. UTC | #3
* Elliot Berman <quic_eberman@quicinc.com> [2023-01-20 14:46:23]:

> +static void gh_handle_mmio_return(struct gunyah_vcpu *vcpu, u64 *state)
> +{
> +	if (!vcpu->vcpu_run->mmio.is_write)
> +		memcpy(&state[0], vcpu->vcpu_run->mmio.data, vcpu->vcpu_run->mmio.len);

Would be good to do a bound check on length of memcpy I think (in case 
vcpu_run_resp->state_data[1] is wrong for example).

> +
> +	vcpu->handle_mmio = false;
> +	vcpu->vcpu_run->exit_reason = GH_VM_EXIT_UNKNOWN;
> +}
> +

// snip

> +static int gh_vcpu_run(struct gunyah_vcpu *vcpu)
> +{
> +	struct gh_hypercall_vcpu_run_resp vcpu_run_resp;
> +	u64 state_data[3] = { 0 };
> +	int ret = 0;
> +
> +	ret = gh_vm_ensure_started(vcpu->ghvm);
> +	if (ret)
> +		return ret;

Can we move this to VM_START ioctl and avoid this check in fast path? In case VM
is not started, then I think gh_hypercall_vcpu_run() will fail which can catch
erroneous use of VCPU_RUN w/o a preceding VM_START. Alternately we could use a
flag in vcpu struct to check for this case (similar to test for vcpu->rsc
below).

// snip

> +			case GH_VCPU_STATE_EXPECTS_WAKEUP:
> +			case GH_VCPU_STATE_POWERED_OFF:
> +				ret = wait_for_completion_interruptible(&vcpu->ready);

I think we should end this wait in case immediate_exit is set as well.

> +static vm_fault_t gh_vcpu_fault(struct vm_fault *vmf)
> +{
> +	struct gunyah_vcpu *vcpu = vmf->vma->vm_file->private_data;
> +	struct page *page = NULL;
> +
> +	if (vmf->pgoff == 0)
> +		page = virt_to_page(vcpu->vcpu_run);
> +
> +	get_page(page);

We should avoid get_page in case page is NULL.

> +	vmf->page = page;
> +	return 0;
> +}
> +
> +static void gunyah_vcpu_unpopulate(struct gunyah_vm_resource_ticket *ticket,
> +				   struct gunyah_resource *ghrsc)
> +{
> +	struct gunyah_vcpu *vcpu = container_of(ticket, struct gunyah_vcpu, ticket);
> +
> +	vcpu->vcpu_run->immediate_exit = true;

We should poke the vcpu thread as well so that it can notice this.
Otherwise it can continue to be in gh_hypercall_vcpu_run() or
wait_for_completion_interruptible() for longer time to come.

> +	mutex_lock(&vcpu->run_lock);
> +	free_irq(vcpu->rsc->irq, vcpu);
> +	vcpu->rsc = NULL;
> +	mutex_unlock(&vcpu->run_lock);
> +}
> +
> +static long gunyah_vcpu_bind(struct gunyah_vm_function *f)
> +{
> +	struct gunyah_vcpu *vcpu;
> +	char name[MAX_VCPU_NAME];
> +	struct file *file;
> +	struct page *page;
> +	int fd;
> +	long r;
> +
> +	if (!gh_api_has_feature(GH_API_FEATURE_VCPU))

We should test for this feature before registering the function? What's
the point in registering a function otherwise if it can't do its work!
Srivatsa Vaddagiri Feb. 10, 2023, 6:54 a.m. UTC | #4
* Srivatsa Vaddagiri <quic_svaddagi@quicinc.com> [2023-02-09 16:10:06]:

> * Elliot Berman <quic_eberman@quicinc.com> [2023-01-20 14:46:23]:
> 
> > +static void gh_handle_mmio_return(struct gunyah_vcpu *vcpu, u64 *state)
> > +{
> > +	if (!vcpu->vcpu_run->mmio.is_write)
> > +		memcpy(&state[0], vcpu->vcpu_run->mmio.data, vcpu->vcpu_run->mmio.len);
> 
> Would be good to do a bound check on length of memcpy I think (in case 
> vcpu_run_resp->state_data[1] is wrong for example).

Sorry I meant to comment that for the other memcpy you had!

+               memcpy(vcpu->vcpu_run->mmio.data, &vcpu_run_resp->state_data[2],
+                       vcpu_run_resp->state_data[1]);
Elliot Berman Feb. 10, 2023, 5:09 p.m. UTC | #5
On 2/9/2023 10:54 PM, Srivatsa Vaddagiri wrote:
> * Srivatsa Vaddagiri <quic_svaddagi@quicinc.com> [2023-02-09 16:10:06]:
> 
>> * Elliot Berman <quic_eberman@quicinc.com> [2023-01-20 14:46:23]:
>>
>>> +static void gh_handle_mmio_return(struct gunyah_vcpu *vcpu, u64 *state)
>>> +{
>>> +	if (!vcpu->vcpu_run->mmio.is_write)
>>> +		memcpy(&state[0], vcpu->vcpu_run->mmio.data, vcpu->vcpu_run->mmio.len);
>>
>> Would be good to do a bound check on length of memcpy I think (in case
>> vcpu_run_resp->state_data[1] is wrong for example).

I think this is the right place to ensure right value is passed. Right 
now, vcpu_run->mmio.len is user accessible and easily provides ability 
for buffer overrun if unchecked. I'm now preserving the length Gunyah 
requested in internal vcpu structure. Userspace can write whatever they 
want to the mmio request, but it won't change the number of bytes that 
we copy from user.

> 
> Sorry I meant to comment that for the other memcpy you had!
> 
> +               memcpy(vcpu->vcpu_run->mmio.data, &vcpu_run_resp->state_data[2],
> +                       vcpu_run_resp->state_data[1]);
> 
> 

Per Srini and Alex, we can trust the values hypervisor sends, so I 
think we can skip this check.

https://lore.kernel.org/all/4a584563-1fb7-22fa-5e16-e0cf5e88b76b@linaro.org/

Thanks,
Elliot
diff mbox series

Patch

diff --git a/Documentation/virt/gunyah/vm-manager.rst b/Documentation/virt/gunyah/vm-manager.rst
index d929f8f20b3b..d11267d59802 100644
--- a/Documentation/virt/gunyah/vm-manager.rst
+++ b/Documentation/virt/gunyah/vm-manager.rst
@@ -5,8 +5,7 @@  Virtual Machine Manager
 =======================
 
 The Gunyah Virtual Machine Manager is a Linux driver to support launching
-virtual machines using Gunyah. It presently supports launching non-proxy
-scheduled Linux-like virtual machines.
+virtual machines using Gunyah.
 
 Except for some basic information about the location of initial binaries,
 most of the configuration about a Gunyah virtual machine is described in the
@@ -116,3 +115,30 @@  GH_VM_START
 ~~~~~~~~~~~
 
 This ioctl starts the VM.
+
+GH_VM_ADD_FUNCTION
+~~~~~~~~~~~~~~~~~~
+
+This ioctl registers a Gunyah VM function with the VM manager. The VM function
+is described with a `type` string and some arguments for that type. Typically,
+the function is added before the VM starts, but the function doesn't "operate"
+until the VM starts with GH_VM_START: e.g. vCPU ioclts will all return an error
+until the VM starts because the vCPUs don't exist until the VM is started. This
+allows the VMM to set up all the kernel functionality needed for the VM *before*
+the VM starts.
+
+The possible types are documented below:
+
+Type: "vcpu"
+^^^^^^^^^^^^
+
+::
+
+  struct gh_fn_vcpu_arg {
+	__u32 vcpu_id;
+  };
+
+The vcpu type will register with the VM Manager to expect to control
+vCPU number `vcpu_id`. It returns a file descriptor allowing interaction with
+the vCPU. See the Gunyah vCPU API description sections for interacting with
+the Gunyah vCPU file descriptors.
diff --git a/arch/arm64/gunyah/gunyah_hypercall.c b/arch/arm64/gunyah/gunyah_hypercall.c
index d93ad2c08479..17d87a130a3e 100644
--- a/arch/arm64/gunyah/gunyah_hypercall.c
+++ b/arch/arm64/gunyah/gunyah_hypercall.c
@@ -15,6 +15,7 @@  static const uint32_t gunyah_known_uuids[][4] = {
 #define GH_HYPERCALL_HYP_IDENTIFY		GH_HYPERCALL(0x0000)
 #define GH_HYPERCALL_MSGQ_SEND			GH_HYPERCALL(0x001B)
 #define GH_HYPERCALL_MSGQ_RECV			GH_HYPERCALL(0x001C)
+#define GH_HYPERCALL_VCPU_RUN			GH_HYPERCALL(0x0065)
 
 /**
  * gh_hypercall_get_uid() - Returns a UID when running under a Gunyah hypervisor
@@ -104,5 +105,32 @@  int gh_hypercall_msgq_recv(u64 capid, uintptr_t buff, size_t size, size_t *recv_
 }
 EXPORT_SYMBOL_GPL(gh_hypercall_msgq_recv);
 
+int gh_hypercall_vcpu_run(u64 capid, u64 *resume_data, struct gh_hypercall_vcpu_run_resp *resp)
+{
+	struct arm_smccc_1_2_regs args = {
+		.a0 = GH_HYPERCALL_VCPU_RUN,
+		.a1 = capid,
+		.a2 = resume_data[0],
+		.a3 = resume_data[1],
+		.a4 = resume_data[2],
+		/* C language says this will be implictly zero. Gunyah requires 0, so be explicit */
+		.a5 = 0,
+	};
+	struct arm_smccc_1_2_regs res;
+
+	arm_smccc_1_2_hvc(&args, &res);
+
+	if (res.a0)
+		return res.a0;
+
+	resp->state = res.a1;
+	resp->state_data[0] = res.a2;
+	resp->state_data[1] = res.a3;
+	resp->state_data[2] = res.a4;
+
+	return res.a0;
+}
+EXPORT_SYMBOL_GPL(gh_hypercall_vcpu_run);
+
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Gunyah Hypervisor Hypercalls");
diff --git a/drivers/virt/gunyah/Kconfig b/drivers/virt/gunyah/Kconfig
index de815189dab6..4c1c6110b50e 100644
--- a/drivers/virt/gunyah/Kconfig
+++ b/drivers/virt/gunyah/Kconfig
@@ -15,3 +15,14 @@  config GUNYAH
 
 config GUNYAH_PLATFORM_HOOKS
 	tristate
+
+config GUNYAH_VCPU
+	tristate "Runnable Gunyah vCPUs"
+	depends on GUNYAH
+	help
+	  Enable kernel support for host-scheduled vCPUs running under Gunyah.
+	  When selecting this option, userspace virtual machine managers (VMM)
+	  can schedule the guest VM's vCPUs instead of using Gunyah's scheduler.
+	  VMMs can also handle stage 2 faults of the vCPUs.
+
+	  Say Y/M here if unsure and you want to support Gunyah VMMs.
diff --git a/drivers/virt/gunyah/Makefile b/drivers/virt/gunyah/Makefile
index 6b8f84dbfe0d..2d1b604a7b03 100644
--- a/drivers/virt/gunyah/Makefile
+++ b/drivers/virt/gunyah/Makefile
@@ -5,3 +5,5 @@  obj-$(CONFIG_GUNYAH_PLATFORM_HOOKS) += gunyah_platform_hooks.o
 
 gunyah_rsc_mgr-y += rsc_mgr.o rsc_mgr_rpc.o vm_mgr.o vm_mgr_mm.o
 obj-$(CONFIG_GUNYAH) += gunyah_rsc_mgr.o
+
+obj-$(CONFIG_GUNYAH_VCPU) += gunyah_vcpu.o
diff --git a/drivers/virt/gunyah/gunyah_vcpu.c b/drivers/virt/gunyah/gunyah_vcpu.c
new file mode 100644
index 000000000000..5166e7d8638f
--- /dev/null
+++ b/drivers/virt/gunyah/gunyah_vcpu.c
@@ -0,0 +1,358 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022-2023 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/gunyah.h>
+#include <linux/gunyah_vm_mgr.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#include "vm_mgr.h"
+
+#include <uapi/linux/gunyah.h>
+
+#define MAX_VCPU_NAME		20 /* gh-vcpu:u32_max+NUL */
+
+struct gunyah_vcpu {
+	struct gunyah_resource *rsc;
+	struct gunyah_vm_resource_ticket ticket;
+	struct gunyah_vm_function *f;
+	struct gunyah_vm *ghvm;
+
+	bool handle_mmio;
+	struct gh_vcpu_run *vcpu_run;
+
+	struct kref kref;
+	struct completion ready;
+	struct mutex run_lock;
+};
+
+/* VCPU is ready to run */
+#define GH_VCPU_STATE_READY		0
+/* VCPU is sleeping until an interrupt arrives */
+#define GH_VCPU_STATE_EXPECTS_WAKEUP	1
+/* VCPU is powered off */
+#define GH_VCPU_STATE_POWERED_OFF	2
+/* VCPU is blocked in EL2 for unspecified reason */
+#define GH_VCPU_STATE_BLOCKED		3
+/* VCPU has returned for MMIO READ */
+#define GH_VCPU_ADDRSPACE_VMMIO_READ	4
+/* VCPU has returned for MMIO WRITE */
+#define GH_VCPU_ADDRSPACE_VMMIO_WRITE	5
+
+static void vcpu_release(struct kref *kref)
+{
+	struct gunyah_vcpu *vcpu = container_of(kref, struct gunyah_vcpu, kref);
+
+	kfree(vcpu);
+}
+
+/*
+ * When hypervisor allows us to schedule vCPU again, it gives us an interrupt
+ */
+static irqreturn_t gh_vcpu_irq_handler(int irq, void *data)
+{
+	struct gunyah_vcpu *vcpu = data;
+
+	complete(&vcpu->ready);
+	return IRQ_HANDLED;
+}
+
+static void gh_handle_mmio_return(struct gunyah_vcpu *vcpu, u64 *state)
+{
+	if (!vcpu->vcpu_run->mmio.is_write)
+		memcpy(&state[0], vcpu->vcpu_run->mmio.data, vcpu->vcpu_run->mmio.len);
+
+	vcpu->handle_mmio = false;
+	vcpu->vcpu_run->exit_reason = GH_VM_EXIT_UNKNOWN;
+}
+
+static bool gh_handle_mmio(struct gunyah_vcpu *vcpu,
+				struct gh_hypercall_vcpu_run_resp *vcpu_run_resp)
+{
+	int ret = 0;
+
+	if (vcpu_run_resp->state == GH_VCPU_ADDRSPACE_VMMIO_READ) {
+		vcpu->vcpu_run->mmio.is_write = 0;
+		vcpu->vcpu_run->exit_reason = GH_VM_EXIT_MMIO;
+	} else { /* GH_VCPU_ADDRSPACE_VMMIO_WRITE case */
+		ret = gh_vm_mgr_mmio_write(vcpu->f->ghvm, vcpu_run_resp->state_data[0],
+			vcpu_run_resp->state_data[1], vcpu_run_resp->state_data[2]);
+		if (!ret)
+			return true;
+
+		vcpu->vcpu_run->mmio.is_write = 1;
+		memcpy(vcpu->vcpu_run->mmio.data, &vcpu_run_resp->state_data[2],
+			vcpu_run_resp->state_data[1]);
+	}
+
+	vcpu->vcpu_run->mmio.phys_addr = vcpu_run_resp->state_data[0];
+	vcpu->vcpu_run->mmio.len = vcpu_run_resp->state_data[1];
+	vcpu->vcpu_run->exit_reason = GH_VM_EXIT_MMIO;
+
+	return false;
+}
+
+/**
+ * gh_vcpu_run() - Request Gunyah to begin scheduling this vCPU.
+ * @vcpu: The client descriptor that was obtained via gunyah_vcpu_alloc()
+ */
+static int gh_vcpu_run(struct gunyah_vcpu *vcpu)
+{
+	struct gh_hypercall_vcpu_run_resp vcpu_run_resp;
+	u64 state_data[3] = { 0 };
+	int ret = 0;
+
+	ret = gh_vm_ensure_started(vcpu->ghvm);
+	if (ret)
+		return ret;
+
+	if (mutex_lock_interruptible(&vcpu->run_lock))
+		return -ERESTARTSYS;
+
+	if (!vcpu->rsc || !vcpu->f) {
+		ret = -ENODEV;
+		goto out;
+	}
+	/* Last exit reason was EXIT_MMIO. Userspace has filled in the data, now we need to tell
+	 * Gunyah about the response.
+	 */
+	if (vcpu->handle_mmio)
+		gh_handle_mmio_return(vcpu, state_data);
+
+	while (!ret && !signal_pending(current)) {
+		if (vcpu->vcpu_run->immediate_exit) {
+			ret = -EINTR;
+			goto out;
+		}
+		ret = gh_hypercall_vcpu_run(vcpu->rsc->capid, state_data, &vcpu_run_resp);
+		if (ret == GH_ERROR_OK) {
+			switch (vcpu_run_resp.state) {
+			case GH_VCPU_STATE_READY:
+				if (need_resched())
+					schedule();
+				break;
+			case GH_VCPU_STATE_EXPECTS_WAKEUP:
+			case GH_VCPU_STATE_POWERED_OFF:
+				ret = wait_for_completion_interruptible(&vcpu->ready);
+				/* reinitialize completion before next VCPU_RUN. If we reinitialize
+				 * after the VCPU_RUN, interrupt may have already come before
+				 * we can re-initialize and then waiting for an interrupt that
+				 * was already handled.
+				 */
+				reinit_completion(&vcpu->ready);
+				break;
+			case GH_VCPU_STATE_BLOCKED:
+				schedule();
+				break;
+			case GH_VCPU_ADDRSPACE_VMMIO_READ:
+			case GH_VCPU_ADDRSPACE_VMMIO_WRITE:
+				if (!gh_handle_mmio(vcpu, &vcpu_run_resp)) {
+					vcpu->handle_mmio = true;
+					goto out;
+				}
+				break;
+			default:
+				pr_warn_ratelimited("Unknown vCPU state: %llx\n",
+							vcpu_run_resp.state);
+				schedule();
+				break;
+			}
+		} else if (ret == GH_ERROR_RETRY) {
+			schedule();
+			ret = 0;
+		} else
+			ret = gh_remap_error(ret);
+	}
+
+out:
+	mutex_unlock(&vcpu->run_lock);
+
+	if (signal_pending(current))
+		return -ERESTARTSYS;
+
+	return ret;
+}
+
+static long gh_vcpu_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct gunyah_vcpu *vcpu = filp->private_data;
+	long ret = -EINVAL;
+
+	switch (cmd) {
+	case GH_VCPU_RUN:
+		ret = gh_vcpu_run(vcpu);
+		break;
+	case GH_VCPU_MMAP_SIZE:
+		ret = PAGE_SIZE;
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+static int gh_vcpu_release(struct inode *inode, struct file *filp)
+{
+	struct gunyah_vcpu *vcpu = filp->private_data;
+
+	kref_put(&vcpu->kref, vcpu_release);
+	return 0;
+}
+
+static vm_fault_t gh_vcpu_fault(struct vm_fault *vmf)
+{
+	struct gunyah_vcpu *vcpu = vmf->vma->vm_file->private_data;
+	struct page *page = NULL;
+
+	if (vmf->pgoff == 0)
+		page = virt_to_page(vcpu->vcpu_run);
+
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct gh_vcpu_ops = {
+	.fault = gh_vcpu_fault,
+};
+
+static int gh_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &gh_vcpu_ops;
+	return 0;
+}
+
+static const struct file_operations gh_vcpu_fops = {
+	.unlocked_ioctl = gh_vcpu_ioctl,
+	.release = gh_vcpu_release,
+	.llseek = noop_llseek,
+	.mmap = gh_vcpu_mmap,
+};
+
+static int gunyah_vcpu_populate(struct gunyah_vm_resource_ticket *ticket,
+				struct gunyah_resource *ghrsc)
+{
+	struct gunyah_vcpu *vcpu = container_of(ticket, struct gunyah_vcpu, ticket);
+	int ret;
+
+	mutex_lock(&vcpu->run_lock);
+	vcpu->rsc = ghrsc;
+
+	init_completion(&vcpu->ready);
+	mutex_unlock(&vcpu->run_lock);
+
+	ret = request_irq(vcpu->rsc->irq, gh_vcpu_irq_handler, IRQF_TRIGGER_RISING, "gh_vcpu",
+			vcpu);
+	if (ret)
+		pr_warn("Failed to request vcpu irq %d: %d", vcpu->rsc->irq, ret);
+
+	return ret;
+}
+
+static void gunyah_vcpu_unpopulate(struct gunyah_vm_resource_ticket *ticket,
+				   struct gunyah_resource *ghrsc)
+{
+	struct gunyah_vcpu *vcpu = container_of(ticket, struct gunyah_vcpu, ticket);
+
+	vcpu->vcpu_run->immediate_exit = true;
+	mutex_lock(&vcpu->run_lock);
+	free_irq(vcpu->rsc->irq, vcpu);
+	vcpu->rsc = NULL;
+	mutex_unlock(&vcpu->run_lock);
+}
+
+static long gunyah_vcpu_bind(struct gunyah_vm_function *f)
+{
+	struct gunyah_vcpu *vcpu;
+	char name[MAX_VCPU_NAME];
+	struct file *file;
+	struct page *page;
+	int fd;
+	long r;
+
+	if (!gh_api_has_feature(GH_API_FEATURE_VCPU))
+		return -EOPNOTSUPP;
+
+	vcpu = kzalloc(sizeof(*vcpu), GFP_KERNEL);
+	if (!vcpu)
+		return -ENOMEM;
+
+	vcpu->f = f;
+	f->data = vcpu;
+	mutex_init(&vcpu->run_lock);
+	kref_init(&vcpu->kref);
+
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page) {
+		r = -ENOMEM;
+		goto err_destroy_vcpu;
+	}
+	vcpu->vcpu_run = page_address(page);
+
+	vcpu->ticket.resource_type = GUNYAH_RESOURCE_TYPE_VCPU;
+	vcpu->ticket.label = f->fn.vcpu.vcpu_id;
+	vcpu->ticket.owner = THIS_MODULE;
+	vcpu->ticket.populate = gunyah_vcpu_populate;
+	vcpu->ticket.unpopulate = gunyah_vcpu_unpopulate;
+
+	r = ghvm_add_resource_ticket(f->ghvm, &vcpu->ticket);
+	if (r)
+		goto err_destroy_page;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0) {
+		r = fd;
+		goto err_remove_vcpu;
+	}
+
+	if (!get_gunyah_vm(f->ghvm)) {
+		r = -ENODEV;
+		goto err_put_fd;
+	}
+	vcpu->ghvm = f->ghvm;
+
+	kref_get(&vcpu->kref);
+	snprintf(name, sizeof(name), "gh-vcpu:%d", vcpu->ticket.label);
+	file = anon_inode_getfile(name, &gh_vcpu_fops, vcpu, O_RDWR);
+	if (IS_ERR(file)) {
+		r = PTR_ERR(file);
+		goto err_put_ghvm;
+	}
+
+	fd_install(fd, file);
+
+	return fd;
+err_put_ghvm:
+	put_gunyah_vm(vcpu->ghvm);
+err_put_fd:
+	put_unused_fd(fd);
+err_remove_vcpu:
+	ghvm_remove_resource_ticket(f->ghvm, &vcpu->ticket);
+err_destroy_page:
+	free_page((unsigned long)vcpu->vcpu_run);
+err_destroy_vcpu:
+	kfree(vcpu);
+	return r;
+}
+
+static void gunyah_vcpu_release(struct gunyah_vm_function *f)
+{
+	struct gunyah_vcpu *vcpu = f->data;
+
+	ghvm_remove_resource_ticket(vcpu->f->ghvm, &vcpu->ticket);
+	vcpu->f = NULL;
+
+	kref_put(&vcpu->kref, vcpu_release);
+}
+
+DECLARE_GUNYAH_VM_FUNCTION_INIT(vcpu, gunyah_vcpu_bind, gunyah_vcpu_release);
+MODULE_DESCRIPTION("Gunyah vCPU Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/virt/gunyah/vm_mgr.c b/drivers/virt/gunyah/vm_mgr.c
index 1dfe354bcc29..9786de138f7c 100644
--- a/drivers/virt/gunyah/vm_mgr.c
+++ b/drivers/virt/gunyah/vm_mgr.c
@@ -431,6 +431,31 @@  static int gh_vm_start(struct gunyah_vm *ghvm)
 	return ret;
 }
 
+int gh_vm_ensure_started(struct gunyah_vm *ghvm)
+{
+	int ret;
+
+retry:
+	ret = down_read_interruptible(&ghvm->status_lock);
+	if (ret)
+		return ret;
+
+	if (unlikely(ghvm->vm_status == GH_RM_VM_STATUS_NO_STATE)) {
+		up_read(&ghvm->status_lock);
+		ret = gh_vm_start(ghvm);
+		if (ret)
+			return ret;
+		goto retry;
+	}
+
+	if (unlikely(ghvm->vm_status != GH_RM_VM_STATUS_READY))
+		ret = -ENODEV;
+
+	up_read(&ghvm->status_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(gh_vm_ensure_started);
+
 static void gh_vm_stop(struct gunyah_vm *ghvm)
 {
 	int ret;
diff --git a/drivers/virt/gunyah/vm_mgr.h b/drivers/virt/gunyah/vm_mgr.h
index eb17a2dda2a5..100c457dce14 100644
--- a/drivers/virt/gunyah/vm_mgr.h
+++ b/drivers/virt/gunyah/vm_mgr.h
@@ -64,6 +64,7 @@  struct gunyah_vm_memory_mapping *gh_vm_mem_mapping_find(struct gunyah_vm *ghvm,
 struct gunyah_vm_memory_mapping *gh_vm_mem_mapping_find_mapping(struct gunyah_vm *ghvm,
 								u64 gpa, u32 size);
 
+int gh_vm_ensure_started(struct gunyah_vm *ghvm);
 int gh_vm_mgr_mmio_write(struct gunyah_vm *ghvm, u64 addr, u32 len, u64 data);
 
 #endif
diff --git a/include/linux/gunyah.h b/include/linux/gunyah.h
index d745916cabcc..feeab01bff30 100644
--- a/include/linux/gunyah.h
+++ b/include/linux/gunyah.h
@@ -169,4 +169,11 @@  void gh_hypercall_hyp_identify(struct gh_hypercall_hyp_identify_resp *hyp_identi
 int gh_hypercall_msgq_send(u64 capid, size_t size, uintptr_t buff, int tx_flags, bool *ready);
 int gh_hypercall_msgq_recv(u64 capid, uintptr_t buff, size_t size, size_t *recv_size, bool *ready);
 
+struct gh_hypercall_vcpu_run_resp {
+	u64 state;
+	u64 state_data[3];
+};
+
+int gh_hypercall_vcpu_run(u64 capid, u64 *resume_data, struct gh_hypercall_vcpu_run_resp *resp);
+
 #endif
diff --git a/include/uapi/linux/gunyah.h b/include/uapi/linux/gunyah.h
index ec8da6fde045..b4afb11f538a 100644
--- a/include/uapi/linux/gunyah.h
+++ b/include/uapi/linux/gunyah.h
@@ -53,9 +53,14 @@  struct gh_vm_dtb_config {
 #define GUNYAH_FUNCTION_NAME_SIZE		32
 #define GUNYAH_FUNCTION_MAX_ARG_SIZE		1024
 
+struct gh_fn_vcpu_arg {
+	__u32 vcpu_id;
+};
+
 struct gh_vm_function {
 	char name[GUNYAH_FUNCTION_NAME_SIZE];
 	union {
+		struct gh_device_vcpu_arg vcpu;
 		char data[GUNYAH_FUNCTION_MAX_ARG_SIZE];
 	};
 };
@@ -63,4 +68,29 @@  struct gh_vm_function {
 #define GH_VM_ADD_FUNCTION	_IOW(GH_IOCTL_TYPE, 0x4, struct gh_vm_function)
 #define GH_VM_REMOVE_FUNCTION	_IOW(GH_IOCTL_TYPE, 0x7, struct gh_vm_function)
 
+/* for GH_VCPU_RUN, returned by mmap(vcpu_fd, offset=0) */
+struct gh_vcpu_run {
+	/* in */
+	__u8 immediate_exit;
+	__u8 padding1[7];
+
+	/* out */
+#define GH_VM_EXIT_UNKNOWN            0
+#define GH_VM_EXIT_MMIO               1
+	__u32 exit_reason;
+
+	union {
+		/* GH_VM_EXIT_MMIO */
+		struct {
+			__u64 phys_addr;
+			__u8  data[8];
+			__u32 len;
+			__u8  is_write;
+		} mmio;
+	};
+};
+
+#define GH_VCPU_RUN		_IO(GH_IOCTL_TYPE, 0x5)
+#define GH_VCPU_MMAP_SIZE	_IO(GH_IOCTL_TYPE, 0x6)
+
 #endif