diff mbox series

[3/4] intel_idle: Add support for using intel_idle in a VM guest using just hlt

Message ID 20230605154716.840930-4-arjan@linux.intel.com
State Accepted
Commit 2f3d08f074b02aa449de27238fda72496c789034
Headers show
Series Add support for running in VM guests to intel_idle | expand

Commit Message

Arjan van de Ven June 5, 2023, 3:47 p.m. UTC
From: Arjan van de Ven <arjan@linux.intel.com>

In a typical VM guest, the mwait instruction is not available, leaving only the
'hlt' instruction (which causes a VMEXIT to the host).

So for this common case, intel_idle will detect the lack of mwait, and fail
to initialize (after which another idle method would step in which will
just use hlt always).

Other (non-common) cases exist; the table below shows the before/after for these:

+------------+--------------------------+-------------------------+
| Hypervisor | Idle method before patch | Idle method after patch |
| exposes    |                          |                         |
+============+==========================+=========================+
| nothing    | default_idle fallback    | intel_idle VM table     |
| (common)   | (straight "hlt")         |                         |
+------------+--------------------------+-------------------------+
| mwait      | intel_idle mwait table   | intel_idle mwait table  |
+------------+--------------------------+-------------------------+
| ACPI       | ACPI C1 state ("hlt")    | intel_idle VM table     |
+------------+--------------------------+-------------------------+

By providing capability to do this with the intel_idle driver, we can
do better than the fallback or ACPI table methods. While this current change
only gets us to the existing behavior, later patches in this series
will add new capabilities such as optimized TLB flushing.

In order to do this, a simplified version of the initialization function
for VM guests is created, and this will be called if the CPU is recognized,
but mwait is not supported, and we're in a VM guest.

One thing to note is that the max latency (and break even) of this C1 state
is higher than the typical bare metal C1 state. Because hlt causes a vmexit,
and the cost of vmexit + hypervisor overhead + vmenter is typically in the
order of upto 5 microseconds... even if the hypervisor does not actually
goes into a hardware power saving state.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 drivers/idle/intel_idle.c | 125 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 124 insertions(+), 1 deletion(-)

Comments

Rafael J. Wysocki June 12, 2023, 6:05 p.m. UTC | #1
On Mon, Jun 5, 2023 at 5:47 PM <arjan@linux.intel.com> wrote:
>
> From: Arjan van de Ven <arjan@linux.intel.com>
>
> In a typical VM guest, the mwait instruction is not available, leaving only the
> 'hlt' instruction (which causes a VMEXIT to the host).
>
> So for this common case, intel_idle will detect the lack of mwait, and fail
> to initialize (after which another idle method would step in which will
> just use hlt always).
>
> Other (non-common) cases exist; the table below shows the before/after for these:
>
> +------------+--------------------------+-------------------------+
> | Hypervisor | Idle method before patch | Idle method after patch |
> | exposes    |                          |                         |
> +============+==========================+=========================+
> | nothing    | default_idle fallback    | intel_idle VM table     |
> | (common)   | (straight "hlt")         |                         |
> +------------+--------------------------+-------------------------+
> | mwait      | intel_idle mwait table   | intel_idle mwait table  |
> +------------+--------------------------+-------------------------+
> | ACPI       | ACPI C1 state ("hlt")    | intel_idle VM table     |
> +------------+--------------------------+-------------------------+
>
> By providing capability to do this with the intel_idle driver, we can
> do better than the fallback or ACPI table methods. While this current change
> only gets us to the existing behavior, later patches in this series
> will add new capabilities such as optimized TLB flushing.
>
> In order to do this, a simplified version of the initialization function
> for VM guests is created, and this will be called if the CPU is recognized,
> but mwait is not supported, and we're in a VM guest.
>
> One thing to note is that the max latency (and break even) of this C1 state
> is higher than the typical bare metal C1 state. Because hlt causes a vmexit,
> and the cost of vmexit + hypervisor overhead + vmenter is typically in the
> order of upto 5 microseconds... even if the hypervisor does not actually
> goes into a hardware power saving state.
>
> Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
> ---
>  drivers/idle/intel_idle.c | 125 +++++++++++++++++++++++++++++++++++++-
>  1 file changed, 124 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
> index 256c2d42e350..d2518cf36ab4 100644
> --- a/drivers/idle/intel_idle.c
> +++ b/drivers/idle/intel_idle.c
> @@ -199,6 +199,43 @@ static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev,
>         return __intel_idle(dev, drv, index);
>  }
>
> +static __always_inline int __intel_idle_hlt(struct cpuidle_device *dev,
> +                                       struct cpuidle_driver *drv, int index)
> +{
> +       raw_safe_halt();
> +       raw_local_irq_disable();
> +       return index;
> +}
> +
> +/**
> + * intel_idle_hlt - Ask the processor to enter the given idle state using hlt.
> + * @dev: cpuidle device of the target CPU.
> + * @drv: cpuidle driver (assumed to point to intel_idle_driver).
> + * @index: Target idle state index.
> + *
> + * Use the HLT instruction to notify the processor that the CPU represented by
> + * @dev is idle and it can try to enter the idle state corresponding to @index.
> + *
> + * Must be called under local_irq_disable().
> + */
> +static __cpuidle int intel_idle_hlt(struct cpuidle_device *dev,
> +                               struct cpuidle_driver *drv, int index)
> +{
> +       return __intel_idle_hlt(dev, drv, index);
> +}
> +
> +static __cpuidle int intel_idle_hlt_irq_on(struct cpuidle_device *dev,
> +                                   struct cpuidle_driver *drv, int index)
> +{
> +       int ret;
> +
> +       raw_local_irq_enable();
> +       ret = __intel_idle_hlt(dev, drv, index);
> +       raw_local_irq_disable();
> +
> +       return ret;
> +}
> +
>  /**
>   * intel_idle_s2idle - Ask the processor to enter the given idle state.
>   * @dev: cpuidle device of the target CPU.
> @@ -1242,6 +1279,18 @@ static struct cpuidle_state snr_cstates[] __initdata = {
>                 .enter = NULL }
>  };
>
> +static struct cpuidle_state vmguest_cstates[] __initdata = {
> +       {
> +               .name = "C1",
> +               .desc = "HLT",
> +               .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
> +               .exit_latency = 5,
> +               .target_residency = 10,
> +               .enter = &intel_idle_hlt, },
> +       {
> +               .enter = NULL }
> +};
> +
>  static const struct idle_cpu idle_cpu_nehalem __initconst = {
>         .state_table = nehalem_cstates,
>         .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
> @@ -1841,6 +1890,16 @@ static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
>
>  static void state_update_enter_method(struct cpuidle_state *state, int cstate)
>  {
> +       if (state->enter == intel_idle_hlt) {
> +               if (force_irq_on) {
> +                       pr_info("forced intel_idle_irq for state %d\n", cstate);
> +                       state->enter = intel_idle_hlt_irq_on;
> +               }
> +               return;
> +       }
> +       if (state->enter == intel_idle_hlt_irq_on)
> +               return; /* no update scenarios */
> +
>         if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) {
>                 /*
>                  * Combining with XSTATE with IBRS or IRQ_ENABLE flags
> @@ -1874,6 +1933,29 @@ static void state_update_enter_method(struct cpuidle_state *state, int cstate)
>         }
>  }
>
> +/*
> + * For mwait based states, we want to verify the cpuid data to see if the state
> + * is actually supported by this specific CPU.
> + * For non-mwait based states, this check should be skipped.
> + */
> +static bool should_verify_mwait(struct cpuidle_state *state)
> +{
> +       if (state->enter == intel_idle_irq)
> +               return true;
> +       if (state->enter == intel_idle)
> +               return true;
> +       if (state->enter == intel_idle_ibrs)
> +               return true;
> +       if (state->enter == intel_idle_xstate)
> +               return true;

Since true is returned by default below, why are the above checks
necessary (or even useful for that matter)?

> +       if (state->enter == intel_idle_hlt)
> +               return false;
> +       if (state->enter == intel_idle_hlt_irq_on)
> +               return false;
> +
> +       return true;
> +}
> +
>  static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
>  {
>         int cstate;
> @@ -1922,7 +2004,7 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
>                 }
>
>                 mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
> -               if (!intel_idle_verify_cstate(mwait_hint))
> +               if (should_verify_mwait(&cpuidle_state_table[cstate]) && !intel_idle_verify_cstate(mwait_hint))
>                         continue;
>
>                 /* Structure copy. */
> @@ -2056,6 +2138,45 @@ static void __init intel_idle_cpuidle_devices_uninit(void)
>                 cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i));
>  }
>
> +static int __init intel_idle_vminit(const struct x86_cpu_id *id)
> +{
> +       int retval;
> +
> +       cpuidle_state_table = vmguest_cstates;
> +
> +       icpu = (const struct idle_cpu *)id->driver_data;
> +
> +       pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
> +                boot_cpu_data.x86_model);
> +
> +       intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
> +       if (!intel_idle_cpuidle_devices)
> +               return -ENOMEM;
> +
> +       intel_idle_cpuidle_driver_init(&intel_idle_driver);
> +
> +       retval = cpuidle_register_driver(&intel_idle_driver);
> +       if (retval) {
> +               struct cpuidle_driver *drv = cpuidle_get_driver();
> +               printk(KERN_DEBUG pr_fmt("intel_idle yielding to %s\n"),
> +                      drv ? drv->name : "none");
> +               goto init_driver_fail;
> +       }
> +
> +       retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
> +                                  intel_idle_cpu_online, NULL);
> +       if (retval < 0)
> +               goto hp_setup_fail;
> +
> +       return 0;
> +hp_setup_fail:
> +       intel_idle_cpuidle_devices_uninit();
> +       cpuidle_unregister_driver(&intel_idle_driver);
> +init_driver_fail:
> +       free_percpu(intel_idle_cpuidle_devices);
> +       return retval;
> +}
> +
>  static int __init intel_idle_init(void)
>  {
>         const struct x86_cpu_id *id;
> @@ -2074,6 +2195,8 @@ static int __init intel_idle_init(void)
>         id = x86_match_cpu(intel_idle_ids);
>         if (id) {
>                 if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
> +                       if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
> +                               return intel_idle_vminit(id);
>                         pr_debug("Please enable MWAIT in BIOS SETUP\n");
>                         return -ENODEV;
>                 }
> --
> 2.40.1
>
Arjan van de Ven June 12, 2023, 6:07 p.m. UTC | #2
On 6/12/2023 11:05 AM, Rafael J. Wysocki wrote:
> Since true is returned by default below, why are the above checks
> necessary (or even useful for that matter)?

they are really for documentation purposes to show all cases are handled... but if you hate them
I don't mind them going away; I can respin quickly or send another
patch to remove them.. up to you
Rafael J. Wysocki June 15, 2023, 5:46 p.m. UTC | #3
On Mon, Jun 5, 2023 at 5:47 PM <arjan@linux.intel.com> wrote:
>
> From: Arjan van de Ven <arjan@linux.intel.com>
>
> In a typical VM guest, the mwait instruction is not available, leaving only the
> 'hlt' instruction (which causes a VMEXIT to the host).
>
> So for this common case, intel_idle will detect the lack of mwait, and fail
> to initialize (after which another idle method would step in which will
> just use hlt always).
>
> Other (non-common) cases exist; the table below shows the before/after for these:
>
> +------------+--------------------------+-------------------------+
> | Hypervisor | Idle method before patch | Idle method after patch |
> | exposes    |                          |                         |
> +============+==========================+=========================+
> | nothing    | default_idle fallback    | intel_idle VM table     |
> | (common)   | (straight "hlt")         |                         |
> +------------+--------------------------+-------------------------+
> | mwait      | intel_idle mwait table   | intel_idle mwait table  |
> +------------+--------------------------+-------------------------+
> | ACPI       | ACPI C1 state ("hlt")    | intel_idle VM table     |
> +------------+--------------------------+-------------------------+

However, this is only for the processors known to intel_idle.

On the processors whose IDs are not there in intel_idle_ids,
intel_idle on virt will still refuse to register, so the first column
remains applicable.

IOW, for this change to take effect on any new processors, their IDs
need to be added to intel_idle_ids.

IMO it would be prudent to mention this here.  I can add a paragraph
to that effect to the changelog or please send me one if you prefer.

> By providing capability to do this with the intel_idle driver, we can
> do better than the fallback or ACPI table methods. While this current change
> only gets us to the existing behavior, later patches in this series
> will add new capabilities such as optimized TLB flushing.
>
> In order to do this, a simplified version of the initialization function
> for VM guests is created, and this will be called if the CPU is recognized,
> but mwait is not supported, and we're in a VM guest.
>
> One thing to note is that the max latency (and break even) of this C1 state
> is higher than the typical bare metal C1 state. Because hlt causes a vmexit,
> and the cost of vmexit + hypervisor overhead + vmenter is typically in the
> order of upto 5 microseconds... even if the hypervisor does not actually
> goes into a hardware power saving state.
>
> Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
> ---
>  drivers/idle/intel_idle.c | 125 +++++++++++++++++++++++++++++++++++++-
>  1 file changed, 124 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
> index 256c2d42e350..d2518cf36ab4 100644
> --- a/drivers/idle/intel_idle.c
> +++ b/drivers/idle/intel_idle.c
> @@ -199,6 +199,43 @@ static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev,
>         return __intel_idle(dev, drv, index);
>  }
>
> +static __always_inline int __intel_idle_hlt(struct cpuidle_device *dev,
> +                                       struct cpuidle_driver *drv, int index)
> +{
> +       raw_safe_halt();
> +       raw_local_irq_disable();
> +       return index;
> +}
> +
> +/**
> + * intel_idle_hlt - Ask the processor to enter the given idle state using hlt.
> + * @dev: cpuidle device of the target CPU.
> + * @drv: cpuidle driver (assumed to point to intel_idle_driver).
> + * @index: Target idle state index.
> + *
> + * Use the HLT instruction to notify the processor that the CPU represented by
> + * @dev is idle and it can try to enter the idle state corresponding to @index.
> + *
> + * Must be called under local_irq_disable().
> + */
> +static __cpuidle int intel_idle_hlt(struct cpuidle_device *dev,
> +                               struct cpuidle_driver *drv, int index)
> +{
> +       return __intel_idle_hlt(dev, drv, index);
> +}
> +
> +static __cpuidle int intel_idle_hlt_irq_on(struct cpuidle_device *dev,
> +                                   struct cpuidle_driver *drv, int index)
> +{
> +       int ret;
> +
> +       raw_local_irq_enable();
> +       ret = __intel_idle_hlt(dev, drv, index);
> +       raw_local_irq_disable();
> +
> +       return ret;
> +}
> +
>  /**
>   * intel_idle_s2idle - Ask the processor to enter the given idle state.
>   * @dev: cpuidle device of the target CPU.
> @@ -1242,6 +1279,18 @@ static struct cpuidle_state snr_cstates[] __initdata = {
>                 .enter = NULL }
>  };
>
> +static struct cpuidle_state vmguest_cstates[] __initdata = {
> +       {
> +               .name = "C1",
> +               .desc = "HLT",
> +               .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
> +               .exit_latency = 5,
> +               .target_residency = 10,
> +               .enter = &intel_idle_hlt, },
> +       {
> +               .enter = NULL }
> +};
> +
>  static const struct idle_cpu idle_cpu_nehalem __initconst = {
>         .state_table = nehalem_cstates,
>         .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
> @@ -1841,6 +1890,16 @@ static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
>
>  static void state_update_enter_method(struct cpuidle_state *state, int cstate)
>  {
> +       if (state->enter == intel_idle_hlt) {
> +               if (force_irq_on) {
> +                       pr_info("forced intel_idle_irq for state %d\n", cstate);
> +                       state->enter = intel_idle_hlt_irq_on;
> +               }
> +               return;
> +       }
> +       if (state->enter == intel_idle_hlt_irq_on)
> +               return; /* no update scenarios */
> +
>         if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) {
>                 /*
>                  * Combining with XSTATE with IBRS or IRQ_ENABLE flags
> @@ -1874,6 +1933,29 @@ static void state_update_enter_method(struct cpuidle_state *state, int cstate)
>         }
>  }
>
> +/*
> + * For mwait based states, we want to verify the cpuid data to see if the state
> + * is actually supported by this specific CPU.
> + * For non-mwait based states, this check should be skipped.
> + */
> +static bool should_verify_mwait(struct cpuidle_state *state)
> +{
> +       if (state->enter == intel_idle_irq)
> +               return true;
> +       if (state->enter == intel_idle)
> +               return true;
> +       if (state->enter == intel_idle_ibrs)
> +               return true;
> +       if (state->enter == intel_idle_xstate)
> +               return true;
> +       if (state->enter == intel_idle_hlt)
> +               return false;
> +       if (state->enter == intel_idle_hlt_irq_on)
> +               return false;
> +
> +       return true;
> +}
> +
>  static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
>  {
>         int cstate;
> @@ -1922,7 +2004,7 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
>                 }
>
>                 mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
> -               if (!intel_idle_verify_cstate(mwait_hint))
> +               if (should_verify_mwait(&cpuidle_state_table[cstate]) && !intel_idle_verify_cstate(mwait_hint))
>                         continue;
>
>                 /* Structure copy. */
> @@ -2056,6 +2138,45 @@ static void __init intel_idle_cpuidle_devices_uninit(void)
>                 cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i));
>  }
>
> +static int __init intel_idle_vminit(const struct x86_cpu_id *id)
> +{
> +       int retval;
> +
> +       cpuidle_state_table = vmguest_cstates;
> +
> +       icpu = (const struct idle_cpu *)id->driver_data;
> +
> +       pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
> +                boot_cpu_data.x86_model);
> +
> +       intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
> +       if (!intel_idle_cpuidle_devices)
> +               return -ENOMEM;
> +
> +       intel_idle_cpuidle_driver_init(&intel_idle_driver);
> +
> +       retval = cpuidle_register_driver(&intel_idle_driver);
> +       if (retval) {
> +               struct cpuidle_driver *drv = cpuidle_get_driver();
> +               printk(KERN_DEBUG pr_fmt("intel_idle yielding to %s\n"),
> +                      drv ? drv->name : "none");
> +               goto init_driver_fail;
> +       }
> +
> +       retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
> +                                  intel_idle_cpu_online, NULL);
> +       if (retval < 0)
> +               goto hp_setup_fail;
> +
> +       return 0;
> +hp_setup_fail:
> +       intel_idle_cpuidle_devices_uninit();
> +       cpuidle_unregister_driver(&intel_idle_driver);
> +init_driver_fail:
> +       free_percpu(intel_idle_cpuidle_devices);
> +       return retval;
> +}
> +
>  static int __init intel_idle_init(void)
>  {
>         const struct x86_cpu_id *id;
> @@ -2074,6 +2195,8 @@ static int __init intel_idle_init(void)
>         id = x86_match_cpu(intel_idle_ids);
>         if (id) {
>                 if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
> +                       if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
> +                               return intel_idle_vminit(id);
>                         pr_debug("Please enable MWAIT in BIOS SETUP\n");
>                         return -ENODEV;
>                 }
> --
> 2.40.1
>
Xiaoyao Li July 17, 2023, 8:34 a.m. UTC | #4
+ KVM maillist.

On 6/5/2023 11:47 PM, arjan@linux.intel.com wrote:
...
>   
> +static int __init intel_idle_vminit(const struct x86_cpu_id *id)
> +{
> +	int retval;
> +
> +	cpuidle_state_table = vmguest_cstates;
> +
> +	icpu = (const struct idle_cpu *)id->driver_data;
> +
> +	pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
> +		 boot_cpu_data.x86_model);
> +
> +	intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
> +	if (!intel_idle_cpuidle_devices)
> +		return -ENOMEM;
> +
> +	intel_idle_cpuidle_driver_init(&intel_idle_driver);
> +
> +	retval = cpuidle_register_driver(&intel_idle_driver);
> +	if (retval) {
> +		struct cpuidle_driver *drv = cpuidle_get_driver();
> +		printk(KERN_DEBUG pr_fmt("intel_idle yielding to %s\n"),
> +		       drv ? drv->name : "none");
> +		goto init_driver_fail;
> +	}
> +
> +	retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
> +				   intel_idle_cpu_online, NULL);
> +	if (retval < 0)
> +		goto hp_setup_fail;
> +
> +	return 0;
> +hp_setup_fail:
> +	intel_idle_cpuidle_devices_uninit();
> +	cpuidle_unregister_driver(&intel_idle_driver);
> +init_driver_fail:
> +	free_percpu(intel_idle_cpuidle_devices);
> +	return retval;
> +}
> +
>   static int __init intel_idle_init(void)
>   {
>   	const struct x86_cpu_id *id;
> @@ -2074,6 +2195,8 @@ static int __init intel_idle_init(void)
>   	id = x86_match_cpu(intel_idle_ids);
>   	if (id) {
>   		if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
> +			if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
> +				return intel_idle_vminit(id);

It leads to below MSR access error on SPR.

[    4.158636] unchecked MSR access error: RDMSR from 0xe2 at rIP: 
0xffffffffbcaeebed (intel_idle_init_cstates_icpu.constprop.0+0x2dd/0x5a0)
[    4.174991] Call Trace:
[    4.179611]  <TASK>
[    4.183610]  ? ex_handler_msr+0x11e/0x150
[    4.190624]  ? fixup_exception+0x17e/0x3c0
[    4.197648]  ? gp_try_fixup_and_notify+0x1d/0xc0
[    4.205579]  ? exc_general_protection+0x1bb/0x410
[    4.213620]  ? asm_exc_general_protection+0x26/0x30
[    4.221624]  ? __pfx_intel_idle_init+0x10/0x10
[    4.228588]  ? intel_idle_init_cstates_icpu.constprop.0+0x2dd/0x5a0
[    4.238632]  ? __pfx_intel_idle_init+0x10/0x10
[    4.246632]  ? __pfx_intel_idle_init+0x10/0x10
[    4.253616]  intel_idle_vminit.isra.0+0xf5/0x1d0
[    4.261580]  ? __pfx_intel_idle_init+0x10/0x10
[    4.269670]  ? __pfx_intel_idle_init+0x10/0x10
[    4.274605]  do_one_initcall+0x50/0x230
[    4.279873]  do_initcalls+0xb3/0x130
[    4.286535]  kernel_init_freeable+0x255/0x310
[    4.293688]  ? __pfx_kernel_init+0x10/0x10
[    4.300630]  kernel_init+0x1a/0x1c0
[    4.305681]  ret_from_fork+0x29/0x50
[    4.312700]  </TASK>

On Intel SPR, the call site is

intel_idle_vminit()
   -> intel_idle_cpuidle_driver_init()
     -> intel_idle_init_cstates_icpu()
       -> spr_idle_state_table_update()
         -> rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);

However, current KVM doesn't provide emulation for 
MSR_PKG_CST_CONFIG_CONTROL. It leads to #GP on accessing.

>   			pr_debug("Please enable MWAIT in BIOS SETUP\n");
>   			return -ENODEV;
>   		}
Rafael J. Wysocki July 17, 2023, 12:58 p.m. UTC | #5
On Mon, Jul 17, 2023 at 10:34 AM Xiaoyao Li <xiaoyao.li@intel.com> wrote:
>
> + KVM maillist.
>
> On 6/5/2023 11:47 PM, arjan@linux.intel.com wrote:
> ...
> >
> > +static int __init intel_idle_vminit(const struct x86_cpu_id *id)
> > +{
> > +     int retval;
> > +
> > +     cpuidle_state_table = vmguest_cstates;
> > +
> > +     icpu = (const struct idle_cpu *)id->driver_data;
> > +
> > +     pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
> > +              boot_cpu_data.x86_model);
> > +
> > +     intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
> > +     if (!intel_idle_cpuidle_devices)
> > +             return -ENOMEM;
> > +
> > +     intel_idle_cpuidle_driver_init(&intel_idle_driver);
> > +
> > +     retval = cpuidle_register_driver(&intel_idle_driver);
> > +     if (retval) {
> > +             struct cpuidle_driver *drv = cpuidle_get_driver();
> > +             printk(KERN_DEBUG pr_fmt("intel_idle yielding to %s\n"),
> > +                    drv ? drv->name : "none");
> > +             goto init_driver_fail;
> > +     }
> > +
> > +     retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
> > +                                intel_idle_cpu_online, NULL);
> > +     if (retval < 0)
> > +             goto hp_setup_fail;
> > +
> > +     return 0;
> > +hp_setup_fail:
> > +     intel_idle_cpuidle_devices_uninit();
> > +     cpuidle_unregister_driver(&intel_idle_driver);
> > +init_driver_fail:
> > +     free_percpu(intel_idle_cpuidle_devices);
> > +     return retval;
> > +}
> > +
> >   static int __init intel_idle_init(void)
> >   {
> >       const struct x86_cpu_id *id;
> > @@ -2074,6 +2195,8 @@ static int __init intel_idle_init(void)
> >       id = x86_match_cpu(intel_idle_ids);
> >       if (id) {
> >               if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
> > +                     if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
> > +                             return intel_idle_vminit(id);
>
> It leads to below MSR access error on SPR.
>
> [    4.158636] unchecked MSR access error: RDMSR from 0xe2 at rIP:
> 0xffffffffbcaeebed (intel_idle_init_cstates_icpu.constprop.0+0x2dd/0x5a0)
> [    4.174991] Call Trace:
> [    4.179611]  <TASK>
> [    4.183610]  ? ex_handler_msr+0x11e/0x150
> [    4.190624]  ? fixup_exception+0x17e/0x3c0
> [    4.197648]  ? gp_try_fixup_and_notify+0x1d/0xc0
> [    4.205579]  ? exc_general_protection+0x1bb/0x410
> [    4.213620]  ? asm_exc_general_protection+0x26/0x30
> [    4.221624]  ? __pfx_intel_idle_init+0x10/0x10
> [    4.228588]  ? intel_idle_init_cstates_icpu.constprop.0+0x2dd/0x5a0
> [    4.238632]  ? __pfx_intel_idle_init+0x10/0x10
> [    4.246632]  ? __pfx_intel_idle_init+0x10/0x10
> [    4.253616]  intel_idle_vminit.isra.0+0xf5/0x1d0
> [    4.261580]  ? __pfx_intel_idle_init+0x10/0x10
> [    4.269670]  ? __pfx_intel_idle_init+0x10/0x10
> [    4.274605]  do_one_initcall+0x50/0x230
> [    4.279873]  do_initcalls+0xb3/0x130
> [    4.286535]  kernel_init_freeable+0x255/0x310
> [    4.293688]  ? __pfx_kernel_init+0x10/0x10
> [    4.300630]  kernel_init+0x1a/0x1c0
> [    4.305681]  ret_from_fork+0x29/0x50
> [    4.312700]  </TASK>
>
> On Intel SPR, the call site is
>
> intel_idle_vminit()
>    -> intel_idle_cpuidle_driver_init()
>      -> intel_idle_init_cstates_icpu()
>        -> spr_idle_state_table_update()
>          -> rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
>
> However, current KVM doesn't provide emulation for
> MSR_PKG_CST_CONFIG_CONTROL. It leads to #GP on accessing.
>
> >                       pr_debug("Please enable MWAIT in BIOS SETUP\n");
> >                       return -ENODEV;
> >               }

Well, I'm waiting for a fix from Arjan, thanks!
Arjan van de Ven July 17, 2023, 2:07 p.m. UTC | #6
> It leads to below MSR access error on SPR.
yeah I have a fix for this but Peter instead wants to delete the whole thing...
... so I'm sort of stuck in two worlds. I'll send the fix today but Rafael will need to
chose if he wants to revert or not
Rafael J. Wysocki July 17, 2023, 2:51 p.m. UTC | #7
On Mon, Jul 17, 2023 at 4:10 PM Arjan van de Ven <arjan@linux.intel.com> wrote:
>
> > It leads to below MSR access error on SPR.
> yeah I have a fix for this but Peter instead wants to delete the whole thing...
> ... so I'm sort of stuck in two worlds. I'll send the fix today but Rafael will need to
> chose if he wants to revert or not

I thought that you wanted to fix this rather than to revert, but the
latter is entirely fine with me too.

Given the Peter's general objection to the changes, a revert would
likely be more appropriate until the controversy is resolved.
Arjan van de Ven July 17, 2023, 2:53 p.m. UTC | #8
On 7/17/2023 7:51 AM, Rafael J. Wysocki wrote:
> On Mon, Jul 17, 2023 at 4:10 PM Arjan van de Ven <arjan@linux.intel.com> wrote:
>>
>>> It leads to below MSR access error on SPR.
>> yeah I have a fix for this but Peter instead wants to delete the whole thing...
>> ... so I'm sort of stuck in two worlds. I'll send the fix today but Rafael will need to
>> chose if he wants to revert or not
> 
> I thought that you wanted to fix this rather than to revert, but the
> latter is entirely fine with me too.
> 

I would rather fix of course, and will send the fixes out shorty (final test ongoing)
diff mbox series

Patch

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 256c2d42e350..d2518cf36ab4 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -199,6 +199,43 @@  static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev,
 	return __intel_idle(dev, drv, index);
 }
 
+static __always_inline int __intel_idle_hlt(struct cpuidle_device *dev,
+					struct cpuidle_driver *drv, int index)
+{
+	raw_safe_halt();
+	raw_local_irq_disable();
+	return index;
+}
+
+/**
+ * intel_idle_hlt - Ask the processor to enter the given idle state using hlt.
+ * @dev: cpuidle device of the target CPU.
+ * @drv: cpuidle driver (assumed to point to intel_idle_driver).
+ * @index: Target idle state index.
+ *
+ * Use the HLT instruction to notify the processor that the CPU represented by
+ * @dev is idle and it can try to enter the idle state corresponding to @index.
+ *
+ * Must be called under local_irq_disable().
+ */
+static __cpuidle int intel_idle_hlt(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv, int index)
+{
+	return __intel_idle_hlt(dev, drv, index);
+}
+
+static __cpuidle int intel_idle_hlt_irq_on(struct cpuidle_device *dev,
+                                   struct cpuidle_driver *drv, int index)
+{
+       int ret;
+
+       raw_local_irq_enable();
+       ret = __intel_idle_hlt(dev, drv, index);
+       raw_local_irq_disable();
+
+       return ret;
+}
+
 /**
  * intel_idle_s2idle - Ask the processor to enter the given idle state.
  * @dev: cpuidle device of the target CPU.
@@ -1242,6 +1279,18 @@  static struct cpuidle_state snr_cstates[] __initdata = {
 		.enter = NULL }
 };
 
+static struct cpuidle_state vmguest_cstates[] __initdata = {
+	{
+		.name = "C1",
+		.desc = "HLT",
+		.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
+		.exit_latency = 5,
+		.target_residency = 10,
+		.enter = &intel_idle_hlt, },
+	{
+		.enter = NULL }
+};
+
 static const struct idle_cpu idle_cpu_nehalem __initconst = {
 	.state_table = nehalem_cstates,
 	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
@@ -1841,6 +1890,16 @@  static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
 
 static void state_update_enter_method(struct cpuidle_state *state, int cstate)
 {
+	if (state->enter == intel_idle_hlt) {
+		if (force_irq_on) {
+			pr_info("forced intel_idle_irq for state %d\n", cstate);
+			state->enter = intel_idle_hlt_irq_on;
+		}
+		return;
+	}
+	if (state->enter == intel_idle_hlt_irq_on)
+		return; /* no update scenarios */
+
 	if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) {
 		/*
 		 * Combining with XSTATE with IBRS or IRQ_ENABLE flags
@@ -1874,6 +1933,29 @@  static void state_update_enter_method(struct cpuidle_state *state, int cstate)
 	}
 }
 
+/*
+ * For mwait based states, we want to verify the cpuid data to see if the state
+ * is actually supported by this specific CPU.
+ * For non-mwait based states, this check should be skipped.
+ */
+static bool should_verify_mwait(struct cpuidle_state *state)
+{
+	if (state->enter == intel_idle_irq)
+		return true;
+	if (state->enter == intel_idle)
+		return true;
+	if (state->enter == intel_idle_ibrs)
+		return true;
+	if (state->enter == intel_idle_xstate)
+		return true;
+	if (state->enter == intel_idle_hlt)
+		return false;
+	if (state->enter == intel_idle_hlt_irq_on)
+		return false;
+
+	return true;
+}
+
 static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 {
 	int cstate;
@@ -1922,7 +2004,7 @@  static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 		}
 
 		mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
-		if (!intel_idle_verify_cstate(mwait_hint))
+		if (should_verify_mwait(&cpuidle_state_table[cstate]) && !intel_idle_verify_cstate(mwait_hint))
 			continue;
 
 		/* Structure copy. */
@@ -2056,6 +2138,45 @@  static void __init intel_idle_cpuidle_devices_uninit(void)
 		cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i));
 }
 
+static int __init intel_idle_vminit(const struct x86_cpu_id *id)
+{
+	int retval;
+
+	cpuidle_state_table = vmguest_cstates;
+
+	icpu = (const struct idle_cpu *)id->driver_data;
+
+	pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
+		 boot_cpu_data.x86_model);
+
+	intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
+	if (!intel_idle_cpuidle_devices)
+		return -ENOMEM;
+
+	intel_idle_cpuidle_driver_init(&intel_idle_driver);
+
+	retval = cpuidle_register_driver(&intel_idle_driver);
+	if (retval) {
+		struct cpuidle_driver *drv = cpuidle_get_driver();
+		printk(KERN_DEBUG pr_fmt("intel_idle yielding to %s\n"),
+		       drv ? drv->name : "none");
+		goto init_driver_fail;
+	}
+
+	retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
+				   intel_idle_cpu_online, NULL);
+	if (retval < 0)
+		goto hp_setup_fail;
+
+	return 0;
+hp_setup_fail:
+	intel_idle_cpuidle_devices_uninit();
+	cpuidle_unregister_driver(&intel_idle_driver);
+init_driver_fail:
+	free_percpu(intel_idle_cpuidle_devices);
+	return retval;
+}
+
 static int __init intel_idle_init(void)
 {
 	const struct x86_cpu_id *id;
@@ -2074,6 +2195,8 @@  static int __init intel_idle_init(void)
 	id = x86_match_cpu(intel_idle_ids);
 	if (id) {
 		if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
+			if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+				return intel_idle_vminit(id);
 			pr_debug("Please enable MWAIT in BIOS SETUP\n");
 			return -ENODEV;
 		}