diff mbox series

KVM: x86: use TPAUSE to replace PAUSE in halt polling

Message ID 20220824091117.767363-1-dapeng1.mi@intel.com
State New
Headers show
Series KVM: x86: use TPAUSE to replace PAUSE in halt polling | expand

Commit Message

Mi, Dapeng1 Aug. 24, 2022, 9:11 a.m. UTC
TPAUSE is a new instruction on Intel processors which can instruct
processor enters a power/performance optimized state. Halt polling
uses PAUSE instruction to wait vCPU is waked up. The polling time
could be long and cause extra power consumption in some cases.

Use TPAUSE to replace the PAUSE instruction in halt polling to get
a better power saving and performance.

Signed-off-by: Dapeng Mi <dapeng1.mi@intel.com>
---
 drivers/cpuidle/poll_state.c |  3 ++-
 include/linux/kvm_host.h     | 20 ++++++++++++++++++++
 virt/kvm/kvm_main.c          |  2 +-
 3 files changed, 23 insertions(+), 2 deletions(-)

Comments

David Laight Aug. 24, 2022, 2:08 p.m. UTC | #1
From: Dapeng Mi
> Sent: 24 August 2022 10:11
> 
> TPAUSE is a new instruction on Intel processors which can instruct
> processor enters a power/performance optimized state. Halt polling
> uses PAUSE instruction to wait vCPU is waked up. The polling time
> could be long and cause extra power consumption in some cases.
> 
> Use TPAUSE to replace the PAUSE instruction in halt polling to get
> a better power saving and performance.

What is the effect on wakeup latency?
Quite often that is far more important than a bit of power saving.

The automatic entry of sleep states is a PITA already.
Block 30 RT threads in cv_wait() and then do cv_broadcast().
Use ftrace to see just how long it takes the last thread
to wake up.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Sean Christopherson Aug. 24, 2022, 3:26 p.m. UTC | #2
On Wed, Aug 24, 2022, Dapeng Mi wrote:
> TPAUSE is a new instruction on Intel processors which can instruct
> processor enters a power/performance optimized state. Halt polling
> uses PAUSE instruction to wait vCPU is waked up. The polling time
> could be long and cause extra power consumption in some cases.
> 
> Use TPAUSE to replace the PAUSE instruction in halt polling to get
> a better power saving and performance.

Better power savings, yes.  Better performance?  Not necessarily.  Using TPAUSE
for  a "successful" halt poll is likely to yield _worse_ performance from the
vCPU's perspective due to the increased latency.

> Signed-off-by: Dapeng Mi <dapeng1.mi@intel.com>
> ---
>  drivers/cpuidle/poll_state.c |  3 ++-
>  include/linux/kvm_host.h     | 20 ++++++++++++++++++++
>  virt/kvm/kvm_main.c          |  2 +-
>  3 files changed, 23 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
> index f7e83613ae94..51ec333cbf80 100644
> --- a/drivers/cpuidle/poll_state.c
> +++ b/drivers/cpuidle/poll_state.c
> @@ -7,6 +7,7 @@
>  #include <linux/sched.h>
>  #include <linux/sched/clock.h>
>  #include <linux/sched/idle.h>
> +#include <linux/kvm_host.h>
>  
>  #define POLL_IDLE_RELAX_COUNT	200
>  
> @@ -25,7 +26,7 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev,
>  		limit = cpuidle_poll_time(drv, dev);
>  
>  		while (!need_resched()) {
> -			cpu_relax();
> +			kvm_cpu_poll_pause(limit);

poll_idle() absolutely should not be calling into KVM code.

>  			if (loop_count++ < POLL_IDLE_RELAX_COUNT)
>  				continue;
>  
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index f4519d3689e1..810e749949b7 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -35,6 +35,7 @@
>  #include <linux/interval_tree.h>
>  #include <linux/rbtree.h>
>  #include <linux/xarray.h>
> +#include <linux/delay.h>
>  #include <asm/signal.h>
>  
>  #include <linux/kvm.h>
> @@ -2247,6 +2248,25 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
>  }
>  #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */
>  
> +/*
> + * This function is intended to replace the cpu_relax function in
> + * halt polling. If TPAUSE instruction is supported, use TPAUSE
> + * instead fo PAUSE to get better power saving and performance.
> + * Selecting 1 us is a compromise between scheduling latency and
> + * power saving time.
> + */
> +static inline void kvm_cpu_poll_pause(u64 timeout_ns)
> +{
> +#ifdef CONFIG_X86

This is not preferred the way to insert arch-specific behavior into common KVM code.
Assuming the goal is to avoid a function call, use an #ifndef here and then #define
the flag in x86's kvm_host.h, e.g.

#ifndef CONFIG_HAVE_KVM_ARCH_HALT_POLL_PAUSE
static inline kvm_cpu_halt_poll_pause(u64 timeout_ns)
{
	cpu_relax();
}
#endif

It's not obvious that we need to avoid a call here though, in which case a

  __weak void kvm_arch_cpu_halt_poll_pause(struct kvm *kvm)
  {

  }

with an x86 implementation will suffice.


> +	if (static_cpu_has(X86_FEATURE_WAITPKG) && timeout_ns > 1000)
> +		udelay(1);

This is far too arbitrary.  Wake events from other vCPU are not necessarily
accompanied by an IRQ, which means that delaying for 1us may really truly delay
for 1us before detecting a pending wake event.

If this is something we want to utilize in KVM, it should be controllable by
userspace, probably via module param, and likely off by default.

E.g. 

  unsigned int halt_poll_tpause_ns;

and then

  if (timeout_ns >= halt_poll_tpause_ns)
  	udelay(halt_poll_tpause_ns);

with halt_poll_tpause_ns zeroed out during setup if TPAUSE isn't supported.

I say "if", because I think this needs to come with performance numbers to show
the impact on guest latency so that KVM and its users can make an informed decision.
And if it's unlikely that anyone will ever want to enable TPAUSE for halt polling,
then it's not worth the extra complexity in KVM.

> +	else
> +		cpu_relax();
> +#else
> +	cpu_relax();
> +#endif
> +}
> +
>  /*
>   * This defines how many reserved entries we want to keep before we
>   * kick the vcpu to the userspace to avoid dirty ring full.  This
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 584a5bab3af3..4afa776d21bd 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3510,7 +3510,7 @@ void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
>  			 */
>  			if (kvm_vcpu_check_block(vcpu) < 0)
>  				goto out;
> -			cpu_relax();
> +			kvm_cpu_poll_pause(vcpu->halt_poll_ns);

This is wrong, vcpu->halt_poll_ns is the total poll time, not the time remaining.
E.g. if the max poll time is 1001 ns, and KVM has already waited for 1000 ns, then
udelay(1) will cause KVM to wait for ~2000ns total.  There's always going to be
some amount of overrun, but overrun by a few ns is quite different than overrun
by a few thousand ns.

>  			poll_end = cur = ktime_get();
>  		} while (kvm_vcpu_can_poll(cur, stop));
>  	}
> -- 
> 2.34.1
>
Paolo Bonzini Aug. 24, 2022, 5:19 p.m. UTC | #3
On 8/24/22 17:26, Sean Christopherson wrote:
> I say "if", because I think this needs to come with performance numbers to show
> the impact on guest latency so that KVM and its users can make an informed decision.
> And if it's unlikely that anyone will ever want to enable TPAUSE for halt polling,
> then it's not worth the extra complexity in KVM.

Yeah, halt polling works around perhaps the biggest performance issue 
with VMs compared to bare metal (so much that it's even possible to move 
halt polling _inside_ the guest for extra performance).

I am ready to be proven wrong but I doubt TPAUSE will have a small 
effect, and if one wants the most power saving they should disable halt 
polling.  Perhaps KVM could do it automatically if the powersaving 
governor is in effect?

Paolo
Mi, Dapeng1 Aug. 25, 2022, 11:08 a.m. UTC | #4
> From: David Laight <David.Laight@ACULAB.COM>
> Sent: Wednesday, August 24, 2022 10:08 PM
> To: Mi, Dapeng1 <dapeng1.mi@intel.com>; rafael@kernel.org;
> daniel.lezcano@linaro.org; pbonzini@redhat.com
> Cc: linux-pm@vger.kernel.org; linux-kernel@vger.kernel.org;
> kvm@vger.kernel.org; zhenyuw@linux.intel.com
> Subject: RE: [PATCH] KVM: x86: use TPAUSE to replace PAUSE in halt polling
> 
> From: Dapeng Mi
> > Sent: 24 August 2022 10:11
> >
> > TPAUSE is a new instruction on Intel processors which can instruct
> > processor enters a power/performance optimized state. Halt polling
> > uses PAUSE instruction to wait vCPU is waked up. The polling time
> > could be long and cause extra power consumption in some cases.
> >
> > Use TPAUSE to replace the PAUSE instruction in halt polling to get a
> > better power saving and performance.
> 
> What is the effect on wakeup latency?
> Quite often that is far more important than a bit of power saving.

In theory, the increased wakeup latency should be less than 1us. I thought this latency impaction should be minimal. I ever run two scheduling related benchmarks, hackbench and schbench.  I didn't see this change would obviously impact the performance.

When running these two scheduling benchmarks on host, a FIO workload is running in a Linux VM simultaneously, FIO would trigger a large number of HLT VM-exit and then trigger haltpolling, then we can see how TPAUSE can impact the performance.

Here are the hackbench and schbench data on Intel ADL platform.

Hackbench 		base		TPAUSE		%delta
Group-1		0.056		0.052		7.14%
Group-4		0.165		0.164		0.61%
Group-8		0.313		0.309		1.28%
Group-16		0.834		0.842		-0.96%

Schbench - Latency percentiles (usec)		base 		TPAUSE	
./schbench -m 1
	50.0th					15		13		
	99.0th					221		203
./schbench -m 2
	50.0th					26		23
	99.0th					16368		16544
./schbench -m 4
	50.0th					56		60
	99.0th					33984		34112
 
Since the schbench benchmark is not so stable, but I can see the data is on a same level.

> The automatic entry of sleep states is a PITA already.
> Block 30 RT threads in cv_wait() and then do cv_broadcast().
> Use ftrace to see just how long it takes the last thread to wake up.

I think this test is familiar with the hackbench and schbench, it should have similar result.

Anyway, performance and power is a tradeoff, it depends on which side we think is more important.

> 
> 	David
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1
> 1PT, UK Registration No: 1397386 (Wales)
Mi, Dapeng1 Aug. 25, 2022, 11:31 a.m. UTC | #5
> From: Sean Christopherson <seanjc@google.com>
> Sent: Wednesday, August 24, 2022 11:27 PM
> To: Mi, Dapeng1 <dapeng1.mi@intel.com>
> Cc: rafael@kernel.org; daniel.lezcano@linaro.org; pbonzini@redhat.com; linux-
> pm@vger.kernel.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> zhenyuw@linux.intel.com
> Subject: Re: [PATCH] KVM: x86: use TPAUSE to replace PAUSE in halt polling
> 
> On Wed, Aug 24, 2022, Dapeng Mi wrote:
> > TPAUSE is a new instruction on Intel processors which can instruct
> > processor enters a power/performance optimized state. Halt polling
> > uses PAUSE instruction to wait vCPU is waked up. The polling time
> > could be long and cause extra power consumption in some cases.
> >
> > Use TPAUSE to replace the PAUSE instruction in halt polling to get a
> > better power saving and performance.
> 
> Better power savings, yes.  Better performance?  Not necessarily.  Using TPAUSE
> for  a "successful" halt poll is likely to yield _worse_ performance from the
> vCPU's perspective due to the increased latency.

The Intel SDM says the C0.2 state which TPAUSE instruction currently enters in kernel would "Improves performance of the other SMT thread(s) on the same core." For some multi-thread workload, it could improve the performance. But considering the increased latency, I'm not sure how large the performance can improve. But we don't see obvious performance downgrade at least in our tests.

> 
> > Signed-off-by: Dapeng Mi <dapeng1.mi@intel.com>
> > ---
> >  drivers/cpuidle/poll_state.c |  3 ++-
> >  include/linux/kvm_host.h     | 20 ++++++++++++++++++++
> >  virt/kvm/kvm_main.c          |  2 +-
> >  3 files changed, 23 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/cpuidle/poll_state.c
> > b/drivers/cpuidle/poll_state.c index f7e83613ae94..51ec333cbf80 100644
> > --- a/drivers/cpuidle/poll_state.c
> > +++ b/drivers/cpuidle/poll_state.c
> > @@ -7,6 +7,7 @@
> >  #include <linux/sched.h>
> >  #include <linux/sched/clock.h>
> >  #include <linux/sched/idle.h>
> > +#include <linux/kvm_host.h>
> >
> >  #define POLL_IDLE_RELAX_COUNT	200
> >
> > @@ -25,7 +26,7 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev,
> >  		limit = cpuidle_poll_time(drv, dev);
> >
> >  		while (!need_resched()) {
> > -			cpu_relax();
> > +			kvm_cpu_poll_pause(limit);
> 
> poll_idle() absolutely should not be calling into KVM code.

Ok, so how should we handle this case? Duplicate a piece of code?

> 
> >  			if (loop_count++ < POLL_IDLE_RELAX_COUNT)
> >  				continue;
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index
> > f4519d3689e1..810e749949b7 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -35,6 +35,7 @@
> >  #include <linux/interval_tree.h>
> >  #include <linux/rbtree.h>
> >  #include <linux/xarray.h>
> > +#include <linux/delay.h>
> >  #include <asm/signal.h>
> >
> >  #include <linux/kvm.h>
> > @@ -2247,6 +2248,25 @@ static inline void
> > kvm_handle_signal_exit(struct kvm_vcpu *vcpu)  }  #endif /*
> > CONFIG_KVM_XFER_TO_GUEST_WORK */
> >
> > +/*
> > + * This function is intended to replace the cpu_relax function in
> > + * halt polling. If TPAUSE instruction is supported, use TPAUSE
> > + * instead fo PAUSE to get better power saving and performance.
> > + * Selecting 1 us is a compromise between scheduling latency and
> > + * power saving time.
> > + */
> > +static inline void kvm_cpu_poll_pause(u64 timeout_ns) { #ifdef
> > +CONFIG_X86
> 
> This is not preferred the way to insert arch-specific behavior into common KVM
> code.
> Assuming the goal is to avoid a function call, use an #ifndef here and then
> #define the flag in x86's kvm_host.h, e.g.
> 
> #ifndef CONFIG_HAVE_KVM_ARCH_HALT_POLL_PAUSE
> static inline kvm_cpu_halt_poll_pause(u64 timeout_ns) {
> 	cpu_relax();
> }
> #endif
> 
> It's not obvious that we need to avoid a call here though, in which case a
> 
>   __weak void kvm_arch_cpu_halt_poll_pause(struct kvm *kvm)
>   {
> 
>   }
> 
> with an x86 implementation will suffice.

Sure. Thanks.

> 
> 
> > +	if (static_cpu_has(X86_FEATURE_WAITPKG) && timeout_ns > 1000)
> > +		udelay(1);
> 
> This is far too arbitrary.  Wake events from other vCPU are not necessarily
> accompanied by an IRQ, which means that delaying for 1us may really truly
> delay for 1us before detecting a pending wake event.
> 
> If this is something we want to utilize in KVM, it should be controllable by
> userspace, probably via module param, and likely off by default.
> 
> E.g.
> 
>   unsigned int halt_poll_tpause_ns;
> 
> and then
> 
>   if (timeout_ns >= halt_poll_tpause_ns)
>   	udelay(halt_poll_tpause_ns);
> 
> with halt_poll_tpause_ns zeroed out during setup if TPAUSE isn't supported.

I see. Thanks.

> 
> I say "if", because I think this needs to come with performance numbers to show
> the impact on guest latency so that KVM and its users can make an informed
> decision.
> And if it's unlikely that anyone will ever want to enable TPAUSE for halt polling,
> then it's not worth the extra complexity in KVM.

I ever run two scheduling related benchmarks, hackbench and schbench, I didn't see  there are obvious performance impact.

Here are the hackbench and schbench data on Intel ADL platform.

Hackbench 		base		TPAUSE		%delta
Group-1		0.056		0.052		7.14%
Group-4		0.165		0.164		0.61%
Group-8		0.313		0.309		1.28%
Group-16		0.834		0.842		-0.96%

Schbench - Latency percentiles (usec)		base 		TPAUSE	
./schbench -m 1
	50.0th					15		13		
	99.0th					221		203
./schbench -m 2
	50.0th					26		23
	99.0th					16368		16544
./schbench -m 4
	50.0th					56		60
	99.0th					33984		34112

Anyway, I would run more performance tests and see whether there are obvious performance impact.

> 
> > +	else
> > +		cpu_relax();
> > +#else
> > +	cpu_relax();
> > +#endif
> > +}
> > +
> >  /*
> >   * This defines how many reserved entries we want to keep before we
> >   * kick the vcpu to the userspace to avoid dirty ring full.  This
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index
> > 584a5bab3af3..4afa776d21bd 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -3510,7 +3510,7 @@ void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
> >  			 */
> >  			if (kvm_vcpu_check_block(vcpu) < 0)
> >  				goto out;
> > -			cpu_relax();
> > +			kvm_cpu_poll_pause(vcpu->halt_poll_ns);
> 
> This is wrong, vcpu->halt_poll_ns is the total poll time, not the time remaining.
> E.g. if the max poll time is 1001 ns, and KVM has already waited for 1000 ns,
> then
> udelay(1) will cause KVM to wait for ~2000ns total.  There's always going to be
> some amount of overrun, but overrun by a few ns is quite different than overrun
> by a few thousand ns.

Yes, actually I calculate the remaining time and pass it to TPAUSE in earlier change. But this would make the code become complex comparing "cpu_relax". I'm concerning the complex code may impact the performance, so just simply it.

> 
> >  			poll_end = cur = ktime_get();
> >  		} while (kvm_vcpu_can_poll(cur, stop));
> >  	}
> > --
> > 2.34.1
> >
Mi, Dapeng1 Aug. 25, 2022, 12:16 p.m. UTC | #6
> From: Paolo Bonzini <pbonzini@redhat.com>
> Sent: Thursday, August 25, 2022 1:19 AM
> To: Christopherson,, Sean <seanjc@google.com>; Mi, Dapeng1
> <dapeng1.mi@intel.com>
> Cc: rafael@kernel.org; daniel.lezcano@linaro.org; linux-pm@vger.kernel.org;
> linux-kernel@vger.kernel.org; kvm@vger.kernel.org; zhenyuw@linux.intel.com
> Subject: Re: [PATCH] KVM: x86: use TPAUSE to replace PAUSE in halt polling
> 
> On 8/24/22 17:26, Sean Christopherson wrote:
> > I say "if", because I think this needs to come with performance
> > numbers to show the impact on guest latency so that KVM and its users can
> make an informed decision.
> > And if it's unlikely that anyone will ever want to enable TPAUSE for
> > halt polling, then it's not worth the extra complexity in KVM.
> 
> Yeah, halt polling works around perhaps the biggest performance issue with VMs
> compared to bare metal (so much that it's even possible to move halt polling
> _inside_ the guest for extra performance).
> 
> I am ready to be proven wrong but I doubt TPAUSE will have a small effect, and
> if one wants the most power saving they should disable halt polling.  Perhaps
> KVM could do it automatically if the powersaving governor is in effect?

Paolo, 

In our tests, we see halt polling consumes too much CPU resources and power. For example, In video playback case,
The CPU utilization of halt polling is 17% and brings 7% extra power consumption comparing with disabling halt polling.

Halt polling seems to consume too much cpu resource and power than imagine, especially for Client platform, it make things worse.
Base on our observation, TPAUSE could improve 1% ~ 2% power saving. Disabling halt polling is another alternative method we are thinking.
Base on our tests, we don't see there are obvious performance downgrade even for FIO and netperf on Intel Alderlake platform. It looks 
the context switch latency could not be so large on the latest CPU. 

Yes, you are right, it could be a better method to make KVM enable/disable halt polling base on the CPU performance governor. 

> 
> Paolo
Paolo Bonzini Aug. 26, 2022, 10:14 a.m. UTC | #7
On 8/25/22 13:31, Mi, Dapeng1 wrote:
>> I say "if", because I think this needs to come with performance numbers to show
>> the impact on guest latency so that KVM and its users can make an informed
>> decision.
>> And if it's unlikely that anyone will ever want to enable TPAUSE for halt polling,
>> then it's not worth the extra complexity in KVM.
> I ever run two scheduling related benchmarks, hackbench and schbench, I didn't see  there are obvious performance impact.
> 
> Here are the hackbench and schbench data on Intel ADL platform.

Can you confirm (using debugfs for example) that halt polling is used 
while hackbench is running, and not used while it is not running?

In particular, I think you need to run the server and client on 
different VMs, for example using netperf's UDP_RR test.  With hackbench 
the ping-pong is simply between two tasks on the same CPU, and the 
hypervisor is not exercised at all.

Paolo
Mi, Dapeng1 Aug. 30, 2022, 5:56 a.m. UTC | #8
> From: Paolo Bonzini <pbonzini@redhat.com>
> Sent: Friday, August 26, 2022 6:14 PM
> To: Mi, Dapeng1 <dapeng1.mi@intel.com>; Christopherson,, Sean
> <seanjc@google.com>
> Cc: rafael@kernel.org; daniel.lezcano@linaro.org; linux-pm@vger.kernel.org;
> linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> zhenyuw@linux.intel.com
> Subject: Re: [PATCH] KVM: x86: use TPAUSE to replace PAUSE in halt polling
> 
> On 8/25/22 13:31, Mi, Dapeng1 wrote:
> >> I say "if", because I think this needs to come with performance
> >> numbers to show the impact on guest latency so that KVM and its users
> >> can make an informed decision.
> >> And if it's unlikely that anyone will ever want to enable TPAUSE for
> >> halt polling, then it's not worth the extra complexity in KVM.
> > I ever run two scheduling related benchmarks, hackbench and schbench, I
> didn't see  there are obvious performance impact.
> >
> > Here are the hackbench and schbench data on Intel ADL platform.
> 
> Can you confirm (using debugfs for example) that halt polling is used while
> hackbench is running, and not used while it is not running?

Sorry, I may not describe the test case clearly. The hackbench and schbench are run on Host 
rather than a VM. When the hackbench or schbench is run on Host, there is a FIO workload running
in a VM in the background and the FIO would trigger a large number of HLT VM-exits and eventually
invoke halt polling. 

In this test, I want to check whether potential polling time extending would increase the scheduling
latency on host. But it looks the impact for scheduling latency is quite minimal.

> 
> In particular, I think you need to run the server and client on different VMs,
> for example using netperf's UDP_RR test.  With hackbench the ping-pong is
> simply between two tasks on the same CPU, and the hypervisor is not
> exercised at all.
> 

Here are the netperf's UDP_RR test result between two VMs locate on two different physical machines.

Netperf 			Vanilla (Avg.)		TPAUSE (Avg.)		%Delta
UDP_RR (Trans. Rate/s)		503.8			503.9			0.02%

It looks there is no obvious difference with TPAUSE change on UDP RR test.


> Paolo
diff mbox series

Patch

diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
index f7e83613ae94..51ec333cbf80 100644
--- a/drivers/cpuidle/poll_state.c
+++ b/drivers/cpuidle/poll_state.c
@@ -7,6 +7,7 @@ 
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/idle.h>
+#include <linux/kvm_host.h>
 
 #define POLL_IDLE_RELAX_COUNT	200
 
@@ -25,7 +26,7 @@  static int __cpuidle poll_idle(struct cpuidle_device *dev,
 		limit = cpuidle_poll_time(drv, dev);
 
 		while (!need_resched()) {
-			cpu_relax();
+			kvm_cpu_poll_pause(limit);
 			if (loop_count++ < POLL_IDLE_RELAX_COUNT)
 				continue;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f4519d3689e1..810e749949b7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -35,6 +35,7 @@ 
 #include <linux/interval_tree.h>
 #include <linux/rbtree.h>
 #include <linux/xarray.h>
+#include <linux/delay.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -2247,6 +2248,25 @@  static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */
 
+/*
+ * This function is intended to replace the cpu_relax function in
+ * halt polling. If TPAUSE instruction is supported, use TPAUSE
+ * instead fo PAUSE to get better power saving and performance.
+ * Selecting 1 us is a compromise between scheduling latency and
+ * power saving time.
+ */
+static inline void kvm_cpu_poll_pause(u64 timeout_ns)
+{
+#ifdef CONFIG_X86
+	if (static_cpu_has(X86_FEATURE_WAITPKG) && timeout_ns > 1000)
+		udelay(1);
+	else
+		cpu_relax();
+#else
+	cpu_relax();
+#endif
+}
+
 /*
  * This defines how many reserved entries we want to keep before we
  * kick the vcpu to the userspace to avoid dirty ring full.  This
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 584a5bab3af3..4afa776d21bd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3510,7 +3510,7 @@  void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
 			 */
 			if (kvm_vcpu_check_block(vcpu) < 0)
 				goto out;
-			cpu_relax();
+			kvm_cpu_poll_pause(vcpu->halt_poll_ns);
 			poll_end = cur = ktime_get();
 		} while (kvm_vcpu_can_poll(cur, stop));
 	}