[v2,08/18] arm64: KVM: add support to save/restore SPE profiling buffer controls

Message ID 20191220143025.33853-9-andrew.murray@arm.com
State New
Headers show
Series
  • [v2,01/18] dt-bindings: ARM SPE: highlight the need for PPI partitions on heterogeneous systems
Related show

Commit Message

Andrew Murray Dec. 20, 2019, 2:30 p.m.
From: Sudeep Holla <sudeep.holla@arm.com>


Currently since we don't support profiling using SPE in the guests,
we just save the PMSCR_EL1, flush the profiling buffers and disable
sampling. However in order to support simultaneous sampling both in
the host and guests, we need to save and reatore the complete SPE
profiling buffer controls' context.

Let's add the support for the same and keep it disabled for now.
We can enable it conditionally only if guests are allowed to use
SPE.

Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>

[ Clear PMBSR bit when saving state to prevent spurious interrupts ]
Signed-off-by: Andrew Murray <andrew.murray@arm.com>

---
 arch/arm64/kvm/hyp/debug-sr.c | 51 +++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 8 deletions(-)

-- 
2.21.0

Comments

Marc Zyngier Dec. 21, 2019, 1:57 p.m. | #1
On Fri, 20 Dec 2019 14:30:15 +0000
Andrew Murray <andrew.murray@arm.com> wrote:

> From: Sudeep Holla <sudeep.holla@arm.com>

> 

> Currently since we don't support profiling using SPE in the guests,

> we just save the PMSCR_EL1, flush the profiling buffers and disable

> sampling. However in order to support simultaneous sampling both in


Is the sampling actually simultaneous? I don't believe so (the whole
series would be much simpler if it was).

> the host and guests, we need to save and reatore the complete SPE


s/reatore/restore/

> profiling buffer controls' context.

> 

> Let's add the support for the same and keep it disabled for now.

> We can enable it conditionally only if guests are allowed to use

> SPE.

> 

> Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>

> [ Clear PMBSR bit when saving state to prevent spurious interrupts ]

> Signed-off-by: Andrew Murray <andrew.murray@arm.com>

> ---

>  arch/arm64/kvm/hyp/debug-sr.c | 51 +++++++++++++++++++++++++++++------

>  1 file changed, 43 insertions(+), 8 deletions(-)

> 

> diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c

> index 8a70a493345e..12429b212a3a 100644

> --- a/arch/arm64/kvm/hyp/debug-sr.c

> +++ b/arch/arm64/kvm/hyp/debug-sr.c

> @@ -85,7 +85,8 @@

>  	default:	write_debug(ptr[0], reg, 0);			\

>  	}

>  

> -static void __hyp_text __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt)

> +static void __hyp_text

> +__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)


nit: don't split lines like this if you can avoid it. You can put the
full_ctxt parameter on a separate line instead.

>  {

>  	u64 reg;

>  

> @@ -102,22 +103,46 @@ static void __hyp_text __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt)

>  	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))

>  		return;

>  

> -	/* No; is the host actually using the thing? */

> -	reg = read_sysreg_s(SYS_PMBLIMITR_EL1);

> -	if (!(reg & BIT(SYS_PMBLIMITR_EL1_E_SHIFT)))

> +	/* Save the control register and disable data generation */

> +	ctxt->sys_regs[PMSCR_EL1] = read_sysreg_el1(SYS_PMSCR);

> +

> +	if (!ctxt->sys_regs[PMSCR_EL1])


Shouldn't you check the enable bits instead of relying on the whole
thing being zero?

>  		return;

>  

>  	/* Yes; save the control register and disable data generation */

> -	ctxt->sys_regs[PMSCR_EL1] = read_sysreg_el1(SYS_PMSCR);


You've already saved the control register...

>  	write_sysreg_el1(0, SYS_PMSCR);

>  	isb();

>  

>  	/* Now drain all buffered data to memory */

>  	psb_csync();

>  	dsb(nsh);

> +

> +	if (!full_ctxt)

> +		return;

> +

> +	ctxt->sys_regs[PMBLIMITR_EL1] = read_sysreg_s(SYS_PMBLIMITR_EL1);

> +	write_sysreg_s(0, SYS_PMBLIMITR_EL1);

> +

> +	/*

> +	 * As PMBSR is conditionally restored when returning to the host we

> +	 * must ensure the service bit is unset here to prevent a spurious

> +	 * host SPE interrupt from being raised.

> +	 */

> +	ctxt->sys_regs[PMBSR_EL1] = read_sysreg_s(SYS_PMBSR_EL1);

> +	write_sysreg_s(0, SYS_PMBSR_EL1);

> +

> +	isb();

> +

> +	ctxt->sys_regs[PMSICR_EL1] = read_sysreg_s(SYS_PMSICR_EL1);

> +	ctxt->sys_regs[PMSIRR_EL1] = read_sysreg_s(SYS_PMSIRR_EL1);

> +	ctxt->sys_regs[PMSFCR_EL1] = read_sysreg_s(SYS_PMSFCR_EL1);

> +	ctxt->sys_regs[PMSEVFR_EL1] = read_sysreg_s(SYS_PMSEVFR_EL1);

> +	ctxt->sys_regs[PMSLATFR_EL1] = read_sysreg_s(SYS_PMSLATFR_EL1);

> +	ctxt->sys_regs[PMBPTR_EL1] = read_sysreg_s(SYS_PMBPTR_EL1);

>  }

>  

> -static void __hyp_text __debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt)

> +static void __hyp_text

> +__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)

>  {

>  	if (!ctxt->sys_regs[PMSCR_EL1])

>  		return;

> @@ -126,6 +151,16 @@ static void __hyp_text __debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt)

>  	isb();

>  

>  	/* Re-enable data generation */

> +	if (full_ctxt) {

> +		write_sysreg_s(ctxt->sys_regs[PMBPTR_EL1], SYS_PMBPTR_EL1);

> +		write_sysreg_s(ctxt->sys_regs[PMBLIMITR_EL1], SYS_PMBLIMITR_EL1);

> +		write_sysreg_s(ctxt->sys_regs[PMSFCR_EL1], SYS_PMSFCR_EL1);

> +		write_sysreg_s(ctxt->sys_regs[PMSEVFR_EL1], SYS_PMSEVFR_EL1);

> +		write_sysreg_s(ctxt->sys_regs[PMSLATFR_EL1], SYS_PMSLATFR_EL1);

> +		write_sysreg_s(ctxt->sys_regs[PMSIRR_EL1], SYS_PMSIRR_EL1);

> +		write_sysreg_s(ctxt->sys_regs[PMSICR_EL1], SYS_PMSICR_EL1);

> +		write_sysreg_s(ctxt->sys_regs[PMBSR_EL1], SYS_PMBSR_EL1);

> +	}

>  	write_sysreg_el1(ctxt->sys_regs[PMSCR_EL1], SYS_PMSCR);

>  }

>  

> @@ -198,7 +233,7 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)

>  	guest_ctxt = &vcpu->arch.ctxt;

>  

>  	if (!has_vhe())

> -		__debug_restore_spe_nvhe(host_ctxt);

> +		__debug_restore_spe_nvhe(host_ctxt, false);

>  

>  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))

>  		return;

> @@ -222,7 +257,7 @@ void __hyp_text __debug_save_host_context(struct kvm_vcpu *vcpu)

>  

>  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);

>  	if (!has_vhe())

> -		__debug_save_spe_nvhe(host_ctxt);

> +		__debug_save_spe_nvhe(host_ctxt, false);

>  }

>  

>  void __hyp_text __debug_save_guest_context(struct kvm_vcpu *vcpu)


So all of this is for non-VHE. What happens in the VHE case?

	M.
-- 
Jazz is not dead. It just smells funny...
Andrew Murray Dec. 24, 2019, 10:49 a.m. | #2
On Sat, Dec 21, 2019 at 01:57:55PM +0000, Marc Zyngier wrote:
> On Fri, 20 Dec 2019 14:30:15 +0000

> Andrew Murray <andrew.murray@arm.com> wrote:

> 

> > From: Sudeep Holla <sudeep.holla@arm.com>

> > 

> > Currently since we don't support profiling using SPE in the guests,

> > we just save the PMSCR_EL1, flush the profiling buffers and disable

> > sampling. However in order to support simultaneous sampling both in

> 

> Is the sampling actually simultaneous? I don't believe so (the whole

> series would be much simpler if it was).


No the SPE is used by either the guest or host at any one time. I guess
the term simultaneous was used to refer to illusion given to both guest
and host that they are able to use it whenever they like. I'll update
the commit message to drop the magic.
 

> 

> > the host and guests, we need to save and reatore the complete SPE

> 

> s/reatore/restore/


Noted.


> 

> > profiling buffer controls' context.

> > 

> > Let's add the support for the same and keep it disabled for now.

> > We can enable it conditionally only if guests are allowed to use

> > SPE.

> > 

> > Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>

> > [ Clear PMBSR bit when saving state to prevent spurious interrupts ]

> > Signed-off-by: Andrew Murray <andrew.murray@arm.com>

> > ---

> >  arch/arm64/kvm/hyp/debug-sr.c | 51 +++++++++++++++++++++++++++++------

> >  1 file changed, 43 insertions(+), 8 deletions(-)

> > 

> > diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c

> > index 8a70a493345e..12429b212a3a 100644

> > --- a/arch/arm64/kvm/hyp/debug-sr.c

> > +++ b/arch/arm64/kvm/hyp/debug-sr.c

> > @@ -85,7 +85,8 @@

> >  	default:	write_debug(ptr[0], reg, 0);			\

> >  	}

> >  

> > -static void __hyp_text __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt)

> > +static void __hyp_text

> > +__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)

> 

> nit: don't split lines like this if you can avoid it. You can put the

> full_ctxt parameter on a separate line instead.


Yes understood.


> 

> >  {

> >  	u64 reg;

> >  

> > @@ -102,22 +103,46 @@ static void __hyp_text __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt)

> >  	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))

> >  		return;

> >  

> > -	/* No; is the host actually using the thing? */

> > -	reg = read_sysreg_s(SYS_PMBLIMITR_EL1);

> > -	if (!(reg & BIT(SYS_PMBLIMITR_EL1_E_SHIFT)))

> > +	/* Save the control register and disable data generation */

> > +	ctxt->sys_regs[PMSCR_EL1] = read_sysreg_el1(SYS_PMSCR);

> > +

> > +	if (!ctxt->sys_regs[PMSCR_EL1])

> 

> Shouldn't you check the enable bits instead of relying on the whole

> thing being zero?


Yes that would make more sense (E1SPE and E0SPE).

I feel that this check makes an assumption about the guest/host SPE
driver... What happens if the SPE driver writes to some SPE registers
but doesn't enable PMSCR? If the guest is also using SPE then those
writes will be lost, when the host returns and the SPE driver enables
SPE it won't work.

With a quick look at the SPE driver I'm not sure this will happen, but
even so it makes me nervous relying on these assumptions. I wonder if
this risk is present in other devices?


> 

> >  		return;

> >  

> >  	/* Yes; save the control register and disable data generation */

> > -	ctxt->sys_regs[PMSCR_EL1] = read_sysreg_el1(SYS_PMSCR);

> 

> You've already saved the control register...


I'll remove that.


> 

> >  	write_sysreg_el1(0, SYS_PMSCR);

> >  	isb();

> >  

> >  	/* Now drain all buffered data to memory */

> >  	psb_csync();

> >  	dsb(nsh);

> > +

> > +	if (!full_ctxt)

> > +		return;

> > +

> > +	ctxt->sys_regs[PMBLIMITR_EL1] = read_sysreg_s(SYS_PMBLIMITR_EL1);

> > +	write_sysreg_s(0, SYS_PMBLIMITR_EL1);

> > +

> > +	/*

> > +	 * As PMBSR is conditionally restored when returning to the host we

> > +	 * must ensure the service bit is unset here to prevent a spurious

> > +	 * host SPE interrupt from being raised.

> > +	 */

> > +	ctxt->sys_regs[PMBSR_EL1] = read_sysreg_s(SYS_PMBSR_EL1);

> > +	write_sysreg_s(0, SYS_PMBSR_EL1);

> > +

> > +	isb();

> > +

> > +	ctxt->sys_regs[PMSICR_EL1] = read_sysreg_s(SYS_PMSICR_EL1);

> > +	ctxt->sys_regs[PMSIRR_EL1] = read_sysreg_s(SYS_PMSIRR_EL1);

> > +	ctxt->sys_regs[PMSFCR_EL1] = read_sysreg_s(SYS_PMSFCR_EL1);

> > +	ctxt->sys_regs[PMSEVFR_EL1] = read_sysreg_s(SYS_PMSEVFR_EL1);

> > +	ctxt->sys_regs[PMSLATFR_EL1] = read_sysreg_s(SYS_PMSLATFR_EL1);

> > +	ctxt->sys_regs[PMBPTR_EL1] = read_sysreg_s(SYS_PMBPTR_EL1);

> >  }

> >  

> > -static void __hyp_text __debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt)

> > +static void __hyp_text

> > +__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)

> >  {

> >  	if (!ctxt->sys_regs[PMSCR_EL1])

> >  		return;

> > @@ -126,6 +151,16 @@ static void __hyp_text __debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt)

> >  	isb();

> >  

> >  	/* Re-enable data generation */

> > +	if (full_ctxt) {

> > +		write_sysreg_s(ctxt->sys_regs[PMBPTR_EL1], SYS_PMBPTR_EL1);

> > +		write_sysreg_s(ctxt->sys_regs[PMBLIMITR_EL1], SYS_PMBLIMITR_EL1);

> > +		write_sysreg_s(ctxt->sys_regs[PMSFCR_EL1], SYS_PMSFCR_EL1);

> > +		write_sysreg_s(ctxt->sys_regs[PMSEVFR_EL1], SYS_PMSEVFR_EL1);

> > +		write_sysreg_s(ctxt->sys_regs[PMSLATFR_EL1], SYS_PMSLATFR_EL1);

> > +		write_sysreg_s(ctxt->sys_regs[PMSIRR_EL1], SYS_PMSIRR_EL1);

> > +		write_sysreg_s(ctxt->sys_regs[PMSICR_EL1], SYS_PMSICR_EL1);

> > +		write_sysreg_s(ctxt->sys_regs[PMBSR_EL1], SYS_PMBSR_EL1);

> > +	}

> >  	write_sysreg_el1(ctxt->sys_regs[PMSCR_EL1], SYS_PMSCR);

> >  }

> >  

> > @@ -198,7 +233,7 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)

> >  	guest_ctxt = &vcpu->arch.ctxt;

> >  

> >  	if (!has_vhe())

> > -		__debug_restore_spe_nvhe(host_ctxt);

> > +		__debug_restore_spe_nvhe(host_ctxt, false);

> >  

> >  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))

> >  		return;

> > @@ -222,7 +257,7 @@ void __hyp_text __debug_save_host_context(struct kvm_vcpu *vcpu)

> >  

> >  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);

> >  	if (!has_vhe())

> > -		__debug_save_spe_nvhe(host_ctxt);

> > +		__debug_save_spe_nvhe(host_ctxt, false);

> >  }

> >  

> >  void __hyp_text __debug_save_guest_context(struct kvm_vcpu *vcpu)

> 

> So all of this is for non-VHE. What happens in the VHE case?


By the end of the series this ends up in __debug_save_host_context which is
called for both VHE/nVHE - on the re-spin I'll make it not look so confusing.

Thanks,

Andrew Murray

> 

> 	M.

> -- 

> Jazz is not dead. It just smells funny...
Andrew Murray Dec. 24, 2019, 3:17 p.m. | #3
On Tue, Dec 24, 2019 at 10:49:30AM +0000, Andrew Murray wrote:
> On Sat, Dec 21, 2019 at 01:57:55PM +0000, Marc Zyngier wrote:

> > On Fri, 20 Dec 2019 14:30:15 +0000

> > Andrew Murray <andrew.murray@arm.com> wrote:

> > 

> > > From: Sudeep Holla <sudeep.holla@arm.com>

> > > 

> > > Currently since we don't support profiling using SPE in the guests,

> > > we just save the PMSCR_EL1, flush the profiling buffers and disable

> > > sampling. However in order to support simultaneous sampling both in

> > 

> > Is the sampling actually simultaneous? I don't believe so (the whole

> > series would be much simpler if it was).

> 

> No the SPE is used by either the guest or host at any one time. I guess

> the term simultaneous was used to refer to illusion given to both guest

> and host that they are able to use it whenever they like. I'll update

> the commit message to drop the magic.

>  

> 

> > 

> > > the host and guests, we need to save and reatore the complete SPE

> > 

> > s/reatore/restore/

> 

> Noted.

> 

> 

> > 

> > > profiling buffer controls' context.

> > > 

> > > Let's add the support for the same and keep it disabled for now.

> > > We can enable it conditionally only if guests are allowed to use

> > > SPE.

> > > 

> > > Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>

> > > [ Clear PMBSR bit when saving state to prevent spurious interrupts ]

> > > Signed-off-by: Andrew Murray <andrew.murray@arm.com>

> > > ---

> > >  arch/arm64/kvm/hyp/debug-sr.c | 51 +++++++++++++++++++++++++++++------

> > >  1 file changed, 43 insertions(+), 8 deletions(-)

> > > 

> > > diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c

> > > index 8a70a493345e..12429b212a3a 100644

> > > --- a/arch/arm64/kvm/hyp/debug-sr.c

> > > +++ b/arch/arm64/kvm/hyp/debug-sr.c

> > > @@ -85,7 +85,8 @@

> > >  	default:	write_debug(ptr[0], reg, 0);			\

> > >  	}

> > >  

> > > -static void __hyp_text __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt)

> > > +static void __hyp_text

> > > +__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)

> > 

> > nit: don't split lines like this if you can avoid it. You can put the

> > full_ctxt parameter on a separate line instead.

> 

> Yes understood.

> 

> 

> > 

> > >  {

> > >  	u64 reg;

> > >  

> > > @@ -102,22 +103,46 @@ static void __hyp_text __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt)

> > >  	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))

> > >  		return;

> > >  

> > > -	/* No; is the host actually using the thing? */

> > > -	reg = read_sysreg_s(SYS_PMBLIMITR_EL1);

> > > -	if (!(reg & BIT(SYS_PMBLIMITR_EL1_E_SHIFT)))

> > > +	/* Save the control register and disable data generation */

> > > +	ctxt->sys_regs[PMSCR_EL1] = read_sysreg_el1(SYS_PMSCR);

> > > +

> > > +	if (!ctxt->sys_regs[PMSCR_EL1])

> > 

> > Shouldn't you check the enable bits instead of relying on the whole

> > thing being zero?

> 

> Yes that would make more sense (E1SPE and E0SPE).

> 

> I feel that this check makes an assumption about the guest/host SPE

> driver... What happens if the SPE driver writes to some SPE registers

> but doesn't enable PMSCR? If the guest is also using SPE then those

> writes will be lost, when the host returns and the SPE driver enables

> SPE it won't work.

> 

> With a quick look at the SPE driver I'm not sure this will happen, but

> even so it makes me nervous relying on these assumptions. I wonder if

> this risk is present in other devices?


In fact, this may be a good reason to trap the SPE registers - this would
allow you to conditionally save/restore based on a dirty bit. It would
also allow you to re-evaluate the SPE interrupt (for example when the guest
clears the status register) and thus potentially reduce any black hole.

Thanks,

Andrew Murray

> 

> 

> > 

> > >  		return;

> > >  

> > >  	/* Yes; save the control register and disable data generation */

> > > -	ctxt->sys_regs[PMSCR_EL1] = read_sysreg_el1(SYS_PMSCR);

> > 

> > You've already saved the control register...

> 

> I'll remove that.

> 

> 

> > 

> > >  	write_sysreg_el1(0, SYS_PMSCR);

> > >  	isb();

> > >  

> > >  	/* Now drain all buffered data to memory */

> > >  	psb_csync();

> > >  	dsb(nsh);

> > > +

> > > +	if (!full_ctxt)

> > > +		return;

> > > +

> > > +	ctxt->sys_regs[PMBLIMITR_EL1] = read_sysreg_s(SYS_PMBLIMITR_EL1);

> > > +	write_sysreg_s(0, SYS_PMBLIMITR_EL1);

> > > +

> > > +	/*

> > > +	 * As PMBSR is conditionally restored when returning to the host we

> > > +	 * must ensure the service bit is unset here to prevent a spurious

> > > +	 * host SPE interrupt from being raised.

> > > +	 */

> > > +	ctxt->sys_regs[PMBSR_EL1] = read_sysreg_s(SYS_PMBSR_EL1);

> > > +	write_sysreg_s(0, SYS_PMBSR_EL1);

> > > +

> > > +	isb();

> > > +

> > > +	ctxt->sys_regs[PMSICR_EL1] = read_sysreg_s(SYS_PMSICR_EL1);

> > > +	ctxt->sys_regs[PMSIRR_EL1] = read_sysreg_s(SYS_PMSIRR_EL1);

> > > +	ctxt->sys_regs[PMSFCR_EL1] = read_sysreg_s(SYS_PMSFCR_EL1);

> > > +	ctxt->sys_regs[PMSEVFR_EL1] = read_sysreg_s(SYS_PMSEVFR_EL1);

> > > +	ctxt->sys_regs[PMSLATFR_EL1] = read_sysreg_s(SYS_PMSLATFR_EL1);

> > > +	ctxt->sys_regs[PMBPTR_EL1] = read_sysreg_s(SYS_PMBPTR_EL1);

> > >  }

> > >  

> > > -static void __hyp_text __debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt)

> > > +static void __hyp_text

> > > +__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)

> > >  {

> > >  	if (!ctxt->sys_regs[PMSCR_EL1])

> > >  		return;

> > > @@ -126,6 +151,16 @@ static void __hyp_text __debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt)

> > >  	isb();

> > >  

> > >  	/* Re-enable data generation */

> > > +	if (full_ctxt) {

> > > +		write_sysreg_s(ctxt->sys_regs[PMBPTR_EL1], SYS_PMBPTR_EL1);

> > > +		write_sysreg_s(ctxt->sys_regs[PMBLIMITR_EL1], SYS_PMBLIMITR_EL1);

> > > +		write_sysreg_s(ctxt->sys_regs[PMSFCR_EL1], SYS_PMSFCR_EL1);

> > > +		write_sysreg_s(ctxt->sys_regs[PMSEVFR_EL1], SYS_PMSEVFR_EL1);

> > > +		write_sysreg_s(ctxt->sys_regs[PMSLATFR_EL1], SYS_PMSLATFR_EL1);

> > > +		write_sysreg_s(ctxt->sys_regs[PMSIRR_EL1], SYS_PMSIRR_EL1);

> > > +		write_sysreg_s(ctxt->sys_regs[PMSICR_EL1], SYS_PMSICR_EL1);

> > > +		write_sysreg_s(ctxt->sys_regs[PMBSR_EL1], SYS_PMBSR_EL1);

> > > +	}

> > >  	write_sysreg_el1(ctxt->sys_regs[PMSCR_EL1], SYS_PMSCR);

> > >  }

> > >  

> > > @@ -198,7 +233,7 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)

> > >  	guest_ctxt = &vcpu->arch.ctxt;

> > >  

> > >  	if (!has_vhe())

> > > -		__debug_restore_spe_nvhe(host_ctxt);

> > > +		__debug_restore_spe_nvhe(host_ctxt, false);

> > >  

> > >  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))

> > >  		return;

> > > @@ -222,7 +257,7 @@ void __hyp_text __debug_save_host_context(struct kvm_vcpu *vcpu)

> > >  

> > >  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);

> > >  	if (!has_vhe())

> > > -		__debug_save_spe_nvhe(host_ctxt);

> > > +		__debug_save_spe_nvhe(host_ctxt, false);

> > >  }

> > >  

> > >  void __hyp_text __debug_save_guest_context(struct kvm_vcpu *vcpu)

> > 

> > So all of this is for non-VHE. What happens in the VHE case?

> 

> By the end of the series this ends up in __debug_save_host_context which is

> called for both VHE/nVHE - on the re-spin I'll make it not look so confusing.

> 

> Thanks,

> 

> Andrew Murray

> 

> > 

> > 	M.

> > -- 

> > Jazz is not dead. It just smells funny...

> _______________________________________________

> kvmarm mailing list

> kvmarm@lists.cs.columbia.edu

> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Marc Zyngier Dec. 24, 2019, 3:48 p.m. | #4
On Tue, 24 Dec 2019 15:17:39 +0000,
Andrew Murray <andrew.murray@arm.com> wrote:
> 

> On Tue, Dec 24, 2019 at 10:49:30AM +0000, Andrew Murray wrote:

> > On Sat, Dec 21, 2019 at 01:57:55PM +0000, Marc Zyngier wrote:

> > > On Fri, 20 Dec 2019 14:30:15 +0000

> > > Andrew Murray <andrew.murray@arm.com> wrote:

> > > 

> > > > From: Sudeep Holla <sudeep.holla@arm.com>

> > > > 

> > > > Currently since we don't support profiling using SPE in the guests,

> > > > we just save the PMSCR_EL1, flush the profiling buffers and disable

> > > > sampling. However in order to support simultaneous sampling both in

> > > 

> > > Is the sampling actually simultaneous? I don't believe so (the whole

> > > series would be much simpler if it was).

> > 

> > No the SPE is used by either the guest or host at any one time. I guess

> > the term simultaneous was used to refer to illusion given to both guest

> > and host that they are able to use it whenever they like. I'll update

> > the commit message to drop the magic.

> >  

> > 

> > > 

> > > > the host and guests, we need to save and reatore the complete SPE

> > > 

> > > s/reatore/restore/

> > 

> > Noted.

> > 

> > 

> > > 

> > > > profiling buffer controls' context.

> > > > 

> > > > Let's add the support for the same and keep it disabled for now.

> > > > We can enable it conditionally only if guests are allowed to use

> > > > SPE.

> > > > 

> > > > Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>

> > > > [ Clear PMBSR bit when saving state to prevent spurious interrupts ]

> > > > Signed-off-by: Andrew Murray <andrew.murray@arm.com>

> > > > ---

> > > >  arch/arm64/kvm/hyp/debug-sr.c | 51 +++++++++++++++++++++++++++++------

> > > >  1 file changed, 43 insertions(+), 8 deletions(-)

> > > > 

> > > > diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c

> > > > index 8a70a493345e..12429b212a3a 100644

> > > > --- a/arch/arm64/kvm/hyp/debug-sr.c

> > > > +++ b/arch/arm64/kvm/hyp/debug-sr.c

> > > > @@ -85,7 +85,8 @@

> > > >  	default:	write_debug(ptr[0], reg, 0);			\

> > > >  	}

> > > >  

> > > > -static void __hyp_text __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt)

> > > > +static void __hyp_text

> > > > +__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)

> > > 

> > > nit: don't split lines like this if you can avoid it. You can put the

> > > full_ctxt parameter on a separate line instead.

> > 

> > Yes understood.

> > 

> > 

> > > 

> > > >  {

> > > >  	u64 reg;

> > > >  

> > > > @@ -102,22 +103,46 @@ static void __hyp_text __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt)

> > > >  	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))

> > > >  		return;

> > > >  

> > > > -	/* No; is the host actually using the thing? */

> > > > -	reg = read_sysreg_s(SYS_PMBLIMITR_EL1);

> > > > -	if (!(reg & BIT(SYS_PMBLIMITR_EL1_E_SHIFT)))

> > > > +	/* Save the control register and disable data generation */

> > > > +	ctxt->sys_regs[PMSCR_EL1] = read_sysreg_el1(SYS_PMSCR);

> > > > +

> > > > +	if (!ctxt->sys_regs[PMSCR_EL1])

> > > 

> > > Shouldn't you check the enable bits instead of relying on the whole

> > > thing being zero?

> > 

> > Yes that would make more sense (E1SPE and E0SPE).

> > 

> > I feel that this check makes an assumption about the guest/host SPE

> > driver... What happens if the SPE driver writes to some SPE registers

> > but doesn't enable PMSCR? If the guest is also using SPE then those

> > writes will be lost, when the host returns and the SPE driver enables

> > SPE it won't work.

> >

> > With a quick look at the SPE driver I'm not sure this will happen, but

> > even so it makes me nervous relying on these assumptions. I wonder if

> > this risk is present in other devices?


As a rule of thumb, you should always save whatever you're about to
overwrite if the registers are not under exclusive control of KVM. No
exception.

So if the guest is willing to use SPE *and* that it isn't enabled on
the host, these registers have to be saved on vcpu_load() and restored
on vcpu_put().

If SPE is enabled on the host, then trapping has to be enabled, and no
tracing occurs in the guest for this time slice.

> In fact, this may be a good reason to trap the SPE registers - this would

> allow you to conditionally save/restore based on a dirty bit. It would

> also allow you to re-evaluate the SPE interrupt (for example when the guest

> clears the status register) and thus potentially reduce any black hole.


I don't see what trapping buys you in the expected case (where the
guest is tracing and the host isn't). To clear PMBSR_EL1.S, you first
need to know that an interrupt has fired. So this brings you exactly
nothing in this particular case, and just adds overhead for everything
else. The whole point of the architecture is that in the non-contended
case, we can give SPE to the guest and mostly forget about it.

I strongly suggest that you start with the simplest possible, non
broken implementation. It doesn't matter if the black holes last for
seconds for now. Once you have something that looks reasonable, we can
evaluate how to improve on it by throwing actual HW and workloads at
it.

	M.

-- 
Jazz is not dead, it just smells funny.

Patch

diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
index 8a70a493345e..12429b212a3a 100644
--- a/arch/arm64/kvm/hyp/debug-sr.c
+++ b/arch/arm64/kvm/hyp/debug-sr.c
@@ -85,7 +85,8 @@ 
 	default:	write_debug(ptr[0], reg, 0);			\
 	}
 
-static void __hyp_text __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt)
+static void __hyp_text
+__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
 {
 	u64 reg;
 
@@ -102,22 +103,46 @@  static void __hyp_text __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt)
 	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
 		return;
 
-	/* No; is the host actually using the thing? */
-	reg = read_sysreg_s(SYS_PMBLIMITR_EL1);
-	if (!(reg & BIT(SYS_PMBLIMITR_EL1_E_SHIFT)))
+	/* Save the control register and disable data generation */
+	ctxt->sys_regs[PMSCR_EL1] = read_sysreg_el1(SYS_PMSCR);
+
+	if (!ctxt->sys_regs[PMSCR_EL1])
 		return;
 
 	/* Yes; save the control register and disable data generation */
-	ctxt->sys_regs[PMSCR_EL1] = read_sysreg_el1(SYS_PMSCR);
 	write_sysreg_el1(0, SYS_PMSCR);
 	isb();
 
 	/* Now drain all buffered data to memory */
 	psb_csync();
 	dsb(nsh);
+
+	if (!full_ctxt)
+		return;
+
+	ctxt->sys_regs[PMBLIMITR_EL1] = read_sysreg_s(SYS_PMBLIMITR_EL1);
+	write_sysreg_s(0, SYS_PMBLIMITR_EL1);
+
+	/*
+	 * As PMBSR is conditionally restored when returning to the host we
+	 * must ensure the service bit is unset here to prevent a spurious
+	 * host SPE interrupt from being raised.
+	 */
+	ctxt->sys_regs[PMBSR_EL1] = read_sysreg_s(SYS_PMBSR_EL1);
+	write_sysreg_s(0, SYS_PMBSR_EL1);
+
+	isb();
+
+	ctxt->sys_regs[PMSICR_EL1] = read_sysreg_s(SYS_PMSICR_EL1);
+	ctxt->sys_regs[PMSIRR_EL1] = read_sysreg_s(SYS_PMSIRR_EL1);
+	ctxt->sys_regs[PMSFCR_EL1] = read_sysreg_s(SYS_PMSFCR_EL1);
+	ctxt->sys_regs[PMSEVFR_EL1] = read_sysreg_s(SYS_PMSEVFR_EL1);
+	ctxt->sys_regs[PMSLATFR_EL1] = read_sysreg_s(SYS_PMSLATFR_EL1);
+	ctxt->sys_regs[PMBPTR_EL1] = read_sysreg_s(SYS_PMBPTR_EL1);
 }
 
-static void __hyp_text __debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt)
+static void __hyp_text
+__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
 {
 	if (!ctxt->sys_regs[PMSCR_EL1])
 		return;
@@ -126,6 +151,16 @@  static void __hyp_text __debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt)
 	isb();
 
 	/* Re-enable data generation */
+	if (full_ctxt) {
+		write_sysreg_s(ctxt->sys_regs[PMBPTR_EL1], SYS_PMBPTR_EL1);
+		write_sysreg_s(ctxt->sys_regs[PMBLIMITR_EL1], SYS_PMBLIMITR_EL1);
+		write_sysreg_s(ctxt->sys_regs[PMSFCR_EL1], SYS_PMSFCR_EL1);
+		write_sysreg_s(ctxt->sys_regs[PMSEVFR_EL1], SYS_PMSEVFR_EL1);
+		write_sysreg_s(ctxt->sys_regs[PMSLATFR_EL1], SYS_PMSLATFR_EL1);
+		write_sysreg_s(ctxt->sys_regs[PMSIRR_EL1], SYS_PMSIRR_EL1);
+		write_sysreg_s(ctxt->sys_regs[PMSICR_EL1], SYS_PMSICR_EL1);
+		write_sysreg_s(ctxt->sys_regs[PMBSR_EL1], SYS_PMBSR_EL1);
+	}
 	write_sysreg_el1(ctxt->sys_regs[PMSCR_EL1], SYS_PMSCR);
 }
 
@@ -198,7 +233,7 @@  void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
 	guest_ctxt = &vcpu->arch.ctxt;
 
 	if (!has_vhe())
-		__debug_restore_spe_nvhe(host_ctxt);
+		__debug_restore_spe_nvhe(host_ctxt, false);
 
 	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
 		return;
@@ -222,7 +257,7 @@  void __hyp_text __debug_save_host_context(struct kvm_vcpu *vcpu)
 
 	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
 	if (!has_vhe())
-		__debug_save_spe_nvhe(host_ctxt);
+		__debug_save_spe_nvhe(host_ctxt, false);
 }
 
 void __hyp_text __debug_save_guest_context(struct kvm_vcpu *vcpu)