diff mbox series

[v9,13/39] KVM: arm64: Manage GCS registers for guests

Message ID 20240625-arm64-gcs-v9-13-0f634469b8f0@kernel.org
State New
Headers show
Series [v9,01/39] arm64/mm: Restructure arch_validate_flags() for extensibility | expand

Commit Message

Mark Brown June 25, 2024, 2:57 p.m. UTC
GCS introduces a number of system registers for EL1 and EL0, on systems
with GCS we need to context switch them and expose them to VMMs to allow
guests to use GCS, as well as describe their fine grained traps to
nested virtualisation.  Traps are already disabled.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/arm64/include/asm/kvm_host.h          | 14 +++++++++
 arch/arm64/include/asm/vncr_mapping.h      |  2 ++
 arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 48 +++++++++++++++++++++++-------
 arch/arm64/kvm/sys_regs.c                  | 25 +++++++++++++++-
 4 files changed, 78 insertions(+), 11 deletions(-)

Comments

Marc Zyngier July 10, 2024, 3:17 p.m. UTC | #1
On Tue, 25 Jun 2024 15:57:41 +0100,
Mark Brown <broonie@kernel.org> wrote:
> 
> GCS introduces a number of system registers for EL1 and EL0, on systems
> with GCS we need to context switch them and expose them to VMMs to allow
> guests to use GCS, as well as describe their fine grained traps to
> nested virtualisation.  Traps are already disabled.

I don't see anything related to FGTs at all.

> 
> Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
> Signed-off-by: Mark Brown <broonie@kernel.org>
> ---
>  arch/arm64/include/asm/kvm_host.h          | 14 +++++++++
>  arch/arm64/include/asm/vncr_mapping.h      |  2 ++
>  arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 48 +++++++++++++++++++++++-------
>  arch/arm64/kvm/sys_regs.c                  | 25 +++++++++++++++-
>  4 files changed, 78 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 36b8e97bf49e..316fb412f355 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -411,6 +411,10 @@ enum vcpu_sysreg {
>  	GCR_EL1,	/* Tag Control Register */
>  	TFSRE0_EL1,	/* Tag Fault Status Register (EL0) */
>  
> +	/* Guarded Control Stack registers */
> +	GCSCRE0_EL1,	/* Guarded Control Stack Control (EL0) */
> +	GCSPR_EL0,	/* Guarded Control Stack Pointer (EL0) */
> +
>  	/* 32bit specific registers. */
>  	DACR32_EL2,	/* Domain Access Control Register */
>  	IFSR32_EL2,	/* Instruction Fault Status Register */
> @@ -481,6 +485,10 @@ enum vcpu_sysreg {
>  	VNCR(PIR_EL1),	 /* Permission Indirection Register 1 (EL1) */
>  	VNCR(PIRE0_EL1), /*  Permission Indirection Register 0 (EL1) */
>  
> +	/* Guarded Control Stack registers */
> +	VNCR(GCSPR_EL1),	/* Guarded Control Stack Pointer (EL1) */
> +	VNCR(GCSCR_EL1),	/* Guarded Control Stack Control (EL1) */
> +
>  	VNCR(HFGRTR_EL2),
>  	VNCR(HFGWTR_EL2),
>  	VNCR(HFGITR_EL2),
> @@ -1343,6 +1351,12 @@ static inline bool __vcpu_has_feature(const struct kvm_arch *ka, int feature)
>  
>  #define kvm_vcpu_initialized(v) vcpu_get_flag(vcpu, VCPU_INITIALIZED)
>  
> +static inline bool has_gcs(void)
> +{
> +	return IS_ENABLED(CONFIG_ARM64_GCS) &&
> +		cpus_have_final_cap(ARM64_HAS_GCS);
> +}

This is mostly useless, see below.

> +
>  int kvm_trng_call(struct kvm_vcpu *vcpu);
>  #ifdef CONFIG_KVM
>  extern phys_addr_t hyp_mem_base;
> diff --git a/arch/arm64/include/asm/vncr_mapping.h b/arch/arm64/include/asm/vncr_mapping.h
> index df2c47c55972..5e83e6f579fd 100644
> --- a/arch/arm64/include/asm/vncr_mapping.h
> +++ b/arch/arm64/include/asm/vncr_mapping.h
> @@ -88,6 +88,8 @@
>  #define VNCR_PMSIRR_EL1         0x840
>  #define VNCR_PMSLATFR_EL1       0x848
>  #define VNCR_TRFCR_EL1          0x880
> +#define VNCR_GCSPR_EL1		0x8C0
> +#define VNCR_GCSCR_EL1		0x8D0
>  #define VNCR_MPAM1_EL1          0x900
>  #define VNCR_MPAMHCR_EL2        0x930
>  #define VNCR_MPAMVPMV_EL2       0x938
> diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
> index 4be6a7fa0070..b20212d80e9b 100644
> --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
> +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
> @@ -16,6 +16,27 @@
>  #include <asm/kvm_hyp.h>
>  #include <asm/kvm_mmu.h>
>  
> +static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt)
> +{
> +	struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu;
> +
> +	if (!vcpu)
> +		vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt);
> +
> +	return vcpu;
> +}
> +
> +static inline bool ctxt_has_gcs(struct kvm_cpu_context *ctxt)
> +{
> +	struct kvm_vcpu *vcpu;
> +
> +	if (!cpus_have_final_cap(ARM64_HAS_GCS))
> +		return false;
> +
> +	vcpu = ctxt_to_vcpu(ctxt);
> +	return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64PFR1_EL1, GCS, IMP);
> +}
> +
>  static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
>  {
>  	ctxt_sys_reg(ctxt, MDSCR_EL1)	= read_sysreg(mdscr_el1);
> @@ -25,16 +46,8 @@ static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
>  {
>  	ctxt_sys_reg(ctxt, TPIDR_EL0)	= read_sysreg(tpidr_el0);
>  	ctxt_sys_reg(ctxt, TPIDRRO_EL0)	= read_sysreg(tpidrro_el0);
> -}
> -
> -static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt)
> -{
> -	struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu;
> -
> -	if (!vcpu)
> -		vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt);
> -
> -	return vcpu;
> +	if (ctxt_has_gcs(ctxt))
> +		ctxt_sys_reg(ctxt, GCSPR_EL0) = read_sysreg_s(SYS_GCSPR_EL0);
>  }
>  
>  static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt)
> @@ -80,6 +93,12 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
>  	ctxt_sys_reg(ctxt, PAR_EL1)	= read_sysreg_par();
>  	ctxt_sys_reg(ctxt, TPIDR_EL1)	= read_sysreg(tpidr_el1);
>  
> +	if (ctxt_has_gcs(ctxt)) {

Since this is conditioned on S1PIE, it should be only be evaluated
when PIE is enabled in the guest.

> +		ctxt_sys_reg(ctxt, GCSPR_EL1)	= read_sysreg_el1(SYS_GCSPR);
> +		ctxt_sys_reg(ctxt, GCSCR_EL1)	= read_sysreg_el1(SYS_GCSCR);
> +		ctxt_sys_reg(ctxt, GCSCRE0_EL1)	= read_sysreg_s(SYS_GCSCRE0_EL1);

Why is this part of the EL1 context? It clearly only matters to EL0
execution, so it could be switched in load/put on nVHE as well. And
actually, given that the whole thing is strictly for userspace, why do
we switch *anything* eagerly at all?

> +	}
> +
>  	if (ctxt_has_mte(ctxt)) {
>  		ctxt_sys_reg(ctxt, TFSR_EL1) = read_sysreg_el1(SYS_TFSR);
>  		ctxt_sys_reg(ctxt, TFSRE0_EL1) = read_sysreg_s(SYS_TFSRE0_EL1);
> @@ -113,6 +132,8 @@ static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
>  {
>  	write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL0),	tpidr_el0);
>  	write_sysreg(ctxt_sys_reg(ctxt, TPIDRRO_EL0),	tpidrro_el0);
> +	if (ctxt_has_gcs(ctxt))
> +		write_sysreg_s(ctxt_sys_reg(ctxt, GCSPR_EL0), SYS_GCSPR_EL0);
>  }
>  
>  static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
> @@ -156,6 +177,13 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
>  	write_sysreg(ctxt_sys_reg(ctxt, PAR_EL1),	par_el1);
>  	write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL1),	tpidr_el1);
>  
> +	if (ctxt_has_gcs(ctxt)) {
> +		write_sysreg_el1(ctxt_sys_reg(ctxt, GCSPR_EL1),	SYS_GCSPR);
> +		write_sysreg_el1(ctxt_sys_reg(ctxt, GCSCR_EL1),	SYS_GCSCR);
> +		write_sysreg_s(ctxt_sys_reg(ctxt, GCSCRE0_EL1),
> +			       SYS_GCSCRE0_EL1);
> +	}
> +
>  	if (ctxt_has_mte(ctxt)) {
>  		write_sysreg_el1(ctxt_sys_reg(ctxt, TFSR_EL1), SYS_TFSR);
>  		write_sysreg_s(ctxt_sys_reg(ctxt, TFSRE0_EL1), SYS_TFSRE0_EL1);
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 22b45a15d068..cf068dcfbd49 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -2015,6 +2015,23 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
>  	.visibility = mte_visibility,		\
>  }
>  
> +static unsigned int gcs_visibility(const struct kvm_vcpu *vcpu,
> +				   const struct sys_reg_desc *rd)
> +{
> +	if (has_gcs())
> +		return 0;

No. we've been here before.

> +
> +	return REG_HIDDEN;
> +}
> +
> +#define GCS_REG(name) {				\
> +	SYS_DESC(SYS_##name),			\
> +	.access = undef_access,			\
> +	.reset = reset_unknown,			\
> +	.reg = name,				\
> +	.visibility = gcs_visibility,		\
> +}
> +
>  static unsigned int el2_visibility(const struct kvm_vcpu *vcpu,
>  				   const struct sys_reg_desc *rd)
>  {
> @@ -2306,7 +2323,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
>  		   ID_AA64PFR0_EL1_GIC |
>  		   ID_AA64PFR0_EL1_AdvSIMD |
>  		   ID_AA64PFR0_EL1_FP), },
> -	ID_SANITISED(ID_AA64PFR1_EL1),
> +	ID_WRITABLE(ID_AA64PFR1_EL1, ~(ID_AA64PFR1_EL1_RES0 |
> +				       ID_AA64PFR1_EL1_BT)),

I don't know what you're trying to do here, but that's not right. If
you want to make this register writable, here's the shopping list:

https://lore.kernel.org/all/87ikxsi0v9.wl-maz@kernel.org/

>  	ID_UNALLOCATED(4,2),
>  	ID_UNALLOCATED(4,3),
>  	ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0),
> @@ -2390,6 +2408,10 @@ static const struct sys_reg_desc sys_reg_descs[] = {
>  	PTRAUTH_KEY(APDB),
>  	PTRAUTH_KEY(APGA),
>  
> +	GCS_REG(GCSCR_EL1),
> +	GCS_REG(GCSPR_EL1),
> +	GCS_REG(GCSCRE0_EL1),
> +
>  	{ SYS_DESC(SYS_SPSR_EL1), access_spsr},
>  	{ SYS_DESC(SYS_ELR_EL1), access_elr},
>  
> @@ -2476,6 +2498,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
>  	{ SYS_DESC(SYS_SMIDR_EL1), undef_access },
>  	{ SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 },
>  	{ SYS_DESC(SYS_CTR_EL0), access_ctr },
> +	GCS_REG(GCSPR_EL0),
>  	{ SYS_DESC(SYS_SVCR), undef_access },
>  
>  	{ PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr,

I don't see the vcpu's hcrx_el2 being updated to enable GCS. How does
it work then? I also don't see the FGU updates when GCS is disabled,
nor the corresponding FGT bits being marked as RES0.

	M.
Mark Brown July 10, 2024, 5:16 p.m. UTC | #2
On Wed, Jul 10, 2024 at 04:17:02PM +0100, Marc Zyngier wrote:
> Mark Brown <broonie@kernel.org> wrote:

> > +	if (ctxt_has_gcs(ctxt)) {

> Since this is conditioned on S1PIE, it should be only be evaluated
> when PIE is enabled in the guest.

So make ctxt_has_gcs() embed a check of ctxt_has_s1pie()?

> > +		ctxt_sys_reg(ctxt, GCSPR_EL1)	= read_sysreg_el1(SYS_GCSPR);
> > +		ctxt_sys_reg(ctxt, GCSCR_EL1)	= read_sysreg_el1(SYS_GCSCR);
> > +		ctxt_sys_reg(ctxt, GCSCRE0_EL1)	= read_sysreg_s(SYS_GCSCRE0_EL1);

> Why is this part of the EL1 context? It clearly only matters to EL0
> execution, so it could be switched in load/put on nVHE as well. And
> actually, given that the whole thing is strictly for userspace, why do
> we switch *anything* eagerly at all?

GCS can also be used independently at EL1 (and EL2 for that matter),
it's not purely for userspace even though this series only implements
use in userspace.  GCSPR_EL1 and GCSCR_EL1 control the use of GCS at
EL1, not EL0, and the guest might be using GCS at EL1 even if the host
doesn't.

GCSCRE0_EL1 is for EL0 though, it ended up here mainly because it's an
_EL1 register and we are already context switching PIRE0_EL1 in the EL1
functions so it seemed consistent to follow the same approach for GCS.
The _el1 and _user save/restore functions are called from the same place
for both VHE and nVHE so the practical impact of the placement should be
minimal AFAICT.  Unlike PIRE0_EL1 GCSCRE0_EL1 only has an impact for
code runnning at EL0 so I can move it to the _user functions.

TBH I'm not following your comments about switching eagerly too here at
all, where would you expect to see the switching done?  You've said
something along these lines before which prompted me to send a patch to
only save the S1PIE registers if they'd been written to which you were
quite reasonably not happy with given the extra traps it would cause:

   https://lore.kernel.org/r/20240301-kvm-arm64-defer-regs-v1-1-401e3de92e97@kernel.org

but I'm at a loss as to how to make things less eager otherwise.

> > @@ -2306,7 +2323,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
> >  		   ID_AA64PFR0_EL1_GIC |
> >  		   ID_AA64PFR0_EL1_AdvSIMD |
> >  		   ID_AA64PFR0_EL1_FP), },
> > -	ID_SANITISED(ID_AA64PFR1_EL1),
> > +	ID_WRITABLE(ID_AA64PFR1_EL1, ~(ID_AA64PFR1_EL1_RES0 |
> > +				       ID_AA64PFR1_EL1_BT)),

> I don't know what you're trying to do here, but that's not right. If
> you want to make this register writable, here's the shopping list:

> https://lore.kernel.org/all/87ikxsi0v9.wl-maz@kernel.org/

Yes, trying to make things writable.  I do see we need to exclude more
bits there and I'm not clear why I excluded BTI, looks like I forgot to
add a TODO comment at some point and finish that off.  Sorry about that.

In the linked mail you say you want to see all fields explicitly
handled, could you be more direct about what such explicit handling
would look like?  I see a number of examples in the existing code like:

	ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0),

	ID_WRITABLE(ID_AA64ISAR0_EL1, ~ID_AA64ISAR0_EL1_RES0),
	ID_WRITABLE(ID_AA64ISAR1_EL1, ~(ID_AA64ISAR1_EL1_GPI |
					ID_AA64ISAR1_EL1_GPA |
					ID_AA64ISAR1_EL1_API |
					ID_AA64ISAR1_EL1_APA)),

which look to my eye very similar to the above, they do not visibliy
explictly enumerate every field in the registers and given that there's
a single mask specified it's not clear how that would look.  If
ID_WRITABLE() took separate read/write masks and combined them it'd be
more obvious but it's just not written that way.
Marc Zyngier July 10, 2024, 6:28 p.m. UTC | #3
On Wed, 10 Jul 2024 18:16:46 +0100,
Mark Brown <broonie@kernel.org> wrote:
> 
> [1  <text/plain; us-ascii (7bit)>]
> On Wed, Jul 10, 2024 at 04:17:02PM +0100, Marc Zyngier wrote:
> > Mark Brown <broonie@kernel.org> wrote:
> 
> > > +	if (ctxt_has_gcs(ctxt)) {
> 
> > Since this is conditioned on S1PIE, it should be only be evaluated
> > when PIE is enabled in the guest.
> 
> So make ctxt_has_gcs() embed a check of ctxt_has_s1pie()?

No. I mean nest the whole thing *under* the check for S1PIE.

> 
> > > +		ctxt_sys_reg(ctxt, GCSPR_EL1)	= read_sysreg_el1(SYS_GCSPR);
> > > +		ctxt_sys_reg(ctxt, GCSCR_EL1)	= read_sysreg_el1(SYS_GCSCR);
> > > +		ctxt_sys_reg(ctxt, GCSCRE0_EL1)	= read_sysreg_s(SYS_GCSCRE0_EL1);
> 
> > Why is this part of the EL1 context? It clearly only matters to EL0
> > execution, so it could be switched in load/put on nVHE as well. And
> > actually, given that the whole thing is strictly for userspace, why do
> > we switch *anything* eagerly at all?
> 
> GCS can also be used independently at EL1 (and EL2 for that matter),
> it's not purely for userspace even though this series only implements
> use in userspace.  GCSPR_EL1 and GCSCR_EL1 control the use of GCS at
> EL1, not EL0, and the guest might be using GCS at EL1 even if the host
> doesn't.
>
> GCSCRE0_EL1 is for EL0 though, it ended up here mainly because it's an
> _EL1 register and we are already context switching PIRE0_EL1 in the EL1
> functions so it seemed consistent to follow the same approach for GCS.
> The _el1 and _user save/restore functions are called from the same place
> for both VHE and nVHE so the practical impact of the placement should be
> minimal AFAICT.  Unlike PIRE0_EL1 GCSCRE0_EL1 only has an impact for
> code runnning at EL0 so I can move it to the _user functions.

Exactly. That's where it belongs, because we never execute EL0 while a
vcpu is loaded. On the contrary, we can make use of a uaccess helper
while a vcpu is loaded, and that makes a hell of a difference.

And it makes a difference because it would allow the loading of
EL0-specific context differently. We had this at some point, and it
was a reasonable optimisation that we lost. I'm keen on bringing it
back.

> 
> TBH I'm not following your comments about switching eagerly too here at
> all, where would you expect to see the switching done?  You've said
> something along these lines before which prompted me to send a patch to
> only save the S1PIE registers if they'd been written to which you were
> quite reasonably not happy with given the extra traps it would cause:
> 
>    https://lore.kernel.org/r/20240301-kvm-arm64-defer-regs-v1-1-401e3de92e97@kernel.org
> 
> but I'm at a loss as to how to make things less eager otherwise.
>
> > > @@ -2306,7 +2323,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
> > >  		   ID_AA64PFR0_EL1_GIC |
> > >  		   ID_AA64PFR0_EL1_AdvSIMD |
> > >  		   ID_AA64PFR0_EL1_FP), },
> > > -	ID_SANITISED(ID_AA64PFR1_EL1),
> > > +	ID_WRITABLE(ID_AA64PFR1_EL1, ~(ID_AA64PFR1_EL1_RES0 |
> > > +				       ID_AA64PFR1_EL1_BT)),
> 
> > I don't know what you're trying to do here, but that's not right. If
> > you want to make this register writable, here's the shopping list:
> 
> > https://lore.kernel.org/all/87ikxsi0v9.wl-maz@kernel.org/
> 
> Yes, trying to make things writable.  I do see we need to exclude more
> bits there and I'm not clear why I excluded BTI, looks like I forgot to
> add a TODO comment at some point and finish that off.  Sorry about that.
> 
> In the linked mail you say you want to see all fields explicitly
> handled, could you be more direct about what such explicit handling

This emails enumerate, point after point, everything that needs to be
done. I really cannot be clearer or more direct. This email is the
clearer I can be, short of writing the code myself. And I have decided
not to do it for once, unless I really need to. And as it turns out, I
don't.

> would look like?  I see a number of examples in the existing code like:
> 
> 	ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0),

This is clear: Everything is writable, and there are no bits here that
are otherwise conditional or unsupported.

> 
> 	ID_WRITABLE(ID_AA64ISAR0_EL1, ~ID_AA64ISAR0_EL1_RES0),

Same thing.

> 	ID_WRITABLE(ID_AA64ISAR1_EL1, ~(ID_AA64ISAR1_EL1_GPI |
> 					ID_AA64ISAR1_EL1_GPA |
> 					ID_AA64ISAR1_EL1_API |
> 					ID_AA64ISAR1_EL1_APA)),

This one needs fixing because of LS64, and I have an in-progress
series for it.

> which look to my eye very similar to the above, they do not visibliy
> explictly enumerate every field in the registers and given that there's
> a single mask specified it's not clear how that would look.  If
> ID_WRITABLE() took separate read/write masks and combined them it'd be
> more obvious but it's just not written that way.

I don't really see what it would buy us, but never mind.

	M.
Mark Brown July 10, 2024, 10:05 p.m. UTC | #4
On Wed, Jul 10, 2024 at 07:28:09PM +0100, Marc Zyngier wrote:
> Mark Brown <broonie@kernel.org> wrote:
> > On Wed, Jul 10, 2024 at 04:17:02PM +0100, Marc Zyngier wrote:

> > > > +	if (ctxt_has_gcs(ctxt)) {

> > > Since this is conditioned on S1PIE, it should be only be evaluated
> > > when PIE is enabled in the guest.

> > So make ctxt_has_gcs() embed a check of ctxt_has_s1pie()?

> No. I mean nest the whole thing *under* the check for S1PIE.

OK, increasing the level of nesting.  Got it.  Does that just apply for
the EL1 registers given that there's no _user S1PIE registers so no
existing check there?

Should we also be doing a similar thing for features that depend on TCR2
- currently that's just PIE but it'll grow?  Probably only when we get
more features rather than now since we don't currently check if the
guest has TCR2, just the system.

> > GCSCRE0_EL1 is for EL0 though, it ended up here mainly because it's an
> > _EL1 register and we are already context switching PIRE0_EL1 in the EL1
> > functions so it seemed consistent to follow the same approach for GCS.
> > The _el1 and _user save/restore functions are called from the same place
> > for both VHE and nVHE so the practical impact of the placement should be
> > minimal AFAICT.  Unlike PIRE0_EL1 GCSCRE0_EL1 only has an impact for
> > code runnning at EL0 so I can move it to the _user functions.

> Exactly. That's where it belongs, because we never execute EL0 while a
> vcpu is loaded. On the contrary, we can make use of a uaccess helper
> while a vcpu is loaded, and that makes a hell of a difference.

OK, to be clear here "it" is GCSCRE0_EL1, not GCSPR_EL1 and GCSCR_EL1
which are for EL1?

> And it makes a difference because it would allow the loading of
> EL0-specific context differently. We had this at some point, and it
> was a reasonable optimisation that we lost. I'm keen on bringing it
> back.

Ah, that'd be good - not only for the optimistation but also since at
the minute it's a bit unclear why there are separate EL0/1 functions.

> > > you want to make this register writable, here's the shopping list:

> > > https://lore.kernel.org/all/87ikxsi0v9.wl-maz@kernel.org/

> > In the linked mail you say you want to see all fields explicitly
> > handled, could you be more direct about what such explicit handling

> This emails enumerate, point after point, everything that needs to be
> done. I really cannot be clearer or more direct. This email is the
> clearer I can be, short of writing the code myself. And I have decided
> not to do it for once, unless I really need to. And as it turns out, I
> don't.

See below, to be clear the only bit I was querying here was:

| - you *must* handle *all* the fields described in that register. There
|   are 15 valid fields there, and I want to see all 15 fields being
|   explicitly dealt with.

TBH it'd probably good to have that whole list in the kernel somewhere.

> > would look like?  I see a number of examples in the existing code like:

> > 	ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0),

> This is clear: Everything is writable, and there are no bits here that
> are otherwise conditional or unsupported.

Ah, I think I see.  I would not have interpreted this as making
everything explicit, to me this makes all the writeable fields writeable
implicitly through them just not being mentioned.  For everything to be
explicit I would expect to see a direct, visible reference in the code
to every single field rather than something like we have here where some
of the fields are not mentioned directly.  The end result is an explicit
value but that's true for any use of ID_WRITABLE().

If my understanding is correct then were I writing the bit I quoted
above I'd probably just drop the "explicitly" from that bullet point due
to the handling of simple writable fields with ID_WRITABLE(), the key
point being that every field needs to be handled with the other points
enumerating the specific options for how each field might be handled.
Does my understanding sound correct?

> > which look to my eye very similar to the above, they do not visibliy
> > explictly enumerate every field in the registers and given that there's
> > a single mask specified it's not clear how that would look.  If
> > ID_WRITABLE() took separate read/write masks and combined them it'd be
> > more obvious but it's just not written that way.

> I don't really see what it would buy us, but never mind.

That was me trying to reconcile my understanding of you asking to make
everything explicit with the code as it is.  I suppose the advantage
would be documentation.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 36b8e97bf49e..316fb412f355 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -411,6 +411,10 @@  enum vcpu_sysreg {
 	GCR_EL1,	/* Tag Control Register */
 	TFSRE0_EL1,	/* Tag Fault Status Register (EL0) */
 
+	/* Guarded Control Stack registers */
+	GCSCRE0_EL1,	/* Guarded Control Stack Control (EL0) */
+	GCSPR_EL0,	/* Guarded Control Stack Pointer (EL0) */
+
 	/* 32bit specific registers. */
 	DACR32_EL2,	/* Domain Access Control Register */
 	IFSR32_EL2,	/* Instruction Fault Status Register */
@@ -481,6 +485,10 @@  enum vcpu_sysreg {
 	VNCR(PIR_EL1),	 /* Permission Indirection Register 1 (EL1) */
 	VNCR(PIRE0_EL1), /*  Permission Indirection Register 0 (EL1) */
 
+	/* Guarded Control Stack registers */
+	VNCR(GCSPR_EL1),	/* Guarded Control Stack Pointer (EL1) */
+	VNCR(GCSCR_EL1),	/* Guarded Control Stack Control (EL1) */
+
 	VNCR(HFGRTR_EL2),
 	VNCR(HFGWTR_EL2),
 	VNCR(HFGITR_EL2),
@@ -1343,6 +1351,12 @@  static inline bool __vcpu_has_feature(const struct kvm_arch *ka, int feature)
 
 #define kvm_vcpu_initialized(v) vcpu_get_flag(vcpu, VCPU_INITIALIZED)
 
+static inline bool has_gcs(void)
+{
+	return IS_ENABLED(CONFIG_ARM64_GCS) &&
+		cpus_have_final_cap(ARM64_HAS_GCS);
+}
+
 int kvm_trng_call(struct kvm_vcpu *vcpu);
 #ifdef CONFIG_KVM
 extern phys_addr_t hyp_mem_base;
diff --git a/arch/arm64/include/asm/vncr_mapping.h b/arch/arm64/include/asm/vncr_mapping.h
index df2c47c55972..5e83e6f579fd 100644
--- a/arch/arm64/include/asm/vncr_mapping.h
+++ b/arch/arm64/include/asm/vncr_mapping.h
@@ -88,6 +88,8 @@ 
 #define VNCR_PMSIRR_EL1         0x840
 #define VNCR_PMSLATFR_EL1       0x848
 #define VNCR_TRFCR_EL1          0x880
+#define VNCR_GCSPR_EL1		0x8C0
+#define VNCR_GCSCR_EL1		0x8D0
 #define VNCR_MPAM1_EL1          0x900
 #define VNCR_MPAMHCR_EL2        0x930
 #define VNCR_MPAMVPMV_EL2       0x938
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index 4be6a7fa0070..b20212d80e9b 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -16,6 +16,27 @@ 
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 
+static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt)
+{
+	struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu;
+
+	if (!vcpu)
+		vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt);
+
+	return vcpu;
+}
+
+static inline bool ctxt_has_gcs(struct kvm_cpu_context *ctxt)
+{
+	struct kvm_vcpu *vcpu;
+
+	if (!cpus_have_final_cap(ARM64_HAS_GCS))
+		return false;
+
+	vcpu = ctxt_to_vcpu(ctxt);
+	return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64PFR1_EL1, GCS, IMP);
+}
+
 static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
 {
 	ctxt_sys_reg(ctxt, MDSCR_EL1)	= read_sysreg(mdscr_el1);
@@ -25,16 +46,8 @@  static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
 {
 	ctxt_sys_reg(ctxt, TPIDR_EL0)	= read_sysreg(tpidr_el0);
 	ctxt_sys_reg(ctxt, TPIDRRO_EL0)	= read_sysreg(tpidrro_el0);
-}
-
-static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt)
-{
-	struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu;
-
-	if (!vcpu)
-		vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt);
-
-	return vcpu;
+	if (ctxt_has_gcs(ctxt))
+		ctxt_sys_reg(ctxt, GCSPR_EL0) = read_sysreg_s(SYS_GCSPR_EL0);
 }
 
 static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt)
@@ -80,6 +93,12 @@  static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
 	ctxt_sys_reg(ctxt, PAR_EL1)	= read_sysreg_par();
 	ctxt_sys_reg(ctxt, TPIDR_EL1)	= read_sysreg(tpidr_el1);
 
+	if (ctxt_has_gcs(ctxt)) {
+		ctxt_sys_reg(ctxt, GCSPR_EL1)	= read_sysreg_el1(SYS_GCSPR);
+		ctxt_sys_reg(ctxt, GCSCR_EL1)	= read_sysreg_el1(SYS_GCSCR);
+		ctxt_sys_reg(ctxt, GCSCRE0_EL1)	= read_sysreg_s(SYS_GCSCRE0_EL1);
+	}
+
 	if (ctxt_has_mte(ctxt)) {
 		ctxt_sys_reg(ctxt, TFSR_EL1) = read_sysreg_el1(SYS_TFSR);
 		ctxt_sys_reg(ctxt, TFSRE0_EL1) = read_sysreg_s(SYS_TFSRE0_EL1);
@@ -113,6 +132,8 @@  static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
 {
 	write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL0),	tpidr_el0);
 	write_sysreg(ctxt_sys_reg(ctxt, TPIDRRO_EL0),	tpidrro_el0);
+	if (ctxt_has_gcs(ctxt))
+		write_sysreg_s(ctxt_sys_reg(ctxt, GCSPR_EL0), SYS_GCSPR_EL0);
 }
 
 static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
@@ -156,6 +177,13 @@  static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
 	write_sysreg(ctxt_sys_reg(ctxt, PAR_EL1),	par_el1);
 	write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL1),	tpidr_el1);
 
+	if (ctxt_has_gcs(ctxt)) {
+		write_sysreg_el1(ctxt_sys_reg(ctxt, GCSPR_EL1),	SYS_GCSPR);
+		write_sysreg_el1(ctxt_sys_reg(ctxt, GCSCR_EL1),	SYS_GCSCR);
+		write_sysreg_s(ctxt_sys_reg(ctxt, GCSCRE0_EL1),
+			       SYS_GCSCRE0_EL1);
+	}
+
 	if (ctxt_has_mte(ctxt)) {
 		write_sysreg_el1(ctxt_sys_reg(ctxt, TFSR_EL1), SYS_TFSR);
 		write_sysreg_s(ctxt_sys_reg(ctxt, TFSRE0_EL1), SYS_TFSRE0_EL1);
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 22b45a15d068..cf068dcfbd49 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -2015,6 +2015,23 @@  static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
 	.visibility = mte_visibility,		\
 }
 
+static unsigned int gcs_visibility(const struct kvm_vcpu *vcpu,
+				   const struct sys_reg_desc *rd)
+{
+	if (has_gcs())
+		return 0;
+
+	return REG_HIDDEN;
+}
+
+#define GCS_REG(name) {				\
+	SYS_DESC(SYS_##name),			\
+	.access = undef_access,			\
+	.reset = reset_unknown,			\
+	.reg = name,				\
+	.visibility = gcs_visibility,		\
+}
+
 static unsigned int el2_visibility(const struct kvm_vcpu *vcpu,
 				   const struct sys_reg_desc *rd)
 {
@@ -2306,7 +2323,8 @@  static const struct sys_reg_desc sys_reg_descs[] = {
 		   ID_AA64PFR0_EL1_GIC |
 		   ID_AA64PFR0_EL1_AdvSIMD |
 		   ID_AA64PFR0_EL1_FP), },
-	ID_SANITISED(ID_AA64PFR1_EL1),
+	ID_WRITABLE(ID_AA64PFR1_EL1, ~(ID_AA64PFR1_EL1_RES0 |
+				       ID_AA64PFR1_EL1_BT)),
 	ID_UNALLOCATED(4,2),
 	ID_UNALLOCATED(4,3),
 	ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0),
@@ -2390,6 +2408,10 @@  static const struct sys_reg_desc sys_reg_descs[] = {
 	PTRAUTH_KEY(APDB),
 	PTRAUTH_KEY(APGA),
 
+	GCS_REG(GCSCR_EL1),
+	GCS_REG(GCSPR_EL1),
+	GCS_REG(GCSCRE0_EL1),
+
 	{ SYS_DESC(SYS_SPSR_EL1), access_spsr},
 	{ SYS_DESC(SYS_ELR_EL1), access_elr},
 
@@ -2476,6 +2498,7 @@  static const struct sys_reg_desc sys_reg_descs[] = {
 	{ SYS_DESC(SYS_SMIDR_EL1), undef_access },
 	{ SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 },
 	{ SYS_DESC(SYS_CTR_EL0), access_ctr },
+	GCS_REG(GCSPR_EL0),
 	{ SYS_DESC(SYS_SVCR), undef_access },
 
 	{ PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr,