diff mbox

[v3] arm64: fix VTTBR_BADDR_MASK

Message ID 20140804143845.2851.85441.stgit@joelaarch64.amd.com
State New
Headers show

Commit Message

Joel Schopp Aug. 4, 2014, 2:38 p.m. UTC
The current VTTBR_BADDR_MASK only masks 39 bits, which is broken on current
systems.  Rather than just add a bit it seems like a good time to also set
things at run-time instead of compile time to accomodate more hardware.

This patch sets TCR_EL2.PS, VTCR_EL2.T0SZ and vttbr_baddr_mask in runtime,
not compile time.

In ARMv8, EL2 physical address size (TCR_EL2.PS) and stage2 input address
size (VTCR_EL2.T0SZE) cannot be determined in compile time since they
depends on hardware capability.

According to Table D4-23 and Table D4-25 in ARM DDI 0487A.b document,
vttbr_x is calculated using different hard-coded values with consideration
of T0SZ, granule size and the level of translation tables. Therefore,
vttbr_baddr_mask should be determined dynamically.

Changes since v2:
Rebased on https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git next branch

Changes since v1:
Rebased fix on Jungseok Lee's patch https://lkml.org/lkml/2014/5/12/189 to
provide better long term fix.  Updated that patch to log error instead of
silently fail on unaligned vttbr.

Cc: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Sungjinn Chung <sungjinn.chung@samsung.com>
Signed-off-by: Jungseok Lee <jays.lee@samsung.com>
Signed-off-by: Joel Schopp <joel.schopp@amd.com>
---
 arch/arm/kvm/arm.c               |   91 +++++++++++++++++++++++++++++++++++++-
 arch/arm64/include/asm/kvm_arm.h |   17 +------
 arch/arm64/kvm/hyp-init.S        |   20 ++++++--
 3 files changed, 106 insertions(+), 22 deletions(-)

Comments

Joel Schopp Aug. 4, 2014, 2:42 p.m. UTC | #1
Since this fixes a real problem and didn't make it into 3.16 it would be
good if this made it into 3.17.

-Joel

On 08/04/2014 09:38 AM, Joel Schopp wrote:
> The current VTTBR_BADDR_MASK only masks 39 bits, which is broken on current
> systems.  Rather than just add a bit it seems like a good time to also set
> things at run-time instead of compile time to accomodate more hardware.
>
> This patch sets TCR_EL2.PS, VTCR_EL2.T0SZ and vttbr_baddr_mask in runtime,
> not compile time.
>
> In ARMv8, EL2 physical address size (TCR_EL2.PS) and stage2 input address
> size (VTCR_EL2.T0SZE) cannot be determined in compile time since they
> depends on hardware capability.
>
> According to Table D4-23 and Table D4-25 in ARM DDI 0487A.b document,
> vttbr_x is calculated using different hard-coded values with consideration
> of T0SZ, granule size and the level of translation tables. Therefore,
> vttbr_baddr_mask should be determined dynamically.
>
> Changes since v2:
> Rebased on https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git next branch
>
> Changes since v1:
> Rebased fix on Jungseok Lee's patch https://lkml.org/lkml/2014/5/12/189 to
> provide better long term fix.  Updated that patch to log error instead of
> silently fail on unaligned vttbr.
>
> Cc: Christoffer Dall <christoffer.dall@linaro.org>
> Cc: Sungjinn Chung <sungjinn.chung@samsung.com>
> Signed-off-by: Jungseok Lee <jays.lee@samsung.com>
> Signed-off-by: Joel Schopp <joel.schopp@amd.com>
> ---
>  arch/arm/kvm/arm.c               |   91 +++++++++++++++++++++++++++++++++++++-
>  arch/arm64/include/asm/kvm_arm.h |   17 +------
>  arch/arm64/kvm/hyp-init.S        |   20 ++++++--
>  3 files changed, 106 insertions(+), 22 deletions(-)
>
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index d7424ef..d7ca2f5 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -37,6 +37,7 @@
>  #include <asm/mman.h>
>  #include <asm/tlbflush.h>
>  #include <asm/cacheflush.h>
> +#include <asm/cputype.h>
>  #include <asm/virt.h>
>  #include <asm/kvm_arm.h>
>  #include <asm/kvm_asm.h>
> @@ -61,6 +62,9 @@ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
>  static u8 kvm_next_vmid;
>  static DEFINE_SPINLOCK(kvm_vmid_lock);
>  
> +/* VTTBR mask cannot be determined in complie time under ARMv8 */
> +static u64 vttbr_baddr_mask;
> +
>  static bool vgic_present;
>  
>  static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
> @@ -376,6 +380,75 @@ static bool need_new_vmid_gen(struct kvm *kvm)
>  }
>  
>  /**
> + * set_vttbr_baddr_mask - set mask value for vttbr base address
> + *
> + * In ARMv8, vttbr_baddr_mask cannot be determined in compile time since stage2
> + * input address size depends on hardware capability. Thus, it is needed to read
> + * ID_AA64MMFR0_EL1.PARange first and then set vttbr_baddr_mask with
> + * consideration of both granule size and the level of translation tables.
> + */
> +static int set_vttbr_baddr_mask(void)
> +{
> +#ifndef CONFIG_ARM64
> +	vttbr_baddr_mask = VTTBR_BADDR_MASK;
> +#else
> +	int pa_range, t0sz, vttbr_x;
> +
> +	pa_range = read_cpuid(ID_AA64MMFR0_EL1) & 0xf;
> +
> +	switch (pa_range) {
> +	case 0:
> +		t0sz = VTCR_EL2_T0SZ(32);
> +		break;
> +	case 1:
> +		t0sz = VTCR_EL2_T0SZ(36);
> +		break;
> +	case 2:
> +		t0sz = VTCR_EL2_T0SZ(40);
> +		break;
> +	case 3:
> +		t0sz = VTCR_EL2_T0SZ(42);
> +		break;
> +	case 4:
> +		t0sz = VTCR_EL2_T0SZ(44);
> +		break;
> +	default:
> +		t0sz = VTCR_EL2_T0SZ(48);
> +	}
> +
> +	/*
> +	 * See Table D4-23 and Table D4-25 in ARM DDI 0487A.b to figure out
> +	 * the origin of the hardcoded values, 38 and 37.
> +	 */
> +#ifdef CONFIG_ARM64_64K_PAGES
> +	/*
> +	 * 16 <= T0SZ <= 21 is valid under 3 level of translation tables
> +	 * 18 <= T0SZ <= 34 is valid under 2 level of translation tables
> +	 * 31 <= T0SZ <= 39 is valid under 1 level of transltaion tables
> +	 */
> +	if (t0sz <= 17) {
> +		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
> +		return -EINVAL;
> +	}
> +	vttbr_x = 38 - t0sz;
> +#else
> +	/*
> +	 * 16 <= T0SZ <= 24 is valid under 4 level of translation tables
> +	 * 21 <= T0SZ <= 30 is valid under 3 level of translation tables
> +	 * 30 <= T0SZ <= 39 is valid under 2 level of translation tables
> +	 */
> +	if (t0sz <= 20) {
> +		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
> +		return -EINVAL;
> +	}
> +	vttbr_x = 37 - t0sz;
> +#endif
> +	vttbr_baddr_mask = (((1LLU << (48 - vttbr_x)) - 1) << (vttbr_x - 1));
> +#endif
> +	return 0;
> +}
> +
> +/**
>   * update_vttbr - Update the VTTBR with a valid VMID before the guest runs
>   * @kvm	The guest that we are about to run
>   *
> @@ -429,8 +502,16 @@ static void update_vttbr(struct kvm *kvm)
>  	/* update vttbr to be used with the new vmid */
>  	pgd_phys = virt_to_phys(kvm->arch.pgd);
>  	vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK;
> -	kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK;
> -	kvm->arch.vttbr |= vmid;
> +
> +	/*
> +	 * If the VTTBR isn't aligned there is something wrong with the system
> +	 * or kernel.  It is better to just fail and not mask it. But no need
> +	 * to panic the host kernel with a BUG_ON(), instead just log the error.
> +	 */
> +	if (pgd_phys & ~vttbr_baddr_mask)
> +		kvm_err("VTTBR not aligned, expect guest to fail");
> +
> +	kvm->arch.vttbr = pgd_phys | vmid;
>  
>  	spin_unlock(&kvm_vmid_lock);
>  }
> @@ -1015,6 +1096,12 @@ int kvm_arch_init(void *opaque)
>  		}
>  	}
>  
> +	err = set_vttbr_baddr_mask();
> +	if (err) {
> +		kvm_err("Cannot set vttbr_baddr_mask\n");
> +		return -EINVAL;
> +	}
> +
>  	cpu_notifier_register_begin();
>  
>  	err = init_hyp_mode();
> diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
> index cc83520..ff4a4fa 100644
> --- a/arch/arm64/include/asm/kvm_arm.h
> +++ b/arch/arm64/include/asm/kvm_arm.h
> @@ -95,7 +95,6 @@
>  /* TCR_EL2 Registers bits */
>  #define TCR_EL2_TBI	(1 << 20)
>  #define TCR_EL2_PS	(7 << 16)
> -#define TCR_EL2_PS_40B	(2 << 16)
>  #define TCR_EL2_TG0	(1 << 14)
>  #define TCR_EL2_SH0	(3 << 12)
>  #define TCR_EL2_ORGN0	(3 << 10)
> @@ -104,8 +103,6 @@
>  #define TCR_EL2_MASK	(TCR_EL2_TG0 | TCR_EL2_SH0 | \
>  			 TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
>  
> -#define TCR_EL2_FLAGS	(TCR_EL2_PS_40B)
> -
>  /* VTCR_EL2 Registers bits */
>  #define VTCR_EL2_PS_MASK	(7 << 16)
>  #define VTCR_EL2_TG0_MASK	(1 << 14)
> @@ -120,36 +117,28 @@
>  #define VTCR_EL2_SL0_MASK	(3 << 6)
>  #define VTCR_EL2_SL0_LVL1	(1 << 6)
>  #define VTCR_EL2_T0SZ_MASK	0x3f
> -#define VTCR_EL2_T0SZ_40B	24
> +#define VTCR_EL2_T0SZ(bits)	(64 - (bits))
>  
>  #ifdef CONFIG_ARM64_64K_PAGES
>  /*
>   * Stage2 translation configuration:
> - * 40bits output (PS = 2)
> - * 40bits input  (T0SZ = 24)
>   * 64kB pages (TG0 = 1)
>   * 2 level page tables (SL = 1)
>   */
>  #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
>  				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> -				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> -#define VTTBR_X		(38 - VTCR_EL2_T0SZ_40B)
> +				 VTCR_EL2_SL0_LVL1)
>  #else
>  /*
>   * Stage2 translation configuration:
> - * 40bits output (PS = 2)
> - * 40bits input  (T0SZ = 24)
>   * 4kB pages (TG0 = 0)
>   * 3 level page tables (SL = 1)
>   */
>  #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
>  				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> -				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> -#define VTTBR_X		(37 - VTCR_EL2_T0SZ_40B)
> +				 VTCR_EL2_SL0_LVL1)
>  #endif
>  
> -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
> -#define VTTBR_BADDR_MASK  (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
>  #define VTTBR_VMID_SHIFT  (48LLU)
>  #define VTTBR_VMID_MASK	  (0xffLLU << VTTBR_VMID_SHIFT)
>  
> diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
> index d968796..c0f7634 100644
> --- a/arch/arm64/kvm/hyp-init.S
> +++ b/arch/arm64/kvm/hyp-init.S
> @@ -63,17 +63,21 @@ __do_hyp_init:
>  	mrs	x4, tcr_el1
>  	ldr	x5, =TCR_EL2_MASK
>  	and	x4, x4, x5
> -	ldr	x5, =TCR_EL2_FLAGS
> -	orr	x4, x4, x5
> -	msr	tcr_el2, x4
> -
> -	ldr	x4, =VTCR_EL2_FLAGS
>  	/*
>  	 * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in
> -	 * VTCR_EL2.
> +	 * TCR_EL2 and both PS bits and T0SZ bits in VTCR_EL2.
>  	 */
>  	mrs	x5, ID_AA64MMFR0_EL1
>  	bfi	x4, x5, #16, #3
> +	msr	tcr_el2, x4
> +
> +	ldr	x4, =VTCR_EL2_FLAGS
> +	bfi	x4, x5, #16, #3
> +	and	x5, x5, #0xf
> +	adr	x6, t0sz
> +	add	x6, x6, x5, lsl #2
> +	ldr	w5, [x6]
> +	orr	x4, x4, x5
>  	msr	vtcr_el2, x4
>  
>  	mrs	x4, mair_el1
> @@ -109,6 +113,10 @@ target: /* We're now in the trampoline code, switch page tables */
>  
>  	/* Hello, World! */
>  	eret
> +
> +t0sz:
> +	.word	VTCR_EL2_T0SZ(32), VTCR_EL2_T0SZ(36), VTCR_EL2_T0SZ(40)
> +	.word	VTCR_EL2_T0SZ(42), VTCR_EL2_T0SZ(44), VTCR_EL2_T0SZ(48)
>  ENDPROC(__kvm_hyp_init)
>  
>  	.ltorg
>
Christoffer Dall Aug. 4, 2014, 3:18 p.m. UTC | #2
On Mon, Aug 04, 2014 at 09:42:46AM -0500, Joel Schopp wrote:
> Since this fixes a real problem and didn't make it into 3.16 it would be
> good if this made it into 3.17.
> 
It's too late for the merge window, we have to review and test as I told
you in a private e-mail.

We will review this and test it, and if it makes sense merge as a fix
for 3.17 before the release.

Please be a little patient given the number of patches on the list
currently.

Thanks,
-Christoffer
Christoffer Dall Aug. 8, 2014, 8:46 p.m. UTC | #3
On Mon, Aug 04, 2014 at 09:38:45AM -0500, Joel Schopp wrote:
> The current VTTBR_BADDR_MASK only masks 39 bits, which is broken on current
> systems.  Rather than just add a bit it seems like a good time to also set
> things at run-time instead of compile time to accomodate more hardware.
> 
> This patch sets TCR_EL2.PS, VTCR_EL2.T0SZ and vttbr_baddr_mask in runtime,
> not compile time.
> 
> In ARMv8, EL2 physical address size (TCR_EL2.PS) and stage2 input address
> size (VTCR_EL2.T0SZE) cannot be determined in compile time since they
> depends on hardware capability.

  depend

> 
> According to Table D4-23 and Table D4-25 in ARM DDI 0487A.b document,
> vttbr_x is calculated using different hard-coded values with consideration

                                        fixed values

> of T0SZ, granule size and the level of translation tables. Therefore,
> vttbr_baddr_mask should be determined dynamically.
> 
> Changes since v2:
> Rebased on https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git next branch
> 
> Changes since v1:
> Rebased fix on Jungseok Lee's patch https://lkml.org/lkml/2014/5/12/189 to
> provide better long term fix.  Updated that patch to log error instead of
> silently fail on unaligned vttbr.
> 
> Cc: Christoffer Dall <christoffer.dall@linaro.org>
> Cc: Sungjinn Chung <sungjinn.chung@samsung.com>
> Signed-off-by: Jungseok Lee <jays.lee@samsung.com>
> Signed-off-by: Joel Schopp <joel.schopp@amd.com>
> ---
>  arch/arm/kvm/arm.c               |   91 +++++++++++++++++++++++++++++++++++++-
>  arch/arm64/include/asm/kvm_arm.h |   17 +------
>  arch/arm64/kvm/hyp-init.S        |   20 ++++++--
>  3 files changed, 106 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index d7424ef..d7ca2f5 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -37,6 +37,7 @@
>  #include <asm/mman.h>
>  #include <asm/tlbflush.h>
>  #include <asm/cacheflush.h>
> +#include <asm/cputype.h>
>  #include <asm/virt.h>
>  #include <asm/kvm_arm.h>
>  #include <asm/kvm_asm.h>
> @@ -61,6 +62,9 @@ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
>  static u8 kvm_next_vmid;
>  static DEFINE_SPINLOCK(kvm_vmid_lock);
>  
> +/* VTTBR mask cannot be determined in complie time under ARMv8 */

I don't think we need this comment.

> +static u64 vttbr_baddr_mask;
> +
>  static bool vgic_present;
>  
>  static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
> @@ -376,6 +380,75 @@ static bool need_new_vmid_gen(struct kvm *kvm)
>  }
>  
>  /**
> + * set_vttbr_baddr_mask - set mask value for vttbr base address
> + *
> + * In ARMv8, vttbr_baddr_mask cannot be determined in compile time since stage2

the stage2 input address

> + * input address size depends on hardware capability. Thus, it is needed to read

Thus, we first need to read...

> + * ID_AA64MMFR0_EL1.PARange first and then set vttbr_baddr_mask with
> + * consideration of both granule size and the level of translation tables.

                            the granule ...
> + */
> +static int set_vttbr_baddr_mask(void)
> +{
> +#ifndef CONFIG_ARM64
> +	vttbr_baddr_mask = VTTBR_BADDR_MASK;
> +#else
> +	int pa_range, t0sz, vttbr_x;
> +
> +	pa_range = read_cpuid(ID_AA64MMFR0_EL1) & 0xf;
> +
> +	switch (pa_range) {
> +	case 0:
> +		t0sz = VTCR_EL2_T0SZ(32);
> +		break;
> +	case 1:
> +		t0sz = VTCR_EL2_T0SZ(36);
> +		break;
> +	case 2:
> +		t0sz = VTCR_EL2_T0SZ(40);
> +		break;
> +	case 3:
> +		t0sz = VTCR_EL2_T0SZ(42);
> +		break;
> +	case 4:
> +		t0sz = VTCR_EL2_T0SZ(44);
> +		break;
> +	default:
> +		t0sz = VTCR_EL2_T0SZ(48);
> +	}

the last case would be case 5 and the default case would be a BUG().

> +
> +	/*
> +	 * See Table D4-23 and Table D4-25 in ARM DDI 0487A.b to figure out
> +	 * the origin of the hardcoded values, 38 and 37.
> +	 */
> +#ifdef CONFIG_ARM64_64K_PAGES
> +	/*
> +	 * 16 <= T0SZ <= 21 is valid under 3 level of translation tables
> +	 * 18 <= T0SZ <= 34 is valid under 2 level of translation tables
> +	 * 31 <= T0SZ <= 39 is valid under 1 level of transltaion tables
> +	 */

so this scheme is with concatenated initial level stage-2 page tables.

But we only ever allocate the amount of pages for our pgd according to
what the host has, so I think this allocation needs to be locked down
more tight, because the host is always using the appropriate amount for
39 bits virtual addresses.

If you want to allow this full range, I think you need to change the
allocation logic and go through arch/arm/kvm/mmu.c and look for all uses
of pgd_offset() and friends and make sure we're doing the right thing.

> +	if (t0sz <= 17) {

I think this is more readable if you say (t0sz < 18) and don't you also
need to check the upper bound?:

(t0sz < 18 || t0sz > 34)

> +		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
> +		return -EINVAL;
> +	}
> +	vttbr_x = 38 - t0sz;
> +#else
> +	/*
> +	 * 16 <= T0SZ <= 24 is valid under 4 level of translation tables
> +	 * 21 <= T0SZ <= 30 is valid under 3 level of translation tables

under the current scheme, shouldn't this be:
           21 <= T0SZ <= 33

but same comment as above applies.

> +	 * 30 <= T0SZ <= 39 is valid under 2 level of translation tables
> +	 */
> +	if (t0sz <= 20) {

same as above

> +		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
> +		return -EINVAL;
> +	}
> +	vttbr_x = 37 - t0sz;
> +#endif
> +	vttbr_baddr_mask = (((1LLU << (48 - vttbr_x)) - 1) << (vttbr_x - 1));
> +#endif

This nested ifdef is really quite horrible.  Can you either factor these
out into some static inlines in arch/arm[64]/include/asm/kvm_mmu.h or
provide a per-architecture implementation in a .c file?

> +	return 0;
> +}
> +
> +/**
>   * update_vttbr - Update the VTTBR with a valid VMID before the guest runs
>   * @kvm	The guest that we are about to run
>   *
> @@ -429,8 +502,16 @@ static void update_vttbr(struct kvm *kvm)
>  	/* update vttbr to be used with the new vmid */
>  	pgd_phys = virt_to_phys(kvm->arch.pgd);
>  	vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK;
> -	kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK;
> -	kvm->arch.vttbr |= vmid;
> +
> +	/*
> +	 * If the VTTBR isn't aligned there is something wrong with the system
> +	 * or kernel.  It is better to just fail and not mask it. But no need
> +	 * to panic the host kernel with a BUG_ON(), instead just log the error.
> +	 */

These last two sentences are not very helpful, because they don't
describe the rationale for what you're doing, only what you are (and are
not) doing.

That said, I don't think this is doing the right thing.  I think you
want to refuse running the VM and avoid any stage-2 entried being
created if this is not the case (actually, we may want to check this
after set_vttbr_baddr_mask() or right aftert allocating the stage-2
pgd), because otherwise I think we may be overwriting memory not
belonging to us with concatenated page tables in a 42-bit 4KB system,
for example.

> +	if (pgd_phys & ~vttbr_baddr_mask)
> +		kvm_err("VTTBR not aligned, expect guest to fail");
> +
> +	kvm->arch.vttbr = pgd_phys | vmid;
>  
>  	spin_unlock(&kvm_vmid_lock);
>  }
> @@ -1015,6 +1096,12 @@ int kvm_arch_init(void *opaque)
>  		}
>  	}
>  
> +	err = set_vttbr_baddr_mask();
> +	if (err) {
> +		kvm_err("Cannot set vttbr_baddr_mask\n");
> +		return -EINVAL;
> +	}
> +
>  	cpu_notifier_register_begin();
>  
>  	err = init_hyp_mode();
> diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
> index cc83520..ff4a4fa 100644
> --- a/arch/arm64/include/asm/kvm_arm.h
> +++ b/arch/arm64/include/asm/kvm_arm.h
> @@ -95,7 +95,6 @@
>  /* TCR_EL2 Registers bits */
>  #define TCR_EL2_TBI	(1 << 20)
>  #define TCR_EL2_PS	(7 << 16)
> -#define TCR_EL2_PS_40B	(2 << 16)
>  #define TCR_EL2_TG0	(1 << 14)
>  #define TCR_EL2_SH0	(3 << 12)
>  #define TCR_EL2_ORGN0	(3 << 10)
> @@ -104,8 +103,6 @@
>  #define TCR_EL2_MASK	(TCR_EL2_TG0 | TCR_EL2_SH0 | \
>  			 TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
>  
> -#define TCR_EL2_FLAGS	(TCR_EL2_PS_40B)
> -
>  /* VTCR_EL2 Registers bits */
>  #define VTCR_EL2_PS_MASK	(7 << 16)
>  #define VTCR_EL2_TG0_MASK	(1 << 14)
> @@ -120,36 +117,28 @@
>  #define VTCR_EL2_SL0_MASK	(3 << 6)
>  #define VTCR_EL2_SL0_LVL1	(1 << 6)
>  #define VTCR_EL2_T0SZ_MASK	0x3f
> -#define VTCR_EL2_T0SZ_40B	24
> +#define VTCR_EL2_T0SZ(bits)	(64 - (bits))
>  
>  #ifdef CONFIG_ARM64_64K_PAGES
>  /*
>   * Stage2 translation configuration:
> - * 40bits output (PS = 2)
> - * 40bits input  (T0SZ = 24)
>   * 64kB pages (TG0 = 1)
>   * 2 level page tables (SL = 1)
>   */
>  #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
>  				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> -				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> -#define VTTBR_X		(38 - VTCR_EL2_T0SZ_40B)
> +				 VTCR_EL2_SL0_LVL1)
>  #else
>  /*
>   * Stage2 translation configuration:
> - * 40bits output (PS = 2)
> - * 40bits input  (T0SZ = 24)
>   * 4kB pages (TG0 = 0)
>   * 3 level page tables (SL = 1)
>   */
>  #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
>  				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> -				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> -#define VTTBR_X		(37 - VTCR_EL2_T0SZ_40B)
> +				 VTCR_EL2_SL0_LVL1)
>  #endif
>  
> -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
> -#define VTTBR_BADDR_MASK  (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
>  #define VTTBR_VMID_SHIFT  (48LLU)
>  #define VTTBR_VMID_MASK	  (0xffLLU << VTTBR_VMID_SHIFT)
>  
> diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
> index d968796..c0f7634 100644
> --- a/arch/arm64/kvm/hyp-init.S
> +++ b/arch/arm64/kvm/hyp-init.S
> @@ -63,17 +63,21 @@ __do_hyp_init:
>  	mrs	x4, tcr_el1
>  	ldr	x5, =TCR_EL2_MASK
>  	and	x4, x4, x5
> -	ldr	x5, =TCR_EL2_FLAGS
> -	orr	x4, x4, x5
> -	msr	tcr_el2, x4
> -
> -	ldr	x4, =VTCR_EL2_FLAGS
>  	/*
>  	 * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in
> -	 * VTCR_EL2.
> +	 * TCR_EL2 and both PS bits and T0SZ bits in VTCR_EL2.
>  	 */
>  	mrs	x5, ID_AA64MMFR0_EL1
>  	bfi	x4, x5, #16, #3
> +	msr	tcr_el2, x4
> +
> +	ldr	x4, =VTCR_EL2_FLAGS
> +	bfi	x4, x5, #16, #3
> +	and	x5, x5, #0xf
> +	adr	x6, t0sz
> +	add	x6, x6, x5, lsl #2
> +	ldr	w5, [x6]
> +	orr	x4, x4, x5
>  	msr	vtcr_el2, x4
>  
>  	mrs	x4, mair_el1
> @@ -109,6 +113,10 @@ target: /* We're now in the trampoline code, switch page tables */
>  
>  	/* Hello, World! */
>  	eret
> +
> +t0sz:
> +	.word	VTCR_EL2_T0SZ(32), VTCR_EL2_T0SZ(36), VTCR_EL2_T0SZ(40)
> +	.word	VTCR_EL2_T0SZ(42), VTCR_EL2_T0SZ(44), VTCR_EL2_T0SZ(48)
>  ENDPROC(__kvm_hyp_init)
>  
>  	.ltorg
> 

Thanks for picking this up!

-Christoffer
Joel Schopp Aug. 11, 2014, 3:20 p.m. UTC | #4
Thanks for the detailed review.
> the last case would be case 5 and the default case would be a BUG().
I agree with the case, but rather than do a BUG() I'm going to print an
error and return -EINVAL.  Not worth stopping the host kernel just
because kvm is messed up when we can gracefully exit from it.
>
>> +
>> +	/*
>> +	 * See Table D4-23 and Table D4-25 in ARM DDI 0487A.b to figure out
>> +	 * the origin of the hardcoded values, 38 and 37.
>> +	 */
>> +#ifdef CONFIG_ARM64_64K_PAGES
>> +	/*
>> +	 * 16 <= T0SZ <= 21 is valid under 3 level of translation tables
>> +	 * 18 <= T0SZ <= 34 is valid under 2 level of translation tables
>> +	 * 31 <= T0SZ <= 39 is valid under 1 level of transltaion tables
>> +	 */
> so this scheme is with concatenated initial level stage-2 page tables.
>
> But we only ever allocate the amount of pages for our pgd according to
> what the host has, so I think this allocation needs to be locked down
> more tight, because the host is always using the appropriate amount for
> 39 bits virtual addresses.
I'll narrow the sanity check of the range.  I'll narrow it based on a 39
- 48 bit VA host range in anticipation of the 4 level 4k and 3 level 64k
host patches going in.

>> +		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
>> +		return -EINVAL;
>> +	}
>> +	vttbr_x = 37 - t0sz;
>> +#endif
>> +	vttbr_baddr_mask = (((1LLU << (48 - vttbr_x)) - 1) << (vttbr_x - 1));
>> +#endif
> This nested ifdef is really quite horrible.  Can you either factor these
> out into some static inlines in arch/arm[64]/include/asm/kvm_mmu.h or
> provide a per-architecture implementation in a .c file?
I'll factor it out in the file to make it more readable and do away with
the nested ifdef.  My theory on putting things into .h files is to only
do it if there is actually another file that uses it.
>
>> +	return 0;
>> +}
>> +
>> +/**
>>   * update_vttbr - Update the VTTBR with a valid VMID before the guest runs
>>   * @kvm	The guest that we are about to run
>>   *
>> @@ -429,8 +502,16 @@ static void update_vttbr(struct kvm *kvm)
>>  	/* update vttbr to be used with the new vmid */
>>  	pgd_phys = virt_to_phys(kvm->arch.pgd);
>>  	vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK;
>> -	kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK;
>> -	kvm->arch.vttbr |= vmid;
>> +
>> +	/*
>> +	 * If the VTTBR isn't aligned there is something wrong with the system
>> +	 * or kernel.  It is better to just fail and not mask it. But no need
>> +	 * to panic the host kernel with a BUG_ON(), instead just log the error.
>> +	 */
> These last two sentences are not very helpful, because they don't
> describe the rationale for what you're doing, only what you are (and are
> not) doing.
I'll reword the comment.
>
> That said, I don't think this is doing the right thing.  I think you
> want to refuse running the VM and avoid any stage-2 entried being
> created if this is not the case (actually, we may want to check this
> after set_vttbr_baddr_mask() or right aftert allocating the stage-2
> pgd), because otherwise I think we may be overwriting memory not
> belonging to us with concatenated page tables in a 42-bit 4KB system,
> for example.
My experience here was that the hardware actually catches the error on
the first instruction load of the guest kernel and does a stage 2
translation abort.  However, to be extra safe we could just log the
error with the address of the vttbr and then zero out the pgd_phys part
of vttbr altogether, leaving only the vmid.  The guest would then die of
natural causes and we wouldn't have to worry about the outside
possibility of memory getting overwritten.

I don't like the option of just calling BUG() because stopping the host
kernel from running just because we can't run a guest seems a bit
extreme.  On the other hand adding a return code to update_vttbr and
checking it (even with unlikely) in the kvm_arch_vcpu_ioctl_run() loop
doesn't thrill me either just from a wasted cpu cycles point of view. 
>
>> +	if (pgd_phys & ~vttbr_baddr_mask)
>> +		kvm_err("VTTBR not aligned, expect guest to fail");
>> +
>> +	kvm->arch.vttbr = pgd_phys | vmid;
>>  
>>  	spin_unlock(&kvm_vmid_lock);
>>  }
>> @@ -1015,6 +1096,12 @@ int kvm_arch_init(void *opaque)
>>  		}
>>  	}
>>  
>> +	err = set_vttbr_baddr_mask();
>> +	if (err) {
>> +		kvm_err("Cannot set vttbr_baddr_mask\n");
>> +		return -EINVAL;
>> +	}
>> +
>>  	cpu_notifier_register_begin();
>>  
>>  	err = init_hyp_mode();
>> diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
>> index cc83520..ff4a4fa 100644
>> --- a/arch/arm64/include/asm/kvm_arm.h
>> +++ b/arch/arm64/include/asm/kvm_arm.h
>> @@ -95,7 +95,6 @@
>>  /* TCR_EL2 Registers bits */
>>  #define TCR_EL2_TBI	(1 << 20)
>>  #define TCR_EL2_PS	(7 << 16)
>> -#define TCR_EL2_PS_40B	(2 << 16)
>>  #define TCR_EL2_TG0	(1 << 14)
>>  #define TCR_EL2_SH0	(3 << 12)
>>  #define TCR_EL2_ORGN0	(3 << 10)
>> @@ -104,8 +103,6 @@
>>  #define TCR_EL2_MASK	(TCR_EL2_TG0 | TCR_EL2_SH0 | \
>>  			 TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
>>  
>> -#define TCR_EL2_FLAGS	(TCR_EL2_PS_40B)
>> -
>>  /* VTCR_EL2 Registers bits */
>>  #define VTCR_EL2_PS_MASK	(7 << 16)
>>  #define VTCR_EL2_TG0_MASK	(1 << 14)
>> @@ -120,36 +117,28 @@
>>  #define VTCR_EL2_SL0_MASK	(3 << 6)
>>  #define VTCR_EL2_SL0_LVL1	(1 << 6)
>>  #define VTCR_EL2_T0SZ_MASK	0x3f
>> -#define VTCR_EL2_T0SZ_40B	24
>> +#define VTCR_EL2_T0SZ(bits)	(64 - (bits))
>>  
>>  #ifdef CONFIG_ARM64_64K_PAGES
>>  /*
>>   * Stage2 translation configuration:
>> - * 40bits output (PS = 2)
>> - * 40bits input  (T0SZ = 24)
>>   * 64kB pages (TG0 = 1)
>>   * 2 level page tables (SL = 1)
>>   */
>>  #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
>>  				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
>> -				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
>> -#define VTTBR_X		(38 - VTCR_EL2_T0SZ_40B)
>> +				 VTCR_EL2_SL0_LVL1)
>>  #else
>>  /*
>>   * Stage2 translation configuration:
>> - * 40bits output (PS = 2)
>> - * 40bits input  (T0SZ = 24)
>>   * 4kB pages (TG0 = 0)
>>   * 3 level page tables (SL = 1)
>>   */
>>  #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
>>  				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
>> -				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
>> -#define VTTBR_X		(37 - VTCR_EL2_T0SZ_40B)
>> +				 VTCR_EL2_SL0_LVL1)
>>  #endif
>>  
>> -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
>> -#define VTTBR_BADDR_MASK  (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
>>  #define VTTBR_VMID_SHIFT  (48LLU)
>>  #define VTTBR_VMID_MASK	  (0xffLLU << VTTBR_VMID_SHIFT)
>>  
>> diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
>> index d968796..c0f7634 100644
>> --- a/arch/arm64/kvm/hyp-init.S
>> +++ b/arch/arm64/kvm/hyp-init.S
>> @@ -63,17 +63,21 @@ __do_hyp_init:
>>  	mrs	x4, tcr_el1
>>  	ldr	x5, =TCR_EL2_MASK
>>  	and	x4, x4, x5
>> -	ldr	x5, =TCR_EL2_FLAGS
>> -	orr	x4, x4, x5
>> -	msr	tcr_el2, x4
>> -
>> -	ldr	x4, =VTCR_EL2_FLAGS
>>  	/*
>>  	 * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in
>> -	 * VTCR_EL2.
>> +	 * TCR_EL2 and both PS bits and T0SZ bits in VTCR_EL2.
>>  	 */
>>  	mrs	x5, ID_AA64MMFR0_EL1
>>  	bfi	x4, x5, #16, #3
>> +	msr	tcr_el2, x4
>> +
>> +	ldr	x4, =VTCR_EL2_FLAGS
>> +	bfi	x4, x5, #16, #3
>> +	and	x5, x5, #0xf
>> +	adr	x6, t0sz
>> +	add	x6, x6, x5, lsl #2
>> +	ldr	w5, [x6]
>> +	orr	x4, x4, x5
>>  	msr	vtcr_el2, x4
>>  
>>  	mrs	x4, mair_el1
>> @@ -109,6 +113,10 @@ target: /* We're now in the trampoline code, switch page tables */
>>  
>>  	/* Hello, World! */
>>  	eret
>> +
>> +t0sz:
>> +	.word	VTCR_EL2_T0SZ(32), VTCR_EL2_T0SZ(36), VTCR_EL2_T0SZ(40)
>> +	.word	VTCR_EL2_T0SZ(42), VTCR_EL2_T0SZ(44), VTCR_EL2_T0SZ(48)
>>  ENDPROC(__kvm_hyp_init)
>>  
>>  	.ltorg
>>
> Thanks for picking this up!
Thanks for the detailed review.  I'll try to turnaround v4 quickly.
> -Christoffer
Christoffer Dall Aug. 11, 2014, 7:19 p.m. UTC | #5
On Mon, Aug 11, 2014 at 10:20:41AM -0500, Joel Schopp wrote:
> Thanks for the detailed review.
> > the last case would be case 5 and the default case would be a BUG().
> I agree with the case, but rather than do a BUG() I'm going to print an
> error and return -EINVAL.  Not worth stopping the host kernel just
> because kvm is messed up when we can gracefully exit from it.

agreed

> >
> >> +
> >> +	/*
> >> +	 * See Table D4-23 and Table D4-25 in ARM DDI 0487A.b to figure out
> >> +	 * the origin of the hardcoded values, 38 and 37.
> >> +	 */
> >> +#ifdef CONFIG_ARM64_64K_PAGES
> >> +	/*
> >> +	 * 16 <= T0SZ <= 21 is valid under 3 level of translation tables
> >> +	 * 18 <= T0SZ <= 34 is valid under 2 level of translation tables
> >> +	 * 31 <= T0SZ <= 39 is valid under 1 level of transltaion tables
> >> +	 */
> > so this scheme is with concatenated initial level stage-2 page tables.
> >
> > But we only ever allocate the amount of pages for our pgd according to
> > what the host has, so I think this allocation needs to be locked down
> > more tight, because the host is always using the appropriate amount for
> > 39 bits virtual addresses.
> I'll narrow the sanity check of the range.  I'll narrow it based on a 39
> - 48 bit VA host range in anticipation of the 4 level 4k and 3 level 64k
> host patches going in.
> 
> >> +		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
> >> +		return -EINVAL;
> >> +	}
> >> +	vttbr_x = 37 - t0sz;
> >> +#endif
> >> +	vttbr_baddr_mask = (((1LLU << (48 - vttbr_x)) - 1) << (vttbr_x - 1));
> >> +#endif
> > This nested ifdef is really quite horrible.  Can you either factor these
> > out into some static inlines in arch/arm[64]/include/asm/kvm_mmu.h or
> > provide a per-architecture implementation in a .c file?
> I'll factor it out in the file to make it more readable and do away with
> the nested ifdef.  My theory on putting things into .h files is to only
> do it if there is actually another file that uses it.

that's not really the design principle between the split of kvm/arm and
kvm/arm64 though, but I'll let you choose your preferred method when
writing up the patches.

> >
> >> +	return 0;
> >> +}
> >> +
> >> +/**
> >>   * update_vttbr - Update the VTTBR with a valid VMID before the guest runs
> >>   * @kvm	The guest that we are about to run
> >>   *
> >> @@ -429,8 +502,16 @@ static void update_vttbr(struct kvm *kvm)
> >>  	/* update vttbr to be used with the new vmid */
> >>  	pgd_phys = virt_to_phys(kvm->arch.pgd);
> >>  	vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK;
> >> -	kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK;
> >> -	kvm->arch.vttbr |= vmid;
> >> +
> >> +	/*
> >> +	 * If the VTTBR isn't aligned there is something wrong with the system
> >> +	 * or kernel.  It is better to just fail and not mask it. But no need
> >> +	 * to panic the host kernel with a BUG_ON(), instead just log the error.
> >> +	 */
> > These last two sentences are not very helpful, because they don't
> > describe the rationale for what you're doing, only what you are (and are
> > not) doing.
> I'll reword the comment.
> >
> > That said, I don't think this is doing the right thing.  I think you
> > want to refuse running the VM and avoid any stage-2 entried being
> > created if this is not the case (actually, we may want to check this
> > after set_vttbr_baddr_mask() or right aftert allocating the stage-2
> > pgd), because otherwise I think we may be overwriting memory not
> > belonging to us with concatenated page tables in a 42-bit 4KB system,
> > for example.
> My experience here was that the hardware actually catches the error on
> the first instruction load of the guest kernel and does a stage 2
> translation abort.  However, to be extra safe we could just log the
> error with the address of the vttbr and then zero out the pgd_phys part
> of vttbr altogether, leaving only the vmid.  The guest would then die of
> natural causes and we wouldn't have to worry about the outside
> possibility of memory getting overwritten.

uh, putting zero in the pgd_phys part will just point to random memory
if you happen to have memory based at address 0 though, right?

I think we should check when we allocate the pgd that it is indeed of
the right size and alignment, and if it isn't at this point, it truly is
a BUG() and your kernel is terribly busted.

> 
> I don't like the option of just calling BUG() because stopping the host
> kernel from running just because we can't run a guest seems a bit
> extreme.  On the other hand adding a return code to update_vttbr and
> checking it (even with unlikely) in the kvm_arch_vcpu_ioctl_run() loop
> doesn't thrill me either just from a wasted cpu cycles point of view. 

Agreed that we shouldn't call BUG on something that could happen.  We
only should if we really do have a BUG.  If we have a programming error
in setting our stage-2 page tables, then we're hosed, and I think
crashing the kernel is fine.

You could make it a VM_BUG_ON() if you don't like having the check
regularly, or create the error code return.

> >
> >> +	if (pgd_phys & ~vttbr_baddr_mask)
> >> +		kvm_err("VTTBR not aligned, expect guest to fail");
> >> +
> >> +	kvm->arch.vttbr = pgd_phys | vmid;
> >>  
> >>  	spin_unlock(&kvm_vmid_lock);
> >>  }
> >> @@ -1015,6 +1096,12 @@ int kvm_arch_init(void *opaque)
> >>  		}
> >>  	}
> >>  
> >> +	err = set_vttbr_baddr_mask();
> >> +	if (err) {
> >> +		kvm_err("Cannot set vttbr_baddr_mask\n");
> >> +		return -EINVAL;
> >> +	}
> >> +
> >>  	cpu_notifier_register_begin();
> >>  
> >>  	err = init_hyp_mode();
> >> diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
> >> index cc83520..ff4a4fa 100644
> >> --- a/arch/arm64/include/asm/kvm_arm.h
> >> +++ b/arch/arm64/include/asm/kvm_arm.h
> >> @@ -95,7 +95,6 @@
> >>  /* TCR_EL2 Registers bits */
> >>  #define TCR_EL2_TBI	(1 << 20)
> >>  #define TCR_EL2_PS	(7 << 16)
> >> -#define TCR_EL2_PS_40B	(2 << 16)
> >>  #define TCR_EL2_TG0	(1 << 14)
> >>  #define TCR_EL2_SH0	(3 << 12)
> >>  #define TCR_EL2_ORGN0	(3 << 10)
> >> @@ -104,8 +103,6 @@
> >>  #define TCR_EL2_MASK	(TCR_EL2_TG0 | TCR_EL2_SH0 | \
> >>  			 TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
> >>  
> >> -#define TCR_EL2_FLAGS	(TCR_EL2_PS_40B)
> >> -
> >>  /* VTCR_EL2 Registers bits */
> >>  #define VTCR_EL2_PS_MASK	(7 << 16)
> >>  #define VTCR_EL2_TG0_MASK	(1 << 14)
> >> @@ -120,36 +117,28 @@
> >>  #define VTCR_EL2_SL0_MASK	(3 << 6)
> >>  #define VTCR_EL2_SL0_LVL1	(1 << 6)
> >>  #define VTCR_EL2_T0SZ_MASK	0x3f
> >> -#define VTCR_EL2_T0SZ_40B	24
> >> +#define VTCR_EL2_T0SZ(bits)	(64 - (bits))
> >>  
> >>  #ifdef CONFIG_ARM64_64K_PAGES
> >>  /*
> >>   * Stage2 translation configuration:
> >> - * 40bits output (PS = 2)
> >> - * 40bits input  (T0SZ = 24)
> >>   * 64kB pages (TG0 = 1)
> >>   * 2 level page tables (SL = 1)
> >>   */
> >>  #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
> >>  				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> >> -				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> >> -#define VTTBR_X		(38 - VTCR_EL2_T0SZ_40B)
> >> +				 VTCR_EL2_SL0_LVL1)
> >>  #else
> >>  /*
> >>   * Stage2 translation configuration:
> >> - * 40bits output (PS = 2)
> >> - * 40bits input  (T0SZ = 24)
> >>   * 4kB pages (TG0 = 0)
> >>   * 3 level page tables (SL = 1)
> >>   */
> >>  #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
> >>  				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> >> -				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> >> -#define VTTBR_X		(37 - VTCR_EL2_T0SZ_40B)
> >> +				 VTCR_EL2_SL0_LVL1)
> >>  #endif
> >>  
> >> -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
> >> -#define VTTBR_BADDR_MASK  (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
> >>  #define VTTBR_VMID_SHIFT  (48LLU)
> >>  #define VTTBR_VMID_MASK	  (0xffLLU << VTTBR_VMID_SHIFT)
> >>  
> >> diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
> >> index d968796..c0f7634 100644
> >> --- a/arch/arm64/kvm/hyp-init.S
> >> +++ b/arch/arm64/kvm/hyp-init.S
> >> @@ -63,17 +63,21 @@ __do_hyp_init:
> >>  	mrs	x4, tcr_el1
> >>  	ldr	x5, =TCR_EL2_MASK
> >>  	and	x4, x4, x5
> >> -	ldr	x5, =TCR_EL2_FLAGS
> >> -	orr	x4, x4, x5
> >> -	msr	tcr_el2, x4
> >> -
> >> -	ldr	x4, =VTCR_EL2_FLAGS
> >>  	/*
> >>  	 * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in
> >> -	 * VTCR_EL2.
> >> +	 * TCR_EL2 and both PS bits and T0SZ bits in VTCR_EL2.
> >>  	 */
> >>  	mrs	x5, ID_AA64MMFR0_EL1
> >>  	bfi	x4, x5, #16, #3
> >> +	msr	tcr_el2, x4
> >> +
> >> +	ldr	x4, =VTCR_EL2_FLAGS
> >> +	bfi	x4, x5, #16, #3
> >> +	and	x5, x5, #0xf
> >> +	adr	x6, t0sz
> >> +	add	x6, x6, x5, lsl #2
> >> +	ldr	w5, [x6]
> >> +	orr	x4, x4, x5
> >>  	msr	vtcr_el2, x4
> >>  
> >>  	mrs	x4, mair_el1
> >> @@ -109,6 +113,10 @@ target: /* We're now in the trampoline code, switch page tables */
> >>  
> >>  	/* Hello, World! */
> >>  	eret
> >> +
> >> +t0sz:
> >> +	.word	VTCR_EL2_T0SZ(32), VTCR_EL2_T0SZ(36), VTCR_EL2_T0SZ(40)
> >> +	.word	VTCR_EL2_T0SZ(42), VTCR_EL2_T0SZ(44), VTCR_EL2_T0SZ(48)
> >>  ENDPROC(__kvm_hyp_init)
> >>  
> >>  	.ltorg
> >>
> > Thanks for picking this up!
> Thanks for the detailed review.  I'll try to turnaround v4 quickly.

Cool!
-Christoffer
Joel Schopp Aug. 11, 2014, 8:21 p.m. UTC | #6
>>> That said, I don't think this is doing the right thing.  I think you
>>> want to refuse running the VM and avoid any stage-2 entried being
>>> created if this is not the case (actually, we may want to check this
>>> after set_vttbr_baddr_mask() or right aftert allocating the stage-2
>>> pgd), because otherwise I think we may be overwriting memory not
>>> belonging to us with concatenated page tables in a 42-bit 4KB system,
>>> for example.
>> My experience here was that the hardware actually catches the error on
>> the first instruction load of the guest kernel and does a stage 2
>> translation abort.  However, to be extra safe we could just log the
>> error with the address of the vttbr and then zero out the pgd_phys part
>> of vttbr altogether, leaving only the vmid.  The guest would then die of
>> natural causes and we wouldn't have to worry about the outside
>> possibility of memory getting overwritten.
> uh, putting zero in the pgd_phys part will just point to random memory
> if you happen to have memory based at address 0 though, right?
>
> I think we should check when we allocate the pgd that it is indeed of
> the right size and alignment, and if it isn't at this point, it truly is
> a BUG() and your kernel is terribly busted.
If I can't rely on 0 to be an invalid address I can't think of what I
could rely on to be invalid.  I'll just change this to BUG_ON(pgd_phys &
~vttbr_baddr_mask); and give up on my dream of the host kernel surviving
the bug.
diff mbox

Patch

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index d7424ef..d7ca2f5 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -37,6 +37,7 @@ 
 #include <asm/mman.h>
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
+#include <asm/cputype.h>
 #include <asm/virt.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_asm.h>
@@ -61,6 +62,9 @@  static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
 static u8 kvm_next_vmid;
 static DEFINE_SPINLOCK(kvm_vmid_lock);
 
+/* VTTBR mask cannot be determined in complie time under ARMv8 */
+static u64 vttbr_baddr_mask;
+
 static bool vgic_present;
 
 static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
@@ -376,6 +380,75 @@  static bool need_new_vmid_gen(struct kvm *kvm)
 }
 
 /**
+ * set_vttbr_baddr_mask - set mask value for vttbr base address
+ *
+ * In ARMv8, vttbr_baddr_mask cannot be determined in compile time since stage2
+ * input address size depends on hardware capability. Thus, it is needed to read
+ * ID_AA64MMFR0_EL1.PARange first and then set vttbr_baddr_mask with
+ * consideration of both granule size and the level of translation tables.
+ */
+static int set_vttbr_baddr_mask(void)
+{
+#ifndef CONFIG_ARM64
+	vttbr_baddr_mask = VTTBR_BADDR_MASK;
+#else
+	int pa_range, t0sz, vttbr_x;
+
+	pa_range = read_cpuid(ID_AA64MMFR0_EL1) & 0xf;
+
+	switch (pa_range) {
+	case 0:
+		t0sz = VTCR_EL2_T0SZ(32);
+		break;
+	case 1:
+		t0sz = VTCR_EL2_T0SZ(36);
+		break;
+	case 2:
+		t0sz = VTCR_EL2_T0SZ(40);
+		break;
+	case 3:
+		t0sz = VTCR_EL2_T0SZ(42);
+		break;
+	case 4:
+		t0sz = VTCR_EL2_T0SZ(44);
+		break;
+	default:
+		t0sz = VTCR_EL2_T0SZ(48);
+	}
+
+	/*
+	 * See Table D4-23 and Table D4-25 in ARM DDI 0487A.b to figure out
+	 * the origin of the hardcoded values, 38 and 37.
+	 */
+#ifdef CONFIG_ARM64_64K_PAGES
+	/*
+	 * 16 <= T0SZ <= 21 is valid under 3 level of translation tables
+	 * 18 <= T0SZ <= 34 is valid under 2 level of translation tables
+	 * 31 <= T0SZ <= 39 is valid under 1 level of transltaion tables
+	 */
+	if (t0sz <= 17) {
+		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
+		return -EINVAL;
+	}
+	vttbr_x = 38 - t0sz;
+#else
+	/*
+	 * 16 <= T0SZ <= 24 is valid under 4 level of translation tables
+	 * 21 <= T0SZ <= 30 is valid under 3 level of translation tables
+	 * 30 <= T0SZ <= 39 is valid under 2 level of translation tables
+	 */
+	if (t0sz <= 20) {
+		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
+		return -EINVAL;
+	}
+	vttbr_x = 37 - t0sz;
+#endif
+	vttbr_baddr_mask = (((1LLU << (48 - vttbr_x)) - 1) << (vttbr_x - 1));
+#endif
+	return 0;
+}
+
+/**
  * update_vttbr - Update the VTTBR with a valid VMID before the guest runs
  * @kvm	The guest that we are about to run
  *
@@ -429,8 +502,16 @@  static void update_vttbr(struct kvm *kvm)
 	/* update vttbr to be used with the new vmid */
 	pgd_phys = virt_to_phys(kvm->arch.pgd);
 	vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK;
-	kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK;
-	kvm->arch.vttbr |= vmid;
+
+	/*
+	 * If the VTTBR isn't aligned there is something wrong with the system
+	 * or kernel.  It is better to just fail and not mask it. But no need
+	 * to panic the host kernel with a BUG_ON(), instead just log the error.
+	 */
+	if (pgd_phys & ~vttbr_baddr_mask)
+		kvm_err("VTTBR not aligned, expect guest to fail");
+
+	kvm->arch.vttbr = pgd_phys | vmid;
 
 	spin_unlock(&kvm_vmid_lock);
 }
@@ -1015,6 +1096,12 @@  int kvm_arch_init(void *opaque)
 		}
 	}
 
+	err = set_vttbr_baddr_mask();
+	if (err) {
+		kvm_err("Cannot set vttbr_baddr_mask\n");
+		return -EINVAL;
+	}
+
 	cpu_notifier_register_begin();
 
 	err = init_hyp_mode();
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index cc83520..ff4a4fa 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -95,7 +95,6 @@ 
 /* TCR_EL2 Registers bits */
 #define TCR_EL2_TBI	(1 << 20)
 #define TCR_EL2_PS	(7 << 16)
-#define TCR_EL2_PS_40B	(2 << 16)
 #define TCR_EL2_TG0	(1 << 14)
 #define TCR_EL2_SH0	(3 << 12)
 #define TCR_EL2_ORGN0	(3 << 10)
@@ -104,8 +103,6 @@ 
 #define TCR_EL2_MASK	(TCR_EL2_TG0 | TCR_EL2_SH0 | \
 			 TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
 
-#define TCR_EL2_FLAGS	(TCR_EL2_PS_40B)
-
 /* VTCR_EL2 Registers bits */
 #define VTCR_EL2_PS_MASK	(7 << 16)
 #define VTCR_EL2_TG0_MASK	(1 << 14)
@@ -120,36 +117,28 @@ 
 #define VTCR_EL2_SL0_MASK	(3 << 6)
 #define VTCR_EL2_SL0_LVL1	(1 << 6)
 #define VTCR_EL2_T0SZ_MASK	0x3f
-#define VTCR_EL2_T0SZ_40B	24
+#define VTCR_EL2_T0SZ(bits)	(64 - (bits))
 
 #ifdef CONFIG_ARM64_64K_PAGES
 /*
  * Stage2 translation configuration:
- * 40bits output (PS = 2)
- * 40bits input  (T0SZ = 24)
  * 64kB pages (TG0 = 1)
  * 2 level page tables (SL = 1)
  */
 #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
 				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
-				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
-#define VTTBR_X		(38 - VTCR_EL2_T0SZ_40B)
+				 VTCR_EL2_SL0_LVL1)
 #else
 /*
  * Stage2 translation configuration:
- * 40bits output (PS = 2)
- * 40bits input  (T0SZ = 24)
  * 4kB pages (TG0 = 0)
  * 3 level page tables (SL = 1)
  */
 #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
 				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
-				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
-#define VTTBR_X		(37 - VTCR_EL2_T0SZ_40B)
+				 VTCR_EL2_SL0_LVL1)
 #endif
 
-#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
-#define VTTBR_BADDR_MASK  (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
 #define VTTBR_VMID_SHIFT  (48LLU)
 #define VTTBR_VMID_MASK	  (0xffLLU << VTTBR_VMID_SHIFT)
 
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index d968796..c0f7634 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -63,17 +63,21 @@  __do_hyp_init:
 	mrs	x4, tcr_el1
 	ldr	x5, =TCR_EL2_MASK
 	and	x4, x4, x5
-	ldr	x5, =TCR_EL2_FLAGS
-	orr	x4, x4, x5
-	msr	tcr_el2, x4
-
-	ldr	x4, =VTCR_EL2_FLAGS
 	/*
 	 * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in
-	 * VTCR_EL2.
+	 * TCR_EL2 and both PS bits and T0SZ bits in VTCR_EL2.
 	 */
 	mrs	x5, ID_AA64MMFR0_EL1
 	bfi	x4, x5, #16, #3
+	msr	tcr_el2, x4
+
+	ldr	x4, =VTCR_EL2_FLAGS
+	bfi	x4, x5, #16, #3
+	and	x5, x5, #0xf
+	adr	x6, t0sz
+	add	x6, x6, x5, lsl #2
+	ldr	w5, [x6]
+	orr	x4, x4, x5
 	msr	vtcr_el2, x4
 
 	mrs	x4, mair_el1
@@ -109,6 +113,10 @@  target: /* We're now in the trampoline code, switch page tables */
 
 	/* Hello, World! */
 	eret
+
+t0sz:
+	.word	VTCR_EL2_T0SZ(32), VTCR_EL2_T0SZ(36), VTCR_EL2_T0SZ(40)
+	.word	VTCR_EL2_T0SZ(42), VTCR_EL2_T0SZ(44), VTCR_EL2_T0SZ(48)
 ENDPROC(__kvm_hyp_init)
 
 	.ltorg