diff mbox series

[Xen-devel,for-4.12,6/8] xen/arm: Implement workaround for Cortex-A76 erratum 1165522

Message ID 20181128164939.8329-7-julien.grall@arm.com
State Superseded
Headers show
Series xen/arm: Workaround for Cortex-A76 erratum 1165522 | expand

Commit Message

Julien Grall Nov. 28, 2018, 4:49 p.m. UTC
Early version of Cortex-A76 can end-up with corrupt TLBs if they
speculate an AT instruction while the S1/S2 system registers are in an
inconsistent state.

This can happen during guest context switch and when invalidating the
TLBs for other than the current VMID.

The workaround implemented in Xen will:
    - Use an empty stage-2 with a reserved VMID while context switching
    between 2 guests
    - Use an empty stage-2 with the VMID where TLBs need to be flushed

Signed-off-by: Julien Grall <julien.grall@arm.com>
---
 docs/misc/arm/silicon-errata.txt |  1 +
 xen/arch/arm/cpuerrata.c         |  6 ++++
 xen/arch/arm/domain.c            |  8 +++--
 xen/arch/arm/p2m.c               | 78 ++++++++++++++++++++++++++++++++++++++--
 xen/include/asm-arm/cpufeature.h |  3 +-
 xen/include/asm-arm/processor.h  |  2 ++
 6 files changed, 93 insertions(+), 5 deletions(-)

Comments

Andrii Anisov Dec. 21, 2018, 3:47 p.m. UTC | #1
On 28.11.18 18:49, Julien Grall wrote:
> Early version of Cortex-A76 can end-up with corrupt TLBs if they
> speculate an AT instruction while the S1/S2 system registers are in an
> inconsistent state.
> 
> This can happen during guest context switch and when invalidating the
> TLBs for other than the current VMID.
> 
> The workaround implemented in Xen will:
>      - Use an empty stage-2 with a reserved VMID while context switching
>      between 2 guests
>      - Use an empty stage-2 with the VMID where TLBs need to be flushed
> 
> Signed-off-by: Julien Grall <julien.grall@arm.com>

Reviewed-by: Andrii Anisov <andrii_anisov@epam.com>
Stefano Stabellini Jan. 24, 2019, 12:52 a.m. UTC | #2
On Wed, 28 Nov 2018, Julien Grall wrote:
> Early version of Cortex-A76 can end-up with corrupt TLBs if they
> speculate an AT instruction while the S1/S2 system registers are in an
> inconsistent state.
> 
> This can happen during guest context switch and when invalidating the
> TLBs for other than the current VMID.
> 
> The workaround implemented in Xen will:
>     - Use an empty stage-2 with a reserved VMID while context switching
>     between 2 guests
>     - Use an empty stage-2 with the VMID where TLBs need to be flushed
>
> Signed-off-by: Julien Grall <julien.grall@arm.com>

Thank you for doing this and for setting up a testing environment. I
have a couple of questions on a couple of changes below.


> ---
>  docs/misc/arm/silicon-errata.txt |  1 +
>  xen/arch/arm/cpuerrata.c         |  6 ++++
>  xen/arch/arm/domain.c            |  8 +++--
>  xen/arch/arm/p2m.c               | 78 ++++++++++++++++++++++++++++++++++++++--
>  xen/include/asm-arm/cpufeature.h |  3 +-
>  xen/include/asm-arm/processor.h  |  2 ++
>  6 files changed, 93 insertions(+), 5 deletions(-)
> 
> diff --git a/docs/misc/arm/silicon-errata.txt b/docs/misc/arm/silicon-errata.txt
> index 906bf5fd48..6cd1366f15 100644
> --- a/docs/misc/arm/silicon-errata.txt
> +++ b/docs/misc/arm/silicon-errata.txt
> @@ -48,4 +48,5 @@ stable hypervisors.
>  | ARM            | Cortex-A57      | #852523         | N/A                     |
>  | ARM            | Cortex-A57      | #832075         | ARM64_ERRATUM_832075    |
>  | ARM            | Cortex-A57      | #834220         | ARM64_ERRATUM_834220    |
> +| ARM            | Cortex-A76      | #1165522        | N/A                     |
>  | ARM            | MMU-500         | #842869         | N/A                     |
> diff --git a/xen/arch/arm/cpuerrata.c b/xen/arch/arm/cpuerrata.c
> index adf88e7bdc..61c64b9816 100644
> --- a/xen/arch/arm/cpuerrata.c
> +++ b/xen/arch/arm/cpuerrata.c
> @@ -489,6 +489,12 @@ static const struct arm_cpu_capabilities arm_errata[] = {
>          .matches = has_ssbd_mitigation,
>      },
>  #endif
> +    {
> +        /* Cortex-A76 r0p0 - r2p0 */
> +        .desc = "ARM erratum 116522",
> +        .capability = ARM64_WORKAROUND_AT_SPECULATE,
> +        MIDR_RANGE(MIDR_CORTEX_A76, 0, 2 << MIDR_VARIANT_SHIFT),
> +    },
>      {},
>  };
>  
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index 1d926dcb29..3180edd89d 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -181,8 +181,6 @@ static void ctxt_switch_to(struct vcpu *n)
>      if ( is_idle_vcpu(n) )
>          return;
>  
> -    p2m_restore_state(n);
> -
>      vpidr = READ_SYSREG32(MIDR_EL1);
>      WRITE_SYSREG32(vpidr, VPIDR_EL2);
>      WRITE_SYSREG(n->arch.vmpidr, VMPIDR_EL2);
> @@ -235,6 +233,12 @@ static void ctxt_switch_to(struct vcpu *n)
>  #endif
>      isb();
>  
> +    /*
> +     * ARM64_WORKAROUND_AT_SPECULATE: The P2M should be restored after
> +     * the stage-1 MMU sysregs have been restored.
> +     */
> +    p2m_restore_state(n);
> +
>      /* Control Registers */
>      WRITE_SYSREG(n->arch.cpacr, CPACR_EL1);
>  
> diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
> index 844833c4c3..0facb66096 100644
> --- a/xen/arch/arm/p2m.c
> +++ b/xen/arch/arm/p2m.c
> @@ -15,6 +15,7 @@
>  #include <asm/event.h>
>  #include <asm/hardirq.h>
>  #include <asm/page.h>
> +#include <asm/alternative.h>
>  
>  #define MAX_VMID_8_BIT  (1UL << 8)
>  #define MAX_VMID_16_BIT (1UL << 16)
> @@ -47,6 +48,8 @@ static const paddr_t level_masks[] =
>  static const uint8_t level_orders[] =
>      { ZEROETH_ORDER, FIRST_ORDER, SECOND_ORDER, THIRD_ORDER };
>  
> +static mfn_t __read_mostly empty_root_mfn;
> +
>  static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
>  {
>      return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
> @@ -98,9 +101,25 @@ void dump_p2m_lookup(struct domain *d, paddr_t addr)
>                   P2M_ROOT_LEVEL, P2M_ROOT_PAGES);
>  }
>  
> +/*
> + * p2m_save_state and p2m_restore_state works in pair to workaround
                                           ^ work


> + * ARM64_WORKAROUND_AT_SPECULATE. p2m_save_state will set-up VTTBR to
> + * point to the empty page-tables to stop allocating TLB entries.
> + */
>  void p2m_save_state(struct vcpu *p)
>  {
>      p->arch.sctlr = READ_SYSREG(SCTLR_EL1);
> +
> +    if ( cpus_have_const_cap(ARM64_WORKAROUND_AT_SPECULATE) )
> +    {
> +        WRITE_SYSREG64(generate_vttbr(INVALID_VMID, empty_root_mfn), VTTBR_EL2);
> +        /*
> +         * Ensure VTTBR_EL2 is correctly synchronized so we can restore
> +         * the next vCPU context without worrying about AT instruction
> +         * speculation.
> +         */
> +        isb();
> +    }
>  }

OK


>  void p2m_restore_state(struct vcpu *n)
> @@ -111,10 +130,17 @@ void p2m_restore_state(struct vcpu *n)
>      if ( is_idle_vcpu(n) )
>          return;
>  
> -    WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
>      WRITE_SYSREG(n->arch.sctlr, SCTLR_EL1);
>      WRITE_SYSREG(n->arch.hcr_el2, HCR_EL2);
>  
> +    /*
> +     * ARM64_WORKAROUND_AT_SPECULATE: VTTBR_EL2 should be restored after all
> +     * registers associated to EL1/EL0 translations regime have been
> +     * synchronized.
> +     */
> +    asm volatile(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_AT_SPECULATE));

Obviously you have done a lot more thinking about this than me, but
I don't fully understand the need for this barrier: this is not about
ARM64_WORKAROUND_AT_SPECULATE per se, right? Shouldn't the CPU be able
to figure out the right execution speculation path given that the
instructions ordering is correct? I guess it depends on the nature of
the hardware bug.


> +    WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
> +
>      last_vcpu_ran = &p2m->last_vcpu_ran[smp_processor_id()];
>  
>      /*
> @@ -157,8 +183,23 @@ static void p2m_force_tlb_flush_sync(struct p2m_domain *p2m)
>      ovttbr = READ_SYSREG64(VTTBR_EL2);
>      if ( ovttbr != p2m->vttbr )
>      {
> +        uint64_t vttbr;
> +
>          local_irq_save(flags);
> -        WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
> +
> +        /*
> +         * ARM64_WORKAROUND_AT_SPECULATE: We need to stop AT to allocate
> +         * TLBs entries because the context is partially modified. We
> +         * only need the VMID for flushing the TLBs, so we can generate
> +         * a new VTTBR with the VMID to flush and the empty root table.
> +         */
> +        if ( !cpus_have_const_cap(ARM64_WORKAROUND_AT_SPECULATE) )
> +            vttbr = p2m->vttbr;
> +        else
> +            vttbr = generate_vttbr(p2m->vmid, empty_root_mfn);
> +
> +        WRITE_SYSREG64(vttbr, VTTBR_EL2);

Good idea, any reasons not to use generate_vttbr(p2m->vmid,
empty_root_mfn) in the general case? There should be no downsides,
right?


>          /* Ensure VTTBR_EL2 is synchronized before flushing the TLBs */
>          isb();
>      }
> @@ -1504,6 +1545,23 @@ static uint32_t __read_mostly vtcr;
>  static void setup_virt_paging_one(void *data)
>  {
>      WRITE_SYSREG32(vtcr, VTCR_EL2);
> +
> +    /*
> +     * ARM64_WORKAROUND_AT_SPECULATE: We want to keep the TLBs free from
> +     * entries related to EL1/EL0 translation regime until a guest vCPU
> +     * is running. For that, we need to set-up VTTBR to point to an empty
> +     * page-table and turn on stage-2 translation.

I don't understand why this is needed: isn't the lack of HCR_VM (due to
your previous patch) supposed to be sufficient? How can there be
speculation without HCR_VM?

Even if speculation happens without HCR_EL2, why do we need to set it
now? Isn't setting empty_root_mfn enough?


>         The TLB entries
> +     * associated with EL1/EL0 translation regime will also be flushed in case
> +     * an AT instruction was speculated before hand.
> +     */
> +    if ( cpus_have_cap(ARM64_WORKAROUND_AT_SPECULATE) )
> +    {
> +        WRITE_SYSREG64(generate_vttbr(INVALID_VMID, empty_root_mfn), VTTBR_EL2);
> +        WRITE_SYSREG(READ_SYSREG(HCR_EL2) | HCR_VM, HCR_EL2);
> +        isb();
> +
> +        flush_tlb_all_local();
> +    }
>  }
>  
>  void __init setup_virt_paging(void)
> @@ -1587,6 +1645,22 @@ void __init setup_virt_paging(void)
>      /* It is not allowed to concatenate a level zero root */
>      BUG_ON( P2M_ROOT_LEVEL == 0 && P2M_ROOT_ORDER > 0 );
>      vtcr = val;
> +
> +    /*
> +     * ARM64_WORKAROUND_AT_SPECULATE requires to allocate root table
> +     * with all entries zeroed.
> +     */
> +    if ( cpus_have_cap(ARM64_WORKAROUND_AT_SPECULATE) )
> +    {
> +        struct page_info *root;
> +
> +        root = p2m_allocate_root();
> +        if ( !root )
> +            panic("Unable to allocate root table for ARM64_WORKAROUND_AT_SPECULATE\n");
> +
> +        empty_root_mfn = page_to_mfn(root);
> +    }

OK


>      setup_virt_paging_one(NULL);
>      smp_call_function(setup_virt_paging_one, NULL, 1);
>  }
> diff --git a/xen/include/asm-arm/cpufeature.h b/xen/include/asm-arm/cpufeature.h
> index 17de928467..c2c8f3417c 100644
> --- a/xen/include/asm-arm/cpufeature.h
> +++ b/xen/include/asm-arm/cpufeature.h
> @@ -45,8 +45,9 @@
>  #define ARM_HARDEN_BRANCH_PREDICTOR 7
>  #define ARM_SSBD 8
>  #define ARM_SMCCC_1_1 9
> +#define ARM64_WORKAROUND_AT_SPECULATE 10
>  
> -#define ARM_NCAPS           10
> +#define ARM_NCAPS           11
>  
>  #ifndef __ASSEMBLY__
>  
> diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
> index 72ddc42778..d03ec6e272 100644
> --- a/xen/include/asm-arm/processor.h
> +++ b/xen/include/asm-arm/processor.h
> @@ -52,6 +52,7 @@
>  #define ARM_CPU_PART_CORTEX_A72     0xD08
>  #define ARM_CPU_PART_CORTEX_A73     0xD09
>  #define ARM_CPU_PART_CORTEX_A75     0xD0A
> +#define ARM_CPU_PART_CORTEX_A76     0xD0B
>  
>  #define MIDR_CORTEX_A12 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A12)
>  #define MIDR_CORTEX_A17 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A17)
> @@ -61,6 +62,7 @@
>  #define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72)
>  #define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73)
>  #define MIDR_CORTEX_A75 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A75)
> +#define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76)
>  
>  /* MPIDR Multiprocessor Affinity Register */
>  #define _MPIDR_UP           (30)
> -- 
> 2.11.0
>
Julien Grall Jan. 24, 2019, 1:17 p.m. UTC | #3
(+ James)

Hi Stefano,

@James, please correct me if I am wrong below :).

On 24/01/2019 00:52, Stefano Stabellini wrote:
> On Wed, 28 Nov 2018, Julien Grall wrote:
>>   void p2m_restore_state(struct vcpu *n)
>> @@ -111,10 +130,17 @@ void p2m_restore_state(struct vcpu *n)
>>       if ( is_idle_vcpu(n) )
>>           return;
>>   
>> -    WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
>>       WRITE_SYSREG(n->arch.sctlr, SCTLR_EL1);
>>       WRITE_SYSREG(n->arch.hcr_el2, HCR_EL2);
>>   
>> +    /*
>> +     * ARM64_WORKAROUND_AT_SPECULATE: VTTBR_EL2 should be restored after all
>> +     * registers associated to EL1/EL0 translations regime have been
>> +     * synchronized.
>> +     */
>> +    asm volatile(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_AT_SPECULATE));
> 
> Obviously you have done a lot more thinking about this than me, but
> I don't fully understand the need for this barrier: this is not about
> ARM64_WORKAROUND_AT_SPECULATE per se, right?

This particular isb() is about ARM64_WORKAROUND_AT_SPECULATE.

> Shouldn't the CPU be able
> to figure out the right execution speculation path given that the
> instructions ordering is correct?

What instructions ordering? Writing a system registers can be re-ordered and you 
need an isb() to ensure the full synchronization before executing at AT 
instruction or flushing the TLBs.

In hardware without the erratum, the registers associated with EL1 translation 
are out-of-context and the AT cannot speculate. The isb() added by patch #5 
ensure the context is synchronized so an AT afterwards would use a consistent 
context. Now...

> I guess it depends on the nature of
> the hardware bug.

... in the context of the errata, you have to imagine what can happen if an AT 
instruction is inserted (via speculation) between each instruction and what 
happen if the system registers are re-ordered.

The key of the erratum is VTTBR_EL2. This is what will stop a speculated AT 
instruction to allocate a TLBs entry because you are not allowed to cache a 
translation that will fault. Without the isb() here, the VTTBR_EL2 may be 
synchronized before the rest of the context, so a speculated AT instruction may 
use an inconsistent state and allocate a TLB entry with an unexpected 
translation against the guest.

So here, we want to ensure the rest of the context is synchronized before 
writing to VTTBR_EL2, hence the isb().

> 
> 
>> +    WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
>> +
>>       last_vcpu_ran = &p2m->last_vcpu_ran[smp_processor_id()];
>>   
>>       /*
>> @@ -157,8 +183,23 @@ static void p2m_force_tlb_flush_sync(struct p2m_domain *p2m)
>>       ovttbr = READ_SYSREG64(VTTBR_EL2);
>>       if ( ovttbr != p2m->vttbr )
>>       {
>> +        uint64_t vttbr;
>> +
>>           local_irq_save(flags);
>> -        WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
>> +
>> +        /*
>> +         * ARM64_WORKAROUND_AT_SPECULATE: We need to stop AT to allocate
>> +         * TLBs entries because the context is partially modified. We
>> +         * only need the VMID for flushing the TLBs, so we can generate
>> +         * a new VTTBR with the VMID to flush and the empty root table.
>> +         */
>> +        if ( !cpus_have_const_cap(ARM64_WORKAROUND_AT_SPECULATE) )
>> +            vttbr = p2m->vttbr;
>> +        else
>> +            vttbr = generate_vttbr(p2m->vmid, empty_root_mfn);
>> +
>> +        WRITE_SYSREG64(vttbr, VTTBR_EL2);
> 
> Good idea, any reasons not to use generate_vttbr(p2m->vmid,
> empty_root_mfn) in the general case? There should be no downsides,
> right?
An empty root means you need to have the root page-tables allocated. In the 
configuration we currently support, the could be either 4K or 8K.

Even if this seems small, this is a downside for platforms that don't require 
such trick.

> 
> 
>>           /* Ensure VTTBR_EL2 is synchronized before flushing the TLBs */
>>           isb();
>>       }
>> @@ -1504,6 +1545,23 @@ static uint32_t __read_mostly vtcr;
>>   static void setup_virt_paging_one(void *data)
>>   {
>>       WRITE_SYSREG32(vtcr, VTCR_EL2);
>> +
>> +    /*
>> +     * ARM64_WORKAROUND_AT_SPECULATE: We want to keep the TLBs free from
>> +     * entries related to EL1/EL0 translation regime until a guest vCPU
>> +     * is running. For that, we need to set-up VTTBR to point to an empty
>> +     * page-table and turn on stage-2 translation.
> 
> I don't understand why this is needed: isn't the lack of HCR_VM (due to
> your previous patch) supposed to be sufficient? How can there be
> speculation without HCR_VM?

HCR_EL2.VM unsets means the stage-2 will not be used for the EL1/EL0 translation 
regime. In the context of the erratum, the AT can still speculate except it will 
not take into account the stage-2. The dependencies on VMID stills applies when 
HCR_EL2.VM is unset, so from my understanding, the entry could get cached to 
whatever is VTTBR_EL2.VMID.

> 
> Even if speculation happens without HCR_EL2, why do we need to set it
> now? Isn't setting empty_root_mfn enough?

The main goal here is to have the TLBs in a known state after the CPU has been 
initialized. After the sequence below, we are sure that the TLBs don't contain 
entries associated to the EL1/EL0 regime and and a speculated AT instruction 
will not be able to allocate more.

Without HCR_EL2.VM set, the stage-2 page-table will not get used. So a 
speculated AT instruction could still allocate an entry in TLB. It is not a 
major issue as it would be against INVALID_VMID, yet it is not a very sane 
situation for the hypervisor.

Cheers,
Stefano Stabellini Jan. 25, 2019, 9:36 p.m. UTC | #4
On Thu, 24 Jan 2019, Julien Grall wrote:
> (+ James)
> 
> Hi Stefano,
> 
> @James, please correct me if I am wrong below :).
> 
> On 24/01/2019 00:52, Stefano Stabellini wrote:
> > On Wed, 28 Nov 2018, Julien Grall wrote:
> > >   void p2m_restore_state(struct vcpu *n)
> > > @@ -111,10 +130,17 @@ void p2m_restore_state(struct vcpu *n)
> > >       if ( is_idle_vcpu(n) )
> > >           return;
> > >   -    WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
> > >       WRITE_SYSREG(n->arch.sctlr, SCTLR_EL1);
> > >       WRITE_SYSREG(n->arch.hcr_el2, HCR_EL2);
> > >   +    /*
> > > +     * ARM64_WORKAROUND_AT_SPECULATE: VTTBR_EL2 should be restored after
> > > all
> > > +     * registers associated to EL1/EL0 translations regime have been
> > > +     * synchronized.
> > > +     */
> > > +    asm volatile(ALTERNATIVE("nop", "isb",
> > > ARM64_WORKAROUND_AT_SPECULATE));
> > 
> > Obviously you have done a lot more thinking about this than me, but
> > I don't fully understand the need for this barrier: this is not about
> > ARM64_WORKAROUND_AT_SPECULATE per se, right?
> 
> This particular isb() is about ARM64_WORKAROUND_AT_SPECULATE.
> 
> > Shouldn't the CPU be able
> > to figure out the right execution speculation path given that the
> > instructions ordering is correct?
> 
> What instructions ordering? Writing a system registers can be re-ordered and
> you need an isb() to ensure the full synchronization before executing at AT
> instruction or flushing the TLBs.
> 
> In hardware without the erratum, the registers associated with EL1 translation
> are out-of-context and the AT cannot speculate. The isb() added by patch #5
> ensure the context is synchronized so an AT afterwards would use a consistent
> context. Now...
> 
> > I guess it depends on the nature of
> > the hardware bug.
> 
> ... in the context of the errata, you have to imagine what can happen if an AT
> instruction is inserted (via speculation) between each instruction and what
> happen if the system registers are re-ordered.
> 
> The key of the erratum is VTTBR_EL2. This is what will stop a speculated AT
> instruction to allocate a TLBs entry because you are not allowed to cache a
> translation that will fault. Without the isb() here, the VTTBR_EL2 may be
> synchronized before the rest of the context, so a speculated AT instruction
> may use an inconsistent state and allocate a TLB entry with an unexpected
> translation against the guest.
> 
> So here, we want to ensure the rest of the context is synchronized before
> writing to VTTBR_EL2, hence the isb().

OK. I understand the explanation, thank you.

I just thought that the CPU would be smart enough to only reorder system
registers writes when appropriate, especially when the CPU is also doing
speculation at the same time. Why would it speculate if it knows that it
is reordering sysreg writes that can badly affect the speculation
itself? Let me say that it doesn't sound like a "sane" behavior to me.
But if it behaves this way, it behaves this way...


> > 
> > > +    WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
> > > +
> > >       last_vcpu_ran = &p2m->last_vcpu_ran[smp_processor_id()];
> > >         /*
> > > @@ -157,8 +183,23 @@ static void p2m_force_tlb_flush_sync(struct
> > > p2m_domain *p2m)
> > >       ovttbr = READ_SYSREG64(VTTBR_EL2);
> > >       if ( ovttbr != p2m->vttbr )
> > >       {
> > > +        uint64_t vttbr;
> > > +
> > >           local_irq_save(flags);
> > > -        WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
> > > +
> > > +        /*
> > > +         * ARM64_WORKAROUND_AT_SPECULATE: We need to stop AT to allocate
> > > +         * TLBs entries because the context is partially modified. We
> > > +         * only need the VMID for flushing the TLBs, so we can generate
> > > +         * a new VTTBR with the VMID to flush and the empty root table.
> > > +         */
> > > +        if ( !cpus_have_const_cap(ARM64_WORKAROUND_AT_SPECULATE) )
> > > +            vttbr = p2m->vttbr;
> > > +        else
> > > +            vttbr = generate_vttbr(p2m->vmid, empty_root_mfn);
> > > +
> > > +        WRITE_SYSREG64(vttbr, VTTBR_EL2);
> > 
> > Good idea, any reasons not to use generate_vttbr(p2m->vmid,
> > empty_root_mfn) in the general case? There should be no downsides,
> > right?
> An empty root means you need to have the root page-tables allocated. In the
> configuration we currently support, the could be either 4K or 8K.
> 
> Even if this seems small, this is a downside for platforms that don't require
> such trick.

Good point, I was fooled by empty_root_mfn being available to all as a
variable. Let's not go down that route then.


> > 
> > >           /* Ensure VTTBR_EL2 is synchronized before flushing the TLBs */
> > >           isb();
> > >       }
> > > @@ -1504,6 +1545,23 @@ static uint32_t __read_mostly vtcr;
> > >   static void setup_virt_paging_one(void *data)
> > >   {
> > >       WRITE_SYSREG32(vtcr, VTCR_EL2);
> > > +
> > > +    /*
> > > +     * ARM64_WORKAROUND_AT_SPECULATE: We want to keep the TLBs free from
> > > +     * entries related to EL1/EL0 translation regime until a guest vCPU
> > > +     * is running. For that, we need to set-up VTTBR to point to an empty
> > > +     * page-table and turn on stage-2 translation.
> > 
> > I don't understand why this is needed: isn't the lack of HCR_VM (due to
> > your previous patch) supposed to be sufficient? How can there be
> > speculation without HCR_VM?
> 
> HCR_EL2.VM unsets means the stage-2 will not be used for the EL1/EL0
> translation regime. In the context of the erratum, the AT can still speculate
> except it will not take into account the stage-2. The dependencies on VMID
> stills applies when HCR_EL2.VM is unset, so from my understanding, the entry
> could get cached to whatever is VTTBR_EL2.VMID.

Damn! Even if at this point of the boot sequence there is no EL1 / EL0
at all? How can that speculation happen? Shouldn't the first EL1 / EL0
speculation occur after the first leave_hypervisor_tail?


> > Even if speculation happens without HCR_EL2, why do we need to set it
> > now? Isn't setting empty_root_mfn enough?
> 
> The main goal here is to have the TLBs in a known state after the CPU has been
> initialized. After the sequence below, we are sure that the TLBs don't contain
> entries associated to the EL1/EL0 regime and and a speculated AT instruction
> will not be able to allocate more.
> 
> Without HCR_EL2.VM set, the stage-2 page-table will not get used. So a
> speculated AT instruction could still allocate an entry in TLB. It is not a
> major issue as it would be against INVALID_VMID, yet it is not a very sane
> situation for the hypervisor.

I have a question on the tlb flush.  Do we need it because the tlb is
not guaranteed to be clean after boot?

Also, do we need a flush_tlb_all_local()? Would flush_tlb_local be
enough, maybe executed immediately before switching VTTBR_EL2? I guess
it depends on whether the speculation happens on the local VMID only.
If it only speculate on the local VMID, then flush_tlb_all_local()
should suffice?
Julien Grall Jan. 27, 2019, 9:55 a.m. UTC | #5
Hi,

On 1/25/19 9:36 PM, Stefano Stabellini wrote:
> On Thu, 24 Jan 2019, Julien Grall wrote:
>> @James, please correct me if I am wrong below :).
>>
>> On 24/01/2019 00:52, Stefano Stabellini wrote:
>>> On Wed, 28 Nov 2018, Julien Grall wrote:
>> ... in the context of the errata, you have to imagine what can happen if an AT
>> instruction is inserted (via speculation) between each instruction and what
>> happen if the system registers are re-ordered.
>>
>> The key of the erratum is VTTBR_EL2. This is what will stop a speculated AT
>> instruction to allocate a TLBs entry because you are not allowed to cache a
>> translation that will fault. Without the isb() here, the VTTBR_EL2 may be
>> synchronized before the rest of the context, so a speculated AT instruction
>> may use an inconsistent state and allocate a TLB entry with an unexpected
>> translation against the guest.
>>
>> So here, we want to ensure the rest of the context is synchronized before
>> writing to VTTBR_EL2, hence the isb().
> 
> OK. I understand the explanation, thank you.
> 
> I just thought that the CPU would be smart enough to only reorder system
> registers writes when appropriate, especially when the CPU is also doing
> speculation at the same time. Why would it speculate if it knows that it
> is reordering sysreg writes that can badly affect the speculation
> itself? Let me say that it doesn't sound like a "sane" behavior to me.
> But if it behaves this way, it behaves this way...

I hope you are aware we are speaking about an erratum here... Not what 
the Arm Arm allows.

Aside the erratum, a processor is allowed to do whatever it wants if it 
is within the Arm Arm. These registers are described as out-of-context 
and should not be used by speculation in EL2. If you want to use them in 
EL2, you need an isb() before any instruction in EL2 using them 
otherwise you may use an inconsistent context. This is giving enough 
freedom to the processor while the impact in the software is minimal.

[...]

>>>
>>>>            /* Ensure VTTBR_EL2 is synchronized before flushing the TLBs */
>>>>            isb();
>>>>        }
>>>> @@ -1504,6 +1545,23 @@ static uint32_t __read_mostly vtcr;
>>>>    static void setup_virt_paging_one(void *data)
>>>>    {
>>>>        WRITE_SYSREG32(vtcr, VTCR_EL2);
>>>> +
>>>> +    /*
>>>> +     * ARM64_WORKAROUND_AT_SPECULATE: We want to keep the TLBs free from
>>>> +     * entries related to EL1/EL0 translation regime until a guest vCPU
>>>> +     * is running. For that, we need to set-up VTTBR to point to an empty
>>>> +     * page-table and turn on stage-2 translation.
>>>
>>> I don't understand why this is needed: isn't the lack of HCR_VM (due to
>>> your previous patch) supposed to be sufficient? How can there be
>>> speculation without HCR_VM?
>>
>> HCR_EL2.VM unsets means the stage-2 will not be used for the EL1/EL0
>> translation regime. In the context of the erratum, the AT can still speculate
>> except it will not take into account the stage-2. The dependencies on VMID
>> stills applies when HCR_EL2.VM is unset, so from my understanding, the entry
>> could get cached to whatever is VTTBR_EL2.VMID.
> 
> Damn! Even if at this point of the boot sequence there is no EL1 / EL0
> at all? How can that speculation happen? Shouldn't the first EL1 / EL0
> speculation occur after the first leave_hypervisor_tail?

How do you know EL1 was not run before hand? Imagine we did a soft 
reboot or kexec Xen...

But the speculation in that context is may be because the processor 
noticed an AT instruction targeting EL1 in the stream.

> 
> 
>>> Even if speculation happens without HCR_EL2, why do we need to set it
>>> now? Isn't setting empty_root_mfn enough?
>>
>> The main goal here is to have the TLBs in a known state after the CPU has been
>> initialized. After the sequence below, we are sure that the TLBs don't contain
>> entries associated to the EL1/EL0 regime and and a speculated AT instruction
>> will not be able to allocate more.
>>
>> Without HCR_EL2.VM set, the stage-2 page-table will not get used. So a
>> speculated AT instruction could still allocate an entry in TLB. It is not a
>> major issue as it would be against INVALID_VMID, yet it is not a very sane
>> situation for the hypervisor.
> 
> I have a question on the tlb flush.  Do we need it because the tlb is
> not guaranteed to be clean after boot?

You don't know the state of the TLBs after boot.

> 
> Also, do we need a flush_tlb_all_local()? Would flush_tlb_local be
> enough, maybe executed immediately before switching VTTBR_EL2? I guess
> it depends on whether the speculation happens on the local VMID only.

Speculation can only happen using system registers. So only on the local 
VMID only.

> If it only speculate on the local VMID, then flush_tlb_all_local()
> should suffice?

We have two VMIDs in play here: whatever was the value in VTTBR_EL2.VMID 
before the function and INVALID_VMID. We would need to flush the former 
and this would require empty root trick because speculation can happen 
as soon as flush ended.

But then, you rely on Xen to only use a single VMID at boot. While this 
is the case today, I can't tell if it will be in the future.

So the flush_tlb_local is the safest.

Cheers,
Julien Grall Jan. 28, 2019, 11:11 a.m. UTC | #6
On 1/27/19 9:55 AM, Julien Grall wrote:
> Hi,
> 
> On 1/25/19 9:36 PM, Stefano Stabellini wrote:
>> On Thu, 24 Jan 2019, Julien Grall wrote:
>>> @James, please correct me if I am wrong below :).
>>>
>>> On 24/01/2019 00:52, Stefano Stabellini wrote:
>>>> On Wed, 28 Nov 2018, Julien Grall wrote:
>>> ... in the context of the errata, you have to imagine what can happen 
>>> if an AT
>>> instruction is inserted (via speculation) between each instruction 
>>> and what
>>> happen if the system registers are re-ordered.
>>>
>>> The key of the erratum is VTTBR_EL2. This is what will stop a 
>>> speculated AT
>>> instruction to allocate a TLBs entry because you are not allowed to 
>>> cache a
>>> translation that will fault. Without the isb() here, the VTTBR_EL2 
>>> may be
>>> synchronized before the rest of the context, so a speculated AT 
>>> instruction
>>> may use an inconsistent state and allocate a TLB entry with an 
>>> unexpected
>>> translation against the guest.
>>>
>>> So here, we want to ensure the rest of the context is synchronized 
>>> before
>>> writing to VTTBR_EL2, hence the isb().
>>
>> OK. I understand the explanation, thank you.
>>
>> I just thought that the CPU would be smart enough to only reorder system
>> registers writes when appropriate, especially when the CPU is also doing
>> speculation at the same time. Why would it speculate if it knows that it
>> is reordering sysreg writes that can badly affect the speculation
>> itself? Let me say that it doesn't sound like a "sane" behavior to me.
>> But if it behaves this way, it behaves this way...
> 
> I hope you are aware we are speaking about an erratum here... Not what 
> the Arm Arm allows.
> 
> Aside the erratum, a processor is allowed to do whatever it wants if it 
> is within the Arm Arm. These registers are described as out-of-context 
> and should not be used by speculation in EL2. If you want to use them in 
> EL2, you need an isb() before any instruction in EL2 using them 
> otherwise you may use an inconsistent context. This is giving enough 
> freedom to the processor while the impact in the software is minimal.
> 
> [...]
> 
>>>>
>>>>>            /* Ensure VTTBR_EL2 is synchronized before flushing the 
>>>>> TLBs */
>>>>>            isb();
>>>>>        }
>>>>> @@ -1504,6 +1545,23 @@ static uint32_t __read_mostly vtcr;
>>>>>    static void setup_virt_paging_one(void *data)
>>>>>    {
>>>>>        WRITE_SYSREG32(vtcr, VTCR_EL2);
>>>>> +
>>>>> +    /*
>>>>> +     * ARM64_WORKAROUND_AT_SPECULATE: We want to keep the TLBs 
>>>>> free from
>>>>> +     * entries related to EL1/EL0 translation regime until a guest 
>>>>> vCPU
>>>>> +     * is running. For that, we need to set-up VTTBR to point to 
>>>>> an empty
>>>>> +     * page-table and turn on stage-2 translation.
>>>>
>>>> I don't understand why this is needed: isn't the lack of HCR_VM (due to
>>>> your previous patch) supposed to be sufficient? How can there be
>>>> speculation without HCR_VM?
>>>
>>> HCR_EL2.VM unsets means the stage-2 will not be used for the EL1/EL0
>>> translation regime. In the context of the erratum, the AT can still 
>>> speculate
>>> except it will not take into account the stage-2. The dependencies on 
>>> VMID
>>> stills applies when HCR_EL2.VM is unset, so from my understanding, 
>>> the entry
>>> could get cached to whatever is VTTBR_EL2.VMID.
>>
>> Damn! Even if at this point of the boot sequence there is no EL1 / EL0
>> at all? How can that speculation happen? Shouldn't the first EL1 / EL0
>> speculation occur after the first leave_hypervisor_tail?
> 
> How do you know EL1 was not run before hand? Imagine we did a soft 
> reboot or kexec Xen...
> 
> But the speculation in that context is may be because the processor 
> noticed an AT instruction targeting EL1 in the stream.
> 
>>
>>
>>>> Even if speculation happens without HCR_EL2, why do we need to set it
>>>> now? Isn't setting empty_root_mfn enough?
>>>
>>> The main goal here is to have the TLBs in a known state after the CPU 
>>> has been
>>> initialized. After the sequence below, we are sure that the TLBs 
>>> don't contain
>>> entries associated to the EL1/EL0 regime and and a speculated AT 
>>> instruction
>>> will not be able to allocate more.
>>>
>>> Without HCR_EL2.VM set, the stage-2 page-table will not get used. So a
>>> speculated AT instruction could still allocate an entry in TLB. It is 
>>> not a
>>> major issue as it would be against INVALID_VMID, yet it is not a very 
>>> sane
>>> situation for the hypervisor.
>>
>> I have a question on the tlb flush.  Do we need it because the tlb is
>> not guaranteed to be clean after boot?
> 
> You don't know the state of the TLBs after boot.
> 
>>
>> Also, do we need a flush_tlb_all_local()? Would flush_tlb_local be
>> enough, maybe executed immediately before switching VTTBR_EL2? I guess
>> it depends on whether the speculation happens on the local VMID only.
> 
> Speculation can only happen using system registers. So only on the local 
> VMID only.
> 
>> If it only speculate on the local VMID, then flush_tlb_all_local()
>> should suffice?
> 
> We have two VMIDs in play here: whatever was the value in VTTBR_EL2.VMID 
> before the function and INVALID_VMID. We would need to flush the former 
> and this would require empty root trick because speculation can happen 
> as soon as flush ended.
> 
> But then, you rely on Xen to only use a single VMID at boot. While this 
> is the case today, I can't tell if it will be in the future.
> 
> So the flush_tlb_local is the safest.

Hmmm, I meant flush_tlb_all_local here.

Cheers,
Stefano Stabellini Jan. 29, 2019, 12:52 a.m. UTC | #7
On Mon, 28 Jan 2019, Julien Grall wrote:
> On 1/27/19 9:55 AM, Julien Grall wrote:

> > Hi,

> > 

> > On 1/25/19 9:36 PM, Stefano Stabellini wrote:

> > > On Thu, 24 Jan 2019, Julien Grall wrote:

> > > > @James, please correct me if I am wrong below :).

> > > > 

> > > > On 24/01/2019 00:52, Stefano Stabellini wrote:

> > > > > On Wed, 28 Nov 2018, Julien Grall wrote:

> > > > ... in the context of the errata, you have to imagine what can happen if

> > > > an AT

> > > > instruction is inserted (via speculation) between each instruction and

> > > > what

> > > > happen if the system registers are re-ordered.

> > > > 

> > > > The key of the erratum is VTTBR_EL2. This is what will stop a speculated

> > > > AT

> > > > instruction to allocate a TLBs entry because you are not allowed to

> > > > cache a

> > > > translation that will fault. Without the isb() here, the VTTBR_EL2 may

> > > > be

> > > > synchronized before the rest of the context, so a speculated AT

> > > > instruction

> > > > may use an inconsistent state and allocate a TLB entry with an

> > > > unexpected

> > > > translation against the guest.

> > > > 

> > > > So here, we want to ensure the rest of the context is synchronized

> > > > before

> > > > writing to VTTBR_EL2, hence the isb().

> > > 

> > > OK. I understand the explanation, thank you.

> > > 

> > > I just thought that the CPU would be smart enough to only reorder system

> > > registers writes when appropriate, especially when the CPU is also doing

> > > speculation at the same time. Why would it speculate if it knows that it

> > > is reordering sysreg writes that can badly affect the speculation

> > > itself? Let me say that it doesn't sound like a "sane" behavior to me.

> > > But if it behaves this way, it behaves this way...

> > 

> > I hope you are aware we are speaking about an erratum here... Not what the

> > Arm Arm allows.


I know -- we are talking about a specific CPU implementation. That is
why it seems strange to me that a CPU would reorder things that it
should know they cause trouble to speculation. Anyway, no point in
discussing hardware design choices at this stage.


> > Aside the erratum, a processor is allowed to do whatever it wants if it is

> > within the Arm Arm. These registers are described as out-of-context and

> > should not be used by speculation in EL2. If you want to use them in EL2,

> > you need an isb() before any instruction in EL2 using them otherwise you may

> > use an inconsistent context. This is giving enough freedom to the processor

> > while the impact in the software is minimal.

> > 

> > [...]

> > 

> > > > > 

> > > > > >            /* Ensure VTTBR_EL2 is synchronized before flushing the

> > > > > > TLBs */

> > > > > >            isb();

> > > > > >        }

> > > > > > @@ -1504,6 +1545,23 @@ static uint32_t __read_mostly vtcr;

> > > > > >    static void setup_virt_paging_one(void *data)

> > > > > >    {

> > > > > >        WRITE_SYSREG32(vtcr, VTCR_EL2);

> > > > > > +

> > > > > > +    /*

> > > > > > +     * ARM64_WORKAROUND_AT_SPECULATE: We want to keep the TLBs free

> > > > > > from

> > > > > > +     * entries related to EL1/EL0 translation regime until a guest

> > > > > > vCPU

> > > > > > +     * is running. For that, we need to set-up VTTBR to point to an

> > > > > > empty

> > > > > > +     * page-table and turn on stage-2 translation.

> > > > > 

> > > > > I don't understand why this is needed: isn't the lack of HCR_VM (due

> > > > > to

> > > > > your previous patch) supposed to be sufficient? How can there be

> > > > > speculation without HCR_VM?

> > > > 

> > > > HCR_EL2.VM unsets means the stage-2 will not be used for the EL1/EL0

> > > > translation regime. In the context of the erratum, the AT can still

> > > > speculate

> > > > except it will not take into account the stage-2. The dependencies on

> > > > VMID

> > > > stills applies when HCR_EL2.VM is unset, so from my understanding, the

> > > > entry

> > > > could get cached to whatever is VTTBR_EL2.VMID.

> > > 

> > > Damn! Even if at this point of the boot sequence there is no EL1 / EL0

> > > at all? How can that speculation happen? Shouldn't the first EL1 / EL0

> > > speculation occur after the first leave_hypervisor_tail?

> > 

> > How do you know EL1 was not run before hand? Imagine we did a soft reboot or

> > kexec Xen...

> > 

> > But the speculation in that context is may be because the processor noticed

> > an AT instruction targeting EL1 in the stream.


This seems to be extremely improbable, borderline impossible to me, but
I can imagine that we might want to be extra-paranoid to make sure all
potential issues are covered.


> > > > > Even if speculation happens without HCR_EL2, why do we need to set it

> > > > > now? Isn't setting empty_root_mfn enough?

> > > > 

> > > > The main goal here is to have the TLBs in a known state after the CPU

> > > > has been

> > > > initialized. After the sequence below, we are sure that the TLBs don't

> > > > contain

> > > > entries associated to the EL1/EL0 regime and and a speculated AT

> > > > instruction

> > > > will not be able to allocate more.

> > > > 

> > > > Without HCR_EL2.VM set, the stage-2 page-table will not get used. So a

> > > > speculated AT instruction could still allocate an entry in TLB. It is

> > > > not a

> > > > major issue as it would be against INVALID_VMID, yet it is not a very

> > > > sane

> > > > situation for the hypervisor.

> > > 

> > > I have a question on the tlb flush.  Do we need it because the tlb is

> > > not guaranteed to be clean after boot?

> > 

> > You don't know the state of the TLBs after boot.

> > 

> > > 

> > > Also, do we need a flush_tlb_all_local()? Would flush_tlb_local be

> > > enough, maybe executed immediately before switching VTTBR_EL2? I guess

> > > it depends on whether the speculation happens on the local VMID only.

> > 

> > Speculation can only happen using system registers. So only on the local

> > VMID only.

> > 

> > > If it only speculate on the local VMID, then flush_tlb_all_local()

> > > should suffice?

> > 

> > We have two VMIDs in play here: whatever was the value in VTTBR_EL2.VMID

> > before the function and INVALID_VMID. We would need to flush the former and

> > this would require empty root trick because speculation can happen as soon

> > as flush ended.

> > 

> > But then, you rely on Xen to only use a single VMID at boot. While this is

> > the case today, I can't tell if it will be in the future.

> > 

> > So the flush_tlb_local is the safest.

> 

> Hmmm, I meant flush_tlb_all_local here.


OK.


Overall, I think we could probably get away without a couple of changes
from this patch, but since it is basically impossible for me to prove
it, I'll give my Reviewed-by. I saw that you resent the series already.
I'll take care of committing it.

Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
Julien Grall Jan. 29, 2019, 10:37 a.m. UTC | #8
Hi,

On 29/01/2019 00:52, Stefano Stabellini wrote:
> On Mon, 28 Jan 2019, Julien Grall wrote:
>> On 1/27/19 9:55 AM, Julien Grall wrote:
>>> Hi,
>>>
>>> On 1/25/19 9:36 PM, Stefano Stabellini wrote:
>>>> On Thu, 24 Jan 2019, Julien Grall wrote:
>>>>> @James, please correct me if I am wrong below :).
>>>>>
>>>>> On 24/01/2019 00:52, Stefano Stabellini wrote:
>>>>>> On Wed, 28 Nov 2018, Julien Grall wrote:
>>>>> ... in the context of the errata, you have to imagine what can happen if
>>>>> an AT
>>>>> instruction is inserted (via speculation) between each instruction and
>>>>> what
>>>>> happen if the system registers are re-ordered.
>>>>>
>>>>> The key of the erratum is VTTBR_EL2. This is what will stop a speculated
>>>>> AT
>>>>> instruction to allocate a TLBs entry because you are not allowed to
>>>>> cache a
>>>>> translation that will fault. Without the isb() here, the VTTBR_EL2 may
>>>>> be
>>>>> synchronized before the rest of the context, so a speculated AT
>>>>> instruction
>>>>> may use an inconsistent state and allocate a TLB entry with an
>>>>> unexpected
>>>>> translation against the guest.
>>>>>
>>>>> So here, we want to ensure the rest of the context is synchronized
>>>>> before
>>>>> writing to VTTBR_EL2, hence the isb().
>>>>
>>>> OK. I understand the explanation, thank you.
>>>>
>>>> I just thought that the CPU would be smart enough to only reorder system
>>>> registers writes when appropriate, especially when the CPU is also doing
>>>> speculation at the same time. Why would it speculate if it knows that it
>>>> is reordering sysreg writes that can badly affect the speculation
>>>> itself? Let me say that it doesn't sound like a "sane" behavior to me.
>>>> But if it behaves this way, it behaves this way...
>>>
>>> I hope you are aware we are speaking about an erratum here... Not what the
>>> Arm Arm allows.
> 
> I know -- we are talking about a specific CPU implementation. That is
> why it seems strange to me that a CPU would reorder things that it
> should know they cause trouble to speculation. Anyway, no point in
> discussing hardware design choices at this stage.

I agree that speculation may not happen as I described in my previous e-mail. 
However, we have to assume that any behavior allowed by the Arm Arm can happen 
unless stated otherwise by the specific processor documentation.

This series is based on the Arm Arm and the erratum description provides in the 
Software Developer Errata Notice for the Cortex-A76 [1], both are available 
publicly.

Regarding re-ordering, the wording in the document does not provide any strong 
evidence the writes to system register cannot be re-ordered. The section 
"conditions" actually suggests the invert (i.e re-ordering is possible).

[...]

>>>>>>
>>>>>>>             /* Ensure VTTBR_EL2 is synchronized before flushing the
>>>>>>> TLBs */
>>>>>>>             isb();
>>>>>>>         }
>>>>>>> @@ -1504,6 +1545,23 @@ static uint32_t __read_mostly vtcr;
>>>>>>>     static void setup_virt_paging_one(void *data)
>>>>>>>     {
>>>>>>>         WRITE_SYSREG32(vtcr, VTCR_EL2);
>>>>>>> +
>>>>>>> +    /*
>>>>>>> +     * ARM64_WORKAROUND_AT_SPECULATE: We want to keep the TLBs free
>>>>>>> from
>>>>>>> +     * entries related to EL1/EL0 translation regime until a guest
>>>>>>> vCPU
>>>>>>> +     * is running. For that, we need to set-up VTTBR to point to an
>>>>>>> empty
>>>>>>> +     * page-table and turn on stage-2 translation.
>>>>>>
>>>>>> I don't understand why this is needed: isn't the lack of HCR_VM (due
>>>>>> to
>>>>>> your previous patch) supposed to be sufficient? How can there be
>>>>>> speculation without HCR_VM?
>>>>>
>>>>> HCR_EL2.VM unsets means the stage-2 will not be used for the EL1/EL0
>>>>> translation regime. In the context of the erratum, the AT can still
>>>>> speculate
>>>>> except it will not take into account the stage-2. The dependencies on
>>>>> VMID
>>>>> stills applies when HCR_EL2.VM is unset, so from my understanding, the
>>>>> entry
>>>>> could get cached to whatever is VTTBR_EL2.VMID.
>>>>
>>>> Damn! Even if at this point of the boot sequence there is no EL1 / EL0
>>>> at all? How can that speculation happen? Shouldn't the first EL1 / EL0
>>>> speculation occur after the first leave_hypervisor_tail?
>>>
>>> How do you know EL1 was not run before hand? Imagine we did a soft reboot or
>>> kexec Xen...
>>>
>>> But the speculation in that context is may be because the processor noticed
>>> an AT instruction targeting EL1 in the stream.
> 
> This seems to be extremely improbable, borderline impossible to me, but
> I can imagine that we might want to be extra-paranoid to make sure all
> potential issues are covered.

The "Workaround" section of the erratum contains the following wording:

"Note that a workaround is only required if the system software
contains an AT instruction as part of an executable page."

Xen contains AT instruction in an executable page, so speculation can happen and 
we don't know when.

[...]

> Overall, I think we could probably get away without a couple of changes
> from this patch, but since it is basically impossible for me to prove
> it, I'll give my Reviewed-by. I saw that you resent the series already.
> I'll take care of committing it.
> 
> Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>

Thank you for the review!

Cheers,

[1] 
https://silver.arm.com/download/Documentation/BX500-DA-10008-r0p0-02rel0/Arm_Cortex_A76_MP052_Software_Developer_Errata_Notice_v11.0.pdf
diff mbox series

Patch

diff --git a/docs/misc/arm/silicon-errata.txt b/docs/misc/arm/silicon-errata.txt
index 906bf5fd48..6cd1366f15 100644
--- a/docs/misc/arm/silicon-errata.txt
+++ b/docs/misc/arm/silicon-errata.txt
@@ -48,4 +48,5 @@  stable hypervisors.
 | ARM            | Cortex-A57      | #852523         | N/A                     |
 | ARM            | Cortex-A57      | #832075         | ARM64_ERRATUM_832075    |
 | ARM            | Cortex-A57      | #834220         | ARM64_ERRATUM_834220    |
+| ARM            | Cortex-A76      | #1165522        | N/A                     |
 | ARM            | MMU-500         | #842869         | N/A                     |
diff --git a/xen/arch/arm/cpuerrata.c b/xen/arch/arm/cpuerrata.c
index adf88e7bdc..61c64b9816 100644
--- a/xen/arch/arm/cpuerrata.c
+++ b/xen/arch/arm/cpuerrata.c
@@ -489,6 +489,12 @@  static const struct arm_cpu_capabilities arm_errata[] = {
         .matches = has_ssbd_mitigation,
     },
 #endif
+    {
+        /* Cortex-A76 r0p0 - r2p0 */
+        .desc = "ARM erratum 116522",
+        .capability = ARM64_WORKAROUND_AT_SPECULATE,
+        MIDR_RANGE(MIDR_CORTEX_A76, 0, 2 << MIDR_VARIANT_SHIFT),
+    },
     {},
 };
 
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 1d926dcb29..3180edd89d 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -181,8 +181,6 @@  static void ctxt_switch_to(struct vcpu *n)
     if ( is_idle_vcpu(n) )
         return;
 
-    p2m_restore_state(n);
-
     vpidr = READ_SYSREG32(MIDR_EL1);
     WRITE_SYSREG32(vpidr, VPIDR_EL2);
     WRITE_SYSREG(n->arch.vmpidr, VMPIDR_EL2);
@@ -235,6 +233,12 @@  static void ctxt_switch_to(struct vcpu *n)
 #endif
     isb();
 
+    /*
+     * ARM64_WORKAROUND_AT_SPECULATE: The P2M should be restored after
+     * the stage-1 MMU sysregs have been restored.
+     */
+    p2m_restore_state(n);
+
     /* Control Registers */
     WRITE_SYSREG(n->arch.cpacr, CPACR_EL1);
 
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 844833c4c3..0facb66096 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -15,6 +15,7 @@ 
 #include <asm/event.h>
 #include <asm/hardirq.h>
 #include <asm/page.h>
+#include <asm/alternative.h>
 
 #define MAX_VMID_8_BIT  (1UL << 8)
 #define MAX_VMID_16_BIT (1UL << 16)
@@ -47,6 +48,8 @@  static const paddr_t level_masks[] =
 static const uint8_t level_orders[] =
     { ZEROETH_ORDER, FIRST_ORDER, SECOND_ORDER, THIRD_ORDER };
 
+static mfn_t __read_mostly empty_root_mfn;
+
 static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
 {
     return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
@@ -98,9 +101,25 @@  void dump_p2m_lookup(struct domain *d, paddr_t addr)
                  P2M_ROOT_LEVEL, P2M_ROOT_PAGES);
 }
 
+/*
+ * p2m_save_state and p2m_restore_state works in pair to workaround
+ * ARM64_WORKAROUND_AT_SPECULATE. p2m_save_state will set-up VTTBR to
+ * point to the empty page-tables to stop allocating TLB entries.
+ */
 void p2m_save_state(struct vcpu *p)
 {
     p->arch.sctlr = READ_SYSREG(SCTLR_EL1);
+
+    if ( cpus_have_const_cap(ARM64_WORKAROUND_AT_SPECULATE) )
+    {
+        WRITE_SYSREG64(generate_vttbr(INVALID_VMID, empty_root_mfn), VTTBR_EL2);
+        /*
+         * Ensure VTTBR_EL2 is correctly synchronized so we can restore
+         * the next vCPU context without worrying about AT instruction
+         * speculation.
+         */
+        isb();
+    }
 }
 
 void p2m_restore_state(struct vcpu *n)
@@ -111,10 +130,17 @@  void p2m_restore_state(struct vcpu *n)
     if ( is_idle_vcpu(n) )
         return;
 
-    WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
     WRITE_SYSREG(n->arch.sctlr, SCTLR_EL1);
     WRITE_SYSREG(n->arch.hcr_el2, HCR_EL2);
 
+    /*
+     * ARM64_WORKAROUND_AT_SPECULATE: VTTBR_EL2 should be restored after all
+     * registers associated to EL1/EL0 translations regime have been
+     * synchronized.
+     */
+    asm volatile(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_AT_SPECULATE));
+    WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
+
     last_vcpu_ran = &p2m->last_vcpu_ran[smp_processor_id()];
 
     /*
@@ -157,8 +183,23 @@  static void p2m_force_tlb_flush_sync(struct p2m_domain *p2m)
     ovttbr = READ_SYSREG64(VTTBR_EL2);
     if ( ovttbr != p2m->vttbr )
     {
+        uint64_t vttbr;
+
         local_irq_save(flags);
-        WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
+
+        /*
+         * ARM64_WORKAROUND_AT_SPECULATE: We need to stop AT to allocate
+         * TLBs entries because the context is partially modified. We
+         * only need the VMID for flushing the TLBs, so we can generate
+         * a new VTTBR with the VMID to flush and the empty root table.
+         */
+        if ( !cpus_have_const_cap(ARM64_WORKAROUND_AT_SPECULATE) )
+            vttbr = p2m->vttbr;
+        else
+            vttbr = generate_vttbr(p2m->vmid, empty_root_mfn);
+
+        WRITE_SYSREG64(vttbr, VTTBR_EL2);
+
         /* Ensure VTTBR_EL2 is synchronized before flushing the TLBs */
         isb();
     }
@@ -1504,6 +1545,23 @@  static uint32_t __read_mostly vtcr;
 static void setup_virt_paging_one(void *data)
 {
     WRITE_SYSREG32(vtcr, VTCR_EL2);
+
+    /*
+     * ARM64_WORKAROUND_AT_SPECULATE: We want to keep the TLBs free from
+     * entries related to EL1/EL0 translation regime until a guest vCPU
+     * is running. For that, we need to set-up VTTBR to point to an empty
+     * page-table and turn on stage-2 translation. The TLB entries
+     * associated with EL1/EL0 translation regime will also be flushed in case
+     * an AT instruction was speculated before hand.
+     */
+    if ( cpus_have_cap(ARM64_WORKAROUND_AT_SPECULATE) )
+    {
+        WRITE_SYSREG64(generate_vttbr(INVALID_VMID, empty_root_mfn), VTTBR_EL2);
+        WRITE_SYSREG(READ_SYSREG(HCR_EL2) | HCR_VM, HCR_EL2);
+        isb();
+
+        flush_tlb_all_local();
+    }
 }
 
 void __init setup_virt_paging(void)
@@ -1587,6 +1645,22 @@  void __init setup_virt_paging(void)
     /* It is not allowed to concatenate a level zero root */
     BUG_ON( P2M_ROOT_LEVEL == 0 && P2M_ROOT_ORDER > 0 );
     vtcr = val;
+
+    /*
+     * ARM64_WORKAROUND_AT_SPECULATE requires to allocate root table
+     * with all entries zeroed.
+     */
+    if ( cpus_have_cap(ARM64_WORKAROUND_AT_SPECULATE) )
+    {
+        struct page_info *root;
+
+        root = p2m_allocate_root();
+        if ( !root )
+            panic("Unable to allocate root table for ARM64_WORKAROUND_AT_SPECULATE\n");
+
+        empty_root_mfn = page_to_mfn(root);
+    }
+
     setup_virt_paging_one(NULL);
     smp_call_function(setup_virt_paging_one, NULL, 1);
 }
diff --git a/xen/include/asm-arm/cpufeature.h b/xen/include/asm-arm/cpufeature.h
index 17de928467..c2c8f3417c 100644
--- a/xen/include/asm-arm/cpufeature.h
+++ b/xen/include/asm-arm/cpufeature.h
@@ -45,8 +45,9 @@ 
 #define ARM_HARDEN_BRANCH_PREDICTOR 7
 #define ARM_SSBD 8
 #define ARM_SMCCC_1_1 9
+#define ARM64_WORKAROUND_AT_SPECULATE 10
 
-#define ARM_NCAPS           10
+#define ARM_NCAPS           11
 
 #ifndef __ASSEMBLY__
 
diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
index 72ddc42778..d03ec6e272 100644
--- a/xen/include/asm-arm/processor.h
+++ b/xen/include/asm-arm/processor.h
@@ -52,6 +52,7 @@ 
 #define ARM_CPU_PART_CORTEX_A72     0xD08
 #define ARM_CPU_PART_CORTEX_A73     0xD09
 #define ARM_CPU_PART_CORTEX_A75     0xD0A
+#define ARM_CPU_PART_CORTEX_A76     0xD0B
 
 #define MIDR_CORTEX_A12 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A12)
 #define MIDR_CORTEX_A17 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A17)
@@ -61,6 +62,7 @@ 
 #define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72)
 #define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73)
 #define MIDR_CORTEX_A75 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A75)
+#define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76)
 
 /* MPIDR Multiprocessor Affinity Register */
 #define _MPIDR_UP           (30)