diff mbox series

[Xen-devel,v2,1/6] xen/arm: Park CPUs with a MIDR different from the boot CPU.

Message ID 1519077510-22405-1-git-send-email-sstabellini@kernel.org
State Superseded
Headers show
Series [Xen-devel,v2,1/6] xen/arm: Park CPUs with a MIDR different from the boot CPU. | expand

Commit Message

Stefano Stabellini Feb. 19, 2018, 9:58 p.m. UTC
From: Julien Grall <julien.grall@arm.com>

From: Julien Grall <julien.grall@arm.com>

Xen does not properly support big.LITTLE platform. All vCPUs of a guest
will always have the MIDR of the boot CPU (see arch_domain_create).
At best the guest may see unreliable performance (vCPU switching between
big and LITTLE), at worst the guest will become unreliable or insecure.

This is becoming more apparent with branch predictor hardening in Linux
because they target a specific kind of CPUs and may not work on other
CPUs.

For the time being, park any CPUs with a MDIR different from the boot
CPU. This will be revisited in the future once Xen gains understanding
of big.LITTLE.

[1] https://lists.xenproject.org/archives/html/xen-devel/2016-12/msg00826.html

Signed-off-by: Julien Grall <julien.grall@arm.com>
Reviewed-by: Oleksandr Tyshchenkko <oleksandr_tyshchenko@epam.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
Acked-by: Jan Beulich <jbeulich@suse.com>
---
 docs/misc/xen-command-line.markdown | 10 ++++++++++
 xen/arch/arm/smpboot.c              | 26 ++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

Comments

Julien Grall Feb. 20, 2018, 10:39 a.m. UTC | #1
Hi Stefano,

On 19/02/18 21:58, Stefano Stabellini wrote:
> There can be processors of different kinds on a single system. Make
> processor a per_cpu variable pointing to the right processor type for
> each core.
> 
> Suggested-by: Julien Grall <julien.grall@arm.com>
> Signed-off-by: Stefano Stabellini <sstabellini@kernel.org>

Reviewed-by: Julien Grall <julien.grall@arm.com>

> 
> ---
> Changes in v2:
> - add patch
> ---
>   xen/arch/arm/processor.c | 8 ++++----
>   xen/arch/arm/smpboot.c   | 2 ++
>   2 files changed, 6 insertions(+), 4 deletions(-)
> 
> diff --git a/xen/arch/arm/processor.c b/xen/arch/arm/processor.c
> index 8c425ce..ce43850 100644
> --- a/xen/arch/arm/processor.c
> +++ b/xen/arch/arm/processor.c
> @@ -18,7 +18,7 @@
>    */
>   #include <asm/procinfo.h>
>   
> -static const struct processor *processor = NULL;
> +static DEFINE_PER_CPU(struct processor *, processor);
>   
>   void __init processor_setup(void)
>   {
> @@ -28,15 +28,15 @@ void __init processor_setup(void)
>       if ( !procinfo )
>           return;
>   
> -    processor = procinfo->processor;
> +    this_cpu(processor) = procinfo->processor;
>   }
>   
>   void processor_vcpu_initialise(struct vcpu *v)
>   {
> -    if ( !processor || !processor->vcpu_initialise )
> +    if ( !this_cpu(processor) || !this_cpu(processor)->vcpu_initialise )
>           return;
>   
> -    processor->vcpu_initialise(v);
> +    this_cpu(processor)->vcpu_initialise(v);
>   }
>   
>   /*
> diff --git a/xen/arch/arm/smpboot.c b/xen/arch/arm/smpboot.c
> index 7ea4e41..122c0b5 100644
> --- a/xen/arch/arm/smpboot.c
> +++ b/xen/arch/arm/smpboot.c
> @@ -32,6 +32,7 @@
>   #include <xen/console.h>
>   #include <asm/cpuerrata.h>
>   #include <asm/gic.h>
> +#include <asm/procinfo.h>
>   #include <asm/psci.h>
>   #include <asm/acpi.h>
>   
> @@ -300,6 +301,7 @@ void start_secondary(unsigned long boot_phys_offset,
>       set_processor_id(cpuid);
>   
>       identify_cpu(&current_cpu_data);
> +    processor_setup();
>   
>       init_traps();
>   
> 

Cheers,
Julien Grall Feb. 20, 2018, 10:56 a.m. UTC | #2
Hi Stefano,

On 19/02/18 21:58, Stefano Stabellini wrote:
> On big.LITTLE systems not all cores have the same ACTLR. Instead of
> reading ACTLR and setting v->arch.actlr in vcpu_initialise, do it later
> on the same pcpu where the vcpu will run.
> 
> This way, assuming that the vcpu has been created with the right pcpu
> affinity, the guest will be able to read the right ACTLR value, matching
> the one of the physical cpu.
> 
> Also move processor_vcpu_initialise(v) to continue_new_vcpu as it
> can modify v->arch.actlr.
> 
> Signed-off-by: Stefano Stabellini <sstabellini@kernel.org>
> 
> ---
> 
> Changes in v2:
> - move processor_vcpu_initialise to continue_new_vcpu
> - remove inaccurate sentence from commit message
> ---
>   xen/arch/arm/domain.c | 7 +++----
>   1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index a010443..fb51415 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -314,6 +314,9 @@ static void schedule_tail(struct vcpu *prev)
>   
>   static void continue_new_vcpu(struct vcpu *prev)
>   {
> +    current->arch.actlr = READ_SYSREG32(ACTLR_EL1);

Hmmm, I just realised that ACTLR_EL1 has been extended to 64-bit for 
AArch64 state in recent spec.

I don't think this necessary to update it in this series so:

Reviewed-by: Julien Grall <julien.grall@arm.com>

Cheers,


> +    processor_vcpu_initialise(current);
> +
>       schedule_tail(prev);
>   
>       if ( is_idle_vcpu(current) )
> @@ -540,12 +543,8 @@ int vcpu_initialise(struct vcpu *v)
>   
>       v->arch.vmpidr = MPIDR_SMP | vcpuid_to_vaffinity(v->vcpu_id);
>   
> -    v->arch.actlr = READ_SYSREG32(ACTLR_EL1);
> -
>       v->arch.hcr_el2 = get_default_hcr_flags();
>   
> -    processor_vcpu_initialise(v);
> -
>       if ( (rc = vcpu_vgic_init(v)) != 0 )
>           goto fail;
>   
> 

Cheers,
Julien Grall Feb. 20, 2018, 11:02 a.m. UTC | #3
Hi Stefano,

On 19/02/18 21:58, Stefano Stabellini wrote:
> On big.LITTLE systems not all cores have the same midr. Instead of
> storing only one vpidr per domain, make it per vcpu and initialize it to
> the value of the midr of the pcpu where the vcpu will run.
> 
> This way, assuming that the vcpu has been created with the right pcpu
> affinity, the guest will be able to read the right vpidr value, matching
> the one of the physical cpu.
> 
> Signed-off-by: Stefano Stabellini <sstabellini@kernel.org>
> 
> ---
> 
> - remove warning message
> - make vpidr per vcpu
> ---
>   xen/arch/arm/domain.c        | 6 ++----
>   xen/arch/arm/vcpreg.c        | 4 ++--
>   xen/include/asm-arm/domain.h | 6 +++---
>   3 files changed, 7 insertions(+), 9 deletions(-)
> 
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index fb51415..41d5d25 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -180,7 +180,7 @@ static void ctxt_switch_to(struct vcpu *n)
>   
>       p2m_restore_state(n);
>   
> -    WRITE_SYSREG32(n->domain->arch.vpidr, VPIDR_EL2);
> +    WRITE_SYSREG32(n->arch.vpidr, VPIDR_EL2);

Do we really need to store the vpidr in struct vcpu? It would be simpler 
and more efficient (no memory access) to use directly read MDIR_EL1 and 
copy it to VPIDR_EL1.

Cheers,
Julien Grall Feb. 20, 2018, 11:33 a.m. UTC | #4
Hi Stefano,

On 19/02/18 21:58, Stefano Stabellini wrote:
> On big.LITTLE systems the cacheline size might differ between big and
> LITTLE cores. Instead of reading the cacheline size once at boot,
> read it as needed from the system registers.
> 
> Suggested-by: Julien Grall <julien.grall@arm.com>
> Signed-off-by: Stefano Stabellini <sstabellini@kernel.org>
> ---
>   xen/arch/arm/arm32/head.S  |  9 +++++++--
>   xen/arch/arm/arm64/head.S  | 10 ++++++++--
>   xen/arch/arm/setup.c       | 17 -----------------
>   xen/include/asm-arm/page.h | 17 +++++++++++++++--
>   4 files changed, 30 insertions(+), 23 deletions(-)
> 
> diff --git a/xen/arch/arm/arm32/head.S b/xen/arch/arm/arm32/head.S
> index 43374e7..db470ad 100644
> --- a/xen/arch/arm/arm32/head.S
> +++ b/xen/arch/arm/arm32/head.S
> @@ -504,8 +504,13 @@ ENTRY(relocate_xen)
>           dsb        /* So the CPU issues all writes to the range */
>   
>           mov   r5, r4
> -        ldr   r6, =cacheline_bytes /* r6 := step */
> -        ldr   r6, [r6]
> +        mov   r6, #0

Comments in the code would be nice to know what you exactly do. Also in 
that case, it would make sense to have a macro as this may be useful in 
other places.

> +        mcr   CP32(r6, CSSELR_EL1)

Please use 32-bit naming in the 32-bit code.


> +        mrc   CP32(r6, CSSELR_EL1)

The size of the cache is read using CSSIDR_EL1. But it looks like the 
way we get the cache line size in Xen is fragile.

We are retrieving the cache line size of Level 1 and assume this will be 
valid for all the other caches. Indeed cache maintenance ops may 
propagate to other caches depending the target (Point of Coherency vs 
Point of Unification).

Looking at the ARM ARM "Cache hierarchy abstraction for address-based 
operations" (D3-2061 DDI 0487C.a), CTR_EL0/CTR will holds the minimum 
line lenght values for the data caches. The value will be the most 
efficient address stride to use to apply a sequence of VA-based 
maintenance instructions to a range of VAs.

So it would be best and safer for Xen to use CTR/CTLR_EL0.DminLine.

> +        and   r6, r6, #0x7
> +        add   r6, r6, #4
> +        mov   r7, #1
> +        lsl   r6, r7, r6
>           mov   r7, r3
>   
>   1:      mcr   CP32(r7, DCCMVAC)
> diff --git a/xen/arch/arm/arm64/head.S b/xen/arch/arm/arm64/head.S
> index fa0ef70..edea300 100644
> --- a/xen/arch/arm/arm64/head.S
> +++ b/xen/arch/arm/arm64/head.S
> @@ -631,8 +631,14 @@ ENTRY(relocate_xen)
>           dsb   sy        /* So the CPU issues all writes to the range */
>   
>           mov   x9, x3
> -        ldr   x10, =cacheline_bytes /* x10 := step */
> -        ldr   x10, [x10]
> +
> +        mov   x10, #0
> +        msr   CSSELR_EL1, x10
> +        mrs   x10, CSSELR_EL1
> +        and   x10, x10, #0x7
> +        add   x10, x10, #4
> +        mov   x11, #1
> +        lsl   x10, x11, x10

Please use dcache_line_size macro (see cache.S).

>           mov   x11, x2
>   
>   1:      dc    cvac, x11
> diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
> index 032a6a8..4754c95 100644
> --- a/xen/arch/arm/setup.c
> +++ b/xen/arch/arm/setup.c
> @@ -680,21 +680,6 @@ static void __init setup_mm(unsigned long dtb_paddr, size_t dtb_size)
>   }
>   #endif
>   
> -size_t __read_mostly cacheline_bytes;
> -
> -/* Very early check of the CPU cache properties */
> -void __init setup_cache(void)
> -{
> -    uint32_t ccsid;
> -
> -    /* Read the cache size ID register for the level-0 data cache */
> -    WRITE_SYSREG32(0, CSSELR_EL1);
> -    ccsid = READ_SYSREG32(CCSIDR_EL1);
> -
> -    /* Low 3 bits are log2(cacheline size in words) - 2. */
> -    cacheline_bytes = 1U << (4 + (ccsid & 0x7));
> -}
> -
>   /* C entry point for boot CPU */
>   void __init start_xen(unsigned long boot_phys_offset,
>                         unsigned long fdt_paddr,
> @@ -708,8 +693,6 @@ void __init start_xen(unsigned long boot_phys_offset,
>       struct domain *dom0;
>       struct xen_arch_domainconfig config;
>   
> -    setup_cache();
> -
>       percpu_init_areas();
>       set_processor_id(0); /* needed early, for smp_processor_id() */
>   
> diff --git a/xen/include/asm-arm/page.h b/xen/include/asm-arm/page.h
> index d948250..791e22e 100644
> --- a/xen/include/asm-arm/page.h
> +++ b/xen/include/asm-arm/page.h
> @@ -133,8 +133,6 @@
>   
>   /* Architectural minimum cacheline size is 4 32-bit words. */
>   #define MIN_CACHELINE_BYTES 16
> -/* Actual cacheline size on the boot CPU. */
> -extern size_t cacheline_bytes;
>   
>   #define copy_page(dp, sp) memcpy(dp, sp, PAGE_SIZE)
>   
> @@ -142,9 +140,22 @@ extern size_t cacheline_bytes;
>    * if 'range' is large enough we might want to use model-specific
>    * full-cache flushes. */
>   
> +static inline size_t read_cacheline_size(void)
> +{
> +    uint32_t ccsid;
> +
> +    /* Read the cache size ID register for the level-0 data cache */
> +    WRITE_SYSREG32(0, CSSELR_EL1);
> +    ccsid = READ_SYSREG32(CCSIDR_EL1);
> +
> +    /* Low 3 bits are log2(cacheline size in words) - 2. */
> +    return (size_t) (1U << (4 + (ccsid & 0x7)));

See my remark above regarding the cacheline size.

> +}
> +
>   static inline int invalidate_dcache_va_range(const void *p, unsigned long size)
>   {
>       const void *end = p + size;
> +    size_t cacheline_bytes = read_cacheline_size();
>       size_t cacheline_mask = cacheline_bytes - 1;
>   
>       dsb(sy);           /* So the CPU issues all writes to the range */
> @@ -171,6 +182,7 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size)
>   
>   static inline int clean_dcache_va_range(const void *p, unsigned long size)
>   {
> +    size_t cacheline_bytes = read_cacheline_size();
>       const void *end = p + size;
>       dsb(sy);           /* So the CPU issues all writes to the range */
>       p = (void *)((uintptr_t)p & ~(cacheline_bytes - 1));
> @@ -184,6 +196,7 @@ static inline int clean_dcache_va_range(const void *p, unsigned long size)
>   static inline int clean_and_invalidate_dcache_va_range
>       (const void *p, unsigned long size)
>   {
> +    size_t cacheline_bytes = read_cacheline_size();
>       const void *end = p + size;
>       dsb(sy);         /* So the CPU issues all writes to the range */
>       p = (void *)((uintptr_t)p & ~(cacheline_bytes - 1));
> 

Cheers,
Julien Grall Feb. 20, 2018, 11:38 a.m. UTC | #5
Hi Stefano,

On 19/02/18 21:58, Stefano Stabellini wrote:
> diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
> index 2184cb9..8997904 100644
> --- a/docs/misc/xen-command-line.markdown
> +++ b/docs/misc/xen-command-line.markdown
> @@ -1007,7 +1007,12 @@ Control Xens use of the APEI Hardware Error Source Table, should one be found.
>   
>   Say yes at your own risk if you want to enable heterogenous computing
>   (such as big.LITTLE). This may result to an unstable and insecure
> -platform. When the option is disabled (default), CPUs that are not
> +platform, unless you manually specify the cpu affinity of all domains so
> +that all vcpus are scheduled on the same class of pcpus (big or LITTLE
> +but not both). vcpu migration between big cores and LITTLE cores is not
> +supported. See docs/misc/arm/big.LITTLE.txt for more information.
> +
> +When the hmp-unsafe option is disabled (default), CPUs that are not
>   identical to the boot CPU will be parked and not used by Xen.
>   
>   ### hpetbroadcast
> diff --git a/xen/arch/arm/smpboot.c b/xen/arch/arm/smpboot.c
> index 122c0b5..d04b7c7 100644
> --- a/xen/arch/arm/smpboot.c
> +++ b/xen/arch/arm/smpboot.c
> @@ -266,7 +266,7 @@ void __init smp_init_cpus(void)
>   
>       if ( opt_hmp_unsafe )
>           warning_add("WARNING: HMP COMPUTING HAS BEEN ENABLED.\n"
> -                    "It has implications on the security and stability of the system.\n");
> +                    "It has implications on the security and stability of the system, unless the cpu affinity of all domains is specified.\n");

The warning message will not be print nicely on serial. Please make sure 
it is split at 72 characters.

>   }
>   
>   int __init
> @@ -308,13 +308,14 @@ void start_secondary(unsigned long boot_phys_offset,
>       /*
>        * Currently Xen assumes the platform has only one kind of CPUs.
>        * This assumption does not hold on big.LITTLE platform and may
> -     * result to instability and insecure platform. Better to park them
> -     * for now.
> +     * result to instability and insecure platform (unless cpu affinity
> +     * is manually specified for all domains). Better to park them for
> +     * now.
>        */
>       if ( !opt_hmp_unsafe &&
>            current_cpu_data.midr.bits != boot_cpu_data.midr.bits )
>       {
> -        printk(XENLOG_ERR "CPU%u MIDR (0x%x) does not match boot CPU MIDR (0x%x).\n",
> +        printk(XENLOG_ERR "CPU%u MIDR (0x%x) does not match boot CPU MIDR (0x%x), disable cpu (see big.LITTLE.txt under docs/).\n",

Same here.

>                  smp_processor_id(), current_cpu_data.midr.bits,
>                  boot_cpu_data.midr.bits);
>           stop_cpu();
> 

Cheers,
Stefano Stabellini Feb. 20, 2018, 4:40 p.m. UTC | #6
On Tue, 20 Feb 2018, Julien Grall wrote:
> Hi Stefano,
> 
> On 19/02/18 21:58, Stefano Stabellini wrote:
> > diff --git a/docs/misc/xen-command-line.markdown
> > b/docs/misc/xen-command-line.markdown
> > index 2184cb9..8997904 100644
> > --- a/docs/misc/xen-command-line.markdown
> > +++ b/docs/misc/xen-command-line.markdown
> > @@ -1007,7 +1007,12 @@ Control Xens use of the APEI Hardware Error Source
> > Table, should one be found.
> >     Say yes at your own risk if you want to enable heterogenous computing
> >   (such as big.LITTLE). This may result to an unstable and insecure
> > -platform. When the option is disabled (default), CPUs that are not
> > +platform, unless you manually specify the cpu affinity of all domains so
> > +that all vcpus are scheduled on the same class of pcpus (big or LITTLE
> > +but not both). vcpu migration between big cores and LITTLE cores is not
> > +supported. See docs/misc/arm/big.LITTLE.txt for more information.
> > +
> > +When the hmp-unsafe option is disabled (default), CPUs that are not
> >   identical to the boot CPU will be parked and not used by Xen.
> >     ### hpetbroadcast
> > diff --git a/xen/arch/arm/smpboot.c b/xen/arch/arm/smpboot.c
> > index 122c0b5..d04b7c7 100644
> > --- a/xen/arch/arm/smpboot.c
> > +++ b/xen/arch/arm/smpboot.c
> > @@ -266,7 +266,7 @@ void __init smp_init_cpus(void)
> >         if ( opt_hmp_unsafe )
> >           warning_add("WARNING: HMP COMPUTING HAS BEEN ENABLED.\n"
> > -                    "It has implications on the security and stability of
> > the system.\n");
> > +                    "It has implications on the security and stability of
> > the system, unless the cpu affinity of all domains is specified.\n");
> 
> The warning message will not be print nicely on serial. Please make sure it is
> split at 72 characters.

OK


> >   }
> >     int __init
> > @@ -308,13 +308,14 @@ void start_secondary(unsigned long boot_phys_offset,
> >       /*
> >        * Currently Xen assumes the platform has only one kind of CPUs.
> >        * This assumption does not hold on big.LITTLE platform and may
> > -     * result to instability and insecure platform. Better to park them
> > -     * for now.
> > +     * result to instability and insecure platform (unless cpu affinity
> > +     * is manually specified for all domains). Better to park them for
> > +     * now.
> >        */
> >       if ( !opt_hmp_unsafe &&
> >            current_cpu_data.midr.bits != boot_cpu_data.midr.bits )
> >       {
> > -        printk(XENLOG_ERR "CPU%u MIDR (0x%x) does not match boot CPU MIDR
> > (0x%x).\n",
> > +        printk(XENLOG_ERR "CPU%u MIDR (0x%x) does not match boot CPU MIDR
> > (0x%x), disable cpu (see big.LITTLE.txt under docs/).\n",
> 
> Same here.
> 
> >                  smp_processor_id(), current_cpu_data.midr.bits,
> >                  boot_cpu_data.midr.bits);
> >           stop_cpu();
> > 
> 
> Cheers,
> 
> -- 
> Julien Grall
>
Stefano Stabellini Feb. 20, 2018, 8:47 p.m. UTC | #7
On Tue, 20 Feb 2018, Julien Grall wrote:
> Hi Stefano,
> 
> On 19/02/18 21:58, Stefano Stabellini wrote:
> > On big.LITTLE systems not all cores have the same midr. Instead of
> > storing only one vpidr per domain, make it per vcpu and initialize it to
> > the value of the midr of the pcpu where the vcpu will run.
> > 
> > This way, assuming that the vcpu has been created with the right pcpu
> > affinity, the guest will be able to read the right vpidr value, matching
> > the one of the physical cpu.
> > 
> > Signed-off-by: Stefano Stabellini <sstabellini@kernel.org>
> > 
> > ---
> > 
> > - remove warning message
> > - make vpidr per vcpu
> > ---
> >   xen/arch/arm/domain.c        | 6 ++----
> >   xen/arch/arm/vcpreg.c        | 4 ++--
> >   xen/include/asm-arm/domain.h | 6 +++---
> >   3 files changed, 7 insertions(+), 9 deletions(-)
> > 
> > diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> > index fb51415..41d5d25 100644
> > --- a/xen/arch/arm/domain.c
> > +++ b/xen/arch/arm/domain.c
> > @@ -180,7 +180,7 @@ static void ctxt_switch_to(struct vcpu *n)
> >         p2m_restore_state(n);
> >   -    WRITE_SYSREG32(n->domain->arch.vpidr, VPIDR_EL2);
> > +    WRITE_SYSREG32(n->arch.vpidr, VPIDR_EL2);
> 
> Do we really need to store the vpidr in struct vcpu? It would be simpler and
> more efficient (no memory access) to use directly read MDIR_EL1 and copy it to
> VPIDR_EL1.

I followed your suggestion to drop vpidr from struct vcpu and just read
MDIR_EL1 in ctxt_switch_to. In do_cp14_32 I replaced v->arch.vpidr with
current_cpu_data.midr.bits for simplicity.
Stefano Stabellini Feb. 20, 2018, 9:03 p.m. UTC | #8
On Tue, 20 Feb 2018, Julien Grall wrote:
> Hi Stefano,
> 
> On 19/02/18 21:58, Stefano Stabellini wrote:
> > On big.LITTLE systems the cacheline size might differ between big and
> > LITTLE cores. Instead of reading the cacheline size once at boot,
> > read it as needed from the system registers.
> > 
> > Suggested-by: Julien Grall <julien.grall@arm.com>
> > Signed-off-by: Stefano Stabellini <sstabellini@kernel.org>
> > ---
> >   xen/arch/arm/arm32/head.S  |  9 +++++++--
> >   xen/arch/arm/arm64/head.S  | 10 ++++++++--
> >   xen/arch/arm/setup.c       | 17 -----------------
> >   xen/include/asm-arm/page.h | 17 +++++++++++++++--
> >   4 files changed, 30 insertions(+), 23 deletions(-)
> > 
> > diff --git a/xen/arch/arm/arm32/head.S b/xen/arch/arm/arm32/head.S
> > index 43374e7..db470ad 100644
> > --- a/xen/arch/arm/arm32/head.S
> > +++ b/xen/arch/arm/arm32/head.S
> > @@ -504,8 +504,13 @@ ENTRY(relocate_xen)
> >           dsb        /* So the CPU issues all writes to the range */
> >             mov   r5, r4
> > -        ldr   r6, =cacheline_bytes /* r6 := step */
> > -        ldr   r6, [r6]
> > +        mov   r6, #0
> 
> Comments in the code would be nice to know what you exactly do. Also in that
> case, it would make sense to have a macro as this may be useful in other
> places.

OK. This is a 1:1 translation from setup_cache.


> > +        mcr   CP32(r6, CSSELR_EL1)
> 
> Please use 32-bit naming in the 32-bit code.

I'll change.

 
> > +        mrc   CP32(r6, CSSELR_EL1)
> 
> The size of the cache is read using CSSIDR_EL1. But it looks like the way we
> get the cache line size in Xen is fragile.
> 
> We are retrieving the cache line size of Level 1 and assume this will be valid
> for all the other caches. Indeed cache maintenance ops may propagate to other
> caches depending the target (Point of Coherency vs Point of Unification).
> 
> Looking at the ARM ARM "Cache hierarchy abstraction for address-based
> operations" (D3-2061 DDI 0487C.a), CTR_EL0/CTR will holds the minimum line
> lenght values for the data caches. The value will be the most efficient
> address stride to use to apply a sequence of VA-based maintenance instructions
> to a range of VAs.
> 
> So it would be best and safer for Xen to use CTR/CTLR_EL0.DminLine.

This is insightful, thank you. Given that this patch is a backport
candidate, I would prefer to retain the same behavior we had before in
setup_cache. I can write a separate patch on top of this to make the
change to use CTR/CTLR_EL0.DminLine. That way, we can make a separate
decision on each of them on whether we want to backport them (and
potentially revert them) or not. In other words: this patch as-is is
suboptimal but is of very little risk. Making changes to the way we
determine the cacheline size improves the patch but significantly
increases the risk factor associated with it.

Does it make sense?


> > +        and   r6, r6, #0x7
> > +        add   r6, r6, #4
> > +        mov   r7, #1
> > +        lsl   r6, r7, r6
> >           mov   r7, r3
> >     1:      mcr   CP32(r7, DCCMVAC)
> > diff --git a/xen/arch/arm/arm64/head.S b/xen/arch/arm/arm64/head.S
> > index fa0ef70..edea300 100644
> > --- a/xen/arch/arm/arm64/head.S
> > +++ b/xen/arch/arm/arm64/head.S
> > @@ -631,8 +631,14 @@ ENTRY(relocate_xen)
> >           dsb   sy        /* So the CPU issues all writes to the range */
> >             mov   x9, x3
> > -        ldr   x10, =cacheline_bytes /* x10 := step */
> > -        ldr   x10, [x10]
> > +
> > +        mov   x10, #0
> > +        msr   CSSELR_EL1, x10
> > +        mrs   x10, CSSELR_EL1
> > +        and   x10, x10, #0x7
> > +        add   x10, x10, #4
> > +        mov   x11, #1
> > +        lsl   x10, x11, x10
> 
> Please use dcache_line_size macro (see cache.S).

Similarly, I would prefer to retain the same old behavior here, and
fix it/improve it in a separate patch.


> >           mov   x11, x2
> >     1:      dc    cvac, x11
> > diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
> > index 032a6a8..4754c95 100644
> > --- a/xen/arch/arm/setup.c
> > +++ b/xen/arch/arm/setup.c
> > @@ -680,21 +680,6 @@ static void __init setup_mm(unsigned long dtb_paddr,
> > size_t dtb_size)
> >   }
> >   #endif
> >   -size_t __read_mostly cacheline_bytes;
> > -
> > -/* Very early check of the CPU cache properties */
> > -void __init setup_cache(void)
> > -{
> > -    uint32_t ccsid;
> > -
> > -    /* Read the cache size ID register for the level-0 data cache */
> > -    WRITE_SYSREG32(0, CSSELR_EL1);
> > -    ccsid = READ_SYSREG32(CCSIDR_EL1);
> > -
> > -    /* Low 3 bits are log2(cacheline size in words) - 2. */
> > -    cacheline_bytes = 1U << (4 + (ccsid & 0x7));
> > -}
> > -
> >   /* C entry point for boot CPU */
> >   void __init start_xen(unsigned long boot_phys_offset,
> >                         unsigned long fdt_paddr,
> > @@ -708,8 +693,6 @@ void __init start_xen(unsigned long boot_phys_offset,
> >       struct domain *dom0;
> >       struct xen_arch_domainconfig config;
> >   -    setup_cache();
> > -
> >       percpu_init_areas();
> >       set_processor_id(0); /* needed early, for smp_processor_id() */
> >   diff --git a/xen/include/asm-arm/page.h b/xen/include/asm-arm/page.h
> > index d948250..791e22e 100644
> > --- a/xen/include/asm-arm/page.h
> > +++ b/xen/include/asm-arm/page.h
> > @@ -133,8 +133,6 @@
> >     /* Architectural minimum cacheline size is 4 32-bit words. */
> >   #define MIN_CACHELINE_BYTES 16
> > -/* Actual cacheline size on the boot CPU. */
> > -extern size_t cacheline_bytes;
> >     #define copy_page(dp, sp) memcpy(dp, sp, PAGE_SIZE)
> >   @@ -142,9 +140,22 @@ extern size_t cacheline_bytes;
> >    * if 'range' is large enough we might want to use model-specific
> >    * full-cache flushes. */
> >   +static inline size_t read_cacheline_size(void)
> > +{
> > +    uint32_t ccsid;
> > +
> > +    /* Read the cache size ID register for the level-0 data cache */
> > +    WRITE_SYSREG32(0, CSSELR_EL1);
> > +    ccsid = READ_SYSREG32(CCSIDR_EL1);
> > +
> > +    /* Low 3 bits are log2(cacheline size in words) - 2. */
> > +    return (size_t) (1U << (4 + (ccsid & 0x7)));
> 
> See my remark above regarding the cacheline size.
> 
> > +}
> > +
> >   static inline int invalidate_dcache_va_range(const void *p, unsigned long
> > size)
> >   {
> >       const void *end = p + size;
> > +    size_t cacheline_bytes = read_cacheline_size();
> >       size_t cacheline_mask = cacheline_bytes - 1;
> >         dsb(sy);           /* So the CPU issues all writes to the range */
> > @@ -171,6 +182,7 @@ static inline int invalidate_dcache_va_range(const void
> > *p, unsigned long size)
> >     static inline int clean_dcache_va_range(const void *p, unsigned long
> > size)
> >   {
> > +    size_t cacheline_bytes = read_cacheline_size();
> >       const void *end = p + size;
> >       dsb(sy);           /* So the CPU issues all writes to the range */
> >       p = (void *)((uintptr_t)p & ~(cacheline_bytes - 1));
> > @@ -184,6 +196,7 @@ static inline int clean_dcache_va_range(const void *p,
> > unsigned long size)
> >   static inline int clean_and_invalidate_dcache_va_range
> >       (const void *p, unsigned long size)
> >   {
> > +    size_t cacheline_bytes = read_cacheline_size();
> >       const void *end = p + size;
> >       dsb(sy);         /* So the CPU issues all writes to the range */
> >       p = (void *)((uintptr_t)p & ~(cacheline_bytes - 1));
> > 
> 
> Cheers,
> 
> -- 
> Julien Grall
>
Julien Grall Feb. 20, 2018, 9:16 p.m. UTC | #9
Hi,

On 20/02/2018 21:03, Stefano Stabellini wrote:
> On Tue, 20 Feb 2018, Julien Grall wrote:
>> On 19/02/18 21:58, Stefano Stabellini wrote:
>>> +        mrc   CP32(r6, CSSELR_EL1)
>>
>> The size of the cache is read using CSSIDR_EL1. But it looks like the way we
>> get the cache line size in Xen is fragile.
>>
>> We are retrieving the cache line size of Level 1 and assume this will be valid
>> for all the other caches. Indeed cache maintenance ops may propagate to other
>> caches depending the target (Point of Coherency vs Point of Unification).
>>
>> Looking at the ARM ARM "Cache hierarchy abstraction for address-based
>> operations" (D3-2061 DDI 0487C.a), CTR_EL0/CTR will holds the minimum line
>> lenght values for the data caches. The value will be the most efficient
>> address stride to use to apply a sequence of VA-based maintenance instructions
>> to a range of VAs.
>>
>> So it would be best and safer for Xen to use CTR/CTLR_EL0.DminLine.
> 
> This is insightful, thank you. Given that this patch is a backport
> candidate, I would prefer to retain the same behavior we had before in
> setup_cache. I can write a separate patch on top of this to make the
> change to use CTR/CTLR_EL0.DminLine. That way, we can make a separate
> decision on each of them on whether we want to backport them (and
> potentially revert them) or not. In other words: this patch as-is is
> suboptimal but is of very little risk. Making changes to the way we
> determine the cacheline size improves the patch but significantly
> increases the risk factor associated with it.
> 
> Does it make sense?

By this patch you mean big.LITTLE? If so, then I don't consider it as a 
potential backport. big.LITTLE has never been supported on Xen and hence 
should be considered as a new feature. What is backportable is the patch 
#1 that forbid big.LITTLE.

Regarding the cache line size, I didn't suggest the change because it is 
more efficient. I suggested the patch because the current code to find 
the cache line size is wrong. Imagine there is a cache in the hierarchy 
that has a smaller cache line than your L1 cache. Then you would not 
clean/invalidate correctly that cache.

>>> +        and   r6, r6, #0x7
>>> +        add   r6, r6, #4
>>> +        mov   r7, #1
>>> +        lsl   r6, r7, r6
>>>            mov   r7, r3
>>>      1:      mcr   CP32(r7, DCCMVAC)
>>> diff --git a/xen/arch/arm/arm64/head.S b/xen/arch/arm/arm64/head.S
>>> index fa0ef70..edea300 100644
>>> --- a/xen/arch/arm/arm64/head.S
>>> +++ b/xen/arch/arm/arm64/head.S
>>> @@ -631,8 +631,14 @@ ENTRY(relocate_xen)
>>>            dsb   sy        /* So the CPU issues all writes to the range */
>>>              mov   x9, x3
>>> -        ldr   x10, =cacheline_bytes /* x10 := step */
>>> -        ldr   x10, [x10]
>>> +
>>> +        mov   x10, #0
>>> +        msr   CSSELR_EL1, x10
>>> +        mrs   x10, CSSELR_EL1
>>> +        and   x10, x10, #0x7
>>> +        add   x10, x10, #4
>>> +        mov   x11, #1
>>> +        lsl   x10, x11, x10
>>
>> Please use dcache_line_size macro (see cache.S).
> 
> Similarly, I would prefer to retain the same old behavior here, and
> fix it/improve it in a separate patch.

See above, you got the wrong end of the stick about the cache line size.

Cheers,
Julien Grall Feb. 20, 2018, 9:49 p.m. UTC | #10
On 20/02/2018 21:16, Julien Grall wrote:
> Hi,
> 
> On 20/02/2018 21:03, Stefano Stabellini wrote:
>> On Tue, 20 Feb 2018, Julien Grall wrote:
>>> On 19/02/18 21:58, Stefano Stabellini wrote:
>>>> +        mrc   CP32(r6, CSSELR_EL1)
>>>
>>> The size of the cache is read using CSSIDR_EL1. But it looks like the 
>>> way we
>>> get the cache line size in Xen is fragile.
>>>
>>> We are retrieving the cache line size of Level 1 and assume this will 
>>> be valid
>>> for all the other caches. Indeed cache maintenance ops may propagate 
>>> to other
>>> caches depending the target (Point of Coherency vs Point of 
>>> Unification).
>>>
>>> Looking at the ARM ARM "Cache hierarchy abstraction for address-based
>>> operations" (D3-2061 DDI 0487C.a), CTR_EL0/CTR will holds the minimum 
>>> line
>>> lenght values for the data caches. The value will be the most efficient
>>> address stride to use to apply a sequence of VA-based maintenance 
>>> instructions
>>> to a range of VAs.
>>>
>>> So it would be best and safer for Xen to use CTR/CTLR_EL0.DminLine.
>>
>> This is insightful, thank you. Given that this patch is a backport
>> candidate, I would prefer to retain the same behavior we had before in
>> setup_cache. I can write a separate patch on top of this to make the
>> change to use CTR/CTLR_EL0.DminLine. That way, we can make a separate
>> decision on each of them on whether we want to backport them (and
>> potentially revert them) or not. In other words: this patch as-is is
>> suboptimal but is of very little risk. Making changes to the way we
>> determine the cacheline size improves the patch but significantly
>> increases the risk factor associated with it.
>>
>> Does it make sense?
> 
> By this patch you mean big.LITTLE? If so, then I don't consider it as a 
> potential backport. big.LITTLE has never been supported on Xen and hence 
> should be considered as a new feature. What is backportable is the patch 
> #1 that forbid big.LITTLE.
> 
> Regarding the cache line size, I didn't suggest the change because it is 
> more efficient. I suggested the patch because the current code to find 
> the cache line size is wrong. Imagine there is a cache in the hierarchy 
> that has a smaller cache line than your L1 cache. Then you would not 
> clean/invalidate correctly that cache.
> 
>>>> +        and   r6, r6, #0x7
>>>> +        add   r6, r6, #4
>>>> +        mov   r7, #1
>>>> +        lsl   r6, r7, r6
>>>>            mov   r7, r3
>>>>      1:      mcr   CP32(r7, DCCMVAC)
>>>> diff --git a/xen/arch/arm/arm64/head.S b/xen/arch/arm/arm64/head.S
>>>> index fa0ef70..edea300 100644
>>>> --- a/xen/arch/arm/arm64/head.S
>>>> +++ b/xen/arch/arm/arm64/head.S
>>>> @@ -631,8 +631,14 @@ ENTRY(relocate_xen)
>>>>            dsb   sy        /* So the CPU issues all writes to the 
>>>> range */
>>>>              mov   x9, x3
>>>> -        ldr   x10, =cacheline_bytes /* x10 := step */
>>>> -        ldr   x10, [x10]
>>>> +
>>>> +        mov   x10, #0
>>>> +        msr   CSSELR_EL1, x10
>>>> +        mrs   x10, CSSELR_EL1
>>>> +        and   x10, x10, #0x7
>>>> +        add   x10, x10, #4
>>>> +        mov   x11, #1
>>>> +        lsl   x10, x11, x10
>>>
>>> Please use dcache_line_size macro (see cache.S).
>>
>> Similarly, I would prefer to retain the same old behavior here, and
>> fix it/improve it in a separate patch.
> 
> See above, you got the wrong end of the stick about the cache line size.

You might want to look at the following patch in Linux:

commit f91e2c3bd427239c198351f44814dd39db91afe0
Author: Catalin Marinas <catalin.marinas@arm.com>
Date:   Tue Dec 7 16:52:04 2010 +0100

    ARM: 6527/1: Use CTR instead of CCSIDR for the D-cache line size on ARMv7

    The current implementation of the dcache_line_size macro reads the L1
    cache size from the CCSIDR register. This, however, is not guaranteed to
    be the smallest cache line in the cache hierarchy. The patch changes to
    the macro to use the more architecturally correct CTR register.

    Reported-by: Kevin Sapp <ksapp@quicinc.com>
    Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
    Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

Cheers,
Stefano Stabellini Feb. 20, 2018, 11:28 p.m. UTC | #11
On Tue, 20 Feb 2018, Julien Grall wrote:
> On 20/02/2018 21:16, Julien Grall wrote:

> > Hi,

> > 

> > On 20/02/2018 21:03, Stefano Stabellini wrote:

> >> On Tue, 20 Feb 2018, Julien Grall wrote:

> >>> On 19/02/18 21:58, Stefano Stabellini wrote:

> >>>> +        mrc   CP32(r6, CSSELR_EL1)

> >>>

> >>> The size of the cache is read using CSSIDR_EL1. But it looks like the 

> >>> way we

> >>> get the cache line size in Xen is fragile.

> >>>

> >>> We are retrieving the cache line size of Level 1 and assume this will 

> >>> be valid

> >>> for all the other caches. Indeed cache maintenance ops may propagate 

> >>> to other

> >>> caches depending the target (Point of Coherency vs Point of 

> >>> Unification).

> >>>

> >>> Looking at the ARM ARM "Cache hierarchy abstraction for address-based

> >>> operations" (D3-2061 DDI 0487C.a), CTR_EL0/CTR will holds the minimum 

> >>> line

> >>> lenght values for the data caches. The value will be the most efficient

> >>> address stride to use to apply a sequence of VA-based maintenance 

> >>> instructions

> >>> to a range of VAs.

> >>>

> >>> So it would be best and safer for Xen to use CTR/CTLR_EL0.DminLine.

> >>

> >> This is insightful, thank you. Given that this patch is a backport

> >> candidate, I would prefer to retain the same behavior we had before in

> >> setup_cache. I can write a separate patch on top of this to make the

> >> change to use CTR/CTLR_EL0.DminLine. That way, we can make a separate

> >> decision on each of them on whether we want to backport them (and

> >> potentially revert them) or not. In other words: this patch as-is is

> >> suboptimal but is of very little risk. Making changes to the way we

> >> determine the cacheline size improves the patch but significantly

> >> increases the risk factor associated with it.

> >>

> >> Does it make sense?

> > 

> > By this patch you mean big.LITTLE? If so, then I don't consider it as a 

> > potential backport. big.LITTLE has never been supported on Xen and hence 

> > should be considered as a new feature. What is backportable is the patch 

> > #1 that forbid big.LITTLE.


Patch #1 ends up forcing people to use big cores only on many platforms,
which from what you wrote can be unsafe. I suggest we backport the whole
series, so that at least users can configure the system to use LITTLE
cores only, or a mix of the two. The big.LITTLE doc in particular is
certainly worth backporting but only makes sense with the rest of the
series.

On support statements: I noticed that big.LITTLE is actually lacking from
SUPPORT.md.


> > Regarding the cache line size, I didn't suggest the change because it is 

> > more efficient. I suggested the patch because the current code to find 

> > the cache line size is wrong. Imagine there is a cache in the hierarchy 

> > that has a smaller cache line than your L1 cache. Then you would not 

> > clean/invalidate correctly that cache.


I didn't mean to imply that what you are suggesting is not important, or
less important than the purpose of patch. I just meant to say that this
patch is about removing the cacheline_bytes variable, it is not about
fixing the way we read the cacheline size. I like to keep one patch
doing one thing only.

The fix you are suggesting is important, in fact it is probably more
important than this series. I am OK writing a patch for it. It is just
that it is a separate issue, and should be fix separately.

I'll have a look at it and propose it a separate patch.


> >>>> +        and   r6, r6, #0x7

> >>>> +        add   r6, r6, #4

> >>>> +        mov   r7, #1

> >>>> +        lsl   r6, r7, r6

> >>>>            mov   r7, r3

> >>>>      1:      mcr   CP32(r7, DCCMVAC)

> >>>> diff --git a/xen/arch/arm/arm64/head.S b/xen/arch/arm/arm64/head.S

> >>>> index fa0ef70..edea300 100644

> >>>> --- a/xen/arch/arm/arm64/head.S

> >>>> +++ b/xen/arch/arm/arm64/head.S

> >>>> @@ -631,8 +631,14 @@ ENTRY(relocate_xen)

> >>>>            dsb   sy        /* So the CPU issues all writes to the 

> >>>> range */

> >>>>              mov   x9, x3

> >>>> -        ldr   x10, =cacheline_bytes /* x10 := step */

> >>>> -        ldr   x10, [x10]

> >>>> +

> >>>> +        mov   x10, #0

> >>>> +        msr   CSSELR_EL1, x10

> >>>> +        mrs   x10, CSSELR_EL1

> >>>> +        and   x10, x10, #0x7

> >>>> +        add   x10, x10, #4

> >>>> +        mov   x11, #1

> >>>> +        lsl   x10, x11, x10

> >>>

> >>> Please use dcache_line_size macro (see cache.S).

> >>

> >> Similarly, I would prefer to retain the same old behavior here, and

> >> fix it/improve it in a separate patch.

> > 

> > See above, you got the wrong end of the stick about the cache line size.

> 

> You might want to look at the following patch in Linux:

> 

> commit f91e2c3bd427239c198351f44814dd39db91afe0

> Author: Catalin Marinas <catalin.marinas@arm.com>

> Date:   Tue Dec 7 16:52:04 2010 +0100

> 

>     ARM: 6527/1: Use CTR instead of CCSIDR for the D-cache line size on ARMv7

> 

>     The current implementation of the dcache_line_size macro reads the L1

>     cache size from the CCSIDR register. This, however, is not guaranteed to

>     be the smallest cache line in the cache hierarchy. The patch changes to

>     the macro to use the more architecturally correct CTR register.

> 

>     Reported-by: Kevin Sapp <ksapp@quicinc.com>

>     Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

>     Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>


Thank you for the pointer, I'll give it a look.
Julien Grall Feb. 21, 2018, 7:58 a.m. UTC | #12
On 20/02/2018 23:28, Stefano Stabellini wrote:
> On Tue, 20 Feb 2018, Julien Grall wrote:
>> On 20/02/2018 21:16, Julien Grall wrote:
>>> Hi,
>>>
>>> On 20/02/2018 21:03, Stefano Stabellini wrote:
>>>> On Tue, 20 Feb 2018, Julien Grall wrote:
>>>>> On 19/02/18 21:58, Stefano Stabellini wrote:
>>>>>> +        mrc   CP32(r6, CSSELR_EL1)
>>>>>
>>>>> The size of the cache is read using CSSIDR_EL1. But it looks like the
>>>>> way we
>>>>> get the cache line size in Xen is fragile.
>>>>>
>>>>> We are retrieving the cache line size of Level 1 and assume this will
>>>>> be valid
>>>>> for all the other caches. Indeed cache maintenance ops may propagate
>>>>> to other
>>>>> caches depending the target (Point of Coherency vs Point of
>>>>> Unification).
>>>>>
>>>>> Looking at the ARM ARM "Cache hierarchy abstraction for address-based
>>>>> operations" (D3-2061 DDI 0487C.a), CTR_EL0/CTR will holds the minimum
>>>>> line
>>>>> lenght values for the data caches. The value will be the most efficient
>>>>> address stride to use to apply a sequence of VA-based maintenance
>>>>> instructions
>>>>> to a range of VAs.
>>>>>
>>>>> So it would be best and safer for Xen to use CTR/CTLR_EL0.DminLine.
>>>>
>>>> This is insightful, thank you. Given that this patch is a backport
>>>> candidate, I would prefer to retain the same behavior we had before in
>>>> setup_cache. I can write a separate patch on top of this to make the
>>>> change to use CTR/CTLR_EL0.DminLine. That way, we can make a separate
>>>> decision on each of them on whether we want to backport them (and
>>>> potentially revert them) or not. In other words: this patch as-is is
>>>> suboptimal but is of very little risk. Making changes to the way we
>>>> determine the cacheline size improves the patch but significantly
>>>> increases the risk factor associated with it.
>>>>
>>>> Does it make sense?
>>>
>>> By this patch you mean big.LITTLE? If so, then I don't consider it as a
>>> potential backport. big.LITTLE has never been supported on Xen and hence
>>> should be considered as a new feature. What is backportable is the patch
>>> #1 that forbid big.LITTLE.
> 
> Patch #1 ends up forcing people to use big cores only on many platforms,
> which from what you wrote can be unsafe. I suggest we backport the whole
> series, so that at least users can configure the system to use LITTLE
> cores only, or a mix of the two. The big.LITTLE doc in particular is
> certainly worth backporting but only makes sense with the rest of the > series.

While patch #1 will restrict the number of CPUs, the other will change 
sensibly the interface exposed to the guest. Now on big.LITTLE cores, 
you expose a different MIDR, and potentially ACTLR. This might not be a 
big deal, but we don't want to take the chance to break existing setup.

Furthermore, this series is based on the assumption that all the cores 
have the same features. I already identified a few places in Xen and you 
fixed in this series. But there are probably more (see all the usage of 
boot_cpu and processor_id()).

I am ok to see this series in staging because it makes a step towards 
big.LITTLE in Xen. But I think it is best to not backport this series at 
all and keep the current situation on Xen 4.*

> 
> On support statements: I noticed that big.LITTLE is actually lacking from
> SUPPORT.md.
>>> Regarding the cache line size, I didn't suggest the change because it is
>>> more efficient. I suggested the patch because the current code to find
>>> the cache line size is wrong. Imagine there is a cache in the hierarchy
>>> that has a smaller cache line than your L1 cache. Then you would not
>>> clean/invalidate correctly that cache.
> 
> I didn't mean to imply that what you are suggesting is not important, or
> less important than the purpose of patch. I just meant to say that this
> patch is about removing the cacheline_bytes variable, it is not about
> fixing the way we read the cacheline size. I like to keep one patch
> doing one thing only.

I wasn't asking to change the behavior how we get the cacheline size in 
this patch. But I would rather fix the misery before spreading a bit 
more. More than it is quite weird to a macro dcache_line_size and not 
using it on Arm64.

> 
> The fix you are suggesting is important, in fact it is probably more
> important than this series. I am OK writing a patch for it. It is just
> that it is a separate issue, and should be fix separately.
> 
> I'll have a look at it and propose it a separate patch.
> 
> 
>>>>>> +        and   r6, r6, #0x7
>>>>>> +        add   r6, r6, #4
>>>>>> +        mov   r7, #1
>>>>>> +        lsl   r6, r7, r6
>>>>>>             mov   r7, r3
>>>>>>       1:      mcr   CP32(r7, DCCMVAC)
>>>>>> diff --git a/xen/arch/arm/arm64/head.S b/xen/arch/arm/arm64/head.S
>>>>>> index fa0ef70..edea300 100644
>>>>>> --- a/xen/arch/arm/arm64/head.S
>>>>>> +++ b/xen/arch/arm/arm64/head.S
>>>>>> @@ -631,8 +631,14 @@ ENTRY(relocate_xen)
>>>>>>             dsb   sy        /* So the CPU issues all writes to the
>>>>>> range */
>>>>>>               mov   x9, x3
>>>>>> -        ldr   x10, =cacheline_bytes /* x10 := step */
>>>>>> -        ldr   x10, [x10]
>>>>>> +
>>>>>> +        mov   x10, #0
>>>>>> +        msr   CSSELR_EL1, x10
>>>>>> +        mrs   x10, CSSELR_EL1
>>>>>> +        and   x10, x10, #0x7
>>>>>> +        add   x10, x10, #4
>>>>>> +        mov   x11, #1
>>>>>> +        lsl   x10, x11, x10
>>>>>
>>>>> Please use dcache_line_size macro (see cache.S).
>>>>
>>>> Similarly, I would prefer to retain the same old behavior here, and
>>>> fix it/improve it in a separate patch.
>>>
>>> See above, you got the wrong end of the stick about the cache line size.
>>
>> You might want to look at the following patch in Linux:
>>
>> commit f91e2c3bd427239c198351f44814dd39db91afe0
>> Author: Catalin Marinas <catalin.marinas@arm.com>
>> Date:   Tue Dec 7 16:52:04 2010 +0100
>>
>>      ARM: 6527/1: Use CTR instead of CCSIDR for the D-cache line size on ARMv7
>>
>>      The current implementation of the dcache_line_size macro reads the L1
>>      cache size from the CCSIDR register. This, however, is not guaranteed to
>>      be the smallest cache line in the cache hierarchy. The patch changes to
>>      the macro to use the more architecturally correct CTR register.
>>
>>      Reported-by: Kevin Sapp <ksapp@quicinc.com>
>>      Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
>>      Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
> 
> Thank you for the pointer, I'll give it a look.
>
diff mbox series

Patch

diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index 8317639..2184cb9 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1000,6 +1000,16 @@  supported only when compiled with XSM on x86.
 
 Control Xens use of the APEI Hardware Error Source Table, should one be found.
 
+### hmp-unsafe (arm)
+> `= <boolean>`
+
+> Default : `false`
+
+Say yes at your own risk if you want to enable heterogenous computing
+(such as big.LITTLE). This may result to an unstable and insecure
+platform. When the option is disabled (default), CPUs that are not
+identical to the boot CPU will be parked and not used by Xen.
+
 ### hpetbroadcast
 > `= <boolean>`
 
diff --git a/xen/arch/arm/smpboot.c b/xen/arch/arm/smpboot.c
index 1255185..7ea4e41 100644
--- a/xen/arch/arm/smpboot.c
+++ b/xen/arch/arm/smpboot.c
@@ -27,6 +27,7 @@ 
 #include <xen/smp.h>
 #include <xen/softirq.h>
 #include <xen/timer.h>
+#include <xen/warning.h>
 #include <xen/irq.h>
 #include <xen/console.h>
 #include <asm/cpuerrata.h>
@@ -69,6 +70,13 @@  DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_mask);
 /* representing HT and core siblings of each logical CPU */
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_mask);
 
+/*
+ * By default non-boot CPUs not identical to the boot CPU will be
+ * parked.
+ */
+static bool __read_mostly opt_hmp_unsafe = false;
+boolean_param("hmp-unsafe", opt_hmp_unsafe);
+
 static void setup_cpu_sibling_map(int cpu)
 {
     if ( !zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) ||
@@ -255,6 +263,9 @@  void __init smp_init_cpus(void)
     else
         acpi_smp_init_cpus();
 
+    if ( opt_hmp_unsafe )
+        warning_add("WARNING: HMP COMPUTING HAS BEEN ENABLED.\n"
+                    "It has implications on the security and stability of the system.\n");
 }
 
 int __init
@@ -292,6 +303,21 @@  void start_secondary(unsigned long boot_phys_offset,
 
     init_traps();
 
+    /*
+     * Currently Xen assumes the platform has only one kind of CPUs.
+     * This assumption does not hold on big.LITTLE platform and may
+     * result to instability and insecure platform. Better to park them
+     * for now.
+     */
+    if ( !opt_hmp_unsafe &&
+         current_cpu_data.midr.bits != boot_cpu_data.midr.bits )
+    {
+        printk(XENLOG_ERR "CPU%u MIDR (0x%x) does not match boot CPU MIDR (0x%x).\n",
+               smp_processor_id(), current_cpu_data.midr.bits,
+               boot_cpu_data.midr.bits);
+        stop_cpu();
+    }
+
     mmu_init_secondary_cpu();
 
     gic_init_secondary_cpu();