diff mbox

arm64: mm: take CWG into account in __inval_cache_range()

Message ID 20160419141321.GD8482@e104818-lin.cambridge.arm.com
State New
Headers show

Commit Message

Catalin Marinas April 19, 2016, 2:13 p.m. UTC
On Tue, Apr 19, 2016 at 03:08:27PM +0200, Ard Biesheuvel wrote:
> On 19 April 2016 at 14:56, Mark Rutland <mark.rutland@arm.com> wrote:

> > Is sharing a CWG broken by design, or is there some caveat I'm missing

> > that prevents/prohibits unrelated data from being dirtied?

> 

> I think sharing a CWG window is broken by design, now that I think of

> it. The invalidate is part of the dma_unmap() code path, which means

> the cleaning we do on the edges of the buffer may clobber data in

> memory written by the device, and not cleaning isn't an option either.


Indeed. It only works if the cache line is always clean (e.g. read or
speculatively loaded) but you can't guarantee what's accessing it. I
think we shouldn't even bother with the edges at all in __dma_inv_range.

The best we could do is to warn if ARCH_DMA_MINALIGN is smaller than CWG
(as Robin suggested, we could do this only if we have non-coherent DMA
masters via arch_setup_dma_ops()). Quick hack below:

-------------------------------8<-----------------------------------
-------------------------------8<-----------------------------------

This way we could also reduce L1_CACHE_SHIFT back to 6 as it shouldn't
cause any correctness issues (potentially performance but so far it
seems that increasing it made things worse on some SoCs).

The reason for cache_line_size() using ARCH_DMA_MINALIGN instead of
L1_CACHE_BYTES is that I can see it used in the sl*b code when
SLAB_HWCACHE_ALIGN is passed and on few occasions for DMA buffers.

-- 
Catalin

Comments

Ard Biesheuvel April 19, 2016, 2:48 p.m. UTC | #1
On 19 April 2016 at 16:13, Catalin Marinas <catalin.marinas@arm.com> wrote:
> On Tue, Apr 19, 2016 at 03:08:27PM +0200, Ard Biesheuvel wrote:

>> On 19 April 2016 at 14:56, Mark Rutland <mark.rutland@arm.com> wrote:

>> > Is sharing a CWG broken by design, or is there some caveat I'm missing

>> > that prevents/prohibits unrelated data from being dirtied?

>>

>> I think sharing a CWG window is broken by design, now that I think of

>> it. The invalidate is part of the dma_unmap() code path, which means

>> the cleaning we do on the edges of the buffer may clobber data in

>> memory written by the device, and not cleaning isn't an option either.

>

> Indeed. It only works if the cache line is always clean (e.g. read or

> speculatively loaded) but you can't guarantee what's accessing it. I

> think we shouldn't even bother with the edges at all in __dma_inv_range.

>


Agreed.

> The best we could do is to warn if ARCH_DMA_MINALIGN is smaller than CWG

> (as Robin suggested, we could do this only if we have non-coherent DMA

> masters via arch_setup_dma_ops()). Quick hack below:

>

> -------------------------------8<-----------------------------------

> diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h

> index 5082b30bc2c0..5967fcbb617a 100644

> --- a/arch/arm64/include/asm/cache.h

> +++ b/arch/arm64/include/asm/cache.h

> @@ -28,7 +28,7 @@

>   * cache before the transfer is done, causing old data to be seen by

>   * the CPU.

>   */

> -#define ARCH_DMA_MINALIGN      L1_CACHE_BYTES

> +#define ARCH_DMA_MINALIGN      128

>

>  #ifndef __ASSEMBLY__

>

> @@ -37,7 +37,7 @@

>  static inline int cache_line_size(void)

>  {

>         u32 cwg = cache_type_cwg();

> -       return cwg ? 4 << cwg : L1_CACHE_BYTES;

> +       return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;


Unrelated, but this does not look right: if the CWG field is zero, we
should either assume 2 KB, or iterate over all the CCSIDR values and
take the maximum linesize.

>  }

>

>  #endif /* __ASSEMBLY__ */

> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c

> index 943f5140e0f3..b1d4d1f5b0dd 100644

> --- a/arch/arm64/kernel/cpufeature.c

> +++ b/arch/arm64/kernel/cpufeature.c

> @@ -970,7 +970,6 @@ static void __init setup_feature_capabilities(void)

>  void __init setup_cpu_features(void)

>  {

>         u32 cwg;

> -       int cls;

>

>         /* Set the CPU feature capabilies */

>         setup_feature_capabilities();

> @@ -983,13 +982,9 @@ void __init setup_cpu_features(void)

>          * Check for sane CTR_EL0.CWG value.

>          */

>         cwg = cache_type_cwg();

> -       cls = cache_line_size();

>         if (!cwg)

> -               pr_warn("No Cache Writeback Granule information, assuming cache line size %d\n",

> -                       cls);

> -       if (L1_CACHE_BYTES < cls)

> -               pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n",

> -                       L1_CACHE_BYTES, cls);

> +               pr_warn("No Cache Writeback Granule information, assuming %d\n",

> +                       ARCH_DMA_MINALIGN);

>  }

>

>  static bool __maybe_unused

> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c

> index a6e757cbab77..08232faf38ad 100644

> --- a/arch/arm64/mm/dma-mapping.c

> +++ b/arch/arm64/mm/dma-mapping.c

> @@ -987,6 +987,11 @@ static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,

>  void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,

>                         struct iommu_ops *iommu, bool coherent)

>  {

> +       WARN_TAINT_ONCE(!coherent && ARCH_DMA_MINALIGN < cache_line_size(),

> +                       TAINT_CPU_OUT_OF_SPEC,

> +                       "ARCH_DMA_MINALIGN smaller than the Cache Writeback Granule (%d < %d)",

> +                       ARCH_DMA_MINALIGN, cache_line_size());

> +

>         if (!dev->archdata.dma_ops)

>                 dev->archdata.dma_ops = &swiotlb_dma_ops;

>

> -------------------------------8<-----------------------------------

>

> This way we could also reduce L1_CACHE_SHIFT back to 6 as it shouldn't

> cause any correctness issues (potentially performance but so far it

> seems that increasing it made things worse on some SoCs).

>

> The reason for cache_line_size() using ARCH_DMA_MINALIGN instead of

> L1_CACHE_BYTES is that I can see it used in the sl*b code when

> SLAB_HWCACHE_ALIGN is passed and on few occasions for DMA buffers.

>

> --

> Catalin
Catalin Marinas April 19, 2016, 3:32 p.m. UTC | #2
On Tue, Apr 19, 2016 at 04:48:32PM +0200, Ard Biesheuvel wrote:
> On 19 April 2016 at 16:13, Catalin Marinas <catalin.marinas@arm.com> wrote:

> > The best we could do is to warn if ARCH_DMA_MINALIGN is smaller than CWG

> > (as Robin suggested, we could do this only if we have non-coherent DMA

> > masters via arch_setup_dma_ops()). Quick hack below:

> >

> > -------------------------------8<-----------------------------------

> > diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h

> > index 5082b30bc2c0..5967fcbb617a 100644

> > --- a/arch/arm64/include/asm/cache.h

> > +++ b/arch/arm64/include/asm/cache.h

> > @@ -28,7 +28,7 @@

> >   * cache before the transfer is done, causing old data to be seen by

> >   * the CPU.

> >   */

> > -#define ARCH_DMA_MINALIGN      L1_CACHE_BYTES

> > +#define ARCH_DMA_MINALIGN      128

> >

> >  #ifndef __ASSEMBLY__

> >

> > @@ -37,7 +37,7 @@

> >  static inline int cache_line_size(void)

> >  {

> >         u32 cwg = cache_type_cwg();

> > -       return cwg ? 4 << cwg : L1_CACHE_BYTES;

> > +       return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;

> 

> Unrelated, but this does not look right: if the CWG field is zero, we

> should either assume 2 KB, or iterate over all the CCSIDR values and

> take the maximum linesize.


It may be a better guess but even that is not always relevant since
CCSIDR may not present the real hardware information. It's only meant to
give enough information to be able to do cache maintenance by set/way
and we've seen CPU implementations where this has nothing to do with the
actual cache geometry.

So I don't think we can do anything more than just hard-coding and hope
that implementations where CWG is 0 (or higher than 128) are only
deployed in a fully coherent configuration.

-- 
Catalin
Ard Biesheuvel April 19, 2016, 3:38 p.m. UTC | #3
On 19 April 2016 at 17:32, Catalin Marinas <catalin.marinas@arm.com> wrote:
> On Tue, Apr 19, 2016 at 04:48:32PM +0200, Ard Biesheuvel wrote:

>> On 19 April 2016 at 16:13, Catalin Marinas <catalin.marinas@arm.com> wrote:

>> > The best we could do is to warn if ARCH_DMA_MINALIGN is smaller than CWG

>> > (as Robin suggested, we could do this only if we have non-coherent DMA

>> > masters via arch_setup_dma_ops()). Quick hack below:

>> >

>> > -------------------------------8<-----------------------------------

>> > diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h

>> > index 5082b30bc2c0..5967fcbb617a 100644

>> > --- a/arch/arm64/include/asm/cache.h

>> > +++ b/arch/arm64/include/asm/cache.h

>> > @@ -28,7 +28,7 @@

>> >   * cache before the transfer is done, causing old data to be seen by

>> >   * the CPU.

>> >   */

>> > -#define ARCH_DMA_MINALIGN      L1_CACHE_BYTES

>> > +#define ARCH_DMA_MINALIGN      128

>> >

>> >  #ifndef __ASSEMBLY__

>> >

>> > @@ -37,7 +37,7 @@

>> >  static inline int cache_line_size(void)

>> >  {

>> >         u32 cwg = cache_type_cwg();

>> > -       return cwg ? 4 << cwg : L1_CACHE_BYTES;

>> > +       return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;

>>

>> Unrelated, but this does not look right: if the CWG field is zero, we

>> should either assume 2 KB, or iterate over all the CCSIDR values and

>> take the maximum linesize.

>

> It may be a better guess but even that is not always relevant since

> CCSIDR may not present the real hardware information. It's only meant to

> give enough information to be able to do cache maintenance by set/way

> and we've seen CPU implementations where this has nothing to do with the

> actual cache geometry.

>


I am aware of that discussion, but that was about inferring aliasing
properties from the way size, which combines the linesize and the
number of sets/ways, and the latter are apparently set to 1/1 in some
cases so that any set/way operation simply affects the entire cache.

However, the CCSIDR linesize field itself is mentioned in the
description of CWG in the ARM ARM, as a suitable source of obtaining
the maximum linesize in the system.

> So I don't think we can do anything more than just hard-coding and hope

> that implementations where CWG is 0 (or higher than 128) are only

> deployed in a fully coherent configuration.

>
Catalin Marinas April 19, 2016, 4:09 p.m. UTC | #4
On Tue, Apr 19, 2016 at 05:38:56PM +0200, Ard Biesheuvel wrote:
> On 19 April 2016 at 17:32, Catalin Marinas <catalin.marinas@arm.com> wrote:

> > On Tue, Apr 19, 2016 at 04:48:32PM +0200, Ard Biesheuvel wrote:

> >> On 19 April 2016 at 16:13, Catalin Marinas <catalin.marinas@arm.com> wrote:

> >> > The best we could do is to warn if ARCH_DMA_MINALIGN is smaller than CWG

> >> > (as Robin suggested, we could do this only if we have non-coherent DMA

> >> > masters via arch_setup_dma_ops()). Quick hack below:

> >> >

> >> > -------------------------------8<-----------------------------------

> >> > diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h

> >> > index 5082b30bc2c0..5967fcbb617a 100644

> >> > --- a/arch/arm64/include/asm/cache.h

> >> > +++ b/arch/arm64/include/asm/cache.h

> >> > @@ -28,7 +28,7 @@

> >> >   * cache before the transfer is done, causing old data to be seen by

> >> >   * the CPU.

> >> >   */

> >> > -#define ARCH_DMA_MINALIGN      L1_CACHE_BYTES

> >> > +#define ARCH_DMA_MINALIGN      128

> >> >

> >> >  #ifndef __ASSEMBLY__

> >> >

> >> > @@ -37,7 +37,7 @@

> >> >  static inline int cache_line_size(void)

> >> >  {

> >> >         u32 cwg = cache_type_cwg();

> >> > -       return cwg ? 4 << cwg : L1_CACHE_BYTES;

> >> > +       return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;

> >>

> >> Unrelated, but this does not look right: if the CWG field is zero, we

> >> should either assume 2 KB, or iterate over all the CCSIDR values and

> >> take the maximum linesize.

> >

> > It may be a better guess but even that is not always relevant since

> > CCSIDR may not present the real hardware information. It's only meant to

> > give enough information to be able to do cache maintenance by set/way

> > and we've seen CPU implementations where this has nothing to do with the

> > actual cache geometry.

> 

> I am aware of that discussion, but that was about inferring aliasing

> properties from the way size, which combines the linesize and the

> number of sets/ways, and the latter are apparently set to 1/1 in some

> cases so that any set/way operation simply affects the entire cache.

> 

> However, the CCSIDR linesize field itself is mentioned in the

> description of CWG in the ARM ARM, as a suitable source of obtaining

> the maximum linesize in the system.


The ARM ARM confuses me. While CTR_EL0.CWG indeed talks about CCSIDR as
a fall back, the latter has a note:

  The parameters NumSets, Associativity, and LineSize in these registers
  define the architecturally visible parameters that are required for
  the cache maintenance by Set/Way instructions. They are not guaranteed
  to represent the actual microarchitectural features of a design. You
  cannot make any inference about the actual sizes of caches based on
  these parameters.

So it's not clear to me whether LineSize would give us the information
we need for cache maintenance by VA. But I'm not opposed to using this
on a CPU that doesn't have CWG (once we see one).

-- 
Catalin
diff mbox

Patch

diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 5082b30bc2c0..5967fcbb617a 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -28,7 +28,7 @@ 
  * cache before the transfer is done, causing old data to be seen by
  * the CPU.
  */
-#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
+#define ARCH_DMA_MINALIGN	128
 
 #ifndef __ASSEMBLY__
 
@@ -37,7 +37,7 @@ 
 static inline int cache_line_size(void)
 {
 	u32 cwg = cache_type_cwg();
-	return cwg ? 4 << cwg : L1_CACHE_BYTES;
+	return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;
 }
 
 #endif	/* __ASSEMBLY__ */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 943f5140e0f3..b1d4d1f5b0dd 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -970,7 +970,6 @@  static void __init setup_feature_capabilities(void)
 void __init setup_cpu_features(void)
 {
 	u32 cwg;
-	int cls;
 
 	/* Set the CPU feature capabilies */
 	setup_feature_capabilities();
@@ -983,13 +982,9 @@  void __init setup_cpu_features(void)
 	 * Check for sane CTR_EL0.CWG value.
 	 */
 	cwg = cache_type_cwg();
-	cls = cache_line_size();
 	if (!cwg)
-		pr_warn("No Cache Writeback Granule information, assuming cache line size %d\n",
-			cls);
-	if (L1_CACHE_BYTES < cls)
-		pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n",
-			L1_CACHE_BYTES, cls);
+		pr_warn("No Cache Writeback Granule information, assuming %d\n",
+			ARCH_DMA_MINALIGN);
 }
 
 static bool __maybe_unused
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index a6e757cbab77..08232faf38ad 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -987,6 +987,11 @@  static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 			struct iommu_ops *iommu, bool coherent)
 {
+	WARN_TAINT_ONCE(!coherent && ARCH_DMA_MINALIGN < cache_line_size(),
+			TAINT_CPU_OUT_OF_SPEC,
+			"ARCH_DMA_MINALIGN smaller than the Cache Writeback Granule (%d < %d)",
+			ARCH_DMA_MINALIGN, cache_line_size());
+
 	if (!dev->archdata.dma_ops)
 		dev->archdata.dma_ops = &swiotlb_dma_ops;