diff mbox series

[v3,20/36] mach-snapdragon: carve out no-map regions

Message ID 20240130-b4-qcom-common-target-v3-20-e523cbf9e556@linaro.org
State New
Headers show
Series Qualcomm generic board support | expand

Commit Message

Caleb Connolly Jan. 30, 2024, 2:05 p.m. UTC
On Qualcomm platforms, the TZ may already have certain memory regions
under protection by the time U-Boot starts. There is a rare case on some
platforms where the prefetcher might speculatively access one of these
regions resulting in a board crash (TZ traps and then resets the board).

We shouldn't be accessing these regions from within U-Boot anyway, so
let's mark them all with PTE_TYPE_FAULT to prevent any speculative
access and correctly trap in EL1 rather than EL3.

This is quite costly with caches off (takes ~2 seconds on SDM845 vs 35ms
with caches on). So to minimise the impact this is only enabled on
QCS404 for now (where the issue is known to occur).

In the future, we should try to find a more efficient way to handle
this, perhaps by turning on the MMU in stages.

Signed-off-by: Caleb Connolly <caleb.connolly@linaro.org>
---
 abort                            |  20 ++++++
 arch/arm/mach-snapdragon/board.c | 151 ++++++++++++++++++++++++++++++++-------
 2 files changed, 147 insertions(+), 24 deletions(-)

Comments

Caleb Connolly Jan. 30, 2024, 2:09 p.m. UTC | #1
On 30/01/2024 14:05, Caleb Connolly wrote:
> On Qualcomm platforms, the TZ may already have certain memory regions
> under protection by the time U-Boot starts. There is a rare case on some
> platforms where the prefetcher might speculatively access one of these
> regions resulting in a board crash (TZ traps and then resets the board).
> 
> We shouldn't be accessing these regions from within U-Boot anyway, so
> let's mark them all with PTE_TYPE_FAULT to prevent any speculative
> access and correctly trap in EL1 rather than EL3.
> 
> This is quite costly with caches off (takes ~2 seconds on SDM845 vs 35ms
> with caches on). So to minimise the impact this is only enabled on
> QCS404 for now (where the issue is known to occur).
> 
> In the future, we should try to find a more efficient way to handle
> this, perhaps by turning on the MMU in stages.
> 
> Signed-off-by: Caleb Connolly <caleb.connolly@linaro.org>
> ---
>  abort                            |  20 ++++++
>  arch/arm/mach-snapdragon/board.c | 151 ++++++++++++++++++++++++++++++++-------
>  2 files changed, 147 insertions(+), 24 deletions(-)
> 
> diff --git a/abort b/abort
> new file mode 100644
> index 000000000000..34c3e2f0596a

Looks like I got a bit excited with the `git add .` 😅 I'll fix this on
the next spin or while merging if a respin isn't needed.
> --- /dev/null
> +++ b/abort
> @@ -0,0 +1,20 @@
> +"Synchronous Abort" handler, esr 0x96000007, far 0x0
> +elr: 0000000000005070 lr : 00000000000039a8 (reloc)
> +elr: 000000017ff27070 lr : 000000017ff259a8
> +x0 : 0000000000000000 x1 : 0000000000000000
> +x2 : 000000017faeb328 x3 : 000000017faeb320
> +x4 : 00000000000feae8 x5 : 00000000feae8000
> +x6 : 0000000000000007 x7 : 0000000000000004
> +x8 : 0000000000000000 x9 : 000000017eae7000
> +x10: 0000000000000000 x11: 000000027c89ffff
> +x12: 000000017fffffff x13: 0000000180000000
> +x14: 0000000000518db0 x15: 000000017faeb0cc
> +x16: 000000017ff2edac x17: 0000000000000000
> +x18: 000000017fb02d50 x19: 000000017ffbd600
> +x20: 000000017ffbd600 x21: 0000000000000000
> +x22: 0000000000001f1f x23: 000000017faeb370
> +x24: 000000017ffbd000 x25: 000000017ff94993
> +x26: 0000000000000030 x27: 000000017ffbecf0
> +x28: 0000000000000000 x29: 000000017faeb2a0
> +
> +Code: a94153f3 f94013f5 a8c47bfd d65f03c0 (b9400002) 
> diff --git a/arch/arm/mach-snapdragon/board.c b/arch/arm/mach-snapdragon/board.c
> index 9d8684a21fa1..ca300dc843a9 100644
> --- a/arch/arm/mach-snapdragon/board.c
> +++ b/arch/arm/mach-snapdragon/board.c
> @@ -6,8 +6,9 @@
>   * Author: Caleb Connolly <caleb.connolly@linaro.org>
>   */
>  
> -#define LOG_DEBUG
> +//#define LOG_DEBUG
>  
> +#include "time.h"
>  #include <asm/armv8/mmu.h>
>  #include <asm/gpio.h>
>  #include <asm/io.h>
> @@ -26,6 +27,7 @@
>  #include <lmb.h>
>  #include <malloc.h>
>  #include <usb.h>
> +#include <sort.h>
>  
>  DECLARE_GLOBAL_DATA_PTR;
>  
> @@ -297,7 +299,7 @@ int board_late_init(void)
>  
>  static void build_mem_map(void)
>  {
> -	int i;
> +	int i, j;
>  
>  	/*
>  	 * Ensure the peripheral block is sized to correctly cover the address range
> @@ -313,39 +315,140 @@ static void build_mem_map(void)
>  			 PTE_BLOCK_NON_SHARE |
>  			 PTE_BLOCK_PXN | PTE_BLOCK_UXN;
>  
> -	debug("Configured memory map:\n");
> -	debug("  0x%016llx - 0x%016llx: Peripheral block\n",
> -	      mem_map[0].phys, mem_map[0].phys + mem_map[0].size);
> -
> -	/*
> -	 * Now add memory map entries for each DRAM bank, ensuring we don't
> -	 * overwrite the list terminator
> -	 */
> -	for (i = 0; i < ARRAY_SIZE(rbx_mem_map) - 2 && gd->bd->bi_dram[i].size; i++) {
> -		if (i == ARRAY_SIZE(rbx_mem_map) - 1) {
> -			log_warning("Too many DRAM banks!\n");
> -			break;
> -		}
> -		mem_map[i + 1].phys = gd->bd->bi_dram[i].start;
> -		mem_map[i + 1].virt = mem_map[i + 1].phys;
> -		mem_map[i + 1].size = gd->bd->bi_dram[i].size;
> -		mem_map[i + 1].attrs = PTE_BLOCK_MEMTYPE(MT_NORMAL) |
> -				     PTE_BLOCK_INNER_SHARE;
> -
> -		debug("  0x%016llx - 0x%016llx: DDR bank %d\n",
> -		      mem_map[i + 1].phys, mem_map[i + 1].phys + mem_map[i + 1].size, i);
> +	for (i = 1, j = 0; i < ARRAY_SIZE(rbx_mem_map) - 1 && gd->bd->bi_dram[j].size; i++, j++) {
> +		mem_map[i].phys = gd->bd->bi_dram[j].start;
> +		mem_map[i].virt = mem_map[i].phys;
> +		mem_map[i].size = gd->bd->bi_dram[j].size;
> +		mem_map[i].attrs = PTE_BLOCK_MEMTYPE(MT_NORMAL) | \
> +				   PTE_BLOCK_INNER_SHARE;
>  	}
> +
> +	mem_map[i].phys = UINT64_MAX;
> +	mem_map[i].size = 0;
> +
> +	debug("Configured memory map:\n");
> +	for (i = 0; mem_map[i].size; i++)
> +		debug("  0x%016llx - 0x%016llx: entry %d\n",
> +		      mem_map[i].phys, mem_map[i].phys + mem_map[i].size, i);
>  }
>  
>  u64 get_page_table_size(void)
>  {
> -	return SZ_64K;
> +	return SZ_256K;
>  }
>  
> +static int fdt_cmp_res(const void *v1, const void *v2)
> +{
> +	const struct fdt_resource *res1 = v1, *res2 = v2;
> +
> +	return res1->start - res2->start;
> +}
> +
> +#define N_RESERVED_REGIONS 32
> +
> +/* Mark all no-map regions as PTE_TYPE_FAULT to prevent speculative access.
> + * On some platforms this is enough to trigger a security violation and trap
> + * to EL3.
> + */
> +static void carve_out_reserved_memory(void)
> +{
> +	static struct fdt_resource res[N_RESERVED_REGIONS] = { 0 };
> +	int parent, rmem, count, i = 0;
> +	phys_addr_t start;
> +	size_t size;
> +
> +	/* Some reserved nodes must be carved out, as the cache-prefetcher may otherwise
> +	 * attempt to access them, causing a security exception.
> +	 */
> +	parent = fdt_path_offset(gd->fdt_blob, "/reserved-memory");
> +	if (parent <= 0) {
> +		log_err("No reserved memory regions found\n");
> +		return;
> +	}
> +	count = fdtdec_get_child_count(gd->fdt_blob, parent);
> +	if (count > N_RESERVED_REGIONS) {
> +		log_err("Too many reserved memory regions!\n");
> +		count = N_RESERVED_REGIONS;
> +	}
> +
> +	/* Collect the reserved memory regions */
> +	fdt_for_each_subnode(rmem, gd->fdt_blob, parent) {
> +		if (!fdt_getprop(gd->fdt_blob, rmem, "no-map", NULL))
> +			continue;
> +		if (fdt_getprop(gd->fdt_blob, rmem, "compatible", NULL))
> +			continue;
> +		/* Skip regions that don't have a fixed address/size */
> +		if (fdt_get_resource(gd->fdt_blob, rmem, "reg", 0, &res[i++]))
> +			i--;
> +	}
> +
> +	/* Sort the reserved memory regions by address */
> +	count = i;
> +	qsort(res, count, sizeof(struct fdt_resource), fdt_cmp_res);
> +
> +	/* Now set the right attributes for them. Often a lot of the regions are tightly packed together
> +	 * so we can optimise the number of calls to mmu_change_region_attr() by combining adjacent
> +	 * regions.
> +	 */
> +	start = ALIGN_DOWN(res[0].start, SZ_2M);
> +	size = ALIGN(res[0].end, SZ_4K) - start;
> +	for (i = 1; i < count; i++) {
> +		/* We ideally want to 2M align everything for more efficient pagetables, but we must avoid
> +		 * overwriting reserved memory regions which shouldn't be mapped as FAULT (like those with
> +		 * compatible properties).
> +		 * If within 2M of the previous region, bump the size to include this region. Otherwise
> +		 * start a new region.
> +		 */
> +		if (start + size < res[i].start - SZ_2M) {
> +			debug("  0x%016llx - 0x%016llx FAULT\n",
> +			      start, start + size);
> +			mmu_change_region_attr(start, size, PTE_TYPE_FAULT);
> +			start = ALIGN_DOWN(res[i].start, SZ_2M);
> +			size = ALIGN(res[i].end, SZ_4K) - start;
> +		} else {
> +			/* Bump size if this region is immediately after the previous one */
> +			size = ALIGN(res[i].end, SZ_4K) - start;
> +		}
> +	}
> +}
> +
> +/* This function open-codes setup_all_pgtables() so that we can
> + * insert additional mappings *before* turning on the MMU.
> + */
>  void enable_caches(void)
>  {
> +	u64 tlb_addr = gd->arch.tlb_addr;
> +	u64 tlb_size = gd->arch.tlb_size;
> +	ulong carveout_start;
> +
> +	gd->arch.tlb_fillptr = tlb_addr;
> +
>  	build_mem_map();
>  
>  	icache_enable();
> +
> +	/* Create normal system page tables */
> +	setup_pgtables();
> +
> +	/* Create emergency page tables */
> +	gd->arch.tlb_size -= (uintptr_t)gd->arch.tlb_fillptr -
> +			     (uintptr_t)gd->arch.tlb_addr;
> +	gd->arch.tlb_addr = gd->arch.tlb_fillptr;
> +	setup_pgtables();
> +	gd->arch.tlb_emerg = gd->arch.tlb_addr;
> +	gd->arch.tlb_addr = tlb_addr;
> +	gd->arch.tlb_size = tlb_size;
> +
> +	/* Doing this has a noticeable impact on boot time, so until we can
> +	 * find a more efficient solution, just enable the workaround for
> +	 * the one board where it's necessary. Without this there seems to
> +	 * be a speculative access to a region which is protected by the TZ.
> +	 */
> +	if (of_machine_is_compatible("qcom,qcs404")) {
> +		carveout_start = get_timer(0);
> +		carve_out_reserved_memory();
> +		debug("carveout time: %lums\n", get_timer(carveout_start));
> +	}
> +
>  	dcache_enable();
>  }
>
Sumit Garg Feb. 1, 2024, 12:07 p.m. UTC | #2
On Tue, 30 Jan 2024 at 19:35, Caleb Connolly <caleb.connolly@linaro.org> wrote:
>
> On Qualcomm platforms, the TZ may already have certain memory regions
> under protection by the time U-Boot starts. There is a rare case on some
> platforms where the prefetcher might speculatively access one of these
> regions resulting in a board crash (TZ traps and then resets the board).
>
> We shouldn't be accessing these regions from within U-Boot anyway, so
> let's mark them all with PTE_TYPE_FAULT to prevent any speculative
> access and correctly trap in EL1 rather than EL3.
>
> This is quite costly with caches off (takes ~2 seconds on SDM845 vs 35ms
> with caches on). So to minimise the impact this is only enabled on
> QCS404 for now (where the issue is known to occur).

It only took 65ms with caches off on QCS404 as per below print:

carveout time: 65ms

It's probably due to some missing bits as pointed below to get it
working fast on SDM845 too..

>
> In the future, we should try to find a more efficient way to handle
> this, perhaps by turning on the MMU in stages.
>
> Signed-off-by: Caleb Connolly <caleb.connolly@linaro.org>
> ---
>  abort                            |  20 ++++++
>  arch/arm/mach-snapdragon/board.c | 151 ++++++++++++++++++++++++++++++++-------
>  2 files changed, 147 insertions(+), 24 deletions(-)
>
> diff --git a/abort b/abort
> new file mode 100644
> index 000000000000..34c3e2f0596a
> --- /dev/null
> +++ b/abort
> @@ -0,0 +1,20 @@
> +"Synchronous Abort" handler, esr 0x96000007, far 0x0
> +elr: 0000000000005070 lr : 00000000000039a8 (reloc)
> +elr: 000000017ff27070 lr : 000000017ff259a8
> +x0 : 0000000000000000 x1 : 0000000000000000
> +x2 : 000000017faeb328 x3 : 000000017faeb320
> +x4 : 00000000000feae8 x5 : 00000000feae8000
> +x6 : 0000000000000007 x7 : 0000000000000004
> +x8 : 0000000000000000 x9 : 000000017eae7000
> +x10: 0000000000000000 x11: 000000027c89ffff
> +x12: 000000017fffffff x13: 0000000180000000
> +x14: 0000000000518db0 x15: 000000017faeb0cc
> +x16: 000000017ff2edac x17: 0000000000000000
> +x18: 000000017fb02d50 x19: 000000017ffbd600
> +x20: 000000017ffbd600 x21: 0000000000000000
> +x22: 0000000000001f1f x23: 000000017faeb370
> +x24: 000000017ffbd000 x25: 000000017ff94993
> +x26: 0000000000000030 x27: 000000017ffbecf0
> +x28: 0000000000000000 x29: 000000017faeb2a0
> +
> +Code: a94153f3 f94013f5 a8c47bfd d65f03c0 (b9400002)
> diff --git a/arch/arm/mach-snapdragon/board.c b/arch/arm/mach-snapdragon/board.c
> index 9d8684a21fa1..ca300dc843a9 100644
> --- a/arch/arm/mach-snapdragon/board.c
> +++ b/arch/arm/mach-snapdragon/board.c
> @@ -6,8 +6,9 @@
>   * Author: Caleb Connolly <caleb.connolly@linaro.org>
>   */
>
> -#define LOG_DEBUG
> +//#define LOG_DEBUG
>
> +#include "time.h"
>  #include <asm/armv8/mmu.h>
>  #include <asm/gpio.h>
>  #include <asm/io.h>
> @@ -26,6 +27,7 @@
>  #include <lmb.h>
>  #include <malloc.h>
>  #include <usb.h>
> +#include <sort.h>
>
>  DECLARE_GLOBAL_DATA_PTR;
>
> @@ -297,7 +299,7 @@ int board_late_init(void)
>
>  static void build_mem_map(void)
>  {
> -       int i;
> +       int i, j;
>
>         /*
>          * Ensure the peripheral block is sized to correctly cover the address range
> @@ -313,39 +315,140 @@ static void build_mem_map(void)
>                          PTE_BLOCK_NON_SHARE |
>                          PTE_BLOCK_PXN | PTE_BLOCK_UXN;
>
> -       debug("Configured memory map:\n");
> -       debug("  0x%016llx - 0x%016llx: Peripheral block\n",
> -             mem_map[0].phys, mem_map[0].phys + mem_map[0].size);
> -
> -       /*
> -        * Now add memory map entries for each DRAM bank, ensuring we don't
> -        * overwrite the list terminator
> -        */
> -       for (i = 0; i < ARRAY_SIZE(rbx_mem_map) - 2 && gd->bd->bi_dram[i].size; i++) {
> -               if (i == ARRAY_SIZE(rbx_mem_map) - 1) {
> -                       log_warning("Too many DRAM banks!\n");
> -                       break;
> -               }
> -               mem_map[i + 1].phys = gd->bd->bi_dram[i].start;
> -               mem_map[i + 1].virt = mem_map[i + 1].phys;
> -               mem_map[i + 1].size = gd->bd->bi_dram[i].size;
> -               mem_map[i + 1].attrs = PTE_BLOCK_MEMTYPE(MT_NORMAL) |
> -                                    PTE_BLOCK_INNER_SHARE;
> -
> -               debug("  0x%016llx - 0x%016llx: DDR bank %d\n",
> -                     mem_map[i + 1].phys, mem_map[i + 1].phys + mem_map[i + 1].size, i);
> +       for (i = 1, j = 0; i < ARRAY_SIZE(rbx_mem_map) - 1 && gd->bd->bi_dram[j].size; i++, j++) {
> +               mem_map[i].phys = gd->bd->bi_dram[j].start;
> +               mem_map[i].virt = mem_map[i].phys;
> +               mem_map[i].size = gd->bd->bi_dram[j].size;
> +               mem_map[i].attrs = PTE_BLOCK_MEMTYPE(MT_NORMAL) | \
> +                                  PTE_BLOCK_INNER_SHARE;
>         }
> +
> +       mem_map[i].phys = UINT64_MAX;
> +       mem_map[i].size = 0;
> +
> +       debug("Configured memory map:\n");
> +       for (i = 0; mem_map[i].size; i++)
> +               debug("  0x%016llx - 0x%016llx: entry %d\n",
> +                     mem_map[i].phys, mem_map[i].phys + mem_map[i].size, i);
>  }
>
>  u64 get_page_table_size(void)
>  {
> -       return SZ_64K;
> +       return SZ_256K;
>  }
>
> +static int fdt_cmp_res(const void *v1, const void *v2)
> +{
> +       const struct fdt_resource *res1 = v1, *res2 = v2;
> +
> +       return res1->start - res2->start;
> +}
> +
> +#define N_RESERVED_REGIONS 32
> +
> +/* Mark all no-map regions as PTE_TYPE_FAULT to prevent speculative access.
> + * On some platforms this is enough to trigger a security violation and trap
> + * to EL3.
> + */
> +static void carve_out_reserved_memory(void)
> +{
> +       static struct fdt_resource res[N_RESERVED_REGIONS] = { 0 };
> +       int parent, rmem, count, i = 0;
> +       phys_addr_t start;
> +       size_t size;
> +
> +       /* Some reserved nodes must be carved out, as the cache-prefetcher may otherwise
> +        * attempt to access them, causing a security exception.
> +        */
> +       parent = fdt_path_offset(gd->fdt_blob, "/reserved-memory");
> +       if (parent <= 0) {
> +               log_err("No reserved memory regions found\n");
> +               return;
> +       }
> +       count = fdtdec_get_child_count(gd->fdt_blob, parent);
> +       if (count > N_RESERVED_REGIONS) {
> +               log_err("Too many reserved memory regions!\n");
> +               count = N_RESERVED_REGIONS;
> +       }
> +
> +       /* Collect the reserved memory regions */
> +       fdt_for_each_subnode(rmem, gd->fdt_blob, parent) {
> +               if (!fdt_getprop(gd->fdt_blob, rmem, "no-map", NULL))
> +                       continue;
> +               if (fdt_getprop(gd->fdt_blob, rmem, "compatible", NULL))
> +                       continue;
> +               /* Skip regions that don't have a fixed address/size */
> +               if (fdt_get_resource(gd->fdt_blob, rmem, "reg", 0, &res[i++]))
> +                       i--;
> +       }
> +
> +       /* Sort the reserved memory regions by address */
> +       count = i;
> +       qsort(res, count, sizeof(struct fdt_resource), fdt_cmp_res);
> +
> +       /* Now set the right attributes for them. Often a lot of the regions are tightly packed together
> +        * so we can optimise the number of calls to mmu_change_region_attr() by combining adjacent
> +        * regions.
> +        */
> +       start = ALIGN_DOWN(res[0].start, SZ_2M);
> +       size = ALIGN(res[0].end, SZ_4K) - start;
> +       for (i = 1; i < count; i++) {

I suppose this loop fails to configure attributes for the last no-map
region like uefi_mem region on QCS404.

> +               /* We ideally want to 2M align everything for more efficient pagetables, but we must avoid
> +                * overwriting reserved memory regions which shouldn't be mapped as FAULT (like those with
> +                * compatible properties).
> +                * If within 2M of the previous region, bump the size to include this region. Otherwise
> +                * start a new region.
> +                */
> +               if (start + size < res[i].start - SZ_2M) {
> +                       debug("  0x%016llx - 0x%016llx FAULT\n",
> +                             start, start + size);
> +                       mmu_change_region_attr(start, size, PTE_TYPE_FAULT);

Here the size won't always be 2M aligned which is the case for SDM845.
I would suggest you do:

                            mmu_change_region_attr(start, ALIGN(size,
SZ_2M), PTE_TYPE_FAULT);

-Sumit

> +                       start = ALIGN_DOWN(res[i].start, SZ_2M);
> +                       size = ALIGN(res[i].end, SZ_4K) - start;
> +               } else {
> +                       /* Bump size if this region is immediately after the previous one */
> +                       size = ALIGN(res[i].end, SZ_4K) - start;
> +               }
> +       }
> +}
> +
> +/* This function open-codes setup_all_pgtables() so that we can
> + * insert additional mappings *before* turning on the MMU.
> + */
>  void enable_caches(void)
>  {
> +       u64 tlb_addr = gd->arch.tlb_addr;
> +       u64 tlb_size = gd->arch.tlb_size;
> +       ulong carveout_start;
> +
> +       gd->arch.tlb_fillptr = tlb_addr;
> +
>         build_mem_map();
>
>         icache_enable();
> +
> +       /* Create normal system page tables */
> +       setup_pgtables();
> +
> +       /* Create emergency page tables */
> +       gd->arch.tlb_size -= (uintptr_t)gd->arch.tlb_fillptr -
> +                            (uintptr_t)gd->arch.tlb_addr;
> +       gd->arch.tlb_addr = gd->arch.tlb_fillptr;
> +       setup_pgtables();
> +       gd->arch.tlb_emerg = gd->arch.tlb_addr;
> +       gd->arch.tlb_addr = tlb_addr;
> +       gd->arch.tlb_size = tlb_size;
> +
> +       /* Doing this has a noticeable impact on boot time, so until we can
> +        * find a more efficient solution, just enable the workaround for
> +        * the one board where it's necessary. Without this there seems to
> +        * be a speculative access to a region which is protected by the TZ.
> +        */
> +       if (of_machine_is_compatible("qcom,qcs404")) {
> +               carveout_start = get_timer(0);
> +               carve_out_reserved_memory();
> +               debug("carveout time: %lums\n", get_timer(carveout_start));
> +       }
> +
>         dcache_enable();
>  }
>
> --
> 2.43.0
>
Caleb Connolly Feb. 1, 2024, 2:50 p.m. UTC | #3
On 01/02/2024 12:07, Sumit Garg wrote:
> On Tue, 30 Jan 2024 at 19:35, Caleb Connolly <caleb.connolly@linaro.org> wrote:
>>
>> On Qualcomm platforms, the TZ may already have certain memory regions
>> under protection by the time U-Boot starts. There is a rare case on some
>> platforms where the prefetcher might speculatively access one of these
>> regions resulting in a board crash (TZ traps and then resets the board).
>>
>> We shouldn't be accessing these regions from within U-Boot anyway, so
>> let's mark them all with PTE_TYPE_FAULT to prevent any speculative
>> access and correctly trap in EL1 rather than EL3.
>>
>> This is quite costly with caches off (takes ~2 seconds on SDM845 vs 35ms
>> with caches on). So to minimise the impact this is only enabled on
>> QCS404 for now (where the issue is known to occur).
> 
> It only took 65ms with caches off on QCS404 as per below print:
> 
> carveout time: 65ms
> 
> It's probably due to some missing bits as pointed below to get it
> working fast on SDM845 too..

Ah I didn't consider that the difference might really just be the 2M vs
4K alignment.

[snip]

>> +
>> +/* Mark all no-map regions as PTE_TYPE_FAULT to prevent speculative access.
>> + * On some platforms this is enough to trigger a security violation and trap
>> + * to EL3.
>> + */
>> +static void carve_out_reserved_memory(void)
>> +{
>> +       static struct fdt_resource res[N_RESERVED_REGIONS] = { 0 };
>> +       int parent, rmem, count, i = 0;
>> +       phys_addr_t start;
>> +       size_t size;
>> +
>> +       /* Some reserved nodes must be carved out, as the cache-prefetcher may otherwise
>> +        * attempt to access them, causing a security exception.
>> +        */
>> +       parent = fdt_path_offset(gd->fdt_blob, "/reserved-memory");
>> +       if (parent <= 0) {
>> +               log_err("No reserved memory regions found\n");
>> +               return;
>> +       }
>> +       count = fdtdec_get_child_count(gd->fdt_blob, parent);
>> +       if (count > N_RESERVED_REGIONS) {
>> +               log_err("Too many reserved memory regions!\n");
>> +               count = N_RESERVED_REGIONS;
>> +       }
>> +
>> +       /* Collect the reserved memory regions */
>> +       fdt_for_each_subnode(rmem, gd->fdt_blob, parent) {
>> +               if (!fdt_getprop(gd->fdt_blob, rmem, "no-map", NULL))
>> +                       continue;
>> +               if (fdt_getprop(gd->fdt_blob, rmem, "compatible", NULL))
>> +                       continue;
>> +               /* Skip regions that don't have a fixed address/size */
>> +               if (fdt_get_resource(gd->fdt_blob, rmem, "reg", 0, &res[i++]))
>> +                       i--;
>> +       }
>> +
>> +       /* Sort the reserved memory regions by address */
>> +       count = i;
>> +       qsort(res, count, sizeof(struct fdt_resource), fdt_cmp_res);
>> +
>> +       /* Now set the right attributes for them. Often a lot of the regions are tightly packed together
>> +        * so we can optimise the number of calls to mmu_change_region_attr() by combining adjacent
>> +        * regions.
>> +        */
>> +       start = ALIGN_DOWN(res[0].start, SZ_2M);
>> +       size = ALIGN(res[0].end, SZ_4K) - start;
>> +       for (i = 1; i < count; i++) {
> 
> I suppose this loop fails to configure attributes for the last no-map
> region like uefi_mem region on QCS404.

Right.
> 
>> +               /* We ideally want to 2M align everything for more efficient pagetables, but we must avoid
>> +                * overwriting reserved memory regions which shouldn't be mapped as FAULT (like those with
>> +                * compatible properties).
>> +                * If within 2M of the previous region, bump the size to include this region. Otherwise
>> +                * start a new region.
>> +                */
>> +               if (start + size < res[i].start - SZ_2M) {
>> +                       debug("  0x%016llx - 0x%016llx FAULT\n",
>> +                             start, start + size);
>> +                       mmu_change_region_attr(start, size, PTE_TYPE_FAULT);
> 
> Here the size won't always be 2M aligned which is the case for SDM845.
> I would suggest you do:
> 
>                             mmu_change_region_attr(start, ALIGN(size,
> SZ_2M), PTE_TYPE_FAULT);

This isn't an option as I want to avoid unmapping reserved memory
regions which have a compatible property; on SDM845 for example this
includes the cmd-db and rmtfs regions. These are not 2M aligned and
might be directly adjacent to other regions, so doing an align here
could result in unmapping part of these regions.

This will be especially relevant for SM8250 where I'll be enabling
cmd-db and RPMh drivers in order to turn on the USB VBUS regulator on RB5.
> 
> -Sumit
> 
>> +                       start = ALIGN_DOWN(res[i].start, SZ_2M);
>> +                       size = ALIGN(res[i].end, SZ_4K) - start;
>> +               } else {
>> +                       /* Bump size if this region is immediately after the previous one */
>> +                       size = ALIGN(res[i].end, SZ_4K) - start;
>> +               }
>> +       }
>> +}
>> +
>> +/* This function open-codes setup_all_pgtables() so that we can
>> + * insert additional mappings *before* turning on the MMU.
>> + */
>>  void enable_caches(void)
>>  {
>> +       u64 tlb_addr = gd->arch.tlb_addr;
>> +       u64 tlb_size = gd->arch.tlb_size;
>> +       ulong carveout_start;
>> +
>> +       gd->arch.tlb_fillptr = tlb_addr;
>> +
>>         build_mem_map();
>>
>>         icache_enable();
>> +
>> +       /* Create normal system page tables */
>> +       setup_pgtables();
>> +
>> +       /* Create emergency page tables */
>> +       gd->arch.tlb_size -= (uintptr_t)gd->arch.tlb_fillptr -
>> +                            (uintptr_t)gd->arch.tlb_addr;
>> +       gd->arch.tlb_addr = gd->arch.tlb_fillptr;
>> +       setup_pgtables();
>> +       gd->arch.tlb_emerg = gd->arch.tlb_addr;
>> +       gd->arch.tlb_addr = tlb_addr;
>> +       gd->arch.tlb_size = tlb_size;
>> +
>> +       /* Doing this has a noticeable impact on boot time, so until we can
>> +        * find a more efficient solution, just enable the workaround for
>> +        * the one board where it's necessary. Without this there seems to
>> +        * be a speculative access to a region which is protected by the TZ.
>> +        */
>> +       if (of_machine_is_compatible("qcom,qcs404")) {
>> +               carveout_start = get_timer(0);
>> +               carve_out_reserved_memory();
>> +               debug("carveout time: %lums\n", get_timer(carveout_start));
>> +       }
>> +
>>         dcache_enable();
>>  }
>>
>> --
>> 2.43.0
>>
Sumit Garg Feb. 2, 2024, 6:03 a.m. UTC | #4
On Thu, 1 Feb 2024 at 20:20, Caleb Connolly <caleb.connolly@linaro.org> wrote:
>
>
>
> On 01/02/2024 12:07, Sumit Garg wrote:
> > On Tue, 30 Jan 2024 at 19:35, Caleb Connolly <caleb.connolly@linaro.org> wrote:
> >>
> >> On Qualcomm platforms, the TZ may already have certain memory regions
> >> under protection by the time U-Boot starts. There is a rare case on some
> >> platforms where the prefetcher might speculatively access one of these
> >> regions resulting in a board crash (TZ traps and then resets the board).
> >>
> >> We shouldn't be accessing these regions from within U-Boot anyway, so
> >> let's mark them all with PTE_TYPE_FAULT to prevent any speculative
> >> access and correctly trap in EL1 rather than EL3.
> >>
> >> This is quite costly with caches off (takes ~2 seconds on SDM845 vs 35ms
> >> with caches on). So to minimise the impact this is only enabled on
> >> QCS404 for now (where the issue is known to occur).
> >
> > It only took 65ms with caches off on QCS404 as per below print:
> >
> > carveout time: 65ms
> >
> > It's probably due to some missing bits as pointed below to get it
> > working fast on SDM845 too..
>
> Ah I didn't consider that the difference might really just be the 2M vs
> 4K alignment.
>
> [snip]
>
> >> +
> >> +/* Mark all no-map regions as PTE_TYPE_FAULT to prevent speculative access.
> >> + * On some platforms this is enough to trigger a security violation and trap
> >> + * to EL3.
> >> + */
> >> +static void carve_out_reserved_memory(void)
> >> +{
> >> +       static struct fdt_resource res[N_RESERVED_REGIONS] = { 0 };
> >> +       int parent, rmem, count, i = 0;
> >> +       phys_addr_t start;
> >> +       size_t size;
> >> +
> >> +       /* Some reserved nodes must be carved out, as the cache-prefetcher may otherwise
> >> +        * attempt to access them, causing a security exception.
> >> +        */
> >> +       parent = fdt_path_offset(gd->fdt_blob, "/reserved-memory");
> >> +       if (parent <= 0) {
> >> +               log_err("No reserved memory regions found\n");
> >> +               return;
> >> +       }
> >> +       count = fdtdec_get_child_count(gd->fdt_blob, parent);
> >> +       if (count > N_RESERVED_REGIONS) {
> >> +               log_err("Too many reserved memory regions!\n");
> >> +               count = N_RESERVED_REGIONS;
> >> +       }
> >> +
> >> +       /* Collect the reserved memory regions */
> >> +       fdt_for_each_subnode(rmem, gd->fdt_blob, parent) {
> >> +               if (!fdt_getprop(gd->fdt_blob, rmem, "no-map", NULL))
> >> +                       continue;
> >> +               if (fdt_getprop(gd->fdt_blob, rmem, "compatible", NULL))
> >> +                       continue;
> >> +               /* Skip regions that don't have a fixed address/size */
> >> +               if (fdt_get_resource(gd->fdt_blob, rmem, "reg", 0, &res[i++]))
> >> +                       i--;
> >> +       }
> >> +
> >> +       /* Sort the reserved memory regions by address */
> >> +       count = i;
> >> +       qsort(res, count, sizeof(struct fdt_resource), fdt_cmp_res);
> >> +
> >> +       /* Now set the right attributes for them. Often a lot of the regions are tightly packed together
> >> +        * so we can optimise the number of calls to mmu_change_region_attr() by combining adjacent
> >> +        * regions.
> >> +        */
> >> +       start = ALIGN_DOWN(res[0].start, SZ_2M);
> >> +       size = ALIGN(res[0].end, SZ_4K) - start;
> >> +       for (i = 1; i < count; i++) {
> >
> > I suppose this loop fails to configure attributes for the last no-map
> > region like uefi_mem region on QCS404.
>
> Right.
> >
> >> +               /* We ideally want to 2M align everything for more efficient pagetables, but we must avoid
> >> +                * overwriting reserved memory regions which shouldn't be mapped as FAULT (like those with
> >> +                * compatible properties).
> >> +                * If within 2M of the previous region, bump the size to include this region. Otherwise
> >> +                * start a new region.
> >> +                */
> >> +               if (start + size < res[i].start - SZ_2M) {
> >> +                       debug("  0x%016llx - 0x%016llx FAULT\n",
> >> +                             start, start + size);
> >> +                       mmu_change_region_attr(start, size, PTE_TYPE_FAULT);
> >
> > Here the size won't always be 2M aligned which is the case for SDM845.
> > I would suggest you do:
> >
> >                             mmu_change_region_attr(start, ALIGN(size,
> > SZ_2M), PTE_TYPE_FAULT);
>
> This isn't an option as I want to avoid unmapping reserved memory
> regions which have a compatible property; on SDM845 for example this
> includes the cmd-db and rmtfs regions. These are not 2M aligned and
> might be directly adjacent to other regions, so doing an align here
> could result in unmapping part of these regions.

The regions with a compatible property and "no-map" should be mapped
by corresponding drivers later on when caches are enabled. The default
memory attributes or cache policy may not make sense for them for eg.
a shared memory region with coprocessors which has to be marked
un-cached etc.

>
> This will be especially relevant for SM8250 where I'll be enabling
> cmd-db and RPMh drivers in order to turn on the USB VBUS regulator on RB5.

So the cmd-db and RPMh drivers should map corresponding reserved
memory appropriately as per their requirements via
mmu_change_region_attr(). Since we have the caches enabled at that
point it won't be an expensive operation.

-Sumit

> >
> >> +                       start = ALIGN_DOWN(res[i].start, SZ_2M);
> >> +                       size = ALIGN(res[i].end, SZ_4K) - start;
> >> +               } else {
> >> +                       /* Bump size if this region is immediately after the previous one */
> >> +                       size = ALIGN(res[i].end, SZ_4K) - start;
> >> +               }
> >> +       }
> >> +}
> >> +
> >> +/* This function open-codes setup_all_pgtables() so that we can
> >> + * insert additional mappings *before* turning on the MMU.
> >> + */
> >>  void enable_caches(void)
> >>  {
> >> +       u64 tlb_addr = gd->arch.tlb_addr;
> >> +       u64 tlb_size = gd->arch.tlb_size;
> >> +       ulong carveout_start;
> >> +
> >> +       gd->arch.tlb_fillptr = tlb_addr;
> >> +
> >>         build_mem_map();
> >>
> >>         icache_enable();
> >> +
> >> +       /* Create normal system page tables */
> >> +       setup_pgtables();
> >> +
> >> +       /* Create emergency page tables */
> >> +       gd->arch.tlb_size -= (uintptr_t)gd->arch.tlb_fillptr -
> >> +                            (uintptr_t)gd->arch.tlb_addr;
> >> +       gd->arch.tlb_addr = gd->arch.tlb_fillptr;
> >> +       setup_pgtables();
> >> +       gd->arch.tlb_emerg = gd->arch.tlb_addr;
> >> +       gd->arch.tlb_addr = tlb_addr;
> >> +       gd->arch.tlb_size = tlb_size;
> >> +
> >> +       /* Doing this has a noticeable impact on boot time, so until we can
> >> +        * find a more efficient solution, just enable the workaround for
> >> +        * the one board where it's necessary. Without this there seems to
> >> +        * be a speculative access to a region which is protected by the TZ.
> >> +        */
> >> +       if (of_machine_is_compatible("qcom,qcs404")) {
> >> +               carveout_start = get_timer(0);
> >> +               carve_out_reserved_memory();
> >> +               debug("carveout time: %lums\n", get_timer(carveout_start));
> >> +       }
> >> +
> >>         dcache_enable();
> >>  }
> >>
> >> --
> >> 2.43.0
> >>
>
> --
> // Caleb (they/them)
diff mbox series

Patch

diff --git a/abort b/abort
new file mode 100644
index 000000000000..34c3e2f0596a
--- /dev/null
+++ b/abort
@@ -0,0 +1,20 @@ 
+"Synchronous Abort" handler, esr 0x96000007, far 0x0
+elr: 0000000000005070 lr : 00000000000039a8 (reloc)
+elr: 000000017ff27070 lr : 000000017ff259a8
+x0 : 0000000000000000 x1 : 0000000000000000
+x2 : 000000017faeb328 x3 : 000000017faeb320
+x4 : 00000000000feae8 x5 : 00000000feae8000
+x6 : 0000000000000007 x7 : 0000000000000004
+x8 : 0000000000000000 x9 : 000000017eae7000
+x10: 0000000000000000 x11: 000000027c89ffff
+x12: 000000017fffffff x13: 0000000180000000
+x14: 0000000000518db0 x15: 000000017faeb0cc
+x16: 000000017ff2edac x17: 0000000000000000
+x18: 000000017fb02d50 x19: 000000017ffbd600
+x20: 000000017ffbd600 x21: 0000000000000000
+x22: 0000000000001f1f x23: 000000017faeb370
+x24: 000000017ffbd000 x25: 000000017ff94993
+x26: 0000000000000030 x27: 000000017ffbecf0
+x28: 0000000000000000 x29: 000000017faeb2a0
+
+Code: a94153f3 f94013f5 a8c47bfd d65f03c0 (b9400002) 
diff --git a/arch/arm/mach-snapdragon/board.c b/arch/arm/mach-snapdragon/board.c
index 9d8684a21fa1..ca300dc843a9 100644
--- a/arch/arm/mach-snapdragon/board.c
+++ b/arch/arm/mach-snapdragon/board.c
@@ -6,8 +6,9 @@ 
  * Author: Caleb Connolly <caleb.connolly@linaro.org>
  */
 
-#define LOG_DEBUG
+//#define LOG_DEBUG
 
+#include "time.h"
 #include <asm/armv8/mmu.h>
 #include <asm/gpio.h>
 #include <asm/io.h>
@@ -26,6 +27,7 @@ 
 #include <lmb.h>
 #include <malloc.h>
 #include <usb.h>
+#include <sort.h>
 
 DECLARE_GLOBAL_DATA_PTR;
 
@@ -297,7 +299,7 @@  int board_late_init(void)
 
 static void build_mem_map(void)
 {
-	int i;
+	int i, j;
 
 	/*
 	 * Ensure the peripheral block is sized to correctly cover the address range
@@ -313,39 +315,140 @@  static void build_mem_map(void)
 			 PTE_BLOCK_NON_SHARE |
 			 PTE_BLOCK_PXN | PTE_BLOCK_UXN;
 
-	debug("Configured memory map:\n");
-	debug("  0x%016llx - 0x%016llx: Peripheral block\n",
-	      mem_map[0].phys, mem_map[0].phys + mem_map[0].size);
-
-	/*
-	 * Now add memory map entries for each DRAM bank, ensuring we don't
-	 * overwrite the list terminator
-	 */
-	for (i = 0; i < ARRAY_SIZE(rbx_mem_map) - 2 && gd->bd->bi_dram[i].size; i++) {
-		if (i == ARRAY_SIZE(rbx_mem_map) - 1) {
-			log_warning("Too many DRAM banks!\n");
-			break;
-		}
-		mem_map[i + 1].phys = gd->bd->bi_dram[i].start;
-		mem_map[i + 1].virt = mem_map[i + 1].phys;
-		mem_map[i + 1].size = gd->bd->bi_dram[i].size;
-		mem_map[i + 1].attrs = PTE_BLOCK_MEMTYPE(MT_NORMAL) |
-				     PTE_BLOCK_INNER_SHARE;
-
-		debug("  0x%016llx - 0x%016llx: DDR bank %d\n",
-		      mem_map[i + 1].phys, mem_map[i + 1].phys + mem_map[i + 1].size, i);
+	for (i = 1, j = 0; i < ARRAY_SIZE(rbx_mem_map) - 1 && gd->bd->bi_dram[j].size; i++, j++) {
+		mem_map[i].phys = gd->bd->bi_dram[j].start;
+		mem_map[i].virt = mem_map[i].phys;
+		mem_map[i].size = gd->bd->bi_dram[j].size;
+		mem_map[i].attrs = PTE_BLOCK_MEMTYPE(MT_NORMAL) | \
+				   PTE_BLOCK_INNER_SHARE;
 	}
+
+	mem_map[i].phys = UINT64_MAX;
+	mem_map[i].size = 0;
+
+	debug("Configured memory map:\n");
+	for (i = 0; mem_map[i].size; i++)
+		debug("  0x%016llx - 0x%016llx: entry %d\n",
+		      mem_map[i].phys, mem_map[i].phys + mem_map[i].size, i);
 }
 
 u64 get_page_table_size(void)
 {
-	return SZ_64K;
+	return SZ_256K;
 }
 
+static int fdt_cmp_res(const void *v1, const void *v2)
+{
+	const struct fdt_resource *res1 = v1, *res2 = v2;
+
+	return res1->start - res2->start;
+}
+
+#define N_RESERVED_REGIONS 32
+
+/* Mark all no-map regions as PTE_TYPE_FAULT to prevent speculative access.
+ * On some platforms this is enough to trigger a security violation and trap
+ * to EL3.
+ */
+static void carve_out_reserved_memory(void)
+{
+	static struct fdt_resource res[N_RESERVED_REGIONS] = { 0 };
+	int parent, rmem, count, i = 0;
+	phys_addr_t start;
+	size_t size;
+
+	/* Some reserved nodes must be carved out, as the cache-prefetcher may otherwise
+	 * attempt to access them, causing a security exception.
+	 */
+	parent = fdt_path_offset(gd->fdt_blob, "/reserved-memory");
+	if (parent <= 0) {
+		log_err("No reserved memory regions found\n");
+		return;
+	}
+	count = fdtdec_get_child_count(gd->fdt_blob, parent);
+	if (count > N_RESERVED_REGIONS) {
+		log_err("Too many reserved memory regions!\n");
+		count = N_RESERVED_REGIONS;
+	}
+
+	/* Collect the reserved memory regions */
+	fdt_for_each_subnode(rmem, gd->fdt_blob, parent) {
+		if (!fdt_getprop(gd->fdt_blob, rmem, "no-map", NULL))
+			continue;
+		if (fdt_getprop(gd->fdt_blob, rmem, "compatible", NULL))
+			continue;
+		/* Skip regions that don't have a fixed address/size */
+		if (fdt_get_resource(gd->fdt_blob, rmem, "reg", 0, &res[i++]))
+			i--;
+	}
+
+	/* Sort the reserved memory regions by address */
+	count = i;
+	qsort(res, count, sizeof(struct fdt_resource), fdt_cmp_res);
+
+	/* Now set the right attributes for them. Often a lot of the regions are tightly packed together
+	 * so we can optimise the number of calls to mmu_change_region_attr() by combining adjacent
+	 * regions.
+	 */
+	start = ALIGN_DOWN(res[0].start, SZ_2M);
+	size = ALIGN(res[0].end, SZ_4K) - start;
+	for (i = 1; i < count; i++) {
+		/* We ideally want to 2M align everything for more efficient pagetables, but we must avoid
+		 * overwriting reserved memory regions which shouldn't be mapped as FAULT (like those with
+		 * compatible properties).
+		 * If within 2M of the previous region, bump the size to include this region. Otherwise
+		 * start a new region.
+		 */
+		if (start + size < res[i].start - SZ_2M) {
+			debug("  0x%016llx - 0x%016llx FAULT\n",
+			      start, start + size);
+			mmu_change_region_attr(start, size, PTE_TYPE_FAULT);
+			start = ALIGN_DOWN(res[i].start, SZ_2M);
+			size = ALIGN(res[i].end, SZ_4K) - start;
+		} else {
+			/* Bump size if this region is immediately after the previous one */
+			size = ALIGN(res[i].end, SZ_4K) - start;
+		}
+	}
+}
+
+/* This function open-codes setup_all_pgtables() so that we can
+ * insert additional mappings *before* turning on the MMU.
+ */
 void enable_caches(void)
 {
+	u64 tlb_addr = gd->arch.tlb_addr;
+	u64 tlb_size = gd->arch.tlb_size;
+	ulong carveout_start;
+
+	gd->arch.tlb_fillptr = tlb_addr;
+
 	build_mem_map();
 
 	icache_enable();
+
+	/* Create normal system page tables */
+	setup_pgtables();
+
+	/* Create emergency page tables */
+	gd->arch.tlb_size -= (uintptr_t)gd->arch.tlb_fillptr -
+			     (uintptr_t)gd->arch.tlb_addr;
+	gd->arch.tlb_addr = gd->arch.tlb_fillptr;
+	setup_pgtables();
+	gd->arch.tlb_emerg = gd->arch.tlb_addr;
+	gd->arch.tlb_addr = tlb_addr;
+	gd->arch.tlb_size = tlb_size;
+
+	/* Doing this has a noticeable impact on boot time, so until we can
+	 * find a more efficient solution, just enable the workaround for
+	 * the one board where it's necessary. Without this there seems to
+	 * be a speculative access to a region which is protected by the TZ.
+	 */
+	if (of_machine_is_compatible("qcom,qcs404")) {
+		carveout_start = get_timer(0);
+		carve_out_reserved_memory();
+		debug("carveout time: %lums\n", get_timer(carveout_start));
+	}
+
 	dcache_enable();
 }