diff mbox

[v5sub1,8/8] arm64: allow kernel Image to be loaded anywhere in physical memory

Message ID 1454324093-15998-9-git-send-email-ard.biesheuvel@linaro.org
State New
Headers show

Commit Message

Ard Biesheuvel Feb. 1, 2016, 10:54 a.m. UTC
This relaxes the kernel Image placement requirements, so that it
may be placed at any 2 MB aligned offset in physical memory.

This is accomplished by ignoring PHYS_OFFSET when installing
memblocks, and accounting for the apparent virtual offset of
the kernel Image. As a result, virtual address references
below PAGE_OFFSET are correctly mapped onto physical references
into the kernel Image regardless of where it sits in memory.

Note that limiting memory using mem= is not unambiguous anymore after
this change, considering that the kernel may be at the top of physical
memory, and clipping from the bottom rather than the top will discard
any 32-bit DMA addressable memory first. To deal with this, the handling
of mem= is reimplemented to clip top down, but take special care not to
clip memory that covers the kernel image.

Since mem= should not be considered a production feature, a panic notifier
handler is installed that dumps the memory limit at panic time if one was
set.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

---
 Documentation/arm64/booting.txt         |  20 ++--
 arch/arm64/include/asm/boot.h           |   6 ++
 arch/arm64/include/asm/kernel-pgtable.h |  12 +++
 arch/arm64/include/asm/kvm_asm.h        |   2 +-
 arch/arm64/include/asm/memory.h         |  15 +--
 arch/arm64/kernel/head.S                |   6 +-
 arch/arm64/kernel/image.h               |  13 ++-
 arch/arm64/mm/init.c                    | 100 +++++++++++++++++++-
 arch/arm64/mm/mmu.c                     |   3 +
 9 files changed, 155 insertions(+), 22 deletions(-)

-- 
2.5.0


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

Comments

Mark Rutland Feb. 1, 2016, 2:50 p.m. UTC | #1
On Mon, Feb 01, 2016 at 11:54:53AM +0100, Ard Biesheuvel wrote:
> This relaxes the kernel Image placement requirements, so that it

> may be placed at any 2 MB aligned offset in physical memory.

> 

> This is accomplished by ignoring PHYS_OFFSET when installing

> memblocks, and accounting for the apparent virtual offset of

> the kernel Image. As a result, virtual address references

> below PAGE_OFFSET are correctly mapped onto physical references

> into the kernel Image regardless of where it sits in memory.

> 

> Note that limiting memory using mem= is not unambiguous anymore after

> this change, considering that the kernel may be at the top of physical

> memory, and clipping from the bottom rather than the top will discard

> any 32-bit DMA addressable memory first. To deal with this, the handling

> of mem= is reimplemented to clip top down, but take special care not to

> clip memory that covers the kernel image.

> 

> Since mem= should not be considered a production feature, a panic notifier

> handler is installed that dumps the memory limit at panic time if one was

> set.


Good idea!

It would be great if we could follow up with a sizes.h update for SZ_4G,
though that's only a nice-to-have, and in no way should block this.

Other than that, this looks good. Thanks for putting this together!

Reviewed-by: Mark Rutland <mark.rutland@arm.com>


For the Documentation/arm64 parts we'll need to ask Fu Wei to update the
zh_CN/ translation to match.

Mark.

> 

> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

> ---

>  Documentation/arm64/booting.txt         |  20 ++--

>  arch/arm64/include/asm/boot.h           |   6 ++

>  arch/arm64/include/asm/kernel-pgtable.h |  12 +++

>  arch/arm64/include/asm/kvm_asm.h        |   2 +-

>  arch/arm64/include/asm/memory.h         |  15 +--

>  arch/arm64/kernel/head.S                |   6 +-

>  arch/arm64/kernel/image.h               |  13 ++-

>  arch/arm64/mm/init.c                    | 100 +++++++++++++++++++-

>  arch/arm64/mm/mmu.c                     |   3 +

>  9 files changed, 155 insertions(+), 22 deletions(-)

> 

> diff --git a/Documentation/arm64/booting.txt b/Documentation/arm64/booting.txt

> index 701d39d3171a..56d6d8b796db 100644

> --- a/Documentation/arm64/booting.txt

> +++ b/Documentation/arm64/booting.txt

> @@ -109,7 +109,13 @@ Header notes:

>  			1 - 4K

>  			2 - 16K

>  			3 - 64K

> -  Bits 3-63:	Reserved.

> +  Bit 3:	Kernel physical placement

> +			0 - 2MB aligned base should be as close as possible

> +			    to the base of DRAM, since memory below it is not

> +			    accessible via the linear mapping

> +			1 - 2MB aligned base may be anywhere in physical

> +			    memory

> +  Bits 4-63:	Reserved.

>  

>  - When image_size is zero, a bootloader should attempt to keep as much

>    memory as possible free for use by the kernel immediately after the

> @@ -117,14 +123,14 @@ Header notes:

>    depending on selected features, and is effectively unbound.

>  

>  The Image must be placed text_offset bytes from a 2MB aligned base

> -address near the start of usable system RAM and called there. Memory

> -below that base address is currently unusable by Linux, and therefore it

> -is strongly recommended that this location is the start of system RAM.

> -The region between the 2 MB aligned base address and the start of the

> -image has no special significance to the kernel, and may be used for

> -other purposes.

> +address anywhere in usable system RAM and called there. The region

> +between the 2 MB aligned base address and the start of the image has no

> +special significance to the kernel, and may be used for other purposes.

>  At least image_size bytes from the start of the image must be free for

>  use by the kernel.

> +NOTE: versions prior to v4.6 cannot make use of memory below the

> +physical offset of the Image so it is recommended that the Image be

> +placed as close as possible to the start of system RAM.

>  

>  Any memory described to the kernel (even that below the start of the

>  image) which is not marked as reserved from the kernel (e.g., with a

> diff --git a/arch/arm64/include/asm/boot.h b/arch/arm64/include/asm/boot.h

> index 81151b67b26b..ebf2481889c3 100644

> --- a/arch/arm64/include/asm/boot.h

> +++ b/arch/arm64/include/asm/boot.h

> @@ -11,4 +11,10 @@

>  #define MIN_FDT_ALIGN		8

>  #define MAX_FDT_SIZE		SZ_2M

>  

> +/*

> + * arm64 requires the kernel image to placed

> + * TEXT_OFFSET bytes beyond a 2 MB aligned base

> + */

> +#define MIN_KIMG_ALIGN		SZ_2M

> +

>  #endif

> diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h

> index a459714ee29e..5c6375d8528b 100644

> --- a/arch/arm64/include/asm/kernel-pgtable.h

> +++ b/arch/arm64/include/asm/kernel-pgtable.h

> @@ -79,5 +79,17 @@

>  #define SWAPPER_MM_MMUFLAGS	(PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)

>  #endif

>  

> +/*

> + * To make optimal use of block mappings when laying out the linear

> + * mapping, round down the base of physical memory to a size that can

> + * be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE

> + * (64k granule), or a multiple that can be mapped using contiguous bits

> + * in the page tables: 32 * PMD_SIZE (16k granule)

> + */

> +#ifdef CONFIG_ARM64_64K_PAGES

> +#define ARM64_MEMSTART_ALIGN	SZ_512M

> +#else

> +#define ARM64_MEMSTART_ALIGN	SZ_1G

> +#endif

>  

>  #endif	/* __ASM_KERNEL_PGTABLE_H */

> diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h

> index f5aee6e764e6..054ac25e7c2e 100644

> --- a/arch/arm64/include/asm/kvm_asm.h

> +++ b/arch/arm64/include/asm/kvm_asm.h

> @@ -26,7 +26,7 @@

>  #define KVM_ARM64_DEBUG_DIRTY_SHIFT	0

>  #define KVM_ARM64_DEBUG_DIRTY		(1 << KVM_ARM64_DEBUG_DIRTY_SHIFT)

>  

> -#define kvm_ksym_ref(sym)		((void *)&sym - KIMAGE_VADDR + PAGE_OFFSET)

> +#define kvm_ksym_ref(sym)		phys_to_virt((u64)&sym - kimage_voffset)

>  

>  #ifndef __ASSEMBLY__

>  struct kvm;

> diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h

> index 4388651d1f0d..61005e7dd6cb 100644

> --- a/arch/arm64/include/asm/memory.h

> +++ b/arch/arm64/include/asm/memory.h

> @@ -88,10 +88,10 @@

>  #define __virt_to_phys(x) ({						\

>  	phys_addr_t __x = (phys_addr_t)(x);				\

>  	__x >= PAGE_OFFSET ? (__x - PAGE_OFFSET + PHYS_OFFSET) :	\

> -			     (__x - KIMAGE_VADDR + PHYS_OFFSET); })

> +			     (__x - kimage_voffset); })

>  

>  #define __phys_to_virt(x)	((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET))

> -#define __phys_to_kimg(x)	((unsigned long)((x) - PHYS_OFFSET + KIMAGE_VADDR))

> +#define __phys_to_kimg(x)	((unsigned long)((x) + kimage_voffset))

>  

>  /*

>   * Convert a page to/from a physical address

> @@ -127,13 +127,14 @@ extern phys_addr_t		memstart_addr;

>  /* PHYS_OFFSET - the physical address of the start of memory. */

>  #define PHYS_OFFSET		({ memstart_addr; })

>  

> +/* the offset between the kernel virtual and physical mappings */

> +extern u64			kimage_voffset;

> +

>  /*

> - * The maximum physical address that the linear direct mapping

> - * of system RAM can cover. (PAGE_OFFSET can be interpreted as

> - * a 2's complement signed quantity and negated to derive the

> - * maximum size of the linear mapping.)

> + * Allow all memory at the discovery stage. We will clip it later.

>   */

> -#define MAX_MEMBLOCK_ADDR	({ memstart_addr - PAGE_OFFSET - 1; })

> +#define MIN_MEMBLOCK_ADDR	0

> +#define MAX_MEMBLOCK_ADDR	U64_MAX

>  

>  /*

>   * PFNs are used to describe any physical page; this means

> diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S

> index 04d38a058b19..05b98289093e 100644

> --- a/arch/arm64/kernel/head.S

> +++ b/arch/arm64/kernel/head.S

> @@ -428,7 +428,11 @@ __mmap_switched:

>  	and	x4, x4, #~(THREAD_SIZE - 1)

>  	msr	sp_el0, x4			// Save thread_info

>  	str_l	x21, __fdt_pointer, x5		// Save FDT pointer

> -	str_l	x24, memstart_addr, x6		// Save PHYS_OFFSET

> +

> +	ldr	x4, =KIMAGE_VADDR		// Save the offset between

> +	sub	x4, x4, x24			// the kernel virtual and

> +	str_l	x4, kimage_voffset, x5		// physical mappings

> +

>  	mov	x29, #0

>  #ifdef CONFIG_KASAN

>  	bl	kasan_early_init

> diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h

> index 999633bd7294..c9c62cab25a4 100644

> --- a/arch/arm64/kernel/image.h

> +++ b/arch/arm64/kernel/image.h

> @@ -42,15 +42,18 @@

>  #endif

>  

>  #ifdef CONFIG_CPU_BIG_ENDIAN

> -#define __HEAD_FLAG_BE	1

> +#define __HEAD_FLAG_BE		1

>  #else

> -#define __HEAD_FLAG_BE	0

> +#define __HEAD_FLAG_BE		0

>  #endif

>  

> -#define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2)

> +#define __HEAD_FLAG_PAGE_SIZE	((PAGE_SHIFT - 10) / 2)

>  

> -#define __HEAD_FLAGS	((__HEAD_FLAG_BE << 0) |	\

> -			 (__HEAD_FLAG_PAGE_SIZE << 1))

> +#define __HEAD_FLAG_PHYS_BASE	1

> +

> +#define __HEAD_FLAGS		((__HEAD_FLAG_BE << 0) |	\

> +				 (__HEAD_FLAG_PAGE_SIZE << 1) |	\

> +				 (__HEAD_FLAG_PHYS_BASE << 3))

>  

>  /*

>   * These will output as part of the Image header, which should be little-endian

> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c

> index 1d627cd8121c..e8e853a1024c 100644

> --- a/arch/arm64/mm/init.c

> +++ b/arch/arm64/mm/init.c

> @@ -35,8 +35,10 @@

>  #include <linux/efi.h>

>  #include <linux/swiotlb.h>

>  

> +#include <asm/boot.h>

>  #include <asm/fixmap.h>

>  #include <asm/kasan.h>

> +#include <asm/kernel-pgtable.h>

>  #include <asm/memory.h>

>  #include <asm/sections.h>

>  #include <asm/setup.h>

> @@ -158,9 +160,80 @@ static int __init early_mem(char *p)

>  }

>  early_param("mem", early_mem);

>  

> +/*

> + * clip_mem_range() - remove memblock memory between @min and @max until

> + *                    we meet the limit in 'memory_limit'.

> + */

> +static void __init clip_mem_range(u64 min, u64 max)

> +{

> +	u64 mem_size, to_remove;

> +	int i;

> +

> +again:

> +	mem_size = memblock_phys_mem_size();

> +	if (mem_size <= memory_limit || max <= min)

> +		return;

> +

> +	to_remove = mem_size - memory_limit;

> +

> +	for (i = memblock.memory.cnt - 1; i >= 0; i--) {

> +		struct memblock_region *r = memblock.memory.regions + i;

> +		u64 start = max(min, r->base);

> +		u64 end = min(max, r->base + r->size);

> +

> +		if (start >= max || end <= min)

> +			continue;

> +

> +		if (end > min) {

> +			u64 size = min(to_remove, end - max(start, min));

> +

> +			memblock_remove(end - size, size);

> +		} else {

> +			memblock_remove(start, min(max - start, to_remove));

> +		}

> +		goto again;

> +	}

> +}

> +

>  void __init arm64_memblock_init(void)

>  {

> -	memblock_enforce_memory_limit(memory_limit);

> +	const s64 linear_region_size = -(s64)PAGE_OFFSET;

> +

> +	/*

> +	 * Select a suitable value for the base of physical memory.

> +	 */

> +	memstart_addr = round_down(memblock_start_of_DRAM(),

> +				   ARM64_MEMSTART_ALIGN);

> +

> +	/*

> +	 * Remove the memory that we will not be able to cover with the

> +	 * linear mapping. Take care not to clip the kernel which may be

> +	 * high in memory.

> +	 */

> +	memblock_remove(max(memstart_addr + linear_region_size, __pa(_end)),

> +			ULLONG_MAX);

> +	if (memblock_end_of_DRAM() > linear_region_size)

> +		memblock_remove(0, memblock_end_of_DRAM() - linear_region_size);

> +

> +	if (memory_limit != (phys_addr_t)ULLONG_MAX) {

> +		u64 kbase = round_down(__pa(_text), MIN_KIMG_ALIGN);

> +		u64 kend = PAGE_ALIGN(__pa(_end));

> +		u64 const sz_4g = 0x100000000UL;

> +

> +		/*

> +		 * Clip memory in order of preference:

> +		 * - above the kernel and above 4 GB

> +		 * - between 4 GB and the start of the kernel (if the kernel

> +		 *   is loaded high in memory)

> +		 * - between the kernel and 4 GB (if the kernel is loaded

> +		 *   low in memory)

> +		 * - below 4 GB

> +		 */

> +		clip_mem_range(max(sz_4g, kend), ULLONG_MAX);

> +		clip_mem_range(sz_4g, kbase);

> +		clip_mem_range(kend, sz_4g);

> +		clip_mem_range(0, min(kbase, sz_4g));

> +	}

>  

>  	/*

>  	 * Register the kernel text, kernel data, initrd, and initial

> @@ -381,3 +454,28 @@ static int __init keepinitrd_setup(char *__unused)

>  

>  __setup("keepinitrd", keepinitrd_setup);

>  #endif

> +

> +/*

> + * Dump out memory limit information on panic.

> + */

> +static int dump_mem_limit(struct notifier_block *self, unsigned long v, void *p)

> +{

> +	if (memory_limit != (phys_addr_t)ULLONG_MAX) {

> +		pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20);

> +	} else {

> +		pr_emerg("Memory Limit: none\n");

> +	}

> +	return 0;

> +}

> +

> +static struct notifier_block mem_limit_notifier = {

> +	.notifier_call = dump_mem_limit,

> +};

> +

> +static int __init register_mem_limit_dumper(void)

> +{

> +	atomic_notifier_chain_register(&panic_notifier_list,

> +				       &mem_limit_notifier);

> +	return 0;

> +}

> +__initcall(register_mem_limit_dumper);

> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c

> index 4c4b15932963..8dda38378959 100644

> --- a/arch/arm64/mm/mmu.c

> +++ b/arch/arm64/mm/mmu.c

> @@ -46,6 +46,9 @@

>  

>  u64 idmap_t0sz = TCR_T0SZ(VA_BITS);

>  

> +u64 kimage_voffset __read_mostly;

> +EXPORT_SYMBOL(kimage_voffset);

> +

>  /*

>   * Empty_zero_page is a special page that is used for zero-initialized data

>   * and COW.

> -- 

> 2.5.0

> 


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Catalin Marinas Feb. 1, 2016, 3:06 p.m. UTC | #2
On Mon, Feb 01, 2016 at 11:54:53AM +0100, Ard Biesheuvel wrote:
> Note that limiting memory using mem= is not unambiguous anymore after

> this change, considering that the kernel may be at the top of physical

> memory, and clipping from the bottom rather than the top will discard

> any 32-bit DMA addressable memory first. To deal with this, the handling

> of mem= is reimplemented to clip top down, but take special care not to

> clip memory that covers the kernel image.


I may have forgotten the reason - why do we need to avoid clipping the
memory that covers the kernel image? It's already mapped in the vmalloc
area, so we wouldn't need it in the linear map as well.

-- 
Catalin

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Ard Biesheuvel Feb. 1, 2016, 3:13 p.m. UTC | #3
On 1 February 2016 at 16:06, Catalin Marinas <catalin.marinas@arm.com> wrote:
> On Mon, Feb 01, 2016 at 11:54:53AM +0100, Ard Biesheuvel wrote:

>> Note that limiting memory using mem= is not unambiguous anymore after

>> this change, considering that the kernel may be at the top of physical

>> memory, and clipping from the bottom rather than the top will discard

>> any 32-bit DMA addressable memory first. To deal with this, the handling

>> of mem= is reimplemented to clip top down, but take special care not to

>> clip memory that covers the kernel image.

>

> I may have forgotten the reason - why do we need to avoid clipping the

> memory that covers the kernel image? It's already mapped in the vmalloc

> area, so we wouldn't need it in the linear map as well.

>


Good question. Originally, I needed it for swapper_pg_dir, whose
pud/pmd/pte levels were accessed via __va() translations of the values
found in the higher-up table entries, but after Mark's patches, only
the top level pgd of swapper_pg_dir is still used. Similarly, for
idmap_pg_dir, we don't change any mappings at runtime so the same
applies there I think.

I will try dropping this, and see what happens.

-- 
Ard.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Ard Biesheuvel Feb. 1, 2016, 4:31 p.m. UTC | #4
On 1 February 2016 at 16:13, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> On 1 February 2016 at 16:06, Catalin Marinas <catalin.marinas@arm.com> wrote:

>> On Mon, Feb 01, 2016 at 11:54:53AM +0100, Ard Biesheuvel wrote:

>>> Note that limiting memory using mem= is not unambiguous anymore after

>>> this change, considering that the kernel may be at the top of physical

>>> memory, and clipping from the bottom rather than the top will discard

>>> any 32-bit DMA addressable memory first. To deal with this, the handling

>>> of mem= is reimplemented to clip top down, but take special care not to

>>> clip memory that covers the kernel image.

>>

>> I may have forgotten the reason - why do we need to avoid clipping the

>> memory that covers the kernel image? It's already mapped in the vmalloc

>> area, so we wouldn't need it in the linear map as well.

>>

>

> Good question. Originally, I needed it for swapper_pg_dir, whose

> pud/pmd/pte levels were accessed via __va() translations of the values

> found in the higher-up table entries, but after Mark's patches, only

> the top level pgd of swapper_pg_dir is still used. Similarly, for

> idmap_pg_dir, we don't change any mappings at runtime so the same

> applies there I think.

>

> I will try dropping this, and see what happens.

>


I have given this a spin, and this chokes on
a) the fact that not all of the translation tables are accessible via
the linear mapping: the fixmap, due to its vicinity to PCI i/o and
other populated regions, will share its pud/pmd level tables with
other users, like ioremap, which traverses the translation tables in
the ordinary way, i.e., it expects that __va() applied on the phys
address in the table entry returns something that is mapped
b) free_initmem() now calls __free_pages() on a region that we never
mapped or registered as available.

So it may be feasible with some hackery, but I wonder if it is worth
it to complicate the common case for implementing mem= more
efficiently.

-- 
Ard.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Catalin Marinas Feb. 1, 2016, 5:31 p.m. UTC | #5
On Mon, Feb 01, 2016 at 05:31:11PM +0100, Ard Biesheuvel wrote:
> On 1 February 2016 at 16:13, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:

> > On 1 February 2016 at 16:06, Catalin Marinas <catalin.marinas@arm.com> wrote:

> >> On Mon, Feb 01, 2016 at 11:54:53AM +0100, Ard Biesheuvel wrote:

> >>> Note that limiting memory using mem= is not unambiguous anymore after

> >>> this change, considering that the kernel may be at the top of physical

> >>> memory, and clipping from the bottom rather than the top will discard

> >>> any 32-bit DMA addressable memory first. To deal with this, the handling

> >>> of mem= is reimplemented to clip top down, but take special care not to

> >>> clip memory that covers the kernel image.

> >>

> >> I may have forgotten the reason - why do we need to avoid clipping the

> >> memory that covers the kernel image? It's already mapped in the vmalloc

> >> area, so we wouldn't need it in the linear map as well.

> >

> > Good question. Originally, I needed it for swapper_pg_dir, whose

> > pud/pmd/pte levels were accessed via __va() translations of the values

> > found in the higher-up table entries, but after Mark's patches, only

> > the top level pgd of swapper_pg_dir is still used. Similarly, for

> > idmap_pg_dir, we don't change any mappings at runtime so the same

> > applies there I think.

> >

> > I will try dropping this, and see what happens.

> 

> I have given this a spin, and this chokes on

> a) the fact that not all of the translation tables are accessible via

> the linear mapping: the fixmap, due to its vicinity to PCI i/o and

> other populated regions, will share its pud/pmd level tables with

> other users, like ioremap, which traverses the translation tables in

> the ordinary way, i.e., it expects that __va() applied on the phys

> address in the table entry returns something that is mapped


Ah, __va(__pa(x)) is not an identity function and I don't think it's
worth fixing it (the __pa() case is much simpler). But it also means
that we won't be able to remove the kernel image alias in the linear
mapping. It shouldn't be a problem for KASLR as long as we randomise
both kernel image PA and VA.

> b) free_initmem() now calls __free_pages() on a region that we never

> mapped or registered as available.

> 

> So it may be feasible with some hackery, but I wonder if it is worth

> it to complicate the common case for implementing mem= more

> efficiently.


I don't care about efficiency, I was hoping to avoid the additional
arm64-specific memory clipping but it seems that it could easily get
more complicated. So let's leave as it is.

Consider this sub-series merged (I'll push it to -next around -rc3).

Thanks.

-- 
Catalin

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Ard Biesheuvel Feb. 1, 2016, 5:57 p.m. UTC | #6
On 1 February 2016 at 18:31, Catalin Marinas <catalin.marinas@arm.com> wrote:
> On Mon, Feb 01, 2016 at 05:31:11PM +0100, Ard Biesheuvel wrote:

>> On 1 February 2016 at 16:13, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:

>> > On 1 February 2016 at 16:06, Catalin Marinas <catalin.marinas@arm.com> wrote:

>> >> On Mon, Feb 01, 2016 at 11:54:53AM +0100, Ard Biesheuvel wrote:

>> >>> Note that limiting memory using mem= is not unambiguous anymore after

>> >>> this change, considering that the kernel may be at the top of physical

>> >>> memory, and clipping from the bottom rather than the top will discard

>> >>> any 32-bit DMA addressable memory first. To deal with this, the handling

>> >>> of mem= is reimplemented to clip top down, but take special care not to

>> >>> clip memory that covers the kernel image.

>> >>

>> >> I may have forgotten the reason - why do we need to avoid clipping the

>> >> memory that covers the kernel image? It's already mapped in the vmalloc

>> >> area, so we wouldn't need it in the linear map as well.

>> >

>> > Good question. Originally, I needed it for swapper_pg_dir, whose

>> > pud/pmd/pte levels were accessed via __va() translations of the values

>> > found in the higher-up table entries, but after Mark's patches, only

>> > the top level pgd of swapper_pg_dir is still used. Similarly, for

>> > idmap_pg_dir, we don't change any mappings at runtime so the same

>> > applies there I think.

>> >

>> > I will try dropping this, and see what happens.

>>

>> I have given this a spin, and this chokes on

>> a) the fact that not all of the translation tables are accessible via

>> the linear mapping: the fixmap, due to its vicinity to PCI i/o and

>> other populated regions, will share its pud/pmd level tables with

>> other users, like ioremap, which traverses the translation tables in

>> the ordinary way, i.e., it expects that __va() applied on the phys

>> address in the table entry returns something that is mapped

>

> Ah, __va(__pa(x)) is not an identity function and I don't think it's

> worth fixing it (the __pa() case is much simpler). But it also means

> that we won't be able to remove the kernel image alias in the linear

> mapping. It shouldn't be a problem for KASLR as long as we randomise

> both kernel image PA and VA.

>


indeed.

>> b) free_initmem() now calls __free_pages() on a region that we never

>> mapped or registered as available.

>>

>> So it may be feasible with some hackery, but I wonder if it is worth

>> it to complicate the common case for implementing mem= more

>> efficiently.

>

> I don't care about efficiency, I was hoping to avoid the additional

> arm64-specific memory clipping but it seems that it could easily get

> more complicated. So let's leave as it is.

>


Alternatively, we could simply apply the memory limit as before, and
add back the [__init_begin, _end] interval right afterwards using
memblock_add()

> Consider this sub-series merged (I'll push it to -next around -rc3).

>

> Thanks.

>

> --

> Catalin


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Catalin Marinas Feb. 1, 2016, 6:02 p.m. UTC | #7
On Mon, Feb 01, 2016 at 06:57:05PM +0100, Ard Biesheuvel wrote:
> On 1 February 2016 at 18:31, Catalin Marinas <catalin.marinas@arm.com> wrote:

> >> >> On Mon, Feb 01, 2016 at 11:54:53AM +0100, Ard Biesheuvel wrote:

> >> >>> Note that limiting memory using mem= is not unambiguous anymore after

> >> >>> this change, considering that the kernel may be at the top of physical

> >> >>> memory, and clipping from the bottom rather than the top will discard

> >> >>> any 32-bit DMA addressable memory first. To deal with this, the handling

> >> >>> of mem= is reimplemented to clip top down, but take special care not to

> >> >>> clip memory that covers the kernel image.

> >> >>

> >> >> I may have forgotten the reason - why do we need to avoid clipping the

> >> >> memory that covers the kernel image? It's already mapped in the vmalloc

> >> >> area, so we wouldn't need it in the linear map as well.

[...]
> > I don't care about efficiency, I was hoping to avoid the additional

> > arm64-specific memory clipping but it seems that it could easily get

> > more complicated. So let's leave as it is.

> 

> Alternatively, we could simply apply the memory limit as before, and

> add back the [__init_begin, _end] interval right afterwards using

> memblock_add()


If the code ends up simpler, yes, I'm fine with it.

-- 
Catalin

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
diff mbox

Patch

diff --git a/Documentation/arm64/booting.txt b/Documentation/arm64/booting.txt
index 701d39d3171a..56d6d8b796db 100644
--- a/Documentation/arm64/booting.txt
+++ b/Documentation/arm64/booting.txt
@@ -109,7 +109,13 @@  Header notes:
 			1 - 4K
 			2 - 16K
 			3 - 64K
-  Bits 3-63:	Reserved.
+  Bit 3:	Kernel physical placement
+			0 - 2MB aligned base should be as close as possible
+			    to the base of DRAM, since memory below it is not
+			    accessible via the linear mapping
+			1 - 2MB aligned base may be anywhere in physical
+			    memory
+  Bits 4-63:	Reserved.
 
 - When image_size is zero, a bootloader should attempt to keep as much
   memory as possible free for use by the kernel immediately after the
@@ -117,14 +123,14 @@  Header notes:
   depending on selected features, and is effectively unbound.
 
 The Image must be placed text_offset bytes from a 2MB aligned base
-address near the start of usable system RAM and called there. Memory
-below that base address is currently unusable by Linux, and therefore it
-is strongly recommended that this location is the start of system RAM.
-The region between the 2 MB aligned base address and the start of the
-image has no special significance to the kernel, and may be used for
-other purposes.
+address anywhere in usable system RAM and called there. The region
+between the 2 MB aligned base address and the start of the image has no
+special significance to the kernel, and may be used for other purposes.
 At least image_size bytes from the start of the image must be free for
 use by the kernel.
+NOTE: versions prior to v4.6 cannot make use of memory below the
+physical offset of the Image so it is recommended that the Image be
+placed as close as possible to the start of system RAM.
 
 Any memory described to the kernel (even that below the start of the
 image) which is not marked as reserved from the kernel (e.g., with a
diff --git a/arch/arm64/include/asm/boot.h b/arch/arm64/include/asm/boot.h
index 81151b67b26b..ebf2481889c3 100644
--- a/arch/arm64/include/asm/boot.h
+++ b/arch/arm64/include/asm/boot.h
@@ -11,4 +11,10 @@ 
 #define MIN_FDT_ALIGN		8
 #define MAX_FDT_SIZE		SZ_2M
 
+/*
+ * arm64 requires the kernel image to placed
+ * TEXT_OFFSET bytes beyond a 2 MB aligned base
+ */
+#define MIN_KIMG_ALIGN		SZ_2M
+
 #endif
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index a459714ee29e..5c6375d8528b 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -79,5 +79,17 @@ 
 #define SWAPPER_MM_MMUFLAGS	(PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
 #endif
 
+/*
+ * To make optimal use of block mappings when laying out the linear
+ * mapping, round down the base of physical memory to a size that can
+ * be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE
+ * (64k granule), or a multiple that can be mapped using contiguous bits
+ * in the page tables: 32 * PMD_SIZE (16k granule)
+ */
+#ifdef CONFIG_ARM64_64K_PAGES
+#define ARM64_MEMSTART_ALIGN	SZ_512M
+#else
+#define ARM64_MEMSTART_ALIGN	SZ_1G
+#endif
 
 #endif	/* __ASM_KERNEL_PGTABLE_H */
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index f5aee6e764e6..054ac25e7c2e 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -26,7 +26,7 @@ 
 #define KVM_ARM64_DEBUG_DIRTY_SHIFT	0
 #define KVM_ARM64_DEBUG_DIRTY		(1 << KVM_ARM64_DEBUG_DIRTY_SHIFT)
 
-#define kvm_ksym_ref(sym)		((void *)&sym - KIMAGE_VADDR + PAGE_OFFSET)
+#define kvm_ksym_ref(sym)		phys_to_virt((u64)&sym - kimage_voffset)
 
 #ifndef __ASSEMBLY__
 struct kvm;
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 4388651d1f0d..61005e7dd6cb 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -88,10 +88,10 @@ 
 #define __virt_to_phys(x) ({						\
 	phys_addr_t __x = (phys_addr_t)(x);				\
 	__x >= PAGE_OFFSET ? (__x - PAGE_OFFSET + PHYS_OFFSET) :	\
-			     (__x - KIMAGE_VADDR + PHYS_OFFSET); })
+			     (__x - kimage_voffset); })
 
 #define __phys_to_virt(x)	((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET))
-#define __phys_to_kimg(x)	((unsigned long)((x) - PHYS_OFFSET + KIMAGE_VADDR))
+#define __phys_to_kimg(x)	((unsigned long)((x) + kimage_voffset))
 
 /*
  * Convert a page to/from a physical address
@@ -127,13 +127,14 @@  extern phys_addr_t		memstart_addr;
 /* PHYS_OFFSET - the physical address of the start of memory. */
 #define PHYS_OFFSET		({ memstart_addr; })
 
+/* the offset between the kernel virtual and physical mappings */
+extern u64			kimage_voffset;
+
 /*
- * The maximum physical address that the linear direct mapping
- * of system RAM can cover. (PAGE_OFFSET can be interpreted as
- * a 2's complement signed quantity and negated to derive the
- * maximum size of the linear mapping.)
+ * Allow all memory at the discovery stage. We will clip it later.
  */
-#define MAX_MEMBLOCK_ADDR	({ memstart_addr - PAGE_OFFSET - 1; })
+#define MIN_MEMBLOCK_ADDR	0
+#define MAX_MEMBLOCK_ADDR	U64_MAX
 
 /*
  * PFNs are used to describe any physical page; this means
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 04d38a058b19..05b98289093e 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -428,7 +428,11 @@  __mmap_switched:
 	and	x4, x4, #~(THREAD_SIZE - 1)
 	msr	sp_el0, x4			// Save thread_info
 	str_l	x21, __fdt_pointer, x5		// Save FDT pointer
-	str_l	x24, memstart_addr, x6		// Save PHYS_OFFSET
+
+	ldr	x4, =KIMAGE_VADDR		// Save the offset between
+	sub	x4, x4, x24			// the kernel virtual and
+	str_l	x4, kimage_voffset, x5		// physical mappings
+
 	mov	x29, #0
 #ifdef CONFIG_KASAN
 	bl	kasan_early_init
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index 999633bd7294..c9c62cab25a4 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -42,15 +42,18 @@ 
 #endif
 
 #ifdef CONFIG_CPU_BIG_ENDIAN
-#define __HEAD_FLAG_BE	1
+#define __HEAD_FLAG_BE		1
 #else
-#define __HEAD_FLAG_BE	0
+#define __HEAD_FLAG_BE		0
 #endif
 
-#define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2)
+#define __HEAD_FLAG_PAGE_SIZE	((PAGE_SHIFT - 10) / 2)
 
-#define __HEAD_FLAGS	((__HEAD_FLAG_BE << 0) |	\
-			 (__HEAD_FLAG_PAGE_SIZE << 1))
+#define __HEAD_FLAG_PHYS_BASE	1
+
+#define __HEAD_FLAGS		((__HEAD_FLAG_BE << 0) |	\
+				 (__HEAD_FLAG_PAGE_SIZE << 1) |	\
+				 (__HEAD_FLAG_PHYS_BASE << 3))
 
 /*
  * These will output as part of the Image header, which should be little-endian
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 1d627cd8121c..e8e853a1024c 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -35,8 +35,10 @@ 
 #include <linux/efi.h>
 #include <linux/swiotlb.h>
 
+#include <asm/boot.h>
 #include <asm/fixmap.h>
 #include <asm/kasan.h>
+#include <asm/kernel-pgtable.h>
 #include <asm/memory.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
@@ -158,9 +160,80 @@  static int __init early_mem(char *p)
 }
 early_param("mem", early_mem);
 
+/*
+ * clip_mem_range() - remove memblock memory between @min and @max until
+ *                    we meet the limit in 'memory_limit'.
+ */
+static void __init clip_mem_range(u64 min, u64 max)
+{
+	u64 mem_size, to_remove;
+	int i;
+
+again:
+	mem_size = memblock_phys_mem_size();
+	if (mem_size <= memory_limit || max <= min)
+		return;
+
+	to_remove = mem_size - memory_limit;
+
+	for (i = memblock.memory.cnt - 1; i >= 0; i--) {
+		struct memblock_region *r = memblock.memory.regions + i;
+		u64 start = max(min, r->base);
+		u64 end = min(max, r->base + r->size);
+
+		if (start >= max || end <= min)
+			continue;
+
+		if (end > min) {
+			u64 size = min(to_remove, end - max(start, min));
+
+			memblock_remove(end - size, size);
+		} else {
+			memblock_remove(start, min(max - start, to_remove));
+		}
+		goto again;
+	}
+}
+
 void __init arm64_memblock_init(void)
 {
-	memblock_enforce_memory_limit(memory_limit);
+	const s64 linear_region_size = -(s64)PAGE_OFFSET;
+
+	/*
+	 * Select a suitable value for the base of physical memory.
+	 */
+	memstart_addr = round_down(memblock_start_of_DRAM(),
+				   ARM64_MEMSTART_ALIGN);
+
+	/*
+	 * Remove the memory that we will not be able to cover with the
+	 * linear mapping. Take care not to clip the kernel which may be
+	 * high in memory.
+	 */
+	memblock_remove(max(memstart_addr + linear_region_size, __pa(_end)),
+			ULLONG_MAX);
+	if (memblock_end_of_DRAM() > linear_region_size)
+		memblock_remove(0, memblock_end_of_DRAM() - linear_region_size);
+
+	if (memory_limit != (phys_addr_t)ULLONG_MAX) {
+		u64 kbase = round_down(__pa(_text), MIN_KIMG_ALIGN);
+		u64 kend = PAGE_ALIGN(__pa(_end));
+		u64 const sz_4g = 0x100000000UL;
+
+		/*
+		 * Clip memory in order of preference:
+		 * - above the kernel and above 4 GB
+		 * - between 4 GB and the start of the kernel (if the kernel
+		 *   is loaded high in memory)
+		 * - between the kernel and 4 GB (if the kernel is loaded
+		 *   low in memory)
+		 * - below 4 GB
+		 */
+		clip_mem_range(max(sz_4g, kend), ULLONG_MAX);
+		clip_mem_range(sz_4g, kbase);
+		clip_mem_range(kend, sz_4g);
+		clip_mem_range(0, min(kbase, sz_4g));
+	}
 
 	/*
 	 * Register the kernel text, kernel data, initrd, and initial
@@ -381,3 +454,28 @@  static int __init keepinitrd_setup(char *__unused)
 
 __setup("keepinitrd", keepinitrd_setup);
 #endif
+
+/*
+ * Dump out memory limit information on panic.
+ */
+static int dump_mem_limit(struct notifier_block *self, unsigned long v, void *p)
+{
+	if (memory_limit != (phys_addr_t)ULLONG_MAX) {
+		pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20);
+	} else {
+		pr_emerg("Memory Limit: none\n");
+	}
+	return 0;
+}
+
+static struct notifier_block mem_limit_notifier = {
+	.notifier_call = dump_mem_limit,
+};
+
+static int __init register_mem_limit_dumper(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list,
+				       &mem_limit_notifier);
+	return 0;
+}
+__initcall(register_mem_limit_dumper);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 4c4b15932963..8dda38378959 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -46,6 +46,9 @@ 
 
 u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
 
+u64 kimage_voffset __read_mostly;
+EXPORT_SYMBOL(kimage_voffset);
+
 /*
  * Empty_zero_page is a special page that is used for zero-initialized data
  * and COW.