[v9,07/11] arm64: kexec_file: add crash dump support

Message ID	20180425062629.29404-8-takahiro.akashi@linaro.org
State	New
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67; From: AKASHI Takahiro <takahiro.akashi@linaro.org> To: catalin.marinas@arm.com, will.deacon@arm.com, dhowells@redhat.com, vgoyal@redhat.com, herbert@gondor.apana.org.au, davem@davemloft.net, dyoung@redhat.com, bhe@redhat.com, arnd@arndb.de Cc: ard.biesheuvel@linaro.org, james.morse@arm.com, bhsharma@redhat.com, kexec@lists.infradead.org, linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org, AKASHI Takahiro <takahiro.akashi@linaro.org> Subject: [PATCH v9 07/11] arm64: kexec_file: add crash dump support Date: Wed, 25 Apr 2018 15:26:25 +0900 Message-Id: <20180425062629.29404-8-takahiro.akashi@linaro.org> In-Reply-To: <20180425062629.29404-1-takahiro.akashi@linaro.org> References: <20180425062629.29404-1-takahiro.akashi@linaro.org> Sender: linux-kernel-owner@vger.kernel.org Precedence: bulk
Series	arm64: kexec: add kexec_file_load() support \| expand [v9,00/11] arm64: kexec: add kexec_file_load() support [v9,01/11] asm-generic: add kexec_file_load system call to unistd.h [v9,02/11] kexec_file: make kexec_image_post_load_cleanup_default() global [v9,03/11] arm64: kexec_file: invoke the kernel without purgatory [v9,04/11] arm64: kexec_file: allocate memory walking through memblock list [v9,05/11] arm64: kexec_file: load initrd and device-tree [v9,06/11] arm64: kexec_file: allow for loading Image-format kernel [v9,07/11] arm64: kexec_file: add crash dump support [v9,08/11] arm64: enable KEXEC_FILE config [v9,09/11] include: pe.h: remove message[] from mz header definition [v9,10/11] arm64: kexec_file: add kernel signature verification support [v9,11/11] arm64: kexec_file: add kaslr support

AKASHI Takahiro April 25, 2018, 6:26 a.m. UTC

Enabling crash dump (kdump) includes
* prepare contents of ELF header of a core dump file, /proc/vmcore,
  using crash_prepare_elf64_headers(), and
* add two device tree properties, "linux,usable-memory-range" and
  "linux,elfcorehdr", which represent repsectively a memory range
  to be used by crash dump kernel and the header's location

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/kexec.h         |   4 +
 arch/arm64/kernel/kexec_image.c        |   9 +-
 arch/arm64/kernel/machine_kexec_file.c | 202 +++++++++++++++++++++++++
 3 files changed, 213 insertions(+), 2 deletions(-)

-- 
2.17.0

James Morse May 15, 2018, 5:11 p.m. UTC | #1

Hi Akashi,

On 25/04/18 07:26, AKASHI Takahiro wrote:
> Enabling crash dump (kdump) includes

> * prepare contents of ELF header of a core dump file, /proc/vmcore,

>   using crash_prepare_elf64_headers(), and

> * add two device tree properties, "linux,usable-memory-range" and

>   "linux,elfcorehdr", which represent repsectively a memory range


(Nit: respectively)


>   to be used by crash dump kernel and the header's location


>  arch/arm64/include/asm/kexec.h         |   4 +

>  arch/arm64/kernel/kexec_image.c        |   9 +-

>  arch/arm64/kernel/machine_kexec_file.c | 202 +++++++++++++++++++++++++


In this patch, machine_kexec_file.c gains its own private fdt array encoder.


> diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c

> index 37c0a9dc2e47..ec674f4d267c 100644

> --- a/arch/arm64/kernel/machine_kexec_file.c

> +++ b/arch/arm64/kernel/machine_kexec_file.c

> @@ -76,6 +81,78 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf,

>  	return ret;

>  }

>  

> +static int __init arch_kexec_file_init(void)

> +{

> +	/* Those values are used later on loading the kernel */

> +	__dt_root_addr_cells = dt_root_addr_cells;

> +	__dt_root_size_cells = dt_root_size_cells;

> +

> +	return 0;

> +}

> +late_initcall(arch_kexec_file_init);


If we need these is it worth taking them out of __initdata? I note they've been
'temporary' for quite a long time.


> +

> +#define FDT_ALIGN(x, a)	(((x) + (a) - 1) & ~((a) - 1))

> +#define FDT_TAGALIGN(x)	(FDT_ALIGN((x), FDT_TAGSIZE))

> +

> +static int fdt_prop_len(const char *prop_name, int len)

> +{

> +	return (strlen(prop_name) + 1) +

> +		sizeof(struct fdt_property) +

> +		FDT_TAGALIGN(len);

> +}


This stuff should really be in libfdt.h  Those macros come from
libfdt_internal.h, so we're probably doing something wrong here.


> +static bool cells_size_fitted(unsigned long base, unsigned long size)

> +{

> +	/* if *_cells >= 2, cells can hold 64-bit values anyway */

> +	if ((__dt_root_addr_cells == 1) && (base >= (1ULL << 32)))

> +		return false;

> +

> +	if ((__dt_root_size_cells == 1) && (size >= (1ULL << 32)))

> +		return false;


Using '> U32_MAX' here may be more readable.


> +	return true;

> +}

> +

> +static void fill_property(void *buf, u64 val64, int cells)

> +{

> +	u32 val32;

> +

> +	if (cells == 1) {

> +		val32 = cpu_to_fdt32((u32)val64);

> +		memcpy(buf, &val32, sizeof(val32));

> +	} else {


> +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));

> +		buf += cells * sizeof(u32) - sizeof(u64);


Is this trying to clear the 'top' cells and shuffle the pointer to point at the
'bottom' 2? I'm pretty sure this isn't endian safe.

Do we really expect a system to have #address-cells > 2?


> +		val64 = cpu_to_fdt64(val64);

> +		memcpy(buf, &val64, sizeof(val64));

> +	}

> +}

> +

> +static int fdt_setprop_range(void *fdt, int nodeoffset, const char *name,

> +				unsigned long addr, unsigned long size)


(the device-tree spec describes a 'ranges' property, which had me confused. This
is encoding a prop-encoded-array)

> +{

> +	void *buf, *prop;

> +	size_t buf_size;

> +	int result;

> +

> +	buf_size = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);

> +	prop = buf = vmalloc(buf_size);


virtual memory allocation for something less than PAGE_SIZE?


> +	if (!buf)

> +		return -ENOMEM;

> +

> +	fill_property(prop, addr, __dt_root_addr_cells);

> +	prop += __dt_root_addr_cells * sizeof(u32);

> +

> +	fill_property(prop, size, __dt_root_size_cells);

> +

> +	result = fdt_setprop(fdt, nodeoffset, name, buf, buf_size);

> +

> +	vfree(buf);

> +

> +	return result;

> +}


Doesn't this stuff belong in libfdt? I guess there is no 'add array element' api
because this the first time we've wanted to create a node with more than
key=fixed-size-value.

I don't think this belongs in arch C code. Do we have a plan for getting libfdt
to support encoding prop-arrays? Can we put it somewhere anyone else duplicating
this will find it, until we can (re)move it?

I have no idea how that happens... it looks like the devicetree list is the
place to ask.


>  static int setup_dtb(struct kimage *image,

>  		unsigned long initrd_load_addr, unsigned long initrd_len,

>  		char *cmdline, unsigned long cmdline_len,

> @@ -88,10 +165,26 @@ static int setup_dtb(struct kimage *image,

>  	int range_len;

>  	int ret;

>  

> +	/* check ranges against root's #address-cells and #size-cells */

> +	if (image->type == KEXEC_TYPE_CRASH &&

> +		(!cells_size_fitted(image->arch.elf_load_addr,

> +				image->arch.elf_headers_sz) ||

> +		 !cells_size_fitted(crashk_res.start,

> +				crashk_res.end - crashk_res.start + 1))) {

> +		pr_err("Crash memory region doesn't fit into DT's root cell sizes.\n");

> +		ret = -EINVAL;

> +		goto out_err;

> +	}


To check I've understood this properly: This can happen if the firmware provided
a DTB with 32bit address/size cells, but at least some of the memory requires 64
bit address/size cells. This could only happen on a UEFI system where the
firmware-DTB doesn't describe memory. ACPI-only systems would have the EFIstub DT.


>  	/* duplicate dt blob */

>  	buf_size = fdt_totalsize(initial_boot_params);

>  	range_len = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);

>  

> +	if (image->type == KEXEC_TYPE_CRASH)

> +		buf_size += fdt_prop_len("linux,elfcorehdr", range_len)

> +				+ fdt_prop_len("linux,usable-memory-range",

> +								range_len);

> +

>  	if (initrd_load_addr)

>  		buf_size += fdt_prop_len("linux,initrd-start", sizeof(u64))

>  				+ fdt_prop_len("linux,initrd-end", sizeof(u64));

> @@ -113,6 +206,23 @@ static int setup_dtb(struct kimage *image,

>  	if (nodeoffset < 0)

>  		goto out_err;

>  

> +	if (image->type == KEXEC_TYPE_CRASH) {

> +		/* add linux,elfcorehdr */

> +		ret = fdt_setprop_range(buf, nodeoffset, "linux,elfcorehdr",

> +				image->arch.elf_load_addr,

> +				image->arch.elf_headers_sz);

> +		if (ret)

> +			goto out_err;

> +

> +		/* add linux,usable-memory-range */

> +		ret = fdt_setprop_range(buf, nodeoffset,

> +				"linux,usable-memory-range",

> +				crashk_res.start,

> +				crashk_res.end - crashk_res.start + 1);


Don't you need to add "linux,usable-memory-range" to the buf_size estimate?


> +		if (ret)

> +			goto out_err;

> +	}


> @@ -148,17 +258,109 @@ static int setup_dtb(struct kimage *image,


> +static struct crash_mem *get_crash_memory_ranges(void)

> +{

> +	unsigned int nr_ranges;

> +	struct crash_mem *cmem;

> +

> +	nr_ranges = 1; /* for exclusion of crashkernel region */

> +	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ranges_callback);

> +

> +	cmem = vmalloc(sizeof(struct crash_mem) +

> +			sizeof(struct crash_mem_range) * nr_ranges);

> +	if (!cmem)

> +		return NULL;

> +

> +	cmem->max_nr_ranges = nr_ranges;

> +	cmem->nr_ranges = 0;

> +	walk_system_ram_res(0, -1, cmem, add_mem_range_callback);

> +

> +	/* Exclude crashkernel region */

> +	if (crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end)) {

> +		vfree(cmem);

> +		return NULL;

> +	}

> +

> +	return cmem;

> +}


Could this function be included in prepare_elf_headers() so that the alloc() and
free() occur together.


> +static int prepare_elf_headers(void **addr, unsigned long *sz)

> +{

> +	struct crash_mem *cmem;

> +	int ret = 0;

> +

> +	cmem = get_crash_memory_ranges();

> +	if (!cmem)

> +		return -ENOMEM;

> +

> +	ret =  crash_prepare_elf64_headers(cmem, true, addr, sz);

> +

> +	vfree(cmem);


> +	return ret;

> +}


All this is moving memory-range information from core-code's
walk_system_ram_res() into core-code's struct crash_mem, and excluding
crashk_res, which again is accessible to the core code.

It looks like this is duplicated in arch/x86 and arch/arm64 because arm64
doesn't have a second 'crashk_low_res' region, and always wants elf64, instead
of when IS_ENABLED(CONFIG_X86_64).
If we can abstract just those two, more of this could be moved to core code
where powerpc can make use of it if they want to support kdump with
kexec_file_load().

But, its getting late for cross-architecture dependencies, lets put that on the
for-later list. (assuming there isn't a powerpc-kdump series out there adding a
third copy of this)


Thanks,

James

James Morse May 16, 2018, 8:34 a.m. UTC | #2

Hi Akashi,

On 15/05/18 18:11, James Morse wrote:
> On 25/04/18 07:26, AKASHI Takahiro wrote:

>> Enabling crash dump (kdump) includes

>> * prepare contents of ELF header of a core dump file, /proc/vmcore,

>>   using crash_prepare_elf64_headers(), and

>> * add two device tree properties, "linux,usable-memory-range" and

>>   "linux,elfcorehdr", which represent repsectively a memory range

>>   to be used by crash dump kernel and the header's location


>> diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c

>> index 37c0a9dc2e47..ec674f4d267c 100644

>> --- a/arch/arm64/kernel/machine_kexec_file.c

>> +++ b/arch/arm64/kernel/machine_kexec_file.c

>> @@ -76,6 +81,78 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf,


>> +static void fill_property(void *buf, u64 val64, int cells)

>> +{

>> +	u32 val32;

>> +

>> +	if (cells == 1) {

>> +		val32 = cpu_to_fdt32((u32)val64);

>> +		memcpy(buf, &val32, sizeof(val32));

>> +	} else {

> 

>> +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));

>> +		buf += cells * sizeof(u32) - sizeof(u64);

> 

> Is this trying to clear the 'top' cells and shuffle the pointer to point at the

> 'bottom' 2? I'm pretty sure this isn't endian safe.


It came to me at 2am: this only works on big-endian, which is exactly what you
want as that is the DT format.


> Do we really expect a system to have #address-cells > 2?



Thanks,

James

James Morse May 16, 2018, 10:06 a.m. UTC | #3

Hi Akashi,

On 15/05/18 18:11, James Morse wrote:
> On 25/04/18 07:26, AKASHI Takahiro wrote:

>> Enabling crash dump (kdump) includes

>> * prepare contents of ELF header of a core dump file, /proc/vmcore,

>>   using crash_prepare_elf64_headers(), and

>> * add two device tree properties, "linux,usable-memory-range" and

>>   "linux,elfcorehdr", which represent repsectively a memory range

>>   to be used by crash dump kernel and the header's location


>> diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c

>> index 37c0a9dc2e47..ec674f4d267c 100644

>> --- a/arch/arm64/kernel/machine_kexec_file.c

>> +++ b/arch/arm64/kernel/machine_kexec_file.c


>> +static struct crash_mem *get_crash_memory_ranges(void)

>> +{

>> +	unsigned int nr_ranges;

>> +	struct crash_mem *cmem;

>> +

>> +	nr_ranges = 1; /* for exclusion of crashkernel region */

>> +	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ranges_callback);

>> +

>> +	cmem = vmalloc(sizeof(struct crash_mem) +

>> +			sizeof(struct crash_mem_range) * nr_ranges);

>> +	if (!cmem)

>> +		return NULL;

>> +

>> +	cmem->max_nr_ranges = nr_ranges;

>> +	cmem->nr_ranges = 0;

>> +	walk_system_ram_res(0, -1, cmem, add_mem_range_callback);

>> +

>> +	/* Exclude crashkernel region */

>> +	if (crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end)) {

>> +		vfree(cmem);

>> +		return NULL;

>> +	}

>> +

>> +	return cmem;

>> +}

> 

> Could this function be included in prepare_elf_headers() so that the alloc() and

> free() occur together.

> 

> 

>> +static int prepare_elf_headers(void **addr, unsigned long *sz)

>> +{

>> +	struct crash_mem *cmem;

>> +	int ret = 0;

>> +

>> +	cmem = get_crash_memory_ranges();

>> +	if (!cmem)

>> +		return -ENOMEM;

>> +

>> +	ret =  crash_prepare_elf64_headers(cmem, true, addr, sz);

>> +

>> +	vfree(cmem);

> 

>> +	return ret;

>> +}

> 

> All this is moving memory-range information from core-code's

> walk_system_ram_res() into core-code's struct crash_mem, and excluding

> crashk_res, which again is accessible to the core code.

> 

> It looks like this is duplicated in arch/x86 and arch/arm64 because arm64

> doesn't have a second 'crashk_low_res' region, and always wants elf64, instead

> of when IS_ENABLED(CONFIG_X86_64).


Thinking about it some more: don't we want to walk memblock here, not
walk_system_ram_res()? What we want is a list of not-nomap regions that the
kernel may have been using, to form part of vmcore.
walk_system_ram_res() is becoming a murkier list of maybe-nomap, maybe-reserved.

I think we should walk the same list here as we do in patch 4.


Thanks,

James

AKASHI Takahiro May 18, 2018, 9:50 a.m. UTC | #4

On Wed, May 16, 2018 at 11:06:02AM +0100, James Morse wrote:
> Hi Akashi,

> 

> On 15/05/18 18:11, James Morse wrote:

> > On 25/04/18 07:26, AKASHI Takahiro wrote:

> >> Enabling crash dump (kdump) includes

> >> * prepare contents of ELF header of a core dump file, /proc/vmcore,

> >>   using crash_prepare_elf64_headers(), and

> >> * add two device tree properties, "linux,usable-memory-range" and

> >>   "linux,elfcorehdr", which represent repsectively a memory range

> >>   to be used by crash dump kernel and the header's location

> 

> >> diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c

> >> index 37c0a9dc2e47..ec674f4d267c 100644

> >> --- a/arch/arm64/kernel/machine_kexec_file.c

> >> +++ b/arch/arm64/kernel/machine_kexec_file.c

> 

> >> +static struct crash_mem *get_crash_memory_ranges(void)

> >> +{

> >> +	unsigned int nr_ranges;

> >> +	struct crash_mem *cmem;

> >> +

> >> +	nr_ranges = 1; /* for exclusion of crashkernel region */

> >> +	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ranges_callback);

> >> +

> >> +	cmem = vmalloc(sizeof(struct crash_mem) +

> >> +			sizeof(struct crash_mem_range) * nr_ranges);

> >> +	if (!cmem)

> >> +		return NULL;

> >> +

> >> +	cmem->max_nr_ranges = nr_ranges;

> >> +	cmem->nr_ranges = 0;

> >> +	walk_system_ram_res(0, -1, cmem, add_mem_range_callback);

> >> +

> >> +	/* Exclude crashkernel region */

> >> +	if (crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end)) {

> >> +		vfree(cmem);

> >> +		return NULL;

> >> +	}

> >> +

> >> +	return cmem;

> >> +}

> > 

> > Could this function be included in prepare_elf_headers() so that the alloc() and

> > free() occur together.

> > 

> > 

> >> +static int prepare_elf_headers(void **addr, unsigned long *sz)

> >> +{

> >> +	struct crash_mem *cmem;

> >> +	int ret = 0;

> >> +

> >> +	cmem = get_crash_memory_ranges();

> >> +	if (!cmem)

> >> +		return -ENOMEM;

> >> +

> >> +	ret =  crash_prepare_elf64_headers(cmem, true, addr, sz);

> >> +

> >> +	vfree(cmem);

> > 

> >> +	return ret;

> >> +}

> > 

> > All this is moving memory-range information from core-code's

> > walk_system_ram_res() into core-code's struct crash_mem, and excluding

> > crashk_res, which again is accessible to the core code.

> > 

> > It looks like this is duplicated in arch/x86 and arch/arm64 because arm64

> > doesn't have a second 'crashk_low_res' region, and always wants elf64, instead

> > of when IS_ENABLED(CONFIG_X86_64).

> 

> Thinking about it some more: don't we want to walk memblock here, not

> walk_system_ram_res()? What we want is a list of not-nomap regions that the

> kernel may have been using, to form part of vmcore.

> walk_system_ram_res() is becoming a murkier list of maybe-nomap, maybe-reserved.

> 

> I think we should walk the same list here as we do in patch 4.


For consistency, yes.
I missed that.

-Takahiro AKASHI

> 

> 

> Thanks,

> 

> James

AKASHI Takahiro May 18, 2018, 9:58 a.m. UTC | #5

On Wed, May 16, 2018 at 09:34:41AM +0100, James Morse wrote:
> Hi Akashi,

> 

> On 15/05/18 18:11, James Morse wrote:

> > On 25/04/18 07:26, AKASHI Takahiro wrote:

> >> Enabling crash dump (kdump) includes

> >> * prepare contents of ELF header of a core dump file, /proc/vmcore,

> >>   using crash_prepare_elf64_headers(), and

> >> * add two device tree properties, "linux,usable-memory-range" and

> >>   "linux,elfcorehdr", which represent repsectively a memory range

> >>   to be used by crash dump kernel and the header's location

> 

> >> diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c

> >> index 37c0a9dc2e47..ec674f4d267c 100644

> >> --- a/arch/arm64/kernel/machine_kexec_file.c

> >> +++ b/arch/arm64/kernel/machine_kexec_file.c

> >> @@ -76,6 +81,78 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf,

> 

> >> +static void fill_property(void *buf, u64 val64, int cells)

> >> +{

> >> +	u32 val32;

> >> +

> >> +	if (cells == 1) {

> >> +		val32 = cpu_to_fdt32((u32)val64);

> >> +		memcpy(buf, &val32, sizeof(val32));

> >> +	} else {

> > 

> >> +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));

> >> +		buf += cells * sizeof(u32) - sizeof(u64);

> > 

> > Is this trying to clear the 'top' cells and shuffle the pointer to point at the

> > 'bottom' 2? I'm pretty sure this isn't endian safe.

> 

> It came to me at 2am: this only works on big-endian, which is exactly what you

> want as that is the DT format.


Oops, I was almost tricked as I haven't tested kexec on BE
for a long time :)

Thanks,
-Takahiro AKASHI

> 

> > Do we really expect a system to have #address-cells > 2?

> 

> 

> Thanks,

> 

> James

AKASHI Takahiro May 18, 2018, 10:39 a.m. UTC | #6

On Tue, May 15, 2018 at 06:11:15PM +0100, James Morse wrote:
> Hi Akashi,

> 

> On 25/04/18 07:26, AKASHI Takahiro wrote:

> > Enabling crash dump (kdump) includes

> > * prepare contents of ELF header of a core dump file, /proc/vmcore,

> >   using crash_prepare_elf64_headers(), and

> > * add two device tree properties, "linux,usable-memory-range" and

> >   "linux,elfcorehdr", which represent repsectively a memory range

> 

> (Nit: respectively)


Will fix.

> 

> >   to be used by crash dump kernel and the header's location

> 

> >  arch/arm64/include/asm/kexec.h         |   4 +

> >  arch/arm64/kernel/kexec_image.c        |   9 +-

> >  arch/arm64/kernel/machine_kexec_file.c | 202 +++++++++++++++++++++++++

> 

> In this patch, machine_kexec_file.c gains its own private fdt array encoder.


See below.

> 

> > diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c

> > index 37c0a9dc2e47..ec674f4d267c 100644

> > --- a/arch/arm64/kernel/machine_kexec_file.c

> > +++ b/arch/arm64/kernel/machine_kexec_file.c

> > @@ -76,6 +81,78 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf,

> >  	return ret;

> >  }

> >  

> > +static int __init arch_kexec_file_init(void)

> > +{

> > +	/* Those values are used later on loading the kernel */

> > +	__dt_root_addr_cells = dt_root_addr_cells;

> > +	__dt_root_size_cells = dt_root_size_cells;

> > +

> > +	return 0;

> > +}

> > +late_initcall(arch_kexec_file_init);

> 

> If we need these is it worth taking them out of __initdata? I note they've been

> 'temporary' for quite a long time.


I think that I had some reason that I didn't do that, but don't remember now.
If there's no problem, I will take your suggestion.

> 

> > +

> > +#define FDT_ALIGN(x, a)	(((x) + (a) - 1) & ~((a) - 1))

> > +#define FDT_TAGALIGN(x)	(FDT_ALIGN((x), FDT_TAGSIZE))

> > +

> > +static int fdt_prop_len(const char *prop_name, int len)

> > +{

> > +	return (strlen(prop_name) + 1) +

> > +		sizeof(struct fdt_property) +

> > +		FDT_TAGALIGN(len);

> > +}

> 

> This stuff should really be in libfdt.h  Those macros come from

> libfdt_internal.h, so we're probably doing something wrong here.

> 

> 

> > +static bool cells_size_fitted(unsigned long base, unsigned long size)

> > +{

> > +	/* if *_cells >= 2, cells can hold 64-bit values anyway */

> > +	if ((__dt_root_addr_cells == 1) && (base >= (1ULL << 32)))

> > +		return false;

> > +

> > +	if ((__dt_root_size_cells == 1) && (size >= (1ULL << 32)))

> > +		return false;

> 

> Using '> U32_MAX' here may be more readable.


OK

> 

> > +	return true;

> > +}

> > +

> > +static void fill_property(void *buf, u64 val64, int cells)

> > +{

> > +	u32 val32;

> > +

> > +	if (cells == 1) {

> > +		val32 = cpu_to_fdt32((u32)val64);

> > +		memcpy(buf, &val32, sizeof(val32));

> > +	} else {

> 

> > +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));

> > +		buf += cells * sizeof(u32) - sizeof(u64);

> 

> Is this trying to clear the 'top' cells and shuffle the pointer to point at the

> 'bottom' 2? I'm pretty sure this isn't endian safe.

> 

> Do we really expect a system to have #address-cells > 2?


I don't know, but just for safety.

> 

> > +		val64 = cpu_to_fdt64(val64);

> > +		memcpy(buf, &val64, sizeof(val64));

> > +	}

> > +}

> > +

> > +static int fdt_setprop_range(void *fdt, int nodeoffset, const char *name,

> > +				unsigned long addr, unsigned long size)

> 

> (the device-tree spec describes a 'ranges' property, which had me confused. This

> is encoding a prop-encoded-array)


Should we rename it to, say, fdt_setprop_reg()?


> > +{

> > +	void *buf, *prop;

> > +	size_t buf_size;

> > +	int result;

> > +

> > +	buf_size = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);

> > +	prop = buf = vmalloc(buf_size);

> 

> virtual memory allocation for something less than PAGE_SIZE?


I've never cared about that. Let me think again.

> 

> > +	if (!buf)

> > +		return -ENOMEM;

> > +

> > +	fill_property(prop, addr, __dt_root_addr_cells);

> > +	prop += __dt_root_addr_cells * sizeof(u32);

> > +

> > +	fill_property(prop, size, __dt_root_size_cells);

> > +

> > +	result = fdt_setprop(fdt, nodeoffset, name, buf, buf_size);

> > +

> > +	vfree(buf);

> > +

> > +	return result;

> > +}

> 

> Doesn't this stuff belong in libfdt? I guess there is no 'add array element' api

> because this the first time we've wanted to create a node with more than

> key=fixed-size-value.

> 

> I don't think this belongs in arch C code. Do we have a plan for getting libfdt

> to support encoding prop-arrays? Can we put it somewhere anyone else duplicating

> this will find it, until we can (re)move it?


I will temporarily move all fdt-related stuff to a separate file, but

> I have no idea how that happens... it looks like the devicetree list is the

> place to ask.


should we always sync with the original dtc/libfdt repository?

> 

> >  static int setup_dtb(struct kimage *image,

> >  		unsigned long initrd_load_addr, unsigned long initrd_len,

> >  		char *cmdline, unsigned long cmdline_len,

> > @@ -88,10 +165,26 @@ static int setup_dtb(struct kimage *image,

> >  	int range_len;

> >  	int ret;

> >  

> > +	/* check ranges against root's #address-cells and #size-cells */

> > +	if (image->type == KEXEC_TYPE_CRASH &&

> > +		(!cells_size_fitted(image->arch.elf_load_addr,

> > +				image->arch.elf_headers_sz) ||

> > +		 !cells_size_fitted(crashk_res.start,

> > +				crashk_res.end - crashk_res.start + 1))) {

> > +		pr_err("Crash memory region doesn't fit into DT's root cell sizes.\n");

> > +		ret = -EINVAL;

> > +		goto out_err;

> > +	}

> 

> To check I've understood this properly: This can happen if the firmware provided

> a DTB with 32bit address/size cells, but at least some of the memory requires 64

> bit address/size cells. This could only happen on a UEFI system where the

> firmware-DTB doesn't describe memory. ACPI-only systems would have the EFIstub DT.


Probably, yes. I assumed the case where #address-cells and #size-cells
were just missing in fdt.

> 

> >  	/* duplicate dt blob */

> >  	buf_size = fdt_totalsize(initial_boot_params);

> >  	range_len = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);

> >  

> > +	if (image->type == KEXEC_TYPE_CRASH)

> > +		buf_size += fdt_prop_len("linux,elfcorehdr", range_len)

> > +				+ fdt_prop_len("linux,usable-memory-range",

> > +								range_len);


                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

> > +

> >  	if (initrd_load_addr)

> >  		buf_size += fdt_prop_len("linux,initrd-start", sizeof(u64))

> >  				+ fdt_prop_len("linux,initrd-end", sizeof(u64));

> > @@ -113,6 +206,23 @@ static int setup_dtb(struct kimage *image,

> >  	if (nodeoffset < 0)

> >  		goto out_err;

> >  

> > +	if (image->type == KEXEC_TYPE_CRASH) {

> > +		/* add linux,elfcorehdr */

> > +		ret = fdt_setprop_range(buf, nodeoffset, "linux,elfcorehdr",

> > +				image->arch.elf_load_addr,

> > +				image->arch.elf_headers_sz);

> > +		if (ret)

> > +			goto out_err;

> > +

> > +		/* add linux,usable-memory-range */

> > +		ret = fdt_setprop_range(buf, nodeoffset,

> > +				"linux,usable-memory-range",

> > +				crashk_res.start,

> > +				crashk_res.end - crashk_res.start + 1);

> 

> Don't you need to add "linux,usable-memory-range" to the buf_size estimate?


I think the code exists. See above.

> 

> > +		if (ret)

> > +			goto out_err;

> > +	}

> 

> > @@ -148,17 +258,109 @@ static int setup_dtb(struct kimage *image,

> 

> > +static struct crash_mem *get_crash_memory_ranges(void)

> > +{

> > +	unsigned int nr_ranges;

> > +	struct crash_mem *cmem;

> > +

> > +	nr_ranges = 1; /* for exclusion of crashkernel region */

> > +	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ranges_callback);

> > +

> > +	cmem = vmalloc(sizeof(struct crash_mem) +

> > +			sizeof(struct crash_mem_range) * nr_ranges);

> > +	if (!cmem)

> > +		return NULL;

> > +

> > +	cmem->max_nr_ranges = nr_ranges;

> > +	cmem->nr_ranges = 0;

> > +	walk_system_ram_res(0, -1, cmem, add_mem_range_callback);

> > +

> > +	/* Exclude crashkernel region */

> > +	if (crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end)) {

> > +		vfree(cmem);

> > +		return NULL;

> > +	}

> > +

> > +	return cmem;

> > +}

> 

> Could this function be included in prepare_elf_headers() so that the alloc() and

> free() occur together.



Or aiming that arm64 and x86 have similar-look code?

> 

> > +static int prepare_elf_headers(void **addr, unsigned long *sz)

> > +{

> > +	struct crash_mem *cmem;

> > +	int ret = 0;

> > +

> > +	cmem = get_crash_memory_ranges();

> > +	if (!cmem)

> > +		return -ENOMEM;

> > +

> > +	ret =  crash_prepare_elf64_headers(cmem, true, addr, sz);

> > +

> > +	vfree(cmem);

> 

> > +	return ret;

> > +}

> 

> All this is moving memory-range information from core-code's

> walk_system_ram_res() into core-code's struct crash_mem, and excluding

> crashk_res, which again is accessible to the core code.

> 

> It looks like this is duplicated in arch/x86 and arch/arm64 because arm64

> doesn't have a second 'crashk_low_res' region, and always wants elf64, instead

> of when IS_ENABLED(CONFIG_X86_64).

> If we can abstract just those two, more of this could be moved to core code

> where powerpc can make use of it if they want to support kdump with

> kexec_file_load().

> 

> But, its getting late for cross-architecture dependencies, lets put that on the

> for-later list. (assuming there isn't a powerpc-kdump series out there adding a

> third copy of this)


Sure. X86 code has so many exceptional lines in the code :)

Thanks,
-Takahiro AKASHI


> 

> Thanks,

> 

> James

James Morse May 18, 2018, 4 p.m. UTC | #7

Hi Akashi,

On 18/05/18 11:39, AKASHI Takahiro wrote:
> On Tue, May 15, 2018 at 06:11:15PM +0100, James Morse wrote:

>> On 25/04/18 07:26, AKASHI Takahiro wrote:

>>> Enabling crash dump (kdump) includes

>>> * prepare contents of ELF header of a core dump file, /proc/vmcore,

>>>   using crash_prepare_elf64_headers(), and

>>> * add two device tree properties, "linux,usable-memory-range" and

>>>   "linux,elfcorehdr", which represent repsectively a memory range


>>> diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c

>>> index 37c0a9dc2e47..ec674f4d267c 100644

>>> --- a/arch/arm64/kernel/machine_kexec_file.c

>>> +++ b/arch/arm64/kernel/machine_kexec_file.c


>>> +static void fill_property(void *buf, u64 val64, int cells)

>>> +{

>>> +	u32 val32;

>>> +

>>> +	if (cells == 1) {

>>> +		val32 = cpu_to_fdt32((u32)val64);

>>> +		memcpy(buf, &val32, sizeof(val32));

>>> +	} else {

>>

>>> +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));

>>> +		buf += cells * sizeof(u32) - sizeof(u64);

>>

>> Is this trying to clear the 'top' cells and shuffle the pointer to point at the

>> 'bottom' 2? I'm pretty sure this isn't endian safe.

>>

>> Do we really expect a system to have #address-cells > 2?

> 

> I don't know, but just for safety.


Okay, so this is aiming to be a cover-all-cases library function.


>>> +		val64 = cpu_to_fdt64(val64);

>>> +		memcpy(buf, &val64, sizeof(val64));

>>> +	}

>>> +}

>>> +

>>> +static int fdt_setprop_range(void *fdt, int nodeoffset, const char *name,

>>> +				unsigned long addr, unsigned long size)

>>

>> (the device-tree spec describes a 'ranges' property, which had me confused. This

>> is encoding a prop-encoded-array)

> 

> Should we rename it to, say, fdt_setprop_reg()?


Sure, but I'd really like this code to come from libfdt. I'm hoping for some
temporary workaround, lets see what the DT folk say.


>>> +	if (!buf)

>>> +		return -ENOMEM;

>>> +

>>> +	fill_property(prop, addr, __dt_root_addr_cells);

>>> +	prop += __dt_root_addr_cells * sizeof(u32);

>>> +

>>> +	fill_property(prop, size, __dt_root_size_cells);

>>> +

>>> +	result = fdt_setprop(fdt, nodeoffset, name, buf, buf_size);

>>> +

>>> +	vfree(buf);

>>> +

>>> +	return result;

>>> +}

>>

>> Doesn't this stuff belong in libfdt? I guess there is no 'add array element' api

>> because this the first time we've wanted to create a node with more than

>> key=fixed-size-value.

>>

>> I don't think this belongs in arch C code. Do we have a plan for getting libfdt

>> to support encoding prop-arrays? Can we put it somewhere anyone else duplicating

>> this will find it, until we can (re)move it?

> 

> I will temporarily move all fdt-related stuff to a separate file, but

> 

>> I have no idea how that happens... it looks like the devicetree list is the

>> place to ask.

> 

> should we always sync with the original dtc/libfdt repository?


I thought so, libfdt is one of those external libraries that the kernel
consumes, like acpica. For acpica at least the rule is changes go upstream, then
get sync'd back.


>>>  static int setup_dtb(struct kimage *image,

>>>  		unsigned long initrd_load_addr, unsigned long initrd_len,

>>>  		char *cmdline, unsigned long cmdline_len,

>>> @@ -88,10 +165,26 @@ static int setup_dtb(struct kimage *image,

>>>  	int range_len;

>>>  	int ret;

>>>  

>>> +	/* check ranges against root's #address-cells and #size-cells */

>>> +	if (image->type == KEXEC_TYPE_CRASH &&

>>> +		(!cells_size_fitted(image->arch.elf_load_addr,

>>> +				image->arch.elf_headers_sz) ||

>>> +		 !cells_size_fitted(crashk_res.start,

>>> +				crashk_res.end - crashk_res.start + 1))) {

>>> +		pr_err("Crash memory region doesn't fit into DT's root cell sizes.\n");

>>> +		ret = -EINVAL;

>>> +		goto out_err;

>>> +	}

>>

>> To check I've understood this properly: This can happen if the firmware provided

>> a DTB with 32bit address/size cells, but at least some of the memory requires 64

>> bit address/size cells. This could only happen on a UEFI system where the

>> firmware-DTB doesn't describe memory. ACPI-only systems would have the EFIstub DT.

> 

> Probably, yes. I assumed the case where #address-cells and #size-cells

> were just missing in fdt.


Ah, that's another one. I just wanted to check we could boot on a system where
this can happen.


>>>  	/* duplicate dt blob */

>>>  	buf_size = fdt_totalsize(initial_boot_params);

>>>  	range_len = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);

>>>  

>>> +	if (image->type == KEXEC_TYPE_CRASH)

>>> +		buf_size += fdt_prop_len("linux,elfcorehdr", range_len)

>>> +				+ fdt_prop_len("linux,usable-memory-range",

>>> +								range_len);


>                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

[...]

>> Don't you need to add "linux,usable-memory-range" to the buf_size estimate?

> 

> I think the code exists. See above.


Sorry, turns out I can't read!


>>> +		if (ret)

>>> +			goto out_err;

>>> +	}

>>

>>> @@ -148,17 +258,109 @@ static int setup_dtb(struct kimage *image,

>>

>>> +static struct crash_mem *get_crash_memory_ranges(void)

>>> +{

>>> +	unsigned int nr_ranges;

>>> +	struct crash_mem *cmem;

>>> +

>>> +	nr_ranges = 1; /* for exclusion of crashkernel region */

>>> +	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ranges_callback);

>>> +

>>> +	cmem = vmalloc(sizeof(struct crash_mem) +

>>> +			sizeof(struct crash_mem_range) * nr_ranges);

>>> +	if (!cmem)

>>> +		return NULL;

>>> +

>>> +	cmem->max_nr_ranges = nr_ranges;

>>> +	cmem->nr_ranges = 0;

>>> +	walk_system_ram_res(0, -1, cmem, add_mem_range_callback);

>>> +

>>> +	/* Exclude crashkernel region */

>>> +	if (crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end)) {

>>> +		vfree(cmem);

>>> +		return NULL;

>>> +	}

>>> +

>>> +	return cmem;

>>> +}

>>

>> Could this function be included in prepare_elf_headers() so that the alloc() and

>> free() occur together.

> 

> Or aiming that arm64 and x86 have similar-look code?


What's the advantage in things looking the same? If they are the same, it
probably shouldn't be in per-arch code. Otherwise it should be as simple as
possible, otherwise we can't spot the bugs/leaks.

But I think walking memblock here will remove all 'looks the same' properties here.


>>> +static int prepare_elf_headers(void **addr, unsigned long *sz)

>>> +{

>>> +	struct crash_mem *cmem;

>>> +	int ret = 0;

>>> +

>>> +	cmem = get_crash_memory_ranges();

>>> +	if (!cmem)

>>> +		return -ENOMEM;

>>> +

>>> +	ret =  crash_prepare_elf64_headers(cmem, true, addr, sz);

>>> +

>>> +	vfree(cmem);

>>

>>> +	return ret;

>>> +}

>>

>> All this is moving memory-range information from core-code's

>> walk_system_ram_res() into core-code's struct crash_mem, and excluding

>> crashk_res, which again is accessible to the core code.

>>

>> It looks like this is duplicated in arch/x86 and arch/arm64 because arm64

>> doesn't have a second 'crashk_low_res' region, and always wants elf64, instead

>> of when IS_ENABLED(CONFIG_X86_64).

>> If we can abstract just those two, more of this could be moved to core code

>> where powerpc can make use of it if they want to support kdump with

>> kexec_file_load().

>>

>> But, its getting late for cross-architecture dependencies, lets put that on the

>> for-later list. (assuming there isn't a powerpc-kdump series out there adding a

>> third copy of this)

> 

> Sure. X86 code has so many exceptional lines in the code :)


They also pass the e820 'usable-memory' map on the cmdline...


Thanks,

James

AKASHI Takahiro May 21, 2018, 9:46 a.m. UTC | #8

James,

On Fri, May 18, 2018 at 05:00:55PM +0100, James Morse wrote:
> Hi Akashi,

> 

> On 18/05/18 11:39, AKASHI Takahiro wrote:

> > On Tue, May 15, 2018 at 06:11:15PM +0100, James Morse wrote:

> >> On 25/04/18 07:26, AKASHI Takahiro wrote:

> >>> Enabling crash dump (kdump) includes

> >>> * prepare contents of ELF header of a core dump file, /proc/vmcore,

> >>>   using crash_prepare_elf64_headers(), and

> >>> * add two device tree properties, "linux,usable-memory-range" and

> >>>   "linux,elfcorehdr", which represent repsectively a memory range

> 

> >>> diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c

> >>> index 37c0a9dc2e47..ec674f4d267c 100644

> >>> --- a/arch/arm64/kernel/machine_kexec_file.c

> >>> +++ b/arch/arm64/kernel/machine_kexec_file.c

> 

> >>> +static void fill_property(void *buf, u64 val64, int cells)

> >>> +{

> >>> +	u32 val32;

> >>> +

> >>> +	if (cells == 1) {

> >>> +		val32 = cpu_to_fdt32((u32)val64);

> >>> +		memcpy(buf, &val32, sizeof(val32));

> >>> +	} else {

> >>

> >>> +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));

> >>> +		buf += cells * sizeof(u32) - sizeof(u64);

> >>

> >> Is this trying to clear the 'top' cells and shuffle the pointer to point at the

> >> 'bottom' 2? I'm pretty sure this isn't endian safe.

> >>

> >> Do we really expect a system to have #address-cells > 2?

> > 

> > I don't know, but just for safety.

> 

> Okay, so this is aiming to be a cover-all-cases library function.

> 

> 

> >>> +		val64 = cpu_to_fdt64(val64);

> >>> +		memcpy(buf, &val64, sizeof(val64));

> >>> +	}

> >>> +}

> >>> +

> >>> +static int fdt_setprop_range(void *fdt, int nodeoffset, const char *name,

> >>> +				unsigned long addr, unsigned long size)

> >>

> >> (the device-tree spec describes a 'ranges' property, which had me confused. This

> >> is encoding a prop-encoded-array)

> > 

> > Should we rename it to, say, fdt_setprop_reg()?

> 

> Sure, but I'd really like this code to come from libfdt. I'm hoping for some

> temporary workaround, lets see what the DT folk say.


OK, I will follow Rob's suggestion.

> >>> +	if (!buf)

> >>> +		return -ENOMEM;

> >>> +

> >>> +	fill_property(prop, addr, __dt_root_addr_cells);

> >>> +	prop += __dt_root_addr_cells * sizeof(u32);

> >>> +

> >>> +	fill_property(prop, size, __dt_root_size_cells);

> >>> +

> >>> +	result = fdt_setprop(fdt, nodeoffset, name, buf, buf_size);

> >>> +

> >>> +	vfree(buf);

> >>> +

> >>> +	return result;

> >>> +}

> >>

> >> Doesn't this stuff belong in libfdt? I guess there is no 'add array element' api

> >> because this the first time we've wanted to create a node with more than

> >> key=fixed-size-value.

> >>

> >> I don't think this belongs in arch C code. Do we have a plan for getting libfdt

> >> to support encoding prop-arrays? Can we put it somewhere anyone else duplicating

> >> this will find it, until we can (re)move it?

> > 

> > I will temporarily move all fdt-related stuff to a separate file, but

> > 

> >> I have no idea how that happens... it looks like the devicetree list is the

> >> place to ask.

> > 

> > should we always sync with the original dtc/libfdt repository?

> 

> I thought so, libfdt is one of those external libraries that the kernel

> consumes, like acpica. For acpica at least the rule is changes go upstream, then

> get sync'd back.


Same above.

> >>>  static int setup_dtb(struct kimage *image,

> >>>  		unsigned long initrd_load_addr, unsigned long initrd_len,

> >>>  		char *cmdline, unsigned long cmdline_len,

> >>> @@ -88,10 +165,26 @@ static int setup_dtb(struct kimage *image,

> >>>  	int range_len;

> >>>  	int ret;

> >>>  

> >>> +	/* check ranges against root's #address-cells and #size-cells */

> >>> +	if (image->type == KEXEC_TYPE_CRASH &&

> >>> +		(!cells_size_fitted(image->arch.elf_load_addr,

> >>> +				image->arch.elf_headers_sz) ||

> >>> +		 !cells_size_fitted(crashk_res.start,

> >>> +				crashk_res.end - crashk_res.start + 1))) {

> >>> +		pr_err("Crash memory region doesn't fit into DT's root cell sizes.\n");

> >>> +		ret = -EINVAL;

> >>> +		goto out_err;

> >>> +	}

> >>

> >> To check I've understood this properly: This can happen if the firmware provided

> >> a DTB with 32bit address/size cells, but at least some of the memory requires 64

> >> bit address/size cells. This could only happen on a UEFI system where the

> >> firmware-DTB doesn't describe memory. ACPI-only systems would have the EFIstub DT.

> > 

> > Probably, yes. I assumed the case where #address-cells and #size-cells

> > were just missing in fdt.

> 

> Ah, that's another one. I just wanted to check we could boot on a system where

> this can happen.

> 

> 

> >>>  	/* duplicate dt blob */

> >>>  	buf_size = fdt_totalsize(initial_boot_params);

> >>>  	range_len = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);

> >>>  

> >>> +	if (image->type == KEXEC_TYPE_CRASH)

> >>> +		buf_size += fdt_prop_len("linux,elfcorehdr", range_len)

> >>> +				+ fdt_prop_len("linux,usable-memory-range",

> >>> +								range_len);

> 

> >                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

> [...]

> 

> >> Don't you need to add "linux,usable-memory-range" to the buf_size estimate?

> > 

> > I think the code exists. See above.

> 

> Sorry, turns out I can't read!

> 

> 

> >>> +		if (ret)

> >>> +			goto out_err;

> >>> +	}

> >>

> >>> @@ -148,17 +258,109 @@ static int setup_dtb(struct kimage *image,

> >>

> >>> +static struct crash_mem *get_crash_memory_ranges(void)

> >>> +{

> >>> +	unsigned int nr_ranges;

> >>> +	struct crash_mem *cmem;

> >>> +

> >>> +	nr_ranges = 1; /* for exclusion of crashkernel region */

> >>> +	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ranges_callback);

> >>> +

> >>> +	cmem = vmalloc(sizeof(struct crash_mem) +

> >>> +			sizeof(struct crash_mem_range) * nr_ranges);

> >>> +	if (!cmem)

> >>> +		return NULL;

> >>> +

> >>> +	cmem->max_nr_ranges = nr_ranges;

> >>> +	cmem->nr_ranges = 0;

> >>> +	walk_system_ram_res(0, -1, cmem, add_mem_range_callback);

> >>> +

> >>> +	/* Exclude crashkernel region */

> >>> +	if (crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end)) {

> >>> +		vfree(cmem);

> >>> +		return NULL;

> >>> +	}

> >>> +

> >>> +	return cmem;

> >>> +}

> >>

> >> Could this function be included in prepare_elf_headers() so that the alloc() and

> >> free() occur together.

> > 

> > Or aiming that arm64 and x86 have similar-look code?

> 

> What's the advantage in things looking the same? If they are the same, it

> probably shouldn't be in per-arch code. Otherwise it should be as simple as

> possible, otherwise we can't spot the bugs/leaks.

> 

> But I think walking memblock here will remove all 'looks the same' properties here.


OK, I will unfold the function in prepare_elf_headers().

> 

> >>> +static int prepare_elf_headers(void **addr, unsigned long *sz)

> >>> +{

> >>> +	struct crash_mem *cmem;

> >>> +	int ret = 0;

> >>> +

> >>> +	cmem = get_crash_memory_ranges();

> >>> +	if (!cmem)

> >>> +		return -ENOMEM;

> >>> +

> >>> +	ret =  crash_prepare_elf64_headers(cmem, true, addr, sz);

> >>> +

> >>> +	vfree(cmem);

> >>

> >>> +	return ret;

> >>> +}

> >>

> >> All this is moving memory-range information from core-code's

> >> walk_system_ram_res() into core-code's struct crash_mem, and excluding

> >> crashk_res, which again is accessible to the core code.

> >>

> >> It looks like this is duplicated in arch/x86 and arch/arm64 because arm64

> >> doesn't have a second 'crashk_low_res' region, and always wants elf64, instead

> >> of when IS_ENABLED(CONFIG_X86_64).

> >> If we can abstract just those two, more of this could be moved to core code

> >> where powerpc can make use of it if they want to support kdump with

> >> kexec_file_load().

> >>

> >> But, its getting late for cross-architecture dependencies, lets put that on the

> >> for-later list. (assuming there isn't a powerpc-kdump series out there adding a

> >> third copy of this)

> > 

> > Sure. X86 code has so many exceptional lines in the code :)

> 

> They also pass the e820 'usable-memory' map on the cmdline...


Well, according to Dave(RedHat)'s past comment, this type of kernel
parameters are in a old style, and x86 now has a dedicated memory region
passed for this sake.

Thanks,
-Takahiro AKASHI

> 

> Thanks,

> 

> James

AKASHI Takahiro May 21, 2018, 10:14 a.m. UTC | #9

Hi Rob,

On Fri, May 18, 2018 at 10:35:52AM -0500, Rob Herring wrote:
> On Tue, May 15, 2018 at 06:12:59PM +0100, James Morse wrote:

> > Hi guys,

> > 

> > (CC: +RobH, devicetree list)

> 

> Thanks.

> 

> > On 25/04/18 07:26, AKASHI Takahiro wrote:

> > > Enabling crash dump (kdump) includes

> > > * prepare contents of ELF header of a core dump file, /proc/vmcore,

> > >   using crash_prepare_elf64_headers(), and

> > > * add two device tree properties, "linux,usable-memory-range" and

> > >   "linux,elfcorehdr", which represent repsectively a memory range

> > >   to be used by crash dump kernel and the header's location

> 

> BTW, I intend to move existing parsing these out of the arch code. 

> Please don't add more DT handling to arch/ unless it is *really* arch 

> specific. I'd assume that the next arch to add kexec support will use 

> these bindings instead of the powerpc way.


So do you expect all the fdt-related stuff in my current implementation
for arm64 to be put into libfdt, or at least drivers/of, from the beginning?

I'm not sure how arch-specific the properties here are. For instance,
it is only arm64 that uses "linux,usable-memory-range" right now but
if some other arch follows, it is no more arch-specific.
# I remember that you didn't like this property :)

> > kexec_file_load() on arm64 needs to be able to create a prop encoded array to

> > the FDT, but there doesn't appear to be a libfdt helper to do this.

> > 

> > Akashi's code below adds fdt_setprop_range() to the arch code, and duplicates

> > bits of libfdt_internal.h to do the work.

> > 

> > How should this be done? I'm assuming this is something we need a new API in

> > libfdt.h for. How do these come about, and is there an interim step we can use

> > until then?

> 

> Submit patches to upstream dtc and then we can pull it in. Ahead of that 

> you can add it to drivers/of/fdt.c (or maybe fdt_address.c because 

> that's really what this is dealing with).


OK, I'm going to try to follow your suggestion.

> libfdt has only recently gained the beginnings of address handling.

> 

> > 

> > Thanks!

> > 

> > James

> > 

> > > diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c

> > > index 37c0a9dc2e47..ec674f4d267c 100644

> > > --- a/arch/arm64/kernel/machine_kexec_file.c

> > > +++ b/arch/arm64/kernel/machine_kexec_file.c

> > > @@ -76,6 +81,78 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf,

> > >  	return ret;

> > >  }

> > >  

> > > +static int __init arch_kexec_file_init(void)

> > > +{

> > > +	/* Those values are used later on loading the kernel */

> > > +	__dt_root_addr_cells = dt_root_addr_cells;

> > > +	__dt_root_size_cells = dt_root_size_cells;

> 

> I intend to make dt_root_*_cells private, so don't add another user 

> outside of drivers/of/.


Once cells_size_fitted() moves to drivers/of, there will be no users.

> > > +

> > > +	return 0;

> > > +}

> > > +late_initcall(arch_kexec_file_init);

> > > +

> > > +#define FDT_ALIGN(x, a)	(((x) + (a) - 1) & ~((a) - 1))

> > > +#define FDT_TAGALIGN(x)	(FDT_ALIGN((x), FDT_TAGSIZE))

> > > +

> > > +static int fdt_prop_len(const char *prop_name, int len)

> > > +{

> > > +	return (strlen(prop_name) + 1) +

> > > +		sizeof(struct fdt_property) +

> > > +		FDT_TAGALIGN(len);

> > > +}

> > > +

> > > +static bool cells_size_fitted(unsigned long base, unsigned long size)

> 

> I can't imagine this would happen. However, when this is moved to 

> drivers/of/ or dtc, these need to be u64 types to work on 32-bit.


OK.

> > > +	/* if *_cells >= 2, cells can hold 64-bit values anyway */

> > > +	if ((__dt_root_addr_cells == 1) && (base >= (1ULL << 32)))

> > > +		return false;

> > > +

> > > +	if ((__dt_root_size_cells == 1) && (size >= (1ULL << 32)))

> > > +		return false;

> > > +

> > > +	return true;

> > > +}

> > > +

> > > +static void fill_property(void *buf, u64 val64, int cells)

> > > +{

> > > +	u32 val32;

> 

> This should be a __be32 or fdt32 type. So should buf.


OK for val32, but buf is a local pointer address.

> > > +

> > > +	if (cells == 1) {

> > > +		val32 = cpu_to_fdt32((u32)val64);

> > > +		memcpy(buf, &val32, sizeof(val32));

> > > +	} else {

> > > +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));

> > > +		buf += cells * sizeof(u32) - sizeof(u64);

> > > +

> > > +		val64 = cpu_to_fdt64(val64);

> > > +		memcpy(buf, &val64, sizeof(val64));

> 

> Look how of_read_number() is implemented. You should be able to do 

> something similar here looping and avoiding the if/else.


Ah, excellent!

> > > +	}

> > > +}

> > > +

> > > +static int fdt_setprop_range(void *fdt, int nodeoffset, const char *name,

> > > +				unsigned long addr, unsigned long size)

> 

> A very generic sounding function, but really only works on addresses in 

> children of the root node.

> 

> > > +{

> > > +	void *buf, *prop;

> > > +	size_t buf_size;

> > > +	int result;

> > > +

> > > +	buf_size = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);

> > > +	prop = buf = vmalloc(buf_size);

> 

> This can go on the stack instead (and would be required to to work in 

> libfdt).


Well, I can't agree with you here since we are now in effort, as far as
I correctly understand, of purging all the variable-sized arrays on a local
stack out of the kernel code.

Thank you for your review.
-Takahiro AKASHI

> > > +	if (!buf)

> > > +		return -ENOMEM;

> > > +

> > > +	fill_property(prop, addr, __dt_root_addr_cells);

> > > +	prop += __dt_root_addr_cells * sizeof(u32);

> > > +

> > > +	fill_property(prop, size, __dt_root_size_cells);

> > > +

> > > +	result = fdt_setprop(fdt, nodeoffset, name, buf, buf_size);

> > > +

> > > +	vfree(buf);

> > > +

> > > +	return result;

> > > +}

> > > +

> > >  static int setup_dtb(struct kimage *image,

> > >  		unsigned long initrd_load_addr, unsigned long initrd_len,

> > >  		char *cmdline, unsigned long cmdline_len,

> > > @@ -88,10 +165,26 @@ static int setup_dtb(struct kimage *image,

> > >  	int range_len;

> > >  	int ret;

> > >  

> > > +	/* check ranges against root's #address-cells and #size-cells */

> > > +	if (image->type == KEXEC_TYPE_CRASH &&

> > > +		(!cells_size_fitted(image->arch.elf_load_addr,

> > > +				image->arch.elf_headers_sz) ||

> > > +		 !cells_size_fitted(crashk_res.start,

> > > +				crashk_res.end - crashk_res.start + 1))) {

> > > +		pr_err("Crash memory region doesn't fit into DT's root cell sizes.\n");

> > > +		ret = -EINVAL;

> > > +		goto out_err;

> > > +	}

> > > +

> > >  	/* duplicate dt blob */

> > >  	buf_size = fdt_totalsize(initial_boot_params);

> > >  	range_len = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);

> > >  

> > > +	if (image->type == KEXEC_TYPE_CRASH)

> > > +		buf_size += fdt_prop_len("linux,elfcorehdr", range_len)

> > > +				+ fdt_prop_len("linux,usable-memory-range",

> > > +								range_len);

> > > +

> > >  	if (initrd_load_addr)

> > >  		buf_size += fdt_prop_len("linux,initrd-start", sizeof(u64))

> > >  				+ fdt_prop_len("linux,initrd-end", sizeof(u64));

> > > @@ -113,6 +206,23 @@ static int setup_dtb(struct kimage *image,

> > >  	if (nodeoffset < 0)

> > >  		goto out_err;

> > >  

> > > +	if (image->type == KEXEC_TYPE_CRASH) {

> > > +		/* add linux,elfcorehdr */

> > > +		ret = fdt_setprop_range(buf, nodeoffset, "linux,elfcorehdr",

> > > +				image->arch.elf_load_addr,

> > > +				image->arch.elf_headers_sz);

> > > +		if (ret)

> > > +			goto out_err;

> > > +

> > > +		/* add linux,usable-memory-range */

> > > +		ret = fdt_setprop_range(buf, nodeoffset,

> > > +				"linux,usable-memory-range",

> > > +				crashk_res.start,

> > > +				crashk_res.end - crashk_res.start + 1);

> > > +		if (ret)

> > > +			goto out_err;

> > > +	}

> > > +

> > >  	/* add bootargs */

> > >  	if (cmdline) {

> > >  		ret = fdt_setprop(buf, nodeoffset, "bootargs",

> >

[v9,07/11] arm64: kexec_file: add crash dump support

Commit Message

Comments

Patch