diff mbox

[v29,3/9] arm64: kdump: reserve memory for crash dump kernel

Message ID 20170113081617.GI20972@linaro.org
State New
Headers show

Commit Message

AKASHI Takahiro Jan. 13, 2017, 8:16 a.m. UTC
Hi Mark,

On Thu, Jan 12, 2017 at 03:09:26PM +0000, Mark Rutland wrote:
> Hi,

> 

> As a general note, I must apologise for my minimial review of the series

> until this point. Judging by the way the DT parts are organised. I'm

> very concerned with the way the DT parts are organised, and clearly I

> did not communicate my concerns and suggestions effectively in prior

> rounds of review.

> 

> On Wed, Dec 28, 2016 at 01:36:00PM +0900, AKASHI Takahiro wrote:

> > "crashkernel=" kernel parameter specifies the size (and optionally

> > the start address) of the system ram used by crash dump kernel.

> > reserve_crashkernel() will allocate and reserve the memory at the startup

> > of primary kernel.

> > 

> > This memory range will be exported to userspace via:

> > 	- an entry named "Crash kernel" in /proc/iomem, and

> > 	- "linux,crashkernel-base" and "linux,crashkernel-size" under

> > 	  /sys/firmware/devicetree/base/chosen

> 

> > +#ifdef CONFIG_KEXEC_CORE

> > +static unsigned long long crash_size, crash_base;

> > +static struct property crash_base_prop = {

> > +	.name = "linux,crashkernel-base",

> > +	.length = sizeof(u64),

> > +	.value = &crash_base

> > +};

> > +static struct property crash_size_prop = {

> > +	.name = "linux,crashkernel-size",

> > +	.length = sizeof(u64),

> > +	.value = &crash_size,

> > +};

> > +

> > +static int __init export_crashkernel(void)

> > +{

> > +	struct device_node *node;

> > +	int ret;

> > +

> > +	if (!crash_size)

> > +		return 0;

> > +

> > +	/* Add /chosen/linux,crashkernel-* properties */

> > +	node = of_find_node_by_path("/chosen");

> > +	if (!node)

> > +		return -ENOENT;

> > +

> > +	/*

> > +	 * There might be existing crash kernel properties, but we can't

> > +	 * be sure what's in them, so remove them.

> > +	 */

> > +	of_remove_property(node, of_find_property(node,

> > +				"linux,crashkernel-base", NULL));

> > +	of_remove_property(node, of_find_property(node,

> > +				"linux,crashkernel-size", NULL));

> > +

> > +	ret = of_add_property(node, &crash_base_prop);

> > +	if (ret)

> > +		goto ret_err;

> > +

> > +	ret = of_add_property(node, &crash_size_prop);

> > +	if (ret)

> > +		goto ret_err;

> > +

> > +	return 0;

> > +

> > +ret_err:

> > +	pr_warn("Exporting crashkernel region to device tree failed\n");

> > +	return ret;

> > +}

> > +late_initcall(export_crashkernel);

> 

> I very much do not like this.

> 

> I don't think we should be modifying the DT exposed to userspace in this

> manner, in the usual boot path, especially given that the kernel itself

> does not appear to be a consumer of this property. I do not think that

> it is right to use the DT exposed to userspace as a communication

> channel solely between the kernel and userspace.


As you mentioned in your comments against my patch#9, this property
originates from PPC implementation.
I added it solely from the sympathy for dt-based architectures.

> So I think we should drop the above, and for arm64 have userspace

> consistently use /proc/iomem (or perhaps a new kexec-specific file) to

> determine the region reserved for the crash kernel, if it needs to know

> this.


As a matter of fact, my port of kexec-tools doesn't check this property
and dropping it won't cause any problem.

> I'll have further comments on this front in the binding patch.

> 

> > +/*

> > + * reserve_crashkernel() - reserves memory for crash kernel

> > + *

> > + * This function reserves memory area given in "crashkernel=" kernel command

> > + * line parameter. The memory reserved is used by dump capture kernel when

> > + * primary kernel is crashing.

> > + */

> > +static void __init reserve_crashkernel(void)

> > +{

> > +	int ret;

> > +

> > +	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),

> > +				&crash_size, &crash_base);

> > +	/* no crashkernel= or invalid value specified */

> > +	if (ret || !crash_size)

> > +		return;

> > +

> > +	if (crash_base == 0) {

> > +		/* Current arm64 boot protocol requires 2MB alignment */

> > +		crash_base = memblock_find_in_range(0, ARCH_LOW_ADDRESS_LIMIT,

> > +				crash_size, SZ_2M);

> > +		if (crash_base == 0) {

> > +			pr_warn("Unable to allocate crashkernel (size:%llx)\n",

> > +				crash_size);

> > +			return;

> > +		}

> > +	} else {

> > +		/* User specifies base address explicitly. */

> > +		if (!memblock_is_region_memory(crash_base, crash_size) ||

> > +			memblock_is_region_reserved(crash_base, crash_size)) {

> > +			pr_warn("crashkernel has wrong address or size\n");

> > +			return;

> > +		}

> > +

> > +		if (!IS_ALIGNED(crash_base, SZ_2M)) {

> > +			pr_warn("crashkernel base address is not 2MB aligned\n");

> > +			return;

> > +		}

> > +	}

> > +	memblock_reserve(crash_base, crash_size);

> 

> This will mean that the crash kernel will have a permanent alias in the linear

> map which is vulnerable to being clobbered. There could also be issues

> with mismatched attributes in future.


Good point, I've never thought of that except making the memblock
region "reserved."

> We're probably ok for now, but in future we'll likely want to fix this

> up to remove the region (or mark it nomap), and only map it temporarily

> when loading things into the region.


Well, I found that the following commit is already in:
        commit 9b492cf58077
        Author: Xunlei Pang <xlpang@redhat.com>
        Date:   Mon May 23 16:24:10 2016 -0700

            kexec: introduce a protection mechanism for the crashkernel
            reserved memory

To make best use of this framework, I'd like to re-use set_memory_ro/rx()
instead of removing the region from linear mapping. But to do so,
we need to
* make memblock_isolate_range() global,
* allow set_memory_ro/rx() to be applied to regions in linear mapping
since set_memory_ro/rx() works only on page-level mappings.

What do you think?
(See my tentative solution below.)

> > +

> > +	pr_info("Reserving %lldMB of memory at %lldMB for crashkernel\n",

> > +		crash_size >> 20, crash_base >> 20);

> > +

> > +	crashk_res.start = crash_base;

> > +	crashk_res.end = crash_base + crash_size - 1;

> > +}

> > +#else

> > +static void __init reserve_crashkernel(void)

> > +{

> > +	;

> 

> Nit: the ';' line can go.


OK

Thanks,
-Takahiro AKASHI

> > +}

> > +#endif /* CONFIG_KEXEC_CORE */

> > +

> >  /*

> >   * Return the maximum physical address for ZONE_DMA (DMA_BIT_MASK(32)). It

> >   * currently assumes that for memory starting above 4G, 32-bit devices will

> > @@ -331,6 +438,9 @@ void __init arm64_memblock_init(void)

> >  		arm64_dma_phys_limit = max_zone_dma_phys();

> >  	else

> >  		arm64_dma_phys_limit = PHYS_MASK + 1;

> > +

> > +	reserve_crashkernel();

> > +

> >  	dma_contiguous_reserve(arm64_dma_phys_limit);

> >  

> >  	memblock_allow_resize();

> > -- 

> > 2.11.0

> 

> Other than my comments regarding the DT usage above, this looks fine to

> me.

> 

> Thanks,

> Mark.


===>8===

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

Comments

Mark Rutland Jan. 13, 2017, 11:39 a.m. UTC | #1
On Fri, Jan 13, 2017 at 05:16:18PM +0900, AKASHI Takahiro wrote:
> On Thu, Jan 12, 2017 at 03:09:26PM +0000, Mark Rutland wrote:

> > > +static int __init export_crashkernel(void)


> > > +	/* Add /chosen/linux,crashkernel-* properties */


> > > +	of_remove_property(node, of_find_property(node,

> > > +				"linux,crashkernel-base", NULL));

> > > +	of_remove_property(node, of_find_property(node,

> > > +				"linux,crashkernel-size", NULL));

> > > +

> > > +	ret = of_add_property(node, &crash_base_prop);

> > > +	if (ret)

> > > +		goto ret_err;

> > > +

> > > +	ret = of_add_property(node, &crash_size_prop);

> > > +	if (ret)

> > > +		goto ret_err;


> > I very much do not like this.

> > 

> > I don't think we should be modifying the DT exposed to userspace in this

> > manner, in the usual boot path, especially given that the kernel itself

> > does not appear to be a consumer of this property. I do not think that

> > it is right to use the DT exposed to userspace as a communication

> > channel solely between the kernel and userspace.

> 

> As you mentioned in your comments against my patch#9, this property

> originates from PPC implementation.

> I added it solely from the sympathy for dt-based architectures.

>

> > So I think we should drop the above, and for arm64 have userspace

> > consistently use /proc/iomem (or perhaps a new kexec-specific file) to

> > determine the region reserved for the crash kernel, if it needs to know

> > this.

> 

> As a matter of fact, my port of kexec-tools doesn't check this property

> and dropping it won't cause any problem.


Ok. It sounds like we're both happy for this to go, then.

While it's unfortunate that architectures differ, I think we have
legitimate reasons to differ, and it's preferable to do so. We have a
different set of constraints (e.g. supporting EFI memory maps), and
following the PPC approach creates longer term issues for us, making it
harder to do the right thing consistently.

> > > +/*

> > > + * reserve_crashkernel() - reserves memory for crash kernel

> > > + *

> > > + * This function reserves memory area given in "crashkernel=" kernel command

> > > + * line parameter. The memory reserved is used by dump capture kernel when

> > > + * primary kernel is crashing.

> > > + */

> > > +static void __init reserve_crashkernel(void)


> > > +	memblock_reserve(crash_base, crash_size);

> > 

> > This will mean that the crash kernel will have a permanent alias in the linear

> > map which is vulnerable to being clobbered. There could also be issues

> > with mismatched attributes in future.

> 

> Good point, I've never thought of that except making the memblock

> region "reserved."

> 

> > We're probably ok for now, but in future we'll likely want to fix this

> > up to remove the region (or mark it nomap), and only map it temporarily

> > when loading things into the region.

> 

> Well, I found that the following commit is already in:

>         commit 9b492cf58077

>         Author: Xunlei Pang <xlpang@redhat.com>

>         Date:   Mon May 23 16:24:10 2016 -0700

> 

>             kexec: introduce a protection mechanism for the crashkernel

>             reserved memory

> 

> To make best use of this framework, I'd like to re-use set_memory_ro/rx()

> instead of removing the region from linear mapping. But to do so,

> we need to

> * make memblock_isolate_range() global,

> * allow set_memory_ro/rx() to be applied to regions in linear mapping

> since set_memory_ro/rx() works only on page-level mappings.

> 

> What do you think?

> (See my tentative solution below.)


Great! I think it would be better to follow the approach of
mark_rodata_ro(), rather than opening up set_memory_*(), but otherwise,
it looks like it should work.

Either way, this still leaves us with an RO alias on crashed cores (and
potential cache attribute mismatches in future). Do we need to read from
the region later, or could we unmap it entirely?

Thanks,
Mark.

> ===8<===

> diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c

> index c0fc3d458195..bb21c0473b8e 100644

> --- a/arch/arm64/kernel/machine_kexec.c

> +++ b/arch/arm64/kernel/machine_kexec.c

> @@ -211,6 +211,44 @@ void machine_kexec(struct kimage *kimage)

>  	BUG(); /* Should never get here. */

>  }

>  

> +static int kexec_mark_range(unsigned long start, unsigned long end,

> +							bool protect)

> +{

> +	unsigned int nr_pages;

> +

> +	if (!end || start >= end)

> +		return 0;

> +

> +	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;

> +

> +	if (protect)

> +		return set_memory_ro(__phys_to_virt(start), nr_pages);

> +	else

> +		return set_memory_rw(__phys_to_virt(start), nr_pages);

> +}

> +

> +static void kexec_mark_crashkres(bool protect)

> +{

> +	unsigned long control;

> +

> +	/* Don't touch the control code page used in crash_kexec().*/

> +	control = page_to_phys(kexec_crash_image->control_code_page);

> +	kexec_mark_range(crashk_res.start, control - 1, protect);

> +

> +	control += KEXEC_CONTROL_PAGE_SIZE;

> +	kexec_mark_range(control, crashk_res.end, protect);

> +}

> +

> +void arch_kexec_protect_crashkres(void)

> +{

> +	kexec_mark_crashkres(true);

> +}

> +

> +void arch_kexec_unprotect_crashkres(void)

> +{

> +	kexec_mark_crashkres(false);

> +}

> +

>  static void machine_kexec_mask_interrupts(void)

>  {

>  	unsigned int i;

> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c

> index 569ec3325bc8..764ec89c4f76 100644

> --- a/arch/arm64/mm/init.c

> +++ b/arch/arm64/mm/init.c

> @@ -90,6 +90,7 @@ early_param("initrd", early_initrd);

>  static void __init reserve_crashkernel(void)

>  {

>  	unsigned long long crash_size, crash_base;

> +	int start_rgn, end_rgn;

>  	int ret;

>  

>  	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),

> @@ -121,6 +122,9 @@ static void __init reserve_crashkernel(void)

>  		}

>  	}

>  	memblock_reserve(crash_base, crash_size);

> +	memblock_isolate_range(&memblock.memory, crash_base, crash_size,

> +			&start_rgn, &end_rgn);

> +

>  

>  	pr_info("Reserving %lldMB of memory at %lldMB for crashkernel\n",

>  		crash_size >> 20, crash_base >> 20);

> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c

> index 17243e43184e..0f60f19c287b 100644

> --- a/arch/arm64/mm/mmu.c

> +++ b/arch/arm64/mm/mmu.c

> @@ -22,6 +22,7 @@

>  #include <linux/kernel.h>

>  #include <linux/errno.h>

>  #include <linux/init.h>

> +#include <linux/kexec.h>

>  #include <linux/libfdt.h>

>  #include <linux/mman.h>

>  #include <linux/nodemask.h>

> @@ -362,6 +363,17 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end

>  	unsigned long kernel_start = __pa(_text);

>  	unsigned long kernel_end = __pa(__init_begin);

>  

> +#ifdef CONFIG_KEXEC_CORE

> +	if (crashk_res.end && start >= crashk_res.start &&

> +			end <= (crashk_res.end + 1)) {

> +		__create_pgd_mapping(pgd, start, __phys_to_virt(start),

> +				     end - start, PAGE_KERNEL,

> +				     early_pgtable_alloc,

> +				     true);

> +		return;

> +	}

> +#endif

> +

>  	/*

>  	 * Take care not to create a writable alias for the

>  	 * read-only text and rodata sections of the kernel image.

> ===>8===


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
AKASHI Takahiro Jan. 17, 2017, 8:20 a.m. UTC | #2
On Fri, Jan 13, 2017 at 11:39:15AM +0000, Mark Rutland wrote:
> On Fri, Jan 13, 2017 at 05:16:18PM +0900, AKASHI Takahiro wrote:

> > On Thu, Jan 12, 2017 at 03:09:26PM +0000, Mark Rutland wrote:

> > > > +static int __init export_crashkernel(void)

> 

> > > > +	/* Add /chosen/linux,crashkernel-* properties */

> 

> > > > +	of_remove_property(node, of_find_property(node,

> > > > +				"linux,crashkernel-base", NULL));

> > > > +	of_remove_property(node, of_find_property(node,

> > > > +				"linux,crashkernel-size", NULL));

> > > > +

> > > > +	ret = of_add_property(node, &crash_base_prop);

> > > > +	if (ret)

> > > > +		goto ret_err;

> > > > +

> > > > +	ret = of_add_property(node, &crash_size_prop);

> > > > +	if (ret)

> > > > +		goto ret_err;

> 

> > > I very much do not like this.

> > > 

> > > I don't think we should be modifying the DT exposed to userspace in this

> > > manner, in the usual boot path, especially given that the kernel itself

> > > does not appear to be a consumer of this property. I do not think that

> > > it is right to use the DT exposed to userspace as a communication

> > > channel solely between the kernel and userspace.

> > 

> > As you mentioned in your comments against my patch#9, this property

> > originates from PPC implementation.

> > I added it solely from the sympathy for dt-based architectures.

> >

> > > So I think we should drop the above, and for arm64 have userspace

> > > consistently use /proc/iomem (or perhaps a new kexec-specific file) to

> > > determine the region reserved for the crash kernel, if it needs to know

> > > this.

> > 

> > As a matter of fact, my port of kexec-tools doesn't check this property

> > and dropping it won't cause any problem.

> 

> Ok. It sounds like we're both happy for this to go, then.

> 

> While it's unfortunate that architectures differ, I think we have

> legitimate reasons to differ, and it's preferable to do so. We have a

> different set of constraints (e.g. supporting EFI memory maps), and

> following the PPC approach creates longer term issues for us, making it

> harder to do the right thing consistently.

> 

> > > > +/*

> > > > + * reserve_crashkernel() - reserves memory for crash kernel

> > > > + *

> > > > + * This function reserves memory area given in "crashkernel=" kernel command

> > > > + * line parameter. The memory reserved is used by dump capture kernel when

> > > > + * primary kernel is crashing.

> > > > + */

> > > > +static void __init reserve_crashkernel(void)

> 

> > > > +	memblock_reserve(crash_base, crash_size);

> > > 

> > > This will mean that the crash kernel will have a permanent alias in the linear

> > > map which is vulnerable to being clobbered. There could also be issues

> > > with mismatched attributes in future.

> > 

> > Good point, I've never thought of that except making the memblock

> > region "reserved."

> > 

> > > We're probably ok for now, but in future we'll likely want to fix this

> > > up to remove the region (or mark it nomap), and only map it temporarily

> > > when loading things into the region.

> > 

> > Well, I found that the following commit is already in:

> >         commit 9b492cf58077

> >         Author: Xunlei Pang <xlpang@redhat.com>

> >         Date:   Mon May 23 16:24:10 2016 -0700

> > 

> >             kexec: introduce a protection mechanism for the crashkernel

> >             reserved memory

> > 

> > To make best use of this framework, I'd like to re-use set_memory_ro/rx()

> > instead of removing the region from linear mapping. But to do so,

> > we need to

> > * make memblock_isolate_range() global,

> > * allow set_memory_ro/rx() to be applied to regions in linear mapping

> > since set_memory_ro/rx() works only on page-level mappings.

> > 

> > What do you think?

> > (See my tentative solution below.)

> 

> Great! I think it would be better to follow the approach of

> mark_rodata_ro(), rather than opening up set_memory_*(), but otherwise,

> it looks like it should work.


I'm not quite sure what the approach of mark_rodata_ro() means, but
I found that using create_mapping_late() may cause two problems:

1) it fails when PTE_CONT bits mismatch between an old and new mmu entry.
   This can happen, say, if the memory range for crash dump kernel
   starts in the mid of _continuous_ pages.

2) The control code page, of one-page size, is still written out in
   machine_kexec() which is called at a crash, and this means that
   the range must be writable even after kexec_load(), but
   create_mapping_late() does not handle a case of changing attributes
   for a single page which is in _section_ mapping.
   We cannot make single-page mapping for the control page since the address
   of that page is not determined at the boot time.

As for (1), we need to call memblock_isolate_range() to make the region
an independent one.

> Either way, this still leaves us with an RO alias on crashed cores (and

> potential cache attribute mismatches in future). Do we need to read from

> the region later,


I believe not, but the region must be _writable_ as I mentioned in (2) above.
To avoid this issue, we have to move copying the control code page
to machine_kexec_prepare() which is called in kexec_load() and so
the region is writable anyway then.
I want Geoff to affirm that this change is safe.

(See my second solution below.)

> or could we unmap it entirely?


given the change above, I think we can.
Is there any code to re-use especially for unmapping?

Thanks,
-Takahiro AKASHI

> Thanks,

> Mark.

===>8===

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel===8<===
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index c0fc3d458195..80a52e9aaf73 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -26,8 +26,6 @@
 extern const unsigned char arm64_relocate_new_kernel[];
 extern const unsigned long arm64_relocate_new_kernel_size;
 
-static unsigned long kimage_start;
-
 /**
  * kexec_image_info - For debugging output.
  */
@@ -68,7 +66,7 @@ void machine_kexec_cleanup(struct kimage *kimage)
  */
 int machine_kexec_prepare(struct kimage *kimage)
 {
-	kimage_start = kimage->start;
+	void *reboot_code_buffer;
 
 	kexec_image_info(kimage);
 
@@ -77,6 +75,21 @@ int machine_kexec_prepare(struct kimage *kimage)
 		return -EBUSY;
 	}
 
+	reboot_code_buffer =
+			phys_to_virt(page_to_phys(kimage->control_code_page));
+
+	/*
+	 * Copy arm64_relocate_new_kernel to the reboot_code_buffer for use
+	 * after the kernel is shut down.
+	 */
+	memcpy(reboot_code_buffer, arm64_relocate_new_kernel,
+		arm64_relocate_new_kernel_size);
+
+	/* Flush the reboot_code_buffer in preparation for its execution. */
+	__flush_dcache_area(reboot_code_buffer, arm64_relocate_new_kernel_size);
+	flush_icache_range((uintptr_t)reboot_code_buffer,
+		arm64_relocate_new_kernel_size);
+
 	return 0;
 }
 
@@ -147,7 +160,6 @@ static void kexec_segment_flush(const struct kimage *kimage)
 void machine_kexec(struct kimage *kimage)
 {
 	phys_addr_t reboot_code_buffer_phys;
-	void *reboot_code_buffer;
 
 	/*
 	 * New cpus may have become stuck_in_kernel after we loaded the image.
@@ -156,7 +168,6 @@ void machine_kexec(struct kimage *kimage)
 			!WARN_ON(kimage == kexec_crash_image));
 
 	reboot_code_buffer_phys = page_to_phys(kimage->control_code_page);
-	reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);
 
 	kexec_image_info(kimage);
 
@@ -164,26 +175,12 @@ void machine_kexec(struct kimage *kimage)
 		kimage->control_code_page);
 	pr_debug("%s:%d: reboot_code_buffer_phys:  %pa\n", __func__, __LINE__,
 		&reboot_code_buffer_phys);
-	pr_debug("%s:%d: reboot_code_buffer:       %p\n", __func__, __LINE__,
-		reboot_code_buffer);
 	pr_debug("%s:%d: relocate_new_kernel:      %p\n", __func__, __LINE__,
 		arm64_relocate_new_kernel);
 	pr_debug("%s:%d: relocate_new_kernel_size: 0x%lx(%lu) bytes\n",
 		__func__, __LINE__, arm64_relocate_new_kernel_size,
 		arm64_relocate_new_kernel_size);
 
-	/*
-	 * Copy arm64_relocate_new_kernel to the reboot_code_buffer for use
-	 * after the kernel is shut down.
-	 */
-	memcpy(reboot_code_buffer, arm64_relocate_new_kernel,
-		arm64_relocate_new_kernel_size);
-
-	/* Flush the reboot_code_buffer in preparation for its execution. */
-	__flush_dcache_area(reboot_code_buffer, arm64_relocate_new_kernel_size);
-	flush_icache_range((uintptr_t)reboot_code_buffer,
-		arm64_relocate_new_kernel_size);
-
 	/* Flush the kimage list and its buffers. */
 	kexec_list_flush(kimage);
 
@@ -206,7 +203,7 @@ void machine_kexec(struct kimage *kimage)
 	 */
 
 	cpu_soft_restart(kimage != kexec_crash_image,
-		reboot_code_buffer_phys, kimage->head, kimage_start, 0);
+		reboot_code_buffer_phys, kimage->head, kimage->start, 0);
 
 	BUG(); /* Should never get here. */
 }
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 569ec3325bc8..e4cc170edc0c 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -90,6 +90,7 @@ early_param("initrd", early_initrd);
 static void __init reserve_crashkernel(void)
 {
 	unsigned long long crash_size, crash_base;
+	int start_rgn, end_rgn;
 	int ret;
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
@@ -120,6 +121,8 @@ static void __init reserve_crashkernel(void)
 			return;
 		}
 	}
+	memblock_isolate_range(&memblock.memory, crash_base, crash_size,
+			&start_rgn, &end_rgn);
 	memblock_reserve(crash_base, crash_size);
 
 	pr_info("Reserving %lldMB of memory at %lldMB for crashkernel\n",
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 17243e43184e..b7c75845407a 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -22,6 +22,8 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/kexec.h>
 #include <linux/libfdt.h>
 #include <linux/mman.h>
 #include <linux/nodemask.h>
@@ -817,3 +819,27 @@ int pmd_clear_huge(pmd_t *pmd)
 	pmd_clear(pmd);
 	return 1;
 }
+
+#ifdef CONFIG_KEXEC_CORE
+void arch_kexec_protect_crashkres(void)
+{
+	flush_tlb_all();
+
+	create_mapping_late(crashk_res.start, __phys_to_virt(crashk_res.start),
+			    resource_size(&crashk_res), PAGE_KERNEL_RO);
+
+	/* flush the TLBs after updating live kernel mappings */
+	flush_tlb_all();
+}
+
+void arch_kexec_unprotect_crashkres(void)
+{
+	flush_tlb_all();
+
+	create_mapping_late(crashk_res.start, __phys_to_virt(crashk_res.start),
+			    resource_size(&crashk_res), PAGE_KERNEL);
+
+	/* flush the TLBs after updating live kernel mappings */
+	flush_tlb_all();
+}
+#endif

Mark Rutland Jan. 17, 2017, 11:54 a.m. UTC | #3
On Tue, Jan 17, 2017 at 05:20:44PM +0900, AKASHI Takahiro wrote:
> On Fri, Jan 13, 2017 at 11:39:15AM +0000, Mark Rutland wrote:

> > Great! I think it would be better to follow the approach of

> > mark_rodata_ro(), rather than opening up set_memory_*(), but otherwise,

> > it looks like it should work.

> 

> I'm not quite sure what the approach of mark_rodata_ro() means, but

> I found that using create_mapping_late() may cause two problems:

> 

> 1) it fails when PTE_CONT bits mismatch between an old and new mmu entry.

>    This can happen, say, if the memory range for crash dump kernel

>    starts in the mid of _continuous_ pages.


That should only happen if we try to remap a segment different to what
we originally mapped.

I was intending that we'd explicitly map the reserved region separately
in the boot path, like we do for kernel segments in map_kernel(). We
would allow sections and/or CONT entires. 

Then, in __map_memblock() we'd then skip that range as we do for the
linear map alias of the kernel image.

That way, we can later use create_mapping_late for that same region, and
it should handle sections and/or CONT entries in the exact same way as
it does for the kernel image segments in mark_rodata_ro().

> 2) The control code page, of one-page size, is still written out in

>    machine_kexec() which is called at a crash, and this means that

>    the range must be writable even after kexec_load(), but

>    create_mapping_late() does not handle a case of changing attributes

>    for a single page which is in _section_ mapping.

>    We cannot make single-page mapping for the control page since the address

>    of that page is not determined at the boot time.


That is a problem. I'm not sure I follow how set_memory_*() helps here
though?

> As for (1), we need to call memblock_isolate_range() to make the region

> an independent one.

> 

> > Either way, this still leaves us with an RO alias on crashed cores (and

> > potential cache attribute mismatches in future). Do we need to read from

> > the region later,

> 

> I believe not, but the region must be _writable_ as I mentioned in (2) above.

> To avoid this issue, we have to move copying the control code page

> to machine_kexec_prepare() which is called in kexec_load() and so

> the region is writable anyway then.

> I want Geoff to affirm that this change is safe.

> 

> (See my second solution below.)


From a quick scan that looks ok.

> > or could we unmap it entirely?

> 

> given the change above, I think we can.


Great!

> Is there any code to re-use especially for unmapping?


I don't think we have much code useful for unmapping. We could re-use 
create_mapping_late for this, passing a set of prot bits that means the
entries are invalid (e.g. have a PAGE_KERNEL_INVALID).

We'd have to perform the TLB invalidation ourselves, but that shouldn't
be too painful.

Thanks,
Mark.

> ===8<===

> diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c

> index c0fc3d458195..80a52e9aaf73 100644

> --- a/arch/arm64/kernel/machine_kexec.c

> +++ b/arch/arm64/kernel/machine_kexec.c

> @@ -26,8 +26,6 @@

>  extern const unsigned char arm64_relocate_new_kernel[];

>  extern const unsigned long arm64_relocate_new_kernel_size;

>  

> -static unsigned long kimage_start;

> -

>  /**

>   * kexec_image_info - For debugging output.

>   */

> @@ -68,7 +66,7 @@ void machine_kexec_cleanup(struct kimage *kimage)

>   */

>  int machine_kexec_prepare(struct kimage *kimage)

>  {

> -	kimage_start = kimage->start;

> +	void *reboot_code_buffer;

>  

>  	kexec_image_info(kimage);

>  

> @@ -77,6 +75,21 @@ int machine_kexec_prepare(struct kimage *kimage)

>  		return -EBUSY;

>  	}

>  

> +	reboot_code_buffer =

> +			phys_to_virt(page_to_phys(kimage->control_code_page));

> +

> +	/*

> +	 * Copy arm64_relocate_new_kernel to the reboot_code_buffer for use

> +	 * after the kernel is shut down.

> +	 */

> +	memcpy(reboot_code_buffer, arm64_relocate_new_kernel,

> +		arm64_relocate_new_kernel_size);

> +

> +	/* Flush the reboot_code_buffer in preparation for its execution. */

> +	__flush_dcache_area(reboot_code_buffer, arm64_relocate_new_kernel_size);

> +	flush_icache_range((uintptr_t)reboot_code_buffer,

> +		arm64_relocate_new_kernel_size);

> +

>  	return 0;

>  }

>  

> @@ -147,7 +160,6 @@ static void kexec_segment_flush(const struct kimage *kimage)

>  void machine_kexec(struct kimage *kimage)

>  {

>  	phys_addr_t reboot_code_buffer_phys;

> -	void *reboot_code_buffer;

>  

>  	/*

>  	 * New cpus may have become stuck_in_kernel after we loaded the image.

> @@ -156,7 +168,6 @@ void machine_kexec(struct kimage *kimage)

>  			!WARN_ON(kimage == kexec_crash_image));

>  

>  	reboot_code_buffer_phys = page_to_phys(kimage->control_code_page);

> -	reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);

>  

>  	kexec_image_info(kimage);

>  

> @@ -164,26 +175,12 @@ void machine_kexec(struct kimage *kimage)

>  		kimage->control_code_page);

>  	pr_debug("%s:%d: reboot_code_buffer_phys:  %pa\n", __func__, __LINE__,

>  		&reboot_code_buffer_phys);

> -	pr_debug("%s:%d: reboot_code_buffer:       %p\n", __func__, __LINE__,

> -		reboot_code_buffer);

>  	pr_debug("%s:%d: relocate_new_kernel:      %p\n", __func__, __LINE__,

>  		arm64_relocate_new_kernel);

>  	pr_debug("%s:%d: relocate_new_kernel_size: 0x%lx(%lu) bytes\n",

>  		__func__, __LINE__, arm64_relocate_new_kernel_size,

>  		arm64_relocate_new_kernel_size);

>  

> -	/*

> -	 * Copy arm64_relocate_new_kernel to the reboot_code_buffer for use

> -	 * after the kernel is shut down.

> -	 */

> -	memcpy(reboot_code_buffer, arm64_relocate_new_kernel,

> -		arm64_relocate_new_kernel_size);

> -

> -	/* Flush the reboot_code_buffer in preparation for its execution. */

> -	__flush_dcache_area(reboot_code_buffer, arm64_relocate_new_kernel_size);

> -	flush_icache_range((uintptr_t)reboot_code_buffer,

> -		arm64_relocate_new_kernel_size);

> -

>  	/* Flush the kimage list and its buffers. */

>  	kexec_list_flush(kimage);

>  

> @@ -206,7 +203,7 @@ void machine_kexec(struct kimage *kimage)

>  	 */

>  

>  	cpu_soft_restart(kimage != kexec_crash_image,

> -		reboot_code_buffer_phys, kimage->head, kimage_start, 0);

> +		reboot_code_buffer_phys, kimage->head, kimage->start, 0);

>  

>  	BUG(); /* Should never get here. */

>  }

> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c

> index 569ec3325bc8..e4cc170edc0c 100644

> --- a/arch/arm64/mm/init.c

> +++ b/arch/arm64/mm/init.c

> @@ -90,6 +90,7 @@ early_param("initrd", early_initrd);

>  static void __init reserve_crashkernel(void)

>  {

>  	unsigned long long crash_size, crash_base;

> +	int start_rgn, end_rgn;

>  	int ret;

>  

>  	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),

> @@ -120,6 +121,8 @@ static void __init reserve_crashkernel(void)

>  			return;

>  		}

>  	}

> +	memblock_isolate_range(&memblock.memory, crash_base, crash_size,

> +			&start_rgn, &end_rgn);

>  	memblock_reserve(crash_base, crash_size);

>  

>  	pr_info("Reserving %lldMB of memory at %lldMB for crashkernel\n",

> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c

> index 17243e43184e..b7c75845407a 100644

> --- a/arch/arm64/mm/mmu.c

> +++ b/arch/arm64/mm/mmu.c

> @@ -22,6 +22,8 @@

>  #include <linux/kernel.h>

>  #include <linux/errno.h>

>  #include <linux/init.h>

> +#include <linux/ioport.h>

> +#include <linux/kexec.h>

>  #include <linux/libfdt.h>

>  #include <linux/mman.h>

>  #include <linux/nodemask.h>

> @@ -817,3 +819,27 @@ int pmd_clear_huge(pmd_t *pmd)

>  	pmd_clear(pmd);

>  	return 1;

>  }

> +

> +#ifdef CONFIG_KEXEC_CORE

> +void arch_kexec_protect_crashkres(void)

> +{

> +	flush_tlb_all();

> +

> +	create_mapping_late(crashk_res.start, __phys_to_virt(crashk_res.start),

> +			    resource_size(&crashk_res), PAGE_KERNEL_RO);

> +

> +	/* flush the TLBs after updating live kernel mappings */

> +	flush_tlb_all();

> +}

> +

> +void arch_kexec_unprotect_crashkres(void)

> +{

> +	flush_tlb_all();

> +

> +	create_mapping_late(crashk_res.start, __phys_to_virt(crashk_res.start),

> +			    resource_size(&crashk_res), PAGE_KERNEL);

> +

> +	/* flush the TLBs after updating live kernel mappings */

> +	flush_tlb_all();

> +}

> +#endif

> ===>8===


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
AKASHI Takahiro Jan. 19, 2017, 9:49 a.m. UTC | #4
On Tue, Jan 17, 2017 at 11:54:42AM +0000, Mark Rutland wrote:
> On Tue, Jan 17, 2017 at 05:20:44PM +0900, AKASHI Takahiro wrote:

> > On Fri, Jan 13, 2017 at 11:39:15AM +0000, Mark Rutland wrote:

> > > Great! I think it would be better to follow the approach of

> > > mark_rodata_ro(), rather than opening up set_memory_*(), but otherwise,

> > > it looks like it should work.

> > 

> > I'm not quite sure what the approach of mark_rodata_ro() means, but

> > I found that using create_mapping_late() may cause two problems:

> > 

> > 1) it fails when PTE_CONT bits mismatch between an old and new mmu entry.

> >    This can happen, say, if the memory range for crash dump kernel

> >    starts in the mid of _continuous_ pages.

> 

> That should only happen if we try to remap a segment different to what

> we originally mapped.

> 

> I was intending that we'd explicitly map the reserved region separately

> in the boot path, like we do for kernel segments in map_kernel(). We

> would allow sections and/or CONT entires. 

> 

> Then, in __map_memblock() we'd then skip that range as we do for the

> linear map alias of the kernel image.

> 

> That way, we can later use create_mapping_late for that same region, and

> it should handle sections and/or CONT entries in the exact same way as

> it does for the kernel image segments in mark_rodata_ro().


I see.
Which one do you prefer, yours above or my (second) solution?
Either way, they do almost the same thing in terms of mapping.

> > 2) The control code page, of one-page size, is still written out in

> >    machine_kexec() which is called at a crash, and this means that

> >    the range must be writable even after kexec_load(), but

> >    create_mapping_late() does not handle a case of changing attributes

> >    for a single page which is in _section_ mapping.

> >    We cannot make single-page mapping for the control page since the address

> >    of that page is not determined at the boot time.

> 

> That is a problem. I'm not sure I follow how set_memory_*() helps here

> though?

> 

> > As for (1), we need to call memblock_isolate_range() to make the region

> > an independent one.

> > 

> > > Either way, this still leaves us with an RO alias on crashed cores (and

> > > potential cache attribute mismatches in future). Do we need to read from

> > > the region later,

> > 

> > I believe not, but the region must be _writable_ as I mentioned in (2) above.

> > To avoid this issue, we have to move copying the control code page

> > to machine_kexec_prepare() which is called in kexec_load() and so

> > the region is writable anyway then.

> > I want Geoff to affirm that this change is safe.

> > 

> > (See my second solution below.)

> 

> From a quick scan that looks ok.

> 

> > > or could we unmap it entirely?

> > 

> > given the change above, I think we can.


I'm now asking Geoff ...

> 

> Great!

>

> > Is there any code to re-use especially for unmapping?

> 

> I don't think we have much code useful for unmapping. We could re-use 

> create_mapping_late for this, passing a set of prot bits that means the

> entries are invalid (e.g. have a PAGE_KERNEL_INVALID).


Do you really think that we should totally invalidate mmu entries?
I guess that, given proper cache & TLB flush operations, RO attribute is
good enough for memory consistency, no?
(None accesses the region, as I said, except in the case of re-loading
crash dump kernel though.)

> We'd have to perform the TLB invalidation ourselves, but that shouldn't

> be too painful.


Do we need to invalidate TLBs not only before but also after changing
permission attributes as make_rodata_ro() does?

-Takahiro AKASHI

> Thanks,

> Mark.

> 

> > ===8<===

> > diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c

> > index c0fc3d458195..80a52e9aaf73 100644

> > --- a/arch/arm64/kernel/machine_kexec.c

> > +++ b/arch/arm64/kernel/machine_kexec.c

> > @@ -26,8 +26,6 @@

> >  extern const unsigned char arm64_relocate_new_kernel[];

> >  extern const unsigned long arm64_relocate_new_kernel_size;

> >  

> > -static unsigned long kimage_start;

> > -

> >  /**

> >   * kexec_image_info - For debugging output.

> >   */

> > @@ -68,7 +66,7 @@ void machine_kexec_cleanup(struct kimage *kimage)

> >   */

> >  int machine_kexec_prepare(struct kimage *kimage)

> >  {

> > -	kimage_start = kimage->start;

> > +	void *reboot_code_buffer;

> >  

> >  	kexec_image_info(kimage);

> >  

> > @@ -77,6 +75,21 @@ int machine_kexec_prepare(struct kimage *kimage)

> >  		return -EBUSY;

> >  	}

> >  

> > +	reboot_code_buffer =

> > +			phys_to_virt(page_to_phys(kimage->control_code_page));

> > +

> > +	/*

> > +	 * Copy arm64_relocate_new_kernel to the reboot_code_buffer for use

> > +	 * after the kernel is shut down.

> > +	 */

> > +	memcpy(reboot_code_buffer, arm64_relocate_new_kernel,

> > +		arm64_relocate_new_kernel_size);

> > +

> > +	/* Flush the reboot_code_buffer in preparation for its execution. */

> > +	__flush_dcache_area(reboot_code_buffer, arm64_relocate_new_kernel_size);

> > +	flush_icache_range((uintptr_t)reboot_code_buffer,

> > +		arm64_relocate_new_kernel_size);

> > +

> >  	return 0;

> >  }

> >  

> > @@ -147,7 +160,6 @@ static void kexec_segment_flush(const struct kimage *kimage)

> >  void machine_kexec(struct kimage *kimage)

> >  {

> >  	phys_addr_t reboot_code_buffer_phys;

> > -	void *reboot_code_buffer;

> >  

> >  	/*

> >  	 * New cpus may have become stuck_in_kernel after we loaded the image.

> > @@ -156,7 +168,6 @@ void machine_kexec(struct kimage *kimage)

> >  			!WARN_ON(kimage == kexec_crash_image));

> >  

> >  	reboot_code_buffer_phys = page_to_phys(kimage->control_code_page);

> > -	reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);

> >  

> >  	kexec_image_info(kimage);

> >  

> > @@ -164,26 +175,12 @@ void machine_kexec(struct kimage *kimage)

> >  		kimage->control_code_page);

> >  	pr_debug("%s:%d: reboot_code_buffer_phys:  %pa\n", __func__, __LINE__,

> >  		&reboot_code_buffer_phys);

> > -	pr_debug("%s:%d: reboot_code_buffer:       %p\n", __func__, __LINE__,

> > -		reboot_code_buffer);

> >  	pr_debug("%s:%d: relocate_new_kernel:      %p\n", __func__, __LINE__,

> >  		arm64_relocate_new_kernel);

> >  	pr_debug("%s:%d: relocate_new_kernel_size: 0x%lx(%lu) bytes\n",

> >  		__func__, __LINE__, arm64_relocate_new_kernel_size,

> >  		arm64_relocate_new_kernel_size);

> >  

> > -	/*

> > -	 * Copy arm64_relocate_new_kernel to the reboot_code_buffer for use

> > -	 * after the kernel is shut down.

> > -	 */

> > -	memcpy(reboot_code_buffer, arm64_relocate_new_kernel,

> > -		arm64_relocate_new_kernel_size);

> > -

> > -	/* Flush the reboot_code_buffer in preparation for its execution. */

> > -	__flush_dcache_area(reboot_code_buffer, arm64_relocate_new_kernel_size);

> > -	flush_icache_range((uintptr_t)reboot_code_buffer,

> > -		arm64_relocate_new_kernel_size);

> > -

> >  	/* Flush the kimage list and its buffers. */

> >  	kexec_list_flush(kimage);

> >  

> > @@ -206,7 +203,7 @@ void machine_kexec(struct kimage *kimage)

> >  	 */

> >  

> >  	cpu_soft_restart(kimage != kexec_crash_image,

> > -		reboot_code_buffer_phys, kimage->head, kimage_start, 0);

> > +		reboot_code_buffer_phys, kimage->head, kimage->start, 0);

> >  

> >  	BUG(); /* Should never get here. */

> >  }

> > diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c

> > index 569ec3325bc8..e4cc170edc0c 100644

> > --- a/arch/arm64/mm/init.c

> > +++ b/arch/arm64/mm/init.c

> > @@ -90,6 +90,7 @@ early_param("initrd", early_initrd);

> >  static void __init reserve_crashkernel(void)

> >  {

> >  	unsigned long long crash_size, crash_base;

> > +	int start_rgn, end_rgn;

> >  	int ret;

> >  

> >  	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),

> > @@ -120,6 +121,8 @@ static void __init reserve_crashkernel(void)

> >  			return;

> >  		}

> >  	}

> > +	memblock_isolate_range(&memblock.memory, crash_base, crash_size,

> > +			&start_rgn, &end_rgn);

> >  	memblock_reserve(crash_base, crash_size);

> >  

> >  	pr_info("Reserving %lldMB of memory at %lldMB for crashkernel\n",

> > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c

> > index 17243e43184e..b7c75845407a 100644

> > --- a/arch/arm64/mm/mmu.c

> > +++ b/arch/arm64/mm/mmu.c

> > @@ -22,6 +22,8 @@

> >  #include <linux/kernel.h>

> >  #include <linux/errno.h>

> >  #include <linux/init.h>

> > +#include <linux/ioport.h>

> > +#include <linux/kexec.h>

> >  #include <linux/libfdt.h>

> >  #include <linux/mman.h>

> >  #include <linux/nodemask.h>

> > @@ -817,3 +819,27 @@ int pmd_clear_huge(pmd_t *pmd)

> >  	pmd_clear(pmd);

> >  	return 1;

> >  }

> > +

> > +#ifdef CONFIG_KEXEC_CORE

> > +void arch_kexec_protect_crashkres(void)

> > +{

> > +	flush_tlb_all();

> > +

> > +	create_mapping_late(crashk_res.start, __phys_to_virt(crashk_res.start),

> > +			    resource_size(&crashk_res), PAGE_KERNEL_RO);

> > +

> > +	/* flush the TLBs after updating live kernel mappings */

> > +	flush_tlb_all();

> > +}

> > +

> > +void arch_kexec_unprotect_crashkres(void)

> > +{

> > +	flush_tlb_all();

> > +

> > +	create_mapping_late(crashk_res.start, __phys_to_virt(crashk_res.start),

> > +			    resource_size(&crashk_res), PAGE_KERNEL);

> > +

> > +	/* flush the TLBs after updating live kernel mappings */

> > +	flush_tlb_all();

> > +}

> > +#endif

> > ===>8===


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Mark Rutland Jan. 19, 2017, 11:28 a.m. UTC | #5
On Thu, Jan 19, 2017 at 06:49:42PM +0900, AKASHI Takahiro wrote:
> On Tue, Jan 17, 2017 at 11:54:42AM +0000, Mark Rutland wrote:

> > On Tue, Jan 17, 2017 at 05:20:44PM +0900, AKASHI Takahiro wrote:

> > > On Fri, Jan 13, 2017 at 11:39:15AM +0000, Mark Rutland wrote:

> > > > Great! I think it would be better to follow the approach of

> > > > mark_rodata_ro(), rather than opening up set_memory_*(), but otherwise,

> > > > it looks like it should work.

> > > 

> > > I'm not quite sure what the approach of mark_rodata_ro() means, but

> > > I found that using create_mapping_late() may cause two problems:

> > > 

> > > 1) it fails when PTE_CONT bits mismatch between an old and new mmu entry.

> > >    This can happen, say, if the memory range for crash dump kernel

> > >    starts in the mid of _continuous_ pages.

> > 

> > That should only happen if we try to remap a segment different to what

> > we originally mapped.

> > 

> > I was intending that we'd explicitly map the reserved region separately

> > in the boot path, like we do for kernel segments in map_kernel(). We

> > would allow sections and/or CONT entires. 

> > 

> > Then, in __map_memblock() we'd then skip that range as we do for the

> > linear map alias of the kernel image.

> > 

> > That way, we can later use create_mapping_late for that same region, and

> > it should handle sections and/or CONT entries in the exact same way as

> > it does for the kernel image segments in mark_rodata_ro().

> 

> I see.

> Which one do you prefer, yours above or my (second) solution?

> Either way, they do almost the same thing in terms of mapping.


While both should work, I'd prefer to match the existing map_kernel()
logic, (i.e. my suggestion above), for consistency.

> > I don't think we have much code useful for unmapping. We could re-use 

> > create_mapping_late for this, passing a set of prot bits that means the

> > entries are invalid (e.g. have a PAGE_KERNEL_INVALID).

> 

> Do you really think that we should totally invalidate mmu entries?

> I guess that, given proper cache & TLB flush operations, RO attribute is

> good enough for memory consistency, no?

> (None accesses the region, as I said, except in the case of re-loading

> crash dump kernel though.)


My worry is that the first kernel and kdump kernel may map (portions of)
the region with potentially confliciting memory attributes. So it would
be necessary to completely unmap the region.

You raise a good point that this would also mean we need to perform some
cache maintenance, which makes that a little more painful. We'd need a
sequence like:

* Unmap the region
* TLB invalidation
* Remap the region with non-cacheable attributes
* Cache maintenance
* Unmap the region
* TLB invalidation

> > We'd have to perform the TLB invalidation ourselves, but that shouldn't

> > be too painful.

> 

> Do we need to invalidate TLBs not only before but also after changing

> permission attributes as make_rodata_ro() does?


I believe we'd only have to perform the TLB invalidation after the
change of attributes.

Thanks,
Mark.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
AKASHI Takahiro Jan. 23, 2017, 9:51 a.m. UTC | #6
Mark,

On Thu, Jan 19, 2017 at 11:28:50AM +0000, Mark Rutland wrote:
> On Thu, Jan 19, 2017 at 06:49:42PM +0900, AKASHI Takahiro wrote:

> > On Tue, Jan 17, 2017 at 11:54:42AM +0000, Mark Rutland wrote:

> > > On Tue, Jan 17, 2017 at 05:20:44PM +0900, AKASHI Takahiro wrote:

> > > > On Fri, Jan 13, 2017 at 11:39:15AM +0000, Mark Rutland wrote:

> > > > > Great! I think it would be better to follow the approach of

> > > > > mark_rodata_ro(), rather than opening up set_memory_*(), but otherwise,

> > > > > it looks like it should work.

> > > > 

> > > > I'm not quite sure what the approach of mark_rodata_ro() means, but

> > > > I found that using create_mapping_late() may cause two problems:

> > > > 

> > > > 1) it fails when PTE_CONT bits mismatch between an old and new mmu entry.

> > > >    This can happen, say, if the memory range for crash dump kernel

> > > >    starts in the mid of _continuous_ pages.

> > > 

> > > That should only happen if we try to remap a segment different to what

> > > we originally mapped.

> > > 

> > > I was intending that we'd explicitly map the reserved region separately

> > > in the boot path, like we do for kernel segments in map_kernel(). We

> > > would allow sections and/or CONT entires. 

> > > 

> > > Then, in __map_memblock() we'd then skip that range as we do for the

> > > linear map alias of the kernel image.

> > > 

> > > That way, we can later use create_mapping_late for that same region, and

> > > it should handle sections and/or CONT entries in the exact same way as

> > > it does for the kernel image segments in mark_rodata_ro().

> > 

> > I see.

> > Which one do you prefer, yours above or my (second) solution?

> > Either way, they do almost the same thing in terms of mapping.

> 

> While both should work, I'd prefer to match the existing map_kernel()

> logic, (i.e. my suggestion above), for consistency.


OK

> > > I don't think we have much code useful for unmapping. We could re-use 

> > > create_mapping_late for this, passing a set of prot bits that means the

> > > entries are invalid (e.g. have a PAGE_KERNEL_INVALID).

> > 

> > Do you really think that we should totally invalidate mmu entries?

> > I guess that, given proper cache & TLB flush operations, RO attribute is

> > good enough for memory consistency, no?

> > (None accesses the region, as I said, except in the case of re-loading

> > crash dump kernel though.)

> 

> My worry is that the first kernel and kdump kernel may map (portions of)

> the region with potentially confliciting memory attributes. So it would

> be necessary to completely unmap the region.


I think that this can happen only if the second kernel boots up,
leaving non-crashed cpus still running for some reason.

> You raise a good point that this would also mean we need to perform some

> cache maintenance, which makes that a little more painful. We'd need a

> sequence like:

> 

> * Unmap the region

> * TLB invalidation

> * Remap the region with non-cacheable attributes

> * Cache maintenance

> * Unmap the region

> * TLB invalidation


I don't get why we need to remap the region and do cache
maintenance here. Please elaborate a bit more?
My current implementation of arch_kexec_protect_crashkres() is:

        kexec_segment_flush(kexec_crash_image);
        create_mapping_late(crashk_res.start, ..., __pgprot(0));
                                                or PAGE_KERNEL_INVALID
        flush_tlb_all();

kexec_segment_flush() will eventually do dcache-flush for all the modified
data in crash dump kernel memory.

> > > We'd have to perform the TLB invalidation ourselves, but that shouldn't

> > > be too painful.

> > 

> > Do we need to invalidate TLBs not only before but also after changing

> > permission attributes as make_rodata_ro() does?

> 

> I believe we'd only have to perform the TLB invalidation after the

> change of attributes.


OK

Thanks,
-Takahiro AKASHI

> Thanks,

> Mark.


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Mark Rutland Jan. 23, 2017, 10:23 a.m. UTC | #7
On Mon, Jan 23, 2017 at 06:51:46PM +0900, AKASHI Takahiro wrote:
> Mark,

> 

> On Thu, Jan 19, 2017 at 11:28:50AM +0000, Mark Rutland wrote:

> > On Thu, Jan 19, 2017 at 06:49:42PM +0900, AKASHI Takahiro wrote:

> > > On Tue, Jan 17, 2017 at 11:54:42AM +0000, Mark Rutland wrote:

> > > > On Tue, Jan 17, 2017 at 05:20:44PM +0900, AKASHI Takahiro wrote:

> > > > > On Fri, Jan 13, 2017 at 11:39:15AM +0000, Mark Rutland wrote:


> > > > I don't think we have much code useful for unmapping. We could re-use 

> > > > create_mapping_late for this, passing a set of prot bits that means the

> > > > entries are invalid (e.g. have a PAGE_KERNEL_INVALID).

> > > 

> > > Do you really think that we should totally invalidate mmu entries?

> > > I guess that, given proper cache & TLB flush operations, RO attribute is

> > > good enough for memory consistency, no?

> > > (None accesses the region, as I said, except in the case of re-loading

> > > crash dump kernel though.)

> > 

> > My worry is that the first kernel and kdump kernel may map (portions of)

> > the region with potentially confliciting memory attributes. So it would

> > be necessary to completely unmap the region.

> 

> I think that this can happen only if the second kernel boots up,

> leaving non-crashed cpus still running for some reason.


Yes. I was considering a kdump case where a secondary was stuck in the
first kernel.

> > You raise a good point that this would also mean we need to perform some

> > cache maintenance, which makes that a little more painful. We'd need a

> > sequence like:

> > 

> > * Unmap the region

> > * TLB invalidation

> > * Remap the region with non-cacheable attributes

> > * Cache maintenance

> > * Unmap the region

> > * TLB invalidation

> 

> I don't get why we need to remap the region and do cache

> maintenance here. Please elaborate a bit more?


I think I was wrong, and we don't need to. Sorry about that.

My thought was that to ensure that there aren't stale lines with
differing attributes, we'd need to do a clean+invalidate while the
caches are guaranteed to not allocate anything furthe. Hence, we'd need
to use a non-cacheable mapping to perform the clean+invalidate.

However, I now think that so long as we unmap the range, this shouldn't
matter. The new kernel can perform the maintenance if it wishes to use
different attributes, similarly to what the first kernel must do per the
boot protocol.

> My current implementation of arch_kexec_protect_crashkres() is:

> 

>         kexec_segment_flush(kexec_crash_image);

>         create_mapping_late(crashk_res.start, ..., __pgprot(0));

>                                                 or PAGE_KERNEL_INVALID

>         flush_tlb_all();

> 

> kexec_segment_flush() will eventually do dcache-flush for all the modified

> data in crash dump kernel memory.


I now think this should be fine, per the above.

Thanks,
Mark.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
AKASHI Takahiro Jan. 24, 2017, 7:55 a.m. UTC | #8
On Mon, Jan 23, 2017 at 10:23:15AM +0000, Mark Rutland wrote:
> On Mon, Jan 23, 2017 at 06:51:46PM +0900, AKASHI Takahiro wrote:

> > Mark,

> > 

> > On Thu, Jan 19, 2017 at 11:28:50AM +0000, Mark Rutland wrote:

> > > On Thu, Jan 19, 2017 at 06:49:42PM +0900, AKASHI Takahiro wrote:

> > > > On Tue, Jan 17, 2017 at 11:54:42AM +0000, Mark Rutland wrote:

> > > > > On Tue, Jan 17, 2017 at 05:20:44PM +0900, AKASHI Takahiro wrote:

> > > > > > On Fri, Jan 13, 2017 at 11:39:15AM +0000, Mark Rutland wrote:

> 

> > > > > I don't think we have much code useful for unmapping. We could re-use 

> > > > > create_mapping_late for this, passing a set of prot bits that means the

> > > > > entries are invalid (e.g. have a PAGE_KERNEL_INVALID).

> > > > 

> > > > Do you really think that we should totally invalidate mmu entries?

> > > > I guess that, given proper cache & TLB flush operations, RO attribute is

> > > > good enough for memory consistency, no?

> > > > (None accesses the region, as I said, except in the case of re-loading

> > > > crash dump kernel though.)

> > > 

> > > My worry is that the first kernel and kdump kernel may map (portions of)

> > > the region with potentially confliciting memory attributes. So it would

> > > be necessary to completely unmap the region.

> > 

> > I think that this can happen only if the second kernel boots up,

> > leaving non-crashed cpus still running for some reason.

> 

> Yes. I was considering a kdump case where a secondary was stuck in the

> first kernel.

> 

> > > You raise a good point that this would also mean we need to perform some

> > > cache maintenance, which makes that a little more painful. We'd need a

> > > sequence like:

> > > 

> > > * Unmap the region

> > > * TLB invalidation

> > > * Remap the region with non-cacheable attributes

> > > * Cache maintenance

> > > * Unmap the region

> > > * TLB invalidation

> > 

> > I don't get why we need to remap the region and do cache

> > maintenance here. Please elaborate a bit more?

> 

> I think I was wrong, and we don't need to. Sorry about that.

> 

> My thought was that to ensure that there aren't stale lines with

> differing attributes, we'd need to do a clean+invalidate while the

> caches are guaranteed to not allocate anything furthe. Hence, we'd need

> to use a non-cacheable mapping to perform the clean+invalidate.

> 

> However, I now think that so long as we unmap the range, this shouldn't

> matter. The new kernel can perform the maintenance if it wishes to use

> different attributes, similarly to what the first kernel must do per the

> boot protocol.

> 

> > My current implementation of arch_kexec_protect_crashkres() is:

> > 

> >         kexec_segment_flush(kexec_crash_image);

> >         create_mapping_late(crashk_res.start, ..., __pgprot(0));

> >                                                 or PAGE_KERNEL_INVALID

> >         flush_tlb_all();

> > 

> > kexec_segment_flush() will eventually do dcache-flush for all the modified

> > data in crash dump kernel memory.

> 

> I now think this should be fine, per the above.


OK.
I think that now I can see a light of the goal :)

-Takahiro AKASHI

> Thanks,

> Mark.


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
diff mbox

Patch

===8<===
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index c0fc3d458195..bb21c0473b8e 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -211,6 +211,44 @@  void machine_kexec(struct kimage *kimage)
 	BUG(); /* Should never get here. */
 }
 
+static int kexec_mark_range(unsigned long start, unsigned long end,
+							bool protect)
+{
+	unsigned int nr_pages;
+
+	if (!end || start >= end)
+		return 0;
+
+	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+
+	if (protect)
+		return set_memory_ro(__phys_to_virt(start), nr_pages);
+	else
+		return set_memory_rw(__phys_to_virt(start), nr_pages);
+}
+
+static void kexec_mark_crashkres(bool protect)
+{
+	unsigned long control;
+
+	/* Don't touch the control code page used in crash_kexec().*/
+	control = page_to_phys(kexec_crash_image->control_code_page);
+	kexec_mark_range(crashk_res.start, control - 1, protect);
+
+	control += KEXEC_CONTROL_PAGE_SIZE;
+	kexec_mark_range(control, crashk_res.end, protect);
+}
+
+void arch_kexec_protect_crashkres(void)
+{
+	kexec_mark_crashkres(true);
+}
+
+void arch_kexec_unprotect_crashkres(void)
+{
+	kexec_mark_crashkres(false);
+}
+
 static void machine_kexec_mask_interrupts(void)
 {
 	unsigned int i;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 569ec3325bc8..764ec89c4f76 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -90,6 +90,7 @@  early_param("initrd", early_initrd);
 static void __init reserve_crashkernel(void)
 {
 	unsigned long long crash_size, crash_base;
+	int start_rgn, end_rgn;
 	int ret;
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
@@ -121,6 +122,9 @@  static void __init reserve_crashkernel(void)
 		}
 	}
 	memblock_reserve(crash_base, crash_size);
+	memblock_isolate_range(&memblock.memory, crash_base, crash_size,
+			&start_rgn, &end_rgn);
+
 
 	pr_info("Reserving %lldMB of memory at %lldMB for crashkernel\n",
 		crash_size >> 20, crash_base >> 20);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 17243e43184e..0f60f19c287b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -22,6 +22,7 @@ 
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/kexec.h>
 #include <linux/libfdt.h>
 #include <linux/mman.h>
 #include <linux/nodemask.h>
@@ -362,6 +363,17 @@  static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end
 	unsigned long kernel_start = __pa(_text);
 	unsigned long kernel_end = __pa(__init_begin);
 
+#ifdef CONFIG_KEXEC_CORE
+	if (crashk_res.end && start >= crashk_res.start &&
+			end <= (crashk_res.end + 1)) {
+		__create_pgd_mapping(pgd, start, __phys_to_virt(start),
+				     end - start, PAGE_KERNEL,
+				     early_pgtable_alloc,
+				     true);
+		return;
+	}
+#endif
+
 	/*
 	 * Take care not to create a writable alias for the
 	 * read-only text and rodata sections of the kernel image.