Message ID | 20180619064424.6642-2-takahiro.akashi@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | arm64: kexec,kdump: fix boot failures on acpi-only system | expand |
On 06/19/2018 01:44 AM, AKASHI Takahiro wrote: > +static int __init reserve_memblock_reserved_regions(void) > +{ > + phys_addr_t start, end, roundup_end = 0; > + struct resource *mem, *res; > + u64 i; > + > + for_each_reserved_mem_region(i, &start, &end) { > + if (end <= roundup_end) > + continue; /* done already */ > + > + start = __pfn_to_phys(PFN_DOWN(start)); > + end = __pfn_to_phys(PFN_UP(end)) - 1; > + roundup_end = end; > + > + res = kzalloc(sizeof(*res), GFP_ATOMIC); > + if (WARN_ON(!res)) > + return -ENOMEM; > + res->start = start; > + res->end = end; > + res->name = "reserved"; > + res->flags = IORESOURCE_MEM; > + > + mem = request_resource_conflict(&iomem_resource, res); > + /* > + * We expected memblock_reserve() regions to conflict with > + * memory created by request_standard_resources(). > + */ > + if (WARN_ON_ONCE(!mem)) > + continue; > + kfree(res); Why is kfree() after the conditional continue? This is a memory leak. > + > + reserve_region_with_split(mem, start, end, "reserved"); > + } > + > + return 0; > +} > +arch_initcall(reserve_memblock_reserved_regions); > + > u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; > > void __init setup_arch(char **cmdline_p) >
Hi Dave, On 19/06/18 14:37, Dave Kleikamp wrote: > On 06/19/2018 01:44 AM, AKASHI Takahiro wrote: >> +static int __init reserve_memblock_reserved_regions(void) >> +{ >> + phys_addr_t start, end, roundup_end = 0; >> + struct resource *mem, *res; >> + u64 i; >> + >> + for_each_reserved_mem_region(i, &start, &end) { >> + if (end <= roundup_end) >> + continue; /* done already */ >> + >> + start = __pfn_to_phys(PFN_DOWN(start)); >> + end = __pfn_to_phys(PFN_UP(end)) - 1; >> + roundup_end = end; >> + >> + res = kzalloc(sizeof(*res), GFP_ATOMIC); >> + if (WARN_ON(!res)) >> + return -ENOMEM; >> + res->start = start; >> + res->end = end; >> + res->name = "reserved"; >> + res->flags = IORESOURCE_MEM; >> + >> + mem = request_resource_conflict(&iomem_resource, res); >> + /* >> + * We expected memblock_reserve() regions to conflict with >> + * memory created by request_standard_resources(). >> + */ >> + if (WARN_ON_ONCE(!mem)) >> + continue; >> + kfree(res); > > Why is kfree() after the conditional continue? This is a memory leak. request_resource_conflict() inserts res into the iomem_resource tree, or returns the conflict if there is already something at this address. We expect something at this address: a 'System RAM' section added by request_resource(). This code wants the conflicting 'System RAM' entry so that reserve_region_with_split() can fill in the gaps below it with 'reserved'. See the commit-message for an example. If there was no conflict, it means the memory map doesn't look like we expect, we can't pass NULL to reserve_region_with_split(), and we just inserted the 'reserved' at the top level. Freeing res at this point would be a use-after-free hanging from /proc/iomem. This code generates a WARN_ON_ONCE() and leaves the 'reserved' description where it is. Trying to cleanup here is pointless, we can remove the resource entry and free it ... but we still want to report it as reserved, which is what just happened, just not quite how we we wanted it. If you ever see this warning, it means some RAM stopped being nomap between request_resources() and reserve_memblock_reserved_regions(). I can't find any case where that ever happens. If all that makes sense: how can I improve the comment above the WARN_ON_ONCE() to make it clearer? Thanks, James >> + >> + reserve_region_with_split(mem, start, end, "reserved"); >> + } >> + >> + return 0; >> +} >> +arch_initcall(reserve_memblock_reserved_regions); >> + >> u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; >> >> void __init setup_arch(char **cmdline_p) >>
On 06/19/2018 10:00 AM, James Morse wrote: > Hi Dave, > > On 19/06/18 14:37, Dave Kleikamp wrote: >> On 06/19/2018 01:44 AM, AKASHI Takahiro wrote: >>> +static int __init reserve_memblock_reserved_regions(void) >>> +{ >>> + phys_addr_t start, end, roundup_end = 0; >>> + struct resource *mem, *res; >>> + u64 i; >>> + >>> + for_each_reserved_mem_region(i, &start, &end) { >>> + if (end <= roundup_end) >>> + continue; /* done already */ >>> + >>> + start = __pfn_to_phys(PFN_DOWN(start)); >>> + end = __pfn_to_phys(PFN_UP(end)) - 1; >>> + roundup_end = end; >>> + >>> + res = kzalloc(sizeof(*res), GFP_ATOMIC); >>> + if (WARN_ON(!res)) >>> + return -ENOMEM; >>> + res->start = start; >>> + res->end = end; >>> + res->name = "reserved"; >>> + res->flags = IORESOURCE_MEM; >>> + >>> + mem = request_resource_conflict(&iomem_resource, res); >>> + /* >>> + * We expected memblock_reserve() regions to conflict with >>> + * memory created by request_standard_resources(). >>> + */ >>> + if (WARN_ON_ONCE(!mem)) >>> + continue; >>> + kfree(res); >> >> Why is kfree() after the conditional continue? This is a memory leak. > > request_resource_conflict() inserts res into the iomem_resource tree, or returns > the conflict if there is already something at this address. > > We expect something at this address: a 'System RAM' section added by > request_resource(). This code wants the conflicting 'System RAM' entry so that > reserve_region_with_split() can fill in the gaps below it with 'reserved'. See > the commit-message for an example. > > If there was no conflict, it means the memory map doesn't look like we expect, > we can't pass NULL to reserve_region_with_split(), and we just inserted the > 'reserved' at the top level. Freeing res at this point would be a use-after-free > hanging from /proc/iomem. > This code generates a WARN_ON_ONCE() and leaves the 'reserved' description where > it is. Okay. I get it. > Trying to cleanup here is pointless, we can remove the resource entry and free > it ... but we still want to report it as reserved, which is what just happened, > just not quite how we we wanted it. > > If you ever see this warning, it means some RAM stopped being nomap between > request_resources() and reserve_memblock_reserved_regions(). I can't find any > case where that ever happens. > > > If all that makes sense: how can I improve the comment above the WARN_ON_ONCE() > to make it clearer? I guess something like you described above. /* * We expect a 'System RAM' section at this address. In the unexpected * case where one is not found, request_resource_conflict() will insert * res into the iomem_resource tree. */ Do you think this is clearer? > > > Thanks, > > James > > >>> + >>> + reserve_region_with_split(mem, start, end, "reserved"); >>> + } >>> + >>> + return 0; >>> +} >>> +arch_initcall(reserve_memblock_reserved_regions); >>> + >>> u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; >>> >>> void __init setup_arch(char **cmdline_p) >>> >
On Tue, Jun 19, 2018 at 10:22:46AM -0500, Dave Kleikamp wrote: > On 06/19/2018 10:00 AM, James Morse wrote: > > Hi Dave, > > > > On 19/06/18 14:37, Dave Kleikamp wrote: > >> On 06/19/2018 01:44 AM, AKASHI Takahiro wrote: > >>> +static int __init reserve_memblock_reserved_regions(void) > >>> +{ > >>> + phys_addr_t start, end, roundup_end = 0; > >>> + struct resource *mem, *res; > >>> + u64 i; > >>> + > >>> + for_each_reserved_mem_region(i, &start, &end) { > >>> + if (end <= roundup_end) > >>> + continue; /* done already */ > >>> + > >>> + start = __pfn_to_phys(PFN_DOWN(start)); > >>> + end = __pfn_to_phys(PFN_UP(end)) - 1; > >>> + roundup_end = end; > >>> + > >>> + res = kzalloc(sizeof(*res), GFP_ATOMIC); > >>> + if (WARN_ON(!res)) > >>> + return -ENOMEM; > >>> + res->start = start; > >>> + res->end = end; > >>> + res->name = "reserved"; > >>> + res->flags = IORESOURCE_MEM; > >>> + > >>> + mem = request_resource_conflict(&iomem_resource, res); > >>> + /* > >>> + * We expected memblock_reserve() regions to conflict with > >>> + * memory created by request_standard_resources(). > >>> + */ > >>> + if (WARN_ON_ONCE(!mem)) > >>> + continue; > >>> + kfree(res); > >> > >> Why is kfree() after the conditional continue? This is a memory leak. > > > > request_resource_conflict() inserts res into the iomem_resource tree, or returns > > the conflict if there is already something at this address. > > > > We expect something at this address: a 'System RAM' section added by > > request_resource(). This code wants the conflicting 'System RAM' entry so that > > reserve_region_with_split() can fill in the gaps below it with 'reserved'. See > > the commit-message for an example. > > > > If there was no conflict, it means the memory map doesn't look like we expect, > > we can't pass NULL to reserve_region_with_split(), and we just inserted the > > 'reserved' at the top level. Freeing res at this point would be a use-after-free > > hanging from /proc/iomem. > > This code generates a WARN_ON_ONCE() and leaves the 'reserved' description where > > it is. > > Okay. I get it. > > > Trying to cleanup here is pointless, we can remove the resource entry and free > > it ... but we still want to report it as reserved, which is what just happened, > > just not quite how we we wanted it. > > > > If you ever see this warning, it means some RAM stopped being nomap between > > request_resources() and reserve_memblock_reserved_regions(). I can't find any > > case where that ever happens. > > > > > > If all that makes sense: how can I improve the comment above the WARN_ON_ONCE() > > to make it clearer? > > I guess something like you described above. > > /* > * We expect a 'System RAM' section at this address. In the unexpected > * case where one is not found, request_resource_conflict() will insert > * res into the iomem_resource tree. > */ > > Do you think this is clearer? If this is the only change needed in my patchset, I'd like the maintainers to take care of it instead of my posting another version. Thanks, -Takahiro AKASHI > > > > > > Thanks, > > > > James > > > > > >>> + > >>> + reserve_region_with_split(mem, start, end, "reserved"); > >>> + } > >>> + > >>> + return 0; > >>> +} > >>> +arch_initcall(reserve_memblock_reserved_regions); > >>> + > >>> u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; > >>> > >>> void __init setup_arch(char **cmdline_p) > >>> > >
On Tue, Jul 3, 2018 at 12:17 PM, AKASHI Takahiro <takahiro.akashi@linaro.org> wrote: > On Tue, Jun 19, 2018 at 10:22:46AM -0500, Dave Kleikamp wrote: >> On 06/19/2018 10:00 AM, James Morse wrote: >> > Hi Dave, >> > >> > On 19/06/18 14:37, Dave Kleikamp wrote: >> >> On 06/19/2018 01:44 AM, AKASHI Takahiro wrote: >> >>> +static int __init reserve_memblock_reserved_regions(void) >> >>> +{ >> >>> + phys_addr_t start, end, roundup_end = 0; >> >>> + struct resource *mem, *res; >> >>> + u64 i; >> >>> + >> >>> + for_each_reserved_mem_region(i, &start, &end) { >> >>> + if (end <= roundup_end) >> >>> + continue; /* done already */ >> >>> + >> >>> + start = __pfn_to_phys(PFN_DOWN(start)); >> >>> + end = __pfn_to_phys(PFN_UP(end)) - 1; >> >>> + roundup_end = end; >> >>> + >> >>> + res = kzalloc(sizeof(*res), GFP_ATOMIC); >> >>> + if (WARN_ON(!res)) >> >>> + return -ENOMEM; >> >>> + res->start = start; >> >>> + res->end = end; >> >>> + res->name = "reserved"; >> >>> + res->flags = IORESOURCE_MEM; >> >>> + >> >>> + mem = request_resource_conflict(&iomem_resource, res); >> >>> + /* >> >>> + * We expected memblock_reserve() regions to conflict with >> >>> + * memory created by request_standard_resources(). >> >>> + */ >> >>> + if (WARN_ON_ONCE(!mem)) >> >>> + continue; >> >>> + kfree(res); >> >> >> >> Why is kfree() after the conditional continue? This is a memory leak. >> > >> > request_resource_conflict() inserts res into the iomem_resource tree, or returns >> > the conflict if there is already something at this address. >> > >> > We expect something at this address: a 'System RAM' section added by >> > request_resource(). This code wants the conflicting 'System RAM' entry so that >> > reserve_region_with_split() can fill in the gaps below it with 'reserved'. See >> > the commit-message for an example. >> > >> > If there was no conflict, it means the memory map doesn't look like we expect, >> > we can't pass NULL to reserve_region_with_split(), and we just inserted the >> > 'reserved' at the top level. Freeing res at this point would be a use-after-free >> > hanging from /proc/iomem. >> > This code generates a WARN_ON_ONCE() and leaves the 'reserved' description where >> > it is. >> >> Okay. I get it. >> >> > Trying to cleanup here is pointless, we can remove the resource entry and free >> > it ... but we still want to report it as reserved, which is what just happened, >> > just not quite how we we wanted it. >> > >> > If you ever see this warning, it means some RAM stopped being nomap between >> > request_resources() and reserve_memblock_reserved_regions(). I can't find any >> > case where that ever happens. >> > >> > >> > If all that makes sense: how can I improve the comment above the WARN_ON_ONCE() >> > to make it clearer? >> >> I guess something like you described above. >> >> /* >> * We expect a 'System RAM' section at this address. In the unexpected >> * case where one is not found, request_resource_conflict() will insert >> * res into the iomem_resource tree. >> */ >> >> Do you think this is clearer? > > If this is the only change needed in my patchset, I'd like the maintainers > to take care of it instead of my posting another version. > +1. crashkernel booting on arm64 machines which support boot via acpi tables is broken since a long time, so it would be great to get these into upstream asap. I think the comment can be addressed while picking up the patchset in the maintainer's tree. I am not sure whether it will go through the ARM tree (Will) or the EFI tree (Ard), but since this has a Tested-by (from myself) and Reviewed-by (from James), probably this can be pulled in to allow further tests via the maintainer's tree. Thanks, Bhupesh >> > >> > >> > Thanks, >> > >> > James >> > >> > >> >>> + >> >>> + reserve_region_with_split(mem, start, end, "reserved"); >> >>> + } >> >>> + >> >>> + return 0; >> >>> +} >> >>> +arch_initcall(reserve_memblock_reserved_regions); >> >>> + >> >>> u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; >> >>> >> >>> void __init setup_arch(char **cmdline_p) >> >>> >> >
On 07/03/2018 01:47 AM, AKASHI Takahiro wrote: > On Tue, Jun 19, 2018 at 10:22:46AM -0500, Dave Kleikamp wrote: >> On 06/19/2018 10:00 AM, James Morse wrote: >>> Hi Dave, >>> >>> On 19/06/18 14:37, Dave Kleikamp wrote: >>>> On 06/19/2018 01:44 AM, AKASHI Takahiro wrote: >>>>> +static int __init reserve_memblock_reserved_regions(void) >>>>> +{ >>>>> + phys_addr_t start, end, roundup_end = 0; >>>>> + struct resource *mem, *res; >>>>> + u64 i; >>>>> + >>>>> + for_each_reserved_mem_region(i, &start, &end) { >>>>> + if (end <= roundup_end) >>>>> + continue; /* done already */ >>>>> + >>>>> + start = __pfn_to_phys(PFN_DOWN(start)); >>>>> + end = __pfn_to_phys(PFN_UP(end)) - 1; >>>>> + roundup_end = end; >>>>> + >>>>> + res = kzalloc(sizeof(*res), GFP_ATOMIC); >>>>> + if (WARN_ON(!res)) >>>>> + return -ENOMEM; >>>>> + res->start = start; >>>>> + res->end = end; >>>>> + res->name = "reserved"; >>>>> + res->flags = IORESOURCE_MEM; >>>>> + >>>>> + mem = request_resource_conflict(&iomem_resource, res); >>>>> + /* >>>>> + * We expected memblock_reserve() regions to conflict with >>>>> + * memory created by request_standard_resources(). >>>>> + */ >>>>> + if (WARN_ON_ONCE(!mem)) >>>>> + continue; >>>>> + kfree(res); >>>> >>>> Why is kfree() after the conditional continue? This is a memory leak. >>> >>> request_resource_conflict() inserts res into the iomem_resource tree, or returns >>> the conflict if there is already something at this address. >>> >>> We expect something at this address: a 'System RAM' section added by >>> request_resource(). This code wants the conflicting 'System RAM' entry so that >>> reserve_region_with_split() can fill in the gaps below it with 'reserved'. See >>> the commit-message for an example. >>> >>> If there was no conflict, it means the memory map doesn't look like we expect, >>> we can't pass NULL to reserve_region_with_split(), and we just inserted the >>> 'reserved' at the top level. Freeing res at this point would be a use-after-free >>> hanging from /proc/iomem. >>> This code generates a WARN_ON_ONCE() and leaves the 'reserved' description where >>> it is. >> >> Okay. I get it. >> >>> Trying to cleanup here is pointless, we can remove the resource entry and free >>> it ... but we still want to report it as reserved, which is what just happened, >>> just not quite how we we wanted it. >>> >>> If you ever see this warning, it means some RAM stopped being nomap between >>> request_resources() and reserve_memblock_reserved_regions(). I can't find any >>> case where that ever happens. >>> >>> >>> If all that makes sense: how can I improve the comment above the WARN_ON_ONCE() >>> to make it clearer? >> >> I guess something like you described above. >> >> /* >> * We expect a 'System RAM' section at this address. In the unexpected >> * case where one is not found, request_resource_conflict() will insert >> * res into the iomem_resource tree. >> */ >> >> Do you think this is clearer? > > If this is the only change needed in my patchset, I'd like the maintainers > to take care of it instead of my posting another version. Agreed. The sooner this gets fixed, the better. Thanks, Dave > > Thanks, > -Takahiro AKASHI > >>> >>> >>> Thanks, >>> >>> James >>> >>> >>>>> + >>>>> + reserve_region_with_split(mem, start, end, "reserved"); >>>>> + } >>>>> + >>>>> + return 0; >>>>> +} >>>>> +arch_initcall(reserve_memblock_reserved_regions); >>>>> + >>>>> u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; >>>>> >>>>> void __init setup_arch(char **cmdline_p) >>>>> >>>
On 19 June 2018 at 08:44, AKASHI Takahiro <takahiro.akashi@linaro.org> wrote: > From: James Morse <james.morse@arm.com> > > There has been some confusion around what is necessary to prevent kexec > overwriting important memory regions. memblock: reserve, or nomap? > Only memblock nomap regions are reported via /proc/iomem, kexec's > user-space doesn't know about memblock_reserve()d regions. > > Until commit f56ab9a5b73ca ("efi/arm: Don't mark ACPI reclaim memory > as MEMBLOCK_NOMAP") the ACPI tables were nomap, now they are reserved > and thus possible for kexec to overwrite with the new kernel or initrd. > But this was always broken, as the UEFI memory map is also reserved > and not marked as nomap. > > Exporting both nomap and reserved memblock types is a nuisance as > they live in different memblock structures which we can't walk at > the same time. > > Take a second walk over memblock.reserved and add new 'reserved' > subnodes for the memblock_reserved() regions that aren't already > described by the existing code. (e.g. Kernel Code) > > We use reserve_region_with_split() to find the gaps in existing named > regions. This handles the gap between 'kernel code' and 'kernel data' > which is memblock_reserve()d, but already partially described by > request_standard_resources(). e.g.: > | 80000000-dfffffff : System RAM > | 80080000-80ffffff : Kernel code > | 81000000-8158ffff : reserved > | 81590000-8237efff : Kernel data > | a0000000-dfffffff : Crash kernel > | e00f0000-f949ffff : System RAM > > reserve_region_with_split needs kzalloc() which isn't available when > request_standard_resources() is called, use an initcall. > > Reported-by: Bhupesh Sharma <bhsharma@redhat.com> > Reported-by: Tyler Baicar <tbaicar@codeaurora.org> > Suggested-by: Akashi Takahiro <takahiro.akashi@linaro.org> > Signed-off-by: James Morse <james.morse@arm.com> > Fixes: d28f6df1305a ("arm64/kexec: Add core kexec support") > CC: Ard Biesheuvel <ard.biesheuvel@linaro.org> > CC: Mark Rutland <mark.rutland@arm.com> Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> > --- > arch/arm64/kernel/setup.c | 38 ++++++++++++++++++++++++++++++++++++++ > 1 file changed, 38 insertions(+) > > diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c > index 30ad2f085d1f..5b4fac434c84 100644 > --- a/arch/arm64/kernel/setup.c > +++ b/arch/arm64/kernel/setup.c > @@ -241,6 +241,44 @@ static void __init request_standard_resources(void) > } > } > > +static int __init reserve_memblock_reserved_regions(void) > +{ > + phys_addr_t start, end, roundup_end = 0; > + struct resource *mem, *res; > + u64 i; > + > + for_each_reserved_mem_region(i, &start, &end) { > + if (end <= roundup_end) > + continue; /* done already */ > + > + start = __pfn_to_phys(PFN_DOWN(start)); > + end = __pfn_to_phys(PFN_UP(end)) - 1; > + roundup_end = end; > + > + res = kzalloc(sizeof(*res), GFP_ATOMIC); > + if (WARN_ON(!res)) > + return -ENOMEM; > + res->start = start; > + res->end = end; > + res->name = "reserved"; > + res->flags = IORESOURCE_MEM; > + > + mem = request_resource_conflict(&iomem_resource, res); > + /* > + * We expected memblock_reserve() regions to conflict with > + * memory created by request_standard_resources(). > + */ > + if (WARN_ON_ONCE(!mem)) > + continue; > + kfree(res); > + > + reserve_region_with_split(mem, start, end, "reserved"); > + } > + > + return 0; > +} > +arch_initcall(reserve_memblock_reserved_regions); > + > u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; > > void __init setup_arch(char **cmdline_p) > -- > 2.17.0 >
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 30ad2f085d1f..5b4fac434c84 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -241,6 +241,44 @@ static void __init request_standard_resources(void) } } +static int __init reserve_memblock_reserved_regions(void) +{ + phys_addr_t start, end, roundup_end = 0; + struct resource *mem, *res; + u64 i; + + for_each_reserved_mem_region(i, &start, &end) { + if (end <= roundup_end) + continue; /* done already */ + + start = __pfn_to_phys(PFN_DOWN(start)); + end = __pfn_to_phys(PFN_UP(end)) - 1; + roundup_end = end; + + res = kzalloc(sizeof(*res), GFP_ATOMIC); + if (WARN_ON(!res)) + return -ENOMEM; + res->start = start; + res->end = end; + res->name = "reserved"; + res->flags = IORESOURCE_MEM; + + mem = request_resource_conflict(&iomem_resource, res); + /* + * We expected memblock_reserve() regions to conflict with + * memory created by request_standard_resources(). + */ + if (WARN_ON_ONCE(!mem)) + continue; + kfree(res); + + reserve_region_with_split(mem, start, end, "reserved"); + } + + return 0; +} +arch_initcall(reserve_memblock_reserved_regions); + u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; void __init setup_arch(char **cmdline_p)