diff mbox

[v3,11/13] arm64/efi: use plain memblock API for adding and removing reserved RAM

Message ID 1416315432-8534-12-git-send-email-ard.biesheuvel@linaro.org
State New
Headers show

Commit Message

Ard Biesheuvel Nov. 18, 2014, 12:57 p.m. UTC
The memblock API is agnostic of page size, so we can use it on both
4 KB and 64 KB granule kernels to install all UEFI memory regions
(EFI_MEMORY_WB) using memblock_add(), and the memblock layer will stitch
all unaligned regions together. Then we start punching holes in it for the
reserved regions, this time taking the native page size into account.

Finally, the reserved regions are memblock_remove()'d as well. This will
ensure that the regions are accessible via mmap(/dev/mem), even when
CONFIG_STRICT_DEVMEM is in effect.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/kernel/efi.c        | 41 +++++++++--------------------------------
 drivers/firmware/efi/virtmap.c | 15 +++++++++------
 2 files changed, 18 insertions(+), 38 deletions(-)

Comments

Mark Salter Nov. 20, 2014, 5:28 p.m. UTC | #1
On Tue, 2014-11-18 at 13:57 +0100, Ard Biesheuvel wrote:
> diff --git a/drivers/firmware/efi/virtmap.c
> b/drivers/firmware/efi/virtmap.c
> index 98735fb43581..4b6a5c31629f 100644
> --- a/drivers/firmware/efi/virtmap.c
> +++ b/drivers/firmware/efi/virtmap.c
> @@ -8,6 +8,7 @@
>   */
>  
>  #include <linux/efi.h>
> +#include <linux/memblock.h>
>  #include <linux/mm_types.h>
>  #include <linux/rbtree.h>
>  #include <linux/rwsem.h>
> @@ -97,8 +98,15 @@ void __init efi_virtmap_init(void)
>                 u64 paddr, npages, size;
>                 pgprot_t prot;
>  
> -               if (!efi_mem_is_usable_region(md))
> +               paddr = md->phys_addr;
> +               npages = md->num_pages;
> +               memrange_efi_to_native(&paddr, &npages);
> +               size = npages << PAGE_SHIFT;
> +
> +               if (!efi_mem_is_usable_region(md)) {
>                         efi_register_mem_resource(md);
> +                       memblock_remove(paddr, size);
> +               }

What exactly is the memblock_remove trying to accomplish? With it, I
get:

 Unable to handle kernel NULL pointer dereference at virtual address 00000008
 pgd = fffffe03d5890000
 [00000008] *pgd=00000043d5ae0003, *pud=00000043d5ae0003, *pmd=00000043d5ae0003, *pte=0000000000000000
 Internal error: Oops: 96000007 [#1] SMP
 Modules linked in:
 CPU: 0 PID: 107 Comm: modprobe Not tainted 3.18.0-rc5+ #8
 Hardware name: APM X-Gene Mustang board (DT)
 task: fffffe03d5770000 ti: fffffe03dd300000 task.ti: fffffe03dd300000
 PC is at mem_cgroup_begin_page_stat+0x40/0x110
 LR is at mem_cgroup_begin_page_stat+0x40/0x110
 pc : [<fffffe00001ec34c>] lr : [<fffffe00001ec34c>] pstate: 60000145
 sp : fffffe03dd303a80
 x29: fffffe03dd303a80 x28: 000000000000ffe0 
 x27: fffffdfef0f56bc0 x26: 0000000000000000 
 x25: 000003ff89b80000 x24: 0000000000000000 
 x23: fffffe03dd303ae8 x22: fffffe03dd303ae7 
 x21: 0000000000000000 x20: fffffe03d58a0420 
 x19: fffffdfef0004500 x18: 0000000000000000 
 x17: 000003ff89ba40dc x16: 000003ff89bbff90 
 x15: 000003ff89ba3b44 x14: 31313030343d6e66 
 x13: 7020303035343030 x12: 0000000000000006 
 x11: 0000000000000000 x10: 0000000000000186 
 x9 : 0000000000000187 x8 : 3030306665666466 
 x7 : fffffe0001041c18 x6 : 00000000000000ff 
 x5 : fffffe00004aaf20 x4 : fffffe00004b209c 
 x3 : 0000000000000000 x2 : 0000000000000000 
 x1 : fffffe0000f6a5d0 x0 : 0000000000000000 
 
 Process modprobe (pid: 107, stack limit = 0xfffffe03dd300058)
 Stack: (0xfffffe03dd303a80 to 0xfffffe03dd304000)
 3a80: dd303ac0 fffffe03 001c3e20 fffffe00 f0004500 fffffdfe d58a0420 fffffe03
 3aa0: f0004500 fffffdfe d58f4dc0 fffffe03 00000000 00000000 001b62d4 fffffe00
 3ac0: dd303af0 fffffe03 001b9984 fffffe00 01140f53 00200040 d58a0420 fffffe03
 3ae0: d5880000 fffffe03 d58a0420 fffffe03 dd303b40 fffffe03 001b9c4c fffffe00
 3b00: d58a0420 fffffe03 89b80000 000003ff f0f56bf0 fffffdfe 00000000 00000000
 ...
 3fa0: 89b907b7 000003ff 89bbf000 000003ff 89bc16e8 000003ff eb753490 000003ff
 3fc0: 89b91f9c 000003ff eb753490 000003ff 89b91fac 000003ff 20000000 00000000
 3fe0: 00000000 00000000 ffffffff ffffffff aaaaaaaa aaaaaaaa aaaaaaaa aaaaaaaa
 Call trace:
 [<fffffe00001ec34c>] mem_cgroup_begin_page_stat+0x40/0x110
 [<fffffe00001c3e1c>] page_add_file_rmap+0x24/0x80
 [<fffffe00001b9980>] do_set_pte+0x108/0x19c
 [<fffffe00001b9c48>] do_read_fault.isra.93+0x234/0x28c
 [<fffffe00001ba57c>] handle_mm_fault+0x488/0xd64
 [<fffffe00000a0ea0>] do_page_fault+0x224/0x358
 [<fffffe0000090128>] do_mem_abort+0x4c/0xb0
 Exception stack(0xfffffe03dd303e30 to 0xfffffe03dd303f50)
 3e20:                                     00000000 00000000 89bbf000 000003ff
 3e40: ffffffff ffffffff 89b91fac 000003ff 01043280 fffffe00 00000002 00000000
 3e60: dd303e70 fffffe03 008ce3a0 fffffe00 dd303e90 fffffe03 001c0400 fffffe00
 3e80: d5880000 fffffe03 000cbc60 fffffe00 eb753610 000003ff 00092fdc fffffe00
 3ea0: 00000000 00000000 89bbf000 000003ff ffffffff ffffffff 89ba4654 000003ff
 3ec0: 80000000 00000000 d5521480 fffffe03 89bc16e8 000003ff 89bc1a40 000003ff
 3ee0: 00000004 00000000 89b80000 000003ff fefefeff fefefefe 00000000 00000000
 3f00: 0000000f 00000000 89bc1028 000003ff 89ba7000 000003ff 0000079c 00000000
 3f20: 00400000 00000000 ffff0000 ffffffff 004038a0 00000000 00010000 00000000
 3f40: 89ba3c1c 000003ff 89ba3b44 000003ff
 Code: b9405835 35000275 aa1303e0 9400065c (f9400413)
Ard Biesheuvel Nov. 20, 2014, 5:38 p.m. UTC | #2
On 20 November 2014 18:28, Mark Salter <msalter@redhat.com> wrote:
> On Tue, 2014-11-18 at 13:57 +0100, Ard Biesheuvel wrote:
>> diff --git a/drivers/firmware/efi/virtmap.c
>> b/drivers/firmware/efi/virtmap.c
>> index 98735fb43581..4b6a5c31629f 100644
>> --- a/drivers/firmware/efi/virtmap.c
>> +++ b/drivers/firmware/efi/virtmap.c
>> @@ -8,6 +8,7 @@
>>   */
>>
>>  #include <linux/efi.h>
>> +#include <linux/memblock.h>
>>  #include <linux/mm_types.h>
>>  #include <linux/rbtree.h>
>>  #include <linux/rwsem.h>
>> @@ -97,8 +98,15 @@ void __init efi_virtmap_init(void)
>>                 u64 paddr, npages, size;
>>                 pgprot_t prot;
>>
>> -               if (!efi_mem_is_usable_region(md))
>> +               paddr = md->phys_addr;
>> +               npages = md->num_pages;
>> +               memrange_efi_to_native(&paddr, &npages);
>> +               size = npages << PAGE_SHIFT;
>> +
>> +               if (!efi_mem_is_usable_region(md)) {
>>                         efi_register_mem_resource(md);
>> +                       memblock_remove(paddr, size);
>> +               }
>
> What exactly is the memblock_remove trying to accomplish? With it, I
> get:
>

The idea was pfn_valid() will then return false for those pfns,
allowing us to distinguish them from memory that the kernel may be
using, primarily for /dev/mem filtering. But apparently it is causing
problems for you, so I will have to figure out if there is a better
way of addressing this.


>  Unable to handle kernel NULL pointer dereference at virtual address 00000008
>  pgd = fffffe03d5890000
>  [00000008] *pgd=00000043d5ae0003, *pud=00000043d5ae0003, *pmd=00000043d5ae0003, *pte=0000000000000000
>  Internal error: Oops: 96000007 [#1] SMP
>  Modules linked in:
>  CPU: 0 PID: 107 Comm: modprobe Not tainted 3.18.0-rc5+ #8
>  Hardware name: APM X-Gene Mustang board (DT)
>  task: fffffe03d5770000 ti: fffffe03dd300000 task.ti: fffffe03dd300000
>  PC is at mem_cgroup_begin_page_stat+0x40/0x110
>  LR is at mem_cgroup_begin_page_stat+0x40/0x110
>  pc : [<fffffe00001ec34c>] lr : [<fffffe00001ec34c>] pstate: 60000145
>  sp : fffffe03dd303a80
>  x29: fffffe03dd303a80 x28: 000000000000ffe0
>  x27: fffffdfef0f56bc0 x26: 0000000000000000
>  x25: 000003ff89b80000 x24: 0000000000000000
>  x23: fffffe03dd303ae8 x22: fffffe03dd303ae7
>  x21: 0000000000000000 x20: fffffe03d58a0420
>  x19: fffffdfef0004500 x18: 0000000000000000
>  x17: 000003ff89ba40dc x16: 000003ff89bbff90
>  x15: 000003ff89ba3b44 x14: 31313030343d6e66
>  x13: 7020303035343030 x12: 0000000000000006
>  x11: 0000000000000000 x10: 0000000000000186
>  x9 : 0000000000000187 x8 : 3030306665666466
>  x7 : fffffe0001041c18 x6 : 00000000000000ff
>  x5 : fffffe00004aaf20 x4 : fffffe00004b209c
>  x3 : 0000000000000000 x2 : 0000000000000000
>  x1 : fffffe0000f6a5d0 x0 : 0000000000000000
>
>  Process modprobe (pid: 107, stack limit = 0xfffffe03dd300058)
>  Stack: (0xfffffe03dd303a80 to 0xfffffe03dd304000)
>  3a80: dd303ac0 fffffe03 001c3e20 fffffe00 f0004500 fffffdfe d58a0420 fffffe03
>  3aa0: f0004500 fffffdfe d58f4dc0 fffffe03 00000000 00000000 001b62d4 fffffe00
>  3ac0: dd303af0 fffffe03 001b9984 fffffe00 01140f53 00200040 d58a0420 fffffe03
>  3ae0: d5880000 fffffe03 d58a0420 fffffe03 dd303b40 fffffe03 001b9c4c fffffe00
>  3b00: d58a0420 fffffe03 89b80000 000003ff f0f56bf0 fffffdfe 00000000 00000000
>  ...
>  3fa0: 89b907b7 000003ff 89bbf000 000003ff 89bc16e8 000003ff eb753490 000003ff
>  3fc0: 89b91f9c 000003ff eb753490 000003ff 89b91fac 000003ff 20000000 00000000
>  3fe0: 00000000 00000000 ffffffff ffffffff aaaaaaaa aaaaaaaa aaaaaaaa aaaaaaaa
>  Call trace:
>  [<fffffe00001ec34c>] mem_cgroup_begin_page_stat+0x40/0x110
>  [<fffffe00001c3e1c>] page_add_file_rmap+0x24/0x80
>  [<fffffe00001b9980>] do_set_pte+0x108/0x19c
>  [<fffffe00001b9c48>] do_read_fault.isra.93+0x234/0x28c
>  [<fffffe00001ba57c>] handle_mm_fault+0x488/0xd64
>  [<fffffe00000a0ea0>] do_page_fault+0x224/0x358
>  [<fffffe0000090128>] do_mem_abort+0x4c/0xb0
>  Exception stack(0xfffffe03dd303e30 to 0xfffffe03dd303f50)
>  3e20:                                     00000000 00000000 89bbf000 000003ff
>  3e40: ffffffff ffffffff 89b91fac 000003ff 01043280 fffffe00 00000002 00000000
>  3e60: dd303e70 fffffe03 008ce3a0 fffffe00 dd303e90 fffffe03 001c0400 fffffe00
>  3e80: d5880000 fffffe03 000cbc60 fffffe00 eb753610 000003ff 00092fdc fffffe00
>  3ea0: 00000000 00000000 89bbf000 000003ff ffffffff ffffffff 89ba4654 000003ff
>  3ec0: 80000000 00000000 d5521480 fffffe03 89bc16e8 000003ff 89bc1a40 000003ff
>  3ee0: 00000004 00000000 89b80000 000003ff fefefeff fefefefe 00000000 00000000
>  3f00: 0000000f 00000000 89bc1028 000003ff 89ba7000 000003ff 0000079c 00000000
>  3f20: 00400000 00000000 ffff0000 ffffffff 004038a0 00000000 00010000 00000000
>  3f40: 89ba3c1c 000003ff 89ba3b44 000003ff
>  Code: b9405835 35000275 aa1303e0 9400065c (f9400413)
>
>
Mark Salter Nov. 20, 2014, 5:54 p.m. UTC | #3
On Thu, 2014-11-20 at 18:38 +0100, Ard Biesheuvel wrote:
> On 20 November 2014 18:28, Mark Salter <msalter@redhat.com> wrote:
> > On Tue, 2014-11-18 at 13:57 +0100, Ard Biesheuvel wrote:
> >> diff --git a/drivers/firmware/efi/virtmap.c
> >> b/drivers/firmware/efi/virtmap.c
> >> index 98735fb43581..4b6a5c31629f 100644
> >> --- a/drivers/firmware/efi/virtmap.c
> >> +++ b/drivers/firmware/efi/virtmap.c
> >> @@ -8,6 +8,7 @@
> >>   */
> >>
> >>  #include <linux/efi.h>
> >> +#include <linux/memblock.h>
> >>  #include <linux/mm_types.h>
> >>  #include <linux/rbtree.h>
> >>  #include <linux/rwsem.h>
> >> @@ -97,8 +98,15 @@ void __init efi_virtmap_init(void)
> >>                 u64 paddr, npages, size;
> >>                 pgprot_t prot;
> >>
> >> -               if (!efi_mem_is_usable_region(md))
> >> +               paddr = md->phys_addr;
> >> +               npages = md->num_pages;
> >> +               memrange_efi_to_native(&paddr, &npages);
> >> +               size = npages << PAGE_SHIFT;
> >> +
> >> +               if (!efi_mem_is_usable_region(md)) {
> >>                         efi_register_mem_resource(md);
> >> +                       memblock_remove(paddr, size);
> >> +               }
> >
> > What exactly is the memblock_remove trying to accomplish? With it, I
> > get:
> >
> 
> The idea was pfn_valid() will then return false for those pfns,
> allowing us to distinguish them from memory that the kernel may be
> using, primarily for /dev/mem filtering. But apparently it is causing
> problems for you, so I will have to figure out if there is a better
> way of addressing this.
> 
Okay. Well I think that works for pfn_valid, but it is confusing the
mem_cgroup code. I think because you end up with multiple memory regions
after creating the holes. Earlier memory management code saw one memory
region. And because this comes after paging_init(), all of the memory
ends up in the kernel linear mapping which is something we were trying
to avoid.

> 
> >  Unable to handle kernel NULL pointer dereference at virtual address 00000008
> >  pgd = fffffe03d5890000
> >  [00000008] *pgd=00000043d5ae0003, *pud=00000043d5ae0003, *pmd=00000043d5ae0003, *pte=0000000000000000
> >  Internal error: Oops: 96000007 [#1] SMP
> >  Modules linked in:
> >  CPU: 0 PID: 107 Comm: modprobe Not tainted 3.18.0-rc5+ #8
> >  Hardware name: APM X-Gene Mustang board (DT)
> >  task: fffffe03d5770000 ti: fffffe03dd300000 task.ti: fffffe03dd300000
> >  PC is at mem_cgroup_begin_page_stat+0x40/0x110
> >  LR is at mem_cgroup_begin_page_stat+0x40/0x110
> >  pc : [<fffffe00001ec34c>] lr : [<fffffe00001ec34c>] pstate: 60000145
> >  sp : fffffe03dd303a80
> >  x29: fffffe03dd303a80 x28: 000000000000ffe0
> >  x27: fffffdfef0f56bc0 x26: 0000000000000000
> >  x25: 000003ff89b80000 x24: 0000000000000000
> >  x23: fffffe03dd303ae8 x22: fffffe03dd303ae7
> >  x21: 0000000000000000 x20: fffffe03d58a0420
> >  x19: fffffdfef0004500 x18: 0000000000000000
> >  x17: 000003ff89ba40dc x16: 000003ff89bbff90
> >  x15: 000003ff89ba3b44 x14: 31313030343d6e66
> >  x13: 7020303035343030 x12: 0000000000000006
> >  x11: 0000000000000000 x10: 0000000000000186
> >  x9 : 0000000000000187 x8 : 3030306665666466
> >  x7 : fffffe0001041c18 x6 : 00000000000000ff
> >  x5 : fffffe00004aaf20 x4 : fffffe00004b209c
> >  x3 : 0000000000000000 x2 : 0000000000000000
> >  x1 : fffffe0000f6a5d0 x0 : 0000000000000000
> >
> >  Process modprobe (pid: 107, stack limit = 0xfffffe03dd300058)
> >  Stack: (0xfffffe03dd303a80 to 0xfffffe03dd304000)
> >  3a80: dd303ac0 fffffe03 001c3e20 fffffe00 f0004500 fffffdfe d58a0420 fffffe03
> >  3aa0: f0004500 fffffdfe d58f4dc0 fffffe03 00000000 00000000 001b62d4 fffffe00
> >  3ac0: dd303af0 fffffe03 001b9984 fffffe00 01140f53 00200040 d58a0420 fffffe03
> >  3ae0: d5880000 fffffe03 d58a0420 fffffe03 dd303b40 fffffe03 001b9c4c fffffe00
> >  3b00: d58a0420 fffffe03 89b80000 000003ff f0f56bf0 fffffdfe 00000000 00000000
> >  ...
> >  3fa0: 89b907b7 000003ff 89bbf000 000003ff 89bc16e8 000003ff eb753490 000003ff
> >  3fc0: 89b91f9c 000003ff eb753490 000003ff 89b91fac 000003ff 20000000 00000000
> >  3fe0: 00000000 00000000 ffffffff ffffffff aaaaaaaa aaaaaaaa aaaaaaaa aaaaaaaa
> >  Call trace:
> >  [<fffffe00001ec34c>] mem_cgroup_begin_page_stat+0x40/0x110
> >  [<fffffe00001c3e1c>] page_add_file_rmap+0x24/0x80
> >  [<fffffe00001b9980>] do_set_pte+0x108/0x19c
> >  [<fffffe00001b9c48>] do_read_fault.isra.93+0x234/0x28c
> >  [<fffffe00001ba57c>] handle_mm_fault+0x488/0xd64
> >  [<fffffe00000a0ea0>] do_page_fault+0x224/0x358
> >  [<fffffe0000090128>] do_mem_abort+0x4c/0xb0
> >  Exception stack(0xfffffe03dd303e30 to 0xfffffe03dd303f50)
> >  3e20:                                     00000000 00000000 89bbf000 000003ff
> >  3e40: ffffffff ffffffff 89b91fac 000003ff 01043280 fffffe00 00000002 00000000
> >  3e60: dd303e70 fffffe03 008ce3a0 fffffe00 dd303e90 fffffe03 001c0400 fffffe00
> >  3e80: d5880000 fffffe03 000cbc60 fffffe00 eb753610 000003ff 00092fdc fffffe00
> >  3ea0: 00000000 00000000 89bbf000 000003ff ffffffff ffffffff 89ba4654 000003ff
> >  3ec0: 80000000 00000000 d5521480 fffffe03 89bc16e8 000003ff 89bc1a40 000003ff
> >  3ee0: 00000004 00000000 89b80000 000003ff fefefeff fefefefe 00000000 00000000
> >  3f00: 0000000f 00000000 89bc1028 000003ff 89ba7000 000003ff 0000079c 00000000
> >  3f20: 00400000 00000000 ffff0000 ffffffff 004038a0 00000000 00010000 00000000
> >  3f40: 89ba3c1c 000003ff 89ba3b44 000003ff
> >  Code: b9405835 35000275 aa1303e0 9400065c (f9400413)
> >
> >
Ard Biesheuvel Nov. 21, 2014, 12:07 p.m. UTC | #4
On 20 November 2014 18:54, Mark Salter <msalter@redhat.com> wrote:
> On Thu, 2014-11-20 at 18:38 +0100, Ard Biesheuvel wrote:
>> On 20 November 2014 18:28, Mark Salter <msalter@redhat.com> wrote:
>> > On Tue, 2014-11-18 at 13:57 +0100, Ard Biesheuvel wrote:
>> >> diff --git a/drivers/firmware/efi/virtmap.c
>> >> b/drivers/firmware/efi/virtmap.c
>> >> index 98735fb43581..4b6a5c31629f 100644
>> >> --- a/drivers/firmware/efi/virtmap.c
>> >> +++ b/drivers/firmware/efi/virtmap.c
>> >> @@ -8,6 +8,7 @@
>> >>   */
>> >>
>> >>  #include <linux/efi.h>
>> >> +#include <linux/memblock.h>
>> >>  #include <linux/mm_types.h>
>> >>  #include <linux/rbtree.h>
>> >>  #include <linux/rwsem.h>
>> >> @@ -97,8 +98,15 @@ void __init efi_virtmap_init(void)
>> >>                 u64 paddr, npages, size;
>> >>                 pgprot_t prot;
>> >>
>> >> -               if (!efi_mem_is_usable_region(md))
>> >> +               paddr = md->phys_addr;
>> >> +               npages = md->num_pages;
>> >> +               memrange_efi_to_native(&paddr, &npages);
>> >> +               size = npages << PAGE_SHIFT;
>> >> +
>> >> +               if (!efi_mem_is_usable_region(md)) {
>> >>                         efi_register_mem_resource(md);
>> >> +                       memblock_remove(paddr, size);
>> >> +               }
>> >
>> > What exactly is the memblock_remove trying to accomplish? With it, I
>> > get:
>> >
>>
>> The idea was pfn_valid() will then return false for those pfns,
>> allowing us to distinguish them from memory that the kernel may be
>> using, primarily for /dev/mem filtering. But apparently it is causing
>> problems for you, so I will have to figure out if there is a better
>> way of addressing this.
>>
> Okay. Well I think that works for pfn_valid, but it is confusing the
> mem_cgroup code. I think because you end up with multiple memory regions
> after creating the holes. Earlier memory management code saw one memory
> region. And because this comes after paging_init(), all of the memory
> ends up in the kernel linear mapping which is something we were trying
> to avoid.
>

I will drop the memblock_remove() then. I guess I could make the test
in devmem_is_allowed() do

if (!memblock_is_memory() || memblock_is_reserved())

instead of 'if (!pfn_valid())' so that reserved regions become
accessible with having to remove them.

Are you happy with the other change in this patch, i.e., using
memblock_add() directly so that we don't have to deal with the
rounding?
Mark Salter Nov. 21, 2014, 3:21 p.m. UTC | #5
On Fri, 2014-11-21 at 13:07 +0100, Ard Biesheuvel wrote:
> On 20 November 2014 18:54, Mark Salter <msalter@redhat.com> wrote:
> > On Thu, 2014-11-20 at 18:38 +0100, Ard Biesheuvel wrote:
> >> On 20 November 2014 18:28, Mark Salter <msalter@redhat.com> wrote:
> >> > On Tue, 2014-11-18 at 13:57 +0100, Ard Biesheuvel wrote:
> >> >> diff --git a/drivers/firmware/efi/virtmap.c
> >> >> b/drivers/firmware/efi/virtmap.c
> >> >> index 98735fb43581..4b6a5c31629f 100644
> >> >> --- a/drivers/firmware/efi/virtmap.c
> >> >> +++ b/drivers/firmware/efi/virtmap.c
> >> >> @@ -8,6 +8,7 @@
> >> >>   */
> >> >>
> >> >>  #include <linux/efi.h>
> >> >> +#include <linux/memblock.h>
> >> >>  #include <linux/mm_types.h>
> >> >>  #include <linux/rbtree.h>
> >> >>  #include <linux/rwsem.h>
> >> >> @@ -97,8 +98,15 @@ void __init efi_virtmap_init(void)
> >> >>                 u64 paddr, npages, size;
> >> >>                 pgprot_t prot;
> >> >>
> >> >> -               if (!efi_mem_is_usable_region(md))
> >> >> +               paddr = md->phys_addr;
> >> >> +               npages = md->num_pages;
> >> >> +               memrange_efi_to_native(&paddr, &npages);
> >> >> +               size = npages << PAGE_SHIFT;
> >> >> +
> >> >> +               if (!efi_mem_is_usable_region(md)) {
> >> >>                         efi_register_mem_resource(md);
> >> >> +                       memblock_remove(paddr, size);
> >> >> +               }
> >> >
> >> > What exactly is the memblock_remove trying to accomplish? With it, I
> >> > get:
> >> >
> >>
> >> The idea was pfn_valid() will then return false for those pfns,
> >> allowing us to distinguish them from memory that the kernel may be
> >> using, primarily for /dev/mem filtering. But apparently it is causing
> >> problems for you, so I will have to figure out if there is a better
> >> way of addressing this.
> >>
> > Okay. Well I think that works for pfn_valid, but it is confusing the
> > mem_cgroup code. I think because you end up with multiple memory regions
> > after creating the holes. Earlier memory management code saw one memory
> > region. And because this comes after paging_init(), all of the memory
> > ends up in the kernel linear mapping which is something we were trying
> > to avoid.
> >
> 
> I will drop the memblock_remove() then. I guess I could make the test
> in devmem_is_allowed() do
> 
> if (!memblock_is_memory() || memblock_is_reserved())
> 
> instead of 'if (!pfn_valid())' so that reserved regions become
> accessible with having to remove them.

Maybe we should add a new memblock to keep track of uefi tables.
The memblock_is_reserved() seems overly permissive to me.

> 
> Are you happy with the other change in this patch, i.e., using
> memblock_add() directly so that we don't have to deal with the
> rounding?
> 

Yeah, I think that's okay. Everything else seems to be working fine
in my testing.
Ard Biesheuvel Nov. 26, 2014, 4:59 p.m. UTC | #6
On 21 November 2014 at 16:21, Mark Salter <msalter@redhat.com> wrote:
> On Fri, 2014-11-21 at 13:07 +0100, Ard Biesheuvel wrote:
>> On 20 November 2014 18:54, Mark Salter <msalter@redhat.com> wrote:
>> > On Thu, 2014-11-20 at 18:38 +0100, Ard Biesheuvel wrote:
>> >> On 20 November 2014 18:28, Mark Salter <msalter@redhat.com> wrote:
>> >> > On Tue, 2014-11-18 at 13:57 +0100, Ard Biesheuvel wrote:
>> >> >> diff --git a/drivers/firmware/efi/virtmap.c
>> >> >> b/drivers/firmware/efi/virtmap.c
>> >> >> index 98735fb43581..4b6a5c31629f 100644
>> >> >> --- a/drivers/firmware/efi/virtmap.c
>> >> >> +++ b/drivers/firmware/efi/virtmap.c
>> >> >> @@ -8,6 +8,7 @@
>> >> >>   */
>> >> >>
>> >> >>  #include <linux/efi.h>
>> >> >> +#include <linux/memblock.h>
>> >> >>  #include <linux/mm_types.h>
>> >> >>  #include <linux/rbtree.h>
>> >> >>  #include <linux/rwsem.h>
>> >> >> @@ -97,8 +98,15 @@ void __init efi_virtmap_init(void)
>> >> >>                 u64 paddr, npages, size;
>> >> >>                 pgprot_t prot;
>> >> >>
>> >> >> -               if (!efi_mem_is_usable_region(md))
>> >> >> +               paddr = md->phys_addr;
>> >> >> +               npages = md->num_pages;
>> >> >> +               memrange_efi_to_native(&paddr, &npages);
>> >> >> +               size = npages << PAGE_SHIFT;
>> >> >> +
>> >> >> +               if (!efi_mem_is_usable_region(md)) {
>> >> >>                         efi_register_mem_resource(md);
>> >> >> +                       memblock_remove(paddr, size);
>> >> >> +               }
>> >> >
>> >> > What exactly is the memblock_remove trying to accomplish? With it, I
>> >> > get:
>> >> >
>> >>
>> >> The idea was pfn_valid() will then return false for those pfns,
>> >> allowing us to distinguish them from memory that the kernel may be
>> >> using, primarily for /dev/mem filtering. But apparently it is causing
>> >> problems for you, so I will have to figure out if there is a better
>> >> way of addressing this.
>> >>
>> > Okay. Well I think that works for pfn_valid, but it is confusing the
>> > mem_cgroup code. I think because you end up with multiple memory regions
>> > after creating the holes. Earlier memory management code saw one memory
>> > region. And because this comes after paging_init(), all of the memory
>> > ends up in the kernel linear mapping which is something we were trying
>> > to avoid.
>> >
>>
>> I will drop the memblock_remove() then. I guess I could make the test
>> in devmem_is_allowed() do
>>
>> if (!memblock_is_memory() || memblock_is_reserved())
>>
>> instead of 'if (!pfn_valid())' so that reserved regions become
>> accessible with having to remove them.
>
> Maybe we should add a new memblock to keep track of uefi tables.
> The memblock_is_reserved() seems overly permissive to me.
>

OK. So instead, I will split reserve_regions() into 2 passes:
- first pass adds all EFI_MEMORY_WB regions (unrounded)
- second pass one removes all reserved regions (rounded up to OS page size)

That way, the devmem stuff should still work as well.

QEMU example:

Processing EFI memory map:
  0x0000b711d000-0x0000b71dffff [Runtime Data       |RUN|  |  |  |
|WB|WT|WC|UC]
  0x0000bfb21000-0x0000bfb34fff [Runtime Code       |RUN|  |  |  |
|WB|WT|WC|UC]
  0x0000bfb35000-0x0000bfb66fff [Runtime Data       |RUN|  |  |  |
|WB|WT|WC|UC]
  0x0000bfb6a000-0x0000bfb6afff [Runtime Data       |RUN|  |  |  |
|WB|WT|WC|UC]
  0x0000bfb82000-0x0000bfb82fff [Runtime Data       |RUN|  |  |  |
|WB|WT|WC|UC]
  0x000004000000-0x000007ffffff [Memory Mapped I/O  |RUN|  |  |  |   |
 |  |  |UC]
  0x000009010000-0x000009010fff [Memory Mapped I/O  |RUN|  |  |  |   |
 |  |  |UC]
  0x000040000000-0x0000407affff [Loader Data        |   |  |  |  |
|WB|WT|WC|UC]
  0x0000b71e0000-0x0000bb77efff [Conventional Memory|   |  |  |  |
|WB|WT|WC|UC]
  0x0000bb77f000-0x0000bbc00fff [Boot Data          |   |  |  |  |
|WB|WT|WC|UC]
  0x0000bbc01000-0x0000bbdc2fff [Conventional Memory|   |  |  |  |
|WB|WT|WC|UC]
  0x0000bbdc3000-0x0000bc01ffff [Boot Data          |   |  |  |  |
|WB|WT|WC|UC]
  0x0000bc020000-0x0000bfa43fff [Conventional Memory|   |  |  |  |
|WB|WT|WC|UC]
  0x0000bfa44000-0x0000bfb20fff [Boot Code          |   |  |  |  |
|WB|WT|WC|UC]
  0x0000407b0000-0x00005fdfffff [Conventional Memory|   |  |  |  |
|WB|WT|WC|UC]
  0x00005fe00000-0x00005fe0ffff [Loader Data        |   |  |  |  |
|WB|WT|WC|UC]
  0x0000bfb67000-0x0000bfb69fff [Boot Data          |   |  |  |  |
|WB|WT|WC|UC]
  0x00005fe10000-0x0000b6337fff [Conventional Memory|   |  |  |  |
|WB|WT|WC|UC]
  0x0000bfb6b000-0x0000bfb7efff [Conventional Memory|   |  |  |  |
|WB|WT|WC|UC]
  0x0000bfb7f000-0x0000bfb81fff [Boot Data          |   |  |  |  |
|WB|WT|WC|UC]
  0x0000b6338000-0x0000b6338fff [Loader Data        |   |  |  |  |
|WB|WT|WC|UC]
  0x0000bfb83000-0x0000bfffffff [Boot Data          |   |  |  |  |
|WB|WT|WC|UC]
  0x0000b6339000-0x0000b6a4bfff [Loader Code        |   |  |  |  |
|WB|WT|WC|UC]
  0x0000b6a4c000-0x0000b711cfff [Boot Code          |   |  |  |  |
|WB|WT|WC|UC]

MEMBLOCK configuration:
 memory size = 0x7fef5000 reserved size = 0x1724000
 memory.cnt  = 0x5
 memory[0x0] [0x00000040000000-0x000000b711cfff], 0x7711d000 bytes flags: 0x0
 memory[0x1] [0x000000b71e0000-0x000000bfb20fff], 0x8941000 bytes flags: 0x0
 memory[0x2] [0x000000bfb67000-0x000000bfb69fff], 0x3000 bytes flags: 0x0
 memory[0x3] [0x000000bfb6b000-0x000000bfb81fff], 0x17000 bytes flags: 0x0
 memory[0x4] [0x000000bfb83000-0x000000bfffffff], 0x47d000 bytes flags: 0x0
 reserved.cnt  = 0x4
 reserved[0x0] [0x00000040080000-0x00000040792fff], 0x713000 bytes flags: 0x0
 reserved[0x1] [0x0000005fe00000-0x0000005fe0ffff], 0x10000 bytes flags: 0x0
 reserved[0x2] [0x000000b6338000-0x000000b6338fff], 0x1000 bytes flags: 0x0
 reserved[0x3] [0x000000be800000-0x000000bf7fffff], 0x1000000 bytes flags: 0x0
   memblock_free: [0x00000040792000-0x00000040792fff]
__create_mapping.isra.6+0x9c/0x2ac

>>
>> Are you happy with the other change in this patch, i.e., using
>> memblock_add() directly so that we don't have to deal with the
>> rounding?
>>
>
> Yeah, I think that's okay. Everything else seems to be working fine
> in my testing.
>

Cheers,
Ard.
diff mbox

Patch

diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 3009c22e2620..2c0a858f699e 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -105,24 +105,6 @@  out:
 	return retval;
 }
 
-/*
- * Return true for RAM regions we want to permanently reserve.
- */
-static __init int is_reserve_region(efi_memory_desc_t *md)
-{
-	switch (md->type) {
-	case EFI_LOADER_CODE:
-	case EFI_LOADER_DATA:
-	case EFI_BOOT_SERVICES_CODE:
-	case EFI_BOOT_SERVICES_DATA:
-	case EFI_CONVENTIONAL_MEMORY:
-		return 0;
-	default:
-		break;
-	}
-	return is_normal_ram(md);
-}
-
 static __init void reserve_regions(void)
 {
 	efi_memory_desc_t *md;
@@ -134,31 +116,26 @@  static __init void reserve_regions(void)
 	for_each_efi_memory_desc(&memmap, md) {
 		paddr = md->phys_addr;
 		npages = md->num_pages;
+		size = npages << EFI_PAGE_SHIFT;
 
 		if (uefi_debug) {
 			char buf[64];
 
-			pr_info("  0x%012llx-0x%012llx %s",
-				paddr, paddr + (npages << EFI_PAGE_SHIFT) - 1,
+			pr_info("  0x%012llx-0x%012llx %s\n",
+				paddr, paddr + size - 1,
 				efi_md_typeattr_format(buf, sizeof(buf), md));
 		}
 
-		memrange_efi_to_native(&paddr, &npages);
-		size = npages << PAGE_SHIFT;
+		if (is_normal_ram(md)) {
+			memblock_add(paddr, size);
 
-		if (is_normal_ram(md))
-			early_init_dt_add_memory_arch(paddr, size);
-
-		if (is_reserve_region(md)) {
-			memblock_reserve(paddr, size);
-			if (uefi_debug)
-				pr_cont("*");
+			if (!efi_mem_is_usable_region(md)) {
+				memrange_efi_to_native(&paddr, &npages);
+				memblock_reserve(paddr, npages << PAGE_SHIFT);
+			}
 		}
 
-		if (uefi_debug)
-			pr_cont("\n");
 	}
-
 	set_bit(EFI_MEMMAP, &efi.flags);
 }
 
diff --git a/drivers/firmware/efi/virtmap.c b/drivers/firmware/efi/virtmap.c
index 98735fb43581..4b6a5c31629f 100644
--- a/drivers/firmware/efi/virtmap.c
+++ b/drivers/firmware/efi/virtmap.c
@@ -8,6 +8,7 @@ 
  */
 
 #include <linux/efi.h>
+#include <linux/memblock.h>
 #include <linux/mm_types.h>
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
@@ -97,8 +98,15 @@  void __init efi_virtmap_init(void)
 		u64 paddr, npages, size;
 		pgprot_t prot;
 
-		if (!efi_mem_is_usable_region(md))
+		paddr = md->phys_addr;
+		npages = md->num_pages;
+		memrange_efi_to_native(&paddr, &npages);
+		size = npages << PAGE_SHIFT;
+
+		if (!efi_mem_is_usable_region(md)) {
 			efi_register_mem_resource(md);
+			memblock_remove(paddr, size);
+		}
 		if (!(md->attribute & EFI_MEMORY_RUNTIME))
 			continue;
 		if (WARN(md->virt_addr == 0,
@@ -106,11 +114,6 @@  void __init efi_virtmap_init(void)
 			 md->phys_addr))
 			return;
 
-		paddr = md->phys_addr;
-		npages = md->num_pages;
-		memrange_efi_to_native(&paddr, &npages);
-		size = npages << PAGE_SHIFT;
-
 		pr_info("  EFI remap 0x%012llx => %p\n",
 			md->phys_addr, (void *)md->virt_addr);