diff mbox series

riscv: Ensure BPF_JIT_REGION_START aligned with PMD size

Message ID 20210615004928.2d27d2ac@xhacker
State New
Headers show
Series riscv: Ensure BPF_JIT_REGION_START aligned with PMD size | expand

Commit Message

Jisheng Zhang June 14, 2021, 4:49 p.m. UTC
From: Jisheng Zhang <jszhang@kernel.org>

Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")
breaks booting with one kind of config file, I reproduced a kernel panic
with the config:

[    0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220
[    0.139159] Oops [#1]
[    0.139303] Modules linked in:
[    0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1
[    0.139934] Hardware name: riscv-virtio,qemu (DT)
[    0.140193] epc : __memset+0xc4/0xfc
[    0.140416]  ra : skb_flow_dissector_init+0x1e/0x82
[    0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0
[    0.140878]  gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158
[    0.141156]  t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0
[    0.141424]  s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000
[    0.141654]  a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064
[    0.141893]  a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff
[    0.142126]  s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088
[    0.142353]  s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438
[    0.142584]  s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac
[    0.142810]  s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000
[    0.143042]  t5 : 0000000000000155 t6 : 00000000000003ff
[    0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f
[    0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc
[    0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60
[    0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168
[    0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224
[    0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110
[    0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc
[    0.145124] ---[ end trace f1e9643daa46d591 ]---

After some investigation, I think I found the root cause: commit
2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves
BPF JIT region after the kernel:

The &_end is unlikely aligned with PMD size, so the front bpf jit
region sits with part of kernel .data section in one PMD size mapping.
But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is
called to make the first bpf jit prog ROX, we will make part of kernel
.data section RO too, so when we write to, for example memset the
.data section, MMU will trigger a store page fault.

To fix the issue, we need to ensure the BPF JIT region is PMD size
aligned. This patch acchieve this goal by restoring the BPF JIT region
to original position, I.E the 128MB before kernel .text section.

Reported-by: Andreas Schwab <schwab@linux-m68k.org>
Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
---
 arch/riscv/include/asm/pgtable.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

Comments

Daniel Borkmann June 15, 2021, 12:29 p.m. UTC | #1
On 6/14/21 6:49 PM, Jisheng Zhang wrote:
> From: Jisheng Zhang <jszhang@kernel.org>

> 

> Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")

> breaks booting with one kind of config file, I reproduced a kernel panic

> with the config:

> 

> [    0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220

> [    0.139159] Oops [#1]

> [    0.139303] Modules linked in:

> [    0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1

> [    0.139934] Hardware name: riscv-virtio,qemu (DT)

> [    0.140193] epc : __memset+0xc4/0xfc

> [    0.140416]  ra : skb_flow_dissector_init+0x1e/0x82

> [    0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0

> [    0.140878]  gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158

> [    0.141156]  t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0

> [    0.141424]  s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000

> [    0.141654]  a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064

> [    0.141893]  a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff

> [    0.142126]  s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088

> [    0.142353]  s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438

> [    0.142584]  s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac

> [    0.142810]  s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000

> [    0.143042]  t5 : 0000000000000155 t6 : 00000000000003ff

> [    0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f

> [    0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc

> [    0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60

> [    0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168

> [    0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224

> [    0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110

> [    0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc

> [    0.145124] ---[ end trace f1e9643daa46d591 ]---

> 

> After some investigation, I think I found the root cause: commit

> 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves

> BPF JIT region after the kernel:

> 

> The &_end is unlikely aligned with PMD size, so the front bpf jit

> region sits with part of kernel .data section in one PMD size mapping.

> But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is

> called to make the first bpf jit prog ROX, we will make part of kernel

> .data section RO too, so when we write to, for example memset the

> .data section, MMU will trigger a store page fault.

> 

> To fix the issue, we need to ensure the BPF JIT region is PMD size

> aligned. This patch acchieve this goal by restoring the BPF JIT region

> to original position, I.E the 128MB before kernel .text section.

> 

> Reported-by: Andreas Schwab <schwab@linux-m68k.org>

> Signed-off-by: Jisheng Zhang <jszhang@kernel.org>

> ---

>   arch/riscv/include/asm/pgtable.h | 5 ++---

>   1 file changed, 2 insertions(+), 3 deletions(-)

> 

> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h

> index 9469f464e71a..380cd3a7e548 100644

> --- a/arch/riscv/include/asm/pgtable.h

> +++ b/arch/riscv/include/asm/pgtable.h

> @@ -30,9 +30,8 @@

>   

>   #define BPF_JIT_REGION_SIZE	(SZ_128M)

>   #ifdef CONFIG_64BIT

> -/* KASLR should leave at least 128MB for BPF after the kernel */

> -#define BPF_JIT_REGION_START	PFN_ALIGN((unsigned long)&_end)

> -#define BPF_JIT_REGION_END	(BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)

> +#define BPF_JIT_REGION_START	(BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)

> +#define BPF_JIT_REGION_END	(MODULES_END)

>   #else

>   #define BPF_JIT_REGION_START	(PAGE_OFFSET - BPF_JIT_REGION_SIZE)

>   #define BPF_JIT_REGION_END	(VMALLOC_END)


I presume this fix will be routed via riscv tree?

Thanks,
Daniel
Alexandre Ghiti June 15, 2021, 6:54 p.m. UTC | #2
Hi Jisheng,

Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :
> From: Jisheng Zhang <jszhang@kernel.org>

> 

> Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")

> breaks booting with one kind of config file, I reproduced a kernel panic

> with the config:

> 

> [    0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220

> [    0.139159] Oops [#1]

> [    0.139303] Modules linked in:

> [    0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1

> [    0.139934] Hardware name: riscv-virtio,qemu (DT)

> [    0.140193] epc : __memset+0xc4/0xfc

> [    0.140416]  ra : skb_flow_dissector_init+0x1e/0x82

> [    0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0

> [    0.140878]  gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158

> [    0.141156]  t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0

> [    0.141424]  s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000

> [    0.141654]  a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064

> [    0.141893]  a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff

> [    0.142126]  s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088

> [    0.142353]  s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438

> [    0.142584]  s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac

> [    0.142810]  s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000

> [    0.143042]  t5 : 0000000000000155 t6 : 00000000000003ff

> [    0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f

> [    0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc

> [    0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60

> [    0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168

> [    0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224

> [    0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110

> [    0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc

> [    0.145124] ---[ end trace f1e9643daa46d591 ]---

> 

> After some investigation, I think I found the root cause: commit

> 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves

> BPF JIT region after the kernel:

> 

> The &_end is unlikely aligned with PMD size, so the front bpf jit

> region sits with part of kernel .data section in one PMD size mapping.

> But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is

> called to make the first bpf jit prog ROX, we will make part of kernel

> .data section RO too, so when we write to, for example memset the

> .data section, MMU will trigger a store page fault.


Good catch, we make sure no physical allocation happens between _end and 
the next PMD aligned address, but I missed this one.

> 

> To fix the issue, we need to ensure the BPF JIT region is PMD size

> aligned. This patch acchieve this goal by restoring the BPF JIT region

> to original position, I.E the 128MB before kernel .text section.


But I disagree with your solution: I made sure modules and BPF programs 
get their own virtual regions to avoid worst case scenario where one 
could allocate all the space and leave nothing to the other (we are 
limited to +- 2GB offset). Why don't just align BPF_JIT_REGION_START to 
the next PMD aligned address?

Again, good catch, thanks,

Alex

> 

> Reported-by: Andreas Schwab <schwab@linux-m68k.org>

> Signed-off-by: Jisheng Zhang <jszhang@kernel.org>

> ---

>   arch/riscv/include/asm/pgtable.h | 5 ++---

>   1 file changed, 2 insertions(+), 3 deletions(-)

> 

> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h

> index 9469f464e71a..380cd3a7e548 100644

> --- a/arch/riscv/include/asm/pgtable.h

> +++ b/arch/riscv/include/asm/pgtable.h

> @@ -30,9 +30,8 @@

>   

>   #define BPF_JIT_REGION_SIZE	(SZ_128M)

>   #ifdef CONFIG_64BIT

> -/* KASLR should leave at least 128MB for BPF after the kernel */

> -#define BPF_JIT_REGION_START	PFN_ALIGN((unsigned long)&_end)

> -#define BPF_JIT_REGION_END	(BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)

> +#define BPF_JIT_REGION_START	(BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)

> +#define BPF_JIT_REGION_END	(MODULES_END)

>   #else

>   #define BPF_JIT_REGION_START	(PAGE_OFFSET - BPF_JIT_REGION_SIZE)

>   #define BPF_JIT_REGION_END	(VMALLOC_END)

>
Jisheng Zhang June 16, 2021, 12:03 a.m. UTC | #3
On Tue, 15 Jun 2021 20:54:19 +0200
Alex Ghiti <alex@ghiti.fr> wrote:

> Hi Jisheng,


Hi Alex,

> 

> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :

> > From: Jisheng Zhang <jszhang@kernel.org>

> > 

> > Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")

> > breaks booting with one kind of config file, I reproduced a kernel panic

> > with the config:

> > 

> > [    0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220

> > [    0.139159] Oops [#1]

> > [    0.139303] Modules linked in:

> > [    0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1

> > [    0.139934] Hardware name: riscv-virtio,qemu (DT)

> > [    0.140193] epc : __memset+0xc4/0xfc

> > [    0.140416]  ra : skb_flow_dissector_init+0x1e/0x82

> > [    0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0

> > [    0.140878]  gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158

> > [    0.141156]  t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0

> > [    0.141424]  s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000

> > [    0.141654]  a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064

> > [    0.141893]  a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff

> > [    0.142126]  s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088

> > [    0.142353]  s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438

> > [    0.142584]  s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac

> > [    0.142810]  s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000

> > [    0.143042]  t5 : 0000000000000155 t6 : 00000000000003ff

> > [    0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f

> > [    0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc

> > [    0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60

> > [    0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168

> > [    0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224

> > [    0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110

> > [    0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc

> > [    0.145124] ---[ end trace f1e9643daa46d591 ]---

> > 

> > After some investigation, I think I found the root cause: commit

> > 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves

> > BPF JIT region after the kernel:

> > 

> > The &_end is unlikely aligned with PMD size, so the front bpf jit

> > region sits with part of kernel .data section in one PMD size mapping.

> > But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is

> > called to make the first bpf jit prog ROX, we will make part of kernel

> > .data section RO too, so when we write to, for example memset the

> > .data section, MMU will trigger a store page fault.  

> 

> Good catch, we make sure no physical allocation happens between _end and 

> the next PMD aligned address, but I missed this one.

> 

> > 

> > To fix the issue, we need to ensure the BPF JIT region is PMD size

> > aligned. This patch acchieve this goal by restoring the BPF JIT region

> > to original position, I.E the 128MB before kernel .text section.  

> 

> But I disagree with your solution: I made sure modules and BPF programs 

> get their own virtual regions to avoid worst case scenario where one 

> could allocate all the space and leave nothing to the other (we are 

> limited to +- 2GB offset). Why don't just align BPF_JIT_REGION_START to 

> the next PMD aligned address?


Originally, I planed to fix the issue by aligning BPF_JIT_REGION_START, but
IIRC, BPF experts are adding (or have added) "Calling kernel functions from BPF"
feature, there's a risk that BPF JIT region is beyond the 2GB of module region:

------
module
------
kernel
------
BPF_JIT

So I made this patch finally. In this patch, we let BPF JIT region sit
between module and kernel.

To address "make sure modules and BPF programs get their own virtual regions",
what about something as below (applied against this patch)?

diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 380cd3a7e548..da1158f10b09 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -31,7 +31,7 @@
 #define BPF_JIT_REGION_SIZE	(SZ_128M)
 #ifdef CONFIG_64BIT
 #define BPF_JIT_REGION_START	(BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
-#define BPF_JIT_REGION_END	(MODULES_END)
+#define BPF_JIT_REGION_END	(PFN_ALIGN((unsigned long)&_start))
 #else
 #define BPF_JIT_REGION_START	(PAGE_OFFSET - BPF_JIT_REGION_SIZE)
 #define BPF_JIT_REGION_END	(VMALLOC_END)
@@ -40,7 +40,7 @@
 /* Modules always live before the kernel */
 #ifdef CONFIG_64BIT
 #define MODULES_VADDR	(PFN_ALIGN((unsigned long)&_end) - SZ_2G)
-#define MODULES_END	(PFN_ALIGN((unsigned long)&_start))
+#define MODULES_END	(BPF_JIT_REGION_END)
 #endif
 


> 

> Again, good catch, thanks,

> 

> Alex

> 

> > 

> > Reported-by: Andreas Schwab <schwab@linux-m68k.org>

> > Signed-off-by: Jisheng Zhang <jszhang@kernel.org>

> > ---

> >   arch/riscv/include/asm/pgtable.h | 5 ++---

> >   1 file changed, 2 insertions(+), 3 deletions(-)

> > 

> > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h

> > index 9469f464e71a..380cd3a7e548 100644

> > --- a/arch/riscv/include/asm/pgtable.h

> > +++ b/arch/riscv/include/asm/pgtable.h

> > @@ -30,9 +30,8 @@

> >   

> >   #define BPF_JIT_REGION_SIZE	(SZ_128M)

> >   #ifdef CONFIG_64BIT

> > -/* KASLR should leave at least 128MB for BPF after the kernel */

> > -#define BPF_JIT_REGION_START	PFN_ALIGN((unsigned long)&_end)

> > -#define BPF_JIT_REGION_END	(BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)

> > +#define BPF_JIT_REGION_START	(BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)

> > +#define BPF_JIT_REGION_END	(MODULES_END)

> >   #else

> >   #define BPF_JIT_REGION_START	(PAGE_OFFSET - BPF_JIT_REGION_SIZE)

> >   #define BPF_JIT_REGION_END	(VMALLOC_END)

> >
Alexandre Ghiti June 17, 2021, 7:23 a.m. UTC | #4
Le 16/06/2021 à 02:03, Jisheng Zhang a écrit :
> On Tue, 15 Jun 2021 20:54:19 +0200

> Alex Ghiti <alex@ghiti.fr> wrote:

> 

>> Hi Jisheng,

> 

> Hi Alex,

> 

>>

>> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :

>>> From: Jisheng Zhang <jszhang@kernel.org>

>>>

>>> Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")

>>> breaks booting with one kind of config file, I reproduced a kernel panic

>>> with the config:

>>>

>>> [    0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220

>>> [    0.139159] Oops [#1]

>>> [    0.139303] Modules linked in:

>>> [    0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1

>>> [    0.139934] Hardware name: riscv-virtio,qemu (DT)

>>> [    0.140193] epc : __memset+0xc4/0xfc

>>> [    0.140416]  ra : skb_flow_dissector_init+0x1e/0x82

>>> [    0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0

>>> [    0.140878]  gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158

>>> [    0.141156]  t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0

>>> [    0.141424]  s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000

>>> [    0.141654]  a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064

>>> [    0.141893]  a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff

>>> [    0.142126]  s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088

>>> [    0.142353]  s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438

>>> [    0.142584]  s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac

>>> [    0.142810]  s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000

>>> [    0.143042]  t5 : 0000000000000155 t6 : 00000000000003ff

>>> [    0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f

>>> [    0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc

>>> [    0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60

>>> [    0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168

>>> [    0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224

>>> [    0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110

>>> [    0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc

>>> [    0.145124] ---[ end trace f1e9643daa46d591 ]---

>>>

>>> After some investigation, I think I found the root cause: commit

>>> 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves

>>> BPF JIT region after the kernel:

>>>

>>> The &_end is unlikely aligned with PMD size, so the front bpf jit

>>> region sits with part of kernel .data section in one PMD size mapping.

>>> But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is

>>> called to make the first bpf jit prog ROX, we will make part of kernel

>>> .data section RO too, so when we write to, for example memset the

>>> .data section, MMU will trigger a store page fault.

>>

>> Good catch, we make sure no physical allocation happens between _end and

>> the next PMD aligned address, but I missed this one.

>>

>>>

>>> To fix the issue, we need to ensure the BPF JIT region is PMD size

>>> aligned. This patch acchieve this goal by restoring the BPF JIT region

>>> to original position, I.E the 128MB before kernel .text section.

>>

>> But I disagree with your solution: I made sure modules and BPF programs

>> get their own virtual regions to avoid worst case scenario where one

>> could allocate all the space and leave nothing to the other (we are

>> limited to +- 2GB offset). Why don't just align BPF_JIT_REGION_START to

>> the next PMD aligned address?

> 

> Originally, I planed to fix the issue by aligning BPF_JIT_REGION_START, but

> IIRC, BPF experts are adding (or have added) "Calling kernel functions from BPF"

> feature, there's a risk that BPF JIT region is beyond the 2GB of module region:

> 

> ------

> module

> ------

> kernel

> ------

> BPF_JIT

> 

> So I made this patch finally. In this patch, we let BPF JIT region sit

> between module and kernel.

> 


 From what I read in the lwn article, I'm not sure BPF programs can call 
module functions, can someone tell us if it is possible? Or planned?

> To address "make sure modules and BPF programs get their own virtual regions",

> what about something as below (applied against this patch)?

> 

> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h

> index 380cd3a7e548..da1158f10b09 100644

> --- a/arch/riscv/include/asm/pgtable.h

> +++ b/arch/riscv/include/asm/pgtable.h

> @@ -31,7 +31,7 @@

>   #define BPF_JIT_REGION_SIZE	(SZ_128M)

>   #ifdef CONFIG_64BIT

>   #define BPF_JIT_REGION_START	(BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)

> -#define BPF_JIT_REGION_END	(MODULES_END)

> +#define BPF_JIT_REGION_END	(PFN_ALIGN((unsigned long)&_start))

>   #else

>   #define BPF_JIT_REGION_START	(PAGE_OFFSET - BPF_JIT_REGION_SIZE)

>   #define BPF_JIT_REGION_END	(VMALLOC_END)

> @@ -40,7 +40,7 @@

>   /* Modules always live before the kernel */

>   #ifdef CONFIG_64BIT

>   #define MODULES_VADDR	(PFN_ALIGN((unsigned long)&_end) - SZ_2G)

> -#define MODULES_END	(PFN_ALIGN((unsigned long)&_start))

> +#define MODULES_END	(BPF_JIT_REGION_END)

>   #endif

>   

> 


In case it is possible, I would let the vmalloc allocator handle the 
case where modules steal room from BPF: I would then not implement the 
above but rather your first patch.

And do not forget to modify Documentation/riscv/vm-layout.rst 
accordingly and remove the comment "/* KASLR should leave at least 128MB 
for BPF after the kernel */"

Thanks,

Alex

> 

>>

>> Again, good catch, thanks,

>>

>> Alex

>>

>>>

>>> Reported-by: Andreas Schwab <schwab@linux-m68k.org>

>>> Signed-off-by: Jisheng Zhang <jszhang@kernel.org>

>>> ---

>>>    arch/riscv/include/asm/pgtable.h | 5 ++---

>>>    1 file changed, 2 insertions(+), 3 deletions(-)

>>>

>>> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h

>>> index 9469f464e71a..380cd3a7e548 100644

>>> --- a/arch/riscv/include/asm/pgtable.h

>>> +++ b/arch/riscv/include/asm/pgtable.h

>>> @@ -30,9 +30,8 @@

>>>    

>>>    #define BPF_JIT_REGION_SIZE	(SZ_128M)

>>>    #ifdef CONFIG_64BIT

>>> -/* KASLR should leave at least 128MB for BPF after the kernel */

>>> -#define BPF_JIT_REGION_START	PFN_ALIGN((unsigned long)&_end)

>>> -#define BPF_JIT_REGION_END	(BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)

>>> +#define BPF_JIT_REGION_START	(BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)

>>> +#define BPF_JIT_REGION_END	(MODULES_END)

>>>    #else

>>>    #define BPF_JIT_REGION_START	(PAGE_OFFSET - BPF_JIT_REGION_SIZE)

>>>    #define BPF_JIT_REGION_END	(VMALLOC_END)

>>>    

> 

> 

> 

> _______________________________________________

> linux-riscv mailing list

> linux-riscv@lists.infradead.org

> http://lists.infradead.org/mailman/listinfo/linux-riscv

>
Palmer Dabbelt June 17, 2021, 7:30 a.m. UTC | #5
On Tue, 15 Jun 2021 17:03:28 PDT (-0700), jszhang3@mail.ustc.edu.cn wrote:
> On Tue, 15 Jun 2021 20:54:19 +0200

> Alex Ghiti <alex@ghiti.fr> wrote:

>

>> Hi Jisheng,

>

> Hi Alex,

>

>> 

>> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :

>> > From: Jisheng Zhang <jszhang@kernel.org>

>> > 

>> > Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")

>> > breaks booting with one kind of config file, I reproduced a kernel panic

>> > with the config:

>> > 

>> > [    0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220

>> > [    0.139159] Oops [#1]

>> > [    0.139303] Modules linked in:

>> > [    0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1

>> > [    0.139934] Hardware name: riscv-virtio,qemu (DT)

>> > [    0.140193] epc : __memset+0xc4/0xfc

>> > [    0.140416]  ra : skb_flow_dissector_init+0x1e/0x82

>> > [    0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0

>> > [    0.140878]  gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158

>> > [    0.141156]  t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0

>> > [    0.141424]  s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000

>> > [    0.141654]  a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064

>> > [    0.141893]  a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff

>> > [    0.142126]  s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088

>> > [    0.142353]  s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438

>> > [    0.142584]  s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac

>> > [    0.142810]  s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000

>> > [    0.143042]  t5 : 0000000000000155 t6 : 00000000000003ff

>> > [    0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f

>> > [    0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc

>> > [    0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60

>> > [    0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168

>> > [    0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224

>> > [    0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110

>> > [    0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc

>> > [    0.145124] ---[ end trace f1e9643daa46d591 ]---

>> > 

>> > After some investigation, I think I found the root cause: commit

>> > 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves

>> > BPF JIT region after the kernel:

>> > 

>> > The &_end is unlikely aligned with PMD size, so the front bpf jit

>> > region sits with part of kernel .data section in one PMD size mapping.

>> > But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is

>> > called to make the first bpf jit prog ROX, we will make part of kernel

>> > .data section RO too, so when we write to, for example memset the

>> > .data section, MMU will trigger a store page fault.  

>> 

>> Good catch, we make sure no physical allocation happens between _end and 

>> the next PMD aligned address, but I missed this one.

>> 

>> > 

>> > To fix the issue, we need to ensure the BPF JIT region is PMD size

>> > aligned. This patch acchieve this goal by restoring the BPF JIT region

>> > to original position, I.E the 128MB before kernel .text section.  

>> 

>> But I disagree with your solution: I made sure modules and BPF programs 

>> get their own virtual regions to avoid worst case scenario where one 

>> could allocate all the space and leave nothing to the other (we are 

>> limited to +- 2GB offset). Why don't just align BPF_JIT_REGION_START to 

>> the next PMD aligned address?

>

> Originally, I planed to fix the issue by aligning BPF_JIT_REGION_START, but

> IIRC, BPF experts are adding (or have added) "Calling kernel functions from BPF"

> feature, there's a risk that BPF JIT region is beyond the 2GB of module region:

>

> ------

> module

> ------

> kernel

> ------

> BPF_JIT

>

> So I made this patch finally. In this patch, we let BPF JIT region sit

> between module and kernel.

>

> To address "make sure modules and BPF programs get their own virtual regions",

> what about something as below (applied against this patch)?

>

> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h

> index 380cd3a7e548..da1158f10b09 100644

> --- a/arch/riscv/include/asm/pgtable.h

> +++ b/arch/riscv/include/asm/pgtable.h

> @@ -31,7 +31,7 @@

>  #define BPF_JIT_REGION_SIZE	(SZ_128M)

>  #ifdef CONFIG_64BIT

>  #define BPF_JIT_REGION_START	(BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)

> -#define BPF_JIT_REGION_END	(MODULES_END)

> +#define BPF_JIT_REGION_END	(PFN_ALIGN((unsigned long)&_start))

>  #else

>  #define BPF_JIT_REGION_START	(PAGE_OFFSET - BPF_JIT_REGION_SIZE)

>  #define BPF_JIT_REGION_END	(VMALLOC_END)

> @@ -40,7 +40,7 @@

>  /* Modules always live before the kernel */

>  #ifdef CONFIG_64BIT

>  #define MODULES_VADDR	(PFN_ALIGN((unsigned long)&_end) - SZ_2G)

> -#define MODULES_END	(PFN_ALIGN((unsigned long)&_start))

> +#define MODULES_END	(BPF_JIT_REGION_END)

>  #endif

>  

>

>

>> 

>> Again, good catch, thanks,

>> 

>> Alex

>> 

>> > 

>> > Reported-by: Andreas Schwab <schwab@linux-m68k.org>

>> > Signed-off-by: Jisheng Zhang <jszhang@kernel.org>

>> > ---

>> >   arch/riscv/include/asm/pgtable.h | 5 ++---

>> >   1 file changed, 2 insertions(+), 3 deletions(-)

>> > 

>> > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h

>> > index 9469f464e71a..380cd3a7e548 100644

>> > --- a/arch/riscv/include/asm/pgtable.h

>> > +++ b/arch/riscv/include/asm/pgtable.h

>> > @@ -30,9 +30,8 @@

>> >   

>> >   #define BPF_JIT_REGION_SIZE	(SZ_128M)

>> >   #ifdef CONFIG_64BIT

>> > -/* KASLR should leave at least 128MB for BPF after the kernel */

>> > -#define BPF_JIT_REGION_START	PFN_ALIGN((unsigned long)&_end)

>> > -#define BPF_JIT_REGION_END	(BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)

>> > +#define BPF_JIT_REGION_START	(BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)

>> > +#define BPF_JIT_REGION_END	(MODULES_END)

>> >   #else

>> >   #define BPF_JIT_REGION_START	(PAGE_OFFSET - BPF_JIT_REGION_SIZE)

>> >   #define BPF_JIT_REGION_END	(VMALLOC_END)

>> >   


This, when applied onto fixes, is breaking early boot on KASAN 
configurations for me.
Alexandre Ghiti June 17, 2021, 8:09 a.m. UTC | #6
Le 17/06/2021 à 09:30, Palmer Dabbelt a écrit :
> On Tue, 15 Jun 2021 17:03:28 PDT (-0700), jszhang3@mail.ustc.edu.cn wrote:

>> On Tue, 15 Jun 2021 20:54:19 +0200

>> Alex Ghiti <alex@ghiti.fr> wrote:

>>

>>> Hi Jisheng,

>>

>> Hi Alex,

>>

>>>

>>> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :

>>> > From: Jisheng Zhang <jszhang@kernel.org>

>>> > > Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking 

>>> W^X")

>>> > breaks booting with one kind of config file, I reproduced a kernel 

>>> panic

>>> > with the config:

>>> > > [    0.138553] Unable to handle kernel paging request at virtual 

>>> address ffffffff81201220

>>> > [    0.139159] Oops [#1]

>>> > [    0.139303] Modules linked in:

>>> > [    0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 

>>> 5.13.0-rc5-default+ #1

>>> > [    0.139934] Hardware name: riscv-virtio,qemu (DT)

>>> > [    0.140193] epc : __memset+0xc4/0xfc

>>> > [    0.140416]  ra : skb_flow_dissector_init+0x1e/0x82

>>> > [    0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : 

>>> ffffffe001647da0

>>> > [    0.140878]  gp : ffffffff81134b08 tp : ffffffe001654380 t0 : 

>>> ffffffff81201158

>>> > [    0.141156]  t1 : 0000000000000002 t2 : 0000000000000154 s0 : 

>>> ffffffe001647dd0

>>> > [    0.141424]  s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 

>>> 0000000000000000

>>> > [    0.141654]  a2 : 000000000000003c a3 : ffffffff81201258 a4 : 

>>> 0000000000000064

>>> > [    0.141893]  a5 : ffffffff8029806c a6 : 0000000000000040 a7 : 

>>> ffffffffffffffff

>>> > [    0.142126]  s2 : ffffffff81201220 s3 : 0000000000000009 s4 : 

>>> ffffffff81135088

>>> > [    0.142353]  s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : 

>>> ffffffff80800438

>>> > [    0.142584]  s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: 

>>> ffffffff806000ac

>>> > [    0.142810]  s11: 0000000000000000 t3 : fffffffffffffffc t4 : 

>>> 0000000000000000

>>> > [    0.143042]  t5 : 0000000000000155 t6 : 00000000000003ff

>>> > [    0.143220] status: 0000000000000120 badaddr: ffffffff81201220 

>>> cause: 000000000000000f

>>> > [    0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc

>>> > [    0.143859] [<ffffffff8061e984>] 

>>> init_default_flow_dissectors+0x22/0x60

>>> > [    0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168

>>> > [    0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224

>>> > [    0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110

>>> > [    0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc

>>> > [    0.145124] ---[ end trace f1e9643daa46d591 ]---

>>> > > After some investigation, I think I found the root cause: commit

>>> > 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves

>>> > BPF JIT region after the kernel:

>>> > > The &_end is unlikely aligned with PMD size, so the front bpf jit

>>> > region sits with part of kernel .data section in one PMD size mapping.

>>> > But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is

>>> > called to make the first bpf jit prog ROX, we will make part of kernel

>>> > .data section RO too, so when we write to, for example memset the

>>> > .data section, MMU will trigger a store page fault.

>>> Good catch, we make sure no physical allocation happens between _end 

>>> and the next PMD aligned address, but I missed this one.

>>>

>>> > > To fix the issue, we need to ensure the BPF JIT region is PMD size

>>> > aligned. This patch acchieve this goal by restoring the BPF JIT region

>>> > to original position, I.E the 128MB before kernel .text section.

>>> But I disagree with your solution: I made sure modules and BPF 

>>> programs get their own virtual regions to avoid worst case scenario 

>>> where one could allocate all the space and leave nothing to the other 

>>> (we are limited to +- 2GB offset). Why don't just align 

>>> BPF_JIT_REGION_START to the next PMD aligned address?

>>

>> Originally, I planed to fix the issue by aligning 

>> BPF_JIT_REGION_START, but

>> IIRC, BPF experts are adding (or have added) "Calling kernel functions 

>> from BPF"

>> feature, there's a risk that BPF JIT region is beyond the 2GB of 

>> module region:

>>

>> ------

>> module

>> ------

>> kernel

>> ------

>> BPF_JIT

>>

>> So I made this patch finally. In this patch, we let BPF JIT region sit

>> between module and kernel.

>>

>> To address "make sure modules and BPF programs get their own virtual 

>> regions",

>> what about something as below (applied against this patch)?

>>

>> diff --git a/arch/riscv/include/asm/pgtable.h 

>> b/arch/riscv/include/asm/pgtable.h

>> index 380cd3a7e548..da1158f10b09 100644

>> --- a/arch/riscv/include/asm/pgtable.h

>> +++ b/arch/riscv/include/asm/pgtable.h

>> @@ -31,7 +31,7 @@

>>  #define BPF_JIT_REGION_SIZE    (SZ_128M)

>>  #ifdef CONFIG_64BIT

>>  #define BPF_JIT_REGION_START    (BPF_JIT_REGION_END - 

>> BPF_JIT_REGION_SIZE)

>> -#define BPF_JIT_REGION_END    (MODULES_END)

>> +#define BPF_JIT_REGION_END    (PFN_ALIGN((unsigned long)&_start))

>>  #else

>>  #define BPF_JIT_REGION_START    (PAGE_OFFSET - BPF_JIT_REGION_SIZE)

>>  #define BPF_JIT_REGION_END    (VMALLOC_END)

>> @@ -40,7 +40,7 @@

>>  /* Modules always live before the kernel */

>>  #ifdef CONFIG_64BIT

>>  #define MODULES_VADDR    (PFN_ALIGN((unsigned long)&_end) - SZ_2G)

>> -#define MODULES_END    (PFN_ALIGN((unsigned long)&_start))

>> +#define MODULES_END    (BPF_JIT_REGION_END)

>>  #endif

>>

>>

>>

>>>

>>> Again, good catch, thanks,

>>>

>>> Alex

>>>

>>> > > Reported-by: Andreas Schwab <schwab@linux-m68k.org>

>>> > Signed-off-by: Jisheng Zhang <jszhang@kernel.org>

>>> > ---

>>> >   arch/riscv/include/asm/pgtable.h | 5 ++---

>>> >   1 file changed, 2 insertions(+), 3 deletions(-)

>>> > > diff --git a/arch/riscv/include/asm/pgtable.h 

>>> b/arch/riscv/include/asm/pgtable.h

>>> > index 9469f464e71a..380cd3a7e548 100644

>>> > --- a/arch/riscv/include/asm/pgtable.h

>>> > +++ b/arch/riscv/include/asm/pgtable.h

>>> > @@ -30,9 +30,8 @@

>>> > >   #define BPF_JIT_REGION_SIZE    (SZ_128M)

>>> >   #ifdef CONFIG_64BIT

>>> > -/* KASLR should leave at least 128MB for BPF after the kernel */

>>> > -#define BPF_JIT_REGION_START    PFN_ALIGN((unsigned long)&_end)

>>> > -#define BPF_JIT_REGION_END    (BPF_JIT_REGION_START + 

>>> BPF_JIT_REGION_SIZE)

>>> > +#define BPF_JIT_REGION_START    (BPF_JIT_REGION_END - 

>>> BPF_JIT_REGION_SIZE)

>>> > +#define BPF_JIT_REGION_END    (MODULES_END)

>>> >   #else

>>> >   #define BPF_JIT_REGION_START    (PAGE_OFFSET - BPF_JIT_REGION_SIZE)

>>> >   #define BPF_JIT_REGION_END    (VMALLOC_END)

>>> > 

> 

> This, when applied onto fixes, is breaking early boot on KASAN 

> configurations for me.


Not surprising, I took a shortcut when initializing KASAN for modules, 
kernel and BPF:

         kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
                        kasan_mem_to_shadow((const void 
*)BPF_JIT_REGION_END));

The kernel is then not covered, I'm taking a look at how to fix that 
properly.

> 

> _______________________________________________

> linux-riscv mailing list

> linux-riscv@lists.infradead.org

> http://lists.infradead.org/mailman/listinfo/linux-riscv
Alexandre Ghiti June 17, 2021, 2:18 p.m. UTC | #7
Le 17/06/2021 à 10:09, Alex Ghiti a écrit :
> Le 17/06/2021 à 09:30, Palmer Dabbelt a écrit :

>> On Tue, 15 Jun 2021 17:03:28 PDT (-0700), jszhang3@mail.ustc.edu.cn 

>> wrote:

>>> On Tue, 15 Jun 2021 20:54:19 +0200

>>> Alex Ghiti <alex@ghiti.fr> wrote:

>>>

>>>> Hi Jisheng,

>>>

>>> Hi Alex,

>>>

>>>>

>>>> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :

>>>> > From: Jisheng Zhang <jszhang@kernel.org>

>>>> > > Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid 

>>>> breaking W^X")

>>>> > breaks booting with one kind of config file, I reproduced a kernel 

>>>> panic

>>>> > with the config:

>>>> > > [    0.138553] Unable to handle kernel paging request at virtual 

>>>> address ffffffff81201220

>>>> > [    0.139159] Oops [#1]

>>>> > [    0.139303] Modules linked in:

>>>> > [    0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 

>>>> 5.13.0-rc5-default+ #1

>>>> > [    0.139934] Hardware name: riscv-virtio,qemu (DT)

>>>> > [    0.140193] epc : __memset+0xc4/0xfc

>>>> > [    0.140416]  ra : skb_flow_dissector_init+0x1e/0x82

>>>> > [    0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : 

>>>> ffffffe001647da0

>>>> > [    0.140878]  gp : ffffffff81134b08 tp : ffffffe001654380 t0 : 

>>>> ffffffff81201158

>>>> > [    0.141156]  t1 : 0000000000000002 t2 : 0000000000000154 s0 : 

>>>> ffffffe001647dd0

>>>> > [    0.141424]  s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 

>>>> 0000000000000000

>>>> > [    0.141654]  a2 : 000000000000003c a3 : ffffffff81201258 a4 : 

>>>> 0000000000000064

>>>> > [    0.141893]  a5 : ffffffff8029806c a6 : 0000000000000040 a7 : 

>>>> ffffffffffffffff

>>>> > [    0.142126]  s2 : ffffffff81201220 s3 : 0000000000000009 s4 : 

>>>> ffffffff81135088

>>>> > [    0.142353]  s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : 

>>>> ffffffff80800438

>>>> > [    0.142584]  s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: 

>>>> ffffffff806000ac

>>>> > [    0.142810]  s11: 0000000000000000 t3 : fffffffffffffffc t4 : 

>>>> 0000000000000000

>>>> > [    0.143042]  t5 : 0000000000000155 t6 : 00000000000003ff

>>>> > [    0.143220] status: 0000000000000120 badaddr: ffffffff81201220 

>>>> cause: 000000000000000f

>>>> > [    0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc

>>>> > [    0.143859] [<ffffffff8061e984>] 

>>>> init_default_flow_dissectors+0x22/0x60

>>>> > [    0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168

>>>> > [    0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224

>>>> > [    0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110

>>>> > [    0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc

>>>> > [    0.145124] ---[ end trace f1e9643daa46d591 ]---

>>>> > > After some investigation, I think I found the root cause: commit

>>>> > 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves

>>>> > BPF JIT region after the kernel:

>>>> > > The &_end is unlikely aligned with PMD size, so the front bpf jit

>>>> > region sits with part of kernel .data section in one PMD size 

>>>> mapping.

>>>> > But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is

>>>> > called to make the first bpf jit prog ROX, we will make part of 

>>>> kernel

>>>> > .data section RO too, so when we write to, for example memset the

>>>> > .data section, MMU will trigger a store page fault.

>>>> Good catch, we make sure no physical allocation happens between _end 

>>>> and the next PMD aligned address, but I missed this one.

>>>>

>>>> > > To fix the issue, we need to ensure the BPF JIT region is PMD size

>>>> > aligned. This patch acchieve this goal by restoring the BPF JIT 

>>>> region

>>>> > to original position, I.E the 128MB before kernel .text section.

>>>> But I disagree with your solution: I made sure modules and BPF 

>>>> programs get their own virtual regions to avoid worst case scenario 

>>>> where one could allocate all the space and leave nothing to the 

>>>> other (we are limited to +- 2GB offset). Why don't just align 

>>>> BPF_JIT_REGION_START to the next PMD aligned address?

>>>

>>> Originally, I planed to fix the issue by aligning 

>>> BPF_JIT_REGION_START, but

>>> IIRC, BPF experts are adding (or have added) "Calling kernel 

>>> functions from BPF"

>>> feature, there's a risk that BPF JIT region is beyond the 2GB of 

>>> module region:

>>>

>>> ------

>>> module

>>> ------

>>> kernel

>>> ------

>>> BPF_JIT

>>>

>>> So I made this patch finally. In this patch, we let BPF JIT region sit

>>> between module and kernel.

>>>

>>> To address "make sure modules and BPF programs get their own virtual 

>>> regions",

>>> what about something as below (applied against this patch)?

>>>

>>> diff --git a/arch/riscv/include/asm/pgtable.h 

>>> b/arch/riscv/include/asm/pgtable.h

>>> index 380cd3a7e548..da1158f10b09 100644

>>> --- a/arch/riscv/include/asm/pgtable.h

>>> +++ b/arch/riscv/include/asm/pgtable.h

>>> @@ -31,7 +31,7 @@

>>>  #define BPF_JIT_REGION_SIZE    (SZ_128M)

>>>  #ifdef CONFIG_64BIT

>>>  #define BPF_JIT_REGION_START    (BPF_JIT_REGION_END - 

>>> BPF_JIT_REGION_SIZE)

>>> -#define BPF_JIT_REGION_END    (MODULES_END)

>>> +#define BPF_JIT_REGION_END    (PFN_ALIGN((unsigned long)&_start))

>>>  #else

>>>  #define BPF_JIT_REGION_START    (PAGE_OFFSET - BPF_JIT_REGION_SIZE)

>>>  #define BPF_JIT_REGION_END    (VMALLOC_END)

>>> @@ -40,7 +40,7 @@

>>>  /* Modules always live before the kernel */

>>>  #ifdef CONFIG_64BIT

>>>  #define MODULES_VADDR    (PFN_ALIGN((unsigned long)&_end) - SZ_2G)

>>> -#define MODULES_END    (PFN_ALIGN((unsigned long)&_start))

>>> +#define MODULES_END    (BPF_JIT_REGION_END)

>>>  #endif

>>>

>>>

>>>

>>>>

>>>> Again, good catch, thanks,

>>>>

>>>> Alex

>>>>

>>>> > > Reported-by: Andreas Schwab <schwab@linux-m68k.org>

>>>> > Signed-off-by: Jisheng Zhang <jszhang@kernel.org>

>>>> > ---

>>>> >   arch/riscv/include/asm/pgtable.h | 5 ++---

>>>> >   1 file changed, 2 insertions(+), 3 deletions(-)

>>>> > > diff --git a/arch/riscv/include/asm/pgtable.h 

>>>> b/arch/riscv/include/asm/pgtable.h

>>>> > index 9469f464e71a..380cd3a7e548 100644

>>>> > --- a/arch/riscv/include/asm/pgtable.h

>>>> > +++ b/arch/riscv/include/asm/pgtable.h

>>>> > @@ -30,9 +30,8 @@

>>>> > >   #define BPF_JIT_REGION_SIZE    (SZ_128M)

>>>> >   #ifdef CONFIG_64BIT

>>>> > -/* KASLR should leave at least 128MB for BPF after the kernel */

>>>> > -#define BPF_JIT_REGION_START    PFN_ALIGN((unsigned long)&_end)

>>>> > -#define BPF_JIT_REGION_END    (BPF_JIT_REGION_START + 

>>>> BPF_JIT_REGION_SIZE)

>>>> > +#define BPF_JIT_REGION_START    (BPF_JIT_REGION_END - 

>>>> BPF_JIT_REGION_SIZE)

>>>> > +#define BPF_JIT_REGION_END    (MODULES_END)

>>>> >   #else

>>>> >   #define BPF_JIT_REGION_START    (PAGE_OFFSET - BPF_JIT_REGION_SIZE)

>>>> >   #define BPF_JIT_REGION_END    (VMALLOC_END)

>>>> > 

>>

>> This, when applied onto fixes, is breaking early boot on KASAN 

>> configurations for me.

> 

> Not surprising, I took a shortcut when initializing KASAN for modules, 

> kernel and BPF:

> 

>          kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),

>                         kasan_mem_to_shadow((const void 

> *)BPF_JIT_REGION_END));

> 

> The kernel is then not covered, I'm taking a look at how to fix that 

> properly.

>


The following based on "riscv: Introduce structure that group all 
variables regarding kernel mapping" fixes the issue:

diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
index 9daacae93e33..2a45ea909e7f 100644
--- a/arch/riscv/mm/kasan_init.c
+++ b/arch/riscv/mm/kasan_init.c
@@ -199,9 +199,12 @@ void __init kasan_init(void)
                 kasan_populate(kasan_mem_to_shadow(start), 
kasan_mem_to_shadow(end));
         }

-       /* Populate kernel, BPF, modules mapping */
+       /* Populate BPF and modules mapping: modules mapping encompasses 
BPF mapping */
         kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
-                      kasan_mem_to_shadow((const void 
*)BPF_JIT_REGION_END));
+                      kasan_mem_to_shadow((const void *)MODULES_END));
+       /* Populate kernel mapping */
+       kasan_populate(kasan_mem_to_shadow((const void 
*)kernel_map.virt_addr),
+                      kasan_mem_to_shadow((const void 
*)kernel_map.virt_addr + kernel_map.size));


Without the mentioned patch, replace kernel_map.virt_addr with 
kernel_virt_addr and kernel_map.size with load_sz. Note that load_sz was 
re-exposed in v6 of the patchset "Map the kernel with correct 
permissions the first time".

Alex

>>

>> _______________________________________________

>> linux-riscv mailing list

>> linux-riscv@lists.infradead.org

>> http://lists.infradead.org/mailman/listinfo/linux-riscv

> 

> _______________________________________________

> linux-riscv mailing list

> linux-riscv@lists.infradead.org

> http://lists.infradead.org/mailman/listinfo/linux-riscv
Jisheng Zhang June 17, 2021, 5:17 p.m. UTC | #8
On Thu, 17 Jun 2021 09:23:04 +0200
Alex Ghiti <alex@ghiti.fr> wrote:

> Le 16/06/2021 à 02:03, Jisheng Zhang a écrit :

> > On Tue, 15 Jun 2021 20:54:19 +0200

> > Alex Ghiti <alex@ghiti.fr> wrote:

> >   

> >> Hi Jisheng,  

> > 

> > Hi Alex,

> >   

> >>

> >> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :  

> >>> From: Jisheng Zhang <jszhang@kernel.org>

> >>>

> >>> Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")

> >>> breaks booting with one kind of config file, I reproduced a kernel panic

> >>> with the config:

> >>>

> >>> [    0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220

> >>> [    0.139159] Oops [#1]

> >>> [    0.139303] Modules linked in:

> >>> [    0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1

> >>> [    0.139934] Hardware name: riscv-virtio,qemu (DT)

> >>> [    0.140193] epc : __memset+0xc4/0xfc

> >>> [    0.140416]  ra : skb_flow_dissector_init+0x1e/0x82

> >>> [    0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0

> >>> [    0.140878]  gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158

> >>> [    0.141156]  t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0

> >>> [    0.141424]  s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000

> >>> [    0.141654]  a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064

> >>> [    0.141893]  a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff

> >>> [    0.142126]  s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088

> >>> [    0.142353]  s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438

> >>> [    0.142584]  s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac

> >>> [    0.142810]  s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000

> >>> [    0.143042]  t5 : 0000000000000155 t6 : 00000000000003ff

> >>> [    0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f

> >>> [    0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc

> >>> [    0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60

> >>> [    0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168

> >>> [    0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224

> >>> [    0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110

> >>> [    0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc

> >>> [    0.145124] ---[ end trace f1e9643daa46d591 ]---

> >>>

> >>> After some investigation, I think I found the root cause: commit

> >>> 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves

> >>> BPF JIT region after the kernel:

> >>>

> >>> The &_end is unlikely aligned with PMD size, so the front bpf jit

> >>> region sits with part of kernel .data section in one PMD size mapping.

> >>> But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is

> >>> called to make the first bpf jit prog ROX, we will make part of kernel

> >>> .data section RO too, so when we write to, for example memset the

> >>> .data section, MMU will trigger a store page fault.  

> >>

> >> Good catch, we make sure no physical allocation happens between _end and

> >> the next PMD aligned address, but I missed this one.

> >>  

> >>>

> >>> To fix the issue, we need to ensure the BPF JIT region is PMD size

> >>> aligned. This patch acchieve this goal by restoring the BPF JIT region

> >>> to original position, I.E the 128MB before kernel .text section.  

> >>

> >> But I disagree with your solution: I made sure modules and BPF programs

> >> get their own virtual regions to avoid worst case scenario where one

> >> could allocate all the space and leave nothing to the other (we are

> >> limited to +- 2GB offset). Why don't just align BPF_JIT_REGION_START to

> >> the next PMD aligned address?  

> > 

> > Originally, I planed to fix the issue by aligning BPF_JIT_REGION_START, but

> > IIRC, BPF experts are adding (or have added) "Calling kernel functions from BPF"

> > feature, there's a risk that BPF JIT region is beyond the 2GB of module region:

> > 

> > ------

> > module

> > ------

> > kernel

> > ------

> > BPF_JIT

> > 

> > So I made this patch finally. In this patch, we let BPF JIT region sit

> > between module and kernel.

> >   

> 

>  From what I read in the lwn article, I'm not sure BPF programs can call 

> module functions, can someone tell us if it is possible? Or planned?


What about module call BPF program? this case also wants the 2GB address limit.

> 

> > To address "make sure modules and BPF programs get their own virtual regions",

> > what about something as below (applied against this patch)?

> > 

> > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h

> > index 380cd3a7e548..da1158f10b09 100644

> > --- a/arch/riscv/include/asm/pgtable.h

> > +++ b/arch/riscv/include/asm/pgtable.h

> > @@ -31,7 +31,7 @@

> >   #define BPF_JIT_REGION_SIZE	(SZ_128M)

> >   #ifdef CONFIG_64BIT

> >   #define BPF_JIT_REGION_START	(BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)

> > -#define BPF_JIT_REGION_END	(MODULES_END)

> > +#define BPF_JIT_REGION_END	(PFN_ALIGN((unsigned long)&_start))

> >   #else

> >   #define BPF_JIT_REGION_START	(PAGE_OFFSET - BPF_JIT_REGION_SIZE)

> >   #define BPF_JIT_REGION_END	(VMALLOC_END)

> > @@ -40,7 +40,7 @@

> >   /* Modules always live before the kernel */

> >   #ifdef CONFIG_64BIT

> >   #define MODULES_VADDR	(PFN_ALIGN((unsigned long)&_end) - SZ_2G)

> > -#define MODULES_END	(PFN_ALIGN((unsigned long)&_start))

> > +#define MODULES_END	(BPF_JIT_REGION_END)

> >   #endif

> >   

> >   

> 

> In case it is possible, I would let the vmalloc allocator handle the 

> case where modules steal room from BPF: I would then not implement the 

> above but rather your first patch.

> 

> And do not forget to modify Documentation/riscv/vm-layout.rst 

> accordingly and remove the comment "/* KASLR should leave at least 128MB 

> for BPF after the kernel */"


Thanks for the comments
Jisheng Zhang June 17, 2021, 5:27 p.m. UTC | #9
On Thu, 17 Jun 2021 16:18:54 +0200
Alex Ghiti <alex@ghiti.fr> wrote:

> Le 17/06/2021 à 10:09, Alex Ghiti a écrit :

> > Le 17/06/2021 à 09:30, Palmer Dabbelt a écrit :  

> >> On Tue, 15 Jun 2021 17:03:28 PDT (-0700), jszhang3@mail.ustc.edu.cn 

> >> wrote:  

> >>> On Tue, 15 Jun 2021 20:54:19 +0200

> >>> Alex Ghiti <alex@ghiti.fr> wrote:

> >>>  

> >>>> Hi Jisheng,  

> >>>

> >>> Hi Alex,

> >>>  

> >>>>

> >>>> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :  

> >>>> > From: Jisheng Zhang <jszhang@kernel.org>  

> >>>> > > Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid   

> >>>> breaking W^X")  

> >>>> > breaks booting with one kind of config file, I reproduced a kernel   

> >>>> panic  

> >>>> > with the config:  

> >>>> > > [    0.138553] Unable to handle kernel paging request at virtual   

> >>>> address ffffffff81201220  

> >>>> > [    0.139159] Oops [#1]

> >>>> > [    0.139303] Modules linked in:

> >>>> > [    0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted   

> >>>> 5.13.0-rc5-default+ #1  

> >>>> > [    0.139934] Hardware name: riscv-virtio,qemu (DT)

> >>>> > [    0.140193] epc : __memset+0xc4/0xfc

> >>>> > [    0.140416]  ra : skb_flow_dissector_init+0x1e/0x82

> >>>> > [    0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp :   

> >>>> ffffffe001647da0  

> >>>> > [    0.140878]  gp : ffffffff81134b08 tp : ffffffe001654380 t0 :   

> >>>> ffffffff81201158  

> >>>> > [    0.141156]  t1 : 0000000000000002 t2 : 0000000000000154 s0 :   

> >>>> ffffffe001647dd0  

> >>>> > [    0.141424]  s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 :   

> >>>> 0000000000000000  

> >>>> > [    0.141654]  a2 : 000000000000003c a3 : ffffffff81201258 a4 :   

> >>>> 0000000000000064  

> >>>> > [    0.141893]  a5 : ffffffff8029806c a6 : 0000000000000040 a7 :   

> >>>> ffffffffffffffff  

> >>>> > [    0.142126]  s2 : ffffffff81201220 s3 : 0000000000000009 s4 :   

> >>>> ffffffff81135088  

> >>>> > [    0.142353]  s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 :   

> >>>> ffffffff80800438  

> >>>> > [    0.142584]  s8 : ffffffff80bc6578 s9 : 0000000000000008 s10:   

> >>>> ffffffff806000ac  

> >>>> > [    0.142810]  s11: 0000000000000000 t3 : fffffffffffffffc t4 :   

> >>>> 0000000000000000  

> >>>> > [    0.143042]  t5 : 0000000000000155 t6 : 00000000000003ff

> >>>> > [    0.143220] status: 0000000000000120 badaddr: ffffffff81201220   

> >>>> cause: 000000000000000f  

> >>>> > [    0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc

> >>>> > [    0.143859] [<ffffffff8061e984>]   

> >>>> init_default_flow_dissectors+0x22/0x60  

> >>>> > [    0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168

> >>>> > [    0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224

> >>>> > [    0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110

> >>>> > [    0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc

> >>>> > [    0.145124] ---[ end trace f1e9643daa46d591 ]---  

> >>>> > > After some investigation, I think I found the root cause: commit  

> >>>> > 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves

> >>>> > BPF JIT region after the kernel:  

> >>>> > > The &_end is unlikely aligned with PMD size, so the front bpf jit  

> >>>> > region sits with part of kernel .data section in one PMD size   

> >>>> mapping.  

> >>>> > But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is

> >>>> > called to make the first bpf jit prog ROX, we will make part of   

> >>>> kernel  

> >>>> > .data section RO too, so when we write to, for example memset the

> >>>> > .data section, MMU will trigger a store page fault.  

> >>>> Good catch, we make sure no physical allocation happens between _end 

> >>>> and the next PMD aligned address, but I missed this one.

> >>>>  

> >>>> > > To fix the issue, we need to ensure the BPF JIT region is PMD size  

> >>>> > aligned. This patch acchieve this goal by restoring the BPF JIT   

> >>>> region  

> >>>> > to original position, I.E the 128MB before kernel .text section.  

> >>>> But I disagree with your solution: I made sure modules and BPF 

> >>>> programs get their own virtual regions to avoid worst case scenario 

> >>>> where one could allocate all the space and leave nothing to the 

> >>>> other (we are limited to +- 2GB offset). Why don't just align 

> >>>> BPF_JIT_REGION_START to the next PMD aligned address?  

> >>>

> >>> Originally, I planed to fix the issue by aligning 

> >>> BPF_JIT_REGION_START, but

> >>> IIRC, BPF experts are adding (or have added) "Calling kernel 

> >>> functions from BPF"

> >>> feature, there's a risk that BPF JIT region is beyond the 2GB of 

> >>> module region:

> >>>

> >>> ------

> >>> module

> >>> ------

> >>> kernel

> >>> ------

> >>> BPF_JIT

> >>>

> >>> So I made this patch finally. In this patch, we let BPF JIT region sit

> >>> between module and kernel.

> >>>

> >>> To address "make sure modules and BPF programs get their own virtual 

> >>> regions",

> >>> what about something as below (applied against this patch)?

> >>>

> >>> diff --git a/arch/riscv/include/asm/pgtable.h 

> >>> b/arch/riscv/include/asm/pgtable.h

> >>> index 380cd3a7e548..da1158f10b09 100644

> >>> --- a/arch/riscv/include/asm/pgtable.h

> >>> +++ b/arch/riscv/include/asm/pgtable.h

> >>> @@ -31,7 +31,7 @@

> >>>  #define BPF_JIT_REGION_SIZE    (SZ_128M)

> >>>  #ifdef CONFIG_64BIT

> >>>  #define BPF_JIT_REGION_START    (BPF_JIT_REGION_END - 

> >>> BPF_JIT_REGION_SIZE)

> >>> -#define BPF_JIT_REGION_END    (MODULES_END)

> >>> +#define BPF_JIT_REGION_END    (PFN_ALIGN((unsigned long)&_start))

> >>>  #else

> >>>  #define BPF_JIT_REGION_START    (PAGE_OFFSET - BPF_JIT_REGION_SIZE)

> >>>  #define BPF_JIT_REGION_END    (VMALLOC_END)

> >>> @@ -40,7 +40,7 @@

> >>>  /* Modules always live before the kernel */

> >>>  #ifdef CONFIG_64BIT

> >>>  #define MODULES_VADDR    (PFN_ALIGN((unsigned long)&_end) - SZ_2G)

> >>> -#define MODULES_END    (PFN_ALIGN((unsigned long)&_start))

> >>> +#define MODULES_END    (BPF_JIT_REGION_END)

> >>>  #endif

> >>>

> >>>

> >>>  

> >>>>

> >>>> Again, good catch, thanks,

> >>>>

> >>>> Alex

> >>>>  

> >>>> > > Reported-by: Andreas Schwab <schwab@linux-m68k.org>  

> >>>> > Signed-off-by: Jisheng Zhang <jszhang@kernel.org>

> >>>> > ---

> >>>> >   arch/riscv/include/asm/pgtable.h | 5 ++---

> >>>> >   1 file changed, 2 insertions(+), 3 deletions(-)  

> >>>> > > diff --git a/arch/riscv/include/asm/pgtable.h   

> >>>> b/arch/riscv/include/asm/pgtable.h  

> >>>> > index 9469f464e71a..380cd3a7e548 100644

> >>>> > --- a/arch/riscv/include/asm/pgtable.h

> >>>> > +++ b/arch/riscv/include/asm/pgtable.h

> >>>> > @@ -30,9 +30,8 @@  

> >>>> > >   #define BPF_JIT_REGION_SIZE    (SZ_128M)  

> >>>> >   #ifdef CONFIG_64BIT

> >>>> > -/* KASLR should leave at least 128MB for BPF after the kernel */

> >>>> > -#define BPF_JIT_REGION_START    PFN_ALIGN((unsigned long)&_end)

> >>>> > -#define BPF_JIT_REGION_END    (BPF_JIT_REGION_START +   

> >>>> BPF_JIT_REGION_SIZE)  

> >>>> > +#define BPF_JIT_REGION_START    (BPF_JIT_REGION_END -   

> >>>> BPF_JIT_REGION_SIZE)  

> >>>> > +#define BPF_JIT_REGION_END    (MODULES_END)

> >>>> >   #else

> >>>> >   #define BPF_JIT_REGION_START    (PAGE_OFFSET - BPF_JIT_REGION_SIZE)

> >>>> >   #define BPF_JIT_REGION_END    (VMALLOC_END)

> >>>> >   

> >>

> >> This, when applied onto fixes, is breaking early boot on KASAN 

> >> configurations for me.  


I can reproduce this issue.

> > 

> > Not surprising, I took a shortcut when initializing KASAN for modules, 

> > kernel and BPF:

> > 

> >          kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),

> >                         kasan_mem_to_shadow((const void 

> > *)BPF_JIT_REGION_END));

> > 

> > The kernel is then not covered, I'm taking a look at how to fix that 

> > properly.

> >  

> 

> The following based on "riscv: Introduce structure that group all 

> variables regarding kernel mapping" fixes the issue:

> 

> diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c

> index 9daacae93e33..2a45ea909e7f 100644

> --- a/arch/riscv/mm/kasan_init.c

> +++ b/arch/riscv/mm/kasan_init.c

> @@ -199,9 +199,12 @@ void __init kasan_init(void)

>                  kasan_populate(kasan_mem_to_shadow(start), 

> kasan_mem_to_shadow(end));

>          }

> 

> -       /* Populate kernel, BPF, modules mapping */

> +       /* Populate BPF and modules mapping: modules mapping encompasses 

> BPF mapping */

>          kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),

> -                      kasan_mem_to_shadow((const void 

> *)BPF_JIT_REGION_END));

> +                      kasan_mem_to_shadow((const void *)MODULES_END));

> +       /* Populate kernel mapping */

> +       kasan_populate(kasan_mem_to_shadow((const void 

> *)kernel_map.virt_addr),

> +                      kasan_mem_to_shadow((const void 

> *)kernel_map.virt_addr + kernel_map.size));

>

If this patch works, maybe we can still use one kasan_populate() to cover
kernel, bpf, and module:

        kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
-                      kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END));
+                      kasan_mem_to_shadow((const void *)MODULES_VADDR + SZ_2G));

However, both can't solve the early boot hang issue. I'm not sure what's missing.

I applied your patch on rc6 + solution below "replace kernel_map.virt_addr with kernel_virt_addr and
kernel_map.size with load_sz"


Thanks
 
> 

> Without the mentioned patch, replace kernel_map.virt_addr with 

> kernel_virt_addr and kernel_map.size with load_sz. Note that load_sz was 

> re-exposed in v6 of the patchset "Map the kernel with correct 

> permissions the first time".

>
Jisheng Zhang June 17, 2021, 5:46 p.m. UTC | #10
On Fri, 18 Jun 2021 01:27:31 +0800
Jisheng Zhang <jszhang3@mail.ustc.edu.cn> wrote:

> On Thu, 17 Jun 2021 16:18:54 +0200

> Alex Ghiti <alex@ghiti.fr> wrote:

> 

> > Le 17/06/2021 à 10:09, Alex Ghiti a écrit :  

> > > Le 17/06/2021 à 09:30, Palmer Dabbelt a écrit :    

> > >> On Tue, 15 Jun 2021 17:03:28 PDT (-0700), jszhang3@mail.ustc.edu.cn 

> > >> wrote:    

> > >>> On Tue, 15 Jun 2021 20:54:19 +0200

> > >>> Alex Ghiti <alex@ghiti.fr> wrote:

> > >>>    

> > >>>> Hi Jisheng,    

> > >>>

> > >>> Hi Alex,

> > >>>    

> > >>>>

> > >>>> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :    

> > >>>> > From: Jisheng Zhang <jszhang@kernel.org>    

> > >>>> > > Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid     

> > >>>> breaking W^X")    

> > >>>> > breaks booting with one kind of config file, I reproduced a kernel     

> > >>>> panic    

> > >>>> > with the config:    

> > >>>> > > [    0.138553] Unable to handle kernel paging request at virtual     

> > >>>> address ffffffff81201220    

> > >>>> > [    0.139159] Oops [#1]

> > >>>> > [    0.139303] Modules linked in:

> > >>>> > [    0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted     

> > >>>> 5.13.0-rc5-default+ #1    

> > >>>> > [    0.139934] Hardware name: riscv-virtio,qemu (DT)

> > >>>> > [    0.140193] epc : __memset+0xc4/0xfc

> > >>>> > [    0.140416]  ra : skb_flow_dissector_init+0x1e/0x82

> > >>>> > [    0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp :     

> > >>>> ffffffe001647da0    

> > >>>> > [    0.140878]  gp : ffffffff81134b08 tp : ffffffe001654380 t0 :     

> > >>>> ffffffff81201158    

> > >>>> > [    0.141156]  t1 : 0000000000000002 t2 : 0000000000000154 s0 :     

> > >>>> ffffffe001647dd0    

> > >>>> > [    0.141424]  s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 :     

> > >>>> 0000000000000000    

> > >>>> > [    0.141654]  a2 : 000000000000003c a3 : ffffffff81201258 a4 :     

> > >>>> 0000000000000064    

> > >>>> > [    0.141893]  a5 : ffffffff8029806c a6 : 0000000000000040 a7 :     

> > >>>> ffffffffffffffff    

> > >>>> > [    0.142126]  s2 : ffffffff81201220 s3 : 0000000000000009 s4 :     

> > >>>> ffffffff81135088    

> > >>>> > [    0.142353]  s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 :     

> > >>>> ffffffff80800438    

> > >>>> > [    0.142584]  s8 : ffffffff80bc6578 s9 : 0000000000000008 s10:     

> > >>>> ffffffff806000ac    

> > >>>> > [    0.142810]  s11: 0000000000000000 t3 : fffffffffffffffc t4 :     

> > >>>> 0000000000000000    

> > >>>> > [    0.143042]  t5 : 0000000000000155 t6 : 00000000000003ff

> > >>>> > [    0.143220] status: 0000000000000120 badaddr: ffffffff81201220     

> > >>>> cause: 000000000000000f    

> > >>>> > [    0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc

> > >>>> > [    0.143859] [<ffffffff8061e984>]     

> > >>>> init_default_flow_dissectors+0x22/0x60    

> > >>>> > [    0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168

> > >>>> > [    0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224

> > >>>> > [    0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110

> > >>>> > [    0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc

> > >>>> > [    0.145124] ---[ end trace f1e9643daa46d591 ]---    

> > >>>> > > After some investigation, I think I found the root cause: commit    

> > >>>> > 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves

> > >>>> > BPF JIT region after the kernel:    

> > >>>> > > The &_end is unlikely aligned with PMD size, so the front bpf jit    

> > >>>> > region sits with part of kernel .data section in one PMD size     

> > >>>> mapping.    

> > >>>> > But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is

> > >>>> > called to make the first bpf jit prog ROX, we will make part of     

> > >>>> kernel    

> > >>>> > .data section RO too, so when we write to, for example memset the

> > >>>> > .data section, MMU will trigger a store page fault.    

> > >>>> Good catch, we make sure no physical allocation happens between _end 

> > >>>> and the next PMD aligned address, but I missed this one.

> > >>>>    

> > >>>> > > To fix the issue, we need to ensure the BPF JIT region is PMD size    

> > >>>> > aligned. This patch acchieve this goal by restoring the BPF JIT     

> > >>>> region    

> > >>>> > to original position, I.E the 128MB before kernel .text section.    

> > >>>> But I disagree with your solution: I made sure modules and BPF 

> > >>>> programs get their own virtual regions to avoid worst case scenario 

> > >>>> where one could allocate all the space and leave nothing to the 

> > >>>> other (we are limited to +- 2GB offset). Why don't just align 

> > >>>> BPF_JIT_REGION_START to the next PMD aligned address?    

> > >>>

> > >>> Originally, I planed to fix the issue by aligning 

> > >>> BPF_JIT_REGION_START, but

> > >>> IIRC, BPF experts are adding (or have added) "Calling kernel 

> > >>> functions from BPF"

> > >>> feature, there's a risk that BPF JIT region is beyond the 2GB of 

> > >>> module region:

> > >>>

> > >>> ------

> > >>> module

> > >>> ------

> > >>> kernel

> > >>> ------

> > >>> BPF_JIT

> > >>>

> > >>> So I made this patch finally. In this patch, we let BPF JIT region sit

> > >>> between module and kernel.

> > >>>

> > >>> To address "make sure modules and BPF programs get their own virtual 

> > >>> regions",

> > >>> what about something as below (applied against this patch)?

> > >>>

> > >>> diff --git a/arch/riscv/include/asm/pgtable.h 

> > >>> b/arch/riscv/include/asm/pgtable.h

> > >>> index 380cd3a7e548..da1158f10b09 100644

> > >>> --- a/arch/riscv/include/asm/pgtable.h

> > >>> +++ b/arch/riscv/include/asm/pgtable.h

> > >>> @@ -31,7 +31,7 @@

> > >>>  #define BPF_JIT_REGION_SIZE    (SZ_128M)

> > >>>  #ifdef CONFIG_64BIT

> > >>>  #define BPF_JIT_REGION_START    (BPF_JIT_REGION_END - 

> > >>> BPF_JIT_REGION_SIZE)

> > >>> -#define BPF_JIT_REGION_END    (MODULES_END)

> > >>> +#define BPF_JIT_REGION_END    (PFN_ALIGN((unsigned long)&_start))

> > >>>  #else

> > >>>  #define BPF_JIT_REGION_START    (PAGE_OFFSET - BPF_JIT_REGION_SIZE)

> > >>>  #define BPF_JIT_REGION_END    (VMALLOC_END)

> > >>> @@ -40,7 +40,7 @@

> > >>>  /* Modules always live before the kernel */

> > >>>  #ifdef CONFIG_64BIT

> > >>>  #define MODULES_VADDR    (PFN_ALIGN((unsigned long)&_end) - SZ_2G)

> > >>> -#define MODULES_END    (PFN_ALIGN((unsigned long)&_start))

> > >>> +#define MODULES_END    (BPF_JIT_REGION_END)

> > >>>  #endif

> > >>>

> > >>>

> > >>>    

> > >>>>

> > >>>> Again, good catch, thanks,

> > >>>>

> > >>>> Alex

> > >>>>    

> > >>>> > > Reported-by: Andreas Schwab <schwab@linux-m68k.org>    

> > >>>> > Signed-off-by: Jisheng Zhang <jszhang@kernel.org>

> > >>>> > ---

> > >>>> >   arch/riscv/include/asm/pgtable.h | 5 ++---

> > >>>> >   1 file changed, 2 insertions(+), 3 deletions(-)    

> > >>>> > > diff --git a/arch/riscv/include/asm/pgtable.h     

> > >>>> b/arch/riscv/include/asm/pgtable.h    

> > >>>> > index 9469f464e71a..380cd3a7e548 100644

> > >>>> > --- a/arch/riscv/include/asm/pgtable.h

> > >>>> > +++ b/arch/riscv/include/asm/pgtable.h

> > >>>> > @@ -30,9 +30,8 @@    

> > >>>> > >   #define BPF_JIT_REGION_SIZE    (SZ_128M)    

> > >>>> >   #ifdef CONFIG_64BIT

> > >>>> > -/* KASLR should leave at least 128MB for BPF after the kernel */

> > >>>> > -#define BPF_JIT_REGION_START    PFN_ALIGN((unsigned long)&_end)

> > >>>> > -#define BPF_JIT_REGION_END    (BPF_JIT_REGION_START +     

> > >>>> BPF_JIT_REGION_SIZE)    

> > >>>> > +#define BPF_JIT_REGION_START    (BPF_JIT_REGION_END -     

> > >>>> BPF_JIT_REGION_SIZE)    

> > >>>> > +#define BPF_JIT_REGION_END    (MODULES_END)

> > >>>> >   #else

> > >>>> >   #define BPF_JIT_REGION_START    (PAGE_OFFSET - BPF_JIT_REGION_SIZE)

> > >>>> >   #define BPF_JIT_REGION_END    (VMALLOC_END)

> > >>>> >     

> > >>

> > >> This, when applied onto fixes, is breaking early boot on KASAN 

> > >> configurations for me.    

> 

> I can reproduce this issue.

> 

> > > 

> > > Not surprising, I took a shortcut when initializing KASAN for modules, 

> > > kernel and BPF:

> > > 

> > >          kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),

> > >                         kasan_mem_to_shadow((const void 

> > > *)BPF_JIT_REGION_END));

> > > 

> > > The kernel is then not covered, I'm taking a look at how to fix that 

> > > properly.

> > >    

> > 

> > The following based on "riscv: Introduce structure that group all 

> > variables regarding kernel mapping" fixes the issue:

> > 

> > diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c

> > index 9daacae93e33..2a45ea909e7f 100644

> > --- a/arch/riscv/mm/kasan_init.c

> > +++ b/arch/riscv/mm/kasan_init.c

> > @@ -199,9 +199,12 @@ void __init kasan_init(void)

> >                  kasan_populate(kasan_mem_to_shadow(start), 

> > kasan_mem_to_shadow(end));

> >          }

> > 

> > -       /* Populate kernel, BPF, modules mapping */

> > +       /* Populate BPF and modules mapping: modules mapping encompasses 

> > BPF mapping */

> >          kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),

> > -                      kasan_mem_to_shadow((const void 

> > *)BPF_JIT_REGION_END));

> > +                      kasan_mem_to_shadow((const void *)MODULES_END));

> > +       /* Populate kernel mapping */

> > +       kasan_populate(kasan_mem_to_shadow((const void 

> > *)kernel_map.virt_addr),

> > +                      kasan_mem_to_shadow((const void 

> > *)kernel_map.virt_addr + kernel_map.size));

> >  

> If this patch works, maybe we can still use one kasan_populate() to cover

> kernel, bpf, and module:

> 

>         kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),

> -                      kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END));

> +                      kasan_mem_to_shadow((const void *)MODULES_VADDR + SZ_2G));

> 


I made a mistake. Below patch works:

        kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
-                      kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END));
+                      kasan_mem_to_shadow((const void *)(MODULES_VADDR + SZ_2G)));

> However, both can't solve the early boot hang issue. I'm not sure what's missing.

> 

> I applied your patch on rc6 + solution below "replace kernel_map.virt_addr with kernel_virt_addr and

> kernel_map.size with load_sz"

> 

> 

> Thanks

>  

> > 

> > Without the mentioned patch, replace kernel_map.virt_addr with 

> > kernel_virt_addr and kernel_map.size with load_sz. Note that load_sz was 

> > re-exposed in v6 of the patchset "Map the kernel with correct 

> > permissions the first time".

> >   

> 

> 

> _______________________________________________

> linux-riscv mailing list

> linux-riscv@lists.infradead.org

> http://lists.infradead.org/mailman/listinfo/linux-riscv
Jisheng Zhang June 17, 2021, 6:10 p.m. UTC | #11
On Fri, 18 Jun 2021 01:46:48 +0800
Jisheng Zhang <jszhang3@mail.ustc.edu.cn> wrote:

> On Fri, 18 Jun 2021 01:27:31 +0800

> Jisheng Zhang <jszhang3@mail.ustc.edu.cn> wrote:

> 

> > On Thu, 17 Jun 2021 16:18:54 +0200

> > Alex Ghiti <alex@ghiti.fr> wrote:

> >   

> > > Le 17/06/2021 à 10:09, Alex Ghiti a écrit :    

> > > > Le 17/06/2021 à 09:30, Palmer Dabbelt a écrit :      

> > > >> On Tue, 15 Jun 2021 17:03:28 PDT (-0700), jszhang3@mail.ustc.edu.cn 

> > > >> wrote:      

> > > >>> On Tue, 15 Jun 2021 20:54:19 +0200

> > > >>> Alex Ghiti <alex@ghiti.fr> wrote:

> > > >>>      

> > > >>>> Hi Jisheng,      

> > > >>>

> > > >>> Hi Alex,

> > > >>>      

> > > >>>>

> > > >>>> Le 14/06/2021 à 18:49, Jisheng Zhang a écrit :      

> > > >>>> > From: Jisheng Zhang <jszhang@kernel.org>      

> > > >>>> > > Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid       

> > > >>>> breaking W^X")      

> > > >>>> > breaks booting with one kind of config file, I reproduced a kernel       

> > > >>>> panic      

> > > >>>> > with the config:      

> > > >>>> > > [    0.138553] Unable to handle kernel paging request at virtual       

> > > >>>> address ffffffff81201220      

> > > >>>> > [    0.139159] Oops [#1]

> > > >>>> > [    0.139303] Modules linked in:

> > > >>>> > [    0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted       

> > > >>>> 5.13.0-rc5-default+ #1      

> > > >>>> > [    0.139934] Hardware name: riscv-virtio,qemu (DT)

> > > >>>> > [    0.140193] epc : __memset+0xc4/0xfc

> > > >>>> > [    0.140416]  ra : skb_flow_dissector_init+0x1e/0x82

> > > >>>> > [    0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp :       

> > > >>>> ffffffe001647da0      

> > > >>>> > [    0.140878]  gp : ffffffff81134b08 tp : ffffffe001654380 t0 :       

> > > >>>> ffffffff81201158      

> > > >>>> > [    0.141156]  t1 : 0000000000000002 t2 : 0000000000000154 s0 :       

> > > >>>> ffffffe001647dd0      

> > > >>>> > [    0.141424]  s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 :       

> > > >>>> 0000000000000000      

> > > >>>> > [    0.141654]  a2 : 000000000000003c a3 : ffffffff81201258 a4 :       

> > > >>>> 0000000000000064      

> > > >>>> > [    0.141893]  a5 : ffffffff8029806c a6 : 0000000000000040 a7 :       

> > > >>>> ffffffffffffffff      

> > > >>>> > [    0.142126]  s2 : ffffffff81201220 s3 : 0000000000000009 s4 :       

> > > >>>> ffffffff81135088      

> > > >>>> > [    0.142353]  s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 :       

> > > >>>> ffffffff80800438      

> > > >>>> > [    0.142584]  s8 : ffffffff80bc6578 s9 : 0000000000000008 s10:       

> > > >>>> ffffffff806000ac      

> > > >>>> > [    0.142810]  s11: 0000000000000000 t3 : fffffffffffffffc t4 :       

> > > >>>> 0000000000000000      

> > > >>>> > [    0.143042]  t5 : 0000000000000155 t6 : 00000000000003ff

> > > >>>> > [    0.143220] status: 0000000000000120 badaddr: ffffffff81201220       

> > > >>>> cause: 000000000000000f      

> > > >>>> > [    0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc

> > > >>>> > [    0.143859] [<ffffffff8061e984>]       

> > > >>>> init_default_flow_dissectors+0x22/0x60      

> > > >>>> > [    0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168

> > > >>>> > [    0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224

> > > >>>> > [    0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110

> > > >>>> > [    0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc

> > > >>>> > [    0.145124] ---[ end trace f1e9643daa46d591 ]---      

> > > >>>> > > After some investigation, I think I found the root cause: commit      

> > > >>>> > 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves

> > > >>>> > BPF JIT region after the kernel:      

> > > >>>> > > The &_end is unlikely aligned with PMD size, so the front bpf jit      

> > > >>>> > region sits with part of kernel .data section in one PMD size       

> > > >>>> mapping.      

> > > >>>> > But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is

> > > >>>> > called to make the first bpf jit prog ROX, we will make part of       

> > > >>>> kernel      

> > > >>>> > .data section RO too, so when we write to, for example memset the

> > > >>>> > .data section, MMU will trigger a store page fault.      

> > > >>>> Good catch, we make sure no physical allocation happens between _end 

> > > >>>> and the next PMD aligned address, but I missed this one.

> > > >>>>      

> > > >>>> > > To fix the issue, we need to ensure the BPF JIT region is PMD size      

> > > >>>> > aligned. This patch acchieve this goal by restoring the BPF JIT       

> > > >>>> region      

> > > >>>> > to original position, I.E the 128MB before kernel .text section.      

> > > >>>> But I disagree with your solution: I made sure modules and BPF 

> > > >>>> programs get their own virtual regions to avoid worst case scenario 

> > > >>>> where one could allocate all the space and leave nothing to the 

> > > >>>> other (we are limited to +- 2GB offset). Why don't just align 

> > > >>>> BPF_JIT_REGION_START to the next PMD aligned address?      

> > > >>>

> > > >>> Originally, I planed to fix the issue by aligning 

> > > >>> BPF_JIT_REGION_START, but

> > > >>> IIRC, BPF experts are adding (or have added) "Calling kernel 

> > > >>> functions from BPF"

> > > >>> feature, there's a risk that BPF JIT region is beyond the 2GB of 

> > > >>> module region:

> > > >>>

> > > >>> ------

> > > >>> module

> > > >>> ------

> > > >>> kernel

> > > >>> ------

> > > >>> BPF_JIT

> > > >>>

> > > >>> So I made this patch finally. In this patch, we let BPF JIT region sit

> > > >>> between module and kernel.

> > > >>>

> > > >>> To address "make sure modules and BPF programs get their own virtual 

> > > >>> regions",

> > > >>> what about something as below (applied against this patch)?

> > > >>>

> > > >>> diff --git a/arch/riscv/include/asm/pgtable.h 

> > > >>> b/arch/riscv/include/asm/pgtable.h

> > > >>> index 380cd3a7e548..da1158f10b09 100644

> > > >>> --- a/arch/riscv/include/asm/pgtable.h

> > > >>> +++ b/arch/riscv/include/asm/pgtable.h

> > > >>> @@ -31,7 +31,7 @@

> > > >>>  #define BPF_JIT_REGION_SIZE    (SZ_128M)

> > > >>>  #ifdef CONFIG_64BIT

> > > >>>  #define BPF_JIT_REGION_START    (BPF_JIT_REGION_END - 

> > > >>> BPF_JIT_REGION_SIZE)

> > > >>> -#define BPF_JIT_REGION_END    (MODULES_END)

> > > >>> +#define BPF_JIT_REGION_END    (PFN_ALIGN((unsigned long)&_start))

> > > >>>  #else

> > > >>>  #define BPF_JIT_REGION_START    (PAGE_OFFSET - BPF_JIT_REGION_SIZE)

> > > >>>  #define BPF_JIT_REGION_END    (VMALLOC_END)

> > > >>> @@ -40,7 +40,7 @@

> > > >>>  /* Modules always live before the kernel */

> > > >>>  #ifdef CONFIG_64BIT

> > > >>>  #define MODULES_VADDR    (PFN_ALIGN((unsigned long)&_end) - SZ_2G)

> > > >>> -#define MODULES_END    (PFN_ALIGN((unsigned long)&_start))

> > > >>> +#define MODULES_END    (BPF_JIT_REGION_END)

> > > >>>  #endif

> > > >>>

> > > >>>

> > > >>>      

> > > >>>>

> > > >>>> Again, good catch, thanks,

> > > >>>>

> > > >>>> Alex

> > > >>>>      

> > > >>>> > > Reported-by: Andreas Schwab <schwab@linux-m68k.org>      

> > > >>>> > Signed-off-by: Jisheng Zhang <jszhang@kernel.org>

> > > >>>> > ---

> > > >>>> >   arch/riscv/include/asm/pgtable.h | 5 ++---

> > > >>>> >   1 file changed, 2 insertions(+), 3 deletions(-)      

> > > >>>> > > diff --git a/arch/riscv/include/asm/pgtable.h       

> > > >>>> b/arch/riscv/include/asm/pgtable.h      

> > > >>>> > index 9469f464e71a..380cd3a7e548 100644

> > > >>>> > --- a/arch/riscv/include/asm/pgtable.h

> > > >>>> > +++ b/arch/riscv/include/asm/pgtable.h

> > > >>>> > @@ -30,9 +30,8 @@      

> > > >>>> > >   #define BPF_JIT_REGION_SIZE    (SZ_128M)      

> > > >>>> >   #ifdef CONFIG_64BIT

> > > >>>> > -/* KASLR should leave at least 128MB for BPF after the kernel */

> > > >>>> > -#define BPF_JIT_REGION_START    PFN_ALIGN((unsigned long)&_end)

> > > >>>> > -#define BPF_JIT_REGION_END    (BPF_JIT_REGION_START +       

> > > >>>> BPF_JIT_REGION_SIZE)      

> > > >>>> > +#define BPF_JIT_REGION_START    (BPF_JIT_REGION_END -       

> > > >>>> BPF_JIT_REGION_SIZE)      

> > > >>>> > +#define BPF_JIT_REGION_END    (MODULES_END)

> > > >>>> >   #else

> > > >>>> >   #define BPF_JIT_REGION_START    (PAGE_OFFSET - BPF_JIT_REGION_SIZE)

> > > >>>> >   #define BPF_JIT_REGION_END    (VMALLOC_END)

> > > >>>> >       

> > > >>

> > > >> This, when applied onto fixes, is breaking early boot on KASAN 

> > > >> configurations for me.      

> > 

> > I can reproduce this issue.

> >   

> > > > 

> > > > Not surprising, I took a shortcut when initializing KASAN for modules, 

> > > > kernel and BPF:

> > > > 

> > > >          kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),

> > > >                         kasan_mem_to_shadow((const void 

> > > > *)BPF_JIT_REGION_END));

> > > > 

> > > > The kernel is then not covered, I'm taking a look at how to fix that 

> > > > properly.

> > > >      

> > > 

> > > The following based on "riscv: Introduce structure that group all 

> > > variables regarding kernel mapping" fixes the issue:

> > > 

> > > diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c

> > > index 9daacae93e33..2a45ea909e7f 100644

> > > --- a/arch/riscv/mm/kasan_init.c

> > > +++ b/arch/riscv/mm/kasan_init.c

> > > @@ -199,9 +199,12 @@ void __init kasan_init(void)

> > >                  kasan_populate(kasan_mem_to_shadow(start), 

> > > kasan_mem_to_shadow(end));

> > >          }

> > > 

> > > -       /* Populate kernel, BPF, modules mapping */

> > > +       /* Populate BPF and modules mapping: modules mapping encompasses 

> > > BPF mapping */

> > >          kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),

> > > -                      kasan_mem_to_shadow((const void 

> > > *)BPF_JIT_REGION_END));

> > > +                      kasan_mem_to_shadow((const void *)MODULES_END));

> > > +       /* Populate kernel mapping */

> > > +       kasan_populate(kasan_mem_to_shadow((const void 

> > > *)kernel_map.virt_addr),

> > > +                      kasan_mem_to_shadow((const void 

> > > *)kernel_map.virt_addr + kernel_map.size));

> > >    

> > If this patch works, maybe we can still use one kasan_populate() to cover

> > kernel, bpf, and module:

> > 

> >         kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),

> > -                      kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END));

> > +                      kasan_mem_to_shadow((const void *)MODULES_VADDR + SZ_2G));

> >   

> 

> I made a mistake. Below patch works:

> 

>         kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),

> -                      kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END));

> +                      kasan_mem_to_shadow((const void *)(MODULES_VADDR + SZ_2G)));


This isn't the key. I knew the reason now. kasan_init() has local vars named
as _start and _end, then MODULES_VADDR is defined as:
#define MODULES_VADDR   (PFN_ALIGN((unsigned long)&_end) - SZ_2G)

So MODULES_VADDR isn't what we expected. To fix it, we must rename the local
vars

> 

> > However, both can't solve the early boot hang issue. I'm not sure what's missing.

> > 

> > I applied your patch on rc6 + solution below "replace kernel_map.virt_addr with kernel_virt_addr and

> > kernel_map.size with load_sz"

> > 

> > 

> > Thanks

> >    

> > > 

> > > Without the mentioned patch, replace kernel_map.virt_addr with 

> > > kernel_virt_addr and kernel_map.size with load_sz. Note that load_sz was 

> > > re-exposed in v6 of the patchset "Map the kernel with correct 

> > > permissions the first time".

> > >     

> > 

> > 

> > _______________________________________________

> > linux-riscv mailing list

> > linux-riscv@lists.infradead.org

> > http://lists.infradead.org/mailman/listinfo/linux-riscv  

>
Alexandre Ghiti June 18, 2021, 6:48 a.m. UTC | #12
Le 17/06/2021 à 20:15, Jisheng Zhang a écrit :
> Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")

> breaks booting with one kind of defconfig, I reproduced a kernel panic

> with the defconfig:

> 

> [    0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220

> [    0.139159] Oops [#1]

> [    0.139303] Modules linked in:

> [    0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1

> [    0.139934] Hardware name: riscv-virtio,qemu (DT)

> [    0.140193] epc : __memset+0xc4/0xfc

> [    0.140416]  ra : skb_flow_dissector_init+0x1e/0x82

> [    0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0

> [    0.140878]  gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158

> [    0.141156]  t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0

> [    0.141424]  s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000

> [    0.141654]  a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064

> [    0.141893]  a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff

> [    0.142126]  s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088

> [    0.142353]  s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438

> [    0.142584]  s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac

> [    0.142810]  s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000

> [    0.143042]  t5 : 0000000000000155 t6 : 00000000000003ff

> [    0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f

> [    0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc

> [    0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60

> [    0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168

> [    0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224

> [    0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110

> [    0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc

> [    0.145124] ---[ end trace f1e9643daa46d591 ]---

> 

> After some investigation, I think I found the root cause: commit

> 2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves

> BPF JIT region after the kernel:

> 

> The &_end is unlikely aligned with PMD size, so the front bpf jit

> region sits with part of kernel .data section in one PMD size mapping.

> But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is

> called to make the first bpf jit prog ROX, we will make part of kernel

> .data section RO too, so when we write to, for example memset the

> .data section, MMU will trigger a store page fault.

> 

> To fix the issue, we need to ensure the BPF JIT region is PMD size

> aligned. This patch acchieve this goal by restoring the BPF JIT region

> to original position, I.E the 128MB before kernel .text section. The

> modification to kasan_init.c is inspired by Alexandre.

> 

> Reported-by: Andreas Schwab <schwab@linux-m68k.org>

> Signed-off-by: Jisheng Zhang <jszhang@kernel.org>

> ---

> 

> Since v1:

>   - Fix early boot hang when kasan is enabled

>   - Update Documentation/riscv/vm-layout.rst

> 

>   Documentation/riscv/vm-layout.rst |  4 ++--

>   arch/riscv/include/asm/pgtable.h  |  5 ++---

>   arch/riscv/mm/kasan_init.c        | 10 +++++-----

>   3 files changed, 9 insertions(+), 10 deletions(-)

> 

> diff --git a/Documentation/riscv/vm-layout.rst b/Documentation/riscv/vm-layout.rst

> index 329d32098af4..b7f98930d38d 100644

> --- a/Documentation/riscv/vm-layout.rst

> +++ b/Documentation/riscv/vm-layout.rst

> @@ -58,6 +58,6 @@ RISC-V Linux Kernel SV39

>                                                                 |

>     ____________________________________________________________|____________________________________________________________

>                       |            |                  |         |

> -   ffffffff00000000 |   -4    GB | ffffffff7fffffff |    2 GB | modules

> -   ffffffff80000000 |   -2    GB | ffffffffffffffff |    2 GB | kernel, BPF

> +   ffffffff00000000 |   -4    GB | ffffffff7fffffff |    2 GB | modules, BPF

> +   ffffffff80000000 |   -2    GB | ffffffffffffffff |    2 GB | kernel

>     __________________|____________|__________________|_________|____________________________________________________________

> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h

> index 9469f464e71a..380cd3a7e548 100644

> --- a/arch/riscv/include/asm/pgtable.h

> +++ b/arch/riscv/include/asm/pgtable.h

> @@ -30,9 +30,8 @@

>   

>   #define BPF_JIT_REGION_SIZE	(SZ_128M)

>   #ifdef CONFIG_64BIT

> -/* KASLR should leave at least 128MB for BPF after the kernel */

> -#define BPF_JIT_REGION_START	PFN_ALIGN((unsigned long)&_end)

> -#define BPF_JIT_REGION_END	(BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)

> +#define BPF_JIT_REGION_START	(BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)

> +#define BPF_JIT_REGION_END	(MODULES_END)

>   #else

>   #define BPF_JIT_REGION_START	(PAGE_OFFSET - BPF_JIT_REGION_SIZE)

>   #define BPF_JIT_REGION_END	(VMALLOC_END)

> diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c

> index 9daacae93e33..d7189c8714a9 100644

> --- a/arch/riscv/mm/kasan_init.c

> +++ b/arch/riscv/mm/kasan_init.c

> @@ -169,7 +169,7 @@ static void __init kasan_shallow_populate(void *start, void *end)

>   

>   void __init kasan_init(void)

>   {

> -	phys_addr_t _start, _end;

> +	phys_addr_t p_start, p_end;



IMHO this fix deserves its own patch, it is not related to the issue you 
describe in the changelog and has been around for some time.

That's too bad BPF people did not answer my question regarding BPF <-> 
modules calls: I'll ask the question directly in kasan-dev mailing list 
and add you in cc.


>   	u64 i;

>   

>   	/*

> @@ -189,9 +189,9 @@ void __init kasan_init(void)

>   			(void *)kasan_mem_to_shadow((void *)VMALLOC_END));

>   

>   	/* Populate the linear mapping */

> -	for_each_mem_range(i, &_start, &_end) {

> -		void *start = (void *)__va(_start);

> -		void *end = (void *)__va(_end);

> +	for_each_mem_range(i, &p_start, &p_end) {

> +		void *start = (void *)__va(p_start);

> +		void *end = (void *)__va(p_end);

>   

>   		if (start >= end)

>   			break;

> @@ -201,7 +201,7 @@ void __init kasan_init(void)

>   

>   	/* Populate kernel, BPF, modules mapping */

>   	kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),

> -		       kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END));

> +		       kasan_mem_to_shadow((const void *)MODULES_VADDR + SZ_2G));

>   

>   	for (i = 0; i < PTRS_PER_PTE; i++)

>   		set_pte(&kasan_early_shadow_pte[i],

>
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 9469f464e71a..380cd3a7e548 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -30,9 +30,8 @@ 
 
 #define BPF_JIT_REGION_SIZE	(SZ_128M)
 #ifdef CONFIG_64BIT
-/* KASLR should leave at least 128MB for BPF after the kernel */
-#define BPF_JIT_REGION_START	PFN_ALIGN((unsigned long)&_end)
-#define BPF_JIT_REGION_END	(BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
+#define BPF_JIT_REGION_START	(BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
+#define BPF_JIT_REGION_END	(MODULES_END)
 #else
 #define BPF_JIT_REGION_START	(PAGE_OFFSET - BPF_JIT_REGION_SIZE)
 #define BPF_JIT_REGION_END	(VMALLOC_END)