Message ID | 20250504095230.2932860-28-ardb+git@google.com |
---|---|
State | New |
Headers | show |
Series | x86: strict separation of startup code | expand |
* Ard Biesheuvel <ardb+git@google.com> wrote: > From: Ard Biesheuvel <ardb@kernel.org> > > On x86_64, the core kernel is entered in long mode, which implies that > paging is enabled. This means that the CR4.LA57 control bit is > guaranteed to be in sync with the number of paging levels used by the > kernel, and there is no need to store this in a variable. > > There is also no need to use variables for storing the calculations of > pgdir_shift and ptrs_per_p4d, as they are easily determined on the fly. > > This removes the need for two different sources of truth (i.e., early > and late) for determining whether 5-level paging is in use: CR4.LA57 > always reflects the actual state, and never changes from the point of > view of the 64-bit core kernel. It also removes the need for exposing > the associated variables to the startup code. The only potential concern > is the cost of CR4 accesses, which can be mitigated using alternatives > patching based on feature detection. > > Note that even the decompressor does not manipulate any page tables > before updating CR4.LA57, so it can also avoid the associated global > variables entirely. However, as it does not implement alternatives > patching, the associated ELF sections need to be discarded. > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org> > --- > arch/x86/boot/compressed/misc.h | 4 -- > arch/x86/boot/compressed/pgtable_64.c | 12 ------ > arch/x86/boot/compressed/vmlinux.lds.S | 1 + > arch/x86/boot/startup/map_kernel.c | 12 +----- > arch/x86/boot/startup/sme.c | 9 ---- > arch/x86/include/asm/pgtable_64_types.h | 43 ++++++++++---------- > arch/x86/kernel/cpu/common.c | 2 - > arch/x86/kernel/head64.c | 11 ----- > arch/x86/mm/kasan_init_64.c | 3 -- > 9 files changed, 24 insertions(+), 73 deletions(-) So this patch breaks the build & creates header dependency hell on x86-64 allnoconfig: starship:~/tip> m kernel/pid.o DESCEND objtool CC arch/x86/kernel/asm-offsets.s INSTALL libsubcmd_headers In file included from ./arch/x86/include/asm/pgtable_64_types.h:5, from ./arch/x86/include/asm/pgtable_types.h:283, from ./arch/x86/include/asm/processor.h:21, from ./arch/x86/include/asm/cpufeature.h:5, from ./arch/x86/include/asm/thread_info.h:59, from ./include/linux/thread_info.h:60, from ./include/linux/spinlock.h:60, from ./include/linux/swait.h:7, from ./include/linux/completion.h:12, from ./include/linux/crypto.h:15, from arch/x86/kernel/asm-offsets.c:9: ./arch/x86/include/asm/sparsemem.h:29:34: warning: "pgtable_l5_enabled" is not defined, evaluates to 0 [-Wundef] 29 | # define MAX_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46) | ^~~~~~~~~~~~~~~~~~ ./include/linux/page-flags-layout.h:31:26: note: in expansion of macro ‘MAX_PHYSMEM_BITS’ 31 | #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) Plus I'm not sure I'm happy about this kind of complexity getting embedded deep within low level MM primitives: static __always_inline __pure bool pgtable_l5_enabled(void) { unsigned long r; bool ret; if (!IS_ENABLED(CONFIG_X86_5LEVEL)) return false; asm(ALTERNATIVE_TERNARY( "movq %%cr4, %[reg] \n\t btl %[la57], %k[reg]" CC_SET(c), %P[feat], "stc", "clc") : [reg] "=&r" (r), CC_OUT(c) (ret) : [feat] "i" (X86_FEATURE_LA57), [la57] "i" (X86_CR4_LA57_BIT) : "cc"); return ret; } it's basically everywhere: arch/x86/include/asm/page_64_types.h:#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47) arch/x86/include/asm/paravirt.h: if (pgtable_l5_enabled()) \ arch/x86/include/asm/paravirt.h: if (pgtable_l5_enabled()) \ arch/x86/include/asm/pgalloc.h: if (!pgtable_l5_enabled()) arch/x86/include/asm/pgalloc.h: if (!pgtable_l5_enabled()) arch/x86/include/asm/pgalloc.h: if (pgtable_l5_enabled()) arch/x86/include/asm/pgtable.h:#define pgd_clear(pgd) (pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0) arch/x86/include/asm/pgtable.h: if (!pgtable_l5_enabled()) arch/x86/include/asm/pgtable.h: if (!pgtable_l5_enabled()) arch/x86/include/asm/pgtable.h: if (!pgtable_l5_enabled()) arch/x86/include/asm/pgtable.h: if (!pgtable_l5_enabled()) arch/x86/include/asm/pgtable_32_types.h:#define pgtable_l5_enabled() 0 arch/x86/include/asm/pgtable_64.h: return !pgtable_l5_enabled(); arch/x86/include/asm/pgtable_64.h: if (pgtable_l5_enabled() || arch/x86/include/asm/pgtable_64_types.h:static __always_inline __pure bool pgtable_l5_enabled(void) arch/x86/include/asm/pgtable_64_types.h:#define PGDIR_SHIFT (pgtable_l5_enabled() ? 48 : 39) arch/x86/include/asm/pgtable_64_types.h:#define PTRS_PER_P4D (pgtable_l5_enabled() ? 512 : 1) arch/x86/include/asm/pgtable_64_types.h:# define VMALLOC_SIZE_TB (pgtable_l5_enabled() ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4) arch/x86/include/asm/sparsemem.h:# define MAX_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46) Inlined approximately a gazillion times. (449 times on x86 defconfig. Yes, I just counted it.) And it's not even worth it, as it generates horrendous code: 154: 0f 20 e0 mov %cr4,%rax 157: 0f ba e0 0c bt $0xc,%eax ... while CR4 access might be faster these days, it's certainly not as fast as simple percpu access. Plus it clobbers a register (RAX in the example above), which is unnecessary for a flag test. Cannot pgtable_l5_enabled() be a single, simple percpu flag or so? And yes, this creates another layer for these values - but thus decouples low level MM from detection & implementation complexities, which is a plus ... Thanks, Ingo
On Sun, 4 May 2025 at 15:51, Ingo Molnar <mingo@kernel.org> wrote: > > > * Ard Biesheuvel <ardb+git@google.com> wrote: > > > From: Ard Biesheuvel <ardb@kernel.org> > > > > On x86_64, the core kernel is entered in long mode, which implies that > > paging is enabled. This means that the CR4.LA57 control bit is > > guaranteed to be in sync with the number of paging levels used by the > > kernel, and there is no need to store this in a variable. > > > > There is also no need to use variables for storing the calculations of > > pgdir_shift and ptrs_per_p4d, as they are easily determined on the fly. > > > > This removes the need for two different sources of truth (i.e., early > > and late) for determining whether 5-level paging is in use: CR4.LA57 > > always reflects the actual state, and never changes from the point of > > view of the 64-bit core kernel. It also removes the need for exposing > > the associated variables to the startup code. The only potential concern > > is the cost of CR4 accesses, which can be mitigated using alternatives > > patching based on feature detection. > > > > Note that even the decompressor does not manipulate any page tables > > before updating CR4.LA57, so it can also avoid the associated global > > variables entirely. However, as it does not implement alternatives > > patching, the associated ELF sections need to be discarded. > > > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org> > > --- > > arch/x86/boot/compressed/misc.h | 4 -- > > arch/x86/boot/compressed/pgtable_64.c | 12 ------ > > arch/x86/boot/compressed/vmlinux.lds.S | 1 + > > arch/x86/boot/startup/map_kernel.c | 12 +----- > > arch/x86/boot/startup/sme.c | 9 ---- > > arch/x86/include/asm/pgtable_64_types.h | 43 ++++++++++---------- > > arch/x86/kernel/cpu/common.c | 2 - > > arch/x86/kernel/head64.c | 11 ----- > > arch/x86/mm/kasan_init_64.c | 3 -- > > 9 files changed, 24 insertions(+), 73 deletions(-) > > So this patch breaks the build & creates header dependency hell on > x86-64 allnoconfig: > Ugh ... > Plus I'm not sure I'm happy about this kind of complexity getting > embedded deep within low level MM primitives: > > static __always_inline __pure bool pgtable_l5_enabled(void) > { > unsigned long r; > bool ret; > > if (!IS_ENABLED(CONFIG_X86_5LEVEL)) > return false; > > asm(ALTERNATIVE_TERNARY( > "movq %%cr4, %[reg] \n\t btl %[la57], %k[reg]" CC_SET(c), > %P[feat], "stc", "clc") > : [reg] "=&r" (r), CC_OUT(c) (ret) > : [feat] "i" (X86_FEATURE_LA57), > [la57] "i" (X86_CR4_LA57_BIT) > : "cc"); > > return ret; > } > ... > > Inlined approximately a gazillion times. (449 times on x86 defconfig. > Yes, I just counted it.) > > And it's not even worth it, as it generates horrendous code: > > 154: 0f 20 e0 mov %cr4,%rax > 157: 0f ba e0 0c bt $0xc,%eax > > ... while CR4 access might be faster these days, it's certainly not as > fast as simple percpu access. Plus it clobbers a register (RAX in the > example above), which is unnecessary for a flag test. > It's an alternative, so this will be patched into stc or clc. But it will still clobber a register. > Cannot pgtable_l5_enabled() be a single, simple percpu flag or so? > We can just drop this patch, and I'll work around it by adding another couple of SYM_PIC_ALIAS()es for these variables. > And yes, this creates another layer for these values - but thus > decouples low level MM from detection & implementation complexities, > which is a plus ... > If you prefer to retain the early vs late distinction, where the late one is more efficient, we could replace the early variant with the CR4 access. But frankly, if we go down that road, I'd prefer to just share these early variables with the startup code.
On Sun, 4 May 2025 at 06:51, Ingo Molnar <mingo@kernel.org> wrote: > > Cannot pgtable_l5_enabled() be a single, simple percpu flag or so? Isn't this logically something that should just be a static key? For some reason I thought we did that already, but looking around, that was only in the critical user access routines (and got replaced by the 'runtime-const' stuff) But I guess that what Ard wants to get rid of is the variable itself, and for early boot using static keys sounds like a bad idea. Honestly, looking at this, I think we should fix the *users* of pgtable_l5_enabled(). Because I think there are two distinct classes: - the stuff in <asm/pgtable.h> is exposed as the generic page table accessor macros to "real" code, and should probably use a static key (ie things like pgd_clear() / pgd_none() and friends) - but in code like __kernel_physical_mapping_init() feels to me like using the value in %cr4 actually makes sense but that looks like a much bigger and fundamental patch than Ard's. Linus
On Sun, 4 May 2025 at 16:58, Linus Torvalds <torvalds@linux-foundation.org> wrote: > > On Sun, 4 May 2025 at 06:51, Ingo Molnar <mingo@kernel.org> wrote: > > > > Cannot pgtable_l5_enabled() be a single, simple percpu flag or so? > > Isn't this logically something that should just be a static key? For > some reason I thought we did that already, but looking around, that > was only in the critical user access routines (and got replaced by the > 'runtime-const' stuff) > The ordinary, late version of pgtable_l5_enabled() is #define'd to cpu_feature_enabled(), which is runtime patched in a slightly more efficient manner than the alternative I'm proposing here. It is conceptually the same as a static key, i.e., the asm goto() gets patched into a JMP or a NOP. Whether or not using runtime constants are worth the hassle for pgdir_shift and ptrs_per_p4d is a different question, I'll keep them as ordinary globals for now, and drop the conditional expressions. > But I guess that what Ard wants to get rid of is the variable itself, > and for early boot using static keys sounds like a bad idea. > I wanted to get rid of the variable, and then noticed the nasty '#define USE_EARLY_PGTABLE_L5' thing, so I attempted to fix that as well. (Having to keep track of which code may be called early, late or both is how I ended up in this swamp to begin with.) > Honestly, looking at this, I think we should fix the *users* of > pgtable_l5_enabled(). > > Because I think there are two distinct classes: > > - the stuff in <asm/pgtable.h> is exposed as the generic page table > accessor macros to "real" code, and should probably use a static key > (ie things like pgd_clear() / pgd_none() and friends) > > - but in code like __kernel_physical_mapping_init() feels to me like > using the value in %cr4 actually makes sense > > but that looks like a much bigger and fundamental patch than Ard's. > I think we should just rely on cpu_feature_enabled(), which can in fact be called early as long as the capability gets set. So instead of setting __pgtable_l5_enabled, we should just call 'set_cpu_cap(&boot_cpu_data, X86_FEATURE_LA57)' so that even early callers can use the ordinary pgtable_l5_enabled(), and we can drop the early flavor. (The decompressor will need its own version but it can just inspect CR4.LA57 directly)
* Linus Torvalds <torvalds@linux-foundation.org> wrote: > Honestly, looking at this, I think we should fix the *users* of > pgtable_l5_enabled(). Agreed. > Because I think there are two distinct classes: > > - the stuff in <asm/pgtable.h> is exposed as the generic page table > accessor macros to "real" code, and should probably use a static key > (ie things like pgd_clear() / pgd_none() and friends) > > - but in code like __kernel_physical_mapping_init() feels to me like > using the value in %cr4 actually makes sense So out of curiosity I measured where exactly pgtable_l5_enabled() is used - at least in terms of inlining frequency. Here's the histogram of first-level pgtable_l5_enabled() inlining uses on x86-64-defconfig (there's over 600 of them): 1 ffffffff844cde60 stat__pgtable_l5_enabled____p4d_free_tlb() 1 ffffffff844cde64 stat__pgtable_l5_enabled__pgd_populate_safe() 1 ffffffff844cde80 stat__pgtable_l5_enabled__native_set_p4d() 1 ffffffff844cde84 stat__pgtable_l5_enabled__mm_p4d_folded() 2 ffffffff844cde8c stat__pgtable_l5_enabled__paravirt_pgd_clear() 5 ffffffff844cde68 stat__pgtable_l5_enabled__pgd_populate() 13 ffffffff844cde98 stat__pgtable_l5_enabled____VIRTUAL_MASK_SHIFT 14 ffffffff844cde78 stat__pgtable_l5_enabled__pgd_present() 16 ffffffff844cde70 stat__pgtable_l5_enabled__pgd_bad() 16 ffffffff844cdea0 stat__pgtable_l5_enabled__other() 20 ffffffff844cde88 stat__pgtable_l5_enabled__paravirt_set_pgd() 29 ffffffff844cde5c stat__pgtable_l5_enabled__VMALLOC_SIZE_TB 46 ffffffff844cde7c stat__pgtable_l5_enabled__PTRS_PER_P4D 49 ffffffff844cde6c stat__pgtable_l5_enabled__pgd_none() 60 ffffffff844cde74 stat__pgtable_l5_enabled__p4d_offset() 156 ffffffff844cde9c stat__pgtable_l5_enabled__PGDIR_SHIFT 179 ffffffff844cde94 stat__pgtable_l5_enabled__MAX_PHYSMEM_BITS --- 609 TOTAL Limitations: - 'Inlining frequency' is admittedly only an imperfect, statistical proxy for 'importance'. - This metric doesn't properly discount boot-time-only use either, although by a quick guesstimate it's less than 10% of all use: the startup code uses pgtable_l5_enabled() only 13 times. (But there's more boot-time use.) Anyway, with these limitations in mind, we can see that the top 5 usecases cover about 80% of all uses: - MAX_PHYSMEM_BITS: (inlined 179 times) arch/x86/include/asm/sparsemem.h:# define MAX_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46) This could be implemented via a precomputed, constant percpu value (per_cpu__x86_MAX_PHYSMEM_BITS) of 52 vs. 46, eliminating not just the CR4 access, but also a branch, at the cost of a percpu memory access. (Which should still be a win on all microarchitectures IMO.) Alternatively, since this value is a semi-constant of 52 vs. 46, we could also, I suspect, ALTERNATIVES-patch MAX_PHYSMEM_BITS in as an immediate constant value? Any reason this shouldn't work: static inline unsigned int __MAX_PHYSMEM_BITS(void) { unsigned int bits; asm_inline (ALTERNATIVE("movl $46, %0", "movl $52, %0", X86_FEATURE_LA57) :"=g" (bits)); return bits; } #define MAX_PHYSMEM_BITS __MAX_PHYSMEM_BITS() ... or something like that? This would result in the best code generation IMO, by far. (It would even make use of the zero-extension property of a 32-bit MOVL, further compressing the opcode to only 5 bytes or so.) We'd even create a secondary helper macro for this, something like: #define ALTERNATIVES_CONST_U32(__val1, __val2, __feature) \ ({ \ u32 __val; \ \ asm_inline (ALTERNATIVE("movl $" #__val1 ", %0", "movl $" __val2 ", %0", __feature) :"=g" (__val)); \ \ __val; \ }) ... #define MAX_PHYSMEM_BITS ALTERNATIVE_CONST_U32(46, 52, X86_FEATURE_LA57) (Or so. Totally untested.) - PGDIR_SHIFT: (inlined 156 times) arch/x86/include/asm/pgtable_64_types.h:#define PGDIR_SHIFT (pgtable_l5_enabled() ? 48 : 39) This too could be implemented via a precomputed constant percpu value (per_cpu__x86_PGDIR_SHIFT), eliminating a branch as well, or via an ALTERNATIVE() immediate operand constants. - p4d_offset(): (inlined 60 times) static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { if (!pgtable_l5_enabled()) return (p4d_t *)pgd; return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address); } Here pgtable_l5_enabled() is used as a binary flag. - pgd_none(): (inlined 49 times) static inline int pgd_none(pgd_t pgd) { if (!pgtable_l5_enabled()) return 0; return !native_pgd_val(pgd); } Binary flag use as well, although the compiler might eliminate the branch here and replace it with 'AND !native_pgd_val(pgd)', because native_pgd_val() is really simple. Or if the compiler doesn't do it, we could code it up as a (hopefully) branch-less return of: static inline int pgd_none(pgd_t pgd) { return pgtable_l5_enabled() & !native_pgd_val(pgd); } (If I got the conversion right.) - PTRS_PER_P4D: (inlined 46 times) arch/x86/include/asm/pgtable_64_types.h:#define PTRS_PER_P4D (pgtable_l5_enabled() ? 512 : 1) This too could be implemented via a precomputed constant percpu value (per_cpu__x86_PTRS_PER_P4D), eliminating a branch, or via an ALTERNATIVE() immediate constant. > but that looks like a much bigger and fundamental patch than Ard's. Definitely. Using various derived percpu values will also complicate bootstrapping, with the usual complications of whether these values are entirely in sync with the chosen paging mode the kernel uses. Anyway, I'm not against Ard's simplification patch as a first step, and any optimizations can be layered on top of that. Thanks, Ingo
* Ingo Molnar <mingo@kernel.org> wrote: > Anyway, with these limitations in mind, we can see that the top 5 > usecases cover about 80% of all uses: > > - MAX_PHYSMEM_BITS: (inlined 179 times) > > arch/x86/include/asm/sparsemem.h:# define MAX_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46) > > This could be implemented via a precomputed, constant percpu value > (per_cpu__x86_MAX_PHYSMEM_BITS) of 52 vs. 46, eliminating not just > the CR4 access, but also a branch, at the cost of a percpu memory > access. (Which should still be a win on all microarchitectures IMO.) > > Alternatively, since this value is a semi-constant of 52 vs. 46, we > could also, I suspect, ALTERNATIVES-patch MAX_PHYSMEM_BITS in as an > immediate constant value? Any reason this shouldn't work: > > static inline unsigned int __MAX_PHYSMEM_BITS(void) > { > unsigned int bits; > > asm_inline (ALTERNATIVE("movl $46, %0", "movl $52, %0", X86_FEATURE_LA57) :"=g" (bits)); > > return bits; > } > #define MAX_PHYSMEM_BITS __MAX_PHYSMEM_BITS() > > ... or something like that? This would result in the best code > generation IMO, by far. (It would even make use of the > zero-extension property of a 32-bit MOVL, further compressing the > opcode to only 5 bytes or so.) > > We'd even create a secondary helper macro for this, something like: > > #define ALTERNATIVES_CONST_U32(__val1, __val2, __feature) \ > ({ \ > u32 __val; \ > \ > asm_inline (ALTERNATIVE("movl $" #__val1 ", %0", "movl $" __val2 ", %0", __feature) :"=g" (__val)); \ > \ > __val; \ > }) > > ... > > #define MAX_PHYSMEM_BITS ALTERNATIVE_CONST_U32(46, 52, X86_FEATURE_LA57) > > (Or so. Totally untested.) BTW., I keep comparing it to the CR4 access, which is a bit unfair, since that's only one out of the 3 variants of ALTERNATIVE_TERNARY(): static __always_inline __pure bool pgtable_l5_enabled(void) { unsigned long r; bool ret; if (!IS_ENABLED(CONFIG_X86_5LEVEL)) return false; asm(ALTERNATIVE_TERNARY( "movq %%cr4, %[reg] \n\t btl %[la57], %k[reg]" CC_SET(c), %P[feat], "stc", "clc") : [reg] "=&r" (r), CC_OUT(c) (ret) : [feat] "i" (X86_FEATURE_LA57), [la57] "i" (X86_CR4_LA57_BIT) : "cc"); return ret; } The STC and CLC variants will probably be the more common outcomes on modern CPUs. But I still think the ALTERNATIVE_CONST_U32() approach I outline above generates superior code for the binary-values cases, which covers 3 out of the top 5 uses of pgtable_l5_enabled(). For non-constant branching uses of pgtable_l5_enabled() I suspect the STC/CLC approach above is pretty good, although the 'cc' constraint will clobber all flags I suspect, while ALTERNATIVE_CONST_U32() doesn't? Ie. with ALTERNATIVE_CONST_U32() we just load the resulting constant into a register, with no additional branches and with flags undisturbed. Also, is STC/CLC always just as fast as the testing of an immediate (or a static branch), on CPUs we care about? Thanks, Ingo
On Mon, 5 May 2025 at 14:07, Ingo Molnar <mingo@kernel.org> wrote: > > - MAX_PHYSMEM_BITS: (inlined 179 times) > > arch/x86/include/asm/sparsemem.h:# define MAX_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46) > > This could be implemented via a precomputed, constant percpu value > (per_cpu__x86_MAX_PHYSMEM_BITS) of 52 vs. 46, eliminating not just > the CR4 access, but also a branch, at the cost of a percpu memory > access. (Which should still be a win on all microarchitectures IMO.) This is literally why I did the "runtime-const" stuff. Exactly for simple constants that you don't want to load from percpu memory because it's just annoying. Now, we only have 64-bit constant values which is very wasteful, and we could just do a signed byte constant if we cared. (We also have a "shift 32-bit value right by a constant amount", which actually does use a signed byte, but it's masked by 0x1f because that's how 32-bit shifts work). I doubt we care - I doubt any of this MAX_PHYSMEM_BITS use is actually performance-critical. The runtime-const stuff would be trivial to use here if we really want to. > - PGDIR_SHIFT: (inlined 156 times) Several of those are actually of the form #define PGDIR_SIZE (1UL << PGDIR_SHIFT) so you artificially see PGDIR_SHIFT as the important part, even though it's often a different constant entirely that just gets generated using it. > - p4d_offset(): (inlined 60 times) > Here pgtable_l5_enabled() is used as a binary flag. static branch would probably work best, and as Ard says, just using cpu_feature_enabled() would just fix it.. > - pgd_none(): (inlined 49 times) > Binary flag use as well, although the compiler might eliminate the > branch here and replace it with 'AND !native_pgd_val(pgd)' This could easily be done as runtime-const. But again, I doubt it's all that performance-critical. > - PTRS_PER_P4D: (inlined 46 times) > This too could be implemented via a precomputed constant percpu > value (per_cpu__x86_PTRS_PER_P4D), eliminating a branch, > or via an ALTERNATIVE() immediate constant. Again, we do have that, although the 64-bit constant is a bit wasteful. The reason runtime-const does a 64-bit constant is that the actual performance-critical cases were for big constants (TASK_SIZE) and for pointers (hash table pointers). Linus
* Linus Torvalds <torvalds@linux-foundation.org> wrote: > > - PGDIR_SHIFT: (inlined 156 times) > > Several of those are actually of the form > > #define PGDIR_SIZE (1UL << PGDIR_SHIFT) > > so you artificially see PGDIR_SHIFT as the important part, even though > it's often a different constant entirely that just gets generated > using it. Yeah, I only examined the first level use. Here's the stats for PGDIR_SIZE: 31 ffffffff844cde5c <stat__pgtable_l5_enabled_PGDIR_SIZE> 157 ffffffff844cdea0 <stat__pgtable_l5_enabled_PGDIR_SHIFT> # total: includes PGDIR_SIZE Ie. only about 19% of PGDIR_SHIFT use is PGDIR_SIZE. > > - PTRS_PER_P4D: (inlined 46 times) > > This too could be implemented via a precomputed constant percpu > > value (per_cpu__x86_PTRS_PER_P4D), eliminating a branch, > > or via an ALTERNATIVE() immediate constant. > > Again, we do have that, although the 64-bit constant is a bit wasteful. Adds 4 bytes to the size of the MOVQ. Not the end of the world, but I suspect for x86-specific values that flag off a CPU-feature flag (which is the case here) we can use the ALTERNATIVE_CONST_U32() trick and have it all in a single place. > The reason runtime-const does a 64-bit constant is that the actual > performance-critical cases were for big constants (TASK_SIZE) and for > pointers (hash table pointers). I remembered that we had something in this area, but I grepped for 'alternative.*const' which found nothing, while runtime_const uses its own simple text-patching method a.k.a. runtime_const_fixup(). :-) Thanks, Ingo
On Mon, 5 May 2025 at 23:25, Ingo Molnar <mingo@kernel.org> wrote: > > > * Ingo Molnar <mingo@kernel.org> wrote: > > > Anyway, with these limitations in mind, we can see that the top 5 > > usecases cover about 80% of all uses: > > > > - MAX_PHYSMEM_BITS: (inlined 179 times) > > > > arch/x86/include/asm/sparsemem.h:# define MAX_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46) > > > > This could be implemented via a precomputed, constant percpu value > > (per_cpu__x86_MAX_PHYSMEM_BITS) of 52 vs. 46, eliminating not just > > the CR4 access, but also a branch, at the cost of a percpu memory > > access. (Which should still be a win on all microarchitectures IMO.) > > > > Alternatively, since this value is a semi-constant of 52 vs. 46, we > > could also, I suspect, ALTERNATIVES-patch MAX_PHYSMEM_BITS in as an > > immediate constant value? Any reason this shouldn't work: > > > > static inline unsigned int __MAX_PHYSMEM_BITS(void) > > { > > unsigned int bits; > > > > asm_inline (ALTERNATIVE("movl $46, %0", "movl $52, %0", X86_FEATURE_LA57) :"=g" (bits)); > > > > return bits; > > } > > #define MAX_PHYSMEM_BITS __MAX_PHYSMEM_BITS() > > > > ... or something like that? This would result in the best code > > generation IMO, by far. (It would even make use of the > > zero-extension property of a 32-bit MOVL, further compressing the > > opcode to only 5 bytes or so.) > > > > We'd even create a secondary helper macro for this, something like: > > > > #define ALTERNATIVES_CONST_U32(__val1, __val2, __feature) \ > > ({ \ > > u32 __val; \ > > \ > > asm_inline (ALTERNATIVE("movl $" #__val1 ", %0", "movl $" __val2 ", %0", __feature) :"=g" (__val)); \ > > \ > > __val; \ > > }) > > > > ... > > > > #define MAX_PHYSMEM_BITS ALTERNATIVE_CONST_U32(46, 52, X86_FEATURE_LA57) > > > > (Or so. Totally untested.) > > BTW., I keep comparing it to the CR4 access, which is a bit unfair, > since that's only one out of the 3 variants of ALTERNATIVE_TERNARY(): > > static __always_inline __pure bool pgtable_l5_enabled(void) > { > unsigned long r; > bool ret; > > if (!IS_ENABLED(CONFIG_X86_5LEVEL)) > return false; > > asm(ALTERNATIVE_TERNARY( > "movq %%cr4, %[reg] \n\t btl %[la57], %k[reg]" CC_SET(c), > %P[feat], "stc", "clc") > : [reg] "=&r" (r), CC_OUT(c) (ret) > : [feat] "i" (X86_FEATURE_LA57), > [la57] "i" (X86_CR4_LA57_BIT) > : "cc"); > > return ret; > } > > The STC and CLC variants will probably be the more common outcomes on > modern CPUs. > The CR4 access is only used before alternatives patching; after that, either the STC or the CLC is patched in. > But I still think the ALTERNATIVE_CONST_U32() approach I outline above > generates superior code for the binary-values cases, which covers 3 out > of the top 5 uses of pgtable_l5_enabled(). > > For non-constant branching uses of pgtable_l5_enabled() I suspect the > STC/CLC approach above is pretty good, although the 'cc' constraint > will clobber all flags I suspect, while ALTERNATIVE_CONST_U32() > doesn't? Ie. with ALTERNATIVE_CONST_U32() we just load the resulting > constant into a register, with no additional branches and with flags > undisturbed. > > Also, is STC/CLC always just as fast as the testing of an immediate (or > a static branch), on CPUs we care about? > The reasoning behind using the C flag rather than asm goto is precisely the fact that in many cases, pgtable_l5_enabled() is not used for control flow but for picking between constants, and this approach permits the compiler [in theory] to resolve it without a branch. But I didn't inspect the resulting codegen, and just patching in a MOV immediate is obviously more efficient than that. I could just use asm goto here, and implement something similar to cpu_feature_enabled(), but in a way that removes the need for USE_EARLY_PGTABLE_L5. Then, we could get rid of the __pgtable_l5_enabled variable in a separate patch, and base it on a test of CR4.LA57 in the early init path. Then, any optimizations related to constants defined in terms of (pgtable_l5_enabled() ? foo : bar) could be layered on top of that. Having runtime constants is the obvious choice, although I'm skeptical whether it's worth the hassle. I also haven't yet looked into what it would entail to share these runtime constants with the startup code.
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index dd8d1a85f671..450d27d0f449 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -16,9 +16,6 @@ #define __NO_FORTIFY -/* cpu_feature_enabled() cannot be used this early */ -#define USE_EARLY_PGTABLE_L5 - /* * Boot stub deals with identity mappings, physical and virtual addresses are * the same, so override these defines. @@ -181,7 +178,6 @@ static inline int count_immovable_mem_regions(void) { return 0; } #endif /* ident_map_64.c */ -extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d; extern void kernel_add_identity_map(unsigned long start, unsigned long end); /* Used by PAGE_KERN* macros: */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 5a6c7a190e5b..591d28f2feb6 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -10,13 +10,6 @@ #define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ #define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ -#ifdef CONFIG_X86_5LEVEL -/* __pgtable_l5_enabled needs to be in .data to avoid being cleared along with .bss */ -unsigned int __section(".data") __pgtable_l5_enabled; -unsigned int __section(".data") pgdir_shift = 39; -unsigned int __section(".data") ptrs_per_p4d = 1; -#endif - /* Buffer to preserve trampoline memory */ static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; @@ -127,11 +120,6 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable) native_cpuid_eax(0) >= 7 && (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) { l5_required = true; - - /* Initialize variables for 5-level paging */ - __pgtable_l5_enabled = 1; - pgdir_shift = 48; - ptrs_per_p4d = 512; } /* diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 3b2bc61c9408..d3e786ff7dcb 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -81,6 +81,7 @@ SECTIONS *(.dynamic) *(.dynsym) *(.dynstr) *(.dynbss) *(.hash) *(.gnu.hash) *(.note.*) + *(.altinstructions .altinstr_replacement) } .got.plt (INFO) : { diff --git a/arch/x86/boot/startup/map_kernel.c b/arch/x86/boot/startup/map_kernel.c index 099ae2559336..11f99d907f76 100644 --- a/arch/x86/boot/startup/map_kernel.c +++ b/arch/x86/boot/startup/map_kernel.c @@ -16,19 +16,9 @@ extern unsigned int next_early_pgt; static inline bool check_la57_support(void) { - if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + if (!pgtable_l5_enabled()) return false; - /* - * 5-level paging is detected and enabled at kernel decompression - * stage. Only check if it has been enabled there. - */ - if (!(native_read_cr4() & X86_CR4_LA57)) - return false; - - __pgtable_l5_enabled = 1; - pgdir_shift = 48; - ptrs_per_p4d = 512; page_offset_base = __PAGE_OFFSET_BASE_L5; vmalloc_base = __VMALLOC_BASE_L5; vmemmap_base = __VMEMMAP_BASE_L5; diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index 5738b31c8e60..ade5ad5060e9 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -25,15 +25,6 @@ #undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS -/* - * This code runs before CPU feature bits are set. By default, the - * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if - * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5 - * is provided to handle this situation and, instead, use a variable that - * has been set by the early boot code. - */ -#define USE_EARLY_PGTABLE_L5 - #include <linux/kernel.h> #include <linux/mm.h> #include <linux/mem_encrypt.h> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 5bb782d856f2..40dceb7d80f5 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -6,7 +6,10 @@ #ifndef __ASSEMBLER__ #include <linux/types.h> +#include <asm/alternative.h> +#include <asm/cpufeatures.h> #include <asm/kaslr.h> +#include <asm/processor-flags.h> /* * These are used to make use of C type-checking.. @@ -21,28 +24,24 @@ typedef unsigned long pgprotval_t; typedef struct { pteval_t pte; } pte_t; typedef struct { pmdval_t pmd; } pmd_t; -extern unsigned int __pgtable_l5_enabled; - -#ifdef CONFIG_X86_5LEVEL -#ifdef USE_EARLY_PGTABLE_L5 -/* - * cpu_feature_enabled() is not available in early boot code. - * Use variable instead. - */ -static inline bool pgtable_l5_enabled(void) +static __always_inline __pure bool pgtable_l5_enabled(void) { - return __pgtable_l5_enabled; -} -#else -#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57) -#endif /* USE_EARLY_PGTABLE_L5 */ + unsigned long r; + bool ret; -#else -#define pgtable_l5_enabled() 0 -#endif /* CONFIG_X86_5LEVEL */ + if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + return false; -extern unsigned int pgdir_shift; -extern unsigned int ptrs_per_p4d; + asm(ALTERNATIVE_TERNARY( + "movq %%cr4, %[reg] \n\t btl %[la57], %k[reg]" CC_SET(c), + %P[feat], "stc", "clc") + : [reg] "=&r" (r), CC_OUT(c) (ret) + : [feat] "i" (X86_FEATURE_LA57), + [la57] "i" (X86_CR4_LA57_BIT) + : "cc"); + + return ret; +} #endif /* !__ASSEMBLER__ */ @@ -53,7 +52,7 @@ extern unsigned int ptrs_per_p4d; /* * PGDIR_SHIFT determines what a top-level page table entry can map */ -#define PGDIR_SHIFT pgdir_shift +#define PGDIR_SHIFT (pgtable_l5_enabled() ? 48 : 39) #define PTRS_PER_PGD 512 /* @@ -61,7 +60,7 @@ extern unsigned int ptrs_per_p4d; */ #define P4D_SHIFT 39 #define MAX_PTRS_PER_P4D 512 -#define PTRS_PER_P4D ptrs_per_p4d +#define PTRS_PER_P4D (pgtable_l5_enabled() ? 512 : 1) #define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) #define P4D_MASK (~(P4D_SIZE - 1)) @@ -76,6 +75,8 @@ extern unsigned int ptrs_per_p4d; #define PTRS_PER_PGD 512 #define MAX_PTRS_PER_P4D 1 +#define MAX_POSSIBLE_PHYSMEM_BITS 46 + #endif /* CONFIG_X86_5LEVEL */ /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 12126adbc3a9..eb6a7f6e20c4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-only -/* cpu_feature_enabled() cannot be used this early */ -#define USE_EARLY_PGTABLE_L5 #include <linux/memblock.h> #include <linux/linkage.h> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 29226f3ac064..3d49abb1bb3a 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -5,9 +5,6 @@ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE */ -/* cpu_feature_enabled() cannot be used this early */ -#define USE_EARLY_PGTABLE_L5 - #include <linux/init.h> #include <linux/linkage.h> #include <linux/types.h> @@ -50,14 +47,6 @@ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); -#ifdef CONFIG_X86_5LEVEL -unsigned int __pgtable_l5_enabled __ro_after_init; -unsigned int pgdir_shift __ro_after_init = 39; -EXPORT_SYMBOL(pgdir_shift); -unsigned int ptrs_per_p4d __ro_after_init = 1; -EXPORT_SYMBOL(ptrs_per_p4d); -#endif - #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; EXPORT_SYMBOL(page_offset_base); diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0539efd0d216..7c4fafbd52cc 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -1,9 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) "kasan: " fmt -/* cpu_feature_enabled() cannot be used this early */ -#define USE_EARLY_PGTABLE_L5 - #include <linux/memblock.h> #include <linux/kasan.h> #include <linux/kdebug.h>