@@ -15,7 +15,7 @@ KMSAN_SANITIZE := n
UBSAN_SANITIZE := n
KCOV_INSTRUMENT := n
-obj-$(CONFIG_X86_64) += gdt_idt.o
+obj-$(CONFIG_X86_64) += gdt_idt.o map_kernel.o
lib-$(CONFIG_X86_64) += la57toggle.o
lib-$(CONFIG_EFI_MIXED) += efi-mixed.o
new file mode 100644
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/pgtable.h>
+
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/sev.h>
+
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+extern unsigned int next_early_pgt;
+
+#ifdef CONFIG_X86_5LEVEL
+unsigned int __pgtable_l5_enabled __ro_after_init;
+unsigned int pgdir_shift __ro_after_init = 39;
+EXPORT_SYMBOL(pgdir_shift);
+unsigned int ptrs_per_p4d __ro_after_init = 1;
+EXPORT_SYMBOL(ptrs_per_p4d);
+#endif
+
+#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
+unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4;
+EXPORT_SYMBOL(page_offset_base);
+unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4;
+EXPORT_SYMBOL(vmalloc_base);
+unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
+EXPORT_SYMBOL(vmemmap_base);
+#endif
+
+static inline bool check_la57_support(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ return false;
+
+ /*
+ * 5-level paging is detected and enabled at kernel decompression
+ * stage. Only check if it has been enabled there.
+ */
+ if (!(native_read_cr4() & X86_CR4_LA57))
+ return false;
+
+ __pgtable_l5_enabled = 1;
+ pgdir_shift = 48;
+ ptrs_per_p4d = 512;
+ page_offset_base = __PAGE_OFFSET_BASE_L5;
+ vmalloc_base = __VMALLOC_BASE_L5;
+ vmemmap_base = __VMEMMAP_BASE_L5;
+
+ return true;
+}
+
+static unsigned long sme_postprocess_startup(struct boot_params *bp,
+ pmdval_t *pmd,
+ unsigned long p2v_offset)
+{
+ unsigned long paddr, paddr_end;
+ int i;
+
+ /* Encrypt the kernel and related (if SME is active) */
+ sme_encrypt_kernel(bp);
+
+ /*
+ * Clear the memory encryption mask from the .bss..decrypted section.
+ * The bss section will be memset to zero later in the initialization so
+ * there is no need to zero it after changing the memory encryption
+ * attribute.
+ */
+ if (sme_get_me_mask()) {
+ paddr = (unsigned long)__start_bss_decrypted;
+ paddr_end = (unsigned long)__end_bss_decrypted;
+
+ for (; paddr < paddr_end; paddr += PMD_SIZE) {
+ /*
+ * On SNP, transition the page to shared in the RMP table so that
+ * it is consistent with the page table attribute change.
+ *
+ * __start_bss_decrypted has a virtual address in the high range
+ * mapping (kernel .text). PVALIDATE, by way of
+ * early_snp_set_memory_shared(), requires a valid virtual
+ * address but the kernel is currently running off of the identity
+ * mapping so use the PA to get a *currently* valid virtual address.
+ */
+ early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
+
+ i = pmd_index(paddr - p2v_offset);
+ pmd[i] -= sme_get_me_mask();
+ }
+ }
+
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
+}
+
+unsigned long __init __startup_64(unsigned long p2v_offset,
+ struct boot_params *bp)
+{
+ pmd_t (*early_pgts)[PTRS_PER_PMD] = early_dynamic_pgts;
+ unsigned long physaddr = (unsigned long)_text;
+ unsigned long va_text, va_end;
+ unsigned long pgtable_flags;
+ unsigned long load_delta;
+ pgdval_t *pgd;
+ p4dval_t *p4d;
+ pudval_t *pud;
+ pmdval_t *pmd, pmd_entry;
+ bool la57;
+ int i;
+
+ la57 = check_la57_support();
+
+ /* Is the address too large? */
+ if (physaddr >> MAX_PHYSMEM_BITS)
+ for (;;);
+
+ /*
+ * Compute the delta between the address I am compiled to run at
+ * and the address I am actually running at.
+ */
+ phys_base = load_delta = __START_KERNEL_map + p2v_offset;
+
+ /* Is the address not 2M aligned? */
+ if (load_delta & ~PMD_MASK)
+ for (;;);
+
+ va_text = physaddr - p2v_offset;
+ va_end = (unsigned long)_end - p2v_offset;
+
+ /* Include the SME encryption mask in the fixup value */
+ load_delta += sme_get_me_mask();
+
+ /* Fixup the physical addresses in the page table */
+
+ pgd = &early_top_pgt[0].pgd;
+ pgd[pgd_index(__START_KERNEL_map)] += load_delta;
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) {
+ p4d = (p4dval_t *)level4_kernel_pgt;
+ p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
+
+ pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE;
+ }
+
+ level3_kernel_pgt[PTRS_PER_PUD - 2].pud += load_delta;
+ level3_kernel_pgt[PTRS_PER_PUD - 1].pud += load_delta;
+
+ for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
+ level2_fixmap_pgt[i].pmd += load_delta;
+
+ /*
+ * Set up the identity mapping for the switchover. These
+ * entries should *NOT* have the global bit set! This also
+ * creates a bunch of nonsense entries but that is fine --
+ * it avoids problems around wraparound.
+ */
+
+ pud = &early_pgts[0]->pmd;
+ pmd = &early_pgts[1]->pmd;
+ next_early_pgt = 2;
+
+ pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
+
+ if (la57) {
+ p4d = &early_pgts[next_early_pgt++]->pmd;
+
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
+
+ i = physaddr >> P4D_SHIFT;
+ p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
+ p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
+ } else {
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
+ }
+
+ i = physaddr >> PUD_SHIFT;
+ pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
+ pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
+
+ pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+ /* Filter out unsupported __PAGE_KERNEL_* bits: */
+ pmd_entry &= __supported_pte_mask;
+ pmd_entry += sme_get_me_mask();
+ pmd_entry += physaddr;
+
+ for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
+ int idx = i + (physaddr >> PMD_SHIFT);
+
+ pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
+ }
+
+ /*
+ * Fixup the kernel text+data virtual addresses. Note that
+ * we might write invalid pmds, when the kernel is relocated
+ * cleanup_highmap() fixes this up along with the mappings
+ * beyond _end.
+ *
+ * Only the region occupied by the kernel image has so far
+ * been checked against the table of usable memory regions
+ * provided by the firmware, so invalidate pages outside that
+ * region. A page table entry that maps to a reserved area of
+ * memory would allow processor speculation into that area,
+ * and on some hardware (particularly the UV platform) even
+ * speculative access to some reserved areas is caught as an
+ * error, causing the BIOS to halt the system.
+ */
+
+ pmd = &level2_kernel_pgt[0].pmd;
+
+ /* invalidate pages before the kernel image */
+ for (i = 0; i < pmd_index(va_text); i++)
+ pmd[i] &= ~_PAGE_PRESENT;
+
+ /* fixup pages that are part of the kernel image */
+ for (; i <= pmd_index(va_end); i++)
+ if (pmd[i] & _PAGE_PRESENT)
+ pmd[i] += load_delta;
+
+ /* invalidate pages after the kernel image */
+ for (; i < PTRS_PER_PMD; i++)
+ pmd[i] &= ~_PAGE_PRESENT;
+
+ return sme_postprocess_startup(bp, pmd, p2v_offset);
+}
@@ -47,235 +47,9 @@
* Manage page tables very early on.
*/
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
-static unsigned int __initdata next_early_pgt;
+unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
-#ifdef CONFIG_X86_5LEVEL
-unsigned int __pgtable_l5_enabled __ro_after_init;
-unsigned int pgdir_shift __ro_after_init = 39;
-EXPORT_SYMBOL(pgdir_shift);
-unsigned int ptrs_per_p4d __ro_after_init = 1;
-EXPORT_SYMBOL(ptrs_per_p4d);
-#endif
-
-#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
-unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4;
-EXPORT_SYMBOL(page_offset_base);
-unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4;
-EXPORT_SYMBOL(vmalloc_base);
-unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
-EXPORT_SYMBOL(vmemmap_base);
-#endif
-
-static inline bool check_la57_support(void)
-{
- if (!IS_ENABLED(CONFIG_X86_5LEVEL))
- return false;
-
- /*
- * 5-level paging is detected and enabled at kernel decompression
- * stage. Only check if it has been enabled there.
- */
- if (!(native_read_cr4() & X86_CR4_LA57))
- return false;
-
- RIP_REL_REF(__pgtable_l5_enabled) = 1;
- RIP_REL_REF(pgdir_shift) = 48;
- RIP_REL_REF(ptrs_per_p4d) = 512;
- RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5;
- RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5;
- RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5;
-
- return true;
-}
-
-static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
- pmdval_t *pmd,
- unsigned long p2v_offset)
-{
- unsigned long paddr, paddr_end;
- int i;
-
- /* Encrypt the kernel and related (if SME is active) */
- sme_encrypt_kernel(bp);
-
- /*
- * Clear the memory encryption mask from the .bss..decrypted section.
- * The bss section will be memset to zero later in the initialization so
- * there is no need to zero it after changing the memory encryption
- * attribute.
- */
- if (sme_get_me_mask()) {
- paddr = (unsigned long)&RIP_REL_REF(__start_bss_decrypted);
- paddr_end = (unsigned long)&RIP_REL_REF(__end_bss_decrypted);
-
- for (; paddr < paddr_end; paddr += PMD_SIZE) {
- /*
- * On SNP, transition the page to shared in the RMP table so that
- * it is consistent with the page table attribute change.
- *
- * __start_bss_decrypted has a virtual address in the high range
- * mapping (kernel .text). PVALIDATE, by way of
- * early_snp_set_memory_shared(), requires a valid virtual
- * address but the kernel is currently running off of the identity
- * mapping so use the PA to get a *currently* valid virtual address.
- */
- early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
-
- i = pmd_index(paddr - p2v_offset);
- pmd[i] -= sme_get_me_mask();
- }
- }
-
- /*
- * Return the SME encryption mask (if SME is active) to be used as a
- * modifier for the initial pgdir entry programmed into CR3.
- */
- return sme_get_me_mask();
-}
-
-/* Code in __startup_64() can be relocated during execution, but the compiler
- * doesn't have to generate PC-relative relocations when accessing globals from
- * that function. Clang actually does not generate them, which leads to
- * boot-time crashes. To work around this problem, every global pointer must
- * be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined
- * by subtracting p2v_offset from the RIP-relative address.
- */
-unsigned long __head __startup_64(unsigned long p2v_offset,
- struct boot_params *bp)
-{
- pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts);
- unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text);
- unsigned long va_text, va_end;
- unsigned long pgtable_flags;
- unsigned long load_delta;
- pgdval_t *pgd;
- p4dval_t *p4d;
- pudval_t *pud;
- pmdval_t *pmd, pmd_entry;
- bool la57;
- int i;
-
- la57 = check_la57_support();
-
- /* Is the address too large? */
- if (physaddr >> MAX_PHYSMEM_BITS)
- for (;;);
-
- /*
- * Compute the delta between the address I am compiled to run at
- * and the address I am actually running at.
- */
- load_delta = __START_KERNEL_map + p2v_offset;
- RIP_REL_REF(phys_base) = load_delta;
-
- /* Is the address not 2M aligned? */
- if (load_delta & ~PMD_MASK)
- for (;;);
-
- va_text = physaddr - p2v_offset;
- va_end = (unsigned long)&RIP_REL_REF(_end) - p2v_offset;
-
- /* Include the SME encryption mask in the fixup value */
- load_delta += sme_get_me_mask();
-
- /* Fixup the physical addresses in the page table */
-
- pgd = &RIP_REL_REF(early_top_pgt)->pgd;
- pgd[pgd_index(__START_KERNEL_map)] += load_delta;
-
- if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) {
- p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt);
- p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
-
- pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE;
- }
-
- RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta;
- RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta;
-
- for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
- RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta;
-
- /*
- * Set up the identity mapping for the switchover. These
- * entries should *NOT* have the global bit set! This also
- * creates a bunch of nonsense entries but that is fine --
- * it avoids problems around wraparound.
- */
-
- pud = &early_pgts[0]->pmd;
- pmd = &early_pgts[1]->pmd;
- RIP_REL_REF(next_early_pgt) = 2;
-
- pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
-
- if (la57) {
- p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd;
-
- i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
- pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
- pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
-
- i = physaddr >> P4D_SHIFT;
- p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
- p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
- } else {
- i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
- pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
- pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
- }
-
- i = physaddr >> PUD_SHIFT;
- pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
- pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
-
- pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
- /* Filter out unsupported __PAGE_KERNEL_* bits: */
- pmd_entry &= RIP_REL_REF(__supported_pte_mask);
- pmd_entry += sme_get_me_mask();
- pmd_entry += physaddr;
-
- for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
- int idx = i + (physaddr >> PMD_SHIFT);
-
- pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
- }
-
- /*
- * Fixup the kernel text+data virtual addresses. Note that
- * we might write invalid pmds, when the kernel is relocated
- * cleanup_highmap() fixes this up along with the mappings
- * beyond _end.
- *
- * Only the region occupied by the kernel image has so far
- * been checked against the table of usable memory regions
- * provided by the firmware, so invalidate pages outside that
- * region. A page table entry that maps to a reserved area of
- * memory would allow processor speculation into that area,
- * and on some hardware (particularly the UV platform) even
- * speculative access to some reserved areas is caught as an
- * error, causing the BIOS to halt the system.
- */
-
- pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd;
-
- /* invalidate pages before the kernel image */
- for (i = 0; i < pmd_index(va_text); i++)
- pmd[i] &= ~_PAGE_PRESENT;
-
- /* fixup pages that are part of the kernel image */
- for (; i <= pmd_index(va_end); i++)
- if (pmd[i] & _PAGE_PRESENT)
- pmd[i] += load_delta;
-
- /* invalidate pages after the kernel image */
- for (; i < PTRS_PER_PMD; i++)
- pmd[i] &= ~_PAGE_PRESENT;
-
- return sme_postprocess_startup(bp, pmd, p2v_offset);
-}
-
/* Wipe all early page tables except for the kernel symbol map */
static void __init reset_early_page_tables(void)
{