diff mbox series

[v2] x86/kexec: Add EFI config table identity mapping for kexec kernel

Message ID 20230601072043.24439-1-ltao@redhat.com
State New
Headers show
Series [v2] x86/kexec: Add EFI config table identity mapping for kexec kernel | expand

Commit Message

Tao Liu June 1, 2023, 7:20 a.m. UTC
A kexec kernel bootup hang is observed on Intel Atom cpu due to unmapped
EFI config table.

Currently EFI system table is identity-mapped for the kexec kernel, but EFI
config table is not mapped explicitly:

    commit 6bbeb276b71f ("x86/kexec: Add the EFI system tables and ACPI
                          tables to the ident map")

Later in the following 2 commits, EFI config table will be accessed when
enabling sev at kernel startup. This may result in a page fault due to EFI
config table's unmapped address. Since the page fault occurs at an early
stage, it is unrecoverable and kernel hangs.

    commit ec1c66af3a30 ("x86/compressed/64: Detect/setup SEV/SME features
                          earlier during boot")
    commit c01fce9cef84 ("x86/compressed: Add SEV-SNP feature
                          detection/setup")

In addition, the issue doesn't appear on all systems, because the kexec
kernel uses Page Size Extension (PSE) for identity mapping. In most cases,
EFI config table can end up to be mapped into due to 1 GB page size.
However if nogbpages is set, or cpu doesn't support pdpe1gb feature
(e.g Intel Atom x6425RE cpu), EFI config table may not be mapped into
due to 2 MB page size, thus a page fault hang is more likely to happen.

This patch will make sure the EFI config table is always mapped.

Signed-off-by: Tao Liu <ltao@redhat.com>
---
Changes in v2:
- Rephrase the change log based on Baoquan's suggestion.
- Rename map_efi_sys_cfg_tab() to map_efi_tables().
- Link to v1: https://lore.kernel.org/kexec/20230525094914.23420-1-ltao@redhat.com/
---
 arch/x86/kernel/machine_kexec_64.c | 35 ++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

Comments

Tao Liu June 8, 2023, 7:13 a.m. UTC | #1
Hello maintainers,

Sorry to interrupt. Currently I'm holding a machine which can be used
to reproduce the original issue and test the patch. However I may need
to return the machine in a short time. So if any updating and testing
needed for patch v3, please let me know. Thanks in advance!

Thanks,
Tao Liu

On Thu, Jun 1, 2023 at 4:25 PM Tao Liu <ltao@redhat.com> wrote:
>
> Hi Baoquan,
>
> On Thu, Jun 1, 2023 at 4:13 PM Baoquan He <bhe@redhat.com> wrote:
> >
> > On 06/01/23 at 03:20pm, Tao Liu wrote:
> > > A kexec kernel bootup hang is observed on Intel Atom cpu due to unmapped
> > > EFI config table.
> > >
> > > Currently EFI system table is identity-mapped for the kexec kernel, but EFI
> > > config table is not mapped explicitly:
> > >
> > >     commit 6bbeb276b71f ("x86/kexec: Add the EFI system tables and ACPI
> > >                           tables to the ident map")
> > >
> > > Later in the following 2 commits, EFI config table will be accessed when
> > > enabling sev at kernel startup. This may result in a page fault due to EFI
> > > config table's unmapped address. Since the page fault occurs at an early
> > > stage, it is unrecoverable and kernel hangs.
> > >
> > >     commit ec1c66af3a30 ("x86/compressed/64: Detect/setup SEV/SME features
> > >                           earlier during boot")
> > >     commit c01fce9cef84 ("x86/compressed: Add SEV-SNP feature
> > >                           detection/setup")
> > >
> > > In addition, the issue doesn't appear on all systems, because the kexec
> > > kernel uses Page Size Extension (PSE) for identity mapping. In most cases,
> > > EFI config table can end up to be mapped into due to 1 GB page size.
> > > However if nogbpages is set, or cpu doesn't support pdpe1gb feature
> > > (e.g Intel Atom x6425RE cpu), EFI config table may not be mapped into
> > > due to 2 MB page size, thus a page fault hang is more likely to happen.
> > >
> > > This patch will make sure the EFI config table is always mapped.
> > >
> > > Signed-off-by: Tao Liu <ltao@redhat.com>
> > > ---
> > > Changes in v2:
> > > - Rephrase the change log based on Baoquan's suggestion.
> > > - Rename map_efi_sys_cfg_tab() to map_efi_tables().
> > > - Link to v1: https://lore.kernel.org/kexec/20230525094914.23420-1-ltao@redhat.com/
> > > ---
> > >  arch/x86/kernel/machine_kexec_64.c | 35 ++++++++++++++++++++++++++----
> > >  1 file changed, 31 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
> > > index 1a3e2c05a8a5..664aefa6e896 100644
> > > --- a/arch/x86/kernel/machine_kexec_64.c
> > > +++ b/arch/x86/kernel/machine_kexec_64.c
> > > @@ -28,6 +28,7 @@
> > >  #include <asm/setup.h>
> > >  #include <asm/set_memory.h>
> > >  #include <asm/cpu.h>
> > > +#include <asm/efi.h>
> > >
> > >  #ifdef CONFIG_ACPI
> > >  /*
> > > @@ -86,10 +87,12 @@ const struct kexec_file_ops * const kexec_file_loaders[] = {
> > >  #endif
> > >
> > >  static int
> > > -map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
> > > +map_efi_tables(struct x86_mapping_info *info, pgd_t *level4p)
> > >  {
> > >  #ifdef CONFIG_EFI
> > >       unsigned long mstart, mend;
> > > +     void *kaddr;
> > > +     int ret;
> > >
> > >       if (!efi_enabled(EFI_BOOT))
> > >               return 0;
> > > @@ -105,6 +108,30 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
> > >       if (!mstart)
> > >               return 0;
> > >
> > > +     ret = kernel_ident_mapping_init(info, level4p, mstart, mend);
> > > +     if (ret)
> > > +             return ret;
> > > +
> > > +     kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB);
> > > +     if (!kaddr) {
> > > +             pr_err("Could not map UEFI system table\n");
> > > +             return -ENOMEM;
> > > +     }
> > > +
> > > +     mstart = efi_config_table;
> > > +
> > > +     if (efi_enabled(EFI_64BIT)) {
> > > +             efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr;
> > > +
> > > +             mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables;
> > > +     } else {
> > > +             efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr;
> > > +
> > > +             mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables;
> > > +     }
> > > +
> > > +     memunmap(kaddr);
> > > +
> > >       return kernel_ident_mapping_init(info, level4p, mstart, mend);
> > >  #endif
> > >       return 0;
> > > @@ -244,10 +271,10 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
> > >       }
> > >
> > >       /*
> > > -      * Prepare EFI systab and ACPI tables for kexec kernel since they are
> > > -      * not covered by pfn_mapped.
> > > +      * Prepare EFI systab, config table and ACPI tables for kexec kernel
> >
> > The code comment need be updated too?
> >
> >          * Prepare EFI tables and ACPI tables for kexec kernel since they are
> >          * not covered by pfn_mapped.
> >
> > Other than this nit, this patch looks good to me, thanks.
> >
>
> Thanks for the patch review! I'm OK with the comment update, but I
> prefer to leave it as it is. Since the comment provides more details:
> there are systab and config tables mapped instead of all efi tables.
>
> Thanks,
> Tao Liu
>
> > Acked-by: Baoquan He <bhe@redhat.com>
> >
> >
> > > +      * since they are not covered by pfn_mapped.
> > >        */
> > > -     result = map_efi_systab(&info, level4p);
> > > +     result = map_efi_tables(&info, level4p);
> > >       if (result)
> > >               return result;
> > >
> > > --
> > > 2.33.1
> > >
> >
Baoquan He June 16, 2023, 12:24 p.m. UTC | #2
Hi,

On 06/01/23 at 03:20pm, Tao Liu wrote:
> A kexec kernel bootup hang is observed on Intel Atom cpu due to unmapped
> EFI config table.

Ping!

The issue is observed on Lenovo ThinkEdge mini PC owning 'Intel
Atom(R) x6425RE' cpu, and reported by Lenovo engineer. On the machine,
kdump kernel switching will hang immediately w/o any prompt. Tao added
debugging info to finally position and find out the root cause.

Could you help check and consider accepting it or comment if there's
any further work Tao need do to correct or improve?

Thanks
Baoquan

> 
> Currently EFI system table is identity-mapped for the kexec kernel, but EFI
> config table is not mapped explicitly:
> 
>     commit 6bbeb276b71f ("x86/kexec: Add the EFI system tables and ACPI
>                           tables to the ident map")
> 
> Later in the following 2 commits, EFI config table will be accessed when
> enabling sev at kernel startup. This may result in a page fault due to EFI
> config table's unmapped address. Since the page fault occurs at an early
> stage, it is unrecoverable and kernel hangs.
> 
>     commit ec1c66af3a30 ("x86/compressed/64: Detect/setup SEV/SME features
>                           earlier during boot")
>     commit c01fce9cef84 ("x86/compressed: Add SEV-SNP feature
>                           detection/setup")
> 
> In addition, the issue doesn't appear on all systems, because the kexec
> kernel uses Page Size Extension (PSE) for identity mapping. In most cases,
> EFI config table can end up to be mapped into due to 1 GB page size.
> However if nogbpages is set, or cpu doesn't support pdpe1gb feature
> (e.g Intel Atom x6425RE cpu), EFI config table may not be mapped into
> due to 2 MB page size, thus a page fault hang is more likely to happen.
> 
> This patch will make sure the EFI config table is always mapped.
> 
> Signed-off-by: Tao Liu <ltao@redhat.com>
> ---
> Changes in v2:
> - Rephrase the change log based on Baoquan's suggestion.
> - Rename map_efi_sys_cfg_tab() to map_efi_tables().
> - Link to v1: https://lore.kernel.org/kexec/20230525094914.23420-1-ltao@redhat.com/
> ---
>  arch/x86/kernel/machine_kexec_64.c | 35 ++++++++++++++++++++++++++----
>  1 file changed, 31 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
> index 1a3e2c05a8a5..664aefa6e896 100644
> --- a/arch/x86/kernel/machine_kexec_64.c
> +++ b/arch/x86/kernel/machine_kexec_64.c
> @@ -28,6 +28,7 @@
>  #include <asm/setup.h>
>  #include <asm/set_memory.h>
>  #include <asm/cpu.h>
> +#include <asm/efi.h>
>  
>  #ifdef CONFIG_ACPI
>  /*
> @@ -86,10 +87,12 @@ const struct kexec_file_ops * const kexec_file_loaders[] = {
>  #endif
>  
>  static int
> -map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
> +map_efi_tables(struct x86_mapping_info *info, pgd_t *level4p)
>  {
>  #ifdef CONFIG_EFI
>  	unsigned long mstart, mend;
> +	void *kaddr;
> +	int ret;
>  
>  	if (!efi_enabled(EFI_BOOT))
>  		return 0;
> @@ -105,6 +108,30 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
>  	if (!mstart)
>  		return 0;
>  
> +	ret = kernel_ident_mapping_init(info, level4p, mstart, mend);
> +	if (ret)
> +		return ret;
> +
> +	kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB);
> +	if (!kaddr) {
> +		pr_err("Could not map UEFI system table\n");
> +		return -ENOMEM;
> +	}
> +
> +	mstart = efi_config_table;
> +
> +	if (efi_enabled(EFI_64BIT)) {
> +		efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr;
> +
> +		mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables;
> +	} else {
> +		efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr;
> +
> +		mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables;
> +	}
> +
> +	memunmap(kaddr);
> +
>  	return kernel_ident_mapping_init(info, level4p, mstart, mend);
>  #endif
>  	return 0;
> @@ -244,10 +271,10 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
>  	}
>  
>  	/*
> -	 * Prepare EFI systab and ACPI tables for kexec kernel since they are
> -	 * not covered by pfn_mapped.
> +	 * Prepare EFI systab, config table and ACPI tables for kexec kernel
> +	 * since they are not covered by pfn_mapped.
>  	 */
> -	result = map_efi_systab(&info, level4p);
> +	result = map_efi_tables(&info, level4p);
>  	if (result)
>  		return result;
>  
> -- 
> 2.33.1
>
Borislav Petkov July 5, 2023, 5:33 p.m. UTC | #3
On Thu, Jun 01, 2023 at 03:20:44PM +0800, Tao Liu wrote:
> A kexec kernel bootup hang is observed on Intel Atom cpu due to unmapped

s/cpu/CPU/g

> EFI config table.
> 
> Currently EFI system table is identity-mapped for the kexec kernel, but EFI
> config table is not mapped explicitly:

Why does the EFI config table *need* to be mapped explicitly?

>     commit 6bbeb276b71f ("x86/kexec: Add the EFI system tables and ACPI
>                           tables to the ident map")
> 
> Later in the following 2 commits, EFI config table will be accessed when
> enabling sev at kernel startup.

What does SEV have to do with an Intel problem?

> This may result in a page fault due to EFI
> config table's unmapped address. Since the page fault occurs at an early
> stage, it is unrecoverable and kernel hangs.
> 
>     commit ec1c66af3a30 ("x86/compressed/64: Detect/setup SEV/SME features
>                           earlier during boot")
>     commit c01fce9cef84 ("x86/compressed: Add SEV-SNP feature
>                           detection/setup")
> 
> In addition, the issue doesn't appear on all systems, because the kexec
> kernel uses Page Size Extension (PSE) for identity mapping. In most cases,
> EFI config table can end up to be mapped into due to 1 GB page size.
> However if nogbpages is set, or cpu doesn't support pdpe1gb feature
> (e.g Intel Atom x6425RE cpu), EFI config table may not be mapped into
> due to 2 MB page size, thus a page fault hang is more likely to happen.

This doesn't answer my question above.

> This patch will make sure the EFI config table is always mapped.

Avoid having "This patch" or "This commit" in the commit message. It is
tautologically useless.

Also, do

$ git grep 'This patch' Documentation/process

for more details.


> 
> Signed-off-by: Tao Liu <ltao@redhat.com>
> ---
> Changes in v2:
> - Rephrase the change log based on Baoquan's suggestion.
> - Rename map_efi_sys_cfg_tab() to map_efi_tables().
> - Link to v1: https://lore.kernel.org/kexec/20230525094914.23420-1-ltao@redhat.com/
> ---
>  arch/x86/kernel/machine_kexec_64.c | 35 ++++++++++++++++++++++++++----
>  1 file changed, 31 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
> index 1a3e2c05a8a5..664aefa6e896 100644
> --- a/arch/x86/kernel/machine_kexec_64.c
> +++ b/arch/x86/kernel/machine_kexec_64.c
> @@ -28,6 +28,7 @@
>  #include <asm/setup.h>
>  #include <asm/set_memory.h>
>  #include <asm/cpu.h>
> +#include <asm/efi.h>
>  
>  #ifdef CONFIG_ACPI
>  /*
> @@ -86,10 +87,12 @@ const struct kexec_file_ops * const kexec_file_loaders[] = {
>  #endif
>  
>  static int
> -map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
> +map_efi_tables(struct x86_mapping_info *info, pgd_t *level4p)
>  {
>  #ifdef CONFIG_EFI
>  	unsigned long mstart, mend;
> +	void *kaddr;
> +	int ret;
>  
>  	if (!efi_enabled(EFI_BOOT))
>  		return 0;
> @@ -105,6 +108,30 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
>  	if (!mstart)
>  		return 0;
>  
> +	ret = kernel_ident_mapping_init(info, level4p, mstart, mend);
> +	if (ret)
> +		return ret;
> +
> +	kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB);
> +	if (!kaddr) {
> +		pr_err("Could not map UEFI system table\n");
> +		return -ENOMEM;
> +	}
> +
> +	mstart = efi_config_table;

Yeah, about this, did you see efi_reuse_config() and the comment above
it especially?

Or is it that the EFI in that box wants the config table mapped 1:1 and
accesses it during boot/kexec?

In any case, this is all cloudy without a proper root cause.

Also, I'd like for Ard to have a look at this too.

Thx.
Tao Liu July 7, 2023, 3:38 a.m. UTC | #4
Hi Borislav,

Thanks for the patch review!

On Thu, Jul 6, 2023 at 1:34 AM Borislav Petkov <bp@alien8.de> wrote:
>
> On Thu, Jun 01, 2023 at 03:20:44PM +0800, Tao Liu wrote:
> > A kexec kernel bootup hang is observed on Intel Atom cpu due to unmapped
>
> s/cpu/CPU/g
>
> > EFI config table.
> >
> > Currently EFI system table is identity-mapped for the kexec kernel, but EFI
> > config table is not mapped explicitly:
>
> Why does the EFI config table *need* to be mapped explicitly?
>
> >     commit 6bbeb276b71f ("x86/kexec: Add the EFI system tables and ACPI
> >                           tables to the ident map")
> >
> > Later in the following 2 commits, EFI config table will be accessed when
> > enabling sev at kernel startup.
>
> What does SEV have to do with an Intel problem?

For the 2 questions above. The call stack is follows:

head_64.S:.Lon_kernel_cs(which is before extract_kernel) -> sev_enable
-> snp_init -> find_cc_blob -> find_cc_blob_efi. No matter what cpu,
with CONFIG_AMD_MEM_ENCRYPT enabled, all will enter sev_enable() and
go through these functions. I assume it is harmless for Intel cpu,
normally just exit if sev enable conditions not met. However the efi
config table will be iterated in find_cc_blob_efi(), in order to find
if there is EFI_CC_BLOB_GUID(Confidential Computing blob) in the
vendor table.

>
> > This may result in a page fault due to EFI
> > config table's unmapped address. Since the page fault occurs at an early
> > stage, it is unrecoverable and kernel hangs.
> >
> >     commit ec1c66af3a30 ("x86/compressed/64: Detect/setup SEV/SME features
> >                           earlier during boot")
> >     commit c01fce9cef84 ("x86/compressed: Add SEV-SNP feature
> >                           detection/setup")
> >
> > In addition, the issue doesn't appear on all systems, because the kexec
> > kernel uses Page Size Extension (PSE) for identity mapping. In most cases,
> > EFI config table can end up to be mapped into due to 1 GB page size.
> > However if nogbpages is set, or cpu doesn't support pdpe1gb feature
> > (e.g Intel Atom x6425RE cpu), EFI config table may not be mapped into
> > due to 2 MB page size, thus a page fault hang is more likely to happen.
>
> This doesn't answer my question above.

Currently the efi config table is not explicitly ident mapped for 2nd
kernel. In a few "unlucky" cases, 2nd kernel will page fault during
find_cc_blob_efi() and unrecoverable, but for most cases, it is
"lucky" with no problem because PSE and pdpe1gb can make config table
mapped into when ident mapping something else.

>
> > This patch will make sure the EFI config table is always mapped.
>
> Avoid having "This patch" or "This commit" in the commit message. It is
> tautologically useless.
>
> Also, do
>
> $ git grep 'This patch' Documentation/process
>
> for more details.

Thanks, I will get it improved in v3.

>
>
> >
> > Signed-off-by: Tao Liu <ltao@redhat.com>
> > ---
> > Changes in v2:
> > - Rephrase the change log based on Baoquan's suggestion.
> > - Rename map_efi_sys_cfg_tab() to map_efi_tables().
> > - Link to v1: https://lore.kernel.org/kexec/20230525094914.23420-1-ltao@redhat.com/
> > ---
> >  arch/x86/kernel/machine_kexec_64.c | 35 ++++++++++++++++++++++++++----
> >  1 file changed, 31 insertions(+), 4 deletions(-)
> >
> > diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
> > index 1a3e2c05a8a5..664aefa6e896 100644
> > --- a/arch/x86/kernel/machine_kexec_64.c
> > +++ b/arch/x86/kernel/machine_kexec_64.c
> > @@ -28,6 +28,7 @@
> >  #include <asm/setup.h>
> >  #include <asm/set_memory.h>
> >  #include <asm/cpu.h>
> > +#include <asm/efi.h>
> >
> >  #ifdef CONFIG_ACPI
> >  /*
> > @@ -86,10 +87,12 @@ const struct kexec_file_ops * const kexec_file_loaders[] = {
> >  #endif
> >
> >  static int
> > -map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
> > +map_efi_tables(struct x86_mapping_info *info, pgd_t *level4p)
> >  {
> >  #ifdef CONFIG_EFI
> >       unsigned long mstart, mend;
> > +     void *kaddr;
> > +     int ret;
> >
> >       if (!efi_enabled(EFI_BOOT))
> >               return 0;
> > @@ -105,6 +108,30 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
> >       if (!mstart)
> >               return 0;
> >
> > +     ret = kernel_ident_mapping_init(info, level4p, mstart, mend);
> > +     if (ret)
> > +             return ret;
> > +
> > +     kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB);
> > +     if (!kaddr) {
> > +             pr_err("Could not map UEFI system table\n");
> > +             return -ENOMEM;
> > +     }
> > +
> > +     mstart = efi_config_table;
>
> Yeah, about this, did you see efi_reuse_config() and the comment above
> it especially?
>
> Or is it that the EFI in that box wants the config table mapped 1:1 and
> accesses it during boot/kexec?

The call stack shows the page fault issue is before the 2nd kernel
extract, which is earlier than what efi_reuse_config() does for kernel
init.

Thanks,
Tao Liu
>
> In any case, this is all cloudy without a proper root cause.
>
> Also, I'd like for Ard to have a look at this too.
>
> Thx.
>
> --
> Regards/Gruss,
>     Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
>
Baoquan He July 7, 2023, 8:41 a.m. UTC | #5
On 07/07/23 at 10:22am, Joerg Roedel wrote:
> On Fri, Jul 07, 2023 at 12:23:59PM +0800, Baoquan He wrote:
> > I am wondering why we don't detect the cpu type and return early inside
> > sev_enable() if it's Intel cpu.
> > 
> > We can't rely on CONFIG_AMD_MEM_ENCRYPT to decide if the code need be
> > executed or not because we usually enable them all in distros.
> 
> Looking at the code in head_64.S, by the time sev_enable() runs the SEV
> bit should already be set in sev_status. Maybe use that to detect
> whether SEV is enabled and bail out early?

Makes sense to me. Thanks.
Borislav Petkov July 7, 2023, 8:57 a.m. UTC | #6
On Fri, Jul 07, 2023 at 10:22:56AM +0200, Joerg Roedel wrote:
> On Fri, Jul 07, 2023 at 12:23:59PM +0800, Baoquan He wrote:
> > I am wondering why we don't detect the cpu type and return early inside
> > sev_enable() if it's Intel cpu.
> > 
> > We can't rely on CONFIG_AMD_MEM_ENCRYPT to decide if the code need be
> > executed or not because we usually enable them all in distros.
> 
> Looking at the code in head_64.S, by the time sev_enable() runs the SEV
> bit should already be set in sev_status. Maybe use that to detect
> whether SEV is enabled and bail out early?

There was something about getting the CPUID page on SNP *before*
actually calling CPUID but this is not the first time we had trouble in
this area. This needs to be done differently.

Michael?
Michael Roth July 7, 2023, 3:25 p.m. UTC | #7
On Fri, Jul 07, 2023 at 10:57:12AM +0200, Borislav Petkov wrote:
> On Fri, Jul 07, 2023 at 10:22:56AM +0200, Joerg Roedel wrote:
> > On Fri, Jul 07, 2023 at 12:23:59PM +0800, Baoquan He wrote:
> > > I am wondering why we don't detect the cpu type and return early inside
> > > sev_enable() if it's Intel cpu.
> > > 
> > > We can't rely on CONFIG_AMD_MEM_ENCRYPT to decide if the code need be
> > > executed or not because we usually enable them all in distros.

If the SETUP_CC_BLOB config table entry isn't present, then none of that
code will run. I think the patch here addresses the main issue: kexec
has handed us a SETUP_EFI setup_data entry, with a pointer to a valid
config table, but we don't map it before accessing it. I should have
done a better job of checking for this, since there has been similar
cases exposed by the SEV checks, e.g.:

  commit b57feed2cc2622ae14b2fa62f19e973e5e0a60cf
  Author: Michael Roth <michael.roth@amd.com>
  Date:   Tue Jul 5 21:53:15 2022 -0500

      x86/compressed/64: Add identity mappings for setup_data entries

but I don't think there's anything inherently wrong with accessing the
EFI config table in early boot. SEV is earliest user now, but something
else can come along. We certainly have early access to EFI system table
and other bootparams fields like ACPI RDSP in this neck of the woods.

So regardless of how we decide to handle sev_enable(), I think this patch
should be applied, rather than allowing this to lurk in the shadows until
it claims its next victim.

> > 
> > Looking at the code in head_64.S, by the time sev_enable() runs the SEV
> > bit should already be set in sev_status. Maybe use that to detect
> > whether SEV is enabled and bail out early?

sev_enabled() is actually what sets sev_status global, but for startup32
path, the SEV_ENABLED bit is artificially set earlier to force the C-bit
check in startup32_check_sev_cbit():

  fef81c8626: x86/boot/compressed/64: Check SEV encryption in the 32-bit boot-path

> 
> There was something about getting the CPUID page on SNP *before*
> actually calling CPUID but this is not the first time we had trouble in
> this area. This needs to be done differently.
> 
> Michael?

The main issue is we don't know when it is safe to access the SEV_STATUS
MSR until we've checked for the CPUID bit that advertises SEV
capability, otherwise we can generate a #GP on machines that don't
support it.

With SNP, the most reliable way to access this CPUID bit is via the SNP
CPUID table entry corresponding to 0x8000001F, rather than the emulated
values provided by hypervisor, like we do with SEV-ES. So that's how
things are implemented currently.

We *could* instead revert back to using the hypervisor-provided CPUID
values for 0x8000001F, as with SEV-ES, which would allow us to check
SEV_STATUS MSR *prior* to SNP CPUID table setup, instead of afterward...

What's awkward about that approach is that we'd effectively be bypassing
all the validation that the SNP firmware does on the 0x8000001F leaf. In
general this seems backwards, however:

  a) we still have the option of re-checking the 0x8000001F leaf once
     the SNP CPUID table is setup to see if the hypervisor presented
     something different to the guest than what we were expecting
  b) in this *specific* case, we only need to check for SEV CPUID bit
     so we know it is safe to check SEV_STATUS MSR. A malicious
     hypervisor can:

       i) trick us into causing a crash via the MSR access: but that's okay,
          security isn't compromised. hypervisor's fault, not ours.
      ii) trick us into thinking SEV is enabled so hypervisor can
          intercept/emulated the MSR access: but that's okay, because
          when SEV is enabled the SEV_STATUS MSR can't be intercepted,
          and if SEV isn't actually enabled, the guest wouldn't get past
          attestation.
     iii) trick us into thinking SEV isn't enabled, which should be fine
          if we leave sev_status unset.

So, yes, we can avoid checking for CC blob by relying on the above
details and reversing the ordering of CPUID table setup and initial
SEV_STATUS MSR checks. But we'd want to similarly audit any changes to these
paths so that an eventual case doesn't pop up where a malicious hypervisor
can cause a certain check/configuration to be bypassed by toying with CPUID
values at this stage of boot, which is why we've so far tried to maintain
the current handling.

It would be unfortunate if we finally abandoned this path because of the
issue being hit here though. I think the patch posted here is the proper
resolution to the issue being hit, and I'm hoping at this point we've
identified all the similar cases where EFI/setup_data-related structures
were missing explicit mappings. But if we still think it's too much of a
liability to access the EFI config table outside of SEV-enabled guests,
then I can work on re-implementing things based on the above logic.

-Mike

> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> https://people.kernel.org/tglx/notes-about-netiquette
Tom Lendacky July 7, 2023, 3:46 p.m. UTC | #8
On 7/7/23 03:22, Joerg Roedel wrote:
> On Fri, Jul 07, 2023 at 12:23:59PM +0800, Baoquan He wrote:
>> I am wondering why we don't detect the cpu type and return early inside
>> sev_enable() if it's Intel cpu.
>>
>> We can't rely on CONFIG_AMD_MEM_ENCRYPT to decide if the code need be
>> executed or not because we usually enable them all in distros.
> 
> Looking at the code in head_64.S, by the time sev_enable() runs the SEV
> bit should already be set in sev_status. Maybe use that to detect
> whether SEV is enabled and bail out early?

I think that is only if you enter on the 32-bit path. If invoked from EFI 
in 64-bit, efi64_stub_entry(), then I don't believe that sev_status will 
be set yet.

Before it can be determined if it is a non-AMD platform, the EFI config 
table has to be searched in order to find the CC blob table. Once that is 
found (or not found), then the checks for the platform are performed and 
sev_enable() will exit if not on an AMD platform.

I think it was an oversight to not add support for identity mapping the 
EFI config tables for kexec. Any features in the future that need to 
search for an EFI config table early like this will need the same.

Thanks,
Tom

> 
> Regards,
>
Borislav Petkov July 7, 2023, 5:12 p.m. UTC | #9
On Fri, Jul 07, 2023 at 10:25:15AM -0500, Michael Roth wrote:
> ...
> It would be unfortunate if we finally abandoned this path because of the
> issue being hit here though. I think the patch posted here is the proper
> resolution to the issue being hit, and I'm hoping at this point we've
> identified all the similar cases where EFI/setup_data-related structures
> were missing explicit mappings. But if we still think it's too much of a
> liability to access the EFI config table outside of SEV-enabled guests,
> then I can work on re-implementing things based on the above logic.

Replying here to Tom's note too...

So, I like the idea of rechecking CPUID. Yes, let's do the sev_status
check. As a result, we either fail the guest - no problem - or we boot
and we recheck. Thus, we don't run AMD code on !AMD machines, if the HV
is not a lying bastard.

Now, if we've gotten a valid setup_data SETUP_EFI entry with a valid
pointer to an EFI config table, then that should happen in the generic
path - initialize_identity_maps(), for example - like you've done in
b57feed2cc26 - not in the kexec code because kexec *happens* to need it.

We want to access the EFI config table? Sure, by all means, but make
that generic for all code.

Thx.
Borislav Petkov July 13, 2023, 10:04 a.m. UTC | #10
On Thu, Jun 01, 2023 at 03:20:44PM +0800, Tao Liu wrote:
>  arch/x86/kernel/machine_kexec_64.c | 35 ++++++++++++++++++++++++++----
>  1 file changed, 31 insertions(+), 4 deletions(-)

Ok, pls try this totally untested thing.

Thx.

---
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 09dc8c187b3c..fefe27b2af85 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -404,13 +404,20 @@ void sev_enable(struct boot_params *bp)
 	if (bp)
 		bp->cc_blob_address = 0;
 
+	/* Check for the SME/SEV support leaf */
+	eax = 0x80000000;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+	if (eax < 0x8000001f)
+		return;
+
 	/*
 	 * Setup/preliminary detection of SNP. This will be sanity-checked
 	 * against CPUID/MSR values later.
 	 */
 	snp = snp_init(bp);
 
-	/* Check for the SME/SEV support leaf */
+	/* Recheck the SME/SEV support leaf */
 	eax = 0x80000000;
 	ecx = 0;
 	native_cpuid(&eax, &ebx, &ecx, &edx);
Ard Biesheuvel July 13, 2023, 10:17 a.m. UTC | #11
On Fri, 7 Jul 2023 at 19:12, Borislav Petkov <bp@alien8.de> wrote:
>
> On Fri, Jul 07, 2023 at 10:25:15AM -0500, Michael Roth wrote:
> > ...
> > It would be unfortunate if we finally abandoned this path because of the
> > issue being hit here though. I think the patch posted here is the proper
> > resolution to the issue being hit, and I'm hoping at this point we've
> > identified all the similar cases where EFI/setup_data-related structures
> > were missing explicit mappings. But if we still think it's too much of a
> > liability to access the EFI config table outside of SEV-enabled guests,
> > then I can work on re-implementing things based on the above logic.
>
> Replying here to Tom's note too...
>
> So, I like the idea of rechecking CPUID. Yes, let's do the sev_status
> check. As a result, we either fail the guest - no problem - or we boot
> and we recheck. Thus, we don't run AMD code on !AMD machines, if the HV
> is not a lying bastard.
>
> Now, if we've gotten a valid setup_data SETUP_EFI entry with a valid
> pointer to an EFI config table, then that should happen in the generic
> path - initialize_identity_maps(), for example - like you've done in
> b57feed2cc26 - not in the kexec code because kexec *happens* to need it.
>
> We want to access the EFI config table? Sure, by all means, but make
> that generic for all code.
>

OK, so in summary, what seems to be happening here is that the SEV
init code in the decompressor looks for the cc blob table before the
on-demand mapping code is up, which normally ensures that any RAM
address is accessible even if it hasn't been mapped explicitly.

This is why the fix happens to work: the code only maps the array of
(guid, phys_addr) tuples that describes the list of configuration
tables that have been provided by the firmware. The actual
configuration tables themselves could be anywhere in physical memory,
and without prior knowledge of a particular GUID value, there is no
way to know the size of the table, and so they cannot be mapped
upfront like this. However, the cc blob table does not exist on this
machine, and so whether the EFI config tables themselves are mapped or
not is irrelevant.

But it does mean the fix is incomplete, and certainly does not belong
in generic kexec code. If anything, we should be fixing the
decompressor code to defer the cc blob table check until after the
demand mapping code is up.

If this is problematic, we might instead disable SEV for kexec, and
rely on the fact that SEV firmware enters with a complete 1:1 map (as
we seem to be doing currently). If kexec for SEV is needed at some
point, we can re-enable it by having it provide a mapping for the
config table array and the cc blob table explicitly.
Tao Liu July 17, 2023, 1:53 p.m. UTC | #12
Hi Borislav,

On Thu, Jul 13, 2023 at 6:05 PM Borislav Petkov <bp@alien8.de> wrote:
>
> On Thu, Jun 01, 2023 at 03:20:44PM +0800, Tao Liu wrote:
> >  arch/x86/kernel/machine_kexec_64.c | 35 ++++++++++++++++++++++++++----
> >  1 file changed, 31 insertions(+), 4 deletions(-)
>
> Ok, pls try this totally untested thing.
>
> Thx.
>
> ---
> diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
> index 09dc8c187b3c..fefe27b2af85 100644
> --- a/arch/x86/boot/compressed/sev.c
> +++ b/arch/x86/boot/compressed/sev.c
> @@ -404,13 +404,20 @@ void sev_enable(struct boot_params *bp)
>         if (bp)
>                 bp->cc_blob_address = 0;
>
> +       /* Check for the SME/SEV support leaf */
> +       eax = 0x80000000;
> +       ecx = 0;
> +       native_cpuid(&eax, &ebx, &ecx, &edx);
> +       if (eax < 0x8000001f)
> +               return;
> +
>         /*
>          * Setup/preliminary detection of SNP. This will be sanity-checked
>          * against CPUID/MSR values later.
>          */
>         snp = snp_init(bp);
>
> -       /* Check for the SME/SEV support leaf */
> +       /* Recheck the SME/SEV support leaf */
>         eax = 0x80000000;
>         ecx = 0;
>         native_cpuid(&eax, &ebx, &ecx, &edx);
>
Thanks a lot for the patch above! Sorry for the late response. I have
compiled and tested it locally against 6.5.0-rc1, though it can pass
the early stage of kexec kernel bootup , however the kernel will panic
occasionally later. The test machine is the one with Intel Atom
x6425RE cpu which encountered the page fault issue of missing efi
config table.

...snip...
[   21.360763]  nvme0n1: p1 p2 p3
[   21.364207] igc 0000:03:00.0: PTM enabled, 4ns granularity
[   21.421097] pps pps1: new PPS source ptp1
[   21.425396] igc 0000:03:00.0 (unnamed net_device) (uninitialized): PHC added
[   21.457005] igc 0000:03:00.0: 4.000 Gb/s available PCIe bandwidth
(5.0 GT/s PCIe x1 link)
[   21.465210] igc 0000:03:00.0 eth1: MAC: ...snip...
[   21.473424] igc 0000:03:00.0 enp3s0: renamed from eth1
[   21.479446] BUG: kernel NULL pointer dereference, address: 0000000000000008
[   21.486405] #PF: supervisor read access in kernel mode
[   21.491519] mmc1: Failed to initialize a non-removable card
[   21.491538] #PF: error_code(0x0000) - not-present page
[   21.502229] PGD 0 P4D 0
[   21.504773] Oops: 0000 [#1] PREEMPT SMP NOPTI
[   21.509133] CPU: 3 PID: 402 Comm: systemd-udevd Not tainted 6.5.0-rc1+ #1
[   21.515905] Hardware name: ...snip...
[   21.522851] RIP: 0010:kernfs_dop_revalidate+0x2b/0x120
[   21.527995] Code: 1f 44 00 00 83 e6 40 0f 85 07 01 00 00 41 55 41
54 55 53 48 8b 47 30 48 89 fb 48 85 c0 0f 84 a2 00 00 00 48 8b a87
[   21.546680] RSP: 0018:ffffb656405abbf0 EFLAGS: 00010282
[   21.551898] RAX: ffff921c761a5d40 RBX: ffff921c76164000 RCX: 0000000000000001
[   21.559018] RDX: ffff921c76164038 RSI: 0000000000000000 RDI: ffff921c76164000
[   21.566137] RBP: 0000000000000000 R08: 0000000000000006 R09: ffff921c02272d80
[   21.573254] R10: ffff8d909b919a89 R11: 0000000000000000 R12: ffff921c02272d80
[   21.580367] R13: ffffb656405abca0 R14: 0000000000000000 R15: 0000000000000000
[   21.587489] FS:  00007f75796bf540(0000) GS:ffff922360580000(0000)
knlGS:0000000000000000
[   21.595557] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   21.601291] CR2: 0000000000000008 CR3: 000000011615c000 CR4: 0000000000350ee0
[   21.608405] Call Trace:
[   21.610864]  <TASK>
[   21.612983]  ? __die+0x1f/0x70
[   21.616053]  ? page_fault_oops+0x75/0x170
[   21.620062]  ? xa_load+0x87/0xe0
[   21.623299]  ? exc_page_fault+0x67/0x140
[   21.627225]  ? asm_exc_page_fault+0x22/0x30
[   21.631415]  ? kernfs_dop_revalidate+0x2b/0x120
[   21.635962]  lookup_fast+0x75/0xf0
[   21.639367]  walk_component+0x1f/0x150
[   21.643125]  path_lookupat+0x6a/0x1a0
[   21.646789]  filename_lookup+0xd0/0x1d0
[   21.650630]  ? __check_object_size.part.0+0x5e/0x130
[   21.655595]  ? __check_object_size.part.0+0x5e/0x130
[   21.660552]  vfs_statx+0x8c/0x160
[   21.663880]  vfs_fstatat+0x51/0x70
[   21.667286]  __do_sys_newfstatat+0x26/0x60
[   21.671388]  ? syscall_trace_enter.isra.0+0x9a/0x1a0
[   21.676349]  do_syscall_64+0x59/0x90
[   21.679926]  ? exit_to_user_mode_prepare+0xbb/0xd0
[   21.684719]  ? syscall_exit_to_user_mode+0x12/0x30
[   21.689507]  ? do_syscall_64+0x68/0x90
[   21.693262]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[   21.698310] RIP: 0033:0x7f757a2ce12e
[   21.701893] Code: 48 89 f2 b9 00 01 00 00 48 89 fe bf 9c ff ff ff
e9 07 00 00 00 0f 1f 80 00 00 00 00 f3 0f 1e fa 41 89 ca b8 06 019
[   21.720570] RSP: 002b:00007ffc38e4ae98 EFLAGS: 00000202 ORIG_RAX:
0000000000000106
[   21.728123] RAX: ffffffffffffffda RBX: 00007ffc38e4b250 RCX: 00007f757a2ce12e
[   21.735242] RDX: 00007ffc38e4aef0 RSI: 00005566acdd2c20 RDI: 00000000ffffff9c
[   21.742363] RBP: 00007ffc38e4afc0 R08: 0000000000000000 R09: 0000000000000000
[   21.749484] R10: 0000000000000100 R11: 0000000000000202 R12: 0000000000000000
[   21.756600] R13: 00007ffc38e4b038 R14: 00005566acdd2c20 R15: 00007ffc38e4b250
[   21.763721]  </TASK>
[   21.765920] Modules linked in: i2c_algo_bit drm_buddy ttm intel_gtt
drm_display_helper drm_kms_helper nvme igc drm sdhci_pci crct10e
[   21.799743] CR2: 0000000000000008
[   21.803061] ---[ end trace 0000000000000000 ]---
[   21.807679] RIP: 0010:kernfs_dop_revalidate+0x2b/0x120
[   21.812819] Code: 1f 44 00 00 83 e6 40 0f 85 07 01 00 00 41 55 41
54 55 53 48 8b 47 30 48 89 fb 48 85 c0 0f 84 a2 00 00 00 48 8b a87
[   21.831505] RSP: 0018:ffffb656405abbf0 EFLAGS: 00010282
[   21.836724] RAX: ffff921c761a5d40 RBX: ffff921c76164000 RCX: 0000000000000001
[   21.843844] RDX: ffff921c76164038 RSI: 0000000000000000 RDI: ffff921c76164000
[   21.850963] RBP: 0000000000000000 R08: 0000000000000006 R09: ffff921c02272d80
[   21.858082] R10: ffff8d909b919a89 R11: 0000000000000000 R12: ffff921c02272d80
[   21.865201] R13: ffffb656405abca0 R14: 0000000000000000 R15: 0000000000000000
[   21.872321] FS:  00007f75796bf540(0000) GS:ffff922360580000(0000)
knlGS:0000000000000000
[   21.880390] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   21.886123] CR2: 0000000000000008 CR3: 000000011615c000 CR4: 0000000000350ee0
[   21.893233] Kernel panic - not syncing: Fatal exception
[   21.898486] Kernel Offset: 0x19e00000 from 0xffffffff81000000
(relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[   21.909231] ---[ end Kernel panic - not syncing: Fatal exception ]---

The stack trace may not be the same all the time, I didn't dive deep
into the root cause, but it looks to me the patch will cause an
unknown issue. Also I tested the patch on kernel-5.14.0-318.el9, it
will encounter a similar issue, because the timing for every panic is
nearly the same.

...snip...
[   19.521883] nvme 0000:04:00.0: platform quirk: setting simple suspend
[   19.522030] nvme nvme0: pci function 0000:04:00.0
[   19.526540] pps pps0: new PPS source ptp0
[   19.526641] igc 0000:02:00.0 (unnamed net_device) (uninitialized): PHC added
[   19.550702] usbcore: registered new interface driver cdc_ncm
[   19.551424] igc 0000:02:00.0: 4.000 Gb/s available PCIe bandwidth
(5.0 GT/s PCIe x1 link)
[   19.551429] igc 0000:02:00.0 eth0: MAC: ...snip...
[   19.551660] igc 0000:03:00.0: PTM enabled, 4ns granularity
[   19.571901] BUG: unable to handle page fault for address: ffff90e3b676ac38
[   19.571906] #PF: supervisor read access in kernel mode
[   19.571907] #PF: error_code(0x0000) - not-present page
[   19.571910] PGD 3f401067 P4D 3f401067 PUD 3f40c067 PMD 176711063
[   19.571915] BAD
[   19.571917] Oops: 0000 [#1] PREEMPT SMP NOPTI
[   19.571920] CPU: 1 PID: 447 Comm: gzip Not tainted 5.14.0+ #2
[   19.571923] Hardware name: ...snip...
[   19.571924] RIP: 0010:vma_interval_tree_remove+0x16a/0x2c0
[   19.571933] Code: e0 fc 48 83 fa 03 76 43 48 8b 48 10 48 8b 70 08
48 8b 50 b0 4c 8b 40 40 48 2b 50 a8 48 c1 ea 0c 4a 8d 54 02 ff 489
[   19.571935] RSP: 0018:ffffb05d40a2bd20 EFLAGS: 00010282
[   19.571938] RAX: ffff90e3550f2d08 RBX: ffff90e34bcc46e0 RCX: ffff90e3b676ac20
[   19.571940] RDX: 0000000000000027 RSI: ffff90e355293df0 RDI: 0000000000000000
[   19.571941] RBP: ffff90e35528a3a0 R08: 0000000000000000 R09: 0000000000000000
[   19.571943] R10: ffffb05d40a2bd00 R11: 00007fffec14c000 R12: ffff90e35528a3f8
[   19.571944] R13: ffffb05d40a2bda0 R14: 0000000000000000 R15: ffff90e355293000
[   19.571946] FS:  0000000000000000(0000) GS:ffff90eaa0480000(0000)
knlGS:0000000000000000
[   19.571949] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   19.571951] CR2: ffff90e3b6711b50 CR3: 0000000113ed8000 CR4: 0000000000350ee0
[   19.571953] Call Trace:
[   19.571956]  <TASK>
[   19.571959]  unlink_file_vma+0x41/0x60
[   19.571963]  free_pgtables+0xa0/0xe0
[   19.571966]  exit_mmap+0xb3/0x1e0
[   19.571972]  mmput+0x58/0x140
[   19.571976]  exit_mm+0xb2/0x110
[   19.571980]  do_exit+0x210/0x4c0
[   19.571982]  do_group_exit+0x2d/0x90
[   19.571985]  __x64_sys_exit_group+0x14/0x20
[   19.571988]  do_syscall_64+0x59/0x90
[   19.571993]  ? exc_page_fault+0x64/0x140
[   19.571996]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   19.572000] RIP: 0033:0x7fac2abd3a4d
[   19.572003] Code: Unable to access opcode bytes at RIP 0x7fac2abd3a23.
[   19.572004] RSP: 002b:00007fffec0e5808 EFLAGS: 00000246 ORIG_RAX:
00000000000000e7
[   19.572006] RAX: ffffffffffffffda RBX: 00007fac2acb19e0 RCX: 00007fac2abd3a4d
[   19.572008] RDX: 00000000000000e7 RSI: ffffffffffffff80 RDI: 0000000000000000
[   19.572009] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000020
[   19.572011] R10: 00007fffec0e56b0 R11: 0000000000000246 R12: 00007fac2acb19e0
[   19.572012] R13: 00007fac2acb6f00 R14: 0000000000000001 R15: 00007fac2acb6ee8
[   19.572015]  </TASK>
[   19.572016] Modules linked in: cdc_ncm nvme cqhci cdc_ether usbnet
nvme_core sdhci uas nvme_common ghash_clmulni_intel usb_storage e
[   19.572040] CR2: ffff90e3b676ac38
[   19.572043] ---[ end trace 4abd4ff8e3f54f71 ]---
[   19.572044] RIP: 0010:vma_interval_tree_remove+0x16a/0x2c0
[   19.572048] Code: e0 fc 48 83 fa 03 76 43 48 8b 48 10 48 8b 70 08
48 8b 50 b0 4c 8b 40 40 48 2b 50 a8 48 c1 ea 0c 4a 8d 54 02 ff 489
[   19.572050] RSP: 0018:ffffb05d40a2bd20 EFLAGS: 00010282
[   19.572052] RAX: ffff90e3550f2d08 RBX: ffff90e34bcc46e0 RCX: ffff90e3b676ac20
[   19.572053] RDX: 0000000000000027 RSI: ffff90e355293df0 RDI: 0000000000000000
[   19.572054] RBP: ffff90e35528a3a0 R08: 0000000000000000 R09: 0000000000000000
[   19.572056] R10: ffffb05d40a2bd00 R11: 00007fffec14c000 R12: ffff90e35528a3f8
[   19.572057] R13: ffffb05d40a2bda0 R14: 0000000000000000 R15: ffff90e355293000
[   19.572058] FS:  0000000000000000(0000) GS:ffff90eaa0480000(0000)
knlGS:0000000000000000
[   19.572060] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   19.572062] CR2: ffff90e3b6711b50 CR3: 0000000113ed8000 CR4: 0000000000350ee0
[   19.572064] Kernel panic - not syncing: Fatal exception
[   19.572840] BUG: unable to handle page fault for address: ffff90e3b6767b48
[   19.971622] #PF: supervisor read access in kernel mode
[   19.971624] #PF: error_code(0x0000) - not-present page
[   19.971626] PGD 3f401067 P4D 3f401067 PUD 3f40c067 PMD 176711063
[   19.971631] BAD
[   19.971632] Oops: 0000 [#2] PREEMPT SMP NOPTI
[   19.971634] CPU: 0 PID: 445 Comm: systemd-vconsol Tainted: G      D
          -------  ---  5.14.0+ #2
[   19.971638] Hardware name: ...snip...
[   19.971639] RIP: 0010:__rb_insert_augmented+0x77/0x1b0
[   19.971645] Code: 89 63 10 48 89 5f 08 4d 85 e4 74 0b 48 89 d8 48
83 c8 01 49 89 04 24 48 8b 03 48 89 07 48 89 3b 48 83 f8 03 76 5d8
[   19.971648] RSP: 0018:ffffb05d408e7c50 EFLAGS: 00010282
[   19.971650] RAX: ffff90e3b6767b38 RBX: ffff90e3550f1798 RCX: 0000000000000019
[   19.971652] RDX: ffffffffad5180f0 RSI: ffff90e35527eb38 RDI: ffff90e35527eb38
[   19.971653] RBP: ffff90e35527eb38 R08: ffff90e35527eae0 R09: 00000000000000e8
[   19.971655] R10: 0000000008000075 R11: 0000000000000000 R12: 0000000000000000
[   19.971656] R13: ffff90e34bcc6310 R14: ffff90e35527e750 R15: ffff90e35527eae0
[   19.971658] FS:  00007fc4586d1380(0000) GS:ffff90eaa0400000(0000)
knlGS:0000000000000000
[   19.971660] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   19.971662] CR2: ffff90e3b6711b38 CR3: 0000000115054000 CR4: 0000000000350ef0
[   19.971664] Call Trace:
[   19.971666]  <TASK>
[   19.971667]  ? vmacache_find+0xb0/0xb0
[   19.971672]  dup_mmap+0x23d/0x520
[   19.971677]  dup_mm+0x60/0x100
[   19.971680]  copy_process+0xc09/0x1680
[   19.971684]  kernel_clone+0x98/0x3f0
[   19.971688]  __do_sys_clone+0x72/0xa0
[   19.971692]  do_syscall_64+0x59/0x90
[   19.971695]  ? syscall_exit_to_user_mode+0x12/0x30
[   19.971698]  ? do_syscall_64+0x68/0x90
[   19.971701]  ? syscall_exit_to_user_mode+0x12/0x30
[   19.971703]  ? do_syscall_64+0x68/0x90
[   19.971706]  ? do_syscall_64+0x68/0x90
[   19.971708]  ? exc_page_fault+0x64/0x140
[   19.971710]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   19.971714] RIP: 0033:0x7fc45930d9d7
[   19.971716] Code: 00 00 00 f3 0f 1e fa 64 48 8b 04 25 10 00 00 00
45 31 c0 31 d2 31 f6 bf 11 00 20 01 4c 8d 90 d0 02 00 00 b8 38 000
[   19.971718] RSP: 002b:00007ffca4fb03d8 EFLAGS: 00000246 ORIG_RAX:
0000000000000038
[   19.971721] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fc45930d9d7
[   19.971723] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011
[   19.971724] RBP: 0000000000000000 R08: 0000000000000000 R09: fffffffffffffe88
[   19.971725] R10: 00007fc4586d1650 R11: 0000000000000246 R12: 0000000000000003
[   19.971727] R13: 00007ffca4fb0590 R14: 0000000000000040 R15: 00007ffca4fb0510
[   19.971730]  </TASK>
[   19.971731] Modules linked in: cdc_ncm nvme cqhci cdc_ether usbnet
nvme_core sdhci uas nvme_common ghash_clmulni_intel usb_storage e
[   19.971749] CR2: ffff90e3b6767b48
[   19.971750] ---[ end trace 4abd4ff8e3f54f72 ]---
[   19.971751] BUG: unable to handle page fault for address: ffff90e3b674eef0
[   19.971752] RIP: 0010:vma_interval_tree_remove+0x16a/0x2c0
[   19.971753] #PF: supervisor read access in kernel mode
[   19.971755] #PF: error_code(0x0000) - not-present page
[   19.971756] Code: e0 fc 48 83 fa 03 76 43 48 8b 48 10 48 8b 70 08
48 8b 50 b0 4c 8b 40 40 48 2b 50 a8 48 c1 ea 0c 4a 8d 54 02 ff 489
[   19.971757] PGD 3f401067 P4D 3f401067
[   19.971758] RSP: 0018:ffffb05d40a2bd20 EFLAGS: 00010282
[   19.971759] PUD 3f40c067
[   19.971760]
[   19.971761] PMD 176711063
[   19.971761] RAX: ffff90e3550f2d08 RBX: ffff90e34bcc46e0 RCX: ffff90e3b676ac20
[   19.971763] BAD
[   19.971763] RDX: 0000000000000027 RSI: ffff90e355293df0 RDI: 0000000000000000
[   19.971764] RBP: ffff90e35528a3a0 R08: 0000000000000000 R09: 0000000000000000
[   19.971764] Oops: 0000 [#3] PREEMPT SMP NOPTI
[   19.971766] R10: ffffb05d40a2bd00 R11: 00007fffec14c000 R12: ffff90e35528a3f8
[   19.971768] R13: ffffb05d40a2bda0 R14: 0000000000000000 R15: ffff90e355293000
[   19.971767] CPU: 2 PID: 409 Comm: systemd-udevd Tainted: G      D
        -------  ---  5.14.0+ #2
[   19.971769] FS:  00007fc4586d1380(0000) GS:ffff90eaa0400000(0000)
knlGS:0000000000000000
[   19.971770] Hardware name: ...snip...
[   19.971772] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   19.971774] CR2: ffff90e3b6711b38 CR3: 0000000115054000 CR4: 0000000000350ef0
[   19.971772] RIP: 0010:inode_has_perm+0x33/0x50
[   19.971778] Code: 89 c9 f6 46 0d 02 75 37 48 63 05 08 99 04 01 48
8b 57 78 8b 7c 02 04 48 8b 46 38 48 85 c0 74 0a 48 63 15 f8 98 041
[   19.971780] RSP: 0018:ffffb05d4086fd08 EFLAGS: 00010282
[   19.971782] RAX: ffff90e3b674eed0 RBX: ffff90e3b674eed0 RCX: ffffb05d4086fd10
[   19.971784] RDX: 0000000000000010 RSI: ffff90e34be230b0 RDI: 0000000000000001
[   19.971786] RBP: ffffb05d4086fd58 R08: 0000000000000010 R09: ffffb05d4086fd10
[   19.971787] R10: 000000000000000c R11: 0000000000000000 R12: ffff90e34be230b0
[   19.971789] R13: ffff90e34357b0c0 R14: ffff90e354e47cc0 R15: 0000000000000000
[   19.971790] FS:  00007fd768238540(0000) GS:ffff90eaa0500000(0000)
knlGS:0000000000000000
[   19.971793] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   19.971795] CR2: ffff90e3b6711a70 CR3: 0000000115064000 CR4: 0000000000350ee0
[   19.971797] Call Trace:
[   19.971798]  <TASK>
[   19.971799]  selinux_inode_getattr+0x99/0xc0
[   19.971805]  security_inode_getattr+0x37/0x60
[   19.971808]  vfs_statx+0x9a/0x160
[   19.971813]  vfs_fstatat+0x51/0x70
[   19.971816]  __do_sys_newfstatat+0x26/0x60
[   19.971819]  ? __seccomp_filter+0x45/0x360
[   19.971823]  ? syscall_trace_enter.isra.0+0x9e/0x1d0
[   19.971828]  do_syscall_64+0x59/0x90
[   19.971831]  ? syscall_exit_to_user_mode+0x12/0x30
[   19.971833]  ? do_syscall_64+0x68/0x90
[   19.971836]  ? exit_to_user_mode_loop+0xb9/0x110
[   19.971839]  ? exit_to_user_mode_prepare+0xac/0xf0
[   19.971842]  ? syscall_exit_to_user_mode+0x12/0x30
[   19.971844]  ? do_syscall_64+0x68/0x90
[   19.971847]  ? do_syscall_64+0x68/0x90
[   19.971849]  ? sysvec_apic_timer_interrupt+0x3a/0x90
[   19.971852]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   19.971855] RIP: 0033:0x7fd768e4712e
[   19.971857] Code: 48 89 f2 b9 00 01 00 00 48 89 fe bf 9c ff ff ff
e9 07 00 00 00 0f 1f 80 00 00 00 00 f3 0f 1e fa 41 89 ca b8 06 019
[   19.971859] RSP: 002b:00007ffdb9f648a8 EFLAGS: 00000206 ORIG_RAX:
0000000000000106
[   19.971862] RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007fd768e4712e
[   19.971863] RDX: 00007ffdb9f649c0 RSI: 00007fd768ec0f35 RDI: 000000000000000c
[   19.971865] RBP: 00007ffdb9f64a90 R08: 000000000000ffff R09: 0000000000001001
[   19.971866] R10: 0000000000001000 R11: 0000000000000206 R12: 0000000000000007
[   19.971868] R13: 0000000000000000 R14: 0000000000000006 R15: 0000557c0e1ef200
[   19.971871]  </TASK>
[   19.971871] Modules linked in: cdc_ncm nvme cqhci cdc_ether usbnet
nvme_core sdhci uas nvme_common ghash_clmulni_intel usb_storage e
[   19.971887] CR2: ffff90e3b674eef0
[   19.971889] ---[ end trace 4abd4ff8e3f54f73 ]---
[   19.971889] BUG: unable to handle page fault for address: ffff90e3b677f400
[   19.971892] #PF: supervisor read access in kernel mode
[   19.971890] RIP: 0010:vma_interval_tree_remove+0x16a/0x2c0
[   19.971893] #PF: error_code(0x0000) - not-present page
[   19.971895] PGD 3f401067
[   19.971895] Code: e0 fc 48 83 fa 03 76 43 48 8b 48 10 48 8b 70 08
48 8b 50 b0 4c 8b 40 40 48 2b 50 a8 48 c1 ea 0c 4a 8d 54 02 ff 489
[   19.971897] P4D 3f401067
[   19.971897] RSP: 0018:ffffb05d40a2bd20 EFLAGS: 00010282
[   19.971898] PUD 3f40c067 PMD 176711063
[   19.971899]
[   19.971900] RAX: ffff90e3550f2d08 RBX: ffff90e34bcc46e0 RCX: ffff90e3b676ac20
[   19.971901] BAD
[   19.971902] RDX: 0000000000000027 RSI: ffff90e355293df0 RDI: 0000000000000000
[   19.971902] Oops: 0000 [#4] PREEMPT SMP NOPTI
[   19.971904] RBP: ffff90e35528a3a0 R08: 0000000000000000 R09: 0000000000000000
[   19.971905] R10: ffffb05d40a2bd00 R11: 00007fffec14c000 R12: ffff90e35528a3f8
[   19.971905] CPU: 3 PID: 37 Comm: ksoftirqd/3 Tainted: G      D
     -------  ---  5.14.0+ #2
[   19.971906] R13: ffffb05d40a2bda0 R14: 0000000000000000 R15: ffff90e355293000
[   19.971908] Hardware name: ...snip...
[   19.971908] FS:  00007fd768238540(0000) GS:ffff90eaa0500000(0000)
knlGS:0000000000000000
[   19.971911] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   19.971912] CR2: ffff90e3b6711a70 CR3: 0000000115064000 CR4: 0000000000350ee0
[   19.971910] RIP: 0010:rcu_segcblist_accelerate+0xed/0x130
[   19.971916] Code: 89 48 18 48 89 50 38 b8 01 00 00 00 c3 cc cc cc
cc 31 c0 c3 cc cc cc cc be 01 00 00 00 eb ae 0f 0b e9 2f ff ff ff4
[   19.971918] RSP: 0018:ffffb05d40233e38 EFLAGS: 00010097
[   19.971921] RAX: ffff90eaa05b1d58 RBX: ffff90eaa05b1cc0 RCX: ffff90e3b677f400
[   19.971923] RDX: 00000000000002c4 RSI: 00000000000002c4 RDI: ffff90eaa05b1d58
[   19.971924] RBP: ffffffffaf1e97c0 R08: 0000000000000002 R09: 000000004406f74b
[   19.971926] R10: 00000000ad580200 R11: ffffffffaee060c0 R12: 00000000000002c4
[   19.971927] R13: ffff90eaa05b1d58 R14: 0000000000000246 R15: 0000000000000008
[   19.971929] FS:  0000000000000000(0000) GS:ffff90eaa0580000(0000)
knlGS:0000000000000000
[   19.971932] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   19.971933] CR2: ffff90e3b6711bf8 CR3: 0000000102e74000 CR4: 0000000000350ee0
[   19.971935] Call Trace:
[   19.971937]  <TASK>
[   19.971937]  rcu_accelerate_cbs+0x56/0x80
[   19.971942]  rcu_core+0x319/0x460
[   19.971945]  ? sched_clock_cpu+0x9/0xb0
[   19.971949]  __do_softirq+0xc7/0x2b3
[   19.971954]  ? find_next_bit+0x10/0x10
[   19.971956]  run_ksoftirqd+0x1e/0x30
[   19.971959]  smpboot_thread_fn+0xd5/0x1d0
[   19.971962]  kthread+0xd7/0x100
[   19.971965]  ? kthread_complete_and_exit+0x20/0x20
[   19.971968]  ret_from_fork+0x1f/0x30
[   19.971973]  </TASK>
[   19.971974] Modules linked in: cdc_ncm nvme cqhci cdc_ether usbnet
nvme_core sdhci uas nvme_common ghash_clmulni_intel usb_storage e
[   19.971990] CR2: ffff90e3b677f400
[   19.971991] ---[ end trace 4abd4ff8e3f54f74 ]---
[   19.971992] RIP: 0010:vma_interval_tree_remove+0x16a/0x2c0
[   19.971996] Code: e0 fc 48 83 fa 03 76 43 48 8b 48 10 48 8b 70 08
48 8b 50 b0 4c 8b 40 40 48 2b 50 a8 48 c1 ea 0c 4a 8d 54 02 ff 489
[   19.971998] RSP: 0018:ffffb05d40a2bd20 EFLAGS: 00010282
[   19.972000] RAX: ffff90e3550f2d08 RBX: ffff90e34bcc46e0 RCX: ffff90e3b676ac20
[   19.972001] RDX: 0000000000000027 RSI: ffff90e355293df0 RDI: 0000000000000000
[   19.972003] RBP: ffff90e35528a3a0 R08: 0000000000000000 R09: 0000000000000000
[   19.972004] R10: ffffb05d40a2bd00 R11: 00007fffec14c000 R12: ffff90e35528a3f8
[   19.972005] R13: ffffb05d40a2bda0 R14: 0000000000000000 R15: ffff90e355293000
[   19.972007] FS:  0000000000000000(0000) GS:ffff90eaa0580000(0000)
knlGS:0000000000000000
[   19.972009] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   19.972010] CR2: ffff90e3b6711bf8 CR3: 0000000102e74000 CR4: 0000000000350ee0
[   22.110997] Shutting down cpus with NMI
[   22.114849] Kernel Offset: 0x2c200000 from 0xffffffff81000000
(relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[   22.125603] ---[ end Kernel panic - not syncing: Fatal exception ]---

Thanks,
Tao Liu


>
> --
> Regards/Gruss,
>     Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
>
Borislav Petkov July 17, 2023, 2:14 p.m. UTC | #13
On Mon, Jul 17, 2023 at 09:53:06PM +0800, Tao Liu wrote:
> ...snip...
> [   21.360763]  nvme0n1: p1 p2 p3
> [   21.364207] igc 0000:03:00.0: PTM enabled, 4ns granularity
> [   21.421097] pps pps1: new PPS source ptp1
> [   21.425396] igc 0000:03:00.0 (unnamed net_device) (uninitialized): PHC added
> [   21.457005] igc 0000:03:00.0: 4.000 Gb/s available PCIe bandwidth
> (5.0 GT/s PCIe x1 link)
> [   21.465210] igc 0000:03:00.0 eth1: MAC: ...snip...
> [   21.473424] igc 0000:03:00.0 enp3s0: renamed from eth1
> [   21.479446] BUG: kernel NULL pointer dereference, address: 0000000000000008
> [   21.486405] #PF: supervisor read access in kernel mode
> [   21.491519] mmc1: Failed to initialize a non-removable card
> [   21.491538] #PF: error_code(0x0000) - not-present page
> [   21.502229] PGD 0 P4D 0
> [   21.504773] Oops: 0000 [#1] PREEMPT SMP NOPTI
> [   21.509133] CPU: 3 PID: 402 Comm: systemd-udevd Not tainted 6.5.0-rc1+ #1
> [   21.515905] Hardware name: ...snip...
> [   21.522851] RIP: 0010:kernfs_dop_revalidate+0x2b/0x120

So something's weird here - my patch should not cause a null ptr deref
here.

> [   21.527995] Code: 1f 44 00 00 83 e6 40 0f 85 07 01 00 00 41 55 41
> 54 55 53 48 8b 47 30 48 89 fb 48 85 c0 0f 84 a2 00 00 00 48 8b a87

This looks weird too. There's no "<>" brackets denoting which byte it
was exactly where RIP pointed to when the NULL ptr happened.

Do

make fs/kernfs/dir.s

and upload dir.s and the dir.o file somewhere.

In any case, my patch shouldn't be causing this. At least I don't see
it.

I'm testing a better version of the patch and it should not cause this
thing even less.

> The stack trace may not be the same all the time, I didn't dive deep
> into the root cause, but it looks to me the patch will cause an
> unknown issue. Also I tested the patch on kernel-5.14.0-318.el9, it

This is the upstream kernel mailing list so those Frankenstein kernels
are all left to you.

Good luck. :-)
Tao Liu July 17, 2023, 2:24 p.m. UTC | #14
On Mon, Jul 17, 2023 at 10:14 PM Borislav Petkov <bp@alien8.de> wrote:
>
> On Mon, Jul 17, 2023 at 09:53:06PM +0800, Tao Liu wrote:
> > ...snip...
> > [   21.360763]  nvme0n1: p1 p2 p3
> > [   21.364207] igc 0000:03:00.0: PTM enabled, 4ns granularity
> > [   21.421097] pps pps1: new PPS source ptp1
> > [   21.425396] igc 0000:03:00.0 (unnamed net_device) (uninitialized): PHC added
> > [   21.457005] igc 0000:03:00.0: 4.000 Gb/s available PCIe bandwidth
> > (5.0 GT/s PCIe x1 link)
> > [   21.465210] igc 0000:03:00.0 eth1: MAC: ...snip...
> > [   21.473424] igc 0000:03:00.0 enp3s0: renamed from eth1
> > [   21.479446] BUG: kernel NULL pointer dereference, address: 0000000000000008
> > [   21.486405] #PF: supervisor read access in kernel mode
> > [   21.491519] mmc1: Failed to initialize a non-removable card
> > [   21.491538] #PF: error_code(0x0000) - not-present page
> > [   21.502229] PGD 0 P4D 0
> > [   21.504773] Oops: 0000 [#1] PREEMPT SMP NOPTI
> > [   21.509133] CPU: 3 PID: 402 Comm: systemd-udevd Not tainted 6.5.0-rc1+ #1
> > [   21.515905] Hardware name: ...snip...
> > [   21.522851] RIP: 0010:kernfs_dop_revalidate+0x2b/0x120
>
> So something's weird here - my patch should not cause a null ptr deref
> here.
>
> > [   21.527995] Code: 1f 44 00 00 83 e6 40 0f 85 07 01 00 00 41 55 41
> > 54 55 53 48 8b 47 30 48 89 fb 48 85 c0 0f 84 a2 00 00 00 48 8b a87
>
> This looks weird too. There's no "<>" brackets denoting which byte it
> was exactly where RIP pointed to when the NULL ptr happened.
>
> Do
>
> make fs/kernfs/dir.s
>
> and upload dir.s and the dir.o file somewhere.
>
> In any case, my patch shouldn't be causing this. At least I don't see
> it.
>
> I'm testing a better version of the patch and it should not cause this
> thing even less.
>
OK, thanks for the help. I will re-make, test and update the info.

> > The stack trace may not be the same all the time, I didn't dive deep
> > into the root cause, but it looks to me the patch will cause an
> > unknown issue. Also I tested the patch on kernel-5.14.0-318.el9, it
>
> This is the upstream kernel mailing list so those Frankenstein kernels
> are all left to you.
>
> Good luck. :-)
>
OK, thanks!

Thanks,
Tao Liu

> --
> Regards/Gruss,
>     Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
>
Ard Biesheuvel July 17, 2023, 2:56 p.m. UTC | #15
On Mon, 17 Jul 2023 at 15:53, Tao Liu <ltao@redhat.com> wrote:
>
> Hi Borislav,
>
> On Thu, Jul 13, 2023 at 6:05 PM Borislav Petkov <bp@alien8.de> wrote:
> >
> > On Thu, Jun 01, 2023 at 03:20:44PM +0800, Tao Liu wrote:
> > >  arch/x86/kernel/machine_kexec_64.c | 35 ++++++++++++++++++++++++++----
> > >  1 file changed, 31 insertions(+), 4 deletions(-)
> >
> > Ok, pls try this totally untested thing.
> >
> > Thx.
> >
> > ---
> > diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
> > index 09dc8c187b3c..fefe27b2af85 100644
> > --- a/arch/x86/boot/compressed/sev.c
> > +++ b/arch/x86/boot/compressed/sev.c
> > @@ -404,13 +404,20 @@ void sev_enable(struct boot_params *bp)
> >         if (bp)
> >                 bp->cc_blob_address = 0;
> >
> > +       /* Check for the SME/SEV support leaf */
> > +       eax = 0x80000000;
> > +       ecx = 0;
> > +       native_cpuid(&eax, &ebx, &ecx, &edx);
> > +       if (eax < 0x8000001f)
> > +               return;
> > +
> >         /*
> >          * Setup/preliminary detection of SNP. This will be sanity-checked
> >          * against CPUID/MSR values later.
> >          */
> >         snp = snp_init(bp);
> >
> > -       /* Check for the SME/SEV support leaf */
> > +       /* Recheck the SME/SEV support leaf */
> >         eax = 0x80000000;
> >         ecx = 0;
> >         native_cpuid(&eax, &ebx, &ecx, &edx);
> >
> Thanks a lot for the patch above! Sorry for the late response. I have
> compiled and tested it locally against 6.5.0-rc1, though it can pass
> the early stage of kexec kernel bootup,

OK, so that proves that the cc_blob table access is the culprit here.
That still means that kexec on SEV is likely to explode in the exact
same way should anyone attempt that.


> however the kernel will panic
> occasionally later. The test machine is the one with Intel Atom
> x6425RE cpu which encountered the page fault issue of missing efi
> config table.
>

Agree with Boris that this seems entirely unrelated.

> ...snip...
> [   21.360763]  nvme0n1: p1 p2 p3
> [   21.364207] igc 0000:03:00.0: PTM enabled, 4ns granularity
> [   21.421097] pps pps1: new PPS source ptp1
> [   21.425396] igc 0000:03:00.0 (unnamed net_device) (uninitialized): PHC added
> [   21.457005] igc 0000:03:00.0: 4.000 Gb/s available PCIe bandwidth
> (5.0 GT/s PCIe x1 link)
> [   21.465210] igc 0000:03:00.0 eth1: MAC: ...snip...
> [   21.473424] igc 0000:03:00.0 enp3s0: renamed from eth1
> [   21.479446] BUG: kernel NULL pointer dereference, address: 0000000000000008
> [   21.486405] #PF: supervisor read access in kernel mode
> [   21.491519] mmc1: Failed to initialize a non-removable card
> [   21.491538] #PF: error_code(0x0000) - not-present page
> [   21.502229] PGD 0 P4D 0
> [   21.504773] Oops: 0000 [#1] PREEMPT SMP NOPTI
> [   21.509133] CPU: 3 PID: 402 Comm: systemd-udevd Not tainted 6.5.0-rc1+ #1
> [   21.515905] Hardware name: ...snip...


Why are you snipping the hardware name?
Tao Liu July 17, 2023, 3:02 p.m. UTC | #16
Hi Ard,

Thanks for your explanation!

On Thu, Jul 13, 2023 at 6:18 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Fri, 7 Jul 2023 at 19:12, Borislav Petkov <bp@alien8.de> wrote:
> >
> > On Fri, Jul 07, 2023 at 10:25:15AM -0500, Michael Roth wrote:
> > > ...
> > > It would be unfortunate if we finally abandoned this path because of the
> > > issue being hit here though. I think the patch posted here is the proper
> > > resolution to the issue being hit, and I'm hoping at this point we've
> > > identified all the similar cases where EFI/setup_data-related structures
> > > were missing explicit mappings. But if we still think it's too much of a
> > > liability to access the EFI config table outside of SEV-enabled guests,
> > > then I can work on re-implementing things based on the above logic.
> >
> > Replying here to Tom's note too...
> >
> > So, I like the idea of rechecking CPUID. Yes, let's do the sev_status
> > check. As a result, we either fail the guest - no problem - or we boot
> > and we recheck. Thus, we don't run AMD code on !AMD machines, if the HV
> > is not a lying bastard.
> >
> > Now, if we've gotten a valid setup_data SETUP_EFI entry with a valid
> > pointer to an EFI config table, then that should happen in the generic
> > path - initialize_identity_maps(), for example - like you've done in
> > b57feed2cc26 - not in the kexec code because kexec *happens* to need it.
> >
> > We want to access the EFI config table? Sure, by all means, but make
> > that generic for all code.
> >
>
> OK, so in summary, what seems to be happening here is that the SEV
> init code in the decompressor looks for the cc blob table before the
> on-demand mapping code is up, which normally ensures that any RAM
> address is accessible even if it hasn't been mapped explicitly.
>
Yes it is exactly the case.

> This is why the fix happens to work: the code only maps the array of
> (guid, phys_addr) tuples that describes the list of configuration
> tables that have been provided by the firmware. The actual
> configuration tables themselves could be anywhere in physical memory,
> and without prior knowledge of a particular GUID value, there is no
> way to know the size of the table, and so they cannot be mapped

Should we loop map each element of the config table one at a time? We
read a GUID value, then we map it with phys_addr, phys_addr +
sizeof(struct), then we read-map the next one, in this way we may not
need to know the size of the table?

> upfront like this. However, the cc blob table does not exist on this
> machine, and so whether the EFI config tables themselves are mapped or
> not is irrelevant.

Currently we don't need all the data of the config table, only part of
it. So only  (guid, phys_addr) tuple arrays are mapped into. Won't it
be better if we map config table on demand if we need further data
later?

>
> But it does mean the fix is incomplete, and certainly does not belong
> in generic kexec code. If anything, we should be fixing the
> decompressor code to defer the cc blob table check until after the
> demand mapping code is up.

Yes, if we can defer the cc blob access, the issue can be resolved. I
don't know if it is doable from AMD's view...
>
> If this is problematic, we might instead disable SEV for kexec, and
> rely on the fact that SEV firmware enters with a complete 1:1 map (as

We still need sev support for kexec. If we disable sev during kexec, I
suppose kdump may be broken on a sev guest, maybe?

Thanks,
Tao Liu

> we seem to be doing currently). If kexec for SEV is needed at some
> point, we can re-enable it by having it provide a mapping for the
> config table array and the cc blob table explicitly.
>
Tao Liu July 17, 2023, 3:11 p.m. UTC | #17
On Mon, Jul 17, 2023 at 10:57 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Mon, 17 Jul 2023 at 15:53, Tao Liu <ltao@redhat.com> wrote:
> >
> > Hi Borislav,
> >
> > On Thu, Jul 13, 2023 at 6:05 PM Borislav Petkov <bp@alien8.de> wrote:
> > >
> > > On Thu, Jun 01, 2023 at 03:20:44PM +0800, Tao Liu wrote:
> > > >  arch/x86/kernel/machine_kexec_64.c | 35 ++++++++++++++++++++++++++----
> > > >  1 file changed, 31 insertions(+), 4 deletions(-)
> > >
> > > Ok, pls try this totally untested thing.
> > >
> > > Thx.
> > >
> > > ---
> > > diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
> > > index 09dc8c187b3c..fefe27b2af85 100644
> > > --- a/arch/x86/boot/compressed/sev.c
> > > +++ b/arch/x86/boot/compressed/sev.c
> > > @@ -404,13 +404,20 @@ void sev_enable(struct boot_params *bp)
> > >         if (bp)
> > >                 bp->cc_blob_address = 0;
> > >
> > > +       /* Check for the SME/SEV support leaf */
> > > +       eax = 0x80000000;
> > > +       ecx = 0;
> > > +       native_cpuid(&eax, &ebx, &ecx, &edx);
> > > +       if (eax < 0x8000001f)
> > > +               return;
> > > +
> > >         /*
> > >          * Setup/preliminary detection of SNP. This will be sanity-checked
> > >          * against CPUID/MSR values later.
> > >          */
> > >         snp = snp_init(bp);
> > >
> > > -       /* Check for the SME/SEV support leaf */
> > > +       /* Recheck the SME/SEV support leaf */
> > >         eax = 0x80000000;
> > >         ecx = 0;
> > >         native_cpuid(&eax, &ebx, &ecx, &edx);
> > >
> > Thanks a lot for the patch above! Sorry for the late response. I have
> > compiled and tested it locally against 6.5.0-rc1, though it can pass
> > the early stage of kexec kernel bootup,
>
> OK, so that proves that the cc_blob table access is the culprit here.
> That still means that kexec on SEV is likely to explode in the exact
> same way should anyone attempt that.
>
>
> > however the kernel will panic
> > occasionally later. The test machine is the one with Intel Atom
> > x6425RE cpu which encountered the page fault issue of missing efi
> > config table.
> >
>
> Agree with Boris that this seems entirely unrelated.

Agree, I will have a retest based on Boris's suggestions.

>
> > ...snip...
> > [   21.360763]  nvme0n1: p1 p2 p3
> > [   21.364207] igc 0000:03:00.0: PTM enabled, 4ns granularity
> > [   21.421097] pps pps1: new PPS source ptp1
> > [   21.425396] igc 0000:03:00.0 (unnamed net_device) (uninitialized): PHC added
> > [   21.457005] igc 0000:03:00.0: 4.000 Gb/s available PCIe bandwidth
> > (5.0 GT/s PCIe x1 link)
> > [   21.465210] igc 0000:03:00.0 eth1: MAC: ...snip...
> > [   21.473424] igc 0000:03:00.0 enp3s0: renamed from eth1
> > [   21.479446] BUG: kernel NULL pointer dereference, address: 0000000000000008
> > [   21.486405] #PF: supervisor read access in kernel mode
> > [   21.491519] mmc1: Failed to initialize a non-removable card
> > [   21.491538] #PF: error_code(0x0000) - not-present page
> > [   21.502229] PGD 0 P4D 0
> > [   21.504773] Oops: 0000 [#1] PREEMPT SMP NOPTI
> > [   21.509133] CPU: 3 PID: 402 Comm: systemd-udevd Not tainted 6.5.0-rc1+ #1
> > [   21.515905] Hardware name: ...snip...
>
>
> Why are you snipping the hardware name?

Sorry for the inconvenience here... The machine is borrowed from our
partner, which may not be officially released to the market. I haven't
discussed the legal issue with them. In addition, I think the stack
trace is more useful, so I snipped the hardware name. Sorry about
that...

>
Tao Liu July 27, 2023, 11:03 a.m. UTC | #18
Hi Borislav,

Sorry for the late response. I spent some time retesting your patch
against 6.5.0-rc1 and 6.5.0-rc3, and it is OK. So

Reported-and-tested-by: Tao Liu <ltao@redhat.com>

And will we use this patch as a workaround or will we wait for a
better solution as proposed by Michael?

On Mon, Jul 17, 2023 at 10:14 PM Borislav Petkov <bp@alien8.de> wrote:
>
> On Mon, Jul 17, 2023 at 09:53:06PM +0800, Tao Liu wrote:
> > ...snip...
> > [   21.360763]  nvme0n1: p1 p2 p3
> > [   21.364207] igc 0000:03:00.0: PTM enabled, 4ns granularity
> > [   21.421097] pps pps1: new PPS source ptp1
> > [   21.425396] igc 0000:03:00.0 (unnamed net_device) (uninitialized): PHC added
> > [   21.457005] igc 0000:03:00.0: 4.000 Gb/s available PCIe bandwidth
> > (5.0 GT/s PCIe x1 link)
> > [   21.465210] igc 0000:03:00.0 eth1: MAC: ...snip...
> > [   21.473424] igc 0000:03:00.0 enp3s0: renamed from eth1
> > [   21.479446] BUG: kernel NULL pointer dereference, address: 0000000000000008
> > [   21.486405] #PF: supervisor read access in kernel mode
> > [   21.491519] mmc1: Failed to initialize a non-removable card
> > [   21.491538] #PF: error_code(0x0000) - not-present page
> > [   21.502229] PGD 0 P4D 0
> > [   21.504773] Oops: 0000 [#1] PREEMPT SMP NOPTI
> > [   21.509133] CPU: 3 PID: 402 Comm: systemd-udevd Not tainted 6.5.0-rc1+ #1
> > [   21.515905] Hardware name: ...snip...
> > [   21.522851] RIP: 0010:kernfs_dop_revalidate+0x2b/0x120
>
> So something's weird here - my patch should not cause a null ptr deref
> here.

The random kernel panic I encountered is irrelevant to this patch, and
I'm pretty sure it is caused by some driver, highly suspicious it is
the graphic card driver i915.ko. When I apply this patch, 1)
disconnect the monitor(using serial port to login) and keep i915, or
2) connect the monitor but remove i915, the kexec kernel will not
fail. Though i915 has provided a pci shutdown function, maybe some bug
triggered and caused memory overwrite after kexec. Anyway, it is
another issue.

Thanks,
Tao Liu

>
> > [   21.527995] Code: 1f 44 00 00 83 e6 40 0f 85 07 01 00 00 41 55 41
> > 54 55 53 48 8b 47 30 48 89 fb 48 85 c0 0f 84 a2 00 00 00 48 8b a87
>
> This looks weird too. There's no "<>" brackets denoting which byte it
> was exactly where RIP pointed to when the NULL ptr happened.
>
> Do
>
> make fs/kernfs/dir.s
>
> and upload dir.s and the dir.o file somewhere.
>
> In any case, my patch shouldn't be causing this. At least I don't see
> it.
>
> I'm testing a better version of the patch and it should not cause this
> thing even less.
>
> > The stack trace may not be the same all the time, I didn't dive deep
> > into the root cause, but it looks to me the patch will cause an
> > unknown issue. Also I tested the patch on kernel-5.14.0-318.el9, it
>
> This is the upstream kernel mailing list so those Frankenstein kernels
> are all left to you.
>
> Good luck. :-)
>
> --
> Regards/Gruss,
>     Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
>
Tao Liu July 27, 2023, 11:11 a.m. UTC | #19
Hi Ard,

On Mon, Jul 17, 2023 at 11:11 PM Tao Liu <ltao@redhat.com> wrote:
>
> On Mon, Jul 17, 2023 at 10:57 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > On Mon, 17 Jul 2023 at 15:53, Tao Liu <ltao@redhat.com> wrote:
> > >
> > > Hi Borislav,
> > >
> > > On Thu, Jul 13, 2023 at 6:05 PM Borislav Petkov <bp@alien8.de> wrote:
> > > >
> > > > On Thu, Jun 01, 2023 at 03:20:44PM +0800, Tao Liu wrote:
> > > > >  arch/x86/kernel/machine_kexec_64.c | 35 ++++++++++++++++++++++++++----
> > > > >  1 file changed, 31 insertions(+), 4 deletions(-)
> > > >
> > > > Ok, pls try this totally untested thing.
> > > >
> > > > Thx.
> > > >
> > > > ---
> > > > diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
> > > > index 09dc8c187b3c..fefe27b2af85 100644
> > > > --- a/arch/x86/boot/compressed/sev.c
> > > > +++ b/arch/x86/boot/compressed/sev.c
> > > > @@ -404,13 +404,20 @@ void sev_enable(struct boot_params *bp)
> > > >         if (bp)
> > > >                 bp->cc_blob_address = 0;
> > > >
> > > > +       /* Check for the SME/SEV support leaf */
> > > > +       eax = 0x80000000;
> > > > +       ecx = 0;
> > > > +       native_cpuid(&eax, &ebx, &ecx, &edx);
> > > > +       if (eax < 0x8000001f)
> > > > +               return;
> > > > +
> > > >         /*
> > > >          * Setup/preliminary detection of SNP. This will be sanity-checked
> > > >          * against CPUID/MSR values later.
> > > >          */
> > > >         snp = snp_init(bp);
> > > >
> > > > -       /* Check for the SME/SEV support leaf */
> > > > +       /* Recheck the SME/SEV support leaf */
> > > >         eax = 0x80000000;
> > > >         ecx = 0;
> > > >         native_cpuid(&eax, &ebx, &ecx, &edx);
> > > >
> > > Thanks a lot for the patch above! Sorry for the late response. I have
> > > compiled and tested it locally against 6.5.0-rc1, though it can pass
> > > the early stage of kexec kernel bootup,
> >
> > OK, so that proves that the cc_blob table access is the culprit here.
> > That still means that kexec on SEV is likely to explode in the exact
> > same way should anyone attempt that.
> >
> >
> > > however the kernel will panic
> > > occasionally later. The test machine is the one with Intel Atom
> > > x6425RE cpu which encountered the page fault issue of missing efi
> > > config table.
> > >
> >
> > Agree with Boris that this seems entirely unrelated.
>
> Agree, I will have a retest based on Boris's suggestions.
>
> >
> > > ...snip...
> > > [   21.360763]  nvme0n1: p1 p2 p3
> > > [   21.364207] igc 0000:03:00.0: PTM enabled, 4ns granularity
> > > [   21.421097] pps pps1: new PPS source ptp1
> > > [   21.425396] igc 0000:03:00.0 (unnamed net_device) (uninitialized): PHC added
> > > [   21.457005] igc 0000:03:00.0: 4.000 Gb/s available PCIe bandwidth
> > > (5.0 GT/s PCIe x1 link)
> > > [   21.465210] igc 0000:03:00.0 eth1: MAC: ...snip...
> > > [   21.473424] igc 0000:03:00.0 enp3s0: renamed from eth1
> > > [   21.479446] BUG: kernel NULL pointer dereference, address: 0000000000000008
> > > [   21.486405] #PF: supervisor read access in kernel mode
> > > [   21.491519] mmc1: Failed to initialize a non-removable card
> > > [   21.491538] #PF: error_code(0x0000) - not-present page
> > > [   21.502229] PGD 0 P4D 0
> > > [   21.504773] Oops: 0000 [#1] PREEMPT SMP NOPTI
> > > [   21.509133] CPU: 3 PID: 402 Comm: systemd-udevd Not tainted 6.5.0-rc1+ #1
> > > [   21.515905] Hardware name: ...snip...
> >
> >
> > Why are you snipping the hardware name?
>

Our partner said it is OK to discuss in public, so the hardware is:
Hardware name: LENOVO 11KL0FVT06/3334, BIOS M4XKT14A 05/17/2023

The machine is Lenovo ThinkEdge SE10.

Thanks,
Tao Liu

> Sorry for the inconvenience here... The machine is borrowed from our
> partner, which may not be officially released to the market. I haven't
> discussed the legal issue with them. In addition, I think the stack
> trace is more useful, so I snipped the hardware name. Sorry about
> that...
>
> >
Borislav Petkov July 28, 2023, 4:55 p.m. UTC | #20
On Thu, Jul 27, 2023 at 07:03:26PM +0800, Tao Liu wrote:
> Hi Borislav,
> 
> Sorry for the late response. I spent some time retesting your patch
> against 6.5.0-rc1 and 6.5.0-rc3, and it is OK. So
> 
> Reported-and-tested-by: Tao Liu <ltao@redhat.com>
> 
> And will we use this patch as a workaround or will we wait for a
> better solution as proposed by Michael?

First of all, please do not top-post.

And yes, here's a better one. I'd appreciate it you testing it.

Thx.

---
 arch/x86/boot/compressed/idt_64.c |  5 ++++-
 arch/x86/boot/compressed/sev.c    | 37 +++++++++++++++++++++++++++++--
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c
index 6debb816e83d..0f03ac12e2a6 100644
--- a/arch/x86/boot/compressed/idt_64.c
+++ b/arch/x86/boot/compressed/idt_64.c
@@ -63,7 +63,10 @@ void load_stage2_idt(void)
 	set_idt_entry(X86_TRAP_PF, boot_page_fault);
 
 #ifdef CONFIG_AMD_MEM_ENCRYPT
-	set_idt_entry(X86_TRAP_VC, boot_stage2_vc);
+	if (sev_status & BIT(1))
+		set_idt_entry(X86_TRAP_VC, boot_stage2_vc);
+	else
+		set_idt_entry(X86_TRAP_VC, NULL);
 #endif
 
 	load_boot_idt(&boot_idt_desc);
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 09dc8c187b3c..c3e343bd4760 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -404,13 +404,46 @@ void sev_enable(struct boot_params *bp)
 	if (bp)
 		bp->cc_blob_address = 0;
 
+	/*
+	 * Do an initial SEV capability check before snp_init() which
+	 * loads the CPUID page and the same checks afterwards are done
+	 * without the hypervisor and are trustworthy.
+	 *
+	 * If the HV fakes SEV support, the guest will crash'n'burn
+	 * which is good enough.
+	 */
+
+	/* Check for the SME/SEV support leaf */
+	eax = 0x80000000;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+	if (eax < 0x8000001f)
+		return;
+
+	/*
+	 * Check for the SME/SEV feature:
+	 *   CPUID Fn8000_001F[EAX]
+	 *   - Bit 0 - Secure Memory Encryption support
+	 *   - Bit 1 - Secure Encrypted Virtualization support
+	 *   CPUID Fn8000_001F[EBX]
+	 *   - Bits 5:0 - Pagetable bit position used to indicate encryption
+	 */
+	eax = 0x8000001f;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+	/* Check whether SEV is supported */
+	if (!(eax & BIT(1)))
+		return;
+
 	/*
 	 * Setup/preliminary detection of SNP. This will be sanity-checked
 	 * against CPUID/MSR values later.
 	 */
 	snp = snp_init(bp);
 
-	/* Check for the SME/SEV support leaf */
+	/* Now repeat the checks with the SNP CPUID table. */
+
+	/* Recheck the SME/SEV support leaf */
 	eax = 0x80000000;
 	ecx = 0;
 	native_cpuid(&eax, &ebx, &ecx, &edx);
@@ -418,7 +451,7 @@ void sev_enable(struct boot_params *bp)
 		return;
 
 	/*
-	 * Check for the SME/SEV feature:
+	 * Recheck for the SME/SEV feature:
 	 *   CPUID Fn8000_001F[EAX]
 	 *   - Bit 0 - Secure Memory Encryption support
 	 *   - Bit 1 - Secure Encrypted Virtualization support
Tao Liu Aug. 2, 2023, 8:22 a.m. UTC | #21
Hi Borislav,

On Sat, Jul 29, 2023 at 12:56 AM Borislav Petkov <bp@alien8.de> wrote:
>
> On Thu, Jul 27, 2023 at 07:03:26PM +0800, Tao Liu wrote:
> > Hi Borislav,
> >
> > Sorry for the late response. I spent some time retesting your patch
> > against 6.5.0-rc1 and 6.5.0-rc3, and it is OK. So
> >
> > Reported-and-tested-by: Tao Liu <ltao@redhat.com>
> >
> > And will we use this patch as a workaround or will we wait for a
> > better solution as proposed by Michael?
>
> First of all, please do not top-post.
>

OK, thanks for the reminder.

> And yes, here's a better one. I'd appreciate it you testing it.
>

Thanks for the patch! I have tested it on the lenovo machine in the
past few days, no issue found, so the patch tests OK.

Thanks,
Tao Liu

> Thx.
>
> ---
>  arch/x86/boot/compressed/idt_64.c |  5 ++++-
>  arch/x86/boot/compressed/sev.c    | 37 +++++++++++++++++++++++++++++--
>  2 files changed, 39 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c
> index 6debb816e83d..0f03ac12e2a6 100644
> --- a/arch/x86/boot/compressed/idt_64.c
> +++ b/arch/x86/boot/compressed/idt_64.c
> @@ -63,7 +63,10 @@ void load_stage2_idt(void)
>         set_idt_entry(X86_TRAP_PF, boot_page_fault);
>
>  #ifdef CONFIG_AMD_MEM_ENCRYPT
> -       set_idt_entry(X86_TRAP_VC, boot_stage2_vc);
> +       if (sev_status & BIT(1))
> +               set_idt_entry(X86_TRAP_VC, boot_stage2_vc);
> +       else
> +               set_idt_entry(X86_TRAP_VC, NULL);
>  #endif
>
>         load_boot_idt(&boot_idt_desc);
> diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
> index 09dc8c187b3c..c3e343bd4760 100644
> --- a/arch/x86/boot/compressed/sev.c
> +++ b/arch/x86/boot/compressed/sev.c
> @@ -404,13 +404,46 @@ void sev_enable(struct boot_params *bp)
>         if (bp)
>                 bp->cc_blob_address = 0;
>
> +       /*
> +        * Do an initial SEV capability check before snp_init() which
> +        * loads the CPUID page and the same checks afterwards are done
> +        * without the hypervisor and are trustworthy.
> +        *
> +        * If the HV fakes SEV support, the guest will crash'n'burn
> +        * which is good enough.
> +        */
> +
> +       /* Check for the SME/SEV support leaf */
> +       eax = 0x80000000;
> +       ecx = 0;
> +       native_cpuid(&eax, &ebx, &ecx, &edx);
> +       if (eax < 0x8000001f)
> +               return;
> +
> +       /*
> +        * Check for the SME/SEV feature:
> +        *   CPUID Fn8000_001F[EAX]
> +        *   - Bit 0 - Secure Memory Encryption support
> +        *   - Bit 1 - Secure Encrypted Virtualization support
> +        *   CPUID Fn8000_001F[EBX]
> +        *   - Bits 5:0 - Pagetable bit position used to indicate encryption
> +        */
> +       eax = 0x8000001f;
> +       ecx = 0;
> +       native_cpuid(&eax, &ebx, &ecx, &edx);
> +       /* Check whether SEV is supported */
> +       if (!(eax & BIT(1)))
> +               return;
> +
>         /*
>          * Setup/preliminary detection of SNP. This will be sanity-checked
>          * against CPUID/MSR values later.
>          */
>         snp = snp_init(bp);
>
> -       /* Check for the SME/SEV support leaf */
> +       /* Now repeat the checks with the SNP CPUID table. */
> +
> +       /* Recheck the SME/SEV support leaf */
>         eax = 0x80000000;
>         ecx = 0;
>         native_cpuid(&eax, &ebx, &ecx, &edx);
> @@ -418,7 +451,7 @@ void sev_enable(struct boot_params *bp)
>                 return;
>
>         /*
> -        * Check for the SME/SEV feature:
> +        * Recheck for the SME/SEV feature:
>          *   CPUID Fn8000_001F[EAX]
>          *   - Bit 0 - Secure Memory Encryption support
>          *   - Bit 1 - Secure Encrypted Virtualization support
> --
> 2.41.0
>
> --
> Regards/Gruss,
>     Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
>
Borislav Petkov Aug. 2, 2023, 9:39 a.m. UTC | #22
On Wed, Aug 02, 2023 at 04:22:54PM +0800, Tao Liu wrote:
> Thanks for the patch! I have tested it on the lenovo machine in the
> past few days, no issue found, so the patch tests OK.

Thanks for testing!

Mike, Tom, the below ok this way?

---
From: "Borislav Petkov (AMD)" <bp@alien8.de>
Date: Sun, 16 Jul 2023 20:22:20 +0200
Subject: [PATCH] x86/sev: Do not try to parse for the CC blob on non-AMD
 hardware

Tao Liu reported a boot hang on an Intel Atom machine due to an unmapped
EFI config table. The reason being that the CC blob which contains the
CPUID page for AMD SNP guests is parsed for before even checking
whether the machine runs on AMD hardware.

Usually that's not a problem on !AMD hw - it simply won't find the CC
blob's GUID and return. However, if any parts of the config table
pointers array is not mapped, the kernel will #PF very early in the
decompressor stage without any opportunity to recover.

Therefore, do a superficial CPUID check before poking for the CC blob.
This will fix the current issue on real hardware. It would also work as
a guest on a non-lying hypervisor.

For the lying hypervisor, the check is done again, *after* parsing the
CC blob as the real CPUID page will be present then.

Clear the #VC handler in case SEV-{ES,SNP} hasn't been detected, as
a precaution.

Fixes: c01fce9cef84 ("x86/compressed: Add SEV-SNP feature detection/setup")
Reported-by: Tao Liu <ltao@redhat.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Tested-by: Tao Liu <ltao@redhat.com>
Cc: <stable@kernel.org>
Link: https://lore.kernel.org/r/20230601072043.24439-1-ltao@redhat.com
---
 arch/x86/boot/compressed/idt_64.c |  9 +++++++-
 arch/x86/boot/compressed/sev.c    | 37 +++++++++++++++++++++++++++++--
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c
index 6debb816e83d..3cdf94b41456 100644
--- a/arch/x86/boot/compressed/idt_64.c
+++ b/arch/x86/boot/compressed/idt_64.c
@@ -63,7 +63,14 @@ void load_stage2_idt(void)
 	set_idt_entry(X86_TRAP_PF, boot_page_fault);
 
 #ifdef CONFIG_AMD_MEM_ENCRYPT
-	set_idt_entry(X86_TRAP_VC, boot_stage2_vc);
+	/*
+	 * Clear the second stage #VC handler in case guest types
+	 * needing #VC have not been detected.
+	 */
+	if (sev_status & BIT(1))
+		set_idt_entry(X86_TRAP_VC, boot_stage2_vc);
+	else
+		set_idt_entry(X86_TRAP_VC, NULL);
 #endif
 
 	load_boot_idt(&boot_idt_desc);
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 09dc8c187b3c..c3e343bd4760 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -404,13 +404,46 @@ void sev_enable(struct boot_params *bp)
 	if (bp)
 		bp->cc_blob_address = 0;
 
+	/*
+	 * Do an initial SEV capability check before snp_init() which
+	 * loads the CPUID page and the same checks afterwards are done
+	 * without the hypervisor and are trustworthy.
+	 *
+	 * If the HV fakes SEV support, the guest will crash'n'burn
+	 * which is good enough.
+	 */
+
+	/* Check for the SME/SEV support leaf */
+	eax = 0x80000000;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+	if (eax < 0x8000001f)
+		return;
+
+	/*
+	 * Check for the SME/SEV feature:
+	 *   CPUID Fn8000_001F[EAX]
+	 *   - Bit 0 - Secure Memory Encryption support
+	 *   - Bit 1 - Secure Encrypted Virtualization support
+	 *   CPUID Fn8000_001F[EBX]
+	 *   - Bits 5:0 - Pagetable bit position used to indicate encryption
+	 */
+	eax = 0x8000001f;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+	/* Check whether SEV is supported */
+	if (!(eax & BIT(1)))
+		return;
+
 	/*
 	 * Setup/preliminary detection of SNP. This will be sanity-checked
 	 * against CPUID/MSR values later.
 	 */
 	snp = snp_init(bp);
 
-	/* Check for the SME/SEV support leaf */
+	/* Now repeat the checks with the SNP CPUID table. */
+
+	/* Recheck the SME/SEV support leaf */
 	eax = 0x80000000;
 	ecx = 0;
 	native_cpuid(&eax, &ebx, &ecx, &edx);
@@ -418,7 +451,7 @@ void sev_enable(struct boot_params *bp)
 		return;
 
 	/*
-	 * Check for the SME/SEV feature:
+	 * Recheck for the SME/SEV feature:
 	 *   CPUID Fn8000_001F[EAX]
 	 *   - Bit 0 - Secure Memory Encryption support
 	 *   - Bit 1 - Secure Encrypted Virtualization support
Tom Lendacky Aug. 2, 2023, 1:40 p.m. UTC | #23
On 8/2/23 04:39, Borislav Petkov wrote:
> On Wed, Aug 02, 2023 at 04:22:54PM +0800, Tao Liu wrote:
>> Thanks for the patch! I have tested it on the lenovo machine in the
>> past few days, no issue found, so the patch tests OK.
> 
> Thanks for testing!
> 
> Mike, Tom, the below ok this way?

Short of figuring out how to map page accesses earlier through the 
boot_page_fault IDT routine, this seems reasonable.

Acked-by: Tom Lendacky <thomas.lendacky@amd.com>

> 
> ---
> From: "Borislav Petkov (AMD)" <bp@alien8.de>
> Date: Sun, 16 Jul 2023 20:22:20 +0200
> Subject: [PATCH] x86/sev: Do not try to parse for the CC blob on non-AMD
>   hardware
> 
> Tao Liu reported a boot hang on an Intel Atom machine due to an unmapped
> EFI config table. The reason being that the CC blob which contains the
> CPUID page for AMD SNP guests is parsed for before even checking
> whether the machine runs on AMD hardware.
> 
> Usually that's not a problem on !AMD hw - it simply won't find the CC
> blob's GUID and return. However, if any parts of the config table
> pointers array is not mapped, the kernel will #PF very early in the
> decompressor stage without any opportunity to recover.
> 
> Therefore, do a superficial CPUID check before poking for the CC blob.
> This will fix the current issue on real hardware. It would also work as
> a guest on a non-lying hypervisor.
> 
> For the lying hypervisor, the check is done again, *after* parsing the
> CC blob as the real CPUID page will be present then.
> 
> Clear the #VC handler in case SEV-{ES,SNP} hasn't been detected, as
> a precaution.
> 
> Fixes: c01fce9cef84 ("x86/compressed: Add SEV-SNP feature detection/setup")
> Reported-by: Tao Liu <ltao@redhat.com>
> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
> Tested-by: Tao Liu <ltao@redhat.com>
> Cc: <stable@kernel.org>
> Link: https://lore.kernel.org/r/20230601072043.24439-1-ltao@redhat.com
> ---
>   arch/x86/boot/compressed/idt_64.c |  9 +++++++-
>   arch/x86/boot/compressed/sev.c    | 37 +++++++++++++++++++++++++++++--
>   2 files changed, 43 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c
> index 6debb816e83d..3cdf94b41456 100644
> --- a/arch/x86/boot/compressed/idt_64.c
> +++ b/arch/x86/boot/compressed/idt_64.c
> @@ -63,7 +63,14 @@ void load_stage2_idt(void)
>   	set_idt_entry(X86_TRAP_PF, boot_page_fault);
>   
>   #ifdef CONFIG_AMD_MEM_ENCRYPT
> -	set_idt_entry(X86_TRAP_VC, boot_stage2_vc);
> +	/*
> +	 * Clear the second stage #VC handler in case guest types
> +	 * needing #VC have not been detected.
> +	 */
> +	if (sev_status & BIT(1))
> +		set_idt_entry(X86_TRAP_VC, boot_stage2_vc);
> +	else
> +		set_idt_entry(X86_TRAP_VC, NULL);
>   #endif
>   
>   	load_boot_idt(&boot_idt_desc);
> diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
> index 09dc8c187b3c..c3e343bd4760 100644
> --- a/arch/x86/boot/compressed/sev.c
> +++ b/arch/x86/boot/compressed/sev.c
> @@ -404,13 +404,46 @@ void sev_enable(struct boot_params *bp)
>   	if (bp)
>   		bp->cc_blob_address = 0;
>   
> +	/*
> +	 * Do an initial SEV capability check before snp_init() which
> +	 * loads the CPUID page and the same checks afterwards are done
> +	 * without the hypervisor and are trustworthy.
> +	 *
> +	 * If the HV fakes SEV support, the guest will crash'n'burn
> +	 * which is good enough.
> +	 */
> +
> +	/* Check for the SME/SEV support leaf */
> +	eax = 0x80000000;
> +	ecx = 0;
> +	native_cpuid(&eax, &ebx, &ecx, &edx);
> +	if (eax < 0x8000001f)
> +		return;
> +
> +	/*
> +	 * Check for the SME/SEV feature:
> +	 *   CPUID Fn8000_001F[EAX]
> +	 *   - Bit 0 - Secure Memory Encryption support
> +	 *   - Bit 1 - Secure Encrypted Virtualization support
> +	 *   CPUID Fn8000_001F[EBX]
> +	 *   - Bits 5:0 - Pagetable bit position used to indicate encryption
> +	 */
> +	eax = 0x8000001f;
> +	ecx = 0;
> +	native_cpuid(&eax, &ebx, &ecx, &edx);
> +	/* Check whether SEV is supported */
> +	if (!(eax & BIT(1)))
> +		return;
> +
>   	/*
>   	 * Setup/preliminary detection of SNP. This will be sanity-checked
>   	 * against CPUID/MSR values later.
>   	 */
>   	snp = snp_init(bp);
>   
> -	/* Check for the SME/SEV support leaf */
> +	/* Now repeat the checks with the SNP CPUID table. */
> +
> +	/* Recheck the SME/SEV support leaf */
>   	eax = 0x80000000;
>   	ecx = 0;
>   	native_cpuid(&eax, &ebx, &ecx, &edx);
> @@ -418,7 +451,7 @@ void sev_enable(struct boot_params *bp)
>   		return;
>   
>   	/*
> -	 * Check for the SME/SEV feature:
> +	 * Recheck for the SME/SEV feature:
>   	 *   CPUID Fn8000_001F[EAX]
>   	 *   - Bit 0 - Secure Memory Encryption support
>   	 *   - Bit 1 - Secure Encrypted Virtualization support
Borislav Petkov Aug. 2, 2023, 1:58 p.m. UTC | #24
On Wed, Aug 02, 2023 at 08:40:36AM -0500, Tom Lendacky wrote:
> Short of figuring out how to map page accesses earlier through the
> boot_page_fault IDT routine

And you want to do that because?
Ard Biesheuvel Aug. 2, 2023, 2:55 p.m. UTC | #25
On Wed, 2 Aug 2023 at 15:59, Borislav Petkov <bp@alien8.de> wrote:
>
> On Wed, Aug 02, 2023 at 08:40:36AM -0500, Tom Lendacky wrote:
> > Short of figuring out how to map page accesses earlier through the
> > boot_page_fault IDT routine
>
> And you want to do that because?
>

... because now, entering via startup_32 is broken, given that it only
maps the kernel image itself and relies on the #PF handling for
everything else it accesses, including firmware tables.

AFAICT this also means that entering via startup_32 is broken entirely
for any configuration that enables the cc blob config table check,
regardless of the platform.
Borislav Petkov Aug. 2, 2023, 3:51 p.m. UTC | #26
On Wed, Aug 02, 2023 at 04:55:27PM +0200, Ard Biesheuvel wrote:
> ... because now, entering via startup_32 is broken, given that it only
> maps the kernel image itself and relies on the #PF handling for
> everything else it accesses, including firmware tables.
> 
> AFAICT this also means that entering via startup_32 is broken entirely
> for any configuration that enables the cc blob config table check,
> regardless of the platform.

Lemme brain-dump what Tom and I just talked on IRC.

That startup_32 entry path for SNP guests was used with old grubs which
used to enter through there and not anymore, reportedly. Which means,
that must've worked at some point but Joerg would know. CCed.

Newer grubs enter through the 64-bit entry point and thus are fine
- otherwise we would be seeing explosions left and right.

So dependent on what we wanna do, if we kill the 32-bit path, we can
kill the 32-bit C-bit verif code. But that's for later and an item on my
TODO list.

Thx.
Ard Biesheuvel Aug. 3, 2023, 11:11 a.m. UTC | #27
On Wed, 2 Aug 2023 at 17:52, Borislav Petkov <bp@alien8.de> wrote:
>
> On Wed, Aug 02, 2023 at 04:55:27PM +0200, Ard Biesheuvel wrote:
> > ... because now, entering via startup_32 is broken, given that it only
> > maps the kernel image itself and relies on the #PF handling for
> > everything else it accesses, including firmware tables.
> >
> > AFAICT this also means that entering via startup_32 is broken entirely
> > for any configuration that enables the cc blob config table check,
> > regardless of the platform.
>
> Lemme brain-dump what Tom and I just talked on IRC.
>
> That startup_32 entry path for SNP guests was used with old grubs which
> used to enter through there and not anymore, reportedly. Which means,
> that must've worked at some point but Joerg would know. CCed.
>

Sadly, not only 'old' grubs - GRUB mainline only recently added
support for booting Linux/x86 via the EFI stub (because I wrote the
code for them), but it will still fall back to the previous mode for
kernels that are built without EFI stub support, or which are older
than ~v5.8 (because their EFI stub does not implement the generic EFI
initrd loading mechanism)

This fallback still appears to enter via startup_32, even when GRUB
itself runs in long mode in the context of EFI.

> Newer grubs enter through the 64-bit entry point and thus are fine
> - otherwise we would be seeing explosions left and right.
>

Yeah. what seems to be saving our ass here is that startup_32 maps the
first 1G of physical address space 4 times, and x86_64 EFI usually
puts firmware tables below 4G. This means the cc blob check doesn't
fault, but it may dereference bogus memory traversing the config table
array looking for the cc blob GUID. However, the system table field
holding the size of the array may also appear as bogus so this may
still break in weird ways.

> So dependent on what we wanna do, if we kill the 32-bit path, we can
> kill the 32-bit C-bit verif code. But that's for later and an item on my
> TODO list.
>

I don't think we can kill it yet, but it would be nice if we could
avoid the need to support SNP boot when entering that way.
Ard Biesheuvel Aug. 3, 2023, 2:27 p.m. UTC | #28
On Thu, 3 Aug 2023 at 13:11, Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Wed, 2 Aug 2023 at 17:52, Borislav Petkov <bp@alien8.de> wrote:
> >
> > On Wed, Aug 02, 2023 at 04:55:27PM +0200, Ard Biesheuvel wrote:
> > > ... because now, entering via startup_32 is broken, given that it only
> > > maps the kernel image itself and relies on the #PF handling for
> > > everything else it accesses, including firmware tables.
> > >
> > > AFAICT this also means that entering via startup_32 is broken entirely
> > > for any configuration that enables the cc blob config table check,
> > > regardless of the platform.
> >
> > Lemme brain-dump what Tom and I just talked on IRC.
> >
> > That startup_32 entry path for SNP guests was used with old grubs which
> > used to enter through there and not anymore, reportedly. Which means,
> > that must've worked at some point but Joerg would know. CCed.
> >
>
> Sadly, not only 'old' grubs - GRUB mainline only recently added
> support for booting Linux/x86 via the EFI stub (because I wrote the
> code for them), but it will still fall back to the previous mode for
> kernels that are built without EFI stub support, or which are older
> than ~v5.8 (because their EFI stub does not implement the generic EFI
> initrd loading mechanism)
>
> This fallback still appears to enter via startup_32, even when GRUB
> itself runs in long mode in the context of EFI.
>
> > Newer grubs enter through the 64-bit entry point and thus are fine
> > - otherwise we would be seeing explosions left and right.
> >
>
> Yeah. what seems to be saving our ass here is that startup_32 maps the
> first 1G of physical address space 4 times, and x86_64 EFI usually
> puts firmware tables below 4G. This means the cc blob check doesn't
> fault, but it may dereference bogus memory traversing the config table
> array looking for the cc blob GUID. However, the system table field
> holding the size of the array may also appear as bogus so this may
> still break in weird ways.
>
> > So dependent on what we wanna do, if we kill the 32-bit path, we can
> > kill the 32-bit C-bit verif code. But that's for later and an item on my
> > TODO list.
> >
>
> I don't think we can kill it yet, but it would be nice if we could
> avoid the need to support SNP boot when entering that way.

https://lists.gnu.org/archive/html/grub-devel/2023-08/msg00005.html

Coming to your distro any decade now!
Borislav Petkov Aug. 5, 2023, 9:17 a.m. UTC | #29
On Thu, Aug 03, 2023 at 01:11:54PM +0200, Ard Biesheuvel wrote:
> Sadly, not only 'old' grubs - GRUB mainline only recently added
> support for booting Linux/x86 via the EFI stub (because I wrote the
> code for them),

haha.

> but it will still fall back to the previous mode for kernels that are
> built without EFI stub support, or which are older than ~v5.8 (because
> their EFI stub does not implement the generic EFI initrd loading
> mechanism)

The thing is, those SNP kernels pretty much use the EFI boot mechanism.
I mean, don't take my word for it as I run SNP guests only from time to
time but that's what everyone uses AFAIK.

> Yeah. what seems to be saving our ass here is that startup_32 maps the
> first 1G of physical address space 4 times, and x86_64 EFI usually
> puts firmware tables below 4G. This means the cc blob check doesn't
> fault, but it may dereference bogus memory traversing the config table
> array looking for the cc blob GUID. However, the system table field
> holding the size of the array may also appear as bogus so this may
> still break in weird ways.

Oh fun.

> I don't think we can kill it yet, but it would be nice if we could
> avoid the need to support SNP boot when entering that way.

That's what I meant - not boot SNP guests through the 32-bit entry path.

Thx.
Borislav Petkov Aug. 5, 2023, 9:19 a.m. UTC | #30
On Thu, Aug 03, 2023 at 04:27:41PM +0200, Ard Biesheuvel wrote:
> https://lists.gnu.org/archive/html/grub-devel/2023-08/msg00005.html
> 
> Coming to your distro any decade now!

Cool. The less 32-bit crap we have to deal with, the better.

Thx.
Ard Biesheuvel Aug. 6, 2023, 9 a.m. UTC | #31
On Sat, 5 Aug 2023 at 11:18, Borislav Petkov <bp@alien8.de> wrote:
>
> On Thu, Aug 03, 2023 at 01:11:54PM +0200, Ard Biesheuvel wrote:
> > Sadly, not only 'old' grubs - GRUB mainline only recently added
> > support for booting Linux/x86 via the EFI stub (because I wrote the
> > code for them),
>
> haha.
>
> > but it will still fall back to the previous mode for kernels that are
> > built without EFI stub support, or which are older than ~v5.8 (because
> > their EFI stub does not implement the generic EFI initrd loading
> > mechanism)
>
> The thing is, those SNP kernels pretty much use the EFI boot mechanism.
> I mean, don't take my word for it as I run SNP guests only from time to
> time but that's what everyone uses AFAIK.
>
> > Yeah. what seems to be saving our ass here is that startup_32 maps the
> > first 1G of physical address space 4 times, and x86_64 EFI usually
> > puts firmware tables below 4G. This means the cc blob check doesn't
> > fault, but it may dereference bogus memory traversing the config table
> > array looking for the cc blob GUID. However, the system table field
> > holding the size of the array may also appear as bogus so this may
> > still break in weird ways.
>
> Oh fun.
>

This is not actually true, I misread the code.

The initial mapping is 1:1 for the lower 4G of system memory, so
anything that lives there is accessible before the demand paging stuff
is up and running.

IOW, your change should be sufficient to fix this even when entering
via the 32-bit entry point.
diff mbox series

Patch

diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 1a3e2c05a8a5..664aefa6e896 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -28,6 +28,7 @@ 
 #include <asm/setup.h>
 #include <asm/set_memory.h>
 #include <asm/cpu.h>
+#include <asm/efi.h>
 
 #ifdef CONFIG_ACPI
 /*
@@ -86,10 +87,12 @@  const struct kexec_file_ops * const kexec_file_loaders[] = {
 #endif
 
 static int
-map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
+map_efi_tables(struct x86_mapping_info *info, pgd_t *level4p)
 {
 #ifdef CONFIG_EFI
 	unsigned long mstart, mend;
+	void *kaddr;
+	int ret;
 
 	if (!efi_enabled(EFI_BOOT))
 		return 0;
@@ -105,6 +108,30 @@  map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
 	if (!mstart)
 		return 0;
 
+	ret = kernel_ident_mapping_init(info, level4p, mstart, mend);
+	if (ret)
+		return ret;
+
+	kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB);
+	if (!kaddr) {
+		pr_err("Could not map UEFI system table\n");
+		return -ENOMEM;
+	}
+
+	mstart = efi_config_table;
+
+	if (efi_enabled(EFI_64BIT)) {
+		efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr;
+
+		mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables;
+	} else {
+		efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr;
+
+		mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables;
+	}
+
+	memunmap(kaddr);
+
 	return kernel_ident_mapping_init(info, level4p, mstart, mend);
 #endif
 	return 0;
@@ -244,10 +271,10 @@  static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 	}
 
 	/*
-	 * Prepare EFI systab and ACPI tables for kexec kernel since they are
-	 * not covered by pfn_mapped.
+	 * Prepare EFI systab, config table and ACPI tables for kexec kernel
+	 * since they are not covered by pfn_mapped.
 	 */
-	result = map_efi_systab(&info, level4p);
+	result = map_efi_tables(&info, level4p);
 	if (result)
 		return result;