diff mbox series

[v10,11/50] x86/sev: Add helper functions for RMPUPDATE and PSMASH instruction

Message ID 20231016132819.1002933-12-michael.roth@amd.com
State New
Headers show
Series Add AMD Secure Nested Paging (SEV-SNP) Hypervisor Support | expand

Commit Message

Michael Roth Oct. 16, 2023, 1:27 p.m. UTC
From: Brijesh Singh <brijesh.singh@amd.com>

The RMPUPDATE instruction writes a new RMP entry in the RMP Table. The
hypervisor will use the instruction to add pages to the RMP table. See
APM3 for details on the instruction operations.

The PSMASH instruction expands a 2MB RMP entry into a corresponding set
of contiguous 4KB-Page RMP entries. The hypervisor will use this
instruction to adjust the RMP entry without invalidating the previous
RMP entry.

Add the following external interface API functions:

psmash():
  Used to smash a 2MB aligned page into 4K pages while preserving the
  Validated bit in the RMP.

rmp_make_private():
  Used to assign a page to guest using the RMPUPDATE instruction.

rmp_make_shared():
  Used to transition a page to hypervisor/shared state using the
  RMPUPDATE instruction.

Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
[mdr: add RMPUPDATE retry logic for transient FAIL_OVERLAP errors]
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/include/asm/sev-common.h | 14 +++++
 arch/x86/include/asm/sev-host.h   | 10 ++++
 arch/x86/virt/svm/sev.c           | 89 +++++++++++++++++++++++++++++++
 3 files changed, 113 insertions(+)

Comments

Borislav Petkov Nov. 21, 2023, 4:21 p.m. UTC | #1
On Mon, Oct 16, 2023 at 08:27:40AM -0500, Michael Roth wrote:
> From: Brijesh Singh <brijesh.singh@amd.com>
> 
> The RMPUPDATE instruction writes a new RMP entry in the RMP Table. The
> hypervisor will use the instruction to add pages to the RMP table. See
> APM3 for details on the instruction operations.
> 
> The PSMASH instruction expands a 2MB RMP entry into a corresponding set
> of contiguous 4KB-Page RMP entries. The hypervisor will use this

s/-Page//

> instruction to adjust the RMP entry without invalidating the previous
> RMP entry.

"... without invalidating it."

This below is useless text in the commit message - that should be all
visible from the patch itself.

> Add the following external interface API functions:
> 
> psmash(): Used to smash a 2MB aligned page into 4K pages while
> preserving the Validated bit in the RMP.
> 
> rmp_make_private(): Used to assign a page to guest using the RMPUPDATE
> instruction.
> 
> rmp_make_shared(): Used to transition a page to hypervisor/shared
> state using the RMPUPDATE instruction.

> 
> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>

Since Brijesh is the author, first comes his SOB, then Ashish's and then
yours.

> [mdr: add RMPUPDATE retry logic for transient FAIL_OVERLAP errors]
> Signed-off-by: Michael Roth <michael.roth@amd.com>
> ---
>  arch/x86/include/asm/sev-common.h | 14 +++++
>  arch/x86/include/asm/sev-host.h   | 10 ++++
>  arch/x86/virt/svm/sev.c           | 89 +++++++++++++++++++++++++++++++
>  3 files changed, 113 insertions(+)
> 
> diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
> index 1e6fb93d8ab0..93ec8c12c91d 100644
> --- a/arch/x86/include/asm/sev-common.h
> +++ b/arch/x86/include/asm/sev-common.h
> @@ -173,8 +173,22 @@ struct snp_psc_desc {
>  #define GHCB_ERR_INVALID_INPUT		5
>  #define GHCB_ERR_INVALID_EVENT		6
>  
> +/* RMUPDATE detected 4K page and 2MB page overlap. */
> +#define RMPUPDATE_FAIL_OVERLAP		4
> +
>  /* RMP page size */
>  #define RMP_PG_SIZE_4K			0
> +#define RMP_PG_SIZE_2M			1

RMP_PG_LEVEL_2M

>  #define RMP_TO_X86_PG_LEVEL(level)	(((level) == RMP_PG_SIZE_4K) ? PG_LEVEL_4K : PG_LEVEL_2M)
> +#define X86_TO_RMP_PG_LEVEL(level)	(((level) == PG_LEVEL_4K) ? RMP_PG_SIZE_4K : RMP_PG_SIZE_2M)
> +
> +struct rmp_state {
> +	u64 gpa;
> +	u8 assigned;
> +	u8 pagesize;
> +	u8 immutable;
> +	u8 rsvd;
> +	u32 asid;
> +} __packed;
>  
>  #endif
> diff --git a/arch/x86/include/asm/sev-host.h b/arch/x86/include/asm/sev-host.h
> index bb06c57f2909..1df989411334 100644
> --- a/arch/x86/include/asm/sev-host.h
> +++ b/arch/x86/include/asm/sev-host.h
> @@ -16,9 +16,19 @@
>  #ifdef CONFIG_KVM_AMD_SEV
>  int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level);
>  void sev_dump_hva_rmpentry(unsigned long address);
> +int psmash(u64 pfn);
> +int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, bool immutable);
> +int rmp_make_shared(u64 pfn, enum pg_level level);
>  #else
>  static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENXIO; }
>  static inline void sev_dump_hva_rmpentry(unsigned long address) {}
> +static inline int psmash(u64 pfn) { return -ENXIO; }
> +static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid,
> +				   bool immutable)
> +{
> +	return -ENXIO;
> +}
> +static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENXIO; }
>  #endif
>  
>  #endif
> diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
> index cac3e311c38f..24a695af13a5 100644
> --- a/arch/x86/virt/svm/sev.c
> +++ b/arch/x86/virt/svm/sev.c
> @@ -367,3 +367,92 @@ void sev_dump_hva_rmpentry(unsigned long hva)
>  	sev_dump_rmpentry(pte_pfn(*pte));
>  }
>  EXPORT_SYMBOL_GPL(sev_dump_hva_rmpentry);
> +
> +/*
> + * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the
> + * Validated bit.
> + */
> +int psmash(u64 pfn)
> +{
> +	unsigned long paddr = pfn << PAGE_SHIFT;
> +	int ret;
> +
> +	pr_debug("%s: PFN: 0x%llx\n", __func__, pfn);

Left over?

> +
> +	if (!pfn_valid(pfn))
> +		return -EINVAL;
> +
> +	if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
> +		return -ENXIO;

That needs to happen first in the function.

> +
> +	/* Binutils version 2.36 supports the PSMASH mnemonic. */
> +	asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF"
> +		      : "=a"(ret)
> +		      : "a"(paddr)

Add an empty space between the " and the (

> +		      : "memory", "cc");
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(psmash);
> +
> +static int rmpupdate(u64 pfn, struct rmp_state *val)

rmp_state *state

so that it is clear what this is.

> +{
> +	unsigned long paddr = pfn << PAGE_SHIFT;
> +	int ret, level, npages;
> +	int attempts = 0;
> +
> +	if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
> +		return -ENXIO;
> +
> +	do {
> +		/* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
> +		asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
> +			     : "=a"(ret)
> +			     : "a"(paddr), "c"((unsigned long)val)

Add an empty space between the " and the (

> +			     : "memory", "cc");
> +
> +		attempts++;
> +	} while (ret == RMPUPDATE_FAIL_OVERLAP);

What's the logic here? Loop as long as it says "overlap"?

How "transient" is that overlapping condition?

What's the upper limit of that loop?

This loop should check a generously chosen upper limit of attempts and
then break if that limit is reached.

> +	if (ret) {
> +		pr_err("RMPUPDATE failed after %d attempts, ret: %d, pfn: %llx, npages: %d, level: %d\n",
> +		       attempts, ret, pfn, npages, level);

You're dumping here uninitialized stack variables npages and level.
Looks like leftover from some prior version of this function.

> +		sev_dump_rmpentry(pfn);
> +		dump_stack();

This is going to become real noisy on a huge machine with a lot of SNP
guests.

> +		return -EFAULT;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * Assign a page to guest using the RMPUPDATE instruction.
> + */

One-line comment works too.

/* Assign ... */

> +int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, bool immutable)
> +{
> +	struct rmp_state val;
> +
> +	memset(&val, 0, sizeof(val));
> +	val.assigned = 1;
> +	val.asid = asid;
> +	val.immutable = immutable;
> +	val.gpa = gpa;
> +	val.pagesize = X86_TO_RMP_PG_LEVEL(level);
> +
> +	return rmpupdate(pfn, &val);
> +}
> +EXPORT_SYMBOL_GPL(rmp_make_private);
> +
> +/*
> + * Transition a page to hypervisor/shared state using the RMPUPDATE instruction.
> + */

Ditto.

> +int rmp_make_shared(u64 pfn, enum pg_level level)
> +{
> +	struct rmp_state val;
> +
> +	memset(&val, 0, sizeof(val));
> +	val.pagesize = X86_TO_RMP_PG_LEVEL(level);
> +
> +	return rmpupdate(pfn, &val);
> +}
> +EXPORT_SYMBOL_GPL(rmp_make_shared);
> -- 

Thx.
Michael Roth Dec. 19, 2023, 4:20 p.m. UTC | #2
On Tue, Nov 21, 2023 at 05:21:49PM +0100, Borislav Petkov wrote:
> On Mon, Oct 16, 2023 at 08:27:40AM -0500, Michael Roth wrote:
> > +static int rmpupdate(u64 pfn, struct rmp_state *val)
> 
> rmp_state *state
> 
> so that it is clear what this is.
> 
> > +{
> > +	unsigned long paddr = pfn << PAGE_SHIFT;
> > +	int ret, level, npages;
> > +	int attempts = 0;
> > +
> > +	if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
> > +		return -ENXIO;
> > +
> > +	do {
> > +		/* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
> > +		asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
> > +			     : "=a"(ret)
> > +			     : "a"(paddr), "c"((unsigned long)val)
> 
> Add an empty space between the " and the (
> 
> > +			     : "memory", "cc");
> > +
> > +		attempts++;
> > +	} while (ret == RMPUPDATE_FAIL_OVERLAP);
> 
> What's the logic here? Loop as long as it says "overlap"?
> 
> How "transient" is that overlapping condition?
> 
> What's the upper limit of that loop?
> 
> This loop should check a generously chosen upper limit of attempts and
> then break if that limit is reached.

We've raised similar questions to David Kaplan and discussed this to a
fair degree.

The transient condition here is due to firmware locking the 2MB-aligned
RMP entry for the range to handle atomic updates. There is no upper bound
on retries or the amount of time spent, but it is always transient since
multiple hypervisor implementations now depend on this and any deviation
from this assurance would constitute a firmware regression.

A good torture test for this path is lots of 4K-only guests doing
concurrent boot/shutdowns in a tight loop. With week-long runs the
longest delay seen was on the order of 100ns, but there's no real 
correlation between time spent and number of retries, sometimes
100ns delays only involve 1 retry, sometimes much smaller time delays
involve hundreds of retries, and it all depends on what firmware is
doing, so there's no way to infer a safe retry limit based on that
data.

All that said, there are unfortunately other conditions that can
trigger non-transient RMPUPDATE_FAIL_OVERLAP failures, and these will
result in an infinite loop. Those are the result of host misbehavior
however, like trying to set up 2MB private RMP entries when there are
already private 4K entries in the range. Ideally these would be separate
error codes, but even if that were changed in firmware we'd still need
code to support older firmwares that don't disambiguate so not sure this
situation can be improved much.

> 
> > +	if (ret) {
> > +		pr_err("RMPUPDATE failed after %d attempts, ret: %d, pfn: %llx, npages: %d, level: %d\n",
> > +		       attempts, ret, pfn, npages, level);
> 
> You're dumping here uninitialized stack variables npages and level.
> Looks like leftover from some prior version of this function.

Yah, I'll clean this up. I think logging the attempts probably doesn't
have much use anymore either.

> 
> > +		sev_dump_rmpentry(pfn);
> > +		dump_stack();
> 
> This is going to become real noisy on a huge machine with a lot of SNP
> guests.

Since the transient case will eventually resolve to ret==0, we will only
get here on a kernel oops sort of condition where a stack dump seems
appropriate. rmpupdate() shouldn't error during normal operation, and if
it ever does it will likely be a fatal situation where those stack dumps
will be useful.

Thanks,

Mike

> 
> Thx.
> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> https://people.kernel.org/tglx/notes-about-netiquette
>
diff mbox series

Patch

diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 1e6fb93d8ab0..93ec8c12c91d 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -173,8 +173,22 @@  struct snp_psc_desc {
 #define GHCB_ERR_INVALID_INPUT		5
 #define GHCB_ERR_INVALID_EVENT		6
 
+/* RMUPDATE detected 4K page and 2MB page overlap. */
+#define RMPUPDATE_FAIL_OVERLAP		4
+
 /* RMP page size */
 #define RMP_PG_SIZE_4K			0
+#define RMP_PG_SIZE_2M			1
 #define RMP_TO_X86_PG_LEVEL(level)	(((level) == RMP_PG_SIZE_4K) ? PG_LEVEL_4K : PG_LEVEL_2M)
+#define X86_TO_RMP_PG_LEVEL(level)	(((level) == PG_LEVEL_4K) ? RMP_PG_SIZE_4K : RMP_PG_SIZE_2M)
+
+struct rmp_state {
+	u64 gpa;
+	u8 assigned;
+	u8 pagesize;
+	u8 immutable;
+	u8 rsvd;
+	u32 asid;
+} __packed;
 
 #endif
diff --git a/arch/x86/include/asm/sev-host.h b/arch/x86/include/asm/sev-host.h
index bb06c57f2909..1df989411334 100644
--- a/arch/x86/include/asm/sev-host.h
+++ b/arch/x86/include/asm/sev-host.h
@@ -16,9 +16,19 @@ 
 #ifdef CONFIG_KVM_AMD_SEV
 int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level);
 void sev_dump_hva_rmpentry(unsigned long address);
+int psmash(u64 pfn);
+int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, bool immutable);
+int rmp_make_shared(u64 pfn, enum pg_level level);
 #else
 static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENXIO; }
 static inline void sev_dump_hva_rmpentry(unsigned long address) {}
+static inline int psmash(u64 pfn) { return -ENXIO; }
+static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid,
+				   bool immutable)
+{
+	return -ENXIO;
+}
+static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENXIO; }
 #endif
 
 #endif
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index cac3e311c38f..24a695af13a5 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -367,3 +367,92 @@  void sev_dump_hva_rmpentry(unsigned long hva)
 	sev_dump_rmpentry(pte_pfn(*pte));
 }
 EXPORT_SYMBOL_GPL(sev_dump_hva_rmpentry);
+
+/*
+ * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the
+ * Validated bit.
+ */
+int psmash(u64 pfn)
+{
+	unsigned long paddr = pfn << PAGE_SHIFT;
+	int ret;
+
+	pr_debug("%s: PFN: 0x%llx\n", __func__, pfn);
+
+	if (!pfn_valid(pfn))
+		return -EINVAL;
+
+	if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+		return -ENXIO;
+
+	/* Binutils version 2.36 supports the PSMASH mnemonic. */
+	asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF"
+		      : "=a"(ret)
+		      : "a"(paddr)
+		      : "memory", "cc");
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(psmash);
+
+static int rmpupdate(u64 pfn, struct rmp_state *val)
+{
+	unsigned long paddr = pfn << PAGE_SHIFT;
+	int ret, level, npages;
+	int attempts = 0;
+
+	if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+		return -ENXIO;
+
+	do {
+		/* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
+		asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
+			     : "=a"(ret)
+			     : "a"(paddr), "c"((unsigned long)val)
+			     : "memory", "cc");
+
+		attempts++;
+	} while (ret == RMPUPDATE_FAIL_OVERLAP);
+
+	if (ret) {
+		pr_err("RMPUPDATE failed after %d attempts, ret: %d, pfn: %llx, npages: %d, level: %d\n",
+		       attempts, ret, pfn, npages, level);
+		sev_dump_rmpentry(pfn);
+		dump_stack();
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+/*
+ * Assign a page to guest using the RMPUPDATE instruction.
+ */
+int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, bool immutable)
+{
+	struct rmp_state val;
+
+	memset(&val, 0, sizeof(val));
+	val.assigned = 1;
+	val.asid = asid;
+	val.immutable = immutable;
+	val.gpa = gpa;
+	val.pagesize = X86_TO_RMP_PG_LEVEL(level);
+
+	return rmpupdate(pfn, &val);
+}
+EXPORT_SYMBOL_GPL(rmp_make_private);
+
+/*
+ * Transition a page to hypervisor/shared state using the RMPUPDATE instruction.
+ */
+int rmp_make_shared(u64 pfn, enum pg_level level)
+{
+	struct rmp_state val;
+
+	memset(&val, 0, sizeof(val));
+	val.pagesize = X86_TO_RMP_PG_LEVEL(level);
+
+	return rmpupdate(pfn, &val);
+}
+EXPORT_SYMBOL_GPL(rmp_make_shared);