diff mbox series

[PATCHv4,6/8] x86/mm: Provide helpers for unaccepted memory

Message ID 20220405234343.74045-7-kirill.shutemov@linux.intel.com
State New
Headers show
Series mm, x86/cc: Implement support for unaccepted memory | expand

Commit Message

Kirill A. Shutemov April 5, 2022, 11:43 p.m. UTC
Core-mm requires few helpers to support unaccepted memory:

 - accept_memory() checks the range of addresses against the bitmap and
   accept memory if needed.

 - memory_is_unaccepted() check if anything within the range requires
   acceptance.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 arch/x86/include/asm/page.h              |  5 +++
 arch/x86/include/asm/unaccepted_memory.h |  1 +
 arch/x86/mm/Makefile                     |  2 +
 arch/x86/mm/unaccepted_memory.c          | 53 ++++++++++++++++++++++++
 4 files changed, 61 insertions(+)
 create mode 100644 arch/x86/mm/unaccepted_memory.c

Comments

Dave Hansen April 8, 2022, 6:15 p.m. UTC | #1
On 4/5/22 16:43, Kirill A. Shutemov wrote:
> Core-mm requires few helpers to support unaccepted memory:
> 
>  - accept_memory() checks the range of addresses against the bitmap and
>    accept memory if needed.
> 
>  - memory_is_unaccepted() check if anything within the range requires
>    acceptance.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  arch/x86/include/asm/page.h              |  5 +++
>  arch/x86/include/asm/unaccepted_memory.h |  1 +
>  arch/x86/mm/Makefile                     |  2 +
>  arch/x86/mm/unaccepted_memory.c          | 53 ++++++++++++++++++++++++
>  4 files changed, 61 insertions(+)
>  create mode 100644 arch/x86/mm/unaccepted_memory.c
> 
> diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
> index 9cc82f305f4b..9ae0064f97e5 100644
> --- a/arch/x86/include/asm/page.h
> +++ b/arch/x86/include/asm/page.h
> @@ -19,6 +19,11 @@
>  struct page;
>  
>  #include <linux/range.h>
> +
> +#ifdef CONFIG_UNACCEPTED_MEMORY
> +#include <asm/unaccepted_memory.h>
> +#endif

It's a lot nicer to just to the #ifdefs inside the header.  Is there a
specific reason to do it this way?

>  extern struct range pfn_mapped[];
>  extern int nr_pfn_mapped;
>  
> diff --git a/arch/x86/include/asm/unaccepted_memory.h b/arch/x86/include/asm/unaccepted_memory.h
> index f1f835d3cd78..a8d12ef1bda8 100644
> --- a/arch/x86/include/asm/unaccepted_memory.h
> +++ b/arch/x86/include/asm/unaccepted_memory.h
> @@ -10,5 +10,6 @@ struct boot_params;
>  void mark_unaccepted(struct boot_params *params, u64 start, u64 num);
>  
>  void accept_memory(phys_addr_t start, phys_addr_t end);
> +bool memory_is_unaccepted(phys_addr_t start, phys_addr_t end);
>  
>  #endif
> diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
> index fe3d3061fc11..e327f83e6bbf 100644
> --- a/arch/x86/mm/Makefile
> +++ b/arch/x86/mm/Makefile
> @@ -60,3 +60,5 @@ obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_amd.o
>  
>  obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_identity.o
>  obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o
> +
> +obj-$(CONFIG_UNACCEPTED_MEMORY)	+= unaccepted_memory.o
> diff --git a/arch/x86/mm/unaccepted_memory.c b/arch/x86/mm/unaccepted_memory.c
> new file mode 100644
> index 000000000000..3588a7cb954c
> --- /dev/null
> +++ b/arch/x86/mm/unaccepted_memory.c
> @@ -0,0 +1,53 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <linux/memblock.h>
> +#include <linux/mm.h>
> +#include <linux/pfn.h>
> +#include <linux/spinlock.h>
> +
> +#include <asm/io.h>
> +#include <asm/setup.h>
> +#include <asm/unaccepted_memory.h>
> +
> +static DEFINE_SPINLOCK(unaccepted_memory_lock);

We need some documentation on what the lock does, either here or in the
changelog.

> +void accept_memory(phys_addr_t start, phys_addr_t end)
> +{
> +	unsigned long *unaccepted_memory;
> +	unsigned long flags;
> +	unsigned int rs, re;
> +
> +	if (!boot_params.unaccepted_memory)
> +		return;
> +
> +	unaccepted_memory = __va(boot_params.unaccepted_memory);
> +	rs = start / PMD_SIZE;
> +
> +	spin_lock_irqsave(&unaccepted_memory_lock, flags);
> +	for_each_set_bitrange_from(rs, re, unaccepted_memory,
> +				   DIV_ROUND_UP(end, PMD_SIZE)) {
> +		/* Platform-specific memory-acceptance call goes here */
> +		panic("Cannot accept memory");
> +		bitmap_clear(unaccepted_memory, rs, re - rs);
> +	}
> +	spin_unlock_irqrestore(&unaccepted_memory_lock, flags);
> +}

That panic() is making me nervous.  Is this bisect-safe?  Is it safe
because there are no callers of this function yet?

> +bool memory_is_unaccepted(phys_addr_t start, phys_addr_t end)
> +{
> +	unsigned long *unaccepted_memory = __va(boot_params.unaccepted_memory);
> +	unsigned long flags;
> +	bool ret = false;
> +
> +	spin_lock_irqsave(&unaccepted_memory_lock, flags);
> +	while (start < end) {
> +		if (test_bit(start / PMD_SIZE, unaccepted_memory)) {
> +			ret = true;
> +			break;
> +		}
> +
> +		start += PMD_SIZE;
> +	}
> +	spin_unlock_irqrestore(&unaccepted_memory_lock, flags);
> +
> +	return ret;
> +}
Dave Hansen April 8, 2022, 7:21 p.m. UTC | #2
On 4/5/22 16:43, Kirill A. Shutemov wrote:
> +void accept_memory(phys_addr_t start, phys_addr_t end)
> +{
> +	unsigned long *unaccepted_memory;
> +	unsigned long flags;
> +	unsigned int rs, re;
> +
> +	if (!boot_params.unaccepted_memory)
> +		return;
> +
> +	unaccepted_memory = __va(boot_params.unaccepted_memory);
> +	rs = start / PMD_SIZE;
> +
> +	spin_lock_irqsave(&unaccepted_memory_lock, flags);
> +	for_each_set_bitrange_from(rs, re, unaccepted_memory,
> +				   DIV_ROUND_UP(end, PMD_SIZE)) {
> +		/* Platform-specific memory-acceptance call goes here */
> +		panic("Cannot accept memory");
> +		bitmap_clear(unaccepted_memory, rs, re - rs);
> +	}
> +	spin_unlock_irqrestore(&unaccepted_memory_lock, flags);
> +}

Just to reiterate: this is a global spinlock.  It's disabling
interrupts.  "Platform-specific memory-acceptance call" will soon become:

	tdx_accept_memory(rs * PMD_SIZE, re * PMD_SIZE);

which is a page-by-page __tdx_module_call():

> +	for (i = 0; i < (end - start) / PAGE_SIZE; i++) {
> +		if (__tdx_module_call(TDACCEPTPAGE, start + i * PAGE_SIZE,
> +				      0, 0, 0, NULL)) {
> +			error("Cannot accept memory: page accept failed\n");
> +		}
> +	}

Each __tdx_module_call() involves a privilege transition that also
surely includes things like changing CR3.  It can't be cheap.  It also
is presumably touching the memory and probably flushing it out of the
CPU caches.  It's also unbounded:

	spin_lock_irqsave(&unaccepted_memory_lock, flags);
	for (i = 0; i < (end - start) / PAGE_SIZE; i++)
		// thousands?  tens-of-thousands of cycles??
	spin_lock_irqsave(&unaccepted_memory_lock, flags);

How far apart can end and start be?  It's at *least* 2MB in the page
allocator, which is on the order of a millisecond.  Are we sure there
aren't any callers that want to do this at a gigabyte granularity?  That
would hold the global lock and disable interrupts on the order of a second.

Do we want to bound the time that the lock can be held?  Or, should we
just let the lockup detectors tell us that we're being naughty?
Kirill A. Shutemov April 13, 2022, 4:08 p.m. UTC | #3
On Fri, Apr 08, 2022 at 12:21:19PM -0700, Dave Hansen wrote:
> On 4/5/22 16:43, Kirill A. Shutemov wrote:
> > +void accept_memory(phys_addr_t start, phys_addr_t end)
> > +{
> > +	unsigned long *unaccepted_memory;
> > +	unsigned long flags;
> > +	unsigned int rs, re;
> > +
> > +	if (!boot_params.unaccepted_memory)
> > +		return;
> > +
> > +	unaccepted_memory = __va(boot_params.unaccepted_memory);
> > +	rs = start / PMD_SIZE;
> > +
> > +	spin_lock_irqsave(&unaccepted_memory_lock, flags);
> > +	for_each_set_bitrange_from(rs, re, unaccepted_memory,
> > +				   DIV_ROUND_UP(end, PMD_SIZE)) {
> > +		/* Platform-specific memory-acceptance call goes here */
> > +		panic("Cannot accept memory");
> > +		bitmap_clear(unaccepted_memory, rs, re - rs);
> > +	}
> > +	spin_unlock_irqrestore(&unaccepted_memory_lock, flags);
> > +}
> 
> Just to reiterate: this is a global spinlock.  It's disabling
> interrupts.  "Platform-specific memory-acceptance call" will soon become:
> 
> 	tdx_accept_memory(rs * PMD_SIZE, re * PMD_SIZE);
> 
> which is a page-by-page __tdx_module_call():
> 
> > +	for (i = 0; i < (end - start) / PAGE_SIZE; i++) {
> > +		if (__tdx_module_call(TDACCEPTPAGE, start + i * PAGE_SIZE,
> > +				      0, 0, 0, NULL)) {
> > +			error("Cannot accept memory: page accept failed\n");
> > +		}
> > +	}
> 
> Each __tdx_module_call() involves a privilege transition that also
> surely includes things like changing CR3.  It can't be cheap.  It also
> is presumably touching the memory and probably flushing it out of the
> CPU caches.  It's also unbounded:
> 
> 	spin_lock_irqsave(&unaccepted_memory_lock, flags);
> 	for (i = 0; i < (end - start) / PAGE_SIZE; i++)
> 		// thousands?  tens-of-thousands of cycles??
> 	spin_lock_irqsave(&unaccepted_memory_lock, flags);
> 
> How far apart can end and start be?  It's at *least* 2MB in the page
> allocator, which is on the order of a millisecond.  Are we sure there
> aren't any callers that want to do this at a gigabyte granularity?  That
> would hold the global lock and disable interrupts on the order of a second.

This codepath only gets invoked with orders <MAX_ORDER or 4M on x86-64.

> Do we want to bound the time that the lock can be held?  Or, should we
> just let the lockup detectors tell us that we're being naughty?

Host can always DoS the guess, so yes this can lead to lockups.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 9cc82f305f4b..9ae0064f97e5 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -19,6 +19,11 @@ 
 struct page;
 
 #include <linux/range.h>
+
+#ifdef CONFIG_UNACCEPTED_MEMORY
+#include <asm/unaccepted_memory.h>
+#endif
+
 extern struct range pfn_mapped[];
 extern int nr_pfn_mapped;
 
diff --git a/arch/x86/include/asm/unaccepted_memory.h b/arch/x86/include/asm/unaccepted_memory.h
index f1f835d3cd78..a8d12ef1bda8 100644
--- a/arch/x86/include/asm/unaccepted_memory.h
+++ b/arch/x86/include/asm/unaccepted_memory.h
@@ -10,5 +10,6 @@  struct boot_params;
 void mark_unaccepted(struct boot_params *params, u64 start, u64 num);
 
 void accept_memory(phys_addr_t start, phys_addr_t end);
+bool memory_is_unaccepted(phys_addr_t start, phys_addr_t end);
 
 #endif
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fe3d3061fc11..e327f83e6bbf 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -60,3 +60,5 @@  obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_amd.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_identity.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o
+
+obj-$(CONFIG_UNACCEPTED_MEMORY)	+= unaccepted_memory.o
diff --git a/arch/x86/mm/unaccepted_memory.c b/arch/x86/mm/unaccepted_memory.c
new file mode 100644
index 000000000000..3588a7cb954c
--- /dev/null
+++ b/arch/x86/mm/unaccepted_memory.c
@@ -0,0 +1,53 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+#include <linux/spinlock.h>
+
+#include <asm/io.h>
+#include <asm/setup.h>
+#include <asm/unaccepted_memory.h>
+
+static DEFINE_SPINLOCK(unaccepted_memory_lock);
+
+void accept_memory(phys_addr_t start, phys_addr_t end)
+{
+	unsigned long *unaccepted_memory;
+	unsigned long flags;
+	unsigned int rs, re;
+
+	if (!boot_params.unaccepted_memory)
+		return;
+
+	unaccepted_memory = __va(boot_params.unaccepted_memory);
+	rs = start / PMD_SIZE;
+
+	spin_lock_irqsave(&unaccepted_memory_lock, flags);
+	for_each_set_bitrange_from(rs, re, unaccepted_memory,
+				   DIV_ROUND_UP(end, PMD_SIZE)) {
+		/* Platform-specific memory-acceptance call goes here */
+		panic("Cannot accept memory");
+		bitmap_clear(unaccepted_memory, rs, re - rs);
+	}
+	spin_unlock_irqrestore(&unaccepted_memory_lock, flags);
+}
+
+bool memory_is_unaccepted(phys_addr_t start, phys_addr_t end)
+{
+	unsigned long *unaccepted_memory = __va(boot_params.unaccepted_memory);
+	unsigned long flags;
+	bool ret = false;
+
+	spin_lock_irqsave(&unaccepted_memory_lock, flags);
+	while (start < end) {
+		if (test_bit(start / PMD_SIZE, unaccepted_memory)) {
+			ret = true;
+			break;
+		}
+
+		start += PMD_SIZE;
+	}
+	spin_unlock_irqrestore(&unaccepted_memory_lock, flags);
+
+	return ret;
+}