diff mbox series

[Part2,v6,14/49] crypto: ccp: Handle the legacy TMR allocation when SNP is enabled

Message ID 3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra@amd.com
State New
Headers show
Series Add AMD Secure Nested Paging (SEV-SNP) | expand

Commit Message

Ashish Kalra June 20, 2022, 11:05 p.m. UTC
From: Brijesh Singh <brijesh.singh@amd.com>

The behavior and requirement for the SEV-legacy command is altered when
the SNP firmware is in the INIT state. See SEV-SNP firmware specification
for more details.

Allocate the Trusted Memory Region (TMR) as a 2mb sized/aligned region
when SNP is enabled to satify new requirements for the SNP. Continue
allocating a 1mb region for !SNP configuration.

While at it, provide API that can be used by others to allocate a page
that can be used by the firmware. The immediate user for this API will
be the KVM driver. The KVM driver to need to allocate a firmware context
page during the guest creation. The context page need to be updated
by the firmware. See the SEV-SNP specification for further details.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 drivers/crypto/ccp/sev-dev.c | 173 +++++++++++++++++++++++++++++++++--
 include/linux/psp-sev.h      |  11 +++
 2 files changed, 178 insertions(+), 6 deletions(-)

Comments

Peter Gonda June 24, 2022, 2:19 p.m. UTC | #1
On Tue, Jun 21, 2022 at 2:17 PM Kalra, Ashish <Ashish.Kalra@amd.com> wrote:
>
> [Public]
>
> Hello Peter,
>
> >> +static int snp_reclaim_pages(unsigned long pfn, unsigned int npages,
> >> +bool locked) {
> >> +       struct sev_data_snp_page_reclaim data;
> >> +       int ret, err, i, n = 0;
> >> +
> >> +       for (i = 0; i < npages; i++) {
>
> >What about setting |n| here too, also the other increments.
>
> >for (i = 0, n = 0; i < npages; i++, n++, pfn++)
>
> Yes that is simpler.
>
> >> +               memset(&data, 0, sizeof(data));
> >> +               data.paddr = pfn << PAGE_SHIFT;
> >> +
> >> +               if (locked)
> >> +                       ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err);
> >> +               else
> >> +                       ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM,
> >> + &data, &err);
>
> > Can we change `sev_cmd_mutex` to some sort of nesting lock type? That could clean up this if (locked) code.
>
> > +static inline int rmp_make_firmware(unsigned long pfn, int level) {
> > +       return rmp_make_private(pfn, 0, level, 0, true); }
> > +
> > +static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, bool to_fw, bool locked,
> > +                            bool need_reclaim)
>
> >This function can do a lot and when I read the call sites its hard to see what its doing since we have a combination of arguments which tell us what behavior is happening, some of which are not valid (ex: to_fw == true and need_reclaim == true is an >invalid argument combination).
>
> to_fw is used to make a firmware page and need_reclaim is for freeing the firmware page, so they are going to be mutually exclusive.
>
> I actually can connect with it quite logically with the callers :
> snp_alloc_firmware_pages will call with to_fw = true and need_reclaim = false
> and snp_free_firmware_pages will do the opposite, to_fw = false and need_reclaim = true.
>
> That seems straightforward to look at.

This might be a preference thing but I find it not straightforward.
When I am reading through unmap_firmware_writeable() and I see

  /* Transition the pre-allocated buffer to the firmware state. */
  if (snp_set_rmp_state(__pa(map->host), npages, true, true, false))
   return -EFAULT;

I don't actually know what snp_set_rmp_state() is doing unless I go
look at the definition and see what all those booleans mean. This is
unlike the rmp_make_shared() and rmp_make_private() functions, each of
which tells me a lot more about what the function will do just from
the name.


>
> >Also this for loop over |npages| is duplicated from snp_reclaim_pages(). One improvement here is that on the current
> >snp_reclaim_pages() if we fail to reclaim a page we assume we cannot reclaim the next pages, this may cause us to snp_leak_pages() more pages than we actually need too.
>
> Yes that is true.
>
> >What about something like this?
>
> >static snp_leak_page(u64 pfn, enum pg_level level) {
> >   memory_failure(pfn, 0);
> >   dump_rmpentry(pfn);
> >}
>
> >static int snp_reclaim_page(u64 pfn, enum pg_level level) {
> >  int ret;
> >  struct sev_data_snp_page_reclaim data;
>
> >  ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err);
> >  if (ret)
> >    goto cleanup;
>
> >  ret = rmp_make_shared(pfn, level);
> >  if (ret)
> >    goto cleanup;
>
> > return 0;
>
> >cleanup:
> >    snp_leak_page(pfn, level)
> >}
>
> >typedef int (*rmp_state_change_func) (u64 pfn, enum pg_level level);
>
> >static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, rmp_state_change_func state_change, rmp_state_change_func cleanup) {
> >  struct sev_data_snp_page_reclaim data;
> >  int ret, err, i, n = 0;
>
> >  for (i = 0, n = 0; i < npages; i++, n++, pfn++) {
> >    ret = state_change(pfn, PG_LEVEL_4K)
> >    if (ret)
> >      goto cleanup;
> >  }
>
> >  return 0;
>
> > cleanup:
> >  for (; i>= 0; i--, n--, pfn--) {
> >    cleanup(pfn, PG_LEVEL_4K);
> >  }
>
> >  return ret;
> >}
>
> >Then inside of __snp_alloc_firmware_pages():
>
> >snp_set_rmp_state(paddr, npages, rmp_make_firmware, snp_reclaim_page);
>
> >And inside of __snp_free_firmware_pages():
>
> >snp_set_rmp_state(paddr, npages, snp_reclaim_page, snp_leak_page);
>
> >Just a suggestion feel free to ignore. The readability comment could be addressed much less invasively by just making separate functions for each valid combination of arguments here. Like snp_set_rmp_fw_state(), snp_set_rmp_shared_state(),
> >snp_set_rmp_release_state() or something.
>
> >> +static struct page *__snp_alloc_firmware_pages(gfp_t gfp_mask, int
> >> +order, bool locked) {
> >> +       unsigned long npages = 1ul << order, paddr;
> >> +       struct sev_device *sev;
> >> +       struct page *page;
> >> +
> >> +       if (!psp_master || !psp_master->sev_data)
> >> +               return NULL;
> >> +
> >> +       page = alloc_pages(gfp_mask, order);
> >> +       if (!page)
> >> +               return NULL;
> >> +
> >> +       /* If SEV-SNP is initialized then add the page in RMP table. */
> >> +       sev = psp_master->sev_data;
> >> +       if (!sev->snp_inited)
> >> +               return page;
> >> +
> >> +       paddr = __pa((unsigned long)page_address(page));
> >> +       if (snp_set_rmp_state(paddr, npages, true, locked, false))
> >> +               return NULL;
>
> >So what about the case where snp_set_rmp_state() fails but we were able to reclaim all the pages? Should we be able to signal that to callers so that we could free |page| here? But given this is an error path already maybe we can optimize this in a >follow up series.
>
> Yes, we should actually tie in to snp_reclaim_pages() success or failure here in the case we were able to successfully unroll some or all of the firmware state change.
>
> > +
> > +       return page;
> > +}
> > +
> > +void *snp_alloc_firmware_page(gfp_t gfp_mask) {
> > +       struct page *page;
> > +
> > +       page = __snp_alloc_firmware_pages(gfp_mask, 0, false);
> > +
> > +       return page ? page_address(page) : NULL; }
> > +EXPORT_SYMBOL_GPL(snp_alloc_firmware_page);
> > +
> > +static void __snp_free_firmware_pages(struct page *page, int order,
> > +bool locked) {
> > +       unsigned long paddr, npages = 1ul << order;
> > +
> > +       if (!page)
> > +               return;
> > +
> > +       paddr = __pa((unsigned long)page_address(page));
> > +       if (snp_set_rmp_state(paddr, npages, false, locked, true))
> > +               return;
>
> > Here we may be able to free some of |page| depending how where inside of snp_set_rmp_state() we failed. But again given this is an error path already maybe we can optimize this in a follow up series.
>
> Yes, we probably should be able to free some of the page(s) depending on how many page(s) got reclaimed in snp_set_rmp_state().
> But these reclamation failures may not be very common, so any failure is indicative of a bigger issue, it might be the case when there is a single page reclamation error it might happen with all the subsequent
> pages and so follow a simple recovery procedure, then handling a more complex recovery for a chunk of pages being reclaimed and another chunk not.
>
> Thanks,
> Ashish
>
>
>
Jarkko Sakkinen Aug. 2, 2022, 11:17 a.m. UTC | #2
On Mon, Jun 20, 2022 at 11:05:01PM +0000, Ashish Kalra wrote:
> From: Brijesh Singh <brijesh.singh@amd.com>
> 
> The behavior and requirement for the SEV-legacy command is altered when
> the SNP firmware is in the INIT state. See SEV-SNP firmware specification
> for more details.
> 
> Allocate the Trusted Memory Region (TMR) as a 2mb sized/aligned region
> when SNP is enabled to satify new requirements for the SNP. Continue
> allocating a 1mb region for !SNP configuration.
> 
> While at it, provide API that can be used by others to allocate a page
> that can be used by the firmware. The immediate user for this API will
> be the KVM driver. The KVM driver to need to allocate a firmware context
> page during the guest creation. The context page need to be updated
> by the firmware. See the SEV-SNP specification for further details.
> 
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> ---
>  drivers/crypto/ccp/sev-dev.c | 173 +++++++++++++++++++++++++++++++++--
>  include/linux/psp-sev.h      |  11 +++
>  2 files changed, 178 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
> index 35d76333e120..0dbd99f29b25 100644
> --- a/drivers/crypto/ccp/sev-dev.c
> +++ b/drivers/crypto/ccp/sev-dev.c
> @@ -79,6 +79,14 @@ static void *sev_es_tmr;
>  #define NV_LENGTH (32 * 1024)
>  static void *sev_init_ex_buffer;
>  
> +/* When SEV-SNP is enabled the TMR needs to be 2MB aligned and 2MB size. */
> +#define SEV_SNP_ES_TMR_SIZE	(2 * 1024 * 1024)
> +
> +static size_t sev_es_tmr_size = SEV_ES_TMR_SIZE;
> +
> +static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret);
> +static int sev_do_cmd(int cmd, void *data, int *psp_ret);
> +
>  static inline bool sev_version_greater_or_equal(u8 maj, u8 min)
>  {
>  	struct sev_device *sev = psp_master->sev_data;
> @@ -177,11 +185,161 @@ static int sev_cmd_buffer_len(int cmd)
>  	return 0;
>  }
>  
> +static void snp_leak_pages(unsigned long pfn, unsigned int npages)
> +{
> +	WARN(1, "psc failed, pfn 0x%lx pages %d (leaking)\n", pfn, npages);
> +	while (npages--) {
> +		memory_failure(pfn, 0);
> +		dump_rmpentry(pfn);
> +		pfn++;
> +	}
> +}
> +
> +static int snp_reclaim_pages(unsigned long pfn, unsigned int npages, bool locked)
> +{
> +	struct sev_data_snp_page_reclaim data;
> +	int ret, err, i, n = 0;
> +
> +	for (i = 0; i < npages; i++) {
> +		memset(&data, 0, sizeof(data));
> +		data.paddr = pfn << PAGE_SHIFT;
> +
> +		if (locked)
> +			ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err);
> +		else
> +			ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err);
> +		if (ret)
> +			goto cleanup;
> +
> +		ret = rmp_make_shared(pfn, PG_LEVEL_4K);
> +		if (ret)
> +			goto cleanup;
> +
> +		pfn++;
> +		n++;
> +	}
> +
> +	return 0;
> +
> +cleanup:
> +	/*
> +	 * If failed to reclaim the page then page is no longer safe to
> +	 * be released, leak it.
> +	 */
> +	snp_leak_pages(pfn, npages - n);
> +	return ret;
> +}
> +
> +static inline int rmp_make_firmware(unsigned long pfn, int level)
> +{
> +	return rmp_make_private(pfn, 0, level, 0, true);
> +}
> +
> +static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, bool to_fw, bool locked,
> +			     bool need_reclaim)
> +{
> +	unsigned long pfn = __sme_clr(paddr) >> PAGE_SHIFT; /* Cbit maybe set in the paddr */
> +	int rc, n = 0, i;
> +
> +	for (i = 0; i < npages; i++) {
> +		if (to_fw)
> +			rc = rmp_make_firmware(pfn, PG_LEVEL_4K);
> +		else
> +			rc = need_reclaim ? snp_reclaim_pages(pfn, 1, locked) :
> +					    rmp_make_shared(pfn, PG_LEVEL_4K);
> +		if (rc)
> +			goto cleanup;
> +
> +		pfn++;
> +		n++;
> +	}
> +
> +	return 0;
> +
> +cleanup:
> +	/* Try unrolling the firmware state changes */
> +	if (to_fw) {
> +		/*
> +		 * Reclaim the pages which were already changed to the
> +		 * firmware state.
> +		 */
> +		snp_reclaim_pages(paddr >> PAGE_SHIFT, n, locked);
> +
> +		return rc;
> +	}
> +
> +	/*
> +	 * If failed to change the page state to shared, then its not safe
> +	 * to release the page back to the system, leak it.
> +	 */
> +	snp_leak_pages(pfn, npages - n);
> +
> +	return rc;
> +}
> +
> +static struct page *__snp_alloc_firmware_pages(gfp_t gfp_mask, int order, bool locked)
> +{
> +	unsigned long npages = 1ul << order, paddr;
> +	struct sev_device *sev;
> +	struct page *page;
> +
> +	if (!psp_master || !psp_master->sev_data)
> +		return NULL;
> +
> +	page = alloc_pages(gfp_mask, order);
> +	if (!page)
> +		return NULL;
> +
> +	/* If SEV-SNP is initialized then add the page in RMP table. */
> +	sev = psp_master->sev_data;
> +	if (!sev->snp_inited)
> +		return page;
> +
> +	paddr = __pa((unsigned long)page_address(page));
> +	if (snp_set_rmp_state(paddr, npages, true, locked, false))
> +		return NULL;
> +
> +	return page;
> +}
> +
> +void *snp_alloc_firmware_page(gfp_t gfp_mask)
> +{
> +	struct page *page;
> +
> +	page = __snp_alloc_firmware_pages(gfp_mask, 0, false);

Could be just

        struct page *page  == __snp_alloc_firmware_pages(gfp_mask, 0, false);

> +
> +	return page ? page_address(page) : NULL;
> +}
> +EXPORT_SYMBOL_GPL(snp_alloc_firmware_page);

Undocumented API

Why don't you just export __snp_alloc_firmware_pages() and declare these
trivial wrappers as "static inline" inside psp-sev.h?

> +
> +static void __snp_free_firmware_pages(struct page *page, int order, bool locked)
> +{
> +	unsigned long paddr, npages = 1ul << order;
> +
> +	if (!page)
> +		return;

Silently ignored NULL pointer.

> +
> +	paddr = __pa((unsigned long)page_address(page));
> +	if (snp_set_rmp_state(paddr, npages, false, locked, true))
> +		return;
> +
> +	__free_pages(page, order);
> +}
> +
> +void snp_free_firmware_page(void *addr)
> +{
> +	if (!addr)
> +		return;

Why silently ignore a NULL pointer? At minimum, pr_warn() would be
appropriate.

> +
> +	__snp_free_firmware_pages(virt_to_page(addr), 0, false);
> +}
> +EXPORT_SYMBOL(snp_free_firmware_page);

Ditto, same comments as for allocation part.

> +
>  static void *sev_fw_alloc(unsigned long len)
>  {
>  	struct page *page;
>  
> -	page = alloc_pages(GFP_KERNEL, get_order(len));
> +	page = __snp_alloc_firmware_pages(GFP_KERNEL, get_order(len), false);
>  	if (!page)
>  		return NULL;
>  
> @@ -393,7 +551,7 @@ static int __sev_init_locked(int *error)
>  		data.tmr_address = __pa(sev_es_tmr);
>  
>  		data.flags |= SEV_INIT_FLAGS_SEV_ES;
> -		data.tmr_len = SEV_ES_TMR_SIZE;
> +		data.tmr_len = sev_es_tmr_size;
>  	}
>  
>  	return __sev_do_cmd_locked(SEV_CMD_INIT, &data, error);
> @@ -421,7 +579,7 @@ static int __sev_init_ex_locked(int *error)
>  		data.tmr_address = __pa(sev_es_tmr);
>  
>  		data.flags |= SEV_INIT_FLAGS_SEV_ES;
> -		data.tmr_len = SEV_ES_TMR_SIZE;
> +		data.tmr_len = sev_es_tmr_size;
>  	}
>  
>  	return __sev_do_cmd_locked(SEV_CMD_INIT_EX, &data, error);
> @@ -818,6 +976,8 @@ static int __sev_snp_init_locked(int *error)
>  	sev->snp_inited = true;
>  	dev_dbg(sev->dev, "SEV-SNP firmware initialized\n");
>  
> +	sev_es_tmr_size = SEV_SNP_ES_TMR_SIZE;
> +
>  	return rc;
>  }
>  
> @@ -1341,8 +1501,9 @@ static void sev_firmware_shutdown(struct sev_device *sev)
>  		/* The TMR area was encrypted, flush it from the cache */
>  		wbinvd_on_all_cpus();
>  
> -		free_pages((unsigned long)sev_es_tmr,
> -			   get_order(SEV_ES_TMR_SIZE));
> +		__snp_free_firmware_pages(virt_to_page(sev_es_tmr),
> +					  get_order(sev_es_tmr_size),
> +					  false);
>  		sev_es_tmr = NULL;
>  	}
>  
> @@ -1430,7 +1591,7 @@ void sev_pci_init(void)
>  	}
>  
>  	/* Obtain the TMR memory area for SEV-ES use */
> -	sev_es_tmr = sev_fw_alloc(SEV_ES_TMR_SIZE);
> +	sev_es_tmr = sev_fw_alloc(sev_es_tmr_size);
>  	if (!sev_es_tmr)
>  		dev_warn(sev->dev,
>  			 "SEV: TMR allocation failed, SEV-ES support unavailable\n");
> diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
> index 9f921d221b75..a3bb792bb842 100644
> --- a/include/linux/psp-sev.h
> +++ b/include/linux/psp-sev.h
> @@ -12,6 +12,8 @@
>  #ifndef __PSP_SEV_H__
>  #define __PSP_SEV_H__
>  
> +#include <linux/sev.h>
> +
>  #include <uapi/linux/psp-sev.h>
>  
>  #ifdef CONFIG_X86
> @@ -940,6 +942,8 @@ int snp_guest_page_reclaim(struct sev_data_snp_page_reclaim *data, int *error);
>  int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *error);
>  
>  void *psp_copy_user_blob(u64 uaddr, u32 len);
> +void *snp_alloc_firmware_page(gfp_t mask);
> +void snp_free_firmware_page(void *addr);
>  
>  #else	/* !CONFIG_CRYPTO_DEV_SP_PSP */
>  
> @@ -981,6 +985,13 @@ static inline int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *erro
>  	return -ENODEV;
>  }
>  
> +static inline void *snp_alloc_firmware_page(gfp_t mask)
> +{
> +	return NULL;
> +}
> +
> +static inline void snp_free_firmware_page(void *addr) { }
> +
>  #endif	/* CONFIG_CRYPTO_DEV_SP_PSP */
>  
>  #endif	/* __PSP_SEV_H__ */
> -- 
> 2.25.1
> 

BR, Jarkko
Jarkko Sakkinen Aug. 2, 2022, 12:17 p.m. UTC | #3
On Tue, Jun 21, 2022 at 08:17:15PM +0000, Kalra, Ashish wrote:
> [Public]
> 
> Hello Peter,
> 
> >> +static int snp_reclaim_pages(unsigned long pfn, unsigned int npages, 
> >> +bool locked) {
> >> +       struct sev_data_snp_page_reclaim data;
> >> +       int ret, err, i, n = 0;
> >> +
> >> +       for (i = 0; i < npages; i++) {
> 
> >What about setting |n| here too, also the other increments.
> 
> >for (i = 0, n = 0; i < npages; i++, n++, pfn++)
> 
> Yes that is simpler.
> 
> >> +               memset(&data, 0, sizeof(data));
> >> +               data.paddr = pfn << PAGE_SHIFT;
> >> +
> >> +               if (locked)
> >> +                       ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err);
> >> +               else
> >> +                       ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, 
> >> + &data, &err);
> 
> > Can we change `sev_cmd_mutex` to some sort of nesting lock type? That could clean up this if (locked) code.
> 
> > +static inline int rmp_make_firmware(unsigned long pfn, int level) {
> > +       return rmp_make_private(pfn, 0, level, 0, true); }
> > +
> > +static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, bool to_fw, bool locked,
> > +                            bool need_reclaim)
> 
> >This function can do a lot and when I read the call sites its hard to see what its doing since we have a combination of arguments which tell us what behavior is happening, some of which are not valid (ex: to_fw == true and need_reclaim == true is an >invalid argument combination).
> 
> to_fw is used to make a firmware page and need_reclaim is for freeing the firmware page, so they are going to be mutually exclusive. 
> 
> I actually can connect with it quite logically with the callers :
> snp_alloc_firmware_pages will call with to_fw = true and need_reclaim = false
> and snp_free_firmware_pages will do the opposite, to_fw = false and need_reclaim = true.
> 
> That seems straightforward to look at.
> 
> >Also this for loop over |npages| is duplicated from snp_reclaim_pages(). One improvement here is that on the current
> >snp_reclaim_pages() if we fail to reclaim a page we assume we cannot reclaim the next pages, this may cause us to snp_leak_pages() more pages than we actually need too.
> 
> Yes that is true.
> 
> >What about something like this?
> 
> >static snp_leak_page(u64 pfn, enum pg_level level) {
> >   memory_failure(pfn, 0);
> >   dump_rmpentry(pfn);
> >}
> 
> >static int snp_reclaim_page(u64 pfn, enum pg_level level) {
> >  int ret;
> >  struct sev_data_snp_page_reclaim data;
> 
> >  ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err);
> >  if (ret)
> >    goto cleanup;
> 
> >  ret = rmp_make_shared(pfn, level);
> >  if (ret)
> >    goto cleanup;
> 
> > return 0;
> 
> >cleanup:
> >    snp_leak_page(pfn, level)
> >}
> 
> >typedef int (*rmp_state_change_func) (u64 pfn, enum pg_level level);
> 
> >static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, rmp_state_change_func state_change, rmp_state_change_func cleanup) {
> >  struct sev_data_snp_page_reclaim data;
> >  int ret, err, i, n = 0;
> 
> >  for (i = 0, n = 0; i < npages; i++, n++, pfn++) {
> >    ret = state_change(pfn, PG_LEVEL_4K)
> >    if (ret)
> >      goto cleanup;
> >  }
> 
> >  return 0;
> 
> > cleanup:
> >  for (; i>= 0; i--, n--, pfn--) {
> >    cleanup(pfn, PG_LEVEL_4K);
> >  }
> 
> >  return ret;
> >}
> 
> >Then inside of __snp_alloc_firmware_pages():
> 
> >snp_set_rmp_state(paddr, npages, rmp_make_firmware, snp_reclaim_page);
> 
> >And inside of __snp_free_firmware_pages():
> 
> >snp_set_rmp_state(paddr, npages, snp_reclaim_page, snp_leak_page);
> 
> >Just a suggestion feel free to ignore. The readability comment could be addressed much less invasively by just making separate functions for each valid combination of arguments here. Like snp_set_rmp_fw_state(), snp_set_rmp_shared_state(),
> >snp_set_rmp_release_state() or something.
> 
> >> +static struct page *__snp_alloc_firmware_pages(gfp_t gfp_mask, int 
> >> +order, bool locked) {
> >> +       unsigned long npages = 1ul << order, paddr;
> >> +       struct sev_device *sev;
> >> +       struct page *page;
> >> +
> >> +       if (!psp_master || !psp_master->sev_data)
> >> +               return NULL;
> >> +
> >> +       page = alloc_pages(gfp_mask, order);
> >> +       if (!page)
> >> +               return NULL;
> >> +
> >> +       /* If SEV-SNP is initialized then add the page in RMP table. */
> >> +       sev = psp_master->sev_data;
> >> +       if (!sev->snp_inited)
> >> +               return page;
> >> +
> >> +       paddr = __pa((unsigned long)page_address(page));
> >> +       if (snp_set_rmp_state(paddr, npages, true, locked, false))
> >> +               return NULL;
> 
> >So what about the case where snp_set_rmp_state() fails but we were able to reclaim all the pages? Should we be able to signal that to callers so that we could free |page| here? But given this is an error path already maybe we can optimize this in a >follow up series.
> 
> Yes, we should actually tie in to snp_reclaim_pages() success or failure here in the case we were able to successfully unroll some or all of the firmware state change.
> 
> > +
> > +       return page;
> > +}
> > +
> > +void *snp_alloc_firmware_page(gfp_t gfp_mask) {
> > +       struct page *page;
> > +
> > +       page = __snp_alloc_firmware_pages(gfp_mask, 0, false);
> > +
> > +       return page ? page_address(page) : NULL; } 
> > +EXPORT_SYMBOL_GPL(snp_alloc_firmware_page);
> > +
> > +static void __snp_free_firmware_pages(struct page *page, int order, 
> > +bool locked) {
> > +       unsigned long paddr, npages = 1ul << order;
> > +
> > +       if (!page)
> > +               return;
> > +
> > +       paddr = __pa((unsigned long)page_address(page));
> > +       if (snp_set_rmp_state(paddr, npages, false, locked, true))
> > +               return;
> 
> > Here we may be able to free some of |page| depending how where inside of snp_set_rmp_state() we failed. But again given this is an error path already maybe we can optimize this in a follow up series.
> 
> Yes, we probably should be able to free some of the page(s) depending on how many page(s) got reclaimed in snp_set_rmp_state().
> But these reclamation failures may not be very common, so any failure is indicative of a bigger issue, it might be the case when there is a single page reclamation error it might happen with all the subsequent
> pages and so follow a simple recovery procedure, then handling a more complex recovery for a chunk of pages being reclaimed and another chunk not. 

Silent ignore is stil a bad idea. I.e. at minimum would
make sense to print a warning to klog.

> 
> Thanks,
> Ashish

BR, Jarkko
Borislav Petkov Oct. 13, 2022, 3:15 p.m. UTC | #4
On Mon, Jun 20, 2022 at 11:05:01PM +0000, Ashish Kalra wrote:
> +static void snp_leak_pages(unsigned long pfn, unsigned int npages)

That function name looks wrong.

> +{
> +	WARN(1, "psc failed, pfn 0x%lx pages %d (leaking)\n", pfn, npages);
> +	while (npages--) {
> +		memory_failure(pfn, 0);
		^^^^^^^^^^^^^^^^^^^^^^

Why?

 * This function is called by the low level machine check code
 * of an architecture when it detects hardware memory corruption
 * of a page. It tries its best to recover, which includes
 * dropping pages, killing processes etc.

I don't think you wanna do that.

It looks like you want to prevent the page from being used again but not
mark it as PG_hwpoison and whatnot. PG_reserved perhaps?

> +		dump_rmpentry(pfn);
> +		pfn++;
> +	}
> +}
> +
> +static int snp_reclaim_pages(unsigned long pfn, unsigned int npages, bool locked)
> +{
> +	struct sev_data_snp_page_reclaim data;
> +	int ret, err, i, n = 0;
> +
> +	for (i = 0; i < npages; i++) {
> +		memset(&data, 0, sizeof(data));
> +		data.paddr = pfn << PAGE_SHIFT;

Oh wow, this is just silly. A struct for a single u64. Just use a

	u64 paddr;

directly. But we had this topic already...

> +
> +		if (locked)

Ew, that's never a good design - conditional locking.

> +			ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err);
> +		else
> +			ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err);

<---- newline here.

> +		if (ret)
> +			goto cleanup;
> +
> +		ret = rmp_make_shared(pfn, PG_LEVEL_4K);
> +		if (ret)
> +			goto cleanup;
> +
> +		pfn++;
> +		n++;
> +	}
> +
> +	return 0;
> +
> +cleanup:
> +	/*
> +	 * If failed to reclaim the page then page is no longer safe to
> +	 * be released, leak it.
> +	 */
> +	snp_leak_pages(pfn, npages - n);

So this looks real weird: we go and reclaim pages, we hit an error
during reclaiming a page X somewhere in-between and then we go and mark
the *remaining* pages as not to be used?!

Why?

Why not only that *one* page which failed and then we continue with the
rest?!

> +	return ret;
> +}
> +
> +static inline int rmp_make_firmware(unsigned long pfn, int level)
> +{
> +	return rmp_make_private(pfn, 0, level, 0, true);
> +}

That's a silly wrapper used only once. Just do at the callsite:

	/* Mark this page as belonging to firmware */
	rc = rmp_make_private(pfn, 0, level, 0, true);

> +
> +static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, bool to_fw, bool locked,
> +			     bool need_reclaim)

Tangential to the above, this is just nuts with those bool arguments.
Just look at the callsites: do you understand what they do?

	snp_set_rmp_state(paddr, npages, true, locked, false);

what does that do? You need to go up to the definition of the function,
count the arguments and see what that "true" arg stands for.

What you should do instead is, have separate helpers which do only one
thing:

	rmp_mark_pages_firmware();
	rmp_mark_pages_shared();
	rmp_mark_pages_...

and then have the *callers* issue snp_reclaim_pages() when needed. So you'd have

	rmp_mark_pages_firmware();
	rmp_mark_pages_shared()

and __snp_free_firmware_pages() would do

	rmp_mark_pages_shared();
	snp_reclaim_pages();

and so on.

And then if you need locking, the callers can decide which sev_do_cmd
variant to issue.

And then if you have common code fragments which you can unify into a
bigger helper function, *then* you can do that.

Instead of multiplexing it this way. Which makes it really hard to
follow what the code does.


> +	unsigned long pfn = __sme_clr(paddr) >> PAGE_SHIFT; /* Cbit maybe set in the paddr */

No side comments pls.

> +	int rc, n = 0, i;
> +
> +	for (i = 0; i < npages; i++) {
> +		if (to_fw)
> +			rc = rmp_make_firmware(pfn, PG_LEVEL_4K);
> +		else
> +			rc = need_reclaim ? snp_reclaim_pages(pfn, 1, locked) :
> +					    rmp_make_shared(pfn, PG_LEVEL_4K);
> +		if (rc)
> +			goto cleanup;
> +
> +		pfn++;
> +		n++;
> +	}
> +
> +	return 0;
> +
> +cleanup:
> +	/* Try unrolling the firmware state changes */
> +	if (to_fw) {
> +		/*
> +		 * Reclaim the pages which were already changed to the
> +		 * firmware state.
> +		 */
> +		snp_reclaim_pages(paddr >> PAGE_SHIFT, n, locked);
> +
> +		return rc;
> +	}
> +
> +	/*
> +	 * If failed to change the page state to shared, then its not safe
> +	 * to release the page back to the system, leak it.
> +	 */
> +	snp_leak_pages(pfn, npages - n);
> +
> +	return rc;
> +}

...

> +void snp_free_firmware_page(void *addr)
> +{
> +	if (!addr)
> +		return;
> +
> +	__snp_free_firmware_pages(virt_to_page(addr), 0, false);
> +}
> +EXPORT_SYMBOL(snp_free_firmware_page);

EXPORT_SYMBOL_GPL() ofc.
Ashish Kalra Oct. 14, 2022, 8 p.m. UTC | #5
Hello Boris,

On 10/13/2022 10:15 AM, Borislav Petkov wrote:
> On Mon, Jun 20, 2022 at 11:05:01PM +0000, Ashish Kalra wrote:
>> +static void snp_leak_pages(unsigned long pfn, unsigned int npages)
> 
> That function name looks wrong.
> 
>> +{
>> +	WARN(1, "psc failed, pfn 0x%lx pages %d (leaking)\n", pfn, npages);
>> +	while (npages--) {
>> +		memory_failure(pfn, 0);
> 		^^^^^^^^^^^^^^^^^^^^^^
> 
> Why?

The page was in FW state and we couldn't transition it back to HV/Shared
state, any access to this page can cause RMP #PF.

> 
>   * This function is called by the low level machine check code
>   * of an architecture when it detects hardware memory corruption
>   * of a page. It tries its best to recover, which includes
>   * dropping pages, killing processes etc.
> 
> I don't think you wanna do that.
> 
> It looks like you want to prevent the page from being used again but not
> mark it as PG_hwpoison and whatnot. PG_reserved perhaps?

  * PG_reserved is set for special pages. The "struct page" of such a
  * page should in general not be touched (e.g. set dirty) except by its
  * owner.

If it is "still" accessed/touched then it can cause RMP #PF.
On the other hand,

  * PG_hwpoison... Accessing is
  * not safe since it may cause another machine check. Don't touch!

That sounds exactly the state we want these page(s) to be in ?

Another possibility is PG_error.

> 
>> +		dump_rmpentry(pfn);
>> +		pfn++;
>> +	}
>> +}
>> +
>> +static int snp_reclaim_pages(unsigned long pfn, unsigned int npages, bool locked)
>> +{
>> +	struct sev_data_snp_page_reclaim data;
>> +	int ret, err, i, n = 0;
>> +
>> +	for (i = 0; i < npages; i++) {
>> +		memset(&data, 0, sizeof(data));
>> +		data.paddr = pfn << PAGE_SHIFT;
> 
> Oh wow, this is just silly. A struct for a single u64. Just use a
> 
> 	u64 paddr;
Ok.
> 
> directly. But we had this topic already...
> 
>> +
>> +		if (locked)
> 
> Ew, that's never a good design - conditional locking.

There is a previous suggestion to change `sev_cmd_mutex` to some sort of 
nesting lock type to clean up this if (locked) code, though AFAIK, there 
is no support for nesting lock types.

> 
>> +			ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err);
>> +		else
>> +			ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err);
> 
> <---- newline here.
> 
>> +		if (ret)
>> +			goto cleanup;
>> +
>> +		ret = rmp_make_shared(pfn, PG_LEVEL_4K);
>> +		if (ret)
>> +			goto cleanup;
>> +
>> +		pfn++;
>> +		n++;
>> +	}
>> +
>> +	return 0;
>> +
>> +cleanup:
>> +	/*
>> +	 * If failed to reclaim the page then page is no longer safe to
>> +	 * be released, leak it.
>> +	 */
>> +	snp_leak_pages(pfn, npages - n);
> 
> So this looks real weird: we go and reclaim pages, we hit an error
> during reclaiming a page X somewhere in-between and then we go and mark
> the *remaining* pages as not to be used?!
> 
> Why?
> 
> Why not only that *one* page which failed and then we continue with the
> rest?!

I agree and will change to this approach.

> 
>> +	return ret;
>> +}
>> +
>> +static inline int rmp_make_firmware(unsigned long pfn, int level)
>> +{
>> +	return rmp_make_private(pfn, 0, level, 0, true);
>> +}
> 
> That's a silly wrapper used only once. Just do at the callsite:
> 
> 	/* Mark this page as belonging to firmware */
> 	rc = rmp_make_private(pfn, 0, level, 0, true);
> 
Ok.

>> +
>> +static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, bool to_fw, bool locked,
>> +			     bool need_reclaim)
> 
> Tangential to the above, this is just nuts with those bool arguments.
> Just look at the callsites: do you understand what they do?
> 
> 	snp_set_rmp_state(paddr, npages, true, locked, false);
> 
> what does that do? You need to go up to the definition of the function,
> count the arguments and see what that "true" arg stands for.

I totally agree, this is simply unreadable.

And this has been mentioned previously too ...
This function can do a lot and when I read the call sites its hard to
see what its doing since we have a combination of arguments which tell
us what behavior is happening ...

> 
> What you should do instead is, have separate helpers which do only one
> thing:
> 
> 	rmp_mark_pages_firmware();
> 	rmp_mark_pages_shared();
> 	rmp_mark_pages_...
> 
> and then have the *callers* issue snp_reclaim_pages() when needed. So you'd have
> 
> 	rmp_mark_pages_firmware();
> 	rmp_mark_pages_shared()
> 
> and __snp_free_firmware_pages() would do
> 
> 	rmp_mark_pages_shared();
> 	snp_reclaim_pages();
> 
Actually, this only needs to call snp_reclaim_pages().

> and so on.
> 
> And then if you need locking, the callers can decide which sev_do_cmd
> variant to issue.
> 
> And then if you have common code fragments which you can unify into a
> bigger helper function, *then* you can do that.
> 
> Instead of multiplexing it this way. Which makes it really hard to
> follow what the code does.
> 

Sure i will do this cleanup.

> 
>> +	unsigned long pfn = __sme_clr(paddr) >> PAGE_SHIFT; /* Cbit maybe set in the paddr */
> 
> No side comments pls.
> 
>> +	int rc, n = 0, i;
>> +
>> +	for (i = 0; i < npages; i++) {
>> +		if (to_fw)
>> +			rc = rmp_make_firmware(pfn, PG_LEVEL_4K);
>> +		else
>> +			rc = need_reclaim ? snp_reclaim_pages(pfn, 1, locked) :
>> +					    rmp_make_shared(pfn, PG_LEVEL_4K);
>> +		if (rc)
>> +			goto cleanup;
>> +
>> +		pfn++;
>> +		n++;
>> +	}
>> +
>> +	return 0;
>> +
>> +cleanup:
>> +	/* Try unrolling the firmware state changes */
>> +	if (to_fw) {
>> +		/*
>> +		 * Reclaim the pages which were already changed to the
>> +		 * firmware state.
>> +		 */
>> +		snp_reclaim_pages(paddr >> PAGE_SHIFT, n, locked);
>> +
>> +		return rc;
>> +	}
>> +
>> +	/*
>> +	 * If failed to change the page state to shared, then its not safe
>> +	 * to release the page back to the system, leak it.
>> +	 */
>> +	snp_leak_pages(pfn, npages - n);
>> +
>> +	return rc;
>> +}
> 
> ...
> 
>> +void snp_free_firmware_page(void *addr)
>> +{
>> +	if (!addr)
>> +		return;
>> +
>> +	__snp_free_firmware_pages(virt_to_page(addr), 0, false);
>> +}
>> +EXPORT_SYMBOL(snp_free_firmware_page);
> 
> EXPORT_SYMBOL_GPL() ofc.
> 

Thanks,
Ashish
Borislav Petkov Oct. 25, 2022, 10:25 a.m. UTC | #6
On Fri, Oct 14, 2022 at 03:00:09PM -0500, Kalra, Ashish wrote:
> If it is "still" accessed/touched then it can cause RMP #PF.
> On the other hand,
> 
>  * PG_hwpoison... Accessing is
>  * not safe since it may cause another machine check. Don't touch!
> 
> That sounds exactly the state we want these page(s) to be in ?
> 
> Another possibility is PG_error.

Something like this:

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e66f7aa3191d..baffa9c0dc30 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -186,6 +186,7 @@ enum pageflags {
 	 * THP.
 	 */
 	PG_has_hwpoisoned = PG_error,
+	PG_offlimits = PG_hwpoison,
 #endif
 
 	/* non-lru isolated movable page */

and SNP will have to depend on CONFIG_MEMORY_FAILURE.

But I'd let mm folks correct me here on the details.
Ashish Kalra Oct. 31, 2022, 8:10 p.m. UTC | #7
Just to add here, writing to any of these pages from the Host
will trigger a RMP #PF which will cause the RMP page fault handler
to send a SIGBUS to the current process, as this page is not owned
by Host.

So calling memory_failure() is proactively doing the same,
marking the page as poisoned and probably also killing the
current process.

Thanks,
Ashish

On 10/25/2022 5:25 AM, Borislav Petkov wrote:
> On Fri, Oct 14, 2022 at 03:00:09PM -0500, Kalra, Ashish wrote:
>> If it is "still" accessed/touched then it can cause RMP #PF.
>> On the other hand,
>>
>>   * PG_hwpoison... Accessing is
>>   * not safe since it may cause another machine check. Don't touch!
>>
>> That sounds exactly the state we want these page(s) to be in ?
>>
>> Another possibility is PG_error.
> 
> Something like this:
> 
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index e66f7aa3191d..baffa9c0dc30 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -186,6 +186,7 @@ enum pageflags {
>   	 * THP.
>   	 */
>   	PG_has_hwpoisoned = PG_error,
> +	PG_offlimits = PG_hwpoison,
>   #endif
>   
>   	/* non-lru isolated movable page */
> 
> and SNP will have to depend on CONFIG_MEMORY_FAILURE.
> 
> But I'd let mm folks correct me here on the details.
>
Borislav Petkov Oct. 31, 2022, 9:15 p.m. UTC | #8
On Mon, Oct 31, 2022 at 03:10:16PM -0500, Kalra, Ashish wrote:
> Just to add here, writing to any of these pages from the Host
> will trigger a RMP #PF which will cause the RMP page fault handler
> to send a SIGBUS to the current process, as this page is not owned
> by Host.

And kill the host process?

So this is another "policy" which sounds iffy. If we kill the process,
we should at least say why. Are we doing that currently?

> So calling memory_failure() is proactively doing the same, marking the
> page as poisoned and probably also killing the current process.

But the page is not suffering a memory failure - it cannot be reclaimed
for whatever reason. Btw, how can that reclaim failure ever happen? Any
real scenarios?

Anyway, memory failure just happens to fit what you wanna do but you
can't just reuse that - that's hacky. What is the problem with writing
your own function which does that?

Also, btw, please do not top-post.

Thx.
Ashish Kalra Oct. 31, 2022, 9:58 p.m. UTC | #9
Hello Boris,

On 10/31/2022 4:15 PM, Borislav Petkov wrote:
> On Mon, Oct 31, 2022 at 03:10:16PM -0500, Kalra, Ashish wrote:
>> Just to add here, writing to any of these pages from the Host
>> will trigger a RMP #PF which will cause the RMP page fault handler
>> to send a SIGBUS to the current process, as this page is not owned
>> by Host.
> 
> And kill the host process?
> 
> So this is another "policy" which sounds iffy. If we kill the process,
> we should at least say why. Are we doing that currently?

Yes, pasted below is the latest host RMP #PF handler, with new and 
additional comments added and there is a relevant comment added here for 
this behavior:

static int handle_user_rmp_page_fault(struct pt_regs *regs, unsigned 
long error_code,unsigned long address)
{
...
...

     /*
      * If its a guest private page, then the fault cannot be resolved.
      * Send a SIGBUS to terminate the process.
      *
      * As documented in APM vol3 pseudo-code for RMPUPDATE, when the
      * 2M range is covered by a valid (Assigned=1) 2M entry, the middle
      * 511 4k entries also have Assigned=1. This means that if there is
      * an access to a page which happens to lie within an Assigned 2M
      * entry, the 4k RMP entry will also have Assigned=1. Therefore, the
      * kernel should see that the page is not a valid page and the fault
      * cannot be resolved.
      */
      if (snp_lookup_rmpentry(pfn, &rmp_level)) {
             do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS);
             return RMP_PF_RETRY;
      }
...
...

I believe that we already had an off-list discussion on the same, 
copying David Kaplan's reply on the same below:

So what I think you want to do is:
1. Compute the pfn for the 4kb page you're trying to access (as your 
code below does) 2. Read that RMP entry -- If it is assigned then kill 
the process 3. Otherwise, check the level from the host page table.  If 
level=PG_LEVEL_4K then somebody else may have already smashed this page, 
so just retry the instruction 4. If level=PG_LEVEL_2M/1G, then the host 
needs to split their page.

This is the current algorithm being followed by the host RMP #PF handler.

> 
>> So calling memory_failure() is proactively doing the same, marking the
>> page as poisoned and probably also killing the current process.
> 
> But the page is not suffering a memory failure - it cannot be reclaimed
> for whatever reason. Btw, how can that reclaim failure ever happen? Any
> real scenarios?

The scenarios here are either SNP FW failure (SNP_PAGE_RECLAIM command) 
in transitioning the page back to HV state and/or RMPUPDATE instruction 
failure to transition the page back to hypervisor/shared state.

> 
> Anyway, memory failure just happens to fit what you wanna do but you
> can't just reuse that - that's hacky. What is the problem with writing
> your own function which does that?
> 

Ok.

Will look at adding our own recovery function for the same, but that 
will again mark the pages as poisoned, right ?

Still waiting for some/more feedback from mm folks on the same.

Thanks,
Ashish

> Also, btw, please do not top-post.
> 
> Thx.
>
Borislav Petkov Nov. 2, 2022, 11:22 a.m. UTC | #10
On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote:
>      if (snp_lookup_rmpentry(pfn, &rmp_level)) {
>             do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS);
>             return RMP_PF_RETRY;

Does this issue some halfway understandable error message why the
process got killed?

> Will look at adding our own recovery function for the same, but that will
> again mark the pages as poisoned, right ?

Well, not poisoned but PG_offlimits or whatever the mm folks agree upon.
Semantically, it'll be handled the same way, ofc.

> Still waiting for some/more feedback from mm folks on the same.

Just send the patch and they'll give it.

Thx.
Ashish Kalra Nov. 14, 2022, 11:36 p.m. UTC | #11
Hello Boris,

On 11/2/2022 6:22 AM, Borislav Petkov wrote:
> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote:
>>       if (snp_lookup_rmpentry(pfn, &rmp_level)) {
>>              do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS);
>>              return RMP_PF_RETRY;
> 
> Does this issue some halfway understandable error message why the
> process got killed?
> 
>> Will look at adding our own recovery function for the same, but that will
>> again mark the pages as poisoned, right ?
> 
> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon.
> Semantically, it'll be handled the same way, ofc.

Added a new PG_offlimits flag and a simple corresponding handler for it.

But there is still added complexity of handling hugepages as part of
reclamation failures (both HugeTLB and transparent hugepages) and that
means calling more static functions in mm/memory_failure.c

There is probably a more appropriate handler in mm/memory-failure.c:

soft_offline_page() - this will mark the page as HWPoisoned and also has
handling for hugepages. And we can avoid adding a new page flag too.

soft_offline_page - Soft offline a page.
Soft offline a page, by migration or invalidation, without killing anything.

So, this looks like a good option to call
soft_offline_page() instead of memory_failure() in case of
failure to transition the page back to HV/shared state via 
SNP_RECLAIM_CMD and/or RMPUPDATE instruction.

Thanks,
Ashish

> 
>> Still waiting for some/more feedback from mm folks on the same.
> 
> Just send the patch and they'll give it.
> 
> Thx.
>
Borislav Petkov Nov. 15, 2022, 2:26 p.m. UTC | #12
On Mon, Nov 14, 2022 at 05:36:29PM -0600, Kalra, Ashish wrote:
> But there is still added complexity of handling hugepages as part of
> reclamation failures (both HugeTLB and transparent hugepages) and that

Why?

You want to offline pfns of 4K pages. What hugepages?

> means calling more static functions in mm/memory_failure.c
> 
> There is probably a more appropriate handler in mm/memory-failure.c:
> 
> soft_offline_page() - this will mark the page as HWPoisoned and also has
> handling for hugepages. And we can avoid adding a new page flag too.

So if some other code wants to dump the amount of all hwpoisoned pages,
it'll dump those too.

Don't you see what is wrong with this picture?

And btw, reusing the hwpoison flag

	PG_offlimits = PG_hwpoison

like previously suggested doesn't help here either.

IOW, I really don't like this lumping of semantics together. ;-\
Vlastimil Babka Nov. 15, 2022, 3:14 p.m. UTC | #13
Cc'ing memory failure folks, the beinning of this subthread is here:

https://lore.kernel.org/all/3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra@amd.com/

On 11/15/22 00:36, Kalra, Ashish wrote:
> Hello Boris,
> 
> On 11/2/2022 6:22 AM, Borislav Petkov wrote:
>> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote:
>>>       if (snp_lookup_rmpentry(pfn, &rmp_level)) {
>>>              do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS);
>>>              return RMP_PF_RETRY;
>>
>> Does this issue some halfway understandable error message why the
>> process got killed?
>>
>>> Will look at adding our own recovery function for the same, but that will
>>> again mark the pages as poisoned, right ?
>>
>> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon.
>> Semantically, it'll be handled the same way, ofc.
> 
> Added a new PG_offlimits flag and a simple corresponding handler for it.

One thing is, there's not enough page flags to be adding more (except
aliases for existing) for cases that can avoid it, but as Boris says, if
using alias to PG_hwpoison it depends what will become confused with the
actual hwpoison.

> But there is still added complexity of handling hugepages as part of
> reclamation failures (both HugeTLB and transparent hugepages) and that
> means calling more static functions in mm/memory_failure.c
> 
> There is probably a more appropriate handler in mm/memory-failure.c:
> 
> soft_offline_page() - this will mark the page as HWPoisoned and also has
> handling for hugepages. And we can avoid adding a new page flag too.
> 
> soft_offline_page - Soft offline a page.
> Soft offline a page, by migration or invalidation, without killing anything.
> 
> So, this looks like a good option to call
> soft_offline_page() instead of memory_failure() in case of
> failure to transition the page back to HV/shared state via SNP_RECLAIM_CMD
> and/or RMPUPDATE instruction.

So it's a bit unclear to me what exact situation we are handling here. The
original patch here seems to me to be just leaking back pages that are
unsafe for further use. soft_offline_page() seems to fit that scenario of a
graceful leak before something is irrepairably corrupt and we page fault on it.
But then in the thread you discus PF handling and killing. So what is the
case here? If we detect this need to call snp_leak_pages() does it mean:

a) nobody that could page fault at them (the guest?) is running anymore, we
are tearing it down, we just can't reuse the pages further on the host
- seem like soft_offline_page() could work, but maybe we could just put the
pages on some leaked lists without special page? The only thing that should
matter is not to free the pages to the page allocator so they would be
reused by something else.

b) something can stil page fault at them (what?) - AFAIU can't be resolved
without killing something, memory_failure() might limit the damage

> Thanks,
> Ashish
> 
>>
>>> Still waiting for some/more feedback from mm folks on the same.
>>
>> Just send the patch and they'll give it.
>>
>> Thx.
>>
Borislav Petkov Nov. 15, 2022, 3:22 p.m. UTC | #14
On Tue, Nov 15, 2022 at 04:14:42PM +0100, Vlastimil Babka wrote:
>  but maybe we could just put the pages on some leaked lists without
> special page? The only thing that should matter is not to free the
> pages to the page allocator so they would be reused by something else.

As said on IRC, I like this a *lot*. This perfectly represents what
those leaked pages are: leaked, cannot be used and lost. Certainly not
hwpoisoned.

Yeah, that's much better.

Thx.
Borislav Petkov Nov. 15, 2022, 4:27 p.m. UTC | #15
And,

as dhansen connected the dots, this should be the exact same protection
scenario as UPM:

https://lore.kernel.org/all/20221025151344.3784230-1-chao.p.peng@linux.intel.com

so you should be able to mark them inaccessible the same way and you
won't need any poisoning dance.

And Michael has patches so you probably should talk to him...

Thx.
Ashish Kalra Nov. 15, 2022, 5:24 p.m. UTC | #16
Hello Vlastimil,

On 11/15/2022 9:14 AM, Vlastimil Babka wrote:
> Cc'ing memory failure folks, the beinning of this subthread is here:
> 
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&amp;data=05%7C01%7Cashish.kalra%40amd.com%7C944b59f239c541a52ac808dac71c2089%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041220947600149%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=do9zzyMlAErkKx5rguqnL2GoG4lhsWHDI74zgwLWaZU%3D&amp;reserved=0
> 
> On 11/15/22 00:36, Kalra, Ashish wrote:
>> Hello Boris,
>>
>> On 11/2/2022 6:22 AM, Borislav Petkov wrote:
>>> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote:
>>>>        if (snp_lookup_rmpentry(pfn, &rmp_level)) {
>>>>               do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS);
>>>>               return RMP_PF_RETRY;
>>>
>>> Does this issue some halfway understandable error message why the
>>> process got killed?
>>>
>>>> Will look at adding our own recovery function for the same, but that will
>>>> again mark the pages as poisoned, right ?
>>>
>>> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon.
>>> Semantically, it'll be handled the same way, ofc.
>>
>> Added a new PG_offlimits flag and a simple corresponding handler for it.
> 
> One thing is, there's not enough page flags to be adding more (except
> aliases for existing) for cases that can avoid it, but as Boris says, if
> using alias to PG_hwpoison it depends what will become confused with the
> actual hwpoison.
> 
>> But there is still added complexity of handling hugepages as part of
>> reclamation failures (both HugeTLB and transparent hugepages) and that
>> means calling more static functions in mm/memory_failure.c
>>
>> There is probably a more appropriate handler in mm/memory-failure.c:
>>
>> soft_offline_page() - this will mark the page as HWPoisoned and also has
>> handling for hugepages. And we can avoid adding a new page flag too.
>>
>> soft_offline_page - Soft offline a page.
>> Soft offline a page, by migration or invalidation, without killing anything.
>>
>> So, this looks like a good option to call
>> soft_offline_page() instead of memory_failure() in case of
>> failure to transition the page back to HV/shared state via SNP_RECLAIM_CMD
>> and/or RMPUPDATE instruction.
> 
> So it's a bit unclear to me what exact situation we are handling here. The
> original patch here seems to me to be just leaking back pages that are
> unsafe for further use. soft_offline_page() seems to fit that scenario of a
> graceful leak before something is irrepairably corrupt and we page fault on it.
> But then in the thread you discus PF handling and killing. So what is the
> case here? If we detect this need to call snp_leak_pages() does it mean:
> 
> a) nobody that could page fault at them (the guest?) is running anymore, we
> are tearing it down, we just can't reuse the pages further on the host

The host can page fault on them, if anything on the host tries to write 
to these pages. Host reads will return garbage data.

> - seem like soft_offline_page() could work, but maybe we could just put the
> pages on some leaked lists without special page? The only thing that should
> matter is not to free the pages to the page allocator so they would be
> reused by something else.
> 
> b) something can stil page fault at them (what?) - AFAIU can't be resolved
> without killing something, memory_failure() might limit the damage

As i mentioned above, host writes will cause RMP violation page fault.

Thanks,
Ashish

>>
>>>
>>>> Still waiting for some/more feedback from mm folks on the same.
>>>
>>> Just send the patch and they'll give it.
>>>
>>> Thx.
>>>
>
Ashish Kalra Nov. 15, 2022, 6:15 p.m. UTC | #17
On 11/15/2022 11:24 AM, Kalra, Ashish wrote:
> Hello Vlastimil,
> 
> On 11/15/2022 9:14 AM, Vlastimil Babka wrote:
>> Cc'ing memory failure folks, the beinning of this subthread is here:
>>
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&amp;data=05%7C01%7Cashish.kalra%40amd.com%7C944b59f239c541a52ac808dac71c2089%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041220947600149%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=do9zzyMlAErkKx5rguqnL2GoG4lhsWHDI74zgwLWaZU%3D&amp;reserved=0 
>>
>>
>> On 11/15/22 00:36, Kalra, Ashish wrote:
>>> Hello Boris,
>>>
>>> On 11/2/2022 6:22 AM, Borislav Petkov wrote:
>>>> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote:
>>>>>        if (snp_lookup_rmpentry(pfn, &rmp_level)) {
>>>>>               do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS);
>>>>>               return RMP_PF_RETRY;
>>>>
>>>> Does this issue some halfway understandable error message why the
>>>> process got killed?
>>>>
>>>>> Will look at adding our own recovery function for the same, but 
>>>>> that will
>>>>> again mark the pages as poisoned, right ?
>>>>
>>>> Well, not poisoned but PG_offlimits or whatever the mm folks agree 
>>>> upon.
>>>> Semantically, it'll be handled the same way, ofc.
>>>
>>> Added a new PG_offlimits flag and a simple corresponding handler for it.
>>
>> One thing is, there's not enough page flags to be adding more (except
>> aliases for existing) for cases that can avoid it, but as Boris says, if
>> using alias to PG_hwpoison it depends what will become confused with the
>> actual hwpoison.
>>
>>> But there is still added complexity of handling hugepages as part of
>>> reclamation failures (both HugeTLB and transparent hugepages) and that
>>> means calling more static functions in mm/memory_failure.c
>>>
>>> There is probably a more appropriate handler in mm/memory-failure.c:
>>>
>>> soft_offline_page() - this will mark the page as HWPoisoned and also has
>>> handling for hugepages. And we can avoid adding a new page flag too.
>>>
>>> soft_offline_page - Soft offline a page.
>>> Soft offline a page, by migration or invalidation, without killing 
>>> anything.
>>>
>>> So, this looks like a good option to call
>>> soft_offline_page() instead of memory_failure() in case of
>>> failure to transition the page back to HV/shared state via 
>>> SNP_RECLAIM_CMD
>>> and/or RMPUPDATE instruction.
>>
>> So it's a bit unclear to me what exact situation we are handling here. 
>> The
>> original patch here seems to me to be just leaking back pages that are
>> unsafe for further use. soft_offline_page() seems to fit that scenario 
>> of a
>> graceful leak before something is irrepairably corrupt and we page 
>> fault on it.
>> But then in the thread you discus PF handling and killing. So what is the
>> case here? If we detect this need to call snp_leak_pages() does it mean:
>>
>> a) nobody that could page fault at them (the guest?) is running 
>> anymore, we
>> are tearing it down, we just can't reuse the pages further on the host
> 
> The host can page fault on them, if anything on the host tries to write 
> to these pages. Host reads will return garbage data.
> 
>> - seem like soft_offline_page() could work, but maybe we could just 
>> put the
>> pages on some leaked lists without special page? The only thing that 
>> should
>> matter is not to free the pages to the page allocator so they would be
>> reused by something else.
>>
>> b) something can stil page fault at them (what?) - AFAIU can't be 
>> resolved
>> without killing something, memory_failure() might limit the damage
> 
> As i mentioned above, host writes will cause RMP violation page fault.
>

And to add here, if its a guest private page, then the above fault 
cannot be resolved, so the faulting process is terminated.

Thanks,
Ashish

> 
>>>
>>>>
>>>>> Still waiting for some/more feedback from mm folks on the same.
>>>>
>>>> Just send the patch and they'll give it.
>>>>
>>>> Thx.
>>>>
>>
Ashish Kalra Nov. 15, 2022, 10:44 p.m. UTC | #18
Hello Boris,

On 11/15/2022 10:27 AM, Borislav Petkov wrote:
> And,
> 
> as dhansen connected the dots, this should be the exact same protection
> scenario as UPM:
> 
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F20221025151344.3784230-1-chao.p.peng%40linux.intel.com&amp;data=05%7C01%7Cashish.kalra%40amd.com%7Cbfecf32a51eb499b526d08dac726491e%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041264625164355%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=3RqOC3b9qn0%2B2IRsTZURBmhAVtOn7rARR6fOMOsrFpE%3D&amp;reserved=0
> 
> so you should be able to mark them inaccessible the same way and you
> won't need any poisoning dance.

With UPM, the guest pages are all still freed back to the host after 
guest shutdown, so it's not clear how this would help with handling of 
leaked pages, for e.g, the host can still access these pages once the 
guest is shutdown and it will cause the RMP violation #PF at that point.

Additionally, our use case is of host allocated firmware pages as part 
of the crypto driver (to be passed to SNP firmware api calls
and then re-transitioned back to host state on return) so these are not 
guest private pages in the true sense and they need to be
handled differently in case there is a failure in reclaiming them.

Can you elaborate on what you have in mind ?

Thanks,
Ashish

> 
> And Michael has patches so you probably should talk to him...
> 
> Thx.
>
HORIGUCHI NAOYA(堀口 直也) Nov. 16, 2022, 5:19 a.m. UTC | #19
On Tue, Nov 15, 2022 at 04:14:42PM +0100, Vlastimil Babka wrote:
> Cc'ing memory failure folks, the beinning of this subthread is here:
> 
> https://lore.kernel.org/all/3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra@amd.com/
> 
> On 11/15/22 00:36, Kalra, Ashish wrote:
> > Hello Boris,
> > 
> > On 11/2/2022 6:22 AM, Borislav Petkov wrote:
> >> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote:
> >>>       if (snp_lookup_rmpentry(pfn, &rmp_level)) {
> >>>              do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS);
> >>>              return RMP_PF_RETRY;
> >>
> >> Does this issue some halfway understandable error message why the
> >> process got killed?
> >>
> >>> Will look at adding our own recovery function for the same, but that will
> >>> again mark the pages as poisoned, right ?
> >>
> >> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon.
> >> Semantically, it'll be handled the same way, ofc.
> > 
> > Added a new PG_offlimits flag and a simple corresponding handler for it.
> 
> One thing is, there's not enough page flags to be adding more (except
> aliases for existing) for cases that can avoid it, but as Boris says, if
> using alias to PG_hwpoison it depends what will become confused with the
> actual hwpoison.

I agree with this. Just defining PG_offlimits as an alias of PG_hwpoison
could break current hwpoison workload.  So if you finally decide to go
forward in this direction, you may as well have some indicator to
distinguish the new kind of leaked pages from hwpoisoned pages.

I don't remember exact thread, but I've read someone writing about similar
kind of suggestion of using memory_failure() to make pages inaccessible in
non-memory error usecase.  I feel that it could be possible to generalize
memory_failure() as general-purpose page offlining (by renaming it with
hard_offline_page() and making memory_failure() one of the user of it).

Thanks,
Naoya Horiguchi
Vlastimil Babka Nov. 16, 2022, 9:08 a.m. UTC | #20
On 11/15/22 19:15, Kalra, Ashish wrote:
> 
> On 11/15/2022 11:24 AM, Kalra, Ashish wrote:
>> Hello Vlastimil,
>>
>> On 11/15/2022 9:14 AM, Vlastimil Babka wrote:
>>> Cc'ing memory failure folks, the beinning of this subthread is here:
>>>
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&amp;data=05%7C01%7Cashish.kalra%40amd.com%7C944b59f239c541a52ac808dac71c2089%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041220947600149%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=do9zzyMlAErkKx5rguqnL2GoG4lhsWHDI74zgwLWaZU%3D&amp;reserved=0
>>>
>>> On 11/15/22 00:36, Kalra, Ashish wrote:
>>>> Hello Boris,
>>>>
>>>> On 11/2/2022 6:22 AM, Borislav Petkov wrote:
>>>>> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote:
>>>>>>        if (snp_lookup_rmpentry(pfn, &rmp_level)) {
>>>>>>               do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS);
>>>>>>               return RMP_PF_RETRY;
>>>>>
>>>>> Does this issue some halfway understandable error message why the
>>>>> process got killed?
>>>>>
>>>>>> Will look at adding our own recovery function for the same, but that will
>>>>>> again mark the pages as poisoned, right ?
>>>>>
>>>>> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon.
>>>>> Semantically, it'll be handled the same way, ofc.
>>>>
>>>> Added a new PG_offlimits flag and a simple corresponding handler for it.
>>>
>>> One thing is, there's not enough page flags to be adding more (except
>>> aliases for existing) for cases that can avoid it, but as Boris says, if
>>> using alias to PG_hwpoison it depends what will become confused with the
>>> actual hwpoison.
>>>
>>>> But there is still added complexity of handling hugepages as part of
>>>> reclamation failures (both HugeTLB and transparent hugepages) and that
>>>> means calling more static functions in mm/memory_failure.c
>>>>
>>>> There is probably a more appropriate handler in mm/memory-failure.c:
>>>>
>>>> soft_offline_page() - this will mark the page as HWPoisoned and also has
>>>> handling for hugepages. And we can avoid adding a new page flag too.
>>>>
>>>> soft_offline_page - Soft offline a page.
>>>> Soft offline a page, by migration or invalidation, without killing
>>>> anything.
>>>>
>>>> So, this looks like a good option to call
>>>> soft_offline_page() instead of memory_failure() in case of
>>>> failure to transition the page back to HV/shared state via SNP_RECLAIM_CMD
>>>> and/or RMPUPDATE instruction.
>>>
>>> So it's a bit unclear to me what exact situation we are handling here. The
>>> original patch here seems to me to be just leaking back pages that are
>>> unsafe for further use. soft_offline_page() seems to fit that scenario of a
>>> graceful leak before something is irrepairably corrupt and we page fault
>>> on it.
>>> But then in the thread you discus PF handling and killing. So what is the
>>> case here? If we detect this need to call snp_leak_pages() does it mean:
>>>
>>> a) nobody that could page fault at them (the guest?) is running anymore, we
>>> are tearing it down, we just can't reuse the pages further on the host
>>
>> The host can page fault on them, if anything on the host tries to write to
>> these pages. Host reads will return garbage data.
>>
>>> - seem like soft_offline_page() could work, but maybe we could just put the
>>> pages on some leaked lists without special page? The only thing that should
>>> matter is not to free the pages to the page allocator so they would be
>>> reused by something else.
>>>
>>> b) something can stil page fault at them (what?) - AFAIU can't be resolved
>>> without killing something, memory_failure() might limit the damage
>>
>> As i mentioned above, host writes will cause RMP violation page fault.
>>
> 
> And to add here, if its a guest private page, then the above fault cannot be
> resolved, so the faulting process is terminated.

BTW would this not be mostly resolved as part of rebasing to UPM?
- host will not have these pages mapped in the first place (both kernel
directmap and qemu userspace)
- guest will have them mapped, but I assume that the conversion from private
to shared (that might fail?) can only happen after guest's mappings are
invalidated in the first place?

> Thanks,
> Ashish
> 
>>
>>>>
>>>>>
>>>>>> Still waiting for some/more feedback from mm folks on the same.
>>>>>
>>>>> Just send the patch and they'll give it.
>>>>>
>>>>> Thx.
>>>>>
>>>
Ashish Kalra Nov. 16, 2022, 10:19 a.m. UTC | #21
On 11/16/2022 3:08 AM, Vlastimil Babka wrote:
> On 11/15/22 19:15, Kalra, Ashish wrote:
>>
>> On 11/15/2022 11:24 AM, Kalra, Ashish wrote:
>>> Hello Vlastimil,
>>>
>>> On 11/15/2022 9:14 AM, Vlastimil Babka wrote:
>>>> Cc'ing memory failure folks, the beinning of this subthread is here:
>>>>
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&amp;data=05%7C01%7Cashish.kalra%40amd.com%7C38f8b76238014c67049308dac7b213a5%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041865033588985%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=DDm7tZhUJLy%2BMzS1SXUnsBmBQnAI5dqR6tWZhCKRMEI%3D&amp;reserved=0
>>>>
>>>> On 11/15/22 00:36, Kalra, Ashish wrote:
>>>>> Hello Boris,
>>>>>
>>>>> On 11/2/2022 6:22 AM, Borislav Petkov wrote:
>>>>>> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote:
>>>>>>>         if (snp_lookup_rmpentry(pfn, &rmp_level)) {
>>>>>>>                do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS);
>>>>>>>                return RMP_PF_RETRY;
>>>>>>
>>>>>> Does this issue some halfway understandable error message why the
>>>>>> process got killed?
>>>>>>
>>>>>>> Will look at adding our own recovery function for the same, but that will
>>>>>>> again mark the pages as poisoned, right ?
>>>>>>
>>>>>> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon.
>>>>>> Semantically, it'll be handled the same way, ofc.
>>>>>
>>>>> Added a new PG_offlimits flag and a simple corresponding handler for it.
>>>>
>>>> One thing is, there's not enough page flags to be adding more (except
>>>> aliases for existing) for cases that can avoid it, but as Boris says, if
>>>> using alias to PG_hwpoison it depends what will become confused with the
>>>> actual hwpoison.
>>>>
>>>>> But there is still added complexity of handling hugepages as part of
>>>>> reclamation failures (both HugeTLB and transparent hugepages) and that
>>>>> means calling more static functions in mm/memory_failure.c
>>>>>
>>>>> There is probably a more appropriate handler in mm/memory-failure.c:
>>>>>
>>>>> soft_offline_page() - this will mark the page as HWPoisoned and also has
>>>>> handling for hugepages. And we can avoid adding a new page flag too.
>>>>>
>>>>> soft_offline_page - Soft offline a page.
>>>>> Soft offline a page, by migration or invalidation, without killing
>>>>> anything.
>>>>>
>>>>> So, this looks like a good option to call
>>>>> soft_offline_page() instead of memory_failure() in case of
>>>>> failure to transition the page back to HV/shared state via SNP_RECLAIM_CMD
>>>>> and/or RMPUPDATE instruction.
>>>>
>>>> So it's a bit unclear to me what exact situation we are handling here. The
>>>> original patch here seems to me to be just leaking back pages that are
>>>> unsafe for further use. soft_offline_page() seems to fit that scenario of a
>>>> graceful leak before something is irrepairably corrupt and we page fault
>>>> on it.
>>>> But then in the thread you discus PF handling and killing. So what is the
>>>> case here? If we detect this need to call snp_leak_pages() does it mean:
>>>>
>>>> a) nobody that could page fault at them (the guest?) is running anymore, we
>>>> are tearing it down, we just can't reuse the pages further on the host
>>>
>>> The host can page fault on them, if anything on the host tries to write to
>>> these pages. Host reads will return garbage data.
>>>
>>>> - seem like soft_offline_page() could work, but maybe we could just put the
>>>> pages on some leaked lists without special page? The only thing that should
>>>> matter is not to free the pages to the page allocator so they would be
>>>> reused by something else.
>>>>
>>>> b) something can stil page fault at them (what?) - AFAIU can't be resolved
>>>> without killing something, memory_failure() might limit the damage
>>>
>>> As i mentioned above, host writes will cause RMP violation page fault.
>>>
>>
>> And to add here, if its a guest private page, then the above fault cannot be
>> resolved, so the faulting process is terminated.
> 
> BTW would this not be mostly resolved as part of rebasing to UPM?
> - host will not have these pages mapped in the first place (both kernel
> directmap and qemu userspace)
> - guest will have them mapped, but I assume that the conversion from private
> to shared (that might fail?) can only happen after guest's mappings are
> invalidated in the first place?
> 

Yes, that will be true for guest private pages. But then there are host 
allocated pages for firmware use which will remain in firmware page 
state or reclaim state if they can't be transitioned back to HV/shared 
state once the firmware releases them back to the host and accessing 
them at this point can potentially cause RMP violation #PF.

Again i don't think this is going to happen regularly or frequently so 
it will be a rare error case where the page reclamation, i.e., the 
transition back to HV/shared state fails and now these pages are no 
longer safe to be used.

Referring back to your thoughts about putting these pages on some leaked
pages list, do any such leaked pages list exist currently ?

Thanks,
Ashish

>>>
>>>>>
>>>>>>
>>>>>>> Still waiting for some/more feedback from mm folks on the same.
>>>>>>
>>>>>> Just send the patch and they'll give it.
>>>>>>
>>>>>> Thx.
>>>>>>
>>>>
>
Vlastimil Babka Nov. 16, 2022, 10:25 a.m. UTC | #22
On 11/16/22 11:19, Kalra, Ashish wrote:
> On 11/16/2022 3:08 AM, Vlastimil Babka wrote:
>> On 11/15/22 19:15, Kalra, Ashish wrote:
>>>
>>> On 11/15/2022 11:24 AM, Kalra, Ashish wrote:
>>>> Hello Vlastimil,
>>>>
>>>> On 11/15/2022 9:14 AM, Vlastimil Babka wrote:
>>>>> Cc'ing memory failure folks, the beinning of this subthread is here:
>>>>>
>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&amp;data=05%7C01%7Cashish.kalra%40amd.com%7C38f8b76238014c67049308dac7b213a5%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041865033588985%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=DDm7tZhUJLy%2BMzS1SXUnsBmBQnAI5dqR6tWZhCKRMEI%3D&amp;reserved=0
>>>>>
>>>>> On 11/15/22 00:36, Kalra, Ashish wrote:
>>>>>> Hello Boris,
>>>>>>
>>>>>> On 11/2/2022 6:22 AM, Borislav Petkov wrote:
>>>>>>> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote:
>>>>>>>>         if (snp_lookup_rmpentry(pfn, &rmp_level)) {
>>>>>>>>                do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS);
>>>>>>>>                return RMP_PF_RETRY;
>>>>>>>
>>>>>>> Does this issue some halfway understandable error message why the
>>>>>>> process got killed?
>>>>>>>
>>>>>>>> Will look at adding our own recovery function for the same, but that
>>>>>>>> will
>>>>>>>> again mark the pages as poisoned, right ?
>>>>>>>
>>>>>>> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon.
>>>>>>> Semantically, it'll be handled the same way, ofc.
>>>>>>
>>>>>> Added a new PG_offlimits flag and a simple corresponding handler for it.
>>>>>
>>>>> One thing is, there's not enough page flags to be adding more (except
>>>>> aliases for existing) for cases that can avoid it, but as Boris says, if
>>>>> using alias to PG_hwpoison it depends what will become confused with the
>>>>> actual hwpoison.
>>>>>
>>>>>> But there is still added complexity of handling hugepages as part of
>>>>>> reclamation failures (both HugeTLB and transparent hugepages) and that
>>>>>> means calling more static functions in mm/memory_failure.c
>>>>>>
>>>>>> There is probably a more appropriate handler in mm/memory-failure.c:
>>>>>>
>>>>>> soft_offline_page() - this will mark the page as HWPoisoned and also has
>>>>>> handling for hugepages. And we can avoid adding a new page flag too.
>>>>>>
>>>>>> soft_offline_page - Soft offline a page.
>>>>>> Soft offline a page, by migration or invalidation, without killing
>>>>>> anything.
>>>>>>
>>>>>> So, this looks like a good option to call
>>>>>> soft_offline_page() instead of memory_failure() in case of
>>>>>> failure to transition the page back to HV/shared state via
>>>>>> SNP_RECLAIM_CMD
>>>>>> and/or RMPUPDATE instruction.
>>>>>
>>>>> So it's a bit unclear to me what exact situation we are handling here. The
>>>>> original patch here seems to me to be just leaking back pages that are
>>>>> unsafe for further use. soft_offline_page() seems to fit that scenario
>>>>> of a
>>>>> graceful leak before something is irrepairably corrupt and we page fault
>>>>> on it.
>>>>> But then in the thread you discus PF handling and killing. So what is the
>>>>> case here? If we detect this need to call snp_leak_pages() does it mean:
>>>>>
>>>>> a) nobody that could page fault at them (the guest?) is running
>>>>> anymore, we
>>>>> are tearing it down, we just can't reuse the pages further on the host
>>>>
>>>> The host can page fault on them, if anything on the host tries to write to
>>>> these pages. Host reads will return garbage data.
>>>>
>>>>> - seem like soft_offline_page() could work, but maybe we could just put
>>>>> the
>>>>> pages on some leaked lists without special page? The only thing that
>>>>> should
>>>>> matter is not to free the pages to the page allocator so they would be
>>>>> reused by something else.
>>>>>
>>>>> b) something can stil page fault at them (what?) - AFAIU can't be resolved
>>>>> without killing something, memory_failure() might limit the damage
>>>>
>>>> As i mentioned above, host writes will cause RMP violation page fault.
>>>>
>>>
>>> And to add here, if its a guest private page, then the above fault cannot be
>>> resolved, so the faulting process is terminated.
>>
>> BTW would this not be mostly resolved as part of rebasing to UPM?
>> - host will not have these pages mapped in the first place (both kernel
>> directmap and qemu userspace)
>> - guest will have them mapped, but I assume that the conversion from private
>> to shared (that might fail?) can only happen after guest's mappings are
>> invalidated in the first place?
>>
> 
> Yes, that will be true for guest private pages. But then there are host
> allocated pages for firmware use which will remain in firmware page state or
> reclaim state if they can't be transitioned back to HV/shared state once the
> firmware releases them back to the host and accessing them at this point can
> potentially cause RMP violation #PF.
> 
> Again i don't think this is going to happen regularly or frequently so it
> will be a rare error case where the page reclamation, i.e., the transition
> back to HV/shared state fails and now these pages are no longer safe to be
> used.
> 
> Referring back to your thoughts about putting these pages on some leaked
> pages list, do any such leaked pages list exist currently ?

Not AFAIK, you could just create a list_head somewhere appropriate (some snp
state structure?) and put the pages there, maybe with a counter exposed in
debugs. The point would be mostly that if something goes so wrong it would
be leaking substantial amounts of memory, we can at least recognize the
cause (but I suppose the dmesg will be also full of messages) and e.g. find
the pages in a crash dump.

> Thanks,
> Ashish
> 
>>>>
>>>>>>
>>>>>>>
>>>>>>>> Still waiting for some/more feedback from mm folks on the same.
>>>>>>>
>>>>>>> Just send the patch and they'll give it.
>>>>>>>
>>>>>>> Thx.
>>>>>>>
>>>>>
>>
Ashish Kalra Nov. 16, 2022, 10:28 a.m. UTC | #23
On 11/15/2022 11:19 PM, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Tue, Nov 15, 2022 at 04:14:42PM +0100, Vlastimil Babka wrote:
>> Cc'ing memory failure folks, the beinning of this subthread is here:
>>
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&amp;data=05%7C01%7Cashish.kalra%40amd.com%7C7b2d39d6e2504a8f923608dac792224b%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041727879125176%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=KBJLKhPQP23vmvY%2FNnbjZs8wTJs%2FrF%2BiU54Sdc4Ldx4%3D&amp;reserved=0
>>
>> On 11/15/22 00:36, Kalra, Ashish wrote:
>>> Hello Boris,
>>>
>>> On 11/2/2022 6:22 AM, Borislav Petkov wrote:
>>>> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote:
>>>>>        if (snp_lookup_rmpentry(pfn, &rmp_level)) {
>>>>>               do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS);
>>>>>               return RMP_PF_RETRY;
>>>>
>>>> Does this issue some halfway understandable error message why the
>>>> process got killed?
>>>>
>>>>> Will look at adding our own recovery function for the same, but that will
>>>>> again mark the pages as poisoned, right ?
>>>>
>>>> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon.
>>>> Semantically, it'll be handled the same way, ofc.
>>>
>>> Added a new PG_offlimits flag and a simple corresponding handler for it.
>>
>> One thing is, there's not enough page flags to be adding more (except
>> aliases for existing) for cases that can avoid it, but as Boris says, if
>> using alias to PG_hwpoison it depends what will become confused with the
>> actual hwpoison.
> 
> I agree with this. Just defining PG_offlimits as an alias of PG_hwpoison
> could break current hwpoison workload.  So if you finally decide to go
> forward in this direction, you may as well have some indicator to
> distinguish the new kind of leaked pages from hwpoisoned pages.
> 
> I don't remember exact thread, but I've read someone writing about similar
> kind of suggestion of using memory_failure() to make pages inaccessible in
> non-memory error usecase.  I feel that it could be possible to generalize
> memory_failure() as general-purpose page offlining (by renaming it with

But, doesn't memory_failure() also mark the pages as PG_hwpoison, and 
then using it for these leaked pages will again cause confusion with 
actual hwpoison ?

Thanks,
Ashish

> hard_offline_page() and making memory_failure() one of the user of it).
> 
> Thanks,
> Naoya Horiguchi
>
Ashish Kalra Nov. 16, 2022, 6:01 p.m. UTC | #24
On 11/16/2022 4:25 AM, Vlastimil Babka wrote:
> On 11/16/22 11:19, Kalra, Ashish wrote:
>> On 11/16/2022 3:08 AM, Vlastimil Babka wrote:
>>> On 11/15/22 19:15, Kalra, Ashish wrote:
>>>>
>>>> On 11/15/2022 11:24 AM, Kalra, Ashish wrote:
>>>>> Hello Vlastimil,
>>>>>
>>>>> On 11/15/2022 9:14 AM, Vlastimil Babka wrote:
>>>>>> Cc'ing memory failure folks, the beinning of this subthread is here:
>>>>>>
>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&amp;data=05%7C01%7Cashish.kalra%40amd.com%7C174b7caaf99a473194cd08dac7bcebf3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041911481429347%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=CFkAXNQqangvCqhnwDyIUJUkfiUrX67OpKDTtLGj6PU%3D&amp;reserved=0
>>>>>>
>>>>>> On 11/15/22 00:36, Kalra, Ashish wrote:
>>>>>>> Hello Boris,
>>>>>>>
>>>>>>> On 11/2/2022 6:22 AM, Borislav Petkov wrote:
>>>>>>>> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote:
>>>>>>>>>          if (snp_lookup_rmpentry(pfn, &rmp_level)) {
>>>>>>>>>                 do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS);
>>>>>>>>>                 return RMP_PF_RETRY;
>>>>>>>>
>>>>>>>> Does this issue some halfway understandable error message why the
>>>>>>>> process got killed?
>>>>>>>>
>>>>>>>>> Will look at adding our own recovery function for the same, but that
>>>>>>>>> will
>>>>>>>>> again mark the pages as poisoned, right ?
>>>>>>>>
>>>>>>>> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon.
>>>>>>>> Semantically, it'll be handled the same way, ofc.
>>>>>>>
>>>>>>> Added a new PG_offlimits flag and a simple corresponding handler for it.
>>>>>>
>>>>>> One thing is, there's not enough page flags to be adding more (except
>>>>>> aliases for existing) for cases that can avoid it, but as Boris says, if
>>>>>> using alias to PG_hwpoison it depends what will become confused with the
>>>>>> actual hwpoison.
>>>>>>
>>>>>>> But there is still added complexity of handling hugepages as part of
>>>>>>> reclamation failures (both HugeTLB and transparent hugepages) and that
>>>>>>> means calling more static functions in mm/memory_failure.c
>>>>>>>
>>>>>>> There is probably a more appropriate handler in mm/memory-failure.c:
>>>>>>>
>>>>>>> soft_offline_page() - this will mark the page as HWPoisoned and also has
>>>>>>> handling for hugepages. And we can avoid adding a new page flag too.
>>>>>>>
>>>>>>> soft_offline_page - Soft offline a page.
>>>>>>> Soft offline a page, by migration or invalidation, without killing
>>>>>>> anything.
>>>>>>>
>>>>>>> So, this looks like a good option to call
>>>>>>> soft_offline_page() instead of memory_failure() in case of
>>>>>>> failure to transition the page back to HV/shared state via
>>>>>>> SNP_RECLAIM_CMD
>>>>>>> and/or RMPUPDATE instruction.
>>>>>>
>>>>>> So it's a bit unclear to me what exact situation we are handling here. The
>>>>>> original patch here seems to me to be just leaking back pages that are
>>>>>> unsafe for further use. soft_offline_page() seems to fit that scenario
>>>>>> of a
>>>>>> graceful leak before something is irrepairably corrupt and we page fault
>>>>>> on it.
>>>>>> But then in the thread you discus PF handling and killing. So what is the
>>>>>> case here? If we detect this need to call snp_leak_pages() does it mean:
>>>>>>
>>>>>> a) nobody that could page fault at them (the guest?) is running
>>>>>> anymore, we
>>>>>> are tearing it down, we just can't reuse the pages further on the host
>>>>>
>>>>> The host can page fault on them, if anything on the host tries to write to
>>>>> these pages. Host reads will return garbage data.
>>>>>
>>>>>> - seem like soft_offline_page() could work, but maybe we could just put
>>>>>> the
>>>>>> pages on some leaked lists without special page? The only thing that
>>>>>> should
>>>>>> matter is not to free the pages to the page allocator so they would be
>>>>>> reused by something else.
>>>>>>
>>>>>> b) something can stil page fault at them (what?) - AFAIU can't be resolved
>>>>>> without killing something, memory_failure() might limit the damage
>>>>>
>>>>> As i mentioned above, host writes will cause RMP violation page fault.
>>>>>
>>>>
>>>> And to add here, if its a guest private page, then the above fault cannot be
>>>> resolved, so the faulting process is terminated.
>>>
>>> BTW would this not be mostly resolved as part of rebasing to UPM?
>>> - host will not have these pages mapped in the first place (both kernel
>>> directmap and qemu userspace)
>>> - guest will have them mapped, but I assume that the conversion from private
>>> to shared (that might fail?) can only happen after guest's mappings are
>>> invalidated in the first place?
>>>
>>
>> Yes, that will be true for guest private pages. But then there are host
>> allocated pages for firmware use which will remain in firmware page state or
>> reclaim state if they can't be transitioned back to HV/shared state once the
>> firmware releases them back to the host and accessing them at this point can
>> potentially cause RMP violation #PF.
>>
>> Again i don't think this is going to happen regularly or frequently so it
>> will be a rare error case where the page reclamation, i.e., the transition
>> back to HV/shared state fails and now these pages are no longer safe to be
>> used.
>>
>> Referring back to your thoughts about putting these pages on some leaked
>> pages list, do any such leaked pages list exist currently ?
> 
> Not AFAIK, you could just create a list_head somewhere appropriate (some snp
> state structure?) and put the pages there, maybe with a counter exposed in
> debugs. The point would be mostly that if something goes so wrong it would
> be leaking substantial amounts of memory, we can at least recognize the
> cause (but I suppose the dmesg will be also full of messages) and e.g. find
> the pages in a crash dump.
> 

Ok, so i will work on implementing this leaked pages list and put it on 
a sev/snp associated structure.

Also to add here, we will actually get a not-present #PF instead of the 
RMP violation #PF on writing to these leaked pages, as these pages would 
have been removed from the kernel direct map.

Thanks,
Ashish

>>
>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>>> Still waiting for some/more feedback from mm folks on the same.
>>>>>>>>
>>>>>>>> Just send the patch and they'll give it.
>>>>>>>>
>>>>>>>> Thx.
>>>>>>>>
>>>>>>
>>>
>
Dave Hansen Nov. 16, 2022, 6:32 p.m. UTC | #25
On 11/16/22 02:25, Vlastimil Babka wrote:
>> Referring back to your thoughts about putting these pages on some leaked
>> pages list, do any such leaked pages list exist currently ?
> Not AFAIK, you could just create a list_head somewhere appropriate (some snp
> state structure?) and put the pages there, maybe with a counter exposed in
> debugs. The point would be mostly that if something goes so wrong it would
> be leaking substantial amounts of memory, we can at least recognize the
> cause (but I suppose the dmesg will be also full of messages) and e.g. find
> the pages in a crash dump.

It might also be worth looking through the places that check
PageHWPoison() and making sure that none of them are poking into the
page contents.

It's also the kind of thing that adding some CONFIG_DEBUG_VM checks
might help with.  For instance, nobody should ever be kmap*()'ing a
private page.  The same might even go for pin_user_pages().
Borislav Petkov Nov. 16, 2022, 6:33 p.m. UTC | #26
On Wed, Nov 16, 2022 at 12:01:11PM -0600, Kalra, Ashish wrote:
> Ok, so i will work on implementing this leaked pages list and put it on a
> sev/snp associated structure.

See __sgx_sanitize_pages() and the poison list there, for an example.

> Also to add here, we will actually get a not-present #PF instead of the RMP
> violation #PF on writing to these leaked pages, as these pages would have
> been removed from the kernel direct map.

So if you do the list and still have the kernel raise a RMP fault for
those pages, traversing that list in the RMP handler to check whether
the page is there on it, should be a lot faster operation than doing the
#PF thing and removing them from the direct map.

And sorry for misleading you about UPM - we were thinking wrong
yesterday.

Thx.
Ashish Kalra Nov. 16, 2022, 6:53 p.m. UTC | #27
On 11/16/2022 12:33 PM, Borislav Petkov wrote:
> On Wed, Nov 16, 2022 at 12:01:11PM -0600, Kalra, Ashish wrote:
>> Ok, so i will work on implementing this leaked pages list and put it on a
>> sev/snp associated structure.
> 
> See __sgx_sanitize_pages() and the poison list there, for an example.
> 
>> Also to add here, we will actually get a not-present #PF instead of the RMP
>> violation #PF on writing to these leaked pages, as these pages would have
>> been removed from the kernel direct map.
> 
> So if you do the list and still have the kernel raise a RMP fault for
> those pages, traversing that list in the RMP handler to check whether
> the page is there on it, should be a lot faster operation than doing the
> #PF thing and removing them from the direct map.
> 

Actually, these host allocated pages would have already been removed 
from the kernel direct map, when they were transitioned to the firmware 
state. So actually the not-present #PF fault will happen on any 
read/write access to these leaked pages instead of the RMP violation #PF 
(not-present #PF has higher priority than RMP violation #PF).

If these pages cannot be reclaimed, they are unsafe to use and cannot be 
added back to the kernel direct map.

Thanks,
Ashish

> And sorry for misleading you about UPM - we were thinking wrong
> yesterday.
> 
> Thx.
>
Borislav Petkov Nov. 16, 2022, 7:09 p.m. UTC | #28
On Wed, Nov 16, 2022 at 12:53:36PM -0600, Kalra, Ashish wrote:
> Actually, these host allocated pages would have already been removed from
> the kernel direct map,

And, as I said above, it would be a lot easier to handle any potential
faults resulting from the host touching them by having it raise a *RMP*
fault instead of normal *PF* fault where the latter code is a crazy mess.
Ashish Kalra Nov. 16, 2022, 7:23 p.m. UTC | #29
On 11/16/2022 1:09 PM, Borislav Petkov wrote:
> On Wed, Nov 16, 2022 at 12:53:36PM -0600, Kalra, Ashish wrote:
>> Actually, these host allocated pages would have already been removed from
>> the kernel direct map,
> 
> And, as I said above, it would be a lot easier to handle any potential
> faults resulting from the host touching them by having it raise a *RMP*
> fault instead of normal *PF* fault where the latter code is a crazy mess.

Just to reiterate here, we won't be getting a *RMP* fault but will 
instead get a normal (not-present) #PF fault when the host touches these 
pages.

Sorry for any confusion about the fault signaled, earlier i mentioned we 
will get a RMP violation #PF, but actually as these pages are also 
removed from the kernel direct-map, therefore, we will get the 
not-present #PF and not the RMP #PF (core will check and signal 
not-present #PF before it performs the RMP checks).

Thanks,
Ashish
HORIGUCHI NAOYA(堀口 直也) Nov. 16, 2022, 11:41 p.m. UTC | #30
On Wed, Nov 16, 2022 at 04:28:11AM -0600, Kalra, Ashish wrote:
> On 11/15/2022 11:19 PM, HORIGUCHI NAOYA(堀口 直也) wrote:
> > On Tue, Nov 15, 2022 at 04:14:42PM +0100, Vlastimil Babka wrote:
> > > Cc'ing memory failure folks, the beinning of this subthread is here:
> > > 
> > > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&amp;data=05%7C01%7Cashish.kalra%40amd.com%7C7b2d39d6e2504a8f923608dac792224b%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041727879125176%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=KBJLKhPQP23vmvY%2FNnbjZs8wTJs%2FrF%2BiU54Sdc4Ldx4%3D&amp;reserved=0
> > > 
> > > On 11/15/22 00:36, Kalra, Ashish wrote:
> > > > Hello Boris,
> > > > 
> > > > On 11/2/2022 6:22 AM, Borislav Petkov wrote:
> > > > > On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote:
> > > > > >        if (snp_lookup_rmpentry(pfn, &rmp_level)) {
> > > > > >               do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS);
> > > > > >               return RMP_PF_RETRY;
> > > > > 
> > > > > Does this issue some halfway understandable error message why the
> > > > > process got killed?
> > > > > 
> > > > > > Will look at adding our own recovery function for the same, but that will
> > > > > > again mark the pages as poisoned, right ?
> > > > > 
> > > > > Well, not poisoned but PG_offlimits or whatever the mm folks agree upon.
> > > > > Semantically, it'll be handled the same way, ofc.
> > > > 
> > > > Added a new PG_offlimits flag and a simple corresponding handler for it.
> > > 
> > > One thing is, there's not enough page flags to be adding more (except
> > > aliases for existing) for cases that can avoid it, but as Boris says, if
> > > using alias to PG_hwpoison it depends what will become confused with the
> > > actual hwpoison.
> > 
> > I agree with this. Just defining PG_offlimits as an alias of PG_hwpoison
> > could break current hwpoison workload.  So if you finally decide to go
> > forward in this direction, you may as well have some indicator to
> > distinguish the new kind of leaked pages from hwpoisoned pages.
> > 
> > I don't remember exact thread, but I've read someone writing about similar
> > kind of suggestion of using memory_failure() to make pages inaccessible in
> > non-memory error usecase.  I feel that it could be possible to generalize
> > memory_failure() as general-purpose page offlining (by renaming it with
> 
> But, doesn't memory_failure() also mark the pages as PG_hwpoison, and then
> using it for these leaked pages will again cause confusion with actual
> hwpoison ?

Yes, so we might need modification of memory_failure code for this approach
like renaming PG_hwpoison to more generic one (although some possible names
like PageOffline and PageIsolated are already used) and/or somehow showing
"which kind of leaked pages" info.

Thanks,
Naoya Horiguchi
Ashish Kalra Nov. 17, 2022, 8:56 p.m. UTC | #31
Hello Boris,

>>
>>> +        if (ret)
>>> +            goto cleanup;
>>> +
>>> +        ret = rmp_make_shared(pfn, PG_LEVEL_4K);
>>> +        if (ret)
>>> +            goto cleanup;
>>> +
>>> +        pfn++;
>>> +        n++;
>>> +    }
>>> +
>>> +    return 0;
>>> +
>>> +cleanup:
>>> +    /*
>>> +     * If failed to reclaim the page then page is no longer safe to
>>> +     * be released, leak it.
>>> +     */
>>> +    snp_leak_pages(pfn, npages - n);
>>
>> So this looks real weird: we go and reclaim pages, we hit an error
>> during reclaiming a page X somewhere in-between and then we go and mark
>> the *remaining* pages as not to be used?!
>>
>> Why?
>>
>> Why not only that *one* page which failed and then we continue with the
>> rest?!
> 

I had a re-look at this while fixing the memory_failure() handling and 
realized that we can't do a *partial* recovery here if we fail to 
reclaim a single page, i.e., if we hit an error during reclaiming a 
single page we need to mark the remaining pages as not usable.

This is because this page could be a part of larger buffer which would 
have been transitioned to firmware state and need to be restored back in 
*full* to HV/shared state, any access to a partially transitioned buffer 
will still cause failures, basically the callers won't be able to do any 
kind of a recovery/access on a partially restored/reclaimed buffer and 
now potentially fragmented buffer, which anyway means failure due to 
data loss on non reclaimed page(s).

So we need to be able to reclaim all the pages or none.

Also this failure won't be happening regularly/frequently, it is a 
*rare* error case and if there is a reclamation failure on a single 
page, there is a high probability that there will be reclamation 
failures on subsequent pages.

Thanks,
Ashish
Borislav Petkov Nov. 20, 2022, 9:34 p.m. UTC | #32
On Thu, Nov 17, 2022 at 02:56:47PM -0600, Kalra, Ashish wrote:
> So we need to be able to reclaim all the pages or none.

/me goes and looks at SNP_PAGE_RECLAIM's retvals:

- INVALID_PLATFORM_STATE - platform is not in INIT state. That's
certainly not a reason to leak pages.

- INVALID_ADDRESS - PAGE_PADDR is not a valid system physical address.
That's botched command buffer but not a broken page so no reason to leak
them either.

- INVALID_PAGE_STATE - the page is neither of those types: metadata,
firmware, pre-guest nor pre-swap. So if you issue page reclaim on the
wrong range of pages that looks again like a user error but no need to
leak pages.

- INVALID_PAGE_SIZE - a size mismatch. Still sounds to me like a user
error of sev-guest instead of anything wrong deeper in the FW or HW.

So in all those, if you end up supplying the wrong range of addresses,
you most certainly will end up leaking the wrong pages.

So it sounds to me like you wanna say: "Error reclaiming range, check
your driver" instead of punishing any innocent pages.

Now, if the retval from the fw were FIRMWARE_INTERNAL_ERROR or so, then
sure, by all means. But not for the above. All the error conditions
above sound like the kernel has supplied the wrong range/botched command
buffer to the firmware so there's no need to leak pages.

Thx.
Borislav Petkov Nov. 22, 2022, 10:44 a.m. UTC | #33
On Tue, Nov 22, 2022 at 04:32:18AM -0600, Kalra, Ashish wrote:
> Please note that in both cases, these non-reclaimed pages cannot be
> freed/returned back to the page allocator.

You keep repeating "these pages". Which pages?

What if you specify the wrong, innocent pages because the kernel is
wrong?
Borislav Petkov Nov. 23, 2022, 11:40 a.m. UTC | #34
On Tue, Nov 22, 2022 at 05:44:47AM -0600, Kalra, Ashish wrote:
> It is important to note that if invalid address/len are supplied, the
> failure will happen at the initial stage itself of transitioning these pages
> to firmware state.

/me goes and checks out your v6 tree based on 5.18.

Lemme choose one:

static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
	...

        inpages = sev_pin_memory(kvm, params.uaddr, params.len, &npages, 1);

	...

        for (i = 0; i < npages; i++) {
                pfn = page_to_pfn(inpages[i]);

		...

                ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, &data, error);
                if (ret) {
                        /*
                         * If the command failed then need to reclaim the page.
                         */
                        snp_page_reclaim(pfn);

and here it would leak the pages if it cannot reclaim them.

Now how did you get those?

Through params.uaddr and params.len which come from userspace:

        if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
                return -EFAULT;


Now, think about it, can userspace be trusted?

Exactly.

Yeah, yeah, I see it does is_hva_registered() but userspace can just as
well supply the wrong region which fits.

> In such a case the kernel panic is justifiable,

So userspace can supply whatever it wants and you'd panic?

You surely don't mean that.

> but again if incorrect addresses are supplied, the failure will happen
> at the initial stage of transitioning these pages to firmware state
> and there is no need to reclaim.

See above.

> Or, otherwise dump a warning and let the pages not be freed/returned
> back to the page allocator.
>
> It is either innocent pages or kernel panic or an innocent host
> process crash (these are the choices to make).

No, it is make the kernel as resilient as possible. Which means, no
panic, add the pages to a not-to-be-used-anymore list and scream loudly
with warning messages when it must leak pages so that people can fix the
issue.

Ok?
diff mbox series

Patch

diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 35d76333e120..0dbd99f29b25 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -79,6 +79,14 @@  static void *sev_es_tmr;
 #define NV_LENGTH (32 * 1024)
 static void *sev_init_ex_buffer;
 
+/* When SEV-SNP is enabled the TMR needs to be 2MB aligned and 2MB size. */
+#define SEV_SNP_ES_TMR_SIZE	(2 * 1024 * 1024)
+
+static size_t sev_es_tmr_size = SEV_ES_TMR_SIZE;
+
+static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret);
+static int sev_do_cmd(int cmd, void *data, int *psp_ret);
+
 static inline bool sev_version_greater_or_equal(u8 maj, u8 min)
 {
 	struct sev_device *sev = psp_master->sev_data;
@@ -177,11 +185,161 @@  static int sev_cmd_buffer_len(int cmd)
 	return 0;
 }
 
+static void snp_leak_pages(unsigned long pfn, unsigned int npages)
+{
+	WARN(1, "psc failed, pfn 0x%lx pages %d (leaking)\n", pfn, npages);
+	while (npages--) {
+		memory_failure(pfn, 0);
+		dump_rmpentry(pfn);
+		pfn++;
+	}
+}
+
+static int snp_reclaim_pages(unsigned long pfn, unsigned int npages, bool locked)
+{
+	struct sev_data_snp_page_reclaim data;
+	int ret, err, i, n = 0;
+
+	for (i = 0; i < npages; i++) {
+		memset(&data, 0, sizeof(data));
+		data.paddr = pfn << PAGE_SHIFT;
+
+		if (locked)
+			ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err);
+		else
+			ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err);
+		if (ret)
+			goto cleanup;
+
+		ret = rmp_make_shared(pfn, PG_LEVEL_4K);
+		if (ret)
+			goto cleanup;
+
+		pfn++;
+		n++;
+	}
+
+	return 0;
+
+cleanup:
+	/*
+	 * If failed to reclaim the page then page is no longer safe to
+	 * be released, leak it.
+	 */
+	snp_leak_pages(pfn, npages - n);
+	return ret;
+}
+
+static inline int rmp_make_firmware(unsigned long pfn, int level)
+{
+	return rmp_make_private(pfn, 0, level, 0, true);
+}
+
+static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, bool to_fw, bool locked,
+			     bool need_reclaim)
+{
+	unsigned long pfn = __sme_clr(paddr) >> PAGE_SHIFT; /* Cbit maybe set in the paddr */
+	int rc, n = 0, i;
+
+	for (i = 0; i < npages; i++) {
+		if (to_fw)
+			rc = rmp_make_firmware(pfn, PG_LEVEL_4K);
+		else
+			rc = need_reclaim ? snp_reclaim_pages(pfn, 1, locked) :
+					    rmp_make_shared(pfn, PG_LEVEL_4K);
+		if (rc)
+			goto cleanup;
+
+		pfn++;
+		n++;
+	}
+
+	return 0;
+
+cleanup:
+	/* Try unrolling the firmware state changes */
+	if (to_fw) {
+		/*
+		 * Reclaim the pages which were already changed to the
+		 * firmware state.
+		 */
+		snp_reclaim_pages(paddr >> PAGE_SHIFT, n, locked);
+
+		return rc;
+	}
+
+	/*
+	 * If failed to change the page state to shared, then its not safe
+	 * to release the page back to the system, leak it.
+	 */
+	snp_leak_pages(pfn, npages - n);
+
+	return rc;
+}
+
+static struct page *__snp_alloc_firmware_pages(gfp_t gfp_mask, int order, bool locked)
+{
+	unsigned long npages = 1ul << order, paddr;
+	struct sev_device *sev;
+	struct page *page;
+
+	if (!psp_master || !psp_master->sev_data)
+		return NULL;
+
+	page = alloc_pages(gfp_mask, order);
+	if (!page)
+		return NULL;
+
+	/* If SEV-SNP is initialized then add the page in RMP table. */
+	sev = psp_master->sev_data;
+	if (!sev->snp_inited)
+		return page;
+
+	paddr = __pa((unsigned long)page_address(page));
+	if (snp_set_rmp_state(paddr, npages, true, locked, false))
+		return NULL;
+
+	return page;
+}
+
+void *snp_alloc_firmware_page(gfp_t gfp_mask)
+{
+	struct page *page;
+
+	page = __snp_alloc_firmware_pages(gfp_mask, 0, false);
+
+	return page ? page_address(page) : NULL;
+}
+EXPORT_SYMBOL_GPL(snp_alloc_firmware_page);
+
+static void __snp_free_firmware_pages(struct page *page, int order, bool locked)
+{
+	unsigned long paddr, npages = 1ul << order;
+
+	if (!page)
+		return;
+
+	paddr = __pa((unsigned long)page_address(page));
+	if (snp_set_rmp_state(paddr, npages, false, locked, true))
+		return;
+
+	__free_pages(page, order);
+}
+
+void snp_free_firmware_page(void *addr)
+{
+	if (!addr)
+		return;
+
+	__snp_free_firmware_pages(virt_to_page(addr), 0, false);
+}
+EXPORT_SYMBOL(snp_free_firmware_page);
+
 static void *sev_fw_alloc(unsigned long len)
 {
 	struct page *page;
 
-	page = alloc_pages(GFP_KERNEL, get_order(len));
+	page = __snp_alloc_firmware_pages(GFP_KERNEL, get_order(len), false);
 	if (!page)
 		return NULL;
 
@@ -393,7 +551,7 @@  static int __sev_init_locked(int *error)
 		data.tmr_address = __pa(sev_es_tmr);
 
 		data.flags |= SEV_INIT_FLAGS_SEV_ES;
-		data.tmr_len = SEV_ES_TMR_SIZE;
+		data.tmr_len = sev_es_tmr_size;
 	}
 
 	return __sev_do_cmd_locked(SEV_CMD_INIT, &data, error);
@@ -421,7 +579,7 @@  static int __sev_init_ex_locked(int *error)
 		data.tmr_address = __pa(sev_es_tmr);
 
 		data.flags |= SEV_INIT_FLAGS_SEV_ES;
-		data.tmr_len = SEV_ES_TMR_SIZE;
+		data.tmr_len = sev_es_tmr_size;
 	}
 
 	return __sev_do_cmd_locked(SEV_CMD_INIT_EX, &data, error);
@@ -818,6 +976,8 @@  static int __sev_snp_init_locked(int *error)
 	sev->snp_inited = true;
 	dev_dbg(sev->dev, "SEV-SNP firmware initialized\n");
 
+	sev_es_tmr_size = SEV_SNP_ES_TMR_SIZE;
+
 	return rc;
 }
 
@@ -1341,8 +1501,9 @@  static void sev_firmware_shutdown(struct sev_device *sev)
 		/* The TMR area was encrypted, flush it from the cache */
 		wbinvd_on_all_cpus();
 
-		free_pages((unsigned long)sev_es_tmr,
-			   get_order(SEV_ES_TMR_SIZE));
+		__snp_free_firmware_pages(virt_to_page(sev_es_tmr),
+					  get_order(sev_es_tmr_size),
+					  false);
 		sev_es_tmr = NULL;
 	}
 
@@ -1430,7 +1591,7 @@  void sev_pci_init(void)
 	}
 
 	/* Obtain the TMR memory area for SEV-ES use */
-	sev_es_tmr = sev_fw_alloc(SEV_ES_TMR_SIZE);
+	sev_es_tmr = sev_fw_alloc(sev_es_tmr_size);
 	if (!sev_es_tmr)
 		dev_warn(sev->dev,
 			 "SEV: TMR allocation failed, SEV-ES support unavailable\n");
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 9f921d221b75..a3bb792bb842 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -12,6 +12,8 @@ 
 #ifndef __PSP_SEV_H__
 #define __PSP_SEV_H__
 
+#include <linux/sev.h>
+
 #include <uapi/linux/psp-sev.h>
 
 #ifdef CONFIG_X86
@@ -940,6 +942,8 @@  int snp_guest_page_reclaim(struct sev_data_snp_page_reclaim *data, int *error);
 int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *error);
 
 void *psp_copy_user_blob(u64 uaddr, u32 len);
+void *snp_alloc_firmware_page(gfp_t mask);
+void snp_free_firmware_page(void *addr);
 
 #else	/* !CONFIG_CRYPTO_DEV_SP_PSP */
 
@@ -981,6 +985,13 @@  static inline int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *erro
 	return -ENODEV;
 }
 
+static inline void *snp_alloc_firmware_page(gfp_t mask)
+{
+	return NULL;
+}
+
+static inline void snp_free_firmware_page(void *addr) { }
+
 #endif	/* CONFIG_CRYPTO_DEV_SP_PSP */
 
 #endif	/* __PSP_SEV_H__ */