diff mbox

[Xen-devel,2/6] tools: arm: allocate superpages to guests.

Message ID 1402394278-9850-2-git-send-email-ian.campbell@citrix.com
State New
Headers show

Commit Message

Ian Campbell June 10, 2014, 9:57 a.m. UTC
Tries to allocate as many large (1G or 2M) pages as it can to the guest and tries
to align to the next larger size on each attempt so that we can attempt to
allocate even larger pages on the next iteration (this is currently only
exercised via a compile time debug option, which is left in place under a #if
0).

Since ARM page tables are consistent at each level there is a common helper
function which tries to allocate a levels worth of pages. The exception to this
consistency is level 0 which does not support table mappings (0.5TB
superpages!).

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
---
 tools/libxc/xc_dom_arm.c |  103 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 92 insertions(+), 11 deletions(-)

Comments

Julien Grall June 10, 2014, 11:23 a.m. UTC | #1
Hi Ian,

On 06/10/2014 10:57 AM, Ian Campbell wrote:
> Tries to allocate as many large (1G or 2M) pages as it can to the guest and tries
> to align to the next larger size on each attempt so that we can attempt to
> allocate even larger pages on the next iteration (this is currently only
> exercised via a compile time debug option, which is left in place under a #if
> 0).
> 
> Since ARM page tables are consistent at each level there is a common helper
> function which tries to allocate a levels worth of pages. The exception to this
> consistency is level 0 which does not support table mappings (0.5TB
> superpages!).
> 
> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> ---
>  tools/libxc/xc_dom_arm.c |  103 +++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 92 insertions(+), 11 deletions(-)
> 
> diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
> index 75f8363..da68ec3 100644
> --- a/tools/libxc/xc_dom_arm.c
> +++ b/tools/libxc/xc_dom_arm.c
> @@ -30,6 +30,13 @@
>  #define CONSOLE_PFN_OFFSET 0
>  #define XENSTORE_PFN_OFFSET 1
>  
> +#define LPAE_SHIFT 9
> +
> +#define PFN_4K_SHIFT  (0)
> +#define PFN_2M_SHIFT  (PFN_4K_SHIFT+LPAE_SHIFT)
> +#define PFN_1G_SHIFT  (PFN_2M_SHIFT+LPAE_SHIFT)
> +#define PFN_512G_SHIFT (PFN_1G_SHIFT+LPAE_SHIFT)
> +
>  /* get guest IO ABI protocol */
>  const char *xc_domain_get_native_protocol(xc_interface *xch,
>                                            uint32_t domid)
> @@ -249,11 +256,57 @@ static int set_mode(xc_interface *xch, domid_t domid, char *guest_type)
>      return rc;
>  }
>  
> +#define min_t(type,x,y) \
> +        ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })

min_t is also defined in xc_dom_decompress_unsafe_xz.c. It might be
interesting to define it in a common header (such as xc_private.h).

> +/*  >0: success, *nr_pfns set to number actually populated
> + *   0: didn't try with this pfn shift (e.g. misaligned base etc)
> + *  <0: ERROR
> + */
> +static int populate_one_size(struct xc_dom_image *dom, int pfn_shift,
> +                             xen_pfn_t base_pfn, xen_pfn_t *nr_pfns)
> +{
> +    uint64_t mask = ((uint64_t)1<<(pfn_shift))-1;
> +    uint64_t next_mask = ((uint64_t)1<<(LPAE_SHIFT))-1;
> +    int nr, i, count = min_t(int, 1024*1024, *nr_pfns >> pfn_shift);

Stupid question, where does come from the 1024*1024?

> +    xen_pfn_t extents[count];

If I follow the count definition, it's possible to allocate 1024*1024
xen_pfn_t (about 8MB) on the stack.

[..]

>  static int populate_guest_memory(struct xc_dom_image *dom,
>                                   xen_pfn_t base_pfn, xen_pfn_t nr_pfns)
>  {
> -    int rc;
> +    int rc = 0;
>      xen_pfn_t allocsz, pfn;
> +    int debug_iters = 0;
>  
>      DOMPRINTF("%s: populating RAM @ %016"PRIx64"-%016"PRIx64" (%"PRId64"MB)",
>                __FUNCTION__,
> @@ -261,21 +314,49 @@ static int populate_guest_memory(struct xc_dom_image *dom,
>                (uint64_t)(base_pfn + nr_pfns) << XC_PAGE_SHIFT,
>                (uint64_t)nr_pfns >> (20-XC_PAGE_SHIFT));
>  
> -    for ( pfn = 0; pfn < nr_pfns; pfn++ )
> -        dom->p2m_host[pfn] = base_pfn + pfn;
> -
> -    for ( pfn = rc = allocsz = 0; (pfn < nr_pfns) && !rc; pfn += allocsz )
> +    for ( pfn = 0; pfn < nr_pfns; pfn += allocsz )
>      {
>          allocsz = nr_pfns - pfn;
> -        if ( allocsz > 1024*1024 )
> -            allocsz = 1024*1024;
>  
> -        rc = xc_domain_populate_physmap_exact(
> -            dom->xch, dom->guest_domid, allocsz,
> -            0, 0, &dom->p2m_host[pfn]);
> +        if ( debug_iters++ > 4 )
> +            abort();

IIRC, abort doesn't give any stack trace. I'm not sure if you intended
to keep it in your patch. If so, I would add an error message.

Regards,
Ian Campbell June 10, 2014, 3:16 p.m. UTC | #2
On Tue, 2014-06-10 at 12:23 +0100, Julien Grall wrote:
> Hi Ian,
> 
> On 06/10/2014 10:57 AM, Ian Campbell wrote:
> > Tries to allocate as many large (1G or 2M) pages as it can to the guest and tries
> > to align to the next larger size on each attempt so that we can attempt to
> > allocate even larger pages on the next iteration (this is currently only
> > exercised via a compile time debug option, which is left in place under a #if
> > 0).
> > 
> > Since ARM page tables are consistent at each level there is a common helper
> > function which tries to allocate a levels worth of pages. The exception to this
> > consistency is level 0 which does not support table mappings (0.5TB
> > superpages!).
> > 
> > Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> > ---
> >  tools/libxc/xc_dom_arm.c |  103 +++++++++++++++++++++++++++++++++++++++++-----
> >  1 file changed, 92 insertions(+), 11 deletions(-)
> > 
> > diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
> > index 75f8363..da68ec3 100644
> > --- a/tools/libxc/xc_dom_arm.c
> > +++ b/tools/libxc/xc_dom_arm.c
> > @@ -30,6 +30,13 @@
> >  #define CONSOLE_PFN_OFFSET 0
> >  #define XENSTORE_PFN_OFFSET 1
> >  
> > +#define LPAE_SHIFT 9
> > +
> > +#define PFN_4K_SHIFT  (0)
> > +#define PFN_2M_SHIFT  (PFN_4K_SHIFT+LPAE_SHIFT)
> > +#define PFN_1G_SHIFT  (PFN_2M_SHIFT+LPAE_SHIFT)
> > +#define PFN_512G_SHIFT (PFN_1G_SHIFT+LPAE_SHIFT)
> > +
> >  /* get guest IO ABI protocol */
> >  const char *xc_domain_get_native_protocol(xc_interface *xch,
> >                                            uint32_t domid)
> > @@ -249,11 +256,57 @@ static int set_mode(xc_interface *xch, domid_t domid, char *guest_type)
> >      return rc;
> >  }
> >  
> > +#define min_t(type,x,y) \
> > +        ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
> 
> min_t is also defined in xc_dom_decompress_unsafe_xz.c. It might be
> interesting to define it in a common header (such as xc_private.h).

Yes, good idea. I'll add a precursor patch.
> 
> > +/*  >0: success, *nr_pfns set to number actually populated
> > + *   0: didn't try with this pfn shift (e.g. misaligned base etc)
> > + *  <0: ERROR
> > + */
> > +static int populate_one_size(struct xc_dom_image *dom, int pfn_shift,
> > +                             xen_pfn_t base_pfn, xen_pfn_t *nr_pfns)
> > +{
> > +    uint64_t mask = ((uint64_t)1<<(pfn_shift))-1;
> > +    uint64_t next_mask = ((uint64_t)1<<(LPAE_SHIFT))-1;
> > +    int nr, i, count = min_t(int, 1024*1024, *nr_pfns >> pfn_shift);
> 
> Stupid question, where does come from the 1024*1024?

It was in the original as a backstop on allocsz. It would correspond to
4GB worth of 4K page I suppose

> > +    xen_pfn_t extents[count];
> 
> If I follow the count definition, it's possible to allocate 1024*1024
> xen_pfn_t (about 8MB) on the stack.

userspace stack isn't all that precious but 8MB does seem a little
excessive. Previously this was using the already allocated host p2m so
it wasn't an issue, but that doesn't work for allocations of page >4K.

The tradeoff is that smaller batches mean it will take longer.

I don't think it would be unreasonable to reduce this to be e.g. 1GB
worth of entries (256*1024) or 2MB of stack.

> 
> [..]
> 
> >  static int populate_guest_memory(struct xc_dom_image *dom,
> >                                   xen_pfn_t base_pfn, xen_pfn_t nr_pfns)
> >  {
> > -    int rc;
> > +    int rc = 0;
> >      xen_pfn_t allocsz, pfn;
> > +    int debug_iters = 0;
> >  
> >      DOMPRINTF("%s: populating RAM @ %016"PRIx64"-%016"PRIx64" (%"PRId64"MB)",
> >                __FUNCTION__,
> > @@ -261,21 +314,49 @@ static int populate_guest_memory(struct xc_dom_image *dom,
> >                (uint64_t)(base_pfn + nr_pfns) << XC_PAGE_SHIFT,
> >                (uint64_t)nr_pfns >> (20-XC_PAGE_SHIFT));
> >  
> > -    for ( pfn = 0; pfn < nr_pfns; pfn++ )
> > -        dom->p2m_host[pfn] = base_pfn + pfn;
> > -
> > -    for ( pfn = rc = allocsz = 0; (pfn < nr_pfns) && !rc; pfn += allocsz )
> > +    for ( pfn = 0; pfn < nr_pfns; pfn += allocsz )
> >      {
> >          allocsz = nr_pfns - pfn;
> > -        if ( allocsz > 1024*1024 )
> > -            allocsz = 1024*1024;
> >  
> > -        rc = xc_domain_populate_physmap_exact(
> > -            dom->xch, dom->guest_domid, allocsz,
> > -            0, 0, &dom->p2m_host[pfn]);
> > +        if ( debug_iters++ > 4 )
> > +            abort();
> 
> IIRC, abort doesn't give any stack trace. I'm not sure if you intended
> to keep it in your patch. If so, I would add an error message.

I can't remember what I was doing here, but it is just debug so I'll
nuke it.

Ian.
Julien Grall June 11, 2014, 11:50 a.m. UTC | #3
On 06/10/2014 04:16 PM, Ian Campbell wrote:
>>> +/*  >0: success, *nr_pfns set to number actually populated
>>> + *   0: didn't try with this pfn shift (e.g. misaligned base etc)
>>> + *  <0: ERROR
>>> + */
>>> +static int populate_one_size(struct xc_dom_image *dom, int pfn_shift,
>>> +                             xen_pfn_t base_pfn, xen_pfn_t *nr_pfns)
>>> +{
>>> +    uint64_t mask = ((uint64_t)1<<(pfn_shift))-1;
>>> +    uint64_t next_mask = ((uint64_t)1<<(LPAE_SHIFT))-1;
>>> +    int nr, i, count = min_t(int, 1024*1024, *nr_pfns >> pfn_shift);
>>
>> Stupid question, where does come from the 1024*1024?
> 
> It was in the original as a backstop on allocsz. It would correspond to
> 4GB worth of 4K page I suppose

Ah ok. I didn't pay attention about it.

> 
>>> +    xen_pfn_t extents[count];
>>
>> If I follow the count definition, it's possible to allocate 1024*1024
>> xen_pfn_t (about 8MB) on the stack.
> 
> userspace stack isn't all that precious but 8MB does seem a little
> excessive. Previously this was using the already allocated host p2m so
> it wasn't an issue, but that doesn't work for allocations of page >4K.
> 
> The tradeoff is that smaller batches mean it will take longer.
> 
> I don't think it would be unreasonable to reduce this to be e.g. 1GB
> worth of entries (256*1024) or 2MB of stack.

It seems the default stack size is 8MB, I'm wondering if we can have
some use case where this limit is smaller.

Is there any issue to allocate this variable with malloc?

Regards,
Ian Campbell June 11, 2014, 11:55 a.m. UTC | #4
On Wed, 2014-06-11 at 12:50 +0100, Julien Grall wrote:
> On 06/10/2014 04:16 PM, Ian Campbell wrote:
> >>> +/*  >0: success, *nr_pfns set to number actually populated
> >>> + *   0: didn't try with this pfn shift (e.g. misaligned base etc)
> >>> + *  <0: ERROR
> >>> + */
> >>> +static int populate_one_size(struct xc_dom_image *dom, int pfn_shift,
> >>> +                             xen_pfn_t base_pfn, xen_pfn_t *nr_pfns)
> >>> +{
> >>> +    uint64_t mask = ((uint64_t)1<<(pfn_shift))-1;
> >>> +    uint64_t next_mask = ((uint64_t)1<<(LPAE_SHIFT))-1;
> >>> +    int nr, i, count = min_t(int, 1024*1024, *nr_pfns >> pfn_shift);
> >>
> >> Stupid question, where does come from the 1024*1024?
> > 
> > It was in the original as a backstop on allocsz. It would correspond to
> > 4GB worth of 4K page I suppose
> 
> Ah ok. I didn't pay attention about it.
> 
> > 
> >>> +    xen_pfn_t extents[count];
> >>
> >> If I follow the count definition, it's possible to allocate 1024*1024
> >> xen_pfn_t (about 8MB) on the stack.
> > 
> > userspace stack isn't all that precious but 8MB does seem a little
> > excessive. Previously this was using the already allocated host p2m so
> > it wasn't an issue, but that doesn't work for allocations of page >4K.
> > 
> > The tradeoff is that smaller batches mean it will take longer.
> > 
> > I don't think it would be unreasonable to reduce this to be e.g. 1GB
> > worth of entries (256*1024) or 2MB of stack.
> 
> It seems the default stack size is 8MB, I'm wondering if we can have
> some use case where this limit is smaller.

I think we effectively assume it is larger than any amount we would
practically use.

> Is there any issue to allocate this variable with malloc?

Just more book keeping code really.

Ian.
Julien Grall June 11, 2014, 12:16 p.m. UTC | #5
On 06/11/2014 12:55 PM, Ian Campbell wrote:
> On Wed, 2014-06-11 at 12:50 +0100, Julien Grall wrote:
>> On 06/10/2014 04:16 PM, Ian Campbell wrote:
>>>>> +/*  >0: success, *nr_pfns set to number actually populated
>>>>> + *   0: didn't try with this pfn shift (e.g. misaligned base etc)
>>>>> + *  <0: ERROR
>>>>> + */
>>>>> +static int populate_one_size(struct xc_dom_image *dom, int pfn_shift,
>>>>> +                             xen_pfn_t base_pfn, xen_pfn_t *nr_pfns)
>>>>> +{
>>>>> +    uint64_t mask = ((uint64_t)1<<(pfn_shift))-1;
>>>>> +    uint64_t next_mask = ((uint64_t)1<<(LPAE_SHIFT))-1;
>>>>> +    int nr, i, count = min_t(int, 1024*1024, *nr_pfns >> pfn_shift);
>>>>
>>>> Stupid question, where does come from the 1024*1024?
>>>
>>> It was in the original as a backstop on allocsz. It would correspond to
>>> 4GB worth of 4K page I suppose
>>
>> Ah ok. I didn't pay attention about it.
>>
>>>
>>>>> +    xen_pfn_t extents[count];
>>>>
>>>> If I follow the count definition, it's possible to allocate 1024*1024
>>>> xen_pfn_t (about 8MB) on the stack.
>>>
>>> userspace stack isn't all that precious but 8MB does seem a little
>>> excessive. Previously this was using the already allocated host p2m so
>>> it wasn't an issue, but that doesn't work for allocations of page >4K.
>>>
>>> The tradeoff is that smaller batches mean it will take longer.
>>>
>>> I don't think it would be unreasonable to reduce this to be e.g. 1GB
>>> worth of entries (256*1024) or 2MB of stack.
>>
>> It seems the default stack size is 8MB, I'm wondering if we can have
>> some use case where this limit is smaller.
> 
> I think we effectively assume it is larger than any amount we would
> practically use.
> 
>> Is there any issue to allocate this variable with malloc?
> 
> Just more book keeping code really.

Looking to the code, it doesn't seem too difficult to add malloc and
handle error.

I would prefer to use the malloc rather the stack and therefore assuming
that's stack a quite big, even though this variable should never be too
big. It wouldn't be mad to have an OS using a 1M stack (or even less).

Anyway it seems that x86 libxc is using the same trick for superpage...

Regards,
diff mbox

Patch

diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
index 75f8363..da68ec3 100644
--- a/tools/libxc/xc_dom_arm.c
+++ b/tools/libxc/xc_dom_arm.c
@@ -30,6 +30,13 @@ 
 #define CONSOLE_PFN_OFFSET 0
 #define XENSTORE_PFN_OFFSET 1
 
+#define LPAE_SHIFT 9
+
+#define PFN_4K_SHIFT  (0)
+#define PFN_2M_SHIFT  (PFN_4K_SHIFT+LPAE_SHIFT)
+#define PFN_1G_SHIFT  (PFN_2M_SHIFT+LPAE_SHIFT)
+#define PFN_512G_SHIFT (PFN_1G_SHIFT+LPAE_SHIFT)
+
 /* get guest IO ABI protocol */
 const char *xc_domain_get_native_protocol(xc_interface *xch,
                                           uint32_t domid)
@@ -249,11 +256,57 @@  static int set_mode(xc_interface *xch, domid_t domid, char *guest_type)
     return rc;
 }
 
+#define min_t(type,x,y) \
+        ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
+
+/*  >0: success, *nr_pfns set to number actually populated
+ *   0: didn't try with this pfn shift (e.g. misaligned base etc)
+ *  <0: ERROR
+ */
+static int populate_one_size(struct xc_dom_image *dom, int pfn_shift,
+                             xen_pfn_t base_pfn, xen_pfn_t *nr_pfns)
+{
+    uint64_t mask = ((uint64_t)1<<(pfn_shift))-1;
+    uint64_t next_mask = ((uint64_t)1<<(LPAE_SHIFT))-1;
+    int nr, i, count = min_t(int, 1024*1024, *nr_pfns >> pfn_shift);
+    xen_pfn_t extents[count];
+
+    /* No level zero super pages with current hardware */
+    if ( pfn_shift == PFN_512G_SHIFT )
+        return 0;
+
+    /* Nothing to allocate */
+    if ( !count )
+        return 0;
+
+    /* base is misaligned for this level */
+    if ( mask & base_pfn )
+        return 0;
+
+    /* align to the end of a super page at this level */
+    if ( count & next_mask )
+        count &= next_mask;
+
+    for ( i = 0 ; i < count ; i ++ )
+        extents[i] = base_pfn + (i<<pfn_shift);
+
+    nr = xc_domain_populate_physmap(dom->xch, dom->guest_domid, count,
+                                    pfn_shift, 0, extents);
+    if ( nr <= 0 ) return nr;
+    DOMPRINTF("%s: populated %#x/%#x entries with shift %d",
+              __FUNCTION__, nr, count, pfn_shift);
+
+    *nr_pfns = nr << pfn_shift;
+
+    return 1;
+}
+
 static int populate_guest_memory(struct xc_dom_image *dom,
                                  xen_pfn_t base_pfn, xen_pfn_t nr_pfns)
 {
-    int rc;
+    int rc = 0;
     xen_pfn_t allocsz, pfn;
+    int debug_iters = 0;
 
     DOMPRINTF("%s: populating RAM @ %016"PRIx64"-%016"PRIx64" (%"PRId64"MB)",
               __FUNCTION__,
@@ -261,21 +314,49 @@  static int populate_guest_memory(struct xc_dom_image *dom,
               (uint64_t)(base_pfn + nr_pfns) << XC_PAGE_SHIFT,
               (uint64_t)nr_pfns >> (20-XC_PAGE_SHIFT));
 
-    for ( pfn = 0; pfn < nr_pfns; pfn++ )
-        dom->p2m_host[pfn] = base_pfn + pfn;
-
-    for ( pfn = rc = allocsz = 0; (pfn < nr_pfns) && !rc; pfn += allocsz )
+    for ( pfn = 0; pfn < nr_pfns; pfn += allocsz )
     {
         allocsz = nr_pfns - pfn;
-        if ( allocsz > 1024*1024 )
-            allocsz = 1024*1024;
 
-        rc = xc_domain_populate_physmap_exact(
-            dom->xch, dom->guest_domid, allocsz,
-            0, 0, &dom->p2m_host[pfn]);
+        if ( debug_iters++ > 4 )
+            abort();
+#if 0 /* Enable this to exercise/debug the code which tries to realign
+       * to a superpage boundary, by misaligning at the start. */
+        if ( pfn == 0 )
+        {
+            allocsz = 1;
+            rc = populate_one_size(dom, PFN_4K_SHIFT, base_pfn + pfn, &allocsz);
+            if (rc < 0) break;
+            if (rc > 0) continue;
+            /* Failed to allocate a single page? */
+            break;
+        }
+#endif
+
+        rc = populate_one_size(dom, PFN_512G_SHIFT, base_pfn + pfn, &allocsz);
+        if (rc < 0) break;
+        if (rc > 0) continue;
+
+        rc = populate_one_size(dom, PFN_1G_SHIFT, base_pfn + pfn, &allocsz);
+        if (rc < 0) break;
+        if (rc > 0) continue;
+
+        rc = populate_one_size(dom, PFN_2M_SHIFT, base_pfn + pfn, &allocsz);
+        if (rc < 0) break;
+        if (rc > 0) continue;
+
+        rc = populate_one_size(dom, PFN_4K_SHIFT, base_pfn + pfn, &allocsz);
+        if (rc < 0) break;
+
+        assert(rc > 0); /* Must have tried to allocate some 4k pages! */
     }
 
-    return rc;
+    DOMPRINTF("%s: rc=%d", __FUNCTION__, rc);
+
+    for ( pfn = 0; pfn < nr_pfns; pfn++ )
+        dom->p2m_host[pfn] = base_pfn + pfn;
+
+    return rc < 0 ? rc : 0;
 }
 
 int arch_setup_meminit(struct xc_dom_image *dom)