diff mbox series

[v1,1/5] virtio-mem: Probe THP size to determine default block size

Message ID 20200923113900.72718-2-david@redhat.com
State New
Headers show
Series virtio-mem: block size and address-assignment optimizations | expand

Commit Message

David Hildenbrand Sept. 23, 2020, 11:38 a.m. UTC
Let's allow a minimum block size of 1 MiB in all configurations. Use
a default block size based on the THP size, and warn if something
smaller is configured by the user.

VIRTIO_MEM only supports Linux (depends on LINUX), so we can probe the
THP size unconditionally.

For now we only support virtio-mem on x86-64 - there isn't a user-visiable
change (x86-64 only supports 2 MiB THP on the PMD level) - the default
was, and will be 2 MiB.

If we ever have THP on the PUD level (e.g., 1 GiB THP on x86-64), we
expect to have a trigger to explicitly opt-in for the new THP granularity.

Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 hw/virtio/virtio-mem.c | 82 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 78 insertions(+), 4 deletions(-)

Comments

Pankaj Gupta Sept. 25, 2020, 1:46 p.m. UTC | #1
> Let's allow a minimum block size of 1 MiB in all configurations. Use
> a default block size based on the THP size, and warn if something
> smaller is configured by the user.
>
> VIRTIO_MEM only supports Linux (depends on LINUX), so we can probe the
> THP size unconditionally.
>
> For now we only support virtio-mem on x86-64 - there isn't a user-visiable
> change (x86-64 only supports 2 MiB THP on the PMD level) - the default
> was, and will be 2 MiB.
>
> If we ever have THP on the PUD level (e.g., 1 GiB THP on x86-64), we
> expect to have a trigger to explicitly opt-in for the new THP granularity.
>
> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> Cc: Wei Yang <richardw.yang@linux.intel.com>
> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
> Cc: Igor Mammedov <imammedo@redhat.com>
> Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  hw/virtio/virtio-mem.c | 82 +++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 78 insertions(+), 4 deletions(-)
>
> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
> index 8fbec77ccc..58098686ee 100644
> --- a/hw/virtio/virtio-mem.c
> +++ b/hw/virtio/virtio-mem.c
> @@ -33,10 +33,70 @@
>  #include "trace.h"
>
>  /*
> - * Use QEMU_VMALLOC_ALIGN, so no THP will have to be split when unplugging
> - * memory (e.g., 2MB on x86_64).
> + * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking
> + * bitmap small.
>   */
> -#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)QEMU_VMALLOC_ALIGN)
> +#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
> +
> +/*
> + * We want to have a reasonable default block size such that
> + * 1. We avoid splitting THPs when unplugging memory, which degrades
> + *    performance.
> + * 2. We avoid placing THPs for plugged blocks that also cover unplugged
> + *    blocks.
> + *
> + * The actual THP size might differ between Linux kernels, so we try to probe
> + * it. In the future (if we ever run into issues regarding 2.), we might want
> + * to disable THP in case we fail to properly probe the THP size, or if the
> + * block size is configured smaller than the THP size.
> + */
> +static uint32_t default_block_size;
> +
> +#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
> +static uint32_t virtio_mem_default_block_size(void)
> +{
> +    gchar *content = NULL;
> +    const char *endptr;
> +    uint64_t tmp;
> +
> +    if (default_block_size) {
> +        return default_block_size;
> +    }
> +
> +    /*
> +     * Try to probe the actual THP size, fallback to (sane but eventually
> +     * incorrect) default sizes.
> +     */
> +    if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
> +        !qemu_strtou64(content, &endptr, 0, &tmp) &&
> +        (!endptr || *endptr == '\n')) {
> +        /*
> +         * Sanity-check the value, if it's too big (e.g., aarch64 with 64k base
> +         * pages) or weird, fallback to something smaller.
> +         */
> +        if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) {
> +            warn_report("Detected a THP size of %" PRIx64
> +                        " MiB, falling back to 1 MiB.", tmp / MiB);
> +            default_block_size = 1 * MiB;

Probably use macro "VIRTIO_MEM_MIN_BLOCK_SIZE"
> +        } else {
> +            default_block_size = tmp;
> +        }
> +    } else {
> +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \
> +    defined(__powerpc64__)
> +        default_block_size = 2 * MiB;
> +#else
> +        /* fallback to 1 MiB (e.g., the THP size on s390x) */
> +        default_block_size = 1 * MiB;
> +#endif

Maybe we can declare this macro near to "VIRTIO_MEM_MIN_BLOCK_SIZE
((uint32_t)(1 * MiB))"
or club into one, just a thought.

> +        warn_report("Could not detect THP size, falling back to %" PRIx64
> +                    "  MiB.", default_block_size / MiB);
> +    }
> +
> +    g_free(content);
> +    return default_block_size;
> +}
> +
>  /*
>   * Size the usable region bigger than the requested size if possible. Esp.
>   * Linux guests will only add (aligned) memory blocks in case they fully
> @@ -437,6 +497,15 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
>      rb = vmem->memdev->mr.ram_block;
>      page_size = qemu_ram_pagesize(rb);
>
> +    /*
> +     * If the block size wasn't configured by the user, use a sane default. This
> +     * allows using hugetlbfs backends with a pagesize bigger than the detected
> +     * THP size without manual intervention/configuration.
> +     */
> +    if (!vmem->block_size) {
> +        vmem->block_size = MAX(page_size, virtio_mem_default_block_size());
> +    }
> +
>      if (vmem->block_size < page_size) {
>          error_setg(errp, "'%s' property has to be at least the page size (0x%"
>                     PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size);
> @@ -760,6 +829,12 @@ static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name,
>          error_setg(errp, "'%s' property has to be a power of two", name);
>          return;
>      }
> +
> +    if (value < virtio_mem_default_block_size()) {
> +        warn_report("'%s' property is smaller than the default block size "
> +                    "(detected THP size) of %" PRIx64 " MiB", name,
> +                    virtio_mem_default_block_size() / MiB);
> +    }
>      vmem->block_size = value;
>  }
>
> @@ -810,7 +885,6 @@ static void virtio_mem_instance_init(Object *obj)
>  {
>      VirtIOMEM *vmem = VIRTIO_MEM(obj);
>
> -    vmem->block_size = VIRTIO_MEM_MIN_BLOCK_SIZE;
>      notifier_list_init(&vmem->size_change_notifiers);
>      vmem->precopy_notifier.notify = virtio_mem_precopy_notify;
>
> --
> 2.26.2
>
David Hildenbrand Sept. 28, 2020, 8:58 a.m. UTC | #2
On 25.09.20 15:46, Pankaj Gupta wrote:
>> Let's allow a minimum block size of 1 MiB in all configurations. Use
>> a default block size based on the THP size, and warn if something
>> smaller is configured by the user.
>>
>> VIRTIO_MEM only supports Linux (depends on LINUX), so we can probe the
>> THP size unconditionally.
>>
>> For now we only support virtio-mem on x86-64 - there isn't a user-visiable
>> change (x86-64 only supports 2 MiB THP on the PMD level) - the default
>> was, and will be 2 MiB.
>>
>> If we ever have THP on the PUD level (e.g., 1 GiB THP on x86-64), we
>> expect to have a trigger to explicitly opt-in for the new THP granularity.
>>
>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>> Cc: Wei Yang <richardw.yang@linux.intel.com>
>> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
>> Cc: Igor Mammedov <imammedo@redhat.com>
>> Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>>  hw/virtio/virtio-mem.c | 82 +++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 78 insertions(+), 4 deletions(-)
>>
>> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
>> index 8fbec77ccc..58098686ee 100644
>> --- a/hw/virtio/virtio-mem.c
>> +++ b/hw/virtio/virtio-mem.c
>> @@ -33,10 +33,70 @@
>>  #include "trace.h"
>>
>>  /*
>> - * Use QEMU_VMALLOC_ALIGN, so no THP will have to be split when unplugging
>> - * memory (e.g., 2MB on x86_64).
>> + * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking
>> + * bitmap small.
>>   */
>> -#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)QEMU_VMALLOC_ALIGN)
>> +#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
>> +
>> +/*
>> + * We want to have a reasonable default block size such that
>> + * 1. We avoid splitting THPs when unplugging memory, which degrades
>> + *    performance.
>> + * 2. We avoid placing THPs for plugged blocks that also cover unplugged
>> + *    blocks.
>> + *
>> + * The actual THP size might differ between Linux kernels, so we try to probe
>> + * it. In the future (if we ever run into issues regarding 2.), we might want
>> + * to disable THP in case we fail to properly probe the THP size, or if the
>> + * block size is configured smaller than the THP size.
>> + */
>> +static uint32_t default_block_size;
>> +
>> +#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
>> +static uint32_t virtio_mem_default_block_size(void)
>> +{
>> +    gchar *content = NULL;
>> +    const char *endptr;
>> +    uint64_t tmp;
>> +
>> +    if (default_block_size) {
>> +        return default_block_size;
>> +    }
>> +
>> +    /*
>> +     * Try to probe the actual THP size, fallback to (sane but eventually
>> +     * incorrect) default sizes.
>> +     */
>> +    if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
>> +        !qemu_strtou64(content, &endptr, 0, &tmp) &&
>> +        (!endptr || *endptr == '\n')) {
>> +        /*
>> +         * Sanity-check the value, if it's too big (e.g., aarch64 with 64k base
>> +         * pages) or weird, fallback to something smaller.
>> +         */
>> +        if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) {
>> +            warn_report("Detected a THP size of %" PRIx64
>> +                        " MiB, falling back to 1 MiB.", tmp / MiB);
>> +            default_block_size = 1 * MiB;
> 
> Probably use macro "VIRTIO_MEM_MIN_BLOCK_SIZE"

Indeed.

>> +        } else {
>> +            default_block_size = tmp;
>> +        }
>> +    } else {
>> +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \
>> +    defined(__powerpc64__)
>> +        default_block_size = 2 * MiB;
>> +#else
>> +        /* fallback to 1 MiB (e.g., the THP size on s390x) */
>> +        default_block_size = 1 * MiB;
>> +#endif
> 
> Maybe we can declare this macro near to "VIRTIO_MEM_MIN_BLOCK_SIZE
> ((uint32_t)(1 * MiB))"
> or club into one, just a thought.

I decided to not use VIRTIO_MEM_MIN_BLOCK_SIZE here to not accidentally
mess up the s390x THP size when ever wanting to decrease
VIRTIO_MEM_MIN_BLOCK_SIZE. But as we have a comment here, people should
know whats happening when ever changing VIRTIO_MEM_MIN_BLOCK_SIZE.

Thanks!
Pankaj Gupta Sept. 28, 2020, 9:31 a.m. UTC | #3
> >> Let's allow a minimum block size of 1 MiB in all configurations. Use
> >> a default block size based on the THP size, and warn if something
> >> smaller is configured by the user.
> >>
> >> VIRTIO_MEM only supports Linux (depends on LINUX), so we can probe the
> >> THP size unconditionally.
> >>
> >> For now we only support virtio-mem on x86-64 - there isn't a user-visiable
> >> change (x86-64 only supports 2 MiB THP on the PMD level) - the default
> >> was, and will be 2 MiB.
> >>
> >> If we ever have THP on the PUD level (e.g., 1 GiB THP on x86-64), we
> >> expect to have a trigger to explicitly opt-in for the new THP granularity.
> >>
> >> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> >> Cc: Wei Yang <richardw.yang@linux.intel.com>
> >> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
> >> Cc: Igor Mammedov <imammedo@redhat.com>
> >> Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
> >> Signed-off-by: David Hildenbrand <david@redhat.com>
> >> ---
> >>  hw/virtio/virtio-mem.c | 82 +++++++++++++++++++++++++++++++++++++++---
> >>  1 file changed, 78 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
> >> index 8fbec77ccc..58098686ee 100644
> >> --- a/hw/virtio/virtio-mem.c
> >> +++ b/hw/virtio/virtio-mem.c
> >> @@ -33,10 +33,70 @@
> >>  #include "trace.h"
> >>
> >>  /*
> >> - * Use QEMU_VMALLOC_ALIGN, so no THP will have to be split when unplugging
> >> - * memory (e.g., 2MB on x86_64).
> >> + * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking
> >> + * bitmap small.
> >>   */
> >> -#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)QEMU_VMALLOC_ALIGN)
> >> +#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
> >> +
> >> +/*
> >> + * We want to have a reasonable default block size such that
> >> + * 1. We avoid splitting THPs when unplugging memory, which degrades
> >> + *    performance.
> >> + * 2. We avoid placing THPs for plugged blocks that also cover unplugged
> >> + *    blocks.
> >> + *
> >> + * The actual THP size might differ between Linux kernels, so we try to probe
> >> + * it. In the future (if we ever run into issues regarding 2.), we might want
> >> + * to disable THP in case we fail to properly probe the THP size, or if the
> >> + * block size is configured smaller than the THP size.
> >> + */
> >> +static uint32_t default_block_size;
> >> +
> >> +#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
> >> +static uint32_t virtio_mem_default_block_size(void)
> >> +{
> >> +    gchar *content = NULL;
> >> +    const char *endptr;
> >> +    uint64_t tmp;
> >> +
> >> +    if (default_block_size) {
> >> +        return default_block_size;
> >> +    }
> >> +
> >> +    /*
> >> +     * Try to probe the actual THP size, fallback to (sane but eventually
> >> +     * incorrect) default sizes.
> >> +     */
> >> +    if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
> >> +        !qemu_strtou64(content, &endptr, 0, &tmp) &&
> >> +        (!endptr || *endptr == '\n')) {
> >> +        /*
> >> +         * Sanity-check the value, if it's too big (e.g., aarch64 with 64k base
> >> +         * pages) or weird, fallback to something smaller.
> >> +         */
> >> +        if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) {
> >> +            warn_report("Detected a THP size of %" PRIx64
> >> +                        " MiB, falling back to 1 MiB.", tmp / MiB);
> >> +            default_block_size = 1 * MiB;
> >
> > Probably use macro "VIRTIO_MEM_MIN_BLOCK_SIZE"
>
> Indeed.
>
> >> +        } else {
> >> +            default_block_size = tmp;
> >> +        }
> >> +    } else {
> >> +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \
> >> +    defined(__powerpc64__)
> >> +        default_block_size = 2 * MiB;
> >> +#else
> >> +        /* fallback to 1 MiB (e.g., the THP size on s390x) */
> >> +        default_block_size = 1 * MiB;
> >> +#endif
> >
> > Maybe we can declare this macro near to "VIRTIO_MEM_MIN_BLOCK_SIZE
> > ((uint32_t)(1 * MiB))"
> > or club into one, just a thought.
>
> I decided to not use VIRTIO_MEM_MIN_BLOCK_SIZE here to not accidentally
> mess up the s390x THP size when ever wanting to decrease
> VIRTIO_MEM_MIN_BLOCK_SIZE. But as we have a comment here, people should
> know whats happening when ever changing VIRTIO_MEM_MIN_BLOCK_SIZE.

Thanks for answering. Makes sense.

Overall the patch looks good to me.
Reviewed-by: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
David Hildenbrand Sept. 28, 2020, 9:36 a.m. UTC | #4
On 28.09.20 11:31, Pankaj Gupta wrote:
>>>> Let's allow a minimum block size of 1 MiB in all configurations. Use

>>>> a default block size based on the THP size, and warn if something

>>>> smaller is configured by the user.

>>>>

>>>> VIRTIO_MEM only supports Linux (depends on LINUX), so we can probe the

>>>> THP size unconditionally.

>>>>

>>>> For now we only support virtio-mem on x86-64 - there isn't a user-visiable

>>>> change (x86-64 only supports 2 MiB THP on the PMD level) - the default

>>>> was, and will be 2 MiB.

>>>>

>>>> If we ever have THP on the PUD level (e.g., 1 GiB THP on x86-64), we

>>>> expect to have a trigger to explicitly opt-in for the new THP granularity.

>>>>

>>>> Cc: "Michael S. Tsirkin" <mst@redhat.com>

>>>> Cc: Wei Yang <richardw.yang@linux.intel.com>

>>>> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>

>>>> Cc: Igor Mammedov <imammedo@redhat.com>

>>>> Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>

>>>> Signed-off-by: David Hildenbrand <david@redhat.com>

>>>> ---

>>>>  hw/virtio/virtio-mem.c | 82 +++++++++++++++++++++++++++++++++++++++---

>>>>  1 file changed, 78 insertions(+), 4 deletions(-)

>>>>

>>>> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c

>>>> index 8fbec77ccc..58098686ee 100644

>>>> --- a/hw/virtio/virtio-mem.c

>>>> +++ b/hw/virtio/virtio-mem.c

>>>> @@ -33,10 +33,70 @@

>>>>  #include "trace.h"

>>>>

>>>>  /*

>>>> - * Use QEMU_VMALLOC_ALIGN, so no THP will have to be split when unplugging

>>>> - * memory (e.g., 2MB on x86_64).

>>>> + * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking

>>>> + * bitmap small.

>>>>   */

>>>> -#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)QEMU_VMALLOC_ALIGN)

>>>> +#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))

>>>> +

>>>> +/*

>>>> + * We want to have a reasonable default block size such that

>>>> + * 1. We avoid splitting THPs when unplugging memory, which degrades

>>>> + *    performance.

>>>> + * 2. We avoid placing THPs for plugged blocks that also cover unplugged

>>>> + *    blocks.

>>>> + *

>>>> + * The actual THP size might differ between Linux kernels, so we try to probe

>>>> + * it. In the future (if we ever run into issues regarding 2.), we might want

>>>> + * to disable THP in case we fail to properly probe the THP size, or if the

>>>> + * block size is configured smaller than the THP size.

>>>> + */

>>>> +static uint32_t default_block_size;

>>>> +

>>>> +#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"

>>>> +static uint32_t virtio_mem_default_block_size(void)

>>>> +{

>>>> +    gchar *content = NULL;

>>>> +    const char *endptr;

>>>> +    uint64_t tmp;

>>>> +

>>>> +    if (default_block_size) {

>>>> +        return default_block_size;

>>>> +    }

>>>> +

>>>> +    /*

>>>> +     * Try to probe the actual THP size, fallback to (sane but eventually

>>>> +     * incorrect) default sizes.

>>>> +     */

>>>> +    if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&

>>>> +        !qemu_strtou64(content, &endptr, 0, &tmp) &&

>>>> +        (!endptr || *endptr == '\n')) {

>>>> +        /*

>>>> +         * Sanity-check the value, if it's too big (e.g., aarch64 with 64k base

>>>> +         * pages) or weird, fallback to something smaller.

>>>> +         */

>>>> +        if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) {

>>>> +            warn_report("Detected a THP size of %" PRIx64

>>>> +                        " MiB, falling back to 1 MiB.", tmp / MiB);

>>>> +            default_block_size = 1 * MiB;

>>>

>>> Probably use macro "VIRTIO_MEM_MIN_BLOCK_SIZE"

>>

>> Indeed.

>>

>>>> +        } else {

>>>> +            default_block_size = tmp;

>>>> +        }

>>>> +    } else {

>>>> +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \

>>>> +    defined(__powerpc64__)

>>>> +        default_block_size = 2 * MiB;

>>>> +#else

>>>> +        /* fallback to 1 MiB (e.g., the THP size on s390x) */

>>>> +        default_block_size = 1 * MiB;

>>>> +#endif

>>>

>>> Maybe we can declare this macro near to "VIRTIO_MEM_MIN_BLOCK_SIZE

>>> ((uint32_t)(1 * MiB))"

>>> or club into one, just a thought.

>>

>> I decided to not use VIRTIO_MEM_MIN_BLOCK_SIZE here to not accidentally

>> mess up the s390x THP size when ever wanting to decrease

>> VIRTIO_MEM_MIN_BLOCK_SIZE. But as we have a comment here, people should

>> know whats happening when ever changing VIRTIO_MEM_MIN_BLOCK_SIZE.

> 

> Thanks for answering. Makes sense.

> 

> Overall the patch looks good to me.

> Reviewed-by: Pankaj Gupta <pankaj.gupta.linux@gmail.com>

> 


I'll do some tweaks to the patch to make something weird (but possible)
be handled correctly (and make it overall look nicer):

It's possible to have a THP size that's bigger than a hugetlbfs page
size. In that case, we want to use the hugetlbfs page size instead (and
don't want to warn).

(I'll drop the RB for now, because it would be good if you gave it
another look - thanks!)

-- 
Thanks,

David / dhildenb
diff mbox series

Patch

diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index 8fbec77ccc..58098686ee 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -33,10 +33,70 @@ 
 #include "trace.h"
 
 /*
- * Use QEMU_VMALLOC_ALIGN, so no THP will have to be split when unplugging
- * memory (e.g., 2MB on x86_64).
+ * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking
+ * bitmap small.
  */
-#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)QEMU_VMALLOC_ALIGN)
+#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
+
+/*
+ * We want to have a reasonable default block size such that
+ * 1. We avoid splitting THPs when unplugging memory, which degrades
+ *    performance.
+ * 2. We avoid placing THPs for plugged blocks that also cover unplugged
+ *    blocks.
+ *
+ * The actual THP size might differ between Linux kernels, so we try to probe
+ * it. In the future (if we ever run into issues regarding 2.), we might want
+ * to disable THP in case we fail to properly probe the THP size, or if the
+ * block size is configured smaller than the THP size.
+ */
+static uint32_t default_block_size;
+
+#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
+static uint32_t virtio_mem_default_block_size(void)
+{
+    gchar *content = NULL;
+    const char *endptr;
+    uint64_t tmp;
+
+    if (default_block_size) {
+        return default_block_size;
+    }
+
+    /*
+     * Try to probe the actual THP size, fallback to (sane but eventually
+     * incorrect) default sizes.
+     */
+    if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
+        !qemu_strtou64(content, &endptr, 0, &tmp) &&
+        (!endptr || *endptr == '\n')) {
+        /*
+         * Sanity-check the value, if it's too big (e.g., aarch64 with 64k base
+         * pages) or weird, fallback to something smaller.
+         */
+        if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) {
+            warn_report("Detected a THP size of %" PRIx64
+                        " MiB, falling back to 1 MiB.", tmp / MiB);
+            default_block_size = 1 * MiB;
+        } else {
+            default_block_size = tmp;
+        }
+    } else {
+#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \
+    defined(__powerpc64__)
+        default_block_size = 2 * MiB;
+#else
+        /* fallback to 1 MiB (e.g., the THP size on s390x) */
+        default_block_size = 1 * MiB;
+#endif
+        warn_report("Could not detect THP size, falling back to %" PRIx64
+                    "  MiB.", default_block_size / MiB);
+    }
+
+    g_free(content);
+    return default_block_size;
+}
+
 /*
  * Size the usable region bigger than the requested size if possible. Esp.
  * Linux guests will only add (aligned) memory blocks in case they fully
@@ -437,6 +497,15 @@  static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
     rb = vmem->memdev->mr.ram_block;
     page_size = qemu_ram_pagesize(rb);
 
+    /*
+     * If the block size wasn't configured by the user, use a sane default. This
+     * allows using hugetlbfs backends with a pagesize bigger than the detected
+     * THP size without manual intervention/configuration.
+     */
+    if (!vmem->block_size) {
+        vmem->block_size = MAX(page_size, virtio_mem_default_block_size());
+    }
+
     if (vmem->block_size < page_size) {
         error_setg(errp, "'%s' property has to be at least the page size (0x%"
                    PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size);
@@ -760,6 +829,12 @@  static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name,
         error_setg(errp, "'%s' property has to be a power of two", name);
         return;
     }
+
+    if (value < virtio_mem_default_block_size()) {
+        warn_report("'%s' property is smaller than the default block size "
+                    "(detected THP size) of %" PRIx64 " MiB", name,
+                    virtio_mem_default_block_size() / MiB);
+    }
     vmem->block_size = value;
 }
 
@@ -810,7 +885,6 @@  static void virtio_mem_instance_init(Object *obj)
 {
     VirtIOMEM *vmem = VIRTIO_MEM(obj);
 
-    vmem->block_size = VIRTIO_MEM_MIN_BLOCK_SIZE;
     notifier_list_init(&vmem->size_change_notifiers);
     vmem->precopy_notifier.notify = virtio_mem_precopy_notify;