Message ID | 20200928114909.20791-1-david@redhat.com |
---|---|
Headers | show |
Series | virtio-mem: block size and address-assignment optimizations | expand |
> Let's allow a minimum block size of 1 MiB in all configurations. Select > the default block size based on > - The page size of the memory backend. > - The THP size if the memory backend size corresponds to the real hsot s/hsot/host > page size. > - The global minimum of 1 MiB. > and warn if something smaller is configured by the user. > > VIRTIO_MEM only supports Linux (depends on LINUX), so we can probe the > THP size unconditionally. > > For now we only support virtio-mem on x86-64 - there isn't a user-visiable s/visiable/visible > change (x86-64 only supports 2 MiB THP on the PMD level) - the default > was, and will be 2 MiB. > > If we ever have THP on the PUD level (e.g., 1 GiB THP on x86-64), we > expect to have a trigger to explicitly opt-in for the new THP granularity. > > Cc: "Michael S. Tsirkin" <mst@redhat.com> > Cc: Wei Yang <richardw.yang@linux.intel.com> > Cc: Dr. David Alan Gilbert <dgilbert@redhat.com> > Cc: Igor Mammedov <imammedo@redhat.com> > Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com> > Signed-off-by: David Hildenbrand <david@redhat.com> > --- > hw/virtio/virtio-mem.c | 105 +++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 101 insertions(+), 4 deletions(-) > > diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c > index 8fbec77ccc..9b1461cf9d 100644 > --- a/hw/virtio/virtio-mem.c > +++ b/hw/virtio/virtio-mem.c > @@ -33,10 +33,83 @@ > #include "trace.h" > > /* > - * Use QEMU_VMALLOC_ALIGN, so no THP will have to be split when unplugging > - * memory (e.g., 2MB on x86_64). > + * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking > + * bitmap small. > */ > -#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)QEMU_VMALLOC_ALIGN) > +#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB)) > + > +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \ > + defined(__powerpc64__) > +#define VIRTIO_MEM_DEFAULT_THP_SIZE ((uint32_t)(2 * MiB)) > +#else > + /* fallback to 1 MiB (e.g., the THP size on s390x) */ > +#define VIRTIO_MEM_DEFAULT_THP_SIZE VIRTIO_MEM_MIN_BLOCK_SIZE > +#endif > + > +/* > + * We want to have a reasonable default block size such that > + * 1. We avoid splitting THPs when unplugging memory, which degrades > + * performance. > + * 2. We avoid placing THPs for plugged blocks that also cover unplugged > + * blocks. > + * > + * The actual THP size might differ between Linux kernels, so we try to probe > + * it. In the future (if we ever run into issues regarding 2.), we might want > + * to disable THP in case we fail to properly probe the THP size, or if the > + * block size is configured smaller than the THP size. > + */ > +static uint32_t thp_size; > + > +#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" > +static uint32_t virtio_mem_thp_size(void) > +{ > + gchar *content = NULL; > + const char *endptr; > + uint64_t tmp; > + > + if (thp_size) { > + return thp_size; > + } > + > + /* > + * Try to probe the actual THP size, fallback to (sane but eventually > + * incorrect) default sizes. > + */ > + if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) && > + !qemu_strtou64(content, &endptr, 0, &tmp) && > + (!endptr || *endptr == '\n')) { > + /* > + * Sanity-check the value, if it's too big (e.g., aarch64 with 64k base > + * pages) or weird, fallback to something smaller. > + */ > + if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) { > + warn_report("Read unsupported THP size: %" PRIx64, tmp); > + } else { > + thp_size = tmp; > + } > + } > + > + if (!thp_size) { > + thp_size = VIRTIO_MEM_DEFAULT_THP_SIZE; > + warn_report("Could not detect THP size, falling back to %" PRIx64 > + " MiB.", thp_size / MiB); > + } > + > + g_free(content); > + return thp_size; > +} > + > +static uint64_t virtio_mem_default_block_size(RAMBlock *rb) > +{ > + const uint64_t page_size = qemu_ram_pagesize(rb); > + > + /* We can have hugetlbfs with a page size smaller than the THP size. */ > + if (page_size == qemu_real_host_page_size) { > + return MAX(page_size, virtio_mem_thp_size()); > + } This condition is special, can think of hugetlbfs smaller in size than THP size configured. > + return MAX(page_size, VIRTIO_MEM_MIN_BLOCK_SIZE); Do we still need this? Or we can have only one return for both the cases? Probably, I am missing something here. > +} > + > /* > * Size the usable region bigger than the requested size if possible. Esp. > * Linux guests will only add (aligned) memory blocks in case they fully > @@ -437,10 +510,23 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) > rb = vmem->memdev->mr.ram_block; > page_size = qemu_ram_pagesize(rb); > > + /* > + * If the block size wasn't configured by the user, use a sane default. This > + * allows using hugetlbfs backends of any page size without manual > + * intervention. > + */ > + if (!vmem->block_size) { > + vmem->block_size = virtio_mem_default_block_size(rb); > + } > + > if (vmem->block_size < page_size) { > error_setg(errp, "'%s' property has to be at least the page size (0x%" > PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size); > return; > + } else if (vmem->block_size < virtio_mem_default_block_size(rb)) { > + warn_report("'%s' property is smaller than the default block size (%" > + PRIx64 " MiB)", VIRTIO_MEM_BLOCK_SIZE_PROP, > + virtio_mem_default_block_size(rb) / MiB); > } else if (!QEMU_IS_ALIGNED(vmem->requested_size, vmem->block_size)) { > error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64 > ")", VIRTIO_MEM_REQUESTED_SIZE_PROP, > @@ -731,6 +817,18 @@ static void virtio_mem_get_block_size(Object *obj, Visitor *v, const char *name, > const VirtIOMEM *vmem = VIRTIO_MEM(obj); > uint64_t value = vmem->block_size; > > + /* > + * If not configured by the user (and we're not realized yet), use the > + * default block size we would use with the current memory backend. > + */ > + if (!value) { > + if (vmem->memdev && memory_region_is_ram(&vmem->memdev->mr)) { > + value = virtio_mem_default_block_size(vmem->memdev->mr.ram_block); > + } else { > + value = virtio_mem_thp_size(); > + } > + } > + > visit_type_size(v, name, &value, errp); > } > > @@ -810,7 +908,6 @@ static void virtio_mem_instance_init(Object *obj) > { > VirtIOMEM *vmem = VIRTIO_MEM(obj); > > - vmem->block_size = VIRTIO_MEM_MIN_BLOCK_SIZE; > notifier_list_init(&vmem->size_change_notifiers); > vmem->precopy_notifier.notify = virtio_mem_precopy_notify;
On 29.09.20 15:54, Pankaj Gupta wrote: >> Let's allow a minimum block size of 1 MiB in all configurations. Select >> the default block size based on >> - The page size of the memory backend. >> - The THP size if the memory backend size corresponds to the real hsot > > s/hsot/host thanks! >> page size. >> - The global minimum of 1 MiB. >> and warn if something smaller is configured by the user. >> >> VIRTIO_MEM only supports Linux (depends on LINUX), so we can probe the >> THP size unconditionally. >> >> For now we only support virtio-mem on x86-64 - there isn't a user-visiable > > s/visiable/visible thanks! >> change (x86-64 only supports 2 MiB THP on the PMD level) - the default >> was, and will be 2 MiB. >> >> If we ever have THP on the PUD level (e.g., 1 GiB THP on x86-64), we >> expect to have a trigger to explicitly opt-in for the new THP granularity. >> >> Cc: "Michael S. Tsirkin" <mst@redhat.com> >> Cc: Wei Yang <richardw.yang@linux.intel.com> >> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com> >> Cc: Igor Mammedov <imammedo@redhat.com> >> Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com> >> Signed-off-by: David Hildenbrand <david@redhat.com> >> --- >> hw/virtio/virtio-mem.c | 105 +++++++++++++++++++++++++++++++++++++++-- >> 1 file changed, 101 insertions(+), 4 deletions(-) >> >> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c >> index 8fbec77ccc..9b1461cf9d 100644 >> --- a/hw/virtio/virtio-mem.c >> +++ b/hw/virtio/virtio-mem.c >> @@ -33,10 +33,83 @@ >> #include "trace.h" >> >> /* >> - * Use QEMU_VMALLOC_ALIGN, so no THP will have to be split when unplugging >> - * memory (e.g., 2MB on x86_64). >> + * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking >> + * bitmap small. >> */ >> -#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)QEMU_VMALLOC_ALIGN) >> +#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB)) >> + >> +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \ >> + defined(__powerpc64__) >> +#define VIRTIO_MEM_DEFAULT_THP_SIZE ((uint32_t)(2 * MiB)) >> +#else >> + /* fallback to 1 MiB (e.g., the THP size on s390x) */ >> +#define VIRTIO_MEM_DEFAULT_THP_SIZE VIRTIO_MEM_MIN_BLOCK_SIZE >> +#endif >> + >> +/* >> + * We want to have a reasonable default block size such that >> + * 1. We avoid splitting THPs when unplugging memory, which degrades >> + * performance. >> + * 2. We avoid placing THPs for plugged blocks that also cover unplugged >> + * blocks. >> + * >> + * The actual THP size might differ between Linux kernels, so we try to probe >> + * it. In the future (if we ever run into issues regarding 2.), we might want >> + * to disable THP in case we fail to properly probe the THP size, or if the >> + * block size is configured smaller than the THP size. >> + */ >> +static uint32_t thp_size; >> + >> +#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" >> +static uint32_t virtio_mem_thp_size(void) >> +{ >> + gchar *content = NULL; >> + const char *endptr; >> + uint64_t tmp; >> + >> + if (thp_size) { >> + return thp_size; >> + } >> + >> + /* >> + * Try to probe the actual THP size, fallback to (sane but eventually >> + * incorrect) default sizes. >> + */ >> + if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) && >> + !qemu_strtou64(content, &endptr, 0, &tmp) && >> + (!endptr || *endptr == '\n')) { >> + /* >> + * Sanity-check the value, if it's too big (e.g., aarch64 with 64k base >> + * pages) or weird, fallback to something smaller. >> + */ >> + if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) { >> + warn_report("Read unsupported THP size: %" PRIx64, tmp); >> + } else { >> + thp_size = tmp; >> + } >> + } >> + >> + if (!thp_size) { >> + thp_size = VIRTIO_MEM_DEFAULT_THP_SIZE; >> + warn_report("Could not detect THP size, falling back to %" PRIx64 >> + " MiB.", thp_size / MiB); >> + } >> + >> + g_free(content); >> + return thp_size; >> +} >> + >> +static uint64_t virtio_mem_default_block_size(RAMBlock *rb) >> +{ >> + const uint64_t page_size = qemu_ram_pagesize(rb); >> + >> + /* We can have hugetlbfs with a page size smaller than the THP size. */ >> + if (page_size == qemu_real_host_page_size) { >> + return MAX(page_size, virtio_mem_thp_size()); >> + } > > This condition is special, can think of hugetlbfs smaller in size than THP size > configured. Yeah, there are weird architectures, most prominently arm64: https://www.kernel.org/doc/html/latest/arm64/hugetlbpage.html Assume you're on 64K base pages with a probed 512MB THP size (currently). You could have hugetlbfs with 2MB page size via "CONT PTE" bits. >> + return MAX(page_size, VIRTIO_MEM_MIN_BLOCK_SIZE); > > Do we still need this? Or we can have only one return for both the cases? > Probably, I am missing something here. We still need it. Assume somebody would have 64K hugetlbfs on arm64 (with 4k base pages), we want to make sure we use at least 1MB blocks. Thanks!
> >> Let's allow a minimum block size of 1 MiB in all configurations. Select > >> the default block size based on > >> - The page size of the memory backend. > >> - The THP size if the memory backend size corresponds to the real hsot > > > > s/hsot/host > > thanks! > > >> page size. > >> - The global minimum of 1 MiB. > >> and warn if something smaller is configured by the user. > >> > >> VIRTIO_MEM only supports Linux (depends on LINUX), so we can probe the > >> THP size unconditionally. > >> > >> For now we only support virtio-mem on x86-64 - there isn't a user-visiable > > > > s/visiable/visible > > thanks! > > >> change (x86-64 only supports 2 MiB THP on the PMD level) - the default > >> was, and will be 2 MiB. > >> > >> If we ever have THP on the PUD level (e.g., 1 GiB THP on x86-64), we > >> expect to have a trigger to explicitly opt-in for the new THP granularity. > >> > >> Cc: "Michael S. Tsirkin" <mst@redhat.com> > >> Cc: Wei Yang <richardw.yang@linux.intel.com> > >> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com> > >> Cc: Igor Mammedov <imammedo@redhat.com> > >> Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com> > >> Signed-off-by: David Hildenbrand <david@redhat.com> > >> --- > >> hw/virtio/virtio-mem.c | 105 +++++++++++++++++++++++++++++++++++++++-- > >> 1 file changed, 101 insertions(+), 4 deletions(-) > >> > >> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c > >> index 8fbec77ccc..9b1461cf9d 100644 > >> --- a/hw/virtio/virtio-mem.c > >> +++ b/hw/virtio/virtio-mem.c > >> @@ -33,10 +33,83 @@ > >> #include "trace.h" > >> > >> /* > >> - * Use QEMU_VMALLOC_ALIGN, so no THP will have to be split when unplugging > >> - * memory (e.g., 2MB on x86_64). > >> + * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking > >> + * bitmap small. > >> */ > >> -#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)QEMU_VMALLOC_ALIGN) > >> +#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB)) > >> + > >> +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \ > >> + defined(__powerpc64__) > >> +#define VIRTIO_MEM_DEFAULT_THP_SIZE ((uint32_t)(2 * MiB)) > >> +#else > >> + /* fallback to 1 MiB (e.g., the THP size on s390x) */ > >> +#define VIRTIO_MEM_DEFAULT_THP_SIZE VIRTIO_MEM_MIN_BLOCK_SIZE > >> +#endif > >> + > >> +/* > >> + * We want to have a reasonable default block size such that > >> + * 1. We avoid splitting THPs when unplugging memory, which degrades > >> + * performance. > >> + * 2. We avoid placing THPs for plugged blocks that also cover unplugged > >> + * blocks. > >> + * > >> + * The actual THP size might differ between Linux kernels, so we try to probe > >> + * it. In the future (if we ever run into issues regarding 2.), we might want > >> + * to disable THP in case we fail to properly probe the THP size, or if the > >> + * block size is configured smaller than the THP size. > >> + */ > >> +static uint32_t thp_size; > >> + > >> +#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" > >> +static uint32_t virtio_mem_thp_size(void) > >> +{ > >> + gchar *content = NULL; > >> + const char *endptr; > >> + uint64_t tmp; > >> + > >> + if (thp_size) { > >> + return thp_size; > >> + } > >> + > >> + /* > >> + * Try to probe the actual THP size, fallback to (sane but eventually > >> + * incorrect) default sizes. > >> + */ > >> + if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) && > >> + !qemu_strtou64(content, &endptr, 0, &tmp) && > >> + (!endptr || *endptr == '\n')) { > >> + /* > >> + * Sanity-check the value, if it's too big (e.g., aarch64 with 64k base > >> + * pages) or weird, fallback to something smaller. > >> + */ > >> + if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) { > >> + warn_report("Read unsupported THP size: %" PRIx64, tmp); > >> + } else { > >> + thp_size = tmp; > >> + } > >> + } > >> + > >> + if (!thp_size) { > >> + thp_size = VIRTIO_MEM_DEFAULT_THP_SIZE; > >> + warn_report("Could not detect THP size, falling back to %" PRIx64 > >> + " MiB.", thp_size / MiB); > >> + } > >> + > >> + g_free(content); > >> + return thp_size; > >> +} > >> + > >> +static uint64_t virtio_mem_default_block_size(RAMBlock *rb) > >> +{ > >> + const uint64_t page_size = qemu_ram_pagesize(rb); > >> + > >> + /* We can have hugetlbfs with a page size smaller than the THP size. */ > >> + if (page_size == qemu_real_host_page_size) { > >> + return MAX(page_size, virtio_mem_thp_size()); > >> + } > > > > This condition is special, can think of hugetlbfs smaller in size than THP size > > configured. > > Yeah, there are weird architectures, most prominently arm64: > > https://www.kernel.org/doc/html/latest/arm64/hugetlbpage.html > > Assume you're on 64K base pages with a probed 512MB THP size > (currently). You could have hugetlbfs with 2MB page size via "CONT PTE" > bits. Ok. I understand now. Thanks for explaining! > > >> + return MAX(page_size, VIRTIO_MEM_MIN_BLOCK_SIZE); > > > > Do we still need this? Or we can have only one return for both the cases? > > Probably, I am missing something here. > > We still need it. Assume somebody would have 64K hugetlbfs on arm64 > (with 4k base pages), we want to make sure we use at least 1MB blocks. ok. got it. Looks good to me. Reviewed-by: Pankaj Gupta <pankaj.gupta.linux@gmail.com>