diff mbox series

[2/9] mm/page_alloc: Add a bulk page allocator

Message ID 20210325114228.27719-3-mgorman@techsingularity.net
State New
Headers show
Series Introduce a bulk order-0 page allocator with two in-tree users | expand

Commit Message

Mel Gorman March 25, 2021, 11:42 a.m. UTC
This patch adds a new page allocator interface via alloc_pages_bulk,
and __alloc_pages_bulk_nodemask. A caller requests a number of pages
to be allocated and added to a list.

The API is not guaranteed to return the requested number of pages and
may fail if the preferred allocation zone has limited free memory, the
cpuset changes during the allocation or page debugging decides to fail
an allocation. It's up to the caller to request more pages in batch
if necessary.

Note that this implementation is not very efficient and could be improved
but it would require refactoring. The intent is to make it available early
to determine what semantics are required by different callers. Once the
full semantics are nailed down, it can be refactored.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/gfp.h |  11 +++++
 mm/page_alloc.c     | 118 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 129 insertions(+)

Comments

Matthew Wilcox March 25, 2021, 12:05 p.m. UTC | #1
On Thu, Mar 25, 2021 at 11:42:21AM +0000, Mel Gorman wrote:
> +int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
> +				nodemask_t *nodemask, int nr_pages,
> +				struct list_head *list);
> +
> +/* Bulk allocate order-0 pages */
> +static inline unsigned long
> +alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
> +{
> +	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list);

Discrepancy in the two return types here.  Suspect they should both
be 'unsigned int' so there's no question about "can it return an errno".

>  
> +/*

If you could make that "/**" instead ...
Vlastimil Babka April 12, 2021, 10:21 a.m. UTC | #2
On 3/25/21 12:42 PM, Mel Gorman wrote:
> This patch adds a new page allocator interface via alloc_pages_bulk,

> and __alloc_pages_bulk_nodemask. A caller requests a number of pages

> to be allocated and added to a list.

> 

> The API is not guaranteed to return the requested number of pages and

> may fail if the preferred allocation zone has limited free memory, the

> cpuset changes during the allocation or page debugging decides to fail

> an allocation. It's up to the caller to request more pages in batch

> if necessary.

> 

> Note that this implementation is not very efficient and could be improved

> but it would require refactoring. The intent is to make it available early

> to determine what semantics are required by different callers. Once the

> full semantics are nailed down, it can be refactored.

> 

> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>

> Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---

>  include/linux/gfp.h |  11 +++++

>  mm/page_alloc.c     | 118 ++++++++++++++++++++++++++++++++++++++++++++

>  2 files changed, 129 insertions(+)

> 

> diff --git a/include/linux/gfp.h b/include/linux/gfp.h

> index 0a88f84b08f4..4a304fd39916 100644

> --- a/include/linux/gfp.h

> +++ b/include/linux/gfp.h

> @@ -518,6 +518,17 @@ static inline int arch_make_page_accessible(struct page *page)

>  struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,

>  		nodemask_t *nodemask);

>  

> +int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,

> +				nodemask_t *nodemask, int nr_pages,

> +				struct list_head *list);

> +

> +/* Bulk allocate order-0 pages */

> +static inline unsigned long

> +alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list)

> +{

> +	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list);

> +}

> +

>  /*

>   * Allocate pages, preferring the node given as nid. The node must be valid and

>   * online. For more general interface, see alloc_pages_node().

> diff --git a/mm/page_alloc.c b/mm/page_alloc.c

> index 8a3e13277e22..eb547470a7e4 100644

> --- a/mm/page_alloc.c

> +++ b/mm/page_alloc.c

> @@ -4965,6 +4965,124 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,

>  	return true;

>  }

>  

> +/*

> + * __alloc_pages_bulk - Allocate a number of order-0 pages to a list

> + * @gfp: GFP flags for the allocation

> + * @preferred_nid: The preferred NUMA node ID to allocate from

> + * @nodemask: Set of nodes to allocate from, may be NULL

> + * @nr_pages: The number of pages desired on the list

> + * @page_list: List to store the allocated pages

> + *

> + * This is a batched version of the page allocator that attempts to

> + * allocate nr_pages quickly and add them to a list.

> + *

> + * Returns the number of pages on the list.

> + */

> +int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,

> +			nodemask_t *nodemask, int nr_pages,

> +			struct list_head *page_list)

> +{

> +	struct page *page;

> +	unsigned long flags;

> +	struct zone *zone;

> +	struct zoneref *z;

> +	struct per_cpu_pages *pcp;

> +	struct list_head *pcp_list;

> +	struct alloc_context ac;

> +	gfp_t alloc_gfp;

> +	unsigned int alloc_flags;


Was going to complain that this is not set to ALLOC_WMARK_LOW. Must be faster
next time...

> +	int allocated = 0;

> +

> +	if (WARN_ON_ONCE(nr_pages <= 0))

> +		return 0;

> +

> +	/* Use the single page allocator for one page. */

> +	if (nr_pages == 1)

> +		goto failed;

> +

> +	/* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */


I don't understand this comment. Only alloc_flags_nofragment() sets this flag
and we don't use it here?

> +	gfp &= gfp_allowed_mask;

> +	alloc_gfp = gfp;

> +	if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))

> +		return 0;

> +	gfp = alloc_gfp;

> +

> +	/* Find an allowed local zone that meets the high watermark. */


Should it say "low watermark"?

Vlastimil
Mel Gorman April 12, 2021, 10:59 a.m. UTC | #3
On Mon, Apr 12, 2021 at 12:21:42PM +0200, Vlastimil Babka wrote:
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c

> > index 8a3e13277e22..eb547470a7e4 100644

> > --- a/mm/page_alloc.c

> > +++ b/mm/page_alloc.c

> > @@ -4965,6 +4965,124 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,

> >  	return true;

> >  }

> >  

> > +/*

> > + * __alloc_pages_bulk - Allocate a number of order-0 pages to a list

> > + * @gfp: GFP flags for the allocation

> > + * @preferred_nid: The preferred NUMA node ID to allocate from

> > + * @nodemask: Set of nodes to allocate from, may be NULL

> > + * @nr_pages: The number of pages desired on the list

> > + * @page_list: List to store the allocated pages

> > + *

> > + * This is a batched version of the page allocator that attempts to

> > + * allocate nr_pages quickly and add them to a list.

> > + *

> > + * Returns the number of pages on the list.

> > + */

> > +int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,

> > +			nodemask_t *nodemask, int nr_pages,

> > +			struct list_head *page_list)

> > +{

> > +	struct page *page;

> > +	unsigned long flags;

> > +	struct zone *zone;

> > +	struct zoneref *z;

> > +	struct per_cpu_pages *pcp;

> > +	struct list_head *pcp_list;

> > +	struct alloc_context ac;

> > +	gfp_t alloc_gfp;

> > +	unsigned int alloc_flags;

> 

> Was going to complain that this is not set to ALLOC_WMARK_LOW. Must be faster

> next time...

> 


Good that you caught it anyway!

> > +	int allocated = 0;

> > +

> > +	if (WARN_ON_ONCE(nr_pages <= 0))

> > +		return 0;

> > +

> > +	/* Use the single page allocator for one page. */

> > +	if (nr_pages == 1)

> > +		goto failed;

> > +

> > +	/* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */

> 

> I don't understand this comment. Only alloc_flags_nofragment() sets this flag

> and we don't use it here?

> 


It's there as a reminder that there are non-obvious consequences
to ALLOC_NOFRAGMENT that may affect the bulk allocation success
rate. __rmqueue_fallback will only select pageblock_order pages and if that
fails, we fall into the slow path that allocates a single page. I didn't
deal with it because it was not obvious that it's even relevant but I bet
in 6 months time, I'll forget that ALLOC_NOFRAGMENT may affect success
rates without the comment. I'm waiting for a bug that can trivially trigger
a case with a meaningful workload where the success rate is poor enough to
affect latency before adding complexity. Ideally by then, the allocation
paths would be unified a bit better.

> > +	gfp &= gfp_allowed_mask;

> > +	alloc_gfp = gfp;

> > +	if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))

> > +		return 0;

> > +	gfp = alloc_gfp;

> > +

> > +	/* Find an allowed local zone that meets the high watermark. */

> 

> Should it say "low watermark"?

> 


Yeah, that's leftover from an earlier prototype :(

-- 
Mel Gorman
SUSE Labs
Mel Gorman April 12, 2021, 11:19 a.m. UTC | #4
On Mon, Apr 12, 2021 at 11:59:38AM +0100, Mel Gorman wrote:
> > I don't understand this comment. Only alloc_flags_nofragment() sets this flag

> > and we don't use it here?

> > 

> 

> It's there as a reminder that there are non-obvious consequences

> to ALLOC_NOFRAGMENT that may affect the bulk allocation success

> rate. __rmqueue_fallback will only select pageblock_order pages and if that

> fails, we fall into the slow path that allocates a single page. I didn't

> deal with it because it was not obvious that it's even relevant but I bet

> in 6 months time, I'll forget that ALLOC_NOFRAGMENT may affect success

> rates without the comment. I'm waiting for a bug that can trivially trigger

> a case with a meaningful workload where the success rate is poor enough to

> affect latency before adding complexity. Ideally by then, the allocation

> paths would be unified a bit better.

> 


So this needs better clarification. ALLOC_NOFRAGMENT is not a
problem at the moment but at one point during development, it was a
non-obvious potential problem. If the paths are unified, ALLOC_NOFRAGMENT
*potentially* becomes a problem depending on how it's done and it needs
careful consideration. For example, it could be part unified by moving
the alloc_flags_nofragment() call into prepare_alloc_pages because in
__alloc_pages, it always happens and it looks like an obvious partial
unification. Hence the comment "May set ALLOC_NOFRAGMENT" because I wanted
a reminder in case I "fixed" this in 6 months time and forgot the downside.

-- 
Mel Gorman
SUSE Labs
diff mbox series

Patch

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0a88f84b08f4..4a304fd39916 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -518,6 +518,17 @@  static inline int arch_make_page_accessible(struct page *page)
 struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
 		nodemask_t *nodemask);
 
+int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+				nodemask_t *nodemask, int nr_pages,
+				struct list_head *list);
+
+/* Bulk allocate order-0 pages */
+static inline unsigned long
+alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
+{
+	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list);
+}
+
 /*
  * Allocate pages, preferring the node given as nid. The node must be valid and
  * online. For more general interface, see alloc_pages_node().
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8a3e13277e22..eb547470a7e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4965,6 +4965,124 @@  static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 	return true;
 }
 
+/*
+ * __alloc_pages_bulk - Allocate a number of order-0 pages to a list
+ * @gfp: GFP flags for the allocation
+ * @preferred_nid: The preferred NUMA node ID to allocate from
+ * @nodemask: Set of nodes to allocate from, may be NULL
+ * @nr_pages: The number of pages desired on the list
+ * @page_list: List to store the allocated pages
+ *
+ * This is a batched version of the page allocator that attempts to
+ * allocate nr_pages quickly and add them to a list.
+ *
+ * Returns the number of pages on the list.
+ */
+int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+			nodemask_t *nodemask, int nr_pages,
+			struct list_head *page_list)
+{
+	struct page *page;
+	unsigned long flags;
+	struct zone *zone;
+	struct zoneref *z;
+	struct per_cpu_pages *pcp;
+	struct list_head *pcp_list;
+	struct alloc_context ac;
+	gfp_t alloc_gfp;
+	unsigned int alloc_flags;
+	int allocated = 0;
+
+	if (WARN_ON_ONCE(nr_pages <= 0))
+		return 0;
+
+	/* Use the single page allocator for one page. */
+	if (nr_pages == 1)
+		goto failed;
+
+	/* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
+	gfp &= gfp_allowed_mask;
+	alloc_gfp = gfp;
+	if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
+		return 0;
+	gfp = alloc_gfp;
+
+	/* Find an allowed local zone that meets the high watermark. */
+	for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
+		unsigned long mark;
+
+		if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
+		    !__cpuset_zone_allowed(zone, gfp)) {
+			continue;
+		}
+
+		if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
+		    zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
+			goto failed;
+		}
+
+		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
+		if (zone_watermark_fast(zone, 0,  mark,
+				zonelist_zone_idx(ac.preferred_zoneref),
+				alloc_flags, gfp)) {
+			break;
+		}
+	}
+
+	/*
+	 * If there are no allowed local zones that meets the watermarks then
+	 * try to allocate a single page and reclaim if necessary.
+	 */
+	if (!zone)
+		goto failed;
+
+	/* Attempt the batch allocation */
+	local_irq_save(flags);
+	pcp = &this_cpu_ptr(zone->pageset)->pcp;
+	pcp_list = &pcp->lists[ac.migratetype];
+
+	while (allocated < nr_pages) {
+		page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
+								pcp, pcp_list);
+		if (!page) {
+			/* Try and get at least one page */
+			if (!allocated)
+				goto failed_irq;
+			break;
+		}
+
+		/*
+		 * Ideally this would be batched but the best way to do
+		 * that cheaply is to first convert zone_statistics to
+		 * be inaccurate per-cpu counter like vm_events to avoid
+		 * a RMW cycle then do the accounting with IRQs enabled.
+		 */
+		__count_zid_vm_events(PGALLOC, zone_idx(zone), 1);
+		zone_statistics(ac.preferred_zoneref->zone, zone);
+
+		prep_new_page(page, 0, gfp, 0);
+		list_add(&page->lru, page_list);
+		allocated++;
+	}
+
+	local_irq_restore(flags);
+
+	return allocated;
+
+failed_irq:
+	local_irq_restore(flags);
+
+failed:
+	page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
+	if (page) {
+		list_add(&page->lru, page_list);
+		allocated = 1;
+	}
+
+	return allocated;
+}
+EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */