diff mbox series

[02/11] mm/page_alloc: Convert per-cpu list protection to local_lock

Message ID 20210407202423.16022-3-mgorman@techsingularity.net
State Superseded
Headers show
Series Use local_lock for pcp protection and reduce stat overhead | expand

Commit Message

Mel Gorman April 7, 2021, 8:24 p.m. UTC
There is a lack of clarity of what exactly local_irq_save/local_irq_restore
protects in page_alloc.c . It conflates the protection of per-cpu page
allocation structures with per-cpu vmstat deltas.

This patch protects the PCP structure using local_lock which for most
configurations is identical to IRQ enabling/disabling.  The scope of the
lock is still wider than it should be but this is decreased laster.

[lkp@intel.com: Make pagesets static]
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 include/linux/mmzone.h |  2 ++
 mm/page_alloc.c        | 50 +++++++++++++++++++++++++++++-------------
 2 files changed, 37 insertions(+), 15 deletions(-)

Comments

Peter Zijlstra April 8, 2021, 10:52 a.m. UTC | #1
On Wed, Apr 07, 2021 at 09:24:14PM +0100, Mel Gorman wrote:
> There is a lack of clarity of what exactly local_irq_save/local_irq_restore

> protects in page_alloc.c . It conflates the protection of per-cpu page

> allocation structures with per-cpu vmstat deltas.

> 

> This patch protects the PCP structure using local_lock which for most

> configurations is identical to IRQ enabling/disabling.  The scope of the

> lock is still wider than it should be but this is decreased laster.


> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

> index a4393ac27336..106da8fbc72a 100644

> --- a/include/linux/mmzone.h

> +++ b/include/linux/mmzone.h


> @@ -337,6 +338,7 @@ enum zone_watermarks {

>  #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)

>  #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)

>  

> +/* Fields and list protected by pagesets local_lock in page_alloc.c */

>  struct per_cpu_pages {

>  	int count;		/* number of pages in the list */

>  	int high;		/* high watermark, emptying needed */


> diff --git a/mm/page_alloc.c b/mm/page_alloc.c

> index a68bacddcae0..e9e60d1a85d4 100644

> --- a/mm/page_alloc.c

> +++ b/mm/page_alloc.c

> @@ -112,6 +112,13 @@ typedef int __bitwise fpi_t;

>  static DEFINE_MUTEX(pcp_batch_high_lock);

>  #define MIN_PERCPU_PAGELIST_FRACTION	(8)

>  

> +struct pagesets {

> +	local_lock_t lock;

> +};

> +static DEFINE_PER_CPU(struct pagesets, pagesets) = {

> +	.lock = INIT_LOCAL_LOCK(lock),

> +};


So why isn't the local_lock_t in struct per_cpu_pages ? That seems to be
the actual object that is protected by it and is already per-cpu.

Is that because you want to avoid the duplication across zones? Is that
worth the effort?
Mel Gorman April 8, 2021, 5:42 p.m. UTC | #2
On Thu, Apr 08, 2021 at 12:52:07PM +0200, Peter Zijlstra wrote:
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c

> > index a68bacddcae0..e9e60d1a85d4 100644

> > --- a/mm/page_alloc.c

> > +++ b/mm/page_alloc.c

> > @@ -112,6 +112,13 @@ typedef int __bitwise fpi_t;

> >  static DEFINE_MUTEX(pcp_batch_high_lock);

> >  #define MIN_PERCPU_PAGELIST_FRACTION	(8)

> >  

> > +struct pagesets {

> > +	local_lock_t lock;

> > +};

> > +static DEFINE_PER_CPU(struct pagesets, pagesets) = {

> > +	.lock = INIT_LOCAL_LOCK(lock),

> > +};

> 

> So why isn't the local_lock_t in struct per_cpu_pages ? That seems to be

> the actual object that is protected by it and is already per-cpu.

> 

> Is that because you want to avoid the duplication across zones? Is that

> worth the effort?


When I wrote the patch, the problem was that zone_pcp_reset freed the
per_cpu_pages structure and it was "protected" by local_irq_save(). If
that was converted to local_lock_irq then the structure containing the
lock is freed before it is released which is obviously bad.

Much later when trying to make the allocator RT-safe in general, I realised
that locking was broken and fixed it in patch 3 of this series. With that,
the local_lock could potentially be embedded within per_cpu_pages safely
at the end of this series.

-- 
Mel Gorman
SUSE Labs
Peter Zijlstra April 9, 2021, 6:39 a.m. UTC | #3
On Thu, Apr 08, 2021 at 06:42:44PM +0100, Mel Gorman wrote:
> On Thu, Apr 08, 2021 at 12:52:07PM +0200, Peter Zijlstra wrote:

> > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c

> > > index a68bacddcae0..e9e60d1a85d4 100644

> > > --- a/mm/page_alloc.c

> > > +++ b/mm/page_alloc.c

> > > @@ -112,6 +112,13 @@ typedef int __bitwise fpi_t;

> > >  static DEFINE_MUTEX(pcp_batch_high_lock);

> > >  #define MIN_PERCPU_PAGELIST_FRACTION	(8)

> > >  

> > > +struct pagesets {

> > > +	local_lock_t lock;

> > > +};

> > > +static DEFINE_PER_CPU(struct pagesets, pagesets) = {

> > > +	.lock = INIT_LOCAL_LOCK(lock),

> > > +};

> > 

> > So why isn't the local_lock_t in struct per_cpu_pages ? That seems to be

> > the actual object that is protected by it and is already per-cpu.

> > 

> > Is that because you want to avoid the duplication across zones? Is that

> > worth the effort?

> 

> When I wrote the patch, the problem was that zone_pcp_reset freed the

> per_cpu_pages structure and it was "protected" by local_irq_save(). If

> that was converted to local_lock_irq then the structure containing the

> lock is freed before it is released which is obviously bad.

> 

> Much later when trying to make the allocator RT-safe in general, I realised

> that locking was broken and fixed it in patch 3 of this series. With that,

> the local_lock could potentially be embedded within per_cpu_pages safely

> at the end of this series.


Fair enough; I was just wondering why the obvious solution wasn't chosen
and neither changelog nor comment explain, so I had to ask :-)
Mel Gorman April 9, 2021, 7:59 a.m. UTC | #4
On Fri, Apr 09, 2021 at 08:39:45AM +0200, Peter Zijlstra wrote:
> On Thu, Apr 08, 2021 at 06:42:44PM +0100, Mel Gorman wrote:

> > On Thu, Apr 08, 2021 at 12:52:07PM +0200, Peter Zijlstra wrote:

> > > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c

> > > > index a68bacddcae0..e9e60d1a85d4 100644

> > > > --- a/mm/page_alloc.c

> > > > +++ b/mm/page_alloc.c

> > > > @@ -112,6 +112,13 @@ typedef int __bitwise fpi_t;

> > > >  static DEFINE_MUTEX(pcp_batch_high_lock);

> > > >  #define MIN_PERCPU_PAGELIST_FRACTION	(8)

> > > >  

> > > > +struct pagesets {

> > > > +	local_lock_t lock;

> > > > +};

> > > > +static DEFINE_PER_CPU(struct pagesets, pagesets) = {

> > > > +	.lock = INIT_LOCAL_LOCK(lock),

> > > > +};

> > > 

> > > So why isn't the local_lock_t in struct per_cpu_pages ? That seems to be

> > > the actual object that is protected by it and is already per-cpu.

> > > 

> > > Is that because you want to avoid the duplication across zones? Is that

> > > worth the effort?

> > 

> > When I wrote the patch, the problem was that zone_pcp_reset freed the

> > per_cpu_pages structure and it was "protected" by local_irq_save(). If

> > that was converted to local_lock_irq then the structure containing the

> > lock is freed before it is released which is obviously bad.

> > 

> > Much later when trying to make the allocator RT-safe in general, I realised

> > that locking was broken and fixed it in patch 3 of this series. With that,

> > the local_lock could potentially be embedded within per_cpu_pages safely

> > at the end of this series.

> 

> Fair enough; I was just wondering why the obvious solution wasn't chosen

> and neither changelog nor comment explain, so I had to ask :-)


It's a fair question and it was my first approach before I hit problems.
Thinking again this morning, I remembered that another problem I hit was
patterns like this

        local_lock_irqsave(&pagesets.lock, flags);
        pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);

turning into

	cpu = get_cpu();
        pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
        local_lock_irqsave(&pcp->lock, flags);

That has its own problems if zone->lock was acquired within the
local_lock_irqsave section (Section "spinlock_t and rwlock_t" in
Documentation/locking/locktypes.rst) so it has to turn into

	migrate_disable();
	pcp = this_cpu_ptr(zone->per_cpu_pageset);
        local_lock_irqsave(&pcp->lock, flags);

I did not want to start adding migrate_disable() in multiple places like
this because I'm guessing that new users of migrate_disable() need strong
justification and adding such code in page_alloc.c might cause cargo-cult
copy&paste in the future. Maybe it could be addressed with a helper like
this_cpu_local_lock or this_cpu_local_lock_irq but that means in some
cases that the PCP structure is looked up twice with patterns like this one

        local_lock_irqsave(&pagesets.lock, flags);
        free_unref_page_commit(page, pfn, migratetype);
        local_unlock_irqrestore(&pagesets.lock, flags);

To get around multiple lookups the helper becomes something that disables
migration, looks up the PCP structure, locks it and returns it with
pcp then passed around as appropriate. Not sure what I would call that
helper :P

In the end I just gave up and kept it simple as there is no benefit to
!PREEMPT_RT which just disables IRQs. Maybe it'll be worth considering when
PREEMPT_RT is upstream and can be enabled. The series was functionally
tested on the PREEMPT_RT tree by reverting the page_alloc.c patch and
applies this series and all of its prerequisites on top.

-- 
Mel Gorman
SUSE Labs
Peter Zijlstra April 9, 2021, 8:24 a.m. UTC | #5
On Fri, Apr 09, 2021 at 08:59:39AM +0100, Mel Gorman wrote:
> In the end I just gave up and kept it simple as there is no benefit to

> !PREEMPT_RT which just disables IRQs. Maybe it'll be worth considering when

> PREEMPT_RT is upstream and can be enabled. The series was functionally

> tested on the PREEMPT_RT tree by reverting the page_alloc.c patch and

> applies this series and all of its prerequisites on top.


Right, I see the problem. Fair enough; perhaps ammend the changelog to
include some of that so that we can 'remember' in a few months why the
code is 'funneh'.
Mel Gorman April 9, 2021, 1:32 p.m. UTC | #6
On Fri, Apr 09, 2021 at 10:24:24AM +0200, Peter Zijlstra wrote:
> On Fri, Apr 09, 2021 at 08:59:39AM +0100, Mel Gorman wrote:

> > In the end I just gave up and kept it simple as there is no benefit to

> > !PREEMPT_RT which just disables IRQs. Maybe it'll be worth considering when

> > PREEMPT_RT is upstream and can be enabled. The series was functionally

> > tested on the PREEMPT_RT tree by reverting the page_alloc.c patch and

> > applies this series and all of its prerequisites on top.

> 

> Right, I see the problem. Fair enough; perhaps ammend the changelog to

> include some of that so that we can 'remember' in a few months why the

> code is 'funneh'.

> 


I updated the changelog and also added a comment above the
declaration. That said, there are some curious users already.
fs/squashfs/decompressor_multi_percpu.c looks like it always uses the
local_lock in CPU 0's per-cpu structure instead of stabilising a per-cpu
pointer. drivers/block/zram/zcomp.c appears to do the same although for
at least one of the zcomp_stream_get() callers, the CPU is pinned for
other reasons (bit spin lock held). I think it happens to work anyway
but it's weird and I'm not a fan.

Anyway, new version looks like is below.

-- 
[PATCH] mm/page_alloc: Convert per-cpu list protection to local_lock

There is a lack of clarity of what exactly local_irq_save/local_irq_restore
protects in page_alloc.c . It conflates the protection of per-cpu page
allocation structures with per-cpu vmstat deltas.

This patch protects the PCP structure using local_lock which for most
configurations is identical to IRQ enabling/disabling.  The scope of the
lock is still wider than it should be but this is decreased later.

local_lock is declared statically instead of placing it within a structure
and this is deliberate. Placing it in the zone offers limited benefit and
confuses what the lock is protecting -- struct per_cpu_pages. However,
putting it in per_cpu_pages is problematic because the task is not guaranteed
to be pinned to the CPU yet so looking up a per-cpu structure is unsafe.

[lkp@intel.com: Make pagesets static]
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>

---
 include/linux/mmzone.h |  2 ++
 mm/page_alloc.c        | 67 +++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a4393ac27336..106da8fbc72a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -20,6 +20,7 @@
 #include <linux/atomic.h>
 #include <linux/mm_types.h>
 #include <linux/page-flags.h>
+#include <linux/local_lock.h>
 #include <asm/page.h>
 
 /* Free memory management - zoned buddy allocator.  */
@@ -337,6 +338,7 @@ enum zone_watermarks {
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
+/* Fields and list protected by pagesets local_lock in page_alloc.c */
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3bc4da4cbf9c..04644c3dd187 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -112,6 +112,30 @@ typedef int __bitwise fpi_t;
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION	(8)
 
+/*
+ * Protects the per_cpu_pages structures.
+ *
+ * This lock is not placed in struct per_cpu_pages because the task acquiring
+ * the lock is not guaranteed to be pinned to the CPU yet due to
+ * preempt/migrate/IRQs disabled or holding a spinlock. The pattern to acquire
+ * the lock would become
+ *
+ *   migrate_disable();
+ *   pcp = this_cpu_ptr(zone->per_cpu_pageset);
+ *   local_lock_irqsave(&pcp->lock, flags);
+ *
+ * While a helper would avoid code duplication, there is no inherent advantage
+ * and migrate_disable itself is undesirable (see include/linux/preempt.h).
+ * Similarly, putting the lock in the zone offers no particular benefit but
+ * confuses what the lock is protecting.
+ */
+struct pagesets {
+	local_lock_t lock;
+};
+static DEFINE_PER_CPU(struct pagesets, pagesets) = {
+	.lock = INIT_LOCAL_LOCK(lock),
+};
+
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -1421,6 +1445,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 		} while (--count && --batch_free && !list_empty(list));
 	}
 
+	/*
+	 * local_lock_irq held so equivalent to spin_lock_irqsave for
+	 * both PREEMPT_RT and non-PREEMPT_RT configurations.
+	 */
 	spin_lock(&zone->lock);
 	isolated_pageblocks = has_isolate_pageblock(zone);
 
@@ -1541,6 +1569,11 @@ static void __free_pages_ok(struct page *page, unsigned int order,
 		return;
 
 	migratetype = get_pfnblock_migratetype(page, pfn);
+
+	/*
+	 * TODO FIX: Disable IRQs before acquiring IRQ-safe zone->lock
+	 * and protect vmstat updates.
+	 */
 	local_irq_save(flags);
 	__count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, pfn, order, migratetype,
@@ -2910,6 +2943,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 {
 	int i, allocated = 0;
 
+	/*
+	 * local_lock_irq held so equivalent to spin_lock_irqsave for
+	 * both PREEMPT_RT and non-PREEMPT_RT configurations.
+	 */
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype,
@@ -2962,12 +2999,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 	unsigned long flags;
 	int to_drain, batch;
 
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 	batch = READ_ONCE(pcp->batch);
 	to_drain = min(pcp->count, batch);
 	if (to_drain > 0)
 		free_pcppages_bulk(zone, to_drain, pcp);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 #endif
 
@@ -2983,13 +3020,13 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 	unsigned long flags;
 	struct per_cpu_pages *pcp;
 
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 
 	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
 	if (pcp->count)
 		free_pcppages_bulk(zone, pcp->count, pcp);
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
 /*
@@ -3252,9 +3289,9 @@ void free_unref_page(struct page *page)
 	if (!free_unref_page_prepare(page, pfn))
 		return;
 
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 	free_unref_page_commit(page, pfn);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
 /*
@@ -3274,7 +3311,7 @@ void free_unref_page_list(struct list_head *list)
 		set_page_private(page, pfn);
 	}
 
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 	list_for_each_entry_safe(page, next, list, lru) {
 		unsigned long pfn = page_private(page);
 
@@ -3287,12 +3324,12 @@ void free_unref_page_list(struct list_head *list)
 		 * a large list of pages to free.
 		 */
 		if (++batch_count == SWAP_CLUSTER_MAX) {
-			local_irq_restore(flags);
+			local_unlock_irqrestore(&pagesets.lock, flags);
 			batch_count = 0;
-			local_irq_save(flags);
+			local_lock_irqsave(&pagesets.lock, flags);
 		}
 	}
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
 /*
@@ -3449,7 +3486,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	struct page *page;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	list = &pcp->lists[migratetype];
 	page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
@@ -3457,7 +3494,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
 		zone_statistics(preferred_zone, zone);
 	}
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 	return page;
 }
 
@@ -5052,7 +5089,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 		goto failed;
 
 	/* Attempt the batch allocation */
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pcp_list = &pcp->lists[ac.migratetype];
 
@@ -5090,12 +5127,12 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 		nr_populated++;
 	}
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 
 	return nr_populated;
 
 failed_irq:
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 
 failed:
 	page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
Peter Zijlstra April 9, 2021, 6:55 p.m. UTC | #7
On Fri, Apr 09, 2021 at 02:32:56PM +0100, Mel Gorman wrote:
> That said, there are some curious users already.

> fs/squashfs/decompressor_multi_percpu.c looks like it always uses the

> local_lock in CPU 0's per-cpu structure instead of stabilising a per-cpu

> pointer. 


I'm not sure how you read that.

You're talking about this:

  local_lock(&msblk->stream->lock);

right? Note that msblk->stream is a per-cpu pointer, so
&msblk->stream->lock is that same per-cpu pointer with an offset on.

The whole think relies on:

	&per_cpu_ptr(msblk->stream, cpu)->lock == per_cpu_ptr(&msblk->stream->lock, cpu)

Which is true because the lhs:

	(local_lock_t *)((msblk->stream + per_cpu_offset(cpu)) + offsetof(struct squashfs_stream, lock))

and the rhs:

	(local_lock_t *)((msblk->stream + offsetof(struct squashfs_stream, lock)) + per_cpu_offset(cpu))

are identical, because addition is associative.

> drivers/block/zram/zcomp.c appears to do the same although for

> at least one of the zcomp_stream_get() callers, the CPU is pinned for

> other reasons (bit spin lock held). I think it happens to work anyway

> but it's weird and I'm not a fan.


Same thing.
Mel Gorman April 12, 2021, 11:56 a.m. UTC | #8
On Fri, Apr 09, 2021 at 08:55:39PM +0200, Peter Zijlstra wrote:
> On Fri, Apr 09, 2021 at 02:32:56PM +0100, Mel Gorman wrote:

> > That said, there are some curious users already.

> > fs/squashfs/decompressor_multi_percpu.c looks like it always uses the

> > local_lock in CPU 0's per-cpu structure instead of stabilising a per-cpu

> > pointer. 

> 

> I'm not sure how you read that.

> 

> You're talking about this:

> 

>   local_lock(&msblk->stream->lock);

> 

> right? Note that msblk->stream is a per-cpu pointer, so

> &msblk->stream->lock is that same per-cpu pointer with an offset on.

> 

> The whole think relies on:

> 

> 	&per_cpu_ptr(msblk->stream, cpu)->lock == per_cpu_ptr(&msblk->stream->lock, cpu)

> 

> Which is true because the lhs:

> 

> 	(local_lock_t *)((msblk->stream + per_cpu_offset(cpu)) + offsetof(struct squashfs_stream, lock))

> 

> and the rhs:

> 

> 	(local_lock_t *)((msblk->stream + offsetof(struct squashfs_stream, lock)) + per_cpu_offset(cpu))

> 

> are identical, because addition is associative.

> 


Ok, I think I see and understand now, I didn't follow far enough down
into the macro magic and missed this observation so thanks for your
patience. The page allocator still incurs a double lookup of the per
cpu offsets but it should work for both the current local_lock_irq
implementation and the one in preempt-rt because the task will be pinned
to the CPU by either preempt_disable, migrate_disable or IRQ disable
depending on the local_lock implementation and kernel configuration.

I'll update the changelog and comment accordingly. I'll decide later
whether to leave it or move the location of the lock at the end of the
series. If the patch is added, it'll either incur the double lookup (not
that expensive, might be optimised by the compiler) or come up with a
helper that takes the lock and returns the per-cpu structure. The double
lookup probably makes more sense initially because there are multiple
potential users of a helper that says "pin to CPU, lookup, lock and return
a per-cpu structure" for both IRQ-safe and IRQ-unsafe variants with the
associated expansion of the local_lock API. It might be better to introduce
such a helper with multiple users converted at the same time and there are
other local_lock users in preempt-rt that could do with upstreaming first.

> > drivers/block/zram/zcomp.c appears to do the same although for

> > at least one of the zcomp_stream_get() callers, the CPU is pinned for

> > other reasons (bit spin lock held). I think it happens to work anyway

> > but it's weird and I'm not a fan.

> 

> Same thing.


Yep.

-- 
Mel Gorman
SUSE Labs
Thomas Gleixner April 12, 2021, 9:47 p.m. UTC | #9
On Mon, Apr 12 2021 at 12:56, Mel Gorman wrote:
> On Fri, Apr 09, 2021 at 08:55:39PM +0200, Peter Zijlstra wrote:

> I'll update the changelog and comment accordingly. I'll decide later

> whether to leave it or move the location of the lock at the end of the

> series. If the patch is added, it'll either incur the double lookup (not

> that expensive, might be optimised by the compiler) or come up with a

> helper that takes the lock and returns the per-cpu structure. The double

> lookup probably makes more sense initially because there are multiple

> potential users of a helper that says "pin to CPU, lookup, lock and return

> a per-cpu structure" for both IRQ-safe and IRQ-unsafe variants with the

> associated expansion of the local_lock API. It might be better to introduce

> such a helper with multiple users converted at the same time and there are

> other local_lock users in preempt-rt that could do with upstreaming first.


We had such helpers in RT a while ago but it turned into an helper
explosion pretty fast. But that was one of the early versions of local
locks which could not be embedded into a per CPU data structure due to
raisins (my stupidity).

But with the more thought out approach of today we can have (+/- the
obligatory naming bikeshedding):

--- a/include/linux/local_lock.h
+++ b/include/linux/local_lock.h
@@ -51,4 +51,35 @@
 #define local_unlock_irqrestore(lock, flags)			\
 	__local_unlock_irqrestore(lock, flags)
 
+/**
+ * local_lock_get_cpu_ptr - Acquire a per CPU local lock and return
+ *			    a pointer to the per CPU data which
+ *			    contains the local lock.
+ * @pcp:	Per CPU data structure
+ * @lock:	The local lock member of @pcp
+ */
+#define local_lock_get_cpu_ptr(pcp, lock)			\
+	__local_lock_get_cpu_ptr(pcp, typeof(*(pcp)), lock)
+
+/**
+ * local_lock_irq_get_cpu_ptr - Acquire a per CPU local lock, disable
+ *				interrupts and return a pointer to the
+ *				per CPU data which contains the local lock.
+ * @pcp:	Per CPU data structure
+ * @lock:	The local lock member of @pcp
+ */
+#define local_lock_irq_get_cpu_ptr(pcp, lock)			\
+	__local_lock_irq_get_cpu_ptr(pcp, typeof(*(pcp)), lock)
+
+/**
+ * local_lock_irqsave_get_cpu_ptr - Acquire a per CPU local lock, save and
+ *				    disable interrupts and return a pointer to
+ *				    the CPU data which contains the local lock.
+ * @pcp:	Per CPU data structure
+ * @lock:	The local lock member of @pcp
+ * @flags:	Storage for interrupt flags
+ */
+#define local_lock_irqsave_get_cpu_ptr(pcp, lock, flags)	\
+	__local_lock_irqsave_get_cpu_ptr(pcp, typeof(*(pcp)), lock, flags)
+
 #endif
--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -91,3 +91,33 @@ static inline void local_lock_release(lo
 		local_lock_release(this_cpu_ptr(lock));		\
 		local_irq_restore(flags);			\
 	} while (0)
+
+#define __local_lock_get_cpu_ptr(pcp, type, lock)		\
+	({							\
+		type *__pcp;					\
+								\
+		preempt_disable();				\
+		__pcp = this_cpu_ptr(pcp);			\
+		local_lock_acquire(&__pcp->lock);		\
+		__pcp;						\
+	})
+
+#define __local_lock_irq_get_cpu_ptr(pcp, type, lock)		\
+	({							\
+		type *__pcp;					\
+								\
+		local_irq_disable();				\
+		__pcp = this_cpu_ptr(pcp);			\
+		local_lock_acquire(&__pcp->lock);		\
+		__pcp;						\
+	})
+
+#define __local_lock_irqsave_get_cpu_ptr(pcp, type, lock, flags)\
+	({							\
+		type *__pcp;					\
+								\
+		local_irq_save(flags);				\
+		__pcp = this_cpu_ptr(pcp);			\
+		local_lock_acquire(&__pcp->lock);		\
+		__pcp;						\
+	})


and RT will then change that to:

--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -96,7 +96,7 @@ static inline void local_lock_release(lo
 	({							\
 		type *__pcp;					\
 								\
-		preempt_disable();				\
+		ll_preempt_disable();				\
 		__pcp = this_cpu_ptr(pcp);			\
 		local_lock_acquire(&__pcp->lock);		\
 		__pcp;						\
@@ -106,7 +106,7 @@ static inline void local_lock_release(lo
 	({							\
 		type *__pcp;					\
 								\
-		local_irq_disable();				\
+		ll_local_irq_disable();				\
 		__pcp = this_cpu_ptr(pcp);			\
 		local_lock_acquire(&__pcp->lock);		\
 		__pcp;						\
@@ -116,7 +116,7 @@ static inline void local_lock_release(lo
 	({							\
 		type *__pcp;					\
 								\
-		local_irq_save(flags);				\
+		ll_local_irq_save(flags);			\
 		__pcp = this_cpu_ptr(pcp);			\
 		local_lock_acquire(&__pcp->lock);		\
 		__pcp;						\


where ll_xxx is defined as xxx for non-RT and on RT all of them
get mapped to migrate_disable().

Thoughts?

Thanks,

        tglx
Mel Gorman April 13, 2021, 4:52 p.m. UTC | #10
On Mon, Apr 12, 2021 at 11:47:00PM +0200, Thomas Gleixner wrote:
> On Mon, Apr 12 2021 at 12:56, Mel Gorman wrote:

> > On Fri, Apr 09, 2021 at 08:55:39PM +0200, Peter Zijlstra wrote:

> > I'll update the changelog and comment accordingly. I'll decide later

> > whether to leave it or move the location of the lock at the end of the

> > series. If the patch is added, it'll either incur the double lookup (not

> > that expensive, might be optimised by the compiler) or come up with a

> > helper that takes the lock and returns the per-cpu structure. The double

> > lookup probably makes more sense initially because there are multiple

> > potential users of a helper that says "pin to CPU, lookup, lock and return

> > a per-cpu structure" for both IRQ-safe and IRQ-unsafe variants with the

> > associated expansion of the local_lock API. It might be better to introduce

> > such a helper with multiple users converted at the same time and there are

> > other local_lock users in preempt-rt that could do with upstreaming first.

> 

> We had such helpers in RT a while ago but it turned into an helper

> explosion pretty fast. But that was one of the early versions of local

> locks which could not be embedded into a per CPU data structure due to

> raisins (my stupidity).

> 

> But with the more thought out approach of today we can have (+/- the

> obligatory naming bikeshedding):

> 

> <SNIP>


I don't have strong opinions on the name -- it's long but it's clear.
The overhead of local_lock_get_cpu_ptr has similar weight to get_cpu_ptr
in terms of the cost of preempt_disable. The helper also means that new
users of a local_lock embedded within a per-cpu structure do not have to
figure out if it's safe from scratch.

If the page allocator embeds local_lock within struct per_cpu_pages then
the conversion to the helper is at the end of the mail. The messiest part
is free_unref_page_commit and that is a mess because free_unref_page_list
has to check if a new lock is required in case a list of pages is from
different zones.

> <SNIP>

>

> and RT will then change that to:

> 

> --- a/include/linux/local_lock_internal.h

> +++ b/include/linux/local_lock_internal.h

> @@ -96,7 +96,7 @@ static inline void local_lock_release(lo

>  	({							\

>  		type *__pcp;					\

>  								\

> -		preempt_disable();				\

> +		ll_preempt_disable();				\

>  		__pcp = this_cpu_ptr(pcp);			\

>  		local_lock_acquire(&__pcp->lock);		\

>  		__pcp;						\

> @@ -106,7 +106,7 @@ static inline void local_lock_release(lo

>  	({							\

>  		type *__pcp;					\

>  								\

> -		local_irq_disable();				\

> +		ll_local_irq_disable();				\

>  		__pcp = this_cpu_ptr(pcp);			\

>  		local_lock_acquire(&__pcp->lock);		\

>  		__pcp;						\

> @@ -116,7 +116,7 @@ static inline void local_lock_release(lo

>  	({							\

>  		type *__pcp;					\

>  								\

> -		local_irq_save(flags);				\

> +		ll_local_irq_save(flags);			\

>  		__pcp = this_cpu_ptr(pcp);			\

>  		local_lock_acquire(&__pcp->lock);		\

>  		__pcp;						\

> 

> 

> where ll_xxx is defined as xxx for non-RT and on RT all of them

> get mapped to migrate_disable().

> 

> Thoughts?

> 


I think that works. I created the obvious definitions of ll_* and rebased
on top of preempt-rt to see. I'll see if it boots :P

Page allocator conversion to helper looks like

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d9d7f6d68243..2948a5502589 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3008,9 +3008,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 	unsigned long flags;
 	struct per_cpu_pages *pcp;
 
-	local_lock_irqsave(&zone->per_cpu_pageset->lock, flags);
-
-	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+	pcp = local_lock_irqsave_get_cpu_ptr(zone->per_cpu_pageset, lock, flags);
 	if (pcp->count)
 		free_pcppages_bulk(zone, pcp->count, pcp);
 
@@ -3235,12 +3233,10 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
 }
 
 static void free_unref_page_commit(struct page *page, struct zone *zone,
-				   unsigned long pfn, int migratetype)
+				   struct per_cpu_pages *pcp, unsigned long pfn,
+				   int migratetype)
 {
-	struct per_cpu_pages *pcp;
-
 	__count_vm_event(PGFREE);
-	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= READ_ONCE(pcp->high))
@@ -3252,6 +3248,7 @@ static void free_unref_page_commit(struct page *page, struct zone *zone,
  */
 void free_unref_page(struct page *page)
 {
+	struct per_cpu_pages *pcp;
 	struct zone *zone;
 	unsigned long flags;
 	unsigned long pfn = page_to_pfn(page);
@@ -3277,8 +3274,8 @@ void free_unref_page(struct page *page)
 	}
 
 	zone = page_zone(page);
-	local_lock_irqsave(&zone->per_cpu_pageset->lock, flags);
-	free_unref_page_commit(page, zone, pfn, migratetype);
+	pcp = local_lock_irqsave_get_cpu_ptr(zone->per_cpu_pageset, lock, flags);
+	free_unref_page_commit(page, zone, pcp, pfn, migratetype);
 	local_unlock_irqrestore(&zone->per_cpu_pageset->lock, flags);
 }
 
@@ -3287,6 +3284,7 @@ void free_unref_page(struct page *page)
  */
 void free_unref_page_list(struct list_head *list)
 {
+	struct per_cpu_pages *pcp;
 	struct zone *locked_zone;
 	struct page *page, *next;
 	unsigned long flags, pfn;
@@ -3320,7 +3318,7 @@ void free_unref_page_list(struct list_head *list)
 	/* Acquire the lock required for the first page. */
 	page = list_first_entry(list, struct page, lru);
 	locked_zone = page_zone(page);
-	local_lock_irqsave(&locked_zone->per_cpu_pageset->lock, flags);
+	pcp = local_lock_irqsave_get_cpu_ptr(locked_zone->per_cpu_pageset, lock, flags);
 
 	list_for_each_entry_safe(page, next, list, lru) {
 		struct zone *zone = page_zone(page);
@@ -3342,12 +3340,12 @@ void free_unref_page_list(struct list_head *list)
 #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_DEBUG_LOCK_ALLOC)
 		if (locked_zone != zone) {
 			local_unlock_irqrestore(&locked_zone->per_cpu_pageset->lock, flags);
-			local_lock_irqsave(&zone->per_cpu_pageset->lock, flags);
+			pcp = local_lock_irqsave_get_cpu_ptr(zone->per_cpu_pageset, lock, flags);
 			locked_zone = zone;
 		}
 #endif
 
-		free_unref_page_commit(page, zone, pfn, migratetype);
+		free_unref_page_commit(page, zone, pcp, pfn, migratetype);
 
 		/*
 		 * Guard against excessive IRQ disabled times when we get
@@ -3517,8 +3515,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	struct page *page;
 	unsigned long flags;
 
-	local_lock_irqsave(&zone->per_cpu_pageset->lock, flags);
-	pcp = this_cpu_ptr(zone->per_cpu_pageset);
+	pcp = local_lock_irqsave_get_cpu_ptr(zone->per_cpu_pageset, lock, flags);
 	list = &pcp->lists[migratetype];
 	page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
 	local_unlock_irqrestore(&zone->per_cpu_pageset->lock, flags);



-- 
Mel Gorman
SUSE Labs
diff mbox series

Patch

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a4393ac27336..106da8fbc72a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -20,6 +20,7 @@ 
 #include <linux/atomic.h>
 #include <linux/mm_types.h>
 #include <linux/page-flags.h>
+#include <linux/local_lock.h>
 #include <asm/page.h>
 
 /* Free memory management - zoned buddy allocator.  */
@@ -337,6 +338,7 @@  enum zone_watermarks {
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
+/* Fields and list protected by pagesets local_lock in page_alloc.c */
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a68bacddcae0..e9e60d1a85d4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -112,6 +112,13 @@  typedef int __bitwise fpi_t;
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION	(8)
 
+struct pagesets {
+	local_lock_t lock;
+};
+static DEFINE_PER_CPU(struct pagesets, pagesets) = {
+	.lock = INIT_LOCAL_LOCK(lock),
+};
+
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -1421,6 +1428,10 @@  static void free_pcppages_bulk(struct zone *zone, int count,
 		} while (--count && --batch_free && !list_empty(list));
 	}
 
+	/*
+	 * local_lock_irq held so equivalent to spin_lock_irqsave for
+	 * both PREEMPT_RT and non-PREEMPT_RT configurations.
+	 */
 	spin_lock(&zone->lock);
 	isolated_pageblocks = has_isolate_pageblock(zone);
 
@@ -1541,6 +1552,11 @@  static void __free_pages_ok(struct page *page, unsigned int order,
 		return;
 
 	migratetype = get_pfnblock_migratetype(page, pfn);
+
+	/*
+	 * TODO FIX: Disable IRQs before acquiring IRQ-safe zone->lock
+	 * and protect vmstat updates.
+	 */
 	local_irq_save(flags);
 	__count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, pfn, order, migratetype,
@@ -2910,6 +2926,10 @@  static int rmqueue_bulk(struct zone *zone, unsigned int order,
 {
 	int i, allocated = 0;
 
+	/*
+	 * local_lock_irq held so equivalent to spin_lock_irqsave for
+	 * both PREEMPT_RT and non-PREEMPT_RT configurations.
+	 */
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype,
@@ -2962,12 +2982,12 @@  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 	unsigned long flags;
 	int to_drain, batch;
 
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 	batch = READ_ONCE(pcp->batch);
 	to_drain = min(pcp->count, batch);
 	if (to_drain > 0)
 		free_pcppages_bulk(zone, to_drain, pcp);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 #endif
 
@@ -2983,13 +3003,13 @@  static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 	unsigned long flags;
 	struct per_cpu_pages *pcp;
 
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 
 	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
 	if (pcp->count)
 		free_pcppages_bulk(zone, pcp->count, pcp);
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
 /*
@@ -3252,9 +3272,9 @@  void free_unref_page(struct page *page)
 	if (!free_unref_page_prepare(page, pfn))
 		return;
 
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 	free_unref_page_commit(page, pfn);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
 /*
@@ -3274,7 +3294,7 @@  void free_unref_page_list(struct list_head *list)
 		set_page_private(page, pfn);
 	}
 
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 	list_for_each_entry_safe(page, next, list, lru) {
 		unsigned long pfn = page_private(page);
 
@@ -3287,12 +3307,12 @@  void free_unref_page_list(struct list_head *list)
 		 * a large list of pages to free.
 		 */
 		if (++batch_count == SWAP_CLUSTER_MAX) {
-			local_irq_restore(flags);
+			local_unlock_irqrestore(&pagesets.lock, flags);
 			batch_count = 0;
-			local_irq_save(flags);
+			local_lock_irqsave(&pagesets.lock, flags);
 		}
 	}
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
 /*
@@ -3449,7 +3469,7 @@  static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	struct page *page;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	list = &pcp->lists[migratetype];
 	page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
@@ -3457,7 +3477,7 @@  static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
 		zone_statistics(preferred_zone, zone);
 	}
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 	return page;
 }
 
@@ -5052,7 +5072,7 @@  unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 		goto failed;
 
 	/* Attempt the batch allocation */
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pcp_list = &pcp->lists[ac.migratetype];
 
@@ -5090,12 +5110,12 @@  unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 		nr_populated++;
 	}
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 
 	return nr_populated;
 
 failed_irq:
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 
 failed:
 	page = __alloc_pages(gfp, 0, preferred_nid, nodemask);