arm64: numa: rightsize the distance array

Message ID 20200708113825.1429671-1-Jonathan.Cameron@huawei.com
State New
Headers show
Series
  • arm64: numa: rightsize the distance array
Related show

Commit Message

Jonathan Cameron July 8, 2020, 11:38 a.m.
Unfortunately we are currently calling numa_alloc_distance well before we call
setup_node_to_cpu_mask_map means that nr_node_ids is set to MAX_NUMNODES.
This wastes a bit of memory and is confusing to the reader.

Note we could just decide to hardcode it as MAX_NUMNODES but if so we should
do so explicitly.

Looking at what x86 does, they do a walk of nodes_parsed and locally
establish the maximum node count seen.  We can't actually do that where we
were previously calling it in numa_init because nodes_parsed isn't set up
either yet.  So let us take a leaf entirely out of x86's book and make
the true assumption that nodes_parsed will definitely be set up before
we try to put a real value in this array.  Hence just do it on demand.

In order to avoid trying and failing to allocate the array multiple times
we do the same thing as x86 and set numa_distance = 1. This requires a
few small modifications elsewhere.

Worth noting, that with one exception (which it appears can be removed [1])
the x86 and arm numa distance code is now identical.  Worth factoring it
out to some common location?

[1] https://lkml.kernel.org/r/20170406124459.dwn5zhpr2xqg3lqm@node.shutemov.name

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

---
arch/arm64/mm/numa.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

-- 
2.19.1

Comments

Jonathan Cameron July 24, 2020, 11:01 a.m. | #1
On Wed, 8 Jul 2020 19:38:25 +0800
Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:

> Unfortunately we are currently calling numa_alloc_distance well before we call

> setup_node_to_cpu_mask_map means that nr_node_ids is set to MAX_NUMNODES.

> This wastes a bit of memory and is confusing to the reader.

> 

> Note we could just decide to hardcode it as MAX_NUMNODES but if so we should

> do so explicitly.

> 

> Looking at what x86 does, they do a walk of nodes_parsed and locally

> establish the maximum node count seen.  We can't actually do that where we

> were previously calling it in numa_init because nodes_parsed isn't set up

> either yet.  So let us take a leaf entirely out of x86's book and make

> the true assumption that nodes_parsed will definitely be set up before

> we try to put a real value in this array.  Hence just do it on demand.

> 

> In order to avoid trying and failing to allocate the array multiple times

> we do the same thing as x86 and set numa_distance = 1. This requires a

> few small modifications elsewhere.

> 

> Worth noting, that with one exception (which it appears can be removed [1])

> the x86 and arm numa distance code is now identical.  Worth factoring it

> out to some common location?

> 

> [1] https://lkml.kernel.org/r/20170406124459.dwn5zhpr2xqg3lqm@node.shutemov.name

> 

> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>


Polite nudge.  Anyone?  No particular urgency on this one but I'm thinking
of taking a stab at factoring out this code into a common location for arm64
and x86 and this change needs to proceed that.

Thanks,

Jonathan


> ---

> arch/arm64/mm/numa.c | 35 ++++++++++++++++++-----------------

>  1 file changed, 18 insertions(+), 17 deletions(-)

> 

> diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c

> index aafcee3e3f7e..a2f549ef0a36 100644

> --- a/arch/arm64/mm/numa.c

> +++ b/arch/arm64/mm/numa.c

> @@ -255,13 +255,11 @@ void __init numa_free_distance(void)

>  {

>  	size_t size;

>  

> -	if (!numa_distance)

> -		return;

> -

>  	size = numa_distance_cnt * numa_distance_cnt *

>  		sizeof(numa_distance[0]);

> -

> -	memblock_free(__pa(numa_distance), size);

> +	/* numa_distance could be 1LU marking allocation failure, test cnt */

> +	if (numa_distance_cnt)

> +		memblock_free(__pa(numa_distance), size);

>  	numa_distance_cnt = 0;

>  	numa_distance = NULL;

>  }

> @@ -271,20 +269,29 @@ void __init numa_free_distance(void)

>   */

>  static int __init numa_alloc_distance(void)

>  {

> +	nodemask_t nodes_parsed;

>  	size_t size;

> +	int i, j, cnt = 0;

>  	u64 phys;

> -	int i, j;

>  

> -	size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);

> +	/* size the new table and allocate it */

> +	nodes_parsed = numa_nodes_parsed;

> +	for_each_node_mask(i, nodes_parsed)

> +		cnt = i;

> +	cnt++;

> +	size = cnt * cnt * sizeof(numa_distance[0]);

>  	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn),

>  				      size, PAGE_SIZE);

> -	if (WARN_ON(!phys))

> +	if (!phys) {

> +		pr_warn("Warning: can't allocate distance table!\n");

> +		/* don't retry until explicitly reset */

> +		numa_distance = (void *)1LU;

>  		return -ENOMEM;

> -

> +	}

>  	memblock_reserve(phys, size);

>  

>  	numa_distance = __va(phys);

> -	numa_distance_cnt = nr_node_ids;

> +	numa_distance_cnt = cnt;

>  

>  	/* fill with the default distances */

>  	for (i = 0; i < numa_distance_cnt; i++)

> @@ -311,10 +318,8 @@ static int __init numa_alloc_distance(void)

>   */

>  void __init numa_set_distance(int from, int to, int distance)

>  {

> -	if (!numa_distance) {

> -		pr_warn_once("Warning: distance table not allocated yet\n");

> +	if (!numa_distance && numa_alloc_distance() < 0)

>  		return;

> -	}

>  

>  	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||

>  			from < 0 || to < 0) {

> @@ -384,10 +389,6 @@ static int __init numa_init(int (*init_func)(void))

>  	nodes_clear(node_possible_map);

>  	nodes_clear(node_online_map);

>  

> -	ret = numa_alloc_distance();

> -	if (ret < 0)

> -		return ret;

> -

>  	ret = init_func();

>  	if (ret < 0)

>  		goto out_free_distance;
Song Bao Hua (Barry Song) July 24, 2020, 11:45 a.m. | #2
> -----Original Message-----

> From: Jonathan Cameron

> Sent: Wednesday, July 8, 2020 11:38 PM

> To: linux-acpi@vger.kernel.org; linux-arm-kernel@lists.infradead.org

> Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>; Guohanjun (Hanjun Guo)

> <guohanjun@huawei.com>; Sudeep Holla <sudeep.holla@arm.com>;

> Linuxarm <linuxarm@huawei.com>; Tejun Heo <tj@kernel.org>; Song Bao Hua

> (Barry Song) <song.bao.hua@hisilicon.com>; Dan Williams

> <dan.j.williams@intel.com>; Jonathan Cameron

> <jonathan.cameron@huawei.com>

> Subject: [PATCH] arm64: numa: rightsize the distance array

> 

> Unfortunately we are currently calling numa_alloc_distance well before we call

> setup_node_to_cpu_mask_map means that nr_node_ids is set to

> MAX_NUMNODES.

> This wastes a bit of memory and is confusing to the reader.

> 

> Note we could just decide to hardcode it as MAX_NUMNODES but if so we

> should

> do so explicitly.

> 

> Looking at what x86 does, they do a walk of nodes_parsed and locally

> establish the maximum node count seen.  We can't actually do that where we

> were previously calling it in numa_init because nodes_parsed isn't set up

> either yet.  So let us take a leaf entirely out of x86's book and make

> the true assumption that nodes_parsed will definitely be set up before

> we try to put a real value in this array.  Hence just do it on demand.

> 

> In order to avoid trying and failing to allocate the array multiple times

> we do the same thing as x86 and set numa_distance = 1. This requires a

> few small modifications elsewhere.

> 

> Worth noting, that with one exception (which it appears can be removed [1])

> the x86 and arm numa distance code is now identical.  Worth factoring it

> out to some common location?

> 

> [1]

> https://lkml.kernel.org/r/20170406124459.dwn5zhpr2xqg3lqm@node.shute

> mov.name

> 

> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>


Looks sensible to me.
Reviewed-by: Barry Song <song.bao.hua@hisilicon.com>



> ---

> arch/arm64/mm/numa.c | 35 ++++++++++++++++++-----------------

>  1 file changed, 18 insertions(+), 17 deletions(-)

> 

> diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c

> index aafcee3e3f7e..a2f549ef0a36 100644

> --- a/arch/arm64/mm/numa.c

> +++ b/arch/arm64/mm/numa.c

> @@ -255,13 +255,11 @@ void __init numa_free_distance(void)

>  {

>  	size_t size;

> 

> -	if (!numa_distance)

> -		return;

> -

>  	size = numa_distance_cnt * numa_distance_cnt *

>  		sizeof(numa_distance[0]);

> -

> -	memblock_free(__pa(numa_distance), size);

> +	/* numa_distance could be 1LU marking allocation failure, test cnt */

> +	if (numa_distance_cnt)

> +		memblock_free(__pa(numa_distance), size);

>  	numa_distance_cnt = 0;

>  	numa_distance = NULL;

>  }

> @@ -271,20 +269,29 @@ void __init numa_free_distance(void)

>   */

>  static int __init numa_alloc_distance(void)

>  {

> +	nodemask_t nodes_parsed;

>  	size_t size;

> +	int i, j, cnt = 0;

>  	u64 phys;

> -	int i, j;

> 

> -	size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);

> +	/* size the new table and allocate it */

> +	nodes_parsed = numa_nodes_parsed;

> +	for_each_node_mask(i, nodes_parsed)

> +		cnt = i;

> +	cnt++;

> +	size = cnt * cnt * sizeof(numa_distance[0]);

>  	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn),

>  				      size, PAGE_SIZE);

> -	if (WARN_ON(!phys))

> +	if (!phys) {

> +		pr_warn("Warning: can't allocate distance table!\n");

> +		/* don't retry until explicitly reset */

> +		numa_distance = (void *)1LU;

>  		return -ENOMEM;

> -

> +	}

>  	memblock_reserve(phys, size);

> 

>  	numa_distance = __va(phys);

> -	numa_distance_cnt = nr_node_ids;

> +	numa_distance_cnt = cnt;

> 

>  	/* fill with the default distances */

>  	for (i = 0; i < numa_distance_cnt; i++)

> @@ -311,10 +318,8 @@ static int __init numa_alloc_distance(void)

>   */

>  void __init numa_set_distance(int from, int to, int distance)

>  {

> -	if (!numa_distance) {

> -		pr_warn_once("Warning: distance table not allocated yet\n");

> +	if (!numa_distance && numa_alloc_distance() < 0)

>  		return;

> -	}

> 

>  	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||

>  			from < 0 || to < 0) {

> @@ -384,10 +389,6 @@ static int __init numa_init(int (*init_func)(void))

>  	nodes_clear(node_possible_map);

>  	nodes_clear(node_online_map);

> 

> -	ret = numa_alloc_distance();

> -	if (ret < 0)

> -		return ret;

> -

>  	ret = init_func();

>  	if (ret < 0)

>  		goto out_free_distance;

> --

> 2.19.1


Thanks
Barry
Jonathan Cameron Aug. 18, 2020, 5:13 p.m. | #3
On Fri, 24 Jul 2020 12:01:57 +0100
Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote:

> On Wed, 8 Jul 2020 19:38:25 +0800

> Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:

> 

> > Unfortunately we are currently calling numa_alloc_distance well before we call

> > setup_node_to_cpu_mask_map means that nr_node_ids is set to MAX_NUMNODES.

> > This wastes a bit of memory and is confusing to the reader.

> > 

> > Note we could just decide to hardcode it as MAX_NUMNODES but if so we should

> > do so explicitly.

> > 

> > Looking at what x86 does, they do a walk of nodes_parsed and locally

> > establish the maximum node count seen.  We can't actually do that where we

> > were previously calling it in numa_init because nodes_parsed isn't set up

> > either yet.  So let us take a leaf entirely out of x86's book and make

> > the true assumption that nodes_parsed will definitely be set up before

> > we try to put a real value in this array.  Hence just do it on demand.

> > 

> > In order to avoid trying and failing to allocate the array multiple times

> > we do the same thing as x86 and set numa_distance = 1. This requires a

> > few small modifications elsewhere.

> > 

> > Worth noting, that with one exception (which it appears can be removed [1])

> > the x86 and arm numa distance code is now identical.  Worth factoring it

> > out to some common location?

> > 

> > [1] https://lkml.kernel.org/r/20170406124459.dwn5zhpr2xqg3lqm@node.shutemov.name

> > 

> > Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>  

> 

> Polite nudge.  Anyone?  No particular urgency on this one but I'm thinking

> of taking a stab at factoring out this code into a common location for arm64

> and x86 and this change needs to proceed that.


This still applies cleanly post merge window, so still looking for some review!

Jonathan

> 

> Thanks,

> 

> Jonathan

> 

> 

> > ---

> > arch/arm64/mm/numa.c | 35 ++++++++++++++++++-----------------

> >  1 file changed, 18 insertions(+), 17 deletions(-)

> > 

> > diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c

> > index aafcee3e3f7e..a2f549ef0a36 100644

> > --- a/arch/arm64/mm/numa.c

> > +++ b/arch/arm64/mm/numa.c

> > @@ -255,13 +255,11 @@ void __init numa_free_distance(void)

> >  {

> >  	size_t size;

> >  

> > -	if (!numa_distance)

> > -		return;

> > -

> >  	size = numa_distance_cnt * numa_distance_cnt *

> >  		sizeof(numa_distance[0]);

> > -

> > -	memblock_free(__pa(numa_distance), size);

> > +	/* numa_distance could be 1LU marking allocation failure, test cnt */

> > +	if (numa_distance_cnt)

> > +		memblock_free(__pa(numa_distance), size);

> >  	numa_distance_cnt = 0;

> >  	numa_distance = NULL;

> >  }

> > @@ -271,20 +269,29 @@ void __init numa_free_distance(void)

> >   */

> >  static int __init numa_alloc_distance(void)

> >  {

> > +	nodemask_t nodes_parsed;

> >  	size_t size;

> > +	int i, j, cnt = 0;

> >  	u64 phys;

> > -	int i, j;

> >  

> > -	size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);

> > +	/* size the new table and allocate it */

> > +	nodes_parsed = numa_nodes_parsed;

> > +	for_each_node_mask(i, nodes_parsed)

> > +		cnt = i;

> > +	cnt++;

> > +	size = cnt * cnt * sizeof(numa_distance[0]);

> >  	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn),

> >  				      size, PAGE_SIZE);

> > -	if (WARN_ON(!phys))

> > +	if (!phys) {

> > +		pr_warn("Warning: can't allocate distance table!\n");

> > +		/* don't retry until explicitly reset */

> > +		numa_distance = (void *)1LU;

> >  		return -ENOMEM;

> > -

> > +	}

> >  	memblock_reserve(phys, size);

> >  

> >  	numa_distance = __va(phys);

> > -	numa_distance_cnt = nr_node_ids;

> > +	numa_distance_cnt = cnt;

> >  

> >  	/* fill with the default distances */

> >  	for (i = 0; i < numa_distance_cnt; i++)

> > @@ -311,10 +318,8 @@ static int __init numa_alloc_distance(void)

> >   */

> >  void __init numa_set_distance(int from, int to, int distance)

> >  {

> > -	if (!numa_distance) {

> > -		pr_warn_once("Warning: distance table not allocated yet\n");

> > +	if (!numa_distance && numa_alloc_distance() < 0)

> >  		return;

> > -	}

> >  

> >  	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||

> >  			from < 0 || to < 0) {

> > @@ -384,10 +389,6 @@ static int __init numa_init(int (*init_func)(void))

> >  	nodes_clear(node_possible_map);

> >  	nodes_clear(node_online_map);

> >  

> > -	ret = numa_alloc_distance();

> > -	if (ret < 0)

> > -		return ret;

> > -

> >  	ret = init_func();

> >  	if (ret < 0)

> >  		goto out_free_distance;  

> 

> 

> _______________________________________________

> Linuxarm mailing list

> Linuxarm@huawei.com

> http://hulk.huawei.com/mailman/listinfo/linuxarm
Anshuman Khandual Aug. 19, 2020, 5:01 a.m. | #4
Hello Jonathan,

On 07/08/2020 05:08 PM, Jonathan Cameron wrote:
> Unfortunately we are currently calling numa_alloc_distance well before we call

> setup_node_to_cpu_mask_map means that nr_node_ids is set to MAX_NUMNODES.

> This wastes a bit of memory and is confusing to the reader.


With defconfig where CONFIG_NODES_SHIFT = 2 i.e MAX_NUMNODES = 4, the total
numa_distance size is 16 bytes (individual entries here are just u8). Even
with MAX_NUMNODES = 256, numa_distance is going to be just 64K. Hence there
might not be much space to be saved that would need optimizing this path.
Please correct me if I have missed something.

> 

> Note we could just decide to hardcode it as MAX_NUMNODES but if so we should

> do so explicitly.


nr_node_ids = MAX_NUMNODES which is set from mm/page_alloc.c, yes asserting
with an WARN_ON() that it is indeed MAX_NUMNODES would make sense.

> 

> Looking at what x86 does, they do a walk of nodes_parsed and locally

> establish the maximum node count seen.  We can't actually do that where we

> were previously calling it in numa_init because nodes_parsed isn't set up

> either yet.  So let us take a leaf entirely out of x86's book and make

> the true assumption that nodes_parsed will definitely be set up before

> we try to put a real value in this array.  Hence just do it on demand.


So it is replacing one assumption i.e nr_node_ids = MAX_NUMNODES with another
i.e nodes_parsed has been initialized, while trying to populate an entry.

> 

> In order to avoid trying and failing to allocate the array multiple times

> we do the same thing as x86 and set numa_distance = 1. This requires a

> few small modifications elsewhere.


Where ? Dont see numa_distance being set as 1.

> 

> Worth noting, that with one exception (which it appears can be removed [1])

> the x86 and arm numa distance code is now identical.  Worth factoring it

> out to some common location?

> 

> [1] https://lkml.kernel.org/r/20170406124459.dwn5zhpr2xqg3lqm@node.shutemov.name

> 

> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

> ---

> arch/arm64/mm/numa.c | 35 ++++++++++++++++++-----------------

>  1 file changed, 18 insertions(+), 17 deletions(-)

> 

> diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c

> index aafcee3e3f7e..a2f549ef0a36 100644

> --- a/arch/arm64/mm/numa.c

> +++ b/arch/arm64/mm/numa.c

> @@ -255,13 +255,11 @@ void __init numa_free_distance(void)

>  {

>  	size_t size;

>  

> -	if (!numa_distance)

> -		return;

> -

>  	size = numa_distance_cnt * numa_distance_cnt *

>  		sizeof(numa_distance[0]);

> -

> -	memblock_free(__pa(numa_distance), size);

> +	/* numa_distance could be 1LU marking allocation failure, test cnt */

> +	if (numa_distance_cnt)

> +		memblock_free(__pa(numa_distance), size);

>  	numa_distance_cnt = 0;

>  	numa_distance = NULL;

>  }

> @@ -271,20 +269,29 @@ void __init numa_free_distance(void)

>   */

>  static int __init numa_alloc_distance(void)

>  {

> +	nodemask_t nodes_parsed;

>  	size_t size;

> +	int i, j, cnt = 0;

>  	u64 phys;

> -	int i, j;

>  

> -	size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);

> +	/* size the new table and allocate it */

> +	nodes_parsed = numa_nodes_parsed;

> +	for_each_node_mask(i, nodes_parsed)

> +		cnt = i;


There is no nodemask related helper to fetch the highest bit set ?

> +	cnt++;

> +	size = cnt * cnt * sizeof(numa_distance[0]);

>  	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn),

>  				      size, PAGE_SIZE);

> -	if (WARN_ON(!phys))

> +	if (!phys) {

> +		pr_warn("Warning: can't allocate distance table!\n");

> +		/* don't retry until explicitly reset */

> +		numa_distance = (void *)1LU;

>  		return -ENOMEM;

> -

> +	}

>  	memblock_reserve(phys, size);

>  

>  	numa_distance = __va(phys);

> -	numa_distance_cnt = nr_node_ids;

> +	numa_distance_cnt = cnt;

>  

>  	/* fill with the default distances */

>  	for (i = 0; i < numa_distance_cnt; i++)

> @@ -311,10 +318,8 @@ static int __init numa_alloc_distance(void)

>   */

>  void __init numa_set_distance(int from, int to, int distance)

>  {

> -	if (!numa_distance) {

> -		pr_warn_once("Warning: distance table not allocated yet\n");

> +	if (!numa_distance && numa_alloc_distance() < 0)

>  		return;

> -	}

>  

>  	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||

>  			from < 0 || to < 0) {

> @@ -384,10 +389,6 @@ static int __init numa_init(int (*init_func)(void))

>  	nodes_clear(node_possible_map);

>  	nodes_clear(node_online_map);

>  

> -	ret = numa_alloc_distance();

> -	if (ret < 0)

> -		return ret;

> -

>  	ret = init_func();

>  	if (ret < 0)

>  		goto out_free_distance;

> 


What is the primary objective here ? Reduce memory for numa_distance[]
or unifying arm64's numa_init() with that of x86's ?
Jonathan Cameron Aug. 19, 2020, 9:41 a.m. | #5
On Wed, 19 Aug 2020 10:31:18 +0530
Anshuman Khandual <anshuman.khandual@arm.com> wrote:

> Hello Jonathan,


Hi Anshuman,

> 

> On 07/08/2020 05:08 PM, Jonathan Cameron wrote:

> > Unfortunately we are currently calling numa_alloc_distance well before we call

> > setup_node_to_cpu_mask_map means that nr_node_ids is set to MAX_NUMNODES.

> > This wastes a bit of memory and is confusing to the reader.  

> 

> With defconfig where CONFIG_NODES_SHIFT = 2 i.e MAX_NUMNODES = 4, the total

> numa_distance size is 16 bytes (individual entries here are just u8). 


That's another issue to tidy up.  The current defconfig isn't big enough to
support existing platforms (we are shipping 8 node systems),
let alone next generation.  Bit of an open question on how far to push it up
though. Perhaps 64?  Given CXL we are probably going to soon see systems with
a lot more nodes.  I'll send a patch for that and we can argue the exact
value then.

> Even

> with MAX_NUMNODES = 256, numa_distance is going to be just 64K. Hence there

> might not be much space to be saved that would need optimizing this path.

> Please correct me if I have missed something.


Agreed.  The saving is small.  This was mostly about the code being misleading.
It gives the impression of doing dynamic sizing but doesn't actually do so as it
is using nr_node_ids before that value has been updated.  I would prefer
to make the code 'obviously' correct.  The x86 path is one approach, simply
hard coding the size as a constant is another.

The secondary advantage would be to make it easy to unify this code across
architectures.  Kind of inevitable riscv will be here soon and it would be nice
not to have a third slight variation on the code.  I'd like to explore such
possible unification but it wasn't the main aim of this patch.

> 

> > 

> > Note we could just decide to hardcode it as MAX_NUMNODES but if so we should

> > do so explicitly.  

> 

> nr_node_ids = MAX_NUMNODES which is set from mm/page_alloc.c, yes asserting

> with an WARN_ON() that it is indeed MAX_NUMNODES would make sense.


OK, a WARN_ON would at least make it apparent this is a constant, not a dynamic
value as a reader of the code might assume (I did).

Could we just go further and use MAX_NUMNODES directly and ignore nr_node_ids
for this purpose?

> 

> > 

> > Looking at what x86 does, they do a walk of nodes_parsed and locally

> > establish the maximum node count seen.  We can't actually do that where we

> > were previously calling it in numa_init because nodes_parsed isn't set up

> > either yet.  So let us take a leaf entirely out of x86's book and make

> > the true assumption that nodes_parsed will definitely be set up before

> > we try to put a real value in this array.  Hence just do it on demand.  

> 

> So it is replacing one assumption i.e nr_node_ids = MAX_NUMNODES with another

> i.e nodes_parsed has been initialized, while trying to populate an entry.


Yes.  If we make it clearly a fixed value, then I'm happy with that solution
to this particular problem.

If not, I would argue that the use nodes_parsed makes it very much obvious that
we are assuming it was initialized.  (Though on that argument we would have
assumed nr_node_ids had it's final value in the existing code and it doesn't,
I come back to my main argument being to make the code 'obviously' correct).

> 

> > 

> > In order to avoid trying and failing to allocate the array multiple times

> > we do the same thing as x86 and set numa_distance = 1. This requires a

> > few small modifications elsewhere.  

> 

> Where ? Dont see numa_distance being set as 1.


See below, I have highlighted the line.

> 

> > 

> > Worth noting, that with one exception (which it appears can be removed [1])

> > the x86 and arm numa distance code is now identical.  Worth factoring it

> > out to some common location?

> > 

> > [1] https://lkml.kernel.org/r/20170406124459.dwn5zhpr2xqg3lqm@node.shutemov.name

> > 

> > Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

> > ---

> > arch/arm64/mm/numa.c | 35 ++++++++++++++++++-----------------

> >  1 file changed, 18 insertions(+), 17 deletions(-)

> > 

> > diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c

> > index aafcee3e3f7e..a2f549ef0a36 100644

> > --- a/arch/arm64/mm/numa.c

> > +++ b/arch/arm64/mm/numa.c

> > @@ -255,13 +255,11 @@ void __init numa_free_distance(void)

> >  {

> >  	size_t size;

> >  

> > -	if (!numa_distance)

> > -		return;

> > -

> >  	size = numa_distance_cnt * numa_distance_cnt *

> >  		sizeof(numa_distance[0]);

> > -

> > -	memblock_free(__pa(numa_distance), size);

> > +	/* numa_distance could be 1LU marking allocation failure, test cnt */

> > +	if (numa_distance_cnt)

> > +		memblock_free(__pa(numa_distance), size);

> >  	numa_distance_cnt = 0;

> >  	numa_distance = NULL;

> >  }

> > @@ -271,20 +269,29 @@ void __init numa_free_distance(void)

> >   */

> >  static int __init numa_alloc_distance(void)

> >  {

> > +	nodemask_t nodes_parsed;

> >  	size_t size;

> > +	int i, j, cnt = 0;

> >  	u64 phys;

> > -	int i, j;

> >  

> > -	size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);

> > +	/* size the new table and allocate it */

> > +	nodes_parsed = numa_nodes_parsed;

> > +	for_each_node_mask(i, nodes_parsed)

> > +		cnt = i;  

> 

> There is no nodemask related helper to fetch the highest bit set ?


Not that I can find.

There is nodes_weight(), but I think we can currently end up with holes,
and it definitely isn't obvious from the local code that we can't.


> 

> > +	cnt++;

> > +	size = cnt * cnt * sizeof(numa_distance[0]);

> >  	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn),

> >  				      size, PAGE_SIZE);

> > -	if (WARN_ON(!phys))

> > +	if (!phys) {

> > +		pr_warn("Warning: can't allocate distance table!\n");

> > +		/* don't retry until explicitly reset */

> > +		numa_distance = (void *)1LU;


Here is where we set numa_distance to 1 in the error path.

> >  		return -ENOMEM;

> > -

> > +	}

> >  	memblock_reserve(phys, size);

> >  

> >  	numa_distance = __va(phys);

> > -	numa_distance_cnt = nr_node_ids;

> > +	numa_distance_cnt = cnt;

> >  

> >  	/* fill with the default distances */

> >  	for (i = 0; i < numa_distance_cnt; i++)

> > @@ -311,10 +318,8 @@ static int __init numa_alloc_distance(void)

> >   */

> >  void __init numa_set_distance(int from, int to, int distance)

> >  {

> > -	if (!numa_distance) {

> > -		pr_warn_once("Warning: distance table not allocated yet\n");

> > +	if (!numa_distance && numa_alloc_distance() < 0)

> >  		return;

> > -	}

> >  

> >  	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||

> >  			from < 0 || to < 0) {

> > @@ -384,10 +389,6 @@ static int __init numa_init(int (*init_func)(void))

> >  	nodes_clear(node_possible_map);

> >  	nodes_clear(node_online_map);

> >  

> > -	ret = numa_alloc_distance();

> > -	if (ret < 0)

> > -		return ret;

> > -

> >  	ret = init_func();

> >  	if (ret < 0)

> >  		goto out_free_distance;

> >   

> 

> What is the primary objective here ? Reduce memory for numa_distance[]

> or unifying arm64's numa_init() with that of x86's ?


As mentioned above. The objective when I originally looked at this was
to make the code do what it appeared to do.  The approach chosen was about
the added benefit of unifying this part with x86 + riscv etc.
Always good to not reinvent the wheel.

Jonathan
Anshuman Khandual Aug. 20, 2020, 3:09 a.m. | #6
On 08/19/2020 03:11 PM, Jonathan Cameron wrote:
> On Wed, 19 Aug 2020 10:31:18 +0530

> Anshuman Khandual <anshuman.khandual@arm.com> wrote:

> 

>> Hello Jonathan,

> 

> Hi Anshuman,

> 

>>

>> On 07/08/2020 05:08 PM, Jonathan Cameron wrote:

>>> Unfortunately we are currently calling numa_alloc_distance well before we call

>>> setup_node_to_cpu_mask_map means that nr_node_ids is set to MAX_NUMNODES.

>>> This wastes a bit of memory and is confusing to the reader.  

>>

>> With defconfig where CONFIG_NODES_SHIFT = 2 i.e MAX_NUMNODES = 4, the total

>> numa_distance size is 16 bytes (individual entries here are just u8). 

> 

> That's another issue to tidy up.  The current defconfig isn't big enough to

> support existing platforms (we are shipping 8 node systems),

> let alone next generation.  Bit of an open question on how far to push it up

> though. Perhaps 64?  Given CXL we are probably going to soon see systems with

> a lot more nodes.  I'll send a patch for that and we can argue the exact

> value then.


I would expect atleast it matches up with the max nodes possible on currently
available systems.

> 

>> Even

>> with MAX_NUMNODES = 256, numa_distance is going to be just 64K. Hence there

>> might not be much space to be saved that would need optimizing this path.

>> Please correct me if I have missed something.

> 

> Agreed.  The saving is small.  This was mostly about the code being misleading.

> It gives the impression of doing dynamic sizing but doesn't actually do so as it

> is using nr_node_ids before that value has been updated.  I would prefer

> to make the code 'obviously' correct.  The x86 path is one approach, simply

> hard coding the size as a constant is another.

> 

> The secondary advantage would be to make it easy to unify this code across

> architectures.  Kind of inevitable riscv will be here soon and it would be nice

> not to have a third slight variation on the code.  I'd like to explore such

> possible unification but it wasn't the main aim of this patch.


There is already a RISC-V proposal in this regard which is trying to create
a generic NUMA framework selectable with GENERIC_ARCH_NUMA. If the ultimate
goal is unify the NUMA init path, we should try and see if that can also
accommodate X86.

https://patchwork.kernel.org/project/linux-arm-kernel/list/?series=332897

> 

>>

>>>

>>> Note we could just decide to hardcode it as MAX_NUMNODES but if so we should

>>> do so explicitly.  

>>

>> nr_node_ids = MAX_NUMNODES which is set from mm/page_alloc.c, yes asserting

>> with an WARN_ON() that it is indeed MAX_NUMNODES would make sense.

> 

> OK, a WARN_ON would at least make it apparent this is a constant, not a dynamic

> value as a reader of the code might assume (I did).

> 

> Could we just go further and use MAX_NUMNODES directly and ignore nr_node_ids

> for this purpose?


IMHO this actually sounds better and cleaner. But the first preference should
be to unify all these as GENERIC_ARCH_NUMA.

> 

>>

>>>

>>> Looking at what x86 does, they do a walk of nodes_parsed and locally

>>> establish the maximum node count seen.  We can't actually do that where we

>>> were previously calling it in numa_init because nodes_parsed isn't set up

>>> either yet.  So let us take a leaf entirely out of x86's book and make

>>> the true assumption that nodes_parsed will definitely be set up before

>>> we try to put a real value in this array.  Hence just do it on demand.  

>>

>> So it is replacing one assumption i.e nr_node_ids = MAX_NUMNODES with another

>> i.e nodes_parsed has been initialized, while trying to populate an entry.

> 

> Yes.  If we make it clearly a fixed value, then I'm happy with that solution

> to this particular problem.

> 

> If not, I would argue that the use nodes_parsed makes it very much obvious that

> we are assuming it was initialized.  (Though on that argument we would have

> assumed nr_node_ids had it's final value in the existing code and it doesn't,

> I come back to my main argument being to make the code 'obviously' correct).

> 

>>

>>>

>>> In order to avoid trying and failing to allocate the array multiple times

>>> we do the same thing as x86 and set numa_distance = 1. This requires a

>>> few small modifications elsewhere.  

>>

>> Where ? Dont see numa_distance being set as 1.

> 

> See below, I have highlighted the line.

> 

>>

>>>

>>> Worth noting, that with one exception (which it appears can be removed [1])

>>> the x86 and arm numa distance code is now identical.  Worth factoring it

>>> out to some common location?

>>>

>>> [1] https://lkml.kernel.org/r/20170406124459.dwn5zhpr2xqg3lqm@node.shutemov.name

>>>

>>> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

>>> ---

>>> arch/arm64/mm/numa.c | 35 ++++++++++++++++++-----------------

>>>  1 file changed, 18 insertions(+), 17 deletions(-)

>>>

>>> diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c

>>> index aafcee3e3f7e..a2f549ef0a36 100644

>>> --- a/arch/arm64/mm/numa.c

>>> +++ b/arch/arm64/mm/numa.c

>>> @@ -255,13 +255,11 @@ void __init numa_free_distance(void)

>>>  {

>>>  	size_t size;

>>>  

>>> -	if (!numa_distance)

>>> -		return;

>>> -

>>>  	size = numa_distance_cnt * numa_distance_cnt *

>>>  		sizeof(numa_distance[0]);

>>> -

>>> -	memblock_free(__pa(numa_distance), size);

>>> +	/* numa_distance could be 1LU marking allocation failure, test cnt */

>>> +	if (numa_distance_cnt)

>>> +		memblock_free(__pa(numa_distance), size);

>>>  	numa_distance_cnt = 0;

>>>  	numa_distance = NULL;

>>>  }

>>> @@ -271,20 +269,29 @@ void __init numa_free_distance(void)

>>>   */

>>>  static int __init numa_alloc_distance(void)

>>>  {

>>> +	nodemask_t nodes_parsed;

>>>  	size_t size;

>>> +	int i, j, cnt = 0;

>>>  	u64 phys;

>>> -	int i, j;

>>>  

>>> -	size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);

>>> +	/* size the new table and allocate it */

>>> +	nodes_parsed = numa_nodes_parsed;

>>> +	for_each_node_mask(i, nodes_parsed)

>>> +		cnt = i;  

>>

>> There is no nodemask related helper to fetch the highest bit set ?

> 

> Not that I can find.

> 

> There is nodes_weight(), but I think we can currently end up with holes,

> and it definitely isn't obvious from the local code that we can't.


Right, the node mask can have holes, so nodes_weight() would not imply
maximum possible node number. Possibly a new helper nodes_max() should
do but not sure if there are any existing instances which could use
such a helper.  

> 

> 

>>

>>> +	cnt++;

>>> +	size = cnt * cnt * sizeof(numa_distance[0]);

>>>  	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn),

>>>  				      size, PAGE_SIZE);

>>> -	if (WARN_ON(!phys))

>>> +	if (!phys) {

>>> +		pr_warn("Warning: can't allocate distance table!\n");

>>> +		/* don't retry until explicitly reset */

>>> +		numa_distance = (void *)1LU;

> 

> Here is where we set numa_distance to 1 in the error path.


Got it.

> 

>>>  		return -ENOMEM;

>>> -

>>> +	}

>>>  	memblock_reserve(phys, size);

>>>  

>>>  	numa_distance = __va(phys);

>>> -	numa_distance_cnt = nr_node_ids;

>>> +	numa_distance_cnt = cnt;

>>>  

>>>  	/* fill with the default distances */

>>>  	for (i = 0; i < numa_distance_cnt; i++)

>>> @@ -311,10 +318,8 @@ static int __init numa_alloc_distance(void)

>>>   */

>>>  void __init numa_set_distance(int from, int to, int distance)

>>>  {

>>> -	if (!numa_distance) {

>>> -		pr_warn_once("Warning: distance table not allocated yet\n");

>>> +	if (!numa_distance && numa_alloc_distance() < 0)

>>>  		return;

>>> -	}

>>>  

>>>  	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||

>>>  			from < 0 || to < 0) {

>>> @@ -384,10 +389,6 @@ static int __init numa_init(int (*init_func)(void))

>>>  	nodes_clear(node_possible_map);

>>>  	nodes_clear(node_online_map);

>>>  

>>> -	ret = numa_alloc_distance();

>>> -	if (ret < 0)

>>> -		return ret;

>>> -

>>>  	ret = init_func();

>>>  	if (ret < 0)

>>>  		goto out_free_distance;

>>>   

>>

>> What is the primary objective here ? Reduce memory for numa_distance[]

>> or unifying arm64's numa_init() with that of x86's ?

> 

> As mentioned above. The objective when I originally looked at this was

> to make the code do what it appeared to do.  The approach chosen was about

> the added benefit of unifying this part with x86 + riscv etc.

> Always good to not reinvent the wheel.


Agreed.
Jonathan Cameron Aug. 20, 2020, 8:45 a.m. | #7
On Thu, 20 Aug 2020 08:39:14 +0530
Anshuman Khandual <anshuman.khandual@arm.com> wrote:

> On 08/19/2020 03:11 PM, Jonathan Cameron wrote:

> > On Wed, 19 Aug 2020 10:31:18 +0530

> > Anshuman Khandual <anshuman.khandual@arm.com> wrote:

> >   

> >> Hello Jonathan,  

> > 

> > Hi Anshuman,

> >   

> >>

> >> On 07/08/2020 05:08 PM, Jonathan Cameron wrote:  

> >>> Unfortunately we are currently calling numa_alloc_distance well before we call

> >>> setup_node_to_cpu_mask_map means that nr_node_ids is set to MAX_NUMNODES.

> >>> This wastes a bit of memory and is confusing to the reader.    

> >>

> >> With defconfig where CONFIG_NODES_SHIFT = 2 i.e MAX_NUMNODES = 4, the total

> >> numa_distance size is 16 bytes (individual entries here are just u8).   

> > 

> > That's another issue to tidy up.  The current defconfig isn't big enough to

> > support existing platforms (we are shipping 8 node systems),

> > let alone next generation.  Bit of an open question on how far to push it up

> > though. Perhaps 64?  Given CXL we are probably going to soon see systems with

> > a lot more nodes.  I'll send a patch for that and we can argue the exact

> > value then.  

> 

> I would expect atleast it matches up with the max nodes possible on currently

> available systems.

> 

> >   

> >> Even

> >> with MAX_NUMNODES = 256, numa_distance is going to be just 64K. Hence there

> >> might not be much space to be saved that would need optimizing this path.

> >> Please correct me if I have missed something.  

> > 

> > Agreed.  The saving is small.  This was mostly about the code being misleading.

> > It gives the impression of doing dynamic sizing but doesn't actually do so as it

> > is using nr_node_ids before that value has been updated.  I would prefer

> > to make the code 'obviously' correct.  The x86 path is one approach, simply

> > hard coding the size as a constant is another.

> > 

> > The secondary advantage would be to make it easy to unify this code across

> > architectures.  Kind of inevitable riscv will be here soon and it would be nice

> > not to have a third slight variation on the code.  I'd like to explore such

> > possible unification but it wasn't the main aim of this patch.  

> 

> There is already a RISC-V proposal in this regard which is trying to create

> a generic NUMA framework selectable with GENERIC_ARCH_NUMA. If the ultimate

> goal is unify the NUMA init path, we should try and see if that can also

> accommodate X86.

> 

> https://patchwork.kernel.org/project/linux-arm-kernel/list/?series=332897


Agreed.

It may be the case that we need to do that in a number of steps but it is definitely
a good place to aim for.

Perhaps we can ignore ia64 however :)

Jonathan

> 

> >   

> >>  

> >>>

> >>> Note we could just decide to hardcode it as MAX_NUMNODES but if so we should

> >>> do so explicitly.    

> >>

> >> nr_node_ids = MAX_NUMNODES which is set from mm/page_alloc.c, yes asserting

> >> with an WARN_ON() that it is indeed MAX_NUMNODES would make sense.  

> > 

> > OK, a WARN_ON would at least make it apparent this is a constant, not a dynamic

> > value as a reader of the code might assume (I did).

> > 

> > Could we just go further and use MAX_NUMNODES directly and ignore nr_node_ids

> > for this purpose?  

> 

> IMHO this actually sounds better and cleaner. But the first preference should

> be to unify all these as GENERIC_ARCH_NUMA.

> 

> >   

> >>  

> >>>

> >>> Looking at what x86 does, they do a walk of nodes_parsed and locally

> >>> establish the maximum node count seen.  We can't actually do that where we

> >>> were previously calling it in numa_init because nodes_parsed isn't set up

> >>> either yet.  So let us take a leaf entirely out of x86's book and make

> >>> the true assumption that nodes_parsed will definitely be set up before

> >>> we try to put a real value in this array.  Hence just do it on demand.    

> >>

> >> So it is replacing one assumption i.e nr_node_ids = MAX_NUMNODES with another

> >> i.e nodes_parsed has been initialized, while trying to populate an entry.  

> > 

> > Yes.  If we make it clearly a fixed value, then I'm happy with that solution

> > to this particular problem.

> > 

> > If not, I would argue that the use nodes_parsed makes it very much obvious that

> > we are assuming it was initialized.  (Though on that argument we would have

> > assumed nr_node_ids had it's final value in the existing code and it doesn't,

> > I come back to my main argument being to make the code 'obviously' correct).

> >   

> >>  

> >>>

> >>> In order to avoid trying and failing to allocate the array multiple times

> >>> we do the same thing as x86 and set numa_distance = 1. This requires a

> >>> few small modifications elsewhere.    

> >>

> >> Where ? Dont see numa_distance being set as 1.  

> > 

> > See below, I have highlighted the line.

> >   

> >>  

> >>>

> >>> Worth noting, that with one exception (which it appears can be removed [1])

> >>> the x86 and arm numa distance code is now identical.  Worth factoring it

> >>> out to some common location?

> >>>

> >>> [1] https://lkml.kernel.org/r/20170406124459.dwn5zhpr2xqg3lqm@node.shutemov.name

> >>>

> >>> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

> >>> ---

> >>> arch/arm64/mm/numa.c | 35 ++++++++++++++++++-----------------

> >>>  1 file changed, 18 insertions(+), 17 deletions(-)

> >>>

> >>> diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c

> >>> index aafcee3e3f7e..a2f549ef0a36 100644

> >>> --- a/arch/arm64/mm/numa.c

> >>> +++ b/arch/arm64/mm/numa.c

> >>> @@ -255,13 +255,11 @@ void __init numa_free_distance(void)

> >>>  {

> >>>  	size_t size;

> >>>  

> >>> -	if (!numa_distance)

> >>> -		return;

> >>> -

> >>>  	size = numa_distance_cnt * numa_distance_cnt *

> >>>  		sizeof(numa_distance[0]);

> >>> -

> >>> -	memblock_free(__pa(numa_distance), size);

> >>> +	/* numa_distance could be 1LU marking allocation failure, test cnt */

> >>> +	if (numa_distance_cnt)

> >>> +		memblock_free(__pa(numa_distance), size);

> >>>  	numa_distance_cnt = 0;

> >>>  	numa_distance = NULL;

> >>>  }

> >>> @@ -271,20 +269,29 @@ void __init numa_free_distance(void)

> >>>   */

> >>>  static int __init numa_alloc_distance(void)

> >>>  {

> >>> +	nodemask_t nodes_parsed;

> >>>  	size_t size;

> >>> +	int i, j, cnt = 0;

> >>>  	u64 phys;

> >>> -	int i, j;

> >>>  

> >>> -	size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);

> >>> +	/* size the new table and allocate it */

> >>> +	nodes_parsed = numa_nodes_parsed;

> >>> +	for_each_node_mask(i, nodes_parsed)

> >>> +		cnt = i;    

> >>

> >> There is no nodemask related helper to fetch the highest bit set ?  

> > 

> > Not that I can find.

> > 

> > There is nodes_weight(), but I think we can currently end up with holes,

> > and it definitely isn't obvious from the local code that we can't.  

> 

> Right, the node mask can have holes, so nodes_weight() would not imply

> maximum possible node number. Possibly a new helper nodes_max() should

> do but not sure if there are any existing instances which could use

> such a helper.  

> 

> > 

> >   

> >>  

> >>> +	cnt++;

> >>> +	size = cnt * cnt * sizeof(numa_distance[0]);

> >>>  	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn),

> >>>  				      size, PAGE_SIZE);

> >>> -	if (WARN_ON(!phys))

> >>> +	if (!phys) {

> >>> +		pr_warn("Warning: can't allocate distance table!\n");

> >>> +		/* don't retry until explicitly reset */

> >>> +		numa_distance = (void *)1LU;  

> > 

> > Here is where we set numa_distance to 1 in the error path.  

> 

> Got it.

> 

> >   

> >>>  		return -ENOMEM;

> >>> -

> >>> +	}

> >>>  	memblock_reserve(phys, size);

> >>>  

> >>>  	numa_distance = __va(phys);

> >>> -	numa_distance_cnt = nr_node_ids;

> >>> +	numa_distance_cnt = cnt;

> >>>  

> >>>  	/* fill with the default distances */

> >>>  	for (i = 0; i < numa_distance_cnt; i++)

> >>> @@ -311,10 +318,8 @@ static int __init numa_alloc_distance(void)

> >>>   */

> >>>  void __init numa_set_distance(int from, int to, int distance)

> >>>  {

> >>> -	if (!numa_distance) {

> >>> -		pr_warn_once("Warning: distance table not allocated yet\n");

> >>> +	if (!numa_distance && numa_alloc_distance() < 0)

> >>>  		return;

> >>> -	}

> >>>  

> >>>  	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||

> >>>  			from < 0 || to < 0) {

> >>> @@ -384,10 +389,6 @@ static int __init numa_init(int (*init_func)(void))

> >>>  	nodes_clear(node_possible_map);

> >>>  	nodes_clear(node_online_map);

> >>>  

> >>> -	ret = numa_alloc_distance();

> >>> -	if (ret < 0)

> >>> -		return ret;

> >>> -

> >>>  	ret = init_func();

> >>>  	if (ret < 0)

> >>>  		goto out_free_distance;

> >>>     

> >>

> >> What is the primary objective here ? Reduce memory for numa_distance[]

> >> or unifying arm64's numa_init() with that of x86's ?  

> > 

> > As mentioned above. The objective when I originally looked at this was

> > to make the code do what it appeared to do.  The approach chosen was about

> > the added benefit of unifying this part with x86 + riscv etc.

> > Always good to not reinvent the wheel.  

> 

> Agreed.

Patch

diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index aafcee3e3f7e..a2f549ef0a36 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -255,13 +255,11 @@  void __init numa_free_distance(void)
 {
 	size_t size;
 
-	if (!numa_distance)
-		return;
-
 	size = numa_distance_cnt * numa_distance_cnt *
 		sizeof(numa_distance[0]);
-
-	memblock_free(__pa(numa_distance), size);
+	/* numa_distance could be 1LU marking allocation failure, test cnt */
+	if (numa_distance_cnt)
+		memblock_free(__pa(numa_distance), size);
 	numa_distance_cnt = 0;
 	numa_distance = NULL;
 }
@@ -271,20 +269,29 @@  void __init numa_free_distance(void)
  */
 static int __init numa_alloc_distance(void)
 {
+	nodemask_t nodes_parsed;
 	size_t size;
+	int i, j, cnt = 0;
 	u64 phys;
-	int i, j;
 
-	size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);
+	/* size the new table and allocate it */
+	nodes_parsed = numa_nodes_parsed;
+	for_each_node_mask(i, nodes_parsed)
+		cnt = i;
+	cnt++;
+	size = cnt * cnt * sizeof(numa_distance[0]);
 	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn),
 				      size, PAGE_SIZE);
-	if (WARN_ON(!phys))
+	if (!phys) {
+		pr_warn("Warning: can't allocate distance table!\n");
+		/* don't retry until explicitly reset */
+		numa_distance = (void *)1LU;
 		return -ENOMEM;
-
+	}
 	memblock_reserve(phys, size);
 
 	numa_distance = __va(phys);
-	numa_distance_cnt = nr_node_ids;
+	numa_distance_cnt = cnt;
 
 	/* fill with the default distances */
 	for (i = 0; i < numa_distance_cnt; i++)
@@ -311,10 +318,8 @@  static int __init numa_alloc_distance(void)
  */
 void __init numa_set_distance(int from, int to, int distance)
 {
-	if (!numa_distance) {
-		pr_warn_once("Warning: distance table not allocated yet\n");
+	if (!numa_distance && numa_alloc_distance() < 0)
 		return;
-	}
 
 	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
 			from < 0 || to < 0) {
@@ -384,10 +389,6 @@  static int __init numa_init(int (*init_func)(void))
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
 
-	ret = numa_alloc_distance();
-	if (ret < 0)
-		return ret;
-
 	ret = init_func();
 	if (ret < 0)
 		goto out_free_distance;