diff mbox series

[v2,1/2] selftests/resctrl: Adjust effective L3 cache size with SNC enabled

Message ID fe9295c6be677d187b1607185e23993dbfe74761.1715769576.git.maciej.wieczor-retman@intel.com
State New
Headers show
Series selftests/resctrl: SNC kernel support discovery | expand

Commit Message

Maciej Wieczor-Retman May 15, 2024, 11:18 a.m. UTC
Sub-NUMA Cluster divides CPUs sharing an L3 cache into separate NUMA
nodes. Systems may support splitting into either two or four nodes.

When SNC mode is enabled the effective amount of L3 cache available
for allocation is divided by the number of nodes per L3.

Detect which SNC mode is active by comparing the number of CPUs
that share a cache with CPU0, with the number of CPUs on node0.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Co-developed-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
---
 tools/testing/selftests/resctrl/resctrl.h   |  3 ++
 tools/testing/selftests/resctrl/resctrlfs.c | 59 +++++++++++++++++++++
 2 files changed, 62 insertions(+)

Comments

Reinette Chatre May 30, 2024, 11:07 p.m. UTC | #1
Hi Maciej,

Regarding shortlog: L3 cache size should no longer be adjusted when
SNC is enabled. You mention that the tests are passing when running
with this adjustment ... I think that this may be because the test
now just runs on a smaller portion of the cache?

On 5/15/24 4:18 AM, Maciej Wieczor-Retman wrote:
> Sub-NUMA Cluster divides CPUs sharing an L3 cache into separate NUMA
> nodes. Systems may support splitting into either two or four nodes.

fyi ... from the most recent kernel submission 2, 3, or 4 nodes
are possible:
https://lore.kernel.org/lkml/20240528222006.58283-20-tony.luck@intel.com/

> 
> When SNC mode is enabled the effective amount of L3 cache available
> for allocation is divided by the number of nodes per L3.

This was a mistake in original implementation and no longer done.

> 
> Detect which SNC mode is active by comparing the number of CPUs
> that share a cache with CPU0, with the number of CPUs on node0.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> Co-developed-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
> Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
> ---
>   tools/testing/selftests/resctrl/resctrl.h   |  3 ++
>   tools/testing/selftests/resctrl/resctrlfs.c | 59 +++++++++++++++++++++
>   2 files changed, 62 insertions(+)
> 
> diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
> index 00d51fa7531c..3dd5d6779786 100644
> --- a/tools/testing/selftests/resctrl/resctrl.h
> +++ b/tools/testing/selftests/resctrl/resctrl.h
> @@ -11,6 +11,7 @@
>   #include <signal.h>
>   #include <dirent.h>
>   #include <stdbool.h>
> +#include <ctype.h>
>   #include <sys/stat.h>
>   #include <sys/ioctl.h>
>   #include <sys/mount.h>
> @@ -49,6 +50,7 @@
>   		umount_resctrlfs();		\
>   		exit(EXIT_FAILURE);		\
>   	} while (0)
> +#define MAX_SNC		4
>   
>   /*
>    * user_params:		User supplied parameters
> @@ -131,6 +133,7 @@ extern pid_t bm_pid, ppid;
>   
>   extern char llc_occup_path[1024];
>   
> +int snc_ways(void);
>   int get_vendor(void);
>   bool check_resctrlfs_support(void);
>   int filter_dmesg(void);
> diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
> index 1cade75176eb..e4d3624a8817 100644
> --- a/tools/testing/selftests/resctrl/resctrlfs.c
> +++ b/tools/testing/selftests/resctrl/resctrlfs.c
> @@ -156,6 +156,63 @@ int get_domain_id(const char *resource, int cpu_no, int *domain_id)
>   	return 0;
>   }
>   
> +/*
> + * Count number of CPUs in a /sys bit map
> + */
> +static unsigned int count_sys_bitmap_bits(char *name)
> +{
> +	FILE *fp = fopen(name, "r");
> +	int count = 0, c;
> +
> +	if (!fp)
> +		return 0;
> +
> +	while ((c = fgetc(fp)) != EOF) {
> +		if (!isxdigit(c))
> +			continue;
> +		switch (c) {
> +		case 'f':
> +			count++;
> +		case '7': case 'b': case 'd': case 'e':
> +			count++;
> +		case '3': case '5': case '6': case '9': case 'a': case 'c':
> +			count++;
> +		case '1': case '2': case '4': case '8':
> +			count++;
> +		}
> +	}
> +	fclose(fp);
> +
> +	return count;
> +}
> +
> +/*
> + * Detect SNC by comparing #CPUs in node0 with #CPUs sharing LLC with CPU0.
> + * If some CPUs are offline the numbers may not be exact multiples of each
> + * other. Any offline CPUs on node0 will be also gone from shared_cpu_map of
> + * CPU0 but offline CPUs from other nodes will only make the cache_cpus value
> + * lower. Still try to get the ratio right by preventing the second possibility.
> + */
> +int snc_ways(void)

"ways" have a specific meaning in cache terminology. Perhaps rather something
like "snc_nodes_per_cache()" or even copy the kernel's (which is still WIP though)
snc_nodes_per_l3_cache()

> +{
> +	int node_cpus, cache_cpus, i;
> +
> +	node_cpus = count_sys_bitmap_bits("/sys/devices/system/node/node0/cpumap");
> +	cache_cpus = count_sys_bitmap_bits("/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_map");
> +
> +	if (!node_cpus || !cache_cpus) {
> +		fprintf(stderr, "Warning could not determine Sub-NUMA Cluster mode\n");

The tests just use "ksft_print_msg()" for error messages. The "Warning could ..."
is somewhat unexpected, perhaps just "Could not determine ..." or "Warning: Could not ..."?

> +		return 1;
> +	}
> +
> +	for (i = 1; i <= MAX_SNC ; i++) {
> +		if (i * node_cpus >= cache_cpus)
> +			return i;
> +	}

This is not obvious to me. From the function comments this seems to address the
scenarios when CPUs from other nodes are offline. It is not clear to me how
this loop addresses this. For example, let's say there are four SNC nodes
associated with a cache and only the node0 CPUs are online. The above would
detect this as "1", not "4", if I read this right?

I wonder if it may not be easier to just follow what the kernel does
(in the new version).
User space can learn the number of online and present CPUs from
/sys/devices/system/cpu/online and /sys/devices/system/cpu/present
respectively. A simple string compare of the contents can be used to
determine if they are identical and a warning can be printed if they are not.
With a warning when accurate detection cannot be done the simple
check will do.

Could you please add an informational message indicating how many SNC nodes
were indeed detected?

> +
> +	return 1;
> +}
> +
>   /*
>    * get_cache_size - Get cache size for a specified CPU
>    * @cpu_no:	CPU number
> @@ -211,6 +268,8 @@ int get_cache_size(int cpu_no, const char *cache_type, unsigned long *cache_size
>   			break;
>   	}
>   
> +	if (cache_num == 3)
> +		*cache_size /= snc_ways();
>   	return 0;
>   }
>   

I think this can be dropped.

Reinette
Luck, Tony May 30, 2024, 11:46 p.m. UTC | #2
>> When SNC mode is enabled the effective amount of L3 cache available
>> for allocation is divided by the number of nodes per L3.
>
> This was a mistake in original implementation and no longer done.

My original kernel code adjusted value reported in the "size" file in resctrl.
That's no longer done because the effective size depends on how applications
are allocating and using memory. Since the kernel can't know that, it
seemed best to just report the total size of the cache.

But I think the resctrl tests still need to take this into account when running
llc_occupancy tests.

E.g. on a 2-way SNC system with a 100MB L3 cache a test that allocates
memory from its local SNC node (default behavior without using libnuma)
will only see 50 MB llc_occupancy with a fully populated L3 mask in the
schemata file.

-Tony
Reinette Chatre May 31, 2024, 4:17 p.m. UTC | #3
Hi Tony and Maciej,

On 5/30/24 5:34 PM, Luck, Tony wrote:
>>>>> When SNC mode is enabled the effective amount of L3 cache available
>>>>> for allocation is divided by the number of nodes per L3.
>>>>
>>>> This was a mistake in original implementation and no longer done.
>>>
>>> My original kernel code adjusted value reported in the "size" file in resctrl.
>>> That's no longer done because the effective size depends on how applications
>>> are allocating and using memory. Since the kernel can't know that, it
>>> seemed best to just report the total size of the cache.
>>>
>>> But I think the resctrl tests still need to take this into account when running
>>> llc_occupancy tests.
>>>
>>> E.g. on a 2-way SNC system with a 100MB L3 cache a test that allocates
>>> memory from its local SNC node (default behavior without using libnuma)
>>> will only see 50 MB llc_occupancy with a fully populated L3 mask in the
>>> schemata file.
>>
>> This seems to contradict the "Cache and memory bandwidth allocation features
>> continue to operate at the scope of the L3 cache." statement from [1]?
> 
> I'll clean that up. MBA isn't affected. But cache allocation is affected in that
> the amount of cache represented by each bit in the masks in the schemata
> file is reduced by a factor equal to SNC nodes per L3 cache.

Thanks Tony. I trust that this is what Maciej intended since the change
is specifically named "Adjuct _effective_ L3 cache size". I'd like to
recommend that your comments be added before the change to
get_cache_size() ...

	/*
	 * The amount of cache represented by each bit in the masks
	 * in the schemata file is reduced by a factor equal to SNC
	 * nodes per L3 cache.
	 * E.g. on a SNC-2 system with a 100MB L3 cache a test that
	 * allocates memory from its local SNC node (default behavior
	 * without using libnuma) will only see 50 MB llc_occupancy
	 * with a fully populated L3 mask in the schemata file.
	 */

Reinette
Reinette Chatre June 25, 2024, 4:28 p.m. UTC | #4
Hi Maciej,

On 6/25/24 4:04 AM, Maciej Wieczor-Retman wrote:
> Hello,
> sorry it took me so long to get back to this. I prepared the next version with
> your comments applied and Tony's replies taken into account.

Thank you very much for sticking with this.

> 
> I wanted to briefly discuss this before posting:
> 
> On 2024-05-30 at 16:07:29 -0700, Reinette Chatre wrote:
>> On 5/15/24 4:18 AM, Maciej Wieczor-Retman wrote:
>>> +		return 1;
>>> +	}
>>> +
>>> +	for (i = 1; i <= MAX_SNC ; i++) {
>>> +		if (i * node_cpus >= cache_cpus)
>>> +			return i;
>>> +	}
>>
>> This is not obvious to me. From the function comments this seems to address the
>> scenarios when CPUs from other nodes are offline. It is not clear to me how
>> this loop addresses this. For example, let's say there are four SNC nodes
>> associated with a cache and only the node0 CPUs are online. The above would
>> detect this as "1", not "4", if I read this right?
>>
>> I wonder if it may not be easier to just follow what the kernel does
>> (in the new version).
>> User space can learn the number of online and present CPUs from
>> /sys/devices/system/cpu/online and /sys/devices/system/cpu/present
>> respectively. A simple string compare of the contents can be used to
>> determine if they are identical and a warning can be printed if they are not.
>> With a warning when accurate detection cannot be done the simple
>> check will do.
>>
>> Could you please add an informational message indicating how many SNC nodes
>> were indeed detected?
> 
> Should the information "how many SNC nodes are detected?" get printed every time
> (by which I mean at the end of CMT and MBM tests) or only when we get the error
> "SNC enabled but kernel doesn't support it" happens? Of course in the first case
> if there is only 1 node detected nothing would be printed to avoid noise.

I agree that it is not needed to print something about SNC if it is disabled.
hmmm ... so SNC impacts every test but it is only detected by default during CAT
and CMT test, with MBA and MBM "detection" only triggered if the test fails?

What if the "SNC detection" is moved to be within run_single_test() but instead of
repeating the detection from scratch every time it rather works like get_vendor()
where the full detection is only done on first attempt? run_single_test() can detect if
SNC is enabled and (if number of SNC nodes > 1) print an informational message
that is inherited by all tests.
Any test that needs to know the number of SNC nodes can continue to use the
same function used for detection (that only does actual detection once).

What do you think?

Reinette
Reinette Chatre June 26, 2024, 4:46 p.m. UTC | #5
Hi Maciej,

On 6/26/24 12:09 AM, Maciej Wieczor-Retman wrote:
> Hello!,
> 
> On 2024-06-25 at 09:28:55 -0700, Reinette Chatre wrote:
>> Hi Maciej,
>>
>> On 6/25/24 4:04 AM, Maciej Wieczor-Retman wrote:
>>> Hello,
>>> sorry it took me so long to get back to this. I prepared the next version with
>>> your comments applied and Tony's replies taken into account.
>>
>> Thank you very much for sticking with this.
>>
>>>
>>> I wanted to briefly discuss this before posting:
>>>
>>> On 2024-05-30 at 16:07:29 -0700, Reinette Chatre wrote:
>>>> On 5/15/24 4:18 AM, Maciej Wieczor-Retman wrote:
>>>>> +		return 1;
>>>>> +	}
>>>>> +
>>>>> +	for (i = 1; i <= MAX_SNC ; i++) {
>>>>> +		if (i * node_cpus >= cache_cpus)
>>>>> +			return i;
>>>>> +	}
>>>>
>>>> This is not obvious to me. From the function comments this seems to address the
>>>> scenarios when CPUs from other nodes are offline. It is not clear to me how
>>>> this loop addresses this. For example, let's say there are four SNC nodes
>>>> associated with a cache and only the node0 CPUs are online. The above would
>>>> detect this as "1", not "4", if I read this right?
>>>>
>>>> I wonder if it may not be easier to just follow what the kernel does
>>>> (in the new version).
>>>> User space can learn the number of online and present CPUs from
>>>> /sys/devices/system/cpu/online and /sys/devices/system/cpu/present
>>>> respectively. A simple string compare of the contents can be used to
>>>> determine if they are identical and a warning can be printed if they are not.
>>>> With a warning when accurate detection cannot be done the simple
>>>> check will do.
>>>>
>>>> Could you please add an informational message indicating how many SNC nodes
>>>> were indeed detected?
>>>
>>> Should the information "how many SNC nodes are detected?" get printed every time
>>> (by which I mean at the end of CMT and MBM tests) or only when we get the error
>>> "SNC enabled but kernel doesn't support it" happens? Of course in the first case
>>> if there is only 1 node detected nothing would be printed to avoid noise.
>>
>> I agree that it is not needed to print something about SNC if it is disabled.
>> hmmm ... so SNC impacts every test but it is only detected by default during CAT
>> and CMT test, with MBA and MBM "detection" only triggered if the test fails?
> 
> Yes, snc_ways() ran before starting CAT and CMT to adjust cache size variable.
> And then after CAT,CMT,MBM and MBA if the return value indicated failure.
> 
>>
>> What if the "SNC detection" is moved to be within run_single_test() but instead of
>> repeating the detection from scratch every time it rather works like get_vendor()
>> where the full detection is only done on first attempt? run_single_test() can detect if
>> SNC is enabled and (if number of SNC nodes > 1) print an informational message
>> that is inherited by all tests.
>> Any test that needs to know the number of SNC nodes can continue to use the
>> same function used for detection (that only does actual detection once).
>>
>> What do you think?
> 
> I think running the detection once at the start and then reusing the results is
> a good idea. You're proposing adding a value (global or passed through all the
> tests) that would get initialized on the first run_single_test()?

I was thinking about a solution similar to get_vendor() that uses a static local
variable. A global variable could work also.

> And then the SNC status (if enabled) + a warning if the detection could be wrong
> (because of the online/present cpus ratio) would happen before the test runs?

I do not think this was part of previous tests, but yes, this is a concern where
a warning would be helpful.

> On the warning placement I think it should be moved out of being printed only on
> failure. I did some experiments using "chcpu" to enable/disable cores and then
> run selftests. They didn't have any problems succeeding even though SNC
> detection detected different mode every time (I added a printf() around the line

I am not surprised here since there has not been much tuning of the CAT test.

> that cache size is modified to show what SNC mode is detected). While I
> understand these tests shouldn't fail since they just use a different portion of
> the cache I think the user should be informed it's not really NUMA aware if the
> detection was wrong:

Seems like there are two warnings to consider:
(a) SNC detection may be wrong.
(b) If SNC is enabled and kernel does not support SNC then the tests may fail.

For (a) I think that it is possible to know when SNC detection may be wrong. A test
similar to the kernel test that compares the "online" and "present" CPUs [1] can
be used. The /sys/devices/system/cpu/online and /sys/devices/system/cpu/present
files are available for this. A simpler way may be to just print a warning if
/sys/devices/system/cpu/offline is not empty and set the number of SNC nodes
to 1. Instead, a new "snc_unreliable" global can be set that can be used
to print additional information during test failure.

I do think that it is fair to print all the SNC details during detection but
I am concerned that those messages will be out of sight during test failures
and I thus do think it is useful to have extra information during test
failure.

Reinette

[1] https://lore.kernel.org/lkml/20240621223859.43471-18-tony.luck@intel.com/
diff mbox series

Patch

diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 00d51fa7531c..3dd5d6779786 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -11,6 +11,7 @@ 
 #include <signal.h>
 #include <dirent.h>
 #include <stdbool.h>
+#include <ctype.h>
 #include <sys/stat.h>
 #include <sys/ioctl.h>
 #include <sys/mount.h>
@@ -49,6 +50,7 @@ 
 		umount_resctrlfs();		\
 		exit(EXIT_FAILURE);		\
 	} while (0)
+#define MAX_SNC		4
 
 /*
  * user_params:		User supplied parameters
@@ -131,6 +133,7 @@  extern pid_t bm_pid, ppid;
 
 extern char llc_occup_path[1024];
 
+int snc_ways(void);
 int get_vendor(void);
 bool check_resctrlfs_support(void);
 int filter_dmesg(void);
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 1cade75176eb..e4d3624a8817 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -156,6 +156,63 @@  int get_domain_id(const char *resource, int cpu_no, int *domain_id)
 	return 0;
 }
 
+/*
+ * Count number of CPUs in a /sys bit map
+ */
+static unsigned int count_sys_bitmap_bits(char *name)
+{
+	FILE *fp = fopen(name, "r");
+	int count = 0, c;
+
+	if (!fp)
+		return 0;
+
+	while ((c = fgetc(fp)) != EOF) {
+		if (!isxdigit(c))
+			continue;
+		switch (c) {
+		case 'f':
+			count++;
+		case '7': case 'b': case 'd': case 'e':
+			count++;
+		case '3': case '5': case '6': case '9': case 'a': case 'c':
+			count++;
+		case '1': case '2': case '4': case '8':
+			count++;
+		}
+	}
+	fclose(fp);
+
+	return count;
+}
+
+/*
+ * Detect SNC by comparing #CPUs in node0 with #CPUs sharing LLC with CPU0.
+ * If some CPUs are offline the numbers may not be exact multiples of each
+ * other. Any offline CPUs on node0 will be also gone from shared_cpu_map of
+ * CPU0 but offline CPUs from other nodes will only make the cache_cpus value
+ * lower. Still try to get the ratio right by preventing the second possibility.
+ */
+int snc_ways(void)
+{
+	int node_cpus, cache_cpus, i;
+
+	node_cpus = count_sys_bitmap_bits("/sys/devices/system/node/node0/cpumap");
+	cache_cpus = count_sys_bitmap_bits("/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_map");
+
+	if (!node_cpus || !cache_cpus) {
+		fprintf(stderr, "Warning could not determine Sub-NUMA Cluster mode\n");
+		return 1;
+	}
+
+	for (i = 1; i <= MAX_SNC ; i++) {
+		if (i * node_cpus >= cache_cpus)
+			return i;
+	}
+
+	return 1;
+}
+
 /*
  * get_cache_size - Get cache size for a specified CPU
  * @cpu_no:	CPU number
@@ -211,6 +268,8 @@  int get_cache_size(int cpu_no, const char *cache_type, unsigned long *cache_size
 			break;
 	}
 
+	if (cache_num == 3)
+		*cache_size /= snc_ways();
 	return 0;
 }