diff mbox series

[v2,4/4] selftests/resctrl: Add non-contiguous CBMs CAT test

Message ID 10c3afd7f62c63db31a3d4af86529144a5d7bbf9.1702392177.git.maciej.wieczor-retman@intel.com
State New
Headers show
Series selftests/resctrl: Add non-contiguous CBMs in Intel CAT selftest | expand

Commit Message

Maciej Wieczor-Retman Dec. 12, 2023, 2:52 p.m. UTC
Add tests for both L2 and L3 CAT to verify the return values
generated by writing non-contiguous CBMs don't contradict the
reported non-contiguous support information.

Use a logical XOR to confirm return value of write_schemata() and
non-contiguous CBMs support information match.

Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
---
Changelog v2:
- Redo the patch message. (Ilpo)
- Tidy up __cpuid_count calls. (Ilpo)
- Remove redundant AND in noncont_mask calculations (Ilpo)
- Fix bit_center offset.
- Add newline before function return. (Ilpo)
- Group non-contiguous tests with CAT tests. (Ilpo)
- Use a helper for reading sparse_masks file. (Ilpo)
- Make get_cache_level() available in other source files. (Ilpo)

 tools/testing/selftests/resctrl/cat_test.c    | 75 +++++++++++++++++++
 tools/testing/selftests/resctrl/resctrl.h     |  3 +
 .../testing/selftests/resctrl/resctrl_tests.c |  2 +
 tools/testing/selftests/resctrl/resctrlfs.c   |  2 +-
 4 files changed, 81 insertions(+), 1 deletion(-)

Comments

Reinette Chatre Jan. 8, 2024, 10:42 p.m. UTC | #1
Hi Maciej,

On 12/12/2023 6:52 AM, Maciej Wieczor-Retman wrote:
> Add tests for both L2 and L3 CAT to verify the return values
> generated by writing non-contiguous CBMs don't contradict the
> reported non-contiguous support information.
> 
> Use a logical XOR to confirm return value of write_schemata() and
> non-contiguous CBMs support information match.
> 
> Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
> ---
> Changelog v2:
> - Redo the patch message. (Ilpo)
> - Tidy up __cpuid_count calls. (Ilpo)
> - Remove redundant AND in noncont_mask calculations (Ilpo)
> - Fix bit_center offset.
> - Add newline before function return. (Ilpo)
> - Group non-contiguous tests with CAT tests. (Ilpo)
> - Use a helper for reading sparse_masks file. (Ilpo)
> - Make get_cache_level() available in other source files. (Ilpo)
> 
>  tools/testing/selftests/resctrl/cat_test.c    | 75 +++++++++++++++++++
>  tools/testing/selftests/resctrl/resctrl.h     |  3 +
>  .../testing/selftests/resctrl/resctrl_tests.c |  2 +
>  tools/testing/selftests/resctrl/resctrlfs.c   |  2 +-
>  4 files changed, 81 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
> index 7dc7206b3b99..ecf553a89aae 100644
> --- a/tools/testing/selftests/resctrl/cat_test.c
> +++ b/tools/testing/selftests/resctrl/cat_test.c
> @@ -292,6 +292,65 @@ static int cat_run_test(const struct resctrl_test *test, const struct user_param
>  	return ret;
>  }
>  
> +static int noncont_cat_run_test(const struct resctrl_test *test,
> +				const struct user_params *uparams)
> +{
> +	unsigned long full_cache_mask, cont_mask, noncont_mask;
> +	unsigned int eax, ebx, ecx, edx, ret;
> +	int level, bit_center, sparse_masks;
> +	char schemata[64];
> +
> +	/* Check to compare sparse_masks content to cpuid output. */

"cpuid" -> "CPUID" (to note it is an instruction)

> +	sparse_masks = read_info_res_file(test->resource, "sparse_masks");
> +	if (sparse_masks < 0)
> +		return sparse_masks;
> +
> +	level = get_cache_level(test->resource);
> +	if (level < 0)
> +		return -EINVAL;
> +	__cpuid_count(0x10, 4 - level, eax, ebx, ecx, edx);

Please do not invent relationships. Please replace the "4 - level" with
specific index used that depends on particular cache. The cache level
may not even be needed, just use the resource to determine the correct
index.

> +
> +	if (sparse_masks != ((ecx >> 3) & 1))
> +		return -1;

Can a message be displayed to support the debugging this test failure?

> +
> +	/* Write checks initialization. */
> +	ret = get_full_cbm(test->resource, &full_cache_mask);
> +	if (ret < 0)
> +		return ret;

I assume this test failure relies on the error message from get_bit_mask()
that is called via get_full_cbm()?

> +	bit_center = count_bits(full_cache_mask) / 2;
> +	cont_mask = full_cache_mask >> bit_center;
> +
> +	/* Contiguous mask write check. */
> +	snprintf(schemata, sizeof(schemata), "%lx", cont_mask);
> +	ret = write_schemata("", schemata, uparams->cpu, test->resource);
> +	if (ret)
> +		return ret;

How will user know what failed? I am seeing this single test exercise a few scenarios
and it is not obvious to me if the issue will be clear if this test,
noncont_cat_run_test(), fails.

> +
> +	/*
> +	 * Non-contiguous mask write check. CBM has a 0xf hole approximately in the middle.
> +	 * Output is compared with support information to catch any edge case errors.
> +	 */
> +	noncont_mask = ~(0xf << (bit_center - 2)) & full_cache_mask;
> +	snprintf(schemata, sizeof(schemata), "%lx", noncont_mask);
> +	ret = write_schemata("", schemata, uparams->cpu, test->resource);
> +	if (ret && sparse_masks)
> +		ksft_print_msg("Non-contiguous CBMs supported but write failed\n");
> +	else if (ret && !sparse_masks)
> +		ksft_print_msg("Non-contiguous CBMs not supported and write failed as expected\n");
> +	else if (!ret && !sparse_masks)
> +		ksft_print_msg("Non-contiguous CBMs not supported but write succeeded\n");

Can these messages be made more specific with a "write" changed to "write of
non-contiguous CBM"

> +
> +	return !ret == !sparse_masks;

Please return negative on error. Ilpo just did a big cleanup to address this.

> +}
> +
> +static bool noncont_cat_feature_check(const struct resctrl_test *test)
> +{
> +	if (!resctrl_resource_exists(test->resource))
> +		return false;
> +
> +	return resctrl_cache_feature_exists(test->resource, "sparse_masks");
> +}
> +
>  struct resctrl_test l3_cat_test = {
>  	.name = "L3_CAT",
>  	.group = "CAT",
> @@ -299,3 +358,19 @@ struct resctrl_test l3_cat_test = {
>  	.feature_check = test_resource_feature_check,
>  	.run_test = cat_run_test,
>  };
> +
> +struct resctrl_test l3_noncont_cat_test = {
> +	.name = "L3_NONCONT_CAT",
> +	.group = "CAT",
> +	.resource = "L3",
> +	.feature_check = noncont_cat_feature_check,
> +	.run_test = noncont_cat_run_test,
> +};
> +
> +struct resctrl_test l2_noncont_cat_test = {
> +	.name = "L2_NONCONT_CAT",
> +	.group = "CAT",
> +	.resource = "L2",
> +	.feature_check = noncont_cat_feature_check,
> +	.run_test = noncont_cat_run_test,
> +};
> diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
> index 74041a35d4ba..7b7a48d1fddd 100644
> --- a/tools/testing/selftests/resctrl/resctrl.h
> +++ b/tools/testing/selftests/resctrl/resctrl.h
> @@ -165,6 +165,7 @@ unsigned int count_contiguous_bits(unsigned long val, unsigned int *start);
>  int get_full_cbm(const char *cache_type, unsigned long *mask);
>  int get_mask_no_shareable(const char *cache_type, unsigned long *mask);
>  int get_cache_size(int cpu_no, const char *cache_type, unsigned long *cache_size);
> +int get_cache_level(const char *cache_type);
>  int read_info_res_file(const char *resource, const char *filename);
>  void ctrlc_handler(int signum, siginfo_t *info, void *ptr);
>  int signal_handler_register(void);
> @@ -201,5 +202,7 @@ extern struct resctrl_test mbm_test;
>  extern struct resctrl_test mba_test;
>  extern struct resctrl_test cmt_test;
>  extern struct resctrl_test l3_cat_test;
> +extern struct resctrl_test l3_noncont_cat_test;
> +extern struct resctrl_test l2_noncont_cat_test;
>  
>  #endif /* RESCTRL_H */
> diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
> index 3044179ee6e9..f3dc1b9696e7 100644
> --- a/tools/testing/selftests/resctrl/resctrl_tests.c
> +++ b/tools/testing/selftests/resctrl/resctrl_tests.c
> @@ -19,6 +19,8 @@ static struct resctrl_test *resctrl_tests[] = {
>  	&mba_test,
>  	&cmt_test,
>  	&l3_cat_test,
> +	&l3_noncont_cat_test,
> +	&l2_noncont_cat_test,
>  };
>  
>  static int detect_vendor(void)
> diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
> index 8546421f0940..8bd30973fec3 100644
> --- a/tools/testing/selftests/resctrl/resctrlfs.c
> +++ b/tools/testing/selftests/resctrl/resctrlfs.c
> @@ -100,7 +100,7 @@ int umount_resctrlfs(void)
>   *
>   * Return: cache level as integer or -1 if @cache_type is invalid.
>   */
> -static int get_cache_level(const char *cache_type)
> +int get_cache_level(const char *cache_type)
>  {
>  	if (!strcmp(cache_type, "L3"))
>  		return 3;


Reinette
Ilpo Järvinen Jan. 9, 2024, 9:13 a.m. UTC | #2
On Mon, 8 Jan 2024, Reinette Chatre wrote:

> Hi Maciej,
> 
> On 12/12/2023 6:52 AM, Maciej Wieczor-Retman wrote:
> > Add tests for both L2 and L3 CAT to verify the return values
> > generated by writing non-contiguous CBMs don't contradict the
> > reported non-contiguous support information.
> > 
> > Use a logical XOR to confirm return value of write_schemata() and
> > non-contiguous CBMs support information match.
> > 
> > Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
> > ---
> > Changelog v2:
> > - Redo the patch message. (Ilpo)
> > - Tidy up __cpuid_count calls. (Ilpo)
> > - Remove redundant AND in noncont_mask calculations (Ilpo)
> > - Fix bit_center offset.
> > - Add newline before function return. (Ilpo)
> > - Group non-contiguous tests with CAT tests. (Ilpo)
> > - Use a helper for reading sparse_masks file. (Ilpo)
> > - Make get_cache_level() available in other source files. (Ilpo)
> > 
> >  tools/testing/selftests/resctrl/cat_test.c    | 75 +++++++++++++++++++
> >  tools/testing/selftests/resctrl/resctrl.h     |  3 +
> >  .../testing/selftests/resctrl/resctrl_tests.c |  2 +
> >  tools/testing/selftests/resctrl/resctrlfs.c   |  2 +-
> >  4 files changed, 81 insertions(+), 1 deletion(-)
> > 
> > diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
> > index 7dc7206b3b99..ecf553a89aae 100644
> > --- a/tools/testing/selftests/resctrl/cat_test.c
> > +++ b/tools/testing/selftests/resctrl/cat_test.c
> > @@ -292,6 +292,65 @@ static int cat_run_test(const struct resctrl_test *test, const struct user_param
> >  	return ret;
> >  }
> >  
> > +static int noncont_cat_run_test(const struct resctrl_test *test,
> > +				const struct user_params *uparams)
> > +{
> > +	unsigned long full_cache_mask, cont_mask, noncont_mask;
> > +	unsigned int eax, ebx, ecx, edx, ret;
> > +	int level, bit_center, sparse_masks;
> > +	char schemata[64];
> > +
> > +	/* Check to compare sparse_masks content to cpuid output. */
> 
> "cpuid" -> "CPUID" (to note it is an instruction)
> 
> > +	sparse_masks = read_info_res_file(test->resource, "sparse_masks");
> > +	if (sparse_masks < 0)
> > +		return sparse_masks;
> > +
> > +	level = get_cache_level(test->resource);
> > +	if (level < 0)
> > +		return -EINVAL;
> > +	__cpuid_count(0x10, 4 - level, eax, ebx, ecx, edx);
> 
> Please do not invent relationships. Please replace the "4 - level" with
> specific index used that depends on particular cache. The cache level
> may not even be needed, just use the resource to determine the correct
> index.

This is actually my fault, I suggested Maciej could use arithmetics there.

> > +
> > +	return !ret == !sparse_masks;
> 
> Please return negative on error. Ilpo just did a big cleanup to address this.

Test failure is not same as an error. So tests should return negative for 
errors which prevent even running test at all, and 0/1 for test 
success/fail.
Reinette Chatre Jan. 9, 2024, 5:17 p.m. UTC | #3
Hi Ilpo,

On 1/9/2024 1:13 AM, Ilpo Järvinen wrote:
> On Mon, 8 Jan 2024, Reinette Chatre wrote:
> 
>> Hi Maciej,
>>
>> On 12/12/2023 6:52 AM, Maciej Wieczor-Retman wrote:
>>> Add tests for both L2 and L3 CAT to verify the return values
>>> generated by writing non-contiguous CBMs don't contradict the
>>> reported non-contiguous support information.
>>>
>>> Use a logical XOR to confirm return value of write_schemata() and
>>> non-contiguous CBMs support information match.
>>>
>>> Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
>>> ---
>>> Changelog v2:
>>> - Redo the patch message. (Ilpo)
>>> - Tidy up __cpuid_count calls. (Ilpo)
>>> - Remove redundant AND in noncont_mask calculations (Ilpo)
>>> - Fix bit_center offset.
>>> - Add newline before function return. (Ilpo)
>>> - Group non-contiguous tests with CAT tests. (Ilpo)
>>> - Use a helper for reading sparse_masks file. (Ilpo)
>>> - Make get_cache_level() available in other source files. (Ilpo)
>>>
>>>  tools/testing/selftests/resctrl/cat_test.c    | 75 +++++++++++++++++++
>>>  tools/testing/selftests/resctrl/resctrl.h     |  3 +
>>>  .../testing/selftests/resctrl/resctrl_tests.c |  2 +
>>>  tools/testing/selftests/resctrl/resctrlfs.c   |  2 +-
>>>  4 files changed, 81 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
>>> index 7dc7206b3b99..ecf553a89aae 100644
>>> --- a/tools/testing/selftests/resctrl/cat_test.c
>>> +++ b/tools/testing/selftests/resctrl/cat_test.c
>>> @@ -292,6 +292,65 @@ static int cat_run_test(const struct resctrl_test *test, const struct user_param
>>>  	return ret;
>>>  }
>>>  
>>> +static int noncont_cat_run_test(const struct resctrl_test *test,
>>> +				const struct user_params *uparams)
>>> +{
>>> +	unsigned long full_cache_mask, cont_mask, noncont_mask;
>>> +	unsigned int eax, ebx, ecx, edx, ret;
>>> +	int level, bit_center, sparse_masks;
>>> +	char schemata[64];
>>> +
>>> +	/* Check to compare sparse_masks content to cpuid output. */
>>
>> "cpuid" -> "CPUID" (to note it is an instruction)
>>
>>> +	sparse_masks = read_info_res_file(test->resource, "sparse_masks");
>>> +	if (sparse_masks < 0)
>>> +		return sparse_masks;
>>> +
>>> +	level = get_cache_level(test->resource);
>>> +	if (level < 0)
>>> +		return -EINVAL;
>>> +	__cpuid_count(0x10, 4 - level, eax, ebx, ecx, edx);
>>
>> Please do not invent relationships. Please replace the "4 - level" with
>> specific index used that depends on particular cache. The cache level
>> may not even be needed, just use the resource to determine the correct
>> index.
> 
> This is actually my fault, I suggested Maciej could use arithmetics there.

No problem. The math works for the current values but there is no such
relationship. If hypothetically a new cache level needs to be supported
then this computation cannot be relied upon to continue to be correct.

>>> +	return !ret == !sparse_masks;
>>
>> Please return negative on error. Ilpo just did a big cleanup to address this.
> 
> Test failure is not same as an error. So tests should return negative for 
> errors which prevent even running test at all, and 0/1 for test 
> success/fail.
> 

Thanks for catching this. I missed this subtlety in the framework.

Reinette
Reinette Chatre Jan. 17, 2024, 6:49 p.m. UTC | #4
Hi Maciej,

On 1/17/2024 12:26 AM, Maciej Wieczór-Retman wrote:
> On 2024-01-08 at 14:42:11 -0800, Reinette Chatre wrote:
>> On 12/12/2023 6:52 AM, Maciej Wieczor-Retman wrote:


>>> +
>>> +	if (sparse_masks != ((ecx >> 3) & 1))
>>> +		return -1;
>>
>> Can a message be displayed to support the debugging this test failure?
> 
> Sure, that is a very good idea. I'll add ksft_perror() here.

I do not think ksft_perror() is appropriate since perror() is for
system error messages (something that sets errno). Perhaps just
ksft_print_msg().

>>> +	bit_center = count_bits(full_cache_mask) / 2;
>>> +	cont_mask = full_cache_mask >> bit_center;
>>> +
>>> +	/* Contiguous mask write check. */
>>> +	snprintf(schemata, sizeof(schemata), "%lx", cont_mask);
>>> +	ret = write_schemata("", schemata, uparams->cpu, test->resource);
>>> +	if (ret)
>>> +		return ret;
>>
>> How will user know what failed? I am seeing this single test exercise a few scenarios
>> and it is not obvious to me if the issue will be clear if this test,
>> noncont_cat_run_test(), fails.
> 
> write_schemata() either succeeds with '0' or errors out with a negative value. If
> the contiguous mask write fails, write_schemata should print out what was wrong
> and I believe that the test will report an error rather than failure.

Right. I am trying to understand whether the user will be able to decipher what failed
in case there is an error. Seems like in this case the user is expected to look at the
source code of the test to understand what the test was trying to do at the time it
encountered the failure. In this case user may be "lucky" that this test only has
one write_schemata() call _not_ followed by a ksft_print_msg() so user can use that
reasoning to figure out which write_schemata() failed to further dig what test was
trying to do. 

Reinette
Reinette Chatre Jan. 18, 2024, 5:15 p.m. UTC | #5
Hi Maciej,

On 1/18/2024 4:02 AM, Maciej Wieczór-Retman wrote:
> On 2024-01-17 at 10:49:06 -0800, Reinette Chatre wrote:
>> On 1/17/2024 12:26 AM, Maciej Wieczór-Retman wrote:
>>> On 2024-01-08 at 14:42:11 -0800, Reinette Chatre wrote:
>>>> On 12/12/2023 6:52 AM, Maciej Wieczor-Retman wrote:

>>>>> +	bit_center = count_bits(full_cache_mask) / 2;
>>>>> +	cont_mask = full_cache_mask >> bit_center;
>>>>> +
>>>>> +	/* Contiguous mask write check. */
>>>>> +	snprintf(schemata, sizeof(schemata), "%lx", cont_mask);
>>>>> +	ret = write_schemata("", schemata, uparams->cpu, test->resource);
>>>>> +	if (ret)
>>>>> +		return ret;
>>>>
>>>> How will user know what failed? I am seeing this single test exercise a few scenarios
>>>> and it is not obvious to me if the issue will be clear if this test,
>>>> noncont_cat_run_test(), fails.
>>>
>>> write_schemata() either succeeds with '0' or errors out with a negative value. If
>>> the contiguous mask write fails, write_schemata should print out what was wrong
>>> and I believe that the test will report an error rather than failure.
>>
>> Right. I am trying to understand whether the user will be able to decipher what failed
>> in case there is an error. Seems like in this case the user is expected to look at the
>> source code of the test to understand what the test was trying to do at the time it
>> encountered the failure. In this case user may be "lucky" that this test only has
>> one write_schemata() call _not_ followed by a ksft_print_msg() so user can use that
>> reasoning to figure out which write_schemata() failed to further dig what test was
>> trying to do. 
> 
> When a write_schemata() is executed the string that is being written gets
> printed. If there are multiple calls in a single tests and one fails I'd imagine
> it would be easy for the user to figure out which one failed.

It would be easy for the user the figure out if (a) it is obvious to the user
what schema a particular write_schema() call attempted to write and (b) all the
write_schema() calls attempt to write different schema.

> On a side note I'm not sure if that's true but I'm getting a feeling that the
> harder errors (not just test failures) are more of a clue for developers working
> on the tests. Would you agree that it seems like users probably won't see
> write_schemata() fail here (if the test execution managed to get to this point
> without erroring out due to bad parameters or kernel compiled without required
> options)?

I do agree that users probably won't see such failures. I do not think these
errors are clues to developers working on the tests though, but instead clues
to resctrl developers or kernel development CI systems.

Reinette
Reinette Chatre Jan. 19, 2024, 4:39 p.m. UTC | #6
Hi Maciej,

On 1/18/2024 11:37 PM, Maciej Wieczór-Retman wrote:
> On 2024-01-18 at 09:15:46 -0800, Reinette Chatre wrote:
>> On 1/18/2024 4:02 AM, Maciej Wieczór-Retman wrote:
>>> On 2024-01-17 at 10:49:06 -0800, Reinette Chatre wrote:
>>>> On 1/17/2024 12:26 AM, Maciej Wieczór-Retman wrote:
>>>>> On 2024-01-08 at 14:42:11 -0800, Reinette Chatre wrote:
>>>>>> On 12/12/2023 6:52 AM, Maciej Wieczor-Retman wrote:
>>
>>>>>>> +	bit_center = count_bits(full_cache_mask) / 2;
>>>>>>> +	cont_mask = full_cache_mask >> bit_center;
>>>>>>> +
>>>>>>> +	/* Contiguous mask write check. */
>>>>>>> +	snprintf(schemata, sizeof(schemata), "%lx", cont_mask);
>>>>>>> +	ret = write_schemata("", schemata, uparams->cpu, test->resource);
>>>>>>> +	if (ret)
>>>>>>> +		return ret;
>>>>>>
>>>>>> How will user know what failed? I am seeing this single test exercise a few scenarios
>>>>>> and it is not obvious to me if the issue will be clear if this test,
>>>>>> noncont_cat_run_test(), fails.
>>>>>
>>>>> write_schemata() either succeeds with '0' or errors out with a negative value. If
>>>>> the contiguous mask write fails, write_schemata should print out what was wrong
>>>>> and I believe that the test will report an error rather than failure.
>>>>
>>>> Right. I am trying to understand whether the user will be able to decipher what failed
>>>> in case there is an error. Seems like in this case the user is expected to look at the
>>>> source code of the test to understand what the test was trying to do at the time it
>>>> encountered the failure. In this case user may be "lucky" that this test only has
>>>> one write_schemata() call _not_ followed by a ksft_print_msg() so user can use that
>>>> reasoning to figure out which write_schemata() failed to further dig what test was
>>>> trying to do. 
>>>
>>> When a write_schemata() is executed the string that is being written gets
>>> printed. If there are multiple calls in a single tests and one fails I'd imagine
>>> it would be easy for the user to figure out which one failed.
>>
>> It would be easy for the user the figure out if (a) it is obvious to the user
>> what schema a particular write_schema() call attempted to write and (b) all the
>> write_schema() calls attempt to write different schema.
> 
> Okay, your comment made me wonder if on error the schemata still is printed. I
> double checked in the code and whether write_schemata() fails or not it has a
> goto path where before returning it will print out the schema. So I believe that
> satisfies your (a) condition.

Let me try with an example.
Scenario 1:
The test has the following code:
	...
	write_schemata(..., "0xfff", ...);
	...
	write_schemata(..., "0xf0f", ...);
	...

Scenario 2:
The test has the following code:
	...
	write_schemata(..., schemata, ...);
	...
	write_schemata(..., schemata, ...);
	...

Any failure of write_schemata() in scenario 1 will be easy to trace. As you
state, write_schemata() prints the schemata attempted and it will thus be
easy to look at the code to see which write_schemata() call failed since it
is obvious from the code which schemata was attempted.
A failure of one of the write_schemata() in scenario 2 will not be as easy
to trace since the user first needs to determine what the value of "schemata"
is at each call and that may depend on the platform, bit shifting done in test,
and state of system state at time of test.

> As for (b) depends on what you meant. Other tests that run more than one
> write_schemata() use different ones every time (CAT, MBM, MBA). Do you suggest
> that the non-contiguous test should attempt more schematas? For example shift
> the bit hole from one side to the other? I assumed one CBM with a centered bit
> hole would be enough to check if non-contiguous CBM feature works properly and
> more CBMs would be redundant.

Let me try with an example.
Scenario 1:
The test has the following code:
	...
	write_schemata(..., "0xfff", ...);
	...
	write_schemata(..., "0xf0f", ...);
	...

Scenario 2:
The test has the following code:
	...
	write_schemata(..., "0xfff", ...);
	...
	write_schemata(..., "0xfff", ...);
	...

A failure of either write_schemata() in scenario 1 will be easy to trace since 
the schemata attempted is different in each case. The schemata printed by the
write_schemata() error message can thus easily be connected to the specific
write_schemata() call.
A failure of either write_schemata() in scenario 2 is not so obvious since they
both attempted the same schemata so the error message printed by write_schemata()
could belong to either. 

Reinette
Reinette Chatre Jan. 23, 2024, 5:42 p.m. UTC | #7
Hi Maciej,

On 1/22/2024 11:58 PM, Maciej Wieczór-Retman wrote:
> On 2024-01-22 at 08:32:36 -0800, Reinette Chatre wrote:
>> Hi Maciej,
>>
>> On 1/21/2024 11:56 PM, Maciej Wieczór-Retman wrote:
>>> Hi!
>>>
>>> On 2024-01-19 at 08:39:31 -0800, Reinette Chatre wrote:
>>>> Hi Maciej,
>>>>
>>>> On 1/18/2024 11:37 PM, Maciej Wieczór-Retman wrote:
>>>>> On 2024-01-18 at 09:15:46 -0800, Reinette Chatre wrote:
>>>>>> On 1/18/2024 4:02 AM, Maciej Wieczór-Retman wrote:
>>>>>>> On 2024-01-17 at 10:49:06 -0800, Reinette Chatre wrote:
>>>>>>>> On 1/17/2024 12:26 AM, Maciej Wieczór-Retman wrote:
>>>>>>>>> On 2024-01-08 at 14:42:11 -0800, Reinette Chatre wrote:
>>>>>>>>>> On 12/12/2023 6:52 AM, Maciej Wieczor-Retman wrote:
>>>>>>
>>>>>>>>>>> +	bit_center = count_bits(full_cache_mask) / 2;
>>>>>>>>>>> +	cont_mask = full_cache_mask >> bit_center;
>>>>>>>>>>> +
>>>>>>>>>>> +	/* Contiguous mask write check. */
>>>>>>>>>>> +	snprintf(schemata, sizeof(schemata), "%lx", cont_mask);
>>>>>>>>>>> +	ret = write_schemata("", schemata, uparams->cpu, test->resource);
>>>>>>>>>>> +	if (ret)
>>>>>>>>>>> +		return ret;
>>>>>>>>>>
>>>>>>>>>> How will user know what failed? I am seeing this single test exercise a few scenarios
>>>>>>>>>> and it is not obvious to me if the issue will be clear if this test,
>>>>>>>>>> noncont_cat_run_test(), fails.
>>>>>>>>>
>>>>>>>>> write_schemata() either succeeds with '0' or errors out with a negative value. If
>>>>>>>>> the contiguous mask write fails, write_schemata should print out what was wrong
>>>>>>>>> and I believe that the test will report an error rather than failure.
>>>>>>>>
>>>>>>>> Right. I am trying to understand whether the user will be able to decipher what failed
>>>>>>>> in case there is an error. Seems like in this case the user is expected to look at the
>>>>>>>> source code of the test to understand what the test was trying to do at the time it
>>>>>>>> encountered the failure. In this case user may be "lucky" that this test only has
>>>>>>>> one write_schemata() call _not_ followed by a ksft_print_msg() so user can use that
>>>>>>>> reasoning to figure out which write_schemata() failed to further dig what test was
>>>>>>>> trying to do. 
>>>>>>>
>>>>>>> When a write_schemata() is executed the string that is being written gets
>>>>>>> printed. If there are multiple calls in a single tests and one fails I'd imagine
>>>>>>> it would be easy for the user to figure out which one failed.
>>>>>>
>>>>>> It would be easy for the user the figure out if (a) it is obvious to the user
>>>>>> what schema a particular write_schema() call attempted to write and (b) all the
>>>>>> write_schema() calls attempt to write different schema.
>>>
>>>>> As for (b) depends on what you meant. Other tests that run more than one
>>>>> write_schemata() use different ones every time (CAT, MBM, MBA). Do you suggest
>>>>> that the non-contiguous test should attempt more schematas? For example shift
>>>>> the bit hole from one side to the other? I assumed one CBM with a centered bit
>>>>> hole would be enough to check if non-contiguous CBM feature works properly and
>>>>> more CBMs would be redundant.
>>>>
>>>> Let me try with an example.
>>>> Scenario 1:
>>>> The test has the following code:
>>>> 	...
>>>> 	write_schemata(..., "0xfff", ...);
>>>> 	...
>>>> 	write_schemata(..., "0xf0f", ...);
>>>> 	...
>>>>
>>>> Scenario 2:
>>>> The test has the following code:
>>>> 	...
>>>> 	write_schemata(..., "0xfff", ...);
>>>> 	...
>>>> 	write_schemata(..., "0xfff", ...);
>>>> 	...
>>>>
>>>> A failure of either write_schemata() in scenario 1 will be easy to trace since 
>>>> the schemata attempted is different in each case. The schemata printed by the
>>>> write_schemata() error message can thus easily be connected to the specific
>>>> write_schemata() call.
>>>> A failure of either write_schemata() in scenario 2 is not so obvious since they
>>>> both attempted the same schemata so the error message printed by write_schemata()
>>>> could belong to either. 
>>
>>> I'm sorry to drag this thread out but I want to be sure if I'm right or are you
>>> suggesting something and I missed it?
>>
>> Please just add a ksft_print_msg() to noncont_cat_run_test() when this
>> write_schemata() fails.
> 
> My point all along was that if write_schemata() fails it already prints out all
> the necessary information. I'd like to avoid adding redundant messages so please
> take a look at how it looks now:

Please consider that there may be different perspectives of "necessary information".

> I injected write_schemata() with an error so it will take a path as if write()
> failed with 'Permission denied' as a reason. Here is the output for L3
> non-contiguous CAT test:
> 
> 	[root@spr1 ~]# ./resctrl_tests -t L3_NONCONT_CAT
> 	TAP version 13
> 	# Pass: Check kernel supports resctrl filesystem
> 	# Pass: Check resctrl mountpoint "/sys/fs/resctrl" exists
> 	# resctrl filesystem not mounted
> 	# dmesg: [   18.579861] resctrl: L3 allocation detected
> 	# dmesg: [   18.590395] resctrl: L2 allocation detected
> 	# dmesg: [   18.595181] resctrl: MB allocation detected
> 	# dmesg: [   18.599963] resctrl: L3 monitoring detected
> 	1..1
> 	# Starting L3_NONCONT_CAT test ...
> 	# Mounting resctrl to "/sys/fs/resctrl"
> 	# Write schema "L3:0=ff" to resctrl FS # write() failed : Permission denied
> 	not ok 1 L3_NONCONT_CAT: test
> 	# Totals: pass:0 fail:1 xfail:0 xpass:0 skip:0 error:0

Understood.

> Of course if you still think adding a ksft_print_msg() there would be meaningful
> I'll try to write a sensible message. But I hope you can see what I meant when I
> wrote that the user could already easily see what failed.

I do still believe that it will be helpful if there is a ksft_print_msg() with
something like "Unable to write contiguous CBM" or "Write of contiguous CBM failed"
or ... ? 

Reinette
diff mbox series

Patch

diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
index 7dc7206b3b99..ecf553a89aae 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -292,6 +292,65 @@  static int cat_run_test(const struct resctrl_test *test, const struct user_param
 	return ret;
 }
 
+static int noncont_cat_run_test(const struct resctrl_test *test,
+				const struct user_params *uparams)
+{
+	unsigned long full_cache_mask, cont_mask, noncont_mask;
+	unsigned int eax, ebx, ecx, edx, ret;
+	int level, bit_center, sparse_masks;
+	char schemata[64];
+
+	/* Check to compare sparse_masks content to cpuid output. */
+	sparse_masks = read_info_res_file(test->resource, "sparse_masks");
+	if (sparse_masks < 0)
+		return sparse_masks;
+
+	level = get_cache_level(test->resource);
+	if (level < 0)
+		return -EINVAL;
+	__cpuid_count(0x10, 4 - level, eax, ebx, ecx, edx);
+
+	if (sparse_masks != ((ecx >> 3) & 1))
+		return -1;
+
+	/* Write checks initialization. */
+	ret = get_full_cbm(test->resource, &full_cache_mask);
+	if (ret < 0)
+		return ret;
+	bit_center = count_bits(full_cache_mask) / 2;
+	cont_mask = full_cache_mask >> bit_center;
+
+	/* Contiguous mask write check. */
+	snprintf(schemata, sizeof(schemata), "%lx", cont_mask);
+	ret = write_schemata("", schemata, uparams->cpu, test->resource);
+	if (ret)
+		return ret;
+
+	/*
+	 * Non-contiguous mask write check. CBM has a 0xf hole approximately in the middle.
+	 * Output is compared with support information to catch any edge case errors.
+	 */
+	noncont_mask = ~(0xf << (bit_center - 2)) & full_cache_mask;
+	snprintf(schemata, sizeof(schemata), "%lx", noncont_mask);
+	ret = write_schemata("", schemata, uparams->cpu, test->resource);
+	if (ret && sparse_masks)
+		ksft_print_msg("Non-contiguous CBMs supported but write failed\n");
+	else if (ret && !sparse_masks)
+		ksft_print_msg("Non-contiguous CBMs not supported and write failed as expected\n");
+	else if (!ret && !sparse_masks)
+		ksft_print_msg("Non-contiguous CBMs not supported but write succeeded\n");
+
+	return !ret == !sparse_masks;
+}
+
+static bool noncont_cat_feature_check(const struct resctrl_test *test)
+{
+	if (!resctrl_resource_exists(test->resource))
+		return false;
+
+	return resctrl_cache_feature_exists(test->resource, "sparse_masks");
+}
+
 struct resctrl_test l3_cat_test = {
 	.name = "L3_CAT",
 	.group = "CAT",
@@ -299,3 +358,19 @@  struct resctrl_test l3_cat_test = {
 	.feature_check = test_resource_feature_check,
 	.run_test = cat_run_test,
 };
+
+struct resctrl_test l3_noncont_cat_test = {
+	.name = "L3_NONCONT_CAT",
+	.group = "CAT",
+	.resource = "L3",
+	.feature_check = noncont_cat_feature_check,
+	.run_test = noncont_cat_run_test,
+};
+
+struct resctrl_test l2_noncont_cat_test = {
+	.name = "L2_NONCONT_CAT",
+	.group = "CAT",
+	.resource = "L2",
+	.feature_check = noncont_cat_feature_check,
+	.run_test = noncont_cat_run_test,
+};
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 74041a35d4ba..7b7a48d1fddd 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -165,6 +165,7 @@  unsigned int count_contiguous_bits(unsigned long val, unsigned int *start);
 int get_full_cbm(const char *cache_type, unsigned long *mask);
 int get_mask_no_shareable(const char *cache_type, unsigned long *mask);
 int get_cache_size(int cpu_no, const char *cache_type, unsigned long *cache_size);
+int get_cache_level(const char *cache_type);
 int read_info_res_file(const char *resource, const char *filename);
 void ctrlc_handler(int signum, siginfo_t *info, void *ptr);
 int signal_handler_register(void);
@@ -201,5 +202,7 @@  extern struct resctrl_test mbm_test;
 extern struct resctrl_test mba_test;
 extern struct resctrl_test cmt_test;
 extern struct resctrl_test l3_cat_test;
+extern struct resctrl_test l3_noncont_cat_test;
+extern struct resctrl_test l2_noncont_cat_test;
 
 #endif /* RESCTRL_H */
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 3044179ee6e9..f3dc1b9696e7 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -19,6 +19,8 @@  static struct resctrl_test *resctrl_tests[] = {
 	&mba_test,
 	&cmt_test,
 	&l3_cat_test,
+	&l3_noncont_cat_test,
+	&l2_noncont_cat_test,
 };
 
 static int detect_vendor(void)
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 8546421f0940..8bd30973fec3 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -100,7 +100,7 @@  int umount_resctrlfs(void)
  *
  * Return: cache level as integer or -1 if @cache_type is invalid.
  */
-static int get_cache_level(const char *cache_type)
+int get_cache_level(const char *cache_type)
 {
 	if (!strcmp(cache_type, "L3"))
 		return 3;