diff mbox series

[v2,2/3] x86: Do not prefer ERMS for memset on Zen3+

Message ID 20240206174322.2317679-3-adhemerval.zanella@linaro.org
State New
Headers show
Series x86: Improve ERMS usage on Zen3+ | expand

Commit Message

Adhemerval Zanella Netto Feb. 6, 2024, 5:43 p.m. UTC
For AMD Zen3+ architecture, the performance of the vectorized loop is
slightly better than ERMS.

Checked on x86_64-linux-gnu on Zen3.
---
 sysdeps/x86/dl-cacheinfo.h | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

Comments

Noah Goldstein Feb. 6, 2024, 7:01 p.m. UTC | #1
On Tue, Feb 6, 2024 at 5:43 PM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
> For AMD Zen3+ architecture, the performance of the vectorized loop is
> slightly better than ERMS.
>
> Checked on x86_64-linux-gnu on Zen3.
> ---
>  sysdeps/x86/dl-cacheinfo.h | 16 +++++++++++-----
>  1 file changed, 11 insertions(+), 5 deletions(-)
>
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index 74b804c5e6..f2cd6f179d 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -1010,11 +1010,17 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    if (tunable_size > minimum_rep_movsb_threshold)
>      rep_movsb_threshold = tunable_size;
>
> -  /* NB: The default value of the x86_rep_stosb_threshold tunable is the
> -     same as the default value of __x86_rep_stosb_threshold and the
> -     minimum value is fixed.  */
> -  rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
> -                                    long int, NULL);
> +  /* For AMD Zen3+ architecture, the performance of the vectorized loop is
> +     slightly better than ERMS.  */
> +  if (cpu_features->basic.kind == arch_kind_amd)
> +    rep_stosb_threshold = SIZE_MAX;
> +
> +  if (TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
> +    /* NB: The default value of the x86_rep_stosb_threshold tunable is the
> +       same as the default value of __x86_rep_stosb_threshold and the
> +       minimum value is fixed.  */
> +    rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
> +                                      long int, NULL);
Can we keep the unconditional
  rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
                                    long int, NULL);
for targets other than AMD?
Its easier to control the defaults that way.

>
>    TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
> --
> 2.34.1
>
Adhemerval Zanella Netto Feb. 7, 2024, 12:14 p.m. UTC | #2
On 06/02/24 16:01, Noah Goldstein wrote:
> On Tue, Feb 6, 2024 at 5:43 PM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>> For AMD Zen3+ architecture, the performance of the vectorized loop is
>> slightly better than ERMS.
>>
>> Checked on x86_64-linux-gnu on Zen3.
>> ---
>>  sysdeps/x86/dl-cacheinfo.h | 16 +++++++++++-----
>>  1 file changed, 11 insertions(+), 5 deletions(-)
>>
>> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
>> index 74b804c5e6..f2cd6f179d 100644
>> --- a/sysdeps/x86/dl-cacheinfo.h
>> +++ b/sysdeps/x86/dl-cacheinfo.h
>> @@ -1010,11 +1010,17 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>>    if (tunable_size > minimum_rep_movsb_threshold)
>>      rep_movsb_threshold = tunable_size;
>>
>> -  /* NB: The default value of the x86_rep_stosb_threshold tunable is the
>> -     same as the default value of __x86_rep_stosb_threshold and the
>> -     minimum value is fixed.  */
>> -  rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
>> -                                    long int, NULL);
>> +  /* For AMD Zen3+ architecture, the performance of the vectorized loop is
>> +     slightly better than ERMS.  */
>> +  if (cpu_features->basic.kind == arch_kind_amd)
>> +    rep_stosb_threshold = SIZE_MAX;
>> +
>> +  if (TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
>> +    /* NB: The default value of the x86_rep_stosb_threshold tunable is the
>> +       same as the default value of __x86_rep_stosb_threshold and the
>> +       minimum value is fixed.  */
>> +    rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
>> +                                      long int, NULL);
> Can we keep the unconditional
>   rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
>                                     long int, NULL);
> for targets other than AMD?
> Its easier to control the defaults that way.

Alright, I will change to

  /* NB: The default value of the x86_rep_stosb_threshold tunable is the
     same as the default value of __x86_rep_stosb_threshold and the
     minimum value is fixed.  */
  rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
                                     long int, NULL);
  if (cpu_features->basic.kind == arch_kind_amd
      && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
    /* For AMD Zen3+ architecture, the performance of the vectorized loop is
       slightly better than ERMS.  */
    rep_stosb_threshold = SIZE_MAX;
diff mbox series

Patch

diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 74b804c5e6..f2cd6f179d 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1010,11 +1010,17 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (tunable_size > minimum_rep_movsb_threshold)
     rep_movsb_threshold = tunable_size;
 
-  /* NB: The default value of the x86_rep_stosb_threshold tunable is the
-     same as the default value of __x86_rep_stosb_threshold and the
-     minimum value is fixed.  */
-  rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
-				     long int, NULL);
+  /* For AMD Zen3+ architecture, the performance of the vectorized loop is
+     slightly better than ERMS.  */
+  if (cpu_features->basic.kind == arch_kind_amd)
+    rep_stosb_threshold = SIZE_MAX;
+
+  if (TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
+    /* NB: The default value of the x86_rep_stosb_threshold tunable is the
+       same as the default value of __x86_rep_stosb_threshold and the
+       minimum value is fixed.  */
+    rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
+				       long int, NULL);
 
   TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);