Message ID | 20240206174322.2317679-2-adhemerval.zanella@linaro.org |
---|---|
State | New |
Headers | show |
Series | x86: Improve ERMS usage on Zen3+ | expand |
On Tue, Feb 6, 2024 at 5:43 PM Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote: > > The REP MOVSB usage on memcpy/memmove does not show much performance > improvement on Zen3/Zen4 cores compared to the vectorized loops. Also, > as from BZ 30994, if the source is aligned and the destination is not > the performance can be 20x slower. > > The performance difference is noticeable with small buffer sizes, closer > to the lower bounds limits when memcpy/memmove starts to use ERMS. The > performance of REP MOVSB is similar to vectorized instruction on the > size limit (the L2 cache). Also, there is no drawback to multiple cores > sharing the cache. > > A new tunable, glibc.cpu.x86_rep_movsb_stop_threshold, allows to set up > the higher bound size to use 'rep movsb'. > > Checked on x86_64-linux-gnu on Zen3. > --- > manual/tunables.texi | 9 +++++++ > sysdeps/x86/dl-cacheinfo.h | 50 +++++++++++++++++++++--------------- > sysdeps/x86/dl-tunables.list | 10 ++++++++ > 3 files changed, 48 insertions(+), 21 deletions(-) > > diff --git a/manual/tunables.texi b/manual/tunables.texi > index be97190d67..ee5d90b91b 100644 > --- a/manual/tunables.texi > +++ b/manual/tunables.texi > @@ -569,6 +569,15 @@ greater than zero, and currently defaults to 2048 bytes. > This tunable is specific to i386 and x86-64. > @end deftp > > +@deftp Tunable glibc.cpu.x86_rep_movsb_stop_threshold > +The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to > +set the threshold in bytes to stop using "rep movsb". The value must be > +greater than zero, and currently, the default depends on the CPU and the > +cache size. > + > +This tunable is specific to i386 and x86-64. > +@end deftp > + > @deftp Tunable glibc.cpu.x86_rep_stosb_threshold > The @code{glibc.cpu.x86_rep_stosb_threshold} tunable allows the user to > set threshold in bytes to start using "rep stosb". The value must be > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > index d5101615e3..74b804c5e6 100644 > --- a/sysdeps/x86/dl-cacheinfo.h > +++ b/sysdeps/x86/dl-cacheinfo.h > @@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > long int data = -1; > long int shared = -1; > long int shared_per_thread = -1; > - long int core = -1; > unsigned int threads = 0; > unsigned long int level1_icache_size = -1; > unsigned long int level1_icache_linesize = -1; > @@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > if (cpu_features->basic.kind == arch_kind_intel) > { > data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); > - core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); > shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); > shared_per_thread = shared; > > @@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features); > level1_dcache_linesize > = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features); > - level2_cache_size = core; > + level2_cache_size > + = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); > level2_cache_assoc > = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features); > level2_cache_linesize > @@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > level4_cache_size > = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); > > - get_common_cache_info (&shared, &shared_per_thread, &threads, core); > + get_common_cache_info (&shared, &shared_per_thread, &threads, > + level2_cache_size); > } > else if (cpu_features->basic.kind == arch_kind_zhaoxin) > { > data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); > - core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); > shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); > shared_per_thread = shared; > > @@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > level1_dcache_size = data; > level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC); > level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE); > - level2_cache_size = core; > + level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); > level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC); > level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE); > level3_cache_size = shared; > level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); > level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); > > - get_common_cache_info (&shared, &shared_per_thread, &threads, core); > + get_common_cache_info (&shared, &shared_per_thread, &threads, > + level2_cache_size); > } > else if (cpu_features->basic.kind == arch_kind_amd) > { > data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); > - core = handle_amd (_SC_LEVEL2_CACHE_SIZE); > shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); > > level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); > @@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > level1_dcache_size = data; > level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC); > level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE); > - level2_cache_size = core; > + level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);; > level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC); > level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE); > level3_cache_size = shared; > @@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > if (shared <= 0) > { > /* No shared L3 cache. All we have is the L2 cache. */ > - shared = core; > + shared = level2_cache_size; > } > else if (cpu_features->basic.family < 0x17) > { > /* Account for exclusive L2 and L3 caches. */ > - shared += core; > + shared += level2_cache_size; > } > > shared_per_thread = shared; > @@ -1028,16 +1027,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > SIZE_MAX); > > unsigned long int rep_movsb_stop_threshold; > - /* ERMS feature is implemented from AMD Zen3 architecture and it is > - performing poorly for data above L2 cache size. Henceforth, adding > - an upper bound threshold parameter to limit the usage of Enhanced > - REP MOVSB operations and setting its value to L2 cache size. */ > - if (cpu_features->basic.kind == arch_kind_amd) > - rep_movsb_stop_threshold = core; > - /* Setting the upper bound of ERMS to the computed value of > - non-temporal threshold for architectures other than AMD. */ > - else > - rep_movsb_stop_threshold = non_temporal_threshold; > + /* If the tunable is set and with a valid value (larger than the minimal > + threshold to use ERMS) use it instead of default values. */ > + rep_movsb_stop_threshold = TUNABLE_GET (x86_rep_movsb_stop_threshold, > + long int, NULL); > + if (!TUNABLE_IS_INITIALIZED (x86_rep_movsb_stop_threshold) > + || rep_movsb_stop_threshold <= rep_movsb_threshold) > + { > + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of > + cases slower than the vectorized path (and for some alignments, > + it is really slow, check BZ #30994). */ > + if (cpu_features->basic.kind == arch_kind_amd) > + rep_movsb_stop_threshold = 0; note that if `size >= rep_movsb_threshold && size >= rep_movsb_stop_threshold` we will use NT stores, not temporal stores. Id think you would want this to be setting the `rep_movsb_threshold` -> `non_temporal_threshold` which would essentially disable `rep movsb` but continue to use the other tunables for temporal/non-temporal decisions. > + else > + /* Setting the upper bound of ERMS to the computed value of > + non-temporal threshold for architectures other than AMD. */ > + rep_movsb_stop_threshold = non_temporal_threshold; > + } > + TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, > + SIZE_MAX); > > cpu_features->data_cache_size = data; > cpu_features->shared_cache_size = shared; > diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list > index 7d82da0dec..80cf5563ab 100644 > --- a/sysdeps/x86/dl-tunables.list > +++ b/sysdeps/x86/dl-tunables.list > @@ -49,6 +49,16 @@ glibc { > # if the tunable value is set by user or not [BZ #27069]. > minval: 1 > } > + x86_rep_movsb_stop_threshold { > + # For AMD CPUs that support ERMS (Zen3+), REP MOVSB is not faster > + # than the vectorized path (and for some destination alignment it > + # is really slow, check BZ #30994). On Intel CPUs, the size limit > + # to use ERMS is [1/8, 1/2] of the size of the chip's cache, check > + # the dl-cacheinfo.h). > + # This tunable allows the caller to set the limit where to use REP > + # MOVB on memcpy/memmove. > + type: SIZE_T > + } > x86_rep_stosb_threshold { > type: SIZE_T > # Since there is overhead to set up REP STOSB operation, REP STOSB > -- > 2.34.1 >
On 06/02/24 15:36, Noah Goldstein wrote: > On Tue, Feb 6, 2024 at 5:43 PM Adhemerval Zanella > <adhemerval.zanella@linaro.org> wrote: >> >> The REP MOVSB usage on memcpy/memmove does not show much performance >> improvement on Zen3/Zen4 cores compared to the vectorized loops. Also, >> as from BZ 30994, if the source is aligned and the destination is not >> the performance can be 20x slower. >> >> The performance difference is noticeable with small buffer sizes, closer >> to the lower bounds limits when memcpy/memmove starts to use ERMS. The >> performance of REP MOVSB is similar to vectorized instruction on the >> size limit (the L2 cache). Also, there is no drawback to multiple cores >> sharing the cache. >> >> A new tunable, glibc.cpu.x86_rep_movsb_stop_threshold, allows to set up >> the higher bound size to use 'rep movsb'. >> >> Checked on x86_64-linux-gnu on Zen3. >> --- >> manual/tunables.texi | 9 +++++++ >> sysdeps/x86/dl-cacheinfo.h | 50 +++++++++++++++++++++--------------- >> sysdeps/x86/dl-tunables.list | 10 ++++++++ >> 3 files changed, 48 insertions(+), 21 deletions(-) >> >> diff --git a/manual/tunables.texi b/manual/tunables.texi >> index be97190d67..ee5d90b91b 100644 >> --- a/manual/tunables.texi >> +++ b/manual/tunables.texi >> @@ -569,6 +569,15 @@ greater than zero, and currently defaults to 2048 bytes. >> This tunable is specific to i386 and x86-64. >> @end deftp >> >> +@deftp Tunable glibc.cpu.x86_rep_movsb_stop_threshold >> +The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to >> +set the threshold in bytes to stop using "rep movsb". The value must be >> +greater than zero, and currently, the default depends on the CPU and the >> +cache size. >> + >> +This tunable is specific to i386 and x86-64. >> +@end deftp >> + >> @deftp Tunable glibc.cpu.x86_rep_stosb_threshold >> The @code{glibc.cpu.x86_rep_stosb_threshold} tunable allows the user to >> set threshold in bytes to start using "rep stosb". The value must be >> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h >> index d5101615e3..74b804c5e6 100644 >> --- a/sysdeps/x86/dl-cacheinfo.h >> +++ b/sysdeps/x86/dl-cacheinfo.h >> @@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >> long int data = -1; >> long int shared = -1; >> long int shared_per_thread = -1; >> - long int core = -1; >> unsigned int threads = 0; >> unsigned long int level1_icache_size = -1; >> unsigned long int level1_icache_linesize = -1; >> @@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >> if (cpu_features->basic.kind == arch_kind_intel) >> { >> data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); >> - core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); >> shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); >> shared_per_thread = shared; >> >> @@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >> = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features); >> level1_dcache_linesize >> = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features); >> - level2_cache_size = core; >> + level2_cache_size >> + = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); >> level2_cache_assoc >> = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features); >> level2_cache_linesize >> @@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >> level4_cache_size >> = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); >> >> - get_common_cache_info (&shared, &shared_per_thread, &threads, core); >> + get_common_cache_info (&shared, &shared_per_thread, &threads, >> + level2_cache_size); >> } >> else if (cpu_features->basic.kind == arch_kind_zhaoxin) >> { >> data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); >> - core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); >> shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); >> shared_per_thread = shared; >> >> @@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >> level1_dcache_size = data; >> level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC); >> level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE); >> - level2_cache_size = core; >> + level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); >> level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC); >> level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE); >> level3_cache_size = shared; >> level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); >> level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); >> >> - get_common_cache_info (&shared, &shared_per_thread, &threads, core); >> + get_common_cache_info (&shared, &shared_per_thread, &threads, >> + level2_cache_size); >> } >> else if (cpu_features->basic.kind == arch_kind_amd) >> { >> data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); >> - core = handle_amd (_SC_LEVEL2_CACHE_SIZE); >> shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); >> >> level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); >> @@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >> level1_dcache_size = data; >> level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC); >> level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE); >> - level2_cache_size = core; >> + level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);; >> level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC); >> level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE); >> level3_cache_size = shared; >> @@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >> if (shared <= 0) >> { >> /* No shared L3 cache. All we have is the L2 cache. */ >> - shared = core; >> + shared = level2_cache_size; >> } >> else if (cpu_features->basic.family < 0x17) >> { >> /* Account for exclusive L2 and L3 caches. */ >> - shared += core; >> + shared += level2_cache_size; >> } >> >> shared_per_thread = shared; >> @@ -1028,16 +1027,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >> SIZE_MAX); >> >> unsigned long int rep_movsb_stop_threshold; >> - /* ERMS feature is implemented from AMD Zen3 architecture and it is >> - performing poorly for data above L2 cache size. Henceforth, adding >> - an upper bound threshold parameter to limit the usage of Enhanced >> - REP MOVSB operations and setting its value to L2 cache size. */ >> - if (cpu_features->basic.kind == arch_kind_amd) >> - rep_movsb_stop_threshold = core; >> - /* Setting the upper bound of ERMS to the computed value of >> - non-temporal threshold for architectures other than AMD. */ >> - else >> - rep_movsb_stop_threshold = non_temporal_threshold; >> + /* If the tunable is set and with a valid value (larger than the minimal >> + threshold to use ERMS) use it instead of default values. */ >> + rep_movsb_stop_threshold = TUNABLE_GET (x86_rep_movsb_stop_threshold, >> + long int, NULL); >> + if (!TUNABLE_IS_INITIALIZED (x86_rep_movsb_stop_threshold) >> + || rep_movsb_stop_threshold <= rep_movsb_threshold) >> + { >> + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of >> + cases slower than the vectorized path (and for some alignments, >> + it is really slow, check BZ #30994). */ >> + if (cpu_features->basic.kind == arch_kind_amd) >> + rep_movsb_stop_threshold = 0; > note that if `size >= rep_movsb_threshold && size >= rep_movsb_stop_threshold` > we will use NT stores, not temporal stores. > > Id think you would want this to be setting the > `rep_movsb_threshold` -> `non_temporal_threshold` > which would essentially disable `rep movsb` but continue to > use the other tunables for temporal/non-temporal decisions. My understanding is it will keep using non temporal stores, for instance with a size equal to 25165824 (x86.cpu_features.non_temporal_threshold) the code will: sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S 384 #if defined USE_MULTIARCH && IS_IN (libc) 385 L(movsb_more_2x_vec): 386 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP 387 ja L(movsb) And then: 613 /* If above __x86_rep_movsb_stop_threshold most likely is 614 candidate for NT moves as well. */ 615 cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP 616 jae L(large_memcpy_2x_check) And then skipping 'rep movsb' altogether. And it will check whether to use temporal stores: 683 L(large_memcpy_2x): 684 mov __x86_shared_non_temporal_threshold(%rip), %R11_LP 685 cmp %R11_LP, %RDX_LP 686 jb L(more_8x_vec_check) Maybe one options would to set rep_movsb_stop_threshold to rep_movsb_threshold, it slight clear that the range to use ERMS is a 0 size internal. >> + else >> + /* Setting the upper bound of ERMS to the computed value of >> + non-temporal threshold for architectures other than AMD. */ >> + rep_movsb_stop_threshold = non_temporal_threshold; >> + } >> + TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, >> + SIZE_MAX); >> >> cpu_features->data_cache_size = data; >> cpu_features->shared_cache_size = shared; >> diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list >> index 7d82da0dec..80cf5563ab 100644 >> --- a/sysdeps/x86/dl-tunables.list >> +++ b/sysdeps/x86/dl-tunables.list >> @@ -49,6 +49,16 @@ glibc { >> # if the tunable value is set by user or not [BZ #27069]. >> minval: 1 >> } >> + x86_rep_movsb_stop_threshold { >> + # For AMD CPUs that support ERMS (Zen3+), REP MOVSB is not faster >> + # than the vectorized path (and for some destination alignment it >> + # is really slow, check BZ #30994). On Intel CPUs, the size limit >> + # to use ERMS is [1/8, 1/2] of the size of the chip's cache, check >> + # the dl-cacheinfo.h). >> + # This tunable allows the caller to set the limit where to use REP >> + # MOVB on memcpy/memmove. >> + type: SIZE_T >> + } >> x86_rep_stosb_threshold { >> type: SIZE_T >> # Since there is overhead to set up REP STOSB operation, REP STOSB >> -- >> 2.34.1 >>
On Wed, Feb 7, 2024 at 12:10 PM Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> wrote: > > > > On 06/02/24 15:36, Noah Goldstein wrote: > > On Tue, Feb 6, 2024 at 5:43 PM Adhemerval Zanella > > <adhemerval.zanella@linaro.org> wrote: > >> > >> The REP MOVSB usage on memcpy/memmove does not show much performance > >> improvement on Zen3/Zen4 cores compared to the vectorized loops. Also, > >> as from BZ 30994, if the source is aligned and the destination is not > >> the performance can be 20x slower. > >> > >> The performance difference is noticeable with small buffer sizes, closer > >> to the lower bounds limits when memcpy/memmove starts to use ERMS. The > >> performance of REP MOVSB is similar to vectorized instruction on the > >> size limit (the L2 cache). Also, there is no drawback to multiple cores > >> sharing the cache. > >> > >> A new tunable, glibc.cpu.x86_rep_movsb_stop_threshold, allows to set up > >> the higher bound size to use 'rep movsb'. > >> > >> Checked on x86_64-linux-gnu on Zen3. > >> --- > >> manual/tunables.texi | 9 +++++++ > >> sysdeps/x86/dl-cacheinfo.h | 50 +++++++++++++++++++++--------------- > >> sysdeps/x86/dl-tunables.list | 10 ++++++++ > >> 3 files changed, 48 insertions(+), 21 deletions(-) > >> > >> diff --git a/manual/tunables.texi b/manual/tunables.texi > >> index be97190d67..ee5d90b91b 100644 > >> --- a/manual/tunables.texi > >> +++ b/manual/tunables.texi > >> @@ -569,6 +569,15 @@ greater than zero, and currently defaults to 2048 bytes. > >> This tunable is specific to i386 and x86-64. > >> @end deftp > >> > >> +@deftp Tunable glibc.cpu.x86_rep_movsb_stop_threshold > >> +The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to > >> +set the threshold in bytes to stop using "rep movsb". The value must be > >> +greater than zero, and currently, the default depends on the CPU and the > >> +cache size. > >> + > >> +This tunable is specific to i386 and x86-64. > >> +@end deftp > >> + > >> @deftp Tunable glibc.cpu.x86_rep_stosb_threshold > >> The @code{glibc.cpu.x86_rep_stosb_threshold} tunable allows the user to > >> set threshold in bytes to start using "rep stosb". The value must be > >> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > >> index d5101615e3..74b804c5e6 100644 > >> --- a/sysdeps/x86/dl-cacheinfo.h > >> +++ b/sysdeps/x86/dl-cacheinfo.h > >> @@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >> long int data = -1; > >> long int shared = -1; > >> long int shared_per_thread = -1; > >> - long int core = -1; > >> unsigned int threads = 0; > >> unsigned long int level1_icache_size = -1; > >> unsigned long int level1_icache_linesize = -1; > >> @@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >> if (cpu_features->basic.kind == arch_kind_intel) > >> { > >> data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); > >> - core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); > >> shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); > >> shared_per_thread = shared; > >> > >> @@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >> = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features); > >> level1_dcache_linesize > >> = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features); > >> - level2_cache_size = core; > >> + level2_cache_size > >> + = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); > >> level2_cache_assoc > >> = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features); > >> level2_cache_linesize > >> @@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >> level4_cache_size > >> = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); > >> > >> - get_common_cache_info (&shared, &shared_per_thread, &threads, core); > >> + get_common_cache_info (&shared, &shared_per_thread, &threads, > >> + level2_cache_size); > >> } > >> else if (cpu_features->basic.kind == arch_kind_zhaoxin) > >> { > >> data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); > >> - core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); > >> shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); > >> shared_per_thread = shared; > >> > >> @@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >> level1_dcache_size = data; > >> level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC); > >> level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE); > >> - level2_cache_size = core; > >> + level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); > >> level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC); > >> level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE); > >> level3_cache_size = shared; > >> level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); > >> level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); > >> > >> - get_common_cache_info (&shared, &shared_per_thread, &threads, core); > >> + get_common_cache_info (&shared, &shared_per_thread, &threads, > >> + level2_cache_size); > >> } > >> else if (cpu_features->basic.kind == arch_kind_amd) > >> { > >> data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); > >> - core = handle_amd (_SC_LEVEL2_CACHE_SIZE); > >> shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); > >> > >> level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); > >> @@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >> level1_dcache_size = data; > >> level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC); > >> level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE); > >> - level2_cache_size = core; > >> + level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);; > >> level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC); > >> level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE); > >> level3_cache_size = shared; > >> @@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >> if (shared <= 0) > >> { > >> /* No shared L3 cache. All we have is the L2 cache. */ > >> - shared = core; > >> + shared = level2_cache_size; > >> } > >> else if (cpu_features->basic.family < 0x17) > >> { > >> /* Account for exclusive L2 and L3 caches. */ > >> - shared += core; > >> + shared += level2_cache_size; > >> } > >> > >> shared_per_thread = shared; > >> @@ -1028,16 +1027,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >> SIZE_MAX); > >> > >> unsigned long int rep_movsb_stop_threshold; > >> - /* ERMS feature is implemented from AMD Zen3 architecture and it is > >> - performing poorly for data above L2 cache size. Henceforth, adding > >> - an upper bound threshold parameter to limit the usage of Enhanced > >> - REP MOVSB operations and setting its value to L2 cache size. */ > >> - if (cpu_features->basic.kind == arch_kind_amd) > >> - rep_movsb_stop_threshold = core; > >> - /* Setting the upper bound of ERMS to the computed value of > >> - non-temporal threshold for architectures other than AMD. */ > >> - else > >> - rep_movsb_stop_threshold = non_temporal_threshold; > >> + /* If the tunable is set and with a valid value (larger than the minimal > >> + threshold to use ERMS) use it instead of default values. */ > >> + rep_movsb_stop_threshold = TUNABLE_GET (x86_rep_movsb_stop_threshold, > >> + long int, NULL); > >> + if (!TUNABLE_IS_INITIALIZED (x86_rep_movsb_stop_threshold) > >> + || rep_movsb_stop_threshold <= rep_movsb_threshold) > >> + { > >> + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of > >> + cases slower than the vectorized path (and for some alignments, > >> + it is really slow, check BZ #30994). */ > >> + if (cpu_features->basic.kind == arch_kind_amd) > >> + rep_movsb_stop_threshold = 0; > > note that if `size >= rep_movsb_threshold && size >= rep_movsb_stop_threshold` > > we will use NT stores, not temporal stores. > > > > Id think you would want this to be setting the > > `rep_movsb_threshold` -> `non_temporal_threshold` > > which would essentially disable `rep movsb` but continue to > > use the other tunables for temporal/non-temporal decisions. > > My understanding is it will keep using non temporal stores, for instance > with a size equal to 25165824 (x86.cpu_features.non_temporal_threshold) > the code will: > > sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > 384 #if defined USE_MULTIARCH && IS_IN (libc) > 385 L(movsb_more_2x_vec): > 386 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP > 387 ja L(movsb) > > > And then: > > 613 /* If above __x86_rep_movsb_stop_threshold most likely is > 614 candidate for NT moves as well. */ > 615 cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP > 616 jae L(large_memcpy_2x_check) > > And then skipping 'rep movsb' altogether. And it will check whether > to use temporal stores: > > 683 L(large_memcpy_2x): > 684 mov __x86_shared_non_temporal_threshold(%rip), %R11_LP > 685 cmp %R11_LP, %RDX_LP > 686 jb L(more_8x_vec_check) > Ah you're right, forgot about that code! > > Maybe one options would to set rep_movsb_stop_threshold to rep_movsb_threshold, > it slight clear that the range to use ERMS is a 0 size internal. So never use `rep movsb`? If that where the case, I would just set `rep_movsb_threshold` to `non_temporal_threshold` as the default. > > > >> + else > >> + /* Setting the upper bound of ERMS to the computed value of > >> + non-temporal threshold for architectures other than AMD. */ > >> + rep_movsb_stop_threshold = non_temporal_threshold; > >> + } > >> + TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, > >> + SIZE_MAX); > >> > >> cpu_features->data_cache_size = data; > >> cpu_features->shared_cache_size = shared; > >> diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list > >> index 7d82da0dec..80cf5563ab 100644 > >> --- a/sysdeps/x86/dl-tunables.list > >> +++ b/sysdeps/x86/dl-tunables.list > >> @@ -49,6 +49,16 @@ glibc { > >> # if the tunable value is set by user or not [BZ #27069]. > >> minval: 1 > >> } > >> + x86_rep_movsb_stop_threshold { > >> + # For AMD CPUs that support ERMS (Zen3+), REP MOVSB is not faster > >> + # than the vectorized path (and for some destination alignment it > >> + # is really slow, check BZ #30994). On Intel CPUs, the size limit > >> + # to use ERMS is [1/8, 1/2] of the size of the chip's cache, check > >> + # the dl-cacheinfo.h). > >> + # This tunable allows the caller to set the limit where to use REP > >> + # MOVB on memcpy/memmove. > >> + type: SIZE_T > >> + } > >> x86_rep_stosb_threshold { > >> type: SIZE_T > >> # Since there is overhead to set up REP STOSB operation, REP STOSB > >> -- > >> 2.34.1 > >>
On 07/02/24 14:39, Noah Goldstein wrote: > On Wed, Feb 7, 2024 at 12:10 PM Adhemerval Zanella Netto > <adhemerval.zanella@linaro.org> wrote: >> >> >> >> On 06/02/24 15:36, Noah Goldstein wrote: >>> On Tue, Feb 6, 2024 at 5:43 PM Adhemerval Zanella >>> <adhemerval.zanella@linaro.org> wrote: >>>> >>>> The REP MOVSB usage on memcpy/memmove does not show much performance >>>> improvement on Zen3/Zen4 cores compared to the vectorized loops. Also, >>>> as from BZ 30994, if the source is aligned and the destination is not >>>> the performance can be 20x slower. >>>> >>>> The performance difference is noticeable with small buffer sizes, closer >>>> to the lower bounds limits when memcpy/memmove starts to use ERMS. The >>>> performance of REP MOVSB is similar to vectorized instruction on the >>>> size limit (the L2 cache). Also, there is no drawback to multiple cores >>>> sharing the cache. >>>> >>>> A new tunable, glibc.cpu.x86_rep_movsb_stop_threshold, allows to set up >>>> the higher bound size to use 'rep movsb'. >>>> >>>> Checked on x86_64-linux-gnu on Zen3. >>>> --- >>>> manual/tunables.texi | 9 +++++++ >>>> sysdeps/x86/dl-cacheinfo.h | 50 +++++++++++++++++++++--------------- >>>> sysdeps/x86/dl-tunables.list | 10 ++++++++ >>>> 3 files changed, 48 insertions(+), 21 deletions(-) >>>> >>>> diff --git a/manual/tunables.texi b/manual/tunables.texi >>>> index be97190d67..ee5d90b91b 100644 >>>> --- a/manual/tunables.texi >>>> +++ b/manual/tunables.texi >>>> @@ -569,6 +569,15 @@ greater than zero, and currently defaults to 2048 bytes. >>>> This tunable is specific to i386 and x86-64. >>>> @end deftp >>>> >>>> +@deftp Tunable glibc.cpu.x86_rep_movsb_stop_threshold >>>> +The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to >>>> +set the threshold in bytes to stop using "rep movsb". The value must be >>>> +greater than zero, and currently, the default depends on the CPU and the >>>> +cache size. >>>> + >>>> +This tunable is specific to i386 and x86-64. >>>> +@end deftp >>>> + >>>> @deftp Tunable glibc.cpu.x86_rep_stosb_threshold >>>> The @code{glibc.cpu.x86_rep_stosb_threshold} tunable allows the user to >>>> set threshold in bytes to start using "rep stosb". The value must be >>>> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h >>>> index d5101615e3..74b804c5e6 100644 >>>> --- a/sysdeps/x86/dl-cacheinfo.h >>>> +++ b/sysdeps/x86/dl-cacheinfo.h >>>> @@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>> long int data = -1; >>>> long int shared = -1; >>>> long int shared_per_thread = -1; >>>> - long int core = -1; >>>> unsigned int threads = 0; >>>> unsigned long int level1_icache_size = -1; >>>> unsigned long int level1_icache_linesize = -1; >>>> @@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>> if (cpu_features->basic.kind == arch_kind_intel) >>>> { >>>> data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); >>>> - core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); >>>> shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); >>>> shared_per_thread = shared; >>>> >>>> @@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>> = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features); >>>> level1_dcache_linesize >>>> = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features); >>>> - level2_cache_size = core; >>>> + level2_cache_size >>>> + = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); >>>> level2_cache_assoc >>>> = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features); >>>> level2_cache_linesize >>>> @@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>> level4_cache_size >>>> = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); >>>> >>>> - get_common_cache_info (&shared, &shared_per_thread, &threads, core); >>>> + get_common_cache_info (&shared, &shared_per_thread, &threads, >>>> + level2_cache_size); >>>> } >>>> else if (cpu_features->basic.kind == arch_kind_zhaoxin) >>>> { >>>> data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); >>>> - core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); >>>> shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); >>>> shared_per_thread = shared; >>>> >>>> @@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>> level1_dcache_size = data; >>>> level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC); >>>> level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE); >>>> - level2_cache_size = core; >>>> + level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); >>>> level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC); >>>> level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE); >>>> level3_cache_size = shared; >>>> level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); >>>> level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); >>>> >>>> - get_common_cache_info (&shared, &shared_per_thread, &threads, core); >>>> + get_common_cache_info (&shared, &shared_per_thread, &threads, >>>> + level2_cache_size); >>>> } >>>> else if (cpu_features->basic.kind == arch_kind_amd) >>>> { >>>> data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); >>>> - core = handle_amd (_SC_LEVEL2_CACHE_SIZE); >>>> shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); >>>> >>>> level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); >>>> @@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>> level1_dcache_size = data; >>>> level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC); >>>> level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE); >>>> - level2_cache_size = core; >>>> + level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);; >>>> level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC); >>>> level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE); >>>> level3_cache_size = shared; >>>> @@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>> if (shared <= 0) >>>> { >>>> /* No shared L3 cache. All we have is the L2 cache. */ >>>> - shared = core; >>>> + shared = level2_cache_size; >>>> } >>>> else if (cpu_features->basic.family < 0x17) >>>> { >>>> /* Account for exclusive L2 and L3 caches. */ >>>> - shared += core; >>>> + shared += level2_cache_size; >>>> } >>>> >>>> shared_per_thread = shared; >>>> @@ -1028,16 +1027,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>> SIZE_MAX); >>>> >>>> unsigned long int rep_movsb_stop_threshold; >>>> - /* ERMS feature is implemented from AMD Zen3 architecture and it is >>>> - performing poorly for data above L2 cache size. Henceforth, adding >>>> - an upper bound threshold parameter to limit the usage of Enhanced >>>> - REP MOVSB operations and setting its value to L2 cache size. */ >>>> - if (cpu_features->basic.kind == arch_kind_amd) >>>> - rep_movsb_stop_threshold = core; >>>> - /* Setting the upper bound of ERMS to the computed value of >>>> - non-temporal threshold for architectures other than AMD. */ >>>> - else >>>> - rep_movsb_stop_threshold = non_temporal_threshold; >>>> + /* If the tunable is set and with a valid value (larger than the minimal >>>> + threshold to use ERMS) use it instead of default values. */ >>>> + rep_movsb_stop_threshold = TUNABLE_GET (x86_rep_movsb_stop_threshold, >>>> + long int, NULL); >>>> + if (!TUNABLE_IS_INITIALIZED (x86_rep_movsb_stop_threshold) >>>> + || rep_movsb_stop_threshold <= rep_movsb_threshold) >>>> + { >>>> + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of >>>> + cases slower than the vectorized path (and for some alignments, >>>> + it is really slow, check BZ #30994). */ >>>> + if (cpu_features->basic.kind == arch_kind_amd) >>>> + rep_movsb_stop_threshold = 0; >>> note that if `size >= rep_movsb_threshold && size >= rep_movsb_stop_threshold` >>> we will use NT stores, not temporal stores. >>> >>> Id think you would want this to be setting the >>> `rep_movsb_threshold` -> `non_temporal_threshold` >>> which would essentially disable `rep movsb` but continue to >>> use the other tunables for temporal/non-temporal decisions. >> >> My understanding is it will keep using non temporal stores, for instance >> with a size equal to 25165824 (x86.cpu_features.non_temporal_threshold) >> the code will: >> >> sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S >> >> 384 #if defined USE_MULTIARCH && IS_IN (libc) >> 385 L(movsb_more_2x_vec): >> 386 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP >> 387 ja L(movsb) >> >> >> And then: >> >> 613 /* If above __x86_rep_movsb_stop_threshold most likely is >> 614 candidate for NT moves as well. */ >> 615 cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP >> 616 jae L(large_memcpy_2x_check) >> >> And then skipping 'rep movsb' altogether. And it will check whether >> to use temporal stores: >> >> 683 L(large_memcpy_2x): >> 684 mov __x86_shared_non_temporal_threshold(%rip), %R11_LP >> 685 cmp %R11_LP, %RDX_LP >> 686 jb L(more_8x_vec_check) >> > > Ah you're right, forgot about that code! > > >> >> Maybe one options would to set rep_movsb_stop_threshold to rep_movsb_threshold, >> it slight clear that the range to use ERMS is a 0 size internal. > So never use `rep movsb`? > If that where the case, I would just set `rep_movsb_threshold` to > `non_temporal_threshold` > as the default. It works as well, it is essentially the same as setting rep_movsb_threshold to rep_movsb_stop_threshold.
On 07/02/24 15:06, Adhemerval Zanella Netto wrote: > > > On 07/02/24 14:39, Noah Goldstein wrote: >> On Wed, Feb 7, 2024 at 12:10 PM Adhemerval Zanella Netto >> <adhemerval.zanella@linaro.org> wrote: >>> >>> >>> >>> On 06/02/24 15:36, Noah Goldstein wrote: >>>> On Tue, Feb 6, 2024 at 5:43 PM Adhemerval Zanella >>>> <adhemerval.zanella@linaro.org> wrote: >>>>> >>>>> The REP MOVSB usage on memcpy/memmove does not show much performance >>>>> improvement on Zen3/Zen4 cores compared to the vectorized loops. Also, >>>>> as from BZ 30994, if the source is aligned and the destination is not >>>>> the performance can be 20x slower. >>>>> >>>>> The performance difference is noticeable with small buffer sizes, closer >>>>> to the lower bounds limits when memcpy/memmove starts to use ERMS. The >>>>> performance of REP MOVSB is similar to vectorized instruction on the >>>>> size limit (the L2 cache). Also, there is no drawback to multiple cores >>>>> sharing the cache. >>>>> >>>>> A new tunable, glibc.cpu.x86_rep_movsb_stop_threshold, allows to set up >>>>> the higher bound size to use 'rep movsb'. >>>>> >>>>> Checked on x86_64-linux-gnu on Zen3. >>>>> --- >>>>> manual/tunables.texi | 9 +++++++ >>>>> sysdeps/x86/dl-cacheinfo.h | 50 +++++++++++++++++++++--------------- >>>>> sysdeps/x86/dl-tunables.list | 10 ++++++++ >>>>> 3 files changed, 48 insertions(+), 21 deletions(-) >>>>> >>>>> diff --git a/manual/tunables.texi b/manual/tunables.texi >>>>> index be97190d67..ee5d90b91b 100644 >>>>> --- a/manual/tunables.texi >>>>> +++ b/manual/tunables.texi >>>>> @@ -569,6 +569,15 @@ greater than zero, and currently defaults to 2048 bytes. >>>>> This tunable is specific to i386 and x86-64. >>>>> @end deftp >>>>> >>>>> +@deftp Tunable glibc.cpu.x86_rep_movsb_stop_threshold >>>>> +The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to >>>>> +set the threshold in bytes to stop using "rep movsb". The value must be >>>>> +greater than zero, and currently, the default depends on the CPU and the >>>>> +cache size. >>>>> + >>>>> +This tunable is specific to i386 and x86-64. >>>>> +@end deftp >>>>> + >>>>> @deftp Tunable glibc.cpu.x86_rep_stosb_threshold >>>>> The @code{glibc.cpu.x86_rep_stosb_threshold} tunable allows the user to >>>>> set threshold in bytes to start using "rep stosb". The value must be >>>>> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h >>>>> index d5101615e3..74b804c5e6 100644 >>>>> --- a/sysdeps/x86/dl-cacheinfo.h >>>>> +++ b/sysdeps/x86/dl-cacheinfo.h >>>>> @@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>>> long int data = -1; >>>>> long int shared = -1; >>>>> long int shared_per_thread = -1; >>>>> - long int core = -1; >>>>> unsigned int threads = 0; >>>>> unsigned long int level1_icache_size = -1; >>>>> unsigned long int level1_icache_linesize = -1; >>>>> @@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>>> if (cpu_features->basic.kind == arch_kind_intel) >>>>> { >>>>> data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); >>>>> - core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); >>>>> shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); >>>>> shared_per_thread = shared; >>>>> >>>>> @@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>>> = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features); >>>>> level1_dcache_linesize >>>>> = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features); >>>>> - level2_cache_size = core; >>>>> + level2_cache_size >>>>> + = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); >>>>> level2_cache_assoc >>>>> = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features); >>>>> level2_cache_linesize >>>>> @@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>>> level4_cache_size >>>>> = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); >>>>> >>>>> - get_common_cache_info (&shared, &shared_per_thread, &threads, core); >>>>> + get_common_cache_info (&shared, &shared_per_thread, &threads, >>>>> + level2_cache_size); >>>>> } >>>>> else if (cpu_features->basic.kind == arch_kind_zhaoxin) >>>>> { >>>>> data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); >>>>> - core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); >>>>> shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); >>>>> shared_per_thread = shared; >>>>> >>>>> @@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>>> level1_dcache_size = data; >>>>> level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC); >>>>> level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE); >>>>> - level2_cache_size = core; >>>>> + level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); >>>>> level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC); >>>>> level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE); >>>>> level3_cache_size = shared; >>>>> level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); >>>>> level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); >>>>> >>>>> - get_common_cache_info (&shared, &shared_per_thread, &threads, core); >>>>> + get_common_cache_info (&shared, &shared_per_thread, &threads, >>>>> + level2_cache_size); >>>>> } >>>>> else if (cpu_features->basic.kind == arch_kind_amd) >>>>> { >>>>> data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); >>>>> - core = handle_amd (_SC_LEVEL2_CACHE_SIZE); >>>>> shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); >>>>> >>>>> level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); >>>>> @@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>>> level1_dcache_size = data; >>>>> level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC); >>>>> level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE); >>>>> - level2_cache_size = core; >>>>> + level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);; >>>>> level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC); >>>>> level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE); >>>>> level3_cache_size = shared; >>>>> @@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>>> if (shared <= 0) >>>>> { >>>>> /* No shared L3 cache. All we have is the L2 cache. */ >>>>> - shared = core; >>>>> + shared = level2_cache_size; >>>>> } >>>>> else if (cpu_features->basic.family < 0x17) >>>>> { >>>>> /* Account for exclusive L2 and L3 caches. */ >>>>> - shared += core; >>>>> + shared += level2_cache_size; >>>>> } >>>>> >>>>> shared_per_thread = shared; >>>>> @@ -1028,16 +1027,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >>>>> SIZE_MAX); >>>>> >>>>> unsigned long int rep_movsb_stop_threshold; >>>>> - /* ERMS feature is implemented from AMD Zen3 architecture and it is >>>>> - performing poorly for data above L2 cache size. Henceforth, adding >>>>> - an upper bound threshold parameter to limit the usage of Enhanced >>>>> - REP MOVSB operations and setting its value to L2 cache size. */ >>>>> - if (cpu_features->basic.kind == arch_kind_amd) >>>>> - rep_movsb_stop_threshold = core; >>>>> - /* Setting the upper bound of ERMS to the computed value of >>>>> - non-temporal threshold for architectures other than AMD. */ >>>>> - else >>>>> - rep_movsb_stop_threshold = non_temporal_threshold; >>>>> + /* If the tunable is set and with a valid value (larger than the minimal >>>>> + threshold to use ERMS) use it instead of default values. */ >>>>> + rep_movsb_stop_threshold = TUNABLE_GET (x86_rep_movsb_stop_threshold, >>>>> + long int, NULL); >>>>> + if (!TUNABLE_IS_INITIALIZED (x86_rep_movsb_stop_threshold) >>>>> + || rep_movsb_stop_threshold <= rep_movsb_threshold) >>>>> + { >>>>> + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of >>>>> + cases slower than the vectorized path (and for some alignments, >>>>> + it is really slow, check BZ #30994). */ >>>>> + if (cpu_features->basic.kind == arch_kind_amd) >>>>> + rep_movsb_stop_threshold = 0; >>>> note that if `size >= rep_movsb_threshold && size >= rep_movsb_stop_threshold` >>>> we will use NT stores, not temporal stores. >>>> >>>> Id think you would want this to be setting the >>>> `rep_movsb_threshold` -> `non_temporal_threshold` >>>> which would essentially disable `rep movsb` but continue to >>>> use the other tunables for temporal/non-temporal decisions. >>> >>> My understanding is it will keep using non temporal stores, for instance >>> with a size equal to 25165824 (x86.cpu_features.non_temporal_threshold) >>> the code will: >>> >>> sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S >>> >>> 384 #if defined USE_MULTIARCH && IS_IN (libc) >>> 385 L(movsb_more_2x_vec): >>> 386 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP >>> 387 ja L(movsb) >>> >>> >>> And then: >>> >>> 613 /* If above __x86_rep_movsb_stop_threshold most likely is >>> 614 candidate for NT moves as well. */ >>> 615 cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP >>> 616 jae L(large_memcpy_2x_check) >>> >>> And then skipping 'rep movsb' altogether. And it will check whether >>> to use temporal stores: >>> >>> 683 L(large_memcpy_2x): >>> 684 mov __x86_shared_non_temporal_threshold(%rip), %R11_LP >>> 685 cmp %R11_LP, %RDX_LP >>> 686 jb L(more_8x_vec_check) >>> >> >> Ah you're right, forgot about that code! >> >> >>> >>> Maybe one options would to set rep_movsb_stop_threshold to rep_movsb_threshold, >>> it slight clear that the range to use ERMS is a 0 size internal. >> So never use `rep movsb`? >> If that where the case, I would just set `rep_movsb_threshold` to >> `non_temporal_threshold` >> as the default. > > It works as well, it is essentially the same as setting rep_movsb_threshold to > rep_movsb_stop_threshold. > And I think by just setting rep_movsb_threshold, it makes the rep_movsb_stop_threshold tunable less appealing (it would be a matter to adjust the existing x86_rep_movsb_threshold tunable to enable ERMS).
On Wed, Feb 7, 2024 at 6:10 PM Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> wrote: > > > > On 07/02/24 15:06, Adhemerval Zanella Netto wrote: > > > > > > On 07/02/24 14:39, Noah Goldstein wrote: > >> On Wed, Feb 7, 2024 at 12:10 PM Adhemerval Zanella Netto > >> <adhemerval.zanella@linaro.org> wrote: > >>> > >>> > >>> > >>> On 06/02/24 15:36, Noah Goldstein wrote: > >>>> On Tue, Feb 6, 2024 at 5:43 PM Adhemerval Zanella > >>>> <adhemerval.zanella@linaro.org> wrote: > >>>>> > >>>>> The REP MOVSB usage on memcpy/memmove does not show much performance > >>>>> improvement on Zen3/Zen4 cores compared to the vectorized loops. Also, > >>>>> as from BZ 30994, if the source is aligned and the destination is not > >>>>> the performance can be 20x slower. > >>>>> > >>>>> The performance difference is noticeable with small buffer sizes, closer > >>>>> to the lower bounds limits when memcpy/memmove starts to use ERMS. The > >>>>> performance of REP MOVSB is similar to vectorized instruction on the > >>>>> size limit (the L2 cache). Also, there is no drawback to multiple cores > >>>>> sharing the cache. > >>>>> > >>>>> A new tunable, glibc.cpu.x86_rep_movsb_stop_threshold, allows to set up > >>>>> the higher bound size to use 'rep movsb'. > >>>>> > >>>>> Checked on x86_64-linux-gnu on Zen3. > >>>>> --- > >>>>> manual/tunables.texi | 9 +++++++ > >>>>> sysdeps/x86/dl-cacheinfo.h | 50 +++++++++++++++++++++--------------- > >>>>> sysdeps/x86/dl-tunables.list | 10 ++++++++ > >>>>> 3 files changed, 48 insertions(+), 21 deletions(-) > >>>>> > >>>>> diff --git a/manual/tunables.texi b/manual/tunables.texi > >>>>> index be97190d67..ee5d90b91b 100644 > >>>>> --- a/manual/tunables.texi > >>>>> +++ b/manual/tunables.texi > >>>>> @@ -569,6 +569,15 @@ greater than zero, and currently defaults to 2048 bytes. > >>>>> This tunable is specific to i386 and x86-64. > >>>>> @end deftp > >>>>> > >>>>> +@deftp Tunable glibc.cpu.x86_rep_movsb_stop_threshold > >>>>> +The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to > >>>>> +set the threshold in bytes to stop using "rep movsb". The value must be > >>>>> +greater than zero, and currently, the default depends on the CPU and the > >>>>> +cache size. > >>>>> + > >>>>> +This tunable is specific to i386 and x86-64. > >>>>> +@end deftp > >>>>> + > >>>>> @deftp Tunable glibc.cpu.x86_rep_stosb_threshold > >>>>> The @code{glibc.cpu.x86_rep_stosb_threshold} tunable allows the user to > >>>>> set threshold in bytes to start using "rep stosb". The value must be > >>>>> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > >>>>> index d5101615e3..74b804c5e6 100644 > >>>>> --- a/sysdeps/x86/dl-cacheinfo.h > >>>>> +++ b/sysdeps/x86/dl-cacheinfo.h > >>>>> @@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >>>>> long int data = -1; > >>>>> long int shared = -1; > >>>>> long int shared_per_thread = -1; > >>>>> - long int core = -1; > >>>>> unsigned int threads = 0; > >>>>> unsigned long int level1_icache_size = -1; > >>>>> unsigned long int level1_icache_linesize = -1; > >>>>> @@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >>>>> if (cpu_features->basic.kind == arch_kind_intel) > >>>>> { > >>>>> data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); > >>>>> - core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); > >>>>> shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); > >>>>> shared_per_thread = shared; > >>>>> > >>>>> @@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >>>>> = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features); > >>>>> level1_dcache_linesize > >>>>> = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features); > >>>>> - level2_cache_size = core; > >>>>> + level2_cache_size > >>>>> + = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); > >>>>> level2_cache_assoc > >>>>> = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features); > >>>>> level2_cache_linesize > >>>>> @@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >>>>> level4_cache_size > >>>>> = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); > >>>>> > >>>>> - get_common_cache_info (&shared, &shared_per_thread, &threads, core); > >>>>> + get_common_cache_info (&shared, &shared_per_thread, &threads, > >>>>> + level2_cache_size); > >>>>> } > >>>>> else if (cpu_features->basic.kind == arch_kind_zhaoxin) > >>>>> { > >>>>> data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); > >>>>> - core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); > >>>>> shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); > >>>>> shared_per_thread = shared; > >>>>> > >>>>> @@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >>>>> level1_dcache_size = data; > >>>>> level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC); > >>>>> level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE); > >>>>> - level2_cache_size = core; > >>>>> + level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); > >>>>> level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC); > >>>>> level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE); > >>>>> level3_cache_size = shared; > >>>>> level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); > >>>>> level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); > >>>>> > >>>>> - get_common_cache_info (&shared, &shared_per_thread, &threads, core); > >>>>> + get_common_cache_info (&shared, &shared_per_thread, &threads, > >>>>> + level2_cache_size); > >>>>> } > >>>>> else if (cpu_features->basic.kind == arch_kind_amd) > >>>>> { > >>>>> data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); > >>>>> - core = handle_amd (_SC_LEVEL2_CACHE_SIZE); > >>>>> shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); > >>>>> > >>>>> level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); > >>>>> @@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >>>>> level1_dcache_size = data; > >>>>> level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC); > >>>>> level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE); > >>>>> - level2_cache_size = core; > >>>>> + level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);; > >>>>> level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC); > >>>>> level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE); > >>>>> level3_cache_size = shared; > >>>>> @@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >>>>> if (shared <= 0) > >>>>> { > >>>>> /* No shared L3 cache. All we have is the L2 cache. */ > >>>>> - shared = core; > >>>>> + shared = level2_cache_size; > >>>>> } > >>>>> else if (cpu_features->basic.family < 0x17) > >>>>> { > >>>>> /* Account for exclusive L2 and L3 caches. */ > >>>>> - shared += core; > >>>>> + shared += level2_cache_size; > >>>>> } > >>>>> > >>>>> shared_per_thread = shared; > >>>>> @@ -1028,16 +1027,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > >>>>> SIZE_MAX); > >>>>> > >>>>> unsigned long int rep_movsb_stop_threshold; > >>>>> - /* ERMS feature is implemented from AMD Zen3 architecture and it is > >>>>> - performing poorly for data above L2 cache size. Henceforth, adding > >>>>> - an upper bound threshold parameter to limit the usage of Enhanced > >>>>> - REP MOVSB operations and setting its value to L2 cache size. */ > >>>>> - if (cpu_features->basic.kind == arch_kind_amd) > >>>>> - rep_movsb_stop_threshold = core; > >>>>> - /* Setting the upper bound of ERMS to the computed value of > >>>>> - non-temporal threshold for architectures other than AMD. */ > >>>>> - else > >>>>> - rep_movsb_stop_threshold = non_temporal_threshold; > >>>>> + /* If the tunable is set and with a valid value (larger than the minimal > >>>>> + threshold to use ERMS) use it instead of default values. */ > >>>>> + rep_movsb_stop_threshold = TUNABLE_GET (x86_rep_movsb_stop_threshold, > >>>>> + long int, NULL); > >>>>> + if (!TUNABLE_IS_INITIALIZED (x86_rep_movsb_stop_threshold) > >>>>> + || rep_movsb_stop_threshold <= rep_movsb_threshold) > >>>>> + { > >>>>> + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of > >>>>> + cases slower than the vectorized path (and for some alignments, > >>>>> + it is really slow, check BZ #30994). */ > >>>>> + if (cpu_features->basic.kind == arch_kind_amd) > >>>>> + rep_movsb_stop_threshold = 0; > >>>> note that if `size >= rep_movsb_threshold && size >= rep_movsb_stop_threshold` > >>>> we will use NT stores, not temporal stores. > >>>> > >>>> Id think you would want this to be setting the > >>>> `rep_movsb_threshold` -> `non_temporal_threshold` > >>>> which would essentially disable `rep movsb` but continue to > >>>> use the other tunables for temporal/non-temporal decisions. > >>> > >>> My understanding is it will keep using non temporal stores, for instance > >>> with a size equal to 25165824 (x86.cpu_features.non_temporal_threshold) > >>> the code will: > >>> > >>> sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > >>> > >>> 384 #if defined USE_MULTIARCH && IS_IN (libc) > >>> 385 L(movsb_more_2x_vec): > >>> 386 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP > >>> 387 ja L(movsb) > >>> > >>> > >>> And then: > >>> > >>> 613 /* If above __x86_rep_movsb_stop_threshold most likely is > >>> 614 candidate for NT moves as well. */ > >>> 615 cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP > >>> 616 jae L(large_memcpy_2x_check) > >>> > >>> And then skipping 'rep movsb' altogether. And it will check whether > >>> to use temporal stores: > >>> > >>> 683 L(large_memcpy_2x): > >>> 684 mov __x86_shared_non_temporal_threshold(%rip), %R11_LP > >>> 685 cmp %R11_LP, %RDX_LP > >>> 686 jb L(more_8x_vec_check) > >>> > >> > >> Ah you're right, forgot about that code! > >> > >> > >>> > >>> Maybe one options would to set rep_movsb_stop_threshold to rep_movsb_threshold, > >>> it slight clear that the range to use ERMS is a 0 size internal. > >> So never use `rep movsb`? > >> If that where the case, I would just set `rep_movsb_threshold` to > >> `non_temporal_threshold` > >> as the default. > > > > It works as well, it is essentially the same as setting rep_movsb_threshold to > > rep_movsb_stop_threshold. > > > > And I think by just setting rep_movsb_threshold, it makes the rep_movsb_stop_threshold > tunable less appealing (it would be a matter to adjust the existing x86_rep_movsb_threshold > tunable to enable ERMS). Would think that is a good thing? Less configs to juggle.
[AMD Official Use Only - General] > On 08/02/24 3:37, Noah Goldstein wrote: > On Wed, Feb 7, 2024 at 6:10 PM Adhemerval Zanella Netto > <adhemerval.zanella@linaro.org> wrote: > > > > > > > > On 07/02/24 15:06, Adhemerval Zanella Netto wrote: > > > > > > > > > On 07/02/24 14:39, Noah Goldstein wrote: > > >> On Wed, Feb 7, 2024 at 12:10 PM Adhemerval Zanella Netto > > >> <adhemerval.zanella@linaro.org> wrote: > > >>> > > >>> > > >>> > > >>> On 06/02/24 15:36, Noah Goldstein wrote: > > >>>> On Tue, Feb 6, 2024 at 5:43 PM Adhemerval Zanella > > >>>> <adhemerval.zanella@linaro.org> wrote: > > >>>>> > > >>>>> The REP MOVSB usage on memcpy/memmove does not show much > > >>>>> performance improvement on Zen3/Zen4 cores compared to the > > >>>>> vectorized loops. Also, as from BZ 30994, if the source is > > >>>>> aligned and the destination is not the performance can be 20x slower. > > >>>>> > > >>>>> The performance difference is noticeable with small buffer > > >>>>> sizes, closer to the lower bounds limits when memcpy/memmove > > >>>>> starts to use ERMS. The performance of REP MOVSB is similar to > > >>>>> vectorized instruction on the size limit (the L2 cache). Also, > > >>>>> there is no drawback to multiple cores sharing the cache. > > >>>>> > > >>>>> A new tunable, glibc.cpu.x86_rep_movsb_stop_threshold, allows to > > >>>>> set up the higher bound size to use 'rep movsb'. > > >>>>> > > >>>>> Checked on x86_64-linux-gnu on Zen3. > > >>>>> --- > > >>>>> manual/tunables.texi | 9 +++++++ > > >>>>> sysdeps/x86/dl-cacheinfo.h | 50 +++++++++++++++++++++--------- > ------ > > >>>>> sysdeps/x86/dl-tunables.list | 10 ++++++++ > > >>>>> 3 files changed, 48 insertions(+), 21 deletions(-) > > >>>>> > > >>>>> diff --git a/manual/tunables.texi b/manual/tunables.texi index > > >>>>> be97190d67..ee5d90b91b 100644 > > >>>>> --- a/manual/tunables.texi > > >>>>> +++ b/manual/tunables.texi > > >>>>> @@ -569,6 +569,15 @@ greater than zero, and currently defaults to > 2048 bytes. > > >>>>> This tunable is specific to i386 and x86-64. > > >>>>> @end deftp > > >>>>> > > >>>>> +@deftp Tunable glibc.cpu.x86_rep_movsb_stop_threshold > > >>>>> +The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the > > >>>>> +user to set the threshold in bytes to stop using "rep movsb". > > >>>>> +The value must be greater than zero, and currently, the default > > >>>>> +depends on the CPU and the cache size. > > >>>>> + > > >>>>> +This tunable is specific to i386 and x86-64. > > >>>>> +@end deftp > > >>>>> + > > >>>>> @deftp Tunable glibc.cpu.x86_rep_stosb_threshold The > > >>>>> @code{glibc.cpu.x86_rep_stosb_threshold} tunable allows the user > > >>>>> to set threshold in bytes to start using "rep stosb". The > > >>>>> value must be diff --git a/sysdeps/x86/dl-cacheinfo.h > > >>>>> b/sysdeps/x86/dl-cacheinfo.h index d5101615e3..74b804c5e6 > 100644 > > >>>>> --- a/sysdeps/x86/dl-cacheinfo.h > > >>>>> +++ b/sysdeps/x86/dl-cacheinfo.h > > >>>>> @@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features > *cpu_features) > > >>>>> long int data = -1; > > >>>>> long int shared = -1; > > >>>>> long int shared_per_thread = -1; > > >>>>> - long int core = -1; > > >>>>> unsigned int threads = 0; > > >>>>> unsigned long int level1_icache_size = -1; > > >>>>> unsigned long int level1_icache_linesize = -1; @@ -809,7 > > >>>>> +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > >>>>> if (cpu_features->basic.kind == arch_kind_intel) > > >>>>> { > > >>>>> data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); > > >>>>> - core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); > > >>>>> shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); > > >>>>> shared_per_thread = shared; > > >>>>> > > >>>>> @@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features > *cpu_features) > > >>>>> = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features); > > >>>>> level1_dcache_linesize > > >>>>> = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features); > > >>>>> - level2_cache_size = core; > > >>>>> + level2_cache_size > > >>>>> + = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); > > >>>>> level2_cache_assoc > > >>>>> = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features); > > >>>>> level2_cache_linesize > > >>>>> @@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features > *cpu_features) > > >>>>> level4_cache_size > > >>>>> = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); > > >>>>> > > >>>>> - get_common_cache_info (&shared, &shared_per_thread, > &threads, core); > > >>>>> + get_common_cache_info (&shared, &shared_per_thread, > &threads, > > >>>>> + level2_cache_size); > > >>>>> } > > >>>>> else if (cpu_features->basic.kind == arch_kind_zhaoxin) > > >>>>> { > > >>>>> data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); > > >>>>> - core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); > > >>>>> shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); > > >>>>> shared_per_thread = shared; > > >>>>> > > >>>>> @@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features > *cpu_features) > > >>>>> level1_dcache_size = data; > > >>>>> level1_dcache_assoc = handle_zhaoxin > (_SC_LEVEL1_DCACHE_ASSOC); > > >>>>> level1_dcache_linesize = handle_zhaoxin > (_SC_LEVEL1_DCACHE_LINESIZE); > > >>>>> - level2_cache_size = core; > > >>>>> + level2_cache_size = handle_zhaoxin > > >>>>> + (_SC_LEVEL2_CACHE_SIZE); > > >>>>> level2_cache_assoc = handle_zhaoxin > (_SC_LEVEL2_CACHE_ASSOC); > > >>>>> level2_cache_linesize = handle_zhaoxin > (_SC_LEVEL2_CACHE_LINESIZE); > > >>>>> level3_cache_size = shared; > > >>>>> level3_cache_assoc = handle_zhaoxin > (_SC_LEVEL3_CACHE_ASSOC); > > >>>>> level3_cache_linesize = handle_zhaoxin > > >>>>> (_SC_LEVEL3_CACHE_LINESIZE); > > >>>>> > > >>>>> - get_common_cache_info (&shared, &shared_per_thread, > &threads, core); > > >>>>> + get_common_cache_info (&shared, &shared_per_thread, > &threads, > > >>>>> + level2_cache_size); > > >>>>> } > > >>>>> else if (cpu_features->basic.kind == arch_kind_amd) > > >>>>> { > > >>>>> data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); > > >>>>> - core = handle_amd (_SC_LEVEL2_CACHE_SIZE); > > >>>>> shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); > > >>>>> > > >>>>> level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); > > >>>>> @@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features > *cpu_features) > > >>>>> level1_dcache_size = data; > > >>>>> level1_dcache_assoc = handle_amd > (_SC_LEVEL1_DCACHE_ASSOC); > > >>>>> level1_dcache_linesize = handle_amd > (_SC_LEVEL1_DCACHE_LINESIZE); > > >>>>> - level2_cache_size = core; > > >>>>> + level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);; > > >>>>> level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC); > > >>>>> level2_cache_linesize = handle_amd > (_SC_LEVEL2_CACHE_LINESIZE); > > >>>>> level3_cache_size = shared; @@ -880,12 +879,12 @@ > > >>>>> dl_init_cacheinfo (struct cpu_features *cpu_features) > > >>>>> if (shared <= 0) > > >>>>> { > > >>>>> /* No shared L3 cache. All we have is the L2 cache. */ > > >>>>> - shared = core; > > >>>>> + shared = level2_cache_size; > > >>>>> } > > >>>>> else if (cpu_features->basic.family < 0x17) > > >>>>> { > > >>>>> /* Account for exclusive L2 and L3 caches. */ > > >>>>> - shared += core; > > >>>>> + shared += level2_cache_size; > > >>>>> } > > >>>>> > > >>>>> shared_per_thread = shared; @@ -1028,16 +1027,25 @@ > > >>>>> dl_init_cacheinfo (struct cpu_features *cpu_features) > > >>>>> SIZE_MAX); > > >>>>> > > >>>>> unsigned long int rep_movsb_stop_threshold; > > >>>>> - /* ERMS feature is implemented from AMD Zen3 architecture and it > is > > >>>>> - performing poorly for data above L2 cache size. Henceforth, adding > > >>>>> - an upper bound threshold parameter to limit the usage of > Enhanced > > >>>>> - REP MOVSB operations and setting its value to L2 cache size. */ > > >>>>> - if (cpu_features->basic.kind == arch_kind_amd) > > >>>>> - rep_movsb_stop_threshold = core; > > >>>>> - /* Setting the upper bound of ERMS to the computed value of > > >>>>> - non-temporal threshold for architectures other than AMD. */ > > >>>>> - else > > >>>>> - rep_movsb_stop_threshold = non_temporal_threshold; > > >>>>> + /* If the tunable is set and with a valid value (larger than the minimal > > >>>>> + threshold to use ERMS) use it instead of default values. > > >>>>> + */ rep_movsb_stop_threshold = TUNABLE_GET > (x86_rep_movsb_stop_threshold, > > >>>>> + long int, NULL); if > > >>>>> + (!TUNABLE_IS_INITIALIZED (x86_rep_movsb_stop_threshold) > > >>>>> + || rep_movsb_stop_threshold <= rep_movsb_threshold) > > >>>>> + { > > >>>>> + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a > lot of > > >>>>> + cases slower than the vectorized path (and for some alignments, > > >>>>> + it is really slow, check BZ #30994). */ > > >>>>> + if (cpu_features->basic.kind == arch_kind_amd) > > >>>>> + rep_movsb_stop_threshold = 0; > > >>>> note that if `size >= rep_movsb_threshold && size >= > > >>>> rep_movsb_stop_threshold` we will use NT stores, not temporal stores. > > >>>> > > >>>> Id think you would want this to be setting the > > >>>> `rep_movsb_threshold` -> `non_temporal_threshold` which would > > >>>> essentially disable `rep movsb` but continue to use the other > > >>>> tunables for temporal/non-temporal decisions. > > >>> > > >>> My understanding is it will keep using non temporal stores, for > > >>> instance with a size equal to 25165824 > > >>> (x86.cpu_features.non_temporal_threshold) > > >>> the code will: > > >>> > > >>> sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > >>> > > >>> 384 #if defined USE_MULTIARCH && IS_IN (libc) > > >>> 385 L(movsb_more_2x_vec): > > >>> 386 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP > > >>> 387 ja L(movsb) > > >>> > > >>> > > >>> And then: > > >>> > > >>> 613 /* If above __x86_rep_movsb_stop_threshold most likely is > > >>> 614 candidate for NT moves as well. */ > > >>> 615 cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP > > >>> 616 jae L(large_memcpy_2x_check) > > >>> > > >>> And then skipping 'rep movsb' altogether. And it will check > > >>> whether to use temporal stores: > > >>> > > >>> 683 L(large_memcpy_2x): > > >>> 684 mov __x86_shared_non_temporal_threshold(%rip), %R11_LP > > >>> 685 cmp %R11_LP, %RDX_LP > > >>> 686 jb L(more_8x_vec_check) > > >>> > > >> > > >> Ah you're right, forgot about that code! > > >> > > >> > > >>> > > >>> Maybe one options would to set rep_movsb_stop_threshold to > > >>> rep_movsb_threshold, it slight clear that the range to use ERMS is a 0 > size internal. > > >> So never use `rep movsb`? > > >> If that where the case, I would just set `rep_movsb_threshold` to > > >> `non_temporal_threshold` as the default. > > > > > > It works as well, it is essentially the same as setting > > > rep_movsb_threshold to rep_movsb_stop_threshold. > > > > > > > And I think by just setting rep_movsb_threshold, it makes the > > rep_movsb_stop_threshold tunable less appealing (it would be a matter > > to adjust the existing x86_rep_movsb_threshold tunable to enable ERMS). I agree with you. LGTM > > Would think that is a good thing? Less configs to juggle.
diff --git a/manual/tunables.texi b/manual/tunables.texi index be97190d67..ee5d90b91b 100644 --- a/manual/tunables.texi +++ b/manual/tunables.texi @@ -569,6 +569,15 @@ greater than zero, and currently defaults to 2048 bytes. This tunable is specific to i386 and x86-64. @end deftp +@deftp Tunable glibc.cpu.x86_rep_movsb_stop_threshold +The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to +set the threshold in bytes to stop using "rep movsb". The value must be +greater than zero, and currently, the default depends on the CPU and the +cache size. + +This tunable is specific to i386 and x86-64. +@end deftp + @deftp Tunable glibc.cpu.x86_rep_stosb_threshold The @code{glibc.cpu.x86_rep_stosb_threshold} tunable allows the user to set threshold in bytes to start using "rep stosb". The value must be diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index d5101615e3..74b804c5e6 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) long int data = -1; long int shared = -1; long int shared_per_thread = -1; - long int core = -1; unsigned int threads = 0; unsigned long int level1_icache_size = -1; unsigned long int level1_icache_linesize = -1; @@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (cpu_features->basic.kind == arch_kind_intel) { data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); - core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); shared_per_thread = shared; @@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features); level1_dcache_linesize = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features); - level2_cache_size = core; + level2_cache_size + = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); level2_cache_assoc = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features); level2_cache_linesize @@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) level4_cache_size = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); - get_common_cache_info (&shared, &shared_per_thread, &threads, core); + get_common_cache_info (&shared, &shared_per_thread, &threads, + level2_cache_size); } else if (cpu_features->basic.kind == arch_kind_zhaoxin) { data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); - core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); shared_per_thread = shared; @@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) level1_dcache_size = data; level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC); level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE); - level2_cache_size = core; + level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC); level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE); level3_cache_size = shared; level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); - get_common_cache_info (&shared, &shared_per_thread, &threads, core); + get_common_cache_info (&shared, &shared_per_thread, &threads, + level2_cache_size); } else if (cpu_features->basic.kind == arch_kind_amd) { data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); - core = handle_amd (_SC_LEVEL2_CACHE_SIZE); shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); @@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) level1_dcache_size = data; level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC); level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE); - level2_cache_size = core; + level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);; level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC); level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE); level3_cache_size = shared; @@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (shared <= 0) { /* No shared L3 cache. All we have is the L2 cache. */ - shared = core; + shared = level2_cache_size; } else if (cpu_features->basic.family < 0x17) { /* Account for exclusive L2 and L3 caches. */ - shared += core; + shared += level2_cache_size; } shared_per_thread = shared; @@ -1028,16 +1027,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) SIZE_MAX); unsigned long int rep_movsb_stop_threshold; - /* ERMS feature is implemented from AMD Zen3 architecture and it is - performing poorly for data above L2 cache size. Henceforth, adding - an upper bound threshold parameter to limit the usage of Enhanced - REP MOVSB operations and setting its value to L2 cache size. */ - if (cpu_features->basic.kind == arch_kind_amd) - rep_movsb_stop_threshold = core; - /* Setting the upper bound of ERMS to the computed value of - non-temporal threshold for architectures other than AMD. */ - else - rep_movsb_stop_threshold = non_temporal_threshold; + /* If the tunable is set and with a valid value (larger than the minimal + threshold to use ERMS) use it instead of default values. */ + rep_movsb_stop_threshold = TUNABLE_GET (x86_rep_movsb_stop_threshold, + long int, NULL); + if (!TUNABLE_IS_INITIALIZED (x86_rep_movsb_stop_threshold) + || rep_movsb_stop_threshold <= rep_movsb_threshold) + { + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of + cases slower than the vectorized path (and for some alignments, + it is really slow, check BZ #30994). */ + if (cpu_features->basic.kind == arch_kind_amd) + rep_movsb_stop_threshold = 0; + else + /* Setting the upper bound of ERMS to the computed value of + non-temporal threshold for architectures other than AMD. */ + rep_movsb_stop_threshold = non_temporal_threshold; + } + TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, + SIZE_MAX); cpu_features->data_cache_size = data; cpu_features->shared_cache_size = shared; diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list index 7d82da0dec..80cf5563ab 100644 --- a/sysdeps/x86/dl-tunables.list +++ b/sysdeps/x86/dl-tunables.list @@ -49,6 +49,16 @@ glibc { # if the tunable value is set by user or not [BZ #27069]. minval: 1 } + x86_rep_movsb_stop_threshold { + # For AMD CPUs that support ERMS (Zen3+), REP MOVSB is not faster + # than the vectorized path (and for some destination alignment it + # is really slow, check BZ #30994). On Intel CPUs, the size limit + # to use ERMS is [1/8, 1/2] of the size of the chip's cache, check + # the dl-cacheinfo.h). + # This tunable allows the caller to set the limit where to use REP + # MOVB on memcpy/memmove. + type: SIZE_T + } x86_rep_stosb_threshold { type: SIZE_T # Since there is overhead to set up REP STOSB operation, REP STOSB