Message ID | 20250603195332.2822499-4-adhemerval.zanella@linaro.org |
---|---|
State | New |
Headers | show |
Series | Optimize inet_ntop | expand |
On 6/3/25 12:51, Adhemerval Zanella wrote: > static inline char *put_uint8 (uint8_t word, char *tp) Put the function name at the start of a line. > + intptr_t s = 1; There should be no need for intptr_t; 'int' is good enough and clearer and should generate the same machine code. > + tp[2] = _itoa_lower_digits[word % 10]; Don't consult an array; just add '0'. > + if (word >= 100) > ... > + if (word >= 10) Since this is for performance, shouldn't the code avoid conditional branches? Something like the following, say: static inline char * put_uint8 (uint8_t word, char *tp) { *tp = '0' + word / 100; tp += 100 <= word; *tp = '0' + word / 10 % 10; tp += 10 <= word; *tp++ = '0' + word % 10; return tp; } Alternatively, if we're going to have conditional branches, there is no need to test (word >= 100) if (word >= 10) is false.
Adhemerval Zanella <adhemerval.zanella@linaro.org> writes: > The performance results on aarch64 Neoverse1 with gcc 14.2.1: > > * master > > aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv4 > "inet_ntop_ipv4": { > "workload-ipv4-random": { > "duration": 1.43067e+09, > "iterations": 8e+06, > "reciprocal-throughput": 178.572, > "latency": 179.096, > "max-throughput": 5.59997e+06, > "min-throughput": 5.58359e+06 > } > aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv6 > "inet_ntop_ipv6": { > "workload-ipv6-random": { > "duration": 1.68539e+09, > "iterations": 4e+06, > "reciprocal-throughput": 421.307, > "latency": 421.388, > "max-throughput": 2.37357e+06, > "min-throughput": 2.37311e+06 > } > } > > * patched > > aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4 > "inet_ntop_ipv4": { > "workload-ipv4-random": { > "duration": 1.06324e+09, > "iterations": 4.4e+07, > "reciprocal-throughput": 24.1509, > "latency": 24.178, > "max-throughput": 4.14063e+07, > "min-throughput": 4.13599e+07 > } > aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv6 > "inet_ntop_ipv6": { > "workload-ipv6-random": { > "duration": 1.08476e+09, > "iterations": 2.4e+07, > "reciprocal-throughput": 45.2794, > "latency": 45.1174, > "max-throughput": 2.20851e+07, > "min-throughput": 2.21644e+07 > } > } > > Checked on aarch64-linux-gnu and x86_64-linux-gnu. > --- > resolv/inet_ntop.c | 134 +++++++++++++++++++++++++++++---------------- > 1 file changed, 87 insertions(+), 47 deletions(-) This part of the patch series looks good to me. So: Reviewed-by: Collin Funk <collin.funk1@gmail.com> I see similar results as well: $ grep '^model name' /proc/cpuinfo | sed 1q model name : AMD Ryzen 7 3700X 8-Core Processor $ gcc --version | sed 1q gcc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) # Pre patch $ cat benchtests/bench.out {"timing_type": "hp_timing", "functions": { "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 3.94723e+09, "iterations": 8e+06, "reciprocal-throughput": 492.274, "latency": 494.534, "max-throughput": 2.03139e+06, "min-throughput": 2.0221e+06 } } , "inet_ntop_ipv6": { "workload-ipv6-random": { "duration": 4.67659e+09, "iterations": 4e+06, "reciprocal-throughput": 1164.13, "latency": 1174.17, "max-throughput": 859010, "min-throughput": 851668 } } } } # Post patch $ cat benchtests/bench.out {"timing_type": "hp_timing", "functions": { "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 3.67214e+09, "iterations": 8.4e+07, "reciprocal-throughput": 43.7862, "latency": 43.6457, "max-throughput": 2.28382e+07, "min-throughput": 2.29118e+07 } } , "inet_ntop_ipv6": { "workload-ipv6-random": { "duration": 3.9483e+09, "iterations": 3.6e+07, "reciprocal-throughput": 109.916, "latency": 109.434, "max-throughput": 9.09786e+06, "min-throughput": 9.13793e+06 } } } } Thanks, Collin
* Adhemerval Zanella: > -static const char * > +static inline const char * > inet_ntop6 (const u_char *src, char *dst, socklen_t size) > { > /* > @@ -108,7 +123,7 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) > */ > char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"], *tp; > struct { int base, len; } best, cur; > - u_int words[NS_IN6ADDRSZ / NS_INT16SZ]; > + uint16_t words[NS_IN6ADDRSZ / NS_INT16SZ] = { 0 }; > int i; > > /* > @@ -116,7 +131,6 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) > * Copy the input (bytewise) array into a wordwise array. > * Find the longest run of 0x00's in src[] for :: shorthanding. > */ > - memset(words, '\0', sizeof words); > for (i = 0; i < NS_IN6ADDRSZ; i += 2) > words[i / 2] = (src[i] << 8) | src[i + 1]; The words copy shouldn't really be needed, GCC can usually combine two byte accesses into single 16-bit loads. It should be possible to optimize put_uint16 using SWAR techniques. We don't need to do byte-wise output, either. We can use full 32-bit writes, potentially with overlaps. If the user-supplied output buffer has the maximum possible size, we don't need the temporary copy. (This applies to the IPv4 variant, too.) Thanks, Florian
On 03/06/25 19:27, Paul Eggert wrote: > On 6/3/25 12:51, Adhemerval Zanella wrote: > > >> static inline char *put_uint8 (uint8_t word, char *tp) > > Put the function name at the start of a line. Ack. > >> + intptr_t s = 1; > > There should be no need for intptr_t; 'int' is good enough and clearer and should generate the same machine code. Ack. > >> + tp[2] = _itoa_lower_digits[word % 10]; > > Don't consult an array; just add '0'. > >> + if (word >= 100) >> ... >> + if (word >= 10) > > Since this is for performance, shouldn't the code avoid conditional branches? Something like the following, say: > > static inline char * > put_uint8 (uint8_t word, char *tp) > { > *tp = '0' + word / 100; > tp += 100 <= word; > *tp = '0' + word / 10 % 10; > tp += 10 <= word; > *tp++ = '0' + word % 10; > return tp; > } > I am not sure, with gcc 14.2.1 this version is indeed faster on aarch64 (Neoverse N1), but slower on x86_64 (Zen3): * patch x86_64-linux-gnu $ ./benchtests/bench-inet_ntop_ipv4 "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 3.74298e+09, "iterations": 1.12e+08, "reciprocal-throughput": 33.4216, "latency": 33.4173, "max-throughput": 2.99208e+07, "min-throughput": 2.99246e+07 } } * suggestion x86_64-linux-gnu $ ./benchtests/bench-inet_ntop_ipv4 "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 3.86481e+09, "iterations": 8.8e+07, "reciprocal-throughput": 43.8419, "latency": 43.9948, "max-throughput": 2.28092e+07, "min-throughput": 2.27299e+07 } } * patch aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4 "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 1.07013e+09, "iterations": 4.4e+07, "reciprocal-throughput": 24.5301, "latency": 24.1121, "max-throughput": 4.07662e+07, "min-throughput": 4.14729e+07 } } * suggestion aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4 "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 1.05304e+09, "iterations": 5.6e+07, "reciprocal-throughput": 18.8547, "latency": 18.7539, "max-throughput": 5.30373e+07, "min-throughput": 5.33221e+07 } } > Alternatively, if we're going to have conditional branches, there is no need to test (word >= 100) if (word >= 10) is false. It indeed seems to help some chips (aarch64 Neoverse N1): static inline char *put_uint8 (uint8_t word, char *tp) { int s = 1; if (word >= 10) { if (word >= 100) { tp[2] = '0' + word % 10; word /= 10; s += 1; } tp[1] = '0' + word % 10; word /= 10; s += 1; } *tp = '0' + word % 10; return tp + s; } aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4 "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 1.07388e+09, "iterations": 5.2e+07, "reciprocal-throughput": 20.3728, "latency": 20.9305, "max-throughput": 4.90851e+07, "min-throughput": 4.77772e+07 } } I think I will go this version since it seems to be better compromise. I might check on some other chips I have access.
On 03/06/25 19:32, Collin Funk wrote: > Adhemerval Zanella <adhemerval.zanella@linaro.org> writes: > >> The performance results on aarch64 Neoverse1 with gcc 14.2.1: >> >> * master >> >> aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv4 >> "inet_ntop_ipv4": { >> "workload-ipv4-random": { >> "duration": 1.43067e+09, >> "iterations": 8e+06, >> "reciprocal-throughput": 178.572, >> "latency": 179.096, >> "max-throughput": 5.59997e+06, >> "min-throughput": 5.58359e+06 >> } >> aarch64-linux-gnu-master$ ./benchtests/bench-inet_ntop_ipv6 >> "inet_ntop_ipv6": { >> "workload-ipv6-random": { >> "duration": 1.68539e+09, >> "iterations": 4e+06, >> "reciprocal-throughput": 421.307, >> "latency": 421.388, >> "max-throughput": 2.37357e+06, >> "min-throughput": 2.37311e+06 >> } >> } >> >> * patched >> >> aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv4 >> "inet_ntop_ipv4": { >> "workload-ipv4-random": { >> "duration": 1.06324e+09, >> "iterations": 4.4e+07, >> "reciprocal-throughput": 24.1509, >> "latency": 24.178, >> "max-throughput": 4.14063e+07, >> "min-throughput": 4.13599e+07 >> } >> aarch64-linux-gnu$ ./benchtests/bench-inet_ntop_ipv6 >> "inet_ntop_ipv6": { >> "workload-ipv6-random": { >> "duration": 1.08476e+09, >> "iterations": 2.4e+07, >> "reciprocal-throughput": 45.2794, >> "latency": 45.1174, >> "max-throughput": 2.20851e+07, >> "min-throughput": 2.21644e+07 >> } >> } >> >> Checked on aarch64-linux-gnu and x86_64-linux-gnu. >> --- >> resolv/inet_ntop.c | 134 +++++++++++++++++++++++++++++---------------- >> 1 file changed, 87 insertions(+), 47 deletions(-) > > This part of the patch series looks good to me. So: > > Reviewed-by: Collin Funk <collin.funk1@gmail.com> Thanks. > > I see similar results as well: > > $ grep '^model name' /proc/cpuinfo | sed 1q > model name : AMD Ryzen 7 3700X 8-Core Processor > $ gcc --version | sed 1q > gcc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) > # Pre patch > $ cat benchtests/bench.out > {"timing_type": "hp_timing", > "functions": { > "inet_ntop_ipv4": { > "workload-ipv4-random": { > "duration": 3.94723e+09, > "iterations": 8e+06, > "reciprocal-throughput": 492.274, > "latency": 494.534, > "max-throughput": 2.03139e+06, > "min-throughput": 2.0221e+06 > } > } > , > "inet_ntop_ipv6": { > "workload-ipv6-random": { > "duration": 4.67659e+09, > "iterations": 4e+06, > "reciprocal-throughput": 1164.13, > "latency": 1174.17, > "max-throughput": 859010, > "min-throughput": 851668 > } > } > > } > } > # Post patch > $ cat benchtests/bench.out > {"timing_type": "hp_timing", > "functions": { > "inet_ntop_ipv4": { > "workload-ipv4-random": { > "duration": 3.67214e+09, > "iterations": 8.4e+07, > "reciprocal-throughput": 43.7862, > "latency": 43.6457, > "max-throughput": 2.28382e+07, > "min-throughput": 2.29118e+07 > } > } > , > "inet_ntop_ipv6": { > "workload-ipv6-random": { > "duration": 3.9483e+09, > "iterations": 3.6e+07, > "reciprocal-throughput": 109.916, > "latency": 109.434, > "max-throughput": 9.09786e+06, > "min-throughput": 9.13793e+06 > } > } > > } > } > > Thanks, > Collin
On 04/06/25 05:09, Florian Weimer wrote: > * Adhemerval Zanella: > >> -static const char * >> +static inline const char * >> inet_ntop6 (const u_char *src, char *dst, socklen_t size) >> { >> /* >> @@ -108,7 +123,7 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) >> */ >> char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"], *tp; >> struct { int base, len; } best, cur; >> - u_int words[NS_IN6ADDRSZ / NS_INT16SZ]; >> + uint16_t words[NS_IN6ADDRSZ / NS_INT16SZ] = { 0 }; >> int i; >> >> /* >> @@ -116,7 +131,6 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) >> * Copy the input (bytewise) array into a wordwise array. >> * Find the longest run of 0x00's in src[] for :: shorthanding. >> */ >> - memset(words, '\0', sizeof words); >> for (i = 0; i < NS_IN6ADDRSZ; i += 2) >> words[i / 2] = (src[i] << 8) | src[i + 1]; > > The words copy shouldn't really be needed, GCC can usually combine two > byte accesses into single 16-bit loads. There is no need for the words array in fact, we can ntohs when accessing the s6_addr16 for source (and on most chips it should be lowered efficiently). > > It should be possible to optimize put_uint16 using SWAR techniques. We > don't need to do byte-wise output, either. We can use full 32-bit > writes, potentially with overlaps. Not sure about that because the usual way is to present ipv6 in quartet and using 32-bit would require to decompose it anyway. Also Paul's suggestion to do overlap writes showed mixed results on different chips. > > If the user-supplied output buffer has the maximum possible size, we > don't need the temporary copy. (This applies to the IPv4 variant, too.) It shows a small performance increase, I have added this change.
* Adhemerval Zanella Netto: >> The words copy shouldn't really be needed, GCC can usually combine two >> byte accesses into single 16-bit loads. > > There is no need for the words array in fact, we can ntohs when > accessing the s6_addr16 for source (and on most chips it should be > lowered efficiently). I believe this introduces an alignment requirement that the inet_ntop specification does not actually provide. Thanks, Florian
On 04/06/25 14:08, Florian Weimer wrote: > * Adhemerval Zanella Netto: > >>> The words copy shouldn't really be needed, GCC can usually combine two >>> byte accesses into single 16-bit loads. >> >> There is no need for the words array in fact, we can ntohs when >> accessing the s6_addr16 for source (and on most chips it should be >> lowered efficiently). > > I believe this introduces an alignment requirement that the inet_ntop > specification does not actually provide. Sure, and we can just use a wrapper to handle this correctly: static inline uint16_t in6_addr_addr16 (const struct in6_addr *src, int idx) { const struct { uint16_t x; } __attribute__((__packed__)) *pptr = (typeof(pptr))(&src->s6_addr16[idx]); return ntohs (pptr->x); } It uses byte-load on architectures that does not allow unaligned access (like sparc). It also generates slight better code than the usual temporary plus memcpy we do for other uses on sparc (it does not spill store-byte for the temporary).
Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> writes: >> This part of the patch series looks good to me. So: >> >> Reviewed-by: Collin Funk <collin.funk1@gmail.com> > > Thanks. NP. How about I write a patch to have 'inet_ntoa' use 'inet_ntop (..., AF_INET)' once all the issues others have noticed are addressed? That function could benefit from the same optimization. Collin
On 04/06/25 14:36, Collin Funk wrote: > Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> writes: > >>> This part of the patch series looks good to me. So: >>> >>> Reviewed-by: Collin Funk <collin.funk1@gmail.com> >> >> Thanks. > > NP. How about I write a patch to have 'inet_ntoa' use > 'inet_ntop (..., AF_INET)' once all the issues others have noticed are > addressed? > > That function could benefit from the same optimization. Sure, if you don't mind I can also add on the v2.
On Wed, 4 Jun 2025, Adhemerval Zanella Netto wrote: > >> + if (word >= 100) > >> ... > >> + if (word >= 10) > > > > Since this is for performance, shouldn't the code avoid conditional branches? Something like the following, say: > > > > static inline char * > > put_uint8 (uint8_t word, char *tp) > > { > > *tp = '0' + word / 100; > > tp += 100 <= word; > > *tp = '0' + word / 10 % 10; > > tp += 10 <= word; > > *tp++ = '0' + word % 10; > > return tp; > > } > > > > I am not sure, with gcc 14.2.1 this version is indeed faster on aarch64 (Neoverse N1), > but slower on x86_64 (Zen3): How many samples did you collect? Hmm, I suppose on a modern write-back D$ system consecutive plain writes to the same memory location will be coalesced. But do all the systems do write allocation nowadays? I think this approach asks for a prefetch hint at the start of the function. I have benchmarked this alternative: static inline char * put_uint8 (uint8_t word, char *tp) { unsigned int o, t, h; h = word; o = h % 10; h /= 10; t = h % 10; h /= 10; *tp = '0' + h; tp += !!h; *tp = '0' + t; tp += !!t; *tp++ = '0' + o; return tp; } on my POWER9 system and it produces results similar to Paul's proposal (taking single samples only; there's some variation between runs and I didn't bother taking the average as that'd require some manual effort): "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 5.36267e+08, "iterations": 3.6e+07, "reciprocal-throughput": 14.8943, "latency": 14.8984, "max-throughput": 6.714e+07, "min-throughput": 6.71215e+07 } } (Paul's) vs: "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 5.38168e+08, "iterations": 3.6e+07, "reciprocal-throughput": 14.9995, "latency": 14.8988, "max-throughput": 6.66691e+07, "min-throughput": 6.71196e+07 } } (mine). If your benchmarking turns out good with this proposal, then I can obtain a representative set of samples for POWER9. I can't benchmark Aarch64 or x86-64 easily, but I note that my proposal produces fewer multiplications for both plus lets x86-64 take advantage of the carry flag via SBB instead of using a discrete conditional-set+add sequence, so chances are it'll perform better. It produces more compact code too, and I think it might be more readable for some though YMMV. Though the dependency on the quality of the optimiser seems very fragile here, e.g. if the locals are changed to a signed data type, then POWER9 produces an extra instruction that causes performance to drop by ~2.5%, but x86-64 is able to convert another conditional-set+add to SBB, which for a change likely causes a performance gain. Also I think you need to fold the update quoted below into 1/3, so as to let the newly-added group of benchmarks be run on its own, or one gets an error otherwise: $ make BENCHSET=bench-resolv bench The following values in BENCHSET are invalid: bench-resolv The valid ones are: bench-math bench-pthread bench-string calloc-simple calloc-tcache calloc-thread hash-benchset malloc-simple malloc-tcache malloc-thread math-benchset stdio-benchset stdio-common-benchset stdlib-benchset string-benchset wcsmbs-benchset Makefile:485: *** Invalid BENCHSET value. Stop. Maciej --- benchtests/Makefile | 1 + 1 file changed, 1 insertion(+) glibc-bench-resolv.diff Index: glibc/benchtests/Makefile =================================================================== --- glibc.orig/benchtests/Makefile +++ glibc/benchtests/Makefile @@ -462,6 +462,7 @@ ifneq ($(strip ${BENCHSET}),) VALIDBENCHSETNAMES := \ bench-math \ bench-pthread \ + bench-resolv \ bench-string \ calloc-simple \ calloc-tcache \
On Fri, 6 Jun 2025, Maciej W. Rozycki wrote: > *tp = '0' + t; > tp += !!t; It has to be: tp += !!t | !!h; obviously here. > "inet_ntop_ipv4": { > "workload-ipv4-random": { > "duration": 5.38168e+08, > "iterations": 3.6e+07, > "reciprocal-throughput": 14.9995, > "latency": 14.8988, > "max-throughput": 6.66691e+07, > "min-throughput": 6.71196e+07 > } > } And it adds an instruction back, scoring: "inet_ntop_ipv4": { "workload-ipv4-random": { "duration": 5.66505e+08, "iterations": 3.6e+07, "reciprocal-throughput": 15.7215, "latency": 15.751, "max-throughput": 6.3607e+07, "min-throughput": 6.34881e+07 } } so it may not be worth it on POWER9, but still seems to produce better code on Aarch64 and x86-64. Sorry about the confusion. Maciej
On 06/06/25 13:51, Maciej W. Rozycki wrote: > On Wed, 4 Jun 2025, Adhemerval Zanella Netto wrote: > >>>> + if (word >= 100) >>>> ... >>>> + if (word >= 10) >>> >>> Since this is for performance, shouldn't the code avoid conditional branches? Something like the following, say: >>> >>> static inline char * >>> put_uint8 (uint8_t word, char *tp) >>> { >>> *tp = '0' + word / 100; >>> tp += 100 <= word; >>> *tp = '0' + word / 10 % 10; >>> tp += 10 <= word; >>> *tp++ = '0' + word % 10; >>> return tp; >>> } >>> >> >> I am not sure, with gcc 14.2.1 this version is indeed faster on aarch64 (Neoverse N1), >> but slower on x86_64 (Zen3): > > How many samples did you collect? For both x86_64 and aarch64 I collected on an idle system that I assured no extra OS jitter would interfere in the result, and I don't see much variability over multiple runs (I will need to do some statistics analysis to give a proper number). As for other benchmark, it should not matter performance variability due hardware frequency scaling, and we do have some handling in bench-strings for this case. > > Hmm, I suppose on a modern write-back D$ system consecutive plain writes > to the same memory location will be coalesced. But do all the systems do > write allocation nowadays? I think this approach asks for a prefetch hint > at the start of the function. > > I have benchmarked this alternative: > > static inline char * > put_uint8 (uint8_t word, char *tp) > { > unsigned int o, t, h; > > h = word; > o = h % 10; > h /= 10; > t = h % 10; > h /= 10; > > *tp = '0' + h; > tp += !!h; > *tp = '0' + t; > tp += !!t; > *tp++ = '0' + o; > > return tp; > } > > on my POWER9 system and it produces results similar to Paul's proposal > (taking single samples only; there's some variation between runs and I > didn't bother taking the average as that'd require some manual effort): > > "inet_ntop_ipv4": { > "workload-ipv4-random": { > "duration": 5.36267e+08, > "iterations": 3.6e+07, > "reciprocal-throughput": 14.8943, > "latency": 14.8984, > "max-throughput": 6.714e+07, > "min-throughput": 6.71215e+07 > } > } > > (Paul's) vs: > > "inet_ntop_ipv4": { > "workload-ipv4-random": { > "duration": 5.38168e+08, > "iterations": 3.6e+07, > "reciprocal-throughput": 14.9995, > "latency": 14.8988, > "max-throughput": 6.66691e+07, > "min-throughput": 6.71196e+07 > } > } > > (mine). If your benchmarking turns out good with this proposal, then I > can obtain a representative set of samples for POWER9. The results of your suggestion is similar to Paul's on the hardware I tested (x86_64/Zen3, AArch64/Neoverse-N1, powerpc/POWER10): reciprocal-throughput patch suggestion improvement x86_64/zen3 26.1951 37.4577 -43.00% x86_64/Westmere [1] 94.3904 89.9536 4.70% i686/zen3 46.5747 57.4518 -23.35% aarch64 19.8964 16.8005 15.56% power10 9.43355 7.11195 24.61% So it really seems an issue with the generated code and Zen3. I also checked with a lower gcc version than I used (13 vs 14), but the x86_64 result are the same. Nor '-march=znver3 -mtune=znver3' made any difference also. I tried to check if there is any perf counter to help understand this issue, but it seems that Core::X86::Pmc::Core::Store_Globally_Visible_Cancels_2 is only available to newer Zen core (to check if this is relate to write-back consecutive plain writes hazards). [1] cfarm187 from gcc compile farm. > > I can't benchmark Aarch64 or x86-64 easily, but I note that my proposal > produces fewer multiplications for both plus lets x86-64 take advantage of > the carry flag via SBB instead of using a discrete conditional-set+add > sequence, so chances are it'll perform better. It produces more compact > code too, and I think it might be more readable for some though YMMV. > > Though the dependency on the quality of the optimiser seems very fragile > here, e.g. if the locals are changed to a signed data type, then POWER9 > produces an extra instruction that causes performance to drop by ~2.5%, > but x86-64 is able to convert another conditional-set+add to SBB, which > for a change likely causes a performance gain. I am even more inclined to use with my current version below mainly because it shows less surprising results on different chips. And we already getting a 10x improvement over current implementation, so we can fine-tune this even more once we figure out why AMD chips are behaving like this. static inline char * put_uint8 (uint8_t word, char *tp) { int s = 1; if (word >= 10) { if (word >= 100) { tp[2] = '0' + word % 10; word /= 10; s += 1; } tp[1] = '0' + word % 10; word /= 10; s += 1; } *tp = '0' + word; return tp + s; } > > Also I think you need to fold the update quoted below into 1/3, so as to > let the newly-added group of benchmarks be run on its own, or one gets an > error otherwise: > > $ make BENCHSET=bench-resolv bench > The following values in BENCHSET are invalid: bench-resolv > The valid ones are: bench-math bench-pthread bench-string calloc-simple calloc-tcache calloc-thread hash-benchset malloc-simple malloc-tcache malloc-thread math-benchset stdio-benchset stdio-common-benchset stdlib-benchset string-benchset wcsmbs-benchset > Makefile:485: *** Invalid BENCHSET value. Stop. Ack, I will fix it. > > Maciej > > --- > benchtests/Makefile | 1 + > 1 file changed, 1 insertion(+) > > glibc-bench-resolv.diff > Index: glibc/benchtests/Makefile > =================================================================== > --- glibc.orig/benchtests/Makefile > +++ glibc/benchtests/Makefile > @@ -462,6 +462,7 @@ ifneq ($(strip ${BENCHSET}),) > VALIDBENCHSETNAMES := \ > bench-math \ > bench-pthread \ > + bench-resolv \ > bench-string \ > calloc-simple \ > calloc-tcache \ >
diff --git a/resolv/inet_ntop.c b/resolv/inet_ntop.c index acf5f3cb88..3f2a9ddd87 100644 --- a/resolv/inet_ntop.c +++ b/resolv/inet_ntop.c @@ -26,45 +26,44 @@ #include <errno.h> #include <stdio.h> #include <string.h> - -#ifdef SPRINTF_CHAR -# define SPRINTF(x) strlen(sprintf/**/x) -#else -# define SPRINTF(x) ((size_t)sprintf x) -#endif +#include <intprops.h> +#include <_itoa.h> /* * WARNING: Don't even consider trying to compile this on a system where * sizeof(int) < 4. sizeof(int) > 4 is fine; all the world's not a VAX. */ -static const char *inet_ntop4 (const u_char *src, char *dst, socklen_t size); -static const char *inet_ntop6 (const u_char *src, char *dst, socklen_t size); - -/* char * - * __inet_ntop(af, src, dst, size) - * convert a network format address to presentation format. - * return: - * pointer to presentation format address (`dst'), or NULL (see errno). - * author: - * Paul Vixie, 1996. - */ -const char * -__inet_ntop (int af, const void *src, char *dst, socklen_t size) +static inline char *put_uint8 (uint8_t word, char *tp) { - switch (af) { - case AF_INET: - return (inet_ntop4(src, dst, size)); - case AF_INET6: - return (inet_ntop6(src, dst, size)); - default: - __set_errno (EAFNOSUPPORT); - return (NULL); - } - /* NOTREACHED */ + intptr_t s = 1; + if (word >= 100) + { + tp[2] = _itoa_lower_digits[word % 10]; + word /= 10; + s += 1; + } + if (word >= 10) + { + tp[1] = _itoa_lower_digits[word % 10]; + word /= 10; + s += 1; + } + *tp = _itoa_lower_digits[word % 10]; + return tp + s; +} + +static inline char *put_uint16 (uint16_t word, char *tp) +{ + if (word >= 0x1000) + *tp++ = _itoa_lower_digits[(word >> 12) & 0xf]; + if (word >= 0x100) + *tp++ = _itoa_lower_digits[(word >> 8) & 0xf]; + if (word >= 0x10) + *tp++ = _itoa_lower_digits[(word >> 4) & 0xf]; + *tp++ = _itoa_lower_digits[word & 0xf]; + return tp; } -libc_hidden_def (__inet_ntop) -weak_alias (__inet_ntop, inet_ntop) /* const char * * inet_ntop4(src, dst, size) @@ -74,20 +73,36 @@ weak_alias (__inet_ntop, inet_ntop) * notes: * (1) uses no statics * (2) takes a u_char* not an in_addr as input - * author: - * Paul Vixie, 1996. */ -static const char * +static __always_inline const char * inet_ntop4 (const u_char *src, char *dst, socklen_t size) { - static const char fmt[] = "%u.%u.%u.%u"; - char tmp[sizeof "255.255.255.255"]; + enum + { + oct_size = INT_BUFSIZE_BOUND (u_char), + tmp_size = 4 * oct_size + 3 /* '.' */ + 1 /* '\0' */ + }; + char tmp[tmp_size]; + char *tmp_r = tmp; - if (SPRINTF((tmp, fmt, src[0], src[1], src[2], src[3])) >= size) { - __set_errno (ENOSPC); - return (NULL); - } - return strcpy(dst, tmp); + tmp_r = put_uint8 (src[0], tmp_r); + *(tmp_r++) = '.'; + tmp_r = put_uint8 (src[1], tmp_r); + *(tmp_r++) = '.'; + tmp_r = put_uint8 (src[2], tmp_r); + *(tmp_r++) = '.'; + tmp_r = put_uint8 (src[3], tmp_r); + *tmp_r++ = '\0'; + + socklen_t tmp_s = tmp_r - tmp; + if (tmp_s > size) + { + __set_errno (ENOSPC); + return NULL; + } + memcpy (dst, tmp, tmp_s); + + return dst; } /* const char * @@ -96,7 +111,7 @@ inet_ntop4 (const u_char *src, char *dst, socklen_t size) * author: * Paul Vixie, 1996. */ -static const char * +static inline const char * inet_ntop6 (const u_char *src, char *dst, socklen_t size) { /* @@ -108,7 +123,7 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) */ char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"], *tp; struct { int base, len; } best, cur; - u_int words[NS_IN6ADDRSZ / NS_INT16SZ]; + uint16_t words[NS_IN6ADDRSZ / NS_INT16SZ] = { 0 }; int i; /* @@ -116,7 +131,6 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) * Copy the input (bytewise) array into a wordwise array. * Find the longest run of 0x00's in src[] for :: shorthanding. */ - memset(words, '\0', sizeof words); for (i = 0; i < NS_IN6ADDRSZ; i += 2) words[i / 2] = (src[i] << 8) | src[i + 1]; best.base = -1; @@ -167,7 +181,7 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) tp += strlen(tp); break; } - tp += SPRINTF((tp, "%x", words[i])); + tp = put_uint16 (words[i], tp); } /* Was it a trailing run of 0x00's? */ if (best.base != -1 && (best.base + best.len) == @@ -178,9 +192,35 @@ inet_ntop6 (const u_char *src, char *dst, socklen_t size) /* * Check for overflow, copy, and we're done. */ - if ((socklen_t)(tp - tmp) > size) { + socklen_t tmp_s = tp - tmp; + if (tmp_s > size) { __set_errno (ENOSPC); return (NULL); } - return strcpy(dst, tmp); + return memcpy(dst, tmp, tmp_s); } + +/* char * + * __inet_ntop(af, src, dst, size) + * convert a network format address to presentation format. + * return: + * pointer to presentation format address (`dst'), or NULL (see errno). + * author: + * Paul Vixie, 1996. + */ +const char * +__inet_ntop (int af, const void *src, char *dst, socklen_t size) +{ + switch (af) { + case AF_INET: + return (inet_ntop4(src, dst, size)); + case AF_INET6: + return (inet_ntop6(src, dst, size)); + default: + __set_errno (EAFNOSUPPORT); + return (NULL); + } + /* NOTREACHED */ +} +libc_hidden_def (__inet_ntop) +weak_alias (__inet_ntop, inet_ntop)