[6/7] stdlib: Optimization qsort{_r} swap implementation

Message ID 1516298002-4618-7-git-send-email-adhemerval.zanella@linaro.org
State New
Headers show
Series
  • Refactor qsort implementation
Related show

Commit Message

Adhemerval Zanella Jan. 18, 2018, 5:53 p.m.
This patchs adds a optimized swap operation on qsort based in previous
msort one.  Instead of byte operation, three variants are provided:

  1. Using uint32_t loads and stores.
  2. Using uint64_t loads and stores.
  3. Generic one with a temporary buffer and memcpy/mempcpy.

The 1. and 2. option are selected only either if architecture defines
_STRING_ARCH_unaligned or if base pointer is aligned to required type.
This is due based on data for bench-qsort, usually programs calls
qsort with array with multiple of machine word as element size.

Benchmarking shows an increase performance:

Results for member size 4
  Sorted
  nmemb   |      base |   patched | diff
        32|      1401 |      1958 | 39.76
      4096|    351333 |    368533 | 4.90
     32768|   3369386 |   3131712 | -7.05
    524288|  63192972 |  59807494 | -5.36

  MostlySorted
  nmemb   |      base |   patched | diff
        32|      2391 |      2061 | -13.80
      4096|   1124074 |    961816 | -14.43
     32768|  11196607 |   9410438 | -15.95
    524288| 215908169 | 185586732 | -14.04

  Unsorted
  nmemb   |      base |   patched | diff
        32|      4993 |      2021 | -59.52
      4096|   1113860 |    963126 | -13.53
     32768|  11251293 |   9518795 | -15.40
    524288| 217252237 | 185072278 | -14.81

Results for member size 8
  Sorted
  nmemb   |      base |   patched | diff
        32|      1296 |      1267 | -2.24
      4096|    359418 |    334852 | -6.83
     32768|   3535229 |   3345157 | -5.38
    524288|  69847251 |  67029358 | -4.03

  MostlySorted
  nmemb   |      base |   patched | diff
        32|      2745 |      2340 | -14.75
      4096|   1222082 |   1014314 | -17.00
     32768|  12244800 |   9924706 | -18.95
    524288| 241557971 | 196898760 | -18.49

  Unsorted
  nmemb   |      base |   patched | diff
        32|      2972 |      2389 | -19.62
      4096|   1314861 |   1024052 | -22.12
     32768|  12397909 |  10120848 | -18.37
    524288| 241789262 | 193414824 | -20.01

Results for member size 32
  Sorted
  nmemb   |      base |   patched | diff
        32|      1305 |      1287 | -1.38
      4096|    346332 |    347979 | 0.48
     32768|   3458244 |   3408058 | -1.45
    524288|  72793445 |  69973719 | -3.87

  MostlySorted
  nmemb   |      base |   patched | diff
        32|      5435 |      4890 | -10.03
      4096|   2032260 |   1688556 | -16.91
     32768|  19909035 |  16419992 | -17.52
    524288| 390339319 | 325921585 | -16.50

  Unsorted
  nmemb   |      base |   patched | diff
        32|      5833 |      5351 | -8.26
      4096|   2022531 |   1724961 | -14.71
     32768|  19842888 |  16588545 | -16.40
    524288| 388838382 | 324102703 | -16.65

Checked on x86_64-linux-gnu.

	[BZ #19305].
	* stdlib/qsort.c (SWAP): Remove.
	(check_alignment, swap_u32, swap_u64, swap_generic,
	select_swap_func): New functions.
	(__qsort_r):
---
 stdlib/qsort.c | 77 ++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 59 insertions(+), 18 deletions(-)

-- 
2.7.4

Comments

Paul Eggert Jan. 22, 2018, 8:27 a.m. | #1
Adhemerval Zanella wrote:
> +static inline bool

> +check_alignment (const void *base, size_t align)

> +{

> +  return _STRING_ARCH_unaligned || ((uintptr_t)base % (align - 1)) == 0;

> +}


Surely the '(align - 1)' was supposed to be 'align'. Has this been tested on an 
architecture that does not allow unaligned access?

> +static inline void

> +swap_generic (void *a, void *b, size_t size)


Why is this inline? It's used only as a function pointer, and the other 
functions so used are not declared inline.

> +static inline swap_t

> +select_swap_func (const void *base, size_t size)

> +{

> +  if (size == 4 && check_alignment (base, 4))

> +    return swap_u32;

> +  else if (size == 8 && check_alignment (base, 8))

> +    return swap_u64;

> +  return swap_generic;

> +}


The conditions aren't portable enough. Use something like this instead for 
swap_u32, and similarly for swap_u64.

   if (size == sizeof (uint32_t) && check_alignment (base, alignof (uint32_t)))
     return swap_u32;

> +static void

> +swap_u32 (void *a, void *b, size_t size)


The pointer arguments should be declared 'void *restrict'. This can help GCC 
generate better code. Similarly for the other swap functions.

> +  uint32_t tmp = *(uint32_t*) a;

> +  *(uint32_t*) a = *(uint32_t*) b;

> +  *(uint32_t*) b = tmp;


It's nicer to avoid casts when possible, as is the case here and elsewhere. This 
is because casts are too powerful in C. Something like this, say:

     uint32_t *ua = a, *ub = b, tmp = *ua;
     *ua = *ub, *ub = tmp;

> +  unsigned char tmp[128];


Why 128? A comment seems called for.

> +static inline void

> +swap_generic (void *a, void *b, size_t size)

> +{

> +  unsigned char tmp[128];

> +  do

> +    {

> +      size_t s = size > sizeof (tmp) ? sizeof (tmp) : size;

> +      memcpy (tmp, a, s);

> +      a = __mempcpy (a, b, s);

> +      b = __mempcpy (b, tmp, s);

> +      size -= s;

> +    }

> +  while (size > 0);

> +}


On my platform (GCC 7.2.1 20170915 (Red Hat 7.2.1-2) x86-64) this inlined the 
memcpy but not the mempcpy calls. How about something like this instead? It 
should let the compiler do a better job of block-move-style operations in the 
loop. If mempcpy is inlined for you, feel free to substitute it for two of the 
loop's calls to memcpy.

   static void
   swap_generic (void *restrict va, void *restrict vb, size_t size)
   {
     char *a = va, *b = vb;
     enum { n = 128 }; /* Why 128?  */
     unsigned char tmp[n];
     while (size >= n)
       {
	memcpy (tmp, a, n);
	memcpy (a, b, n);
	memcpy (b, tmp, n);
	a += n;
	b += n;
	size -= n;
       }
     memcpy (tmp, a, size);
     memcpy (a, b, size);
     memcpy (b, tmp, size);
   }
Adhemerval Zanella Jan. 22, 2018, 10:55 a.m. | #2
On 22/01/2018 06:27, Paul Eggert wrote:
> Adhemerval Zanella wrote:

>> +static inline bool

>> +check_alignment (const void *base, size_t align)

>> +{

>> +  return _STRING_ARCH_unaligned || ((uintptr_t)base % (align - 1)) == 0;

>> +}

> 

> Surely the '(align - 1)' was supposed to be 'align'. Has this been tested on an architecture that does not allow unaligned access?


Yes, I checked on sparc64 machine.  This test is similar to the Linux kernel one
at lib/sort.c.

> 

>> +static inline void

>> +swap_generic (void *a, void *b, size_t size)

> 

> Why is this inline? It's used only as a function pointer, and the other functions so used are not declared inline.


It should not be, I fixed it locally.

> 

>> +static inline swap_t

>> +select_swap_func (const void *base, size_t size)

>> +{

>> +  if (size == 4 && check_alignment (base, 4))

>> +    return swap_u32;

>> +  else if (size == 8 && check_alignment (base, 8))

>> +    return swap_u64;

>> +  return swap_generic;

>> +}

> 

> The conditions aren't portable enough. Use something like this instead for swap_u32, and similarly for swap_u64.

> 

>   if (size == sizeof (uint32_t) && check_alignment (base, alignof (uint32_t)))

>     return swap_u32;


Ack, fixed locally.

> 

>> +static void

>> +swap_u32 (void *a, void *b, size_t size)

> 

> The pointer arguments should be declared 'void *restrict'. This can help GCC generate better code. Similarly for the other swap functions.

> 

>> +  uint32_t tmp = *(uint32_t*) a;

>> +  *(uint32_t*) a = *(uint32_t*) b;

>> +  *(uint32_t*) b = tmp;

> 

> It's nicer to avoid casts when possible, as is the case here and elsewhere. This is because casts are too powerful in C. Something like this, say:

> 

>     uint32_t *ua = a, *ub = b, tmp = *ua;

>     *ua = *ub, *ub = tmp;


Right, I changed to your suggestion.

> 

>> +  unsigned char tmp[128];

> 

> Why 128? A comment seems called for.


It is indeed an arbitrary value based on some real cases usage which
covers all gcc and firefox usage (largest key size gcc uses is 56
and firefox is 40). I will add comment about it.

> 

>> +static inline void

>> +swap_generic (void *a, void *b, size_t size)

>> +{

>> +  unsigned char tmp[128];

>> +  do

>> +    {

>> +      size_t s = size > sizeof (tmp) ? sizeof (tmp) : size;

>> +      memcpy (tmp, a, s);

>> +      a = __mempcpy (a, b, s);

>> +      b = __mempcpy (b, tmp, s);

>> +      size -= s;

>> +    }

>> +  while (size > 0);

>> +}

> 

> On my platform (GCC 7.2.1 20170915 (Red Hat 7.2.1-2) x86-64) this inlined the memcpy but not the mempcpy calls. How about something like this instead? It should let the compiler do a better job of block-move-style operations in the loop. If mempcpy is inlined for you, feel free to substitute it for two of the loop's calls to memcpy.

> 

>   static void

>   swap_generic (void *restrict va, void *restrict vb, size_t size)

>   {

>     char *a = va, *b = vb;

>     enum { n = 128 }; /* Why 128?  */

>     unsigned char tmp[n];

>     while (size >= n)

>       {

>     memcpy (tmp, a, n);

>     memcpy (a, b, n);

>     memcpy (b, tmp, n);

>     a += n;

>     b += n;

>     size -= n;

>       }

>     memcpy (tmp, a, size);

>     memcpy (a, b, size);

>     memcpy (b, tmp, size);

>   }


Because for this specific code inline is not always a gain, it depends 1. which is the
minimum ISA compiler will use to generate the inline variants and 2. which ifunc variants
glibc will provide for mempcpy (if if it is exposed for internal calls).  On recent x86_64
for instance the mempcpy call will issue __memmove_avx_unaligned_erms, which is faster
than default SSE2 inline variations.  Your suggestion turns to be in fact slower (base
is current approach) on a my machine (i7-4790K):

Results for member size 32
  Sorted
  nmemb   |      base |   patched | diff
        32|      1184 |      1268 | 7.09
      4096|    325865 |    333332 | 2.29
     32768|   3331750 |   3431695 | 3.00
    524288|  69067176 |  68805735 | -0.38

  Repeated
  nmemb   |      base |   patched | diff
        32|      4813 |      5779 | 20.07
      4096|   1624137 |   1972045 | 21.42
     32768|  15896739 |  19705289 | 23.96
    524288| 316328778 | 393942797 | 24.54

  MostlySorted
  nmemb   |      base |   patched | diff
        32|      5332 |      6198 | 16.24
      4096|   1312703 |   1563919 | 19.14
     32768|  12360726 |  14990070 | 21.27
    524288| 231603294 | 283228681 | 22.29

  Unsorted
  nmemb   |      base |   patched | diff
        32|      6047 |      7115 | 17.66
      4096|   1695241 |   2010943 | 18.62
     32768|  16430388 |  19636166 | 19.51
    524288| 329496913 | 395355847 | 19.99

In fact if I use -fno-builtin to force the memcpy call to issue the ifunc I get another
speed up (and it could be faster, for x86_64 at least, if glibc is build with -mavx):

Results for member size 32
  Sorted
  nmemb   |      base |   patched | diff
        32|      1184 |      1240 | 4.73
      4096|    325865 |    326596 | 0.22
     32768|   3331750 |   3613807 | 8.47
    524288|  69067176 |  74352201 | 7.65

  Repeated
  nmemb   |      base |   patched | diff
        32|      4813 |      4133 | -14.13
      4096|   1624137 |   1707452 | 5.13
     32768|  15896739 |  13999315 | -11.94
    524288| 316328778 | 280461810 | -11.34

  MostlySorted
  nmemb   |      base |   patched | diff
        32|      5332 |      4681 | -12.21
      4096|   1312703 |   1226684 | -6.55
     32768|  12360726 |  11362772 | -8.07
    524288| 231603294 | 212250739 | -8.36

  Unsorted
  nmemb   |      base |   patched | diff
        32|      6047 |      6676 | 10.40
      4096|   1695241 |   1492257 | -11.97
     32768|  16430388 |  14799600 | -9.93
    524288| 329496913 | 303681410 | -7.83

It might be that your approach is faster for other architectures which do not have
ifunc mempcpy, however I do not want to over-engineer this code since most real
word correspond to key sizes of 4 and 8.
Alexander Monakov Jan. 22, 2018, 1:46 p.m. | #3
On Mon, 22 Jan 2018, Adhemerval Zanella wrote:
> On 22/01/2018 06:27, Paul Eggert wrote:

> > Adhemerval Zanella wrote:

> >> +static inline bool

> >> +check_alignment (const void *base, size_t align)

> >> +{

> >> +  return _STRING_ARCH_unaligned || ((uintptr_t)base % (align - 1)) == 0;

> >> +}

> > 

> > Surely the '(align - 1)' was supposed to be 'align'. Has this been tested on an architecture that does not allow unaligned access?

> 

> Yes, I checked on sparc64 machine.  This test is similar to the Linux kernel one

> at lib/sort.c.


But the kernel source correctly uses '&' there rather than '%'.

Alexander
Adhemerval Zanella Jan. 22, 2018, 3:23 p.m. | #4
> Il giorno 22 gen 2018, alle ore 11:46, Alexander Monakov <amonakov@ispras.ru> ha scritto:

> 

>> On Mon, 22 Jan 2018, Adhemerval Zanella wrote:

>>> On 22/01/2018 06:27, Paul Eggert wrote:

>>> Adhemerval Zanella wrote:

>>>> +static inline bool

>>>> +check_alignment (const void *base, size_t align)

>>>> +{

>>>> +  return _STRING_ARCH_unaligned || ((uintptr_t)base % (align - 1)) == 0;

>>>> +}

>>> 

>>> Surely the '(align - 1)' was supposed to be 'align'. Has this been tested on an architecture that does not allow unaligned access?

>> 

>> Yes, I checked on sparc64 machine.  This test is similar to the Linux kernel one

>> at lib/sort.c.

> 

> But the kernel source correctly uses '&' there rather than '%'


Indeed, I assume I got luck on sparc64 (only uses aligned buffers). I fixed it locally.
Paul Eggert Jan. 22, 2018, 5:15 p.m. | #5
On 01/22/2018 02:55 AM, Adhemerval Zanella wrote:
> On 22/01/2018 06:27, Paul Eggert wrote:

>> Adhemerval Zanella wrote:

>>> +static inline bool

>>> +check_alignment (const void *base, size_t align)

>>> +{

>>> +  return _STRING_ARCH_unaligned || ((uintptr_t)base % (align - 1)) == 0;

>>> +}

>> Surely the '(align - 1)' was supposed to be 'align'. Has this been tested on an architecture that does not allow unaligned access?

> Yes, I checked on sparc64 machine.  This test is similar to the Linux kernel one

> at lib/sort.c.


The Linux kernel lib/sort.c test is (((unsigned long)base & (align - 1)) 
== 0), which is correct. The test above uses '% (align - 1)' instead, 
which is clearly wrong; the "%" should be "&". As the tests evidently 
did not catch the error, they need to be improved to catch it.

> It might be that your approach is faster for other architectures which do not have

> ifunc mempcpy, however I do not want to over-engineer this code since most real

> word correspond to key sizes of 4 and 8.


Thanks, that all makes sense.

One other question. Would it improve performance to partially evaluate 
qsort for the case where the key size is that of a pointer, to allow the 
swap to be done inline with four insns? I would imagine that this is the 
most common case.
Adhemerval Zanella Jan. 22, 2018, 5:48 p.m. | #6
On 22/01/2018 15:15, Paul Eggert wrote:
> On 01/22/2018 02:55 AM, Adhemerval Zanella wrote:

>> On 22/01/2018 06:27, Paul Eggert wrote:

>>> Adhemerval Zanella wrote:

>>>> +static inline bool

>>>> +check_alignment (const void *base, size_t align)

>>>> +{

>>>> +  return _STRING_ARCH_unaligned || ((uintptr_t)base % (align - 1)) == 0;

>>>> +}

>>> Surely the '(align - 1)' was supposed to be 'align'. Has this been tested on an architecture that does not allow unaligned access?

>> Yes, I checked on sparc64 machine.  This test is similar to the Linux kernel one

>> at lib/sort.c.

> 

> The Linux kernel lib/sort.c test is (((unsigned long)base & (align - 1)) == 0), which is correct. The test above uses '% (align - 1)' instead, which is clearly wrong; the "%" should be "&". As the tests evidently did not catch the error, they need to be improved to catch it.


Indeed, Alexander Monakov pointed out this issue is his message and I have fixed it
locally.

> 

>> It might be that your approach is faster for other architectures which do not have

>> ifunc mempcpy, however I do not want to over-engineer this code since most real

>> word correspond to key sizes of 4 and 8.

> 

> Thanks, that all makes sense.

> 

> One other question. Would it improve performance to partially evaluate qsort for the case where the key size is that of a pointer, to allow the swap to be done inline with four insns? I would imagine that this is the most common case.

> 


I noted that, at least for x86_64, calling a function pointer it slight faster
than embedded the test in a switch (as current msort).  One option I have not
tested, and which will trade code side for performance; would parametrize
the qsort creation (as for the 7/7 patch in this set) to have qsort_uint32_t,
qsort_uint64_t, and qsort_generic for instance (which calls the swap inline).

So we will have something as:

void qsort (void *pbase, size_t total_elems, size_t size)
{
  if (size == sizeof (uint32_t)
    && check_alignment (base, sizeof (uint32_t)))
    return qsort_uint32_t (pbase, total_elems, size);
  else if (size == sizeof (uint64_t)
    && check_alignment (base, sizeof (uint64_t)))
    return qsort_uint64_t (pbase, total_elems, size);
  return qsort_generic (pbase, total_elems, size);
}
Paul Eggert Jan. 22, 2018, 6:29 p.m. | #7
On 01/22/2018 09:48 AM, Adhemerval Zanella wrote:
> One option I have not

> tested, and which will trade code side for performance; would parametrize

> the qsort creation (as for the 7/7 patch in this set) to have qsort_uint32_t,

> qsort_uint64_t, and qsort_generic for instance (which calls the swap inline).

>

> So we will have something as:

>

> void qsort (void *pbase, size_t total_elems, size_t size)

> {

>    if (size == sizeof (uint32_t)

>      && check_alignment (base, sizeof (uint32_t)))

>      return qsort_uint32_t (pbase, total_elems, size);

>    else if (size == sizeof (uint64_t)

>      && check_alignment (base, sizeof (uint64_t)))

>      return qsort_uint64_t (pbase, total_elems, size);

>    return qsort_generic (pbase, total_elems, size);

> }


Yes, that's the option I was thinking of, except I was thinking that the 
first test should be "if (size == sizeof (void *) && check_alignment 
(base, alignof (void *))) return qsort_voidptr (pbase, total_elems, 
size);" because sorting arrays of pointers is the most common. (Also, 
check_alignment's argument should use alignof not sizeof.)
Adhemerval Zanella Jan. 22, 2018, 7:33 p.m. | #8
On 22/01/2018 16:29, Paul Eggert wrote:
> On 01/22/2018 09:48 AM, Adhemerval Zanella wrote:

>> One option I have not

>> tested, and which will trade code side for performance; would parametrize

>> the qsort creation (as for the 7/7 patch in this set) to have qsort_uint32_t,

>> qsort_uint64_t, and qsort_generic for instance (which calls the swap inline).

>>

>> So we will have something as:

>>

>> void qsort (void *pbase, size_t total_elems, size_t size)

>> {

>>    if (size == sizeof (uint32_t)

>>      && check_alignment (base, sizeof (uint32_t)))

>>      return qsort_uint32_t (pbase, total_elems, size);

>>    else if (size == sizeof (uint64_t)

>>      && check_alignment (base, sizeof (uint64_t)))

>>      return qsort_uint64_t (pbase, total_elems, size);

>>    return qsort_generic (pbase, total_elems, size);

>> }

> 

> Yes, that's the option I was thinking of, except I was thinking that the first test should be "if (size == sizeof (void *) && check_alignment (base, alignof (void *))) return qsort_voidptr (pbase, total_elems, size);" because sorting arrays of pointers is the most common. (Also, check_alignment's argument should use alignof not sizeof.)

> 


I add the implementation size and the results are slight better:

Results for member size 8
  Sorted
  nmemb   |      base |   patched | diff
        32|      1173 |      1282 | 9.29
      4096|    325485 |    332451 | 2.14
     32768|   3232255 |   3293842 | 1.91
    524288|  65645381 |  66182948 | 0.82

  Repeated
  nmemb   |      base |   patched | diff
        32|      2074 |      2034 | -1.93
      4096|    948339 |    913363 | -3.69
     32768|   8906214 |   8651378 | -2.86
    524288| 173498547 | 166294093 | -4.15

  MostlySorted
  nmemb   |      base |   patched | diff
        32|      2211 |      2147 | -2.89
      4096|    757543 |    739765 | -2.35
     32768|   7785343 |   7570811 | -2.76
    524288| 133912169 | 129728791 | -3.12

  Unsorted
  nmemb   |      base |   patched | diff
        32|      2219 |      2191 | -1.26
      4096|   1017790 |    989068 | -2.82
     32768|   9747216 |   9456092 | -2.99
    524288| 191726744 | 185012121 | -3.50

At the cost of large text sizes and slight more code:

# Before
$ size stdlib/qsort.os
   text    data     bss     dec     hex filename
   2578       0       0    2578     a12 stdlib/qsort.os

# After
$ size stdlib/qsort.os
   text    data     bss     dec     hex filename
   6037       0       0    6037    1795 stdlib/qsort.os


I still prefer my version where generates shorter text segment and also
optimizes for uint32_t.
Paul Eggert Jan. 23, 2018, 6:04 a.m. | #9
Adhemerval Zanella wrote:
> At the cost of large text sizes and slight more code:


Yes, that's a common tradeoff for this sort of optimization. My guess is that 
most glibc users these days would like to spend 4 kB of text space to gain a 
2%-or-so CPU speedup. (But it's just a guess. :-)
> I still prefer my version where generates shorter text segment and also

> optimizes for uint32_t.


The more-inlined version could also optimize for uint32_t. Such an optimization 
should not change the machine code on platforms with 32-bit pointers (since 
uint32_t has the same size and alignment restrictions as void *, and GCC should 
be smart enough to figure this out) but should speed up the size-4 case on 
platforms with 64-bit pointers.

Any thoughts on why the more-inlined version is a bit slower when input is 
already sorted?
Adhemerval Zanella Jan. 23, 2018, 6:28 p.m. | #10
On 23/01/2018 04:04, Paul Eggert wrote:
> Adhemerval Zanella wrote:

>> At the cost of large text sizes and slight more code:

> 

> Yes, that's a common tradeoff for this sort of optimization. My guess is that most glibc users these days would like to spend 4 kB of text space to gain a 2%-or-so CPU speedup. (But it's just a guess. :-)

>> I still prefer my version where generates shorter text segment and also

>> optimizes for uint32_t.

> 

> The more-inlined version could also optimize for uint32_t. Such an optimization should not change the machine code on platforms with 32-bit pointers (since uint32_t has the same size and alignment restrictions as void *, and GCC should be smart enough to figure this out) but should speed up the size-4 case on platforms with 64-bit pointers.

> 

> Any thoughts on why the more-inlined version is a bit slower when input is already sorted?


Again do we really to over-engineering it? GCC profile usage shows 95% to total 
issues done with up to 9 elements and 92% of key size 8.  Firefox is somewhat 
more diverse with 72% up to 17 elements and 95% of key size 8.  I think that 
adding even more code complexity by parametrizing the qsort calls to inline 
the swap operations won't really make much difference in the aforementioned
user cases.

I would rather add specialized sort implementation such as BSD family, heapsort
and mergesort, to provide different algorithm for different constraints (mergesort
for stable-sort, heapsort/mergesort to avoid worse-case from quicksort). We might
even extend it to add something like introsort.
Paul Eggert Jan. 23, 2018, 11:37 p.m. | #11
On 01/23/2018 10:28 AM, Adhemerval Zanella wrote:

> Again do we really to over-engineering it? GCC profile usage shows 95% to total

> issues done with up to 9 elements and 92% of key size 8.  Firefox is somewhat

> more diverse with 72% up to 17 elements and 95% of key size 8.


You have a point. I assume these were on machines with 64-bit pointers. 
In that case why bother with a size-4 special case? Special-casing 
pointer-size items should suffice.

> I would rather add specialized sort implementation such as BSD family, heapsort

> and mergesort, to provide different algorithm for different constraints (mergesort

> for stable-sort, heapsort/mergesort to avoid worse-case from quicksort). We might

> even extend it to add something like introsort.


Each of us over-engineers in his own way (:-).
Adhemerval Zanella Jan. 24, 2018, 10:47 a.m. | #12
On 23/01/2018 21:37, Paul Eggert wrote:
> On 01/23/2018 10:28 AM, Adhemerval Zanella wrote:

> 

>> Again do we really to over-engineering it? GCC profile usage shows 95% to total

>> issues done with up to 9 elements and 92% of key size 8.  Firefox is somewhat

>> more diverse with 72% up to 17 elements and 95% of key size 8.

> 

> You have a point. I assume these were on machines with 64-bit pointers. In that case why bother with a size-4 special case? Special-casing pointer-size items should suffice.


Yes, I just tested on x86_64 and I added the size-4 mainly because is quite simple
in terms of code complexity and resulting code size.

> 

>> I would rather add specialized sort implementation such as BSD family, heapsort

>> and mergesort, to provide different algorithm for different constraints (mergesort

>> for stable-sort, heapsort/mergesort to avoid worse-case from quicksort). We might

>> even extend it to add something like introsort.

> 

> Each of us over-engineers in his own way (:-).

> 


I do think your points are fair, most usage of qsort is already hitting the quicksort
implementation (due the total size of array) and for these cases they will do see
a small speed up due swap optimization and the undefined behaviour fix for qsort_r.

I think if speed is the focus, there is some other idea to optimize for it like
BZ#17941.

Patch

diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index b3a5102..2194003 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -23,20 +23,59 @@ 
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdbool.h>
 
-/* Byte-wise swap two items of size SIZE. */
-#define SWAP(a, b, size)						      \
-  do									      \
-    {									      \
-      size_t __size = (size);						      \
-      char *__a = (a), *__b = (b);					      \
-      do								      \
-	{								      \
-	  char __tmp = *__a;						      \
-	  *__a++ = *__b;						      \
-	  *__b++ = __tmp;						      \
-	} while (--__size > 0);						      \
-    } while (0)
+/* Swap SIZE bytes between addresses A and B.  Helper to generic types
+   are provided as an optimization.  */
+
+typedef void (*swap_t)(void *, void *, size_t);
+
+static inline bool
+check_alignment (const void *base, size_t align)
+{
+  return _STRING_ARCH_unaligned || ((uintptr_t)base % (align - 1)) == 0;
+}
+
+static void
+swap_u32 (void *a, void *b, size_t size)
+{
+  uint32_t tmp = *(uint32_t*) a;
+  *(uint32_t*) a = *(uint32_t*) b;
+  *(uint32_t*) b = tmp;
+}
+
+static void
+swap_u64 (void *a, void *b, size_t size)
+{
+  uint64_t tmp = *(uint64_t*) a;
+  *(uint64_t*) a = *(uint64_t*) b;
+  *(uint64_t*) b = tmp;
+}
+
+static inline void
+swap_generic (void *a, void *b, size_t size)
+{
+  unsigned char tmp[128];
+  do
+    {
+      size_t s = size > sizeof (tmp) ? sizeof (tmp) : size;
+      memcpy (tmp, a, s);
+      a = __mempcpy (a, b, s);
+      b = __mempcpy (b, tmp, s);
+      size -= s;
+    }
+  while (size > 0);
+}
+
+static inline swap_t
+select_swap_func (const void *base, size_t size)
+{
+  if (size == 4 && check_alignment (base, 4))
+    return swap_u32;
+  else if (size == 8 && check_alignment (base, 8))
+    return swap_u64;
+  return swap_generic;
+}
 
 /* Discontinue quicksort algorithm when partition gets below this size.
    This particular magic number was chosen to work best on a Sun 4/260. */
@@ -96,6 +135,8 @@  __qsort_r (void *const pbase, size_t total_elems, size_t size,
     /* Avoid lossage with unsigned arithmetic below.  */
     return;
 
+  swap_t swap = select_swap_func (pbase, size);
+
   if (total_elems > MAX_THRESH)
     {
       char *lo = base_ptr;
@@ -119,13 +160,13 @@  __qsort_r (void *const pbase, size_t total_elems, size_t size,
 	  char *mid = lo + size * ((hi - lo) / size >> 1);
 
 	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    SWAP (mid, lo, size);
+	    swap (mid, lo, size);
 	  if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
-	    SWAP (mid, hi, size);
+	    swap (mid, hi, size);
 	  else
 	    goto jump_over;
 	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    SWAP (mid, lo, size);
+	    swap (mid, lo, size);
 	jump_over:;
 
 	  left_ptr  = lo + size;
@@ -144,7 +185,7 @@  __qsort_r (void *const pbase, size_t total_elems, size_t size,
 
 	      if (left_ptr < right_ptr)
 		{
-		  SWAP (left_ptr, right_ptr, size);
+		  swap (left_ptr, right_ptr, size);
 		  if (mid == left_ptr)
 		    mid = right_ptr;
 		  else if (mid == right_ptr)
@@ -216,7 +257,7 @@  __qsort_r (void *const pbase, size_t total_elems, size_t size,
         tmp_ptr = run_ptr;
 
     if (tmp_ptr != base_ptr)
-      SWAP (tmp_ptr, base_ptr, size);
+      swap (tmp_ptr, base_ptr, size);
 
     /* Insertion sort, running from left-hand-side up to right-hand-side.  */