diff mbox series

[v2,5/7] stdlib: Remove use of mergesort on qsort

Message ID 20180831204238.10626-6-adhemerval.zanella@linaro.org
State Superseded
Headers show
Series Refactor qsort implementation | expand

Commit Message

Adhemerval Zanella Aug. 31, 2018, 8:42 p.m. UTC
This patch removes the mergesort optimization on qsort{_r} implementation
and use the quicksort instead.  The mergesort implementation has some
issues:

  - It is as-safe only for certain types sizes (if total size is less
    than 1 KB with large element sizes also forcing memory allocation)
    which contradicts the function documentation.  Although not required
    by the C standard, it is preferable and doable to have a O(1) space
    implementation.

  - The malloc for certain element size and element number adds arbitrary
    latency (might even be worse if malloc is interposed).

  - To avoid trigger swap from memory allocation the implementation relies
    on system information that might be virtualized (for instance VMs with
    overcommit memory) which might leads to potentially use of swap even
    if system advertise more memory than actually has.  The check also have
    the downside of issuing syscalls where none is expected (although only
    once per execution).

  - The mergesort is suboptimal on already sorted array (BZ#21719).

The quicksort implementation is already optimized to use constant extra
space (due the limit of total number of elements from maximum VM size)
and thus can be used to avoid the malloc usage issues.

Using bench-qsort (i7-4790K, gcc 7.1.1) shows the performance difference
between mergesort (base) and quicksort (patched):

Results for member size 4
  MostlySorted
  nmemb   |      base |   patched | diff
       32 |      1791 |      2145 | 19.77
     4096 |    530267 |    902724 | 70.24
    32768 |   5319819 |   7844403 | 47.46
   524288 | 105147020 | 152809379 | 45.33

  Repeated
  nmemb   |      base |   patched | diff
       32 |      1988 |      2222 | 11.77
     4096 |    898057 |   1029244 | 14.61
    32768 |   8890765 |  10057897 | 13.13
   524288 | 178316071 | 197150076 | 10.56

  Sorted
  nmemb   |      base |   patched | diff
       32 |      1511 |      1461 | -3.31
     4096 |    277733 |    357884 | 28.86
    32768 |   2634360 |   3468080 | 31.65
   524288 |  49793076 |  67584959 | 35.73

  Unsorted
  nmemb   |      base |   patched | diff
       32 |      2070 |      2385 | 15.22
     4096 |    941830 |   1146892 | 21.77
    32768 |   9492371 |  10799397 | 13.77
   524288 | 191355021 | 212098446 | 10.84

Results for member size 8
  MostlySorted
  nmemb   |      base |   patched | diff
       32 |      1763 |      2676 | 51.79
     4096 |    510794 |    907769 | 77.72
    32768 |   5075103 |   8605499 | 69.56
   524288 | 103741137 | 154255341 | 48.69

  Repeated
  nmemb   |      base |   patched | diff
       32 |      1908 |      2230 | 16.88
     4096 |    904798 |   1129157 | 24.80
    32768 |   8954918 |  10775229 | 20.33
   524288 | 179825532 | 212935649 | 18.41

  Sorted
  nmemb   |      base |   patched | diff
       32 |      1316 |      1193 | -9.35
     4096 |    261069 |    308152 | 18.03
    32768 |   2449581 |   3022480 | 23.39
   524288 |  47772793 |  60029109 | 25.66

  Unsorted
  nmemb   |      base |   patched | diff
       32 |      2011 |      2814 | 39.93
     4096 |    953723 |   1198160 | 25.63
    32768 |   9539278 |  11678920 | 22.43
   524288 | 193690362 | 229161344 | 18.31

Results for member size 32
  MostlySorted
  nmemb   |      base |   patched | diff
       32 |      4686 |      5073 | 8.26
     4096 |   1688822 |   1572437 | -6.89
    32768 |  17633569 |  14170732 | -19.64
   524288 | 375170630 | 267001863 | -28.83

  Repeated
  nmemb   |      base |   patched | diff
       32 |      5138 |      5592 | 8.84
     4096 |   2187509 |   1890849 | -13.56
    32768 |  22271793 |  18284219 | -17.90
   524288 | 468956765 | 361847282 | -22.84

  Sorted
  nmemb   |      base |   patched | diff
       32 |      3581 |      1179 | -67.08
     4096 |    938145 |    308793 | -67.08
    32768 |   9553669 |   3017486 | -68.42
   524288 | 194239124 |  63986145 | -67.06

  Unsorted
  nmemb   |      base |   patched | diff
       32 |      5235 |      6591 | 25.90
     4096 |   2227377 |   1990681 | -10.63
    32768 |  22875769 |  19127569 | -16.39
   524288 | 484156353 | 375072780 | -22.53

The result shows an increase in latency, as expected. Some performance
difference is also due the fact mergesort uses a slight improved swap
operation than quicksort (which a following patch addresses it).  This
change also renders the BZ #21719 fix unrequired (since it is meant to fix
the sorted input performance degradation for mergesort).  The manual is
also updated to indicate the function is now async-cancel safe.

Checked on x86_64-linux-gnu.

	[BZ #21719]
	* stdlib/Makefile (routines): Remove msort.
	(CFLAGS-msort.c): Remove rule.
	* stdlib/msort.c: Remove file.
	* stdlib/qsort.c (_quicksort): Rename to __qsort_r and add weak_alias
	to qsort_r.
	(qsort): New symbol.
	* manual/argp.texi: Remove qsort @acu* annotation.
	* manual/locale.texi: Likewise.
	* manual/search.texi: Likewise.
---
 manual/argp.texi   |   2 +-
 manual/locale.texi |   3 +-
 manual/search.texi |   7 +-
 stdlib/Makefile    |   3 +-
 stdlib/msort.c     | 310 ---------------------------------------------
 stdlib/qsort.c     |  15 ++-
 6 files changed, 18 insertions(+), 322 deletions(-)
 delete mode 100644 stdlib/msort.c

-- 
2.17.1
diff mbox series

Patch

diff --git a/manual/argp.texi b/manual/argp.texi
index 0023441812..b77ad68285 100644
--- a/manual/argp.texi
+++ b/manual/argp.texi
@@ -735,7 +735,7 @@  for options, bad phase of the moon, etc.
 @c  hol_set_group ok
 @c   hol_find_entry ok
 @c  hol_sort @mtslocale @acucorrupt
-@c   qsort dup @acucorrupt
+@c   qsort dup
 @c    hol_entry_qcmp @mtslocale
 @c     hol_entry_cmp @mtslocale
 @c      group_cmp ok
diff --git a/manual/locale.texi b/manual/locale.texi
index dabb959f9e..15d3b7820c 100644
--- a/manual/locale.texi
+++ b/manual/locale.texi
@@ -253,7 +253,7 @@  The symbols in this section are defined in the header file @file{locale.h}.
 @c    calculate_head_size ok
 @c    __munmap ok
 @c    compute_hashval ok
-@c    qsort dup @acucorrupt
+@c    qsort dup
 @c     rangecmp ok
 @c    malloc @ascuheap @acsmem
 @c    strdup @ascuheap @acsmem
@@ -275,7 +275,6 @@  The symbols in this section are defined in the header file @file{locale.h}.
 @c      realloc @ascuheap @acsmem
 @c     realloc @ascuheap @acsmem
 @c     fclose @ascuheap @asulock @acsmem @acsfd @aculock
-@c     qsort @ascuheap @acsmem
 @c      alias_compare dup
 @c    libc_lock_unlock @aculock
 @c   _nl_explode_name @ascuheap @acsmem
diff --git a/manual/search.texi b/manual/search.texi
index 57dad7a56d..148d451701 100644
--- a/manual/search.texi
+++ b/manual/search.texi
@@ -159,7 +159,7 @@  To sort an array using an arbitrary comparison function, use the
 
 @deftypefun void qsort (void *@var{array}, size_t @var{count}, size_t @var{size}, comparison_fn_t @var{compare})
 @standards{ISO, stdlib.h}
-@safety{@prelim{}@mtsafe{}@assafe{}@acunsafe{@acucorrupt{}}}
+@safety{@prelim{}@mtsafe{}@assafe{}@acsafe{}}
 The @code{qsort} function sorts the array @var{array}.  The array
 contains @var{count} elements, each of which is of size @var{size}.
 
@@ -199,9 +199,8 @@  Functions}):
 The @code{qsort} function derives its name from the fact that it was
 originally implemented using the ``quick sort'' algorithm.
 
-The implementation of @code{qsort} in this library might not be an
-in-place sort and might thereby use an extra amount of memory to store
-the array.
+The implementation of @code{qsort} in this library is an in-place sort
+and uses a constant extra space (allocated on the stack).
 @end deftypefun
 
 @node Search/Sort Example
diff --git a/stdlib/Makefile b/stdlib/Makefile
index 4e012a865a..2c28033196 100644
--- a/stdlib/Makefile
+++ b/stdlib/Makefile
@@ -34,7 +34,7 @@  headers	:= stdlib.h bits/stdlib.h bits/stdlib-ldbl.h bits/stdlib-float.h      \
 routines	:=							      \
 	atof atoi atol atoll						      \
 	abort								      \
-	bsearch qsort msort						      \
+	bsearch qsort							      \
 	getenv putenv setenv secure-getenv				      \
 	exit on_exit atexit cxa_atexit cxa_finalize old_atexit		      \
 	quick_exit at_quick_exit cxa_at_quick_exit cxa_thread_atexit_impl     \
@@ -139,7 +139,6 @@  extra-test-objs += tst-putenvmod.os
 generated += isomac isomac.out tst-putenvmod.so
 
 CFLAGS-bsearch.c += $(uses-callbacks)
-CFLAGS-msort.c += $(uses-callbacks)
 CFLAGS-qsort.c += $(uses-callbacks)
 CFLAGS-system.c += -fexceptions
 CFLAGS-system.os = -fomit-frame-pointer
diff --git a/stdlib/msort.c b/stdlib/msort.c
deleted file mode 100644
index 266c2538c0..0000000000
--- a/stdlib/msort.c
+++ /dev/null
@@ -1,310 +0,0 @@ 
-/* An alternative to qsort, with an identical interface.
-   This file is part of the GNU C Library.
-   Copyright (C) 1992-2018 Free Software Foundation, Inc.
-   Written by Mike Haertel, September 1988.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <alloca.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <memcopy.h>
-#include <errno.h>
-#include <atomic.h>
-
-struct msort_param
-{
-  size_t s;
-  size_t var;
-  __compar_d_fn_t cmp;
-  void *arg;
-  char *t;
-};
-static void msort_with_tmp (const struct msort_param *p, void *b, size_t n);
-
-static void
-msort_with_tmp (const struct msort_param *p, void *b, size_t n)
-{
-  char *b1, *b2;
-  size_t n1, n2;
-
-  if (n <= 1)
-    return;
-
-  n1 = n / 2;
-  n2 = n - n1;
-  b1 = b;
-  b2 = (char *) b + (n1 * p->s);
-
-  msort_with_tmp (p, b1, n1);
-  msort_with_tmp (p, b2, n2);
-
-  char *tmp = p->t;
-  const size_t s = p->s;
-  __compar_d_fn_t cmp = p->cmp;
-  void *arg = p->arg;
-  switch (p->var)
-    {
-    case 0:
-      while (n1 > 0 && n2 > 0)
-	{
-	  if ((*cmp) (b1, b2, arg) <= 0)
-	    {
-	      *(uint32_t *) tmp = *(uint32_t *) b1;
-	      b1 += sizeof (uint32_t);
-	      --n1;
-	    }
-	  else
-	    {
-	      *(uint32_t *) tmp = *(uint32_t *) b2;
-	      b2 += sizeof (uint32_t);
-	      --n2;
-	    }
-	  tmp += sizeof (uint32_t);
-	}
-      break;
-    case 1:
-      while (n1 > 0 && n2 > 0)
-	{
-	  if ((*cmp) (b1, b2, arg) <= 0)
-	    {
-	      *(uint64_t *) tmp = *(uint64_t *) b1;
-	      b1 += sizeof (uint64_t);
-	      --n1;
-	    }
-	  else
-	    {
-	      *(uint64_t *) tmp = *(uint64_t *) b2;
-	      b2 += sizeof (uint64_t);
-	      --n2;
-	    }
-	  tmp += sizeof (uint64_t);
-	}
-      break;
-    case 2:
-      while (n1 > 0 && n2 > 0)
-	{
-	  unsigned long *tmpl = (unsigned long *) tmp;
-	  unsigned long *bl;
-
-	  tmp += s;
-	  if ((*cmp) (b1, b2, arg) <= 0)
-	    {
-	      bl = (unsigned long *) b1;
-	      b1 += s;
-	      --n1;
-	    }
-	  else
-	    {
-	      bl = (unsigned long *) b2;
-	      b2 += s;
-	      --n2;
-	    }
-	  while (tmpl < (unsigned long *) tmp)
-	    *tmpl++ = *bl++;
-	}
-      break;
-    case 3:
-      while (n1 > 0 && n2 > 0)
-	{
-	  if ((*cmp) (*(const void **) b1, *(const void **) b2, arg) <= 0)
-	    {
-	      *(void **) tmp = *(void **) b1;
-	      b1 += sizeof (void *);
-	      --n1;
-	    }
-	  else
-	    {
-	      *(void **) tmp = *(void **) b2;
-	      b2 += sizeof (void *);
-	      --n2;
-	    }
-	  tmp += sizeof (void *);
-	}
-      break;
-    default:
-      while (n1 > 0 && n2 > 0)
-	{
-	  if ((*cmp) (b1, b2, arg) <= 0)
-	    {
-	      tmp = (char *) __mempcpy (tmp, b1, s);
-	      b1 += s;
-	      --n1;
-	    }
-	  else
-	    {
-	      tmp = (char *) __mempcpy (tmp, b2, s);
-	      b2 += s;
-	      --n2;
-	    }
-	}
-      break;
-    }
-
-  if (n1 > 0)
-    memcpy (tmp, b1, n1 * s);
-  memcpy (b, p->t, (n - n2) * s);
-}
-
-
-void
-__qsort_r (void *b, size_t n, size_t s, __compar_d_fn_t cmp, void *arg)
-{
-  size_t size = n * s;
-  char *tmp = NULL;
-  struct msort_param p;
-
-  /* For large object sizes use indirect sorting.  */
-  if (s > 32)
-    size = 2 * n * sizeof (void *) + s;
-
-  if (size < 1024)
-    /* The temporary array is small, so put it on the stack.  */
-    p.t = __alloca (size);
-  else
-    {
-      /* We should avoid allocating too much memory since this might
-	 have to be backed up by swap space.  */
-      static long int phys_pages;
-      static int pagesize;
-
-      if (pagesize == 0)
-	{
-	  phys_pages = __sysconf (_SC_PHYS_PAGES);
-
-	  if (phys_pages == -1)
-	    /* Error while determining the memory size.  So let's
-	       assume there is enough memory.  Otherwise the
-	       implementer should provide a complete implementation of
-	       the `sysconf' function.  */
-	    phys_pages = (long int) (~0ul >> 1);
-
-	  /* The following determines that we will never use more than
-	     a quarter of the physical memory.  */
-	  phys_pages /= 4;
-
-	  /* Make sure phys_pages is written to memory.  */
-	  atomic_write_barrier ();
-
-	  pagesize = __sysconf (_SC_PAGESIZE);
-	}
-
-      /* Just a comment here.  We cannot compute
-	   phys_pages * pagesize
-	   and compare the needed amount of memory against this value.
-	   The problem is that some systems might have more physical
-	   memory then can be represented with a `size_t' value (when
-	   measured in bytes.  */
-
-      /* If the memory requirements are too high don't allocate memory.  */
-      if (size / pagesize > (size_t) phys_pages)
-	{
-	  _quicksort (b, n, s, cmp, arg);
-	  return;
-	}
-
-      /* It's somewhat large, so malloc it.  */
-      int save = errno;
-      tmp = malloc (size);
-      __set_errno (save);
-      if (tmp == NULL)
-	{
-	  /* Couldn't get space, so use the slower algorithm
-	     that doesn't need a temporary array.  */
-	  _quicksort (b, n, s, cmp, arg);
-	  return;
-	}
-      p.t = tmp;
-    }
-
-  p.s = s;
-  p.var = 4;
-  p.cmp = cmp;
-  p.arg = arg;
-
-  if (s > 32)
-    {
-      /* Indirect sorting.  */
-      char *ip = (char *) b;
-      void **tp = (void **) (p.t + n * sizeof (void *));
-      void **t = tp;
-      void *tmp_storage = (void *) (tp + n);
-
-      while ((void *) t < tmp_storage)
-	{
-	  *t++ = ip;
-	  ip += s;
-	}
-      p.s = sizeof (void *);
-      p.var = 3;
-      msort_with_tmp (&p, p.t + n * sizeof (void *), n);
-
-      /* tp[0] .. tp[n - 1] is now sorted, copy around entries of
-	 the original array.  Knuth vol. 3 (2nd ed.) exercise 5.2-10.  */
-      char *kp;
-      size_t i;
-      for (i = 0, ip = (char *) b; i < n; i++, ip += s)
-	if ((kp = tp[i]) != ip)
-	  {
-	    size_t j = i;
-	    char *jp = ip;
-	    memcpy (tmp_storage, ip, s);
-
-	    do
-	      {
-		size_t k = (kp - (char *) b) / s;
-		tp[j] = jp;
-		memcpy (jp, kp, s);
-		j = k;
-		jp = kp;
-		kp = tp[k];
-	      }
-	    while (kp != ip);
-
-	    tp[j] = jp;
-	    memcpy (jp, tmp_storage, s);
-	  }
-    }
-  else
-    {
-      if ((s & (sizeof (uint32_t) - 1)) == 0
-	  && ((char *) b - (char *) 0) % __alignof__ (uint32_t) == 0)
-	{
-	  if (s == sizeof (uint32_t))
-	    p.var = 0;
-	  else if (s == sizeof (uint64_t)
-		   && ((char *) b - (char *) 0) % __alignof__ (uint64_t) == 0)
-	    p.var = 1;
-	  else if ((s & (sizeof (unsigned long) - 1)) == 0
-		   && ((char *) b - (char *) 0)
-		      % __alignof__ (unsigned long) == 0)
-	    p.var = 2;
-	}
-      msort_with_tmp (&p, b, n);
-    }
-  free (tmp);
-}
-libc_hidden_def (__qsort_r)
-weak_alias (__qsort_r, qsort_r)
-
-
-void
-qsort (void *b, size_t n, size_t s, __compar_fn_t cmp)
-{
-  return __qsort_r (b, n, s, (__compar_d_fn_t) cmp, NULL);
-}
-libc_hidden_def (qsort)
diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index 264a06b8a9..b3a5102cac 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -20,7 +20,6 @@ 
    Engineering a sort function; Jon Bentley and M. Douglas McIlroy;
    Software - Practice and Experience; Vol. 23 (11), 1249-1265, 1993.  */
 
-#include <alloca.h>
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
@@ -86,8 +85,8 @@  typedef struct
       stack size is needed (actually O(1) in this case)!  */
 
 void
-_quicksort (void *const pbase, size_t total_elems, size_t size,
-	    __compar_d_fn_t cmp, void *arg)
+__qsort_r (void *const pbase, size_t total_elems, size_t size,
+	   __compar_d_fn_t cmp, void *arg)
 {
   char *base_ptr = (char *) pbase;
 
@@ -247,3 +246,13 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
       }
   }
 }
+
+libc_hidden_def (__qsort_r)
+weak_alias (__qsort_r, qsort_r)
+
+void
+qsort (void *b, size_t n, size_t s, __compar_fn_t cmp)
+{
+  return __qsort_r (b, n, s, (__compar_d_fn_t) cmp, NULL);
+}
+libc_hidden_def (qsort)