[7/7] stdlib: Remove undefined behavior from qsort implementation

Message ID 1516298002-4618-8-git-send-email-adhemerval.zanella@linaro.org
State New
Headers show
Series
  • Refactor qsort implementation
Related show

Commit Message

Adhemerval Zanella Jan. 18, 2018, 5:53 p.m.
Internally qsort is implemented on top of __qsort_r by casting the
function pointer to another type (__compar_fn_t tp __compar_d_fn_t)
and passing a NULL extra argument.  Casting function pointer with
different types for subsequent function call is undefined-behaviour
(C11 6.3.2.3):

  "[8] A pointer to a function of one type may be converted to a pointer
  to a function of another type and back again; the result shall compare
  equal to the original pointer. If a converted pointer is used to call
  a function whose type is not compatible with the referenced type,
  the behavior is undefined."

Also 'compatible' in this case also does not apply according to
6.7.6.3 Function declarators (including prototypes):

  "[15] For two function types to be compatible, both shall specify
  compatible return types. (146) Moreover, the parameter type lists,
  if both are present, shall agree in the number of parameters and
  in use of the ellipsis terminator; corresponding parameters shall
  have compatible types. [...]"

Although this works on all architectures glibc supports (mostly because
it casts function pointers with similar calling conventions), I think
it is worth to avoid it.  This patch fixes it by adding a common
implementation (qsort_common.c) which redefines the function based
on the required types.

For x86_64 (i7-4790K, gcc 7.2.1) shows a slight better performance
for qsort:

Results for member size 4
  Sorted
  nmemb   |      base |   patched | diff
        32|      1304 |      1257 | -3.60
      4096|    330707 |    302235 | -8.61
     32768|   3300210 |   3020728 | -8.47
    524288|  65673289 |  59306436 | -9.69

  Repeated
  nmemb   |      base |   patched | diff
        32|      1885 |      1873 | -0.64
      4096|    951490 |    904864 | -4.90
     32768|   9272366 |   8542801 | -7.87
    524288| 183337854 | 168426795 | -8.13

  MostlySorted
  nmemb   |      base |   patched | diff
        32|      1836 |      1776 | -3.27
      4096|    758359 |    709937 | -6.39
     32768|   7199982 |   6855890 | -4.78
    524288| 139242170 | 129385161 | -7.08

  Unsorted
  nmemb   |      base |   patched | diff
        32|      2073 |      1941 | -6.37
      4096|   1058383 |    969021 | -8.44
     32768|  10310116 |   9462116 | -8.22
    524288| 202427388 | 186560908 | -7.84

Results for member size 8
  Sorted
  nmemb   |      base |   patched | diff
        32|      1224 |      1205 | -1.55
      4096|    336100 |    325554 | -3.14
     32768|   3539890 |   3264125 | -7.79
    524288|  67268510 |  66107684 | -1.73

  Repeated
  nmemb   |      base |   patched | diff
        32|      2096 |      2118 | 1.05
      4096|   1015585 |    979114 | -3.59
     32768|   9871981 |   9028606 | -8.54
    524288| 189710172 | 174903867 | -7.80

  MostlySorted
  nmemb   |      base |   patched | diff
        32|      2318 |      2346 | 1.21
      4096|    805051 |    759158 | -5.70
     32768|   8346363 |   7810444 | -6.42
    524288| 143597264 | 135900146 | -5.36

  Unsorted
  nmemb   |      base |   patched | diff
        32|      2364 |      2301 | -2.66
      4096|   1076998 |   1014018 | -5.85
     32768|  10442153 |   9888078 | -5.31
    524288| 206235337 | 192479957 | -6.67

Results for member size 32
  Sorted
  nmemb   |      base |   patched | diff
        32|      1214 |      1184 | -2.47
      4096|    332449 |    325865 | -1.98
     32768|   3313274 |   3331750 | 0.56
    524288|  70786673 |  69067176 | -2.43

  Repeated
  nmemb   |      base |   patched | diff
        32|      4913 |      4813 | -2.04
      4096|   1693735 |   1624137 | -4.11
     32768|  17054760 |  15896739 | -6.79
    524288| 332149265 | 316328778 | -4.76

  MostlySorted
  nmemb   |      base |   patched | diff
        32|      5490 |      5332 | -2.88
      4096|   1394312 |   1312703 | -5.85
     32768|  12743599 |  12360726 | -3.00
    524288| 240249011 | 231603294 | -3.60

  Unsorted
  nmemb   |      base |   patched | diff
        32|      6251 |      6047 | -3.26
      4096|   1959306 |   1695241 | -13.48
     32768|  17204840 |  16430388 | -4.50
    524288| 342716199 | 329496913 | -3.86

Checked on x86_64-linux-gnu.

	* stdlib/qsort.c: Move common code to stdlib/qsort_common.c
	and parametrize the function definition based wether to use
	the '_r' variant.
	* stdlib/qsort_common.c: New file.
---
 stdlib/qsort.c        | 208 ++--------------------------------------------
 stdlib/qsort_common.c | 225 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 233 insertions(+), 200 deletions(-)
 create mode 100644 stdlib/qsort_common.c

-- 
2.7.4

Patch

diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index 2194003..03ab0e5 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -16,17 +16,13 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* If you consider tuning this algorithm, you should consult first:
-   Engineering a sort function; Jon Bentley and M. Douglas McIlroy;
-   Software - Practice and Experience; Vol. 23 (11), 1249-1265, 1993.  */
-
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdbool.h>
 
-/* Swap SIZE bytes between addresses A and B.  Helper to generic types
-   are provided as an optimization.  */
+/* Swap SIZE bytes between addresses A and B.  These helpers are provided
+   along the generic one as an optimization.  */
 
 typedef void (*swap_t)(void *, void *, size_t);
 
@@ -98,202 +94,14 @@  typedef struct
 #define	POP(low, high)	((void) (--top, (low = top->lo), (high = top->hi)))
 #define	STACK_NOT_EMPTY	(stack < top)
 
-
-/* Order size using quicksort.  This implementation incorporates
-   four optimizations discussed in Sedgewick:
-
-   1. Non-recursive, using an explicit stack of pointer that store the
-      next array partition to sort.  To save time, this maximum amount
-      of space required to store an array of SIZE_MAX is allocated on the
-      stack.  Assuming a 32-bit (64 bit) integer for size_t, this needs
-      only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes).
-      Pretty cheap, actually.
-
-   2. Chose the pivot element using a median-of-three decision tree.
-      This reduces the probability of selecting a bad pivot value and
-      eliminates certain extraneous comparisons.
-
-   3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving
-      insertion sort to order the MAX_THRESH items within each partition.
-      This is a big win, since insertion sort is faster for small, mostly
-      sorted array segments.
-
-   4. The larger of the two sub-partitions is always pushed onto the
-      stack first, with the algorithm then concentrating on the
-      smaller partition.  This *guarantees* no more than log (total_elems)
-      stack size is needed (actually O(1) in this case)!  */
-
-void
-__qsort_r (void *const pbase, size_t total_elems, size_t size,
-	   __compar_d_fn_t cmp, void *arg)
-{
-  char *base_ptr = (char *) pbase;
-
-  const size_t max_thresh = MAX_THRESH * size;
-
-  if (total_elems == 0)
-    /* Avoid lossage with unsigned arithmetic below.  */
-    return;
-
-  swap_t swap = select_swap_func (pbase, size);
-
-  if (total_elems > MAX_THRESH)
-    {
-      char *lo = base_ptr;
-      char *hi = &lo[size * (total_elems - 1)];
-      stack_node stack[STACK_SIZE];
-      stack_node *top = stack;
-
-      PUSH (NULL, NULL);
-
-      while (STACK_NOT_EMPTY)
-        {
-          char *left_ptr;
-          char *right_ptr;
-
-	  /* Select median value from among LO, MID, and HI. Rearrange
-	     LO and HI so the three values are sorted. This lowers the
-	     probability of picking a pathological pivot value and
-	     skips a comparison for both the LEFT_PTR and RIGHT_PTR in
-	     the while loops. */
-
-	  char *mid = lo + size * ((hi - lo) / size >> 1);
-
-	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    swap (mid, lo, size);
-	  if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
-	    swap (mid, hi, size);
-	  else
-	    goto jump_over;
-	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    swap (mid, lo, size);
-	jump_over:;
-
-	  left_ptr  = lo + size;
-	  right_ptr = hi - size;
-
-	  /* Here's the famous ``collapse the walls'' section of quicksort.
-	     Gotta like those tight inner loops!  They are the main reason
-	     that this algorithm runs much faster than others. */
-	  do
-	    {
-	      while ((*cmp) ((void *) left_ptr, (void *) mid, arg) < 0)
-		left_ptr += size;
-
-	      while ((*cmp) ((void *) mid, (void *) right_ptr, arg) < 0)
-		right_ptr -= size;
-
-	      if (left_ptr < right_ptr)
-		{
-		  swap (left_ptr, right_ptr, size);
-		  if (mid == left_ptr)
-		    mid = right_ptr;
-		  else if (mid == right_ptr)
-		    mid = left_ptr;
-		  left_ptr += size;
-		  right_ptr -= size;
-		}
-	      else if (left_ptr == right_ptr)
-		{
-		  left_ptr += size;
-		  right_ptr -= size;
-		  break;
-		}
-	    }
-	  while (left_ptr <= right_ptr);
-
-          /* Set up pointers for next iteration.  First determine whether
-             left and right partitions are below the threshold size.  If so,
-             ignore one or both.  Otherwise, push the larger partition's
-             bounds on the stack and continue sorting the smaller one. */
-
-          if ((size_t) (right_ptr - lo) <= max_thresh)
-            {
-              if ((size_t) (hi - left_ptr) <= max_thresh)
-		/* Ignore both small partitions. */
-                POP (lo, hi);
-              else
-		/* Ignore small left partition. */
-                lo = left_ptr;
-            }
-          else if ((size_t) (hi - left_ptr) <= max_thresh)
-	    /* Ignore small right partition. */
-            hi = right_ptr;
-          else if ((right_ptr - lo) > (hi - left_ptr))
-            {
-	      /* Push larger left partition indices. */
-              PUSH (lo, right_ptr);
-              lo = left_ptr;
-            }
-          else
-            {
-	      /* Push larger right partition indices. */
-              PUSH (left_ptr, hi);
-              hi = right_ptr;
-            }
-        }
-    }
-
-  /* Once the BASE_PTR array is partially sorted by quicksort the rest
-     is completely sorted using insertion sort, since this is efficient
-     for partitions below MAX_THRESH size. BASE_PTR points to the beginning
-     of the array to sort, and END_PTR points at the very last element in
-     the array (*not* one beyond it!). */
-
-#define min(x, y) ((x) < (y) ? (x) : (y))
-
-  {
-    char *const end_ptr = &base_ptr[size * (total_elems - 1)];
-    char *tmp_ptr = base_ptr;
-    char *thresh = min(end_ptr, base_ptr + max_thresh);
-    char *run_ptr;
-
-    /* Find smallest element in first threshold and place it at the
-       array's beginning.  This is the smallest array element,
-       and the operation speeds up insertion sort's inner loop. */
-
-    for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
-      if ((*cmp) ((void *) run_ptr, (void *) tmp_ptr, arg) < 0)
-        tmp_ptr = run_ptr;
-
-    if (tmp_ptr != base_ptr)
-      swap (tmp_ptr, base_ptr, size);
-
-    /* Insertion sort, running from left-hand-side up to right-hand-side.  */
-
-    run_ptr = base_ptr + size;
-    while ((run_ptr += size) <= end_ptr)
-      {
-	tmp_ptr = run_ptr - size;
-	while ((*cmp) ((void *) run_ptr, (void *) tmp_ptr, arg) < 0)
-	  tmp_ptr -= size;
-
-	tmp_ptr += size;
-        if (tmp_ptr != run_ptr)
-          {
-            char *trav;
-
-	    trav = run_ptr + size;
-	    while (--trav >= run_ptr)
-              {
-                char c = *trav;
-                char *hi, *lo;
-
-                for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
-                  *hi = *lo;
-                *hi = c;
-              }
-          }
-      }
-  }
-}
+#define R_VERSION
+#define R_FUNC    __qsort_r
+#include <stdlib/qsort_common.c>
 
 libc_hidden_def (__qsort_r)
 weak_alias (__qsort_r, qsort_r)
 
-void
-qsort (void *b, size_t n, size_t s, __compar_fn_t cmp)
-{
-  return __qsort_r (b, n, s, (__compar_d_fn_t) cmp, NULL);
-}
+#define R_FUNC   qsort
+#include <stdlib/qsort_common.c>
+
 libc_hidden_def (qsort)
diff --git a/stdlib/qsort_common.c b/stdlib/qsort_common.c
new file mode 100644
index 0000000..666b195
--- /dev/null
+++ b/stdlib/qsort_common.c
@@ -0,0 +1,225 @@ 
+/* Common implementation for both qsort and qsort_r.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* If you consider tuning this algorithm, you should consult first:
+   Engineering a sort function; Jon Bentley and M. Douglas McIlroy;
+   Software - Practice and Experience; Vol. 23 (11), 1249-1265, 1993.  */
+
+#ifdef R_VERSION
+# define R_CMP_TYPE          __compar_d_fn_t
+# define R_CMP_ARG           , void *arg
+# define R_CMP(p1, p2)       cmp (p1, p2, arg)
+#else
+# define R_CMP_TYPE           __compar_fn_t
+# define R_CMP_ARG
+# define R_CMP(p1, p2)       cmp (p1, p2)
+#endif
+
+/* Order size using quicksort.  This implementation incorporates
+   four optimizations discussed in Sedgewick:
+
+   1. Non-recursive, using an explicit stack of pointer that store the
+      next array partition to sort.  To save time, this maximum amount
+      of space required to store an array of SIZE_MAX is allocated on the
+      stack.  Assuming a 32-bit (64 bit) integer for size_t, this needs
+      only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes).
+      Pretty cheap, actually.
+
+   2. Chose the pivot element using a median-of-three decision tree.
+      This reduces the probability of selecting a bad pivot value and
+      eliminates certain extraneous comparisons.
+
+   3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving
+      insertion sort to order the MAX_THRESH items within each partition.
+      This is a big win, since insertion sort is faster for small, mostly
+      sorted array segments.
+
+   4. The larger of the two sub-partitions is always pushed onto the
+      stack first, with the algorithm then concentrating on the
+      smaller partition.  This *guarantees* no more than log (total_elems)
+      stack size is needed (actually O(1) in this case)!  */
+
+void
+R_FUNC (void *pbase, size_t total_elems, size_t size, R_CMP_TYPE cmp R_CMP_ARG)
+{
+  if (total_elems == 0)
+    /* Avoid lossage with unsigned arithmetic below.  */
+    return;
+
+  char *base_ptr = (char *) pbase;
+
+  const size_t max_thresh = MAX_THRESH * size;
+
+  swap_t swap = select_swap_func (pbase, size);
+
+  if (total_elems > MAX_THRESH)
+    {
+      char *lo = base_ptr;
+      char *hi = &lo[size * (total_elems - 1)];
+      stack_node stack[STACK_SIZE];
+      stack_node *top = stack;
+
+      PUSH (NULL, NULL);
+
+      while (STACK_NOT_EMPTY)
+        {
+          char *left_ptr;
+          char *right_ptr;
+
+	  /* Select median value from among LO, MID, and HI. Rearrange
+	     LO and HI so the three values are sorted. This lowers the
+	     probability of picking a pathological pivot value and
+	     skips a comparison for both the LEFT_PTR and RIGHT_PTR in
+	     the while loops. */
+
+	  char *mid = lo + size * ((hi - lo) / size >> 1);
+
+	  if (R_CMP ((void *) mid, (void *) lo) < 0)
+	    swap (mid, lo, size);
+	  if (R_CMP ((void *) hi, (void *) mid) < 0)
+	    swap (mid, hi, size);
+	  else
+	    goto jump_over;
+	  if (R_CMP ((void *) mid, (void *) lo) < 0)
+	    swap (mid, lo, size);
+	jump_over:;
+
+	  left_ptr  = lo + size;
+	  right_ptr = hi - size;
+
+	  /* Here's the famous ``collapse the walls'' section of quicksort.
+	     Gotta like those tight inner loops!  They are the main reason
+	     that this algorithm runs much faster than others. */
+	  do
+	    {
+	      while (R_CMP ((void *) left_ptr, (void *) mid) < 0)
+		left_ptr += size;
+
+	      while (R_CMP ((void *) mid, (void *) right_ptr) < 0)
+		right_ptr -= size;
+
+	      if (left_ptr < right_ptr)
+		{
+		  swap (left_ptr, right_ptr, size);
+		  if (mid == left_ptr)
+		    mid = right_ptr;
+		  else if (mid == right_ptr)
+		    mid = left_ptr;
+		  left_ptr += size;
+		  right_ptr -= size;
+		}
+	      else if (left_ptr == right_ptr)
+		{
+		  left_ptr += size;
+		  right_ptr -= size;
+		  break;
+		}
+	    }
+	  while (left_ptr <= right_ptr);
+
+          /* Set up pointers for next iteration.  First determine whether
+             left and right partitions are below the threshold size.  If so,
+             ignore one or both.  Otherwise, push the larger partition's
+             bounds on the stack and continue sorting the smaller one. */
+
+          if ((size_t) (right_ptr - lo) <= max_thresh)
+            {
+              if ((size_t) (hi - left_ptr) <= max_thresh)
+		/* Ignore both small partitions. */
+                POP (lo, hi);
+              else
+		/* Ignore small left partition. */
+                lo = left_ptr;
+            }
+          else if ((size_t) (hi - left_ptr) <= max_thresh)
+	    /* Ignore small right partition. */
+            hi = right_ptr;
+          else if ((right_ptr - lo) > (hi - left_ptr))
+            {
+	      /* Push larger left partition indices. */
+              PUSH (lo, right_ptr);
+              lo = left_ptr;
+            }
+          else
+            {
+	      /* Push larger right partition indices. */
+              PUSH (left_ptr, hi);
+              hi = right_ptr;
+            }
+        }
+    }
+
+  /* Once the BASE_PTR array is partially sorted by quicksort the rest
+     is completely sorted using insertion sort, since this is efficient
+     for partitions below MAX_THRESH size. BASE_PTR points to the beginning
+     of the array to sort, and END_PTR points at the very last element in
+     the array (*not* one beyond it!). */
+
+  {
+    char *const end_ptr = &base_ptr[size * (total_elems - 1)];
+    char *tmp_ptr = base_ptr;
+    char *thresh = end_ptr < base_ptr + max_thresh ?
+		   end_ptr : base_ptr + max_thresh;
+    char *run_ptr;
+
+    /* Find smallest element in first threshold and place it at the
+       array's beginning.  This is the smallest array element,
+       and the operation speeds up insertion sort's inner loop. */
+
+    for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
+      if (R_CMP ((void *) run_ptr, (void *) tmp_ptr) < 0)
+        tmp_ptr = run_ptr;
+
+    if (tmp_ptr != base_ptr)
+      swap (tmp_ptr, base_ptr, size);
+
+    /* Insertion sort, running from left-hand-side up to right-hand-side.  */
+
+    run_ptr = base_ptr + size;
+    while ((run_ptr += size) <= end_ptr)
+      {
+	tmp_ptr = run_ptr - size;
+	while (R_CMP ((void *) run_ptr, (void *) tmp_ptr) < 0)
+	  tmp_ptr -= size;
+
+	tmp_ptr += size;
+        if (tmp_ptr != run_ptr)
+          {
+            char *trav;
+
+	    trav = run_ptr + size;
+	    while (--trav >= run_ptr)
+              {
+                char c = *trav;
+                char *hi, *lo;
+
+                for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
+                  *hi = *lo;
+                *hi = c;
+              }
+          }
+      }
+  }
+}
+
+#undef R_NAME
+#undef R_CMP_TYPE
+#undef R_CMP_ARG
+#undef R_CMP
+#undef R_FUNC
+#undef R_VERSION