Message ID | 20230914183704.1386534-6-adhemerval.zanella@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | Use introsort for qsort | expand |
On Thu, Sep 14, 2023 at 1:37 PM Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote: > > This patch makes the quicksort implementation to acts as introsort, to > avoid worse-case performance (and thus making it O(nlog n)). It switch > to heapsort when the depth level reaches 2*log2(total elements). The > heapsort is a textbook implementation. > > Checked on x86_64-linux-gnu. > --- > stdlib/qsort.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++---- > 1 file changed, 80 insertions(+), 7 deletions(-) > > diff --git a/stdlib/qsort.c b/stdlib/qsort.c > index 4082f5f9c1..e160932657 100644 > --- a/stdlib/qsort.c > +++ b/stdlib/qsort.c > @@ -97,6 +97,7 @@ typedef struct > { > char *lo; > char *hi; > + size_t depth; > } stack_node; > > /* The stack needs log (total_elements) entries (we could even subtract > @@ -106,22 +107,83 @@ typedef struct > enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) }; > > static inline stack_node * > -push (stack_node *top, char *lo, char *hi) > +push (stack_node *top, char *lo, char *hi, size_t depth) > { > top->lo = lo; > top->hi = hi; > + top->depth = depth; > return ++top; > } > > static inline stack_node * > -pop (stack_node *top, char **lo, char **hi) > +pop (stack_node *top, char **lo, char **hi, size_t *depth) > { > --top; > *lo = top->lo; > *hi = top->hi; > + *depth = top->depth; > return top; > } > > +/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux > + lib/sort.c. Used on introsort implementation as a fallback routine with > + worst-case performance of O(nlog n) and worst-case space complexity of > + O(1). */ > + > +static inline void > +siftdown (void *base, size_t size, size_t k, size_t n, > + enum swap_type_t swap_type, __compar_d_fn_t cmp, void *arg) > +{ > + while (k <= n / 2) IMO a comment that `n` is an *inclusive* bound would be nice. Otherwise looks very much so like there is an out-bounds-access. Especially since we have `j < n` in the inner loop. > + { > + size_t j = 2 * k; > + if (j < n && cmp (base + (j * size), base + ((j + 1) * size), arg) < 0) > + j++; > + > + if (cmp (base + (k * size), base + (j * size), arg) >= 0) > + break; > + > + do_swap (base + (size * j), base + (k * size), size, swap_type); > + k = j; > + } > +} > + > +static inline void > +heapify (void *base, size_t size, size_t n, enum swap_type_t swap_type, > + __compar_d_fn_t cmp, void *arg) > +{ > + size_t k = n / 2; > + while (1) > + { > + siftdown (base, size, k, n, swap_type, cmp, arg); > + if (k-- == 0) > + break; > + } > +} > + > +static void > +heapsort_r (void *base, void *end, size_t size, enum swap_type_t swap_type, > + __compar_d_fn_t cmp, void *arg) > +{ > + const size_t count = ((uintptr_t) end - (uintptr_t) base) / size; > + > + if (count < 2) > + return; > + > + size_t n = count - 1; > + > + /* Build the binary heap, largest value at the base[0]. */ > + heapify (base, size, n, swap_type, cmp, arg); > + > + /* On each iteration base[0:n] is the binary heap, while base[n:count] > + is sorted. */ > + while (n > 0) > + { > + do_swap (base, base + (n * size), size, swap_type); > + n--; > + siftdown (base, size, 0, n, swap_type, cmp, arg); > + } > +} Might be nice to move `heapsort_r` to header file so it can be tested explicitly with the existing `qsort` test suite. > > static inline void > insertion_sort_qsort_partitions (void *const pbase, size_t total_elems, > @@ -207,7 +269,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > > const size_t max_thresh = MAX_THRESH * size; > > - if (total_elems == 0) > + if (total_elems <= 1) > /* Avoid lossage with unsigned arithmetic below. */ > return; > > @@ -219,15 +281,26 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > else > swap_type = SWAP_BYTES; > > + /* Maximum depth before quicksort switches to heapsort. */ > + size_t depth = 2 * (sizeof (size_t) * CHAR_BIT - 1 > + - __builtin_clzl (total_elems)); > + > if (total_elems > MAX_THRESH) > { > char *lo = base_ptr; > char *hi = &lo[size * (total_elems - 1)]; > stack_node stack[STACK_SIZE]; > - stack_node *top = stack + 1; > + stack_node *top = push (stack, NULL, NULL, depth); > > while (stack < top) > { > + if (depth == 0) > + { > + heapsort_r (lo, hi, size, swap_type, cmp, arg); > + top = pop (top, &lo, &hi, &depth); > + continue; > + } > + > char *left_ptr; > char *right_ptr; > > @@ -291,7 +364,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > { > if ((size_t) (hi - left_ptr) <= max_thresh) > /* Ignore both small partitions. */ > - top = pop (top, &lo, &hi); > + top = pop (top, &lo, &hi, &depth); > else > /* Ignore small left partition. */ > lo = left_ptr; > @@ -302,13 +375,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > else if ((right_ptr - lo) > (hi - left_ptr)) > { > /* Push larger left partition indices. */ > - top = push (top, lo, right_ptr); > + top = push (top, lo, right_ptr, depth - 1); > lo = left_ptr; > } > else > { > /* Push larger right partition indices. */ > - top = push (top, left_ptr, hi); > + top = push (top, left_ptr, hi, depth - 1); > hi = right_ptr; > } > } > -- > 2.34.1 >
On 15/09/23 11:42, Noah Goldstein wrote: > On Thu, Sep 14, 2023 at 1:37 PM Adhemerval Zanella > <adhemerval.zanella@linaro.org> wrote: >> >> This patch makes the quicksort implementation to acts as introsort, to >> avoid worse-case performance (and thus making it O(nlog n)). It switch >> to heapsort when the depth level reaches 2*log2(total elements). The >> heapsort is a textbook implementation. >> >> Checked on x86_64-linux-gnu. >> --- >> stdlib/qsort.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++---- >> 1 file changed, 80 insertions(+), 7 deletions(-) >> >> diff --git a/stdlib/qsort.c b/stdlib/qsort.c >> index 4082f5f9c1..e160932657 100644 >> --- a/stdlib/qsort.c >> +++ b/stdlib/qsort.c >> @@ -97,6 +97,7 @@ typedef struct >> { >> char *lo; >> char *hi; >> + size_t depth; >> } stack_node; >> >> /* The stack needs log (total_elements) entries (we could even subtract >> @@ -106,22 +107,83 @@ typedef struct >> enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) }; >> >> static inline stack_node * >> -push (stack_node *top, char *lo, char *hi) >> +push (stack_node *top, char *lo, char *hi, size_t depth) >> { >> top->lo = lo; >> top->hi = hi; >> + top->depth = depth; >> return ++top; >> } >> >> static inline stack_node * >> -pop (stack_node *top, char **lo, char **hi) >> +pop (stack_node *top, char **lo, char **hi, size_t *depth) >> { >> --top; >> *lo = top->lo; >> *hi = top->hi; >> + *depth = top->depth; >> return top; >> } >> >> +/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux >> + lib/sort.c. Used on introsort implementation as a fallback routine with >> + worst-case performance of O(nlog n) and worst-case space complexity of >> + O(1). */ >> + >> +static inline void >> +siftdown (void *base, size_t size, size_t k, size_t n, >> + enum swap_type_t swap_type, __compar_d_fn_t cmp, void *arg) >> +{ >> + while (k <= n / 2) > IMO a comment that `n` is an *inclusive* bound would be nice. Otherwise > looks very much so like there is an out-bounds-access. Especially since > we have `j < n` in the inner loop. Ack. >> + { >> + size_t j = 2 * k; >> + if (j < n && cmp (base + (j * size), base + ((j + 1) * size), arg) < 0) >> + j++; >> + >> + if (cmp (base + (k * size), base + (j * size), arg) >= 0) >> + break; >> + >> + do_swap (base + (size * j), base + (k * size), size, swap_type); >> + k = j; >> + } >> +} >> + >> +static inline void >> +heapify (void *base, size_t size, size_t n, enum swap_type_t swap_type, >> + __compar_d_fn_t cmp, void *arg) >> +{ >> + size_t k = n / 2; >> + while (1) >> + { >> + siftdown (base, size, k, n, swap_type, cmp, arg); >> + if (k-- == 0) >> + break; >> + } >> +} >> + >> +static void >> +heapsort_r (void *base, void *end, size_t size, enum swap_type_t swap_type, >> + __compar_d_fn_t cmp, void *arg) >> +{ >> + const size_t count = ((uintptr_t) end - (uintptr_t) base) / size; >> + >> + if (count < 2) >> + return; >> + >> + size_t n = count - 1; >> + >> + /* Build the binary heap, largest value at the base[0]. */ >> + heapify (base, size, n, swap_type, cmp, arg); >> + >> + /* On each iteration base[0:n] is the binary heap, while base[n:count] >> + is sorted. */ >> + while (n > 0) >> + { >> + do_swap (base, base + (n * size), size, swap_type); >> + n--; >> + siftdown (base, size, 0, n, swap_type, cmp, arg); >> + } >> +} > > Might be nice to move `heapsort_r` to header file so it can be tested > explicitly with the existing `qsort` test suite. In fact I tough about that, but I ended up improving tst-qsort3 to always trigger the heapsort by adding quicksort defeating pattern, and also added another sorting algorithm so we can check against. Not sure if we really need to add that much extra testing for qsort. >> >> static inline void >> insertion_sort_qsort_partitions (void *const pbase, size_t total_elems, >> @@ -207,7 +269,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, >> >> const size_t max_thresh = MAX_THRESH * size; >> >> - if (total_elems == 0) >> + if (total_elems <= 1) >> /* Avoid lossage with unsigned arithmetic below. */ >> return; >> >> @@ -219,15 +281,26 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, >> else >> swap_type = SWAP_BYTES; >> >> + /* Maximum depth before quicksort switches to heapsort. */ >> + size_t depth = 2 * (sizeof (size_t) * CHAR_BIT - 1 >> + - __builtin_clzl (total_elems)); >> + >> if (total_elems > MAX_THRESH) >> { >> char *lo = base_ptr; >> char *hi = &lo[size * (total_elems - 1)]; >> stack_node stack[STACK_SIZE]; >> - stack_node *top = stack + 1; >> + stack_node *top = push (stack, NULL, NULL, depth); >> >> while (stack < top) >> { >> + if (depth == 0) >> + { >> + heapsort_r (lo, hi, size, swap_type, cmp, arg); >> + top = pop (top, &lo, &hi, &depth); >> + continue; >> + } >> + >> char *left_ptr; >> char *right_ptr; >> >> @@ -291,7 +364,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, >> { >> if ((size_t) (hi - left_ptr) <= max_thresh) >> /* Ignore both small partitions. */ >> - top = pop (top, &lo, &hi); >> + top = pop (top, &lo, &hi, &depth); >> else >> /* Ignore small left partition. */ >> lo = left_ptr; >> @@ -302,13 +375,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, >> else if ((right_ptr - lo) > (hi - left_ptr)) >> { >> /* Push larger left partition indices. */ >> - top = push (top, lo, right_ptr); >> + top = push (top, lo, right_ptr, depth - 1); >> lo = left_ptr; >> } >> else >> { >> /* Push larger right partition indices. */ >> - top = push (top, left_ptr, hi); >> + top = push (top, left_ptr, hi, depth - 1); >> hi = right_ptr; >> } >> } >> -- >> 2.34.1 >>
On Wed, Sep 20, 2023 at 8:50 AM Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> wrote: > > > > On 15/09/23 11:42, Noah Goldstein wrote: > > On Thu, Sep 14, 2023 at 1:37 PM Adhemerval Zanella > > <adhemerval.zanella@linaro.org> wrote: > >> > >> This patch makes the quicksort implementation to acts as introsort, to > >> avoid worse-case performance (and thus making it O(nlog n)). It switch > >> to heapsort when the depth level reaches 2*log2(total elements). The > >> heapsort is a textbook implementation. > >> > >> Checked on x86_64-linux-gnu. > >> --- > >> stdlib/qsort.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++---- > >> 1 file changed, 80 insertions(+), 7 deletions(-) > >> > >> diff --git a/stdlib/qsort.c b/stdlib/qsort.c > >> index 4082f5f9c1..e160932657 100644 > >> --- a/stdlib/qsort.c > >> +++ b/stdlib/qsort.c > >> @@ -97,6 +97,7 @@ typedef struct > >> { > >> char *lo; > >> char *hi; > >> + size_t depth; > >> } stack_node; > >> > >> /* The stack needs log (total_elements) entries (we could even subtract > >> @@ -106,22 +107,83 @@ typedef struct > >> enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) }; > >> > >> static inline stack_node * > >> -push (stack_node *top, char *lo, char *hi) > >> +push (stack_node *top, char *lo, char *hi, size_t depth) > >> { > >> top->lo = lo; > >> top->hi = hi; > >> + top->depth = depth; > >> return ++top; > >> } > >> > >> static inline stack_node * > >> -pop (stack_node *top, char **lo, char **hi) > >> +pop (stack_node *top, char **lo, char **hi, size_t *depth) > >> { > >> --top; > >> *lo = top->lo; > >> *hi = top->hi; > >> + *depth = top->depth; > >> return top; > >> } > >> > >> +/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux > >> + lib/sort.c. Used on introsort implementation as a fallback routine with > >> + worst-case performance of O(nlog n) and worst-case space complexity of > >> + O(1). */ > >> + > >> +static inline void > >> +siftdown (void *base, size_t size, size_t k, size_t n, > >> + enum swap_type_t swap_type, __compar_d_fn_t cmp, void *arg) > >> +{ > >> + while (k <= n / 2) > > IMO a comment that `n` is an *inclusive* bound would be nice. Otherwise > > looks very much so like there is an out-bounds-access. Especially since > > we have `j < n` in the inner loop. > > Ack. > > >> + { > >> + size_t j = 2 * k; > >> + if (j < n && cmp (base + (j * size), base + ((j + 1) * size), arg) < 0) > >> + j++; > >> + > >> + if (cmp (base + (k * size), base + (j * size), arg) >= 0) > >> + break; > >> + > >> + do_swap (base + (size * j), base + (k * size), size, swap_type); > >> + k = j; > >> + } > >> +} > >> + > >> +static inline void > >> +heapify (void *base, size_t size, size_t n, enum swap_type_t swap_type, > >> + __compar_d_fn_t cmp, void *arg) > >> +{ > >> + size_t k = n / 2; > >> + while (1) > >> + { > >> + siftdown (base, size, k, n, swap_type, cmp, arg); > >> + if (k-- == 0) > >> + break; > >> + } > >> +} > >> + > >> +static void > >> +heapsort_r (void *base, void *end, size_t size, enum swap_type_t swap_type, > >> + __compar_d_fn_t cmp, void *arg) > >> +{ > >> + const size_t count = ((uintptr_t) end - (uintptr_t) base) / size; > >> + > >> + if (count < 2) > >> + return; > >> + > >> + size_t n = count - 1; > >> + > >> + /* Build the binary heap, largest value at the base[0]. */ > >> + heapify (base, size, n, swap_type, cmp, arg); > >> + > >> + /* On each iteration base[0:n] is the binary heap, while base[n:count] > >> + is sorted. */ > >> + while (n > 0) > >> + { > >> + do_swap (base, base + (n * size), size, swap_type); > >> + n--; > >> + siftdown (base, size, 0, n, swap_type, cmp, arg); > >> + } > >> +} > > > > Might be nice to move `heapsort_r` to header file so it can be tested > > explicitly with the existing `qsort` test suite. > > In fact I tough about that, but I ended up improving tst-qsort3 to always > trigger the heapsort by adding quicksort defeating pattern, and also added > another sorting algorithm so we can check against. Not sure if we really > need to add that much extra testing for qsort. Fair enough. > > >> > >> static inline void > >> insertion_sort_qsort_partitions (void *const pbase, size_t total_elems, > >> @@ -207,7 +269,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > >> > >> const size_t max_thresh = MAX_THRESH * size; > >> > >> - if (total_elems == 0) > >> + if (total_elems <= 1) > >> /* Avoid lossage with unsigned arithmetic below. */ > >> return; > >> > >> @@ -219,15 +281,26 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > >> else > >> swap_type = SWAP_BYTES; > >> > >> + /* Maximum depth before quicksort switches to heapsort. */ > >> + size_t depth = 2 * (sizeof (size_t) * CHAR_BIT - 1 > >> + - __builtin_clzl (total_elems)); > >> + > >> if (total_elems > MAX_THRESH) > >> { > >> char *lo = base_ptr; > >> char *hi = &lo[size * (total_elems - 1)]; > >> stack_node stack[STACK_SIZE]; > >> - stack_node *top = stack + 1; > >> + stack_node *top = push (stack, NULL, NULL, depth); > >> > >> while (stack < top) > >> { > >> + if (depth == 0) > >> + { > >> + heapsort_r (lo, hi, size, swap_type, cmp, arg); > >> + top = pop (top, &lo, &hi, &depth); > >> + continue; > >> + } > >> + > >> char *left_ptr; > >> char *right_ptr; > >> > >> @@ -291,7 +364,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > >> { > >> if ((size_t) (hi - left_ptr) <= max_thresh) > >> /* Ignore both small partitions. */ > >> - top = pop (top, &lo, &hi); > >> + top = pop (top, &lo, &hi, &depth); > >> else > >> /* Ignore small left partition. */ > >> lo = left_ptr; > >> @@ -302,13 +375,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > >> else if ((right_ptr - lo) > (hi - left_ptr)) > >> { > >> /* Push larger left partition indices. */ > >> - top = push (top, lo, right_ptr); > >> + top = push (top, lo, right_ptr, depth - 1); > >> lo = left_ptr; > >> } > >> else > >> { > >> /* Push larger right partition indices. */ > >> - top = push (top, left_ptr, hi); > >> + top = push (top, left_ptr, hi, depth - 1); > >> hi = right_ptr; > >> } > >> } > >> -- > >> 2.34.1 > >>
diff --git a/stdlib/qsort.c b/stdlib/qsort.c index 4082f5f9c1..e160932657 100644 --- a/stdlib/qsort.c +++ b/stdlib/qsort.c @@ -97,6 +97,7 @@ typedef struct { char *lo; char *hi; + size_t depth; } stack_node; /* The stack needs log (total_elements) entries (we could even subtract @@ -106,22 +107,83 @@ typedef struct enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) }; static inline stack_node * -push (stack_node *top, char *lo, char *hi) +push (stack_node *top, char *lo, char *hi, size_t depth) { top->lo = lo; top->hi = hi; + top->depth = depth; return ++top; } static inline stack_node * -pop (stack_node *top, char **lo, char **hi) +pop (stack_node *top, char **lo, char **hi, size_t *depth) { --top; *lo = top->lo; *hi = top->hi; + *depth = top->depth; return top; } +/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux + lib/sort.c. Used on introsort implementation as a fallback routine with + worst-case performance of O(nlog n) and worst-case space complexity of + O(1). */ + +static inline void +siftdown (void *base, size_t size, size_t k, size_t n, + enum swap_type_t swap_type, __compar_d_fn_t cmp, void *arg) +{ + while (k <= n / 2) + { + size_t j = 2 * k; + if (j < n && cmp (base + (j * size), base + ((j + 1) * size), arg) < 0) + j++; + + if (cmp (base + (k * size), base + (j * size), arg) >= 0) + break; + + do_swap (base + (size * j), base + (k * size), size, swap_type); + k = j; + } +} + +static inline void +heapify (void *base, size_t size, size_t n, enum swap_type_t swap_type, + __compar_d_fn_t cmp, void *arg) +{ + size_t k = n / 2; + while (1) + { + siftdown (base, size, k, n, swap_type, cmp, arg); + if (k-- == 0) + break; + } +} + +static void +heapsort_r (void *base, void *end, size_t size, enum swap_type_t swap_type, + __compar_d_fn_t cmp, void *arg) +{ + const size_t count = ((uintptr_t) end - (uintptr_t) base) / size; + + if (count < 2) + return; + + size_t n = count - 1; + + /* Build the binary heap, largest value at the base[0]. */ + heapify (base, size, n, swap_type, cmp, arg); + + /* On each iteration base[0:n] is the binary heap, while base[n:count] + is sorted. */ + while (n > 0) + { + do_swap (base, base + (n * size), size, swap_type); + n--; + siftdown (base, size, 0, n, swap_type, cmp, arg); + } +} static inline void insertion_sort_qsort_partitions (void *const pbase, size_t total_elems, @@ -207,7 +269,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, const size_t max_thresh = MAX_THRESH * size; - if (total_elems == 0) + if (total_elems <= 1) /* Avoid lossage with unsigned arithmetic below. */ return; @@ -219,15 +281,26 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, else swap_type = SWAP_BYTES; + /* Maximum depth before quicksort switches to heapsort. */ + size_t depth = 2 * (sizeof (size_t) * CHAR_BIT - 1 + - __builtin_clzl (total_elems)); + if (total_elems > MAX_THRESH) { char *lo = base_ptr; char *hi = &lo[size * (total_elems - 1)]; stack_node stack[STACK_SIZE]; - stack_node *top = stack + 1; + stack_node *top = push (stack, NULL, NULL, depth); while (stack < top) { + if (depth == 0) + { + heapsort_r (lo, hi, size, swap_type, cmp, arg); + top = pop (top, &lo, &hi, &depth); + continue; + } + char *left_ptr; char *right_ptr; @@ -291,7 +364,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, { if ((size_t) (hi - left_ptr) <= max_thresh) /* Ignore both small partitions. */ - top = pop (top, &lo, &hi); + top = pop (top, &lo, &hi, &depth); else /* Ignore small left partition. */ lo = left_ptr; @@ -302,13 +375,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, else if ((right_ptr - lo) > (hi - left_ptr)) { /* Push larger left partition indices. */ - top = push (top, lo, right_ptr); + top = push (top, lo, right_ptr, depth - 1); lo = left_ptr; } else { /* Push larger right partition indices. */ - top = push (top, left_ptr, hi); + top = push (top, left_ptr, hi, depth - 1); hi = right_ptr; } }