diff mbox series

[v2,1/6] wcsmbs: Add wcscpy loop unroll option

Message ID 20190313140317.8894-1-adhemerval.zanella@linaro.org
State New
Headers show
Series [v2,1/6] wcsmbs: Add wcscpy loop unroll option | expand

Commit Message

Adhemerval Zanella Netto March 13, 2019, 2:03 p.m. UTC
This allows an architecture the use the old generic implementation
and also set explicit loop unrolling.

Checked on aarch64-linux-gnu.

	* include/loop_unroll.h: New file.
	* wcsmbs/wcscpy (__wcscpy): Add option to use loop unrolling
	besides generic implementation.
---
 include/loop_unroll.h | 78 +++++++++++++++++++++++++++++++++++++++++++
 wcsmbs/wcscpy.c       | 19 +++++++++++
 2 files changed, 97 insertions(+)
 create mode 100644 include/loop_unroll.h

-- 
2.17.1

Comments

Gabriel F. T. Gomes March 23, 2019, 3 p.m. UTC | #1
On Wed, Mar 13 2019, Adhemerval Zanella wrote:
> This allows an architecture the use the old generic implementation

> and also set explicit loop unrolling.


s/the use/to use/

> 	* include/loop_unroll.h: New file.

> 	* wcsmbs/wcscpy (__wcscpy): Add option to use loop unrolling

> 	besides generic implementation.


OK.

> +/* Loop unroll macro to be used for explicity force loop unrolling with a

> +   configurable number or iterations.  The idea is to make the loop unrolling

> +   independent of whether compiler is able to unrolling through specific

> +   optimizations options (-funroll-loops or -funroll-all-loops).

> +

> +   For instance, to implement strcpy with SRC being the source input and 

> +   DEST the destination buffer, it is expected the macro to be used in this

> +   way:

> +

> +     #define ITERATION(index)	\

> +       ({ char c = *str++; *dest++ = c; c != '\0' })

> +

> +     while (1)

> +       UNROLL_REPEAT (4, ITERATION)

> +

> +   The loop will be manually unrolled 4 times.  Another option is to do

> +   the index update after the tests:

> +

> +     #define ITERATION(index)	\

> +       ({ char c = *(str + index); *(dest + index) = c; c != '\0' })

> +     #define UPDATE(n)		\

> +       str += n; dst += n

> +

> +     while (1)

> +       UNROLL_REPEAT_UPDATE (4, ITERATION, UPDATE)

> +

> +   The loop will be manually unrolled 4 times and the SRC and DEST pointers

> +   will be update only after last iteration.

> +

> +   Currently both macros unrolls the loop 8 times at maximum.  */


Thanks for writing this detailed explanation.

> +#define UNROLL_REPEAT_1(X)    if (!X(0)) break;

> +#define UNROLL_REPEAT_2(X)    UNROLL_REPEAT_1(X) if (!X(1)) break;

> +#define UNROLL_REPEAT_3(X)    UNROLL_REPEAT_2(X) if (!X(2)) break;

> +#define UNROLL_REPEAT_4(X)    UNROLL_REPEAT_3(X) if (!X(3)) break;

> +#define UNROLL_REPEAT_5(X)    UNROLL_REPEAT_4(X) if (!X(4)) break;

> +#define UNROLL_REPEAT_6(X)    UNROLL_REPEAT_5(X) if (!X(5)) break;

> +#define UNROLL_REPEAT_7(X)    UNROLL_REPEAT_6(X) if (!X(6)) break;

> +#define UNROLL_REPEAT_8(X)    UNROLL_REPEAT_7(X) if (!X(7)) break;

                                                ^         ^
Missing space between macro call and parentheses (twice in each line).

> +#define UNROLL_REPEAT__(N, X) UNROLL_EXPAND(UNROLL_REPEAT_ ## N) (X)

                                              ^
I think that this is OK according to the style guidelines, as it just
expands the name to UNROLL_REPEAT_1, UNROLL_REPEAT_2, etc.  But maybe I
just didn't quite understand the guidelines.

> +#define UNROLL_REPEAT(N, X)                \

> +  (void) ({                                \

> +    UNROLL_REPEAT_(UNROLL_EXPAND(N), X);   \

                     ^
Missing space.

> +  /* Some architectures might have costly tail function call (powerpc

> +     for instance) where wmemcpy call overhead for smalls sizes might

> +     be costly than just unroll the main loop.  */


s/be costly than just unroll/be more costly than just unrolling/

... I think.

Looks good to me with these changes.

Reviewed-by: Gabriel F. T. Gomes <gabriel@inconstante.eti.br>
diff mbox series

Patch

diff --git a/include/loop_unroll.h b/include/loop_unroll.h
new file mode 100644
index 0000000000..f40d375257
--- /dev/null
+++ b/include/loop_unroll.h
@@ -0,0 +1,78 @@ 
+/* Macro for explicit loop unrolling.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _LOOP_UNROLL_H
+#define _LOOP_UNROLL_H
+
+/* Loop unroll macro to be used for explicity force loop unrolling with a
+   configurable number or iterations.  The idea is to make the loop unrolling
+   independent of whether compiler is able to unrolling through specific
+   optimizations options (-funroll-loops or -funroll-all-loops).
+
+   For instance, to implement strcpy with SRC being the source input and 
+   DEST the destination buffer, it is expected the macro to be used in this
+   way:
+
+     #define ITERATION(index)	\
+       ({ char c = *str++; *dest++ = c; c != '\0' })
+
+     while (1)
+       UNROLL_REPEAT (4, ITERATION)
+
+   The loop will be manually unrolled 4 times.  Another option is to do
+   the index update after the tests:
+
+     #define ITERATION(index)	\
+       ({ char c = *(str + index); *(dest + index) = c; c != '\0' })
+     #define UPDATE(n)		\
+       str += n; dst += n
+
+     while (1)
+       UNROLL_REPEAT_UPDATE (4, ITERATION, UPDATE)
+
+   The loop will be manually unrolled 4 times and the SRC and DEST pointers
+   will be update only after last iteration.
+
+   Currently both macros unrolls the loop 8 times at maximum.  */
+
+#define UNROLL_REPEAT_1(X)    if (!X(0)) break;
+#define UNROLL_REPEAT_2(X)    UNROLL_REPEAT_1(X) if (!X(1)) break;
+#define UNROLL_REPEAT_3(X)    UNROLL_REPEAT_2(X) if (!X(2)) break;
+#define UNROLL_REPEAT_4(X)    UNROLL_REPEAT_3(X) if (!X(3)) break;
+#define UNROLL_REPEAT_5(X)    UNROLL_REPEAT_4(X) if (!X(4)) break;
+#define UNROLL_REPEAT_6(X)    UNROLL_REPEAT_5(X) if (!X(5)) break;
+#define UNROLL_REPEAT_7(X)    UNROLL_REPEAT_6(X) if (!X(6)) break;
+#define UNROLL_REPEAT_8(X)    UNROLL_REPEAT_7(X) if (!X(7)) break;
+
+#define UNROLL_EXPAND(...)    __VA_ARGS__
+
+#define UNROLL_REPEAT__(N, X) UNROLL_EXPAND(UNROLL_REPEAT_ ## N) (X)
+#define UNROLL_REPEAT_(N, X)  UNROLL_REPEAT__ (N, X)
+
+#define UNROLL_REPEAT(N, X)                \
+  (void) ({                                \
+    UNROLL_REPEAT_(UNROLL_EXPAND(N), X);   \
+  })
+
+#define UNROLL_REPEAT_UPDATE(N, X, U)      \
+  (void) ({                                \
+    UNROLL_REPEAT_ (UNROLL_EXPAND(N), X);  \
+    UPDATE (N);                            \
+  })
+
+#endif
diff --git a/wcsmbs/wcscpy.c b/wcsmbs/wcscpy.c
index 6fb2969513..8e8719744e 100644
--- a/wcsmbs/wcscpy.c
+++ b/wcsmbs/wcscpy.c
@@ -17,6 +17,7 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 #include <wchar.h>
+#include <loop_unroll.h>
 
 
 #ifdef WCSCPY
@@ -27,7 +28,25 @@ 
 wchar_t *
 __wcscpy (wchar_t *dest, const wchar_t *src)
 {
+#ifndef UNROLL_NTIMES
   return __wmemcpy (dest, src, __wcslen (src) + 1);
+#else
+  /* Some architectures might have costly tail function call (powerpc
+     for instance) where wmemcpy call overhead for smalls sizes might
+     be costly than just unroll the main loop.  */
+  wchar_t *wcp = dest;
+
+#define ITERATION(index)		\
+  ({					\
+     wchar_t c = *src++;		\
+     *wcp++ = c;			\
+     c != L'\0';			\
+  })
+
+  while (1)
+    UNROLL_REPEAT(UNROLL_NTIMES, ITERATION);
+  return dest;
+#endif
 }
 #ifndef WCSCPY
 weak_alias (__wcscpy, wcscpy)