diff mbox

sysdeps/arm/armv6t2/strlen.S: strlen implementation for armv6t2.

Message ID 52089516.3080304@linaro.org
State Accepted
Headers show

Commit Message

Will Newton Aug. 12, 2013, 7:56 a.m. UTC
This implementation of strlen is faster than the armv6 version for
all string lengths greater than 1 on a Cortex-A15.

ports/ChangeLog.arm:

2013-08-09  Will Newton  <will.newton@linaro.org>

	* sysdeps/arm/armv6t2/strlen.S: New file.
---
 ports/sysdeps/arm/armv6t2/strlen.S | 141 +++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 ports/sysdeps/arm/armv6t2/strlen.S

Comments

Will Newton Aug. 27, 2013, 7:47 a.m. UTC | #1
On 12 August 2013 08:56, Will Newton <will.newton@linaro.org> wrote:
>
> This implementation of strlen is faster than the armv6 version for
> all string lengths greater than 1 on a Cortex-A15.
>
> ports/ChangeLog.arm:
>
> 2013-08-09  Will Newton  <will.newton@linaro.org>
>
>         * sysdeps/arm/armv6t2/strlen.S: New file.
> ---
>  ports/sysdeps/arm/armv6t2/strlen.S | 141 +++++++++++++++++++++++++++++++++++++
>  1 file changed, 141 insertions(+)
>  create mode 100644 ports/sysdeps/arm/armv6t2/strlen.S

Ping?
Joseph Myers Aug. 30, 2013, 12:13 a.m. UTC | #2
On Mon, 12 Aug 2013, Will Newton wrote:

> This implementation of strlen is faster than the armv6 version for
> all string lengths greater than 1 on a Cortex-A15.
> 
> ports/ChangeLog.arm:
> 
> 2013-08-09  Will Newton  <will.newton@linaro.org>
> 
> 	* sysdeps/arm/armv6t2/strlen.S: New file.

OK, presuming you've run the full glibc testsuite with this version used.
Will Newton Aug. 30, 2013, 9:06 a.m. UTC | #3
On 30 August 2013 01:13, Joseph S. Myers <joseph@codesourcery.com> wrote:
> On Mon, 12 Aug 2013, Will Newton wrote:
>
>> This implementation of strlen is faster than the armv6 version for
>> all string lengths greater than 1 on a Cortex-A15.
>>
>> ports/ChangeLog.arm:
>>
>> 2013-08-09  Will Newton  <will.newton@linaro.org>
>>
>>       * sysdeps/arm/armv6t2/strlen.S: New file.
>
> OK, presuming you've run the full glibc testsuite with this version used.

Yes, testsuite is clean. Committed.
Carlos O'Donell Aug. 30, 2013, 5:36 p.m. UTC | #4
On 08/30/2013 05:06 AM, Will Newton wrote:
> On 30 August 2013 01:13, Joseph S. Myers <joseph@codesourcery.com> wrote:
>> On Mon, 12 Aug 2013, Will Newton wrote:
>>
>>> This implementation of strlen is faster than the armv6 version for
>>> all string lengths greater than 1 on a Cortex-A15.
>>>
>>> ports/ChangeLog.arm:
>>>
>>> 2013-08-09  Will Newton  <will.newton@linaro.org>
>>>
>>>       * sysdeps/arm/armv6t2/strlen.S: New file.
>>
>> OK, presuming you've run the full glibc testsuite with this version used.
> 
> Yes, testsuite is clean. Committed.

I'm not happy seeing these kinds of patches go in without some
kind of numbers around "faster" and a reproducible way to get
those numbers.

Cheers,
Carlos.
Carlos O'Donell Aug. 30, 2013, 5:38 p.m. UTC | #5
On 08/30/2013 01:36 PM, Carlos O'Donell wrote:
> On 08/30/2013 05:06 AM, Will Newton wrote:
>> On 30 August 2013 01:13, Joseph S. Myers <joseph@codesourcery.com> wrote:
>>> On Mon, 12 Aug 2013, Will Newton wrote:
>>>
>>>> This implementation of strlen is faster than the armv6 version for
>>>> all string lengths greater than 1 on a Cortex-A15.
>>>>
>>>> ports/ChangeLog.arm:
>>>>
>>>> 2013-08-09  Will Newton  <will.newton@linaro.org>
>>>>
>>>>       * sysdeps/arm/armv6t2/strlen.S: New file.
>>>
>>> OK, presuming you've run the full glibc testsuite with this version used.
>>
>> Yes, testsuite is clean. Committed.
> 
> I'm not happy seeing these kinds of patches go in without some
> kind of numbers around "faster" and a reproducible way to get
> those numbers.

I don't say this because I'm just cranky, I want all of us to be
more accountable when it comes to the engineering rigour required
for performance patches. That way I can look back at these numbers
when users report issues with the speed of strlen on A15.

Cheers,
Carlos.
diff mbox

Patch

diff --git a/ports/sysdeps/arm/armv6t2/strlen.S b/ports/sysdeps/arm/armv6t2/strlen.S
new file mode 100644
index 0000000..a52e2e7
--- /dev/null
+++ b/ports/sysdeps/arm/armv6t2/strlen.S
@@ -0,0 +1,141 @@ 
+/* Copyright (C) 2010-2011,2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/*
+   Assumes:
+   ARMv6T2, AArch32
+
+ */
+
+#include <sysdep.h>
+
+#ifdef __ARMEB__
+#define S2LO		lsl
+#define S2HI		lsr
+#else
+#define S2LO		lsr
+#define S2HI		lsl
+#endif
+
+	/* This code requires Thumb.  */
+	.thumb
+	.syntax unified
+
+/* Parameters and result.  */
+#define srcin		r0
+#define result		r0
+
+/* Internal variables.  */
+#define src		r1
+#define data1a		r2
+#define data1b		r3
+#define const_m1	r12
+#define const_0		r4
+#define tmp1		r4		/* Overlaps const_0  */
+#define tmp2		r5
+
+	.text
+	.p2align 6
+ENTRY(strlen)
+	pld	[srcin, #0]
+	strd	r4, r5, [sp, #-8]!
+	cfi_adjust_cfa_offset (8)
+	cfi_rel_offset (r4, 0)
+	cfi_rel_offset (r5, 4)
+	cfi_remember_state
+	bic	src, srcin, #7
+	mvn	const_m1, #0
+	ands	tmp1, srcin, #7		/* (8 - bytes) to alignment.  */
+	pld	[src, #32]
+	bne.w	.Lmisaligned8
+	mov	const_0, #0
+	mov	result, #-8
+.Lloop_aligned:
+	/* Bytes 0-7.  */
+	ldrd	data1a, data1b, [src]
+	pld	[src, #64]
+	add	result, result, #8
+.Lstart_realigned:
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, .Lnull_found
+
+	/* Bytes 8-15.  */
+	ldrd	data1a, data1b, [src, #8]
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, .Lnull_found
+
+	/* Bytes 16-23.  */
+	ldrd	data1a, data1b, [src, #16]
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, .Lnull_found
+
+	/* Bytes 24-31.  */
+	ldrd	data1a, data1b, [src, #24]
+	add	src, src, #32
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cmp	data1b, #0
+	beq	.Lloop_aligned
+
+.Lnull_found:
+	cmp	data1a, #0
+	itt	eq
+	addeq	result, result, #4
+	moveq	data1a, data1b
+#ifndef __ARMEB__
+	rev	data1a, data1a
+#endif
+	clz	data1a, data1a
+	ldrd	r4, r5, [sp], #8
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (r4)
+	cfi_restore (r5)
+	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
+	DO_RET(lr)
+
+.Lmisaligned8:
+	cfi_restore_state
+	ldrd	data1a, data1b, [src]
+	and	tmp2, tmp1, #3
+	rsb	result, tmp1, #0
+	lsl	tmp2, tmp2, #3			/* Bytes -> bits.  */
+	tst	tmp1, #4
+	pld	[src, #64]
+	S2HI	tmp2, const_m1, tmp2
+	orn	data1a, data1a, tmp2
+	itt	ne
+	ornne	data1b, data1b, tmp2
+	movne	data1a, const_m1
+	mov	const_0, #0
+	b	.Lstart_realigned
+
+END(strlen)
+libc_hidden_builtin_def (strlen)