[ARM] Optimised strchr and strlen

Message ID	20111219172122.GA10120@davesworkthinkpad
State	New
Headers	show Return-Path: <patch+caf_=linaro-patchwork=canonical.com@linaro.org> Received-SPF: neutral (google.com: 74.125.83.50 is neither permitted nor denied by best guess record for domain of david.gilbert@linaro.org) client-ip=74.125.83.50; Date: Mon, 19 Dec 2011 17:21:23 +0000 From: "Dr. David Alan Gilbert" <david.gilbert@linaro.org> To: libc-ports@sourceware.org Cc: joseph@codesourcery.com, patches@linaro.org Subject: [ARM] Optimised strchr and strlen Message-ID: <20111219172122.GA10120@davesworkthinkpad> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.5.20 (2009-06-14)

diff -urN ports/sysdeps/arm/eabi/armv6t2/strchr.S src/ports/sysdeps/arm/eabi/armv6t2/strchr.S --- ports/sysdeps/arm/eabi/armv6t2/strchr.S 1970-01-01 01:00:00.000000000 +0100 +++ ports/sysdeps/arm/eabi/armv6t2/strchr.S 2011-12-16 13:43:56.704694919 +0000 @@ -0,0 +1,71 @@ +/* Copyright (C) 2011 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Code contributed by Dave Gilbert <david.gilbert@linaro.org> + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> + +@ A very simple strchr routine, from benchmarks on A9 it's a bit faster than +@ the current version in eglibc. +@ While I have a version that does 8 bytes/loop and is a lot faster on long +@ strings, it is slower on short strings, and short strings seem more common +@ in strchr usage. +@ Note: The use of cbz/cbnz means it's Thumb only + +@ 2011-02-07 david.gilbert@linaro.org +@ Extracted from local git a5b438d861 +@ 2011-12-16 david.gilbert@linaro.org +@ Copy from Cortex strings rev 65 and change license + + .syntax unified + + .text + .thumb + +@ --------------------------------------------------------------------------- + + .thumb_func + .global strchr + .type strchr,%function +ENTRY(strchr) + @ r0 = start of string + @ r1 = character to match + @ returns NULL for no match, or a pointer to the match + and r1,r1, #255 + +1: + ldrb r2,[r0],#1 + cmp r2,r1 + cbz r2,10f + bne 1b + + @ We're here if it matched +5: + subs r0,r0,#1 + DO_RET(lr) + +10: + @ We're here if we ran off the end + cmp r1, #0 @ Corner case - you can search for the nil and get a pointer to it + beq 5b @ messy, if common we should branch at the start to a special loop + mov r0,#0 + DO_RET(lr) + +END(strchr) + +weak_alias (strchr, index) +libc_hidden_builtin_def(strchr) diff -urN ports/sysdeps/arm/eabi/armv6t2/strlen.S src/ports/sysdeps/arm/eabi/armv6t2/strlen.S --- ports/sysdeps/arm/eabi/armv6t2/strlen.S 1970-01-01 01:00:00.000000000 +0100 +++ ports/sysdeps/arm/eabi/armv6t2/strlen.S 2011-12-16 13:43:01.991130183 +0000 @@ -0,0 +1,118 @@ +/* Copyright (C) 2011 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Code contributed by Dave Gilbert <david.gilbert@linaro.org> + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> + +@ This strlen routine is optimised on a Cortex-A9 and should work on +@ all ARMv7 processors. This routine is reasonably fast for short +@ strings, but is probably slower than a simple implementation if all +@ your strings are very short +@ Note: The use of cbz/cbnz means it's Thumb only + +@ 2011-02-08 david.gilbert@linaro.org +@ Extracted from local git 6848613a +@ 2011-12-16 david.gilbert@linaro.org +@ Copy from Cortex strings rev 65 and change license +@ Add cfi magic, switch to ldrd + + +@ this lets us check a flag in a 00/ff byte easily in either endianness +#ifdef __ARMEB__ +#define CHARTSTMASK(c) 1<<(31-(c*8)) +#else +#define CHARTSTMASK(c) 1<<(c*8) +#endif + +@----------------------------------------------------------------------------- + .syntax unified + + .text + .thumb + + .thumb_func + .global strlen + .type strlen,%function +ENTRY(strlen) + @ r0 = string + @ returns count of bytes in string not including terminator + mov r1, r0 + push { r4,r6 } + cfi_adjust_cfa_offset (8) + cfi_rel_offset (r4, 0) + cfi_rel_offset (r6, 4) + + cfi_remember_state + + mvns r6, #0 @ all F + movs r4, #0 + tst r0, #7 + beq 2f + +1: + ldrb r2, [r1], #1 + tst r1, #7 @ Hit alignment yet? + cbz r2, 10f @ Exit if we found the 0 + bne 1b + + @ So we're now aligned +2: + ldrd r2,r3,[r1],#8 + uadd8 r2, r2, r6 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r2, r4, r6 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + uadd8 r3, r3, r6 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r3, r2, r6 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + cmp r3, #0 + beq 2b + +strlenendtmp: + @ One (or more) of the bytes we loaded was 0 - but which one? + @ r2 has the mask corresponding to the first loaded word + @ r3 has a combined mask of the two words - but if r2 was all-non 0 + @ then it's just the 2nd words + cmp r2, #0 + itte eq + moveq r2, r3 @ the end is in the 2nd word + subeq r1,r1,#3 + subne r1,r1,#7 + + @ r1 currently points to the 2nd byte of the word containing the 0 + tst r2, # CHARTSTMASK(0) @ 1st character + bne 10f + adds r1,r1,#1 + tst r2, # CHARTSTMASK(1) @ 2nd character + ittt eq + addeq r1,r1,#1 + tsteq r2, # (3<<15) @ 2nd & 3rd character + @ If not the 3rd must be the last one + addeq r1,r1,#1 + +10: + @ r0 is still at the beginning, r1 is pointing 1 byte after terminator + sub r0, r1, r0 + subs r0, r0, #1 + pop { r4, r6 } + + cfi_adjust_cfa_offset (-8) + cfi_restore (r4) + cfi_restore (r6) + + DO_RET(lr) + +END(strlen) +libc_hidden_builtin_def (strlen)

[ARM] Optimised strchr and strlen

Commit Message

Comments

Patch