From patchwork Thu Oct 26 19:06:30 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Adhemerval Zanella Netto X-Patchwork-Id: 117254 Delivered-To: patch@linaro.org Received: by 10.140.22.164 with SMTP id 33csp1124128qgn; Thu, 26 Oct 2017 12:07:48 -0700 (PDT) X-Google-Smtp-Source: ABhQp+Szeilv9oZvFDLH/U4UjT4cddVE1tAm6phXE4CmjS+88mYCEVWAaakGG/i+XI+K5ra5ej9P X-Received: by 10.98.202.74 with SMTP id n71mr6431254pfg.202.1509044868775; Thu, 26 Oct 2017 12:07:48 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1509044868; cv=none; d=google.com; s=arc-20160816; b=SCYja0ML+NCnbTg6FUspjJmrVFWu8scufPst44aPVZbdrUeIwV9cyD1FydvEvuIi4G Pg6IaoMrLTy+uqSeR8QnoIwfGNxMGMRNoWJN6RS5JsD5WGdQ7hkZk4Ex421rOsa6nc+W HgoBAcfUF2shPdul6qIFt0QNBk2bRltOIfMfkDtJmiR5uthdnU+TAkhoLT8e6bS+1X13 NM5RooRMZ3uGNZPr0VkNI9nJIo9ESjHFfgpoF9QDWR85NwGsLjy5Okw8a9QP14cJJ5IK +XmvD1ZhF5M7PANnl7fWlxUBDXZxYBQhY2yVxP7O6BdjQx1s6Yv+HrhQm9PsDxzngR9C OhVQ== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=references:in-reply-to:message-id:date:subject:to:from:delivered-to :sender:list-help:list-post:list-archive:list-subscribe :list-unsubscribe:list-id:precedence:mailing-list:dkim-signature :domainkey-signature:arc-authentication-results; bh=cZatx8kjES2qltP7dzWrePtRYsJVNd6/QqNzp7f/qHA=; b=mhsxnKjEKrasXQksqOHUTInBLCUsL4wANgGCgVj/s7Gs/mE0R7Cw8HtFdYM2bb5tlb ROURyt/nQzr7uXYukykmn5al1T/jE6jJ1ol469595kah/S1kxFHzzNEuC/nukJIv3rxe 6B0EJwo56rRHpZewcTdbKNe9/+pCiht9ejfYKvyipAMMGEdnjqCBqITIvO16yWDMPNVy Bo1wpdmW2CYeZUIWjwVwPJ1p5ho6TMOfgSEPwEGhjkGQIRnwcWhZkiqxKgfHDiv36xT/ /29qEkXwfAwk5bx9NfQV1wopvyxo028OXi8dN0ZfIzCB2Byis0jUEGC4z0fU1/+Q5qGz 9JhQ== ARC-Authentication-Results: i=1; mx.google.com; dkim=pass header.i=@sourceware.org header.s=default header.b=B/L2Rp19; spf=pass (google.com: domain of libc-alpha-return-86404-patch=linaro.org@sourceware.org designates 209.132.180.131 as permitted sender) smtp.mailfrom=libc-alpha-return-86404-patch=linaro.org@sourceware.org; dmarc=fail (p=NONE sp=NONE dis=NONE) header.from=linaro.org Return-Path: Received: from sourceware.org (server1.sourceware.org. [209.132.180.131]) by mx.google.com with ESMTPS id m63si3358144pld.191.2017.10.26.12.07.48 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Thu, 26 Oct 2017 12:07:48 -0700 (PDT) Received-SPF: pass (google.com: domain of libc-alpha-return-86404-patch=linaro.org@sourceware.org designates 209.132.180.131 as permitted sender) client-ip=209.132.180.131; Authentication-Results: mx.google.com; dkim=pass header.i=@sourceware.org header.s=default header.b=B/L2Rp19; spf=pass (google.com: domain of libc-alpha-return-86404-patch=linaro.org@sourceware.org designates 209.132.180.131 as permitted sender) smtp.mailfrom=libc-alpha-return-86404-patch=linaro.org@sourceware.org; dmarc=fail (p=NONE sp=NONE dis=NONE) header.from=linaro.org DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:from:to:subject:date:message-id:in-reply-to :references; q=dns; s=default; b=MFSIbGK+SvksIa/wLcA25/Jb2UpVsCS yA5cVmlWFHEKqia5M7VE9/KqsGQmy6ILlxUAB0dbnTbkDlN7+OvMdZy7tc0QJwX9 AhnU8KknstY2GjaFilCLhzFyAUqjt8XEatWVKmT9j7AqFX8qVUUGRQh+uhh0jstd d7lq8yCKP0m8= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:from:to:subject:date:message-id:in-reply-to :references; s=default; bh=oBLQZcd6fqSNJ6nD/W2B6x9P+vs=; b=B/L2R p19IGGTWVo6qPgVP0FUQYzQvnuj+ZPTGjOSa1Mr7fRbhQy940vrEbPgluMXTK86j jukmmkbFElXv/XMrUs62AT2dHXfuN3P31m/zVowaN/kJVJcDhU4riF2ZoHYpaxtm E8VnH0NTLGdjNdOvNtci/tHkqYdHpObiwFicPI= Received: (qmail 97260 invoked by alias); 26 Oct 2017 19:07:10 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 97142 invoked by uid 89); 26 Oct 2017 19:07:08 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-26.3 required=5.0 tests=BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_STOCKGEN, RCVD_IN_DNSWL_NONE, RCVD_IN_SORBS_SPAM, SPF_PASS autolearn=ham version=3.3.2 spammy=vmov, bic, sk:armv7l X-HELO: mail-qt0-f195.google.com X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to :references; bh=cZatx8kjES2qltP7dzWrePtRYsJVNd6/QqNzp7f/qHA=; b=CxYOYb2Vg81oYBQ/oYSS7N5ogCsfmPCF2aDj4KUFZKXjd1B7jlTI/VaRhlaxSzj7C2 qpVyZ9TP6BT17f3FzaM7IweeM9MPHqFErsVJVOr6rWgsUHzYpV9Zvcl9ggXR8UUF+3Qz LwL1iK26zGDktZfTfTryd7oM8rcczPZxiu8N89JwJUFjKEM0SOgvAenng+sCYCZ8YXiX WCvw2M3N8F4w9m1QzioRjhpGWiKi2btzorCclf2VDrgihymDKovqYqxAENPCBFh3LvmK 6fQ46ceBX9/j24r+PJz7fioH9agfhojESDd5LNiLcHmCFVzQMVtWvML0O/NHiV4J30uG nMQA== X-Gm-Message-State: AMCzsaV26sTazpT5QdVOnmOC2CirlYzKz4OV7U9+oH42ilZ7JjI2YR15 iopO9Fqq0YCBQDr+LRnQKHF3abD0oqo= X-Received: by 10.237.35.58 with SMTP id h55mr36375953qtc.108.1509044822739; Thu, 26 Oct 2017 12:07:02 -0700 (PDT) From: Adhemerval Zanella To: libc-alpha@sourceware.org Subject: [PATCH 02/25] arm: Implement memchr ifunc selection in C Date: Thu, 26 Oct 2017 17:06:30 -0200 Message-Id: <1509044813-9951-3-git-send-email-adhemerval.zanella@linaro.org> In-Reply-To: <1509044813-9951-1-git-send-email-adhemerval.zanella@linaro.org> References: <1509044813-9951-1-git-send-email-adhemerval.zanella@linaro.org> This patch refactor ARM memchr ifunc selector to a C implementation. No functional change is expected, including ifunc resolution rules. It also reorganize the ifunc options code: 1. The memchr_impl.S is renamed to memchr_neon.S and multiple compilation options (which route to armv6t2/memchr one) is removed. The code to build if __ARM_NEON__ is defined is also simplified. 2. A memchr_noneon is added (which as build along previous ifunc resolution) and includes the armv6t2 direct. 3. Same as 2. for loader object. Alongside the aforementioned changes, it also some cleanus: - Internal memchr definition (__GI_memcpy) is now a hidden symbol. - No need to create hidden definition for the ifunc variants. Checked on armv7-linux-gnueabihf and with a build for arm-linux-gnueabi, arm-linux-gnueabihf with and without multiarch support and with both GCC 7.1 and GCC mainline. * sysdeps/arm/armv7/multiarch/Makefile [$(subdir) = string] (sysdeps_routines): Add memchr_noneon. * sysdeps/arm/armv7/multiarch/ifunc-memchr.h: New file. * sysdeps/arm/armv7/multiarch/memchr_noneon.S: Likewise. * sysdeps/arm/armv7/multiarch/rtld-memchr.S: Likewise. * sysdeps/arm/armv7/multiarch/memchr.S: Remove file. * sysdeps/arm/armv7/multiarch/memchr.c: New file. * sysdeps/arm/armv7/multiarch/memchr_impl.S: Move to ... * sysdeps/arm/armv7/multiarch/memchr_neon.S: ... here. Signed-off-by: Adhemerval Zanella --- ChangeLog | 10 ++ sysdeps/arm/armv7/multiarch/Makefile | 3 +- sysdeps/arm/armv7/multiarch/ifunc-memchr.h | 28 ++++ sysdeps/arm/armv7/multiarch/memchr.S | 59 -------- sysdeps/arm/armv7/multiarch/memchr.c | 35 +++++ sysdeps/arm/armv7/multiarch/memchr_impl.S | 219 --------------------------- sysdeps/arm/armv7/multiarch/memchr_neon.S | 221 +++++++++++++++++++++++++++- sysdeps/arm/armv7/multiarch/memchr_noneon.S | 5 + sysdeps/arm/armv7/multiarch/rtld-memchr.S | 1 + 9 files changed, 296 insertions(+), 285 deletions(-) create mode 100644 sysdeps/arm/armv7/multiarch/ifunc-memchr.h delete mode 100644 sysdeps/arm/armv7/multiarch/memchr.S create mode 100644 sysdeps/arm/armv7/multiarch/memchr.c delete mode 100644 sysdeps/arm/armv7/multiarch/memchr_impl.S create mode 100644 sysdeps/arm/armv7/multiarch/memchr_noneon.S create mode 100644 sysdeps/arm/armv7/multiarch/rtld-memchr.S -- 2.7.4 diff --git a/sysdeps/arm/armv7/multiarch/Makefile b/sysdeps/arm/armv7/multiarch/Makefile index 1e62ef9..6e5851f 100644 --- a/sysdeps/arm/armv7/multiarch/Makefile +++ b/sysdeps/arm/armv7/multiarch/Makefile @@ -1,3 +1,4 @@ ifeq ($(subdir),string) -sysdep_routines += memcpy_neon memcpy_vfp memchr_neon memcpy_arm +sysdep_routines += memcpy_neon memcpy_vfp memchr_neon memcpy_arm \ + memchr_noneon endif diff --git a/sysdeps/arm/armv7/multiarch/ifunc-memchr.h b/sysdeps/arm/armv7/multiarch/ifunc-memchr.h new file mode 100644 index 0000000..42f89fa --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/ifunc-memchr.h @@ -0,0 +1,28 @@ +/* Common definition for memchr resolver. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +__typeof (REDIRECT_NAME) OPTIMIZE (neon) attribute_hidden; +__typeof (REDIRECT_NAME) OPTIMIZE (noneon) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (int hwcap) +{ + if (hwcap & HWCAP_ARM_NEON) + return OPTIMIZE (neon); + return OPTIMIZE (noneon); +} diff --git a/sysdeps/arm/armv7/multiarch/memchr.S b/sysdeps/arm/armv7/multiarch/memchr.S deleted file mode 100644 index 8e8097a..0000000 --- a/sysdeps/arm/armv7/multiarch/memchr.S +++ /dev/null @@ -1,59 +0,0 @@ -/* Multiple versions of memchr - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2013-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - -#if IS_IN (libc) -/* Under __ARM_NEON__, memchr_neon.S defines the name memchr. */ -# ifndef __ARM_NEON__ - .text - .arm -ENTRY(memchr) - .type memchr, %gnu_indirect_function - ldr r1, .Lmemchr_noneon - tst r0, #HWCAP_ARM_NEON - ldrne r1, .Lmemchr_neon -1: - add r0, r1, pc - DO_RET(lr) - -.Lmemchr_noneon: - .long C_SYMBOL_NAME(__memchr_noneon) - 1b - 8 -.Lmemchr_neon: - .long C_SYMBOL_NAME(__memchr_neon) - 1b - 8 - -END(memchr) - -libc_hidden_builtin_def (memchr) -# endif /* Not __ARM_NEON__. */ -libc_hidden_def (__memchr_noneon) - -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(name) -# undef weak_alias -# define weak_alias(x, y) -# undef libc_hidden_def -# define libc_hidden_def(name) - -# define memchr __memchr_noneon - -#endif - -#include "memchr_impl.S" diff --git a/sysdeps/arm/armv7/multiarch/memchr.c b/sysdeps/arm/armv7/multiarch/memchr.c new file mode 100644 index 0000000..906bcd5 --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/memchr.c @@ -0,0 +1,35 @@ +/* Multiple versions of memchr. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* For __ARM_NEON__ memchr_neon.S defines memchr directly and ifunc + is not used. */ +#if IS_IN (libc) && !defined (__ARM_NEON__) +# define memchr __redirect_memchr +# include +# undef memchr + +# include + +# define SYMBOL_NAME memchr +# include "ifunc-memchr.h" + +arm_libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR); + +arm_libc_ifunc_hidden_def (__redirect_memchr, memchr); +#endif diff --git a/sysdeps/arm/armv7/multiarch/memchr_impl.S b/sysdeps/arm/armv7/multiarch/memchr_impl.S deleted file mode 100644 index e8cbb97..0000000 --- a/sysdeps/arm/armv7/multiarch/memchr_impl.S +++ /dev/null @@ -1,219 +0,0 @@ -/* memchr implemented using NEON. - Copyright (C) 2011-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - . */ - -#ifdef MEMCHR_NEON - -#include - - .arch armv7-a - .fpu neon - - -/* Arguments */ -#define srcin r0 -#define chrin r1 -#define cntin r2 - -/* Retval */ -#define result r0 /* Live range does not overlap with srcin */ - -/* Working registers */ -#define src r1 /* Live range does not overlap with chrin */ -#define tmp r3 -#define synd r0 /* No overlap with srcin or result */ -#define soff r12 - -/* Working NEON registers */ -#define vrepchr q0 -#define vdata0 q1 -#define vdata0_0 d2 /* Lower half of vdata0 */ -#define vdata0_1 d3 /* Upper half of vdata0 */ -#define vdata1 q2 -#define vdata1_0 d4 /* Lower half of vhas_chr0 */ -#define vdata1_1 d5 /* Upper half of vhas_chr0 */ -#define vrepmask q3 -#define vrepmask0 d6 -#define vrepmask1 d7 -#define vend q4 -#define vend0 d8 -#define vend1 d9 - -/* - * Core algorithm: - * - * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per - * byte. Each bit is set if the relevant byte matched the requested character - * and cleared otherwise. Since the bits in the syndrome reflect exactly the - * order in which things occur in the original string, counting trailing zeros - * allows to identify exactly which byte has matched. - */ - -#ifndef NO_THUMB - .thumb_func -#else - .arm -#endif - .p2align 4,,15 - -ENTRY(memchr) - /* Use a simple loop if there are less than 8 bytes to search. */ - cmp cntin, #7 - bhi .Llargestr - and chrin, chrin, #0xff - -.Lsmallstr: - subs cntin, cntin, #1 - blo .Lnotfound /* Return not found if reached end. */ - ldrb tmp, [srcin], #1 - cmp tmp, chrin - bne .Lsmallstr /* Loop again if not found. */ - /* Otherwise fixup address and return. */ - sub result, srcin, #1 - bx lr - - -.Llargestr: - vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */ - /* - * Magic constant 0x8040201008040201 allows us to identify which lane - * matches the requested byte. - */ - movw tmp, #0x0201 - movt tmp, #0x0804 - lsl soff, tmp, #4 - vmov vrepmask0, tmp, soff - vmov vrepmask1, tmp, soff - /* Work with aligned 32-byte chunks */ - bic src, srcin, #31 - ands soff, srcin, #31 - beq .Lloopintro /* Go straight to main loop if it's aligned. */ - - /* - * Input string is not 32-byte aligned. We calculate the syndrome - * value for the aligned 32 bytes block containing the first bytes - * and mask the irrelevant part. - */ - vld1.8 {vdata0, vdata1}, [src:256]! - sub tmp, soff, #32 - adds cntin, cntin, tmp - vceq.i8 vdata0, vdata0, vrepchr - vceq.i8 vdata1, vdata1, vrepchr - vand vdata0, vdata0, vrepmask - vand vdata1, vdata1, vrepmask - vpadd.i8 vdata0_0, vdata0_0, vdata0_1 - vpadd.i8 vdata1_0, vdata1_0, vdata1_1 - vpadd.i8 vdata0_0, vdata0_0, vdata1_0 - vpadd.i8 vdata0_0, vdata0_0, vdata0_0 - vmov synd, vdata0_0[0] - - /* Clear the soff lower bits */ - lsr synd, synd, soff - lsl synd, synd, soff - /* The first block can also be the last */ - bls .Lmasklast - /* Have we found something already? */ -#ifndef NO_THUMB - cbnz synd, .Ltail -#else - cmp synd, #0 - bne .Ltail -#endif - - -.Lloopintro: - vpush {vend} - /* 264/265 correspond to d8/d9 for q4 */ - cfi_adjust_cfa_offset (16) - cfi_rel_offset (264, 0) - cfi_rel_offset (265, 8) - .p2align 3,,7 -.Lloop: - vld1.8 {vdata0, vdata1}, [src:256]! - subs cntin, cntin, #32 - vceq.i8 vdata0, vdata0, vrepchr - vceq.i8 vdata1, vdata1, vrepchr - /* If we're out of data we finish regardless of the result. */ - bls .Lend - /* Use a fast check for the termination condition. */ - vorr vend, vdata0, vdata1 - vorr vend0, vend0, vend1 - vmov synd, tmp, vend0 - orrs synd, synd, tmp - /* We're not out of data, loop if we haven't found the character. */ - beq .Lloop - -.Lend: - vpop {vend} - cfi_adjust_cfa_offset (-16) - cfi_restore (264) - cfi_restore (265) - - /* Termination condition found, let's calculate the syndrome value. */ - vand vdata0, vdata0, vrepmask - vand vdata1, vdata1, vrepmask - vpadd.i8 vdata0_0, vdata0_0, vdata0_1 - vpadd.i8 vdata1_0, vdata1_0, vdata1_1 - vpadd.i8 vdata0_0, vdata0_0, vdata1_0 - vpadd.i8 vdata0_0, vdata0_0, vdata0_0 - vmov synd, vdata0_0[0] -#ifndef NO_THUMB - cbz synd, .Lnotfound - bhi .Ltail /* Uses the condition code from - subs cntin, cntin, #32 above. */ -#else - cmp synd, #0 - beq .Lnotfound - cmp cntin, #0 - bhi .Ltail -#endif - - -.Lmasklast: - /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */ - neg cntin, cntin - lsl synd, synd, cntin - lsrs synd, synd, cntin - it eq - moveq src, #0 /* If no match, set src to 0 so the retval is 0. */ - - -.Ltail: - /* Count the trailing zeros using bit reversing */ - rbit synd, synd - /* Compensate the last post-increment */ - sub src, src, #32 - /* Count the leading zeros */ - clz synd, synd - /* Compute the potential result and return */ - add result, src, synd - bx lr - - -.Lnotfound: - /* Set result to NULL if not found and return */ - mov result, #0 - bx lr - -END(memchr) -libc_hidden_builtin_def (memchr) - -#else - -#include "../../armv6t2/memchr.S" - -#endif diff --git a/sysdeps/arm/armv7/multiarch/memchr_neon.S b/sysdeps/arm/armv7/multiarch/memchr_neon.S index ee21818..a400033 100644 --- a/sysdeps/arm/armv7/multiarch/memchr_neon.S +++ b/sysdeps/arm/armv7/multiarch/memchr_neon.S @@ -1,9 +1,218 @@ -#ifdef __ARM_NEON__ -/* Under __ARM_NEON__, this file defines memchr directly. */ -libc_hidden_builtin_def (memchr) -#else +/* memchr implemented using NEON. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +/* For __ARM_NEON__ this file defines memchr. */ +#ifndef __ARM_NEON__ # define memchr __memchr_neon +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(a) +#endif + + .arch armv7-a + .fpu neon + + +/* Arguments */ +#define srcin r0 +#define chrin r1 +#define cntin r2 + +/* Retval */ +#define result r0 /* Live range does not overlap with srcin */ + +/* Working registers */ +#define src r1 /* Live range does not overlap with chrin */ +#define tmp r3 +#define synd r0 /* No overlap with srcin or result */ +#define soff r12 + +/* Working NEON registers */ +#define vrepchr q0 +#define vdata0 q1 +#define vdata0_0 d2 /* Lower half of vdata0 */ +#define vdata0_1 d3 /* Upper half of vdata0 */ +#define vdata1 q2 +#define vdata1_0 d4 /* Lower half of vhas_chr0 */ +#define vdata1_1 d5 /* Upper half of vhas_chr0 */ +#define vrepmask q3 +#define vrepmask0 d6 +#define vrepmask1 d7 +#define vend q4 +#define vend0 d8 +#define vend1 d9 + +/* + * Core algorithm: + * + * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per + * byte. Each bit is set if the relevant byte matched the requested character + * and cleared otherwise. Since the bits in the syndrome reflect exactly the + * order in which things occur in the original string, counting trailing zeros + * allows to identify exactly which byte has matched. + */ + +#ifndef NO_THUMB + .thumb_func +#else + .arm +#endif + .p2align 4,,15 + +ENTRY(memchr) + /* Use a simple loop if there are less than 8 bytes to search. */ + cmp cntin, #7 + bhi .Llargestr + and chrin, chrin, #0xff + +.Lsmallstr: + subs cntin, cntin, #1 + blo .Lnotfound /* Return not found if reached end. */ + ldrb tmp, [srcin], #1 + cmp tmp, chrin + bne .Lsmallstr /* Loop again if not found. */ + /* Otherwise fixup address and return. */ + sub result, srcin, #1 + bx lr + + +.Llargestr: + vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */ + /* + * Magic constant 0x8040201008040201 allows us to identify which lane + * matches the requested byte. + */ + movw tmp, #0x0201 + movt tmp, #0x0804 + lsl soff, tmp, #4 + vmov vrepmask0, tmp, soff + vmov vrepmask1, tmp, soff + /* Work with aligned 32-byte chunks */ + bic src, srcin, #31 + ands soff, srcin, #31 + beq .Lloopintro /* Go straight to main loop if it's aligned. */ + + /* + * Input string is not 32-byte aligned. We calculate the syndrome + * value for the aligned 32 bytes block containing the first bytes + * and mask the irrelevant part. + */ + vld1.8 {vdata0, vdata1}, [src:256]! + sub tmp, soff, #32 + adds cntin, cntin, tmp + vceq.i8 vdata0, vdata0, vrepchr + vceq.i8 vdata1, vdata1, vrepchr + vand vdata0, vdata0, vrepmask + vand vdata1, vdata1, vrepmask + vpadd.i8 vdata0_0, vdata0_0, vdata0_1 + vpadd.i8 vdata1_0, vdata1_0, vdata1_1 + vpadd.i8 vdata0_0, vdata0_0, vdata1_0 + vpadd.i8 vdata0_0, vdata0_0, vdata0_0 + vmov synd, vdata0_0[0] + + /* Clear the soff lower bits */ + lsr synd, synd, soff + lsl synd, synd, soff + /* The first block can also be the last */ + bls .Lmasklast + /* Have we found something already? */ +#ifndef NO_THUMB + cbnz synd, .Ltail +#else + cmp synd, #0 + bne .Ltail #endif -#define MEMCHR_NEON -#include "memchr_impl.S" + +.Lloopintro: + vpush {vend} + /* 264/265 correspond to d8/d9 for q4 */ + cfi_adjust_cfa_offset (16) + cfi_rel_offset (264, 0) + cfi_rel_offset (265, 8) + .p2align 3,,7 +.Lloop: + vld1.8 {vdata0, vdata1}, [src:256]! + subs cntin, cntin, #32 + vceq.i8 vdata0, vdata0, vrepchr + vceq.i8 vdata1, vdata1, vrepchr + /* If we're out of data we finish regardless of the result. */ + bls .Lend + /* Use a fast check for the termination condition. */ + vorr vend, vdata0, vdata1 + vorr vend0, vend0, vend1 + vmov synd, tmp, vend0 + orrs synd, synd, tmp + /* We're not out of data, loop if we haven't found the character. */ + beq .Lloop + +.Lend: + vpop {vend} + cfi_adjust_cfa_offset (-16) + cfi_restore (264) + cfi_restore (265) + + /* Termination condition found, let's calculate the syndrome value. */ + vand vdata0, vdata0, vrepmask + vand vdata1, vdata1, vrepmask + vpadd.i8 vdata0_0, vdata0_0, vdata0_1 + vpadd.i8 vdata1_0, vdata1_0, vdata1_1 + vpadd.i8 vdata0_0, vdata0_0, vdata1_0 + vpadd.i8 vdata0_0, vdata0_0, vdata0_0 + vmov synd, vdata0_0[0] +#ifndef NO_THUMB + cbz synd, .Lnotfound + bhi .Ltail /* Uses the condition code from + subs cntin, cntin, #32 above. */ +#else + cmp synd, #0 + beq .Lnotfound + cmp cntin, #0 + bhi .Ltail +#endif + + +.Lmasklast: + /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */ + neg cntin, cntin + lsl synd, synd, cntin + lsrs synd, synd, cntin + it eq + moveq src, #0 /* If no match, set src to 0 so the retval is 0. */ + + +.Ltail: + /* Count the trailing zeros using bit reversing */ + rbit synd, synd + /* Compensate the last post-increment */ + sub src, src, #32 + /* Count the leading zeros */ + clz synd, synd + /* Compute the potential result and return */ + add result, src, synd + bx lr + + +.Lnotfound: + /* Set result to NULL if not found and return */ + mov result, #0 + bx lr + +END(memchr) +libc_hidden_builtin_def (memchr) diff --git a/sysdeps/arm/armv7/multiarch/memchr_noneon.S b/sysdeps/arm/armv7/multiarch/memchr_noneon.S new file mode 100644 index 0000000..b1fb540 --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/memchr_noneon.S @@ -0,0 +1,5 @@ +#define memchr __memchr_noneon +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include diff --git a/sysdeps/arm/armv7/multiarch/rtld-memchr.S b/sysdeps/arm/armv7/multiarch/rtld-memchr.S new file mode 100644 index 0000000..ae8e5f0 --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/rtld-memchr.S @@ -0,0 +1 @@ +#include