From patchwork Wed Aug 1 22:23:47 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Richard Henderson X-Patchwork-Id: 143319 Delivered-To: patch@linaro.org Received: by 2002:a2e:9754:0:0:0:0:0 with SMTP id f20-v6csp1415154ljj; Wed, 1 Aug 2018 15:24:53 -0700 (PDT) X-Google-Smtp-Source: AAOMgpf4RHJlMj3of3x2WGmrGNj+ZZOKYEMH/BXPJV+ue29SdseQsVFEyMAAxAD1Gp5kdp6ga/AD X-Received: by 2002:a65:57c9:: with SMTP id q9-v6mr213462pgr.128.1533162293127; Wed, 01 Aug 2018 15:24:53 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1533162293; cv=none; d=google.com; s=arc-20160816; b=P1hs7tYciKpsRJtFYABg1tCjwOS5FFvEF/wo4c4oQ8nZvUakE4fbxeMewrkx42RIQa ApjpZoRzGBOspskRwtMhduMAqeXM2IAPwmETi5gtmDBzfqkc4yEbiNGwRSCwbO3L4o6z Nvfb/hDMBbbmAN6O8uYJ2k4MJlBn/4aMVRiKttJCeom2MrUL+IstRi1DrzEN6JVQ5v13 CSRK/+lX3ij1IGH6attI/GdQ6sKTGH/KkfhJrKRTTYaw8SeKcBSxFz2ZHe7KH3BBTHcj YzmC+2Jk6CXPbpbLmjxgRfxDrIm1+0Nm8rJKaqNzJtCCLJAQO6tGQFHc3GCwViLy1bDm enaA== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=references:in-reply-to:message-id:date:subject:cc:to:from:sender :dkim-signature:delivered-to:sender:list-help:list-post:list-archive :list-subscribe:list-unsubscribe:list-id:precedence:mailing-list :arc-authentication-results; bh=hBKtMhCjVGD6X8lBZBjb5AN6l81Hdrcy8nCWSz88XjE=; b=adFCdDeKq8tPZoTAHb5kLi/UE7jfmD1mSwN0DjPG3Rde+qHpicoW8cVXfRi+ySZSmo dd4yXTuWGagptV7sIkwb52CJbUUwXya2jzcEYWcvY2yQiTuxl95xYH4SeHX0JnY2npCU gywtfIQnRU22MNyYv1JWz+Cw67iy+eI/XTc4VsKbgJ55EbrtDwzkgW+WFh0GRhRnsXWh TBINXFasaw8gUbCeWUIoSNlwZaEvQpZ1zdnawdaUobNr4Z9/Rb7lzUwsEjrQIMLuCp4s tmly0tWFgyEn6o++j/55o/hc1W9KxpK00n6I+THGwMDMvr1C2oriXwuaTmxdouo/0/R+ 97XA== ARC-Authentication-Results: i=1; mx.google.com; dkim=pass header.i=@gmail.com header.s=20161025 header.b=iDjhLltq; spf=pass (google.com: domain of libc-alpha-return-94989-patch=linaro.org@sourceware.org designates 209.132.180.131 as permitted sender) smtp.mailfrom="libc-alpha-return-94989-patch=linaro.org@sourceware.org" Return-Path: Received: from sourceware.org (server1.sourceware.org. [209.132.180.131]) by mx.google.com with ESMTPS id n70-v6si134761pfa.320.2018.08.01.15.24.52 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Wed, 01 Aug 2018 15:24:53 -0700 (PDT) Received-SPF: pass (google.com: domain of libc-alpha-return-94989-patch=linaro.org@sourceware.org designates 209.132.180.131 as permitted sender) client-ip=209.132.180.131; Authentication-Results: mx.google.com; dkim=pass header.i=@gmail.com header.s=20161025 header.b=iDjhLltq; spf=pass (google.com: domain of libc-alpha-return-94989-patch=linaro.org@sourceware.org designates 209.132.180.131 as permitted sender) smtp.mailfrom="libc-alpha-return-94989-patch=linaro.org@sourceware.org" Received: (qmail 25350 invoked by alias); 1 Aug 2018 22:24:06 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 25028 invoked by uid 89); 1 Aug 2018 22:24:04 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-26.6 required=5.0 tests=BAYES_00, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE, SPF_PASS autolearn=ham version=3.3.2 spammy=H*r:sk:h1-v6so, HX-Received:sk:l27-v6m, amended X-HELO: mail-ua0-f195.google.com DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20161025; h=sender:from:to:cc:subject:date:message-id:in-reply-to:references; bh=hBKtMhCjVGD6X8lBZBjb5AN6l81Hdrcy8nCWSz88XjE=; b=iDjhLltq88m0yQiPEIzUGI70PMyrCp6gvKcHldmRTr4sLgDf7DbnxfVxMGdq1+JzNA xw3FMLv7xYQdVZUu4vT9eZEwbLmwy4kl3IW9gzpLrFo6T5o+ZbsnBE1Jvzn1fbMdl8Sv gZPahMxhoT8KUUYZHMoMWnj9H7m/XVZ7RwMt+razhK8dHYdnylz2qCjRF84goB2CUwxm 8x8WbcWLs152EmOgv9csuqKwt+j2SqrBVsVq+Jb6rRN+fYN90G9sy5uKIR2/JHX4V1rm KFqn2qeMlCTIxwoLMr0xISNByGeNyLk3f0L/WJBe8BnCuBleGIBXnUH7lKnn8OYwnp1+ D75w== Return-Path: Sender: Richard Henderson From: rth@twiddle.net To: libc-alpha@sourceware.org Cc: marcus.shawcroft@linaro.org, szabolcs.nagy@arm.com, Richard Henderson Subject: [PATCH 3/3] aarch64: Save and restore SVE registers in ld.so Date: Wed, 1 Aug 2018 18:23:47 -0400 Message-Id: <20180801222347.18903-4-rth@twiddle.net> In-Reply-To: <20180801222347.18903-1-rth@twiddle.net> References: <20180801222347.18903-1-rth@twiddle.net> From: Richard Henderson Add SVE versions of _dl_runtime_resolve and _dl_runtime_profile. This honors the extended vector calling conventionn described in ARM_100986_0000_00_en (SVEpcs 00bet1). * sysdeps/aarch64/dl-trampoline.S (_dl_runtime_resolve_sve): New. (_dl_runtime_profile_sve): New. * sysdeps/aarch64/dl-machine.h (elf_machine_runtime_set): Use the new routines if HWCAP_SVE is set. --- sysdeps/aarch64/dl-machine.h | 13 +- sysdeps/aarch64/dl-trampoline.S | 343 ++++++++++++++++++++++++++++++++ 2 files changed, 353 insertions(+), 3 deletions(-) -- 2.17.1 diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h index 4935aa7c54..ea7c5c71d5 100644 --- a/sysdeps/aarch64/dl-machine.h +++ b/sysdeps/aarch64/dl-machine.h @@ -69,6 +69,9 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) ElfW(Addr) *got; extern void _dl_runtime_resolve (ElfW(Word)); extern void _dl_runtime_profile (ElfW(Word)); + extern void _dl_runtime_resolve_sve (ElfW(Word)); + extern void _dl_runtime_profile_sve (ElfW(Word)); + unsigned has_sve = GLRO(dl_hwcap) & HWCAP_SVE; got = (ElfW(Addr) *) D_PTR (l, l_info[DT_PLTGOT]); if (got[1]) @@ -83,9 +86,11 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) to intercept the calls to collect information. In this case we don't store the address in the GOT so that all future calls also end in this function. */ - if ( profile) + if (profile) { - got[2] = (ElfW(Addr)) &_dl_runtime_profile; + got[2] = (has_sve + ? (ElfW(Addr)) &_dl_runtime_profile_sve + : (ElfW(Addr)) &_dl_runtime_profile); if (GLRO(dl_profile) != NULL && _dl_name_match_p (GLRO(dl_profile), l)) @@ -98,7 +103,9 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) /* This function will get called to fix up the GOT entry indicated by the offset on the stack, and then jump to the resolved address. */ - got[2] = (ElfW(Addr)) &_dl_runtime_resolve; + got[2] = (has_sve + ? (ElfW(Addr)) &_dl_runtime_resolve_sve + : (ElfW(Addr)) &_dl_runtime_resolve); } } diff --git a/sysdeps/aarch64/dl-trampoline.S b/sysdeps/aarch64/dl-trampoline.S index 67a7c1b207..e23e5f1aad 100644 --- a/sysdeps/aarch64/dl-trampoline.S +++ b/sysdeps/aarch64/dl-trampoline.S @@ -280,3 +280,346 @@ _dl_runtime_profile: cfi_endproc .size _dl_runtime_profile, .-_dl_runtime_profile #endif + +/* + * For functions conforming to the procedure call standard as + * amended for SVE support (ARM_100986_0000_00_en (SVEpcs 00bet1)), + * we must save the entire contents of Z0-Z7 as well as P0-P3. + */ + .arch armv8-a+sve + + .globl _dl_runtime_resolve_sve + .type _dl_runtime_resolve_sve, #function + .align 2 +_dl_runtime_resolve_sve: + /* AArch64 we get called with: + ip0 &PLTGOT[2] + ip1 temp(dl resolver entry point) + [sp, #8] lr + [sp, #0] &PLTGOT[n] + */ + cfi_startproc + cfi_adjust_cfa_offset(16) /* Incorporate PLT */ + cfi_rel_offset (lr, 8) + + /* Save arguments. */ + stp x29, x8, [sp, #-80]! + cfi_adjust_cfa_offset (80) + cfi_rel_offset (x29, 0) + mov x29, sp + cfi_def_cfa_register (x29) + + stp x6, x7, [sp, #16] + stp x4, x5, [sp, #32] + stp x2, x3, [sp, #48] + stp x0, x1, [sp, #64] + + /* Allocate space for, and store, Z[0-7]. */ + addvl sp, sp, #-8 + str z0, [sp, #0, mul vl] + str z1, [sp, #1, mul vl] + str z2, [sp, #2, mul vl] + str z3, [sp, #3, mul vl] + str z4, [sp, #4, mul vl] + str z5, [sp, #5, mul vl] + str z6, [sp, #6, mul vl] + str z7, [sp, #7, mul vl] + + /* Allocate space for, and store, P[0-3]. */ + addpl sp, sp, #-4 + str p0, [sp, #0, mul vl] + str p1, [sp, #1, mul vl] + str p2, [sp, #2, mul vl] + str p3, [sp, #3, mul vl] + + /* Get pointer to linker struct. */ + ldr PTR_REG (0), [ip0, #-PTR_SIZE] + + /* Prepare to call _dl_fixup(). */ + ldr x1, [x29, 80] /* Recover &PLTGOT[n] */ + + sub x1, x1, ip0 + add x1, x1, x1, lsl #1 + lsl x1, x1, #3 + sub x1, x1, #(RELA_SIZE<<3) + lsr x1, x1, #3 + + /* Call fixup routine. */ + bl _dl_fixup + + /* Save the return. */ + mov ip0, x0 + + /* Get arguments and return address back. */ + ldr p0, [sp, #0, mul vl] + ldr p1, [sp, #1, mul vl] + ldr p2, [sp, #2, mul vl] + ldr p3, [sp, #3, mul vl] + addpl sp, sp, #4 + + ldr z0, [sp, #0, mul vl] + ldr z1, [sp, #1, mul vl] + ldr z2, [sp, #2, mul vl] + ldr z3, [sp, #3, mul vl] + ldr z4, [sp, #4, mul vl] + ldr z5, [sp, #5, mul vl] + ldr z6, [sp, #6, mul vl] + ldr z7, [sp, #7, mul vl] + addvl sp, sp, #8 + + ldr lr, [sp, #88] + ldp x0, x1, [sp, #64] + ldp x2, x3, [sp, #48] + ldp x4, x5, [sp, #32] + ldp x6, x7, [sp, #16] + ldp x29, x8, [sp], #96 + cfi_def_cfa (sp, 0) + cfi_restore (lr) + cfi_restore (x29) + + /* Jump to the newly found address. */ + br ip0 + + cfi_endproc + .size _dl_runtime_resolve_sve, .-_dl_runtime_resolve_sve + +#ifndef PROF + .globl _dl_runtime_profile_sve + .type _dl_runtime_profile_sve, #function + .align 2 +_dl_runtime_profile_sve: + /* AArch64 we get called with: + ip0 &PLTGOT[2] + ip1 temp(dl resolver entry point) + [sp, #8] lr + [sp, #0] &PLTGOT[n] + + Stack frame layout: + [x29, #...] lr + [x29, #...] &PLTGOT[n] + [x29, #96] La_aarch64_regs + [x29, #48] La_aarch64_retval + [x29, #40] frame size return from pltenter + [x29, #32] dl_profile_call saved x1 + [x29, #24] dl_profile_call saved x0 + [x29, #16] t1 + [x29, #0] x29, lr <- x29 + [x29, #-1, mul vl] full p[0-3] + [x29, #-2, mul vl] full z[0-8] <- sp + + ??? Extending the profiling hook for full SVE register export + is tricky given the variable register size. Perhaps the new + La_aarch64_regs should contain pointers to Z0 and P0, and + the current VL, and one infers the addresses from there. + + This one new form could be used for all, with AdvSIMD + devolving into VL=16 with no predicate registers. + + In the meantime, this function simply saves the contents of + the SVE registers, but only exposes the AdvSIMD portion to + the profile hooks. + */ + + cfi_startproc + cfi_adjust_cfa_offset(16) /* Incorporate PLT */ + cfi_rel_offset (lr, 8) + + stp x29, x8, [SP, #-SF_SIZE]! + cfi_adjust_cfa_offset (SF_SIZE) + cfi_rel_offset (x29, 0) + mov x29, sp + cfi_def_cfa_register (x29) + + /* Save La_aarch64_regs. */ + stp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0] + stp x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1] + stp x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2] + stp x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3] + stp d0, d1, [X29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*0] + stp d2, d3, [X29, #OFFSET_RG+ DL_OFFSET_RG_D0 + 16*1] + stp d4, d5, [X29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*2] + stp d6, d7, [X29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*3] + + /* Re-save the full contents of the vector arguments. + + Note that PL = VL/8, so we can save all 4 predicates + in (less than) the space of one vector; this minimizes + the number of stack adjustments required, and gives a + predictable place for each register. + + Despite the unfortunate assembler mnemomics, the vector + stores do not overlap the preceeding prediate stores. */ + addvl sp, sp, #-9 + + str p0, [x29, #-1, mul vl] + str p1, [x29, #-2, mul vl] + str p2, [x29, #-3, mul vl] + str p3, [x29, #-4, mul vl] + + str z0, [x29, #-2, mul vl] + str z1, [x29, #-3, mul vl] + str z2, [x29, #-4, mul vl] + str z3, [x29, #-5, mul vl] + str z4, [x29, #-6, mul vl] + str z5, [x29, #-7, mul vl] + str z6, [x29, #-8, mul vl] + str z7, [x29, #-9, mul vl] + + add x0, x29, #SF_SIZE + 16 + ldr x1, [x29, #OFFSET_LR] + stp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_SP] + + /* Get pointer to linker struct. */ + ldr PTR_REG (0), [ip0, #-PTR_SIZE] + + /* Prepare to call _dl_profile_fixup(). */ + ldr x1, [x29, OFFSET_PLTGOTN] /* Recover &PLTGOT[n] */ + + sub x1, x1, ip0 + add x1, x1, x1, lsl #1 + lsl x1, x1, #3 + sub x1, x1, #(RELA_SIZE<<3) + lsr x1, x1, #3 + + stp x0, x1, [x29, #OFFSET_SAVED_CALL_X0] + + /* Set up extra args for _dl_profile_fixup */ + ldr x2, [x29, #OFFSET_LR] /* load saved LR */ + add x3, x29, #OFFSET_RG /* address of La_aarch64_reg */ + add x4, x29, #OFFSET_FS /* address of framesize */ + bl _dl_profile_fixup + + ldr ip0l, [x29, #OFFSET_FS] /* framesize == 0 */ + cmp ip0l, #0 + bge 1f + cfi_remember_state + + /* Save the return. */ + mov ip0, x0 + + /* Get arguments and return address back. */ + ldr p0, [x29, #-1, mul vl] + ldr p1, [x29, #-2, mul vl] + ldr p2, [x29, #-3, mul vl] + ldr p3, [x29, #-4, mul vl] + + ldr z0, [x29, #-2, mul vl] + ldr z1, [x29, #-3, mul vl] + ldr z2, [x29, #-4, mul vl] + ldr z3, [x29, #-5, mul vl] + ldr z4, [x29, #-6, mul vl] + ldr z5, [x29, #-7, mul vl] + ldr z6, [x29, #-8, mul vl] + ldr z7, [x29, #-9, mul vl] + + ldr lr, [x29, #OFFSET_LR] + ldp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0] + ldp x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1] + ldp x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2] + ldp x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3] + + mov sp, x29 + ldp x29, x8, [sp], SF_SIZE + 16 + cfi_def_cfa (sp, 0) + cfi_restore(x29) + cfi_restore(lr) + + /* Jump to the newly found address. */ + br ip0 + + cfi_restore_state + /* The new frame size is in ip0, extended for pointer size. */ +1: sub x1, sp, ip0 + and sp, x1, #0xfffffffffffffff0 + + str x0, [x29, #OFFSET_T1] + + mov x0, sp + add x1, x29, #SF_SIZE + 16 + mov x2, ip0 + bl memcpy + + ldr ip0, [x29, #OFFSET_T1] + + /* Reload the full arguments. */ + ldp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0] + ldp x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1] + ldp x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2] + ldp x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3] + ldr x8, [x29, 8] + + ldr p0, [x29, #-1, mul vl] + ldr p1, [x29, #-2, mul vl] + ldr p2, [x29, #-3, mul vl] + ldr p3, [x29, #-4, mul vl] + + ldr z0, [x29, #-2, mul vl] + ldr z1, [x29, #-3, mul vl] + ldr z2, [x29, #-4, mul vl] + ldr z3, [x29, #-5, mul vl] + ldr z4, [x29, #-6, mul vl] + ldr z5, [x29, #-7, mul vl] + ldr z6, [x29, #-8, mul vl] + ldr z7, [x29, #-9, mul vl] + + /* Call the function. */ + blr ip0 + + /* Store La_aarch64_retval, as if for the non-vector ABI. */ + stp x0, x1, [x29, #OFFSET_RV + DL_OFFSET_RV_X0] + stp d0, d1, [x29, #OFFSET_RV + DL_OFFSET_RV_D0 + 16*0] + stp d2, d3, [x29, #OFFSET_RV + DL_OFFSET_RV_D0 + 16*1] + + /* Store the full contents of the vector return. */ + str p0, [x29, #-1, mul vl] + str p1, [x29, #-2, mul vl] + str p2, [x29, #-3, mul vl] + str p3, [x29, #-4, mul vl] + + str z0, [x29, #-2, mul vl] + str z1, [x29, #-3, mul vl] + str z2, [x29, #-4, mul vl] + str z3, [x29, #-5, mul vl] + str z4, [x29, #-6, mul vl] + str z5, [x29, #-7, mul vl] + str z6, [x29, #-8, mul vl] + str z7, [x29, #-9, mul vl] + + /* Setup call to pltexit */ + ldp x0, x1, [x29, #OFFSET_SAVED_CALL_X0] + add x2, x29, #OFFSET_RG + add x3, x29, #OFFSET_RV + bl _dl_call_pltexit + + /* Reload the full return value. */ + ldp x0, x1, [x29, #OFFSET_RV + DL_OFFSET_RV_X0] + + ldr p0, [x29, #-1, mul vl] + ldr p1, [x29, #-2, mul vl] + ldr p2, [x29, #-3, mul vl] + ldr p3, [x29, #-4, mul vl] + + ldr z0, [x29, #-2, mul vl] + ldr z1, [x29, #-3, mul vl] + ldr z2, [x29, #-4, mul vl] + ldr z3, [x29, #-5, mul vl] + ldr z4, [x29, #-6, mul vl] + ldr z5, [x29, #-7, mul vl] + ldr z6, [x29, #-8, mul vl] + ldr z7, [x29, #-9, mul vl] + + /* LR from within La_aarch64_reg */ + ldr lr, [x29, #OFFSET_RG + DL_OFFSET_RG_LR] + mov sp, x29 + cfi_def_cfa_register (sp) + ldr x29, [x29, #0] + add sp, sp, SF_SIZE + 16 + cfi_adjust_cfa_offset (- SF_SIZE - 16) + cfi_restore(x29) + cfi_restore(lr) + + br lr + + cfi_endproc + .size _dl_runtime_profile_sve, .-_dl_runtime_profile_sve +#endif