[3/3] aarch64: Save and restore SVE registers in ld.so

Message ID	20180801222347.18903-4-rth@twiddle.net
State	New
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: domain of libc-alpha-return-94989-patch=linaro.org@sourceware.org designates 209.132.180.131 as permitted sender) client-ip=209.132.180.131; Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org Sender: Richard Henderson <rth7680@gmail.com> From: rth@twiddle.net To: libc-alpha@sourceware.org Cc: marcus.shawcroft@linaro.org, szabolcs.nagy@arm.com, Richard Henderson <richard.henderson@linaro.org> Subject: [PATCH 3/3] aarch64: Save and restore SVE registers in ld.so Date: Wed, 1 Aug 2018 18:23:47 -0400 Message-Id: <20180801222347.18903-4-rth@twiddle.net> In-Reply-To: <20180801222347.18903-1-rth@twiddle.net> References: <20180801222347.18903-1-rth@twiddle.net>
Series	aarch64: Update ld.so for vector abi \| expand [0/3] aarch64: Update ld.so for vector abi [1/3] aarch64: Clean up _dl_runtime_resolve [2/3] aarch64: Clean up _dl_runtime_profile [3/3] aarch64: Save and restore SVE registers in ld.so

diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h index 4935aa7c54..ea7c5c71d5 100644 --- a/sysdeps/aarch64/dl-machine.h +++ b/sysdeps/aarch64/dl-machine.h @@ -69,6 +69,9 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) ElfW(Addr) *got; extern void _dl_runtime_resolve (ElfW(Word)); extern void _dl_runtime_profile (ElfW(Word)); + extern void _dl_runtime_resolve_sve (ElfW(Word)); + extern void _dl_runtime_profile_sve (ElfW(Word)); + unsigned has_sve = GLRO(dl_hwcap) & HWCAP_SVE; got = (ElfW(Addr) *) D_PTR (l, l_info[DT_PLTGOT]); if (got[1]) @@ -83,9 +86,11 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) to intercept the calls to collect information. In this case we don't store the address in the GOT so that all future calls also end in this function. */ - if ( profile) + if (profile) { - got[2] = (ElfW(Addr)) &_dl_runtime_profile; + got[2] = (has_sve + ? (ElfW(Addr)) &_dl_runtime_profile_sve + : (ElfW(Addr)) &_dl_runtime_profile); if (GLRO(dl_profile) != NULL && _dl_name_match_p (GLRO(dl_profile), l)) @@ -98,7 +103,9 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) /* This function will get called to fix up the GOT entry indicated by the offset on the stack, and then jump to the resolved address. */ - got[2] = (ElfW(Addr)) &_dl_runtime_resolve; + got[2] = (has_sve + ? (ElfW(Addr)) &_dl_runtime_resolve_sve + : (ElfW(Addr)) &_dl_runtime_resolve); } } diff --git a/sysdeps/aarch64/dl-trampoline.S b/sysdeps/aarch64/dl-trampoline.S index 67a7c1b207..e23e5f1aad 100644 --- a/sysdeps/aarch64/dl-trampoline.S +++ b/sysdeps/aarch64/dl-trampoline.S @@ -280,3 +280,346 @@ _dl_runtime_profile: cfi_endproc .size _dl_runtime_profile, .-_dl_runtime_profile #endif + +/* + * For functions conforming to the procedure call standard as + * amended for SVE support (ARM_100986_0000_00_en (SVEpcs 00bet1)), + * we must save the entire contents of Z0-Z7 as well as P0-P3. + */ + .arch armv8-a+sve + + .globl _dl_runtime_resolve_sve + .type _dl_runtime_resolve_sve, #function + .align 2 +_dl_runtime_resolve_sve: + /* AArch64 we get called with: + ip0 &PLTGOT[2] + ip1 temp(dl resolver entry point) + [sp, #8] lr + [sp, #0] &PLTGOT[n] + */ + cfi_startproc + cfi_adjust_cfa_offset(16) /* Incorporate PLT */ + cfi_rel_offset (lr, 8) + + /* Save arguments. */ + stp x29, x8, [sp, #-80]! + cfi_adjust_cfa_offset (80) + cfi_rel_offset (x29, 0) + mov x29, sp + cfi_def_cfa_register (x29) + + stp x6, x7, [sp, #16] + stp x4, x5, [sp, #32] + stp x2, x3, [sp, #48] + stp x0, x1, [sp, #64] + + /* Allocate space for, and store, Z[0-7]. */ + addvl sp, sp, #-8 + str z0, [sp, #0, mul vl] + str z1, [sp, #1, mul vl] + str z2, [sp, #2, mul vl] + str z3, [sp, #3, mul vl] + str z4, [sp, #4, mul vl] + str z5, [sp, #5, mul vl] + str z6, [sp, #6, mul vl] + str z7, [sp, #7, mul vl] + + /* Allocate space for, and store, P[0-3]. */ + addpl sp, sp, #-4 + str p0, [sp, #0, mul vl] + str p1, [sp, #1, mul vl] + str p2, [sp, #2, mul vl] + str p3, [sp, #3, mul vl] + + /* Get pointer to linker struct. */ + ldr PTR_REG (0), [ip0, #-PTR_SIZE] + + /* Prepare to call _dl_fixup(). */ + ldr x1, [x29, 80] /* Recover &PLTGOT[n] */ + + sub x1, x1, ip0 + add x1, x1, x1, lsl #1 + lsl x1, x1, #3 + sub x1, x1, #(RELA_SIZE<<3) + lsr x1, x1, #3 + + /* Call fixup routine. */ + bl _dl_fixup + + /* Save the return. */ + mov ip0, x0 + + /* Get arguments and return address back. */ + ldr p0, [sp, #0, mul vl] + ldr p1, [sp, #1, mul vl] + ldr p2, [sp, #2, mul vl] + ldr p3, [sp, #3, mul vl] + addpl sp, sp, #4 + + ldr z0, [sp, #0, mul vl] + ldr z1, [sp, #1, mul vl] + ldr z2, [sp, #2, mul vl] + ldr z3, [sp, #3, mul vl] + ldr z4, [sp, #4, mul vl] + ldr z5, [sp, #5, mul vl] + ldr z6, [sp, #6, mul vl] + ldr z7, [sp, #7, mul vl] + addvl sp, sp, #8 + + ldr lr, [sp, #88] + ldp x0, x1, [sp, #64] + ldp x2, x3, [sp, #48] + ldp x4, x5, [sp, #32] + ldp x6, x7, [sp, #16] + ldp x29, x8, [sp], #96 + cfi_def_cfa (sp, 0) + cfi_restore (lr) + cfi_restore (x29) + + /* Jump to the newly found address. */ + br ip0 + + cfi_endproc + .size _dl_runtime_resolve_sve, .-_dl_runtime_resolve_sve + +#ifndef PROF + .globl _dl_runtime_profile_sve + .type _dl_runtime_profile_sve, #function + .align 2 +_dl_runtime_profile_sve: + /* AArch64 we get called with: + ip0 &PLTGOT[2] + ip1 temp(dl resolver entry point) + [sp, #8] lr + [sp, #0] &PLTGOT[n] + + Stack frame layout: + [x29, #...] lr + [x29, #...] &PLTGOT[n] + [x29, #96] La_aarch64_regs + [x29, #48] La_aarch64_retval + [x29, #40] frame size return from pltenter + [x29, #32] dl_profile_call saved x1 + [x29, #24] dl_profile_call saved x0 + [x29, #16] t1 + [x29, #0] x29, lr <- x29 + [x29, #-1, mul vl] full p[0-3] + [x29, #-2, mul vl] full z[0-8] <- sp + + ??? Extending the profiling hook for full SVE register export + is tricky given the variable register size. Perhaps the new + La_aarch64_regs should contain pointers to Z0 and P0, and + the current VL, and one infers the addresses from there. + + This one new form could be used for all, with AdvSIMD + devolving into VL=16 with no predicate registers. + + In the meantime, this function simply saves the contents of + the SVE registers, but only exposes the AdvSIMD portion to + the profile hooks. + */ + + cfi_startproc + cfi_adjust_cfa_offset(16) /* Incorporate PLT */ + cfi_rel_offset (lr, 8) + + stp x29, x8, [SP, #-SF_SIZE]! + cfi_adjust_cfa_offset (SF_SIZE) + cfi_rel_offset (x29, 0) + mov x29, sp + cfi_def_cfa_register (x29) + + /* Save La_aarch64_regs. */ + stp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0] + stp x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1] + stp x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2] + stp x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3] + stp d0, d1, [X29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*0] + stp d2, d3, [X29, #OFFSET_RG+ DL_OFFSET_RG_D0 + 16*1] + stp d4, d5, [X29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*2] + stp d6, d7, [X29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*3] + + /* Re-save the full contents of the vector arguments. + + Note that PL = VL/8, so we can save all 4 predicates + in (less than) the space of one vector; this minimizes + the number of stack adjustments required, and gives a + predictable place for each register. + + Despite the unfortunate assembler mnemomics, the vector + stores do not overlap the preceeding prediate stores. */ + addvl sp, sp, #-9 + + str p0, [x29, #-1, mul vl] + str p1, [x29, #-2, mul vl] + str p2, [x29, #-3, mul vl] + str p3, [x29, #-4, mul vl] + + str z0, [x29, #-2, mul vl] + str z1, [x29, #-3, mul vl] + str z2, [x29, #-4, mul vl] + str z3, [x29, #-5, mul vl] + str z4, [x29, #-6, mul vl] + str z5, [x29, #-7, mul vl] + str z6, [x29, #-8, mul vl] + str z7, [x29, #-9, mul vl] + + add x0, x29, #SF_SIZE + 16 + ldr x1, [x29, #OFFSET_LR] + stp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_SP] + + /* Get pointer to linker struct. */ + ldr PTR_REG (0), [ip0, #-PTR_SIZE] + + /* Prepare to call _dl_profile_fixup(). */ + ldr x1, [x29, OFFSET_PLTGOTN] /* Recover &PLTGOT[n] */ + + sub x1, x1, ip0 + add x1, x1, x1, lsl #1 + lsl x1, x1, #3 + sub x1, x1, #(RELA_SIZE<<3) + lsr x1, x1, #3 + + stp x0, x1, [x29, #OFFSET_SAVED_CALL_X0] + + /* Set up extra args for _dl_profile_fixup */ + ldr x2, [x29, #OFFSET_LR] /* load saved LR */ + add x3, x29, #OFFSET_RG /* address of La_aarch64_reg */ + add x4, x29, #OFFSET_FS /* address of framesize */ + bl _dl_profile_fixup + + ldr ip0l, [x29, #OFFSET_FS] /* framesize == 0 */ + cmp ip0l, #0 + bge 1f + cfi_remember_state + + /* Save the return. */ + mov ip0, x0 + + /* Get arguments and return address back. */ + ldr p0, [x29, #-1, mul vl] + ldr p1, [x29, #-2, mul vl] + ldr p2, [x29, #-3, mul vl] + ldr p3, [x29, #-4, mul vl] + + ldr z0, [x29, #-2, mul vl] + ldr z1, [x29, #-3, mul vl] + ldr z2, [x29, #-4, mul vl] + ldr z3, [x29, #-5, mul vl] + ldr z4, [x29, #-6, mul vl] + ldr z5, [x29, #-7, mul vl] + ldr z6, [x29, #-8, mul vl] + ldr z7, [x29, #-9, mul vl] + + ldr lr, [x29, #OFFSET_LR] + ldp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0] + ldp x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1] + ldp x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2] + ldp x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3] + + mov sp, x29 + ldp x29, x8, [sp], SF_SIZE + 16 + cfi_def_cfa (sp, 0) + cfi_restore(x29) + cfi_restore(lr) + + /* Jump to the newly found address. */ + br ip0 + + cfi_restore_state + /* The new frame size is in ip0, extended for pointer size. */ +1: sub x1, sp, ip0 + and sp, x1, #0xfffffffffffffff0 + + str x0, [x29, #OFFSET_T1] + + mov x0, sp + add x1, x29, #SF_SIZE + 16 + mov x2, ip0 + bl memcpy + + ldr ip0, [x29, #OFFSET_T1] + + /* Reload the full arguments. */ + ldp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0] + ldp x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1] + ldp x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2] + ldp x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3] + ldr x8, [x29, 8] + + ldr p0, [x29, #-1, mul vl] + ldr p1, [x29, #-2, mul vl] + ldr p2, [x29, #-3, mul vl] + ldr p3, [x29, #-4, mul vl] + + ldr z0, [x29, #-2, mul vl] + ldr z1, [x29, #-3, mul vl] + ldr z2, [x29, #-4, mul vl] + ldr z3, [x29, #-5, mul vl] + ldr z4, [x29, #-6, mul vl] + ldr z5, [x29, #-7, mul vl] + ldr z6, [x29, #-8, mul vl] + ldr z7, [x29, #-9, mul vl] + + /* Call the function. */ + blr ip0 + + /* Store La_aarch64_retval, as if for the non-vector ABI. */ + stp x0, x1, [x29, #OFFSET_RV + DL_OFFSET_RV_X0] + stp d0, d1, [x29, #OFFSET_RV + DL_OFFSET_RV_D0 + 16*0] + stp d2, d3, [x29, #OFFSET_RV + DL_OFFSET_RV_D0 + 16*1] + + /* Store the full contents of the vector return. */ + str p0, [x29, #-1, mul vl] + str p1, [x29, #-2, mul vl] + str p2, [x29, #-3, mul vl] + str p3, [x29, #-4, mul vl] + + str z0, [x29, #-2, mul vl] + str z1, [x29, #-3, mul vl] + str z2, [x29, #-4, mul vl] + str z3, [x29, #-5, mul vl] + str z4, [x29, #-6, mul vl] + str z5, [x29, #-7, mul vl] + str z6, [x29, #-8, mul vl] + str z7, [x29, #-9, mul vl] + + /* Setup call to pltexit */ + ldp x0, x1, [x29, #OFFSET_SAVED_CALL_X0] + add x2, x29, #OFFSET_RG + add x3, x29, #OFFSET_RV + bl _dl_call_pltexit + + /* Reload the full return value. */ + ldp x0, x1, [x29, #OFFSET_RV + DL_OFFSET_RV_X0] + + ldr p0, [x29, #-1, mul vl] + ldr p1, [x29, #-2, mul vl] + ldr p2, [x29, #-3, mul vl] + ldr p3, [x29, #-4, mul vl] + + ldr z0, [x29, #-2, mul vl] + ldr z1, [x29, #-3, mul vl] + ldr z2, [x29, #-4, mul vl] + ldr z3, [x29, #-5, mul vl] + ldr z4, [x29, #-6, mul vl] + ldr z5, [x29, #-7, mul vl] + ldr z6, [x29, #-8, mul vl] + ldr z7, [x29, #-9, mul vl] + + /* LR from within La_aarch64_reg */ + ldr lr, [x29, #OFFSET_RG + DL_OFFSET_RG_LR] + mov sp, x29 + cfi_def_cfa_register (sp) + ldr x29, [x29, #0] + add sp, sp, SF_SIZE + 16 + cfi_adjust_cfa_offset (- SF_SIZE - 16) + cfi_restore(x29) + cfi_restore(lr) + + br lr + + cfi_endproc + .size _dl_runtime_profile_sve, .-_dl_runtime_profile_sve +#endif

[3/3] aarch64: Save and restore SVE registers in ld.so

Commit Message

Comments

Patch