crypto: arm64 - revert NEON yield for fast AEAD implementations

Message ID	20180729145230.6738-1-ard.biesheuvel@linaro.org
State	Accepted
Commit	f10dc56c64bb662822475304508c1ce99f194e70
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: best guess record for domain of linux-crypto-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67; From: Ard Biesheuvel <ard.biesheuvel@linaro.org> To: linux-crypto@vger.kernel.org Cc: herbert@gondor.apana.org.au, will.deacon@arm.com, dave.martin@arm.com, vakul.garg@nxp.com, bigeasy@linutronix.de, Ard Biesheuvel <ard.biesheuvel@linaro.org> Subject: [PATCH] crypto: arm64 - revert NEON yield for fast AEAD implementations Date: Sun, 29 Jul 2018 16:52:30 +0200 Message-Id: <20180729145230.6738-1-ard.biesheuvel@linaro.org> Sender: linux-crypto-owner@vger.kernel.org Precedence: bulk
Series	crypto: arm64 - revert NEON yield for fast AEAD implementations \| expand crypto: arm64 - revert NEON yield for fast AEAD implementations

diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S index 88f5aef7934c..e3a375c4cb83 100644 --- a/arch/arm64/crypto/aes-ce-ccm-core.S +++ b/arch/arm64/crypto/aes-ce-ccm-core.S @@ -19,33 +19,24 @@ * u32 *macp, u8 const rk[], u32 rounds); */ ENTRY(ce_aes_ccm_auth_data) - frame_push 7 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x5 - - ldr w25, [x22] /* leftover from prev round? */ + ldr w8, [x3] /* leftover from prev round? */ ld1 {v0.16b}, [x0] /* load mac */ - cbz w25, 1f - sub w25, w25, #16 + cbz w8, 1f + sub w8, w8, #16 eor v1.16b, v1.16b, v1.16b -0: ldrb w7, [x20], #1 /* get 1 byte of input */ - subs w21, w21, #1 - add w25, w25, #1 +0: ldrb w7, [x1], #1 /* get 1 byte of input */ + subs w2, w2, #1 + add w8, w8, #1 ins v1.b[0], w7 ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ beq 8f /* out of input? */ - cbnz w25, 0b + cbnz w8, 0b eor v0.16b, v0.16b, v1.16b -1: ld1 {v3.4s}, [x23] /* load first round key */ - prfm pldl1strm, [x20] - cmp w24, #12 /* which key size? */ - add x6, x23, #16 - sub w7, w24, #2 /* modified # of rounds */ +1: ld1 {v3.4s}, [x4] /* load first round key */ + prfm pldl1strm, [x1] + cmp w5, #12 /* which key size? */ + add x6, x4, #16 + sub w7, w5, #2 /* modified # of rounds */ bmi 2f bne 5f mov v5.16b, v3.16b @@ -64,43 +55,33 @@ ENTRY(ce_aes_ccm_auth_data) ld1 {v5.4s}, [x6], #16 /* load next round key */ bpl 3b aese v0.16b, v4.16b - subs w21, w21, #16 /* last data? */ + subs w2, w2, #16 /* last data? */ eor v0.16b, v0.16b, v5.16b /* final round */ bmi 6f - ld1 {v1.16b}, [x20], #16 /* load next input block */ + ld1 {v1.16b}, [x1], #16 /* load next input block */ eor v0.16b, v0.16b, v1.16b /* xor with mac */ - beq 6f - - if_will_cond_yield_neon - st1 {v0.16b}, [x19] /* store mac */ - do_cond_yield_neon - ld1 {v0.16b}, [x19] /* reload mac */ - endif_yield_neon - - b 1b -6: st1 {v0.16b}, [x19] /* store mac */ + bne 1b +6: st1 {v0.16b}, [x0] /* store mac */ beq 10f - adds w21, w21, #16 + adds w2, w2, #16 beq 10f - mov w25, w21 -7: ldrb w7, [x20], #1 + mov w8, w2 +7: ldrb w7, [x1], #1 umov w6, v0.b[0] eor w6, w6, w7 - strb w6, [x19], #1 - subs w21, w21, #1 + strb w6, [x0], #1 + subs w2, w2, #1 beq 10f ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ b 7b -8: mov w7, w25 - add w25, w25, #16 +8: mov w7, w8 + add w8, w8, #16 9: ext v1.16b, v1.16b, v1.16b, #1 adds w7, w7, #1 bne 9b eor v0.16b, v0.16b, v1.16b - st1 {v0.16b}, [x19] -10: str w25, [x22] - - frame_pop + st1 {v0.16b}, [x0] +10: str w8, [x3] ret ENDPROC(ce_aes_ccm_auth_data) @@ -145,29 +126,19 @@ ENTRY(ce_aes_ccm_final) ENDPROC(ce_aes_ccm_final) .macro aes_ccm_do_crypt,enc - frame_push 8 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x5 - mov x25, x6 - - ldr x26, [x25, #8] /* load lower ctr */ - ld1 {v0.16b}, [x24] /* load mac */ -CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */ + ldr x8, [x6, #8] /* load lower ctr */ + ld1 {v0.16b}, [x5] /* load mac */ +CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */ 0: /* outer loop */ - ld1 {v1.8b}, [x25] /* load upper ctr */ - prfm pldl1strm, [x20] - add x26, x26, #1 - rev x9, x26 - cmp w23, #12 /* which key size? */ - sub w7, w23, #2 /* get modified # of rounds */ + ld1 {v1.8b}, [x6] /* load upper ctr */ + prfm pldl1strm, [x1] + add x8, x8, #1 + rev x9, x8 + cmp w4, #12 /* which key size? */ + sub w7, w4, #2 /* get modified # of rounds */ ins v1.d[1], x9 /* no carry in lower ctr */ - ld1 {v3.4s}, [x22] /* load first round key */ - add x10, x22, #16 + ld1 {v3.4s}, [x3] /* load first round key */ + add x10, x3, #16 bmi 1f bne 4f mov v5.16b, v3.16b @@ -194,9 +165,9 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */ bpl 2b aese v0.16b, v4.16b aese v1.16b, v4.16b - subs w21, w21, #16 - bmi 7f /* partial block? */ - ld1 {v2.16b}, [x20], #16 /* load next input block */ + subs w2, w2, #16 + bmi 6f /* partial block? */ + ld1 {v2.16b}, [x1], #16 /* load next input block */ .if \enc == 1 eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ @@ -205,29 +176,18 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */ eor v1.16b, v2.16b, v5.16b /* final round enc */ .endif eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ - st1 {v1.16b}, [x19], #16 /* write output block */ - beq 5f - - if_will_cond_yield_neon - st1 {v0.16b}, [x24] /* store mac */ - do_cond_yield_neon - ld1 {v0.16b}, [x24] /* reload mac */ - endif_yield_neon - - b 0b -5: -CPU_LE( rev x26, x26 ) - st1 {v0.16b}, [x24] /* store mac */ - str x26, [x25, #8] /* store lsb end of ctr (BE) */ - -6: frame_pop - ret - -7: eor v0.16b, v0.16b, v5.16b /* final round mac */ + st1 {v1.16b}, [x0], #16 /* write output block */ + bne 0b +CPU_LE( rev x8, x8 ) + st1 {v0.16b}, [x5] /* store mac */ + str x8, [x6, #8] /* store lsb end of ctr (BE) */ +5: ret + +6: eor v0.16b, v0.16b, v5.16b /* final round mac */ eor v1.16b, v1.16b, v5.16b /* final round enc */ - st1 {v0.16b}, [x24] /* store mac */ - add w21, w21, #16 /* process partial tail block */ -8: ldrb w9, [x20], #1 /* get 1 byte of input */ + st1 {v0.16b}, [x5] /* store mac */ + add w2, w2, #16 /* process partial tail block */ +7: ldrb w9, [x1], #1 /* get 1 byte of input */ umov w6, v1.b[0] /* get top crypted ctr byte */ umov w7, v0.b[0] /* get top mac byte */ .if \enc == 1 @@ -237,13 +197,13 @@ CPU_LE( rev x26, x26 ) eor w9, w9, w6 eor w7, w7, w9 .endif - strb w9, [x19], #1 /* store out byte */ - strb w7, [x24], #1 /* store mac byte */ - subs w21, w21, #1 - beq 6b + strb w9, [x0], #1 /* store out byte */ + strb w7, [x5], #1 /* store mac byte */ + subs w2, w2, #1 + beq 5b ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ - b 8b + b 7b .endm /* diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index dcffb9e77589..c723647b37db 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -322,55 +322,41 @@ ENDPROC(pmull_ghash_update_p8) .endm .macro pmull_gcm_do_crypt, enc - frame_push 10 + ld1 {SHASH.2d}, [x4] + ld1 {XL.2d}, [x1] + ldr x8, [x5, #8] // load lower counter - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x5 - mov x25, x6 - mov x26, x7 - .if \enc == 1 - ldr x27, [sp, #96] // first stacked arg - .endif - - ldr x28, [x24, #8] // load lower counter -CPU_LE( rev x28, x28 ) - -0: mov x0, x25 - load_round_keys w26, x0 - ld1 {SHASH.2d}, [x23] - ld1 {XL.2d}, [x20] + load_round_keys w7, x6 movi MASK.16b, #0xe1 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 +CPU_LE( rev x8, x8 ) shl MASK.2d, MASK.2d, #57 eor SHASH2.16b, SHASH2.16b, SHASH.16b .if \enc == 1 - ld1 {KS.16b}, [x27] + ldr x10, [sp] + ld1 {KS.16b}, [x10] .endif -1: ld1 {CTR.8b}, [x24] // load upper counter - ld1 {INP.16b}, [x22], #16 - rev x9, x28 - add x28, x28, #1 - sub w19, w19, #1 +0: ld1 {CTR.8b}, [x5] // load upper counter + ld1 {INP.16b}, [x3], #16 + rev x9, x8 + add x8, x8, #1 + sub w0, w0, #1 ins CTR.d[1], x9 // set lower counter .if \enc == 1 eor INP.16b, INP.16b, KS.16b // encrypt input - st1 {INP.16b}, [x21], #16 + st1 {INP.16b}, [x2], #16 .endif rev64 T1.16b, INP.16b - cmp w26, #12 - b.ge 4f // AES-192/256? + cmp w7, #12 + b.ge 2f // AES-192/256? -2: enc_round CTR, v21 +1: enc_round CTR, v21 ext T2.16b, XL.16b, XL.16b, #8 ext IN1.16b, T1.16b, T1.16b, #8 @@ -425,39 +411,27 @@ CPU_LE( rev x28, x28 ) .if \enc == 0 eor INP.16b, INP.16b, KS.16b - st1 {INP.16b}, [x21], #16 + st1 {INP.16b}, [x2], #16 .endif - cbz w19, 3f + cbnz w0, 0b - if_will_cond_yield_neon - st1 {XL.2d}, [x20] - .if \enc == 1 - st1 {KS.16b}, [x27] - .endif - do_cond_yield_neon - b 0b - endif_yield_neon +CPU_LE( rev x8, x8 ) + st1 {XL.2d}, [x1] + str x8, [x5, #8] // store lower counter - b 1b - -3: st1 {XL.2d}, [x20] .if \enc == 1 - st1 {KS.16b}, [x27] + st1 {KS.16b}, [x10] .endif -CPU_LE( rev x28, x28 ) - str x28, [x24, #8] // store lower counter - - frame_pop ret -4: b.eq 5f // AES-192? +2: b.eq 3f // AES-192? enc_round CTR, v17 enc_round CTR, v18 -5: enc_round CTR, v19 +3: enc_round CTR, v19 enc_round CTR, v20 - b 2b + b 1b .endm /*

crypto: arm64 - revert NEON yield for fast AEAD implementations

Commit Message

Comments

Patch