[09/10] crypto: arm64/aes-neon-blk - tweak performance for low end cores

Message ID	1484666557-31458-10-git-send-email-ard.biesheuvel@linaro.org
State	New
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: best guess record for domain of linux-crypto-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67; From: Ard Biesheuvel <ard.biesheuvel@linaro.org> To: linux-crypto@vger.kernel.org, herbert@gondor.apana.org.au Cc: linux-arm-kernel@lists.infradead.org, Ard Biesheuvel <ard.biesheuvel@linaro.org> Subject: [PATCH 09/10] crypto: arm64/aes-neon-blk - tweak performance for low end cores Date: Tue, 17 Jan 2017 15:22:36 +0000 Message-Id: <1484666557-31458-10-git-send-email-ard.biesheuvel@linaro.org> In-Reply-To: <1484666557-31458-1-git-send-email-ard.biesheuvel@linaro.org> References: <1484666557-31458-1-git-send-email-ard.biesheuvel@linaro.org> Sender: linux-crypto-owner@vger.kernel.org Precedence: bulk
Series	crypto - AES for ARM/arm64 updates for v4.11 (round #2) \| expand [00/10] crypto - AES for ARM/arm64 updates for v4.11 (round #2) [01/10] crypto: arm64/aes-neon-bs - honour iv_out requirement in CTR mode [02/10] crypto: arm/aes-ce - remove cra_alignmask [03/10] crypto: arm/chacha20 - remove cra_alignmask [04/10] crypto: arm64/aes-ce-ccm - remove cra_alignmask [05/10] crypto: arm64/aes-blk - remove cra_alignmask [06/10] crypto: arm64/chacha20 - remove cra_alignmask [07/10] crypto: arm64/aes - avoid literals for cross-module symbol references [08/10] crypto: arm64/aes - performance tweak [09/10] crypto: arm64/aes-neon-blk - tweak performance for low end cores [10/10] crypto: arm64/aes - replace scalar fallback with plain NEON fallback

diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 8ee1fb7aaa4f..055bc3f61138 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -409,5 +409,7 @@ static int __init aes_init(void) module_cpu_feature_match(AES, aes_init); #else module_init(aes_init); +EXPORT_SYMBOL(neon_aes_ecb_encrypt); +EXPORT_SYMBOL(neon_aes_cbc_encrypt); #endif module_exit(aes_exit); diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S index 85f07ead7c5c..67c68462bc20 100644 --- a/arch/arm64/crypto/aes-neon.S +++ b/arch/arm64/crypto/aes-neon.S @@ -1,7 +1,7 @@ /* * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON * - * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -25,9 +25,9 @@ /* preload the entire Sbox */ .macro prepare, sbox, shiftrows, temp adr \temp, \sbox - movi v12.16b, #0x40 + movi v12.16b, #0x1b ldr q13, \shiftrows - movi v14.16b, #0x1b + ldr q14, .Lror32by8 ld1 {v16.16b-v19.16b}, [\temp], #64 ld1 {v20.16b-v23.16b}, [\temp], #64 ld1 {v24.16b-v27.16b}, [\temp], #64 @@ -50,37 +50,33 @@ /* apply SubBytes transformation using the the preloaded Sbox */ .macro sub_bytes, in - sub v9.16b, \in\().16b, v12.16b + sub v9.16b, \in\().16b, v15.16b tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b - sub v10.16b, v9.16b, v12.16b + sub v10.16b, v9.16b, v15.16b tbx \in\().16b, {v20.16b-v23.16b}, v9.16b - sub v11.16b, v10.16b, v12.16b + sub v11.16b, v10.16b, v15.16b tbx \in\().16b, {v24.16b-v27.16b}, v10.16b tbx \in\().16b, {v28.16b-v31.16b}, v11.16b .endm /* apply MixColumns transformation */ - .macro mix_columns, in - mul_by_x v10.16b, \in\().16b, v9.16b, v14.16b - rev32 v8.8h, \in\().8h - eor \in\().16b, v10.16b, \in\().16b - shl v9.4s, v8.4s, #24 - shl v11.4s, \in\().4s, #24 - sri v9.4s, v8.4s, #8 - sri v11.4s, \in\().4s, #8 - eor v9.16b, v9.16b, v8.16b - eor v10.16b, v10.16b, v9.16b - eor \in\().16b, v10.16b, v11.16b - .endm - + .macro mix_columns, in, enc + .if \enc == 0 /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ - .macro inv_mix_columns, in - mul_by_x v11.16b, \in\().16b, v10.16b, v14.16b - mul_by_x v11.16b, v11.16b, v10.16b, v14.16b + mul_by_x v11.16b, \in\().16b, v10.16b, v12.16b + mul_by_x v11.16b, v11.16b, v10.16b, v12.16b eor \in\().16b, \in\().16b, v11.16b rev32 v11.8h, v11.8h eor \in\().16b, \in\().16b, v11.16b - mix_columns \in + .endif + + mul_by_x v10.16b, \in\().16b, v9.16b, v12.16b + rev32 v8.8h, \in\().8h + eor \in\().16b, \in\().16b, v10.16b + eor v10.16b, v10.16b, v8.16b + eor v11.16b, \in\().16b, v8.16b + tbl v11.16b, {v11.16b}, v14.16b + eor \in\().16b, v10.16b, v11.16b .endm .macro do_block, enc, in, rounds, rk, rkp, i @@ -88,16 +84,13 @@ add \rkp, \rk, #16 mov \i, \rounds 1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ + movi v15.16b, #0x40 tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ sub_bytes \in - ld1 {v15.4s}, [\rkp], #16 subs \i, \i, #1 + ld1 {v15.4s}, [\rkp], #16 beq 2222f - .if \enc == 1 - mix_columns \in - .else - inv_mix_columns \in - .endif + mix_columns \in, \enc b 1111b 2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ .endm @@ -116,48 +109,48 @@ */ .macro sub_bytes_2x, in0, in1 - sub v8.16b, \in0\().16b, v12.16b - sub v9.16b, \in1\().16b, v12.16b + sub v8.16b, \in0\().16b, v15.16b tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b + sub v9.16b, \in1\().16b, v15.16b tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b - sub v10.16b, v8.16b, v12.16b - sub v11.16b, v9.16b, v12.16b + sub v10.16b, v8.16b, v15.16b tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b + sub v11.16b, v9.16b, v15.16b tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b - sub v8.16b, v10.16b, v12.16b - sub v9.16b, v11.16b, v12.16b + sub v8.16b, v10.16b, v15.16b tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b + sub v9.16b, v11.16b, v15.16b tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b .endm .macro sub_bytes_4x, in0, in1, in2, in3 - sub v8.16b, \in0\().16b, v12.16b + sub v8.16b, \in0\().16b, v15.16b tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b - sub v9.16b, \in1\().16b, v12.16b + sub v9.16b, \in1\().16b, v15.16b tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b - sub v10.16b, \in2\().16b, v12.16b + sub v10.16b, \in2\().16b, v15.16b tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b - sub v11.16b, \in3\().16b, v12.16b + sub v11.16b, \in3\().16b, v15.16b tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b - sub v8.16b, v8.16b, v12.16b + sub v8.16b, v8.16b, v15.16b tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b - sub v9.16b, v9.16b, v12.16b + sub v9.16b, v9.16b, v15.16b tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b - sub v10.16b, v10.16b, v12.16b + sub v10.16b, v10.16b, v15.16b tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b - sub v11.16b, v11.16b, v12.16b + sub v11.16b, v11.16b, v15.16b tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b - sub v8.16b, v8.16b, v12.16b + sub v8.16b, v8.16b, v15.16b tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b - sub v9.16b, v9.16b, v12.16b + sub v9.16b, v9.16b, v15.16b tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b - sub v10.16b, v10.16b, v12.16b + sub v10.16b, v10.16b, v15.16b tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b - sub v11.16b, v11.16b, v12.16b + sub v11.16b, v11.16b, v15.16b tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b @@ -174,59 +167,32 @@ eor \out1\().16b, \out1\().16b, \tmp1\().16b .endm - .macro mix_columns_2x, in0, in1 - mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 - rev32 v10.8h, \in0\().8h - rev32 v11.8h, \in1\().8h - eor \in0\().16b, v8.16b, \in0\().16b - eor \in1\().16b, v9.16b, \in1\().16b - shl v12.4s, v10.4s, #24 - shl v13.4s, v11.4s, #24 - eor v8.16b, v8.16b, v10.16b - sri v12.4s, v10.4s, #8 - shl v10.4s, \in0\().4s, #24 - eor v9.16b, v9.16b, v11.16b - sri v13.4s, v11.4s, #8 - shl v11.4s, \in1\().4s, #24 - sri v10.4s, \in0\().4s, #8 - eor \in0\().16b, v8.16b, v12.16b - sri v11.4s, \in1\().4s, #8 - eor \in1\().16b, v9.16b, v13.16b - eor \in0\().16b, v10.16b, \in0\().16b - eor \in1\().16b, v11.16b, \in1\().16b - .endm - - .macro inv_mix_cols_2x, in0, in1 - mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 - mul_by_x_2x v8, v9, v8, v9, v10, v11, v14 + .macro mix_columns_2x, in0, in1, enc + .if \enc == 0 + /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ + mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12 + mul_by_x_2x v8, v9, v8, v9, v10, v11, v12 eor \in0\().16b, \in0\().16b, v8.16b - eor \in1\().16b, \in1\().16b, v9.16b rev32 v8.8h, v8.8h + eor \in1\().16b, \in1\().16b, v9.16b rev32 v9.8h, v9.8h eor \in0\().16b, \in0\().16b, v8.16b eor \in1\().16b, \in1\().16b, v9.16b - mix_columns_2x \in0, \in1 - .endm + .endif - .macro inv_mix_cols_4x, in0, in1, in2, in3 - mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 - mul_by_x_2x v10, v11, \in2, \in3, v12, v13, v14 - mul_by_x_2x v8, v9, v8, v9, v12, v13, v14 - mul_by_x_2x v10, v11, v10, v11, v12, v13, v14 + mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12 + rev32 v10.8h, \in0\().8h eor \in0\().16b, \in0\().16b, v8.16b + rev32 v11.8h, \in1\().8h eor \in1\().16b, \in1\().16b, v9.16b - eor \in2\().16b, \in2\().16b, v10.16b - eor \in3\().16b, \in3\().16b, v11.16b - rev32 v8.8h, v8.8h - rev32 v9.8h, v9.8h - rev32 v10.8h, v10.8h - rev32 v11.8h, v11.8h + eor v8.16b, v8.16b, v10.16b + eor \in0\().16b, \in0\().16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor \in1\().16b, \in1\().16b, v11.16b + tbl \in0\().16b, {\in0\().16b}, v14.16b + tbl \in1\().16b, {\in1\().16b}, v14.16b eor \in0\().16b, \in0\().16b, v8.16b eor \in1\().16b, \in1\().16b, v9.16b - eor \in2\().16b, \in2\().16b, v10.16b - eor \in3\().16b, \in3\().16b, v11.16b - mix_columns_2x \in0, \in1 - mix_columns_2x \in2, \in3 .endm .macro do_block_2x, enc, in0, in1 rounds, rk, rkp, i @@ -235,20 +201,14 @@ mov \i, \rounds 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ - sub_bytes_2x \in0, \in1 + movi v15.16b, #0x40 tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ - ld1 {v15.4s}, [\rkp], #16 + sub_bytes_2x \in0, \in1 subs \i, \i, #1 + ld1 {v15.4s}, [\rkp], #16 beq 2222f - .if \enc == 1 - mix_columns_2x \in0, \in1 - ldr q13, .LForward_ShiftRows - .else - inv_mix_cols_2x \in0, \in1 - ldr q13, .LReverse_ShiftRows - .endif - movi v12.16b, #0x40 + mix_columns_2x \in0, \in1, \enc b 1111b 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ @@ -262,23 +222,17 @@ eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ - sub_bytes_4x \in0, \in1, \in2, \in3 + movi v15.16b, #0x40 tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ - ld1 {v15.4s}, [\rkp], #16 + sub_bytes_4x \in0, \in1, \in2, \in3 subs \i, \i, #1 + ld1 {v15.4s}, [\rkp], #16 beq 2222f - .if \enc == 1 - mix_columns_2x \in0, \in1 - mix_columns_2x \in2, \in3 - ldr q13, .LForward_ShiftRows - .else - inv_mix_cols_4x \in0, \in1, \in2, \in3 - ldr q13, .LReverse_ShiftRows - .endif - movi v12.16b, #0x40 + mix_columns_2x \in0, \in1, \enc + mix_columns_2x \in2, \in3, \enc b 1111b 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ @@ -305,19 +259,7 @@ #include "aes-modes.S" .text - .align 4 -.LForward_ShiftRows: -CPU_LE( .byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3 ) -CPU_LE( .byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb ) -CPU_BE( .byte 0xb, 0x6, 0x1, 0xc, 0x7, 0x2, 0xd, 0x8 ) -CPU_BE( .byte 0x3, 0xe, 0x9, 0x4, 0xf, 0xa, 0x5, 0x0 ) - -.LReverse_ShiftRows: -CPU_LE( .byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb ) -CPU_LE( .byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3 ) -CPU_BE( .byte 0x3, 0x6, 0x9, 0xc, 0xf, 0x2, 0x5, 0x8 ) -CPU_BE( .byte 0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0 ) - + .align 6 .LForward_Sbox: .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 @@ -385,3 +327,12 @@ CPU_BE( .byte 0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0 ) .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + +.LForward_ShiftRows: + .octa 0x0b06010c07020d08030e09040f0a0500 + +.LReverse_ShiftRows: + .octa 0x0306090c0f0205080b0e0104070a0d00 + +.Lror32by8: + .octa 0x0c0f0e0d080b0a090407060500030201

[09/10] crypto: arm64/aes-neon-blk - tweak performance for low end cores

Commit Message

Patch