diff mbox

[RFT,v2] crypto: arm64/gcm - implement native driver using v8 Crypto Extensions

Message ID 20170630113241.17002-1-ard.biesheuvel@linaro.org
State Superseded
Headers show

Commit Message

Ard Biesheuvel June 30, 2017, 11:32 a.m. UTC
Currently, the AES-GCM implementation for arm64 systems that support the
ARMv8 Crypto Extensions is based on the generic GCM module, which combines
the AES-CTR implementation using AES instructions with the PMULL based
GHASH driver. This is suboptimal, given the fact that the input data needs
to be loaded twice, once for the encryption and again for the MAC
calculation.

On Cortex-A57 (r1p2) and other recent cores that implement micro-op fusing
for the AES instructions, AES executes at less than 1 cycle per byte, which
means that any cycles wasted on loading the data twice hurt even more.

So implement a new GCM driver that combines the AES and PMULL instructions
at the block level. This improves performance on Cortex-A57 by ~27% (from
3.5 cpb to 2.6 cpb)

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

---
v2: - rebase onto non-upstream arm64 SIMD refactoring branch
      (https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/log/?h=arm64-gcm)
    - implement non-SIMD fallback
    - remove accelerated AES routines from setkey() path
    - use be32() accessors instead of open-coded array assignments
    - remove redundant round key loads

Raw numbers measured on a 2GHz AMD Overdrive B1 can be found after he patch.

 arch/arm64/crypto/Kconfig         |   3 +-
 arch/arm64/crypto/ghash-ce-core.S | 175 ++++++++
 arch/arm64/crypto/ghash-ce-glue.c | 436 ++++++++++++++++++--
 3 files changed, 587 insertions(+), 27 deletions(-)

-- 
2.9.3


Generic GCM wrapper around AES-CTR and GHASH (using AES and PMULL instructions)
===============================================================================

testing speed of gcm(aes) (gcm_base(ctr-aes-ce,ghash-ce)) encryption
test 0 (128 bit key, 16 byte blocks): 1133407 operations in 1 seconds (18134512 bytes)
test 1 (128 bit key, 64 byte blocks): 1025997 operations in 1 seconds (65663808 bytes)
test 2 (128 bit key, 256 byte blocks): 768971 operations in 1 seconds (196856576 bytes)
test 3 (128 bit key, 512 byte blocks): 577197 operations in 1 seconds (295524864 bytes)
test 4 (128 bit key, 1024 byte blocks): 390516 operations in 1 seconds (399888384 bytes)
test 5 (128 bit key, 2048 byte blocks): 237002 operations in 1 seconds (485380096 bytes)
test 6 (128 bit key, 4096 byte blocks): 132590 operations in 1 seconds (543088640 bytes)
test 7 (128 bit key, 8192 byte blocks): 69495 operations in 1 seconds (569303040 bytes)
test 8 (192 bit key, 16 byte blocks): 1108665 operations in 1 seconds (17738640 bytes)
test 9 (192 bit key, 64 byte blocks): 1054793 operations in 1 seconds (67506752 bytes)
test 10 (192 bit key, 256 byte blocks): 759134 operations in 1 seconds (194338304 bytes)
test 11 (192 bit key, 512 byte blocks): 565960 operations in 1 seconds (289771520 bytes)
test 12 (192 bit key, 1024 byte blocks): 380881 operations in 1 seconds (390022144 bytes)
test 13 (192 bit key, 2048 byte blocks): 231188 operations in 1 seconds (473473024 bytes)
test 14 (192 bit key, 4096 byte blocks): 128310 operations in 1 seconds (525557760 bytes)
test 15 (192 bit key, 8192 byte blocks): 67436 operations in 1 seconds (552435712 bytes)
test 16 (256 bit key, 16 byte blocks): 1122946 operations in 1 seconds (17967136 bytes)
test 17 (256 bit key, 64 byte blocks): 1006653 operations in 1 seconds (64425792 bytes)
test 18 (256 bit key, 256 byte blocks): 744818 operations in 1 seconds (190673408 bytes)
test 19 (256 bit key, 512 byte blocks): 553923 operations in 1 seconds (283608576 bytes)
test 20 (256 bit key, 1024 byte blocks): 371402 operations in 1 seconds (380315648 bytes)
test 21 (256 bit key, 2048 byte blocks): 223312 operations in 1 seconds (457342976 bytes)
test 22 (256 bit key, 4096 byte blocks): 123945 operations in 1 seconds (507678720 bytes)
test 23 (256 bit key, 8192 byte blocks): 64935 operations in 1 seconds (531947520 bytes)

Native GCM module with block level interleave of AES-CTR and GHASH
==================================================================

testing speed of gcm(aes) (gcm-aes-ce) encryption
test 0 (128 bit key, 16 byte blocks): 1860711 operations in 1 seconds (29771376 bytes)
test 1 (128 bit key, 64 byte blocks): 1573017 operations in 1 seconds (100673088 bytes)
test 2 (128 bit key, 256 byte blocks): 1136989 operations in 1 seconds (291069184 bytes)
test 3 (128 bit key, 512 byte blocks): 840846 operations in 1 seconds (430513152 bytes)
test 4 (128 bit key, 1024 byte blocks): 548205 operations in 1 seconds (561361920 bytes)
test 5 (128 bit key, 2048 byte blocks): 328413 operations in 1 seconds (672589824 bytes)
test 6 (128 bit key, 4096 byte blocks): 181673 operations in 1 seconds (744132608 bytes)
test 7 (128 bit key, 8192 byte blocks): 94986 operations in 1 seconds (778125312 bytes)
test 8 (192 bit key, 16 byte blocks): 1837762 operations in 1 seconds (29404192 bytes)
test 9 (192 bit key, 64 byte blocks): 1537458 operations in 1 seconds (98397312 bytes)
test 10 (192 bit key, 256 byte blocks): 1087589 operations in 1 seconds (278422784 bytes)
test 11 (192 bit key, 512 byte blocks): 807194 operations in 1 seconds (413283328 bytes)
test 12 (192 bit key, 1024 byte blocks): 524966 operations in 1 seconds (537565184 bytes)
test 13 (192 bit key, 2048 byte blocks): 312338 operations in 1 seconds (639668224 bytes)
test 14 (192 bit key, 4096 byte blocks): 173324 operations in 1 seconds (709935104 bytes)
test 15 (192 bit key, 8192 byte blocks): 90857 operations in 1 seconds (744300544 bytes)
test 16 (256 bit key, 16 byte blocks): 1798971 operations in 1 seconds (28783536 bytes)
test 17 (256 bit key, 64 byte blocks): 1497989 operations in 1 seconds (95871296 bytes)
test 18 (256 bit key, 256 byte blocks): 1058926 operations in 1 seconds (271085056 bytes)
test 19 (256 bit key, 512 byte blocks): 775609 operations in 1 seconds (397111808 bytes)
test 20 (256 bit key, 1024 byte blocks): 492267 operations in 1 seconds (504081408 bytes)
test 21 (256 bit key, 2048 byte blocks): 294868 operations in 1 seconds (603889664 bytes)
test 22 (256 bit key, 4096 byte blocks): 161802 operations in 1 seconds (662740992 bytes)
test 23 (256 bit key, 8192 byte blocks): 84664 operations in 1 seconds (693567488 bytes)

Comments

Ard Biesheuvel June 30, 2017, 7:09 p.m. UTC | #1
On 30 June 2017 at 11:32, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> Currently, the AES-GCM implementation for arm64 systems that support the

> ARMv8 Crypto Extensions is based on the generic GCM module, which combines

> the AES-CTR implementation using AES instructions with the PMULL based

> GHASH driver. This is suboptimal, given the fact that the input data needs

> to be loaded twice, once for the encryption and again for the MAC

> calculation.

>

> On Cortex-A57 (r1p2) and other recent cores that implement micro-op fusing

> for the AES instructions, AES executes at less than 1 cycle per byte, which

> means that any cycles wasted on loading the data twice hurt even more.

>

> So implement a new GCM driver that combines the AES and PMULL instructions

> at the block level. This improves performance on Cortex-A57 by ~27% (from



37% not 27%

> 3.5 cpb to 2.6 cpb)

>

> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

> ---

> v2: - rebase onto non-upstream arm64 SIMD refactoring branch

>       (https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/log/?h=arm64-gcm)

>     - implement non-SIMD fallback

>     - remove accelerated AES routines from setkey() path

>     - use be32() accessors instead of open-coded array assignments

>     - remove redundant round key loads

>

> Raw numbers measured on a 2GHz AMD Overdrive B1 can be found after he patch.

>

>  arch/arm64/crypto/Kconfig         |   3 +-

>  arch/arm64/crypto/ghash-ce-core.S | 175 ++++++++

>  arch/arm64/crypto/ghash-ce-glue.c | 436 ++++++++++++++++++--

>  3 files changed, 587 insertions(+), 27 deletions(-)

>

> diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig

> index a669dedc8767..3e5b39b79fb9 100644

> --- a/arch/arm64/crypto/Kconfig

> +++ b/arch/arm64/crypto/Kconfig

> @@ -29,10 +29,11 @@ config CRYPTO_SHA2_ARM64_CE

>         select CRYPTO_SHA256_ARM64

>

>  config CRYPTO_GHASH_ARM64_CE

> -       tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions"

> +       tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"

>         depends on KERNEL_MODE_NEON

>         select CRYPTO_HASH

>         select CRYPTO_GF128MUL

> +       select CRYPTO_AES

>

>  config CRYPTO_CRCT10DIF_ARM64_CE

>         tristate "CRCT10DIF digest algorithm using PMULL instructions"

> diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S

> index f0bb9f0b524f..cb22459eba85 100644

> --- a/arch/arm64/crypto/ghash-ce-core.S

> +++ b/arch/arm64/crypto/ghash-ce-core.S

> @@ -77,3 +77,178 @@ CPU_LE(     rev64           T1.16b, T1.16b  )

>         st1             {XL.2d}, [x1]

>         ret

>  ENDPROC(pmull_ghash_update)

> +

> +       KS              .req    v8

> +       CTR             .req    v9

> +       INP             .req    v10

> +

> +       .macro          load_round_keys, rounds, rk

> +       cmp             \rounds, #12

> +       blo             2222f           /* 128 bits */

> +       beq             1111f           /* 192 bits */

> +       ld1             {v17.4s-v18.4s}, [\rk], #32

> +1111:  ld1             {v19.4s-v20.4s}, [\rk], #32

> +2222:  ld1             {v21.4s-v24.4s}, [\rk], #64

> +       ld1             {v25.4s-v28.4s}, [\rk], #64

> +       ld1             {v29.4s-v31.4s}, [\rk]

> +       .endm

> +

> +       .macro          enc_round, state, key

> +       aese            \state\().16b, \key\().16b

> +       aesmc           \state\().16b, \state\().16b

> +       .endm

> +

> +       .macro          enc_block, state, rounds

> +       cmp             \rounds, #12

> +       b.lo            2222f           /* 128 bits */

> +       b.eq            1111f           /* 192 bits */

> +       enc_round       \state, v17

> +       enc_round       \state, v18

> +1111:  enc_round       \state, v19

> +       enc_round       \state, v20

> +2222:  .irp            key, v21, v22, v23, v24, v25, v26, v27, v28, v29

> +       enc_round       \state, \key

> +       .endr

> +       aese            \state\().16b, v30.16b

> +       eor             \state\().16b, \state\().16b, v31.16b

> +       .endm

> +

> +       .macro          pmull_gcm_do_crypt, enc

> +       ld1             {SHASH.2d}, [x4]

> +       ld1             {XL.2d}, [x1]

> +       ldr             x8, [x5, #8]                    // load lower counter

> +

> +       movi            MASK.16b, #0xe1

> +       ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8

> +CPU_LE(        rev             x8, x8          )

> +       shl             MASK.2d, MASK.2d, #57

> +       eor             SHASH2.16b, SHASH2.16b, SHASH.16b

> +

> +       .if             \enc == 1

> +       ld1             {KS.16b}, [x7]

> +       .endif

> +

> +0:     ld1             {CTR.8b}, [x5]                  // load upper counter

> +       ld1             {INP.16b}, [x3], #16

> +       rev             x9, x8

> +       add             x8, x8, #1

> +       sub             w0, w0, #1

> +       ins             CTR.d[1], x9                    // set lower counter

> +

> +       .if             \enc == 1

> +       eor             INP.16b, INP.16b, KS.16b        // encrypt input

> +       st1             {INP.16b}, [x2], #16

> +       .endif

> +

> +       rev64           T1.16b, INP.16b

> +

> +       cmp             w6, #12

> +       b.ge            2f                              // AES-192/256?

> +

> +1:     enc_round       CTR, v21

> +

> +       ext             T2.16b, XL.16b, XL.16b, #8

> +       ext             IN1.16b, T1.16b, T1.16b, #8

> +

> +       enc_round       CTR, v22

> +

> +       eor             T1.16b, T1.16b, T2.16b

> +       eor             XL.16b, XL.16b, IN1.16b

> +

> +       enc_round       CTR, v23

> +

> +       pmull2          XH.1q, SHASH.2d, XL.2d          // a1 * b1

> +       eor             T1.16b, T1.16b, XL.16b

> +

> +       enc_round       CTR, v24

> +

> +       pmull           XL.1q, SHASH.1d, XL.1d          // a0 * b0

> +       pmull           XM.1q, SHASH2.1d, T1.1d         // (a1 + a0)(b1 + b0)

> +

> +       enc_round       CTR, v25

> +

> +       ext             T1.16b, XL.16b, XH.16b, #8

> +       eor             T2.16b, XL.16b, XH.16b

> +       eor             XM.16b, XM.16b, T1.16b

> +

> +       enc_round       CTR, v26

> +

> +       eor             XM.16b, XM.16b, T2.16b

> +       pmull           T2.1q, XL.1d, MASK.1d

> +

> +       enc_round       CTR, v27

> +

> +       mov             XH.d[0], XM.d[1]

> +       mov             XM.d[1], XL.d[0]

> +

> +       enc_round       CTR, v28

> +

> +       eor             XL.16b, XM.16b, T2.16b

> +

> +       enc_round       CTR, v29

> +

> +       ext             T2.16b, XL.16b, XL.16b, #8

> +

> +       aese            CTR.16b, v30.16b

> +

> +       pmull           XL.1q, XL.1d, MASK.1d

> +       eor             T2.16b, T2.16b, XH.16b

> +

> +       eor             KS.16b, CTR.16b, v31.16b

> +

> +       eor             XL.16b, XL.16b, T2.16b

> +

> +       .if             \enc == 0

> +       eor             INP.16b, INP.16b, KS.16b

> +       st1             {INP.16b}, [x2], #16

> +       .endif

> +

> +       cbnz            w0, 0b

> +

> +CPU_LE(        rev             x8, x8          )

> +       st1             {XL.2d}, [x1]

> +       str             x8, [x5, #8]                    // store lower counter

> +

> +       .if             \enc == 1

> +       st1             {KS.16b}, [x7]

> +       .endif

> +

> +       ret

> +

> +2:     b.eq            3f                              // AES-192?

> +       enc_round       CTR, v17

> +       enc_round       CTR, v18

> +3:     enc_round       CTR, v19

> +       enc_round       CTR, v20

> +       b               1b

> +       .endm

> +

> +       /*

> +        * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],

> +        *                        struct ghash_key const *k, u8 ctr[],

> +        *                        int rounds, u8 ks[])

> +        */

> +ENTRY(pmull_gcm_encrypt)

> +       pmull_gcm_do_crypt      1

> +ENDPROC(pmull_gcm_encrypt)

> +

> +       /*

> +        * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],

> +        *                        struct ghash_key const *k, u8 ctr[],

> +        *                        int rounds)

> +        */

> +ENTRY(pmull_gcm_decrypt)

> +       pmull_gcm_do_crypt      0

> +ENDPROC(pmull_gcm_decrypt)

> +

> +       /*

> +        * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)

> +        */

> +ENTRY(pmull_gcm_encrypt_block)

> +       cbz             x2, 0f

> +       load_round_keys w3, x2

> +0:     ld1             {v0.16b}, [x1]

> +       enc_block       v0, w3

> +       st1             {v0.16b}, [x0]

> +       ret

> +ENDPROC(pmull_gcm_encrypt_block)

> diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c

> index 30221ef56e70..85ff57e789ff 100644

> --- a/arch/arm64/crypto/ghash-ce-glue.c

> +++ b/arch/arm64/crypto/ghash-ce-glue.c

> @@ -11,18 +11,25 @@

>  #include <asm/neon.h>

>  #include <asm/simd.h>

>  #include <asm/unaligned.h>

> +#include <crypto/aes.h>

> +#include <crypto/algapi.h>

> +#include <crypto/b128ops.h>

>  #include <crypto/gf128mul.h>

> +#include <crypto/internal/aead.h>

>  #include <crypto/internal/hash.h>

> +#include <crypto/internal/skcipher.h>

> +#include <crypto/scatterwalk.h>

>  #include <linux/cpufeature.h>

>  #include <linux/crypto.h>

>  #include <linux/module.h>

>

> -MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions");

> +MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions");

>  MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");

>  MODULE_LICENSE("GPL v2");

>

>  #define GHASH_BLOCK_SIZE       16

>  #define GHASH_DIGEST_SIZE      16

> +#define GCM_IV_SIZE            12

>

>  struct ghash_key {

>         u64 a;

> @@ -36,9 +43,25 @@ struct ghash_desc_ctx {

>         u32 count;

>  };

>

> +struct gcm_aes_ctx {

> +       struct crypto_aes_ctx   aes_key;

> +       struct ghash_key        ghash_key;

> +};

> +

>  asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,

>                                    struct ghash_key const *k, const char *head);

>

> +asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],

> +                                 const u8 src[], struct ghash_key const *k,

> +                                 u8 ctr[], int rounds, u8 ks[]);

> +

> +asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[],

> +                                 const u8 src[], struct ghash_key const *k,

> +                                 u8 ctr[], int rounds);

> +

> +asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[],

> +                                       u32 const rk[], int rounds);

> +

>  static int ghash_init(struct shash_desc *desc)

>  {

>         struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);

> @@ -130,17 +153,11 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)

>         return 0;

>  }

>

> -static int ghash_setkey(struct crypto_shash *tfm,

> -                       const u8 *inkey, unsigned int keylen)

> +static int __ghash_setkey(struct ghash_key *key,

> +                         const u8 *inkey, unsigned int keylen)

>  {

> -       struct ghash_key *key = crypto_shash_ctx(tfm);

>         u64 a, b;

>

> -       if (keylen != GHASH_BLOCK_SIZE) {

> -               crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);

> -               return -EINVAL;

> -       }

> -

>         /* needed for the fallback */

>         memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);

>

> @@ -157,31 +174,398 @@ static int ghash_setkey(struct crypto_shash *tfm,

>         return 0;

>  }

>

> +static int ghash_setkey(struct crypto_shash *tfm,

> +                       const u8 *inkey, unsigned int keylen)

> +{

> +       struct ghash_key *key = crypto_shash_ctx(tfm);

> +

> +       if (keylen != GHASH_BLOCK_SIZE) {

> +               crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);

> +               return -EINVAL;

> +       }

> +

> +       return __ghash_setkey(key, inkey, keylen);

> +}

> +

>  static struct shash_alg ghash_alg = {

> -       .digestsize     = GHASH_DIGEST_SIZE,

> -       .init           = ghash_init,

> -       .update         = ghash_update,

> -       .final          = ghash_final,

> -       .setkey         = ghash_setkey,

> -       .descsize       = sizeof(struct ghash_desc_ctx),

> -       .base           = {

> -               .cra_name               = "ghash",

> -               .cra_driver_name        = "ghash-ce",

> -               .cra_priority           = 200,

> -               .cra_flags              = CRYPTO_ALG_TYPE_SHASH,

> -               .cra_blocksize          = GHASH_BLOCK_SIZE,

> -               .cra_ctxsize            = sizeof(struct ghash_key),

> -               .cra_module             = THIS_MODULE,

> -       },

> +       .base.cra_name          = "ghash",

> +       .base.cra_driver_name   = "ghash-ce",

> +       .base.cra_priority      = 200,

> +       .base.cra_flags         = CRYPTO_ALG_TYPE_SHASH,

> +       .base.cra_blocksize     = GHASH_BLOCK_SIZE,

> +       .base.cra_ctxsize       = sizeof(struct ghash_key),

> +       .base.cra_module        = THIS_MODULE,

> +

> +       .digestsize             = GHASH_DIGEST_SIZE,

> +       .init                   = ghash_init,

> +       .update                 = ghash_update,

> +       .final                  = ghash_final,

> +       .setkey                 = ghash_setkey,

> +       .descsize               = sizeof(struct ghash_desc_ctx),

>  };

>

> -static int __init ghash_ce_mod_init(void)

> +static int num_rounds(struct crypto_aes_ctx *ctx)

> +{

> +       /*

> +        * # of rounds specified by AES:

> +        * 128 bit key          10 rounds

> +        * 192 bit key          12 rounds

> +        * 256 bit key          14 rounds

> +        * => n byte key        => 6 + (n/4) rounds

> +        */

> +       return 6 + ctx->key_length / 4;

> +}

> +

> +static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,

> +                     unsigned int keylen)

> +{

> +       struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm);

> +       u8 key[GHASH_BLOCK_SIZE];

> +       int ret;

> +

> +       ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen);

> +       if (ret) {

> +               tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;

> +               return -EINVAL;

> +       }

> +

> +       crypto_aes_encrypt(&ctx->aes_key, key, (u8[AES_BLOCK_SIZE]){});

> +

> +       return __ghash_setkey(&ctx->ghash_key, key, sizeof(key));

> +}

> +

> +static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)

> +{

> +       switch (authsize) {

> +       case 4:

> +       case 8:

> +       case 12 ... 16:

> +               break;

> +       default:

> +               return -EINVAL;

> +       }

> +       return 0;

> +}

> +

> +static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[],

> +                          int *buf_count, struct gcm_aes_ctx *ctx)

> +{

> +       if (*buf_count > 0) {

> +               int buf_added = min(count, GHASH_BLOCK_SIZE - *buf_count);

> +

> +               memcpy(&buf[*buf_count], src, buf_added);

> +

> +               *buf_count += buf_added;

> +               src += buf_added;

> +               count -= buf_added;

> +       }

> +

> +       if (count >= GHASH_BLOCK_SIZE || *buf_count == GHASH_BLOCK_SIZE) {

> +               int blocks = count / GHASH_BLOCK_SIZE;

> +

> +               ghash_do_update(blocks, dg, src, &ctx->ghash_key,

> +                               *buf_count ? buf : NULL);

> +

> +               src += blocks * GHASH_BLOCK_SIZE;

> +               count %= GHASH_BLOCK_SIZE;

> +               *buf_count = 0;

> +       }

> +

> +       if (count > 0) {

> +               memcpy(buf, src, count);

> +               *buf_count = count;

> +       }

> +}

> +

> +static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])

> +{

> +       struct crypto_aead *aead = crypto_aead_reqtfm(req);

> +       struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);

> +       u8 buf[GHASH_BLOCK_SIZE];

> +       struct scatter_walk walk;

> +       u32 len = req->assoclen;

> +       int buf_count = 0;

> +

> +       scatterwalk_start(&walk, req->src);

> +

> +       do {

> +               u32 n = scatterwalk_clamp(&walk, len);

> +               u8 *p;

> +

> +               if (!n) {

> +                       scatterwalk_start(&walk, sg_next(walk.sg));

> +                       n = scatterwalk_clamp(&walk, len);

> +               }

> +               p = scatterwalk_map(&walk);

> +

> +               gcm_update_mac(dg, p, n, buf, &buf_count, ctx);

> +               len -= n;

> +

> +               scatterwalk_unmap(p);

> +               scatterwalk_advance(&walk, n);

> +               scatterwalk_done(&walk, 0, len);

> +       } while (len);

> +

> +       if (buf_count) {

> +               memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count);

> +               ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);

> +       }

> +}

> +

> +static void gcm_final(struct aead_request *req, struct gcm_aes_ctx *ctx,

> +                     u64 dg[], u8 tag[], int cryptlen)

> +{

> +       u8 mac[AES_BLOCK_SIZE];

> +       u128 lengths;

> +

> +       lengths.a = cpu_to_be64(req->assoclen * 8);

> +       lengths.b = cpu_to_be64(cryptlen * 8);

> +

> +       ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL);

> +

> +       put_unaligned_be64(dg[1], mac);

> +       put_unaligned_be64(dg[0], mac + 8);

> +

> +       crypto_xor(tag, mac, AES_BLOCK_SIZE);

> +}

> +

> +static int gcm_encrypt(struct aead_request *req)

>  {

> -       return crypto_register_shash(&ghash_alg);

> +       struct crypto_aead *aead = crypto_aead_reqtfm(req);

> +       struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);

> +       struct skcipher_walk walk;

> +       u8 iv[AES_BLOCK_SIZE];

> +       u8 ks[AES_BLOCK_SIZE];

> +       u8 tag[AES_BLOCK_SIZE];

> +       u64 dg[2] = {};

> +       int err;

> +

> +       if (req->assoclen)

> +               gcm_calculate_auth_mac(req, dg);

> +

> +       memcpy(iv, req->iv, GCM_IV_SIZE);

> +       put_unaligned_be32(1, iv + GCM_IV_SIZE);

> +

> +       if (likely(may_use_simd())) {

> +               kernel_neon_begin();

> +

> +               pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,

> +                                       num_rounds(&ctx->aes_key));

> +               put_unaligned_be32(2, iv + GCM_IV_SIZE);

> +               pmull_gcm_encrypt_block(ks, iv, NULL,

> +                                       num_rounds(&ctx->aes_key));

> +               put_unaligned_be32(3, iv + GCM_IV_SIZE);

> +

> +               err = skcipher_walk_aead_encrypt(&walk, req, true);

> +

> +               while (walk.nbytes >= AES_BLOCK_SIZE) {

> +                       int blocks = walk.nbytes / AES_BLOCK_SIZE;

> +

> +                       pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,

> +                                         walk.src.virt.addr, &ctx->ghash_key,

> +                                         iv, num_rounds(&ctx->aes_key), ks);

> +

> +                       err = skcipher_walk_done(&walk,

> +                                                walk.nbytes % AES_BLOCK_SIZE);

> +               }

> +               kernel_neon_end();

> +       } else {

> +               crypto_aes_encrypt(&ctx->aes_key, tag, iv);

> +               put_unaligned_be32(2, iv + GCM_IV_SIZE);

> +

> +               err = skcipher_walk_aead_encrypt(&walk, req, true);

> +

> +               while (walk.nbytes >= AES_BLOCK_SIZE) {

> +                       int blocks = walk.nbytes / AES_BLOCK_SIZE;

> +                       u8 *dst = walk.dst.virt.addr;

> +                       u8 *src = walk.src.virt.addr;

> +

> +                       do {

> +                               crypto_aes_encrypt(&ctx->aes_key, ks, iv);

> +                               if (dst != src)

> +                                       memcpy(dst, src, AES_BLOCK_SIZE);

> +                               crypto_xor(dst, ks, AES_BLOCK_SIZE);

> +                               crypto_inc(iv, AES_BLOCK_SIZE);

> +

> +                               dst += AES_BLOCK_SIZE;

> +                               src += AES_BLOCK_SIZE;

> +                       } while (--blocks > 0);

> +

> +                       ghash_do_update(walk.nbytes / AES_BLOCK_SIZE, dg,

> +                                       walk.dst.virt.addr, &ctx->ghash_key,

> +                                       NULL);

> +

> +                       err = skcipher_walk_done(&walk,

> +                                                walk.nbytes % AES_BLOCK_SIZE);

> +               }

> +               if (walk.nbytes)

> +                       crypto_aes_encrypt(&ctx->aes_key, ks, iv);

> +       }

> +

> +       /* handle the tail */

> +       if (walk.nbytes) {

> +               u8 buf[GHASH_BLOCK_SIZE];

> +

> +               if (walk.dst.virt.addr != walk.src.virt.addr)

> +                       memcpy(walk.dst.virt.addr, walk.src.virt.addr,

> +                              walk.nbytes);

> +               crypto_xor(walk.dst.virt.addr, ks, walk.nbytes);

> +

> +               memcpy(buf, walk.dst.virt.addr, walk.nbytes);

> +               memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);

> +               ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);

> +

> +               err = skcipher_walk_done(&walk, 0);

> +       }

> +

> +       if (err)

> +               return err;

> +

> +       gcm_final(req, ctx, dg, tag, req->cryptlen);

> +

> +       /* copy authtag to end of dst */

> +       scatterwalk_map_and_copy(tag, req->dst, req->assoclen + req->cryptlen,

> +                                crypto_aead_authsize(aead), 1);

> +

> +       return 0;

> +}

> +

> +static int gcm_decrypt(struct aead_request *req)

> +{

> +       struct crypto_aead *aead = crypto_aead_reqtfm(req);

> +       struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);

> +       unsigned int authsize = crypto_aead_authsize(aead);

> +       struct skcipher_walk walk;

> +       u8 iv[AES_BLOCK_SIZE];

> +       u8 tag[AES_BLOCK_SIZE];

> +       u8 buf[GHASH_BLOCK_SIZE];

> +       u64 dg[2] = {};

> +       int err;

> +

> +       if (req->assoclen)

> +               gcm_calculate_auth_mac(req, dg);

> +

> +       memcpy(iv, req->iv, GCM_IV_SIZE);

> +       put_unaligned_be32(1, iv + GCM_IV_SIZE);

> +

> +       if (likely(may_use_simd())) {

> +               kernel_neon_begin();

> +

> +               pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,

> +                                       num_rounds(&ctx->aes_key));

> +               put_unaligned_be32(2, iv + GCM_IV_SIZE);

> +

> +               err = skcipher_walk_aead_decrypt(&walk, req, true);

> +

> +               while (walk.nbytes >= AES_BLOCK_SIZE) {

> +                       int blocks = walk.nbytes / AES_BLOCK_SIZE;

> +

> +                       pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr,

> +                                         walk.src.virt.addr, &ctx->ghash_key,

> +                                         iv, num_rounds(&ctx->aes_key));

> +

> +                       err = skcipher_walk_done(&walk,

> +                                                walk.nbytes % AES_BLOCK_SIZE);

> +               }

> +               if (walk.nbytes)

> +                       pmull_gcm_encrypt_block(iv, iv, NULL,

> +                                               num_rounds(&ctx->aes_key));

> +

> +               kernel_neon_end();

> +       } else {

> +               crypto_aes_encrypt(&ctx->aes_key, tag, iv);

> +               put_unaligned_be32(2, iv + GCM_IV_SIZE);

> +

> +               err = skcipher_walk_aead_decrypt(&walk, req, true);

> +

> +               while (walk.nbytes >= AES_BLOCK_SIZE) {

> +                       int blocks = walk.nbytes / AES_BLOCK_SIZE;

> +                       u8 *dst = walk.dst.virt.addr;

> +                       u8 *src = walk.src.virt.addr;

> +

> +                       ghash_do_update(blocks, dg, walk.src.virt.addr,

> +                                       &ctx->ghash_key, NULL);

> +

> +                       do {

> +                               crypto_aes_encrypt(&ctx->aes_key, buf, iv);

> +                               if (dst != src)

> +                                       memcpy(dst, src, AES_BLOCK_SIZE);

> +                               crypto_xor(dst, buf, AES_BLOCK_SIZE);

> +                               crypto_inc(iv, AES_BLOCK_SIZE);

> +

> +                               dst += AES_BLOCK_SIZE;

> +                               src += AES_BLOCK_SIZE;

> +                       } while (--blocks > 0);

> +

> +                       err = skcipher_walk_done(&walk,

> +                                                walk.nbytes % AES_BLOCK_SIZE);

> +               }

> +               if (walk.nbytes)

> +                       crypto_aes_encrypt(&ctx->aes_key, iv, iv);

> +       }

> +

> +       /* handle the tail */

> +       if (walk.nbytes) {

> +               memcpy(buf, walk.src.virt.addr, walk.nbytes);

> +               memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);

> +               ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);

> +

> +               if (walk.dst.virt.addr != walk.src.virt.addr)

> +                       memcpy(walk.dst.virt.addr, walk.src.virt.addr,

> +                              walk.nbytes);

> +               crypto_xor(walk.dst.virt.addr, iv, walk.nbytes);

> +

> +               err = skcipher_walk_done(&walk, 0);

> +       }

> +

> +       if (err)

> +               return err;

> +

> +       gcm_final(req, ctx, dg, tag, req->cryptlen - authsize);

> +

> +       /* compare calculated auth tag with the stored one */

> +       scatterwalk_map_and_copy(buf, req->src,

> +                                req->assoclen + req->cryptlen - authsize,

> +                                authsize, 0);

> +

> +       if (crypto_memneq(tag, buf, authsize))

> +               return -EBADMSG;

> +       return 0;

> +}

> +

> +static struct aead_alg gcm_aes_alg = {

> +       .ivsize                 = GCM_IV_SIZE,

> +       .chunksize              = AES_BLOCK_SIZE,

> +       .maxauthsize            = AES_BLOCK_SIZE,

> +       .setkey                 = gcm_setkey,

> +       .setauthsize            = gcm_setauthsize,

> +       .encrypt                = gcm_encrypt,

> +       .decrypt                = gcm_decrypt,

> +

> +       .base.cra_name          = "gcm(aes)",

> +       .base.cra_driver_name   = "gcm-aes-ce",

> +       .base.cra_priority      = 300,

> +       .base.cra_blocksize     = 1,

> +       .base.cra_ctxsize       = sizeof(struct gcm_aes_ctx),

> +       .base.cra_module        = THIS_MODULE,

> +};

> +

> +static int __init ghash_ce_mod_init(void)

> +{      int ret;

> +

> +       ret = crypto_register_shash(&ghash_alg);

> +       if (ret)

> +               return ret;

> +

> +       ret = crypto_register_aead(&gcm_aes_alg);

> +       if (ret)

> +               crypto_unregister_shash(&ghash_alg);

> +       return ret;

>  }

>

>  static void __exit ghash_ce_mod_exit(void)

>  {

> +       crypto_unregister_aead(&gcm_aes_alg);

>         crypto_unregister_shash(&ghash_alg);

>  }

>

> --

> 2.9.3

>

>

> Generic GCM wrapper around AES-CTR and GHASH (using AES and PMULL instructions)

> ===============================================================================

>

> testing speed of gcm(aes) (gcm_base(ctr-aes-ce,ghash-ce)) encryption

> test 0 (128 bit key, 16 byte blocks): 1133407 operations in 1 seconds (18134512 bytes)

> test 1 (128 bit key, 64 byte blocks): 1025997 operations in 1 seconds (65663808 bytes)

> test 2 (128 bit key, 256 byte blocks): 768971 operations in 1 seconds (196856576 bytes)

> test 3 (128 bit key, 512 byte blocks): 577197 operations in 1 seconds (295524864 bytes)

> test 4 (128 bit key, 1024 byte blocks): 390516 operations in 1 seconds (399888384 bytes)

> test 5 (128 bit key, 2048 byte blocks): 237002 operations in 1 seconds (485380096 bytes)

> test 6 (128 bit key, 4096 byte blocks): 132590 operations in 1 seconds (543088640 bytes)

> test 7 (128 bit key, 8192 byte blocks): 69495 operations in 1 seconds (569303040 bytes)

> test 8 (192 bit key, 16 byte blocks): 1108665 operations in 1 seconds (17738640 bytes)

> test 9 (192 bit key, 64 byte blocks): 1054793 operations in 1 seconds (67506752 bytes)

> test 10 (192 bit key, 256 byte blocks): 759134 operations in 1 seconds (194338304 bytes)

> test 11 (192 bit key, 512 byte blocks): 565960 operations in 1 seconds (289771520 bytes)

> test 12 (192 bit key, 1024 byte blocks): 380881 operations in 1 seconds (390022144 bytes)

> test 13 (192 bit key, 2048 byte blocks): 231188 operations in 1 seconds (473473024 bytes)

> test 14 (192 bit key, 4096 byte blocks): 128310 operations in 1 seconds (525557760 bytes)

> test 15 (192 bit key, 8192 byte blocks): 67436 operations in 1 seconds (552435712 bytes)

> test 16 (256 bit key, 16 byte blocks): 1122946 operations in 1 seconds (17967136 bytes)

> test 17 (256 bit key, 64 byte blocks): 1006653 operations in 1 seconds (64425792 bytes)

> test 18 (256 bit key, 256 byte blocks): 744818 operations in 1 seconds (190673408 bytes)

> test 19 (256 bit key, 512 byte blocks): 553923 operations in 1 seconds (283608576 bytes)

> test 20 (256 bit key, 1024 byte blocks): 371402 operations in 1 seconds (380315648 bytes)

> test 21 (256 bit key, 2048 byte blocks): 223312 operations in 1 seconds (457342976 bytes)

> test 22 (256 bit key, 4096 byte blocks): 123945 operations in 1 seconds (507678720 bytes)

> test 23 (256 bit key, 8192 byte blocks): 64935 operations in 1 seconds (531947520 bytes)

>

> Native GCM module with block level interleave of AES-CTR and GHASH

> ==================================================================

>

> testing speed of gcm(aes) (gcm-aes-ce) encryption

> test 0 (128 bit key, 16 byte blocks): 1860711 operations in 1 seconds (29771376 bytes)

> test 1 (128 bit key, 64 byte blocks): 1573017 operations in 1 seconds (100673088 bytes)

> test 2 (128 bit key, 256 byte blocks): 1136989 operations in 1 seconds (291069184 bytes)

> test 3 (128 bit key, 512 byte blocks): 840846 operations in 1 seconds (430513152 bytes)

> test 4 (128 bit key, 1024 byte blocks): 548205 operations in 1 seconds (561361920 bytes)

> test 5 (128 bit key, 2048 byte blocks): 328413 operations in 1 seconds (672589824 bytes)

> test 6 (128 bit key, 4096 byte blocks): 181673 operations in 1 seconds (744132608 bytes)

> test 7 (128 bit key, 8192 byte blocks): 94986 operations in 1 seconds (778125312 bytes)

> test 8 (192 bit key, 16 byte blocks): 1837762 operations in 1 seconds (29404192 bytes)

> test 9 (192 bit key, 64 byte blocks): 1537458 operations in 1 seconds (98397312 bytes)

> test 10 (192 bit key, 256 byte blocks): 1087589 operations in 1 seconds (278422784 bytes)

> test 11 (192 bit key, 512 byte blocks): 807194 operations in 1 seconds (413283328 bytes)

> test 12 (192 bit key, 1024 byte blocks): 524966 operations in 1 seconds (537565184 bytes)

> test 13 (192 bit key, 2048 byte blocks): 312338 operations in 1 seconds (639668224 bytes)

> test 14 (192 bit key, 4096 byte blocks): 173324 operations in 1 seconds (709935104 bytes)

> test 15 (192 bit key, 8192 byte blocks): 90857 operations in 1 seconds (744300544 bytes)

> test 16 (256 bit key, 16 byte blocks): 1798971 operations in 1 seconds (28783536 bytes)

> test 17 (256 bit key, 64 byte blocks): 1497989 operations in 1 seconds (95871296 bytes)

> test 18 (256 bit key, 256 byte blocks): 1058926 operations in 1 seconds (271085056 bytes)

> test 19 (256 bit key, 512 byte blocks): 775609 operations in 1 seconds (397111808 bytes)

> test 20 (256 bit key, 1024 byte blocks): 492267 operations in 1 seconds (504081408 bytes)

> test 21 (256 bit key, 2048 byte blocks): 294868 operations in 1 seconds (603889664 bytes)

> test 22 (256 bit key, 4096 byte blocks): 161802 operations in 1 seconds (662740992 bytes)

> test 23 (256 bit key, 8192 byte blocks): 84664 operations in 1 seconds (693567488 bytes)
diff mbox

Patch

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index a669dedc8767..3e5b39b79fb9 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -29,10 +29,11 @@  config CRYPTO_SHA2_ARM64_CE
 	select CRYPTO_SHA256_ARM64
 
 config CRYPTO_GHASH_ARM64_CE
-	tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions"
+	tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_HASH
 	select CRYPTO_GF128MUL
+	select CRYPTO_AES
 
 config CRYPTO_CRCT10DIF_ARM64_CE
 	tristate "CRCT10DIF digest algorithm using PMULL instructions"
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index f0bb9f0b524f..cb22459eba85 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -77,3 +77,178 @@  CPU_LE(	rev64		T1.16b, T1.16b	)
 	st1		{XL.2d}, [x1]
 	ret
 ENDPROC(pmull_ghash_update)
+
+	KS		.req	v8
+	CTR		.req	v9
+	INP		.req	v10
+
+	.macro		load_round_keys, rounds, rk
+	cmp		\rounds, #12
+	blo		2222f		/* 128 bits */
+	beq		1111f		/* 192 bits */
+	ld1		{v17.4s-v18.4s}, [\rk], #32
+1111:	ld1		{v19.4s-v20.4s}, [\rk], #32
+2222:	ld1		{v21.4s-v24.4s}, [\rk], #64
+	ld1		{v25.4s-v28.4s}, [\rk], #64
+	ld1		{v29.4s-v31.4s}, [\rk]
+	.endm
+
+	.macro		enc_round, state, key
+	aese		\state\().16b, \key\().16b
+	aesmc		\state\().16b, \state\().16b
+	.endm
+
+	.macro		enc_block, state, rounds
+	cmp		\rounds, #12
+	b.lo		2222f		/* 128 bits */
+	b.eq		1111f		/* 192 bits */
+	enc_round	\state, v17
+	enc_round	\state, v18
+1111:	enc_round	\state, v19
+	enc_round	\state, v20
+2222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
+	enc_round	\state, \key
+	.endr
+	aese		\state\().16b, v30.16b
+	eor		\state\().16b, \state\().16b, v31.16b
+	.endm
+
+	.macro		pmull_gcm_do_crypt, enc
+	ld1		{SHASH.2d}, [x4]
+	ld1		{XL.2d}, [x1]
+	ldr		x8, [x5, #8]			// load lower counter
+
+	movi		MASK.16b, #0xe1
+	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
+CPU_LE(	rev		x8, x8		)
+	shl		MASK.2d, MASK.2d, #57
+	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
+
+	.if		\enc == 1
+	ld1		{KS.16b}, [x7]
+	.endif
+
+0:	ld1		{CTR.8b}, [x5]			// load upper counter
+	ld1		{INP.16b}, [x3], #16
+	rev		x9, x8
+	add		x8, x8, #1
+	sub		w0, w0, #1
+	ins		CTR.d[1], x9			// set lower counter
+
+	.if		\enc == 1
+	eor		INP.16b, INP.16b, KS.16b	// encrypt input
+	st1		{INP.16b}, [x2], #16
+	.endif
+
+	rev64		T1.16b, INP.16b
+
+	cmp		w6, #12
+	b.ge		2f				// AES-192/256?
+
+1:	enc_round	CTR, v21
+
+	ext		T2.16b, XL.16b, XL.16b, #8
+	ext		IN1.16b, T1.16b, T1.16b, #8
+
+	enc_round	CTR, v22
+
+	eor		T1.16b, T1.16b, T2.16b
+	eor		XL.16b, XL.16b, IN1.16b
+
+	enc_round	CTR, v23
+
+	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
+	eor		T1.16b, T1.16b, XL.16b
+
+	enc_round	CTR, v24
+
+	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
+	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
+
+	enc_round	CTR, v25
+
+	ext		T1.16b, XL.16b, XH.16b, #8
+	eor		T2.16b, XL.16b, XH.16b
+	eor		XM.16b, XM.16b, T1.16b
+
+	enc_round	CTR, v26
+
+	eor		XM.16b, XM.16b, T2.16b
+	pmull		T2.1q, XL.1d, MASK.1d
+
+	enc_round	CTR, v27
+
+	mov		XH.d[0], XM.d[1]
+	mov		XM.d[1], XL.d[0]
+
+	enc_round	CTR, v28
+
+	eor		XL.16b, XM.16b, T2.16b
+
+	enc_round	CTR, v29
+
+	ext		T2.16b, XL.16b, XL.16b, #8
+
+	aese		CTR.16b, v30.16b
+
+	pmull		XL.1q, XL.1d, MASK.1d
+	eor		T2.16b, T2.16b, XH.16b
+
+	eor		KS.16b, CTR.16b, v31.16b
+
+	eor		XL.16b, XL.16b, T2.16b
+
+	.if		\enc == 0
+	eor		INP.16b, INP.16b, KS.16b
+	st1		{INP.16b}, [x2], #16
+	.endif
+
+	cbnz		w0, 0b
+
+CPU_LE(	rev		x8, x8		)
+	st1		{XL.2d}, [x1]
+	str		x8, [x5, #8]			// store lower counter
+
+	.if		\enc == 1
+	st1		{KS.16b}, [x7]
+	.endif
+
+	ret
+
+2:	b.eq		3f				// AES-192?
+	enc_round	CTR, v17
+	enc_round	CTR, v18
+3:	enc_round	CTR, v19
+	enc_round	CTR, v20
+	b		1b
+	.endm
+
+	/*
+	 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
+	 *			  struct ghash_key const *k, u8 ctr[],
+	 *			  int rounds, u8 ks[])
+	 */
+ENTRY(pmull_gcm_encrypt)
+	pmull_gcm_do_crypt	1
+ENDPROC(pmull_gcm_encrypt)
+
+	/*
+	 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
+	 *			  struct ghash_key const *k, u8 ctr[],
+	 *			  int rounds)
+	 */
+ENTRY(pmull_gcm_decrypt)
+	pmull_gcm_do_crypt	0
+ENDPROC(pmull_gcm_decrypt)
+
+	/*
+	 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
+	 */
+ENTRY(pmull_gcm_encrypt_block)
+	cbz		x2, 0f
+	load_round_keys	w3, x2
+0:	ld1		{v0.16b}, [x1]
+	enc_block	v0, w3
+	st1		{v0.16b}, [x0]
+	ret
+ENDPROC(pmull_gcm_encrypt_block)
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 30221ef56e70..85ff57e789ff 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -11,18 +11,25 @@ 
 #include <asm/neon.h>
 #include <asm/simd.h>
 #include <asm/unaligned.h>
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <crypto/b128ops.h>
 #include <crypto/gf128mul.h>
+#include <crypto/internal/aead.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
 #include <linux/cpufeature.h>
 #include <linux/crypto.h>
 #include <linux/module.h>
 
-MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions");
+MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
 
 #define GHASH_BLOCK_SIZE	16
 #define GHASH_DIGEST_SIZE	16
+#define GCM_IV_SIZE		12
 
 struct ghash_key {
 	u64 a;
@@ -36,9 +43,25 @@  struct ghash_desc_ctx {
 	u32 count;
 };
 
+struct gcm_aes_ctx {
+	struct crypto_aes_ctx	aes_key;
+	struct ghash_key	ghash_key;
+};
+
 asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
 				   struct ghash_key const *k, const char *head);
 
+asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
+				  const u8 src[], struct ghash_key const *k,
+				  u8 ctr[], int rounds, u8 ks[]);
+
+asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[],
+				  const u8 src[], struct ghash_key const *k,
+				  u8 ctr[], int rounds);
+
+asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[],
+					u32 const rk[], int rounds);
+
 static int ghash_init(struct shash_desc *desc)
 {
 	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
@@ -130,17 +153,11 @@  static int ghash_final(struct shash_desc *desc, u8 *dst)
 	return 0;
 }
 
-static int ghash_setkey(struct crypto_shash *tfm,
-			const u8 *inkey, unsigned int keylen)
+static int __ghash_setkey(struct ghash_key *key,
+			  const u8 *inkey, unsigned int keylen)
 {
-	struct ghash_key *key = crypto_shash_ctx(tfm);
 	u64 a, b;
 
-	if (keylen != GHASH_BLOCK_SIZE) {
-		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-
 	/* needed for the fallback */
 	memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
 
@@ -157,31 +174,398 @@  static int ghash_setkey(struct crypto_shash *tfm,
 	return 0;
 }
 
+static int ghash_setkey(struct crypto_shash *tfm,
+			const u8 *inkey, unsigned int keylen)
+{
+	struct ghash_key *key = crypto_shash_ctx(tfm);
+
+	if (keylen != GHASH_BLOCK_SIZE) {
+		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	return __ghash_setkey(key, inkey, keylen);
+}
+
 static struct shash_alg ghash_alg = {
-	.digestsize	= GHASH_DIGEST_SIZE,
-	.init		= ghash_init,
-	.update		= ghash_update,
-	.final		= ghash_final,
-	.setkey		= ghash_setkey,
-	.descsize	= sizeof(struct ghash_desc_ctx),
-	.base		= {
-		.cra_name		= "ghash",
-		.cra_driver_name	= "ghash-ce",
-		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
-		.cra_blocksize		= GHASH_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct ghash_key),
-		.cra_module		= THIS_MODULE,
-	},
+	.base.cra_name		= "ghash",
+	.base.cra_driver_name	= "ghash-ce",
+	.base.cra_priority	= 200,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= GHASH_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct ghash_key),
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= GHASH_DIGEST_SIZE,
+	.init			= ghash_init,
+	.update			= ghash_update,
+	.final			= ghash_final,
+	.setkey			= ghash_setkey,
+	.descsize		= sizeof(struct ghash_desc_ctx),
 };
 
-static int __init ghash_ce_mod_init(void)
+static int num_rounds(struct crypto_aes_ctx *ctx)
+{
+	/*
+	 * # of rounds specified by AES:
+	 * 128 bit key		10 rounds
+	 * 192 bit key		12 rounds
+	 * 256 bit key		14 rounds
+	 * => n byte key	=> 6 + (n/4) rounds
+	 */
+	return 6 + ctx->key_length / 4;
+}
+
+static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
+		      unsigned int keylen)
+{
+	struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm);
+	u8 key[GHASH_BLOCK_SIZE];
+	int ret;
+
+	ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen);
+	if (ret) {
+		tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	crypto_aes_encrypt(&ctx->aes_key, key, (u8[AES_BLOCK_SIZE]){});
+
+	return __ghash_setkey(&ctx->ghash_key, key, sizeof(key));
+}
+
+static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
+{
+	switch (authsize) {
+	case 4:
+	case 8:
+	case 12 ... 16:
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[],
+			   int *buf_count, struct gcm_aes_ctx *ctx)
+{
+	if (*buf_count > 0) {
+		int buf_added = min(count, GHASH_BLOCK_SIZE - *buf_count);
+
+		memcpy(&buf[*buf_count], src, buf_added);
+
+		*buf_count += buf_added;
+		src += buf_added;
+		count -= buf_added;
+	}
+
+	if (count >= GHASH_BLOCK_SIZE || *buf_count == GHASH_BLOCK_SIZE) {
+		int blocks = count / GHASH_BLOCK_SIZE;
+
+		ghash_do_update(blocks, dg, src, &ctx->ghash_key,
+				*buf_count ? buf : NULL);
+
+		src += blocks * GHASH_BLOCK_SIZE;
+		count %= GHASH_BLOCK_SIZE;
+		*buf_count = 0;
+	}
+
+	if (count > 0) {
+		memcpy(buf, src, count);
+		*buf_count = count;
+	}
+}
+
+static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
+	u8 buf[GHASH_BLOCK_SIZE];
+	struct scatter_walk walk;
+	u32 len = req->assoclen;
+	int buf_count = 0;
+
+	scatterwalk_start(&walk, req->src);
+
+	do {
+		u32 n = scatterwalk_clamp(&walk, len);
+		u8 *p;
+
+		if (!n) {
+			scatterwalk_start(&walk, sg_next(walk.sg));
+			n = scatterwalk_clamp(&walk, len);
+		}
+		p = scatterwalk_map(&walk);
+
+		gcm_update_mac(dg, p, n, buf, &buf_count, ctx);
+		len -= n;
+
+		scatterwalk_unmap(p);
+		scatterwalk_advance(&walk, n);
+		scatterwalk_done(&walk, 0, len);
+	} while (len);
+
+	if (buf_count) {
+		memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count);
+		ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+	}
+}
+
+static void gcm_final(struct aead_request *req, struct gcm_aes_ctx *ctx,
+		      u64 dg[], u8 tag[], int cryptlen)
+{
+	u8 mac[AES_BLOCK_SIZE];
+	u128 lengths;
+
+	lengths.a = cpu_to_be64(req->assoclen * 8);
+	lengths.b = cpu_to_be64(cryptlen * 8);
+
+	ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL);
+
+	put_unaligned_be64(dg[1], mac);
+	put_unaligned_be64(dg[0], mac + 8);
+
+	crypto_xor(tag, mac, AES_BLOCK_SIZE);
+}
+
+static int gcm_encrypt(struct aead_request *req)
 {
-	return crypto_register_shash(&ghash_alg);
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
+	struct skcipher_walk walk;
+	u8 iv[AES_BLOCK_SIZE];
+	u8 ks[AES_BLOCK_SIZE];
+	u8 tag[AES_BLOCK_SIZE];
+	u64 dg[2] = {};
+	int err;
+
+	if (req->assoclen)
+		gcm_calculate_auth_mac(req, dg);
+
+	memcpy(iv, req->iv, GCM_IV_SIZE);
+	put_unaligned_be32(1, iv + GCM_IV_SIZE);
+
+	if (likely(may_use_simd())) {
+		kernel_neon_begin();
+
+		pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,
+					num_rounds(&ctx->aes_key));
+		put_unaligned_be32(2, iv + GCM_IV_SIZE);
+		pmull_gcm_encrypt_block(ks, iv, NULL,
+					num_rounds(&ctx->aes_key));
+		put_unaligned_be32(3, iv + GCM_IV_SIZE);
+
+		err = skcipher_walk_aead_encrypt(&walk, req, true);
+
+		while (walk.nbytes >= AES_BLOCK_SIZE) {
+			int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+			pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,
+					  walk.src.virt.addr, &ctx->ghash_key,
+					  iv, num_rounds(&ctx->aes_key), ks);
+
+			err = skcipher_walk_done(&walk,
+						 walk.nbytes % AES_BLOCK_SIZE);
+		}
+		kernel_neon_end();
+	} else {
+		crypto_aes_encrypt(&ctx->aes_key, tag, iv);
+		put_unaligned_be32(2, iv + GCM_IV_SIZE);
+
+		err = skcipher_walk_aead_encrypt(&walk, req, true);
+
+		while (walk.nbytes >= AES_BLOCK_SIZE) {
+			int blocks = walk.nbytes / AES_BLOCK_SIZE;
+			u8 *dst = walk.dst.virt.addr;
+			u8 *src = walk.src.virt.addr;
+
+			do {
+				crypto_aes_encrypt(&ctx->aes_key, ks, iv);
+				if (dst != src)
+					memcpy(dst, src, AES_BLOCK_SIZE);
+				crypto_xor(dst, ks, AES_BLOCK_SIZE);
+				crypto_inc(iv, AES_BLOCK_SIZE);
+
+				dst += AES_BLOCK_SIZE;
+				src += AES_BLOCK_SIZE;
+			} while (--blocks > 0);
+
+			ghash_do_update(walk.nbytes / AES_BLOCK_SIZE, dg,
+					walk.dst.virt.addr, &ctx->ghash_key,
+					NULL);
+
+			err = skcipher_walk_done(&walk,
+						 walk.nbytes % AES_BLOCK_SIZE);
+		}
+		if (walk.nbytes)
+			crypto_aes_encrypt(&ctx->aes_key, ks, iv);
+	}
+
+	/* handle the tail */
+	if (walk.nbytes) {
+		u8 buf[GHASH_BLOCK_SIZE];
+
+		if (walk.dst.virt.addr != walk.src.virt.addr)
+			memcpy(walk.dst.virt.addr, walk.src.virt.addr,
+			       walk.nbytes);
+		crypto_xor(walk.dst.virt.addr, ks, walk.nbytes);
+
+		memcpy(buf, walk.dst.virt.addr, walk.nbytes);
+		memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);
+		ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+
+		err = skcipher_walk_done(&walk, 0);
+	}
+
+	if (err)
+		return err;
+
+	gcm_final(req, ctx, dg, tag, req->cryptlen);
+
+	/* copy authtag to end of dst */
+	scatterwalk_map_and_copy(tag, req->dst, req->assoclen + req->cryptlen,
+				 crypto_aead_authsize(aead), 1);
+
+	return 0;
+}
+
+static int gcm_decrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
+	unsigned int authsize = crypto_aead_authsize(aead);
+	struct skcipher_walk walk;
+	u8 iv[AES_BLOCK_SIZE];
+	u8 tag[AES_BLOCK_SIZE];
+	u8 buf[GHASH_BLOCK_SIZE];
+	u64 dg[2] = {};
+	int err;
+
+	if (req->assoclen)
+		gcm_calculate_auth_mac(req, dg);
+
+	memcpy(iv, req->iv, GCM_IV_SIZE);
+	put_unaligned_be32(1, iv + GCM_IV_SIZE);
+
+	if (likely(may_use_simd())) {
+		kernel_neon_begin();
+
+		pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,
+					num_rounds(&ctx->aes_key));
+		put_unaligned_be32(2, iv + GCM_IV_SIZE);
+
+		err = skcipher_walk_aead_decrypt(&walk, req, true);
+
+		while (walk.nbytes >= AES_BLOCK_SIZE) {
+			int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+			pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr,
+					  walk.src.virt.addr, &ctx->ghash_key,
+					  iv, num_rounds(&ctx->aes_key));
+
+			err = skcipher_walk_done(&walk,
+						 walk.nbytes % AES_BLOCK_SIZE);
+		}
+		if (walk.nbytes)
+			pmull_gcm_encrypt_block(iv, iv, NULL,
+						num_rounds(&ctx->aes_key));
+
+		kernel_neon_end();
+	} else {
+		crypto_aes_encrypt(&ctx->aes_key, tag, iv);
+		put_unaligned_be32(2, iv + GCM_IV_SIZE);
+
+		err = skcipher_walk_aead_decrypt(&walk, req, true);
+
+		while (walk.nbytes >= AES_BLOCK_SIZE) {
+			int blocks = walk.nbytes / AES_BLOCK_SIZE;
+			u8 *dst = walk.dst.virt.addr;
+			u8 *src = walk.src.virt.addr;
+
+			ghash_do_update(blocks, dg, walk.src.virt.addr,
+					&ctx->ghash_key, NULL);
+
+			do {
+				crypto_aes_encrypt(&ctx->aes_key, buf, iv);
+				if (dst != src)
+					memcpy(dst, src, AES_BLOCK_SIZE);
+				crypto_xor(dst, buf, AES_BLOCK_SIZE);
+				crypto_inc(iv, AES_BLOCK_SIZE);
+
+				dst += AES_BLOCK_SIZE;
+				src += AES_BLOCK_SIZE;
+			} while (--blocks > 0);
+
+			err = skcipher_walk_done(&walk,
+						 walk.nbytes % AES_BLOCK_SIZE);
+		}
+		if (walk.nbytes)
+			crypto_aes_encrypt(&ctx->aes_key, iv, iv);
+	}
+
+	/* handle the tail */
+	if (walk.nbytes) {
+		memcpy(buf, walk.src.virt.addr, walk.nbytes);
+		memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);
+		ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+
+		if (walk.dst.virt.addr != walk.src.virt.addr)
+			memcpy(walk.dst.virt.addr, walk.src.virt.addr,
+			       walk.nbytes);
+		crypto_xor(walk.dst.virt.addr, iv, walk.nbytes);
+
+		err = skcipher_walk_done(&walk, 0);
+	}
+
+	if (err)
+		return err;
+
+	gcm_final(req, ctx, dg, tag, req->cryptlen - authsize);
+
+	/* compare calculated auth tag with the stored one */
+	scatterwalk_map_and_copy(buf, req->src,
+				 req->assoclen + req->cryptlen - authsize,
+				 authsize, 0);
+
+	if (crypto_memneq(tag, buf, authsize))
+		return -EBADMSG;
+	return 0;
+}
+
+static struct aead_alg gcm_aes_alg = {
+	.ivsize			= GCM_IV_SIZE,
+	.chunksize		= AES_BLOCK_SIZE,
+	.maxauthsize		= AES_BLOCK_SIZE,
+	.setkey			= gcm_setkey,
+	.setauthsize		= gcm_setauthsize,
+	.encrypt		= gcm_encrypt,
+	.decrypt		= gcm_decrypt,
+
+	.base.cra_name		= "gcm(aes)",
+	.base.cra_driver_name	= "gcm-aes-ce",
+	.base.cra_priority	= 300,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct gcm_aes_ctx),
+	.base.cra_module	= THIS_MODULE,
+};
+
+static int __init ghash_ce_mod_init(void)
+{	int ret;
+
+	ret = crypto_register_shash(&ghash_alg);
+	if (ret)
+		return ret;
+
+	ret = crypto_register_aead(&gcm_aes_alg);
+	if (ret)
+		crypto_unregister_shash(&ghash_alg);
+	return ret;
 }
 
 static void __exit ghash_ce_mod_exit(void)
 {
+	crypto_unregister_aead(&gcm_aes_alg);
 	crypto_unregister_shash(&ghash_alg);
 }