diff mbox series

[v2,2/3] target/arm: Implement an IMPDEF pauth algorithm

Message ID 20200813200243.3036772-3-richard.henderson@linaro.org
State Superseded
Headers show
Series target/arm: Implement an IMPDEF pauth algorithm | expand

Commit Message

Richard Henderson Aug. 13, 2020, 8:02 p.m. UTC
Without hardware acceleration, a cryptographically strong
algorithm is too expensive for pauth_computepac.

Even with hardware accel, we are not currently expecting
to link the linux-user binaries to any crypto libraries,
and doing so would generally make the --static build fail.

So choose XXH64 as a reasonably quick and decent hash.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
v2: Move the XXH64 bits to xxhash.h (ajb).
    Create isar_feature_aa64_pauth_arch and fixup a comment
    in isar_feature_aa64_pauth that no longer applies.
---
 include/qemu/xxhash.h     | 82 +++++++++++++++++++++++++++++++++++++++
 target/arm/cpu.h          | 15 +++++--
 target/arm/pauth_helper.c | 41 +++++++++++++++++---
 3 files changed, 129 insertions(+), 9 deletions(-)

-- 
2.25.1

Comments

Andrew Jones Aug. 14, 2020, 9:26 a.m. UTC | #1
On Thu, Aug 13, 2020 at 01:02:42PM -0700, Richard Henderson wrote:
> Without hardware acceleration, a cryptographically strong

> algorithm is too expensive for pauth_computepac.

> 

> Even with hardware accel, we are not currently expecting

> to link the linux-user binaries to any crypto libraries,

> and doing so would generally make the --static build fail.

> 

> So choose XXH64 as a reasonably quick and decent hash.

> 

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

> ---

> v2: Move the XXH64 bits to xxhash.h (ajb).

>     Create isar_feature_aa64_pauth_arch and fixup a comment

>     in isar_feature_aa64_pauth that no longer applies.

> ---

>  include/qemu/xxhash.h     | 82 +++++++++++++++++++++++++++++++++++++++

>  target/arm/cpu.h          | 15 +++++--

>  target/arm/pauth_helper.c | 41 +++++++++++++++++---

>  3 files changed, 129 insertions(+), 9 deletions(-)

> 

> diff --git a/include/qemu/xxhash.h b/include/qemu/xxhash.h

> index 076f1f6054..93ba1a0425 100644

> --- a/include/qemu/xxhash.h

> +++ b/include/qemu/xxhash.h

> @@ -119,4 +119,86 @@ static inline uint32_t qemu_xxhash6(uint64_t ab, uint64_t cd, uint32_t e,

>      return qemu_xxhash7(ab, cd, e, f, 0);

>  }

>  

> +/*

> + * Component parts of the XXH64 algorithm from

> + * https://github.com/Cyan4973/xxHash/blob/v0.8.0/xxhash.h

> + *

> + * The complete algorithm looks like

> + *

> + *  i = 0;

> + *  if (len >= 32) {

> + *      v1 = seed + PRIME64_1 + PRIME64_2;

> + *      v2 = seed + PRIME64_2;

> + *      v3 = seed + 0;

> + *      v4 = seed - XXH_PRIME64_1;

> + *      do {

> + *          v1 = XXH64_round(v1, get64bits(input + i));

> + *          v2 = XXH64_round(v2, get64bits(input + i + 8));

> + *          v3 = XXH64_round(v3, get64bits(input + i + 16));

> + *          v4 = XXH64_round(v4, get64bits(input + i + 24));

> + *      } while ((i += 32) <= len);

> + *      h64 = XXH64_mergerounds(v1, v2, v3, v4);

> + *  } else {

> + *      h64 = seed + PRIME64_5;

> + *  }

> + *  h64 += len;

> + *

> + *  for (; i + 8 <= len; i += 8) {

> + *      h64 ^= XXH64_round(0, get64bits(input + i));

> + *      h64 = rol64(h64, 27) * PRIME64_1 + PRIME64_4;

> + *  }

> + *  for (; i + 4 <= len; i += 4) {

> + *      h64 ^= get32bits(input + i) * PRIME64_1;

> + *      h64 = rol64(h64, 23) * PRIME64_2 + PRIME64_3;

> + *  }

> + *  for (; i < len; i += 1) {

> + *      h64 ^= get8bits(input + i) * PRIME64_5;

> + *      h64 = rol64(h64, 11) * PRIME64_1;

> + *  }

> + *

> + *  return XXH64_avalanche(h64)

> + *

> + * Exposing the pieces instead allows for simplified usage when

> + * the length is a known constant and the inputs are in registers.

> + */

> +#define PRIME64_1   0x9E3779B185EBCA87ULL

> +#define PRIME64_2   0xC2B2AE3D27D4EB4FULL

> +#define PRIME64_3   0x165667B19E3779F9ULL

> +#define PRIME64_4   0x85EBCA77C2B2AE63ULL

> +#define PRIME64_5   0x27D4EB2F165667C5ULL

> +

> +static inline uint64_t XXH64_round(uint64_t acc, uint64_t input)

> +{

> +    return rol64(acc + input * PRIME64_2, 31) * PRIME64_1;

> +}

> +

> +static inline uint64_t XXH64_mergeround(uint64_t acc, uint64_t val)

> +{

> +    return (acc ^ XXH64_round(0, val)) * PRIME64_1 + PRIME64_4;

> +}

> +

> +static inline uint64_t XXH64_mergerounds(uint64_t v1, uint64_t v2,

> +                                         uint64_t v3, uint64_t v4)

> +{

> +    uint64_t h64;

> +

> +    h64 = rol64(v1, 1) + rol64(v2, 7) + rol64(v3, 12) + rol64(v4, 18);

> +    h64 = XXH64_mergeround(h64, v1);

> +    h64 = XXH64_mergeround(h64, v2);

> +    h64 = XXH64_mergeround(h64, v3);

> +    h64 = XXH64_mergeround(h64, v4);

> +

> +    return h64;

> +}

> +

> +static inline uint64_t XXH64_avalanche(uint64_t h64)

> +{

> +    h64 ^= h64 >> 33;

> +    h64 *= PRIME64_2;

> +    h64 ^= h64 >> 29;

> +    h64 *= PRIME64_3;

> +    h64 ^= h64 >> 32;

> +    return h64;

> +}

> +

>  #endif /* QEMU_XXHASH_H */

> diff --git a/target/arm/cpu.h b/target/arm/cpu.h

> index 44901923c8..776bf30cbc 100644

> --- a/target/arm/cpu.h

> +++ b/target/arm/cpu.h

> @@ -3767,10 +3767,8 @@ static inline bool isar_feature_aa64_fcma(const ARMISARegisters *id)

>  static inline bool isar_feature_aa64_pauth(const ARMISARegisters *id)

>  {

>      /*

> -     * Note that while QEMU will only implement the architected algorithm

> -     * QARMA, and thus APA+GPA, the host cpu for kvm may use implementation

> -     * defined algorithms, and thus API+GPI, and this predicate controls

> -     * migration of the 128-bit keys.

> +     * Return true if any form of pauth is enabled, as this

> +     * predicate controls migration of the 128-bit keys.

>       */

>      return (id->id_aa64isar1 &

>              (FIELD_DP64(0, ID_AA64ISAR1, APA, 0xf) |

> @@ -3779,6 +3777,15 @@ static inline bool isar_feature_aa64_pauth(const ARMISARegisters *id)

>               FIELD_DP64(0, ID_AA64ISAR1, GPI, 0xf))) != 0;

>  }

>  

> +static inline bool isar_feature_aa64_pauth_arch(const ARMISARegisters *id)

> +{

> +    /*

> +     * Return true if pauth is enabled with the architected QARMA algorithm.

> +     * QEMU will always set APA+GPA to the same value.

> +     */

> +    return FIELD_EX64(id->id_aa64isar1, ID_AA64ISAR1, APA) != 0;

> +}

> +

>  static inline bool isar_feature_aa64_sb(const ARMISARegisters *id)

>  {

>      return FIELD_EX64(id->id_aa64isar1, ID_AA64ISAR1, SB) != 0;

> diff --git a/target/arm/pauth_helper.c b/target/arm/pauth_helper.c

> index 6dbab03768..6ec4f83ff0 100644

> --- a/target/arm/pauth_helper.c

> +++ b/target/arm/pauth_helper.c

> @@ -24,6 +24,7 @@

>  #include "exec/cpu_ldst.h"

>  #include "exec/helper-proto.h"

>  #include "tcg/tcg-gvec-desc.h"

> +#include "qemu/xxhash.h"

>  

>  

>  static uint64_t pac_cell_shuffle(uint64_t i)

> @@ -207,8 +208,8 @@ static uint64_t tweak_inv_shuffle(uint64_t i)

>      return o;

>  }

>  

> -static uint64_t pauth_computepac(uint64_t data, uint64_t modifier,

> -                                 ARMPACKey key)

> +static uint64_t __attribute__((noinline))

> +pauth_computepac_architected(uint64_t data, uint64_t modifier, ARMPACKey key)

>  {

>      static const uint64_t RC[5] = {

>          0x0000000000000000ull,

> @@ -272,6 +273,36 @@ static uint64_t pauth_computepac(uint64_t data, uint64_t modifier,

>      return workingval;

>  }

>  

> +/*

> + * The XXH64 algorithm from

> + * https://github.com/Cyan4973/xxHash/blob/v0.8.0/xxhash.h

> + */

> +static uint64_t __attribute__((noinline))

> +pauth_computepac_impdef(uint64_t data, uint64_t modifier, ARMPACKey key)


Out of curiosity, why do we need to make these computepac functions
noinline?

> +{

> +    uint64_t v1 = QEMU_XXHASH_SEED + PRIME64_1 + PRIME64_2;

> +    uint64_t v2 = QEMU_XXHASH_SEED + PRIME64_2;

> +    uint64_t v3 = QEMU_XXHASH_SEED + 0;

> +    uint64_t v4 = QEMU_XXHASH_SEED - PRIME64_1;

> +

> +    v1 = XXH64_round(v1, data);

> +    v2 = XXH64_round(v2, modifier);

> +    v3 = XXH64_round(v3, key.lo);

> +    v4 = XXH64_round(v4, key.hi);

> +

> +    return XXH64_avalanche(XXH64_mergerounds(v1, v2, v3, v4));

> +}

> +

> +static uint64_t pauth_computepac(CPUARMState *env, uint64_t data,

> +                                 uint64_t modifier, ARMPACKey key)

> +{

> +    if (cpu_isar_feature(aa64_pauth_arch, env_archcpu(env))) {

> +        return pauth_computepac_architected(data, modifier, key);

> +    } else {

> +        return pauth_computepac_impdef(data, modifier, key);

> +    }

> +}


I think this patch should come before the last one. As it stands, when
bisecting between the last one and this one a user could attempt to
enable pauth-imdef, but it wouldn't do anything, or it would potentially
break things. However, this patch shouldn't change anything if it comes
first.

> +

>  static uint64_t pauth_addpac(CPUARMState *env, uint64_t ptr, uint64_t modifier,

>                               ARMPACKey *key, bool data)

>  {

> @@ -292,7 +323,7 @@ static uint64_t pauth_addpac(CPUARMState *env, uint64_t ptr, uint64_t modifier,

>      bot_bit = 64 - param.tsz;

>      ext_ptr = deposit64(ptr, bot_bit, top_bit - bot_bit, ext);

>  

> -    pac = pauth_computepac(ext_ptr, modifier, *key);

> +    pac = pauth_computepac(env, ext_ptr, modifier, *key);

>  

>      /*

>       * Check if the ptr has good extension bits and corrupt the

> @@ -341,7 +372,7 @@ static uint64_t pauth_auth(CPUARMState *env, uint64_t ptr, uint64_t modifier,

>      uint64_t pac, orig_ptr, test;

>  

>      orig_ptr = pauth_original_ptr(ptr, param);

> -    pac = pauth_computepac(orig_ptr, modifier, *key);

> +    pac = pauth_computepac(env, orig_ptr, modifier, *key);

>      bot_bit = 64 - param.tsz;

>      top_bit = 64 - 8 * param.tbi;

>  

> @@ -442,7 +473,7 @@ uint64_t HELPER(pacga)(CPUARMState *env, uint64_t x, uint64_t y)

>      uint64_t pac;

>  

>      pauth_check_trap(env, arm_current_el(env), GETPC());

> -    pac = pauth_computepac(x, y, env->keys.apga);

> +    pac = pauth_computepac(env, x, y, env->keys.apga);

>  

>      return pac & 0xffffffff00000000ull;

>  }

> -- 

> 2.25.1

> 

>


Thanks,
drew
Richard Henderson Aug. 14, 2020, 4:08 p.m. UTC | #2
On 8/14/20 2:26 AM, Andrew Jones wrote:
>> +static uint64_t __attribute__((noinline))

>> +pauth_computepac_impdef(uint64_t data, uint64_t modifier, ARMPACKey key)

> 

> Out of curiosity, why do we need to make these computepac functions

> noinline?


Oh, heh.  Left over from profiling.  Will remove.

> I think this patch should come before the last one. As it stands, when

> bisecting between the last one and this one a user could attempt to

> enable pauth-imdef, but it wouldn't do anything, or it would potentially

> break things. However, this patch shouldn't change anything if it comes

> first.


The current patch ordering would enable impdef but implement that with the
architected algorithm.  Which is ok.

But you're right that the other ordering makes more sense.


r~
diff mbox series

Patch

diff --git a/include/qemu/xxhash.h b/include/qemu/xxhash.h
index 076f1f6054..93ba1a0425 100644
--- a/include/qemu/xxhash.h
+++ b/include/qemu/xxhash.h
@@ -119,4 +119,86 @@  static inline uint32_t qemu_xxhash6(uint64_t ab, uint64_t cd, uint32_t e,
     return qemu_xxhash7(ab, cd, e, f, 0);
 }
 
+/*
+ * Component parts of the XXH64 algorithm from
+ * https://github.com/Cyan4973/xxHash/blob/v0.8.0/xxhash.h
+ *
+ * The complete algorithm looks like
+ *
+ *  i = 0;
+ *  if (len >= 32) {
+ *      v1 = seed + PRIME64_1 + PRIME64_2;
+ *      v2 = seed + PRIME64_2;
+ *      v3 = seed + 0;
+ *      v4 = seed - XXH_PRIME64_1;
+ *      do {
+ *          v1 = XXH64_round(v1, get64bits(input + i));
+ *          v2 = XXH64_round(v2, get64bits(input + i + 8));
+ *          v3 = XXH64_round(v3, get64bits(input + i + 16));
+ *          v4 = XXH64_round(v4, get64bits(input + i + 24));
+ *      } while ((i += 32) <= len);
+ *      h64 = XXH64_mergerounds(v1, v2, v3, v4);
+ *  } else {
+ *      h64 = seed + PRIME64_5;
+ *  }
+ *  h64 += len;
+ *
+ *  for (; i + 8 <= len; i += 8) {
+ *      h64 ^= XXH64_round(0, get64bits(input + i));
+ *      h64 = rol64(h64, 27) * PRIME64_1 + PRIME64_4;
+ *  }
+ *  for (; i + 4 <= len; i += 4) {
+ *      h64 ^= get32bits(input + i) * PRIME64_1;
+ *      h64 = rol64(h64, 23) * PRIME64_2 + PRIME64_3;
+ *  }
+ *  for (; i < len; i += 1) {
+ *      h64 ^= get8bits(input + i) * PRIME64_5;
+ *      h64 = rol64(h64, 11) * PRIME64_1;
+ *  }
+ *
+ *  return XXH64_avalanche(h64)
+ *
+ * Exposing the pieces instead allows for simplified usage when
+ * the length is a known constant and the inputs are in registers.
+ */
+#define PRIME64_1   0x9E3779B185EBCA87ULL
+#define PRIME64_2   0xC2B2AE3D27D4EB4FULL
+#define PRIME64_3   0x165667B19E3779F9ULL
+#define PRIME64_4   0x85EBCA77C2B2AE63ULL
+#define PRIME64_5   0x27D4EB2F165667C5ULL
+
+static inline uint64_t XXH64_round(uint64_t acc, uint64_t input)
+{
+    return rol64(acc + input * PRIME64_2, 31) * PRIME64_1;
+}
+
+static inline uint64_t XXH64_mergeround(uint64_t acc, uint64_t val)
+{
+    return (acc ^ XXH64_round(0, val)) * PRIME64_1 + PRIME64_4;
+}
+
+static inline uint64_t XXH64_mergerounds(uint64_t v1, uint64_t v2,
+                                         uint64_t v3, uint64_t v4)
+{
+    uint64_t h64;
+
+    h64 = rol64(v1, 1) + rol64(v2, 7) + rol64(v3, 12) + rol64(v4, 18);
+    h64 = XXH64_mergeround(h64, v1);
+    h64 = XXH64_mergeround(h64, v2);
+    h64 = XXH64_mergeround(h64, v3);
+    h64 = XXH64_mergeround(h64, v4);
+
+    return h64;
+}
+
+static inline uint64_t XXH64_avalanche(uint64_t h64)
+{
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
 #endif /* QEMU_XXHASH_H */
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 44901923c8..776bf30cbc 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -3767,10 +3767,8 @@  static inline bool isar_feature_aa64_fcma(const ARMISARegisters *id)
 static inline bool isar_feature_aa64_pauth(const ARMISARegisters *id)
 {
     /*
-     * Note that while QEMU will only implement the architected algorithm
-     * QARMA, and thus APA+GPA, the host cpu for kvm may use implementation
-     * defined algorithms, and thus API+GPI, and this predicate controls
-     * migration of the 128-bit keys.
+     * Return true if any form of pauth is enabled, as this
+     * predicate controls migration of the 128-bit keys.
      */
     return (id->id_aa64isar1 &
             (FIELD_DP64(0, ID_AA64ISAR1, APA, 0xf) |
@@ -3779,6 +3777,15 @@  static inline bool isar_feature_aa64_pauth(const ARMISARegisters *id)
              FIELD_DP64(0, ID_AA64ISAR1, GPI, 0xf))) != 0;
 }
 
+static inline bool isar_feature_aa64_pauth_arch(const ARMISARegisters *id)
+{
+    /*
+     * Return true if pauth is enabled with the architected QARMA algorithm.
+     * QEMU will always set APA+GPA to the same value.
+     */
+    return FIELD_EX64(id->id_aa64isar1, ID_AA64ISAR1, APA) != 0;
+}
+
 static inline bool isar_feature_aa64_sb(const ARMISARegisters *id)
 {
     return FIELD_EX64(id->id_aa64isar1, ID_AA64ISAR1, SB) != 0;
diff --git a/target/arm/pauth_helper.c b/target/arm/pauth_helper.c
index 6dbab03768..6ec4f83ff0 100644
--- a/target/arm/pauth_helper.c
+++ b/target/arm/pauth_helper.c
@@ -24,6 +24,7 @@ 
 #include "exec/cpu_ldst.h"
 #include "exec/helper-proto.h"
 #include "tcg/tcg-gvec-desc.h"
+#include "qemu/xxhash.h"
 
 
 static uint64_t pac_cell_shuffle(uint64_t i)
@@ -207,8 +208,8 @@  static uint64_t tweak_inv_shuffle(uint64_t i)
     return o;
 }
 
-static uint64_t pauth_computepac(uint64_t data, uint64_t modifier,
-                                 ARMPACKey key)
+static uint64_t __attribute__((noinline))
+pauth_computepac_architected(uint64_t data, uint64_t modifier, ARMPACKey key)
 {
     static const uint64_t RC[5] = {
         0x0000000000000000ull,
@@ -272,6 +273,36 @@  static uint64_t pauth_computepac(uint64_t data, uint64_t modifier,
     return workingval;
 }
 
+/*
+ * The XXH64 algorithm from
+ * https://github.com/Cyan4973/xxHash/blob/v0.8.0/xxhash.h
+ */
+static uint64_t __attribute__((noinline))
+pauth_computepac_impdef(uint64_t data, uint64_t modifier, ARMPACKey key)
+{
+    uint64_t v1 = QEMU_XXHASH_SEED + PRIME64_1 + PRIME64_2;
+    uint64_t v2 = QEMU_XXHASH_SEED + PRIME64_2;
+    uint64_t v3 = QEMU_XXHASH_SEED + 0;
+    uint64_t v4 = QEMU_XXHASH_SEED - PRIME64_1;
+
+    v1 = XXH64_round(v1, data);
+    v2 = XXH64_round(v2, modifier);
+    v3 = XXH64_round(v3, key.lo);
+    v4 = XXH64_round(v4, key.hi);
+
+    return XXH64_avalanche(XXH64_mergerounds(v1, v2, v3, v4));
+}
+
+static uint64_t pauth_computepac(CPUARMState *env, uint64_t data,
+                                 uint64_t modifier, ARMPACKey key)
+{
+    if (cpu_isar_feature(aa64_pauth_arch, env_archcpu(env))) {
+        return pauth_computepac_architected(data, modifier, key);
+    } else {
+        return pauth_computepac_impdef(data, modifier, key);
+    }
+}
+
 static uint64_t pauth_addpac(CPUARMState *env, uint64_t ptr, uint64_t modifier,
                              ARMPACKey *key, bool data)
 {
@@ -292,7 +323,7 @@  static uint64_t pauth_addpac(CPUARMState *env, uint64_t ptr, uint64_t modifier,
     bot_bit = 64 - param.tsz;
     ext_ptr = deposit64(ptr, bot_bit, top_bit - bot_bit, ext);
 
-    pac = pauth_computepac(ext_ptr, modifier, *key);
+    pac = pauth_computepac(env, ext_ptr, modifier, *key);
 
     /*
      * Check if the ptr has good extension bits and corrupt the
@@ -341,7 +372,7 @@  static uint64_t pauth_auth(CPUARMState *env, uint64_t ptr, uint64_t modifier,
     uint64_t pac, orig_ptr, test;
 
     orig_ptr = pauth_original_ptr(ptr, param);
-    pac = pauth_computepac(orig_ptr, modifier, *key);
+    pac = pauth_computepac(env, orig_ptr, modifier, *key);
     bot_bit = 64 - param.tsz;
     top_bit = 64 - 8 * param.tbi;
 
@@ -442,7 +473,7 @@  uint64_t HELPER(pacga)(CPUARMState *env, uint64_t x, uint64_t y)
     uint64_t pac;
 
     pauth_check_trap(env, arm_current_el(env), GETPC());
-    pac = pauth_computepac(x, y, env->keys.apga);
+    pac = pauth_computepac(env, x, y, env->keys.apga);
 
     return pac & 0xffffffff00000000ull;
 }