diff mbox series

[07/12] RISC-V: crypto: add Zvkg accelerated GCM GHASH implementation

Message ID 20231025183644.8735-8-jerry.shih@sifive.com
State New
Headers show
Series RISC-V: provide some accelerated cryptography implementations using vector extensions | expand

Commit Message

Jerry Shih Oct. 25, 2023, 6:36 p.m. UTC
Add a gcm hash implementation using the Zvkg extension from OpenSSL
(openssl/openssl#21923).

The perlasm here is different from the original implementation in OpenSSL.
The OpenSSL assumes that the H is stored in little-endian. Thus, it needs
to convert the H to big-endian for Zvkg instructions. In kernel, we have
the big-endian H directly. There is no need for endian conversion.

Co-developed-by: Christoph Müllner <christoph.muellner@vrull.eu>
Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
Co-developed-by: Heiko Stuebner <heiko.stuebner@vrull.eu>
Signed-off-by: Heiko Stuebner <heiko.stuebner@vrull.eu>
Signed-off-by: Jerry Shih <jerry.shih@sifive.com>
---
 arch/riscv/crypto/Kconfig               |  14 ++
 arch/riscv/crypto/Makefile              |   7 +
 arch/riscv/crypto/ghash-riscv64-glue.c  | 191 ++++++++++++++++++++++++
 arch/riscv/crypto/ghash-riscv64-zvkg.pl | 100 +++++++++++++
 4 files changed, 312 insertions(+)
 create mode 100644 arch/riscv/crypto/ghash-riscv64-glue.c
 create mode 100644 arch/riscv/crypto/ghash-riscv64-zvkg.pl

Comments

Eric Biggers Nov. 22, 2023, 1:42 a.m. UTC | #1
On Thu, Oct 26, 2023 at 02:36:39AM +0800, Jerry Shih wrote:
> +struct riscv64_ghash_context {
> +	be128 key;
> +};
> +
> +struct riscv64_ghash_desc_ctx {
> +	be128 shash;
> +	u8 buffer[GHASH_BLOCK_SIZE];
> +	u32 bytes;
> +};

I recommend calling the first struct 'riscv64_ghash_tfm_ctx', and making the
pointers to it be named 'tctx'.  That would more clearly distinguish it from the
desc_ctx / dctx.

> +
> +typedef void (*ghash_func)(be128 *Xi, const be128 *H, const u8 *inp,
> +			   size_t len);
> +
> +static inline void ghash_blocks(const struct riscv64_ghash_context *ctx,
> +				struct riscv64_ghash_desc_ctx *dctx,
> +				const u8 *src, size_t srclen, ghash_func func)
> +	if (crypto_simd_usable()) {
> +		kernel_vector_begin();
> +		func(&dctx->shash, &ctx->key, src, srclen);
> +		kernel_vector_end();

The indirection to ghash_func is unnecessary, since the only value is
gcm_ghash_rv64i_zvkg.

This also means that ghash_update() should be folded into ghash_update_zvkg(),
and ghash_final() into ghash_final_zvkg().

> +	} else {
> +		while (srclen >= GHASH_BLOCK_SIZE) {
> +			crypto_xor((u8 *)&dctx->shash, src, GHASH_BLOCK_SIZE);
> +			gf128mul_lle(&dctx->shash, &ctx->key);
> +			srclen -= GHASH_BLOCK_SIZE;
> +			src += GHASH_BLOCK_SIZE;
> +		}
> +	}

The assembly code uses the equivalent of the following do-while loop instead:

        do {
                srclen -= GHASH_BLOCK_SIZE;
        } while (srclen);

I.e., it assumes the length here is nonzero and a multiple of 16, which it is.

To avoid confusion, I recommend making the C code use the same do-while loop.


>        const struct riscv64_ghash_context *ctx =
>               crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));

crypto_tfm_ctx(crypto_shash_tfm(tfm)) should be crypto_shash_ctx(tfm)

> +static int ghash_final(struct shash_desc *desc, u8 *out, ghash_func func)
> +{
> +	const struct riscv64_ghash_context *ctx =
> +		crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
> +	struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
> +	int i;
> +
> +	if (dctx->bytes) {
> +		for (i = dctx->bytes; i < GHASH_BLOCK_SIZE; i++)
> +			dctx->buffer[i] = 0;
> +
> +		ghash_blocks(ctx, dctx, dctx->buffer, GHASH_BLOCK_SIZE, func);
> +		dctx->bytes = 0;
> +	}
> +

Setting dctx->bytes above is unnecessary.

> +static int ghash_init(struct shash_desc *desc)
> +{
> +	struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
> +
> +	*dctx = (struct riscv64_ghash_desc_ctx){};
> +
> +	return 0;
> +}
> +
> +static int ghash_update_zvkg(struct shash_desc *desc, const u8 *src,
> +			     unsigned int srclen)
> +{
> +	return ghash_update(desc, src, srclen, gcm_ghash_rv64i_zvkg);
> +}
> +
> +static int ghash_final_zvkg(struct shash_desc *desc, u8 *out)
> +{
> +	return ghash_final(desc, out, gcm_ghash_rv64i_zvkg);
> +}
> +
> +static int ghash_setkey(struct crypto_shash *tfm, const u8 *key,
> +			unsigned int keylen)
> +{
> +	struct riscv64_ghash_context *ctx =
> +		crypto_tfm_ctx(crypto_shash_tfm(tfm));
> +
> +	if (keylen != GHASH_BLOCK_SIZE)
> +		return -EINVAL;
> +
> +	memcpy(&ctx->key, key, GHASH_BLOCK_SIZE);
> +
> +	return 0;
> +}
> +
> +static struct shash_alg riscv64_ghash_alg_zvkg = {
> +	.digestsize = GHASH_DIGEST_SIZE,
> +	.init = ghash_init,
> +	.update = ghash_update_zvkg,
> +	.final = ghash_final_zvkg,
> +	.setkey = ghash_setkey,

IMO it's helpful to order the shash functions as follows, both in their
definitions and their fields in struct shash_alg:

    setkey
    init
    update
    final

That matches the order in which they're called.

- Eric
Jerry Shih Nov. 27, 2023, 2:49 a.m. UTC | #2
On Nov 22, 2023, at 09:42, Eric Biggers <ebiggers@kernel.org> wrote:
> On Thu, Oct 26, 2023 at 02:36:39AM +0800, Jerry Shih wrote:
>> +struct riscv64_ghash_context {
>> +	be128 key;
>> +};
>> +
>> +struct riscv64_ghash_desc_ctx {
>> +	be128 shash;
>> +	u8 buffer[GHASH_BLOCK_SIZE];
>> +	u32 bytes;
>> +};
> 
> I recommend calling the first struct 'riscv64_ghash_tfm_ctx', and making the
> pointers to it be named 'tctx'.  That would more clearly distinguish it from the
> desc_ctx / dctx.

Fixed.

>> +
>> +typedef void (*ghash_func)(be128 *Xi, const be128 *H, const u8 *inp,
>> +			   size_t len);
>> +
>> +static inline void ghash_blocks(const struct riscv64_ghash_context *ctx,
>> +				struct riscv64_ghash_desc_ctx *dctx,
>> +				const u8 *src, size_t srclen, ghash_func func)
>> +	if (crypto_simd_usable()) {
>> +		kernel_vector_begin();
>> +		func(&dctx->shash, &ctx->key, src, srclen);
>> +		kernel_vector_end();
> 
> The indirection to ghash_func is unnecessary, since the only value is
> gcm_ghash_rv64i_zvkg.
> 
> This also means that ghash_update() should be folded into ghash_update_zvkg(),
> and ghash_final() into ghash_final_zvkg().

Fixed. The `gcm_ghash_rv64i_zvkg()` is folded into `ghash_update_zvkg()` and
`ghash_final_zvkg()`.

>> +	} else {
>> +		while (srclen >= GHASH_BLOCK_SIZE) {
>> +			crypto_xor((u8 *)&dctx->shash, src, GHASH_BLOCK_SIZE);
>> +			gf128mul_lle(&dctx->shash, &ctx->key);
>> +			srclen -= GHASH_BLOCK_SIZE;
>> +			src += GHASH_BLOCK_SIZE;
>> +		}
>> +	}
> 
> The assembly code uses the equivalent of the following do-while loop instead:
> 
>        do {
>                srclen -= GHASH_BLOCK_SIZE;
>        } while (srclen);
> 
> I.e., it assumes the length here is nonzero and a multiple of 16, which it is.
> 
> To avoid confusion, I recommend making the C code use the same do-while loop.

Fixed.

>>       const struct riscv64_ghash_context *ctx =
>>              crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
> 
> crypto_tfm_ctx(crypto_shash_tfm(tfm)) should be crypto_shash_ctx(tfm)

Fixed.
But the original code do the same thing.

>> +static int ghash_final(struct shash_desc *desc, u8 *out, ghash_func func)
>> +{
>> +	const struct riscv64_ghash_context *ctx =
>> +		crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
>> +	struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
>> +	int i;
>> +
>> +	if (dctx->bytes) {
>> +		for (i = dctx->bytes; i < GHASH_BLOCK_SIZE; i++)
>> +			dctx->buffer[i] = 0;
>> +
>> +		ghash_blocks(ctx, dctx, dctx->buffer, GHASH_BLOCK_SIZE, func);
>> +		dctx->bytes = 0;
>> +	}
>> +
> 
> Setting dctx->bytes above is unnecessary.

Fixed.

>> +static int ghash_init(struct shash_desc *desc)
>> +{
>> +	struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
>> +
>> +	*dctx = (struct riscv64_ghash_desc_ctx){};
>> +
>> +	return 0;
>> +}
>> +
>> +static int ghash_update_zvkg(struct shash_desc *desc, const u8 *src,
>> +			     unsigned int srclen)
>> +{
>> +	return ghash_update(desc, src, srclen, gcm_ghash_rv64i_zvkg);
>> +}
>> +
>> +static int ghash_final_zvkg(struct shash_desc *desc, u8 *out)
>> +{
>> +	return ghash_final(desc, out, gcm_ghash_rv64i_zvkg);
>> +}
>> +
>> +static int ghash_setkey(struct crypto_shash *tfm, const u8 *key,
>> +			unsigned int keylen)
>> +{
>> +	struct riscv64_ghash_context *ctx =
>> +		crypto_tfm_ctx(crypto_shash_tfm(tfm));
>> +
>> +	if (keylen != GHASH_BLOCK_SIZE)
>> +		return -EINVAL;
>> +
>> +	memcpy(&ctx->key, key, GHASH_BLOCK_SIZE);
>> +
>> +	return 0;
>> +}
>> +
>> +static struct shash_alg riscv64_ghash_alg_zvkg = {
>> +	.digestsize = GHASH_DIGEST_SIZE,
>> +	.init = ghash_init,
>> +	.update = ghash_update_zvkg,
>> +	.final = ghash_final_zvkg,
>> +	.setkey = ghash_setkey,
> 
> IMO it's helpful to order the shash functions as follows, both in their
> definitions and their fields in struct shash_alg:
> 
>    setkey
>    init
>    update
>    final
> 
> That matches the order in which they're called.

I have different opinion. I reorder the initialization in the order declared.
That will help us to check whether the function/member is missed.

> - Eric


-Jerry
diff mbox series

Patch

diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig
index dfa9d0146d26..00be7177eb1e 100644
--- a/arch/riscv/crypto/Kconfig
+++ b/arch/riscv/crypto/Kconfig
@@ -35,4 +35,18 @@  config CRYPTO_AES_BLOCK_RISCV64
 	  - Zvkg vector crypto extension (XTS)
 	  - Zvkned vector crypto extension
 
+config CRYPTO_GHASH_RISCV64
+	default y if RISCV_ISA_V
+	tristate "Hash functions: GHASH"
+	depends on 64BIT && RISCV_ISA_V
+	select CRYPTO_GCM
+	select CRYPTO_GHASH
+	select CRYPTO_HASH
+	select CRYPTO_LIB_GF128MUL
+	help
+	  GCM GHASH function (NIST SP 800-38D)
+
+	  Architecture: riscv64 using:
+	  - Zvkg vector crypto extension
+
 endmenu
diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile
index 42a4e8ec79cf..532316cc1758 100644
--- a/arch/riscv/crypto/Makefile
+++ b/arch/riscv/crypto/Makefile
@@ -9,6 +9,9 @@  aes-riscv64-y := aes-riscv64-glue.o aes-riscv64-zvkned.o
 obj-$(CONFIG_CRYPTO_AES_BLOCK_RISCV64) += aes-block-riscv64.o
 aes-block-riscv64-y := aes-riscv64-block-mode-glue.o aes-riscv64-zvbb-zvkg-zvkned.o aes-riscv64-zvkb-zvkned.o
 
+obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o
+ghash-riscv64-y := ghash-riscv64-glue.o ghash-riscv64-zvkg.o
+
 quiet_cmd_perlasm = PERLASM $@
       cmd_perlasm = $(PERL) $(<) void $(@)
 
@@ -21,6 +24,10 @@  $(obj)/aes-riscv64-zvbb-zvkg-zvkned.S: $(src)/aes-riscv64-zvbb-zvkg-zvkned.pl
 $(obj)/aes-riscv64-zvkb-zvkned.S: $(src)/aes-riscv64-zvkb-zvkned.pl
 	$(call cmd,perlasm)
 
+$(obj)/ghash-riscv64-zvkg.S: $(src)/ghash-riscv64-zvkg.pl
+	$(call cmd,perlasm)
+
 clean-files += aes-riscv64-zvkned.S
 clean-files += aes-riscv64-zvbb-zvkg-zvkned.S
 clean-files += aes-riscv64-zvkb-zvkned.S
+clean-files += ghash-riscv64-zvkg.S
diff --git a/arch/riscv/crypto/ghash-riscv64-glue.c b/arch/riscv/crypto/ghash-riscv64-glue.c
new file mode 100644
index 000000000000..d5b7f0e4f612
--- /dev/null
+++ b/arch/riscv/crypto/ghash-riscv64-glue.c
@@ -0,0 +1,191 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * RISC-V optimized GHASH routines
+ *
+ * Copyright (C) 2023 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/ghash.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+/* ghash using zvkg vector crypto extension */
+void gcm_ghash_rv64i_zvkg(be128 *Xi, const be128 *H, const u8 *inp, size_t len);
+
+struct riscv64_ghash_context {
+	be128 key;
+};
+
+struct riscv64_ghash_desc_ctx {
+	be128 shash;
+	u8 buffer[GHASH_BLOCK_SIZE];
+	u32 bytes;
+};
+
+typedef void (*ghash_func)(be128 *Xi, const be128 *H, const u8 *inp,
+			   size_t len);
+
+static inline void ghash_blocks(const struct riscv64_ghash_context *ctx,
+				struct riscv64_ghash_desc_ctx *dctx,
+				const u8 *src, size_t srclen, ghash_func func)
+{
+	if (crypto_simd_usable()) {
+		kernel_vector_begin();
+		func(&dctx->shash, &ctx->key, src, srclen);
+		kernel_vector_end();
+	} else {
+		while (srclen >= GHASH_BLOCK_SIZE) {
+			crypto_xor((u8 *)&dctx->shash, src, GHASH_BLOCK_SIZE);
+			gf128mul_lle(&dctx->shash, &ctx->key);
+			srclen -= GHASH_BLOCK_SIZE;
+			src += GHASH_BLOCK_SIZE;
+		}
+	}
+}
+
+static int ghash_update(struct shash_desc *desc, const u8 *src, size_t srclen,
+			ghash_func func)
+{
+	size_t len;
+	const struct riscv64_ghash_context *ctx =
+		crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
+	struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	if (dctx->bytes) {
+		if (dctx->bytes + srclen < GHASH_BLOCK_SIZE) {
+			memcpy(dctx->buffer + dctx->bytes, src, srclen);
+			dctx->bytes += srclen;
+			return 0;
+		}
+		memcpy(dctx->buffer + dctx->bytes, src,
+		       GHASH_BLOCK_SIZE - dctx->bytes);
+
+		ghash_blocks(ctx, dctx, dctx->buffer, GHASH_BLOCK_SIZE, func);
+
+		src += GHASH_BLOCK_SIZE - dctx->bytes;
+		srclen -= GHASH_BLOCK_SIZE - dctx->bytes;
+		dctx->bytes = 0;
+	}
+	len = srclen & ~(GHASH_BLOCK_SIZE - 1);
+
+	if (len) {
+		ghash_blocks(ctx, dctx, src, len, func);
+		src += len;
+		srclen -= len;
+	}
+
+	if (srclen) {
+		memcpy(dctx->buffer, src, srclen);
+		dctx->bytes = srclen;
+	}
+
+	return 0;
+}
+
+static int ghash_final(struct shash_desc *desc, u8 *out, ghash_func func)
+{
+	const struct riscv64_ghash_context *ctx =
+		crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
+	struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+	int i;
+
+	if (dctx->bytes) {
+		for (i = dctx->bytes; i < GHASH_BLOCK_SIZE; i++)
+			dctx->buffer[i] = 0;
+
+		ghash_blocks(ctx, dctx, dctx->buffer, GHASH_BLOCK_SIZE, func);
+		dctx->bytes = 0;
+	}
+
+	memcpy(out, &dctx->shash, GHASH_DIGEST_SIZE);
+
+	return 0;
+}
+
+static int ghash_init(struct shash_desc *desc)
+{
+	struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	*dctx = (struct riscv64_ghash_desc_ctx){};
+
+	return 0;
+}
+
+static int ghash_update_zvkg(struct shash_desc *desc, const u8 *src,
+			     unsigned int srclen)
+{
+	return ghash_update(desc, src, srclen, gcm_ghash_rv64i_zvkg);
+}
+
+static int ghash_final_zvkg(struct shash_desc *desc, u8 *out)
+{
+	return ghash_final(desc, out, gcm_ghash_rv64i_zvkg);
+}
+
+static int ghash_setkey(struct crypto_shash *tfm, const u8 *key,
+			unsigned int keylen)
+{
+	struct riscv64_ghash_context *ctx =
+		crypto_tfm_ctx(crypto_shash_tfm(tfm));
+
+	if (keylen != GHASH_BLOCK_SIZE)
+		return -EINVAL;
+
+	memcpy(&ctx->key, key, GHASH_BLOCK_SIZE);
+
+	return 0;
+}
+
+static struct shash_alg riscv64_ghash_alg_zvkg = {
+	.digestsize = GHASH_DIGEST_SIZE,
+	.init = ghash_init,
+	.update = ghash_update_zvkg,
+	.final = ghash_final_zvkg,
+	.setkey = ghash_setkey,
+	.descsize = sizeof(struct riscv64_ghash_desc_ctx),
+	.base = {
+		.cra_name = "ghash",
+		.cra_driver_name = "ghash-riscv64-zvkg",
+		.cra_priority = 303,
+		.cra_blocksize = GHASH_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct riscv64_ghash_context),
+		.cra_module = THIS_MODULE,
+	},
+};
+
+static inline bool check_ghash_ext(void)
+{
+	return riscv_isa_extension_available(NULL, ZVKG) &&
+	       riscv_vector_vlen() >= 128;
+}
+
+static int __init riscv64_ghash_mod_init(void)
+{
+	if (check_ghash_ext())
+		return crypto_register_shash(&riscv64_ghash_alg_zvkg);
+
+	return -ENODEV;
+}
+
+static void __exit riscv64_ghash_mod_fini(void)
+{
+	if (check_ghash_ext())
+		crypto_unregister_shash(&riscv64_ghash_alg_zvkg);
+}
+
+module_init(riscv64_ghash_mod_init);
+module_exit(riscv64_ghash_mod_fini);
+
+MODULE_DESCRIPTION("GCM GHASH (RISC-V accelerated)");
+MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("ghash");
diff --git a/arch/riscv/crypto/ghash-riscv64-zvkg.pl b/arch/riscv/crypto/ghash-riscv64-zvkg.pl
new file mode 100644
index 000000000000..4beea4ac9cbe
--- /dev/null
+++ b/arch/riscv/crypto/ghash-riscv64-zvkg.pl
@@ -0,0 +1,100 @@ 
+#! /usr/bin/env perl
+# SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause
+#
+# This file is dual-licensed, meaning that you can use it under your
+# choice of either of the following two licenses:
+#
+# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You can obtain
+# a copy in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# or
+#
+# Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+# Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# - RV64I
+# - RISC-V Vector ('V') with VLEN >= 128
+# - RISC-V Vector GCM/GMAC extension ('Zvkg')
+
+use strict;
+use warnings;
+
+use FindBin qw($Bin);
+use lib "$Bin";
+use lib "$Bin/../../perlasm";
+use riscv;
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$output and open STDOUT,">$output";
+
+my $code=<<___;
+.text
+___
+
+###############################################################################
+# void gcm_ghash_rv64i_zvkg(be128 *Xi, const be128 *H, const u8 *inp, size_t len)
+#
+# input: Xi: current hash value
+#        H: hash key
+#        inp: pointer to input data
+#        len: length of input data in bytes (multiple of block size)
+# output: Xi: Xi+1 (next hash value Xi)
+{
+my ($Xi,$H,$inp,$len) = ("a0","a1","a2","a3");
+my ($vXi,$vH,$vinp,$Vzero) = ("v1","v2","v3","v4");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_ghash_rv64i_zvkg
+.type gcm_ghash_rv64i_zvkg,\@function
+gcm_ghash_rv64i_zvkg:
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $vH, $H]}
+    @{[vle32_v $vXi, $Xi]}
+
+Lstep:
+    @{[vle32_v $vinp, $inp]}
+    add $inp, $inp, 16
+    add $len, $len, -16
+    @{[vghsh_vv $vXi, $vH, $vinp]}
+    bnez $len, Lstep
+
+    @{[vse32_v $vXi, $Xi]}
+    ret
+
+.size gcm_ghash_rv64i_zvkg,.-gcm_ghash_rv64i_zvkg
+___
+}
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";