arm64: do_csum: implement accelerated scalar version

Message ID 20190218230842.11448-1-ard.biesheuvel@linaro.org
State New
Headers show
Series
  • arm64: do_csum: implement accelerated scalar version
Related show

Commit Message

Ard Biesheuvel Feb. 18, 2019, 11:08 p.m.
It turns out that the IP checksumming code is still exercised often,
even though one might expect that modern NICs with checksum offload
have no use for it. However, as Lingyan points out, there are
combinations of features where the network stack may still fall back
to software checksumming, and so it makes sense to provide an
optimized implementation in software as well.

So provide an implementation of do_csum() in scalar assembler, which,
unlike C, gives direct access to the carry flag, making the code run
substantially faster. The routine uses overlapping 64 byte loads for
all input size > 64 bytes, in order to reduce the number of branches
and improve performance on cores with deep pipelines.

On Cortex-A57, this implementation is on par with Lingyan's NEON
implementation, and roughly 7x as fast as the generic C code.

Cc: "huanglingyan (A)" <huanglingyan2@huawei.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

---
Test code after the patch.

 arch/arm64/include/asm/checksum.h |   3 +
 arch/arm64/lib/Makefile           |   2 +-
 arch/arm64/lib/csum.S             | 127 ++++++++++++++++++++
 3 files changed, 131 insertions(+), 1 deletion(-)

-- 
2.20.1

  diff --git a/lib/checksum.c b/lib/checksum.c
  index d3ec93f9e5f3..7711f1186f71 100644
  --- a/lib/checksum.c
  +++ b/lib/checksum.c
  @@ -37,7 +37,7 @@
   
   #include <asm/byteorder.h>
   
  -#ifndef do_csum
  +#if 1 //ndef do_csum
   static inline unsigned short from32to16(unsigned int x)
   {
          /* add up 16-bit and 16-bit for 16+c bit */
  @@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x)
          return x;
   }
   
  -static unsigned int do_csum(const unsigned char *buff, int len)
  +static unsigned int __do_csum(const unsigned char *buff, int len)
   {
          int odd;
          unsigned int result = 0;
  @@ -206,3 +206,23 @@ __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
   }
   EXPORT_SYMBOL(csum_tcpudp_nofold);
   #endif
  +
  +extern u8 crypto_ft_tab[];
  +
  +static int __init do_selftest(void)
  +{
  +       int i, j;
  +       u16 c1, c2;
  +
  +       for (i = 0; i < 1024; i++) {
  +               for (j = i + 1; j <= 1024; j++) {
  +                       c1 = __do_csum(crypto_ft_tab + i, j - i);
  +                       c2 = do_csum(crypto_ft_tab + i, j - i);
  +
  +                       if (c1 != c2)
  +                               pr_err("######### %d %d %x %x\n", i, j, c1, c2);
  +               }
  +       }
  +       return 0;
  +}
  +late_initcall(do_selftest);

Comments

Ilias Apalodimas Feb. 19, 2019, 3:08 p.m. | #1
On Tue, Feb 19, 2019 at 12:08:42AM +0100, Ard Biesheuvel wrote:
> It turns out that the IP checksumming code is still exercised often,

> even though one might expect that modern NICs with checksum offload

> have no use for it. However, as Lingyan points out, there are

> combinations of features where the network stack may still fall back

> to software checksumming, and so it makes sense to provide an

> optimized implementation in software as well.

> 

> So provide an implementation of do_csum() in scalar assembler, which,

> unlike C, gives direct access to the carry flag, making the code run

> substantially faster. The routine uses overlapping 64 byte loads for

> all input size > 64 bytes, in order to reduce the number of branches

> and improve performance on cores with deep pipelines.

> 

> On Cortex-A57, this implementation is on par with Lingyan's NEON

> implementation, and roughly 7x as fast as the generic C code.

> 

> Cc: "huanglingyan (A)" <huanglingyan2@huawei.com>

> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

> ---

> Test code after the patch.

> 

>  arch/arm64/include/asm/checksum.h |   3 +

>  arch/arm64/lib/Makefile           |   2 +-

>  arch/arm64/lib/csum.S             | 127 ++++++++++++++++++++

>  3 files changed, 131 insertions(+), 1 deletion(-)

> 

> diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h

> index 0b6f5a7d4027..e906b956c1fc 100644

> --- a/arch/arm64/include/asm/checksum.h

> +++ b/arch/arm64/include/asm/checksum.h

> @@ -46,6 +46,9 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)

>  }

>  #define ip_fast_csum ip_fast_csum

>  

> +extern unsigned int do_csum(const unsigned char *buff, int len);

> +#define do_csum do_csum

> +

>  #include <asm-generic/checksum.h>

>  

>  #endif	/* __ASM_CHECKSUM_H */

> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile

> index 5540a1638baf..a7606007a749 100644

> --- a/arch/arm64/lib/Makefile

> +++ b/arch/arm64/lib/Makefile

> @@ -3,7 +3,7 @@ lib-y		:= clear_user.o delay.o copy_from_user.o		\

>  		   copy_to_user.o copy_in_user.o copy_page.o		\

>  		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\

>  		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\

> -		   strchr.o strrchr.o tishift.o

> +		   strchr.o strrchr.o tishift.o csum.o

>  

>  ifeq ($(CONFIG_KERNEL_MODE_NEON), y)

>  obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o

> diff --git a/arch/arm64/lib/csum.S b/arch/arm64/lib/csum.S

> new file mode 100644

> index 000000000000..534e2ebdc426

> --- /dev/null

> +++ b/arch/arm64/lib/csum.S

> @@ -0,0 +1,127 @@

> +/* SPDX-License-Identifier: GPL-2.0 */

> +/*

> + * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>

> + */

> +

> +#include <linux/linkage.h>

> +#include <asm/assembler.h>

> +

> +ENTRY(do_csum)

> +	adds		x2, xzr, xzr		// clear x2 and C flag

> +

> +	// 64 bytes at a time

> +	lsr		x3, x1, #6

> +	and		x1, x1, #63

> +	cbz		x3, 1f

> +

> +	// Eight 64-bit adds per iteration

> +0:	ldp		x4, x5, [x0], #64

> +	ldp		x6, x7, [x0, #-48]

> +	ldp		x8, x9, [x0, #-32]

> +	ldp		x10, x11, [x0, #-16]

> +	adcs		x2, x2, x4

> +	sub		x3, x3, #1

> +	adcs		x2, x2, x5

> +	adcs		x2, x2, x6

> +	adcs		x2, x2, x7

> +	adcs		x2, x2, x8

> +	adcs		x2, x2, x9

> +	adcs		x2, x2, x10

> +	adcs		x2, x2, x11

> +	cbnz		x3, 0b

> +	adc		x2, x2, xzr

> +

> +	cbz		x1, 7f

> +	bic		x3, x1, #1

> +	add		x12, x0, x1

> +	add		x0, x0, x3

> +	neg		x3, x3

> +	add		x3, x3, #64

> +	lsl		x3, x3, #3

> +

> +	// Handle remaining 63 bytes or less using an overlapping 64-byte load

> +	// and a branchless code path to complete the calculation

> +	ldp		x4, x5, [x0, #-64]

> +	ldp		x6, x7, [x0, #-48]

> +	ldp		x8, x9, [x0, #-32]

> +	ldp		x10, x11, [x0, #-16]

> +	ldrb		w12, [x12, #-1]

> +

> +	.irp		reg, x4, x5, x6, x7, x8, x9, x10, x11

> +	cmp		x3, #64

> +	csel		\reg, \reg, xzr, lt

> +	ccmp		x3, xzr, #0, lt

> +	csel		x13, x3, xzr, gt

> +	sub		x3, x3, #64

> +CPU_LE(	lsr		\reg, \reg, x13		)

> +CPU_BE(	lsl		\reg, \reg, x13		)

> +	.endr

> +

> +	adds		x2, x2, x4

> +	adcs		x2, x2, x5

> +	adcs		x2, x2, x6

> +	adcs		x2, x2, x7

> +	adcs		x2, x2, x8

> +	adcs		x2, x2, x9

> +	adcs		x2, x2, x10

> +	adcs		x2, x2, x11

> +	adc		x2, x2, xzr

> +

> +CPU_LE(	adds		x12, x2, x12		)

> +CPU_BE(	adds		x12, x2, x12, lsl #8	)

> +	adc		x12, x12, xzr

> +	tst		x1, #1

> +	csel		x2, x2, x12, eq

> +

> +7:	lsr		x1, x2, #32

> +	adds		w2, w2, w1

> +	adc		w2, w2, wzr

> +

> +	lsr		w1, w2, #16

> +	uxth		w2, w2

> +	add		w2, w2, w1

> +

> +	lsr		w1, w2, #16		// handle the carry by hand

> +	add		w2, w2, w1

> +

> +	uxth		w0, w2

> +	ret

> +

> +	// Handle 63 bytes or less

> +1:	tbz		x1, #5, 2f

> +	ldp		x4, x5, [x0], #32

> +	ldp		x6, x7, [x0, #-16]

> +	adds		x2, x2, x4

> +	adcs		x2, x2, x5

> +	adcs		x2, x2, x6

> +	adcs		x2, x2, x7

> +	adc		x2, x2, xzr

> +

> +2:	tbz		x1, #4, 3f

> +	ldp		x4, x5, [x0], #16

> +	adds		x2, x2, x4

> +	adcs		x2, x2, x5

> +	adc		x2, x2, xzr

> +

> +3:	tbz		x1, #3, 4f

> +	ldr		x4, [x0], #8

> +	adds		x2, x2, x4

> +	adc		x2, x2, xzr

> +

> +4:	tbz		x1, #2, 5f

> +	ldr		w4, [x0], #4

> +	adds		x2, x2, x4

> +	adc		x2, x2, xzr

> +

> +5:	tbz		x1, #1, 6f

> +	ldrh		w4, [x0], #2

> +	adds		x2, x2, x4

> +	adc		x2, x2, xzr

> +

> +6:	tbz		x1, #0, 7b

> +	ldrb		w4, [x0]

> +CPU_LE(	adds		x2, x2, x4		)

> +CPU_BE(	adds		x2, x2, x4, lsl #8	)

> +	adc		x2, x2, xzr

> +	b		7b

> +ENDPROC(do_csum)

> -- 

> 2.20.1

> 

>   diff --git a/lib/checksum.c b/lib/checksum.c

>   index d3ec93f9e5f3..7711f1186f71 100644

>   --- a/lib/checksum.c

>   +++ b/lib/checksum.c

>   @@ -37,7 +37,7 @@

>    

>    #include <asm/byteorder.h>

>    

>   -#ifndef do_csum

>   +#if 1 //ndef do_csum

>    static inline unsigned short from32to16(unsigned int x)

>    {

>           /* add up 16-bit and 16-bit for 16+c bit */

>   @@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x)

>           return x;

>    }

>    

>   -static unsigned int do_csum(const unsigned char *buff, int len)

>   +static unsigned int __do_csum(const unsigned char *buff, int len)

>    {

>           int odd;

>           unsigned int result = 0;

>   @@ -206,3 +206,23 @@ __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,

>    }

>    EXPORT_SYMBOL(csum_tcpudp_nofold);

>    #endif

>   +

>   +extern u8 crypto_ft_tab[];

>   +

>   +static int __init do_selftest(void)

>   +{

>   +       int i, j;

>   +       u16 c1, c2;

>   +

>   +       for (i = 0; i < 1024; i++) {

>   +               for (j = i + 1; j <= 1024; j++) {

>   +                       c1 = __do_csum(crypto_ft_tab + i, j - i);

>   +                       c2 = do_csum(crypto_ft_tab + i, j - i);

>   +

>   +                       if (c1 != c2)

>   +                               pr_err("######### %d %d %x %x\n", i, j, c1, c2);

>   +               }

>   +       }

>   +       return 0;

>   +}

>   +late_initcall(do_selftest);



Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Ard Biesheuvel Feb. 28, 2019, 2:16 p.m. | #2
(+ Catalin)

On Tue, 19 Feb 2019 at 16:08, Ilias Apalodimas
<ilias.apalodimas@linaro.org> wrote:
>

> On Tue, Feb 19, 2019 at 12:08:42AM +0100, Ard Biesheuvel wrote:

> > It turns out that the IP checksumming code is still exercised often,

> > even though one might expect that modern NICs with checksum offload

> > have no use for it. However, as Lingyan points out, there are

> > combinations of features where the network stack may still fall back

> > to software checksumming, and so it makes sense to provide an

> > optimized implementation in software as well.

> >

> > So provide an implementation of do_csum() in scalar assembler, which,

> > unlike C, gives direct access to the carry flag, making the code run

> > substantially faster. The routine uses overlapping 64 byte loads for

> > all input size > 64 bytes, in order to reduce the number of branches

> > and improve performance on cores with deep pipelines.

> >

> > On Cortex-A57, this implementation is on par with Lingyan's NEON

> > implementation, and roughly 7x as fast as the generic C code.

> >

> > Cc: "huanglingyan (A)" <huanglingyan2@huawei.com>

> > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

...
>

> Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>


Full patch here

https://lore.kernel.org/linux-arm-kernel/20190218230842.11448-1-ard.biesheuvel@linaro.org/

This was a follow-up to some discussions about Lingyan's NEON code,
CC'ed to netdev@ so people could chime in as to whether we need
accelerated checksumming code in the first place.
Robin Murphy Feb. 28, 2019, 3:13 p.m. | #3
Hi Ard,

On 28/02/2019 14:16, Ard Biesheuvel wrote:
> (+ Catalin)

> 

> On Tue, 19 Feb 2019 at 16:08, Ilias Apalodimas

> <ilias.apalodimas@linaro.org> wrote:

>>

>> On Tue, Feb 19, 2019 at 12:08:42AM +0100, Ard Biesheuvel wrote:

>>> It turns out that the IP checksumming code is still exercised often,

>>> even though one might expect that modern NICs with checksum offload

>>> have no use for it. However, as Lingyan points out, there are

>>> combinations of features where the network stack may still fall back

>>> to software checksumming, and so it makes sense to provide an

>>> optimized implementation in software as well.

>>>

>>> So provide an implementation of do_csum() in scalar assembler, which,

>>> unlike C, gives direct access to the carry flag, making the code run

>>> substantially faster. The routine uses overlapping 64 byte loads for

>>> all input size > 64 bytes, in order to reduce the number of branches

>>> and improve performance on cores with deep pipelines.

>>>

>>> On Cortex-A57, this implementation is on par with Lingyan's NEON

>>> implementation, and roughly 7x as fast as the generic C code.

>>>

>>> Cc: "huanglingyan (A)" <huanglingyan2@huawei.com>

>>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

> ...

>>

>> Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>

> 

> Full patch here

> 

> https://lore.kernel.org/linux-arm-kernel/20190218230842.11448-1-ard.biesheuvel@linaro.org/

> 

> This was a follow-up to some discussions about Lingyan's NEON code,

> CC'ed to netdev@ so people could chime in as to whether we need

> accelerated checksumming code in the first place.

FWIW ever since we did ip_fast_csum() I've been meaning to see how well 
I can do with a similar tweaked C implementation for this (mostly for 
fun). Since I've recently dug out my RK3328 box for other reasons I'll 
give this a test - that's a weedy little quad-A53 whose GbE hardware 
checksumming is slightly busted and has to be turned off, so the 
do_csum() overhead under heavy network load is comparatively massive. 
(plus it's non-EFI so I should be able to try big-endian easily too)

The asm looks pretty reasonable to me - instinct says there's *possibly* 
some value for out-of-order cores in doing the 8-way accumulations in a 
more pairwise fashion, but I guess either way the carry flag dependency 
is going to dominate, so it may well be moot. What may be more 
worthwhile is taking the effort to align the source pointer, at least 
for larger inputs, so as to be kinder to little cores - according to its 
optimisation guide, A55 is fairly sensitive to unaligned loads, so I'd 
assume that's true of its older/smaller friends too. I'll see what I can 
measure in practice - until proven otherwise I'd have no great objection 
to merging this patch as-is if the need is real. Improvements can always 
come later :)

Robin.
Ard Biesheuvel Feb. 28, 2019, 3:28 p.m. | #4
On Thu, 28 Feb 2019 at 16:14, Robin Murphy <robin.murphy@arm.com> wrote:
>

> Hi Ard,

>

> On 28/02/2019 14:16, Ard Biesheuvel wrote:

> > (+ Catalin)

> >

> > On Tue, 19 Feb 2019 at 16:08, Ilias Apalodimas

> > <ilias.apalodimas@linaro.org> wrote:

> >>

> >> On Tue, Feb 19, 2019 at 12:08:42AM +0100, Ard Biesheuvel wrote:

> >>> It turns out that the IP checksumming code is still exercised often,

> >>> even though one might expect that modern NICs with checksum offload

> >>> have no use for it. However, as Lingyan points out, there are

> >>> combinations of features where the network stack may still fall back

> >>> to software checksumming, and so it makes sense to provide an

> >>> optimized implementation in software as well.

> >>>

> >>> So provide an implementation of do_csum() in scalar assembler, which,

> >>> unlike C, gives direct access to the carry flag, making the code run

> >>> substantially faster. The routine uses overlapping 64 byte loads for

> >>> all input size > 64 bytes, in order to reduce the number of branches

> >>> and improve performance on cores with deep pipelines.

> >>>

> >>> On Cortex-A57, this implementation is on par with Lingyan's NEON

> >>> implementation, and roughly 7x as fast as the generic C code.

> >>>

> >>> Cc: "huanglingyan (A)" <huanglingyan2@huawei.com>

> >>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

> > ...

> >>

> >> Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>

> >

> > Full patch here

> >

> > https://lore.kernel.org/linux-arm-kernel/20190218230842.11448-1-ard.biesheuvel@linaro.org/

> >

> > This was a follow-up to some discussions about Lingyan's NEON code,

> > CC'ed to netdev@ so people could chime in as to whether we need

> > accelerated checksumming code in the first place.


Thanks for taking a look.

> FWIW ever since we did ip_fast_csum() I've been meaning to see how well

> I can do with a similar tweaked C implementation for this (mostly for

> fun). Since I've recently dug out my RK3328 box for other reasons I'll

> give this a test - that's a weedy little quad-A53 whose GbE hardware

> checksumming is slightly busted and has to be turned off, so the

> do_csum() overhead under heavy network load is comparatively massive.

> (plus it's non-EFI so I should be able to try big-endian easily too)

>


Yes please. I've been meaning to run this on A72 myself, but ever
since my MacchiatoBin self-combusted, I've been relying on AWS for
this, which is a bit finicky.

As for the C implementation, not having access to the carry flag is
pretty limiting, so I wonder how you intend to get around that.

> The asm looks pretty reasonable to me - instinct says there's *possibly*

> some value for out-of-order cores in doing the 8-way accumulations in a

> more pairwise fashion, but I guess either way the carry flag dependency

> is going to dominate, so it may well be moot.


Yes. In fact, I was surprised the speedup is as dramatic as it is
despite of this, but I guess they optimize for this rather well at the
uarch level.

> What may be more

> worthwhile is taking the effort to align the source pointer, at least

> for larger inputs, so as to be kinder to little cores - according to its

> optimisation guide, A55 is fairly sensitive to unaligned loads, so I'd

> assume that's true of its older/smaller friends too. I'll see what I can

> measure in practice - until proven otherwise I'd have no great objection

> to merging this patch as-is if the need is real. Improvements can always

> come later :)

>


Good point re alignment, I didn't consider that at all tbh.

I'll let the maintainers decide whether/when to merge this. I don't
feel strongly either way.
Shaokun Zhang April 12, 2019, 2:31 a.m. | #5
Hi maintainers and Ard,

Any update on it?

Thanks,
Shaokun

On 2019/2/19 7:08, Ard Biesheuvel wrote:
> It turns out that the IP checksumming code is still exercised often,

> even though one might expect that modern NICs with checksum offload

> have no use for it. However, as Lingyan points out, there are

> combinations of features where the network stack may still fall back

> to software checksumming, and so it makes sense to provide an

> optimized implementation in software as well.

> 

> So provide an implementation of do_csum() in scalar assembler, which,

> unlike C, gives direct access to the carry flag, making the code run

> substantially faster. The routine uses overlapping 64 byte loads for

> all input size > 64 bytes, in order to reduce the number of branches

> and improve performance on cores with deep pipelines.

> 

> On Cortex-A57, this implementation is on par with Lingyan's NEON

> implementation, and roughly 7x as fast as the generic C code.

> 

> Cc: "huanglingyan (A)" <huanglingyan2@huawei.com>

> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

> ---

> Test code after the patch.

> 

>  arch/arm64/include/asm/checksum.h |   3 +

>  arch/arm64/lib/Makefile           |   2 +-

>  arch/arm64/lib/csum.S             | 127 ++++++++++++++++++++

>  3 files changed, 131 insertions(+), 1 deletion(-)

> 

> diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h

> index 0b6f5a7d4027..e906b956c1fc 100644

> --- a/arch/arm64/include/asm/checksum.h

> +++ b/arch/arm64/include/asm/checksum.h

> @@ -46,6 +46,9 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)

>  }

>  #define ip_fast_csum ip_fast_csum

>  

> +extern unsigned int do_csum(const unsigned char *buff, int len);

> +#define do_csum do_csum

> +

>  #include <asm-generic/checksum.h>

>  

>  #endif	/* __ASM_CHECKSUM_H */

> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile

> index 5540a1638baf..a7606007a749 100644

> --- a/arch/arm64/lib/Makefile

> +++ b/arch/arm64/lib/Makefile

> @@ -3,7 +3,7 @@ lib-y		:= clear_user.o delay.o copy_from_user.o		\

>  		   copy_to_user.o copy_in_user.o copy_page.o		\

>  		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\

>  		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\

> -		   strchr.o strrchr.o tishift.o

> +		   strchr.o strrchr.o tishift.o csum.o

>  

>  ifeq ($(CONFIG_KERNEL_MODE_NEON), y)

>  obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o

> diff --git a/arch/arm64/lib/csum.S b/arch/arm64/lib/csum.S

> new file mode 100644

> index 000000000000..534e2ebdc426

> --- /dev/null

> +++ b/arch/arm64/lib/csum.S

> @@ -0,0 +1,127 @@

> +/* SPDX-License-Identifier: GPL-2.0 */

> +/*

> + * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>

> + */

> +

> +#include <linux/linkage.h>

> +#include <asm/assembler.h>

> +

> +ENTRY(do_csum)

> +	adds		x2, xzr, xzr		// clear x2 and C flag

> +

> +	// 64 bytes at a time

> +	lsr		x3, x1, #6

> +	and		x1, x1, #63

> +	cbz		x3, 1f

> +

> +	// Eight 64-bit adds per iteration

> +0:	ldp		x4, x5, [x0], #64

> +	ldp		x6, x7, [x0, #-48]

> +	ldp		x8, x9, [x0, #-32]

> +	ldp		x10, x11, [x0, #-16]

> +	adcs		x2, x2, x4

> +	sub		x3, x3, #1

> +	adcs		x2, x2, x5

> +	adcs		x2, x2, x6

> +	adcs		x2, x2, x7

> +	adcs		x2, x2, x8

> +	adcs		x2, x2, x9

> +	adcs		x2, x2, x10

> +	adcs		x2, x2, x11

> +	cbnz		x3, 0b

> +	adc		x2, x2, xzr

> +

> +	cbz		x1, 7f

> +	bic		x3, x1, #1

> +	add		x12, x0, x1

> +	add		x0, x0, x3

> +	neg		x3, x3

> +	add		x3, x3, #64

> +	lsl		x3, x3, #3

> +

> +	// Handle remaining 63 bytes or less using an overlapping 64-byte load

> +	// and a branchless code path to complete the calculation

> +	ldp		x4, x5, [x0, #-64]

> +	ldp		x6, x7, [x0, #-48]

> +	ldp		x8, x9, [x0, #-32]

> +	ldp		x10, x11, [x0, #-16]

> +	ldrb		w12, [x12, #-1]

> +

> +	.irp		reg, x4, x5, x6, x7, x8, x9, x10, x11

> +	cmp		x3, #64

> +	csel		\reg, \reg, xzr, lt

> +	ccmp		x3, xzr, #0, lt

> +	csel		x13, x3, xzr, gt

> +	sub		x3, x3, #64

> +CPU_LE(	lsr		\reg, \reg, x13		)

> +CPU_BE(	lsl		\reg, \reg, x13		)

> +	.endr

> +

> +	adds		x2, x2, x4

> +	adcs		x2, x2, x5

> +	adcs		x2, x2, x6

> +	adcs		x2, x2, x7

> +	adcs		x2, x2, x8

> +	adcs		x2, x2, x9

> +	adcs		x2, x2, x10

> +	adcs		x2, x2, x11

> +	adc		x2, x2, xzr

> +

> +CPU_LE(	adds		x12, x2, x12		)

> +CPU_BE(	adds		x12, x2, x12, lsl #8	)

> +	adc		x12, x12, xzr

> +	tst		x1, #1

> +	csel		x2, x2, x12, eq

> +

> +7:	lsr		x1, x2, #32

> +	adds		w2, w2, w1

> +	adc		w2, w2, wzr

> +

> +	lsr		w1, w2, #16

> +	uxth		w2, w2

> +	add		w2, w2, w1

> +

> +	lsr		w1, w2, #16		// handle the carry by hand

> +	add		w2, w2, w1

> +

> +	uxth		w0, w2

> +	ret

> +

> +	// Handle 63 bytes or less

> +1:	tbz		x1, #5, 2f

> +	ldp		x4, x5, [x0], #32

> +	ldp		x6, x7, [x0, #-16]

> +	adds		x2, x2, x4

> +	adcs		x2, x2, x5

> +	adcs		x2, x2, x6

> +	adcs		x2, x2, x7

> +	adc		x2, x2, xzr

> +

> +2:	tbz		x1, #4, 3f

> +	ldp		x4, x5, [x0], #16

> +	adds		x2, x2, x4

> +	adcs		x2, x2, x5

> +	adc		x2, x2, xzr

> +

> +3:	tbz		x1, #3, 4f

> +	ldr		x4, [x0], #8

> +	adds		x2, x2, x4

> +	adc		x2, x2, xzr

> +

> +4:	tbz		x1, #2, 5f

> +	ldr		w4, [x0], #4

> +	adds		x2, x2, x4

> +	adc		x2, x2, xzr

> +

> +5:	tbz		x1, #1, 6f

> +	ldrh		w4, [x0], #2

> +	adds		x2, x2, x4

> +	adc		x2, x2, xzr

> +

> +6:	tbz		x1, #0, 7b

> +	ldrb		w4, [x0]

> +CPU_LE(	adds		x2, x2, x4		)

> +CPU_BE(	adds		x2, x2, x4, lsl #8	)

> +	adc		x2, x2, xzr

> +	b		7b

> +ENDPROC(do_csum)

>
Will Deacon April 12, 2019, 9:52 a.m. | #6
On Fri, Apr 12, 2019 at 10:31:16AM +0800, Zhangshaokun wrote:
> On 2019/2/19 7:08, Ard Biesheuvel wrote:

> > It turns out that the IP checksumming code is still exercised often,

> > even though one might expect that modern NICs with checksum offload

> > have no use for it. However, as Lingyan points out, there are

> > combinations of features where the network stack may still fall back

> > to software checksumming, and so it makes sense to provide an

> > optimized implementation in software as well.

> > 

> > So provide an implementation of do_csum() in scalar assembler, which,

> > unlike C, gives direct access to the carry flag, making the code run

> > substantially faster. The routine uses overlapping 64 byte loads for

> > all input size > 64 bytes, in order to reduce the number of branches

> > and improve performance on cores with deep pipelines.

> > 

> > On Cortex-A57, this implementation is on par with Lingyan's NEON

> > implementation, and roughly 7x as fast as the generic C code.

> > 

> > Cc: "huanglingyan (A)" <huanglingyan2@huawei.com>

> > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

> > ---

> > Test code after the patch.

>

> Hi maintainers and Ard,

> 

> Any update on it?


I'm waiting for Robin to come back with numbers for a C implementation.

Robin -- did you get anywhere with that?

Will
Robin Murphy April 15, 2019, 6:18 p.m. | #7
On 12/04/2019 10:52, Will Deacon wrote:
> On Fri, Apr 12, 2019 at 10:31:16AM +0800, Zhangshaokun wrote:

>> On 2019/2/19 7:08, Ard Biesheuvel wrote:

>>> It turns out that the IP checksumming code is still exercised often,

>>> even though one might expect that modern NICs with checksum offload

>>> have no use for it. However, as Lingyan points out, there are

>>> combinations of features where the network stack may still fall back

>>> to software checksumming, and so it makes sense to provide an

>>> optimized implementation in software as well.

>>>

>>> So provide an implementation of do_csum() in scalar assembler, which,

>>> unlike C, gives direct access to the carry flag, making the code run

>>> substantially faster. The routine uses overlapping 64 byte loads for

>>> all input size > 64 bytes, in order to reduce the number of branches

>>> and improve performance on cores with deep pipelines.

>>>

>>> On Cortex-A57, this implementation is on par with Lingyan's NEON

>>> implementation, and roughly 7x as fast as the generic C code.

>>>

>>> Cc: "huanglingyan (A)" <huanglingyan2@huawei.com>

>>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

>>> ---

>>> Test code after the patch.

>>

>> Hi maintainers and Ard,

>>

>> Any update on it?

> 

> I'm waiting for Robin to come back with numbers for a C implementation.

> 

> Robin -- did you get anywhere with that?


Still not what I would call finished, but where I've got so far (besides 
an increasingly elaborate test rig) is as below - it still wants some 
unrolling in the middle to really fly (and actual testing on BE), but 
the worst-case performance already equals or just beats this asm version 
on Cortex-A53 with GCC 7 (by virtue of being alignment-insensitive and 
branchless except for the loop). Unfortunately, the advantage of C code 
being instrumentable does also come around to bite me...

Robin.

----->8-----

/* Looks dumb, but generates nice-ish code */
static u64 accumulate(u64 sum, u64 data)
{
	__uint128_t tmp = (__uint128_t)sum + data;
	return tmp + (tmp >> 64);
}

unsigned int do_csum_c(const unsigned char *buff, int len)
{
	unsigned int offset, shift, sum, count;
	u64 data, *ptr;
	u64 sum64 = 0;

	offset = (unsigned long)buff & 0x7;
	/*
	 * This is to all intents and purposes safe, since rounding down cannot
	 * result in a different page or cache line being accessed, and @buff
	 * should absolutely not be pointing to anything read-sensitive.
	 * It does, however, piss off KASAN...
	 */
	ptr = (u64 *)(buff - offset);
	shift = offset * 8;

	/*
	 * Head: zero out any excess leading bytes. Shifting back by the same
	 * amount should be at least as fast as any other way of handling the
	 * odd/even alignment, and means we can ignore it until the very end.
	 */
	data = *ptr++;
#ifdef __LITTLE_ENDIAN
	data = (data >> shift) << shift;
#else
	data = (data << shift) >> shift;
#endif
	count = 8 - offset;

	/* Body: straightforward aligned loads from here on... */
	//TODO: fancy stuff with larger strides and uint128s?
	while(len > count) {
		sum64 = accumulate(sum64, data);
		data = *ptr++;
		count += 8;
	}
	/*
	 * Tail: zero any over-read bytes similarly to the head, again
	 * preserving odd/even alignment.
	 */
	shift = (count - len) * 8;
#ifdef __LITTLE_ENDIAN
	data = (data << shift) >> shift;
#else
	data = (data >> shift) << shift;
#endif
	sum64 = accumulate(sum64, data);

	/* Finally, folding */
	sum64 += (sum64 >> 32) | (sum64 << 32);
	sum = sum64 >> 32;
	sum += (sum >> 16) | (sum << 16);
	if (offset & 1)
		return (u16)swab32(sum);

	return sum >> 16;
}
Will Deacon May 15, 2019, 9:47 a.m. | #8
On Mon, Apr 15, 2019 at 07:18:22PM +0100, Robin Murphy wrote:
> On 12/04/2019 10:52, Will Deacon wrote:

> > I'm waiting for Robin to come back with numbers for a C implementation.

> > 

> > Robin -- did you get anywhere with that?

> 

> Still not what I would call finished, but where I've got so far (besides an

> increasingly elaborate test rig) is as below - it still wants some unrolling

> in the middle to really fly (and actual testing on BE), but the worst-case

> performance already equals or just beats this asm version on Cortex-A53 with

> GCC 7 (by virtue of being alignment-insensitive and branchless except for

> the loop). Unfortunately, the advantage of C code being instrumentable does

> also come around to bite me...


Is there any interest from anybody in spinning a proper patch out of this?
Shaokun?

Will

> /* Looks dumb, but generates nice-ish code */

> static u64 accumulate(u64 sum, u64 data)

> {

> 	__uint128_t tmp = (__uint128_t)sum + data;

> 	return tmp + (tmp >> 64);

> }

> 

> unsigned int do_csum_c(const unsigned char *buff, int len)

> {

> 	unsigned int offset, shift, sum, count;

> 	u64 data, *ptr;

> 	u64 sum64 = 0;

> 

> 	offset = (unsigned long)buff & 0x7;

> 	/*

> 	 * This is to all intents and purposes safe, since rounding down cannot

> 	 * result in a different page or cache line being accessed, and @buff

> 	 * should absolutely not be pointing to anything read-sensitive.

> 	 * It does, however, piss off KASAN...

> 	 */

> 	ptr = (u64 *)(buff - offset);

> 	shift = offset * 8;

> 

> 	/*

> 	 * Head: zero out any excess leading bytes. Shifting back by the same

> 	 * amount should be at least as fast as any other way of handling the

> 	 * odd/even alignment, and means we can ignore it until the very end.

> 	 */

> 	data = *ptr++;

> #ifdef __LITTLE_ENDIAN

> 	data = (data >> shift) << shift;

> #else

> 	data = (data << shift) >> shift;

> #endif

> 	count = 8 - offset;

> 

> 	/* Body: straightforward aligned loads from here on... */

> 	//TODO: fancy stuff with larger strides and uint128s?

> 	while(len > count) {

> 		sum64 = accumulate(sum64, data);

> 		data = *ptr++;

> 		count += 8;

> 	}

> 	/*

> 	 * Tail: zero any over-read bytes similarly to the head, again

> 	 * preserving odd/even alignment.

> 	 */

> 	shift = (count - len) * 8;

> #ifdef __LITTLE_ENDIAN

> 	data = (data << shift) >> shift;

> #else

> 	data = (data >> shift) << shift;

> #endif

> 	sum64 = accumulate(sum64, data);

> 

> 	/* Finally, folding */

> 	sum64 += (sum64 >> 32) | (sum64 << 32);

> 	sum = sum64 >> 32;

> 	sum += (sum >> 16) | (sum << 16);

> 	if (offset & 1)

> 		return (u16)swab32(sum);

> 

> 	return sum >> 16;

> }
David Laight May 15, 2019, 10:15 a.m. | #9
...
> > 	ptr = (u64 *)(buff - offset);

> > 	shift = offset * 8;

> >

> > 	/*

> > 	 * Head: zero out any excess leading bytes. Shifting back by the same

> > 	 * amount should be at least as fast as any other way of handling the

> > 	 * odd/even alignment, and means we can ignore it until the very end.

> > 	 */

> > 	data = *ptr++;

> > #ifdef __LITTLE_ENDIAN

> > 	data = (data >> shift) << shift;

> > #else

> > 	data = (data << shift) >> shift;

> > #endif


I suspect that
#ifdef __LITTLE_ENDIAN
	data &= ~0ull << shift;
#else
	data &= ~0ull >> shift;
#endif
is likely to be better.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Robin Murphy May 15, 2019, 10:57 a.m. | #10
On 15/05/2019 11:15, David Laight wrote:
> ...

>>> 	ptr = (u64 *)(buff - offset);

>>> 	shift = offset * 8;

>>>

>>> 	/*

>>> 	 * Head: zero out any excess leading bytes. Shifting back by the same

>>> 	 * amount should be at least as fast as any other way of handling the

>>> 	 * odd/even alignment, and means we can ignore it until the very end.

>>> 	 */

>>> 	data = *ptr++;

>>> #ifdef __LITTLE_ENDIAN

>>> 	data = (data >> shift) << shift;

>>> #else

>>> 	data = (data << shift) >> shift;

>>> #endif

> 

> I suspect that

> #ifdef __LITTLE_ENDIAN

> 	data &= ~0ull << shift;

> #else

> 	data &= ~0ull >> shift;

> #endif

> is likely to be better.


Out of interest, better in which respects? For the A64 ISA at least, 
that would take 3 instructions plus an additional scratch register, e.g.:

	MOV	x2, #~0
	LSL	x2, x2, x1
	AND	x0, x0, x1

(alternatively "AND x0, x0, x1 LSL x2" to save 4 bytes of code, but that 
will typically take as many cycles if not more than just pipelining the 
two 'simple' ALU instructions)

Whereas the original is just two shift instruction in-place.

	LSR	x0, x0, x1
	LSL	x0, x0, x1

If the operation were repeated, the constant generation could certainly 
be amortised over multiple subsequent ANDs for a net win, but that isn't 
the case here.

Robin.
Robin Murphy May 15, 2019, 11:02 a.m. | #11
On 15/05/2019 10:47, Will Deacon wrote:
> On Mon, Apr 15, 2019 at 07:18:22PM +0100, Robin Murphy wrote:

>> On 12/04/2019 10:52, Will Deacon wrote:

>>> I'm waiting for Robin to come back with numbers for a C implementation.

>>>

>>> Robin -- did you get anywhere with that?

>>

>> Still not what I would call finished, but where I've got so far (besides an

>> increasingly elaborate test rig) is as below - it still wants some unrolling

>> in the middle to really fly (and actual testing on BE), but the worst-case

>> performance already equals or just beats this asm version on Cortex-A53 with

>> GCC 7 (by virtue of being alignment-insensitive and branchless except for

>> the loop). Unfortunately, the advantage of C code being instrumentable does

>> also come around to bite me...

> 

> Is there any interest from anybody in spinning a proper patch out of this?

> Shaokun?


FWIW I've learned how to fix the KASAN thing now, so I'll try giving 
this some more love while I've got other outstanding optimisation stuff 
to look at anyway.

Robin.
David Laight May 15, 2019, 11:13 a.m. | #12
From: Robin Murphy

> Sent: 15 May 2019 11:58

> To: David Laight; 'Will Deacon'

> Cc: Zhangshaokun; Ard Biesheuvel; linux-arm-kernel@lists.infradead.org; netdev@vger.kernel.org;

> ilias.apalodimas@linaro.org; huanglingyan (A); steve.capper@arm.com

> Subject: Re: [PATCH] arm64: do_csum: implement accelerated scalar version

> 

> On 15/05/2019 11:15, David Laight wrote:

> > ...

> >>> 	ptr = (u64 *)(buff - offset);

> >>> 	shift = offset * 8;

> >>>

> >>> 	/*

> >>> 	 * Head: zero out any excess leading bytes. Shifting back by the same

> >>> 	 * amount should be at least as fast as any other way of handling the

> >>> 	 * odd/even alignment, and means we can ignore it until the very end.

> >>> 	 */

> >>> 	data = *ptr++;

> >>> #ifdef __LITTLE_ENDIAN

> >>> 	data = (data >> shift) << shift;

> >>> #else

> >>> 	data = (data << shift) >> shift;

> >>> #endif

> >

> > I suspect that

> > #ifdef __LITTLE_ENDIAN

> > 	data &= ~0ull << shift;

> > #else

> > 	data &= ~0ull >> shift;

> > #endif

> > is likely to be better.

> 

> Out of interest, better in which respects? For the A64 ISA at least,

> that would take 3 instructions plus an additional scratch register, e.g.:

> 

> 	MOV	x2, #~0

> 	LSL	x2, x2, x1

> 	AND	x0, x0, x1

> 

> (alternatively "AND x0, x0, x1 LSL x2" to save 4 bytes of code, but that

> will typically take as many cycles if not more than just pipelining the

> two 'simple' ALU instructions)

> 

> Whereas the original is just two shift instruction in-place.

> 

> 	LSR	x0, x0, x1

> 	LSL	x0, x0, x1

> 

> If the operation were repeated, the constant generation could certainly

> be amortised over multiple subsequent ANDs for a net win, but that isn't

> the case here.


On a superscaler processor you reduce the register dependency
chain by one instruction.
The original code is pretty much a single dependency chain so
you are likely to be able to generate the mask 'for free'.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Robin Murphy May 15, 2019, 12:39 p.m. | #13
On 15/05/2019 12:13, David Laight wrote:
> From: Robin Murphy

>> Sent: 15 May 2019 11:58

>> To: David Laight; 'Will Deacon'

>> Cc: Zhangshaokun; Ard Biesheuvel; linux-arm-kernel@lists.infradead.org; netdev@vger.kernel.org;

>> ilias.apalodimas@linaro.org; huanglingyan (A); steve.capper@arm.com

>> Subject: Re: [PATCH] arm64: do_csum: implement accelerated scalar version

>>

>> On 15/05/2019 11:15, David Laight wrote:

>>> ...

>>>>> 	ptr = (u64 *)(buff - offset);

>>>>> 	shift = offset * 8;

>>>>>

>>>>> 	/*

>>>>> 	 * Head: zero out any excess leading bytes. Shifting back by the same

>>>>> 	 * amount should be at least as fast as any other way of handling the

>>>>> 	 * odd/even alignment, and means we can ignore it until the very end.

>>>>> 	 */

>>>>> 	data = *ptr++;

>>>>> #ifdef __LITTLE_ENDIAN

>>>>> 	data = (data >> shift) << shift;

>>>>> #else

>>>>> 	data = (data << shift) >> shift;

>>>>> #endif

>>>

>>> I suspect that

>>> #ifdef __LITTLE_ENDIAN

>>> 	data &= ~0ull << shift;

>>> #else

>>> 	data &= ~0ull >> shift;

>>> #endif

>>> is likely to be better.

>>

>> Out of interest, better in which respects? For the A64 ISA at least,

>> that would take 3 instructions plus an additional scratch register, e.g.:

>>

>> 	MOV	x2, #~0

>> 	LSL	x2, x2, x1

>> 	AND	x0, x0, x1


[That should have been "AND x0, x1, x2", obviously...]

>>

>> (alternatively "AND x0, x0, x1 LSL x2" to save 4 bytes of code, but that

>> will typically take as many cycles if not more than just pipelining the

>> two 'simple' ALU instructions)

>>

>> Whereas the original is just two shift instruction in-place.

>>

>> 	LSR	x0, x0, x1

>> 	LSL	x0, x0, x1

>>

>> If the operation were repeated, the constant generation could certainly

>> be amortised over multiple subsequent ANDs for a net win, but that isn't

>> the case here.

> 

> On a superscaler processor you reduce the register dependency

> chain by one instruction.

> The original code is pretty much a single dependency chain so

> you are likely to be able to generate the mask 'for free'.


Gotcha, although 'free' still means additional I$ and register rename 
footprint, vs. (typically) just 1 extra cycle to forward an ALU result. 
It's an interesting consideration, but in our case there are almost 
certainly far more little in-order cores out in the wild than big OoO 
ones, and the double-shift will always be objectively better for those.

Thanks,
Robin.
David Laight May 15, 2019, 1:54 p.m. | #14
From: Robin Murphy

> Sent: 15 May 2019 13:40

> On 15/05/2019 12:13, David Laight wrote:

> > From: Robin Murphy

> >> Sent: 15 May 2019 11:58

> >> To: David Laight; 'Will Deacon'

> >> Cc: Zhangshaokun; Ard Biesheuvel; linux-arm-kernel@lists.infradead.org; netdev@vger.kernel.org;

> >> ilias.apalodimas@linaro.org; huanglingyan (A); steve.capper@arm.com

> >> Subject: Re: [PATCH] arm64: do_csum: implement accelerated scalar version

> >>

> >> On 15/05/2019 11:15, David Laight wrote:

> >>> ...

> >>>>> 	ptr = (u64 *)(buff - offset);

> >>>>> 	shift = offset * 8;

> >>>>>

> >>>>> 	/*

> >>>>> 	 * Head: zero out any excess leading bytes. Shifting back by the same

> >>>>> 	 * amount should be at least as fast as any other way of handling the

> >>>>> 	 * odd/even alignment, and means we can ignore it until the very end.

> >>>>> 	 */

> >>>>> 	data = *ptr++;

> >>>>> #ifdef __LITTLE_ENDIAN

> >>>>> 	data = (data >> shift) << shift;

> >>>>> #else

> >>>>> 	data = (data << shift) >> shift;

> >>>>> #endif

> >>>

> >>> I suspect that

> >>> #ifdef __LITTLE_ENDIAN

> >>> 	data &= ~0ull << shift;

> >>> #else

> >>> 	data &= ~0ull >> shift;

> >>> #endif

> >>> is likely to be better.

> >>

> >> Out of interest, better in which respects? For the A64 ISA at least,

> >> that would take 3 instructions plus an additional scratch register, e.g.:

> >>

> >> 	MOV	x2, #~0

> >> 	LSL	x2, x2, x1

> >> 	AND	x0, x0, x1

> 

> [That should have been "AND x0, x1, x2", obviously...]

> 

> >>

> >> (alternatively "AND x0, x0, x1 LSL x2" to save 4 bytes of code, but that

> >> will typically take as many cycles if not more than just pipelining the

> >> two 'simple' ALU instructions)

> >>

> >> Whereas the original is just two shift instruction in-place.

> >>

> >> 	LSR	x0, x0, x1

> >> 	LSL	x0, x0, x1

> >>

> >> If the operation were repeated, the constant generation could certainly

> >> be amortised over multiple subsequent ANDs for a net win, but that isn't

> >> the case here.

> >

> > On a superscaler processor you reduce the register dependency

> > chain by one instruction.

> > The original code is pretty much a single dependency chain so

> > you are likely to be able to generate the mask 'for free'.

> 

> Gotcha, although 'free' still means additional I$ and register rename

> footprint, vs. (typically) just 1 extra cycle to forward an ALU result.

> It's an interesting consideration, but in our case there are almost

> certainly far more little in-order cores out in the wild than big OoO

> ones, and the double-shift will always be objectively better for those.


Is there a pipeline delay before the result of the memory read (*ptr)
can be used? (Even assuming the data is in the L1 cache??)
Even on an in-order cpu that can give you a spare cycle or two
that the code may not normally fill.

FWIW I've been known to use (++ptr)[-1] (instead of *ptr++) to move
the increment into an available delay slot (of an earlier load).

Anyway it isn't that obvious that it is the fastest way.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Shaokun Zhang May 16, 2019, 3:14 a.m. | #15
Hi Will,

On 2019/5/15 17:47, Will Deacon wrote:
> On Mon, Apr 15, 2019 at 07:18:22PM +0100, Robin Murphy wrote:

>> On 12/04/2019 10:52, Will Deacon wrote:

>>> I'm waiting for Robin to come back with numbers for a C implementation.

>>>

>>> Robin -- did you get anywhere with that?

>>

>> Still not what I would call finished, but where I've got so far (besides an

>> increasingly elaborate test rig) is as below - it still wants some unrolling

>> in the middle to really fly (and actual testing on BE), but the worst-case

>> performance already equals or just beats this asm version on Cortex-A53 with

>> GCC 7 (by virtue of being alignment-insensitive and branchless except for

>> the loop). Unfortunately, the advantage of C code being instrumentable does

>> also come around to bite me...

> 

> Is there any interest from anybody in spinning a proper patch out of this?

> Shaokun?


HiSilicon's Kunpeng920(Hi1620) benefits from do_csum optimization, if Ard and
Robin are ok, Lingyan or I can try to do it.
Of course, if any guy posts the patch, we are happy to test it.
Any will be ok.

Thanks,
Shaokun

> 

> Will

> 

>> /* Looks dumb, but generates nice-ish code */

>> static u64 accumulate(u64 sum, u64 data)

>> {

>> 	__uint128_t tmp = (__uint128_t)sum + data;

>> 	return tmp + (tmp >> 64);

>> }

>>

>> unsigned int do_csum_c(const unsigned char *buff, int len)

>> {

>> 	unsigned int offset, shift, sum, count;

>> 	u64 data, *ptr;

>> 	u64 sum64 = 0;

>>

>> 	offset = (unsigned long)buff & 0x7;

>> 	/*

>> 	 * This is to all intents and purposes safe, since rounding down cannot

>> 	 * result in a different page or cache line being accessed, and @buff

>> 	 * should absolutely not be pointing to anything read-sensitive.

>> 	 * It does, however, piss off KASAN...

>> 	 */

>> 	ptr = (u64 *)(buff - offset);

>> 	shift = offset * 8;

>>

>> 	/*

>> 	 * Head: zero out any excess leading bytes. Shifting back by the same

>> 	 * amount should be at least as fast as any other way of handling the

>> 	 * odd/even alignment, and means we can ignore it until the very end.

>> 	 */

>> 	data = *ptr++;

>> #ifdef __LITTLE_ENDIAN

>> 	data = (data >> shift) << shift;

>> #else

>> 	data = (data << shift) >> shift;

>> #endif

>> 	count = 8 - offset;

>>

>> 	/* Body: straightforward aligned loads from here on... */

>> 	//TODO: fancy stuff with larger strides and uint128s?

>> 	while(len > count) {

>> 		sum64 = accumulate(sum64, data);

>> 		data = *ptr++;

>> 		count += 8;

>> 	}

>> 	/*

>> 	 * Tail: zero any over-read bytes similarly to the head, again

>> 	 * preserving odd/even alignment.

>> 	 */

>> 	shift = (count - len) * 8;

>> #ifdef __LITTLE_ENDIAN

>> 	data = (data << shift) >> shift;

>> #else

>> 	data = (data >> shift) << shift;

>> #endif

>> 	sum64 = accumulate(sum64, data);

>>

>> 	/* Finally, folding */

>> 	sum64 += (sum64 >> 32) | (sum64 << 32);

>> 	sum = sum64 >> 32;

>> 	sum += (sum >> 16) | (sum << 16);

>> 	if (offset & 1)

>> 		return (u16)swab32(sum);

>>

>> 	return sum >> 16;

>> }

> 

> .

>
Will Deacon Aug. 15, 2019, 4:46 p.m. | #16
On Thu, May 16, 2019 at 11:14:35AM +0800, Zhangshaokun wrote:
> On 2019/5/15 17:47, Will Deacon wrote:

> > On Mon, Apr 15, 2019 at 07:18:22PM +0100, Robin Murphy wrote:

> >> On 12/04/2019 10:52, Will Deacon wrote:

> >>> I'm waiting for Robin to come back with numbers for a C implementation.

> >>>

> >>> Robin -- did you get anywhere with that?

> >>

> >> Still not what I would call finished, but where I've got so far (besides an

> >> increasingly elaborate test rig) is as below - it still wants some unrolling

> >> in the middle to really fly (and actual testing on BE), but the worst-case

> >> performance already equals or just beats this asm version on Cortex-A53 with

> >> GCC 7 (by virtue of being alignment-insensitive and branchless except for

> >> the loop). Unfortunately, the advantage of C code being instrumentable does

> >> also come around to bite me...

> > 

> > Is there any interest from anybody in spinning a proper patch out of this?

> > Shaokun?

> 

> HiSilicon's Kunpeng920(Hi1620) benefits from do_csum optimization, if Ard and

> Robin are ok, Lingyan or I can try to do it.

> Of course, if any guy posts the patch, we are happy to test it.

> Any will be ok.


I don't mind who posts it, but Robin is super busy with SMMU stuff at the
moment so it probably makes more sense for you or Lingyan to do it.

Will
Shaokun Zhang Aug. 16, 2019, 8:15 a.m. | #17
Hi Will,

On 2019/8/16 0:46, Will Deacon wrote:
> On Thu, May 16, 2019 at 11:14:35AM +0800, Zhangshaokun wrote:

>> On 2019/5/15 17:47, Will Deacon wrote:

>>> On Mon, Apr 15, 2019 at 07:18:22PM +0100, Robin Murphy wrote:

>>>> On 12/04/2019 10:52, Will Deacon wrote:

>>>>> I'm waiting for Robin to come back with numbers for a C implementation.

>>>>>

>>>>> Robin -- did you get anywhere with that?

>>>>

>>>> Still not what I would call finished, but where I've got so far (besides an

>>>> increasingly elaborate test rig) is as below - it still wants some unrolling

>>>> in the middle to really fly (and actual testing on BE), but the worst-case

>>>> performance already equals or just beats this asm version on Cortex-A53 with

>>>> GCC 7 (by virtue of being alignment-insensitive and branchless except for

>>>> the loop). Unfortunately, the advantage of C code being instrumentable does

>>>> also come around to bite me...

>>>

>>> Is there any interest from anybody in spinning a proper patch out of this?

>>> Shaokun?

>>

>> HiSilicon's Kunpeng920(Hi1620) benefits from do_csum optimization, if Ard and

>> Robin are ok, Lingyan or I can try to do it.

>> Of course, if any guy posts the patch, we are happy to test it.

>> Any will be ok.

> 

> I don't mind who posts it, but Robin is super busy with SMMU stuff at the

> moment so it probably makes more sense for you or Lingyan to do it.


Thanks for restarting this topic, I or Lingyan will do it soon.

Thanks,
Shaokun

> 

> Will

> 

> .

>
Robin Murphy Aug. 16, 2019, 2:55 p.m. | #18
On 16/08/2019 09:15, Shaokun Zhang wrote:
> Hi Will,

> 

> On 2019/8/16 0:46, Will Deacon wrote:

>> On Thu, May 16, 2019 at 11:14:35AM +0800, Zhangshaokun wrote:

>>> On 2019/5/15 17:47, Will Deacon wrote:

>>>> On Mon, Apr 15, 2019 at 07:18:22PM +0100, Robin Murphy wrote:

>>>>> On 12/04/2019 10:52, Will Deacon wrote:

>>>>>> I'm waiting for Robin to come back with numbers for a C implementation.

>>>>>>

>>>>>> Robin -- did you get anywhere with that?

>>>>>

>>>>> Still not what I would call finished, but where I've got so far (besides an

>>>>> increasingly elaborate test rig) is as below - it still wants some unrolling

>>>>> in the middle to really fly (and actual testing on BE), but the worst-case

>>>>> performance already equals or just beats this asm version on Cortex-A53 with

>>>>> GCC 7 (by virtue of being alignment-insensitive and branchless except for

>>>>> the loop). Unfortunately, the advantage of C code being instrumentable does

>>>>> also come around to bite me...

>>>>

>>>> Is there any interest from anybody in spinning a proper patch out of this?

>>>> Shaokun?

>>>

>>> HiSilicon's Kunpeng920(Hi1620) benefits from do_csum optimization, if Ard and

>>> Robin are ok, Lingyan or I can try to do it.

>>> Of course, if any guy posts the patch, we are happy to test it.

>>> Any will be ok.

>>

>> I don't mind who posts it, but Robin is super busy with SMMU stuff at the

>> moment so it probably makes more sense for you or Lingyan to do it.

> 

> Thanks for restarting this topic, I or Lingyan will do it soon.


FWIW, I've rolled up what I had so far and dumped it up into a quick 
semi-realistic patch here:

http://linux-arm.org/git?p=linux-rm.git;a=commitdiff;h=859c5566510c32ae72039aa5072e932a771a3596

So far I'd put most of the effort into the aforementioned benchmarking 
harness to compare performance and correctness for all the proposed 
implementations over all reasonable alignment/length combinations - I 
think that got pretty much finished, but as Will says I'm unlikely to 
find time to properly look at this again for several weeks.

Robin.

Patch

diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
index 0b6f5a7d4027..e906b956c1fc 100644
--- a/arch/arm64/include/asm/checksum.h
+++ b/arch/arm64/include/asm/checksum.h
@@ -46,6 +46,9 @@  static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 }
 #define ip_fast_csum ip_fast_csum
 
+extern unsigned int do_csum(const unsigned char *buff, int len);
+#define do_csum do_csum
+
 #include <asm-generic/checksum.h>
 
 #endif	/* __ASM_CHECKSUM_H */
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 5540a1638baf..a7606007a749 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -3,7 +3,7 @@  lib-y		:= clear_user.o delay.o copy_from_user.o		\
 		   copy_to_user.o copy_in_user.o copy_page.o		\
 		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\
 		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
-		   strchr.o strrchr.o tishift.o
+		   strchr.o strrchr.o tishift.o csum.o
 
 ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
 obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
diff --git a/arch/arm64/lib/csum.S b/arch/arm64/lib/csum.S
new file mode 100644
index 000000000000..534e2ebdc426
--- /dev/null
+++ b/arch/arm64/lib/csum.S
@@ -0,0 +1,127 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ENTRY(do_csum)
+	adds		x2, xzr, xzr		// clear x2 and C flag
+
+	// 64 bytes at a time
+	lsr		x3, x1, #6
+	and		x1, x1, #63
+	cbz		x3, 1f
+
+	// Eight 64-bit adds per iteration
+0:	ldp		x4, x5, [x0], #64
+	ldp		x6, x7, [x0, #-48]
+	ldp		x8, x9, [x0, #-32]
+	ldp		x10, x11, [x0, #-16]
+	adcs		x2, x2, x4
+	sub		x3, x3, #1
+	adcs		x2, x2, x5
+	adcs		x2, x2, x6
+	adcs		x2, x2, x7
+	adcs		x2, x2, x8
+	adcs		x2, x2, x9
+	adcs		x2, x2, x10
+	adcs		x2, x2, x11
+	cbnz		x3, 0b
+	adc		x2, x2, xzr
+
+	cbz		x1, 7f
+	bic		x3, x1, #1
+	add		x12, x0, x1
+	add		x0, x0, x3
+	neg		x3, x3
+	add		x3, x3, #64
+	lsl		x3, x3, #3
+
+	// Handle remaining 63 bytes or less using an overlapping 64-byte load
+	// and a branchless code path to complete the calculation
+	ldp		x4, x5, [x0, #-64]
+	ldp		x6, x7, [x0, #-48]
+	ldp		x8, x9, [x0, #-32]
+	ldp		x10, x11, [x0, #-16]
+	ldrb		w12, [x12, #-1]
+
+	.irp		reg, x4, x5, x6, x7, x8, x9, x10, x11
+	cmp		x3, #64
+	csel		\reg, \reg, xzr, lt
+	ccmp		x3, xzr, #0, lt
+	csel		x13, x3, xzr, gt
+	sub		x3, x3, #64
+CPU_LE(	lsr		\reg, \reg, x13		)
+CPU_BE(	lsl		\reg, \reg, x13		)
+	.endr
+
+	adds		x2, x2, x4
+	adcs		x2, x2, x5
+	adcs		x2, x2, x6
+	adcs		x2, x2, x7
+	adcs		x2, x2, x8
+	adcs		x2, x2, x9
+	adcs		x2, x2, x10
+	adcs		x2, x2, x11
+	adc		x2, x2, xzr
+
+CPU_LE(	adds		x12, x2, x12		)
+CPU_BE(	adds		x12, x2, x12, lsl #8	)
+	adc		x12, x12, xzr
+	tst		x1, #1
+	csel		x2, x2, x12, eq
+
+7:	lsr		x1, x2, #32
+	adds		w2, w2, w1
+	adc		w2, w2, wzr
+
+	lsr		w1, w2, #16
+	uxth		w2, w2
+	add		w2, w2, w1
+
+	lsr		w1, w2, #16		// handle the carry by hand
+	add		w2, w2, w1
+
+	uxth		w0, w2
+	ret
+
+	// Handle 63 bytes or less
+1:	tbz		x1, #5, 2f
+	ldp		x4, x5, [x0], #32
+	ldp		x6, x7, [x0, #-16]
+	adds		x2, x2, x4
+	adcs		x2, x2, x5
+	adcs		x2, x2, x6
+	adcs		x2, x2, x7
+	adc		x2, x2, xzr
+
+2:	tbz		x1, #4, 3f
+	ldp		x4, x5, [x0], #16
+	adds		x2, x2, x4
+	adcs		x2, x2, x5
+	adc		x2, x2, xzr
+
+3:	tbz		x1, #3, 4f
+	ldr		x4, [x0], #8
+	adds		x2, x2, x4
+	adc		x2, x2, xzr
+
+4:	tbz		x1, #2, 5f
+	ldr		w4, [x0], #4
+	adds		x2, x2, x4
+	adc		x2, x2, xzr
+
+5:	tbz		x1, #1, 6f
+	ldrh		w4, [x0], #2
+	adds		x2, x2, x4
+	adc		x2, x2, xzr
+
+6:	tbz		x1, #0, 7b
+	ldrb		w4, [x0]
+CPU_LE(	adds		x2, x2, x4		)
+CPU_BE(	adds		x2, x2, x4, lsl #8	)
+	adc		x2, x2, xzr
+	b		7b
+ENDPROC(do_csum)