diff mbox

md/raid6: delta syndrome for ARM NEON

Message ID 1435164213-25410-1-git-send-email-ard.biesheuvel@linaro.org
State Accepted
Commit 0e833e697bcf4c2f3f7fb9fce39d08cd4439e5d7
Headers show

Commit Message

Ard Biesheuvel June 24, 2015, 4:43 p.m. UTC
This implements XOR syndrome calculation using NEON intrinsics.
As before, the module can be built for ARM and arm64 from the
same source.

Relative performance on a Cortex-A57 based system:

  raid6: int64x1  gen()   905 MB/s
  raid6: int64x1  xor()   881 MB/s
  raid6: int64x2  gen()  1343 MB/s
  raid6: int64x2  xor()  1286 MB/s
  raid6: int64x4  gen()  1896 MB/s
  raid6: int64x4  xor()  1321 MB/s
  raid6: int64x8  gen()  1773 MB/s
  raid6: int64x8  xor()  1165 MB/s
  raid6: neonx1   gen()  1834 MB/s
  raid6: neonx1   xor()  1278 MB/s
  raid6: neonx2   gen()  2528 MB/s
  raid6: neonx2   xor()  1942 MB/s
  raid6: neonx4   gen()  2888 MB/s
  raid6: neonx4   xor()  2334 MB/s
  raid6: neonx8   gen()  2957 MB/s
  raid6: neonx8   xor()  2232 MB/s
  raid6: using algorithm neonx8 gen() 2957 MB/s
  raid6: .... xor() 2232 MB/s, rmw enabled

Cc: Markus Stockhausen <stockhausen@collogia.de>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 lib/raid6/neon.c  | 13 ++++++++++++-
 lib/raid6/neon.uc | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)

Comments

Ard Biesheuvel June 25, 2015, 8:30 a.m. UTC | #1
On 25 June 2015 at 08:32, Markus Stockhausen <stockhausen@collogia.de> wrote:
>> Von: Ard Biesheuvel [ard.biesheuvel@linaro.org]
>> Gesendet: Mittwoch, 24. Juni 2015 18:43
>> An: linux-arm-kernel@lists.infradead.org; hpa@zytor.com
>> Cc: Ard Biesheuvel; Markus Stockhausen; Neil Brown
>> Betreff: [PATCH] md/raid6: delta syndrome for ARM NEON
>>
>> This implements XOR syndrome calculation using NEON intrinsics.
>> As before, the module can be built for ARM and arm64 from the
>> same source.
>
>> Relative performance on a Cortex-A57 based system:
>>
>>   raid6: int64x1  gen()   905 MB/s
>>   raid6: int64x1  xor()   881 MB/s
>>   raid6: int64x2  gen()  1343 MB/s
>>   raid6: int64x2  xor()  1286 MB/s
>>   raid6: int64x4  gen()  1896 MB/s
>>   raid6: int64x4  xor()  1321 MB/s
>>   raid6: int64x8  gen()  1773 MB/s
>>   raid6: int64x8  xor()  1165 MB/s
>>   raid6: neonx1   gen()  1834 MB/s
>>   raid6: neonx1   xor()  1278 MB/s
>>   raid6: neonx2   gen()  2528 MB/s
>>   raid6: neonx2   xor()  1942 MB/s
>>   raid6: neonx4   gen()  2888 MB/s
>>   raid6: neonx4   xor()  2334 MB/s
>>   raid6: neonx8   gen()  2957 MB/s
>>   raid6: neonx8   xor()  2232 MB/s
>>   raid6: using algorithm neonx8 gen() 2957 MB/s
>>   raid6: .... xor() 2232 MB/s, rmw enabled
>>
>
> Nice to see that the placeholders get filled.
> Did you have a chance to do some real tests?
>

Hello Markus,

I haven't done any real world testing yet. Can you recommend any test
tools or test suites in particular?

Thanks,
Ard.
Ard Biesheuvel June 25, 2015, 8:50 a.m. UTC | #2
On 25 June 2015 at 10:30, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> On 25 June 2015 at 08:32, Markus Stockhausen <stockhausen@collogia.de> wrote:
>>> Von: Ard Biesheuvel [ard.biesheuvel@linaro.org]
>>> Gesendet: Mittwoch, 24. Juni 2015 18:43
>>> An: linux-arm-kernel@lists.infradead.org; hpa@zytor.com
>>> Cc: Ard Biesheuvel; Markus Stockhausen; Neil Brown
>>> Betreff: [PATCH] md/raid6: delta syndrome for ARM NEON
>>>
>>> This implements XOR syndrome calculation using NEON intrinsics.
>>> As before, the module can be built for ARM and arm64 from the
>>> same source.
>>
>>> Relative performance on a Cortex-A57 based system:
>>>
>>>   raid6: int64x1  gen()   905 MB/s
>>>   raid6: int64x1  xor()   881 MB/s
>>>   raid6: int64x2  gen()  1343 MB/s
>>>   raid6: int64x2  xor()  1286 MB/s
>>>   raid6: int64x4  gen()  1896 MB/s
>>>   raid6: int64x4  xor()  1321 MB/s
>>>   raid6: int64x8  gen()  1773 MB/s
>>>   raid6: int64x8  xor()  1165 MB/s
>>>   raid6: neonx1   gen()  1834 MB/s
>>>   raid6: neonx1   xor()  1278 MB/s
>>>   raid6: neonx2   gen()  2528 MB/s
>>>   raid6: neonx2   xor()  1942 MB/s
>>>   raid6: neonx4   gen()  2888 MB/s
>>>   raid6: neonx4   xor()  2334 MB/s
>>>   raid6: neonx8   gen()  2957 MB/s
>>>   raid6: neonx8   xor()  2232 MB/s
>>>   raid6: using algorithm neonx8 gen() 2957 MB/s
>>>   raid6: .... xor() 2232 MB/s, rmw enabled
>>>
>>
>> Nice to see that the placeholders get filled.
>> Did you have a chance to do some real tests?
>>
>
> Hello Markus,
>
> I haven't done any real world testing yet. Can you recommend any test
> tools or test suites in particular?
>

I am assuming you are asking about benchmarks, right? The code passes
the raid6test tests, so the correctness part should be covered
(although more testing is always better, of course)
diff mbox

Patch

diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c
index d9ad6ee284f4..7076ef1ba3dd 100644
--- a/lib/raid6/neon.c
+++ b/lib/raid6/neon.c
@@ -40,9 +40,20 @@ 
 					(unsigned long)bytes, ptrs);	\
 		kernel_neon_end();					\
 	}								\
+	static void raid6_neon ## _n ## _xor_syndrome(int disks,	\
+					int start, int stop, 		\
+					size_t bytes, void **ptrs)	\
+	{								\
+		void raid6_neon ## _n  ## _xor_syndrome_real(int,	\
+				int, int, unsigned long, void**);	\
+		kernel_neon_begin();					\
+		raid6_neon ## _n ## _xor_syndrome_real(disks,		\
+			start, stop, (unsigned long)bytes, ptrs);	\
+		kernel_neon_end();					\
+	}								\
 	struct raid6_calls const raid6_neonx ## _n = {			\
 		raid6_neon ## _n ## _gen_syndrome,			\
-		NULL,		/* XOR not yet implemented */		\
+		raid6_neon ## _n ## _xor_syndrome,			\
 		raid6_have_neon,					\
 		"neonx" #_n,						\
 		0							\
diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc
index 1b9ed793342d..4fa51b761dd0 100644
--- a/lib/raid6/neon.uc
+++ b/lib/raid6/neon.uc
@@ -3,6 +3,7 @@ 
  *   neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
  *
  *   Copyright (C) 2012 Rob Herring
+ *   Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
  *
  *   Based on altivec.uc:
  *     Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
@@ -78,3 +79,48 @@  void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
 		vst1q_u8(&q[d+NSIZE*$$], wq$$);
 	}
 }
+
+void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
+				    unsigned long bytes, void **ptrs)
+{
+	uint8_t **dptr = (uint8_t **)ptrs;
+	uint8_t *p, *q;
+	int d, z, z0;
+
+	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
+	const unative_t x1d = NBYTES(0x1d);
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
+		wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
+		wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);
+
+		/* P/Q data pages */
+		for ( z = z0-1 ; z >= start ; z-- ) {
+			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
+			wp$$ = veorq_u8(wp$$, wd$$);
+			w2$$ = MASK(wq$$);
+			w1$$ = SHLBYTE(wq$$);
+
+			w2$$ = vandq_u8(w2$$, x1d);
+			w1$$ = veorq_u8(w1$$, w2$$);
+			wq$$ = veorq_u8(w1$$, wd$$);
+		}
+		/* P/Q left side optimization */
+		for ( z = start-1 ; z >= 0 ; z-- ) {
+			w2$$ = MASK(wq$$);
+			w1$$ = SHLBYTE(wq$$);
+
+			w2$$ = vandq_u8(w2$$, x1d);
+			wq$$ = veorq_u8(w1$$, w2$$);
+		}
+		w1$$ = vld1q_u8(&q[d+NSIZE*$$]);
+		wq$$ = veorq_u8(wq$$, w1$$);
+
+		vst1q_u8(&p[d+NSIZE*$$], wp$$);
+		vst1q_u8(&q[d+NSIZE*$$], wq$$);
+	}
+}