diff mbox series

lib/raid6: use vdupq_n_u8 to avoid endianness warnings

Message ID 20190226040345.202047-1-ndesaulniers@google.com
State Accepted
Commit 1ad3935b39da78a403e7df7a3813f866c731bc64
Headers show
Series lib/raid6: use vdupq_n_u8 to avoid endianness warnings | expand

Commit Message

Nick Desaulniers Feb. 26, 2019, 4:03 a.m. UTC
Clang warns: vector initializers are not compatible with NEON intrinsics
in big endian mode [-Wnonportable-vector-initialization]

While this is usually the case, it's not an issue for this case since
we're initializing the uint8x16_t (16x uint8_t's) with the same value.

Instead, use vdupq_n_u8 which both compilers lower into a single movi
instruction: https://godbolt.org/z/vBrgzt

This avoids the static storage for a constant value.

Link: https://github.com/ClangBuiltLinux/linux/issues/214
Suggested-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>

---
 lib/raid6/neon.uc            | 5 ++---
 lib/raid6/recov_neon_inner.c | 7 ++-----
 2 files changed, 4 insertions(+), 8 deletions(-)

-- 
2.21.0.rc2.261.ga7da99ff1b-goog

Comments

Ard Biesheuvel Feb. 26, 2019, 7:19 a.m. UTC | #1
On Tue, 26 Feb 2019 at 05:03, <ndesaulniers@google.com> wrote:
>

> Clang warns: vector initializers are not compatible with NEON intrinsics

> in big endian mode [-Wnonportable-vector-initialization]

>

> While this is usually the case, it's not an issue for this case since

> we're initializing the uint8x16_t (16x uint8_t's) with the same value.

>

> Instead, use vdupq_n_u8 which both compilers lower into a single movi

> instruction: https://godbolt.org/z/vBrgzt

>

> This avoids the static storage for a constant value.

>

> Link: https://github.com/ClangBuiltLinux/linux/issues/214

> Suggested-by: Nathan Chancellor <natechancellor@gmail.com>

> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>


Much better, thanks,

Did you double check that the intrinsic exists on 32-bit ARM as well?
I assume it does, but please make sure if you haven't yet.

If so,

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>


> ---

>  lib/raid6/neon.uc            | 5 ++---

>  lib/raid6/recov_neon_inner.c | 7 ++-----

>  2 files changed, 4 insertions(+), 8 deletions(-)

>

> diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc

> index d5242f544551..b7c68030da4f 100644

> --- a/lib/raid6/neon.uc

> +++ b/lib/raid6/neon.uc

> @@ -28,7 +28,6 @@

>

>  typedef uint8x16_t unative_t;

>

> -#define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})

>  #define NSIZE  sizeof(unative_t)

>

>  /*

> @@ -61,7 +60,7 @@ void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)

>         int d, z, z0;

>

>         register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;

> -       const unative_t x1d = NBYTES(0x1d);

> +       const unative_t x1d = vdupq_n_u8(0x1d);

>

>         z0 = disks - 3;         /* Highest data disk */

>         p = dptr[z0+1];         /* XOR parity */

> @@ -92,7 +91,7 @@ void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,

>         int d, z, z0;

>

>         register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;

> -       const unative_t x1d = NBYTES(0x1d);

> +       const unative_t x1d = vdupq_n_u8(0x1d);

>

>         z0 = stop;              /* P/Q right side optimization */

>         p = dptr[disks-2];      /* XOR parity */

> diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c

> index 8cd20c9f834a..7d00c31a6547 100644

> --- a/lib/raid6/recov_neon_inner.c

> +++ b/lib/raid6/recov_neon_inner.c

> @@ -10,11 +10,6 @@

>

>  #include <arm_neon.h>

>

> -static const uint8x16_t x0f = {

> -       0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,

> -       0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,

> -};

> -

>  #ifdef CONFIG_ARM

>  /*

>   * AArch32 does not provide this intrinsic natively because it does not

> @@ -41,6 +36,7 @@ void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,

>         uint8x16_t pm1 = vld1q_u8(pbmul + 16);

>         uint8x16_t qm0 = vld1q_u8(qmul);

>         uint8x16_t qm1 = vld1q_u8(qmul + 16);

> +       uint8x16_t x0f = vdupq_n_u8(0x0f);

>

>         /*

>          * while ( bytes-- ) {

> @@ -87,6 +83,7 @@ void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,

>  {

>         uint8x16_t qm0 = vld1q_u8(qmul);

>         uint8x16_t qm1 = vld1q_u8(qmul + 16);

> +       uint8x16_t x0f = vdupq_n_u8(0x0f);

>

>         /*

>          * while (bytes--) {

> --

> 2.21.0.rc2.261.ga7da99ff1b-goog

>
Nick Desaulniers Feb. 26, 2019, 8:44 p.m. UTC | #2
On Mon, Feb 25, 2019 at 11:19 PM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
>

> On Tue, 26 Feb 2019 at 05:03, <ndesaulniers@google.com> wrote:

> >

> > Clang warns: vector initializers are not compatible with NEON intrinsics

> > in big endian mode [-Wnonportable-vector-initialization]

> >

> > While this is usually the case, it's not an issue for this case since

> > we're initializing the uint8x16_t (16x uint8_t's) with the same value.

> >

> > Instead, use vdupq_n_u8 which both compilers lower into a single movi

> > instruction: https://godbolt.org/z/vBrgzt

> >

> > This avoids the static storage for a constant value.

> >

> > Link: https://github.com/ClangBuiltLinux/linux/issues/214

> > Suggested-by: Nathan Chancellor <natechancellor@gmail.com>

> > Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>

>

> Much better, thanks,

>

> Did you double check that the intrinsic exists on 32-bit ARM as well?

> I assume it does, but please make sure if you haven't yet.


Thanks for the review!
Looking through Clang's generated arm_neon.h, vdupq_n_u8 seems to have
2 definitions predicated on __LITTLE_ENDIAN__ (not __arch64__ or
__ARM_ARCH >= 8 like some of the other types and functions).

So NEON got some additions in v8?  Is there a doc that lists them?
http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491g/BABDBBJB.html
is where I found vdupq_n_u8, but it doesn't seem to mention
compatibility (so I assume it's been around since the introduction of
NEON?).
-- 
Thanks,
~Nick Desaulniers
Ard Biesheuvel Feb. 26, 2019, 8:52 p.m. UTC | #3
On Tue, 26 Feb 2019 at 21:44, Nick Desaulniers <ndesaulniers@google.com> wrote:
>

> On Mon, Feb 25, 2019 at 11:19 PM Ard Biesheuvel

> <ard.biesheuvel@linaro.org> wrote:

> >

> > On Tue, 26 Feb 2019 at 05:03, <ndesaulniers@google.com> wrote:

> > >

> > > Clang warns: vector initializers are not compatible with NEON intrinsics

> > > in big endian mode [-Wnonportable-vector-initialization]

> > >

> > > While this is usually the case, it's not an issue for this case since

> > > we're initializing the uint8x16_t (16x uint8_t's) with the same value.

> > >

> > > Instead, use vdupq_n_u8 which both compilers lower into a single movi

> > > instruction: https://godbolt.org/z/vBrgzt

> > >

> > > This avoids the static storage for a constant value.

> > >

> > > Link: https://github.com/ClangBuiltLinux/linux/issues/214

> > > Suggested-by: Nathan Chancellor <natechancellor@gmail.com>

> > > Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>

> >

> > Much better, thanks,

> >

> > Did you double check that the intrinsic exists on 32-bit ARM as well?

> > I assume it does, but please make sure if you haven't yet.

>

> Thanks for the review!


My pleasure.

> Looking through Clang's generated arm_neon.h, vdupq_n_u8 seems to have

> 2 definitions predicated on __LITTLE_ENDIAN__ (not __arch64__ or

> __ARM_ARCH >= 8 like some of the other types and functions).

>

> So NEON got some additions in v8?


Basically, yes. One example is right there in the NEON recovery code,
guarded by #ifdef CONFIG_ARM.

New intrinsics were also introduced for the crypto instructions,
although I think those were also added to the 32-bit version of
arm_neon.h

>  Is there a doc that lists them?


Not that I know of.

> http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491g/BABDBBJB.html

> is where I found vdupq_n_u8, but it doesn't seem to mention

> compatibility (so I assume it's been around since the introduction of

> NEON?).


Yes, it appears that the 32-bit arm_neon.h header on my box defines
this intrinsic, so this should be fine.
Catalin Marinas Feb. 28, 2019, 5:47 p.m. UTC | #4
On Mon, Feb 25, 2019 at 08:03:42PM -0800, ndesaulniers@google.com wrote:
> Clang warns: vector initializers are not compatible with NEON intrinsics

> in big endian mode [-Wnonportable-vector-initialization]

> 

> While this is usually the case, it's not an issue for this case since

> we're initializing the uint8x16_t (16x uint8_t's) with the same value.

> 

> Instead, use vdupq_n_u8 which both compilers lower into a single movi

> instruction: https://godbolt.org/z/vBrgzt

> 

> This avoids the static storage for a constant value.

> 

> Link: https://github.com/ClangBuiltLinux/linux/issues/214

> Suggested-by: Nathan Chancellor <natechancellor@gmail.com>

> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>

> ---

>  lib/raid6/neon.uc            | 5 ++---

>  lib/raid6/recov_neon_inner.c | 7 ++-----

>  2 files changed, 4 insertions(+), 8 deletions(-)


Queued for 5.1. Thanks.

-- 
Catalin
Robin Murphy Feb. 28, 2019, 6 p.m. UTC | #5
On 26/02/2019 20:44, Nick Desaulniers wrote:
> On Mon, Feb 25, 2019 at 11:19 PM Ard Biesheuvel

> <ard.biesheuvel@linaro.org> wrote:

>>

>> On Tue, 26 Feb 2019 at 05:03, <ndesaulniers@google.com> wrote:

>>>

>>> Clang warns: vector initializers are not compatible with NEON intrinsics

>>> in big endian mode [-Wnonportable-vector-initialization]

>>>

>>> While this is usually the case, it's not an issue for this case since

>>> we're initializing the uint8x16_t (16x uint8_t's) with the same value.

>>>

>>> Instead, use vdupq_n_u8 which both compilers lower into a single movi

>>> instruction: https://godbolt.org/z/vBrgzt

>>>

>>> This avoids the static storage for a constant value.

>>>

>>> Link: https://github.com/ClangBuiltLinux/linux/issues/214

>>> Suggested-by: Nathan Chancellor <natechancellor@gmail.com>

>>> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>

>>

>> Much better, thanks,

>>

>> Did you double check that the intrinsic exists on 32-bit ARM as well?

>> I assume it does, but please make sure if you haven't yet.

> 

> Thanks for the review!

> Looking through Clang's generated arm_neon.h, vdupq_n_u8 seems to have

> 2 definitions predicated on __LITTLE_ENDIAN__ (not __arch64__ or

> __ARM_ARCH >= 8 like some of the other types and functions).

> 

> So NEON got some additions in v8?  Is there a doc that lists them?

> http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491g/BABDBBJB.html

> is where I found vdupq_n_u8, but it doesn't seem to mention

> compatibility (so I assume it's been around since the introduction of

> NEON?).


FWIW the most recent 'proper' spec document I know of is this one:

http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073b/index.html


Apparently we have a more interactive playground on the new site, too:

https://developer.arm.com/technologies/neon/intrinsics


Robin.
Nick Desaulniers Feb. 28, 2019, 6:51 p.m. UTC | #6
On Thu, Feb 28, 2019 at 10:00 AM Robin Murphy <robin.murphy@arm.com> wrote:
>

> On 26/02/2019 20:44, Nick Desaulniers wrote:

> > On Mon, Feb 25, 2019 at 11:19 PM Ard Biesheuvel

> > <ard.biesheuvel@linaro.org> wrote:

> >>

> >> On Tue, 26 Feb 2019 at 05:03, <ndesaulniers@google.com> wrote:

> >>>

> >>> Clang warns: vector initializers are not compatible with NEON intrinsics

> >>> in big endian mode [-Wnonportable-vector-initialization]

> >>>

> >>> While this is usually the case, it's not an issue for this case since

> >>> we're initializing the uint8x16_t (16x uint8_t's) with the same value.

> >>>

> >>> Instead, use vdupq_n_u8 which both compilers lower into a single movi

> >>> instruction: https://godbolt.org/z/vBrgzt

> >>>

> >>> This avoids the static storage for a constant value.

> >>>

> >>> Link: https://github.com/ClangBuiltLinux/linux/issues/214

> >>> Suggested-by: Nathan Chancellor <natechancellor@gmail.com>

> >>> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>

> >>

> >> Much better, thanks,

> >>

> >> Did you double check that the intrinsic exists on 32-bit ARM as well?

> >> I assume it does, but please make sure if you haven't yet.

> >

> > Thanks for the review!

> > Looking through Clang's generated arm_neon.h, vdupq_n_u8 seems to have

> > 2 definitions predicated on __LITTLE_ENDIAN__ (not __arch64__ or

> > __ARM_ARCH >= 8 like some of the other types and functions).

> >

> > So NEON got some additions in v8?  Is there a doc that lists them?

> > http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491g/BABDBBJB.html

> > is where I found vdupq_n_u8, but it doesn't seem to mention

> > compatibility (so I assume it's been around since the introduction of

> > NEON?).

>

> FWIW the most recent 'proper' spec document I know of is this one:

>

> http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073b/index.html


Bookmarked, thanks!
Ard, page 171 mentions armv7, armv8 for supported architectures for vdupq_n_u8.

>

>

> Apparently we have a more interactive playground on the new site, too:

>

> https://developer.arm.com/technologies/neon/intrinsics


Also bookmarked! I'm also super happy to see this; I'm familiar with
Intel's equivalent:
https://software.intel.com/sites/landingpage/IntrinsicsGuide/

Interactive sites like these are quite useful.  Reading a post
recently: https://www.sigarch.org/simd-instructions-considered-harmful/

"The IA-32 instruction set has grown from 80 to around 1400
instructions since 1978, largely fueled by SIMD."

reminded me how useful and almost necessary the interactive sites are
for navigating the large swathes of SIMD extensions.  (no comment on
the title of that article)

-- 
Thanks,
~Nick Desaulniers
diff mbox series

Patch

diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc
index d5242f544551..b7c68030da4f 100644
--- a/lib/raid6/neon.uc
+++ b/lib/raid6/neon.uc
@@ -28,7 +28,6 @@ 
 
 typedef uint8x16_t unative_t;
 
-#define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})
 #define NSIZE	sizeof(unative_t)
 
 /*
@@ -61,7 +60,7 @@  void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
 	int d, z, z0;
 
 	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
-	const unative_t x1d = NBYTES(0x1d);
+	const unative_t x1d = vdupq_n_u8(0x1d);
 
 	z0 = disks - 3;		/* Highest data disk */
 	p = dptr[z0+1];		/* XOR parity */
@@ -92,7 +91,7 @@  void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
 	int d, z, z0;
 
 	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
-	const unative_t x1d = NBYTES(0x1d);
+	const unative_t x1d = vdupq_n_u8(0x1d);
 
 	z0 = stop;		/* P/Q right side optimization */
 	p = dptr[disks-2];	/* XOR parity */
diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c
index 8cd20c9f834a..7d00c31a6547 100644
--- a/lib/raid6/recov_neon_inner.c
+++ b/lib/raid6/recov_neon_inner.c
@@ -10,11 +10,6 @@ 
 
 #include <arm_neon.h>
 
-static const uint8x16_t x0f = {
-	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
-	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
-};
-
 #ifdef CONFIG_ARM
 /*
  * AArch32 does not provide this intrinsic natively because it does not
@@ -41,6 +36,7 @@  void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
 	uint8x16_t pm1 = vld1q_u8(pbmul + 16);
 	uint8x16_t qm0 = vld1q_u8(qmul);
 	uint8x16_t qm1 = vld1q_u8(qmul + 16);
+	uint8x16_t x0f = vdupq_n_u8(0x0f);
 
 	/*
 	 * while ( bytes-- ) {
@@ -87,6 +83,7 @@  void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
 {
 	uint8x16_t qm0 = vld1q_u8(qmul);
 	uint8x16_t qm1 = vld1q_u8(qmul + 16);
+	uint8x16_t x0f = vdupq_n_u8(0x0f);
 
 	/*
 	 * while (bytes--) {