Message ID | 20231031064553.2319688-3-xiao.w.wang@intel.com |
---|---|
State | Accepted |
Commit | 457926b253200bd9bdfae9a016a3b1d1dc661d55 |
Headers | show |
Series | riscv: Optimize bitops with Zbb extension | expand |
Hi Xiao, On Tue, Oct 31, 2023 at 7:37 AM Xiao Wang <xiao.w.wang@intel.com> wrote: > This patch leverages the alternative mechanism to dynamically optimize > bitops (including __ffs, __fls, ffs, fls) with Zbb instructions. When > Zbb ext is not supported by the runtime CPU, legacy implementation is > used. If Zbb is supported, then the optimized variants will be selected > via alternative patching. > > The legacy bitops support is taken from the generic C implementation as > fallback. > > If the parameter is a build-time constant, we leverage compiler builtin to > calculate the result directly, this approach is inspired by x86 bitops > implementation. > > EFI stub runs before the kernel, so alternative mechanism should not be > used there, this patch introduces a macro NO_ALTERNATIVE for this purpose. > > Signed-off-by: Xiao Wang <xiao.w.wang@intel.com> > Reviewed-by: Charlie Jenkins <charlie@rivosinc.com> Thanks for your patch, which is now commit 457926b253200bd9 ("riscv: Optimize bitops with Zbb extension") in riscv/for-next. > --- a/arch/riscv/include/asm/bitops.h > +++ b/arch/riscv/include/asm/bitops.h > @@ -15,13 +15,261 @@ > #include <asm/barrier.h> > #include <asm/bitsperlong.h> > > +#if !defined(CONFIG_RISCV_ISA_ZBB) || defined(NO_ALTERNATIVE) > #include <asm-generic/bitops/__ffs.h> > -#include <asm-generic/bitops/ffz.h> > -#include <asm-generic/bitops/fls.h> > #include <asm-generic/bitops/__fls.h> > +#include <asm-generic/bitops/ffs.h> > +#include <asm-generic/bitops/fls.h> > + > +#else > +#include <asm/alternative-macros.h> > +#include <asm/hwcap.h> > + > +#if (BITS_PER_LONG == 64) > +#define CTZW "ctzw " > +#define CLZW "clzw " > +#elif (BITS_PER_LONG == 32) > +#define CTZW "ctz " > +#define CLZW "clz " > +#else > +#error "Unexpected BITS_PER_LONG" > +#endif > + > +static __always_inline unsigned long variable__ffs(unsigned long word) > +{ > + int num; > + > + asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, > + RISCV_ISA_EXT_ZBB, 1) > + : : : : legacy); > + > + asm volatile (".option push\n" > + ".option arch,+zbb\n" > + "ctz %0, %1\n" > + ".option pop\n" > + : "=r" (word) : "r" (word) :); > + > + return word; > + > +legacy: > + num = 0; > +#if BITS_PER_LONG == 64 > + if ((word & 0xffffffff) == 0) { > + num += 32; > + word >>= 32; > + } > +#endif > + if ((word & 0xffff) == 0) { > + num += 16; > + word >>= 16; > + } > + if ((word & 0xff) == 0) { > + num += 8; > + word >>= 8; > + } > + if ((word & 0xf) == 0) { > + num += 4; > + word >>= 4; > + } > + if ((word & 0x3) == 0) { > + num += 2; > + word >>= 2; > + } > + if ((word & 0x1) == 0) > + num += 1; > + return num; > +} Surely we can do better than duplicating include/asm-generic/bitops/__ffs.h? E.g. rename the generic implementation to generic___ffs(): -static __always_inline unsigned long __ffs(unsigned long word) +static __always_inline unsigned long generic__ffs(unsigned long word) { ... } +#ifndef __ffs +#define __ffs(x) generic__ffs(x) +#endif and explicitly calling the generic one here? Same comment for the other functions. Gr{oetje,eeting}s, Geert
Hi Geert, > -----Original Message----- > From: Geert Uytterhoeven <geert@linux-m68k.org> > Sent: Friday, November 10, 2023 5:25 PM > To: Wang, Xiao W <xiao.w.wang@intel.com> > Cc: paul.walmsley@sifive.com; palmer@dabbelt.com; > aou@eecs.berkeley.edu; ardb@kernel.org; anup@brainfault.org; Li, Haicheng > <haicheng.li@intel.com>; ajones@ventanamicro.com; Liu, Yujie > <yujie.liu@intel.com>; charlie@rivosinc.com; linux-riscv@lists.infradead.org; > linux-efi@vger.kernel.org; linux-kernel@vger.kernel.org > Subject: Re: [PATCH v5 2/2] riscv: Optimize bitops with Zbb extension > > Hi Xiao, > > On Tue, Oct 31, 2023 at 7:37 AM Xiao Wang <xiao.w.wang@intel.com> wrote: > > This patch leverages the alternative mechanism to dynamically optimize > > bitops (including __ffs, __fls, ffs, fls) with Zbb instructions. When > > Zbb ext is not supported by the runtime CPU, legacy implementation is > > used. If Zbb is supported, then the optimized variants will be selected > > via alternative patching. > > > > The legacy bitops support is taken from the generic C implementation as > > fallback. > > > > If the parameter is a build-time constant, we leverage compiler builtin to > > calculate the result directly, this approach is inspired by x86 bitops > > implementation. > > > > EFI stub runs before the kernel, so alternative mechanism should not be > > used there, this patch introduces a macro NO_ALTERNATIVE for this > purpose. > > > > Signed-off-by: Xiao Wang <xiao.w.wang@intel.com> > > Reviewed-by: Charlie Jenkins <charlie@rivosinc.com> > > Thanks for your patch, which is now commit 457926b253200bd9 ("riscv: > Optimize bitops with Zbb extension") in riscv/for-next. > > > --- a/arch/riscv/include/asm/bitops.h > > +++ b/arch/riscv/include/asm/bitops.h > > @@ -15,13 +15,261 @@ > > #include <asm/barrier.h> > > #include <asm/bitsperlong.h> > > > > +#if !defined(CONFIG_RISCV_ISA_ZBB) || defined(NO_ALTERNATIVE) > > #include <asm-generic/bitops/__ffs.h> > > -#include <asm-generic/bitops/ffz.h> > > -#include <asm-generic/bitops/fls.h> > > #include <asm-generic/bitops/__fls.h> > > +#include <asm-generic/bitops/ffs.h> > > +#include <asm-generic/bitops/fls.h> > > + > > +#else > > +#include <asm/alternative-macros.h> > > +#include <asm/hwcap.h> > > + > > +#if (BITS_PER_LONG == 64) > > +#define CTZW "ctzw " > > +#define CLZW "clzw " > > +#elif (BITS_PER_LONG == 32) > > +#define CTZW "ctz " > > +#define CLZW "clz " > > +#else > > +#error "Unexpected BITS_PER_LONG" > > +#endif > > + > > +static __always_inline unsigned long variable__ffs(unsigned long word) > > +{ > > + int num; > > + > > + asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, > > + RISCV_ISA_EXT_ZBB, 1) > > + : : : : legacy); > > + > > + asm volatile (".option push\n" > > + ".option arch,+zbb\n" > > + "ctz %0, %1\n" > > + ".option pop\n" > > + : "=r" (word) : "r" (word) :); > > + > > + return word; > > + > > +legacy: > > + num = 0; > > +#if BITS_PER_LONG == 64 > > + if ((word & 0xffffffff) == 0) { > > + num += 32; > > + word >>= 32; > > + } > > +#endif > > + if ((word & 0xffff) == 0) { > > + num += 16; > > + word >>= 16; > > + } > > + if ((word & 0xff) == 0) { > > + num += 8; > > + word >>= 8; > > + } > > + if ((word & 0xf) == 0) { > > + num += 4; > > + word >>= 4; > > + } > > + if ((word & 0x3) == 0) { > > + num += 2; > > + word >>= 2; > > + } > > + if ((word & 0x1) == 0) > > + num += 1; > > + return num; > > +} > > Surely we can do better than duplicating include/asm-generic/bitops/__ffs.h? > > E.g. rename the generic implementation to generic___ffs(): > > -static __always_inline unsigned long __ffs(unsigned long word) > +static __always_inline unsigned long generic__ffs(unsigned long word) > { > ... > } > > +#ifndef __ffs > +#define __ffs(x) generic__ffs(x) > +#endif > > and explicitly calling the generic one here? > > Same comment for the other functions. Thanks for the suggestion. I just tried your above example and got build error of "__ffs" redefinition, I think we can change the macro condition to: +#ifndef __HAVE_ARCH___FFS +#define __ffs(word) generic___ffs(word) +#endif Besides, adding a "generic_" prefix to __ffs would make it as "generic___ffs". I saw similar API names in include/asm-generic/bitops/generic-non-atomic.h. I would send a patch for this. BRs, Xiao > > Gr{oetje,eeting}s, > > Geert > > -- > Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux- > m68k.org > > In personal conversations with technical people, I call myself a hacker. But > when I'm talking to journalists I just say "programmer" or something like that. > -- Linus Torvalds
diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index 3540b690944b..b212c2708cda 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -15,13 +15,261 @@ #include <asm/barrier.h> #include <asm/bitsperlong.h> +#if !defined(CONFIG_RISCV_ISA_ZBB) || defined(NO_ALTERNATIVE) #include <asm-generic/bitops/__ffs.h> -#include <asm-generic/bitops/ffz.h> -#include <asm-generic/bitops/fls.h> #include <asm-generic/bitops/__fls.h> +#include <asm-generic/bitops/ffs.h> +#include <asm-generic/bitops/fls.h> + +#else +#include <asm/alternative-macros.h> +#include <asm/hwcap.h> + +#if (BITS_PER_LONG == 64) +#define CTZW "ctzw " +#define CLZW "clzw " +#elif (BITS_PER_LONG == 32) +#define CTZW "ctz " +#define CLZW "clz " +#else +#error "Unexpected BITS_PER_LONG" +#endif + +static __always_inline unsigned long variable__ffs(unsigned long word) +{ + int num; + + asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + RISCV_ISA_EXT_ZBB, 1) + : : : : legacy); + + asm volatile (".option push\n" + ".option arch,+zbb\n" + "ctz %0, %1\n" + ".option pop\n" + : "=r" (word) : "r" (word) :); + + return word; + +legacy: + num = 0; +#if BITS_PER_LONG == 64 + if ((word & 0xffffffff) == 0) { + num += 32; + word >>= 32; + } +#endif + if ((word & 0xffff) == 0) { + num += 16; + word >>= 16; + } + if ((word & 0xff) == 0) { + num += 8; + word >>= 8; + } + if ((word & 0xf) == 0) { + num += 4; + word >>= 4; + } + if ((word & 0x3) == 0) { + num += 2; + word >>= 2; + } + if ((word & 0x1) == 0) + num += 1; + return num; +} + +/** + * __ffs - find first set bit in a long word + * @word: The word to search + * + * Undefined if no set bit exists, so code should check against 0 first. + */ +#define __ffs(word) \ + (__builtin_constant_p(word) ? \ + (unsigned long)__builtin_ctzl(word) : \ + variable__ffs(word)) + +static __always_inline unsigned long variable__fls(unsigned long word) +{ + int num; + + asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + RISCV_ISA_EXT_ZBB, 1) + : : : : legacy); + + asm volatile (".option push\n" + ".option arch,+zbb\n" + "clz %0, %1\n" + ".option pop\n" + : "=r" (word) : "r" (word) :); + + return BITS_PER_LONG - 1 - word; + +legacy: + num = BITS_PER_LONG - 1; +#if BITS_PER_LONG == 64 + if (!(word & (~0ul << 32))) { + num -= 32; + word <<= 32; + } +#endif + if (!(word & (~0ul << (BITS_PER_LONG - 16)))) { + num -= 16; + word <<= 16; + } + if (!(word & (~0ul << (BITS_PER_LONG - 8)))) { + num -= 8; + word <<= 8; + } + if (!(word & (~0ul << (BITS_PER_LONG - 4)))) { + num -= 4; + word <<= 4; + } + if (!(word & (~0ul << (BITS_PER_LONG - 2)))) { + num -= 2; + word <<= 2; + } + if (!(word & (~0ul << (BITS_PER_LONG - 1)))) + num -= 1; + return num; +} + +/** + * __fls - find last set bit in a long word + * @word: the word to search + * + * Undefined if no set bit exists, so code should check against 0 first. + */ +#define __fls(word) \ + (__builtin_constant_p(word) ? \ + (unsigned long)(BITS_PER_LONG - 1 - __builtin_clzl(word)) : \ + variable__fls(word)) + +static __always_inline int variable_ffs(int x) +{ + int r; + + if (!x) + return 0; + + asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + RISCV_ISA_EXT_ZBB, 1) + : : : : legacy); + + asm volatile (".option push\n" + ".option arch,+zbb\n" + CTZW "%0, %1\n" + ".option pop\n" + : "=r" (r) : "r" (x) :); + + return r + 1; + +legacy: + r = 1; + if (!(x & 0xffff)) { + x >>= 16; + r += 16; + } + if (!(x & 0xff)) { + x >>= 8; + r += 8; + } + if (!(x & 0xf)) { + x >>= 4; + r += 4; + } + if (!(x & 3)) { + x >>= 2; + r += 2; + } + if (!(x & 1)) { + x >>= 1; + r += 1; + } + return r; +} + +/** + * ffs - find first set bit in a word + * @x: the word to search + * + * This is defined the same way as the libc and compiler builtin ffs routines. + * + * ffs(value) returns 0 if value is 0 or the position of the first set bit if + * value is nonzero. The first (least significant) bit is at position 1. + */ +#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x)) + +static __always_inline int variable_fls(unsigned int x) +{ + int r; + + if (!x) + return 0; + + asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + RISCV_ISA_EXT_ZBB, 1) + : : : : legacy); + + asm volatile (".option push\n" + ".option arch,+zbb\n" + CLZW "%0, %1\n" + ".option pop\n" + : "=r" (r) : "r" (x) :); + + return 32 - r; + +legacy: + r = 32; + if (!(x & 0xffff0000u)) { + x <<= 16; + r -= 16; + } + if (!(x & 0xff000000u)) { + x <<= 8; + r -= 8; + } + if (!(x & 0xf0000000u)) { + x <<= 4; + r -= 4; + } + if (!(x & 0xc0000000u)) { + x <<= 2; + r -= 2; + } + if (!(x & 0x80000000u)) { + x <<= 1; + r -= 1; + } + return r; +} + +/** + * fls - find last set bit in a word + * @x: the word to search + * + * This is defined in a similar way as ffs, but returns the position of the most + * significant set bit. + * + * fls(value) returns 0 if value is 0 or the position of the last set bit if + * value is nonzero. The last (most significant) bit is at position 32. + */ +#define fls(x) \ +({ \ + typeof(x) x_ = (x); \ + __builtin_constant_p(x_) ? \ + (int)((x_ != 0) ? (32 - __builtin_clz(x_)) : 0) \ + : \ + variable_fls(x_); \ +}) + +#endif /* !defined(CONFIG_RISCV_ISA_ZBB) || defined(NO_ALTERNATIVE) */ + +#include <asm-generic/bitops/ffz.h> #include <asm-generic/bitops/fls64.h> #include <asm-generic/bitops/sched.h> -#include <asm-generic/bitops/ffs.h> #include <asm-generic/bitops/hweight.h> diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index a1157c2a7170..d68cacd4e3af 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -28,7 +28,7 @@ cflags-$(CONFIG_ARM) += -DEFI_HAVE_STRLEN -DEFI_HAVE_STRNLEN \ -DEFI_HAVE_MEMCHR -DEFI_HAVE_STRRCHR \ -DEFI_HAVE_STRCMP -fno-builtin -fpic \ $(call cc-option,-mno-single-pic-base) -cflags-$(CONFIG_RISCV) += -fpic +cflags-$(CONFIG_RISCV) += -fpic -DNO_ALTERNATIVE cflags-$(CONFIG_LOONGARCH) += -fpie cflags-$(CONFIG_EFI_PARAMS_FROM_FDT) += -I$(srctree)/scripts/dtc/libfdt