diff mbox

[5/5] ARM: asm/div64.h: adjust to generic codde

Message ID 1446503610-6942-6-git-send-email-nicolas.pitre@linaro.org
State Accepted
Commit 040b323b5012b5503561ec7fe15cccd6a4bcaec2
Headers show

Commit Message

Nicolas Pitre Nov. 2, 2015, 10:33 p.m. UTC
Now that the constant divisor optimization is made generic, adapt the
ARM case to it.

Signed-off-by: Nicolas Pitre <nico@linaro.org>

---
 arch/arm/include/asm/div64.h | 283 ++++++++++++++-----------------------------
 1 file changed, 93 insertions(+), 190 deletions(-)

-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Comments

Nicolas Pitre Nov. 19, 2015, 4:42 p.m. UTC | #1
On Thu, 19 Nov 2015, Måns Rullgård wrote:

> Nicolas Pitre <nicolas.pitre@linaro.org> writes:

> 

> > +static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)

> > +{

> > +	unsigned long long res;

> > +	unsigned int tmp = 0;

> > +

> > +	if (!bias) {

> > +		asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"

> > +			"mov	%Q0, #0"

> > +			: "=&r" (res)

> > +			: "r" (m), "r" (n)

> > +			: "cc");

> > +	} else if (!(m & ((1ULL << 63) | (1ULL << 31)))) {

> > +		res = m;

> > +		asm (	"umlal	%Q0, %R0, %Q1, %Q2\n\t"

> > +			"mov	%Q0, #0"

> > +			: "+&r" (res)

> > +			: "r" (m), "r" (n)

> > +			: "cc");

> > +	} else {

> > +		asm (	"umull	%Q0, %R0, %Q2, %Q3\n\t"

> > +			"cmn	%Q0, %Q2\n\t"

> > +			"adcs	%R0, %R0, %R2\n\t"

> > +			"adc	%Q0, %1, #0"

> > +			: "=&r" (res), "+&r" (tmp)

> > +			: "r" (m), "r" (n)

> 

> Why is tmp using a +r constraint here?  The register is not written, so

> using an input-only operand could/should result in better code.  That is

> also what the old code did.


No, it is worse. gcc allocates two registers because, somehow, it 
doesn't think that the first one still holds zero after the first usage.  
This way usage of only one temporary register is forced throughout, 
producing better code.

I meant to have this split out in a separate patch but messed it up 
somehow.



> 

> > +			: "cc");

> > +	}

> > +

> > +	if (!(m & ((1ULL << 63) | (1ULL << 31)))) {

> > +		asm (	"umlal	%R0, %Q0, %R1, %Q2\n\t"

> > +			"umlal	%R0, %Q0, %Q1, %R2\n\t"

> > +			"mov	%R0, #0\n\t"

> > +			"umlal	%Q0, %R0, %R1, %R2"

> > +			: "+&r" (res)

> > +			: "r" (m), "r" (n)

> > +			: "cc");

> > +	} else {

> > +		asm (	"umlal	%R0, %Q0, %R2, %Q3\n\t"

> > +			"umlal	%R0, %1, %Q2, %R3\n\t"

> > +			"mov	%R0, #0\n\t"

> > +			"adds	%Q0, %1, %Q0\n\t"

> > +			"adc	%R0, %R0, #0\n\t"

> > +			"umlal	%Q0, %R0, %R2, %R3"

> > +			: "+&r" (res), "+&r" (tmp)

> > +			: "r" (m), "r" (n)

> > +			: "cc");

> > +	}

> > +

> > +	return res;

> > +}

> 

> -- 

> Måns Rullgård

> mans@mansr.com

> --

> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

> the body of a message to majordomo@vger.kernel.org

> More majordomo info at  http://vger.kernel.org/majordomo-info.html

> Please read the FAQ at  http://www.tux.org/lkml/

> 

>
diff mbox

Patch

diff --git a/arch/arm/include/asm/div64.h b/arch/arm/include/asm/div64.h
index 662c7bd061..626bbb3671 100644
--- a/arch/arm/include/asm/div64.h
+++ b/arch/arm/include/asm/div64.h
@@ -5,9 +5,9 @@ 
 #include <asm/compiler.h>
 
 /*
- * The semantics of do_div() are:
+ * The semantics of __div64_32() are:
  *
- * uint32_t do_div(uint64_t *n, uint32_t base)
+ * uint32_t __div64_32(uint64_t *n, uint32_t base)
  * {
  * 	uint32_t remainder = *n % base;
  * 	*n = *n / base;
@@ -16,8 +16,9 @@ 
  *
  * In other words, a 64-bit dividend with a 32-bit divisor producing
  * a 64-bit result and a 32-bit remainder.  To accomplish this optimally
- * we call a special __do_div64 helper with completely non standard
- * calling convention for arguments and results (beware).
+ * we override the generic version in lib/div64.c to call our __do_div64
+ * assembly implementation with completely non standard calling convention
+ * for arguments and results (beware).
  */
 
 #ifdef __ARMEB__
@@ -28,199 +29,101 @@ 
 #define __xh "r1"
 #endif
 
-#define __do_div_asm(n, base)					\
-({								\
-	register unsigned int __base      asm("r4") = base;	\
-	register unsigned long long __n   asm("r0") = n;	\
-	register unsigned long long __res asm("r2");		\
-	register unsigned int __rem       asm(__xh);		\
-	asm(	__asmeq("%0", __xh)				\
-		__asmeq("%1", "r2")				\
-		__asmeq("%2", "r0")				\
-		__asmeq("%3", "r4")				\
-		"bl	__do_div64"				\
-		: "=r" (__rem), "=r" (__res)			\
-		: "r" (__n), "r" (__base)			\
-		: "ip", "lr", "cc");				\
-	n = __res;						\
-	__rem;							\
-})
-
-#if __GNUC__ < 4 || !defined(CONFIG_AEABI)
+static inline uint32_t __div64_32(uint64_t *n, uint32_t base)
+{
+	register unsigned int __base      asm("r4") = base;
+	register unsigned long long __n   asm("r0") = *n;
+	register unsigned long long __res asm("r2");
+	register unsigned int __rem       asm(__xh);
+	asm(	__asmeq("%0", __xh)
+		__asmeq("%1", "r2")
+		__asmeq("%2", "r0")
+		__asmeq("%3", "r4")
+		"bl	__do_div64"
+		: "=r" (__rem), "=r" (__res)
+		: "r" (__n), "r" (__base)
+		: "ip", "lr", "cc");
+	*n = __res;
+	return __rem;
+}
+#define __div64_32 __div64_32
+
+#if !defined(CONFIG_AEABI)
 
 /*
- * gcc versions earlier than 4.0 are simply too problematic for the
- * optimized implementation below. First there is gcc PR 15089 that
- * tend to trig on more complex constructs, spurious .global __udivsi3
- * are inserted even if none of those symbols are referenced in the
- * generated code, and those gcc versions are not able to do constant
- * propagation on long long values anyway.
+ * In OABI configurations, some uses of the do_div function
+ * cause gcc to run out of registers. To work around that,
+ * we can force the use of the out-of-line version for
+ * configurations that build a OABI kernel.
  */
-#define do_div(n, base) __do_div_asm(n, base)
-
-#elif __GNUC__ >= 4
+#define do_div(n, base) __div64_32(&(n), base)
 
-#include <asm/bug.h>
+#else
 
 /*
- * If the divisor happens to be constant, we determine the appropriate
- * inverse at compile time to turn the division into a few inline
- * multiplications instead which is much faster. And yet only if compiling
- * for ARMv4 or higher (we need umull/umlal) and if the gcc version is
- * sufficiently recent to perform proper long long constant propagation.
- * (It is unfortunate that gcc doesn't perform all this internally.)
+ * gcc versions earlier than 4.0 are simply too problematic for the
+ * __div64_const32() code in asm-generic/div64.h. First there is
+ * gcc PR 15089 that tend to trig on more complex constructs, spurious
+ * .global __udivsi3 are inserted even if none of those symbols are
+ * referenced in the generated code, and those gcc versions are not able
+ * to do constant propagation on long long values anyway.
  */
-#define do_div(n, base)							\
-({									\
-	unsigned int __r, __b = (base);					\
-	if (!__builtin_constant_p(__b) || __b == 0 ||			\
-	    (__LINUX_ARM_ARCH__ < 4 && (__b & (__b - 1)) != 0)) {	\
-		/* non-constant divisor (or zero): slow path */		\
-		__r = __do_div_asm(n, __b);				\
-	} else if ((__b & (__b - 1)) == 0) {				\
-		/* Trivial: __b is constant and a power of 2 */		\
-		/* gcc does the right thing with this code.  */		\
-		__r = n;						\
-		__r &= (__b - 1);					\
-		n /= __b;						\
-	} else {							\
-		/* Multiply by inverse of __b: n/b = n*(p/b)/p       */	\
-		/* We rely on the fact that most of this code gets   */	\
-		/* optimized away at compile time due to constant    */	\
-		/* propagation and only a couple inline assembly     */	\
-		/* instructions should remain. Better avoid any      */	\
-		/* code construct that might prevent that.           */	\
-		unsigned long long __res, __x, __t, __m, __n = n;	\
-		unsigned int __c, __p, __z = 0;				\
-		/* preserve low part of n for reminder computation */	\
-		__r = __n;						\
-		/* determine number of bits to represent __b */		\
-		__p = 1 << __div64_fls(__b);				\
-		/* compute __m = ((__p << 64) + __b - 1) / __b */	\
-		__m = (~0ULL / __b) * __p;				\
-		__m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b;	\
-		/* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */	\
-		__x = ~0ULL / __b * __b - 1;				\
-		__res = (__m & 0xffffffff) * (__x & 0xffffffff);	\
-		__res >>= 32;						\
-		__res += (__m & 0xffffffff) * (__x >> 32);		\
-		__t = __res;						\
-		__res += (__x & 0xffffffff) * (__m >> 32);		\
-		__t = (__res < __t) ? (1ULL << 32) : 0;			\
-		__res = (__res >> 32) + __t;				\
-		__res += (__m >> 32) * (__x >> 32);			\
-		__res /= __p;						\
-		/* Now sanitize and optimize what we've got. */		\
-		if (~0ULL % (__b / (__b & -__b)) == 0) {		\
-			/* those cases can be simplified with: */	\
-			__n /= (__b & -__b);				\
-			__m = ~0ULL / (__b / (__b & -__b));		\
-			__p = 1;					\
-			__c = 1;					\
-		} else if (__res != __x / __b) {			\
-			/* We can't get away without a correction    */	\
-			/* to compensate for bit truncation errors.  */	\
-			/* To avoid it we'd need an additional bit   */	\
-			/* to represent __m which would overflow it. */	\
-			/* Instead we do m=p/b and n/b=(n*m+m)/p.    */	\
-			__c = 1;					\
-			/* Compute __m = (__p << 64) / __b */		\
-			__m = (~0ULL / __b) * __p;			\
-			__m += ((~0ULL % __b + 1) * __p) / __b;		\
-		} else {						\
-			/* Reduce __m/__p, and try to clear bit 31   */	\
-			/* of __m when possible otherwise that'll    */	\
-			/* need extra overflow handling later.       */	\
-			unsigned int __bits = -(__m & -__m);		\
-			__bits |= __m >> 32;				\
-			__bits = (~__bits) << 1;			\
-			/* If __bits == 0 then setting bit 31 is     */	\
-			/* unavoidable.  Simply apply the maximum    */	\
-			/* possible reduction in that case.          */	\
-			/* Otherwise the MSB of __bits indicates the */	\
-			/* best reduction we should apply.           */	\
-			if (!__bits) {					\
-				__p /= (__m & -__m);			\
-				__m /= (__m & -__m);			\
-			} else {					\
-				__p >>= __div64_fls(__bits);		\
-				__m >>= __div64_fls(__bits);		\
-			}						\
-			/* No correction needed. */			\
-			__c = 0;					\
-		}							\
-		/* Now we have a combination of 2 conditions:        */	\
-		/* 1) whether or not we need a correction (__c), and */	\
-		/* 2) whether or not there might be an overflow in   */	\
-		/*    the cross product (__m & ((1<<63) | (1<<31)))  */	\
-		/* Select the best insn combination to perform the   */	\
-		/* actual __m * __n / (__p << 64) operation.         */	\
-		if (!__c) {						\
-			asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"		\
-				"mov	%Q0, #0"			\
-				: "=&r" (__res)				\
-				: "r" (__m), "r" (__n)			\
-				: "cc" );				\
-		} else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {	\
-			__res = __m;					\
-			asm (	"umlal	%Q0, %R0, %Q1, %Q2\n\t"		\
-				"mov	%Q0, #0"			\
-				: "+&r" (__res)				\
-				: "r" (__m), "r" (__n)			\
-				: "cc" );				\
-		} else {						\
-			asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"		\
-				"cmn	%Q0, %Q1\n\t"			\
-				"adcs	%R0, %R0, %R1\n\t"		\
-				"adc	%Q0, %3, #0"			\
-				: "=&r" (__res)				\
-				: "r" (__m), "r" (__n), "r" (__z)	\
-				: "cc" );				\
-		}							\
-		if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {		\
-			asm (	"umlal	%R0, %Q0, %R1, %Q2\n\t"		\
-				"umlal	%R0, %Q0, %Q1, %R2\n\t"		\
-				"mov	%R0, #0\n\t"			\
-				"umlal	%Q0, %R0, %R1, %R2"		\
-				: "+&r" (__res)				\
-				: "r" (__m), "r" (__n)			\
-				: "cc" );				\
-		} else {						\
-			asm (	"umlal	%R0, %Q0, %R2, %Q3\n\t"		\
-				"umlal	%R0, %1, %Q2, %R3\n\t"		\
-				"mov	%R0, #0\n\t"			\
-				"adds	%Q0, %1, %Q0\n\t"		\
-				"adc	%R0, %R0, #0\n\t"		\
-				"umlal	%Q0, %R0, %R2, %R3"		\
-				: "+&r" (__res), "+&r" (__z)		\
-				: "r" (__m), "r" (__n)			\
-				: "cc" );				\
-		}							\
-		__res /= __p;						\
-		/* The reminder can be computed with 32-bit regs     */	\
-		/* only, and gcc is good at that.                    */	\
-		{							\
-			unsigned int __res0 = __res;			\
-			unsigned int __b0 = __b;			\
-			__r -= __res0 * __b0;				\
-		}							\
-		/* BUG_ON(__r >= __b || __res * __b + __r != n); */	\
-		n = __res;						\
-	}								\
-	__r;								\
-})
-
-/* our own fls implementation to make sure constant propagation is fine */
-#define __div64_fls(bits)						\
-({									\
-	unsigned int __left = (bits), __nr = 0;				\
-	if (__left & 0xffff0000) __nr += 16, __left >>= 16;		\
-	if (__left & 0x0000ff00) __nr +=  8, __left >>=  8;		\
-	if (__left & 0x000000f0) __nr +=  4, __left >>=  4;		\
-	if (__left & 0x0000000c) __nr +=  2, __left >>=  2;		\
-	if (__left & 0x00000002) __nr +=  1;				\
-	__nr;								\
-})
+
+#define __div64_const32_is_OK (__GNUC__ >= 4)
+
+static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
+{
+	unsigned long long res;
+	unsigned int tmp = 0;
+
+	if (!bias) {
+		asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"
+			"mov	%Q0, #0"
+			: "=&r" (res)
+			: "r" (m), "r" (n)
+			: "cc");
+	} else if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
+		res = m;
+		asm (	"umlal	%Q0, %R0, %Q1, %Q2\n\t"
+			"mov	%Q0, #0"
+			: "+&r" (res)
+			: "r" (m), "r" (n)
+			: "cc");
+	} else {
+		asm (	"umull	%Q0, %R0, %Q2, %Q3\n\t"
+			"cmn	%Q0, %Q2\n\t"
+			"adcs	%R0, %R0, %R2\n\t"
+			"adc	%Q0, %1, #0"
+			: "=&r" (res), "+&r" (tmp)
+			: "r" (m), "r" (n)
+			: "cc");
+	}
+
+	if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
+		asm (	"umlal	%R0, %Q0, %R1, %Q2\n\t"
+			"umlal	%R0, %Q0, %Q1, %R2\n\t"
+			"mov	%R0, #0\n\t"
+			"umlal	%Q0, %R0, %R1, %R2"
+			: "+&r" (res)
+			: "r" (m), "r" (n)
+			: "cc");
+	} else {
+		asm (	"umlal	%R0, %Q0, %R2, %Q3\n\t"
+			"umlal	%R0, %1, %Q2, %R3\n\t"
+			"mov	%R0, #0\n\t"
+			"adds	%Q0, %1, %Q0\n\t"
+			"adc	%R0, %R0, #0\n\t"
+			"umlal	%Q0, %R0, %R2, %R3"
+			: "+&r" (res), "+&r" (tmp)
+			: "r" (m), "r" (n)
+			: "cc");
+	}
+
+	return res;
+}
+#define __arch_xprod_64 __arch_xprod_64
+
+#include <asm-generic/div64.h>
 
 #endif