diff mbox series

[19/23] math: Use atan2f from CORE-MATH

Message ID 20241129132032.476978-20-adhemerval.zanella@linaro.org
State New
Headers show
Series Add remaining CORE-MATH binary32 implementations to libm | expand

Commit Message

Adhemerval Zanella Nov. 29, 2024, 1:17 p.m. UTC
The CORE-MATH implementation is correctly rounded (for any rounding mode)
and shows slight better performance to the generic atan2f.

The code was adapted to glibc style and to use the definition of
math_config.h (to handle errno, overflow, and underflow).

Benchtest on x64_64 (Ryzen 9 5900X, gcc 14.2.1), aarch64 (Neoverse-N1,
gcc 13.3.1), and powerpc (POWER10, gcc 13.2.1):

Latency                      master        patched   improvement
x86_64                      68.1175        69.2014        -1.59%
x86_64v2                    66.9884        66.0081         1.46%
x86_64v3                    57.7034        61.6407        -6.82%
i686                       189.8690        152.7560       19.55%
aarch64 (Neoverse)          32.6151        24.5382        24.76%
power10                     21.7282        17.1896        20.89%

reciprocal-throughput        master        patched   improvement
x86_64                      34.5202        31.6155         8.41%
x86_64v2                    32.6379        30.3372         7.05%
x86_64v3                    34.3677        23.6455        31.20%
i686                       157.7290        75.8308        51.92%
aarch64 (Neoverse)          27.7788        16.2671        41.44%
power10                     15.5715         8.1588        47.60%

Signed-off-by: Alexei Sibidanov <sibid@uvic.ca>
Signed-off-by: Paul Zimmermann <Paul.Zimmermann@inria.fr>
Signed-off-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
---
 SHARED-FILES                                  |   4 +
 sysdeps/aarch64/libm-test-ulps                |   4 -
 sysdeps/alpha/fpu/libm-test-ulps              |   4 -
 sysdeps/arc/fpu/libm-test-ulps                |   4 -
 sysdeps/arc/nofpu/libm-test-ulps              |   1 -
 sysdeps/arm/libm-test-ulps                    |   4 -
 sysdeps/csky/fpu/libm-test-ulps               |   4 -
 sysdeps/csky/nofpu/libm-test-ulps             |   4 -
 sysdeps/hppa/fpu/libm-test-ulps               |   4 -
 sysdeps/i386/fpu/e_atan2f.S                   |  30 --
 sysdeps/i386/fpu/libm-test-ulps               |   3 -
 .../i386/i686/fpu/multiarch/libm-test-ulps    |   3 -
 sysdeps/ieee754/flt-32/e_atan2f.c             | 337 ++++++++++++++----
 sysdeps/loongarch/lp64/libm-test-ulps         |   4 -
 sysdeps/m68k/coldfire/fpu/libm-test-ulps      |   1 -
 sysdeps/microblaze/libm-test-ulps             |   1 -
 sysdeps/mips/mips32/libm-test-ulps            |   4 -
 sysdeps/mips/mips64/libm-test-ulps            |   4 -
 sysdeps/or1k/fpu/libm-test-ulps               |   4 -
 sysdeps/or1k/nofpu/libm-test-ulps             |   4 -
 sysdeps/powerpc/fpu/libm-test-ulps            |   4 -
 sysdeps/powerpc/nofpu/libm-test-ulps          |   4 -
 sysdeps/riscv/nofpu/libm-test-ulps            |   4 -
 sysdeps/riscv/rvd/libm-test-ulps              |   4 -
 sysdeps/s390/fpu/libm-test-ulps               |   4 -
 sysdeps/sh/libm-test-ulps                     |   2 -
 sysdeps/sparc/fpu/libm-test-ulps              |   4 -
 sysdeps/x86_64/fpu/libm-test-ulps             |   4 -
 28 files changed, 262 insertions(+), 196 deletions(-)
 delete mode 100644 sysdeps/i386/fpu/e_atan2f.S
diff mbox series

Patch

diff --git a/SHARED-FILES b/SHARED-FILES
index b9627afdfe..99f8554393 100644
--- a/SHARED-FILES
+++ b/SHARED-FILES
@@ -314,3 +314,7 @@  sysdeps/ieee754/flt-32/s_atanf.c:
   (src/binary32/atan/atanf.c in CORE-MATH)
   - The code was adapted to use glibc code style and internal
     functions to handle errno, overflow, and underflow.
+sysdeps/ieee754/flt-32/e_atan2f.c:
+  (src/binary32/atan2/atan2f.c in CORE-MATH)
+  - The code was adapted to use glibc code style and internal
+    functions to handle errno, overflow, and underflow.
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index 44934af245..f48a4d2e76 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -102,7 +102,6 @@  double: 1
 ldouble: 1
 
 Function: "atan2":
-float: 1
 ldouble: 2
 
 Function: "atan2_advsimd":
@@ -111,7 +110,6 @@  float: 2
 
 Function: "atan2_downward":
 double: 1
-float: 2
 ldouble: 2
 
 Function: "atan2_sve":
@@ -120,12 +118,10 @@  float: 2
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 ldouble: 3
 
 Function: "atan2_upward":
 double: 1
-float: 1
 ldouble: 2
 
 Function: "atan_advsimd":
diff --git a/sysdeps/alpha/fpu/libm-test-ulps b/sysdeps/alpha/fpu/libm-test-ulps
index f9c1cf7cf5..860da75a86 100644
--- a/sysdeps/alpha/fpu/libm-test-ulps
+++ b/sysdeps/alpha/fpu/libm-test-ulps
@@ -70,22 +70,18 @@  double: 1
 ldouble: 1
 
 Function: "atan2":
-float: 2
 ldouble: 2
 
 Function: "atan2_downward":
 double: 1
-float: 2
 ldouble: 2
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 ldouble: 3
 
 Function: "atan2_upward":
 double: 1
-float: 2
 ldouble: 2
 
 Function: "atan_downward":
diff --git a/sysdeps/arc/fpu/libm-test-ulps b/sysdeps/arc/fpu/libm-test-ulps
index 37b0efae66..7b5208e2f8 100644
--- a/sysdeps/arc/fpu/libm-test-ulps
+++ b/sysdeps/arc/fpu/libm-test-ulps
@@ -54,19 +54,15 @@  double: 1
 
 Function: "atan2":
 double: 7
-float: 2
 
 Function: "atan2_downward":
 double: 5
-float: 2
 
 Function: "atan2_towardzero":
 double: 5
-float: 2
 
 Function: "atan2_upward":
 double: 8
-float: 2
 
 Function: "atan_downward":
 double: 1
diff --git a/sysdeps/arc/nofpu/libm-test-ulps b/sysdeps/arc/nofpu/libm-test-ulps
index 8d283f0627..a46825b0c1 100644
--- a/sysdeps/arc/nofpu/libm-test-ulps
+++ b/sysdeps/arc/nofpu/libm-test-ulps
@@ -17,7 +17,6 @@  Function: "atan":
 double: 1
 
 Function: "atan2":
-float: 2
 
 Function: "atanh":
 double: 2
diff --git a/sysdeps/arm/libm-test-ulps b/sysdeps/arm/libm-test-ulps
index bb4ee0f2e4..5c1046b9f5 100644
--- a/sysdeps/arm/libm-test-ulps
+++ b/sysdeps/arm/libm-test-ulps
@@ -53,19 +53,15 @@  Function: "atan":
 double: 1
 
 Function: "atan2":
-float: 2
 
 Function: "atan2_downward":
 double: 1
-float: 2
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 
 Function: "atan2_upward":
 double: 1
-float: 2
 
 Function: "atan_downward":
 double: 1
diff --git a/sysdeps/csky/fpu/libm-test-ulps b/sysdeps/csky/fpu/libm-test-ulps
index 9d3fcf693d..0a18d51742 100644
--- a/sysdeps/csky/fpu/libm-test-ulps
+++ b/sysdeps/csky/fpu/libm-test-ulps
@@ -50,19 +50,15 @@  double: 3
 Function: "atan":
 
 Function: "atan2":
-float: 1
 
 Function: "atan2_downward":
 double: 1
-float: 2
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 
 Function: "atan2_upward":
 double: 1
-float: 2
 
 Function: "atan_downward":
 double: 1
diff --git a/sysdeps/csky/nofpu/libm-test-ulps b/sysdeps/csky/nofpu/libm-test-ulps
index 1bab8effc7..55be0213ec 100644
--- a/sysdeps/csky/nofpu/libm-test-ulps
+++ b/sysdeps/csky/nofpu/libm-test-ulps
@@ -50,19 +50,15 @@  double: 3
 Function: "atan":
 
 Function: "atan2":
-float: 1
 
 Function: "atan2_downward":
 double: 1
-float: 2
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 
 Function: "atan2_upward":
 double: 1
-float: 2
 
 Function: "atan_downward":
 double: 1
diff --git a/sysdeps/hppa/fpu/libm-test-ulps b/sysdeps/hppa/fpu/libm-test-ulps
index 8de00f442b..c92f46cc15 100644
--- a/sysdeps/hppa/fpu/libm-test-ulps
+++ b/sysdeps/hppa/fpu/libm-test-ulps
@@ -53,19 +53,15 @@  Function: "atan":
 double: 1
 
 Function: "atan2":
-float: 2
 
 Function: "atan2_downward":
 double: 1
-float: 2
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 
 Function: "atan2_upward":
 double: 1
-float: 2
 
 Function: "atan_downward":
 double: 1
diff --git a/sysdeps/i386/fpu/e_atan2f.S b/sysdeps/i386/fpu/e_atan2f.S
deleted file mode 100644
index 606d4a6f2d..0000000000
--- a/sysdeps/i386/fpu/e_atan2f.S
+++ /dev/null
@@ -1,30 +0,0 @@ 
-/*
- * Public domain.
- */
-
-#include <machine/asm.h>
-#include <i386-math-asm.h>
-#include <libm-alias-finite.h>
-
-RCSID("$NetBSD: e_atan2f.S,v 1.1 1995/05/08 23:35:10 jtc Exp $")
-
-DEFINE_FLT_MIN
-
-#ifdef PIC
-# define MO(op) op##@GOTOFF(%ecx)
-#else
-# define MO(op) op
-#endif
-
-	.text
-ENTRY(__ieee754_atan2f)
-#ifdef  PIC
-	LOAD_PIC_REG (cx)
-#endif
-	flds	4(%esp)
-	flds	8(%esp)
-	fpatan
-	FLT_CHECK_FORCE_UFLOW_NARROW
-	ret
-END (__ieee754_atan2f)
-libm_alias_finite (__ieee754_atan2f, __atan2f)
diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
index 31286ea178..b1f5b5f900 100644
--- a/sysdeps/i386/fpu/libm-test-ulps
+++ b/sysdeps/i386/fpu/libm-test-ulps
@@ -91,19 +91,16 @@  ldouble: 1
 
 Function: "atan2_downward":
 double: 1
-float: 1
 float128: 2
 ldouble: 1
 
 Function: "atan2_towardzero":
 double: 1
-float: 1
 float128: 3
 ldouble: 1
 
 Function: "atan2_upward":
 double: 1
-float: 1
 float128: 2
 ldouble: 1
 
diff --git a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
index 0a872570d1..b651fd4fbf 100644
--- a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
+++ b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
@@ -91,19 +91,16 @@  ldouble: 1
 
 Function: "atan2_downward":
 double: 1
-float: 1
 float128: 2
 ldouble: 1
 
 Function: "atan2_towardzero":
 double: 1
-float: 1
 float128: 3
 ldouble: 1
 
 Function: "atan2_upward":
 double: 1
-float: 1
 float128: 2
 ldouble: 1
 
diff --git a/sysdeps/ieee754/flt-32/e_atan2f.c b/sysdeps/ieee754/flt-32/e_atan2f.c
index 75174062e8..836202f122 100644
--- a/sysdeps/ieee754/flt-32/e_atan2f.c
+++ b/sysdeps/ieee754/flt-32/e_atan2f.c
@@ -1,94 +1,273 @@ 
-/* e_atan2f.c -- float version of e_atan2.c.
- */
+/* Correctly-rounded arctangent function of two binary32 values.
 
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
+Copyright (c) 2022-2024 Alexei Sibidanov and Paul Zimmermann.
+
+The original version of this file was copied from the CORE-MATH
+project (file src/binary32/atan2/atan2f.c, revision 7835c5d).
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
 
 #include <math.h>
-#include <math_private.h>
+#include <stdint.h>
 #include <libm-alias-finite.h>
+#include "math_config.h"
 
-static const float
-tiny  = 1.0e-30,
-zero  = 0.0,
-pi_o_4  = 7.8539818525e-01,  /* 0x3f490fdb */
-pi_o_2  = 1.5707963705e+00,  /* 0x3fc90fdb */
-pi      = 3.1415927410e+00,  /* 0x40490fdb */
-pi_lo   = -8.7422776573e-08; /* 0xb3bbbd2e */
+static inline double
+muldd (double xh, double xl, double ch, double cl, double *l)
+{
+  double ahlh = ch * xl;
+  double alhh = cl * xh;
+  double ahhh = ch * xh;
+  double ahhl = fma (ch, xh, -ahhh);
+  ahhl += alhh + ahlh;
+  ch = ahhh + ahhl;
+  *l = (ahhh - ch) + ahhl;
+  return ch;
+}
 
-float
-__ieee754_atan2f (float y, float x)
+static double
+polydd (double xh, double xl, int n, const double c[][2], double *l)
 {
-	float z;
-	int32_t k,m,hx,hy,ix,iy;
+  int i = n - 1;
+  double ch = c[i][0];
+  double cl = c[i][1];
+  while (--i >= 0)
+    {
+      ch = muldd (xh, xl, ch, cl, &cl);
+      double th = ch + c[i][0];
+      double tl = (c[i][0] - th) + ch;
+      ch = th;
+      cl += tl + c[i][1];
+    }
+  *l = cl;
+  return ch;
+}
 
-	GET_FLOAT_WORD(hx,x);
-	ix = hx&0x7fffffff;
-	GET_FLOAT_WORD(hy,y);
-	iy = hy&0x7fffffff;
-	if((ix>0x7f800000)||
-	   (iy>0x7f800000))	/* x or y is NaN */
-	   return x+y;
-	if(hx==0x3f800000) return __atanf(y);   /* x=1.0 */
-	m = ((hy>>31)&1)|((hx>>30)&2);	/* 2*sign(x)+sign(y) */
+/* for y/x tiny, use Taylor approximation z - z^3/3 where z=y/x */
+static float
+cr_atan2f_tiny (float y, float x)
+{
+  double dy = y, dx = x;
+  double z = dy / dx;
+  double e = fma (-z, x, y);
+  /* z * x + e = y thus y/x = z + e/x */
+  static const double c = -0x1.5555555555555p-2; /* -1/3 rounded to nearest */
+  double zz = z * z;
+  double cz = c * z;
+  e = e / x + cz * zz;
+  uint64_t t = asuint64 (z);
+  if ((t & UINT64_C(0xfffffff)) == 0) /* boundary case */
+    {
+      /* If z and e are of same sign (resp. of different signs), we increase
+	 (resp. decrease) the significant of t by 1 to avoid a double-rounding
+	 issue when rounding t to binary32. */
+      if (z * e > 0)
+	t += 1;
+      else
+	t -= 1;
+    }
+  return asdouble (t);
+}
 
-    /* when y = 0 */
-	if(iy==0) {
-	    switch(m) {
-		case 0:
-		case 1: return y;	/* atan(+-0,+anything)=+-0 */
-		case 2: return  pi+tiny;/* atan(+0,-anything) = pi */
-		case 3: return -pi-tiny;/* atan(-0,-anything) =-pi */
-	    }
+float
+__ieee754_atan2f (float y, float x)
+{
+  static const double cn[] =
+    {
+      0x1p+0,               0x1.40e0698f94c35p+1, 0x1.248c5da347f0dp+1,
+      0x1.d873386572976p-1, 0x1.46fa40b20f1dp-3,  0x1.33f5e041eed0fp-7,
+      0x1.546bbf28667c5p-14
+    };
+  static const double cd[] =
+    {
+      0x1p+0,               0x1.6b8b143a3f6dap+1, 0x1.8421201d18ed5p+1,
+      0x1.8221d086914ebp+0, 0x1.670657e3a07bap-2, 0x1.0f4951fd1e72dp-5,
+      0x1.b3874b8798286p-11
+    };
+  static const double m[] = { 0, 1 };
+#define pi 0x1.921fb54442d18p+1
+#define pi2 0x1.921fb54442d18p+0
+#define pi2l 0x1.1a62633145c07p-54
+  static const double off[] = { 0.0f, pi2, pi, pi2, -0.0f, -pi2, -pi, -pi2 };
+  static const double offl[] =
+    {
+      0.0f, pi2l, 2 * pi2l, pi2l, -0.0f, -pi2l, -2 * pi2l, -pi2l
+    };
+  static const double sgn[] = { 1, -1 };
+  uint32_t ux = asuint (x);
+  uint32_t uy = asuint (y);
+  uint32_t ax = ux & (~0u >> 1);
+  uint32_t ay = uy & (~0u >> 1);
+  if (__glibc_unlikely (ay >= (0xff << 23) || ax >= (0xff << 23)))
+    {
+      /* we use x+y below so that the invalid exception is set
+	 for (x,y) = (qnan,snan) or (snan,qnan) */
+      if (ay > (0xff << 23))
+	return y + y; /* nan */
+      if (ax > (0xff << 23))
+	return x + y; /* nan */
+      bool yinf = ay == (0xff << 23);
+      bool xinf = ax == (0xff << 23);
+      if (yinf & xinf)
+	{
+	  if (ux >> 31)
+	    return 0x1.2d97c7f3321d2p+1 * sgn[uy >> 31];
+	  else
+	    return 0x1.921fb54442d18p-1 * sgn[uy >> 31];
 	}
-    /* when x = 0 */
-	if(ix==0) return (hy<0)?  -pi_o_2-tiny: pi_o_2+tiny;
-
-    /* when x is INF */
-	if(ix==0x7f800000) {
-	    if(iy==0x7f800000) {
-		switch(m) {
-		    case 0: return  pi_o_4+tiny;/* atan(+INF,+INF) */
-		    case 1: return -pi_o_4-tiny;/* atan(-INF,+INF) */
-		    case 2: return  (float)3.0*pi_o_4+tiny;/*atan(+INF,-INF)*/
-		    case 3: return (float)-3.0*pi_o_4-tiny;/*atan(-INF,-INF)*/
-		}
-	    } else {
-		switch(m) {
-		    case 0: return  zero  ;	/* atan(+...,+INF) */
-		    case 1: return -zero  ;	/* atan(-...,+INF) */
-		    case 2: return  pi+tiny  ;	/* atan(+...,-INF) */
-		    case 3: return -pi-tiny  ;	/* atan(-...,-INF) */
-		}
-	    }
+      if (xinf)
+	{
+	  if (ux >> 31)
+	    return pi * sgn[uy >> 31];
+	  else
+	    return 0.0f * sgn[uy >> 31];
+	}
+      if (yinf)
+	return pi2 * sgn[uy >> 31];
+    }
+  if (__glibc_unlikely (ay == 0))
+    {
+      if (__glibc_unlikely (!(ay | ax)))
+	{
+	  uint32_t i = (uy >> 31) * 4 + (ux >> 31) * 2;
+	  if (ux >> 31)
+	    return off[i] + offl[i];
+	  else
+	    return off[i];
 	}
-    /* when y is INF */
-	if(iy==0x7f800000) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
+      if (!(ux >> 31))
+	return 0.0f * sgn[uy >> 31];
+    }
+  uint32_t gt = ay > ax;
+  uint32_t i = (uy >> 31) * 4 + (ux >> 31) * 2 + gt;
 
-    /* compute y/x */
-	k = (iy-ix)>>23;
-	if(k > 60) z=pi_o_2+(float)0.5*pi_lo;	/* |y/x| >  2**60 */
-	else if(hx<0&&k<-60) z=0.0;	/* |y|/x < -2**60 */
-	else z=__atanf(fabsf(y/x));	/* safe to do y/x */
-	switch (m) {
-	    case 0: return       z  ;	/* atan(+,+) */
-	    case 1: {
-		      uint32_t zh;
-		      GET_FLOAT_WORD(zh,z);
-		      SET_FLOAT_WORD(z,zh ^ 0x80000000);
-		    }
-		    return       z  ;	/* atan(-,+) */
-	    case 2: return  pi-(z-pi_lo);/* atan(+,-) */
-	    default: /* case 3 */
-		    return  (z-pi_lo)-pi;/* atan(-,-) */
+  double zx = x;
+  double zy = y;
+  double z = (m[gt] * zx + m[1 - gt] * zy) / (m[gt] * zy + m[1 - gt] * zx);
+  /* z = x/y if |y| > |x|, and z = y/x otherwise */
+  double r;
+  int d = (int) ax - (int) ay;
+  if (__glibc_likely (d < (27 << 23) && d > (-(27 << 23))))
+    {
+      double z2 = z * z, z4 = z2 * z2, z8 = z4 * z4;
+      /* z2 cannot underflow, since for |y|=0x1p-149 and |x|=0x1.fffffep+127
+	 we get |z| > 2^-277 thus z2 > 2^-554, but z4 and z8 might underflow,
+	 which might give spurious underflow exceptions. */
+      double cn0 = cn[0] + z2 * cn[1];
+      double cn2 = cn[2] + z2 * cn[3];
+      double cn4 = cn[4] + z2 * cn[5];
+      double cn6 = cn[6];
+      cn0 += z4 * cn2;
+      cn4 += z4 * cn6;
+      cn0 += z8 * cn4;
+      double cd0 = cd[0] + z2 * cd[1];
+      double cd2 = cd[2] + z2 * cd[3];
+      double cd4 = cd[4] + z2 * cd[5];
+      double cd6 = cd[6];
+      cd0 += z4 * cd2;
+      cd4 += z4 * cd6;
+      cd0 += z8 * cd4;
+      r = cn0 / cd0;
+    }
+  else
+    r = 1;
+  z *= sgn[gt];
+  r = z * r + off[i];
+  if (__glibc_unlikely (((asuint64 (r) + 8) & 0xfffffff) <= 16))
+    {
+      /* check tiny y/x */
+      if (ay < ax && ((ax - ay) >> 23 >= 25))
+	return cr_atan2f_tiny (y, x);
+      double zh;
+      double zl;
+      if (gt == 0)
+	{
+	  zh = zy / zx;
+	  zl = fma (zh, -zx, zy) / zx;
+	}
+      else
+	{
+	  zh = zx / zy;
+	  zl = fma (zh, -zy, zx) / zy;
+	}
+      double z2l;
+      double z2h = muldd (zh, zl, zh, zl, &z2l);
+      static const double c[32][2] =
+	{
+	  {  0x1p+0,                -0x1.8c1dac5492248p-87 },
+	  { -0x1.5555555555555p-2,  -0x1.55553bf3a2abep-56 },
+	  {  0x1.999999999999ap-3,  -0x1.99deed1ec9071p-57 },
+	  { -0x1.2492492492492p-3,  -0x1.fd99c8d18269ap-58 },
+	  {  0x1.c71c71c71c717p-4,  -0x1.651eee4c4d9dp-61 },
+	  { -0x1.745d1745d1649p-4,  -0x1.632683d6c44a6p-58 },
+	  {  0x1.3b13b13b11c63p-4,   0x1.bf69c1f8af41dp-58 },
+	  { -0x1.11111110e6338p-4,   0x1.3c3e431e8bb68p-61 },
+	  {  0x1.e1e1e1dc45c4ap-5,  -0x1.be2db05c77bbfp-59 },
+	  { -0x1.af286b8164b4fp-5,   0x1.a4673491f0942p-61 },
+	  {  0x1.86185e9ad4846p-5,   0x1.e12e32d79fceep-59 },
+	  { -0x1.642c6d5161faep-5,   0x1.3ce76c1ca03fp-59 },
+	  {  0x1.47ad6f277e5bfp-5,  -0x1.abd8d85bdb714p-60 },
+	  { -0x1.2f64a2ee8896dp-5,   0x1.ef87d4b615323p-61 },
+	  {  0x1.1a6a2b31741b5p-5,   0x1.a5d9d973547eep-62 },
+	  { -0x1.07fbdad65e0a6p-5,  -0x1.65ac07f5d35f4p-61 },
+	  {  0x1.ee9932a9a5f8bp-6,   0x1.f8b9623f6f55ap-61 },
+	  { -0x1.ce8b5b9584dc6p-6,   0x1.fe5af96e8ea2dp-61 },
+	  {  0x1.ac9cb288087b7p-6,  -0x1.450cdfceaf5cap-60 },
+	  { -0x1.84b025351f3e6p-6,   0x1.579561b0d73dap-61 },
+	  {  0x1.52f5b8ecdd52bp-6,   0x1.036bd2c6fba47p-60 },
+	  { -0x1.163a8c44909dcp-6,   0x1.18f735ffb9f16p-60 },
+	  {  0x1.a400dce3eea6fp-7,  -0x1.c90569c0c1b5cp-61 },
+	  { -0x1.1caa78ae6db3ap-7,  -0x1.4c60f8161ea09p-61 },
+	  {  0x1.52672453c0731p-8,   0x1.834efb598c338p-62 },
+	  { -0x1.5850c5be137cfp-9,  -0x1.445fc150ca7f5p-63 },
+	  {  0x1.23eb98d22e1cap-10, -0x1.388fbaf1d783p-64 },
+	  { -0x1.8f4e974a40741p-12,  0x1.271198a97da34p-66 },
+	  {  0x1.a5cf2e9cf76e5p-14, -0x1.887eb4a63b665p-68 },
+	  { -0x1.420c270719e32p-16,  0x1.efd595b27888bp-71 },
+	  {  0x1.3ba2d69b51677p-19, -0x1.4fb06829cdfc7p-73 },
+	  { -0x1.29b7e6f676385p-23, -0x1.a783b6de718fbp-77 }
+	};
+      double pl;
+      double ph = polydd (z2h, z2l, 32, c, &pl);
+      zh *= sgn[gt];
+      zl *= sgn[gt];
+      ph = muldd (zh, zl, ph, pl, &pl);
+      double sh = ph + off[i];
+      double sl = ((off[i] - sh) + ph) + pl + offl[i];
+      float rf = sh;
+      double th = rf;
+      double dh = sh - th;
+      double tm = dh + sl;
+      uint64_t tth = asuint64 (th);
+      if (th + th * 0x1p-60 == th - th * 0x1p-60)
+	{
+	  tth &= UINT64_C(0x7ff) << 52;
+	  tth -= UINT64_C(24) << 52;
+	  if (fabs (tm) > asdouble (tth))
+	    tm *= 1.25;
+	  else
+	    tm *= 0.75;
 	}
+      r = th + tm;
+    }
+  return r;
 }
 libm_alias_finite (__ieee754_atan2f, __atan2f)
diff --git a/sysdeps/loongarch/lp64/libm-test-ulps b/sysdeps/loongarch/lp64/libm-test-ulps
index ff1cf6b2e4..2b726c95a9 100644
--- a/sysdeps/loongarch/lp64/libm-test-ulps
+++ b/sysdeps/loongarch/lp64/libm-test-ulps
@@ -70,22 +70,18 @@  double: 1
 ldouble: 1
 
 Function: "atan2":
-float: 2
 ldouble: 2
 
 Function: "atan2_downward":
 double: 1
-float: 2
 ldouble: 2
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 ldouble: 3
 
 Function: "atan2_upward":
 double: 1
-float: 2
 ldouble: 2
 
 Function: "atan_downward":
diff --git a/sysdeps/m68k/coldfire/fpu/libm-test-ulps b/sysdeps/m68k/coldfire/fpu/libm-test-ulps
index 7e49468421..989015ff9a 100644
--- a/sysdeps/m68k/coldfire/fpu/libm-test-ulps
+++ b/sysdeps/m68k/coldfire/fpu/libm-test-ulps
@@ -2,7 +2,6 @@ 
 
 # Maximal error of functions:
 Function: "atan2":
-float: 1
 
 Function: "atanh":
 float: 1
diff --git a/sysdeps/microblaze/libm-test-ulps b/sysdeps/microblaze/libm-test-ulps
index 5dce4c8f89..30eb8f1a0c 100644
--- a/sysdeps/microblaze/libm-test-ulps
+++ b/sysdeps/microblaze/libm-test-ulps
@@ -14,7 +14,6 @@  double: 1
 Function: "atan":
 
 Function: "atan2":
-float: 1
 
 Function: "atanh":
 double: 2
diff --git a/sysdeps/mips/mips32/libm-test-ulps b/sysdeps/mips/mips32/libm-test-ulps
index 9046a17170..6b0bf0e003 100644
--- a/sysdeps/mips/mips32/libm-test-ulps
+++ b/sysdeps/mips/mips32/libm-test-ulps
@@ -53,19 +53,15 @@  Function: "atan":
 double: 1
 
 Function: "atan2":
-float: 2
 
 Function: "atan2_downward":
 double: 1
-float: 2
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 
 Function: "atan2_upward":
 double: 1
-float: 2
 
 Function: "atan_downward":
 double: 1
diff --git a/sysdeps/mips/mips64/libm-test-ulps b/sysdeps/mips/mips64/libm-test-ulps
index 1525e55eb5..1ed68bb36d 100644
--- a/sysdeps/mips/mips64/libm-test-ulps
+++ b/sysdeps/mips/mips64/libm-test-ulps
@@ -70,22 +70,18 @@  double: 1
 ldouble: 1
 
 Function: "atan2":
-float: 2
 ldouble: 2
 
 Function: "atan2_downward":
 double: 1
-float: 2
 ldouble: 2
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 ldouble: 3
 
 Function: "atan2_upward":
 double: 1
-float: 2
 ldouble: 2
 
 Function: "atan_downward":
diff --git a/sysdeps/or1k/fpu/libm-test-ulps b/sysdeps/or1k/fpu/libm-test-ulps
index 6edadaed89..56485715bd 100644
--- a/sysdeps/or1k/fpu/libm-test-ulps
+++ b/sysdeps/or1k/fpu/libm-test-ulps
@@ -53,19 +53,15 @@  Function: "atan":
 double: 1
 
 Function: "atan2":
-float: 2
 
 Function: "atan2_downward":
 double: 5
-float: 2
 
 Function: "atan2_towardzero":
 double: 5
-float: 2
 
 Function: "atan2_upward":
 double: 8
-float: 2
 
 Function: "atan_downward":
 double: 1
diff --git a/sysdeps/or1k/nofpu/libm-test-ulps b/sysdeps/or1k/nofpu/libm-test-ulps
index aff536b890..7ece256112 100644
--- a/sysdeps/or1k/nofpu/libm-test-ulps
+++ b/sysdeps/or1k/nofpu/libm-test-ulps
@@ -53,19 +53,15 @@  Function: "atan":
 double: 1
 
 Function: "atan2":
-float: 2
 
 Function: "atan2_downward":
 double: 5
-float: 2
 
 Function: "atan2_towardzero":
 double: 5
-float: 2
 
 Function: "atan2_upward":
 double: 8
-float: 2
 
 Function: "atan_downward":
 double: 1
diff --git a/sysdeps/powerpc/fpu/libm-test-ulps b/sysdeps/powerpc/fpu/libm-test-ulps
index 342054bb72..95e4d049ac 100644
--- a/sysdeps/powerpc/fpu/libm-test-ulps
+++ b/sysdeps/powerpc/fpu/libm-test-ulps
@@ -91,25 +91,21 @@  float128: 1
 ldouble: 1
 
 Function: "atan2":
-float: 1
 float128: 2
 ldouble: 2
 
 Function: "atan2_downward":
 double: 1
-float: 2
 float128: 2
 ldouble: 5
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 float128: 3
 ldouble: 6
 
 Function: "atan2_upward":
 double: 1
-float: 1
 float128: 2
 ldouble: 3
 
diff --git a/sysdeps/powerpc/nofpu/libm-test-ulps b/sysdeps/powerpc/nofpu/libm-test-ulps
index c7242e5fec..5d6a34115e 100644
--- a/sysdeps/powerpc/nofpu/libm-test-ulps
+++ b/sysdeps/powerpc/nofpu/libm-test-ulps
@@ -74,22 +74,18 @@  double: 1
 ldouble: 1
 
 Function: "atan2":
-float: 2
 ldouble: 2
 
 Function: "atan2_downward":
 double: 1
-float: 2
 ldouble: 5
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 ldouble: 6
 
 Function: "atan2_upward":
 double: 1
-float: 2
 ldouble: 3
 
 Function: "atan_downward":
diff --git a/sysdeps/riscv/nofpu/libm-test-ulps b/sysdeps/riscv/nofpu/libm-test-ulps
index 4fa17a3da2..ff1e792986 100644
--- a/sysdeps/riscv/nofpu/libm-test-ulps
+++ b/sysdeps/riscv/nofpu/libm-test-ulps
@@ -70,22 +70,18 @@  double: 1
 ldouble: 1
 
 Function: "atan2":
-float: 2
 ldouble: 2
 
 Function: "atan2_downward":
 double: 1
-float: 2
 ldouble: 2
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 ldouble: 3
 
 Function: "atan2_upward":
 double: 1
-float: 1
 ldouble: 2
 
 Function: "atan_downward":
diff --git a/sysdeps/riscv/rvd/libm-test-ulps b/sysdeps/riscv/rvd/libm-test-ulps
index 0e3fb96ee5..1a4577cd5c 100644
--- a/sysdeps/riscv/rvd/libm-test-ulps
+++ b/sysdeps/riscv/rvd/libm-test-ulps
@@ -70,22 +70,18 @@  double: 1
 ldouble: 1
 
 Function: "atan2":
-float: 1
 ldouble: 2
 
 Function: "atan2_downward":
 double: 1
-float: 2
 ldouble: 2
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 ldouble: 3
 
 Function: "atan2_upward":
 double: 1
-float: 1
 ldouble: 2
 
 Function: "atan_downward":
diff --git a/sysdeps/s390/fpu/libm-test-ulps b/sysdeps/s390/fpu/libm-test-ulps
index 921ff284af..4ff845bc0a 100644
--- a/sysdeps/s390/fpu/libm-test-ulps
+++ b/sysdeps/s390/fpu/libm-test-ulps
@@ -70,22 +70,18 @@  double: 1
 ldouble: 1
 
 Function: "atan2":
-float: 1
 ldouble: 2
 
 Function: "atan2_downward":
 double: 1
-float: 2
 ldouble: 2
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 ldouble: 3
 
 Function: "atan2_upward":
 double: 1
-float: 1
 ldouble: 2
 
 Function: "atan_downward":
diff --git a/sysdeps/sh/libm-test-ulps b/sysdeps/sh/libm-test-ulps
index b429f42d89..d87533a5cc 100644
--- a/sysdeps/sh/libm-test-ulps
+++ b/sysdeps/sh/libm-test-ulps
@@ -26,11 +26,9 @@  double: 2
 Function: "atan":
 
 Function: "atan2":
-float: 1
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 
 Function: "atan_towardzero":
 double: 1
diff --git a/sysdeps/sparc/fpu/libm-test-ulps b/sysdeps/sparc/fpu/libm-test-ulps
index ee7eea81f9..01e9cd360a 100644
--- a/sysdeps/sparc/fpu/libm-test-ulps
+++ b/sysdeps/sparc/fpu/libm-test-ulps
@@ -70,22 +70,18 @@  double: 1
 ldouble: 1
 
 Function: "atan2":
-float: 2
 ldouble: 2
 
 Function: "atan2_downward":
 double: 1
-float: 2
 ldouble: 2
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 ldouble: 3
 
 Function: "atan2_upward":
 double: 1
-float: 2
 ldouble: 2
 
 Function: "atan_downward":
diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
index 1589403c1c..2a4b3bcfc6 100644
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@@ -164,25 +164,21 @@  float128: 1
 ldouble: 1
 
 Function: "atan2":
-float: 2
 float128: 2
 ldouble: 1
 
 Function: "atan2_downward":
 double: 1
-float: 2
 float128: 2
 ldouble: 1
 
 Function: "atan2_towardzero":
 double: 1
-float: 2
 float128: 3
 ldouble: 1
 
 Function: "atan2_upward":
 double: 1
-float: 2
 float128: 2
 ldouble: 1