glibc – nexttowardf candidate for optimization on aarch64

In my previous post, I went over the steps to finding a function that was optimized for x86_64 but not on aarch64.

After going through the list of functions, I came across nexttowardf.c:

[lisac@localhost glibc]$ find ./* -name "*nexttowardf*"
./math/s_nexttowardf.c
./sysdeps/x86_64/fpu/s_nexttowardf.c
./sysdeps/ia64/fpu/s_nexttowardf.S
./sysdeps/ieee754/ldbl-opt/nldbl-nexttowardf.c
./sysdeps/ieee754/ldbl-opt/s_nexttowardfd.c
./sysdeps/ieee754/ldbl-128/s_nexttowardf.c
./sysdeps/ieee754/ldbl-64-128/s_nexttowardf.c
./sysdeps/ieee754/ldbl-96/s_nexttowardf.c
./sysdeps/ieee754/ldbl-128ibm/s_nexttowardf.c
./sysdeps/i386/fpu/s_nexttowardf.c

From nexttowardf‘s man pages:

“The nextafter(), nextafterf(), and nextafterl() functions return the next representable floating-point value following x in the direction of y. If y is less than x, these functions will return the largest representable number less than x.

If x equals y, the functions return y.

The nexttoward(), nexttowardf(), and nexttowardl() functions do the same as the corresponding nextafter() functions, except that they have a long double second argument.”

Looking at the x86_64 optimization sysdeps/x86_64/fpu/s_nexttowardf.c:

#include <sysdeps/i386/fpu/s_nexttowardf.c>

This file contains only an include statement for the i386’s optimization. So I had a look at the sysdeps/i386/fpu/s_nexttowardf.c and it is essentially identical to the original math/s_nexttowardf.c version. One thing to note are the macros inside __nexttowardf: .

original

math/s_nexttowardf.c:

float __nexttowardf(float x, long double y)
{
    int32_t hx,hy,ix,iy;
    u_int32_t ly;

    GET_FLOAT_WORD(hx,x);
    EXTRACT_WORDS(hy,ly,y);
    ix = hx&0x7fffffff;     /* |x| */
    iy = hy&0x7fffffff;     /* |y| */
...
...
    SET_FLOAT_WORD(x,hx);

sysdeps/generic/math_private.h:

/* Get a 32 bit int from a float.  */
#ifndef GET_FLOAT_WORD
# define GET_FLOAT_WORD(i,d)                    \
do {                                \
  ieee_float_shape_type gf_u;                   \
  gf_u.value = (d);                     \
  (i) = gf_u.word;                      \
} while (0)
#endif
/* Set a float from a 32 bit int.  */
#ifndef SET_FLOAT_WORD
# define SET_FLOAT_WORD(d,i)                    \
do {                                \
  ieee_float_shape_type sf_u;                   \
  sf_u.word = (i);                      \
  (d) = sf_u.value;                     \
} while (0)
#endif
/* Get two 32 bit ints from a double.  */

#define EXTRACT_WORDS(ix0,ix1,d)                \
do {                                \
  ieee_double_shape_type ew_u;                  \
  ew_u.value = (d);                     \
  (ix0) = ew_u.parts.msw;                   \
  (ix1) = ew_u.parts.lsw;                   \
} while (0)

x86_64

sysdeps/i386/fpu/s_nexttowardf.c:

float __nexttowardf(float x, long double y)
{
    int32_t hx,ix,iy;
    u_int32_t hy,ly,esy;

    GET_FLOAT_WORD(hx,x);
    GET_LDOUBLE_WORDS(esy,hy,ly,y);
    ix = hx&0x7fffffff;     /* |x| */
    iy = esy&0x7fff;        /* |y| */
...
...
    SET_FLOAT_WORD(x,hx);

sysdeps/x86_64/fpu/math_private.h:

/* Direct movement of float into integer register.  */
#define GET_FLOAT_WORD(i, d) \
  do {                                        \
    int i_;                                   \
    asm (MOVD " %1, %0" : "=rm" (i_) : "x" ((float) (d)));            \
    (i) = i_;                                     \
  } while (0)
/* And the reverse.  */
#define SET_FLOAT_WORD(f, i) \
  do {                                        \
    int i_ = i;                                   \
    float f__;                                    \
    asm (MOVD " %1, %0" : "=x" (f__) : "rm" (i_));                \
    f = f__;                                      \
  } while (0)

sysdeps/x86_64/fpu/math_ldbl.h:

/* Get three 32 bit ints from a double.  */

#define GET_LDOUBLE_WORDS(exp,ix0,ix1,d)            \
do {                                \
  ieee_long_double_shape_type ew_u;             \
  ew_u.value = (d);                     \
  (exp) = ew_u.parts.sign_exponent;             \
  (ix0) = ew_u.parts.msw;                   \
  (ix1) = ew_u.parts.lsw;                   \
} while (0)

The x86_64 definition for GET_FLOAT_WORD and SET_FLOAT_WORD contains inline assembly. I will try a similar approach for the aarch64.