mirror of
https://github.com/freebsd/freebsd-src.git
synced 2024-11-26 20:12:44 +00:00
Update the Arm Optimized Routine library to v24.01
Sponsored by: Arm Ltd
This commit is contained in:
commit
5a02ffc32e
@ -12,7 +12,7 @@ contribution requirements are documented in README.contributors of
|
||||
the appropriate subdirectory.
|
||||
|
||||
Regular quarterly releases are tagged as vYY.MM, the latest
|
||||
release is v23.01.
|
||||
release is v24.01.
|
||||
|
||||
Source code layout:
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
# Example config.mk
|
||||
#
|
||||
# Copyright (c) 2018-2022, Arm Limited.
|
||||
# Copyright (c) 2018-2023, Arm Limited.
|
||||
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
|
||||
# Subprojects to build
|
||||
@ -59,13 +59,14 @@ math-cflags += -ffp-contract=fast -fno-math-errno
|
||||
# Use with clang.
|
||||
#math-cflags += -ffp-contract=fast
|
||||
|
||||
# Disable vector math code
|
||||
#math-cflags += -DWANT_VMATH=0
|
||||
|
||||
# Disable/enable SVE vector math code and tests
|
||||
# Disable/enable SVE vector math code and tests.
|
||||
# If WANT_SVE_MATH is enabled, math-sve-cflags is added for SVE
|
||||
# routines only so that SVE code does not leak into scalar
|
||||
# routines. It is also necessary to add it for tools (e.g. ulp,
|
||||
# mathbench)
|
||||
WANT_SVE_MATH = 0
|
||||
ifeq ($(WANT_SVE_MATH), 1)
|
||||
math-cflags += -march=armv8.2-a+sve
|
||||
math-sve-cflags = -march=armv8-a+sve
|
||||
endif
|
||||
math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
|
||||
|
||||
|
@ -1,12 +1,14 @@
|
||||
# Makefile fragment - requires GNU make
|
||||
#
|
||||
# Copyright (c) 2019-2022, Arm Limited.
|
||||
# Copyright (c) 2019-2023, Arm Limited.
|
||||
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
|
||||
S := $(srcdir)/math
|
||||
B := build/math
|
||||
|
||||
math-lib-srcs := $(wildcard $(S)/*.[cS])
|
||||
math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS])
|
||||
|
||||
math-test-srcs := \
|
||||
$(S)/test/mathtest.c \
|
||||
$(S)/test/mathbench.c \
|
||||
@ -65,6 +67,8 @@ build/lib/libmathlib.a: $(math-lib-objs)
|
||||
|
||||
$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
|
||||
$(math-tools): LDLIBS += $(math-ldlibs) -lm
|
||||
# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled
|
||||
$(math-tools): CFLAGS_ALL += $(math-sve-cflags)
|
||||
|
||||
build/bin/rtest: $(math-host-objs)
|
||||
$(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS)
|
||||
|
87
contrib/arm-optimized-routines/math/aarch64/v_cos.c
Normal file
87
contrib/arm-optimized-routines/math/aarch64/v_cos.c
Normal file
@ -0,0 +1,87 @@
|
||||
/*
|
||||
* Double-precision vector cos function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t poly[7];
|
||||
float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
|
||||
} data = {
|
||||
/* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
|
||||
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
|
||||
V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
|
||||
V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
|
||||
V2 (-0x1.9e9540300a1p-41) },
|
||||
.inv_pi = V2 (0x1.45f306dc9c883p-2),
|
||||
.half_pi = V2 (0x1.921fb54442d18p+0),
|
||||
.pi_1 = V2 (0x1.921fb54442d18p+1),
|
||||
.pi_2 = V2 (0x1.1a62633145c06p-53),
|
||||
.pi_3 = V2 (0x1.c1cd129024e09p-106),
|
||||
.shift = V2 (0x1.8p52),
|
||||
.range_val = V2 (0x1p23)
|
||||
};
|
||||
|
||||
#define C(i) d->poly[i]
|
||||
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
|
||||
{
|
||||
y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
|
||||
return v_call_f64 (cos, x, y, cmp);
|
||||
}
|
||||
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float64x2_t n, r, r2, r3, r4, t1, t2, t3, y;
|
||||
uint64x2_t odd, cmp;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
r = vabsq_f64 (x);
|
||||
cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r),
|
||||
vreinterpretq_u64_f64 (d->range_val));
|
||||
if (unlikely (v_any_u64 (cmp)))
|
||||
/* If fenv exceptions are to be triggered correctly, set any special lanes
|
||||
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
|
||||
special-case handler later. */
|
||||
r = vbslq_f64 (cmp, v_f64 (1.0), r);
|
||||
#else
|
||||
cmp = vcageq_f64 (x, d->range_val);
|
||||
r = x;
|
||||
#endif
|
||||
|
||||
/* n = rint((|x|+pi/2)/pi) - 0.5. */
|
||||
n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
|
||||
odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
|
||||
n = vsubq_f64 (n, d->shift);
|
||||
n = vsubq_f64 (n, v_f64 (0.5));
|
||||
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
|
||||
r = vfmsq_f64 (r, d->pi_1, n);
|
||||
r = vfmsq_f64 (r, d->pi_2, n);
|
||||
r = vfmsq_f64 (r, d->pi_3, n);
|
||||
|
||||
/* sin(r) poly approx. */
|
||||
r2 = vmulq_f64 (r, r);
|
||||
r3 = vmulq_f64 (r2, r);
|
||||
r4 = vmulq_f64 (r2, r2);
|
||||
|
||||
t1 = vfmaq_f64 (C (4), C (5), r2);
|
||||
t2 = vfmaq_f64 (C (2), C (3), r2);
|
||||
t3 = vfmaq_f64 (C (0), C (1), r2);
|
||||
|
||||
y = vfmaq_f64 (t1, C (6), r4);
|
||||
y = vfmaq_f64 (t2, y, r4);
|
||||
y = vfmaq_f64 (t3, y, r4);
|
||||
y = vfmaq_f64 (r, y, r3);
|
||||
|
||||
if (unlikely (v_any_u64 (cmp)))
|
||||
return special_case (x, y, odd, cmp);
|
||||
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
|
||||
}
|
82
contrib/arm-optimized-routines/math/aarch64/v_cosf.c
Normal file
82
contrib/arm-optimized-routines/math/aarch64/v_cosf.c
Normal file
@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Single-precision vector cos function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[4];
|
||||
float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
|
||||
} data = {
|
||||
/* 1.886 ulp error. */
|
||||
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
|
||||
V4 (0x1.5b2e76p-19f) },
|
||||
|
||||
.pi_1 = V4 (0x1.921fb6p+1f),
|
||||
.pi_2 = V4 (-0x1.777a5cp-24f),
|
||||
.pi_3 = V4 (-0x1.ee59dap-49f),
|
||||
|
||||
.inv_pi = V4 (0x1.45f306p-2f),
|
||||
.shift = V4 (0x1.8p+23f),
|
||||
.half_pi = V4 (0x1.921fb6p0f),
|
||||
.range_val = V4 (0x1p20f)
|
||||
};
|
||||
|
||||
#define C(i) d->poly[i]
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
|
||||
{
|
||||
/* Fall back to scalar code. */
|
||||
y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
|
||||
return v_call_f32 (cosf, x, y, cmp);
|
||||
}
|
||||
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float32x4_t n, r, r2, r3, y;
|
||||
uint32x4_t odd, cmp;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
r = vabsq_f32 (x);
|
||||
cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r),
|
||||
vreinterpretq_u32_f32 (d->range_val));
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
/* If fenv exceptions are to be triggered correctly, set any special lanes
|
||||
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
|
||||
special-case handler later. */
|
||||
r = vbslq_f32 (cmp, v_f32 (1.0f), r);
|
||||
#else
|
||||
cmp = vcageq_f32 (x, d->range_val);
|
||||
r = x;
|
||||
#endif
|
||||
|
||||
/* n = rint((|x|+pi/2)/pi) - 0.5. */
|
||||
n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
|
||||
odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
|
||||
n = vsubq_f32 (n, d->shift);
|
||||
n = vsubq_f32 (n, v_f32 (0.5f));
|
||||
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
|
||||
r = vfmsq_f32 (r, d->pi_1, n);
|
||||
r = vfmsq_f32 (r, d->pi_2, n);
|
||||
r = vfmsq_f32 (r, d->pi_3, n);
|
||||
|
||||
/* y = sin(r). */
|
||||
r2 = vmulq_f32 (r, r);
|
||||
r3 = vmulq_f32 (r2, r);
|
||||
y = vfmaq_f32 (C (2), C (3), r2);
|
||||
y = vfmaq_f32 (C (1), y, r2);
|
||||
y = vfmaq_f32 (C (0), y, r2);
|
||||
y = vfmaq_f32 (r, y, r3);
|
||||
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
return special_case (x, y, odd, cmp);
|
||||
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
|
||||
}
|
125
contrib/arm-optimized-routines/math/aarch64/v_exp.c
Normal file
125
contrib/arm-optimized-routines/math/aarch64/v_exp.c
Normal file
@ -0,0 +1,125 @@
|
||||
/*
|
||||
* Double-precision vector e^x function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
|
||||
#define N (1 << V_EXP_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
|
||||
const static volatile struct
|
||||
{
|
||||
float64x2_t poly[3];
|
||||
float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
float64x2_t special_bound, scale_thresh;
|
||||
#endif
|
||||
} data = {
|
||||
/* maxerr: 1.88 +0.5 ulp
|
||||
rel error: 1.4337*2^-53
|
||||
abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */
|
||||
.poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3),
|
||||
V2 (0x1.55555da646206p-5) },
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
.scale_thresh = V2 (163840.0), /* 1280.0 * N. */
|
||||
.special_bound = V2 (704.0),
|
||||
#endif
|
||||
.inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */
|
||||
.ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */
|
||||
.ln2_lo = V2 (0x1.abc9e3b39803f3p-63),
|
||||
.shift = V2 (0x1.8p+52)
|
||||
};
|
||||
|
||||
#define C(i) data.poly[i]
|
||||
#define Tab __v_exp_data
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
||||
# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */
|
||||
# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */
|
||||
# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */
|
||||
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
|
||||
{
|
||||
/* If fenv exceptions are to be triggered correctly, fall back to the scalar
|
||||
routine to special lanes. */
|
||||
return v_call_f64 (exp, x, y, cmp);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
|
||||
/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
|
||||
# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
|
||||
# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
|
||||
|
||||
static inline float64x2_t VPCS_ATTR
|
||||
special_case (float64x2_t s, float64x2_t y, float64x2_t n)
|
||||
{
|
||||
/* 2^(n/N) may overflow, break it up into s1*s2. */
|
||||
uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
|
||||
float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
|
||||
float64x2_t s2 = vreinterpretq_f64_u64 (
|
||||
vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
|
||||
uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh);
|
||||
float64x2_t r1 = vmulq_f64 (s1, s1);
|
||||
float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
|
||||
return vbslq_f64 (cmp, r1, r0);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
|
||||
{
|
||||
float64x2_t n, r, r2, s, y, z;
|
||||
uint64x2_t cmp, u, e;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
|
||||
special_case to fix special lanes later. This is only necessary if fenv
|
||||
exceptions are to be triggered correctly. */
|
||||
float64x2_t xm = x;
|
||||
uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
|
||||
cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound);
|
||||
if (unlikely (v_any_u64 (cmp)))
|
||||
x = vbslq_f64 (cmp, v_f64 (1), x);
|
||||
#else
|
||||
cmp = vcagtq_f64 (x, data.special_bound);
|
||||
#endif
|
||||
|
||||
/* n = round(x/(ln2/N)). */
|
||||
z = vfmaq_f64 (data.shift, x, data.inv_ln2);
|
||||
u = vreinterpretq_u64_f64 (z);
|
||||
n = vsubq_f64 (z, data.shift);
|
||||
|
||||
/* r = x - n*ln2/N. */
|
||||
r = x;
|
||||
r = vfmsq_f64 (r, data.ln2_hi, n);
|
||||
r = vfmsq_f64 (r, data.ln2_lo, n);
|
||||
|
||||
e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
|
||||
|
||||
/* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4. */
|
||||
r2 = vmulq_f64 (r, r);
|
||||
y = vfmaq_f64 (C (0), C (1), r);
|
||||
y = vfmaq_f64 (y, C (2), r2);
|
||||
y = vfmaq_f64 (r, y, r2);
|
||||
|
||||
/* s = 2^(n/N). */
|
||||
u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] };
|
||||
s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
|
||||
|
||||
if (unlikely (v_any_u64 (cmp)))
|
||||
#if WANT_SIMD_EXCEPT
|
||||
return special_case (xm, vfmaq_f64 (s, y, s), cmp);
|
||||
#else
|
||||
return special_case (s, y, n);
|
||||
#endif
|
||||
|
||||
return vfmaq_f64 (s, y, s);
|
||||
}
|
113
contrib/arm-optimized-routines/math/aarch64/v_exp2f.c
Normal file
113
contrib/arm-optimized-routines/math/aarch64/v_exp2f.c
Normal file
@ -0,0 +1,113 @@
|
||||
/*
|
||||
* Single-precision vector 2^x function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[5];
|
||||
uint32x4_t exponent_bias;
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
float32x4_t special_bound, scale_thresh;
|
||||
#endif
|
||||
} data = {
|
||||
/* maxerr: 1.962 ulp. */
|
||||
.poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
|
||||
V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
|
||||
.exponent_bias = V4 (0x3f800000),
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
.special_bound = V4 (126.0f),
|
||||
.scale_thresh = V4 (192.0f),
|
||||
#endif
|
||||
};
|
||||
|
||||
#define C(i) d->poly[i]
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
||||
# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
|
||||
# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */
|
||||
# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
|
||||
{
|
||||
/* If fenv exceptions are to be triggered correctly, fall back to the scalar
|
||||
routine for special lanes. */
|
||||
return v_call_f32 (exp2f, x, y, cmp);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
# define SpecialOffset v_u32 (0x82000000)
|
||||
# define SpecialBias v_u32 (0x7f000000)
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
||||
float32x4_t scale, const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
|
||||
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
|
||||
float32x4_t r2 = vmulq_f32 (s1, s1);
|
||||
float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
|
||||
/* Similar to r1 but avoids double rounding in the subnormal range. */
|
||||
float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
|
||||
float32x4_t r = vbslq_f32 (cmp1, r1, r0);
|
||||
return vbslq_f32 (cmp2, r2, r);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float32x4_t n, r, r2, scale, p, q, poly;
|
||||
uint32x4_t cmp, e;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* asuint(|x|) - TinyBound >= BigBound - TinyBound. */
|
||||
uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
|
||||
cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
|
||||
float32x4_t xm = x;
|
||||
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
|
||||
special_case to fix special lanes later. This is only necessary if fenv
|
||||
exceptions are to be triggered correctly. */
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
x = vbslq_f32 (cmp, v_f32 (1), x);
|
||||
#endif
|
||||
|
||||
/* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = n + r, with r in [-1/2, 1/2]. */
|
||||
n = vrndaq_f32 (x);
|
||||
r = vsubq_f32 (x, n);
|
||||
e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
|
||||
scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
cmp = vcagtq_f32 (n, d->special_bound);
|
||||
#endif
|
||||
|
||||
r2 = vmulq_f32 (r, r);
|
||||
p = vfmaq_f32 (C (1), C (0), r);
|
||||
q = vfmaq_f32 (C (3), C (2), r);
|
||||
q = vfmaq_f32 (q, p, r2);
|
||||
p = vmulq_f32 (C (4), r);
|
||||
poly = vfmaq_f32 (p, q, r2);
|
||||
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
#if WANT_SIMD_EXCEPT
|
||||
return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
|
||||
#else
|
||||
return special_case (poly, n, e, cmp, scale, d);
|
||||
#endif
|
||||
|
||||
return vfmaq_f32 (scale, poly, scale);
|
||||
}
|
72
contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c
Normal file
72
contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c
Normal file
@ -0,0 +1,72 @@
|
||||
/*
|
||||
* Single-precision vector 2^x function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
|
||||
static const float Poly[] = {
|
||||
/* maxerr: 0.878 ulp. */
|
||||
0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f
|
||||
};
|
||||
#define C0 v_f32 (Poly[0])
|
||||
#define C1 v_f32 (Poly[1])
|
||||
#define C2 v_f32 (Poly[2])
|
||||
#define C3 v_f32 (Poly[3])
|
||||
#define C4 v_f32 (Poly[4])
|
||||
#define C5 v_f32 (Poly[5])
|
||||
|
||||
#define Shift v_f32 (0x1.8p23f)
|
||||
#define InvLn2 v_f32 (0x1.715476p+0f)
|
||||
#define Ln2hi v_f32 (0x1.62e4p-1f)
|
||||
#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
|
||||
uint32x4_t cmp = absn > v_f32 (192.0f);
|
||||
float32x4_t r1 = s1 * s1;
|
||||
float32x4_t r0 = poly * s1 * s2;
|
||||
return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
|
||||
| (~cmp & vreinterpretq_u32_f32 (r0)));
|
||||
}
|
||||
|
||||
float32x4_t VPCS_ATTR
|
||||
_ZGVnN4v_exp2f_1u (float32x4_t x)
|
||||
{
|
||||
float32x4_t n, r, scale, poly, absn;
|
||||
uint32x4_t cmp, e;
|
||||
|
||||
/* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = n + r, with r in [-1/2, 1/2]. */
|
||||
#if 0
|
||||
float32x4_t z;
|
||||
z = x + Shift;
|
||||
n = z - Shift;
|
||||
r = x - n;
|
||||
e = vreinterpretq_u32_f32 (z) << 23;
|
||||
#else
|
||||
n = vrndaq_f32 (x);
|
||||
r = x - n;
|
||||
e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23;
|
||||
#endif
|
||||
scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
|
||||
absn = vabsq_f32 (n);
|
||||
cmp = absn > v_f32 (126.0f);
|
||||
poly = vfmaq_f32 (C1, C0, r);
|
||||
poly = vfmaq_f32 (C2, poly, r);
|
||||
poly = vfmaq_f32 (C3, poly, r);
|
||||
poly = vfmaq_f32 (C4, poly, r);
|
||||
poly = vfmaq_f32 (C5, poly, r);
|
||||
poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
return specialcase (poly, n, e, absn);
|
||||
return scale * poly;
|
||||
}
|
146
contrib/arm-optimized-routines/math/aarch64/v_exp_data.c
Normal file
146
contrib/arm-optimized-routines/math/aarch64/v_exp_data.c
Normal file
@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Lookup table for double-precision e^x vector function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
# define N (1 << V_EXP_TABLE_BITS)
|
||||
|
||||
/* 2^(j/N), j=0..N. */
|
||||
const uint64_t __v_exp_data[] = {
|
||||
# if N == 128
|
||||
0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061,
|
||||
0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de,
|
||||
0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f,
|
||||
0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b,
|
||||
0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0,
|
||||
0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea,
|
||||
0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa,
|
||||
0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96,
|
||||
0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd,
|
||||
0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990,
|
||||
0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715,
|
||||
0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1,
|
||||
0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7,
|
||||
0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c,
|
||||
0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d,
|
||||
0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de,
|
||||
0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7,
|
||||
0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f,
|
||||
0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429,
|
||||
0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09,
|
||||
0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225,
|
||||
0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf,
|
||||
0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74,
|
||||
0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f,
|
||||
0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62,
|
||||
0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad,
|
||||
0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db,
|
||||
0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6,
|
||||
0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50,
|
||||
0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323,
|
||||
0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d,
|
||||
0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a,
|
||||
0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb,
|
||||
0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a,
|
||||
0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c,
|
||||
0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5,
|
||||
0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c,
|
||||
0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398,
|
||||
0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f,
|
||||
0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83,
|
||||
0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27,
|
||||
0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14,
|
||||
0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1,
|
||||
# elif N == 256
|
||||
0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
|
||||
0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
|
||||
0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
|
||||
0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
|
||||
0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
|
||||
0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
|
||||
0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
|
||||
0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
|
||||
0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
|
||||
0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
|
||||
0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
|
||||
0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
|
||||
0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
|
||||
0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
|
||||
0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
|
||||
0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
|
||||
0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
|
||||
0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
|
||||
0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
|
||||
0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
|
||||
0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
|
||||
0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
|
||||
0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
|
||||
0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
|
||||
0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
|
||||
0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
|
||||
0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
|
||||
0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
|
||||
0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
|
||||
0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
|
||||
0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
|
||||
0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
|
||||
0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
|
||||
0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
|
||||
0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
|
||||
0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
|
||||
0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
|
||||
0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
|
||||
0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
|
||||
0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
|
||||
0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
|
||||
0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
|
||||
0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
|
||||
0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
|
||||
0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
|
||||
0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
|
||||
0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
|
||||
0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
|
||||
0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
|
||||
0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
|
||||
0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
|
||||
0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
|
||||
0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
|
||||
0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
|
||||
0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
|
||||
0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
|
||||
0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
|
||||
0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
|
||||
0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
|
||||
0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
|
||||
0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
|
||||
0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
|
||||
0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
|
||||
0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
|
||||
0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
|
||||
0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
|
||||
0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
|
||||
0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
|
||||
0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
|
||||
0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
|
||||
0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
|
||||
0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
|
||||
0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
|
||||
0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
|
||||
0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
|
||||
0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
|
||||
0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
|
||||
0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
|
||||
0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
|
||||
0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
|
||||
0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
|
||||
0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
|
||||
0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
|
||||
0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
|
||||
0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
|
||||
0x3feff9d96b2a23d9,
|
||||
# endif
|
||||
};
|
122
contrib/arm-optimized-routines/math/aarch64/v_expf.c
Normal file
122
contrib/arm-optimized-routines/math/aarch64/v_expf.c
Normal file
@ -0,0 +1,122 @@
|
||||
/*
|
||||
* Single-precision vector e^x function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[5];
|
||||
float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
|
||||
uint32x4_t exponent_bias;
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
float32x4_t special_bound, scale_thresh;
|
||||
#endif
|
||||
} data = {
|
||||
/* maxerr: 1.45358 +0.5 ulp. */
|
||||
.poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
|
||||
V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
|
||||
.shift = V4 (0x1.8p23f),
|
||||
.inv_ln2 = V4 (0x1.715476p+0f),
|
||||
.ln2_hi = V4 (0x1.62e4p-1f),
|
||||
.ln2_lo = V4 (0x1.7f7d1cp-20f),
|
||||
.exponent_bias = V4 (0x3f800000),
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
.special_bound = V4 (126.0f),
|
||||
.scale_thresh = V4 (192.0f),
|
||||
#endif
|
||||
};
|
||||
|
||||
#define C(i) d->poly[i]
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
||||
# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
|
||||
# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */
|
||||
# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
|
||||
{
|
||||
/* If fenv exceptions are to be triggered correctly, fall back to the scalar
|
||||
routine to special lanes. */
|
||||
return v_call_f32 (expf, x, y, cmp);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
# define SpecialOffset v_u32 (0x82000000)
|
||||
# define SpecialBias v_u32 (0x7f000000)
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
||||
float32x4_t scale, const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
|
||||
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
|
||||
float32x4_t r2 = vmulq_f32 (s1, s1);
|
||||
float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
|
||||
/* Similar to r1 but avoids double rounding in the subnormal range. */
|
||||
float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
|
||||
float32x4_t r = vbslq_f32 (cmp1, r1, r0);
|
||||
return vbslq_f32 (cmp2, r2, r);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float32x4_t n, r, r2, scale, p, q, poly, z;
|
||||
uint32x4_t cmp, e;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* asuint(x) - TinyBound >= BigBound - TinyBound. */
|
||||
cmp = vcgeq_u32 (
|
||||
vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
|
||||
TinyBound),
|
||||
SpecialBound);
|
||||
float32x4_t xm = x;
|
||||
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
|
||||
special case handler to fix special lanes later. This is only necessary if
|
||||
fenv exceptions are to be triggered correctly. */
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
x = vbslq_f32 (cmp, v_f32 (1), x);
|
||||
#endif
|
||||
|
||||
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
z = vfmaq_f32 (d->shift, x, d->inv_ln2);
|
||||
n = vsubq_f32 (z, d->shift);
|
||||
r = vfmsq_f32 (x, n, d->ln2_hi);
|
||||
r = vfmsq_f32 (r, n, d->ln2_lo);
|
||||
e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
|
||||
scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
cmp = vcagtq_f32 (n, d->special_bound);
|
||||
#endif
|
||||
|
||||
r2 = vmulq_f32 (r, r);
|
||||
p = vfmaq_f32 (C (1), C (0), r);
|
||||
q = vfmaq_f32 (C (3), C (2), r);
|
||||
q = vfmaq_f32 (q, p, r2);
|
||||
p = vmulq_f32 (C (4), r);
|
||||
poly = vfmaq_f32 (p, q, r2);
|
||||
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
#if WANT_SIMD_EXCEPT
|
||||
return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
|
||||
#else
|
||||
return special_case (poly, n, e, cmp, scale, d);
|
||||
#endif
|
||||
|
||||
return vfmaq_f32 (scale, poly, scale);
|
||||
}
|
77
contrib/arm-optimized-routines/math/aarch64/v_expf_1u.c
Normal file
77
contrib/arm-optimized-routines/math/aarch64/v_expf_1u.c
Normal file
@ -0,0 +1,77 @@
|
||||
/*
|
||||
* Single-precision vector e^x function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
|
||||
static const float Poly[] = {
|
||||
/* maxerr: 0.36565 +0.5 ulp. */
|
||||
0x1.6a6000p-10f,
|
||||
0x1.12718ep-7f,
|
||||
0x1.555af0p-5f,
|
||||
0x1.555430p-3f,
|
||||
0x1.fffff4p-2f,
|
||||
};
|
||||
#define C0 v_f32 (Poly[0])
|
||||
#define C1 v_f32 (Poly[1])
|
||||
#define C2 v_f32 (Poly[2])
|
||||
#define C3 v_f32 (Poly[3])
|
||||
#define C4 v_f32 (Poly[4])
|
||||
|
||||
#define Shift v_f32 (0x1.8p23f)
|
||||
#define InvLn2 v_f32 (0x1.715476p+0f)
|
||||
#define Ln2hi v_f32 (0x1.62e4p-1f)
|
||||
#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
|
||||
uint32x4_t cmp = absn > v_f32 (192.0f);
|
||||
float32x4_t r1 = s1 * s1;
|
||||
float32x4_t r0 = poly * s1 * s2;
|
||||
return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
|
||||
| (~cmp & vreinterpretq_u32_f32 (r0)));
|
||||
}
|
||||
|
||||
float32x4_t VPCS_ATTR
|
||||
_ZGVnN4v_expf_1u (float32x4_t x)
|
||||
{
|
||||
float32x4_t n, r, scale, poly, absn, z;
|
||||
uint32x4_t cmp, e;
|
||||
|
||||
/* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
#if 1
|
||||
z = vfmaq_f32 (Shift, x, InvLn2);
|
||||
n = z - Shift;
|
||||
r = vfmaq_f32 (x, n, -Ln2hi);
|
||||
r = vfmaq_f32 (r, n, -Ln2lo);
|
||||
e = vreinterpretq_u32_f32 (z) << 23;
|
||||
#else
|
||||
z = x * InvLn2;
|
||||
n = vrndaq_f32 (z);
|
||||
r = vfmaq_f32 (x, n, -Ln2hi);
|
||||
r = vfmaq_f32 (r, n, -Ln2lo);
|
||||
e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)) << 23;
|
||||
#endif
|
||||
scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
|
||||
absn = vabsq_f32 (n);
|
||||
cmp = absn > v_f32 (126.0f);
|
||||
poly = vfmaq_f32 (C1, C0, r);
|
||||
poly = vfmaq_f32 (C2, poly, r);
|
||||
poly = vfmaq_f32 (C3, poly, r);
|
||||
poly = vfmaq_f32 (C4, poly, r);
|
||||
poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
|
||||
poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
return specialcase (poly, n, e, absn);
|
||||
return scale * poly;
|
||||
}
|
100
contrib/arm-optimized-routines/math/aarch64/v_log.c
Normal file
100
contrib/arm-optimized-routines/math/aarch64/v_log.c
Normal file
@ -0,0 +1,100 @@
|
||||
/*
|
||||
* Double-precision vector log(x) function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
uint64x2_t min_norm;
|
||||
uint32x4_t special_bound;
|
||||
float64x2_t poly[5];
|
||||
float64x2_t ln2;
|
||||
uint64x2_t sign_exp_mask;
|
||||
} data = {
|
||||
/* Worst-case error: 1.17 + 0.5 ulp.
|
||||
Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
|
||||
.poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2),
|
||||
V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3),
|
||||
V2 (-0x1.554e550bd501ep-3) },
|
||||
.ln2 = V2 (0x1.62e42fefa39efp-1),
|
||||
.min_norm = V2 (0x0010000000000000),
|
||||
.special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
|
||||
.sign_exp_mask = V2 (0xfff0000000000000)
|
||||
};
|
||||
|
||||
#define A(i) d->poly[i]
|
||||
#define N (1 << V_LOG_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
#define Off v_u64 (0x3fe6900900000000)
|
||||
|
||||
struct entry
|
||||
{
|
||||
float64x2_t invc;
|
||||
float64x2_t logc;
|
||||
};
|
||||
|
||||
static inline struct entry
|
||||
lookup (uint64x2_t i)
|
||||
{
|
||||
/* Since N is a power of 2, n % N = n & (N - 1). */
|
||||
struct entry e;
|
||||
uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
|
||||
float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
|
||||
e.invc = vuzp1q_f64 (e0, e1);
|
||||
e.logc = vuzp2q_f64 (e0, e1);
|
||||
return e;
|
||||
}
|
||||
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
|
||||
uint32x2_t cmp)
|
||||
{
|
||||
return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp));
|
||||
}
|
||||
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float64x2_t z, r, r2, p, y, kd, hi;
|
||||
uint64x2_t ix, iz, tmp;
|
||||
uint32x2_t cmp;
|
||||
int64x2_t k;
|
||||
struct entry e;
|
||||
|
||||
ix = vreinterpretq_u64_f64 (x);
|
||||
cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
|
||||
vget_low_u32 (d->special_bound));
|
||||
|
||||
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
|
||||
The range is split into N subintervals.
|
||||
The ith subinterval contains z and c is near its center. */
|
||||
tmp = vsubq_u64 (ix, Off);
|
||||
k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
|
||||
iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
|
||||
z = vreinterpretq_f64_u64 (iz);
|
||||
e = lookup (tmp);
|
||||
|
||||
/* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
|
||||
r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
|
||||
kd = vcvtq_f64_s64 (k);
|
||||
|
||||
/* hi = r + log(c) + k*Ln2. */
|
||||
hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
|
||||
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
|
||||
r2 = vmulq_f64 (r, r);
|
||||
y = vfmaq_f64 (A (2), A (3), r);
|
||||
p = vfmaq_f64 (A (0), A (1), r);
|
||||
y = vfmaq_f64 (y, A (4), r2);
|
||||
y = vfmaq_f64 (p, y, r2);
|
||||
|
||||
if (unlikely (v_any_u32h (cmp)))
|
||||
return special_case (x, y, hi, r2, cmp);
|
||||
return vfmaq_f64 (hi, y, r2);
|
||||
}
|
156
contrib/arm-optimized-routines/math/aarch64/v_log_data.c
Normal file
156
contrib/arm-optimized-routines/math/aarch64/v_log_data.c
Normal file
@ -0,0 +1,156 @@
|
||||
/*
|
||||
* Lookup table for double-precision log(x) vector function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
#define N (1 << V_LOG_TABLE_BITS)
|
||||
|
||||
const struct v_log_data __v_log_data = {
|
||||
/* Algorithm:
|
||||
|
||||
x = 2^k z
|
||||
log(x) = k ln2 + log(c) + poly(z/c - 1)
|
||||
|
||||
where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,
|
||||
N=128) and log(c) and 1/c for the ith subinterval comes from lookup tables:
|
||||
|
||||
table[i].invc = 1/c
|
||||
table[i].logc = (double)log(c)
|
||||
|
||||
where c is near the center of the subinterval and is chosen by trying several
|
||||
floating point invc candidates around 1/center and selecting one for which
|
||||
the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
|
||||
that contains 1 and the previous one got tweaked to avoid cancellation. */
|
||||
.table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 },
|
||||
{ 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 },
|
||||
{ 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 },
|
||||
{ 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 },
|
||||
{ 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 },
|
||||
{ 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 },
|
||||
{ 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 },
|
||||
{ 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 },
|
||||
{ 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 },
|
||||
{ 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 },
|
||||
{ 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 },
|
||||
{ 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 },
|
||||
{ 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 },
|
||||
{ 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 },
|
||||
{ 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 },
|
||||
{ 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 },
|
||||
{ 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 },
|
||||
{ 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 },
|
||||
{ 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 },
|
||||
{ 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 },
|
||||
{ 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 },
|
||||
{ 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 },
|
||||
{ 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 },
|
||||
{ 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 },
|
||||
{ 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 },
|
||||
{ 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 },
|
||||
{ 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 },
|
||||
{ 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 },
|
||||
{ 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 },
|
||||
{ 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 },
|
||||
{ 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 },
|
||||
{ 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 },
|
||||
{ 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 },
|
||||
{ 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 },
|
||||
{ 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 },
|
||||
{ 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 },
|
||||
{ 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 },
|
||||
{ 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 },
|
||||
{ 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 },
|
||||
{ 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 },
|
||||
{ 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 },
|
||||
{ 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 },
|
||||
{ 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 },
|
||||
{ 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 },
|
||||
{ 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 },
|
||||
{ 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 },
|
||||
{ 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 },
|
||||
{ 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 },
|
||||
{ 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 },
|
||||
{ 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 },
|
||||
{ 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 },
|
||||
{ 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 },
|
||||
{ 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 },
|
||||
{ 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 },
|
||||
{ 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 },
|
||||
{ 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 },
|
||||
{ 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 },
|
||||
{ 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 },
|
||||
{ 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 },
|
||||
{ 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 },
|
||||
{ 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 },
|
||||
{ 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 },
|
||||
{ 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 },
|
||||
{ 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 },
|
||||
{ 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 },
|
||||
{ 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 },
|
||||
{ 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 },
|
||||
{ 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 },
|
||||
{ 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 },
|
||||
{ 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 },
|
||||
{ 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 },
|
||||
{ 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 },
|
||||
{ 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 },
|
||||
{ 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 },
|
||||
{ 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 },
|
||||
{ 1.0, 0.0 },
|
||||
{ 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 },
|
||||
{ 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 },
|
||||
{ 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 },
|
||||
{ 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 },
|
||||
{ 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 },
|
||||
{ 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 },
|
||||
{ 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 },
|
||||
{ 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 },
|
||||
{ 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 },
|
||||
{ 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 },
|
||||
{ 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 },
|
||||
{ 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 },
|
||||
{ 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 },
|
||||
{ 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 },
|
||||
{ 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 },
|
||||
{ 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 },
|
||||
{ 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 },
|
||||
{ 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 },
|
||||
{ 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 },
|
||||
{ 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 },
|
||||
{ 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 },
|
||||
{ 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 },
|
||||
{ 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 },
|
||||
{ 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 },
|
||||
{ 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 },
|
||||
{ 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 },
|
||||
{ 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 },
|
||||
{ 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 },
|
||||
{ 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 },
|
||||
{ 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 },
|
||||
{ 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 },
|
||||
{ 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 },
|
||||
{ 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 },
|
||||
{ 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 },
|
||||
{ 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 },
|
||||
{ 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 },
|
||||
{ 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 },
|
||||
{ 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 },
|
||||
{ 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 },
|
||||
{ 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 },
|
||||
{ 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 },
|
||||
{ 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 },
|
||||
{ 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 },
|
||||
{ 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 },
|
||||
{ 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 },
|
||||
{ 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 },
|
||||
{ 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 },
|
||||
{ 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 },
|
||||
{ 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 },
|
||||
{ 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 },
|
||||
{ 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 },
|
||||
{ 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } }
|
||||
};
|
74
contrib/arm-optimized-routines/math/aarch64/v_logf.c
Normal file
74
contrib/arm-optimized-routines/math/aarch64/v_logf.c
Normal file
@ -0,0 +1,74 @@
|
||||
/*
|
||||
* Single-precision vector log function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
uint32x4_t min_norm;
|
||||
uint16x8_t special_bound;
|
||||
float32x4_t poly[7];
|
||||
float32x4_t ln2, tiny_bound;
|
||||
uint32x4_t off, mantissa_mask;
|
||||
} data = {
|
||||
/* 3.34 ulp error. */
|
||||
.poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
|
||||
V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
|
||||
V4 (-0x1.ffffc8p-2f) },
|
||||
.ln2 = V4 (0x1.62e43p-1f),
|
||||
.tiny_bound = V4 (0x1p-126),
|
||||
.min_norm = V4 (0x00800000),
|
||||
.special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
|
||||
.off = V4 (0x3f2aaaab), /* 0.666667. */
|
||||
.mantissa_mask = V4 (0x007fffff)
|
||||
};
|
||||
|
||||
#define P(i) d->poly[7 - i]
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
|
||||
uint16x4_t cmp)
|
||||
{
|
||||
/* Fall back to scalar code. */
|
||||
return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
|
||||
}
|
||||
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float32x4_t n, p, q, r, r2, y;
|
||||
uint32x4_t u;
|
||||
uint16x4_t cmp;
|
||||
|
||||
u = vreinterpretq_u32_f32 (x);
|
||||
cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
|
||||
vget_low_u16 (d->special_bound));
|
||||
|
||||
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
|
||||
u = vsubq_u32 (u, d->off);
|
||||
n = vcvtq_f32_s32 (
|
||||
vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
|
||||
u = vandq_u32 (u, d->mantissa_mask);
|
||||
u = vaddq_u32 (u, d->off);
|
||||
r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
|
||||
|
||||
/* y = log(1+r) + n*ln2. */
|
||||
r2 = vmulq_f32 (r, r);
|
||||
/* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
|
||||
p = vfmaq_f32 (P (5), P (6), r);
|
||||
q = vfmaq_f32 (P (3), P (4), r);
|
||||
y = vfmaq_f32 (P (1), P (2), r);
|
||||
p = vfmaq_f32 (p, P (7), r2);
|
||||
q = vfmaq_f32 (q, p, r2);
|
||||
y = vfmaq_f32 (y, q, r2);
|
||||
p = vfmaq_f32 (r, d->ln2, n);
|
||||
|
||||
if (unlikely (v_any_u16h (cmp)))
|
||||
return special_case (x, y, r2, p, cmp);
|
||||
return vfmaq_f32 (p, y, r2);
|
||||
}
|
135
contrib/arm-optimized-routines/math/aarch64/v_math.h
Normal file
135
contrib/arm-optimized-routines/math/aarch64/v_math.h
Normal file
@ -0,0 +1,135 @@
|
||||
/*
|
||||
* Vector math abstractions.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef _V_MATH_H
|
||||
#define _V_MATH_H
|
||||
|
||||
#if !__aarch64__
|
||||
# error "Cannot build without AArch64"
|
||||
#endif
|
||||
|
||||
#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
|
||||
|
||||
#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
|
||||
#define V_NAME_D1(fun) _ZGVnN2v_##fun
|
||||
#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
|
||||
#define V_NAME_D2(fun) _ZGVnN2vv_##fun
|
||||
|
||||
#include <stdint.h>
|
||||
#include "../math_config.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
/* Shorthand helpers for declaring constants. */
|
||||
# define V2(X) { X, X }
|
||||
# define V4(X) { X, X, X, X }
|
||||
# define V8(X) { X, X, X, X, X, X, X, X }
|
||||
|
||||
static inline int
|
||||
v_any_u16h (uint16x4_t x)
|
||||
{
|
||||
return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
|
||||
}
|
||||
|
||||
static inline int
|
||||
v_lanes32 (void)
|
||||
{
|
||||
return 4;
|
||||
}
|
||||
|
||||
static inline float32x4_t
|
||||
v_f32 (float x)
|
||||
{
|
||||
return (float32x4_t) V4 (x);
|
||||
}
|
||||
static inline uint32x4_t
|
||||
v_u32 (uint32_t x)
|
||||
{
|
||||
return (uint32x4_t) V4 (x);
|
||||
}
|
||||
/* true if any elements of a v_cond result is non-zero. */
|
||||
static inline int
|
||||
v_any_u32 (uint32x4_t x)
|
||||
{
|
||||
/* assume elements in x are either 0 or -1u. */
|
||||
return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
|
||||
}
|
||||
static inline int
|
||||
v_any_u32h (uint32x2_t x)
|
||||
{
|
||||
return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
|
||||
}
|
||||
static inline float32x4_t
|
||||
v_lookup_f32 (const float *tab, uint32x4_t idx)
|
||||
{
|
||||
return (float32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
|
||||
}
|
||||
static inline uint32x4_t
|
||||
v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)
|
||||
{
|
||||
return (uint32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
|
||||
}
|
||||
static inline float32x4_t
|
||||
v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)
|
||||
{
|
||||
return (float32x4_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
|
||||
p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
|
||||
}
|
||||
static inline float32x4_t
|
||||
v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
|
||||
float32x4_t y, uint32x4_t p)
|
||||
{
|
||||
return (float32x4_t){p[0] ? f (x1[0], x2[0]) : y[0],
|
||||
p[1] ? f (x1[1], x2[1]) : y[1],
|
||||
p[2] ? f (x1[2], x2[2]) : y[2],
|
||||
p[3] ? f (x1[3], x2[3]) : y[3]};
|
||||
}
|
||||
|
||||
static inline int
|
||||
v_lanes64 (void)
|
||||
{
|
||||
return 2;
|
||||
}
|
||||
static inline float64x2_t
|
||||
v_f64 (double x)
|
||||
{
|
||||
return (float64x2_t) V2 (x);
|
||||
}
|
||||
static inline uint64x2_t
|
||||
v_u64 (uint64_t x)
|
||||
{
|
||||
return (uint64x2_t) V2 (x);
|
||||
}
|
||||
/* true if any elements of a v_cond result is non-zero. */
|
||||
static inline int
|
||||
v_any_u64 (uint64x2_t x)
|
||||
{
|
||||
/* assume elements in x are either 0 or -1u. */
|
||||
return vpaddd_u64 (x) != 0;
|
||||
}
|
||||
static inline float64x2_t
|
||||
v_lookup_f64 (const double *tab, uint64x2_t idx)
|
||||
{
|
||||
return (float64x2_t){tab[idx[0]], tab[idx[1]]};
|
||||
}
|
||||
static inline uint64x2_t
|
||||
v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
|
||||
{
|
||||
return (uint64x2_t){tab[idx[0]], tab[idx[1]]};
|
||||
}
|
||||
static inline float64x2_t
|
||||
v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
|
||||
{
|
||||
double p1 = p[1];
|
||||
double x1 = x[1];
|
||||
if (likely (p[0]))
|
||||
y[0] = f (x[0]);
|
||||
if (likely (p1))
|
||||
y[1] = f (x1);
|
||||
return y;
|
||||
}
|
||||
|
||||
#endif
|
22
contrib/arm-optimized-routines/math/aarch64/v_pow.c
Normal file
22
contrib/arm-optimized-routines/math/aarch64/v_pow.c
Normal file
@ -0,0 +1,22 @@
|
||||
/*
|
||||
* Double-precision vector pow function.
|
||||
*
|
||||
* Copyright (c) 2020-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
|
||||
float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
|
||||
{
|
||||
float64x2_t z;
|
||||
for (int lane = 0; lane < v_lanes64 (); lane++)
|
||||
{
|
||||
double sx = x[lane];
|
||||
double sy = y[lane];
|
||||
double sz = pow (sx, sy);
|
||||
z[lane] = sz;
|
||||
}
|
||||
return z;
|
||||
}
|
148
contrib/arm-optimized-routines/math/aarch64/v_powf.c
Normal file
148
contrib/arm-optimized-routines/math/aarch64/v_powf.c
Normal file
@ -0,0 +1,148 @@
|
||||
/*
|
||||
* Single-precision vector powf function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
#define Min v_u32 (0x00800000)
|
||||
#define Max v_u32 (0x7f800000)
|
||||
#define Thresh v_u32 (0x7f000000) /* Max - Min. */
|
||||
#define MantissaMask v_u32 (0x007fffff)
|
||||
|
||||
#define A data.log2_poly
|
||||
#define C data.exp2f_poly
|
||||
|
||||
/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */
|
||||
#define Off v_u32 (0x3f35d000)
|
||||
|
||||
#define V_POWF_LOG2_TABLE_BITS 5
|
||||
#define V_EXP2F_TABLE_BITS 5
|
||||
#define Log2IdxMask v_u32 ((1 << V_POWF_LOG2_TABLE_BITS) - 1)
|
||||
#define Scale ((double) (1 << V_EXP2F_TABLE_BITS))
|
||||
|
||||
static const struct
|
||||
{
|
||||
struct
|
||||
{
|
||||
double invc, logc;
|
||||
} log2_tab[1 << V_POWF_LOG2_TABLE_BITS];
|
||||
double log2_poly[4];
|
||||
uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS];
|
||||
double exp2f_poly[3];
|
||||
} data = {
|
||||
.log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale},
|
||||
{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale},
|
||||
{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale},
|
||||
{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale},
|
||||
{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale},
|
||||
{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale},
|
||||
{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale},
|
||||
{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale},
|
||||
{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale},
|
||||
{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale},
|
||||
{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale},
|
||||
{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale},
|
||||
{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale},
|
||||
{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale},
|
||||
{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale},
|
||||
{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale},
|
||||
{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale},
|
||||
{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale},
|
||||
{0x1p+0, 0x0p+0 * Scale},
|
||||
{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale},
|
||||
{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale},
|
||||
{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale},
|
||||
{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale},
|
||||
{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale},
|
||||
{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale},
|
||||
{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale},
|
||||
{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale},
|
||||
{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale},
|
||||
{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale},
|
||||
{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale},
|
||||
{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale},
|
||||
{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},},
|
||||
.log2_poly = { /* rel err: 1.5 * 2^-30. */
|
||||
-0x1.6ff5daa3b3d7cp-2 * Scale, 0x1.ec81d03c01aebp-2 * Scale,
|
||||
-0x1.71547bb43f101p-1 * Scale, 0x1.7154764a815cbp0 * Scale,},
|
||||
.exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f,
|
||||
0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa,
|
||||
0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715,
|
||||
0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
|
||||
0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429,
|
||||
0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74,
|
||||
0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db,
|
||||
0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
|
||||
0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c,
|
||||
0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f,
|
||||
0x3fefa4afa2a490da, 0x3fefd0765b6e4540,},
|
||||
.exp2f_poly = { /* rel err: 1.69 * 2^-34. */
|
||||
0x1.c6af84b912394p-5 / Scale / Scale / Scale,
|
||||
0x1.ebfce50fac4f3p-3 / Scale / Scale,
|
||||
0x1.62e42ff0c52d6p-1 / Scale}};
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp)
|
||||
{
|
||||
return v_call2_f32 (powf, x, y, ret, cmp);
|
||||
}
|
||||
|
||||
float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y)
|
||||
{
|
||||
uint32x4_t u = vreinterpretq_u32_f32 (x);
|
||||
uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh);
|
||||
uint32x4_t tmp = vsubq_u32 (u, Off);
|
||||
uint32x4_t i = vandq_u32 (vshrq_n_u32 (tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
|
||||
Log2IdxMask);
|
||||
uint32x4_t top = vbicq_u32 (tmp, MantissaMask);
|
||||
uint32x4_t iz = vsubq_u32 (u, top);
|
||||
int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top),
|
||||
23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */
|
||||
|
||||
float32x4_t ret;
|
||||
for (int lane = 0; lane < 4; lane++)
|
||||
{
|
||||
/* Use double precision for each lane. */
|
||||
double invc = data.log2_tab[i[lane]].invc;
|
||||
double logc = data.log2_tab[i[lane]].logc;
|
||||
double z = (double) asfloat (iz[lane]);
|
||||
|
||||
/* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */
|
||||
double r = __builtin_fma (z, invc, -1.0);
|
||||
double y0 = logc + (double) k[lane];
|
||||
|
||||
/* Polynomial to approximate log1p(r)/ln2. */
|
||||
double logx = A[0];
|
||||
logx = r * logx + A[1];
|
||||
logx = r * logx + A[2];
|
||||
logx = r * logx + A[3];
|
||||
logx = r * logx + y0;
|
||||
double ylogx = y[lane] * logx;
|
||||
cmp[lane] = (asuint64 (ylogx) >> 47 & 0xffff)
|
||||
>= asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) >> 47
|
||||
? 1
|
||||
: cmp[lane];
|
||||
|
||||
/* N*x = k + r with r in [-1/2, 1/2]. */
|
||||
double kd = round (ylogx);
|
||||
uint64_t ki = lround (ylogx);
|
||||
r = ylogx - kd;
|
||||
|
||||
/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
|
||||
uint64_t t = data.exp2f_tab[ki % (1 << V_EXP2F_TABLE_BITS)];
|
||||
t += ki << (52 - V_EXP2F_TABLE_BITS);
|
||||
double s = asdouble (t);
|
||||
double p = C[0];
|
||||
p = __builtin_fma (p, r, C[1]);
|
||||
p = __builtin_fma (p, r, C[2]);
|
||||
p = __builtin_fma (p, s * r, s);
|
||||
|
||||
ret[lane] = p;
|
||||
}
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
return special_case (x, y, ret, cmp);
|
||||
return ret;
|
||||
}
|
97
contrib/arm-optimized-routines/math/aarch64/v_sin.c
Normal file
97
contrib/arm-optimized-routines/math/aarch64/v_sin.c
Normal file
@ -0,0 +1,97 @@
|
||||
/*
|
||||
* Double-precision vector sin function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t poly[7];
|
||||
float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
|
||||
} data = {
|
||||
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
|
||||
V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
|
||||
V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
|
||||
V2 (-0x1.9e9540300a1p-41) },
|
||||
|
||||
.range_val = V2 (0x1p23),
|
||||
.inv_pi = V2 (0x1.45f306dc9c883p-2),
|
||||
.pi_1 = V2 (0x1.921fb54442d18p+1),
|
||||
.pi_2 = V2 (0x1.1a62633145c06p-53),
|
||||
.pi_3 = V2 (0x1.c1cd129024e09p-106),
|
||||
.shift = V2 (0x1.8p52),
|
||||
};
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */
|
||||
# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */
|
||||
#endif
|
||||
|
||||
#define C(i) d->poly[i]
|
||||
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
|
||||
{
|
||||
y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
|
||||
return v_call_f64 (sin, x, y, cmp);
|
||||
}
|
||||
|
||||
/* Vector (AdvSIMD) sin approximation.
|
||||
Maximum observed error in [-pi/2, pi/2], where argument is not reduced,
|
||||
is 2.87 ULP:
|
||||
_ZGVnN2v_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1
|
||||
want 0x1.fffffffa7dc05p-1
|
||||
Maximum observed error in the entire non-special domain ([-2^23, 2^23])
|
||||
is 3.22 ULP:
|
||||
_ZGVnN2v_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3
|
||||
want 0x1.ffdcd125c84f8p-3. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float64x2_t n, r, r2, r3, r4, y, t1, t2, t3;
|
||||
uint64x2_t odd, cmp;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be
|
||||
triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
|
||||
fenv). These lanes will be fixed by special-case handler later. */
|
||||
uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
|
||||
cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
|
||||
r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
|
||||
#else
|
||||
r = x;
|
||||
cmp = vcageq_f64 (x, d->range_val);
|
||||
#endif
|
||||
|
||||
/* n = rint(|x|/pi). */
|
||||
n = vfmaq_f64 (d->shift, d->inv_pi, r);
|
||||
odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
|
||||
n = vsubq_f64 (n, d->shift);
|
||||
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
|
||||
r = vfmsq_f64 (r, d->pi_1, n);
|
||||
r = vfmsq_f64 (r, d->pi_2, n);
|
||||
r = vfmsq_f64 (r, d->pi_3, n);
|
||||
|
||||
/* sin(r) poly approx. */
|
||||
r2 = vmulq_f64 (r, r);
|
||||
r3 = vmulq_f64 (r2, r);
|
||||
r4 = vmulq_f64 (r2, r2);
|
||||
|
||||
t1 = vfmaq_f64 (C (4), C (5), r2);
|
||||
t2 = vfmaq_f64 (C (2), C (3), r2);
|
||||
t3 = vfmaq_f64 (C (0), C (1), r2);
|
||||
|
||||
y = vfmaq_f64 (t1, C (6), r4);
|
||||
y = vfmaq_f64 (t2, y, r4);
|
||||
y = vfmaq_f64 (t3, y, r4);
|
||||
y = vfmaq_f64 (r, y, r3);
|
||||
|
||||
if (unlikely (v_any_u64 (cmp)))
|
||||
return special_case (x, y, odd, cmp);
|
||||
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
|
||||
}
|
82
contrib/arm-optimized-routines/math/aarch64/v_sinf.c
Normal file
82
contrib/arm-optimized-routines/math/aarch64/v_sinf.c
Normal file
@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Single-precision vector sin function.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[4];
|
||||
float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
|
||||
} data = {
|
||||
/* 1.886 ulp error. */
|
||||
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
|
||||
V4 (0x1.5b2e76p-19f) },
|
||||
|
||||
.pi_1 = V4 (0x1.921fb6p+1f),
|
||||
.pi_2 = V4 (-0x1.777a5cp-24f),
|
||||
.pi_3 = V4 (-0x1.ee59dap-49f),
|
||||
|
||||
.inv_pi = V4 (0x1.45f306p-2f),
|
||||
.shift = V4 (0x1.8p+23f),
|
||||
.range_val = V4 (0x1p20f)
|
||||
};
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */
|
||||
# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */
|
||||
#endif
|
||||
|
||||
#define C(i) d->poly[i]
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
|
||||
{
|
||||
/* Fall back to scalar code. */
|
||||
y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
|
||||
return v_call_f32 (sinf, x, y, cmp);
|
||||
}
|
||||
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float32x4_t n, r, r2, y;
|
||||
uint32x4_t odd, cmp;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
|
||||
cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
|
||||
/* If fenv exceptions are to be triggered correctly, set any special lanes
|
||||
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
|
||||
special-case handler later. */
|
||||
r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
|
||||
#else
|
||||
r = x;
|
||||
cmp = vcageq_f32 (x, d->range_val);
|
||||
#endif
|
||||
|
||||
/* n = rint(|x|/pi) */
|
||||
n = vfmaq_f32 (d->shift, d->inv_pi, r);
|
||||
odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
|
||||
n = vsubq_f32 (n, d->shift);
|
||||
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
|
||||
r = vfmsq_f32 (r, d->pi_1, n);
|
||||
r = vfmsq_f32 (r, d->pi_2, n);
|
||||
r = vfmsq_f32 (r, d->pi_3, n);
|
||||
|
||||
/* y = sin(r) */
|
||||
r2 = vmulq_f32 (r, r);
|
||||
y = vfmaq_f32 (C (2), C (3), r2);
|
||||
y = vfmaq_f32 (C (1), y, r2);
|
||||
y = vfmaq_f32 (C (0), y, r2);
|
||||
y = vfmaq_f32 (r, vmulq_f32 (y, r2), r);
|
||||
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
return special_case (x, y, odd, cmp);
|
||||
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
|
||||
}
|
129
contrib/arm-optimized-routines/math/exp10.c
Normal file
129
contrib/arm-optimized-routines/math/exp10.c
Normal file
@ -0,0 +1,129 @@
|
||||
/*
|
||||
* Double-precision 10^x function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "math_config.h"
|
||||
|
||||
#define N (1 << EXP_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX). */
|
||||
#define UFlowBound -0x1.5ep+8 /* -350. */
|
||||
#define SmallTop 0x3c6 /* top12(0x1p-57). */
|
||||
#define BigTop 0x407 /* top12(0x1p8). */
|
||||
#define Thresh 0x41 /* BigTop - SmallTop. */
|
||||
#define Shift __exp_data.shift
|
||||
#define C(i) __exp_data.exp10_poly[i]
|
||||
|
||||
static double
|
||||
special_case (uint64_t sbits, double_t tmp, uint64_t ki)
|
||||
{
|
||||
double_t scale, y;
|
||||
|
||||
if (ki - (1ull << 16) < 0x80000000)
|
||||
{
|
||||
/* The exponent of scale might have overflowed by 1. */
|
||||
sbits -= 1ull << 52;
|
||||
scale = asdouble (sbits);
|
||||
y = 2 * (scale + scale * tmp);
|
||||
return check_oflow (eval_as_double (y));
|
||||
}
|
||||
|
||||
/* n < 0, need special care in the subnormal range. */
|
||||
sbits += 1022ull << 52;
|
||||
scale = asdouble (sbits);
|
||||
y = scale + scale * tmp;
|
||||
|
||||
if (y < 1.0)
|
||||
{
|
||||
/* Round y to the right precision before scaling it into the subnormal
|
||||
range to avoid double rounding that can cause 0.5+E/2 ulp error where
|
||||
E is the worst-case ulp error outside the subnormal range. So this
|
||||
is only useful if the goal is better than 1 ulp worst-case error. */
|
||||
double_t lo = scale - y + scale * tmp;
|
||||
double_t hi = 1.0 + y;
|
||||
lo = 1.0 - hi + y + lo;
|
||||
y = eval_as_double (hi + lo) - 1.0;
|
||||
/* Avoid -0.0 with downward rounding. */
|
||||
if (WANT_ROUNDING && y == 0.0)
|
||||
y = 0.0;
|
||||
/* The underflow exception needs to be signaled explicitly. */
|
||||
force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
|
||||
}
|
||||
y = 0x1p-1022 * y;
|
||||
|
||||
return check_uflow (y);
|
||||
}
|
||||
|
||||
/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP. */
|
||||
double
|
||||
exp10 (double x)
|
||||
{
|
||||
uint64_t ix = asuint64 (x);
|
||||
uint32_t abstop = (ix >> 52) & 0x7ff;
|
||||
|
||||
if (unlikely (abstop - SmallTop >= Thresh))
|
||||
{
|
||||
if (abstop - SmallTop >= 0x80000000)
|
||||
/* Avoid spurious underflow for tiny x.
|
||||
Note: 0 is common input. */
|
||||
return x + 1;
|
||||
if (abstop == 0x7ff)
|
||||
return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0;
|
||||
if (x >= OFlowBound)
|
||||
return __math_oflow (0);
|
||||
if (x < UFlowBound)
|
||||
return __math_uflow (0);
|
||||
|
||||
/* Large x is special-cased below. */
|
||||
abstop = 0;
|
||||
}
|
||||
|
||||
/* Reduce x: z = x * N / log10(2), k = round(z). */
|
||||
double_t z = __exp_data.invlog10_2N * x;
|
||||
double_t kd;
|
||||
int64_t ki;
|
||||
#if TOINT_INTRINSICS
|
||||
kd = roundtoint (z);
|
||||
ki = converttoint (z);
|
||||
#else
|
||||
kd = eval_as_double (z + Shift);
|
||||
kd -= Shift;
|
||||
ki = kd;
|
||||
#endif
|
||||
|
||||
/* r = x - k * log10(2), r in [-0.5, 0.5]. */
|
||||
double_t r = x;
|
||||
r = __exp_data.neglog10_2hiN * kd + r;
|
||||
r = __exp_data.neglog10_2loN * kd + r;
|
||||
|
||||
/* exp10(x) = 2^(k/N) * 2^(r/N).
|
||||
Approximate the two components separately. */
|
||||
|
||||
/* s = 2^(k/N), using lookup table. */
|
||||
uint64_t e = ki << (52 - EXP_TABLE_BITS);
|
||||
uint64_t i = (ki & IndexMask) * 2;
|
||||
uint64_t u = __exp_data.tab[i + 1];
|
||||
uint64_t sbits = u + e;
|
||||
|
||||
double_t tail = asdouble (__exp_data.tab[i]);
|
||||
|
||||
/* 2^(r/N) ~= 1 + r * Poly(r). */
|
||||
double_t r2 = r * r;
|
||||
double_t p = C (0) + r * C (1);
|
||||
double_t y = C (2) + r * C (3);
|
||||
y = y + r2 * C (4);
|
||||
y = p + r2 * y;
|
||||
y = tail + y * r;
|
||||
|
||||
if (unlikely (abstop == 0))
|
||||
return special_case (sbits, y, ki);
|
||||
|
||||
/* Assemble components:
|
||||
y = 2^(r/N) * 2^(k/N)
|
||||
~= (y + 1) * s. */
|
||||
double_t s = asdouble (sbits);
|
||||
return eval_as_double (s * y + s);
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Shared data between exp, exp2 and pow.
|
||||
*
|
||||
* Copyright (c) 2018, Arm Limited.
|
||||
* Copyright (c) 2018-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
@ -12,6 +12,7 @@
|
||||
const struct exp_data __exp_data = {
|
||||
// N/ln2
|
||||
.invln2N = 0x1.71547652b82fep0 * N,
|
||||
.invlog10_2N = 0x1.a934f0979a371p1 * N,
|
||||
// -ln2/N
|
||||
#if N == 64
|
||||
.negln2hiN = -0x1.62e42fefa0000p-7,
|
||||
@ -26,6 +27,8 @@ const struct exp_data __exp_data = {
|
||||
.negln2hiN = -0x1.62e42fef80000p-10,
|
||||
.negln2loN = -0x1.1cf79abc9e3b4p-45,
|
||||
#endif
|
||||
.neglog10_2hiN = -0x1.3441350ap-2 / N,
|
||||
.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N,
|
||||
// Used for rounding when !TOINT_INTRINSICS
|
||||
#if EXP_USE_TOINT_NARROW
|
||||
.shift = 0x1800000000.8p0,
|
||||
@ -147,6 +150,24 @@ const struct exp_data __exp_data = {
|
||||
0x1.3b2ab786ee1dap-7,
|
||||
#endif
|
||||
},
|
||||
.exp10_poly = {
|
||||
#if EXP10_POLY_WIDE
|
||||
/* Range is wider if using shift-based reduction: coeffs generated
|
||||
using Remez in [-log10(2)/128, log10(2)/128 ]. */
|
||||
0x1.26bb1bbb55515p1,
|
||||
0x1.53524c73cd32bp1,
|
||||
0x1.0470591e1a108p1,
|
||||
0x1.2bd77b12fe9a8p0,
|
||||
0x1.14289fef24b78p-1
|
||||
#else
|
||||
/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ]. */
|
||||
0x1.26bb1bbb55516p1,
|
||||
0x1.53524c73ce9fep1,
|
||||
0x1.0470591ce4b26p1,
|
||||
0x1.2bd76577fe684p0,
|
||||
0x1.1446eeccd0efbp-1
|
||||
#endif
|
||||
},
|
||||
// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
|
||||
// tab[2*k] = asuint64(T[k])
|
||||
// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Public API.
|
||||
*
|
||||
* Copyright (c) 2015-2020, Arm Limited.
|
||||
* Copyright (c) 2015-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
@ -18,74 +18,33 @@ float cosf (float);
|
||||
void sincosf (float, float*, float*);
|
||||
|
||||
double exp (double);
|
||||
double exp10 (double);
|
||||
double exp2 (double);
|
||||
double log (double);
|
||||
double log2 (double);
|
||||
double pow (double, double);
|
||||
|
||||
/* Scalar functions using the vector algorithm with identical result. */
|
||||
float __s_sinf (float);
|
||||
float __s_cosf (float);
|
||||
float __s_expf (float);
|
||||
float __s_expf_1u (float);
|
||||
float __s_exp2f (float);
|
||||
float __s_exp2f_1u (float);
|
||||
float __s_logf (float);
|
||||
float __s_powf (float, float);
|
||||
double __s_sin (double);
|
||||
double __s_cos (double);
|
||||
double __s_exp (double);
|
||||
double __s_log (double);
|
||||
double __s_pow (double, double);
|
||||
|
||||
#if __aarch64__
|
||||
#if __GNUC__ >= 5
|
||||
# if __GNUC__ >= 5
|
||||
typedef __Float32x4_t __f32x4_t;
|
||||
typedef __Float64x2_t __f64x2_t;
|
||||
#elif __clang_major__*100+__clang_minor__ >= 305
|
||||
# elif __clang_major__*100+__clang_minor__ >= 305
|
||||
typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t;
|
||||
typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
|
||||
#else
|
||||
#error Unsupported compiler
|
||||
#endif
|
||||
# else
|
||||
# error Unsupported compiler
|
||||
# endif
|
||||
|
||||
/* Vector functions following the base PCS. */
|
||||
__f32x4_t __v_sinf (__f32x4_t);
|
||||
__f32x4_t __v_cosf (__f32x4_t);
|
||||
__f32x4_t __v_expf (__f32x4_t);
|
||||
__f32x4_t __v_expf_1u (__f32x4_t);
|
||||
__f32x4_t __v_exp2f (__f32x4_t);
|
||||
__f32x4_t __v_exp2f_1u (__f32x4_t);
|
||||
__f32x4_t __v_logf (__f32x4_t);
|
||||
__f32x4_t __v_powf (__f32x4_t, __f32x4_t);
|
||||
__f64x2_t __v_sin (__f64x2_t);
|
||||
__f64x2_t __v_cos (__f64x2_t);
|
||||
__f64x2_t __v_exp (__f64x2_t);
|
||||
__f64x2_t __v_log (__f64x2_t);
|
||||
__f64x2_t __v_pow (__f64x2_t, __f64x2_t);
|
||||
|
||||
#if __GNUC__ >= 9 || __clang_major__ >= 8
|
||||
#define __vpcs __attribute__((__aarch64_vector_pcs__))
|
||||
|
||||
/* Vector functions following the vector PCS. */
|
||||
__vpcs __f32x4_t __vn_sinf (__f32x4_t);
|
||||
__vpcs __f32x4_t __vn_cosf (__f32x4_t);
|
||||
__vpcs __f32x4_t __vn_expf (__f32x4_t);
|
||||
__vpcs __f32x4_t __vn_expf_1u (__f32x4_t);
|
||||
__vpcs __f32x4_t __vn_exp2f (__f32x4_t);
|
||||
__vpcs __f32x4_t __vn_exp2f_1u (__f32x4_t);
|
||||
__vpcs __f32x4_t __vn_logf (__f32x4_t);
|
||||
__vpcs __f32x4_t __vn_powf (__f32x4_t, __f32x4_t);
|
||||
__vpcs __f64x2_t __vn_sin (__f64x2_t);
|
||||
__vpcs __f64x2_t __vn_cos (__f64x2_t);
|
||||
__vpcs __f64x2_t __vn_exp (__f64x2_t);
|
||||
__vpcs __f64x2_t __vn_log (__f64x2_t);
|
||||
__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t);
|
||||
# if __GNUC__ >= 9 || __clang_major__ >= 8
|
||||
# undef __vpcs
|
||||
# define __vpcs __attribute__((__aarch64_vector_pcs__))
|
||||
|
||||
/* Vector functions following the vector PCS using ABI names. */
|
||||
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_expf_1u (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_exp2f_1u (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
|
||||
@ -94,7 +53,7 @@ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
|
||||
#endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Configuration for math routines.
|
||||
*
|
||||
* Copyright (c) 2017-2020, Arm Limited.
|
||||
* Copyright (c) 2017-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
@ -92,6 +92,46 @@
|
||||
# define unlikely(x) (x)
|
||||
#endif
|
||||
|
||||
/* Return ptr but hide its value from the compiler so accesses through it
|
||||
cannot be optimized based on the contents. */
|
||||
#define ptr_barrier(ptr) \
|
||||
({ \
|
||||
__typeof (ptr) __ptr = (ptr); \
|
||||
__asm("" : "+r"(__ptr)); \
|
||||
__ptr; \
|
||||
})
|
||||
|
||||
/* Symbol renames to avoid libc conflicts. */
|
||||
#define __math_oflowf arm_math_oflowf
|
||||
#define __math_uflowf arm_math_uflowf
|
||||
#define __math_may_uflowf arm_math_may_uflowf
|
||||
#define __math_divzerof arm_math_divzerof
|
||||
#define __math_oflow arm_math_oflow
|
||||
#define __math_uflow arm_math_uflow
|
||||
#define __math_may_uflow arm_math_may_uflow
|
||||
#define __math_divzero arm_math_divzero
|
||||
#define __math_invalidf arm_math_invalidf
|
||||
#define __math_invalid arm_math_invalid
|
||||
#define __math_check_oflow arm_math_check_oflow
|
||||
#define __math_check_uflow arm_math_check_uflow
|
||||
#define __math_check_oflowf arm_math_check_oflowf
|
||||
#define __math_check_uflowf arm_math_check_uflowf
|
||||
|
||||
#define __sincosf_table arm_math_sincosf_table
|
||||
#define __inv_pio4 arm_math_inv_pio4
|
||||
#define __exp2f_data arm_math_exp2f_data
|
||||
#define __logf_data arm_math_logf_data
|
||||
#define __log2f_data arm_math_log2f_data
|
||||
#define __powf_log2_data arm_math_powf_log2_data
|
||||
#define __exp_data arm_math_exp_data
|
||||
#define __log_data arm_math_log_data
|
||||
#define __log2_data arm_math_log2_data
|
||||
#define __pow_log_data arm_math_pow_log_data
|
||||
#define __erff_data arm_math_erff_data
|
||||
#define __erf_data arm_math_erf_data
|
||||
#define __v_exp_data arm_math_v_exp_data
|
||||
#define __v_log_data arm_math_v_log_data
|
||||
|
||||
#if HAVE_FAST_ROUND
|
||||
/* When set, the roundtoint and converttoint functions are provided with
|
||||
the semantics documented below. */
|
||||
@ -381,15 +421,22 @@ extern const struct powf_log2_data
|
||||
#define EXP_USE_TOINT_NARROW 0
|
||||
#define EXP2_POLY_ORDER 5
|
||||
#define EXP2_POLY_WIDE 0
|
||||
/* Wider exp10 polynomial necessary for good precision in non-nearest rounding
|
||||
and !TOINT_INTRINSICS. */
|
||||
#define EXP10_POLY_WIDE 0
|
||||
extern const struct exp_data
|
||||
{
|
||||
double invln2N;
|
||||
double invlog10_2N;
|
||||
double shift;
|
||||
double negln2hiN;
|
||||
double negln2loN;
|
||||
double neglog10_2hiN;
|
||||
double neglog10_2loN;
|
||||
double poly[4]; /* Last four coefficients. */
|
||||
double exp2_shift;
|
||||
double exp2_poly[EXP2_POLY_ORDER];
|
||||
double exp10_poly[5];
|
||||
uint64_t tab[2*(1 << EXP_TABLE_BITS)];
|
||||
} __exp_data HIDDEN;
|
||||
|
||||
@ -459,4 +506,16 @@ extern const struct erf_data
|
||||
double erfc_poly_F[ERFC_POLY_F_NCOEFFS];
|
||||
} __erf_data HIDDEN;
|
||||
|
||||
#define V_EXP_TABLE_BITS 7
|
||||
extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
|
||||
|
||||
#define V_LOG_TABLE_BITS 7
|
||||
extern const struct v_log_data
|
||||
{
|
||||
struct
|
||||
{
|
||||
double invc, logc;
|
||||
} table[1 << V_LOG_TABLE_BITS];
|
||||
} __v_log_data HIDDEN;
|
||||
|
||||
#endif
|
||||
|
@ -1,6 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#define SCALAR 1
|
||||
#include "v_cos.c"
|
@ -1,6 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#define SCALAR 1
|
||||
#include "v_cosf.c"
|
@ -1,6 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#define SCALAR 1
|
||||
#include "v_exp.c"
|
@ -1,6 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#define SCALAR 1
|
||||
#include "v_exp2f.c"
|
@ -1,6 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#define SCALAR 1
|
||||
#include "v_exp2f_1u.c"
|
@ -1,6 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#define SCALAR 1
|
||||
#include "v_expf.c"
|
@ -1,6 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#define SCALAR 1
|
||||
#include "v_expf_1u.c"
|
@ -1,6 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#define SCALAR 1
|
||||
#include "v_log.c"
|
@ -1,6 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#define SCALAR 1
|
||||
#include "v_logf.c"
|
@ -1,6 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#define SCALAR 1
|
||||
#include "v_pow.c"
|
@ -1,6 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#define SCALAR 1
|
||||
#include "v_powf.c"
|
@ -1,6 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#define SCALAR 1
|
||||
#include "v_sin.c"
|
@ -1,6 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#define SCALAR 1
|
||||
#include "v_sinf.c"
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Microbenchmark for math functions.
|
||||
*
|
||||
* Copyright (c) 2018-2022, Arm Limited.
|
||||
* Copyright (c) 2018-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
@ -15,11 +15,6 @@
|
||||
#include <math.h>
|
||||
#include "mathlib.h"
|
||||
|
||||
#ifndef WANT_VMATH
|
||||
/* Enable the build of vector math code. */
|
||||
# define WANT_VMATH 1
|
||||
#endif
|
||||
|
||||
/* Number of measurements, best result is reported. */
|
||||
#define MEASURE 60
|
||||
/* Array size. */
|
||||
@ -34,8 +29,9 @@ static float Af[N];
|
||||
static long measurecount = MEASURE;
|
||||
static long itercount = ITER;
|
||||
|
||||
#if __aarch64__ && WANT_VMATH
|
||||
typedef __f64x2_t v_double;
|
||||
#ifdef __vpcs
|
||||
#include <arm_neon.h>
|
||||
typedef float64x2_t v_double;
|
||||
|
||||
#define v_double_len() 2
|
||||
|
||||
@ -51,7 +47,7 @@ v_double_dup (double x)
|
||||
return (v_double){x, x};
|
||||
}
|
||||
|
||||
typedef __f32x4_t v_float;
|
||||
typedef float32x4_t v_float;
|
||||
|
||||
#define v_float_len() 4
|
||||
|
||||
@ -66,6 +62,19 @@ v_float_dup (float x)
|
||||
{
|
||||
return (v_float){x, x, x, x};
|
||||
}
|
||||
#else
|
||||
/* dummy definitions to make things compile. */
|
||||
typedef double v_double;
|
||||
typedef float v_float;
|
||||
#define v_double_len(x) 1
|
||||
#define v_double_load(x) (x)[0]
|
||||
#define v_double_dup(x) (x)
|
||||
#define v_float_len(x) 1
|
||||
#define v_float_load(x) (x)[0]
|
||||
#define v_float_dup(x) (x)
|
||||
|
||||
#endif
|
||||
|
||||
#if WANT_SVE_MATH
|
||||
#include <arm_sve.h>
|
||||
typedef svbool_t sv_bool;
|
||||
@ -102,17 +111,10 @@ sv_float_dup (float x)
|
||||
{
|
||||
return svdup_n_f32(x);
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
/* dummy definitions to make things compile. */
|
||||
typedef double v_double;
|
||||
typedef float v_float;
|
||||
#define v_double_len(x) 1
|
||||
#define v_double_load(x) (x)[0]
|
||||
#define v_double_dup(x) (x)
|
||||
#define v_float_len(x) 1
|
||||
#define v_float_load(x) (x)[0]
|
||||
#define v_float_dup(x) (x)
|
||||
#define sv_double_len(x) 1
|
||||
#define sv_float_len(x) 1
|
||||
#endif
|
||||
|
||||
static double
|
||||
@ -126,20 +128,6 @@ dummyf (float x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
#if WANT_VMATH
|
||||
#if __aarch64__
|
||||
static v_double
|
||||
__v_dummy (v_double x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
static v_float
|
||||
__v_dummyf (v_float x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
#ifdef __vpcs
|
||||
__vpcs static v_double
|
||||
__vn_dummy (v_double x)
|
||||
@ -166,8 +154,6 @@ __sv_dummyf (sv_float x, sv_bool pg)
|
||||
return x;
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "test/mathbench_wrappers.h"
|
||||
@ -183,8 +169,6 @@ static const struct fun
|
||||
{
|
||||
double (*d) (double);
|
||||
float (*f) (float);
|
||||
v_double (*vd) (v_double);
|
||||
v_float (*vf) (v_float);
|
||||
#ifdef __vpcs
|
||||
__vpcs v_double (*vnd) (v_double);
|
||||
__vpcs v_float (*vnf) (v_float);
|
||||
@ -197,18 +181,12 @@ static const struct fun
|
||||
} funtab[] = {
|
||||
#define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
|
||||
#define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
|
||||
#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}},
|
||||
#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
|
||||
#define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
|
||||
#define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
|
||||
#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
|
||||
#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
|
||||
D (dummy, 1.0, 2.0)
|
||||
F (dummyf, 1.0, 2.0)
|
||||
#if WANT_VMATH
|
||||
#if __aarch64__
|
||||
VD (__v_dummy, 1.0, 2.0)
|
||||
VF (__v_dummyf, 1.0, 2.0)
|
||||
#ifdef __vpcs
|
||||
VND (__vn_dummy, 1.0, 2.0)
|
||||
VNF (__vn_dummyf, 1.0, 2.0)
|
||||
@ -217,14 +195,10 @@ VNF (__vn_dummyf, 1.0, 2.0)
|
||||
SVD (__sv_dummy, 1.0, 2.0)
|
||||
SVF (__sv_dummyf, 1.0, 2.0)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#include "test/mathbench_funcs.h"
|
||||
{0},
|
||||
#undef F
|
||||
#undef D
|
||||
#undef VF
|
||||
#undef VD
|
||||
#undef VNF
|
||||
#undef VND
|
||||
#undef SVF
|
||||
@ -327,38 +301,6 @@ runf_latency (float f (float))
|
||||
prev = f (Af[i] + prev * z);
|
||||
}
|
||||
|
||||
static void
|
||||
run_v_thruput (v_double f (v_double))
|
||||
{
|
||||
for (int i = 0; i < N; i += v_double_len ())
|
||||
f (v_double_load (A+i));
|
||||
}
|
||||
|
||||
static void
|
||||
runf_v_thruput (v_float f (v_float))
|
||||
{
|
||||
for (int i = 0; i < N; i += v_float_len ())
|
||||
f (v_float_load (Af+i));
|
||||
}
|
||||
|
||||
static void
|
||||
run_v_latency (v_double f (v_double))
|
||||
{
|
||||
v_double z = v_double_dup (zero);
|
||||
v_double prev = z;
|
||||
for (int i = 0; i < N; i += v_double_len ())
|
||||
prev = f (v_double_load (A+i) + prev * z);
|
||||
}
|
||||
|
||||
static void
|
||||
runf_v_latency (v_float f (v_float))
|
||||
{
|
||||
v_float z = v_float_dup (zero);
|
||||
v_float prev = z;
|
||||
for (int i = 0; i < N; i += v_float_len ())
|
||||
prev = f (v_float_load (Af+i) + prev * z);
|
||||
}
|
||||
|
||||
#ifdef __vpcs
|
||||
static void
|
||||
run_vn_thruput (__vpcs v_double f (v_double))
|
||||
@ -377,19 +319,21 @@ runf_vn_thruput (__vpcs v_float f (v_float))
|
||||
static void
|
||||
run_vn_latency (__vpcs v_double f (v_double))
|
||||
{
|
||||
v_double z = v_double_dup (zero);
|
||||
v_double prev = z;
|
||||
volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 };
|
||||
uint64x2_t sel = vsel;
|
||||
v_double prev = v_double_dup (0);
|
||||
for (int i = 0; i < N; i += v_double_len ())
|
||||
prev = f (v_double_load (A+i) + prev * z);
|
||||
prev = f (vbslq_f64 (sel, prev, v_double_load (A+i)));
|
||||
}
|
||||
|
||||
static void
|
||||
runf_vn_latency (__vpcs v_float f (v_float))
|
||||
{
|
||||
v_float z = v_float_dup (zero);
|
||||
v_float prev = z;
|
||||
volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 };
|
||||
uint32x4_t sel = vsel;
|
||||
v_float prev = v_float_dup (0);
|
||||
for (int i = 0; i < N; i += v_float_len ())
|
||||
prev = f (v_float_load (Af+i) + prev * z);
|
||||
prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i)));
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -411,19 +355,21 @@ runf_sv_thruput (sv_float f (sv_float, sv_bool))
|
||||
static void
|
||||
run_sv_latency (sv_double f (sv_double, sv_bool))
|
||||
{
|
||||
sv_double z = sv_double_dup (zero);
|
||||
sv_double prev = z;
|
||||
volatile sv_bool vsel = svptrue_b64 ();
|
||||
sv_bool sel = vsel;
|
||||
sv_double prev = sv_double_dup (0);
|
||||
for (int i = 0; i < N; i += sv_double_len ())
|
||||
prev = f (svmad_f64_x (svptrue_b64 (), prev, z, sv_double_load (A+i)), svptrue_b64 ());
|
||||
prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ());
|
||||
}
|
||||
|
||||
static void
|
||||
runf_sv_latency (sv_float f (sv_float, sv_bool))
|
||||
{
|
||||
sv_float z = sv_float_dup (zero);
|
||||
sv_float prev = z;
|
||||
volatile sv_bool vsel = svptrue_b32 ();
|
||||
sv_bool sel = vsel;
|
||||
sv_float prev = sv_float_dup (0);
|
||||
for (int i = 0; i < N; i += sv_float_len ())
|
||||
prev = f (svmad_f32_x (svptrue_b32 (), prev, z, sv_float_load (Af+i)), svptrue_b32 ());
|
||||
prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ());
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -458,10 +404,10 @@ bench1 (const struct fun *f, int type, double lo, double hi)
|
||||
const char *s = type == 't' ? "rthruput" : "latency";
|
||||
int vlen = 1;
|
||||
|
||||
if (f->vec && f->prec == 'd')
|
||||
vlen = v_double_len();
|
||||
else if (f->vec && f->prec == 'f')
|
||||
vlen = v_float_len();
|
||||
if (f->vec == 'n')
|
||||
vlen = f->prec == 'd' ? v_double_len() : v_float_len();
|
||||
else if (f->vec == 's')
|
||||
vlen = f->prec == 'd' ? sv_double_len() : sv_float_len();
|
||||
|
||||
if (f->prec == 'd' && type == 't' && f->vec == 0)
|
||||
TIMEIT (run_thruput, f->fun.d);
|
||||
@ -471,14 +417,6 @@ bench1 (const struct fun *f, int type, double lo, double hi)
|
||||
TIMEIT (runf_thruput, f->fun.f);
|
||||
else if (f->prec == 'f' && type == 'l' && f->vec == 0)
|
||||
TIMEIT (runf_latency, f->fun.f);
|
||||
else if (f->prec == 'd' && type == 't' && f->vec == 'v')
|
||||
TIMEIT (run_v_thruput, f->fun.vd);
|
||||
else if (f->prec == 'd' && type == 'l' && f->vec == 'v')
|
||||
TIMEIT (run_v_latency, f->fun.vd);
|
||||
else if (f->prec == 'f' && type == 't' && f->vec == 'v')
|
||||
TIMEIT (runf_v_thruput, f->fun.vf);
|
||||
else if (f->prec == 'f' && type == 'l' && f->vec == 'v')
|
||||
TIMEIT (runf_v_latency, f->fun.vf);
|
||||
#ifdef __vpcs
|
||||
else if (f->prec == 'd' && type == 't' && f->vec == 'n')
|
||||
TIMEIT (run_vn_thruput, f->fun.vnd);
|
||||
@ -503,16 +441,18 @@ bench1 (const struct fun *f, int type, double lo, double hi)
|
||||
if (type == 't')
|
||||
{
|
||||
ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
|
||||
printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
|
||||
printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n",
|
||||
f->name, s,
|
||||
(unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
|
||||
(unsigned long long) dt, lo, hi);
|
||||
(unsigned long long) dt, lo, hi, vlen);
|
||||
}
|
||||
else if (type == 'l')
|
||||
{
|
||||
ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
|
||||
printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s,
|
||||
printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n",
|
||||
f->name, s,
|
||||
(unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
|
||||
(unsigned long long) dt, lo, hi);
|
||||
(unsigned long long) dt, lo, hi, vlen);
|
||||
}
|
||||
fflush (stdout);
|
||||
}
|
||||
|
@ -1,11 +1,13 @@
|
||||
/*
|
||||
* Function entries for mathbench.
|
||||
*
|
||||
* Copyright (c) 2022, Arm Limited.
|
||||
* Copyright (c) 2022-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
/* clang-format off */
|
||||
D (exp, -9.9, 9.9)
|
||||
D (exp, 0.5, 1.0)
|
||||
D (exp10, -9.9, 9.9)
|
||||
D (exp2, -9.9, 9.9)
|
||||
D (log, 0.01, 11.1)
|
||||
D (log, 0.999, 1.001)
|
||||
@ -42,59 +44,19 @@ F (cosf, 3.3, 33.3)
|
||||
F (cosf, 100, 1000)
|
||||
F (cosf, 1e6, 1e32)
|
||||
F (erff, -4.0, 4.0)
|
||||
#if WANT_VMATH
|
||||
D (__s_sin, -3.1, 3.1)
|
||||
D (__s_cos, -3.1, 3.1)
|
||||
D (__s_exp, -9.9, 9.9)
|
||||
D (__s_log, 0.01, 11.1)
|
||||
{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
|
||||
F (__s_expf, -9.9, 9.9)
|
||||
F (__s_expf_1u, -9.9, 9.9)
|
||||
F (__s_exp2f, -9.9, 9.9)
|
||||
F (__s_exp2f_1u, -9.9, 9.9)
|
||||
F (__s_logf, 0.01, 11.1)
|
||||
{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}},
|
||||
F (__s_sinf, -3.1, 3.1)
|
||||
F (__s_cosf, -3.1, 3.1)
|
||||
#if __aarch64__
|
||||
VD (__v_sin, -3.1, 3.1)
|
||||
VD (__v_cos, -3.1, 3.1)
|
||||
VD (__v_exp, -9.9, 9.9)
|
||||
VD (__v_log, 0.01, 11.1)
|
||||
{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
|
||||
VF (__v_expf, -9.9, 9.9)
|
||||
VF (__v_expf_1u, -9.9, 9.9)
|
||||
VF (__v_exp2f, -9.9, 9.9)
|
||||
VF (__v_exp2f_1u, -9.9, 9.9)
|
||||
VF (__v_logf, 0.01, 11.1)
|
||||
{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}},
|
||||
VF (__v_sinf, -3.1, 3.1)
|
||||
VF (__v_cosf, -3.1, 3.1)
|
||||
#ifdef __vpcs
|
||||
VND (__vn_exp, -9.9, 9.9)
|
||||
VND (_ZGVnN2v_exp, -9.9, 9.9)
|
||||
VND (__vn_log, 0.01, 11.1)
|
||||
VND (_ZGVnN2v_log, 0.01, 11.1)
|
||||
{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
|
||||
{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
|
||||
VND (__vn_sin, -3.1, 3.1)
|
||||
VND (_ZGVnN2v_sin, -3.1, 3.1)
|
||||
VND (__vn_cos, -3.1, 3.1)
|
||||
VND (_ZGVnN2v_cos, -3.1, 3.1)
|
||||
VNF (__vn_expf, -9.9, 9.9)
|
||||
VNF (_ZGVnN4v_expf, -9.9, 9.9)
|
||||
VNF (__vn_expf_1u, -9.9, 9.9)
|
||||
VNF (__vn_exp2f, -9.9, 9.9)
|
||||
VNF (_ZGVnN4v_expf_1u, -9.9, 9.9)
|
||||
VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
|
||||
VNF (__vn_exp2f_1u, -9.9, 9.9)
|
||||
VNF (__vn_logf, 0.01, 11.1)
|
||||
VNF (_ZGVnN4v_exp2f_1u, -9.9, 9.9)
|
||||
VNF (_ZGVnN4v_logf, 0.01, 11.1)
|
||||
{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}},
|
||||
{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
|
||||
VNF (__vn_sinf, -3.1, 3.1)
|
||||
VNF (_ZGVnN4v_sinf, -3.1, 3.1)
|
||||
VNF (__vn_cosf, -3.1, 3.1)
|
||||
VNF (_ZGVnN4v_cosf, -3.1, 3.1)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
/* clang-format on */
|
||||
|
@ -1,18 +1,11 @@
|
||||
/*
|
||||
* Function wrappers for mathbench.
|
||||
*
|
||||
* Copyright (c) 2022, Arm Limited.
|
||||
* Copyright (c) 2022-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#if WANT_VMATH
|
||||
#if __aarch64__
|
||||
|
||||
#ifdef __vpcs
|
||||
__vpcs static v_float
|
||||
xy__vn_powf (v_float x)
|
||||
{
|
||||
return __vn_powf (x, x);
|
||||
}
|
||||
|
||||
__vpcs static v_float
|
||||
xy_Z_powf (v_float x)
|
||||
@ -20,44 +13,13 @@ xy_Z_powf (v_float x)
|
||||
return _ZGVnN4vv_powf (x, x);
|
||||
}
|
||||
|
||||
__vpcs static v_double
|
||||
xy__vn_pow (v_double x)
|
||||
{
|
||||
return __vn_pow (x, x);
|
||||
}
|
||||
|
||||
__vpcs static v_double
|
||||
xy_Z_pow (v_double x)
|
||||
{
|
||||
return _ZGVnN2vv_pow (x, x);
|
||||
}
|
||||
#endif // __vpcs
|
||||
|
||||
static v_float
|
||||
xy__v_powf (v_float x)
|
||||
{
|
||||
return __v_powf (x, x);
|
||||
}
|
||||
|
||||
static v_double
|
||||
xy__v_pow (v_double x)
|
||||
{
|
||||
return __v_pow (x, x);
|
||||
}
|
||||
#endif // __aarch64__
|
||||
|
||||
static float
|
||||
xy__s_powf (float x)
|
||||
{
|
||||
return __s_powf (x, x);
|
||||
}
|
||||
|
||||
static double
|
||||
xy__s_pow (double x)
|
||||
{
|
||||
return __s_pow (x, x);
|
||||
}
|
||||
#endif // WANT_VMATH
|
||||
#endif
|
||||
|
||||
static double
|
||||
xypow (double x)
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* mathtest.c - test rig for mathlib
|
||||
*
|
||||
* Copyright (c) 1998-2022, Arm Limited.
|
||||
* Copyright (c) 1998-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
@ -254,6 +254,7 @@ test_func tfuncs[] = {
|
||||
TFUNCARM(at_s,rt_s, expf, 3*ULPUNIT/4),
|
||||
TFUNCARM(at_s,rt_s, exp2f, 3*ULPUNIT/4),
|
||||
TFUNC(at_s,rt_s, expm1f, ULPUNIT),
|
||||
TFUNC(at_d,rt_d, exp10, ULPUNIT),
|
||||
|
||||
/* power */
|
||||
TFUNC(at_d2,rt_d, pow, 3*ULPUNIT/4),
|
||||
@ -1021,6 +1022,7 @@ int runtest(testdetail t) {
|
||||
DO_DOP(d_arg1,op1r);
|
||||
DO_DOP(d_arg2,op2r);
|
||||
s_arg1.i = t.op1r[0]; s_arg2.i = t.op2r[0];
|
||||
s_res.i = 0;
|
||||
|
||||
/*
|
||||
* Detect NaNs, infinities and denormals on input, and set a
|
||||
@ -1155,22 +1157,25 @@ int runtest(testdetail t) {
|
||||
tresultr[0] = t.resultr[0];
|
||||
tresultr[1] = t.resultr[1];
|
||||
resultr[0] = d_res.i[dmsd]; resultr[1] = d_res.i[dlsd];
|
||||
resulti[0] = resulti[1] = 0;
|
||||
wres = 2;
|
||||
break;
|
||||
case rt_i:
|
||||
tresultr[0] = t.resultr[0];
|
||||
resultr[0] = intres;
|
||||
resulti[0] = 0;
|
||||
wres = 1;
|
||||
break;
|
||||
case rt_s:
|
||||
case rt_s2:
|
||||
tresultr[0] = t.resultr[0];
|
||||
resultr[0] = s_res.i;
|
||||
resulti[0] = 0;
|
||||
wres = 1;
|
||||
break;
|
||||
default:
|
||||
puts("unhandled rettype in runtest");
|
||||
wres = 0;
|
||||
abort ();
|
||||
}
|
||||
if(t.resultc != rc_none) {
|
||||
int err = 0;
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
# ULP error check script.
|
||||
#
|
||||
# Copyright (c) 2019-2022, Arm Limited.
|
||||
# Copyright (c) 2019-2023, Arm Limited.
|
||||
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
|
||||
#set -x
|
||||
@ -72,6 +72,16 @@ t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000
|
||||
t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000
|
||||
t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000
|
||||
|
||||
L=0.02
|
||||
t exp10 0 0x1p-47 5000
|
||||
t exp10 -0 -0x1p-47 5000
|
||||
t exp10 0x1p-47 1 50000
|
||||
t exp10 -0x1p-47 -1 50000
|
||||
t exp10 1 0x1.34413509f79ffp8 50000
|
||||
t exp10 -1 -0x1.434e6420f4374p8 50000
|
||||
t exp10 0x1.34413509f79ffp8 inf 5000
|
||||
t exp10 -0x1.434e6420f4374p8 -inf 5000
|
||||
|
||||
L=1.0
|
||||
Ldir=0.9
|
||||
t erf 0 0xffff000000000000 10000
|
||||
@ -143,15 +153,10 @@ Ldir=0.5
|
||||
done
|
||||
|
||||
# vector functions
|
||||
|
||||
Ldir=0.5
|
||||
r='n'
|
||||
flags="${ULPFLAGS:--q}"
|
||||
runs=
|
||||
check __s_exp 1 && runs=1
|
||||
runv=
|
||||
check __v_exp 1 && runv=1
|
||||
runvn=
|
||||
check __vn_exp 1 && runvn=1
|
||||
|
||||
range_exp='
|
||||
0 0xffff000000000000 10000
|
||||
@ -177,9 +182,10 @@ range_pow='
|
||||
'
|
||||
|
||||
range_sin='
|
||||
0 0xffff000000000000 10000
|
||||
0x1p-4 0x1p4 400000
|
||||
-0x1p-23 0x1p23 400000
|
||||
0 0x1p23 500000
|
||||
-0 -0x1p23 500000
|
||||
0x1p23 inf 10000
|
||||
-0x1p23 -inf 10000
|
||||
'
|
||||
range_cos="$range_sin"
|
||||
|
||||
@ -199,9 +205,10 @@ range_logf='
|
||||
'
|
||||
|
||||
range_sinf='
|
||||
0 0xffff0000 10000
|
||||
0x1p-4 0x1p4 300000
|
||||
-0x1p-9 -0x1p9 300000
|
||||
0 0x1p20 500000
|
||||
-0 -0x1p20 500000
|
||||
0x1p20 inf 10000
|
||||
-0x1p20 -inf 10000
|
||||
'
|
||||
range_cosf="$range_sinf"
|
||||
|
||||
@ -229,9 +236,8 @@ L_sinf=1.4
|
||||
L_cosf=1.4
|
||||
L_powf=2.1
|
||||
|
||||
while read G F R D
|
||||
while read G F D
|
||||
do
|
||||
[ "$R" = 1 ] || continue
|
||||
case "$G" in \#*) continue ;; esac
|
||||
eval range="\${range_$G}"
|
||||
eval L="\${L_$G}"
|
||||
@ -251,71 +257,23 @@ do
|
||||
t $D $disable_fenv $F $X
|
||||
done << EOF
|
||||
$range
|
||||
|
||||
EOF
|
||||
done << EOF
|
||||
# group symbol run
|
||||
exp __s_exp $runs
|
||||
exp __v_exp $runv
|
||||
exp __vn_exp $runvn
|
||||
exp _ZGVnN2v_exp $runvn
|
||||
|
||||
log __s_log $runs
|
||||
log __v_log $runv
|
||||
log __vn_log $runvn
|
||||
log _ZGVnN2v_log $runvn
|
||||
|
||||
pow __s_pow $runs -f
|
||||
pow __v_pow $runv -f
|
||||
pow __vn_pow $runvn -f
|
||||
pow _ZGVnN2vv_pow $runvn -f
|
||||
|
||||
sin __s_sin $runs
|
||||
sin __v_sin $runv
|
||||
sin __vn_sin $runvn
|
||||
sin _ZGVnN2v_sin $runvn
|
||||
|
||||
cos __s_cos $runs
|
||||
cos __v_cos $runv
|
||||
cos __vn_cos $runvn
|
||||
cos _ZGVnN2v_cos $runvn
|
||||
|
||||
expf __s_expf $runs
|
||||
expf __v_expf $runv
|
||||
expf __vn_expf $runvn
|
||||
expf _ZGVnN4v_expf $runvn
|
||||
|
||||
expf_1u __s_expf_1u $runs -f
|
||||
expf_1u __v_expf_1u $runv -f
|
||||
expf_1u __vn_expf_1u $runvn -f
|
||||
|
||||
exp2f __s_exp2f $runs
|
||||
exp2f __v_exp2f $runv
|
||||
exp2f __vn_exp2f $runvn
|
||||
exp2f _ZGVnN4v_exp2f $runvn
|
||||
|
||||
exp2f_1u __s_exp2f_1u $runs -f
|
||||
exp2f_1u __v_exp2f_1u $runv -f
|
||||
exp2f_1u __vn_exp2f_1u $runvn -f
|
||||
|
||||
logf __s_logf $runs
|
||||
logf __v_logf $runv
|
||||
logf __vn_logf $runvn
|
||||
logf _ZGVnN4v_logf $runvn
|
||||
|
||||
sinf __s_sinf $runs
|
||||
sinf __v_sinf $runv
|
||||
sinf __vn_sinf $runvn
|
||||
sinf _ZGVnN4v_sinf $runvn
|
||||
|
||||
cosf __s_cosf $runs
|
||||
cosf __v_cosf $runv
|
||||
cosf __vn_cosf $runvn
|
||||
cosf _ZGVnN4v_cosf $runvn
|
||||
|
||||
powf __s_powf $runs -f
|
||||
powf __v_powf $runv -f
|
||||
powf __vn_powf $runvn -f
|
||||
powf _ZGVnN4vv_powf $runvn -f
|
||||
exp _ZGVnN2v_exp
|
||||
log _ZGVnN2v_log
|
||||
pow _ZGVnN2vv_pow -f
|
||||
sin _ZGVnN2v_sin -z
|
||||
cos _ZGVnN2v_cos
|
||||
expf _ZGVnN4v_expf
|
||||
expf_1u _ZGVnN4v_expf_1u -f
|
||||
exp2f _ZGVnN4v_exp2f
|
||||
exp2f_1u _ZGVnN4v_exp2f_1u -f
|
||||
logf _ZGVnN4v_logf
|
||||
sinf _ZGVnN4v_sinf -z
|
||||
cosf _ZGVnN4v_cosf
|
||||
powf _ZGVnN4vv_powf -f
|
||||
EOF
|
||||
|
||||
[ 0 -eq $FAIL ] || {
|
||||
|
@ -0,0 +1,15 @@
|
||||
; Directed test cases for exp10
|
||||
;
|
||||
; Copyright (c) 2023, Arm Limited.
|
||||
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
|
||||
func=exp10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
|
||||
func=exp10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
|
||||
func=exp10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
|
||||
func=exp10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
|
||||
func=exp10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
|
||||
func=exp10 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
|
||||
func=exp10 op1=fff00000.00000000 result=00000000.00000000 errno=0
|
||||
func=exp10 op1=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux
|
||||
func=exp10 op1=00000000.00000000 result=3ff00000.00000000 errno=0
|
||||
func=exp10 op1=80000000.00000000 result=3ff00000.00000000 errno=0
|
@ -1,10 +1,11 @@
|
||||
/*
|
||||
* ULP error checking tool for math functions.
|
||||
*
|
||||
* Copyright (c) 2019-2022, Arm Limited.
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#define _GNU_SOURCE
|
||||
#include <ctype.h>
|
||||
#include <fenv.h>
|
||||
#include <float.h>
|
||||
@ -23,11 +24,6 @@
|
||||
# include <mpfr.h>
|
||||
#endif
|
||||
|
||||
#ifndef WANT_VMATH
|
||||
/* Enable the build of vector math code. */
|
||||
# define WANT_VMATH 1
|
||||
#endif
|
||||
|
||||
static inline uint64_t
|
||||
asuint64 (double f)
|
||||
{
|
||||
@ -212,6 +208,7 @@ struct conf
|
||||
unsigned long long n;
|
||||
double softlim;
|
||||
double errlim;
|
||||
int ignore_zero_sign;
|
||||
};
|
||||
|
||||
/* A bit of a hack: call vector functions twice with the same
|
||||
@ -220,7 +217,7 @@ struct conf
|
||||
static int secondcall;
|
||||
|
||||
/* Wrappers for vector functions. */
|
||||
#if __aarch64__ && WANT_VMATH
|
||||
#ifdef __vpcs
|
||||
typedef __f32x4_t v_float;
|
||||
typedef __f64x2_t v_double;
|
||||
/* First element of fv and dv may be changed by -c argument. */
|
||||
@ -264,40 +261,8 @@ static inline double svretd(sv_double vec) {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if WANT_SVE_MATH
|
||||
long double
|
||||
dummyl (long double x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
double
|
||||
dummy (double x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
static sv_double
|
||||
__sv_dummy (sv_double x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
static sv_float
|
||||
__sv_dummyf (sv_float x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
#endif
|
||||
|
||||
#include "test/ulp_wrappers.h"
|
||||
|
||||
/* Wrappers for SVE functions. */
|
||||
#if WANT_SVE_MATH
|
||||
static double sv_dummy (double x) { return svretd (__sv_dummy (svargd (x))); }
|
||||
static float sv_dummyf (float x) { return svretf (__sv_dummyf (svargf (x))); }
|
||||
#endif
|
||||
|
||||
struct fun
|
||||
{
|
||||
const char *name;
|
||||
@ -358,10 +323,6 @@ static const struct fun fun[] = {
|
||||
#define ZVNF2(x) VNF2 (x) ZVF2 (x)
|
||||
#define ZVND1(x) VND1 (x) ZVD1 (x)
|
||||
#define ZVND2(x) VND2 (x) ZVD2 (x)
|
||||
#define SF1(x) F (__s_##x##f, __s_##x##f, x, mpfr_##x, 1, 1, f1, 0)
|
||||
#define SF2(x) F (__s_##x##f, __s_##x##f, x, mpfr_##x, 2, 1, f2, 0)
|
||||
#define SD1(x) F (__s_##x, __s_##x, x##l, mpfr_##x, 1, 0, d1, 0)
|
||||
#define SD2(x) F (__s_##x, __s_##x, x##l, mpfr_##x, 2, 0, d2, 0)
|
||||
/* SVE routines. */
|
||||
#define SVF1(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
|
||||
#define SVF2(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
|
||||
@ -374,11 +335,6 @@ static const struct fun fun[] = {
|
||||
|
||||
#include "test/ulp_funcs.h"
|
||||
|
||||
#if WANT_SVE_MATH
|
||||
SVD1 (dummy)
|
||||
SVF1 (dummy)
|
||||
#endif
|
||||
|
||||
#undef F
|
||||
#undef F1
|
||||
#undef F2
|
||||
@ -628,17 +584,18 @@ call_mpfr_d2 (mpfr_t y, const struct fun *f, struct args_d2 a, mpfr_rnd_t r)
|
||||
static void
|
||||
usage (void)
|
||||
{
|
||||
puts ("./ulp [-q] [-m] [-f] [-r nudz] [-l soft-ulplimit] [-e ulplimit] func "
|
||||
puts ("./ulp [-q] [-m] [-f] [-r {n|u|d|z}] [-l soft-ulplimit] [-e ulplimit] func "
|
||||
"lo [hi [x lo2 hi2] [count]]");
|
||||
puts ("Compares func against a higher precision implementation in [lo; hi].");
|
||||
puts ("-q: quiet.");
|
||||
puts ("-m: use mpfr even if faster method is available.");
|
||||
puts ("-f: disable fenv testing (rounding modes and exceptions).");
|
||||
#if __aarch64__ && WANT_VMATH
|
||||
puts ("-f: disable fenv exceptions testing.");
|
||||
#ifdef ___vpcs
|
||||
puts ("-c: neutral 'control value' to test behaviour when one lane can affect another. \n"
|
||||
" This should be different from tested input in other lanes, and non-special \n"
|
||||
" (i.e. should not trigger fenv exceptions). Default is 1.");
|
||||
#endif
|
||||
puts ("-z: ignore sign of 0.");
|
||||
puts ("Supported func:");
|
||||
for (const struct fun *f = fun; f->name; f++)
|
||||
printf ("\t%s\n", f->name);
|
||||
@ -762,6 +719,7 @@ main (int argc, char *argv[])
|
||||
conf.fenv = 1;
|
||||
conf.softlim = 0;
|
||||
conf.errlim = INFINITY;
|
||||
conf.ignore_zero_sign = 0;
|
||||
for (;;)
|
||||
{
|
||||
argc--;
|
||||
@ -801,12 +759,15 @@ main (int argc, char *argv[])
|
||||
{
|
||||
argc--;
|
||||
argv++;
|
||||
if (argc < 1)
|
||||
if (argc < 1 || argv[0][1] != '\0')
|
||||
usage ();
|
||||
conf.rc = argv[0][0];
|
||||
}
|
||||
break;
|
||||
#if __aarch64__ && WANT_VMATH
|
||||
case 'z':
|
||||
conf.ignore_zero_sign = 1;
|
||||
break;
|
||||
#ifdef __vpcs
|
||||
case 'c':
|
||||
argc--;
|
||||
argv++;
|
||||
@ -839,7 +800,19 @@ main (int argc, char *argv[])
|
||||
if (strcmp (argv[0], f->name) == 0)
|
||||
break;
|
||||
if (!f->name)
|
||||
usage ();
|
||||
{
|
||||
#ifndef __vpcs
|
||||
/* Ignore vector math functions if vector math is not supported. */
|
||||
if (strncmp (argv[0], "_ZGVnN", 6) == 0)
|
||||
exit (0);
|
||||
#endif
|
||||
#if !WANT_SVE_MATH
|
||||
if (strncmp (argv[0], "_ZGVsMxv", 8) == 0)
|
||||
exit (0);
|
||||
#endif
|
||||
printf ("math function %s not supported\n", argv[0]);
|
||||
exit (1);
|
||||
}
|
||||
if (!f->singleprec && LDBL_MANT_DIG == DBL_MANT_DIG)
|
||||
conf.mpfr = 1; /* Use mpfr if long double has no extra precision. */
|
||||
if (!USE_MPFR && conf.mpfr)
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Generic functions for ULP error estimation.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
@ -37,7 +37,8 @@ static int RT(ulpscale_mpfr) (mpfr_t x, int t)
|
||||
/* Difference between exact result and closest real number that
|
||||
gets rounded to got, i.e. error before rounding, for a correctly
|
||||
rounded result the difference is 0. */
|
||||
static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r)
|
||||
static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r,
|
||||
int ignore_zero_sign)
|
||||
{
|
||||
RT(float) want = p->y;
|
||||
RT(float) d;
|
||||
@ -45,10 +46,18 @@ static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r)
|
||||
|
||||
if (RT(asuint) (got) == RT(asuint) (want))
|
||||
return 0.0;
|
||||
if (isnan (got) && isnan (want))
|
||||
/* Ignore sign of NaN. */
|
||||
return RT (issignaling) (got) == RT (issignaling) (want) ? 0 : INFINITY;
|
||||
if (signbit (got) != signbit (want))
|
||||
/* May have false positives with NaN. */
|
||||
//return isnan(got) && isnan(want) ? 0 : INFINITY;
|
||||
return INFINITY;
|
||||
{
|
||||
/* Fall through to ULP calculation if ignoring sign of zero and at
|
||||
exactly one of want and got is non-zero. */
|
||||
if (ignore_zero_sign && want == got)
|
||||
return 0.0;
|
||||
if (!ignore_zero_sign || (want != 0 && got != 0))
|
||||
return INFINITY;
|
||||
}
|
||||
if (!isfinite (want) || !isfinite (got))
|
||||
{
|
||||
if (isnan (got) != isnan (want))
|
||||
@ -114,8 +123,12 @@ static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r,
|
||||
static inline void T(call_nofenv) (const struct fun *f, struct T(args) a,
|
||||
int r, RT(float) * y, int *ex)
|
||||
{
|
||||
if (r != FE_TONEAREST)
|
||||
fesetround (r);
|
||||
*y = T(call) (f, a);
|
||||
*ex = 0;
|
||||
if (r != FE_TONEAREST)
|
||||
fesetround (FE_TONEAREST);
|
||||
}
|
||||
|
||||
static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a,
|
||||
@ -155,8 +168,12 @@ static inline int T(call_long_nofenv) (const struct fun *f, struct T(args) a,
|
||||
int r, struct RT(ret) * p,
|
||||
RT(float) ygot, int exgot)
|
||||
{
|
||||
if (r != FE_TONEAREST)
|
||||
fesetround (r);
|
||||
RT(double) yl = T(call_long) (f, a);
|
||||
p->y = (RT(float)) yl;
|
||||
if (r != FE_TONEAREST)
|
||||
fesetround (FE_TONEAREST);
|
||||
if (RT(isok_nofenv) (ygot, p->y))
|
||||
return 1;
|
||||
p->ulpexp = RT(ulpscale) (p->y);
|
||||
@ -288,7 +305,7 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
|
||||
if (!ok)
|
||||
{
|
||||
int print = 0;
|
||||
double err = RT(ulperr) (ygot, &want, r);
|
||||
double err = RT (ulperr) (ygot, &want, r, conf->ignore_zero_sign);
|
||||
double abserr = fabs (err);
|
||||
// TODO: count errors below accuracy limit.
|
||||
if (abserr > 0)
|
||||
|
@ -1,9 +1,10 @@
|
||||
/*
|
||||
* Function entries for ulp.
|
||||
*
|
||||
* Copyright (c) 2022, Arm Limited.
|
||||
* Copyright (c) 2022-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
/* clang-format off */
|
||||
F1 (sin)
|
||||
F1 (cos)
|
||||
F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
|
||||
@ -15,56 +16,18 @@
|
||||
F2 (pow)
|
||||
F1 (erf)
|
||||
D1 (exp)
|
||||
D1 (exp10)
|
||||
D1 (exp2)
|
||||
D1 (log)
|
||||
D1 (log2)
|
||||
D2 (pow)
|
||||
D1 (erf)
|
||||
#if WANT_VMATH
|
||||
F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
|
||||
F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
|
||||
F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
|
||||
F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
|
||||
F (__s_exp2f_1u, __s_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 0)
|
||||
F (__s_exp2f, __s_exp2f, exp2, mpfr_exp2, 1, 1, f1, 0)
|
||||
F (__s_powf, __s_powf, pow, mpfr_pow, 2, 1, f2, 0)
|
||||
F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0)
|
||||
F (__s_sin, __s_sin, sinl, mpfr_sin, 1, 0, d1, 0)
|
||||
F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0)
|
||||
F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
|
||||
F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0)
|
||||
F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0)
|
||||
#if __aarch64__
|
||||
F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
|
||||
F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
|
||||
F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
|
||||
F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
|
||||
F (__v_exp2f_1u, v_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
|
||||
F (__v_exp2f, v_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
|
||||
F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1)
|
||||
F (__v_powf, v_powf, pow, mpfr_pow, 2, 1, f2, 1)
|
||||
F (__v_sin, v_sin, sinl, mpfr_sin, 1, 0, d1, 1)
|
||||
F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1)
|
||||
F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
|
||||
F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1)
|
||||
F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1)
|
||||
#ifdef __vpcs
|
||||
F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
|
||||
F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
|
||||
F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
|
||||
F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
|
||||
F (__vn_exp2f_1u, vn_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
|
||||
F (__vn_exp2f, vn_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
|
||||
F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1)
|
||||
F (__vn_powf, vn_powf, pow, mpfr_pow, 2, 1, f2, 1)
|
||||
F (__vn_sin, vn_sin, sinl, mpfr_sin, 1, 0, d1, 1)
|
||||
F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1)
|
||||
F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
|
||||
F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1)
|
||||
F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1)
|
||||
F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
|
||||
F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
|
||||
F (_ZGVnN4v_expf_1u, Z_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
|
||||
F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
|
||||
F (_ZGVnN4v_exp2f_1u, Z_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
|
||||
F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
|
||||
F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
|
||||
F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
|
||||
@ -74,5 +37,4 @@
|
||||
F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
|
||||
F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
/* clang-format on */
|
||||
|
@ -1,10 +1,12 @@
|
||||
/*
|
||||
* Function wrappers for ulp.
|
||||
*
|
||||
* Copyright (c) 2022, Arm Limited.
|
||||
* Copyright (c) 2022-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
/* clang-format off */
|
||||
|
||||
/* Wrappers for sincos. */
|
||||
static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
|
||||
static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
|
||||
@ -16,37 +18,12 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,
|
||||
#endif
|
||||
|
||||
/* Wrappers for vector functions. */
|
||||
#if __aarch64__ && WANT_VMATH
|
||||
static float v_sinf(float x) { return __v_sinf(argf(x))[0]; }
|
||||
static float v_cosf(float x) { return __v_cosf(argf(x))[0]; }
|
||||
static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
|
||||
static float v_expf(float x) { return __v_expf(argf(x))[0]; }
|
||||
static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; }
|
||||
static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; }
|
||||
static float v_logf(float x) { return __v_logf(argf(x))[0]; }
|
||||
static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; }
|
||||
static double v_sin(double x) { return __v_sin(argd(x))[0]; }
|
||||
static double v_cos(double x) { return __v_cos(argd(x))[0]; }
|
||||
static double v_exp(double x) { return __v_exp(argd(x))[0]; }
|
||||
static double v_log(double x) { return __v_log(argd(x))[0]; }
|
||||
static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; }
|
||||
#ifdef __vpcs
|
||||
static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
|
||||
static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
|
||||
static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
|
||||
static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
|
||||
static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; }
|
||||
static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; }
|
||||
static float vn_logf(float x) { return __vn_logf(argf(x))[0]; }
|
||||
static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; }
|
||||
static double vn_sin(double x) { return __vn_sin(argd(x))[0]; }
|
||||
static double vn_cos(double x) { return __vn_cos(argd(x))[0]; }
|
||||
static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
|
||||
static double vn_log(double x) { return __vn_log(argd(x))[0]; }
|
||||
static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; }
|
||||
static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
|
||||
static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
|
||||
static float Z_expf_1u(float x) { return _ZGVnN4v_expf_1u(argf(x))[0]; }
|
||||
static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
|
||||
static float Z_exp2f_1u(float x) { return _ZGVnN4v_exp2f_1u(argf(x))[0]; }
|
||||
static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
|
||||
static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
|
||||
static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
|
||||
@ -56,4 +33,5 @@ static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
|
||||
static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
|
||||
static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* clang-format on */
|
||||
|
356
contrib/arm-optimized-routines/math/tgamma128.c
Normal file
356
contrib/arm-optimized-routines/math/tgamma128.c
Normal file
@ -0,0 +1,356 @@
|
||||
/*
|
||||
* Implementation of the true gamma function (as opposed to lgamma)
|
||||
* for 128-bit long double.
|
||||
*
|
||||
* Copyright (c) 2006-2024, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
/*
|
||||
* This module implements the float128 gamma function under the name
|
||||
* tgamma128. It's expected to be suitable for integration into system
|
||||
* maths libraries under the standard name tgammal, if long double is
|
||||
* 128-bit. Such a library will probably want to check the error
|
||||
* handling and optimize the initial process of extracting the
|
||||
* exponent, which is done here by simple and portable (but
|
||||
* potentially slower) methods.
|
||||
*/
|
||||
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/* Only binary128 format is supported. */
|
||||
#if LDBL_MANT_DIG == 113
|
||||
|
||||
#include "tgamma128.h"
|
||||
|
||||
#define lenof(x) (sizeof(x)/sizeof(*(x)))
|
||||
|
||||
/*
|
||||
* Helper routine to evaluate a polynomial via Horner's rule
|
||||
*/
|
||||
static long double poly(const long double *coeffs, size_t n, long double x)
|
||||
{
|
||||
long double result = coeffs[--n];
|
||||
|
||||
while (n > 0)
|
||||
result = (result * x) + coeffs[--n];
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute sin(pi*x) / pi, for use in the reflection formula that
|
||||
* relates gamma(-x) and gamma(x).
|
||||
*/
|
||||
static long double sin_pi_x_over_pi(long double x)
|
||||
{
|
||||
int quo;
|
||||
long double fracpart = remquol(x, 0.5L, &quo);
|
||||
|
||||
long double sign = 1.0L;
|
||||
if (quo & 2)
|
||||
sign = -sign;
|
||||
quo &= 1;
|
||||
|
||||
if (quo == 0 && fabsl(fracpart) < 0x1.p-58L) {
|
||||
/* For numbers this size, sin(pi*x) is so close to pi*x that
|
||||
* sin(pi*x)/pi is indistinguishable from x in float128 */
|
||||
return sign * fracpart;
|
||||
}
|
||||
|
||||
if (quo == 0) {
|
||||
return sign * sinl(pi*fracpart) / pi;
|
||||
} else {
|
||||
return sign * cosl(pi*fracpart) / pi;
|
||||
}
|
||||
}
|
||||
|
||||
/* Return tgamma(x) on the assumption that x >= 8. */
|
||||
static long double tgamma_large(long double x,
|
||||
bool negative, long double negadjust)
|
||||
{
|
||||
/*
|
||||
* In this range we compute gamma(x) as x^(x-1/2) * e^-x * K,
|
||||
* where K is a correction factor computed as a polynomial in 1/x.
|
||||
*
|
||||
* (Vaguely inspired by the form of the Lanczos approximation, but
|
||||
* I tried the Lanczos approximation itself and it suffers badly
|
||||
* from big cancellation leading to loss of significance.)
|
||||
*/
|
||||
long double t = 1/x;
|
||||
long double p = poly(coeffs_large, lenof(coeffs_large), t);
|
||||
|
||||
/*
|
||||
* To avoid overflow in cases where x^(x-0.5) does overflow
|
||||
* but gamma(x) does not, we split x^(x-0.5) in half and
|
||||
* multiply back up _after_ multiplying the shrinking factor
|
||||
* of exp(-(x-0.5)).
|
||||
*
|
||||
* Note that computing x-0.5 and (x-0.5)/2 is exact for the
|
||||
* relevant range of x, so the only sources of error are pow
|
||||
* and exp themselves, plus the multiplications.
|
||||
*/
|
||||
long double powhalf = powl(x, (x-0.5L)/2.0L);
|
||||
long double expret = expl(-(x-0.5L));
|
||||
|
||||
if (!negative) {
|
||||
return (expret * powhalf) * powhalf * p;
|
||||
} else {
|
||||
/*
|
||||
* Apply the reflection formula as commented below, but
|
||||
* carefully: negadjust has magnitude less than 1, so it can
|
||||
* turn a case where gamma(+x) would overflow into a case
|
||||
* where gamma(-x) doesn't underflow. Not only that, but the
|
||||
* FP format has greater range in the tiny domain due to
|
||||
* denormals. For both reasons, it's not good enough to
|
||||
* compute the positive result and then adjust it.
|
||||
*/
|
||||
long double ret = 1 / ((expret * powhalf) * (x * negadjust) * p);
|
||||
return ret / powhalf;
|
||||
}
|
||||
}
|
||||
|
||||
/* Return tgamma(x) on the assumption that 0 <= x < 1/32. */
|
||||
static long double tgamma_tiny(long double x,
|
||||
bool negative, long double negadjust)
|
||||
{
|
||||
/*
|
||||
* For x near zero, we use a polynomial approximation to
|
||||
* g = 1/(x*gamma(x)), and then return 1/(g*x).
|
||||
*/
|
||||
long double g = poly(coeffs_tiny, lenof(coeffs_tiny), x);
|
||||
if (!negative)
|
||||
return 1.0L / (g*x);
|
||||
else
|
||||
return g / negadjust;
|
||||
}
|
||||
|
||||
/* Return tgamma(x) on the assumption that 0 <= x < 2^-113. */
|
||||
static long double tgamma_ultratiny(long double x, bool negative,
|
||||
long double negadjust)
|
||||
{
|
||||
/* On this interval, gamma can't even be distinguished from 1/x,
|
||||
* so we skip the polynomial evaluation in tgamma_tiny, partly to
|
||||
* save time and partly to avoid the tiny intermediate values
|
||||
* setting the underflow exception flag. */
|
||||
if (!negative)
|
||||
return 1.0L / x;
|
||||
else
|
||||
return 1.0L / negadjust;
|
||||
}
|
||||
|
||||
/* Return tgamma(x) on the assumption that 1 <= x <= 2. */
|
||||
static long double tgamma_central(long double x)
|
||||
{
|
||||
/*
|
||||
* In this central interval, our strategy is to finding the
|
||||
* difference between x and the point where gamma has a minimum,
|
||||
* and approximate based on that.
|
||||
*/
|
||||
|
||||
/* The difference between the input x and the minimum x. The first
|
||||
* subtraction is expected to be exact, since x and min_hi have
|
||||
* the same exponent (unless x=2, in which case it will still be
|
||||
* exact). */
|
||||
long double t = (x - min_x_hi) - min_x_lo;
|
||||
|
||||
/*
|
||||
* Now use two different polynomials for the intervals [1,m] and
|
||||
* [m,2].
|
||||
*/
|
||||
long double p;
|
||||
if (t < 0)
|
||||
p = poly(coeffs_central_neg, lenof(coeffs_central_neg), -t);
|
||||
else
|
||||
p = poly(coeffs_central_pos, lenof(coeffs_central_pos), t);
|
||||
|
||||
return (min_y_lo + p * (t*t)) + min_y_hi;
|
||||
}
|
||||
|
||||
long double tgamma128(long double x)
|
||||
{
|
||||
/*
|
||||
* Start by extracting the number's sign and exponent, and ruling
|
||||
* out cases of non-normalized numbers.
|
||||
*
|
||||
* For an implementation integrated into a system libm, it would
|
||||
* almost certainly be quicker to do this by direct bitwise access
|
||||
* to the input float128 value, using whatever is the local idiom
|
||||
* for knowing its endianness.
|
||||
*
|
||||
* Integration into a system libc may also need to worry about
|
||||
* setting errno, if that's the locally preferred way to report
|
||||
* math.h errors.
|
||||
*/
|
||||
int sign = signbit(x);
|
||||
int exponent;
|
||||
switch (fpclassify(x)) {
|
||||
case FP_NAN:
|
||||
return x+x; /* propagate QNaN, make SNaN throw an exception */
|
||||
case FP_ZERO:
|
||||
return 1/x; /* divide by zero on purpose to indicate a pole */
|
||||
case FP_INFINITE:
|
||||
if (sign) {
|
||||
return x-x; /* gamma(-inf) has indeterminate sign, so provoke an
|
||||
* IEEE invalid operation exception to indicate that */
|
||||
}
|
||||
return x; /* but gamma(+inf) is just +inf with no error */
|
||||
case FP_SUBNORMAL:
|
||||
exponent = -16384;
|
||||
break;
|
||||
default:
|
||||
frexpl(x, &exponent);
|
||||
exponent--;
|
||||
break;
|
||||
}
|
||||
|
||||
bool negative = false;
|
||||
long double negadjust = 0.0L;
|
||||
|
||||
if (sign) {
|
||||
/*
|
||||
* Euler's reflection formula is
|
||||
*
|
||||
* gamma(1-x) gamma(x) = pi/sin(pi*x)
|
||||
*
|
||||
* pi
|
||||
* => gamma(x) = --------------------
|
||||
* gamma(1-x) sin(pi*x)
|
||||
*
|
||||
* But computing 1-x is going to lose a lot of accuracy when x
|
||||
* is very small, so instead we transform using the recurrence
|
||||
* gamma(t+1)=t gamma(t). Setting t=-x, this gives us
|
||||
* gamma(1-x) = -x gamma(-x), so we now have
|
||||
*
|
||||
* pi
|
||||
* gamma(x) = ----------------------
|
||||
* -x gamma(-x) sin(pi*x)
|
||||
*
|
||||
* which relates gamma(x) to gamma(-x), which is much nicer,
|
||||
* since x can be turned into -x without rounding.
|
||||
*/
|
||||
negadjust = sin_pi_x_over_pi(x);
|
||||
negative = true;
|
||||
x = -x;
|
||||
|
||||
/*
|
||||
* Now the ultimate answer we want is
|
||||
*
|
||||
* 1 / (gamma(x) * x * negadjust)
|
||||
*
|
||||
* where x is the positive value we've just turned it into.
|
||||
*
|
||||
* For some of the cases below, we'll compute gamma(x)
|
||||
* normally and then compute this adjusted value afterwards.
|
||||
* But for others, we can implement the reciprocal operation
|
||||
* in this formula by _avoiding_ an inversion that the
|
||||
* sub-case was going to do anyway.
|
||||
*/
|
||||
|
||||
if (negadjust == 0) {
|
||||
/*
|
||||
* Special case for negative integers. Applying the
|
||||
* reflection formula would cause division by zero, but
|
||||
* standards would prefer we treat this error case as an
|
||||
* invalid operation and return NaN instead. (Possibly
|
||||
* because otherwise you'd have to decide which sign of
|
||||
* infinity to return, and unlike the x=0 case, there's no
|
||||
* sign of zero available to disambiguate.)
|
||||
*/
|
||||
return negadjust / negadjust;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Split the positive domain into various cases. For cases where
|
||||
* we do the negative-number adjustment the usual way, we'll leave
|
||||
* the answer in 'g' and drop out of the if statement.
|
||||
*/
|
||||
long double g;
|
||||
|
||||
if (exponent >= 11) {
|
||||
/*
|
||||
* gamma of any positive value this large overflows, and gamma
|
||||
* of any negative value underflows.
|
||||
*/
|
||||
if (!negative) {
|
||||
long double huge = 0x1p+12288L;
|
||||
return huge * huge; /* provoke an overflow */
|
||||
} else {
|
||||
long double tiny = 0x1p-12288L;
|
||||
return tiny * tiny * negadjust; /* underflow, of the right sign */
|
||||
}
|
||||
} else if (exponent >= 3) {
|
||||
/* Negative-number adjustment happens inside here */
|
||||
return tgamma_large(x, negative, negadjust);
|
||||
} else if (exponent < -113) {
|
||||
/* Negative-number adjustment happens inside here */
|
||||
return tgamma_ultratiny(x, negative, negadjust);
|
||||
} else if (exponent < -5) {
|
||||
/* Negative-number adjustment happens inside here */
|
||||
return tgamma_tiny(x, negative, negadjust);
|
||||
} else if (exponent == 0) {
|
||||
g = tgamma_central(x);
|
||||
} else if (exponent < 0) {
|
||||
/*
|
||||
* For x in [1/32,1) we range-reduce upwards to the interval
|
||||
* [1,2), using the inverse of the normal recurrence formula:
|
||||
* gamma(x) = gamma(x+1)/x.
|
||||
*/
|
||||
g = tgamma_central(1+x) / x;
|
||||
} else {
|
||||
/*
|
||||
* For x in [2,8) we range-reduce downwards to the interval
|
||||
* [1,2) by repeated application of the recurrence formula.
|
||||
*
|
||||
* Actually multiplying (x-1) by (x-2) by (x-3) and so on
|
||||
* would introduce multiple ULPs of rounding error. We can get
|
||||
* better accuracy by writing x = (k+1/2) + t, where k is an
|
||||
* integer and |t|<1/2, and expanding out the obvious factor
|
||||
* (x-1)(x-2)...(x-k+1) as a polynomial in t.
|
||||
*/
|
||||
long double mult;
|
||||
int i = x;
|
||||
if (i == 2) { /* x in [2,3) */
|
||||
mult = (x-1);
|
||||
} else {
|
||||
long double t = x - (i + 0.5L);
|
||||
switch (i) {
|
||||
/* E.g. for x=3.5+t, we want
|
||||
* (x-1)(x-2) = (2.5+t)(1.5+t) = 3.75 + 4t + t^2 */
|
||||
case 3:
|
||||
mult = 3.75L+t*(4.0L+t);
|
||||
break;
|
||||
case 4:
|
||||
mult = 13.125L+t*(17.75L+t*(7.5L+t));
|
||||
break;
|
||||
case 5:
|
||||
mult = 59.0625L+t*(93.0L+t*(51.50L+t*(12.0L+t)));
|
||||
break;
|
||||
case 6:
|
||||
mult = 324.84375L+t*(570.5625L+t*(376.250L+t*(
|
||||
117.5L+t*(17.5L+t))));
|
||||
break;
|
||||
case 7:
|
||||
mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*(
|
||||
1140.0L+t*(231.25L+t*(24.0L+t)))));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
g = tgamma_central(x - (i-1)) * mult;
|
||||
}
|
||||
|
||||
if (!negative) {
|
||||
/* Positive domain: return g unmodified */
|
||||
return g;
|
||||
} else {
|
||||
/* Negative domain: apply the reflection formula as commented above */
|
||||
return 1.0L / (g * x * negadjust);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
141
contrib/arm-optimized-routines/math/tgamma128.h
Normal file
141
contrib/arm-optimized-routines/math/tgamma128.h
Normal file
@ -0,0 +1,141 @@
|
||||
/*
|
||||
* Polynomial coefficients and other constants for tgamma128.c.
|
||||
*
|
||||
* Copyright (c) 2006-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
/* The largest positive value for which 128-bit tgamma does not overflow. */
|
||||
static const long double max_x = 0x1.b6e3180cd66a5c4206f128ba77f4p+10L;
|
||||
|
||||
/* Coefficients of the polynomial used in the tgamma_large() subroutine */
|
||||
static const long double coeffs_large[] = {
|
||||
0x1.8535745aa79569579b9eec0f3bbcp+0L,
|
||||
0x1.0378f83c6fb8f0e51269f2b4a973p-3L,
|
||||
0x1.59f6a05094f69686c3380f4e2783p-8L,
|
||||
-0x1.0b291dee952a82764a4859b081a6p-8L,
|
||||
-0x1.6dd301b2205bf936b5a3eaad0dbbp-12L,
|
||||
0x1.387a8b5f38dd77e7f139b1021e86p-10L,
|
||||
0x1.bca46637f65b13750c728cc29e40p-14L,
|
||||
-0x1.d80401c00aef998c9e303151a51cp-11L,
|
||||
-0x1.49cb6bb09f935a2053ccc2cf3711p-14L,
|
||||
0x1.4e950204437dcaf2be77f73a6f45p-10L,
|
||||
0x1.cb711a2d65f188bf60110934d6bep-14L,
|
||||
-0x1.7d7ff4bc95dc7faefc5e767f70f1p-9L,
|
||||
-0x1.0305ab9760cddb0d833e73766836p-12L,
|
||||
0x1.3ef6c84bf1cd5c3f65ac2693bb5bp-7L,
|
||||
0x1.bb4144740ad9290123fdcea684aap-11L,
|
||||
-0x1.72ab4e88272a229bfafd192450f0p-5L,
|
||||
0x1.80c70ac6eb3b7a698983d25a62b8p-12L,
|
||||
0x1.e222791c6743ce3e3cae220fb236p-3L,
|
||||
0x1.1a2dca1c82a9326c52b465f7cb7ap-2L,
|
||||
-0x1.9d204fa235a42cd901b123d2ad47p+1L,
|
||||
0x1.55b56d1158f77ddb1c95fc44ab02p+0L,
|
||||
0x1.37f900a11dbd892abd7dde533e2dp+5L,
|
||||
-0x1.2da49f4188dd89cb958369ef2401p+7L,
|
||||
0x1.fdae5ec3ec6eb7dffc09edbe6c14p+7L,
|
||||
-0x1.61433cebe649098c9611c4c7774ap+7L,
|
||||
};
|
||||
|
||||
/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */
|
||||
static const long double coeffs_tiny[] = {
|
||||
0x1.0000000000000000000000000000p+0L,
|
||||
0x1.2788cfc6fb618f49a37c7f0201fep-1L,
|
||||
-0x1.4fcf4026afa2dceb8490ade22796p-1L,
|
||||
-0x1.5815e8fa27047c8f42b5d9217244p-5L,
|
||||
0x1.5512320b43fbe5dfa771333518f7p-3L,
|
||||
-0x1.59af103c340927bffdd44f954bfcp-5L,
|
||||
-0x1.3b4af28483e210479657e5543366p-7L,
|
||||
0x1.d919c527f6070bfce9b29c2ace9cp-8L,
|
||||
-0x1.317112ce35337def3556a18aa178p-10L,
|
||||
-0x1.c364fe77a6f27677b985b1fa2e1dp-13L,
|
||||
0x1.0c8a7a19a3fd40fe1f7e867efe7bp-13L,
|
||||
-0x1.51cf9f090b5dc398ba86305e3634p-16L,
|
||||
-0x1.4e80f64c04a339740de06ca9fa4ap-20L,
|
||||
0x1.241ddc2aef2ec20e58b08f2fda17p-20L,
|
||||
};
|
||||
|
||||
/* The location within the interval [1,2] where gamma has a minimum.
|
||||
* Specified as the sum of two 128-bit values, for extra precision. */
|
||||
static const long double min_x_hi = 0x1.762d86356be3f6e1a9c8865e0a4fp+0L;
|
||||
static const long double min_x_lo = 0x1.ac54d7d218de21303a7c60f08840p-118L;
|
||||
|
||||
/* The actual minimum value that gamma takes at that location.
|
||||
* Again specified as the sum of two 128-bit values. */
|
||||
static const long double min_y_hi = 0x1.c56dc82a74aee8d8851566d40f32p-1L;
|
||||
static const long double min_y_lo = 0x1.8ed98685742c353ce55e5794686fp-114L;
|
||||
|
||||
/* Coefficients of the polynomial used in the tgamma_central() subroutine
|
||||
* for computing gamma on the interval [1,min_x] */
|
||||
static const long double coeffs_central_neg[] = {
|
||||
0x1.b6c53f7377b83839c8a292e43b69p-2L,
|
||||
0x1.0bae9f40c7d09ed76e732045850ap-3L,
|
||||
0x1.4981175e14d04c3530e51d01c5fep-3L,
|
||||
0x1.79f77aaf032c948af3a9edbd2061p-4L,
|
||||
0x1.1e97bd10821095a5b79fbfdfa1a3p-4L,
|
||||
0x1.8071ce0935e4dcf0b33b0fbec7c1p-5L,
|
||||
0x1.0b44c2f92982f887b55ec36dfdb0p-5L,
|
||||
0x1.6df1de1e178ef72ca7bd63d40870p-6L,
|
||||
0x1.f63f502bde27e81c0f5e13479b43p-7L,
|
||||
0x1.57fd67d901f40ea011353ad89a0ap-7L,
|
||||
0x1.d7151376eed187eb753e2273cafcp-8L,
|
||||
0x1.427162b5c6ff1d904c71ef53e37cp-8L,
|
||||
0x1.b954b8c3a56cf93e49ef6538928ap-9L,
|
||||
0x1.2dff2ec26a3ae5cd3aaccae7a09ep-9L,
|
||||
0x1.9d35250d9b9378d9b59df734537ap-10L,
|
||||
0x1.1b2c0c48b9855a28f6dbd6fdff3cp-10L,
|
||||
0x1.7e0db39bb99cdb52b028d9359380p-11L,
|
||||
0x1.2164b5e1d364a0b5eaf97c436aa7p-11L,
|
||||
0x1.27521cf5fd24dcdf43524e6add11p-13L,
|
||||
0x1.06461d62243bf9a826b42349672fp-10L,
|
||||
-0x1.2b852abead28209b4e0c756dc46ep-9L,
|
||||
0x1.be673c11a72c826115ec6d286c14p-8L,
|
||||
-0x1.fd9ce330c215c31fcd3cb53c42ebp-7L,
|
||||
0x1.fa362bd2dc68f41abef2d8600acdp-6L,
|
||||
-0x1.a21585b2f52f8b23855de8e452edp-5L,
|
||||
0x1.1f234431ed032052fc92e64e0493p-4L,
|
||||
-0x1.40d332476ca0199c60cdae3f9132p-4L,
|
||||
0x1.1d45dc665d86012eba2eea199cefp-4L,
|
||||
-0x1.8491016cdd08dc9be7ade9b5fef3p-5L,
|
||||
0x1.7e7e2fbc6d49ad484300d6add324p-6L,
|
||||
-0x1.e63fe3f874a37276a8d7d8b705ecp-8L,
|
||||
0x1.30a2a73944f8c84998314d69c23fp-10L,
|
||||
};
|
||||
|
||||
/* Coefficients of the polynomial used in the tgamma_central() subroutine
|
||||
* for computing gamma on the interval [min_x,2] */
|
||||
static const long double coeffs_central_pos[] = {
|
||||
0x1.b6c53f7377b83839c8a292e22aa2p-2L,
|
||||
-0x1.0bae9f40c7d09ed76e72e1c955dep-3L,
|
||||
0x1.4981175e14d04c3530ee5e1ecebcp-3L,
|
||||
-0x1.79f77aaf032c948ac983d77f3e07p-4L,
|
||||
0x1.1e97bd10821095ab7dc94936cc11p-4L,
|
||||
-0x1.8071ce0935e4d7edef8cbf2a1cf1p-5L,
|
||||
0x1.0b44c2f929837fafef7b5d9e80f1p-5L,
|
||||
-0x1.6df1de1e175fe2a51faa25cddbb4p-6L,
|
||||
0x1.f63f502be57d11aed2cfe90843ffp-7L,
|
||||
-0x1.57fd67d852f230015b9f64770273p-7L,
|
||||
0x1.d715138adc07e5fce81077070357p-8L,
|
||||
-0x1.4271618e9fda8992a667adb15f4fp-8L,
|
||||
0x1.b954d15d9eb772e80fdd760672d7p-9L,
|
||||
-0x1.2dfe391241d3cb79c8c15182843dp-9L,
|
||||
0x1.9d44396fcd48451c3ba924cee814p-10L,
|
||||
-0x1.1ac195fb99739e341589e39803e6p-10L,
|
||||
0x1.82e46127b68f002770826e25f146p-11L,
|
||||
-0x1.089dacd90d9f41493119ac178359p-11L,
|
||||
0x1.6993c007b20394a057d21f3d37f8p-12L,
|
||||
-0x1.ec43a709f4446560c099dec8e31bp-13L,
|
||||
0x1.4ba36322f4074e9add9450f003cap-13L,
|
||||
-0x1.b3f83a977965ca1b7937bf5b34cap-14L,
|
||||
0x1.10af346abc09cb25a6d9fe810b6ep-14L,
|
||||
-0x1.38d8ea1188f242f50203edc395bdp-15L,
|
||||
0x1.39add987a948ec56f62b721a4475p-16L,
|
||||
-0x1.02a4e141f286c8a967e2df9bc9adp-17L,
|
||||
0x1.433b50af22425f546e87113062d7p-19L,
|
||||
-0x1.0c7b73cb0013f00aafc103e8e382p-21L,
|
||||
0x1.b852de313ec38da2297f6deaa6b4p-25L,
|
||||
};
|
||||
|
||||
/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine
|
||||
*/
|
||||
static const long double pi = 0x1.921fb54442d18469898cc51701b8p+1L;
|
212
contrib/arm-optimized-routines/math/tools/tgamma128_gen.jl
Normal file
212
contrib/arm-optimized-routines/math/tools/tgamma128_gen.jl
Normal file
@ -0,0 +1,212 @@
|
||||
# -*- julia -*-
|
||||
#
|
||||
# Generate tgamma128.h, containing polynomials and constants used by
|
||||
# tgamma128.c.
|
||||
#
|
||||
# Copyright (c) 2006-2023, Arm Limited.
|
||||
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
|
||||
# This Julia program depends on the 'Remez' and 'SpecialFunctions'
|
||||
# library packages. To install them, run this at the interactive Julia
|
||||
# prompt:
|
||||
#
|
||||
# import Pkg; Pkg.add(["Remez", "SpecialFunctions"])
|
||||
#
|
||||
# Tested on Julia 1.4.1 (Ubuntu 20.04) and 1.9.0 (22.04).
|
||||
|
||||
import Printf
|
||||
import Remez
|
||||
import SpecialFunctions
|
||||
|
||||
# Round a BigFloat to 128-bit long double and format it as a C99 hex
|
||||
# float literal.
|
||||
function quadhex(x)
|
||||
sign = " "
|
||||
if x < 0
|
||||
sign = "-"
|
||||
x = -x
|
||||
end
|
||||
|
||||
exponent = BigInt(floor(log2(x)))
|
||||
exponent = max(exponent, -16382)
|
||||
@assert(exponent <= 16383) # else overflow
|
||||
|
||||
x /= BigFloat(2)^exponent
|
||||
@assert(1 <= x < 2)
|
||||
x *= BigFloat(2)^112
|
||||
mantissa = BigInt(round(x))
|
||||
|
||||
mantstr = string(mantissa, base=16, pad=29)
|
||||
return Printf.@sprintf("%s0x%s.%sp%+dL", sign, mantstr[1], mantstr[2:end],
|
||||
exponent)
|
||||
end
|
||||
|
||||
# Round a BigFloat to 128-bit long double and return it still as a
|
||||
# BigFloat.
|
||||
function quadval(x, round=0)
|
||||
sign = +1
|
||||
if x.sign < 0
|
||||
sign = -1
|
||||
x = -x
|
||||
end
|
||||
|
||||
exponent = BigInt(floor(log2(x)))
|
||||
exponent = max(exponent, -16382)
|
||||
@assert(exponent <= 16383) # else overflow
|
||||
|
||||
x /= BigFloat(2)^exponent
|
||||
@assert(1 <= x < 2)
|
||||
x *= BigFloat(2)^112
|
||||
if round < 0
|
||||
mantissa = floor(x)
|
||||
elseif round > 0
|
||||
mantissa = ceil(x)
|
||||
else
|
||||
mantissa = round(x)
|
||||
end
|
||||
|
||||
return sign * mantissa * BigFloat(2)^(exponent - 112)
|
||||
end
|
||||
|
||||
# Output an array of BigFloats as a C array declaration.
|
||||
function dumparray(a, name)
|
||||
println("static const long double ", name, "[] = {")
|
||||
for x in N
|
||||
println(" ", quadhex(x), ",")
|
||||
end
|
||||
println("};")
|
||||
end
|
||||
|
||||
print("/*
|
||||
* Polynomial coefficients and other constants for tgamma128.c.
|
||||
*
|
||||
* Copyright (c) 2006,2009,2023 Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
")
|
||||
|
||||
Base.MPFR.setprecision(512)
|
||||
|
||||
e = exp(BigFloat(1))
|
||||
|
||||
print("
|
||||
/* The largest positive value for which 128-bit tgamma does not overflow. */
|
||||
")
|
||||
lo = BigFloat("1000")
|
||||
hi = BigFloat("2000")
|
||||
while true
|
||||
global lo
|
||||
global hi
|
||||
global max_x
|
||||
|
||||
mid = (lo + hi) / 2
|
||||
if mid == lo || mid == hi
|
||||
max_x = mid
|
||||
break
|
||||
end
|
||||
if SpecialFunctions.logabsgamma(mid)[1] < 16384 * log(BigFloat(2))
|
||||
lo = mid
|
||||
else
|
||||
hi = mid
|
||||
end
|
||||
end
|
||||
max_x = quadval(max_x, -1)
|
||||
println("static const long double max_x = ", quadhex(max_x), ";")
|
||||
|
||||
print("
|
||||
/* Coefficients of the polynomial used in the tgamma_large() subroutine */
|
||||
")
|
||||
N, D, E, X = Remez.ratfn_minimax(
|
||||
x -> x==0 ? sqrt(BigFloat(2)*pi/e) :
|
||||
exp(SpecialFunctions.logabsgamma(1/x)[1] +
|
||||
(1/x-0.5)*(1+log(x))),
|
||||
(0, 1/BigFloat(8)),
|
||||
24, 0,
|
||||
(x, y) -> 1/y
|
||||
)
|
||||
dumparray(N, "coeffs_large")
|
||||
|
||||
print("
|
||||
/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */
|
||||
")
|
||||
N, D, E, X = Remez.ratfn_minimax(
|
||||
x -> x==0 ? 1 : 1/(x*SpecialFunctions.gamma(x)),
|
||||
(0, 1/BigFloat(32)),
|
||||
13, 0,
|
||||
)
|
||||
dumparray(N, "coeffs_tiny")
|
||||
|
||||
print("
|
||||
/* The location within the interval [1,2] where gamma has a minimum.
|
||||
* Specified as the sum of two 128-bit values, for extra precision. */
|
||||
")
|
||||
lo = BigFloat("1.4")
|
||||
hi = BigFloat("1.5")
|
||||
while true
|
||||
global lo
|
||||
global hi
|
||||
global min_x
|
||||
|
||||
mid = (lo + hi) / 2
|
||||
if mid == lo || mid == hi
|
||||
min_x = mid
|
||||
break
|
||||
end
|
||||
if SpecialFunctions.digamma(mid) < 0
|
||||
lo = mid
|
||||
else
|
||||
hi = mid
|
||||
end
|
||||
end
|
||||
min_x_hi = quadval(min_x, -1)
|
||||
println("static const long double min_x_hi = ", quadhex(min_x_hi), ";")
|
||||
println("static const long double min_x_lo = ", quadhex(min_x - min_x_hi), ";")
|
||||
|
||||
print("
|
||||
/* The actual minimum value that gamma takes at that location.
|
||||
* Again specified as the sum of two 128-bit values. */
|
||||
")
|
||||
min_y = SpecialFunctions.gamma(min_x)
|
||||
min_y_hi = quadval(min_y, -1)
|
||||
println("static const long double min_y_hi = ", quadhex(min_y_hi), ";")
|
||||
println("static const long double min_y_lo = ", quadhex(min_y - min_y_hi), ";")
|
||||
|
||||
function taylor_bodge(x)
|
||||
# Taylor series generated by Wolfram Alpha for (gamma(min_x+x)-min_y)/x^2.
|
||||
# Used in the Remez calls below for x values very near the origin, to avoid
|
||||
# significance loss problems when trying to compute it directly via that
|
||||
# formula (even in MPFR's extra precision).
|
||||
return BigFloat("0.428486815855585429730209907810650582960483696962660010556335457558784421896667728014324097132413696263704801646004585959298743677879606168187061990204432200")+x*(-BigFloat("0.130704158939785761928008749242671025181542078105370084716141350308119418619652583986015464395882363802104154017741656168641240436089858504560718773026275797")+x*(BigFloat("0.160890753325112844190519489594363387594505844658437718135952967735294789599989664428071656484587979507034160383271974554122934842441540146372016567834062876")+x*(-BigFloat("0.092277030213334350126864106458600575084335085690780082222880945224248438672595248111704471182201673989215223667543694847795410779036800385804729955729659506"))))
|
||||
end
|
||||
|
||||
print("
|
||||
/* Coefficients of the polynomial used in the tgamma_central() subroutine
|
||||
* for computing gamma on the interval [1,min_x] */
|
||||
")
|
||||
N, D, E, X = Remez.ratfn_minimax(
|
||||
x -> x < BigFloat(0x1p-64) ? taylor_bodge(-x) :
|
||||
(SpecialFunctions.gamma(min_x - x) - min_y) / (x*x),
|
||||
(0, min_x - 1),
|
||||
31, 0,
|
||||
(x, y) -> x^2,
|
||||
)
|
||||
dumparray(N, "coeffs_central_neg")
|
||||
|
||||
print("
|
||||
/* Coefficients of the polynomial used in the tgamma_central() subroutine
|
||||
* for computing gamma on the interval [min_x,2] */
|
||||
")
|
||||
N, D, E, X = Remez.ratfn_minimax(
|
||||
x -> x < BigFloat(0x1p-64) ? taylor_bodge(x) :
|
||||
(SpecialFunctions.gamma(min_x + x) - min_y) / (x*x),
|
||||
(0, 2 - min_x),
|
||||
28, 0,
|
||||
(x, y) -> x^2,
|
||||
)
|
||||
dumparray(N, "coeffs_central_pos")
|
||||
|
||||
print("
|
||||
/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine
|
||||
*/
|
||||
")
|
||||
println("static const long double pi = ", quadhex(BigFloat(pi)), ";")
|
@ -1,95 +0,0 @@
|
||||
/*
|
||||
* Double-precision vector cos function.
|
||||
*
|
||||
* Copyright (c) 2019-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#if V_SUPPORTED
|
||||
|
||||
static const double Poly[] = {
|
||||
/* worst-case error is 3.5 ulp.
|
||||
abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */
|
||||
-0x1.9f4a9c8b21dc9p-41,
|
||||
0x1.60e88a10163f2p-33,
|
||||
-0x1.ae6361b7254e7p-26,
|
||||
0x1.71de382e8d62bp-19,
|
||||
-0x1.a01a019aeb4ffp-13,
|
||||
0x1.111111110b25ep-7,
|
||||
-0x1.55555555554c3p-3,
|
||||
};
|
||||
|
||||
#define C7 v_f64 (Poly[0])
|
||||
#define C6 v_f64 (Poly[1])
|
||||
#define C5 v_f64 (Poly[2])
|
||||
#define C4 v_f64 (Poly[3])
|
||||
#define C3 v_f64 (Poly[4])
|
||||
#define C2 v_f64 (Poly[5])
|
||||
#define C1 v_f64 (Poly[6])
|
||||
|
||||
#define InvPi v_f64 (0x1.45f306dc9c883p-2)
|
||||
#define HalfPi v_f64 (0x1.921fb54442d18p+0)
|
||||
#define Pi1 v_f64 (0x1.921fb54442d18p+1)
|
||||
#define Pi2 v_f64 (0x1.1a62633145c06p-53)
|
||||
#define Pi3 v_f64 (0x1.c1cd129024e09p-106)
|
||||
#define Shift v_f64 (0x1.8p52)
|
||||
#define RangeVal v_f64 (0x1p23)
|
||||
#define AbsMask v_u64 (0x7fffffffffffffff)
|
||||
|
||||
VPCS_ATTR
|
||||
__attribute__ ((noinline)) static v_f64_t
|
||||
specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
|
||||
{
|
||||
return v_call_f64 (cos, x, y, cmp);
|
||||
}
|
||||
|
||||
VPCS_ATTR
|
||||
v_f64_t
|
||||
V_NAME(cos) (v_f64_t x)
|
||||
{
|
||||
v_f64_t n, r, r2, y;
|
||||
v_u64_t odd, cmp;
|
||||
|
||||
r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
|
||||
cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
if (unlikely (v_any_u64 (cmp)))
|
||||
/* If fenv exceptions are to be triggered correctly, set any special lanes
|
||||
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
|
||||
specialcase later. */
|
||||
r = v_sel_f64 (cmp, v_f64 (1.0), r);
|
||||
#endif
|
||||
|
||||
/* n = rint((|x|+pi/2)/pi) - 0.5. */
|
||||
n = v_fma_f64 (InvPi, r + HalfPi, Shift);
|
||||
odd = v_as_u64_f64 (n) << 63;
|
||||
n -= Shift;
|
||||
n -= v_f64 (0.5);
|
||||
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
|
||||
r = v_fma_f64 (-Pi1, n, r);
|
||||
r = v_fma_f64 (-Pi2, n, r);
|
||||
r = v_fma_f64 (-Pi3, n, r);
|
||||
|
||||
/* sin(r) poly approx. */
|
||||
r2 = r * r;
|
||||
y = v_fma_f64 (C7, r2, C6);
|
||||
y = v_fma_f64 (y, r2, C5);
|
||||
y = v_fma_f64 (y, r2, C4);
|
||||
y = v_fma_f64 (y, r2, C3);
|
||||
y = v_fma_f64 (y, r2, C2);
|
||||
y = v_fma_f64 (y, r2, C1);
|
||||
y = v_fma_f64 (y * r2, r, r);
|
||||
|
||||
/* sign. */
|
||||
y = v_as_f64_u64 (v_as_u64_f64 (y) ^ odd);
|
||||
|
||||
if (unlikely (v_any_u64 (cmp)))
|
||||
return specialcase (x, y, cmp);
|
||||
return y;
|
||||
}
|
||||
VPCS_ALIAS
|
||||
#endif
|
@ -1,84 +0,0 @@
|
||||
/*
|
||||
* Single-precision vector cos function.
|
||||
*
|
||||
* Copyright (c) 2019-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#if V_SUPPORTED
|
||||
|
||||
static const float Poly[] = {
|
||||
/* 1.886 ulp error */
|
||||
0x1.5b2e76p-19f,
|
||||
-0x1.9f42eap-13f,
|
||||
0x1.110df4p-7f,
|
||||
-0x1.555548p-3f,
|
||||
};
|
||||
#define Pi1 v_f32 (0x1.921fb6p+1f)
|
||||
#define Pi2 v_f32 (-0x1.777a5cp-24f)
|
||||
#define Pi3 v_f32 (-0x1.ee59dap-49f)
|
||||
#define A3 v_f32 (Poly[3])
|
||||
#define A5 v_f32 (Poly[2])
|
||||
#define A7 v_f32 (Poly[1])
|
||||
#define A9 v_f32 (Poly[0])
|
||||
#define RangeVal v_f32 (0x1p20f)
|
||||
#define InvPi v_f32 (0x1.45f306p-2f)
|
||||
#define Shift v_f32 (0x1.8p+23f)
|
||||
#define AbsMask v_u32 (0x7fffffff)
|
||||
#define HalfPi v_f32 (0x1.921fb6p0f)
|
||||
|
||||
VPCS_ATTR
|
||||
static v_f32_t
|
||||
specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
|
||||
{
|
||||
/* Fall back to scalar code. */
|
||||
return v_call_f32 (cosf, x, y, cmp);
|
||||
}
|
||||
|
||||
VPCS_ATTR
|
||||
v_f32_t
|
||||
V_NAME(cosf) (v_f32_t x)
|
||||
{
|
||||
v_f32_t n, r, r2, y;
|
||||
v_u32_t odd, cmp;
|
||||
|
||||
r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
|
||||
cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
/* If fenv exceptions are to be triggered correctly, set any special lanes
|
||||
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
|
||||
specialcase later. */
|
||||
r = v_sel_f32 (cmp, v_f32 (1.0f), r);
|
||||
#endif
|
||||
|
||||
/* n = rint((|x|+pi/2)/pi) - 0.5 */
|
||||
n = v_fma_f32 (InvPi, r + HalfPi, Shift);
|
||||
odd = v_as_u32_f32 (n) << 31;
|
||||
n -= Shift;
|
||||
n -= v_f32 (0.5f);
|
||||
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
|
||||
r = v_fma_f32 (-Pi1, n, r);
|
||||
r = v_fma_f32 (-Pi2, n, r);
|
||||
r = v_fma_f32 (-Pi3, n, r);
|
||||
|
||||
/* y = sin(r) */
|
||||
r2 = r * r;
|
||||
y = v_fma_f32 (A9, r2, A7);
|
||||
y = v_fma_f32 (y, r2, A5);
|
||||
y = v_fma_f32 (y, r2, A3);
|
||||
y = v_fma_f32 (y * r2, r, r);
|
||||
|
||||
/* sign fix */
|
||||
y = v_as_f32_u32 (v_as_u32_f32 (y) ^ odd);
|
||||
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
return specialcase (x, y, cmp);
|
||||
return y;
|
||||
}
|
||||
VPCS_ALIAS
|
||||
#endif
|
@ -1,128 +0,0 @@
|
||||
/*
|
||||
* Double-precision vector e^x function.
|
||||
*
|
||||
* Copyright (c) 2019-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#if V_SUPPORTED
|
||||
#include "v_exp.h"
|
||||
|
||||
#if V_EXP_TABLE_BITS == 7
|
||||
/* maxerr: 1.88 +0.5 ulp
|
||||
rel error: 1.4337*2^-53
|
||||
abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */
|
||||
#define C1 v_f64 (0x1.ffffffffffd43p-2)
|
||||
#define C2 v_f64 (0x1.55555c75adbb2p-3)
|
||||
#define C3 v_f64 (0x1.55555da646206p-5)
|
||||
#define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2. */
|
||||
#define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N. */
|
||||
#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63)
|
||||
#elif V_EXP_TABLE_BITS == 8
|
||||
/* maxerr: 0.54 +0.5 ulp
|
||||
rel error: 1.4318*2^-58
|
||||
abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ]. */
|
||||
#define C1 v_f64 (0x1.fffffffffffd4p-2)
|
||||
#define C2 v_f64 (0x1.5555571d6b68cp-3)
|
||||
#define C3 v_f64 (0x1.5555576a59599p-5)
|
||||
#define InvLn2 v_f64 (0x1.71547652b82fep8)
|
||||
#define Ln2hi v_f64 (0x1.62e42fefa39efp-9)
|
||||
#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64)
|
||||
#endif
|
||||
|
||||
#define N (1 << V_EXP_TABLE_BITS)
|
||||
#define Tab __v_exp_data
|
||||
#define IndexMask v_u64 (N - 1)
|
||||
#define Shift v_f64 (0x1.8p+52)
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
||||
#define TinyBound 0x200 /* top12 (asuint64 (0x1p-511)). */
|
||||
#define BigBound 0x408 /* top12 (asuint64 (0x1p9)). */
|
||||
|
||||
VPCS_ATTR static NOINLINE v_f64_t
|
||||
specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
|
||||
{
|
||||
/* If fenv exceptions are to be triggered correctly, fall back to the scalar
|
||||
routine to special lanes. */
|
||||
return v_call_f64 (exp, x, y, cmp);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define Thres v_f64 (704.0)
|
||||
|
||||
VPCS_ATTR
|
||||
static v_f64_t
|
||||
specialcase (v_f64_t s, v_f64_t y, v_f64_t n)
|
||||
{
|
||||
v_f64_t absn = v_abs_f64 (n);
|
||||
|
||||
/* 2^(n/N) may overflow, break it up into s1*s2. */
|
||||
v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000);
|
||||
v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b);
|
||||
v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b);
|
||||
v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N));
|
||||
v_f64_t r1 = s1 * s1;
|
||||
v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1;
|
||||
return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
VPCS_ATTR
|
||||
v_f64_t
|
||||
V_NAME(exp) (v_f64_t x)
|
||||
{
|
||||
v_f64_t n, r, r2, s, y, z;
|
||||
v_u64_t cmp, u, e, i;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
|
||||
specialcase to fix special lanes later. This is only necessary if fenv
|
||||
exceptions are to be triggered correctly. */
|
||||
v_f64_t xm = x;
|
||||
cmp = v_cond_u64 ((v_as_u64_f64 (v_abs_f64 (x)) >> 52) - TinyBound
|
||||
>= BigBound - TinyBound);
|
||||
if (unlikely (v_any_u64 (cmp)))
|
||||
x = v_sel_f64 (cmp, v_f64 (1), x);
|
||||
#else
|
||||
cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
|
||||
#endif
|
||||
|
||||
/* n = round(x/(ln2/N)). */
|
||||
z = v_fma_f64 (x, InvLn2, Shift);
|
||||
u = v_as_u64_f64 (z);
|
||||
n = z - Shift;
|
||||
|
||||
/* r = x - n*ln2/N. */
|
||||
r = x;
|
||||
r = v_fma_f64 (-Ln2hi, n, r);
|
||||
r = v_fma_f64 (-Ln2lo, n, r);
|
||||
|
||||
e = u << (52 - V_EXP_TABLE_BITS);
|
||||
i = u & IndexMask;
|
||||
|
||||
/* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */
|
||||
r2 = r * r;
|
||||
y = v_fma_f64 (C2, r, C1);
|
||||
y = v_fma_f64 (C3, r2, y);
|
||||
y = v_fma_f64 (y, r2, r);
|
||||
|
||||
/* s = 2^(n/N). */
|
||||
u = v_lookup_u64 (Tab, i);
|
||||
s = v_as_f64_u64 (u + e);
|
||||
|
||||
if (unlikely (v_any_u64 (cmp)))
|
||||
#if WANT_SIMD_EXCEPT
|
||||
return specialcase (xm, v_fma_f64 (y, s, s), cmp);
|
||||
#else
|
||||
return specialcase (s, y, n);
|
||||
#endif
|
||||
|
||||
return v_fma_f64 (y, s, s);
|
||||
}
|
||||
VPCS_ALIAS
|
||||
#endif
|
@ -1,14 +0,0 @@
|
||||
/*
|
||||
* Declarations for double-precision e^x vector function.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#if WANT_VMATH
|
||||
|
||||
#define V_EXP_TABLE_BITS 7
|
||||
|
||||
extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
|
||||
#endif
|
@ -1,117 +0,0 @@
|
||||
/*
|
||||
* Single-precision vector 2^x function.
|
||||
*
|
||||
* Copyright (c) 2019-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#if V_SUPPORTED
|
||||
|
||||
static const float Poly[] = {
|
||||
/* maxerr: 1.962 ulp. */
|
||||
0x1.59977ap-10f,
|
||||
0x1.3ce9e4p-7f,
|
||||
0x1.c6bd32p-5f,
|
||||
0x1.ebf9bcp-3f,
|
||||
0x1.62e422p-1f,
|
||||
};
|
||||
#define C0 v_f32 (Poly[0])
|
||||
#define C1 v_f32 (Poly[1])
|
||||
#define C2 v_f32 (Poly[2])
|
||||
#define C3 v_f32 (Poly[3])
|
||||
#define C4 v_f32 (Poly[4])
|
||||
|
||||
#define Shift v_f32 (0x1.8p23f)
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
||||
#define TinyBound 0x20000000 /* asuint (0x1p-63). */
|
||||
#define BigBound 0x42800000 /* asuint (0x1p6). */
|
||||
|
||||
VPCS_ATTR
|
||||
static NOINLINE v_f32_t
|
||||
specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
|
||||
{
|
||||
/* If fenv exceptions are to be triggered correctly, fall back to the scalar
|
||||
routine to special lanes. */
|
||||
return v_call_f32 (exp2f, x, y, cmp);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
VPCS_ATTR
|
||||
static v_f32_t
|
||||
specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
|
||||
v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
|
||||
v_f32_t s2 = v_as_f32_u32 (e - b);
|
||||
v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
|
||||
v_u32_t r2 = v_as_u32_f32 (s1 * s1);
|
||||
v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
|
||||
/* Similar to r1 but avoids double rounding in the subnormal range. */
|
||||
v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
|
||||
return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
VPCS_ATTR
|
||||
v_f32_t
|
||||
V_NAME(exp2f) (v_f32_t x)
|
||||
{
|
||||
v_f32_t n, r, r2, scale, p, q, poly;
|
||||
v_u32_t cmp, e;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound
|
||||
>= BigBound - TinyBound);
|
||||
v_f32_t xm = x;
|
||||
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
|
||||
specialcase to fix special lanes later. This is only necessary if fenv
|
||||
exceptions are to be triggered correctly. */
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
x = v_sel_f32 (cmp, v_f32 (1), x);
|
||||
#endif
|
||||
|
||||
/* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = n + r, with r in [-1/2, 1/2]. */
|
||||
#if 0
|
||||
v_f32_t z;
|
||||
z = x + Shift;
|
||||
n = z - Shift;
|
||||
r = x - n;
|
||||
e = v_as_u32_f32 (z) << 23;
|
||||
#else
|
||||
n = v_round_f32 (x);
|
||||
r = x - n;
|
||||
e = v_as_u32_s32 (v_round_s32 (x)) << 23;
|
||||
#endif
|
||||
scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
|
||||
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
v_f32_t absn = v_abs_f32 (n);
|
||||
cmp = v_cond_u32 (absn > v_f32 (126.0f));
|
||||
#endif
|
||||
|
||||
r2 = r * r;
|
||||
p = v_fma_f32 (C0, r, C1);
|
||||
q = v_fma_f32 (C2, r, C3);
|
||||
q = v_fma_f32 (p, r2, q);
|
||||
p = C4 * r;
|
||||
poly = v_fma_f32 (q, r2, p);
|
||||
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
#if WANT_SIMD_EXCEPT
|
||||
return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp);
|
||||
#else
|
||||
return specialcase (poly, n, e, absn, cmp, scale);
|
||||
#endif
|
||||
|
||||
return v_fma_f32 (poly, scale, scale);
|
||||
}
|
||||
VPCS_ALIAS
|
||||
#endif
|
@ -1,75 +0,0 @@
|
||||
/*
|
||||
* Single-precision vector 2^x function.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#if V_SUPPORTED
|
||||
|
||||
static const float Poly[] = {
|
||||
/* maxerr: 0.878 ulp. */
|
||||
0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f
|
||||
};
|
||||
#define C0 v_f32 (Poly[0])
|
||||
#define C1 v_f32 (Poly[1])
|
||||
#define C2 v_f32 (Poly[2])
|
||||
#define C3 v_f32 (Poly[3])
|
||||
#define C4 v_f32 (Poly[4])
|
||||
#define C5 v_f32 (Poly[5])
|
||||
|
||||
#define Shift v_f32 (0x1.8p23f)
|
||||
#define InvLn2 v_f32 (0x1.715476p+0f)
|
||||
#define Ln2hi v_f32 (0x1.62e4p-1f)
|
||||
#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
|
||||
|
||||
VPCS_ATTR
|
||||
static v_f32_t
|
||||
specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
|
||||
v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
|
||||
v_f32_t s2 = v_as_f32_u32 (e - b);
|
||||
v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
|
||||
v_f32_t r1 = s1 * s1;
|
||||
v_f32_t r0 = poly * s1 * s2;
|
||||
return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
|
||||
}
|
||||
|
||||
VPCS_ATTR
|
||||
v_f32_t
|
||||
V_NAME(exp2f_1u) (v_f32_t x)
|
||||
{
|
||||
v_f32_t n, r, scale, poly, absn;
|
||||
v_u32_t cmp, e;
|
||||
|
||||
/* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = n + r, with r in [-1/2, 1/2]. */
|
||||
#if 0
|
||||
v_f32_t z;
|
||||
z = x + Shift;
|
||||
n = z - Shift;
|
||||
r = x - n;
|
||||
e = v_as_u32_f32 (z) << 23;
|
||||
#else
|
||||
n = v_round_f32 (x);
|
||||
r = x - n;
|
||||
e = v_as_u32_s32 (v_round_s32 (x)) << 23;
|
||||
#endif
|
||||
scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
|
||||
absn = v_abs_f32 (n);
|
||||
cmp = v_cond_u32 (absn > v_f32 (126.0f));
|
||||
poly = v_fma_f32 (C0, r, C1);
|
||||
poly = v_fma_f32 (poly, r, C2);
|
||||
poly = v_fma_f32 (poly, r, C3);
|
||||
poly = v_fma_f32 (poly, r, C4);
|
||||
poly = v_fma_f32 (poly, r, C5);
|
||||
poly = v_fma_f32 (poly, r, v_f32 (1.0f));
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
return specialcase (poly, n, e, absn);
|
||||
return scale * poly;
|
||||
}
|
||||
#endif
|
@ -1,122 +0,0 @@
|
||||
/*
|
||||
* Single-precision vector e^x function.
|
||||
*
|
||||
* Copyright (c) 2019-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#if V_SUPPORTED
|
||||
|
||||
static const float Poly[] = {
|
||||
/* maxerr: 1.45358 +0.5 ulp. */
|
||||
0x1.0e4020p-7f,
|
||||
0x1.573e2ep-5f,
|
||||
0x1.555e66p-3f,
|
||||
0x1.fffdb6p-2f,
|
||||
0x1.ffffecp-1f,
|
||||
};
|
||||
#define C0 v_f32 (Poly[0])
|
||||
#define C1 v_f32 (Poly[1])
|
||||
#define C2 v_f32 (Poly[2])
|
||||
#define C3 v_f32 (Poly[3])
|
||||
#define C4 v_f32 (Poly[4])
|
||||
|
||||
#define Shift v_f32 (0x1.8p23f)
|
||||
#define InvLn2 v_f32 (0x1.715476p+0f)
|
||||
#define Ln2hi v_f32 (0x1.62e4p-1f)
|
||||
#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
||||
#define TinyBound 0x20000000 /* asuint (0x1p-63). */
|
||||
#define BigBound 0x42800000 /* asuint (0x1p6). */
|
||||
|
||||
VPCS_ATTR
|
||||
static NOINLINE v_f32_t
|
||||
specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
|
||||
{
|
||||
/* If fenv exceptions are to be triggered correctly, fall back to the scalar
|
||||
routine to special lanes. */
|
||||
return v_call_f32 (expf, x, y, cmp);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
VPCS_ATTR
|
||||
static v_f32_t
|
||||
specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
|
||||
v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
|
||||
v_f32_t s2 = v_as_f32_u32 (e - b);
|
||||
v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
|
||||
v_u32_t r2 = v_as_u32_f32 (s1 * s1);
|
||||
v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
|
||||
/* Similar to r1 but avoids double rounding in the subnormal range. */
|
||||
v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
|
||||
return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
VPCS_ATTR
|
||||
v_f32_t
|
||||
V_NAME(expf) (v_f32_t x)
|
||||
{
|
||||
v_f32_t n, r, r2, scale, p, q, poly, z;
|
||||
v_u32_t cmp, e;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound
|
||||
>= BigBound - TinyBound);
|
||||
v_f32_t xm = x;
|
||||
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
|
||||
specialcase to fix special lanes later. This is only necessary if fenv
|
||||
exceptions are to be triggered correctly. */
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
x = v_sel_f32 (cmp, v_f32 (1), x);
|
||||
#endif
|
||||
|
||||
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
#if 1
|
||||
z = v_fma_f32 (x, InvLn2, Shift);
|
||||
n = z - Shift;
|
||||
r = v_fma_f32 (n, -Ln2hi, x);
|
||||
r = v_fma_f32 (n, -Ln2lo, r);
|
||||
e = v_as_u32_f32 (z) << 23;
|
||||
#else
|
||||
z = x * InvLn2;
|
||||
n = v_round_f32 (z);
|
||||
r = v_fma_f32 (n, -Ln2hi, x);
|
||||
r = v_fma_f32 (n, -Ln2lo, r);
|
||||
e = v_as_u32_s32 (v_round_s32 (z)) << 23;
|
||||
#endif
|
||||
scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
|
||||
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
v_f32_t absn = v_abs_f32 (n);
|
||||
cmp = v_cond_u32 (absn > v_f32 (126.0f));
|
||||
#endif
|
||||
|
||||
r2 = r * r;
|
||||
p = v_fma_f32 (C0, r, C1);
|
||||
q = v_fma_f32 (C2, r, C3);
|
||||
q = v_fma_f32 (p, r2, q);
|
||||
p = C4 * r;
|
||||
poly = v_fma_f32 (q, r2, p);
|
||||
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
#if WANT_SIMD_EXCEPT
|
||||
return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp);
|
||||
#else
|
||||
return specialcase (poly, n, e, absn, cmp, scale);
|
||||
#endif
|
||||
|
||||
return v_fma_f32 (poly, scale, scale);
|
||||
}
|
||||
VPCS_ALIAS
|
||||
#endif
|
@ -1,80 +0,0 @@
|
||||
/*
|
||||
* Single-precision vector e^x function.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#if V_SUPPORTED
|
||||
|
||||
static const float Poly[] = {
|
||||
/* maxerr: 0.36565 +0.5 ulp. */
|
||||
0x1.6a6000p-10f,
|
||||
0x1.12718ep-7f,
|
||||
0x1.555af0p-5f,
|
||||
0x1.555430p-3f,
|
||||
0x1.fffff4p-2f,
|
||||
};
|
||||
#define C0 v_f32 (Poly[0])
|
||||
#define C1 v_f32 (Poly[1])
|
||||
#define C2 v_f32 (Poly[2])
|
||||
#define C3 v_f32 (Poly[3])
|
||||
#define C4 v_f32 (Poly[4])
|
||||
|
||||
#define Shift v_f32 (0x1.8p23f)
|
||||
#define InvLn2 v_f32 (0x1.715476p+0f)
|
||||
#define Ln2hi v_f32 (0x1.62e4p-1f)
|
||||
#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
|
||||
|
||||
VPCS_ATTR
|
||||
static v_f32_t
|
||||
specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
|
||||
v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
|
||||
v_f32_t s2 = v_as_f32_u32 (e - b);
|
||||
v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
|
||||
v_f32_t r1 = s1 * s1;
|
||||
v_f32_t r0 = poly * s1 * s2;
|
||||
return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
|
||||
}
|
||||
|
||||
VPCS_ATTR
|
||||
v_f32_t
|
||||
V_NAME(expf_1u) (v_f32_t x)
|
||||
{
|
||||
v_f32_t n, r, scale, poly, absn, z;
|
||||
v_u32_t cmp, e;
|
||||
|
||||
/* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
#if 1
|
||||
z = v_fma_f32 (x, InvLn2, Shift);
|
||||
n = z - Shift;
|
||||
r = v_fma_f32 (n, -Ln2hi, x);
|
||||
r = v_fma_f32 (n, -Ln2lo, r);
|
||||
e = v_as_u32_f32 (z) << 23;
|
||||
#else
|
||||
z = x * InvLn2;
|
||||
n = v_round_f32 (z);
|
||||
r = v_fma_f32 (n, -Ln2hi, x);
|
||||
r = v_fma_f32 (n, -Ln2lo, r);
|
||||
e = v_as_u32_s32 (v_round_s32 (z)) << 23;
|
||||
#endif
|
||||
scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
|
||||
absn = v_abs_f32 (n);
|
||||
cmp = v_cond_u32 (absn > v_f32 (126.0f));
|
||||
poly = v_fma_f32 (C0, r, C1);
|
||||
poly = v_fma_f32 (poly, r, C2);
|
||||
poly = v_fma_f32 (poly, r, C3);
|
||||
poly = v_fma_f32 (poly, r, C4);
|
||||
poly = v_fma_f32 (poly, r, v_f32 (1.0f));
|
||||
poly = v_fma_f32 (poly, r, v_f32 (1.0f));
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
return specialcase (poly, n, e, absn);
|
||||
return scale * poly;
|
||||
}
|
||||
#endif
|
@ -1,104 +0,0 @@
|
||||
/*
|
||||
* Double-precision vector log(x) function.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#include "v_log.h"
|
||||
#if V_SUPPORTED
|
||||
|
||||
/* Worst-case error: 1.17 + 0.5 ulp. */
|
||||
|
||||
static const f64_t Poly[] = {
|
||||
/* rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
|
||||
-0x1.ffffffffffff7p-2,
|
||||
0x1.55555555170d4p-2,
|
||||
-0x1.0000000399c27p-2,
|
||||
0x1.999b2e90e94cap-3,
|
||||
-0x1.554e550bd501ep-3,
|
||||
};
|
||||
|
||||
#define A0 v_f64 (Poly[0])
|
||||
#define A1 v_f64 (Poly[1])
|
||||
#define A2 v_f64 (Poly[2])
|
||||
#define A3 v_f64 (Poly[3])
|
||||
#define A4 v_f64 (Poly[4])
|
||||
#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
|
||||
#define N (1 << V_LOG_TABLE_BITS)
|
||||
#define OFF v_u64 (0x3fe6900900000000)
|
||||
|
||||
struct entry
|
||||
{
|
||||
v_f64_t invc;
|
||||
v_f64_t logc;
|
||||
};
|
||||
|
||||
static inline struct entry
|
||||
lookup (v_u64_t i)
|
||||
{
|
||||
struct entry e;
|
||||
#ifdef SCALAR
|
||||
e.invc = __v_log_data[i].invc;
|
||||
e.logc = __v_log_data[i].logc;
|
||||
#else
|
||||
e.invc[0] = __v_log_data[i[0]].invc;
|
||||
e.logc[0] = __v_log_data[i[0]].logc;
|
||||
e.invc[1] = __v_log_data[i[1]].invc;
|
||||
e.logc[1] = __v_log_data[i[1]].logc;
|
||||
#endif
|
||||
return e;
|
||||
}
|
||||
|
||||
VPCS_ATTR
|
||||
__attribute__ ((noinline)) static v_f64_t
|
||||
specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
|
||||
{
|
||||
return v_call_f64 (log, x, y, cmp);
|
||||
}
|
||||
|
||||
VPCS_ATTR
|
||||
v_f64_t
|
||||
V_NAME(log) (v_f64_t x)
|
||||
{
|
||||
v_f64_t z, r, r2, p, y, kd, hi;
|
||||
v_u64_t ix, iz, tmp, top, i, cmp;
|
||||
v_s64_t k;
|
||||
struct entry e;
|
||||
|
||||
ix = v_as_u64_f64 (x);
|
||||
top = ix >> 48;
|
||||
cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
|
||||
|
||||
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
|
||||
The range is split into N subintervals.
|
||||
The ith subinterval contains z and c is near its center. */
|
||||
tmp = ix - OFF;
|
||||
i = (tmp >> (52 - V_LOG_TABLE_BITS)) % N;
|
||||
k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift */
|
||||
iz = ix - (tmp & v_u64 (0xfffULL << 52));
|
||||
z = v_as_f64_u64 (iz);
|
||||
e = lookup (i);
|
||||
|
||||
/* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
|
||||
r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
|
||||
kd = v_to_f64_s64 (k);
|
||||
|
||||
/* hi = r + log(c) + k*Ln2. */
|
||||
hi = v_fma_f64 (kd, Ln2, e.logc + r);
|
||||
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
|
||||
r2 = r * r;
|
||||
y = v_fma_f64 (A3, r, A2);
|
||||
p = v_fma_f64 (A1, r, A0);
|
||||
y = v_fma_f64 (A4, r2, y);
|
||||
y = v_fma_f64 (y, r2, p);
|
||||
y = v_fma_f64 (y, r2, hi);
|
||||
|
||||
if (unlikely (v_any_u64 (cmp)))
|
||||
return specialcase (x, y, cmp);
|
||||
return y;
|
||||
}
|
||||
VPCS_ALIAS
|
||||
#endif
|
@ -1,18 +0,0 @@
|
||||
/*
|
||||
* Declarations for double-precision log(x) vector function.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_math.h"
|
||||
#if WANT_VMATH
|
||||
|
||||
#define V_LOG_TABLE_BITS 7
|
||||
|
||||
extern const struct v_log_data
|
||||
{
|
||||
f64_t invc;
|
||||
f64_t logc;
|
||||
} __v_log_data[1 << V_LOG_TABLE_BITS] HIDDEN;
|
||||
#endif
|
@ -1,158 +0,0 @@
|
||||
/*
|
||||
* Lookup table for double-precision log(x) vector function.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "v_log.h"
|
||||
#if WANT_VMATH
|
||||
|
||||
#define N (1 << V_LOG_TABLE_BITS)
|
||||
|
||||
/* Algorithm:
|
||||
|
||||
x = 2^k z
|
||||
log(x) = k ln2 + log(c) + poly(z/c - 1)
|
||||
|
||||
where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128)
|
||||
and log(c) and 1/c for the ith subinterval comes from a lookup table:
|
||||
|
||||
tab[i].invc = 1/c
|
||||
tab[i].logc = (double)log(c)
|
||||
|
||||
where c is near the center of the subinterval and is chosen by trying several
|
||||
floating point invc candidates around 1/center and selecting one for which
|
||||
the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
|
||||
that contains 1 and the previous one got tweaked to avoid cancellation. */
|
||||
const struct v_log_data __v_log_data[N] = {
|
||||
{0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2},
|
||||
{0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2},
|
||||
{0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2},
|
||||
{0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2},
|
||||
{0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2},
|
||||
{0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2},
|
||||
{0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2},
|
||||
{0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2},
|
||||
{0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2},
|
||||
{0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2},
|
||||
{0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2},
|
||||
{0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2},
|
||||
{0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2},
|
||||
{0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2},
|
||||
{0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2},
|
||||
{0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2},
|
||||
{0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2},
|
||||
{0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2},
|
||||
{0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2},
|
||||
{0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3},
|
||||
{0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3},
|
||||
{0x1.446f12b278001p+0, -0x1.e52e160484698p-3},
|
||||
{0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3},
|
||||
{0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3},
|
||||
{0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3},
|
||||
{0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3},
|
||||
{0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3},
|
||||
{0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3},
|
||||
{0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3},
|
||||
{0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3},
|
||||
{0x1.36987540fbf53p+0, -0x1.8be843d796044p-3},
|
||||
{0x1.352166b648f61p+0, -0x1.82395ecc477edp-3},
|
||||
{0x1.33adddb3eb575p+0, -0x1.7896240966422p-3},
|
||||
{0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3},
|
||||
{0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3},
|
||||
{0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3},
|
||||
{0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3},
|
||||
{0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3},
|
||||
{0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3},
|
||||
{0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3},
|
||||
{0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3},
|
||||
{0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3},
|
||||
{0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3},
|
||||
{0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3},
|
||||
{0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3},
|
||||
{0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4},
|
||||
{0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4},
|
||||
{0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4},
|
||||
{0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4},
|
||||
{0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4},
|
||||
{0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4},
|
||||
{0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4},
|
||||
{0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4},
|
||||
{0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4},
|
||||
{0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4},
|
||||
{0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4},
|
||||
{0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4},
|
||||
{0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4},
|
||||
{0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4},
|
||||
{0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4},
|
||||
{0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5},
|
||||
{0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5},
|
||||
{0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5},
|
||||
{0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5},
|
||||
{0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5},
|
||||
{0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5},
|
||||
{0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5},
|
||||
{0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5},
|
||||
{0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6},
|
||||
{0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6},
|
||||
{0x1.05193497a7cc5p+0, -0x1.43183683400acp-6},
|
||||
{0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6},
|
||||
{0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7},
|
||||
{0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7},
|
||||
{0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9},
|
||||
{1.0, 0.0},
|
||||
{0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8},
|
||||
{0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7},
|
||||
{0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6},
|
||||
{0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6},
|
||||
{0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5},
|
||||
{0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5},
|
||||
{0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5},
|
||||
{0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5},
|
||||
{0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4},
|
||||
{0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4},
|
||||
{0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4},
|
||||
{0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4},
|
||||
{0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4},
|
||||
{0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4},
|
||||
{0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4},
|
||||
{0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4},
|
||||
{0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4},
|
||||
{0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3},
|
||||
{0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3},
|
||||
{0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3},
|
||||
{0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3},
|
||||
{0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3},
|
||||
{0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3},
|
||||
{0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3},
|
||||
{0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3},
|
||||
{0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3},
|
||||
{0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3},
|
||||
{0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3},
|
||||
{0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3},
|
||||
{0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3},
|
||||
{0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3},
|
||||
{0x1.9998e1480b618p-1, 0x1.c903161240163p-3},
|
||||
{0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3},
|
||||
{0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3},
|
||||
{0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3},
|
||||
{0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3},
|
||||
{0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2},
|
||||
{0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2},
|
||||
{0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2},
|
||||
{0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2},
|
||||
{0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2},
|
||||
{0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2},
|
||||
{0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2},
|
||||
{0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2},
|
||||
{0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2},
|
||||
{0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2},
|
||||
{0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2},
|
||||
{0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2},
|
||||
{0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2},
|
||||
{0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2},
|
||||
{0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2},
|
||||
{0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2},
|
||||
};
|
||||
#endif
|
@ -1,73 +0,0 @@
|
||||
/*
|
||||
* Single-precision vector log function.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#if V_SUPPORTED
|
||||
|
||||
static const float Poly[] = {
|
||||
/* 3.34 ulp error */
|
||||
-0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f,
|
||||
-0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f,
|
||||
};
|
||||
#define P7 v_f32 (Poly[0])
|
||||
#define P6 v_f32 (Poly[1])
|
||||
#define P5 v_f32 (Poly[2])
|
||||
#define P4 v_f32 (Poly[3])
|
||||
#define P3 v_f32 (Poly[4])
|
||||
#define P2 v_f32 (Poly[5])
|
||||
#define P1 v_f32 (Poly[6])
|
||||
|
||||
#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */
|
||||
#define Min v_u32 (0x00800000)
|
||||
#define Max v_u32 (0x7f800000)
|
||||
#define Mask v_u32 (0x007fffff)
|
||||
#define Off v_u32 (0x3f2aaaab) /* 0.666667 */
|
||||
|
||||
VPCS_ATTR
|
||||
__attribute__ ((noinline)) static v_f32_t
|
||||
specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
|
||||
{
|
||||
/* Fall back to scalar code. */
|
||||
return v_call_f32 (logf, x, y, cmp);
|
||||
}
|
||||
|
||||
VPCS_ATTR
|
||||
v_f32_t
|
||||
V_NAME(logf) (v_f32_t x)
|
||||
{
|
||||
v_f32_t n, p, q, r, r2, y;
|
||||
v_u32_t u, cmp;
|
||||
|
||||
u = v_as_u32_f32 (x);
|
||||
cmp = v_cond_u32 (u - Min >= Max - Min);
|
||||
|
||||
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3 */
|
||||
u -= Off;
|
||||
n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend */
|
||||
u &= Mask;
|
||||
u += Off;
|
||||
r = v_as_f32_u32 (u) - v_f32 (1.0f);
|
||||
|
||||
/* y = log(1+r) + n*ln2. */
|
||||
r2 = r * r;
|
||||
/* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
|
||||
p = v_fma_f32 (P6, r, P5);
|
||||
q = v_fma_f32 (P4, r, P3);
|
||||
y = v_fma_f32 (P2, r, P1);
|
||||
p = v_fma_f32 (P7, r2, p);
|
||||
q = v_fma_f32 (p, r2, q);
|
||||
y = v_fma_f32 (q, r2, y);
|
||||
p = v_fma_f32 (Ln2, n, r);
|
||||
y = v_fma_f32 (y, r2, p);
|
||||
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
return specialcase (x, y, cmp);
|
||||
return y;
|
||||
}
|
||||
VPCS_ALIAS
|
||||
#endif
|
@ -1,661 +0,0 @@
|
||||
/*
|
||||
* Vector math abstractions.
|
||||
*
|
||||
* Copyright (c) 2019-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef _V_MATH_H
|
||||
#define _V_MATH_H
|
||||
|
||||
#ifndef WANT_VMATH
|
||||
/* Enable the build of vector math code. */
|
||||
# define WANT_VMATH 1
|
||||
#endif
|
||||
#if WANT_VMATH
|
||||
|
||||
/* The goal of this header is to allow vector and scalar
|
||||
build of the same algorithm, the provided intrinsic
|
||||
wrappers are also vector length agnostic so they can
|
||||
be implemented for SVE too (or other simd architectures)
|
||||
and then the code should work on those targets too. */
|
||||
|
||||
#if SCALAR
|
||||
#define V_NAME(x) __s_##x
|
||||
#elif VPCS && __aarch64__
|
||||
#define V_NAME(x) __vn_##x
|
||||
#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
|
||||
#else
|
||||
#define V_NAME(x) __v_##x
|
||||
#endif
|
||||
|
||||
#ifndef VPCS_ATTR
|
||||
#define VPCS_ATTR
|
||||
#endif
|
||||
#ifndef VPCS_ALIAS
|
||||
#define VPCS_ALIAS
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include "math_config.h"
|
||||
|
||||
typedef float f32_t;
|
||||
typedef uint32_t u32_t;
|
||||
typedef int32_t s32_t;
|
||||
typedef double f64_t;
|
||||
typedef uint64_t u64_t;
|
||||
typedef int64_t s64_t;
|
||||
|
||||
/* reinterpret as type1 from type2. */
|
||||
static inline u32_t
|
||||
as_u32_f32 (f32_t x)
|
||||
{
|
||||
union { f32_t f; u32_t u; } r = {x};
|
||||
return r.u;
|
||||
}
|
||||
static inline f32_t
|
||||
as_f32_u32 (u32_t x)
|
||||
{
|
||||
union { u32_t u; f32_t f; } r = {x};
|
||||
return r.f;
|
||||
}
|
||||
static inline s32_t
|
||||
as_s32_u32 (u32_t x)
|
||||
{
|
||||
union { u32_t u; s32_t i; } r = {x};
|
||||
return r.i;
|
||||
}
|
||||
static inline u32_t
|
||||
as_u32_s32 (s32_t x)
|
||||
{
|
||||
union { s32_t i; u32_t u; } r = {x};
|
||||
return r.u;
|
||||
}
|
||||
static inline u64_t
|
||||
as_u64_f64 (f64_t x)
|
||||
{
|
||||
union { f64_t f; u64_t u; } r = {x};
|
||||
return r.u;
|
||||
}
|
||||
static inline f64_t
|
||||
as_f64_u64 (u64_t x)
|
||||
{
|
||||
union { u64_t u; f64_t f; } r = {x};
|
||||
return r.f;
|
||||
}
|
||||
static inline s64_t
|
||||
as_s64_u64 (u64_t x)
|
||||
{
|
||||
union { u64_t u; s64_t i; } r = {x};
|
||||
return r.i;
|
||||
}
|
||||
static inline u64_t
|
||||
as_u64_s64 (s64_t x)
|
||||
{
|
||||
union { s64_t i; u64_t u; } r = {x};
|
||||
return r.u;
|
||||
}
|
||||
|
||||
#if SCALAR
|
||||
#define V_SUPPORTED 1
|
||||
typedef f32_t v_f32_t;
|
||||
typedef u32_t v_u32_t;
|
||||
typedef s32_t v_s32_t;
|
||||
typedef f64_t v_f64_t;
|
||||
typedef u64_t v_u64_t;
|
||||
typedef s64_t v_s64_t;
|
||||
|
||||
static inline int
|
||||
v_lanes32 (void)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline v_f32_t
|
||||
v_f32 (f32_t x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
static inline v_u32_t
|
||||
v_u32 (u32_t x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
static inline v_s32_t
|
||||
v_s32 (s32_t x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
static inline f32_t
|
||||
v_get_f32 (v_f32_t x, int i)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
static inline u32_t
|
||||
v_get_u32 (v_u32_t x, int i)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
static inline s32_t
|
||||
v_get_s32 (v_s32_t x, int i)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
static inline void
|
||||
v_set_f32 (v_f32_t *x, int i, f32_t v)
|
||||
{
|
||||
*x = v;
|
||||
}
|
||||
static inline void
|
||||
v_set_u32 (v_u32_t *x, int i, u32_t v)
|
||||
{
|
||||
*x = v;
|
||||
}
|
||||
static inline void
|
||||
v_set_s32 (v_s32_t *x, int i, s32_t v)
|
||||
{
|
||||
*x = v;
|
||||
}
|
||||
|
||||
/* true if any elements of a v_cond result is non-zero. */
|
||||
static inline int
|
||||
v_any_u32 (v_u32_t x)
|
||||
{
|
||||
return x != 0;
|
||||
}
|
||||
/* to wrap the result of relational operators. */
|
||||
static inline v_u32_t
|
||||
v_cond_u32 (v_u32_t x)
|
||||
{
|
||||
return x ? -1 : 0;
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_abs_f32 (v_f32_t x)
|
||||
{
|
||||
return __builtin_fabsf (x);
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
|
||||
{
|
||||
return __builtin_fmaf (x, y, z);
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_round_f32 (v_f32_t x)
|
||||
{
|
||||
return __builtin_roundf (x);
|
||||
}
|
||||
static inline v_s32_t
|
||||
v_round_s32 (v_f32_t x)
|
||||
{
|
||||
return __builtin_lroundf (x); /* relies on -fno-math-errno. */
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
|
||||
{
|
||||
return p ? x : y;
|
||||
}
|
||||
/* convert to type1 from type2. */
|
||||
static inline v_f32_t
|
||||
v_to_f32_s32 (v_s32_t x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_to_f32_u32 (v_u32_t x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
/* reinterpret as type1 from type2. */
|
||||
static inline v_u32_t
|
||||
v_as_u32_f32 (v_f32_t x)
|
||||
{
|
||||
union { v_f32_t f; v_u32_t u; } r = {x};
|
||||
return r.u;
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_as_f32_u32 (v_u32_t x)
|
||||
{
|
||||
union { v_u32_t u; v_f32_t f; } r = {x};
|
||||
return r.f;
|
||||
}
|
||||
static inline v_s32_t
|
||||
v_as_s32_u32 (v_u32_t x)
|
||||
{
|
||||
union { v_u32_t u; v_s32_t i; } r = {x};
|
||||
return r.i;
|
||||
}
|
||||
static inline v_u32_t
|
||||
v_as_u32_s32 (v_s32_t x)
|
||||
{
|
||||
union { v_s32_t i; v_u32_t u; } r = {x};
|
||||
return r.u;
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_lookup_f32 (const f32_t *tab, v_u32_t idx)
|
||||
{
|
||||
return tab[idx];
|
||||
}
|
||||
static inline v_u32_t
|
||||
v_lookup_u32 (const u32_t *tab, v_u32_t idx)
|
||||
{
|
||||
return tab[idx];
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
|
||||
{
|
||||
return f (x);
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
|
||||
v_u32_t p)
|
||||
{
|
||||
return f (x1, x2);
|
||||
}
|
||||
|
||||
static inline int
|
||||
v_lanes64 (void)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_f64 (f64_t x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
static inline v_u64_t
|
||||
v_u64 (u64_t x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
static inline v_s64_t
|
||||
v_s64 (s64_t x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
static inline f64_t
|
||||
v_get_f64 (v_f64_t x, int i)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
static inline void
|
||||
v_set_f64 (v_f64_t *x, int i, f64_t v)
|
||||
{
|
||||
*x = v;
|
||||
}
|
||||
/* true if any elements of a v_cond result is non-zero. */
|
||||
static inline int
|
||||
v_any_u64 (v_u64_t x)
|
||||
{
|
||||
return x != 0;
|
||||
}
|
||||
/* to wrap the result of relational operators. */
|
||||
static inline v_u64_t
|
||||
v_cond_u64 (v_u64_t x)
|
||||
{
|
||||
return x ? -1 : 0;
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_abs_f64 (v_f64_t x)
|
||||
{
|
||||
return __builtin_fabs (x);
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
|
||||
{
|
||||
return __builtin_fma (x, y, z);
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_round_f64 (v_f64_t x)
|
||||
{
|
||||
return __builtin_round (x);
|
||||
}
|
||||
static inline v_s64_t
|
||||
v_round_s64 (v_f64_t x)
|
||||
{
|
||||
return __builtin_lround (x); /* relies on -fno-math-errno. */
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
|
||||
{
|
||||
return p ? x : y;
|
||||
}
|
||||
/* convert to type1 from type2. */
|
||||
static inline v_f64_t
|
||||
v_to_f64_s64 (v_s64_t x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_to_f64_u64 (v_u64_t x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
/* reinterpret as type1 from type2. */
|
||||
static inline v_u64_t
|
||||
v_as_u64_f64 (v_f64_t x)
|
||||
{
|
||||
union { v_f64_t f; v_u64_t u; } r = {x};
|
||||
return r.u;
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_as_f64_u64 (v_u64_t x)
|
||||
{
|
||||
union { v_u64_t u; v_f64_t f; } r = {x};
|
||||
return r.f;
|
||||
}
|
||||
static inline v_s64_t
|
||||
v_as_s64_u64 (v_u64_t x)
|
||||
{
|
||||
union { v_u64_t u; v_s64_t i; } r = {x};
|
||||
return r.i;
|
||||
}
|
||||
static inline v_u64_t
|
||||
v_as_u64_s64 (v_s64_t x)
|
||||
{
|
||||
union { v_s64_t i; v_u64_t u; } r = {x};
|
||||
return r.u;
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_lookup_f64 (const f64_t *tab, v_u64_t idx)
|
||||
{
|
||||
return tab[idx];
|
||||
}
|
||||
static inline v_u64_t
|
||||
v_lookup_u64 (const u64_t *tab, v_u64_t idx)
|
||||
{
|
||||
return tab[idx];
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
|
||||
{
|
||||
return f (x);
|
||||
}
|
||||
|
||||
#elif __aarch64__
|
||||
#define V_SUPPORTED 1
|
||||
#include <arm_neon.h>
|
||||
typedef float32x4_t v_f32_t;
|
||||
typedef uint32x4_t v_u32_t;
|
||||
typedef int32x4_t v_s32_t;
|
||||
typedef float64x2_t v_f64_t;
|
||||
typedef uint64x2_t v_u64_t;
|
||||
typedef int64x2_t v_s64_t;
|
||||
|
||||
static inline int
|
||||
v_lanes32 (void)
|
||||
{
|
||||
return 4;
|
||||
}
|
||||
|
||||
static inline v_f32_t
|
||||
v_f32 (f32_t x)
|
||||
{
|
||||
return (v_f32_t){x, x, x, x};
|
||||
}
|
||||
static inline v_u32_t
|
||||
v_u32 (u32_t x)
|
||||
{
|
||||
return (v_u32_t){x, x, x, x};
|
||||
}
|
||||
static inline v_s32_t
|
||||
v_s32 (s32_t x)
|
||||
{
|
||||
return (v_s32_t){x, x, x, x};
|
||||
}
|
||||
|
||||
static inline f32_t
|
||||
v_get_f32 (v_f32_t x, int i)
|
||||
{
|
||||
return x[i];
|
||||
}
|
||||
static inline u32_t
|
||||
v_get_u32 (v_u32_t x, int i)
|
||||
{
|
||||
return x[i];
|
||||
}
|
||||
static inline s32_t
|
||||
v_get_s32 (v_s32_t x, int i)
|
||||
{
|
||||
return x[i];
|
||||
}
|
||||
|
||||
static inline void
|
||||
v_set_f32 (v_f32_t *x, int i, f32_t v)
|
||||
{
|
||||
(*x)[i] = v;
|
||||
}
|
||||
static inline void
|
||||
v_set_u32 (v_u32_t *x, int i, u32_t v)
|
||||
{
|
||||
(*x)[i] = v;
|
||||
}
|
||||
static inline void
|
||||
v_set_s32 (v_s32_t *x, int i, s32_t v)
|
||||
{
|
||||
(*x)[i] = v;
|
||||
}
|
||||
|
||||
/* true if any elements of a v_cond result is non-zero. */
|
||||
static inline int
|
||||
v_any_u32 (v_u32_t x)
|
||||
{
|
||||
/* assume elements in x are either 0 or -1u. */
|
||||
return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
|
||||
}
|
||||
/* to wrap the result of relational operators. */
|
||||
static inline v_u32_t
|
||||
v_cond_u32 (v_u32_t x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_abs_f32 (v_f32_t x)
|
||||
{
|
||||
return vabsq_f32 (x);
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
|
||||
{
|
||||
return vfmaq_f32 (z, x, y);
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_round_f32 (v_f32_t x)
|
||||
{
|
||||
return vrndaq_f32 (x);
|
||||
}
|
||||
static inline v_s32_t
|
||||
v_round_s32 (v_f32_t x)
|
||||
{
|
||||
return vcvtaq_s32_f32 (x);
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
|
||||
{
|
||||
return vbslq_f32 (p, x, y);
|
||||
}
|
||||
/* convert to type1 from type2. */
|
||||
static inline v_f32_t
|
||||
v_to_f32_s32 (v_s32_t x)
|
||||
{
|
||||
return (v_f32_t){x[0], x[1], x[2], x[3]};
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_to_f32_u32 (v_u32_t x)
|
||||
{
|
||||
return (v_f32_t){x[0], x[1], x[2], x[3]};
|
||||
}
|
||||
/* reinterpret as type1 from type2. */
|
||||
static inline v_u32_t
|
||||
v_as_u32_f32 (v_f32_t x)
|
||||
{
|
||||
union { v_f32_t f; v_u32_t u; } r = {x};
|
||||
return r.u;
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_as_f32_u32 (v_u32_t x)
|
||||
{
|
||||
union { v_u32_t u; v_f32_t f; } r = {x};
|
||||
return r.f;
|
||||
}
|
||||
static inline v_s32_t
|
||||
v_as_s32_u32 (v_u32_t x)
|
||||
{
|
||||
union { v_u32_t u; v_s32_t i; } r = {x};
|
||||
return r.i;
|
||||
}
|
||||
static inline v_u32_t
|
||||
v_as_u32_s32 (v_s32_t x)
|
||||
{
|
||||
union { v_s32_t i; v_u32_t u; } r = {x};
|
||||
return r.u;
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_lookup_f32 (const f32_t *tab, v_u32_t idx)
|
||||
{
|
||||
return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
|
||||
}
|
||||
static inline v_u32_t
|
||||
v_lookup_u32 (const u32_t *tab, v_u32_t idx)
|
||||
{
|
||||
return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
|
||||
{
|
||||
return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
|
||||
p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
|
||||
}
|
||||
static inline v_f32_t
|
||||
v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
|
||||
v_u32_t p)
|
||||
{
|
||||
return (
|
||||
v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
|
||||
p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
|
||||
}
|
||||
|
||||
static inline int
|
||||
v_lanes64 (void)
|
||||
{
|
||||
return 2;
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_f64 (f64_t x)
|
||||
{
|
||||
return (v_f64_t){x, x};
|
||||
}
|
||||
static inline v_u64_t
|
||||
v_u64 (u64_t x)
|
||||
{
|
||||
return (v_u64_t){x, x};
|
||||
}
|
||||
static inline v_s64_t
|
||||
v_s64 (s64_t x)
|
||||
{
|
||||
return (v_s64_t){x, x};
|
||||
}
|
||||
static inline f64_t
|
||||
v_get_f64 (v_f64_t x, int i)
|
||||
{
|
||||
return x[i];
|
||||
}
|
||||
static inline void
|
||||
v_set_f64 (v_f64_t *x, int i, f64_t v)
|
||||
{
|
||||
(*x)[i] = v;
|
||||
}
|
||||
/* true if any elements of a v_cond result is non-zero. */
|
||||
static inline int
|
||||
v_any_u64 (v_u64_t x)
|
||||
{
|
||||
/* assume elements in x are either 0 or -1u. */
|
||||
return vpaddd_u64 (x) != 0;
|
||||
}
|
||||
/* to wrap the result of relational operators. */
|
||||
static inline v_u64_t
|
||||
v_cond_u64 (v_u64_t x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_abs_f64 (v_f64_t x)
|
||||
{
|
||||
return vabsq_f64 (x);
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
|
||||
{
|
||||
return vfmaq_f64 (z, x, y);
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_round_f64 (v_f64_t x)
|
||||
{
|
||||
return vrndaq_f64 (x);
|
||||
}
|
||||
static inline v_s64_t
|
||||
v_round_s64 (v_f64_t x)
|
||||
{
|
||||
return vcvtaq_s64_f64 (x);
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
|
||||
{
|
||||
return vbslq_f64 (p, x, y);
|
||||
}
|
||||
/* convert to type1 from type2. */
|
||||
static inline v_f64_t
|
||||
v_to_f64_s64 (v_s64_t x)
|
||||
{
|
||||
return (v_f64_t){x[0], x[1]};
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_to_f64_u64 (v_u64_t x)
|
||||
{
|
||||
return (v_f64_t){x[0], x[1]};
|
||||
}
|
||||
/* reinterpret as type1 from type2. */
|
||||
static inline v_u64_t
|
||||
v_as_u64_f64 (v_f64_t x)
|
||||
{
|
||||
union { v_f64_t f; v_u64_t u; } r = {x};
|
||||
return r.u;
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_as_f64_u64 (v_u64_t x)
|
||||
{
|
||||
union { v_u64_t u; v_f64_t f; } r = {x};
|
||||
return r.f;
|
||||
}
|
||||
static inline v_s64_t
|
||||
v_as_s64_u64 (v_u64_t x)
|
||||
{
|
||||
union { v_u64_t u; v_s64_t i; } r = {x};
|
||||
return r.i;
|
||||
}
|
||||
static inline v_u64_t
|
||||
v_as_u64_s64 (v_s64_t x)
|
||||
{
|
||||
union { v_s64_t i; v_u64_t u; } r = {x};
|
||||
return r.u;
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_lookup_f64 (const f64_t *tab, v_u64_t idx)
|
||||
{
|
||||
return (v_f64_t){tab[idx[0]], tab[idx[1]]};
|
||||
}
|
||||
static inline v_u64_t
|
||||
v_lookup_u64 (const u64_t *tab, v_u64_t idx)
|
||||
{
|
||||
return (v_u64_t){tab[idx[0]], tab[idx[1]]};
|
||||
}
|
||||
static inline v_f64_t
|
||||
v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
|
||||
{
|
||||
return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
@ -1,27 +0,0 @@
|
||||
/*
|
||||
* Double-precision vector pow function.
|
||||
*
|
||||
* Copyright (c) 2020, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#if V_SUPPORTED
|
||||
|
||||
VPCS_ATTR
|
||||
v_f64_t
|
||||
V_NAME(pow) (v_f64_t x, v_f64_t y)
|
||||
{
|
||||
v_f64_t z;
|
||||
for (int lane = 0; lane < v_lanes64 (); lane++)
|
||||
{
|
||||
f64_t sx = v_get_f64 (x, lane);
|
||||
f64_t sy = v_get_f64 (y, lane);
|
||||
f64_t sz = pow (sx, sy);
|
||||
v_set_f64 (&z, lane, sz);
|
||||
}
|
||||
return z;
|
||||
}
|
||||
VPCS_ALIAS
|
||||
#endif
|
@ -1,235 +0,0 @@
|
||||
/*
|
||||
* Single-precision vector powf function.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#if V_SUPPORTED
|
||||
|
||||
#define Min v_u32 (0x00800000)
|
||||
#define Max v_u32 (0x7f800000)
|
||||
#define SBITS 5
|
||||
#define Tlog v__powf_log2_data.tab
|
||||
#define Texp v__exp2f_data.tab
|
||||
#define A v__powf_log2_data.poly
|
||||
#define C v__exp2f_data.poly
|
||||
#define LOGDEG 4
|
||||
|
||||
#if LOGDEG == 5
|
||||
/* 1.01 ulp */
|
||||
#define OFF v_u32 (0x3f330000)
|
||||
#define TBITS 4
|
||||
#elif LOGDEG == 4
|
||||
/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */
|
||||
#define OFF v_u32 (0x3f35d000)
|
||||
#define TBITS 5
|
||||
#endif
|
||||
|
||||
#define V_EXP2F_TABLE_BITS SBITS
|
||||
#define V_EXP2F_POLY_ORDER 3
|
||||
struct v_exp2f_data
|
||||
{
|
||||
uint64_t tab[1 << V_EXP2F_TABLE_BITS];
|
||||
double poly[V_EXP2F_POLY_ORDER];
|
||||
};
|
||||
|
||||
#define V_POWF_LOG2_TABLE_BITS TBITS
|
||||
#define V_POWF_LOG2_POLY_ORDER LOGDEG
|
||||
#define SCALE ((double) (1 << SBITS))
|
||||
struct v_powf_log2_data
|
||||
{
|
||||
struct
|
||||
{
|
||||
double invc, logc;
|
||||
} tab[1 << V_POWF_LOG2_TABLE_BITS];
|
||||
double poly[V_POWF_LOG2_POLY_ORDER];
|
||||
};
|
||||
|
||||
static const struct v_powf_log2_data v__powf_log2_data = {
|
||||
#if LOGDEG == 5
|
||||
.tab = {
|
||||
{ 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE },
|
||||
{ 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE },
|
||||
{ 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE },
|
||||
{ 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE },
|
||||
{ 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE },
|
||||
{ 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE },
|
||||
{ 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE },
|
||||
{ 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE },
|
||||
{ 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE },
|
||||
{ 0x1p+0, 0x0p+0 * SCALE },
|
||||
{ 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE },
|
||||
{ 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE },
|
||||
{ 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE },
|
||||
{ 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE },
|
||||
{ 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE },
|
||||
{ 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE },
|
||||
},
|
||||
/* rel err: 1.46 * 2^-32 */
|
||||
.poly = {
|
||||
0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE,
|
||||
0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE,
|
||||
0x1.71547652ab82bp0 * SCALE,
|
||||
}
|
||||
#elif LOGDEG == 4
|
||||
.tab = {
|
||||
{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE},
|
||||
{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE},
|
||||
{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE},
|
||||
{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE},
|
||||
{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE},
|
||||
{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE},
|
||||
{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE},
|
||||
{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE},
|
||||
{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE},
|
||||
{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE},
|
||||
{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE},
|
||||
{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE},
|
||||
{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE},
|
||||
{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE},
|
||||
{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE},
|
||||
{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE},
|
||||
{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE},
|
||||
{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE},
|
||||
{0x1p+0, 0x0p+0 * SCALE},
|
||||
{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE},
|
||||
{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE},
|
||||
{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE},
|
||||
{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE},
|
||||
{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE},
|
||||
{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE},
|
||||
{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE},
|
||||
{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE},
|
||||
{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE},
|
||||
{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE},
|
||||
{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE},
|
||||
{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE},
|
||||
{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE},
|
||||
},
|
||||
/* rel err: 1.5 * 2^-30 */
|
||||
.poly = {
|
||||
-0x1.6ff5daa3b3d7cp-2 * SCALE,
|
||||
0x1.ec81d03c01aebp-2 * SCALE,
|
||||
-0x1.71547bb43f101p-1 * SCALE,
|
||||
0x1.7154764a815cbp0 * SCALE,
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
static const struct v_exp2f_data v__exp2f_data = {
|
||||
.tab = {
|
||||
0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
|
||||
0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
|
||||
0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
|
||||
0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
|
||||
0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
|
||||
0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
|
||||
0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
|
||||
0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
|
||||
},
|
||||
/* rel err: 1.69 * 2^-34 */
|
||||
.poly = {
|
||||
0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE
|
||||
},
|
||||
};
|
||||
|
||||
VPCS_ATTR
|
||||
__attribute__ ((noinline)) static v_f32_t
|
||||
specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp)
|
||||
{
|
||||
return v_call2_f32 (powf, x, y, ret, cmp);
|
||||
}
|
||||
|
||||
VPCS_ATTR
|
||||
v_f32_t
|
||||
V_NAME(powf) (v_f32_t x, v_f32_t y)
|
||||
{
|
||||
v_u32_t u, tmp, cmp, i, top, iz;
|
||||
v_s32_t k;
|
||||
v_f32_t ret;
|
||||
|
||||
u = v_as_u32_f32 (x);
|
||||
cmp = v_cond_u32 (u - Min >= Max - Min);
|
||||
tmp = u - OFF;
|
||||
i = (tmp >> (23 - TBITS)) % (1 << TBITS);
|
||||
top = tmp & 0xff800000;
|
||||
iz = u - top;
|
||||
k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */
|
||||
|
||||
for (int lane = 0; lane < v_lanes32 (); lane++)
|
||||
{
|
||||
uint32_t si, siz;
|
||||
int32_t sk;
|
||||
float sy;
|
||||
|
||||
/* Use double precision for each lane. */
|
||||
double invc, logc, z, r, p, y0, logx, ylogx, kd, s;
|
||||
uint64_t ki, t;
|
||||
|
||||
si = v_get_u32 (i, lane);
|
||||
siz = v_get_u32 (iz, lane);
|
||||
sk = v_get_s32 (k, lane);
|
||||
sy = v_get_f32 (y, lane);
|
||||
|
||||
invc = Tlog[si].invc;
|
||||
logc = Tlog[si].logc;
|
||||
z = (double) as_f32_u32 (siz);
|
||||
|
||||
/* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
|
||||
r = __builtin_fma (z, invc, -1.0);
|
||||
y0 = logc + (double) sk;
|
||||
|
||||
/* Polynomial to approximate log1p(r)/ln2. */
|
||||
#if LOGDEG == 5
|
||||
logx = A[0];
|
||||
logx = r * logx + A[1];
|
||||
logx = r * logx + A[2];
|
||||
logx = r * logx + A[3];
|
||||
logx = r * logx + A[4];
|
||||
logx = r * logx + y0;
|
||||
#elif LOGDEG == 4
|
||||
logx = A[0];
|
||||
logx = r * logx + A[1];
|
||||
logx = r * logx + A[2];
|
||||
logx = r * logx + A[3];
|
||||
logx = r * logx + y0;
|
||||
#endif
|
||||
ylogx = sy * logx;
|
||||
v_set_u32 (&cmp, lane,
|
||||
(as_u64_f64 (ylogx) >> 47 & 0xffff)
|
||||
>= as_u64_f64 (126.0 * (1 << SBITS)) >> 47
|
||||
? 1
|
||||
: v_get_u32 (cmp, lane));
|
||||
|
||||
/* N*x = k + r with r in [-1/2, 1/2] */
|
||||
#if TOINT_INTRINSICS
|
||||
kd = roundtoint (ylogx); /* k */
|
||||
ki = converttoint (ylogx);
|
||||
#else
|
||||
# define SHIFT 0x1.8p52
|
||||
kd = eval_as_double (ylogx + SHIFT);
|
||||
ki = asuint64 (kd);
|
||||
kd -= SHIFT;
|
||||
#endif
|
||||
r = ylogx - kd;
|
||||
|
||||
/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
|
||||
t = Texp[ki % (1 << SBITS)];
|
||||
t += ki << (52 - SBITS);
|
||||
s = as_f64_u64 (t);
|
||||
p = C[0];
|
||||
p = __builtin_fma (p, r, C[1]);
|
||||
p = __builtin_fma (p, r, C[2]);
|
||||
p = __builtin_fma (p, s * r, s);
|
||||
|
||||
v_set_f32 (&ret, lane, p);
|
||||
}
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
return specialcase (x, y, ret, cmp);
|
||||
return ret;
|
||||
}
|
||||
VPCS_ALIAS
|
||||
#endif
|
@ -1,103 +0,0 @@
|
||||
/*
|
||||
* Double-precision vector sin function.
|
||||
*
|
||||
* Copyright (c) 2019-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#if V_SUPPORTED
|
||||
|
||||
static const double Poly[] = {
|
||||
/* worst-case error is 3.5 ulp.
|
||||
abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */
|
||||
-0x1.9f4a9c8b21dc9p-41,
|
||||
0x1.60e88a10163f2p-33,
|
||||
-0x1.ae6361b7254e7p-26,
|
||||
0x1.71de382e8d62bp-19,
|
||||
-0x1.a01a019aeb4ffp-13,
|
||||
0x1.111111110b25ep-7,
|
||||
-0x1.55555555554c3p-3,
|
||||
};
|
||||
|
||||
#define C7 v_f64 (Poly[0])
|
||||
#define C6 v_f64 (Poly[1])
|
||||
#define C5 v_f64 (Poly[2])
|
||||
#define C4 v_f64 (Poly[3])
|
||||
#define C3 v_f64 (Poly[4])
|
||||
#define C2 v_f64 (Poly[5])
|
||||
#define C1 v_f64 (Poly[6])
|
||||
|
||||
#define InvPi v_f64 (0x1.45f306dc9c883p-2)
|
||||
#define Pi1 v_f64 (0x1.921fb54442d18p+1)
|
||||
#define Pi2 v_f64 (0x1.1a62633145c06p-53)
|
||||
#define Pi3 v_f64 (0x1.c1cd129024e09p-106)
|
||||
#define Shift v_f64 (0x1.8p52)
|
||||
#define AbsMask v_u64 (0x7fffffffffffffff)
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
#define TinyBound 0x202 /* top12 (asuint64 (0x1p-509)). */
|
||||
#define Thresh 0x214 /* top12 (asuint64 (RangeVal)) - TinyBound. */
|
||||
#else
|
||||
#define RangeVal v_f64 (0x1p23)
|
||||
#endif
|
||||
|
||||
VPCS_ATTR
|
||||
__attribute__ ((noinline)) static v_f64_t
|
||||
specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
|
||||
{
|
||||
return v_call_f64 (sin, x, y, cmp);
|
||||
}
|
||||
|
||||
VPCS_ATTR
|
||||
v_f64_t
|
||||
V_NAME(sin) (v_f64_t x)
|
||||
{
|
||||
v_f64_t n, r, r2, y;
|
||||
v_u64_t sign, odd, cmp, ir;
|
||||
|
||||
ir = v_as_u64_f64 (x) & AbsMask;
|
||||
r = v_as_f64_u64 (ir);
|
||||
sign = v_as_u64_f64 (x) & ~AbsMask;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* Detect |x| <= 0x1p-509 or |x| >= RangeVal. If fenv exceptions are to be
|
||||
triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
|
||||
fenv). These lanes will be fixed by specialcase later. */
|
||||
cmp = v_cond_u64 ((ir >> 52) - TinyBound >= Thresh);
|
||||
if (unlikely (v_any_u64 (cmp)))
|
||||
r = v_sel_f64 (cmp, v_f64 (1), r);
|
||||
#else
|
||||
cmp = v_cond_u64 (ir >= v_as_u64_f64 (RangeVal));
|
||||
#endif
|
||||
|
||||
/* n = rint(|x|/pi). */
|
||||
n = v_fma_f64 (InvPi, r, Shift);
|
||||
odd = v_as_u64_f64 (n) << 63;
|
||||
n -= Shift;
|
||||
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
|
||||
r = v_fma_f64 (-Pi1, n, r);
|
||||
r = v_fma_f64 (-Pi2, n, r);
|
||||
r = v_fma_f64 (-Pi3, n, r);
|
||||
|
||||
/* sin(r) poly approx. */
|
||||
r2 = r * r;
|
||||
y = v_fma_f64 (C7, r2, C6);
|
||||
y = v_fma_f64 (y, r2, C5);
|
||||
y = v_fma_f64 (y, r2, C4);
|
||||
y = v_fma_f64 (y, r2, C3);
|
||||
y = v_fma_f64 (y, r2, C2);
|
||||
y = v_fma_f64 (y, r2, C1);
|
||||
y = v_fma_f64 (y * r2, r, r);
|
||||
|
||||
/* sign. */
|
||||
y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign ^ odd);
|
||||
|
||||
if (unlikely (v_any_u64 (cmp)))
|
||||
return specialcase (x, y, cmp);
|
||||
return y;
|
||||
}
|
||||
VPCS_ALIAS
|
||||
#endif
|
@ -1,88 +0,0 @@
|
||||
/*
|
||||
* Single-precision vector sin function.
|
||||
*
|
||||
* Copyright (c) 2019-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "v_math.h"
|
||||
#if V_SUPPORTED
|
||||
|
||||
static const float Poly[] = {
|
||||
/* 1.886 ulp error */
|
||||
0x1.5b2e76p-19f,
|
||||
-0x1.9f42eap-13f,
|
||||
0x1.110df4p-7f,
|
||||
-0x1.555548p-3f,
|
||||
};
|
||||
#define Pi1 v_f32 (0x1.921fb6p+1f)
|
||||
#define Pi2 v_f32 (-0x1.777a5cp-24f)
|
||||
#define Pi3 v_f32 (-0x1.ee59dap-49f)
|
||||
#define A3 v_f32 (Poly[3])
|
||||
#define A5 v_f32 (Poly[2])
|
||||
#define A7 v_f32 (Poly[1])
|
||||
#define A9 v_f32 (Poly[0])
|
||||
#define RangeVal v_f32 (0x1p20f)
|
||||
#define TinyBound v_f32 (0x1p-61f)
|
||||
#define InvPi v_f32 (0x1.45f306p-2f)
|
||||
#define Shift v_f32 (0x1.8p+23f)
|
||||
#define AbsMask v_u32 (0x7fffffff)
|
||||
|
||||
VPCS_ATTR
|
||||
static v_f32_t
|
||||
specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
|
||||
{
|
||||
/* Fall back to scalar code. */
|
||||
return v_call_f32 (sinf, x, y, cmp);
|
||||
}
|
||||
|
||||
VPCS_ATTR
|
||||
v_f32_t
|
||||
V_NAME(sinf) (v_f32_t x)
|
||||
{
|
||||
v_f32_t n, r, r2, y;
|
||||
v_u32_t sign, odd, cmp, ir;
|
||||
|
||||
ir = v_as_u32_f32 (x) & AbsMask;
|
||||
r = v_as_f32_u32 (ir);
|
||||
sign = v_as_u32_f32 (x) & ~AbsMask;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
cmp = v_cond_u32 ((ir - v_as_u32_f32 (TinyBound)
|
||||
>= v_as_u32_f32 (RangeVal) - v_as_u32_f32 (TinyBound)));
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
/* If fenv exceptions are to be triggered correctly, set any special lanes
|
||||
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
|
||||
specialcase later. */
|
||||
r = v_sel_f32 (cmp, v_f32 (1), r);
|
||||
#else
|
||||
cmp = v_cond_u32 (ir >= v_as_u32_f32 (RangeVal));
|
||||
#endif
|
||||
|
||||
/* n = rint(|x|/pi) */
|
||||
n = v_fma_f32 (InvPi, r, Shift);
|
||||
odd = v_as_u32_f32 (n) << 31;
|
||||
n -= Shift;
|
||||
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
|
||||
r = v_fma_f32 (-Pi1, n, r);
|
||||
r = v_fma_f32 (-Pi2, n, r);
|
||||
r = v_fma_f32 (-Pi3, n, r);
|
||||
|
||||
/* y = sin(r) */
|
||||
r2 = r * r;
|
||||
y = v_fma_f32 (A9, r2, A7);
|
||||
y = v_fma_f32 (y, r2, A5);
|
||||
y = v_fma_f32 (y, r2, A3);
|
||||
y = v_fma_f32 (y * r2, r, r);
|
||||
|
||||
/* sign fix */
|
||||
y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign ^ odd);
|
||||
|
||||
if (unlikely (v_any_u32 (cmp)))
|
||||
return specialcase (x, y, cmp);
|
||||
return y;
|
||||
}
|
||||
VPCS_ALIAS
|
||||
#endif
|
@ -1,12 +0,0 @@
|
||||
/*
|
||||
* AdvSIMD vector PCS variant of __v_cos.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "mathlib.h"
|
||||
#ifdef __vpcs
|
||||
#define VPCS 1
|
||||
#define VPCS_ALIAS strong_alias (__vn_cos, _ZGVnN2v_cos)
|
||||
#include "v_cos.c"
|
||||
#endif
|
@ -1,12 +0,0 @@
|
||||
/*
|
||||
* AdvSIMD vector PCS variant of __v_cosf.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "mathlib.h"
|
||||
#ifdef __vpcs
|
||||
#define VPCS 1
|
||||
#define VPCS_ALIAS strong_alias (__vn_cosf, _ZGVnN4v_cosf)
|
||||
#include "v_cosf.c"
|
||||
#endif
|
@ -1,12 +0,0 @@
|
||||
/*
|
||||
* AdvSIMD vector PCS variant of __v_exp.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "mathlib.h"
|
||||
#ifdef __vpcs
|
||||
#define VPCS 1
|
||||
#define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp)
|
||||
#include "v_exp.c"
|
||||
#endif
|
@ -1,12 +0,0 @@
|
||||
/*
|
||||
* AdvSIMD vector PCS variant of __v_exp2f.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "mathlib.h"
|
||||
#ifdef __vpcs
|
||||
#define VPCS 1
|
||||
#define VPCS_ALIAS strong_alias (__vn_exp2f, _ZGVnN4v_exp2f)
|
||||
#include "v_exp2f.c"
|
||||
#endif
|
@ -1,11 +0,0 @@
|
||||
/*
|
||||
* AdvSIMD vector PCS variant of __v_exp2f_1u.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "mathlib.h"
|
||||
#ifdef __vpcs
|
||||
#define VPCS 1
|
||||
#include "v_exp2f_1u.c"
|
||||
#endif
|
@ -1,12 +0,0 @@
|
||||
/*
|
||||
* AdvSIMD vector PCS variant of __v_expf.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "mathlib.h"
|
||||
#ifdef __vpcs
|
||||
#define VPCS 1
|
||||
#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf)
|
||||
#include "v_expf.c"
|
||||
#endif
|
@ -1,11 +0,0 @@
|
||||
/*
|
||||
* AdvSIMD vector PCS variant of __v_expf_1u.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "mathlib.h"
|
||||
#ifdef __vpcs
|
||||
#define VPCS 1
|
||||
#include "v_expf_1u.c"
|
||||
#endif
|
@ -1,12 +0,0 @@
|
||||
/*
|
||||
* AdvSIMD vector PCS variant of __v_log.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "mathlib.h"
|
||||
#ifdef __vpcs
|
||||
#define VPCS 1
|
||||
#define VPCS_ALIAS strong_alias (__vn_log, _ZGVnN2v_log)
|
||||
#include "v_log.c"
|
||||
#endif
|
@ -1,12 +0,0 @@
|
||||
/*
|
||||
* AdvSIMD vector PCS variant of __v_logf.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "mathlib.h"
|
||||
#ifdef __vpcs
|
||||
#define VPCS 1
|
||||
#define VPCS_ALIAS strong_alias (__vn_logf, _ZGVnN4v_logf)
|
||||
#include "v_logf.c"
|
||||
#endif
|
@ -1,12 +0,0 @@
|
||||
/*
|
||||
* AdvSIMD vector PCS variant of __v_pow.
|
||||
*
|
||||
* Copyright (c) 2020, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "mathlib.h"
|
||||
#ifdef __vpcs
|
||||
#define VPCS 1
|
||||
#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow)
|
||||
#include "v_pow.c"
|
||||
#endif
|
@ -1,12 +0,0 @@
|
||||
/*
|
||||
* AdvSIMD vector PCS variant of __v_powf.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "mathlib.h"
|
||||
#ifdef __vpcs
|
||||
#define VPCS 1
|
||||
#define VPCS_ALIAS strong_alias (__vn_powf, _ZGVnN4vv_powf)
|
||||
#include "v_powf.c"
|
||||
#endif
|
@ -1,12 +0,0 @@
|
||||
/*
|
||||
* AdvSIMD vector PCS variant of __v_sin.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "mathlib.h"
|
||||
#ifdef __vpcs
|
||||
#define VPCS 1
|
||||
#define VPCS_ALIAS strong_alias (__vn_sin, _ZGVnN2v_sin)
|
||||
#include "v_sin.c"
|
||||
#endif
|
@ -1,12 +0,0 @@
|
||||
/*
|
||||
* AdvSIMD vector PCS variant of __v_sinf.
|
||||
*
|
||||
* Copyright (c) 2019, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "mathlib.h"
|
||||
#ifdef __vpcs
|
||||
#define VPCS 1
|
||||
#define VPCS_ALIAS strong_alias (__vn_sinf, _ZGVnN4v_sinf)
|
||||
#include "v_sinf.c"
|
||||
#endif
|
@ -1,13 +1,18 @@
|
||||
# Makefile fragment - requires GNU make
|
||||
#
|
||||
# Copyright (c) 2019-2023, Arm Limited.
|
||||
# Copyright (c) 2019-2024, Arm Limited.
|
||||
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
|
||||
PLM := $(srcdir)/pl/math
|
||||
AOR := $(srcdir)/math
|
||||
B := build/pl/math
|
||||
|
||||
math-lib-srcs := $(wildcard $(PLM)/*.[cS])
|
||||
pl-lib-srcs := $(wildcard $(PLM)/*.[cS])
|
||||
|
||||
ifeq ($(WANT_SVE_MATH), 0)
|
||||
pl-lib-srcs := $(filter-out $(PLM)/sv_%, $(pl-lib-srcs))
|
||||
endif
|
||||
|
||||
math-test-srcs := \
|
||||
$(AOR)/test/mathtest.c \
|
||||
$(AOR)/test/mathbench.c \
|
||||
@ -15,10 +20,10 @@ math-test-srcs := \
|
||||
|
||||
math-test-host-srcs := $(wildcard $(AOR)/test/rtest/*.[cS])
|
||||
|
||||
math-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h))
|
||||
math-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h))
|
||||
pl-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h))
|
||||
pl-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h))
|
||||
|
||||
math-libs := \
|
||||
pl-libs := \
|
||||
build/pl/lib/libmathlib.so \
|
||||
build/pl/lib/libmathlib.a \
|
||||
|
||||
@ -32,37 +37,39 @@ math-tools := \
|
||||
math-host-tools := \
|
||||
build/pl/bin/rtest \
|
||||
|
||||
math-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(math-lib-srcs)))
|
||||
pl-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(pl-lib-srcs)))
|
||||
math-test-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-srcs)))
|
||||
math-host-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-host-srcs)))
|
||||
math-target-objs := $(math-lib-objs) $(math-test-objs)
|
||||
math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs)
|
||||
pl-target-objs := $(pl-lib-objs) $(math-test-objs)
|
||||
pl-objs := $(pl-target-objs) $(pl-target-objs:%.o=%.os) $(math-host-objs)
|
||||
|
||||
pl/math-files := \
|
||||
$(math-objs) \
|
||||
$(math-libs) \
|
||||
$(pl-objs) \
|
||||
$(pl-libs) \
|
||||
$(math-tools) \
|
||||
$(math-host-tools) \
|
||||
$(math-includes) \
|
||||
$(math-test-includes) \
|
||||
$(pl-includes) \
|
||||
$(pl-test-includes) \
|
||||
|
||||
all-pl/math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
|
||||
all-pl/math: $(pl-libs) $(math-tools) $(pl-includes) $(pl-test-includes)
|
||||
|
||||
$(math-objs): $(math-includes) $(math-test-includes)
|
||||
$(math-objs): CFLAGS_PL += $(math-cflags)
|
||||
$(pl-objs): $(pl-includes) $(pl-test-includes)
|
||||
$(pl-objs): CFLAGS_PL += $(math-cflags)
|
||||
$(B)/test/mathtest.o: CFLAGS_PL += -fmath-errno
|
||||
$(math-host-objs): CC = $(HOST_CC)
|
||||
$(math-host-objs): CFLAGS_PL = $(HOST_CFLAGS)
|
||||
|
||||
build/pl/include/test/ulp_funcs_gen.h: $(math-lib-srcs)
|
||||
$(B)/sv_%: CFLAGS_PL += $(math-sve-cflags)
|
||||
|
||||
build/pl/include/test/ulp_funcs_gen.h: $(pl-lib-srcs)
|
||||
# Replace PL_SIG
|
||||
cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f)" -P > $@
|
||||
|
||||
build/pl/include/test/mathbench_funcs_gen.h: $(math-lib-srcs)
|
||||
build/pl/include/test/mathbench_funcs_gen.h: $(pl-lib-srcs)
|
||||
# Replace PL_SIG macros with mathbench func entries
|
||||
cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f, ##__VA_ARGS__)" -P > $@
|
||||
|
||||
build/pl/include/test/ulp_wrappers_gen.h: $(math-lib-srcs)
|
||||
build/pl/include/test/ulp_wrappers_gen.h: $(pl-lib-srcs)
|
||||
# Replace PL_SIG macros with ULP wrapper declarations
|
||||
cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=Z##v##N##t##a##_WRAP(f)" -P > $@
|
||||
|
||||
@ -72,16 +79,18 @@ $(B)/test/ulp.o: CFLAGS_PL += -I build/pl/include/test
|
||||
$(B)/test/mathbench.o: build/pl/include/test/mathbench_funcs_gen.h
|
||||
$(B)/test/mathbench.o: CFLAGS_PL += -I build/pl/include/test
|
||||
|
||||
build/pl/lib/libmathlib.so: $(math-lib-objs:%.o=%.os)
|
||||
build/pl/lib/libmathlib.so: $(pl-lib-objs:%.o=%.os)
|
||||
$(CC) $(CFLAGS_PL) $(LDFLAGS) -shared -o $@ $^
|
||||
|
||||
build/pl/lib/libmathlib.a: $(math-lib-objs)
|
||||
build/pl/lib/libmathlib.a: $(pl-lib-objs)
|
||||
rm -f $@
|
||||
$(AR) rc $@ $^
|
||||
$(RANLIB) $@
|
||||
|
||||
$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
|
||||
$(math-tools): LDLIBS += $(math-ldlibs) -lm
|
||||
# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled
|
||||
$(math-tools): CFLAGS_PL += $(math-sve-cflags)
|
||||
|
||||
# Some targets to build pl/math/test from math/test sources
|
||||
build/pl/math/test/%.o: $(srcdir)/math/test/%.S
|
||||
@ -145,12 +154,11 @@ check-pl/math-rtest: $(math-host-tools) $(math-tools)
|
||||
|
||||
ulp-input-dir=$(B)/test/inputs
|
||||
|
||||
math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(math-lib-srcs)))
|
||||
math-lib-aliases = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.alias,$(basename $(math-lib-srcs)))
|
||||
math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(math-lib-srcs)))
|
||||
math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(math-lib-srcs)))
|
||||
math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(pl-lib-srcs)))
|
||||
math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(pl-lib-srcs)))
|
||||
math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(pl-lib-srcs)))
|
||||
|
||||
ulp-inputs = $(math-lib-lims) $(math-lib-aliases) $(math-lib-fenvs) $(math-lib-itvs)
|
||||
ulp-inputs = $(math-lib-lims) $(math-lib-fenvs) $(math-lib-itvs)
|
||||
|
||||
$(ulp-inputs): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags)
|
||||
|
||||
@ -158,10 +166,6 @@ $(ulp-input-dir)/%.ulp: $(PLM)/%.c
|
||||
mkdir -p $(@D)
|
||||
$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_ULP [^ ]* [^ ]*" || true; } > $@
|
||||
|
||||
$(ulp-input-dir)/%.alias: $(PLM)/%.c
|
||||
mkdir -p $(@D)
|
||||
$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_ALIAS" || true; } | sed "s/_x / /g"> $@
|
||||
|
||||
$(ulp-input-dir)/%.fenv: $(PLM)/%.c
|
||||
mkdir -p $(@D)
|
||||
$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_EXPECT_FENV_ENABLED [^ ]*" || true; } > $@
|
||||
@ -174,38 +178,21 @@ ulp-lims := $(ulp-input-dir)/limits
|
||||
$(ulp-lims): $(math-lib-lims)
|
||||
cat $^ | sed "s/PL_TEST_ULP //g;s/^ *//g" > $@
|
||||
|
||||
ulp-aliases := $(ulp-input-dir)/aliases
|
||||
$(ulp-aliases): $(math-lib-aliases)
|
||||
cat $^ | sed "s/PL_TEST_ALIAS //g;s/^ *//g" > $@
|
||||
|
||||
fenv-exps := $(ulp-input-dir)/fenv
|
||||
$(fenv-exps): $(math-lib-fenvs)
|
||||
cat $^ | sed "s/PL_TEST_EXPECT_FENV_ENABLED //g;s/^ *//g" > $@
|
||||
|
||||
ulp-itvs-noalias := $(ulp-input-dir)/itvs_noalias
|
||||
$(ulp-itvs-noalias): $(math-lib-itvs)
|
||||
cat $^ > $@
|
||||
|
||||
rename-aliases := $(ulp-input-dir)/rename_alias.sed
|
||||
$(rename-aliases): $(ulp-aliases)
|
||||
# Build sed script for replacing aliases from generated alias file
|
||||
cat $< | awk '{ print "s/ " $$1 " / " $$2 " /g" }' > $@
|
||||
|
||||
ulp-itvs-alias := $(ulp-input-dir)/itvs_alias
|
||||
$(ulp-itvs-alias): $(ulp-itvs-noalias) $(rename-aliases)
|
||||
cat $< | sed -f $(rename-aliases) > $@
|
||||
|
||||
ulp-itvs := $(ulp-input-dir)/intervals
|
||||
$(ulp-itvs): $(ulp-itvs-alias) $(ulp-itvs-noalias)
|
||||
$(ulp-itvs): $(math-lib-itvs)
|
||||
cat $^ | sort -u | sed "s/PL_TEST_INTERVAL //g" > $@
|
||||
|
||||
check-pl/math-ulp: $(math-tools) $(ulp-lims) $(ulp-aliases) $(fenv-exps) $(ulp-itvs)
|
||||
check-pl/math-ulp: $(math-tools) $(ulp-lims) $(fenv-exps) $(ulp-itvs)
|
||||
WANT_SVE_MATH=$(WANT_SVE_MATH) \
|
||||
ULPFLAGS="$(math-ulpflags)" \
|
||||
LIMITS=../../../$(ulp-lims) \
|
||||
ALIASES=../../../$(ulp-aliases) \
|
||||
INTERVALS=../../../$(ulp-itvs) \
|
||||
FENV=../../../$(fenv-exps) \
|
||||
FUNC=$(func) \
|
||||
build/pl/bin/runulp.sh $(EMULATOR)
|
||||
|
||||
check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp
|
||||
@ -220,8 +207,8 @@ $(DESTDIR)$(includedir)/pl/%: build/pl/include/%
|
||||
$(INSTALL) -m 644 -D $< $@
|
||||
|
||||
install-pl/math: \
|
||||
$(math-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \
|
||||
$(math-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%)
|
||||
$(pl-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \
|
||||
$(pl-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%)
|
||||
|
||||
clean-pl/math:
|
||||
rm -f $(pl/math-files)
|
||||
|
100
contrib/arm-optimized-routines/pl/math/acos_2u.c
Normal file
100
contrib/arm-optimized-routines/pl/math/acos_2u.c
Normal file
@ -0,0 +1,100 @@
|
||||
/*
|
||||
* Double-precision acos(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "math_config.h"
|
||||
#include "poly_scalar_f64.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
|
||||
#define AbsMask (0x7fffffffffffffff)
|
||||
#define Half (0x3fe0000000000000)
|
||||
#define One (0x3ff0000000000000)
|
||||
#define PiOver2 (0x1.921fb54442d18p+0)
|
||||
#define Pi (0x1.921fb54442d18p+1)
|
||||
#define Small (0x3c90000000000000) /* 2^-53. */
|
||||
#define Small16 (0x3c90)
|
||||
#define QNaN (0x7ff8)
|
||||
|
||||
/* Fast implementation of double-precision acos(x) based on polynomial
|
||||
approximation of double-precision asin(x).
|
||||
|
||||
For x < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct
|
||||
rounding.
|
||||
|
||||
For |x| in [Small, 0.5], use the trigonometric identity
|
||||
|
||||
acos(x) = pi/2 - asin(x)
|
||||
|
||||
and use an order 11 polynomial P such that the final approximation of asin is
|
||||
an odd polynomial: asin(x) ~ x + x^3 * P(x^2).
|
||||
|
||||
The largest observed error in this region is 1.18 ulps,
|
||||
acos(0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0
|
||||
want 0x1.0d54d1985c069p+0.
|
||||
|
||||
For |x| in [0.5, 1.0], use the following development of acos(x) near x = 1
|
||||
|
||||
acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z))
|
||||
|
||||
where z = (1-x)/2, z is near 0 when x approaches 1, and P contributes to the
|
||||
approximation of asin near 0.
|
||||
|
||||
The largest observed error in this region is 1.52 ulps,
|
||||
acos(0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1
|
||||
want 0x1.edbbedf8a7d6cp-1.
|
||||
|
||||
For x in [-1.0, -0.5], use this other identity to deduce the negative inputs
|
||||
from their absolute value: acos(x) = pi - acos(-x). */
|
||||
double
|
||||
acos (double x)
|
||||
{
|
||||
uint64_t ix = asuint64 (x);
|
||||
uint64_t ia = ix & AbsMask;
|
||||
uint64_t ia16 = ia >> 48;
|
||||
double ax = asdouble (ia);
|
||||
uint64_t sign = ix & ~AbsMask;
|
||||
|
||||
/* Special values and invalid range. */
|
||||
if (unlikely (ia16 == QNaN))
|
||||
return x;
|
||||
if (ia > One)
|
||||
return __math_invalid (x);
|
||||
if (ia16 < Small16)
|
||||
return PiOver2 - x;
|
||||
|
||||
/* Evaluate polynomial Q(|x|) = z + z * z2 * P(z2) with
|
||||
z2 = x ^ 2 and z = |x| , if |x| < 0.5
|
||||
z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
|
||||
double z2 = ax < 0.5 ? x * x : fma (-0.5, ax, 0.5);
|
||||
double z = ax < 0.5 ? ax : sqrt (z2);
|
||||
|
||||
/* Use a single polynomial approximation P for both intervals. */
|
||||
double z4 = z2 * z2;
|
||||
double z8 = z4 * z4;
|
||||
double z16 = z8 * z8;
|
||||
double p = estrin_11_f64 (z2, z4, z8, z16, __asin_poly);
|
||||
|
||||
/* Finalize polynomial: z + z * z2 * P(z2). */
|
||||
p = fma (z * z2, p, z);
|
||||
|
||||
/* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
|
||||
= pi - 2 Q(|x|), for -1.0 < x <= -0.5
|
||||
= 2 Q(|x|) , for -0.5 < x < 0.0. */
|
||||
if (ax < 0.5)
|
||||
return PiOver2 - asdouble (asuint64 (p) | sign);
|
||||
|
||||
return (x <= -0.5) ? fma (-2.0, p, Pi) : 2.0 * p;
|
||||
}
|
||||
|
||||
PL_SIG (S, D, 1, acos, -1.0, 1.0)
|
||||
PL_TEST_ULP (acos, 1.02)
|
||||
PL_TEST_INTERVAL (acos, 0, Small, 5000)
|
||||
PL_TEST_INTERVAL (acos, Small, 0.5, 50000)
|
||||
PL_TEST_INTERVAL (acos, 0.5, 1.0, 50000)
|
||||
PL_TEST_INTERVAL (acos, 1.0, 0x1p11, 50000)
|
||||
PL_TEST_INTERVAL (acos, 0x1p11, inf, 20000)
|
||||
PL_TEST_INTERVAL (acos, -0, -inf, 20000)
|
99
contrib/arm-optimized-routines/pl/math/acosf_1u4.c
Normal file
99
contrib/arm-optimized-routines/pl/math/acosf_1u4.c
Normal file
@ -0,0 +1,99 @@
|
||||
/*
|
||||
* Single-precision acos(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "poly_scalar_f32.h"
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
|
||||
#define AbsMask (0x7fffffff)
|
||||
#define Half (0x3f000000)
|
||||
#define One (0x3f800000)
|
||||
#define PiOver2f (0x1.921fb6p+0f)
|
||||
#define Pif (0x1.921fb6p+1f)
|
||||
#define Small (0x32800000) /* 2^-26. */
|
||||
#define Small12 (0x328)
|
||||
#define QNaN (0x7fc)
|
||||
|
||||
/* Fast implementation of single-precision acos(x) based on polynomial
|
||||
approximation of single-precision asin(x).
|
||||
|
||||
For x < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct
|
||||
rounding.
|
||||
|
||||
For |x| in [Small, 0.5], use the trigonometric identity
|
||||
|
||||
acos(x) = pi/2 - asin(x)
|
||||
|
||||
and use an order 4 polynomial P such that the final approximation of asin is
|
||||
an odd polynomial: asin(x) ~ x + x^3 * P(x^2).
|
||||
|
||||
The largest observed error in this region is 1.16 ulps,
|
||||
acosf(0x1.ffbeccp-2) got 0x1.0c27f8p+0 want 0x1.0c27f6p+0.
|
||||
|
||||
For |x| in [0.5, 1.0], use the following development of acos(x) near x = 1
|
||||
|
||||
acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z))
|
||||
|
||||
where z = (1-x)/2, z is near 0 when x approaches 1, and P contributes to the
|
||||
approximation of asin near 0.
|
||||
|
||||
The largest observed error in this region is 1.32 ulps,
|
||||
acosf(0x1.15ba56p-1) got 0x1.feb33p-1 want 0x1.feb32ep-1.
|
||||
|
||||
For x in [-1.0, -0.5], use this other identity to deduce the negative inputs
|
||||
from their absolute value.
|
||||
|
||||
acos(x) = pi - acos(-x)
|
||||
|
||||
The largest observed error in this region is 1.28 ulps,
|
||||
acosf(-0x1.002072p-1) got 0x1.0c1e84p+1 want 0x1.0c1e82p+1. */
|
||||
float
|
||||
acosf (float x)
|
||||
{
|
||||
uint32_t ix = asuint (x);
|
||||
uint32_t ia = ix & AbsMask;
|
||||
uint32_t ia12 = ia >> 20;
|
||||
float ax = asfloat (ia);
|
||||
uint32_t sign = ix & ~AbsMask;
|
||||
|
||||
/* Special values and invalid range. */
|
||||
if (unlikely (ia12 == QNaN))
|
||||
return x;
|
||||
if (ia > One)
|
||||
return __math_invalidf (x);
|
||||
if (ia12 < Small12)
|
||||
return PiOver2f - x;
|
||||
|
||||
/* Evaluate polynomial Q(|x|) = z + z * z2 * P(z2) with
|
||||
z2 = x ^ 2 and z = |x| , if |x| < 0.5
|
||||
z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
|
||||
float z2 = ax < 0.5 ? x * x : fmaf (-0.5f, ax, 0.5f);
|
||||
float z = ax < 0.5 ? ax : sqrtf (z2);
|
||||
|
||||
/* Use a single polynomial approximation P for both intervals. */
|
||||
float p = horner_4_f32 (z2, __asinf_poly);
|
||||
/* Finalize polynomial: z + z * z2 * P(z2). */
|
||||
p = fmaf (z * z2, p, z);
|
||||
|
||||
/* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
|
||||
= pi - 2 Q(|x|), for -1.0 < x <= -0.5
|
||||
= 2 Q(|x|) , for -0.5 < x < 0.0. */
|
||||
if (ax < 0.5)
|
||||
return PiOver2f - asfloat (asuint (p) | sign);
|
||||
|
||||
return (x <= -0.5) ? fmaf (-2.0f, p, Pif) : 2.0f * p;
|
||||
}
|
||||
|
||||
PL_SIG (S, F, 1, acos, -1.0, 1.0)
|
||||
PL_TEST_ULP (acosf, 0.82)
|
||||
PL_TEST_INTERVAL (acosf, 0, Small, 5000)
|
||||
PL_TEST_INTERVAL (acosf, Small, 0.5, 50000)
|
||||
PL_TEST_INTERVAL (acosf, 0.5, 1.0, 50000)
|
||||
PL_TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000)
|
||||
PL_TEST_INTERVAL (acosf, 0x1p11, inf, 20000)
|
||||
PL_TEST_INTERVAL (acosf, -0, -inf, 20000)
|
106
contrib/arm-optimized-routines/pl/math/asin_3u.c
Normal file
106
contrib/arm-optimized-routines/pl/math/asin_3u.c
Normal file
@ -0,0 +1,106 @@
|
||||
/*
|
||||
* Double-precision asin(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "poly_scalar_f64.h"
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
|
||||
#define AbsMask (0x7fffffffffffffff)
|
||||
#define Half (0x3fe0000000000000)
|
||||
#define One (0x3ff0000000000000)
|
||||
#define PiOver2 (0x1.921fb54442d18p+0)
|
||||
#define Small (0x3e50000000000000) /* 2^-26. */
|
||||
#define Small16 (0x3e50)
|
||||
#define QNaN (0x7ff8)
|
||||
|
||||
/* Fast implementation of double-precision asin(x) based on polynomial
|
||||
approximation.
|
||||
|
||||
For x < Small, approximate asin(x) by x. Small = 2^-26 for correct rounding.
|
||||
|
||||
For x in [Small, 0.5], use an order 11 polynomial P such that the final
|
||||
approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
|
||||
|
||||
The largest observed error in this region is 1.01 ulps,
|
||||
asin(0x1.da9735b5a9277p-2) got 0x1.ed78525a927efp-2
|
||||
want 0x1.ed78525a927eep-2.
|
||||
|
||||
No cheap approximation can be obtained near x = 1, since the function is not
|
||||
continuously differentiable on 1.
|
||||
|
||||
For x in [0.5, 1.0], we use a method based on a trigonometric identity
|
||||
|
||||
asin(x) = pi/2 - acos(x)
|
||||
|
||||
and a generalized power series expansion of acos(y) near y=1, that reads as
|
||||
|
||||
acos(y)/sqrt(2y) ~ 1 + 1/12 * y + 3/160 * y^2 + ... (1)
|
||||
|
||||
The Taylor series of asin(z) near z = 0, reads as
|
||||
|
||||
asin(z) ~ z + z^3 P(z^2) = z + z^3 * (1/6 + 3/40 z^2 + ...).
|
||||
|
||||
Therefore, (1) can be written in terms of P(y/2) or even asin(y/2)
|
||||
|
||||
acos(y) ~ sqrt(2y) (1 + y/2 * P(y/2)) = 2 * sqrt(y/2) (1 + y/2 * P(y/2)
|
||||
|
||||
Hence, if we write z = (1-x)/2, z is near 0 when x approaches 1 and
|
||||
|
||||
asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)).
|
||||
|
||||
The largest observed error in this region is 2.69 ulps,
|
||||
asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
|
||||
want 0x1.110d7e85fdd53p-1. */
|
||||
double
|
||||
asin (double x)
|
||||
{
|
||||
uint64_t ix = asuint64 (x);
|
||||
uint64_t ia = ix & AbsMask;
|
||||
uint64_t ia16 = ia >> 48;
|
||||
double ax = asdouble (ia);
|
||||
uint64_t sign = ix & ~AbsMask;
|
||||
|
||||
/* Special values and invalid range. */
|
||||
if (unlikely (ia16 == QNaN))
|
||||
return x;
|
||||
if (ia > One)
|
||||
return __math_invalid (x);
|
||||
if (ia16 < Small16)
|
||||
return x;
|
||||
|
||||
/* Evaluate polynomial Q(x) = y + y * z * P(z) with
|
||||
z2 = x ^ 2 and z = |x| , if |x| < 0.5
|
||||
z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
|
||||
double z2 = ax < 0.5 ? x * x : fma (-0.5, ax, 0.5);
|
||||
double z = ax < 0.5 ? ax : sqrt (z2);
|
||||
|
||||
/* Use a single polynomial approximation P for both intervals. */
|
||||
double z4 = z2 * z2;
|
||||
double z8 = z4 * z4;
|
||||
double z16 = z8 * z8;
|
||||
double p = estrin_11_f64 (z2, z4, z8, z16, __asin_poly);
|
||||
|
||||
/* Finalize polynomial: z + z * z2 * P(z2). */
|
||||
p = fma (z * z2, p, z);
|
||||
|
||||
/* asin(|x|) = Q(|x|) , for |x| < 0.5
|
||||
= pi/2 - 2 Q(|x|), for |x| >= 0.5. */
|
||||
double y = ax < 0.5 ? p : fma (-2.0, p, PiOver2);
|
||||
|
||||
/* Copy sign. */
|
||||
return asdouble (asuint64 (y) | sign);
|
||||
}
|
||||
|
||||
PL_SIG (S, D, 1, asin, -1.0, 1.0)
|
||||
PL_TEST_ULP (asin, 2.19)
|
||||
PL_TEST_INTERVAL (asin, 0, Small, 5000)
|
||||
PL_TEST_INTERVAL (asin, Small, 0.5, 50000)
|
||||
PL_TEST_INTERVAL (asin, 0.5, 1.0, 50000)
|
||||
PL_TEST_INTERVAL (asin, 1.0, 0x1p11, 50000)
|
||||
PL_TEST_INTERVAL (asin, 0x1p11, inf, 20000)
|
||||
PL_TEST_INTERVAL (asin, -0, -inf, 20000)
|
19
contrib/arm-optimized-routines/pl/math/asin_data.c
Normal file
19
contrib/arm-optimized-routines/pl/math/asin_data.c
Normal file
@ -0,0 +1,19 @@
|
||||
/*
|
||||
* Coefficients for single-precision asin(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "math_config.h"
|
||||
|
||||
/* Approximate asin(x) directly in [0x1p-106, 0.25]. See tools/asin.sollya
|
||||
for these coeffcients were generated. */
|
||||
const double __asin_poly[] = {
|
||||
/* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
|
||||
on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
|
||||
0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5,
|
||||
0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6,
|
||||
0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8,
|
||||
0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6,
|
||||
};
|
100
contrib/arm-optimized-routines/pl/math/asinf_2u5.c
Normal file
100
contrib/arm-optimized-routines/pl/math/asinf_2u5.c
Normal file
@ -0,0 +1,100 @@
|
||||
/*
|
||||
* Single-precision asin(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "poly_scalar_f32.h"
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
|
||||
#define AbsMask (0x7fffffff)
|
||||
#define Half (0x3f000000)
|
||||
#define One (0x3f800000)
|
||||
#define PiOver2f (0x1.921fb6p+0f)
|
||||
#define Small (0x39800000) /* 2^-12. */
|
||||
#define Small12 (0x398)
|
||||
#define QNaN (0x7fc)
|
||||
|
||||
/* Fast implementation of single-precision asin(x) based on polynomial
|
||||
approximation.
|
||||
|
||||
For x < Small, approximate asin(x) by x. Small = 2^-12 for correct rounding.
|
||||
|
||||
For x in [Small, 0.5], use order 4 polynomial P such that the final
|
||||
approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
|
||||
|
||||
The largest observed error in this region is 0.83 ulps,
|
||||
asinf(0x1.ea00f4p-2) got 0x1.fef15ep-2 want 0x1.fef15cp-2.
|
||||
|
||||
No cheap approximation can be obtained near x = 1, since the function is not
|
||||
continuously differentiable on 1.
|
||||
|
||||
For x in [0.5, 1.0], we use a method based on a trigonometric identity
|
||||
|
||||
asin(x) = pi/2 - acos(x)
|
||||
|
||||
and a generalized power series expansion of acos(y) near y=1, that reads as
|
||||
|
||||
acos(y)/sqrt(2y) ~ 1 + 1/12 * y + 3/160 * y^2 + ... (1)
|
||||
|
||||
The Taylor series of asin(z) near z = 0, reads as
|
||||
|
||||
asin(z) ~ z + z^3 P(z^2) = z + z^3 * (1/6 + 3/40 z^2 + ...).
|
||||
|
||||
Therefore, (1) can be written in terms of P(y/2) or even asin(y/2)
|
||||
|
||||
acos(y) ~ sqrt(2y) (1 + y/2 * P(y/2)) = 2 * sqrt(y/2) (1 + y/2 * P(y/2)
|
||||
|
||||
Hence, if we write z = (1-x)/2, z is near 0 when x approaches 1 and
|
||||
|
||||
asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)).
|
||||
|
||||
The largest observed error in this region is 2.41 ulps,
|
||||
asinf(0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */
|
||||
float
|
||||
asinf (float x)
|
||||
{
|
||||
uint32_t ix = asuint (x);
|
||||
uint32_t ia = ix & AbsMask;
|
||||
uint32_t ia12 = ia >> 20;
|
||||
float ax = asfloat (ia);
|
||||
uint32_t sign = ix & ~AbsMask;
|
||||
|
||||
/* Special values and invalid range. */
|
||||
if (unlikely (ia12 == QNaN))
|
||||
return x;
|
||||
if (ia > One)
|
||||
return __math_invalidf (x);
|
||||
if (ia12 < Small12)
|
||||
return x;
|
||||
|
||||
/* Evaluate polynomial Q(x) = y + y * z * P(z) with
|
||||
z2 = x ^ 2 and z = |x| , if |x| < 0.5
|
||||
z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
|
||||
float z2 = ax < 0.5 ? x * x : fmaf (-0.5f, ax, 0.5f);
|
||||
float z = ax < 0.5 ? ax : sqrtf (z2);
|
||||
|
||||
/* Use a single polynomial approximation P for both intervals. */
|
||||
float p = horner_4_f32 (z2, __asinf_poly);
|
||||
/* Finalize polynomial: z + z * z2 * P(z2). */
|
||||
p = fmaf (z * z2, p, z);
|
||||
|
||||
/* asin(|x|) = Q(|x|) , for |x| < 0.5
|
||||
= pi/2 - 2 Q(|x|), for |x| >= 0.5. */
|
||||
float y = ax < 0.5 ? p : fmaf (-2.0f, p, PiOver2f);
|
||||
|
||||
/* Copy sign. */
|
||||
return asfloat (asuint (y) | sign);
|
||||
}
|
||||
|
||||
PL_SIG (S, F, 1, asin, -1.0, 1.0)
|
||||
PL_TEST_ULP (asinf, 1.91)
|
||||
PL_TEST_INTERVAL (asinf, 0, Small, 5000)
|
||||
PL_TEST_INTERVAL (asinf, Small, 0.5, 50000)
|
||||
PL_TEST_INTERVAL (asinf, 0.5, 1.0, 50000)
|
||||
PL_TEST_INTERVAL (asinf, 1.0, 0x1p11, 50000)
|
||||
PL_TEST_INTERVAL (asinf, 0x1p11, inf, 20000)
|
||||
PL_TEST_INTERVAL (asinf, -0, -inf, 20000)
|
16
contrib/arm-optimized-routines/pl/math/asinf_data.c
Normal file
16
contrib/arm-optimized-routines/pl/math/asinf_data.c
Normal file
@ -0,0 +1,16 @@
|
||||
/*
|
||||
* Coefficients for single-precision asin(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "math_config.h"
|
||||
|
||||
/* Approximate asinf(x) directly in [0x1p-24, 0.25]. See for tools/asinf.sollya
|
||||
for these coeffs were generated. */
|
||||
const float __asinf_poly[] = {
|
||||
/* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
|
||||
[ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
|
||||
0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6, 0x1.3af7d8p-5,
|
||||
};
|
@ -4,7 +4,7 @@
|
||||
* Copyright (c) 2022-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#include "estrin.h"
|
||||
#include "poly_scalar_f64.h"
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
@ -60,8 +60,7 @@ asinh (double x)
|
||||
double z2 = x2 * x2;
|
||||
double z4 = z2 * z2;
|
||||
double z8 = z4 * z4;
|
||||
#define C(i) __asinh_data.poly[i]
|
||||
double p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
|
||||
double p = estrin_17_f64 (x2, z2, z4, z8, z8 * z8, __asinh_data.poly);
|
||||
double y = fma (p, x2 * ax, ax);
|
||||
return asdouble (asuint64 (y) | sign);
|
||||
}
|
||||
|
@ -5,7 +5,7 @@
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "estrinf.h"
|
||||
#include "poly_scalar_f32.h"
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
@ -16,8 +16,6 @@
|
||||
#define One (0x3f8)
|
||||
#define ExpM12 (0x398)
|
||||
|
||||
#define C(i) __asinhf_data.coeffs[i]
|
||||
|
||||
float
|
||||
optr_aor_log_f32 (float);
|
||||
|
||||
@ -57,7 +55,7 @@ asinhf (float x)
|
||||
if (ia12 < One)
|
||||
{
|
||||
float x2 = ax * ax;
|
||||
float p = ESTRIN_7 (ax, x2, x2 * x2, C);
|
||||
float p = estrin_7_f32 (ax, x2, x2 * x2, __asinhf_data.coeffs);
|
||||
float y = fmaf (x2, p, ax);
|
||||
return asfloat (asuint (y) | sign);
|
||||
}
|
||||
|
@ -1,49 +1,33 @@
|
||||
/*
|
||||
* Double-precision polynomial evaluation function for scalar and vector atan(x)
|
||||
* and atan2(y,x).
|
||||
* Double-precision polynomial evaluation function for scalar
|
||||
* atan(x) and atan2(y,x).
|
||||
*
|
||||
* Copyright (c) 2021-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "math_config.h"
|
||||
#include "estrin.h"
|
||||
|
||||
#if V_SUPPORTED
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
#define DBL_T v_f64_t
|
||||
#define P(i) v_f64 (__atan_poly_data.poly[i])
|
||||
|
||||
#else
|
||||
|
||||
#define DBL_T double
|
||||
#define P(i) __atan_poly_data.poly[i]
|
||||
|
||||
#endif
|
||||
#include "poly_scalar_f64.h"
|
||||
|
||||
/* Polynomial used in fast atan(x) and atan2(y,x) implementations
|
||||
The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */
|
||||
static inline DBL_T
|
||||
eval_poly (DBL_T z, DBL_T az, DBL_T shift)
|
||||
static inline double
|
||||
eval_poly (double z, double az, double shift)
|
||||
{
|
||||
/* Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
|
||||
full scheme to avoid underflow in x^16. */
|
||||
DBL_T z2 = z * z;
|
||||
DBL_T x2 = z2 * z2;
|
||||
DBL_T x4 = x2 * x2;
|
||||
DBL_T x8 = x4 * x4;
|
||||
DBL_T y
|
||||
= FMA (ESTRIN_11_ (z2, x2, x4, x8, P, 8), x8, ESTRIN_7 (z2, x2, x4, P));
|
||||
double z2 = z * z;
|
||||
double x2 = z2 * z2;
|
||||
double x4 = x2 * x2;
|
||||
double x8 = x4 * x4;
|
||||
double y = fma (estrin_11_f64 (z2, x2, x4, x8, __atan_poly_data.poly + 8),
|
||||
x8, estrin_7_f64 (z2, x2, x4, __atan_poly_data.poly));
|
||||
|
||||
/* Finalize. y = shift + z + z^3 * P(z^2). */
|
||||
y = FMA (y, z2 * az, az);
|
||||
y = fma (y, z2 * az, az);
|
||||
y = y + shift;
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
#undef DBL_T
|
||||
#undef FMA
|
||||
#undef P
|
||||
|
@ -66,11 +66,7 @@ atanf (float x)
|
||||
|
||||
PL_SIG (S, F, 1, atan, -10.0, 10.0)
|
||||
PL_TEST_ULP (atanf, 2.38)
|
||||
PL_TEST_INTERVAL (atanf, 0, 0x1p-30, 5000)
|
||||
PL_TEST_INTERVAL (atanf, -0, -0x1p-30, 5000)
|
||||
PL_TEST_INTERVAL (atanf, 0x1p-30, 1, 40000)
|
||||
PL_TEST_INTERVAL (atanf, -0x1p-30, -1, 40000)
|
||||
PL_TEST_INTERVAL (atanf, 1, 0x1p30, 40000)
|
||||
PL_TEST_INTERVAL (atanf, -1, -0x1p30, 40000)
|
||||
PL_TEST_INTERVAL (atanf, 0x1p30, inf, 1000)
|
||||
PL_TEST_INTERVAL (atanf, -0x1p30, -inf, 1000)
|
||||
PL_TEST_SYM_INTERVAL (atanf, 0, 0x1p-30, 5000)
|
||||
PL_TEST_SYM_INTERVAL (atanf, 0x1p-30, 1, 40000)
|
||||
PL_TEST_SYM_INTERVAL (atanf, 1, 0x1p30, 40000)
|
||||
PL_TEST_SYM_INTERVAL (atanf, 0x1p30, inf, 1000)
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Single-precision polynomial evaluation function for scalar and vector
|
||||
* Single-precision polynomial evaluation function for scalar
|
||||
* atan(x) and atan2(y,x).
|
||||
*
|
||||
* Copyright (c) 2021-2023, Arm Limited.
|
||||
@ -10,26 +10,12 @@
|
||||
#define PL_MATH_ATANF_COMMON_H
|
||||
|
||||
#include "math_config.h"
|
||||
#include "estrinf.h"
|
||||
|
||||
#if V_SUPPORTED
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
#define FLT_T v_f32_t
|
||||
#define P(i) v_f32 (__atanf_poly_data.poly[i])
|
||||
|
||||
#else
|
||||
|
||||
#define FLT_T float
|
||||
#define P(i) __atanf_poly_data.poly[i]
|
||||
|
||||
#endif
|
||||
#include "poly_scalar_f32.h"
|
||||
|
||||
/* Polynomial used in fast atanf(x) and atan2f(y,x) implementations
|
||||
The order 7 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */
|
||||
static inline FLT_T
|
||||
eval_poly (FLT_T z, FLT_T az, FLT_T shift)
|
||||
static inline float
|
||||
eval_poly (float z, float az, float shift)
|
||||
{
|
||||
/* Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
|
||||
a standard implementation using z8 creates spurious underflow
|
||||
@ -37,15 +23,16 @@ eval_poly (FLT_T z, FLT_T az, FLT_T shift)
|
||||
Therefore, we split the last fma into a mul and and an fma.
|
||||
Horner and single-level Estrin have higher errors that exceed
|
||||
threshold. */
|
||||
FLT_T z2 = z * z;
|
||||
FLT_T z4 = z2 * z2;
|
||||
float z2 = z * z;
|
||||
float z4 = z2 * z2;
|
||||
|
||||
/* Then assemble polynomial. */
|
||||
FLT_T y = FMA (z4, z4 * ESTRIN_3_ (z2, z4, P, 4), ESTRIN_3 (z2, z4, P));
|
||||
|
||||
float y = fmaf (
|
||||
z4, z4 * pairwise_poly_3_f32 (z2, z4, __atanf_poly_data.poly + 4),
|
||||
pairwise_poly_3_f32 (z2, z4, __atanf_poly_data.poly));
|
||||
/* Finalize:
|
||||
y = shift + z * P(z^2). */
|
||||
return FMA (y, z2 * az, az) + shift;
|
||||
return fmaf (y, z2 * az, az) + shift;
|
||||
}
|
||||
|
||||
#endif // PL_MATH_ATANF_COMMON_H
|
||||
|
@ -6,7 +6,7 @@
|
||||
*/
|
||||
|
||||
#include "math_config.h"
|
||||
#include "estrin.h"
|
||||
#include "poly_scalar_f64.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
|
||||
@ -20,7 +20,6 @@
|
||||
#define OneTop12 0x3ff
|
||||
#define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)). */
|
||||
#define BottomMask 0xffffffff
|
||||
#define C(i) __log1p_data.coeffs[i]
|
||||
|
||||
static inline double
|
||||
log1p_inline (double x)
|
||||
@ -46,7 +45,8 @@ log1p_inline (double x)
|
||||
double f2 = f * f;
|
||||
double f4 = f2 * f2;
|
||||
double f8 = f4 * f4;
|
||||
double p = fma (f, ESTRIN_18 (f, f2, f4, f8, f8 * f8, C) * f, f);
|
||||
double p = fma (
|
||||
f, estrin_18_f64 (f, f2, f4, f8, f8 * f8, __log1p_data.coeffs) * f, f);
|
||||
|
||||
/* Recombine log1p(x) = k*log2 + log1p(f) + c/m. */
|
||||
double kd = k;
|
||||
@ -78,9 +78,6 @@ atanh (double x)
|
||||
|
||||
PL_SIG (S, D, 1, atanh, -1.0, 1.0)
|
||||
PL_TEST_ULP (atanh, 3.00)
|
||||
PL_TEST_INTERVAL (atanh, 0, 0x1p-23, 10000)
|
||||
PL_TEST_INTERVAL (atanh, -0, -0x1p-23, 10000)
|
||||
PL_TEST_INTERVAL (atanh, 0x1p-23, 1, 90000)
|
||||
PL_TEST_INTERVAL (atanh, -0x1p-23, -1, 90000)
|
||||
PL_TEST_INTERVAL (atanh, 1, inf, 100)
|
||||
PL_TEST_INTERVAL (atanh, -1, -inf, 100)
|
||||
PL_TEST_SYM_INTERVAL (atanh, 0, 0x1p-23, 10000)
|
||||
PL_TEST_SYM_INTERVAL (atanh, 0x1p-23, 1, 90000)
|
||||
PL_TEST_SYM_INTERVAL (atanh, 1, inf, 100)
|
||||
|
@ -15,7 +15,8 @@
|
||||
#define One 0x3f800000
|
||||
#define Four 0x40800000
|
||||
#define Ln2 0x1.62e43p-1f
|
||||
#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
|
||||
/* asuint(0x1p-12), below which atanhf(x) rounds to x. */
|
||||
#define TinyBound 0x39800000
|
||||
|
||||
#define C(i) __log1pf_data.coeffs[i]
|
||||
|
||||
@ -80,9 +81,6 @@ atanhf (float x)
|
||||
|
||||
PL_SIG (S, F, 1, atanh, -1.0, 1.0)
|
||||
PL_TEST_ULP (atanhf, 2.59)
|
||||
PL_TEST_INTERVAL (atanhf, 0, 0x1p-12, 500)
|
||||
PL_TEST_INTERVAL (atanhf, 0x1p-12, 1, 200000)
|
||||
PL_TEST_INTERVAL (atanhf, 1, inf, 1000)
|
||||
PL_TEST_INTERVAL (atanhf, -0, -0x1p-12, 500)
|
||||
PL_TEST_INTERVAL (atanhf, -0x1p-12, -1, 200000)
|
||||
PL_TEST_INTERVAL (atanhf, -1, -inf, 1000)
|
||||
PL_TEST_SYM_INTERVAL (atanhf, 0, 0x1p-12, 500)
|
||||
PL_TEST_SYM_INTERVAL (atanhf, 0x1p-12, 1, 200000)
|
||||
PL_TEST_SYM_INTERVAL (atanhf, 1, inf, 1000)
|
||||
|
@ -31,7 +31,7 @@ cbrt (double x)
|
||||
uint64_t iax = ix & AbsMask;
|
||||
uint64_t sign = ix & ~AbsMask;
|
||||
|
||||
if (unlikely (iax == 0 || iax == 0x7f80000000000000))
|
||||
if (unlikely (iax == 0 || iax == 0x7ff0000000000000))
|
||||
return x;
|
||||
|
||||
/* |x| = m * 2^e, where m is in [0.5, 1.0].
|
||||
@ -66,5 +66,4 @@ cbrt (double x)
|
||||
}
|
||||
|
||||
PL_TEST_ULP (cbrt, 1.30)
|
||||
PL_TEST_INTERVAL (cbrt, 0, inf, 1000000)
|
||||
PL_TEST_INTERVAL (cbrt, -0, -inf, 1000000)
|
||||
PL_TEST_SYM_INTERVAL (cbrt, 0, inf, 1000000)
|
||||
|
@ -5,7 +5,7 @@
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "estrinf.h"
|
||||
#include "poly_scalar_f32.h"
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
@ -14,7 +14,6 @@
|
||||
#define SignMask 0x80000000
|
||||
#define TwoThirds 0x1.555556p-1f
|
||||
|
||||
#define C(i) __cbrtf_data.poly[i]
|
||||
#define T(i) __cbrtf_data.table[i]
|
||||
|
||||
/* Approximation for single-precision cbrt(x), using low-order polynomial and
|
||||
@ -41,7 +40,8 @@ cbrtf (float x)
|
||||
/* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
|
||||
the less accurate the next stage of the algorithm needs to be. An order-4
|
||||
polynomial is enough for one Newton iteration. */
|
||||
float p = ESTRIN_3 (m, m * m, C);
|
||||
float p = pairwise_poly_3_f32 (m, m * m, __cbrtf_data.poly);
|
||||
|
||||
/* One iteration of Newton's method for iteratively approximating cbrt. */
|
||||
float m_by_3 = m / 3;
|
||||
float a = fmaf (TwoThirds, p, m_by_3 / (p * p));
|
||||
@ -63,5 +63,4 @@ cbrtf (float x)
|
||||
|
||||
PL_SIG (S, F, 1, cbrt, -10.0, 10.0)
|
||||
PL_TEST_ULP (cbrtf, 1.03)
|
||||
PL_TEST_INTERVAL (cbrtf, 0, inf, 1000000)
|
||||
PL_TEST_INTERVAL (cbrtf, -0, -inf, 1000000)
|
||||
PL_TEST_SYM_INTERVAL (cbrtf, 0, inf, 1000000)
|
||||
|
@ -58,9 +58,6 @@ cosh (double x)
|
||||
|
||||
PL_SIG (S, D, 1, cosh, -10.0, 10.0)
|
||||
PL_TEST_ULP (cosh, 1.43)
|
||||
PL_TEST_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000)
|
||||
PL_TEST_INTERVAL (cosh, -0, -0x1.61da04cbafe44p+9, 100000)
|
||||
PL_TEST_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000)
|
||||
PL_TEST_INTERVAL (cosh, -0x1.61da04cbafe44p+9, -0x1p10, 1000)
|
||||
PL_TEST_INTERVAL (cosh, 0x1p10, inf, 100)
|
||||
PL_TEST_INTERVAL (cosh, -0x1p10, -inf, 100)
|
||||
PL_TEST_SYM_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000)
|
||||
PL_TEST_SYM_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000)
|
||||
PL_TEST_SYM_INTERVAL (cosh, 0x1p10, inf, 100)
|
||||
|
@ -63,9 +63,6 @@ coshf (float x)
|
||||
|
||||
PL_SIG (S, F, 1, cosh, -10.0, 10.0)
|
||||
PL_TEST_ULP (coshf, 1.89)
|
||||
PL_TEST_INTERVAL (coshf, 0, 0x1p-63, 100)
|
||||
PL_TEST_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000)
|
||||
PL_TEST_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000)
|
||||
PL_TEST_INTERVAL (coshf, -0, -0x1p-63, 100)
|
||||
PL_TEST_INTERVAL (coshf, -0, -0x1.5a92d8p+6, 80000)
|
||||
PL_TEST_INTERVAL (coshf, -0x1.5a92d8p+6, -inf, 2000)
|
||||
PL_TEST_SYM_INTERVAL (coshf, 0, 0x1p-63, 100)
|
||||
PL_TEST_SYM_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000)
|
||||
PL_TEST_SYM_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000)
|
||||
|
89
contrib/arm-optimized-routines/pl/math/cospi_3u1.c
Normal file
89
contrib/arm-optimized-routines/pl/math/cospi_3u1.c
Normal file
@ -0,0 +1,89 @@
|
||||
/*
|
||||
* Double-precision scalar cospi function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
#include "poly_scalar_f64.h"
|
||||
|
||||
/* Taylor series coefficents for sin(pi * x).
|
||||
C2 coefficient (orginally ~=5.16771278) has been split into two parts:
|
||||
C2_hi = 4, C2_lo = C2 - C2_hi (~=1.16771278)
|
||||
This change in magnitude reduces floating point rounding errors.
|
||||
C2_hi is then reintroduced after the polynomial approxmation. */
|
||||
static const double poly[]
|
||||
= { 0x1.921fb54442d184p1, -0x1.2aef39896f94bp0, 0x1.466bc6775ab16p1,
|
||||
-0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8,
|
||||
0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, 0x1.af86ae521260bp-21,
|
||||
-0x1.012a9870eeb7dp-25 };
|
||||
|
||||
#define Shift 0x1.8p+52
|
||||
|
||||
/* Approximation for scalar double-precision cospi(x).
|
||||
Maximum error: 3.13 ULP:
|
||||
cospi(0x1.160b129300112p-21) got 0x1.fffffffffd16bp-1
|
||||
want 0x1.fffffffffd16ep-1. */
|
||||
double
|
||||
cospi (double x)
|
||||
{
|
||||
if (isinf (x))
|
||||
return __math_invalid (x);
|
||||
|
||||
double ax = asdouble (asuint64 (x) & ~0x8000000000000000);
|
||||
|
||||
/* Edge cases for when cospif should be exactly 1. (Integers)
|
||||
0x1p53 is the limit for single precision to store any decimal places. */
|
||||
if (ax >= 0x1p53)
|
||||
return 1;
|
||||
|
||||
/* If x is an integer, return +- 1, based upon if x is odd. */
|
||||
uint64_t m = (uint64_t) ax;
|
||||
if (m == ax)
|
||||
return (m & 1) ? -1 : 1;
|
||||
|
||||
/* For very small inputs, squaring r causes underflow.
|
||||
Values below this threshold can be approximated via
|
||||
cospi(x) ~= 1. */
|
||||
if (ax < 0x1p-63)
|
||||
return 1;
|
||||
|
||||
/* Any non-integer values >= 0x1x51 will be int +0.5.
|
||||
These values should return exactly 0. */
|
||||
if (ax >= 0x1p51)
|
||||
return 0;
|
||||
|
||||
/* n = rint(|x|). */
|
||||
double n = ax + Shift;
|
||||
uint64_t sign = asuint64 (n) << 63;
|
||||
n = n - Shift;
|
||||
|
||||
/* We know that cospi(x) = sinpi(0.5 - x)
|
||||
range reduction and offset into sinpi range -1/2 .. 1/2
|
||||
r = 0.5 - |x - rint(x)|. */
|
||||
double r = 0.5 - fabs (ax - n);
|
||||
|
||||
/* y = sin(r). */
|
||||
double r2 = r * r;
|
||||
double y = horner_9_f64 (r2, poly);
|
||||
y = y * r;
|
||||
|
||||
/* Reintroduce C2_hi. */
|
||||
y = fma (-4 * r2, r, y);
|
||||
|
||||
/* As all values are reduced to -1/2 .. 1/2, the result of cos(x) always be
|
||||
positive, therefore, the sign must be introduced based upon if x rounds to
|
||||
odd or even. */
|
||||
return asdouble (asuint64 (y) ^ sign);
|
||||
}
|
||||
|
||||
PL_SIG (S, D, 1, cospi, -0.9, 0.9)
|
||||
PL_TEST_ULP (cospi, 2.63)
|
||||
PL_TEST_SYM_INTERVAL (cospi, 0, 0x1p-63, 5000)
|
||||
PL_TEST_SYM_INTERVAL (cospi, 0x1p-63, 0.5, 10000)
|
||||
PL_TEST_SYM_INTERVAL (cospi, 0.5, 0x1p51f, 10000)
|
||||
PL_TEST_SYM_INTERVAL (cospi, 0x1p51f, inf, 10000)
|
84
contrib/arm-optimized-routines/pl/math/cospif_2u6.c
Normal file
84
contrib/arm-optimized-routines/pl/math/cospif_2u6.c
Normal file
@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Single-precision scalar cospi function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "mathlib.h"
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
|
||||
/* Taylor series coefficents for sin(pi * x). */
|
||||
#define C0 0x1.921fb6p1f
|
||||
#define C1 -0x1.4abbcep2f
|
||||
#define C2 0x1.466bc6p1f
|
||||
#define C3 -0x1.32d2ccp-1f
|
||||
#define C4 0x1.50783p-4f
|
||||
#define C5 -0x1.e30750p-8f
|
||||
|
||||
#define Shift 0x1.0p+23f
|
||||
|
||||
/* Approximation for scalar single-precision cospi(x) - cospif.
|
||||
Maximum error: 2.64 ULP:
|
||||
cospif(0x1.37e844p-4) got 0x1.f16b3p-1
|
||||
want 0x1.f16b2ap-1. */
|
||||
float
|
||||
cospif (float x)
|
||||
{
|
||||
if (isinf (x))
|
||||
return __math_invalidf (x);
|
||||
|
||||
float ax = asfloat (asuint (x) & ~0x80000000);
|
||||
|
||||
/* Edge cases for when cospif should be exactly +/- 1. (Integers)
|
||||
0x1p23 is the limit for single precision to store any decimal places. */
|
||||
if (ax >= 0x1p24f)
|
||||
return 1;
|
||||
|
||||
uint32_t m = roundf (ax);
|
||||
if (m == ax)
|
||||
return (m & 1) ? -1 : 1;
|
||||
|
||||
/* Any non-integer values >= 0x1p22f will be int +0.5.
|
||||
These values should return exactly 0. */
|
||||
if (ax >= 0x1p22f)
|
||||
return 0;
|
||||
|
||||
/* For very small inputs, squaring r causes underflow.
|
||||
Values below this threshold can be approximated via cospi(x) ~= 1 -
|
||||
(pi*x). */
|
||||
if (ax < 0x1p-31f)
|
||||
return 1 - (C0 * x);
|
||||
|
||||
/* n = rint(|x|). */
|
||||
float n = ax + Shift;
|
||||
uint32_t sign = asuint (n) << 31;
|
||||
n = n - Shift;
|
||||
|
||||
/* We know that cospi(x) = sinpi(0.5 - x)
|
||||
range reduction and offset into sinpi range -1/2 .. 1/2
|
||||
r = 0.5 - |x - rint(x)|. */
|
||||
float r = 0.5f - fabs (ax - n);
|
||||
|
||||
/* y = sin(pi * r). */
|
||||
float r2 = r * r;
|
||||
float y = fmaf (C5, r2, C4);
|
||||
y = fmaf (y, r2, C3);
|
||||
y = fmaf (y, r2, C2);
|
||||
y = fmaf (y, r2, C1);
|
||||
y = fmaf (y, r2, C0);
|
||||
|
||||
/* As all values are reduced to -1/2 .. 1/2, the result of cos(x) always be
|
||||
positive, therefore, the sign must be introduced based upon if x rounds to
|
||||
odd or even. */
|
||||
return asfloat (asuint (y * r) ^ sign);
|
||||
}
|
||||
|
||||
PL_SIG (S, F, 1, cospi, -0.9, 0.9)
|
||||
PL_TEST_ULP (cospif, 2.15)
|
||||
PL_TEST_SYM_INTERVAL (cospif, 0, 0x1p-31, 5000)
|
||||
PL_TEST_SYM_INTERVAL (cospif, 0x1p-31, 0.5, 10000)
|
||||
PL_TEST_SYM_INTERVAL (cospif, 0.5, 0x1p22f, 10000)
|
||||
PL_TEST_SYM_INTERVAL (cospif, 0x1p22f, inf, 10000)
|
102
contrib/arm-optimized-routines/pl/math/erf_2u5.c
Normal file
102
contrib/arm-optimized-routines/pl/math/erf_2u5.c
Normal file
@ -0,0 +1,102 @@
|
||||
/*
|
||||
* Double-precision erf(x) function.
|
||||
*
|
||||
* Copyright (c) 2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "math_config.h"
|
||||
#include "pl_sig.h"
|
||||
#include "pl_test.h"
|
||||
|
||||
#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3
|
||||
#define Shift 0x1p45
|
||||
|
||||
/* Polynomial coefficients. */
|
||||
#define OneThird 0x1.5555555555555p-2
|
||||
#define TwoThird 0x1.5555555555555p-1
|
||||
|
||||
#define TwoOverFifteen 0x1.1111111111111p-3
|
||||
#define TwoOverFive 0x1.999999999999ap-2
|
||||
#define Tenth 0x1.999999999999ap-4
|
||||
|
||||
#define TwoOverNine 0x1.c71c71c71c71cp-3
|
||||
#define TwoOverFortyFive 0x1.6c16c16c16c17p-5
|
||||
#define Sixth 0x1.555555555555p-3
|
||||
|
||||
/* Fast erf approximation based on series expansion near x rounded to
|
||||
nearest multiple of 1/128.
|
||||
Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
|
||||
|
||||
erf(x) ~ erf(r)
|
||||
+ scale * d * [
|
||||
+ 1
|
||||
- r d
|
||||
+ 1/3 (2 r^2 - 1) d^2
|
||||
- 1/6 (r (2 r^2 - 3)) d^3
|
||||
+ 1/30 (4 r^4 - 12 r^2 + 3) d^4
|
||||
- 1/90 (4 r^4 - 20 r^2 + 15) d^5
|
||||
]
|
||||
|
||||
Maximum measure error: 2.29 ULP
|
||||
erf(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8
|
||||
want -0x1.20dd59132ebafp-8. */
|
||||
double
|
||||
erf (double x)
|
||||
{
|
||||
/* Get absolute value and sign. */
|
||||
uint64_t ix = asuint64 (x);
|
||||
uint64_t ia = ix & 0x7fffffffffffffff;
|
||||
uint64_t sign = ix & ~0x7fffffffffffffff;
|
||||
|
||||
/* |x| < 0x1p-508. Triggers exceptions. */
|
||||
if (unlikely (ia < 0x2030000000000000))
|
||||
return fma (TwoOverSqrtPiMinusOne, x, x);
|
||||
|
||||
if (ia < 0x4017f80000000000) /* |x| < 6 - 1 / 128 = 5.9921875. */
|
||||
{
|
||||
/* Set r to multiple of 1/128 nearest to |x|. */
|
||||
double a = asdouble (ia);
|
||||
double z = a + Shift;
|
||||
uint64_t i = asuint64 (z) - asuint64 (Shift);
|
||||
double r = z - Shift;
|
||||
/* Lookup erf(r) and scale(r) in table.
|
||||
Set erf(r) to 0 and scale to 2/sqrt(pi) for |x| <= 0x1.cp-9. */
|
||||
double erfr = __erf_data.tab[i].erf;
|
||||
double scale = __erf_data.tab[i].scale;
|
||||
|
||||
/* erf(x) ~ erf(r) + scale * d * poly (d, r). */
|
||||
double d = a - r;
|
||||
double r2 = r * r;
|
||||
double d2 = d * d;
|
||||
|
||||
/* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */
|
||||
double p1 = -r;
|
||||
double p2 = fma (TwoThird, r2, -OneThird);
|
||||
double p3 = -r * fma (OneThird, r2, -0.5);
|
||||
double p4 = fma (fma (TwoOverFifteen, r2, -TwoOverFive), r2, Tenth);
|
||||
double p5
|
||||
= -r * fma (fma (TwoOverFortyFive, r2, -TwoOverNine), r2, Sixth);
|
||||
|
||||
double p34 = fma (p4, d, p3);
|
||||
double p12 = fma (p2, d, p1);
|
||||
double y = fma (p5, d2, p34);
|
||||
y = fma (y, d2, p12);
|
||||
|
||||
y = fma (fma (y, d2, d), scale, erfr);
|
||||
return asdouble (asuint64 (y) | sign);
|
||||
}
|
||||
|
||||
/* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1. */
|
||||
if (unlikely (ia >= 0x7ff0000000000000))
|
||||
return (1.0 - (double) (sign >> 62)) + 1.0 / x;
|
||||
|
||||
/* Boring domain (|x| >= 6.0). */
|
||||
return asdouble (sign | asuint64 (1.0));
|
||||
}
|
||||
|
||||
PL_SIG (S, D, 1, erf, -6.0, 6.0)
|
||||
PL_TEST_ULP (erf, 1.79)
|
||||
PL_TEST_SYM_INTERVAL (erf, 0, 5.9921875, 40000)
|
||||
PL_TEST_SYM_INTERVAL (erf, 5.9921875, inf, 40000)
|
||||
PL_TEST_SYM_INTERVAL (erf, 0, inf, 40000)
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user