x86-64: Add vector atanh/atanhf implementation to libmvec

Implement vectorized atanh/atanhf containing SSE, AVX, AVX2 and
AVX512 versions for libmvec as per vector ABI.  It also contains
accuracy and ABI tests for vector atanh/atanhf with regenerated ulps.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Sunil K Pandey 2021-12-29 09:42:04 -08:00
parent 74265c16ab
commit 6dea4dd3da
50 changed files with 5060 additions and 1 deletions

View file

@ -252,4 +252,15 @@
#define __DECL_SIMD_log1pf32x
#define __DECL_SIMD_log1pf64x
#define __DECL_SIMD_log1pf128x
#define __DECL_SIMD_atanh
#define __DECL_SIMD_atanhf
#define __DECL_SIMD_atanhl
#define __DECL_SIMD_atanhf16
#define __DECL_SIMD_atanhf32
#define __DECL_SIMD_atanhf64
#define __DECL_SIMD_atanhf128
#define __DECL_SIMD_atanhf32x
#define __DECL_SIMD_atanhf64x
#define __DECL_SIMD_atanhf128x
#endif

View file

@ -86,7 +86,7 @@ __MATHCALL (acosh,, (_Mdouble_ __x));
/* Hyperbolic arc sine of X. */
__MATHCALL (asinh,, (_Mdouble_ __x));
/* Hyperbolic arc tangent of X. */
__MATHCALL (atanh,, (_Mdouble_ __x));
__MATHCALL_VEC (atanh,, (_Mdouble_ __x));
#endif
/* Exponential and logarithmic functions. */

View file

@ -49,6 +49,7 @@ GLIBC_2.22 _ZGVeN8vvv_sincos F
GLIBC_2.35 _ZGVbN2v_acos F
GLIBC_2.35 _ZGVbN2v_asin F
GLIBC_2.35 _ZGVbN2v_atan F
GLIBC_2.35 _ZGVbN2v_atanh F
GLIBC_2.35 _ZGVbN2v_cbrt F
GLIBC_2.35 _ZGVbN2v_cosh F
GLIBC_2.35 _ZGVbN2v_exp10 F
@ -63,6 +64,7 @@ GLIBC_2.35 _ZGVbN2vv_hypot F
GLIBC_2.35 _ZGVbN4v_acosf F
GLIBC_2.35 _ZGVbN4v_asinf F
GLIBC_2.35 _ZGVbN4v_atanf F
GLIBC_2.35 _ZGVbN4v_atanhf F
GLIBC_2.35 _ZGVbN4v_cbrtf F
GLIBC_2.35 _ZGVbN4v_coshf F
GLIBC_2.35 _ZGVbN4v_exp10f F
@ -77,6 +79,7 @@ GLIBC_2.35 _ZGVbN4vv_hypotf F
GLIBC_2.35 _ZGVcN4v_acos F
GLIBC_2.35 _ZGVcN4v_asin F
GLIBC_2.35 _ZGVcN4v_atan F
GLIBC_2.35 _ZGVcN4v_atanh F
GLIBC_2.35 _ZGVcN4v_cbrt F
GLIBC_2.35 _ZGVcN4v_cosh F
GLIBC_2.35 _ZGVcN4v_exp10 F
@ -91,6 +94,7 @@ GLIBC_2.35 _ZGVcN4vv_hypot F
GLIBC_2.35 _ZGVcN8v_acosf F
GLIBC_2.35 _ZGVcN8v_asinf F
GLIBC_2.35 _ZGVcN8v_atanf F
GLIBC_2.35 _ZGVcN8v_atanhf F
GLIBC_2.35 _ZGVcN8v_cbrtf F
GLIBC_2.35 _ZGVcN8v_coshf F
GLIBC_2.35 _ZGVcN8v_exp10f F
@ -105,6 +109,7 @@ GLIBC_2.35 _ZGVcN8vv_hypotf F
GLIBC_2.35 _ZGVdN4v_acos F
GLIBC_2.35 _ZGVdN4v_asin F
GLIBC_2.35 _ZGVdN4v_atan F
GLIBC_2.35 _ZGVdN4v_atanh F
GLIBC_2.35 _ZGVdN4v_cbrt F
GLIBC_2.35 _ZGVdN4v_cosh F
GLIBC_2.35 _ZGVdN4v_exp10 F
@ -119,6 +124,7 @@ GLIBC_2.35 _ZGVdN4vv_hypot F
GLIBC_2.35 _ZGVdN8v_acosf F
GLIBC_2.35 _ZGVdN8v_asinf F
GLIBC_2.35 _ZGVdN8v_atanf F
GLIBC_2.35 _ZGVdN8v_atanhf F
GLIBC_2.35 _ZGVdN8v_cbrtf F
GLIBC_2.35 _ZGVdN8v_coshf F
GLIBC_2.35 _ZGVdN8v_exp10f F
@ -133,6 +139,7 @@ GLIBC_2.35 _ZGVdN8vv_hypotf F
GLIBC_2.35 _ZGVeN16v_acosf F
GLIBC_2.35 _ZGVeN16v_asinf F
GLIBC_2.35 _ZGVeN16v_atanf F
GLIBC_2.35 _ZGVeN16v_atanhf F
GLIBC_2.35 _ZGVeN16v_cbrtf F
GLIBC_2.35 _ZGVeN16v_coshf F
GLIBC_2.35 _ZGVeN16v_exp10f F
@ -147,6 +154,7 @@ GLIBC_2.35 _ZGVeN16vv_hypotf F
GLIBC_2.35 _ZGVeN8v_acos F
GLIBC_2.35 _ZGVeN8v_asin F
GLIBC_2.35 _ZGVeN8v_atan F
GLIBC_2.35 _ZGVeN8v_atanh F
GLIBC_2.35 _ZGVeN8v_cbrt F
GLIBC_2.35 _ZGVeN8v_cosh F
GLIBC_2.35 _ZGVeN8v_exp10 F

View file

@ -114,6 +114,10 @@
# define __DECL_SIMD_log1p __DECL_SIMD_x86_64
# undef __DECL_SIMD_log1pf
# define __DECL_SIMD_log1pf __DECL_SIMD_x86_64
# undef __DECL_SIMD_atanh
# define __DECL_SIMD_atanh __DECL_SIMD_x86_64
# undef __DECL_SIMD_atanhf
# define __DECL_SIMD_atanhf __DECL_SIMD_x86_64
# endif
#endif

View file

@ -56,6 +56,8 @@
!GCC$ builtin (log2f) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (log1p) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (log1pf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (atanh) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (atanhf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
!GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
@ -97,3 +99,5 @@
!GCC$ builtin (log2f) attributes simd (notinbranch) if('x32')
!GCC$ builtin (log1p) attributes simd (notinbranch) if('x32')
!GCC$ builtin (log1pf) attributes simd (notinbranch) if('x32')
!GCC$ builtin (atanh) attributes simd (notinbranch) if('x32')
!GCC$ builtin (atanhf) attributes simd (notinbranch) if('x32')

View file

@ -26,6 +26,7 @@ libmvec-funcs = \
asin \
atan \
atan2 \
atanh \
cbrt \
cos \
cosh \

View file

@ -17,6 +17,7 @@ libmvec {
_ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
_ZGVbN2v_asin; _ZGVcN4v_asin; _ZGVdN4v_asin; _ZGVeN8v_asin;
_ZGVbN2v_atan; _ZGVcN4v_atan; _ZGVdN4v_atan; _ZGVeN8v_atan;
_ZGVbN2v_atanh; _ZGVcN4v_atanh; _ZGVdN4v_atanh; _ZGVeN8v_atanh;
_ZGVbN2v_cbrt; _ZGVcN4v_cbrt; _ZGVdN4v_cbrt; _ZGVeN8v_cbrt;
_ZGVbN2v_cosh; _ZGVcN4v_cosh; _ZGVdN4v_cosh; _ZGVeN8v_cosh;
_ZGVbN2v_exp10; _ZGVcN4v_exp10; _ZGVdN4v_exp10; _ZGVeN8v_exp10;
@ -31,6 +32,7 @@ libmvec {
_ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
_ZGVbN4v_asinf; _ZGVcN8v_asinf; _ZGVdN8v_asinf; _ZGVeN16v_asinf;
_ZGVbN4v_atanf; _ZGVcN8v_atanf; _ZGVdN8v_atanf; _ZGVeN16v_atanf;
_ZGVbN4v_atanhf; _ZGVcN8v_atanhf; _ZGVdN8v_atanhf; _ZGVeN16v_atanhf;
_ZGVbN4v_cbrtf; _ZGVcN8v_cbrtf; _ZGVdN8v_cbrtf; _ZGVeN16v_cbrtf;
_ZGVbN4v_coshf; _ZGVcN8v_coshf; _ZGVdN8v_coshf; _ZGVeN16v_coshf;
_ZGVbN4v_exp10f; _ZGVcN8v_exp10f; _ZGVdN8v_exp10f; _ZGVeN16v_exp10f;

View file

@ -248,6 +248,26 @@ float: 3
float128: 4
ldouble: 5
Function: "atanh_vlen16":
float: 1
Function: "atanh_vlen2":
double: 1
Function: "atanh_vlen4":
double: 1
float: 1
Function: "atanh_vlen4_avx2":
double: 1
Function: "atanh_vlen8":
double: 1
float: 1
Function: "atanh_vlen8_avx2":
float: 1
Function: "cabs":
double: 1
float128: 1

View file

@ -0,0 +1,20 @@
/* SSE2 version of vectorized atanh, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN2v_atanh _ZGVbN2v_atanh_sse2
#include "../svml_d_atanh2_core.S"

View file

@ -0,0 +1,27 @@
/* Multiple versions of vectorized atanh, vector length is 2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN2v_atanh
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN2v_atanh, __GI__ZGVbN2v_atanh, __redirect__ZGVbN2v_atanh)
__attribute__ ((visibility ("hidden")));
#endif

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,20 @@
/* SSE version of vectorized atanh, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN4v_atanh _ZGVdN4v_atanh_sse_wrapper
#include "../svml_d_atanh4_core.S"

View file

@ -0,0 +1,27 @@
/* Multiple versions of vectorized atanh, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN4v_atanh
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN4v_atanh, __GI__ZGVdN4v_atanh, __redirect__ZGVdN4v_atanh)
__attribute__ ((visibility ("hidden")));
#endif

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,20 @@
/* AVX2 version of vectorized atanh, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN8v_atanh _ZGVeN8v_atanh_avx2_wrapper
#include "../svml_d_atanh8_core.S"

View file

@ -0,0 +1,27 @@
/* Multiple versions of vectorized atanh, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN8v_atanh
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN8v_atanh, __GI__ZGVeN8v_atanh, __redirect__ZGVeN8v_atanh)
__attribute__ ((visibility ("hidden")));
#endif

View file

@ -0,0 +1,401 @@
/* Function atanh vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
* using small lookup table that map to AVX-512 permute instructions
*
* Special cases:
*
* atanh(0) = 0
* atanh(+1) = +INF
* atanh(-1) = -INF
* atanh(x) = NaN if |x| > 1, or if x is a NaN or INF
*
*/
/* Offsets for data table __svml_datanh_data_internal_avx512
*/
#define Log_tbl_H 0
#define Log_tbl_L 128
#define One 256
#define AbsMask 320
#define AddB5 384
#define RcpBitMask 448
#define poly_coeff8 512
#define poly_coeff7 576
#define poly_coeff6 640
#define poly_coeff5 704
#define poly_coeff4 768
#define poly_coeff3 832
#define poly_coeff2 896
#define poly_coeff1 960
#define poly_coeff0 1024
#define Half 1088
#define L2H 1152
#define L2L 1216
#include <sysdep.h>
.text
.section .text.evex512,"ax",@progbits
ENTRY(_ZGVeN8v_atanh_skx)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-64, %rsp
subq $192, %rsp
vmovups One+__svml_datanh_data_internal_avx512(%rip), %zmm15
/* round reciprocals to 1+4b mantissas */
vmovups AddB5+__svml_datanh_data_internal_avx512(%rip), %zmm6
vmovups RcpBitMask+__svml_datanh_data_internal_avx512(%rip), %zmm9
vmovaps %zmm0, %zmm2
vandpd AbsMask+__svml_datanh_data_internal_avx512(%rip), %zmm2, %zmm13
/* 1+y */
vaddpd {rn-sae}, %zmm15, %zmm13, %zmm0
/* 1-y */
vsubpd {rn-sae}, %zmm13, %zmm15, %zmm4
vxorpd %zmm13, %zmm2, %zmm1
/* Yp_high */
vsubpd {rn-sae}, %zmm15, %zmm0, %zmm7
/* -Ym_high */
vsubpd {rn-sae}, %zmm15, %zmm4, %zmm12
/* RcpP ~ 1/Yp */
vrcp14pd %zmm0, %zmm3
/* RcpM ~ 1/Ym */
vrcp14pd %zmm4, %zmm5
/* input outside (-1, 1) ? */
vcmppd $21, {sae}, %zmm15, %zmm13, %k0
vpaddq %zmm6, %zmm3, %zmm11
vpaddq %zmm6, %zmm5, %zmm10
/* Yp_low */
vsubpd {rn-sae}, %zmm7, %zmm13, %zmm8
vandpd %zmm9, %zmm11, %zmm14
vandpd %zmm9, %zmm10, %zmm3
/* Ym_low */
vaddpd {rn-sae}, %zmm12, %zmm13, %zmm12
/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
vfmsub213pd {rn-sae}, %zmm15, %zmm14, %zmm0
/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
vfmsub231pd {rn-sae}, %zmm3, %zmm4, %zmm15
/* exponents */
vgetexppd {sae}, %zmm14, %zmm5
vgetexppd {sae}, %zmm3, %zmm4
/* Table lookups */
vmovups __svml_datanh_data_internal_avx512(%rip), %zmm9
vmovups Log_tbl_H+64+__svml_datanh_data_internal_avx512(%rip), %zmm13
vmovups Log_tbl_L+__svml_datanh_data_internal_avx512(%rip), %zmm7
vfmadd231pd {rn-sae}, %zmm14, %zmm8, %zmm0
vfnmadd231pd {rn-sae}, %zmm3, %zmm12, %zmm15
/* Prepare table index */
vpsrlq $48, %zmm14, %zmm11
vpsrlq $48, %zmm3, %zmm8
vmovups Log_tbl_L+64+__svml_datanh_data_internal_avx512(%rip), %zmm14
/* polynomials */
vmovups poly_coeff8+__svml_datanh_data_internal_avx512(%rip), %zmm3
/* Km-Kp */
vsubpd {rn-sae}, %zmm5, %zmm4, %zmm5
vmovups poly_coeff7+__svml_datanh_data_internal_avx512(%rip), %zmm4
kmovw %k0, %edx
vmovaps %zmm11, %zmm10
vmovaps %zmm4, %zmm6
vpermi2pd %zmm13, %zmm9, %zmm10
vpermi2pd %zmm14, %zmm7, %zmm11
vpermt2pd %zmm13, %zmm8, %zmm9
vpermt2pd %zmm14, %zmm8, %zmm7
vmovups poly_coeff6+__svml_datanh_data_internal_avx512(%rip), %zmm8
vfmadd231pd {rn-sae}, %zmm0, %zmm3, %zmm6
vfmadd231pd {rn-sae}, %zmm15, %zmm3, %zmm4
vmovups poly_coeff3+__svml_datanh_data_internal_avx512(%rip), %zmm13
vmovups poly_coeff2+__svml_datanh_data_internal_avx512(%rip), %zmm14
vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm6
vfmadd213pd {rn-sae}, %zmm8, %zmm15, %zmm4
vmovups poly_coeff0+__svml_datanh_data_internal_avx512(%rip), %zmm8
vsubpd {rn-sae}, %zmm11, %zmm7, %zmm12
/* table values */
vsubpd {rn-sae}, %zmm10, %zmm9, %zmm3
vmovups poly_coeff5+__svml_datanh_data_internal_avx512(%rip), %zmm7
vmovups poly_coeff4+__svml_datanh_data_internal_avx512(%rip), %zmm9
/* K*L2H + Th */
vmovups L2H+__svml_datanh_data_internal_avx512(%rip), %zmm10
/* K*L2L + Tl */
vmovups L2L+__svml_datanh_data_internal_avx512(%rip), %zmm11
vfmadd213pd {rn-sae}, %zmm7, %zmm0, %zmm6
vfmadd213pd {rn-sae}, %zmm7, %zmm15, %zmm4
vmovups poly_coeff1+__svml_datanh_data_internal_avx512(%rip), %zmm7
vfmadd231pd {rn-sae}, %zmm5, %zmm10, %zmm3
vfmadd213pd {rn-sae}, %zmm12, %zmm11, %zmm5
vfmadd213pd {rn-sae}, %zmm9, %zmm0, %zmm6
vfmadd213pd {rn-sae}, %zmm9, %zmm15, %zmm4
vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm6
vfmadd213pd {rn-sae}, %zmm13, %zmm15, %zmm4
vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm6
vfmadd213pd {rn-sae}, %zmm14, %zmm15, %zmm4
vfmadd213pd {rn-sae}, %zmm7, %zmm0, %zmm6
vfmadd213pd {rn-sae}, %zmm7, %zmm15, %zmm4
vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm6
vfmadd213pd {rn-sae}, %zmm8, %zmm15, %zmm4
/* (K*L2L + Tl) + Rp*PolyP */
vfmadd213pd {rn-sae}, %zmm5, %zmm0, %zmm6
vorpd Half+__svml_datanh_data_internal_avx512(%rip), %zmm1, %zmm0
/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
vfnmadd213pd {rn-sae}, %zmm6, %zmm15, %zmm4
vaddpd {rn-sae}, %zmm4, %zmm3, %zmm1
vmulpd {rn-sae}, %zmm0, %zmm1, %zmm0
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm2
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %zmm2, 64(%rsp)
vmovups %zmm0, 128(%rsp)
# LOE rbx r12 r13 r14 r15 edx zmm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $8, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 128(%rsp), %zmm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 zmm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movsd 64(%rsp,%r14,8), %xmm0
call atanh@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movsd %xmm0, 128(%rsp,%r14,8)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVeN8v_atanh_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_datanh_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(64)) VUINT32 Log_tbl_H[16][2];
__declspec(align(64)) VUINT32 Log_tbl_L[16][2];
__declspec(align(64)) VUINT32 One[8][2];
__declspec(align(64)) VUINT32 AbsMask[8][2];
__declspec(align(64)) VUINT32 AddB5[8][2];
__declspec(align(64)) VUINT32 RcpBitMask[8][2];
__declspec(align(64)) VUINT32 poly_coeff8[8][2];
__declspec(align(64)) VUINT32 poly_coeff7[8][2];
__declspec(align(64)) VUINT32 poly_coeff6[8][2];
__declspec(align(64)) VUINT32 poly_coeff5[8][2];
__declspec(align(64)) VUINT32 poly_coeff4[8][2];
__declspec(align(64)) VUINT32 poly_coeff3[8][2];
__declspec(align(64)) VUINT32 poly_coeff2[8][2];
__declspec(align(64)) VUINT32 poly_coeff1[8][2];
__declspec(align(64)) VUINT32 poly_coeff0[8][2];
__declspec(align(64)) VUINT32 Half[8][2];
__declspec(align(64)) VUINT32 L2H[8][2];
__declspec(align(64)) VUINT32 L2L[8][2];
} __svml_datanh_data_internal_avx512;
#endif
__svml_datanh_data_internal_avx512:
/*== Log_tbl_H ==*/
.quad 0x0000000000000000
.quad 0x3faf0a30c0100000
.quad 0x3fbe27076e2a0000
.quad 0x3fc5ff3070a80000
.quad 0x3fcc8ff7c79b0000
.quad 0x3fd1675cabab8000
.quad 0x3fd4618bc21c8000
.quad 0x3fd739d7f6bc0000
.quad 0x3fd9f323ecbf8000
.quad 0x3fdc8ff7c79a8000
.quad 0x3fdf128f5faf0000
.quad 0x3fe0be72e4254000
.quad 0x3fe1e85f5e704000
.quad 0x3fe307d7334f0000
.quad 0x3fe41d8fe8468000
.quad 0x3fe52a2d265bc000
/*== Log_tbl_L ==*/
.align 64
.quad 0x0000000000000000
.quad 0x3d662a6617cc9717
.quad 0x3d6e5cbd3d50fffc
.quad 0xbd6b0b0de3077d7e
.quad 0xbd697794f689f843
.quad 0x3d630701ce63eab9
.quad 0xbd609ec17a426426
.quad 0xbd67fcb18ed9d603
.quad 0x3d584bf2b68d766f
.quad 0x3d5a21ac25d81ef3
.quad 0x3d3bb2cd720ec44c
.quad 0xbd657d49676844cc
.quad 0x3d1a07bd8b34be7c
.quad 0x3d60be1fb590a1f5
.quad 0xbd5aa33736867a17
.quad 0x3d46abb9df22bc57
/*== One ==*/
.align 64
.quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
/*== AbsMask ==*/
.align 64
.quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
/*== AddB5 ==*/
.align 64
.quad 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000
/*== RcpBitMask ==*/
.align 64
.quad 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000
/*== poly_coeff8 ==*/
.align 64
.quad 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142
/*== poly_coeff7 ==*/
.align 64
.quad 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70
/*== poly_coeff6 ==*/
.align 64
.quad 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8
/*== poly_coeff5 ==*/
.align 64
.quad 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5
/*== poly_coeff4 ==*/
.align 64
.quad 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a
/*== poly_coeff3 ==*/
.align 64
.quad 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01
/*== poly_coeff2 ==*/
.align 64
.quad 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462
/*== poly_coeff1 ==*/
.align 64
.quad 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5
/*== poly_coeff0 ==*/
.align 64
.quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
/*== Half ==*/
.align 64
.quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
/*== L2H = log(2)_high ==*/
.align 64
.quad 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000
/*== L2L = log(2)_low ==*/
.align 64
.quad 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000
.align 64
.type __svml_datanh_data_internal_avx512,@object
.size __svml_datanh_data_internal_avx512,.-__svml_datanh_data_internal_avx512

View file

@ -0,0 +1,20 @@
/* AVX2 version of vectorized atanhf.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVeN16v_atanhf _ZGVeN16v_atanhf_avx2_wrapper
#include "../svml_s_atanhf16_core.S"

View file

@ -0,0 +1,28 @@
/* Multiple versions of vectorized atanhf, vector length is 16.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVeN16v_atanhf
#include "ifunc-mathvec-avx512-skx.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVeN16v_atanhf, __GI__ZGVeN16v_atanhf,
__redirect__ZGVeN16v_atanhf)
__attribute__ ((visibility ("hidden")));
#endif

View file

@ -0,0 +1,393 @@
/* Function atanhf vectorized with AVX-512.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
* using small lookup table that map to AVX-512 permute instructions
*
* Special cases:
*
* atanh(0) = 0
* atanh(+1) = +INF
* atanh(-1) = -INF
* atanh(x) = NaN if |x| > 1, or if x is a NaN or INF
*
*/
/* Offsets for data table __svml_satanh_data_internal_avx512
*/
#define Log_tbl_H 0
#define Log_tbl_L 128
#define One 256
#define AbsMask 320
#define AddB5 384
#define RcpBitMask 448
#define poly_coeff3 512
#define poly_coeff2 576
#define poly_coeff1 640
#define poly_coeff0 704
#define Half 768
#define L2H 832
#define L2L 896
#include <sysdep.h>
.text
.section .text.exex512,"ax",@progbits
ENTRY(_ZGVeN16v_atanhf_skx)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-64, %rsp
subq $192, %rsp
vmovups One+__svml_satanh_data_internal_avx512(%rip), %zmm4
/* round reciprocals to 1+5b mantissas */
vmovups AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
vmovups RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
vmovaps %zmm0, %zmm11
vandps AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
/* 1+y */
vaddps {rn-sae}, %zmm4, %zmm6, %zmm9
/* 1-y */
vsubps {rn-sae}, %zmm6, %zmm4, %zmm8
vxorps %zmm6, %zmm11, %zmm10
/* Yp_high */
vsubps {rn-sae}, %zmm4, %zmm9, %zmm2
/* -Ym_high */
vsubps {rn-sae}, %zmm4, %zmm8, %zmm5
/* RcpP ~ 1/Yp */
vrcp14ps %zmm9, %zmm12
/* RcpM ~ 1/Ym */
vrcp14ps %zmm8, %zmm13
/* input outside (-1, 1) ? */
vcmpps $21, {sae}, %zmm4, %zmm6, %k0
vpaddd %zmm14, %zmm12, %zmm15
vpaddd %zmm14, %zmm13, %zmm0
/* Yp_low */
vsubps {rn-sae}, %zmm2, %zmm6, %zmm3
vandps %zmm1, %zmm15, %zmm7
vandps %zmm1, %zmm0, %zmm12
/* Ym_low */
vaddps {rn-sae}, %zmm5, %zmm6, %zmm5
/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
vmovups Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
vmovups Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
/* exponents */
vgetexpps {sae}, %zmm7, %zmm15
vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
/* Table lookups */
vmovups __svml_satanh_data_internal_avx512(%rip), %zmm6
vgetexpps {sae}, %zmm12, %zmm14
vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
/* Prepare table index */
vpsrld $18, %zmm7, %zmm3
vpsrld $18, %zmm12, %zmm2
vmovups Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
vmovups poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
/* Km-Kp */
vsubps {rn-sae}, %zmm15, %zmm14, %zmm1
kmovw %k0, %edx
vmovaps %zmm3, %zmm0
vpermi2ps %zmm13, %zmm8, %zmm3
vpermt2ps %zmm13, %zmm2, %zmm8
vpermi2ps %zmm7, %zmm6, %zmm0
vpermt2ps %zmm7, %zmm2, %zmm6
vsubps {rn-sae}, %zmm3, %zmm8, %zmm5
/* K*L2H + Th */
vmovups L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
/* K*L2L + Tl */
vmovups L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
/* polynomials */
vmovups poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
vmovups poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
/* table values */
vsubps {rn-sae}, %zmm0, %zmm6, %zmm0
vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
vmovups poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
vmovaps %zmm3, %zmm2
vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
/* (K*L2L + Tl) + Rp*PolyP */
vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
vorps Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
vaddps {rn-sae}, %zmm3, %zmm0, %zmm4
vmulps {rn-sae}, %zmm9, %zmm4, %zmm0
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %zmm11, 64(%rsp)
vmovups %zmm0, 128(%rsp)
# LOE rbx r12 r13 r14 r15 edx zmm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $16, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 128(%rsp), %zmm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 zmm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 64(%rsp,%r14,4), %xmm0
call atanhf@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movss %xmm0, 128(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVeN16v_atanhf_skx)
.section .rodata, "a"
.align 64
#ifdef __svml_satanh_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(64)) VUINT32 Log_tbl_H[32][1];
__declspec(align(64)) VUINT32 Log_tbl_L[32][1];
__declspec(align(64)) VUINT32 One[16][1];
__declspec(align(64)) VUINT32 AbsMask[16][1];
__declspec(align(64)) VUINT32 AddB5[16][1];
__declspec(align(64)) VUINT32 RcpBitMask[16][1];
__declspec(align(64)) VUINT32 poly_coeff3[16][1];
__declspec(align(64)) VUINT32 poly_coeff2[16][1];
__declspec(align(64)) VUINT32 poly_coeff1[16][1];
__declspec(align(64)) VUINT32 poly_coeff0[16][1];
__declspec(align(64)) VUINT32 Half[16][1];
__declspec(align(64)) VUINT32 L2H[16][1];
__declspec(align(64)) VUINT32 L2L[16][1];
} __svml_satanh_data_internal_avx512;
#endif
__svml_satanh_data_internal_avx512:
/*== Log_tbl_H ==*/
.long 0x00000000
.long 0x3cfc0000
.long 0x3d780000
.long 0x3db78000
.long 0x3df10000
.long 0x3e14c000
.long 0x3e300000
.long 0x3e4a8000
.long 0x3e648000
.long 0x3e7dc000
.long 0x3e8b4000
.long 0x3e974000
.long 0x3ea30000
.long 0x3eae8000
.long 0x3eb9c000
.long 0x3ec4e000
.long 0x3ecfa000
.long 0x3eda2000
.long 0x3ee48000
.long 0x3eeea000
.long 0x3ef8a000
.long 0x3f013000
.long 0x3f05f000
.long 0x3f0aa000
.long 0x3f0f4000
.long 0x3f13d000
.long 0x3f184000
.long 0x3f1ca000
.long 0x3f20f000
.long 0x3f252000
.long 0x3f295000
.long 0x3f2d7000
/*== Log_tbl_L ==*/
.align 64
.long 0x00000000
.long 0x3726c39e
.long 0x38a30c01
.long 0x37528ae5
.long 0x38e0edc5
.long 0xb8ab41f8
.long 0xb7cf8f58
.long 0x3896a73d
.long 0xb5838656
.long 0x380c36af
.long 0xb8235454
.long 0x3862bae1
.long 0x38c5e10e
.long 0x38dedfac
.long 0x38ebfb5e
.long 0xb8e63c9f
.long 0xb85c1340
.long 0x38777bcd
.long 0xb6038656
.long 0x37d40984
.long 0xb8b85028
.long 0xb8ad5a5a
.long 0x3865c84a
.long 0x38c3d2f5
.long 0x383ebce1
.long 0xb8a1ed76
.long 0xb7a332c4
.long 0xb779654f
.long 0xb8602f73
.long 0x38f85db0
.long 0x37b4996f
.long 0xb8bfb3ca
/*== One ==*/
.align 64
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
/*== AbsMask ==*/
.align 64
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
/*== AddB5 ==*/
.align 64
.long 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
/*== RcpBitMask ==*/
.align 64
.long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
/*== poly_coeff3 ==*/
.align 64
.long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
/*== poly_coeff2 ==*/
.align 64
.long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
/*== poly_coeff1 ==*/
.align 64
.long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
/*== poly_coeff0 ==*/
.align 64
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
/*== Half ==*/
.align 64
.long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
/*== L2H = log(2)_high ==*/
.align 64
.long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
/*== L2L = log(2)_low ==*/
.align 64
.long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
.align 64
.type __svml_satanh_data_internal_avx512,@object
.size __svml_satanh_data_internal_avx512,.-__svml_satanh_data_internal_avx512

View file

@ -0,0 +1,20 @@
/* SSE2 version of vectorized atanhf, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVbN4v_atanhf _ZGVbN4v_atanhf_sse2
#include "../svml_s_atanhf4_core.S"

View file

@ -0,0 +1,28 @@
/* Multiple versions of vectorized atanhf, vector length is 4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVbN4v_atanhf
#include "ifunc-mathvec-sse4_1.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVbN4v_atanhf, __GI__ZGVbN4v_atanhf,
__redirect__ZGVbN4v_atanhf)
__attribute__ ((visibility ("hidden")));
#endif

View file

@ -0,0 +1,361 @@
/* Function atanhf vectorized with SSE4.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
*
* Special cases:
*
* atanh(0) = 0
* atanh(+1) = +INF
* atanh(-1) = -INF
* atanh(x) = NaN if |x| > 1, or if x is a NaN or INF
*
*/
/* Offsets for data table __svml_satanh_data_internal
*/
#define SgnMask 0
#define sOne 16
#define sPoly 32
#define iBrkValue 160
#define iOffExpoMask 176
#define sHalf 192
#define sSign 208
#define sTopMask12 224
#define TinyRange 240
#define sLn2 256
#include <sysdep.h>
.text
.section .text.sse4,"ax",@progbits
ENTRY(_ZGVbN4v_atanhf_sse4)
subq $72, %rsp
cfi_def_cfa_offset(80)
movaps %xmm0, %xmm5
/* Load constants including One = 1 */
movups sOne+__svml_satanh_data_internal(%rip), %xmm4
movaps %xmm5, %xmm3
/* Strip off the sign, so treat X as positive until right at the end */
movups SgnMask+__svml_satanh_data_internal(%rip), %xmm7
movaps %xmm4, %xmm8
andps %xmm5, %xmm7
movaps %xmm4, %xmm10
movups sTopMask12+__svml_satanh_data_internal(%rip), %xmm11
movaps %xmm4, %xmm14
movaps %xmm11, %xmm9
/*
* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
* the upper part UHi being <= 12 bits long. Then we have
* atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
*/
movaps %xmm7, %xmm12
/*
* Check whether |X| < 1, in which case we use the main function.
* Otherwise set the rangemask so that the callout will get used.
* Note that this will also use the callout for NaNs since not(NaN < 1).
*/
movaps %xmm7, %xmm6
movaps %xmm7, %xmm2
cmpnltps %xmm4, %xmm6
cmpltps TinyRange+__svml_satanh_data_internal(%rip), %xmm2
mulps %xmm5, %xmm3
subps %xmm7, %xmm8
addps %xmm7, %xmm12
movmskps %xmm6, %edx
subps %xmm8, %xmm10
addps %xmm5, %xmm3
subps %xmm7, %xmm10
andps %xmm8, %xmm9
/*
* Now we feed into the log1p code, using H in place of _VARG1 and
* later incorporating L into the reduced argument.
* compute 1+x as high, low parts
*/
movaps %xmm4, %xmm7
/*
* Now compute R = 1/(UHi+ULo) * (1 - E) and the error term E
* The first FMR is exact (we force R to 12 bits just in case it
* isn't already, to make absolutely sure), and since E is ~ 2^-12,
* the rounding error in the other one is acceptable.
*/
rcpps %xmm9, %xmm15
subps %xmm9, %xmm8
andps %xmm11, %xmm15
/*
* Split V as well into upper 12 bits and lower part, so that we can get
* a preliminary quotient estimate without rounding error.
*/
andps %xmm12, %xmm11
mulps %xmm15, %xmm9
addps %xmm8, %xmm10
subps %xmm11, %xmm12
/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
mulps %xmm15, %xmm11
mulps %xmm15, %xmm10
subps %xmm9, %xmm14
mulps %xmm12, %xmm15
subps %xmm10, %xmm14
/* Compute D = E + E^2 */
movaps %xmm14, %xmm13
movaps %xmm4, %xmm8
mulps %xmm14, %xmm13
/* reduction: compute r,n */
movdqu iBrkValue+__svml_satanh_data_internal(%rip), %xmm9
addps %xmm13, %xmm14
/*
* Compute R * (VHi + VLo) * (1 + E + E^2)
* = R * (VHi + VLo) * (1 + D)
* = QHi + (QHi * D + QLo + QLo * D)
*/
movaps %xmm14, %xmm0
mulps %xmm15, %xmm14
mulps %xmm11, %xmm0
addps %xmm14, %xmm15
movdqu iOffExpoMask+__svml_satanh_data_internal(%rip), %xmm12
movaps %xmm4, %xmm14
/* Record the sign for eventual reincorporation. */
movups sSign+__svml_satanh_data_internal(%rip), %xmm1
addps %xmm15, %xmm0
/*
* Now finally accumulate the high and low parts of the
* argument to log1p, H + L, with a final compensated summation.
*/
movaps %xmm0, %xmm6
andps %xmm5, %xmm1
/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
orps %xmm1, %xmm3
addps %xmm11, %xmm6
maxps %xmm6, %xmm7
minps %xmm6, %xmm8
subps %xmm6, %xmm11
movaps %xmm7, %xmm10
andps %xmm2, %xmm3
addps %xmm8, %xmm10
addps %xmm11, %xmm0
subps %xmm10, %xmm7
psubd %xmm9, %xmm10
addps %xmm7, %xmm8
pand %xmm10, %xmm12
psrad $23, %xmm10
cvtdq2ps %xmm10, %xmm13
addps %xmm8, %xmm0
/* final reconstruction */
mulps sLn2+__svml_satanh_data_internal(%rip), %xmm13
pslld $23, %xmm10
paddd %xmm9, %xmm12
psubd %xmm10, %xmm14
/* polynomial evaluation */
subps %xmm4, %xmm12
mulps %xmm0, %xmm14
movups sPoly+112+__svml_satanh_data_internal(%rip), %xmm0
addps %xmm12, %xmm14
mulps %xmm14, %xmm0
/* Finally, halve the result and reincorporate the sign */
movups sHalf+__svml_satanh_data_internal(%rip), %xmm4
pxor %xmm1, %xmm4
addps sPoly+96+__svml_satanh_data_internal(%rip), %xmm0
mulps %xmm14, %xmm0
addps sPoly+80+__svml_satanh_data_internal(%rip), %xmm0
mulps %xmm14, %xmm0
addps sPoly+64+__svml_satanh_data_internal(%rip), %xmm0
mulps %xmm14, %xmm0
addps sPoly+48+__svml_satanh_data_internal(%rip), %xmm0
mulps %xmm14, %xmm0
addps sPoly+32+__svml_satanh_data_internal(%rip), %xmm0
mulps %xmm14, %xmm0
addps sPoly+16+__svml_satanh_data_internal(%rip), %xmm0
mulps %xmm14, %xmm0
addps sPoly+__svml_satanh_data_internal(%rip), %xmm0
mulps %xmm14, %xmm0
mulps %xmm14, %xmm0
addps %xmm0, %xmm14
movaps %xmm2, %xmm0
addps %xmm13, %xmm14
mulps %xmm14, %xmm4
andnps %xmm4, %xmm0
orps %xmm3, %xmm0
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm5
/* Restore registers
* and exit the function
*/
L(EXIT):
addq $72, %rsp
cfi_def_cfa_offset(8)
ret
cfi_def_cfa_offset(80)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
movups %xmm5, 32(%rsp)
movups %xmm0, 48(%rsp)
# LOE rbx rbp r12 r13 r14 r15 edx
xorl %eax, %eax
movq %r12, 16(%rsp)
cfi_offset(12, -64)
movl %eax, %r12d
movq %r13, 8(%rsp)
cfi_offset(13, -72)
movl %edx, %r13d
movq %r14, (%rsp)
cfi_offset(14, -80)
# LOE rbx rbp r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx rbp r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $4, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx rbp r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
movups 48(%rsp), %xmm0
/* Go to exit */
jmp L(EXIT)
cfi_offset(12, -64)
cfi_offset(13, -72)
cfi_offset(14, -80)
# LOE rbx rbp r12 r13 r14 r15 xmm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 32(%rsp,%r14,4), %xmm0
call atanhf@PLT
# LOE rbx rbp r14 r15 r12d r13d xmm0
movss %xmm0, 48(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx rbp r15 r12d r13d
END(_ZGVbN4v_atanhf_sse4)
.section .rodata, "a"
.align 16
#ifdef __svml_satanh_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(16)) VUINT32 SgnMask[4][1];
__declspec(align(16)) VUINT32 sOne[4][1];
__declspec(align(16)) VUINT32 sPoly[8][4][1];
__declspec(align(16)) VUINT32 iBrkValue[4][1];
__declspec(align(16)) VUINT32 iOffExpoMask[4][1];
__declspec(align(16)) VUINT32 sHalf[4][1];
__declspec(align(16)) VUINT32 sSign[4][1];
__declspec(align(16)) VUINT32 sTopMask12[4][1];
__declspec(align(16)) VUINT32 TinyRange[4][1];
__declspec(align(16)) VUINT32 sLn2[4][1];
} __svml_satanh_data_internal;
#endif
__svml_satanh_data_internal:
/*== SgnMask ==*/
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
/*== sOne = SP 1.0 ==*/
.align 16
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
/*== sPoly[] = SP polynomial ==*/
.align 16
.long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
.long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
.long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
.long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
.long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
.long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
.long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
.long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
/*== iBrkValue = SP 2/3 ==*/
.align 16
.long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
/*== iOffExpoMask = SP significand mask ==*/
.align 16
.long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
/*== sHalf ==*/
.align 16
.long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
/*== sSign ==*/
.align 16
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
/*== sTopMask12 ==*/
.align 16
.long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
/*== TinyRange ==*/
.align 16
.long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
/*== sLn2 = SP ln(2) ==*/
.align 16
.long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
.align 16
.type __svml_satanh_data_internal,@object
.size __svml_satanh_data_internal,.-__svml_satanh_data_internal

View file

@ -0,0 +1,20 @@
/* SSE version of vectorized atanhf, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define _ZGVdN8v_atanhf _ZGVdN8v_atanhf_sse_wrapper
#include "../svml_s_atanhf8_core.S"

View file

@ -0,0 +1,28 @@
/* Multiple versions of vectorized atanhf, vector length is 8.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define SYMBOL_NAME _ZGVdN8v_atanhf
#include "ifunc-mathvec-avx2.h"
libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
#ifdef SHARED
__hidden_ver1 (_ZGVdN8v_atanhf, __GI__ZGVdN8v_atanhf,
__redirect__ZGVdN8v_atanhf)
__attribute__ ((visibility ("hidden")));
#endif

View file

@ -0,0 +1,335 @@
/* Function atanhf vectorized with AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
https://www.gnu.org/licenses/. */
/*
* ALGORITHM DESCRIPTION:
*
* Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
*
* Special cases:
*
* atanh(0) = 0
* atanh(+1) = +INF
* atanh(-1) = -INF
* atanh(x) = NaN if |x| > 1, or if x is a NaN or INF
*
*/
/* Offsets for data table __svml_satanh_data_internal
*/
#define SgnMask 0
#define sOne 32
#define sPoly 64
#define iBrkValue 320
#define iOffExpoMask 352
#define sHalf 384
#define sSign 416
#define sTopMask12 448
#define TinyRange 480
#define sLn2 512
#include <sysdep.h>
.text
.section .text.avx2,"ax",@progbits
ENTRY(_ZGVdN8v_atanhf_avx2)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-32, %rsp
subq $96, %rsp
/* Load constants including One = 1 */
vmovups sOne+__svml_satanh_data_internal(%rip), %ymm5
vmovups sTopMask12+__svml_satanh_data_internal(%rip), %ymm13
vmovaps %ymm0, %ymm6
/* Strip off the sign, so treat X as positive until right at the end */
vandps SgnMask+__svml_satanh_data_internal(%rip), %ymm6, %ymm10
vsubps %ymm10, %ymm5, %ymm1
/*
* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
* the upper part UHi being <= 12 bits long. Then we have
* atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
*/
vaddps %ymm10, %ymm10, %ymm14
/*
* Check whether |X| < 1, in which case we use the main function.
* Otherwise set the rangemask so that the callout will get used.
* Note that this will also use the callout for NaNs since not(NaN < 1).
*/
vcmpnlt_uqps %ymm5, %ymm10, %ymm7
vsubps %ymm1, %ymm5, %ymm9
vcmplt_oqps TinyRange+__svml_satanh_data_internal(%rip), %ymm10, %ymm4
vrcpps %ymm1, %ymm11
vsubps %ymm10, %ymm9, %ymm12
vandps %ymm13, %ymm11, %ymm0
/* No need to split sU when FMA is available */
vfnmadd213ps %ymm5, %ymm0, %ymm1
vmovaps %ymm6, %ymm8
vfmadd213ps %ymm6, %ymm6, %ymm8
vfnmadd231ps %ymm0, %ymm12, %ymm1
/*
* Split V as well into upper 12 bits and lower part, so that we can get
* a preliminary quotient estimate without rounding error.
*/
vandps %ymm13, %ymm14, %ymm15
vmovmskps %ymm7, %edx
vsubps %ymm15, %ymm14, %ymm7
/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
vmulps %ymm15, %ymm0, %ymm10
/* Compute D = E + E^2 */
vfmadd213ps %ymm1, %ymm1, %ymm1
/* Record the sign for eventual reincorporation. */
vandps sSign+__svml_satanh_data_internal(%rip), %ymm6, %ymm3
/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
vorps %ymm3, %ymm8, %ymm2
vmulps %ymm7, %ymm0, %ymm8
/*
* Compute R * (VHi + VLo) * (1 + E + E^2)
* = R * (VHi + VLo) * (1 + D)
* = QHi + (QHi * D + QLo + QLo * D)
*/
vmulps %ymm1, %ymm10, %ymm9
vfmadd213ps %ymm8, %ymm8, %ymm1
vaddps %ymm1, %ymm9, %ymm1
/* reduction: compute r,n */
vmovups iBrkValue+__svml_satanh_data_internal(%rip), %ymm9
/*
* Now finally accumulate the high and low parts of the
* argument to log1p, H + L, with a final compensated summation.
*/
vaddps %ymm1, %ymm10, %ymm12
vsubps %ymm12, %ymm10, %ymm11
/*
* Now we feed into the log1p code, using H in place of _VARG1 and
* later incorporating L into the reduced argument.
* compute 1+x as high, low parts
*/
vmaxps %ymm12, %ymm5, %ymm13
vminps %ymm12, %ymm5, %ymm14
vaddps %ymm11, %ymm1, %ymm0
vaddps %ymm14, %ymm13, %ymm1
vpsubd %ymm9, %ymm1, %ymm7
vsubps %ymm1, %ymm13, %ymm15
vpsrad $23, %ymm7, %ymm10
vpand iOffExpoMask+__svml_satanh_data_internal(%rip), %ymm7, %ymm8
vaddps %ymm15, %ymm14, %ymm13
vpslld $23, %ymm10, %ymm11
vpaddd %ymm9, %ymm8, %ymm15
vaddps %ymm13, %ymm0, %ymm14
vcvtdq2ps %ymm10, %ymm0
vpsubd %ymm11, %ymm5, %ymm12
/* polynomial evaluation */
vsubps %ymm5, %ymm15, %ymm5
vmulps %ymm14, %ymm12, %ymm1
vaddps %ymm5, %ymm1, %ymm5
vmovups sPoly+224+__svml_satanh_data_internal(%rip), %ymm1
vfmadd213ps sPoly+192+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
vfmadd213ps sPoly+160+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
vfmadd213ps sPoly+128+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
vfmadd213ps sPoly+96+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
vfmadd213ps sPoly+64+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
vfmadd213ps sPoly+32+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
vfmadd213ps sPoly+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
vmulps %ymm1, %ymm5, %ymm7
vfmadd213ps %ymm5, %ymm5, %ymm7
/* final reconstruction */
vfmadd132ps sLn2+__svml_satanh_data_internal(%rip), %ymm7, %ymm0
/* Finally, halve the result and reincorporate the sign */
vxorps sHalf+__svml_satanh_data_internal(%rip), %ymm3, %ymm3
vmulps %ymm0, %ymm3, %ymm0
vblendvps %ymm4, %ymm2, %ymm0, %ymm0
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx ymm0 ymm6
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
L(SPECIAL_VALUES_BRANCH):
vmovups %ymm6, 32(%rsp)
vmovups %ymm0, 64(%rsp)
# LOE rbx r12 r13 r14 r15 edx ymm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $8, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 64(%rsp), %ymm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 ymm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 32(%rsp,%r14,4), %xmm0
call atanhf@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movss %xmm0, 64(%rsp,%r14,4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
END(_ZGVdN8v_atanhf_avx2)
.section .rodata, "a"
.align 32
#ifdef __svml_satanh_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(32)) VUINT32 SgnMask[8][1];
__declspec(align(32)) VUINT32 sOne[8][1];
__declspec(align(32)) VUINT32 sPoly[8][8][1];
__declspec(align(32)) VUINT32 iBrkValue[8][1];
__declspec(align(32)) VUINT32 iOffExpoMask[8][1];
__declspec(align(32)) VUINT32 sHalf[8][1];
__declspec(align(32)) VUINT32 sSign[8][1];
__declspec(align(32)) VUINT32 sTopMask12[8][1];
__declspec(align(32)) VUINT32 TinyRange[8][1];
__declspec(align(32)) VUINT32 sLn2[8][1];
} __svml_satanh_data_internal;
#endif
__svml_satanh_data_internal:
/*== SgnMask ==*/
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
/*== sOne = SP 1.0 ==*/
.align 32
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
/*== sPoly[] = SP polynomial ==*/
.align 32
.long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
.long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
.long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
.long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
.long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
.long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
.long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
.long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
/*== iBrkValue = SP 2/3 ==*/
.align 32
.long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
/*== iOffExpoMask = SP significand mask ==*/
.align 32
.long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
/*== sHalf ==*/
.align 32
.long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
/*== sSign ==*/
.align 32
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
/*== sTopMask12 ==*/
.align 32
.long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
/*== TinyRange ==*/
.align 32
.long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
/*== sLn2 = SP ln(2) ==*/
.align 32
.long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
.align 32
.type __svml_satanh_data_internal,@object
.size __svml_satanh_data_internal,.-__svml_satanh_data_internal

View file

@ -0,0 +1,29 @@
/* Function atanh vectorized with SSE2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVbN2v_atanh)
WRAPPER_IMPL_SSE2 atanh
END (_ZGVbN2v_atanh)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN2v_atanh)
#endif

View file

@ -0,0 +1,29 @@
/* Function atanh vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVdN4v_atanh)
WRAPPER_IMPL_AVX _ZGVbN2v_atanh
END (_ZGVdN4v_atanh)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN4v_atanh)
#endif

View file

@ -0,0 +1,25 @@
/* Function atanh vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVcN4v_atanh)
WRAPPER_IMPL_AVX _ZGVbN2v_atanh
END (_ZGVcN4v_atanh)

View file

@ -0,0 +1,25 @@
/* Function atanh vectorized with AVX-512, wrapper to AVX2.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_d_wrapper_impl.h"
.text
ENTRY (_ZGVeN8v_atanh)
WRAPPER_IMPL_AVX512 _ZGVdN4v_atanh
END (_ZGVeN8v_atanh)

View file

@ -0,0 +1,25 @@
/* Function atanhf vectorized with AVX-512. Wrapper to AVX2 version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVeN16v_atanhf)
WRAPPER_IMPL_AVX512 _ZGVdN8v_atanhf
END (_ZGVeN16v_atanhf)

View file

@ -0,0 +1,29 @@
/* Function atanhf vectorized with SSE2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVbN4v_atanhf)
WRAPPER_IMPL_SSE2 atanhf
END (_ZGVbN4v_atanhf)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN4v_atanhf)
#endif

View file

@ -0,0 +1,29 @@
/* Function atanhf vectorized with AVX2, wrapper version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVdN8v_atanhf)
WRAPPER_IMPL_AVX _ZGVbN4v_atanhf
END (_ZGVdN8v_atanhf)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN8v_atanhf)
#endif

View file

@ -0,0 +1,25 @@
/* Function atanhf vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVcN8v_atanhf)
WRAPPER_IMPL_AVX _ZGVbN4v_atanhf
END (_ZGVcN8v_atanhf)

View file

@ -0,0 +1 @@
#include "test-double-libmvec-atanh.c"

View file

@ -0,0 +1 @@
#include "test-double-libmvec-atanh.c"

View file

@ -0,0 +1 @@
#include "test-double-libmvec-atanh.c"

View file

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE double
#define LIBMVEC_FUNC atanh
#include "test-vector-abi-arg1.h"

View file

@ -41,6 +41,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2), _ZGVbN2vv_atan2)
VECTOR_WRAPPER (WRAPPER_NAME (log10), _ZGVbN2v_log10)
VECTOR_WRAPPER (WRAPPER_NAME (log2), _ZGVbN2v_log2)
VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVbN2v_log1p)
VECTOR_WRAPPER (WRAPPER_NAME (atanh), _ZGVbN2v_atanh)
#define VEC_INT_TYPE __m128i

View file

@ -44,6 +44,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2), _ZGVdN4vv_atan2)
VECTOR_WRAPPER (WRAPPER_NAME (log10), _ZGVdN4v_log10)
VECTOR_WRAPPER (WRAPPER_NAME (log2), _ZGVdN4v_log2)
VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVdN4v_log1p)
VECTOR_WRAPPER (WRAPPER_NAME (atanh), _ZGVdN4v_atanh)
#ifndef __ILP32__
# define VEC_INT_TYPE __m256i

View file

@ -41,6 +41,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2), _ZGVcN4vv_atan2)
VECTOR_WRAPPER (WRAPPER_NAME (log10), _ZGVcN4v_log10)
VECTOR_WRAPPER (WRAPPER_NAME (log2), _ZGVcN4v_log2)
VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVcN4v_log1p)
VECTOR_WRAPPER (WRAPPER_NAME (atanh), _ZGVcN4v_atanh)
#define VEC_INT_TYPE __m128i

View file

@ -41,6 +41,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2), _ZGVeN8vv_atan2)
VECTOR_WRAPPER (WRAPPER_NAME (log10), _ZGVeN8v_log10)
VECTOR_WRAPPER (WRAPPER_NAME (log2), _ZGVeN8v_log2)
VECTOR_WRAPPER (WRAPPER_NAME (log1p), _ZGVeN8v_log1p)
VECTOR_WRAPPER (WRAPPER_NAME (atanh), _ZGVeN8v_atanh)
#ifndef __ILP32__
# define VEC_INT_TYPE __m512i

View file

@ -0,0 +1 @@
#include "test-float-libmvec-atanhf.c"

View file

@ -0,0 +1 @@
#include "test-float-libmvec-atanhf.c"

View file

@ -0,0 +1 @@
#include "test-float-libmvec-atanhf.c"

View file

@ -0,0 +1,3 @@
#define LIBMVEC_TYPE float
#define LIBMVEC_FUNC atanhf
#include "test-vector-abi-arg1.h"

View file

@ -41,6 +41,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2f), _ZGVeN16vv_atan2f)
VECTOR_WRAPPER (WRAPPER_NAME (log10f), _ZGVeN16v_log10f)
VECTOR_WRAPPER (WRAPPER_NAME (log2f), _ZGVeN16v_log2f)
VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVeN16v_log1pf)
VECTOR_WRAPPER (WRAPPER_NAME (atanhf), _ZGVeN16v_atanhf)
#define VEC_INT_TYPE __m512i

View file

@ -41,6 +41,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2f), _ZGVbN4vv_atan2f)
VECTOR_WRAPPER (WRAPPER_NAME (log10f), _ZGVbN4v_log10f)
VECTOR_WRAPPER (WRAPPER_NAME (log2f), _ZGVbN4v_log2f)
VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVbN4v_log1pf)
VECTOR_WRAPPER (WRAPPER_NAME (atanhf), _ZGVbN4v_atanhf)
#define VEC_INT_TYPE __m128i

View file

@ -44,6 +44,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2f), _ZGVdN8vv_atan2f)
VECTOR_WRAPPER (WRAPPER_NAME (log10f), _ZGVdN8v_log10f)
VECTOR_WRAPPER (WRAPPER_NAME (log2f), _ZGVdN8v_log2f)
VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVdN8v_log1pf)
VECTOR_WRAPPER (WRAPPER_NAME (atanhf), _ZGVdN8v_atanhf)
/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
#undef VECTOR_WRAPPER_fFF

View file

@ -41,6 +41,7 @@ VECTOR_WRAPPER_ff (WRAPPER_NAME (atan2f), _ZGVcN8vv_atan2f)
VECTOR_WRAPPER (WRAPPER_NAME (log10f), _ZGVcN8v_log10f)
VECTOR_WRAPPER (WRAPPER_NAME (log2f), _ZGVcN8v_log2f)
VECTOR_WRAPPER (WRAPPER_NAME (log1pf), _ZGVcN8v_log1pf)
VECTOR_WRAPPER (WRAPPER_NAME (atanhf), _ZGVcN8v_atanhf)
#define VEC_INT_TYPE __m128i