glibc/sysdeps/x86_64/multiarch/ifunc-impl-list.c

1255 lines
49 KiB
C
Raw Normal View History

2012-10-10 16:49:12 +02:00
/* Enumerate available IFUNC implementations of a function. x86-64 version.
Copyright (C) 2012-2022 Free Software Foundation, Inc.
2012-10-10 16:49:12 +02:00
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
Prefer https to http for gnu.org and fsf.org URLs Also, change sources.redhat.com to sourceware.org. This patch was automatically generated by running the following shell script, which uses GNU sed, and which avoids modifying files imported from upstream: sed -ri ' s,(http|ftp)(://(.*\.)?(gnu|fsf|sourceware)\.org($|[^.]|\.[^a-z])),https\2,g s,(http|ftp)(://(.*\.)?)sources\.redhat\.com($|[^.]|\.[^a-z]),https\2sourceware.org\4,g ' \ $(find $(git ls-files) -prune -type f \ ! -name '*.po' \ ! -name 'ChangeLog*' \ ! -path COPYING ! -path COPYING.LIB \ ! -path manual/fdl-1.3.texi ! -path manual/lgpl-2.1.texi \ ! -path manual/texinfo.tex ! -path scripts/config.guess \ ! -path scripts/config.sub ! -path scripts/install-sh \ ! -path scripts/mkinstalldirs ! -path scripts/move-if-change \ ! -path INSTALL ! -path locale/programs/charmap-kw.h \ ! -path po/libc.pot ! -path sysdeps/gnu/errlist.c \ ! '(' -name configure \ -execdir test -f configure.ac -o -f configure.in ';' ')' \ ! '(' -name preconfigure \ -execdir test -f preconfigure.ac ';' ')' \ -print) and then by running 'make dist-prepare' to regenerate files built from the altered files, and then executing the following to cleanup: chmod a+x sysdeps/unix/sysv/linux/riscv/configure # Omit irrelevant whitespace and comment-only changes, # perhaps from a slightly-different Autoconf version. git checkout -f \ sysdeps/csky/configure \ sysdeps/hppa/configure \ sysdeps/riscv/configure \ sysdeps/unix/sysv/linux/csky/configure # Omit changes that caused a pre-commit check to fail like this: # remote: *** error: sysdeps/powerpc/powerpc64/ppc-mcount.S: trailing lines git checkout -f \ sysdeps/powerpc/powerpc64/ppc-mcount.S \ sysdeps/unix/sysv/linux/s390/s390-64/syscall.S # Omit change that caused a pre-commit check to fail like this: # remote: *** error: sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S: last line does not end in newline git checkout -f sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S
2019-09-07 07:40:42 +02:00
<https://www.gnu.org/licenses/>. */
2012-10-10 16:49:12 +02:00
#include <assert.h>
#include <string.h>
#include <wchar.h>
#include <ifunc-impl-list.h>
#include <sysdep.h>
2012-10-10 16:49:12 +02:00
#include "init-arch.h"
/* Fill ARRAY of MAX elements with IFUNC implementations for function
NAME supported on target machine and return the number of valid
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
entries. Each set of implementations for a given function is sorted in
descending order by ISA level. */
2012-10-10 16:49:12 +02:00
size_t
__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
size_t max)
{
size_t i = max;
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/memcmpeq.c. */
IFUNC_IMPL (i, name, __memcmpeq,
X86_IFUNC_IMPL_ADD_V4 (array, i, __memcmpeq,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__memcmpeq_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memcmpeq,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__memcmpeq_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memcmpeq,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__memcmpeq_avx2_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, __memcmpeq,
1,
__memcmpeq_sse2))
/* Support sysdeps/x86_64/multiarch/memchr.c. */
x86-64: Optimize memchr/rawmemchr/wmemchr with SSE2/AVX2 SSE2 memchr is extended to support wmemchr. AVX2 memchr/rawmemchr/wmemchr are added to search 32 bytes with a single vector compare instruction. AVX2 memchr/rawmemchr/wmemchr are as fast as SSE2 memchr/rawmemchr/wmemchr for small sizes and up to 1.5X faster for larger sizes on Haswell and Skylake. Select AVX2 memchr/rawmemchr/wmemchr on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/memchr.S (MEMCHR): New. Depending on if USE_AS_WMEMCHR is defined. (PCMPEQ): Likewise. (memchr): Renamed to ... (MEMCHR): This. Support wmemchr if USE_AS_WMEMCHR is defined. Replace pcmpeqb with PCMPEQ. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memchr-sse2, rawmemchr-sse2, memchr-avx2, rawmemchr-avx2, wmemchr-sse4_1, wmemchr-avx2 and wmemchr-c. * sysdeps/x86_64/multiarch/ifunc-avx2.h: New file. * sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/memchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/memchr.c: Likewise. * sysdeps/x86_64/multiarch/rawmemchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/rawmemchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/rawmemchr.c: Likewise. * sysdeps/x86_64/multiarch/wmemchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wmemchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wmemchr.c: Likewise. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memchr_avx2, __memchr_sse2, __rawmemchr_avx2, __rawmemchr_sse2, __wmemchr_avx2 and __wmemchr_sse2.
2017-06-09 14:13:15 +02:00
IFUNC_IMPL (i, name, memchr,
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__memchr_evex)
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__memchr_evex_rtm)
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
X86_IFUNC_IMPL_ADD_V3 (array, i, memchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__memchr_avx2)
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
X86_IFUNC_IMPL_ADD_V3 (array, i, memchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__memchr_avx2_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
X86_IFUNC_IMPL_ADD_V2 (array, i, memchr,
1,
__memchr_sse2))
x86-64: Optimize memchr/rawmemchr/wmemchr with SSE2/AVX2 SSE2 memchr is extended to support wmemchr. AVX2 memchr/rawmemchr/wmemchr are added to search 32 bytes with a single vector compare instruction. AVX2 memchr/rawmemchr/wmemchr are as fast as SSE2 memchr/rawmemchr/wmemchr for small sizes and up to 1.5X faster for larger sizes on Haswell and Skylake. Select AVX2 memchr/rawmemchr/wmemchr on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/memchr.S (MEMCHR): New. Depending on if USE_AS_WMEMCHR is defined. (PCMPEQ): Likewise. (memchr): Renamed to ... (MEMCHR): This. Support wmemchr if USE_AS_WMEMCHR is defined. Replace pcmpeqb with PCMPEQ. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memchr-sse2, rawmemchr-sse2, memchr-avx2, rawmemchr-avx2, wmemchr-sse4_1, wmemchr-avx2 and wmemchr-c. * sysdeps/x86_64/multiarch/ifunc-avx2.h: New file. * sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/memchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/memchr.c: Likewise. * sysdeps/x86_64/multiarch/rawmemchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/rawmemchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/rawmemchr.c: Likewise. * sysdeps/x86_64/multiarch/wmemchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wmemchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wmemchr.c: Likewise. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memchr_avx2, __memchr_sse2, __rawmemchr_avx2, __rawmemchr_sse2, __wmemchr_avx2 and __wmemchr_sse2.
2017-06-09 14:13:15 +02:00
/* Support sysdeps/x86_64/multiarch/memcmp.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, memcmp,
/* NB: If any of these names change or if any new
implementations are added be sure to update
sysdeps/x86_64/memcmp-isa-default-impl.h. */
X86_IFUNC_IMPL_ADD_V4 (array, i, memcmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_evex_movbe)
X86_IFUNC_IMPL_ADD_V3 (array, i, memcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_avx2_movbe)
X86_IFUNC_IMPL_ADD_V3 (array, i, memcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)
&& CPU_FEATURE_USABLE (RTM)),
__memcmp_avx2_movbe_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, memcmp,
1,
__memcmp_sse2))
2012-10-10 16:49:12 +02:00
#ifdef SHARED
x86-64: Implement memmove family IFUNC selectors in C Implement memmove family IFUNC selectors in C. All internal calls within libc.so can use IFUNC on x86-64 since unlike x86, x86-64 supports PC-relative addressing to access the GOT entry so that it can call via PLT without using an extra register. For libc.a, we can't use IFUNC for functions which are called before IFUNC has been initialized. Use IFUNC internally reduces the icache footprint since libc.so and other codes in the process use the same implementations. This patch uses IFUNC for memmove family functions within libc. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memmove-sse2-unaligned-erms, memcpy_chk-nonshared, mempcpy_chk-nonshared and memmove_chk-nonshared. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __memmove_chk_erms, __memcpy_chk_erms and __mempcpy_chk_erms. Update comments. * sysdeps/x86_64/multiarch/ifunc-memmove.h: New file. * sysdeps/x86_64/multiarch/memcpy.c: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk.c: Likewise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memmove.c: Likewise. * sysdeps/x86_64/multiarch/memmove_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise. * sysdeps/x86_64/multiarch/mempcpy.c: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.c: Likewise. * sysdeps/x86_64/multiarch/memcpy.S: Removed. * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S (__mempcpy_chk_erms): New function. (__memmove_chk_erms): Likewise. (__memcpy_chk_erms): New alias.
2017-06-14 21:10:57 +02:00
/* Support sysdeps/x86_64/multiarch/memmove_chk.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, __memmove_chk,
Add x86-64 memmove with unaligned load/store and rep movsb Implement x86-64 memmove with unaligned load/store and rep movsb. Support 16-byte, 32-byte and 64-byte vector register sizes. When size <= 8 times of vector register size, there is no check for address overlap bewteen source and destination. Since overhead for overlap check is small when size > 8 times of vector register size, memcpy is an alias of memmove. A single file provides 2 implementations of memmove, one with rep movsb and the other without rep movsb. They share the same codes when size is between 2 times of vector register size and REP_MOVSB_THRESHOLD which is 2KB for 16-byte vector register size and scaled up by large vector register size. Key features: 1. Use overlapping load and store to avoid branch. 2. For size <= 8 times of vector register size, load all sources into registers and store them together. 3. If there is no address overlap bewteen source and destination, copy from both ends with 4 times of vector register size at a time. 4. If address of destination > address of source, backward copy 8 times of vector register size at a time. 5. Otherwise, forward copy 8 times of vector register size at a time. 6. Use rep movsb only for forward copy. Avoid slow backward rep movsb by fallbacking to backward copy 8 times of vector register size at a time. 7. Skip when address of destination == address of source. [BZ #19776] * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and memmove-avx512-unaligned-erms. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memmove_chk_avx512_unaligned_2, __memmove_chk_avx512_unaligned_erms, __memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms, __memmove_chk_sse2_unaligned_2, __memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2, __memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2, __memmove_avx512_unaligned_erms, __memmove_erms, __memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms, __memcpy_chk_avx512_unaligned_2, __memcpy_chk_avx512_unaligned_erms, __memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms, __memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms, __memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms, __memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms, __memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms, __memcpy_erms, __mempcpy_chk_avx512_unaligned_2, __mempcpy_chk_avx512_unaligned_erms, __mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms, __mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms, __mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms, __mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms, __mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and __mempcpy_erms. * sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New file. * sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S: Likwise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likwise. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: Likwise.
2016-03-31 19:04:26 +02:00
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512F),
__memmove_chk_avx512_no_vzeroupper)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_avx512_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_avx512_unaligned_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_evex_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_evex_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX),
__memmove_chk_avx_unaligned)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX),
__memmove_chk_avx_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_chk_avx_unaligned_rtm)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_chk_avx_unaligned_erms_rtm)
/* By V3 we assume fast aligned copy. */
X86_IFUNC_IMPL_ADD_V2 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (SSSE3),
__memmove_chk_ssse3)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2 (SSSE3 is too
optimized around aligned copy to be better as general
purpose memmove). */
X86_IFUNC_IMPL_ADD_V2 (array, i, __memmove_chk, 1,
__memmove_chk_sse2_unaligned)
X86_IFUNC_IMPL_ADD_V2 (array, i, __memmove_chk, 1,
__memmove_chk_sse2_unaligned_erms))
#endif
2012-10-10 16:49:12 +02:00
x86-64: Implement memmove family IFUNC selectors in C Implement memmove family IFUNC selectors in C. All internal calls within libc.so can use IFUNC on x86-64 since unlike x86, x86-64 supports PC-relative addressing to access the GOT entry so that it can call via PLT without using an extra register. For libc.a, we can't use IFUNC for functions which are called before IFUNC has been initialized. Use IFUNC internally reduces the icache footprint since libc.so and other codes in the process use the same implementations. This patch uses IFUNC for memmove family functions within libc. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memmove-sse2-unaligned-erms, memcpy_chk-nonshared, mempcpy_chk-nonshared and memmove_chk-nonshared. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __memmove_chk_erms, __memcpy_chk_erms and __mempcpy_chk_erms. Update comments. * sysdeps/x86_64/multiarch/ifunc-memmove.h: New file. * sysdeps/x86_64/multiarch/memcpy.c: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk.c: Likewise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memmove.c: Likewise. * sysdeps/x86_64/multiarch/memmove_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise. * sysdeps/x86_64/multiarch/mempcpy.c: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.c: Likewise. * sysdeps/x86_64/multiarch/memcpy.S: Removed. * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S (__mempcpy_chk_erms): New function. (__memmove_chk_erms): Likewise. (__memcpy_chk_erms): New alias.
2017-06-14 21:10:57 +02:00
/* Support sysdeps/x86_64/multiarch/memmove.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, memmove,
Add x86-64 memmove with unaligned load/store and rep movsb Implement x86-64 memmove with unaligned load/store and rep movsb. Support 16-byte, 32-byte and 64-byte vector register sizes. When size <= 8 times of vector register size, there is no check for address overlap bewteen source and destination. Since overhead for overlap check is small when size > 8 times of vector register size, memcpy is an alias of memmove. A single file provides 2 implementations of memmove, one with rep movsb and the other without rep movsb. They share the same codes when size is between 2 times of vector register size and REP_MOVSB_THRESHOLD which is 2KB for 16-byte vector register size and scaled up by large vector register size. Key features: 1. Use overlapping load and store to avoid branch. 2. For size <= 8 times of vector register size, load all sources into registers and store them together. 3. If there is no address overlap bewteen source and destination, copy from both ends with 4 times of vector register size at a time. 4. If address of destination > address of source, backward copy 8 times of vector register size at a time. 5. Otherwise, forward copy 8 times of vector register size at a time. 6. Use rep movsb only for forward copy. Avoid slow backward rep movsb by fallbacking to backward copy 8 times of vector register size at a time. 7. Skip when address of destination == address of source. [BZ #19776] * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and memmove-avx512-unaligned-erms. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memmove_chk_avx512_unaligned_2, __memmove_chk_avx512_unaligned_erms, __memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms, __memmove_chk_sse2_unaligned_2, __memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2, __memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2, __memmove_avx512_unaligned_erms, __memmove_erms, __memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms, __memcpy_chk_avx512_unaligned_2, __memcpy_chk_avx512_unaligned_erms, __memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms, __memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms, __memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms, __memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms, __memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms, __memcpy_erms, __mempcpy_chk_avx512_unaligned_2, __mempcpy_chk_avx512_unaligned_erms, __mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms, __mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms, __mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms, __mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms, __mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and __mempcpy_erms. * sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New file. * sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S: Likwise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likwise. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: Likwise.
2016-03-31 19:04:26 +02:00
IFUNC_IMPL_ADD (array, i, memmove, 1,
__memmove_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
CPU_FEATURE_USABLE (AVX512F),
__memmove_avx512_no_vzeroupper)
X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_avx512_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_avx512_unaligned_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_evex_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_evex_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
CPU_FEATURE_USABLE (AVX),
__memmove_avx_unaligned)
X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
CPU_FEATURE_USABLE (AVX),
__memmove_avx_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_avx_unaligned_rtm)
X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_avx_unaligned_erms_rtm)
/* By V3 we assume fast aligned copy. */
X86_IFUNC_IMPL_ADD_V2 (array, i, memmove,
CPU_FEATURE_USABLE (SSSE3),
__memmove_ssse3)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2 (SSSE3 is too
optimized around aligned copy to be better as general
purpose memmove). */
X86_IFUNC_IMPL_ADD_V2 (array, i, memmove, 1,
__memmove_sse2_unaligned)
X86_IFUNC_IMPL_ADD_V2 (array, i, memmove, 1,
__memmove_sse2_unaligned_erms))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/memrchr.c. */
IFUNC_IMPL (i, name, memrchr,
X86_IFUNC_IMPL_ADD_V4 (array, i, memrchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (LZCNT)),
__memrchr_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, memrchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (LZCNT)),
__memrchr_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, memrchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (LZCNT)
&& CPU_FEATURE_USABLE (RTM)),
__memrchr_avx2_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, memrchr,
1,
__memrchr_sse2))
#ifdef SHARED
/* Support sysdeps/x86_64/multiarch/memset_chk.c. */
IFUNC_IMPL (i, name, __memset_chk,
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
__memset_chk_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memset_chk,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__memset_chk_avx512_unaligned_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memset_chk,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__memset_chk_avx512_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memset_chk,
CPU_FEATURE_USABLE (AVX512F),
__memset_chk_avx512_no_vzeroupper)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memset_chk,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__memset_chk_evex_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memset_chk,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__memset_chk_evex_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memset_chk,
CPU_FEATURE_USABLE (AVX2),
__memset_chk_avx2_unaligned)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memset_chk,
CPU_FEATURE_USABLE (AVX2),
__memset_chk_avx2_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memset_chk,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__memset_chk_avx2_unaligned_rtm)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memset_chk,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__memset_chk_avx2_unaligned_erms_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, __memset_chk, 1,
__memset_chk_sse2_unaligned)
X86_IFUNC_IMPL_ADD_V2 (array, i, __memset_chk, 1,
__memset_chk_sse2_unaligned_erms)
)
#endif
/* Support sysdeps/x86_64/multiarch/memset.c. */
IFUNC_IMPL (i, name, memset,
Add x86-64 memset with unaligned store and rep stosb Implement x86-64 memset with unaligned store and rep movsb. Support 16-byte, 32-byte and 64-byte vector register sizes. A single file provides 2 implementations of memset, one with rep stosb and the other without rep stosb. They share the same codes when size is between 2 times of vector register size and REP_STOSB_THRESHOLD which defaults to 2KB. Key features: 1. Use overlapping store to avoid branch. 2. For size <= 4 times of vector register size, fully unroll the loop. 3. For size > 4 times of vector register size, store 4 times of vector register size at a time. [BZ #19881] * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and memset-avx512-unaligned-erms. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned, __memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned, __memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned, __memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned, __memset_sse2_unaligned_erms, __memset_erms, __memset_avx2_unaligned, __memset_avx2_unaligned_erms, __memset_avx512_unaligned_erms and __memset_avx512_unaligned. * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New file. * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise.
2016-03-31 19:05:51 +02:00
IFUNC_IMPL_ADD (array, i, memset, 1,
__memset_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, memset,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__memset_avx512_unaligned_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, memset,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__memset_avx512_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, memset,
CPU_FEATURE_USABLE (AVX512F),
__memset_avx512_no_vzeroupper)
X86_IFUNC_IMPL_ADD_V4 (array, i, memset,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__memset_evex_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, memset,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__memset_evex_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, memset,
CPU_FEATURE_USABLE (AVX2),
__memset_avx2_unaligned)
X86_IFUNC_IMPL_ADD_V3 (array, i, memset,
CPU_FEATURE_USABLE (AVX2),
__memset_avx2_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, memset,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__memset_avx2_unaligned_rtm)
X86_IFUNC_IMPL_ADD_V3 (array, i, memset,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__memset_avx2_unaligned_erms_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, memset, 1,
__memset_sse2_unaligned)
X86_IFUNC_IMPL_ADD_V2 (array, i, memset, 1,
__memset_sse2_unaligned_erms)
)
/* Support sysdeps/x86_64/multiarch/rawmemchr.c. */
x86-64: Optimize memchr/rawmemchr/wmemchr with SSE2/AVX2 SSE2 memchr is extended to support wmemchr. AVX2 memchr/rawmemchr/wmemchr are added to search 32 bytes with a single vector compare instruction. AVX2 memchr/rawmemchr/wmemchr are as fast as SSE2 memchr/rawmemchr/wmemchr for small sizes and up to 1.5X faster for larger sizes on Haswell and Skylake. Select AVX2 memchr/rawmemchr/wmemchr on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/memchr.S (MEMCHR): New. Depending on if USE_AS_WMEMCHR is defined. (PCMPEQ): Likewise. (memchr): Renamed to ... (MEMCHR): This. Support wmemchr if USE_AS_WMEMCHR is defined. Replace pcmpeqb with PCMPEQ. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memchr-sse2, rawmemchr-sse2, memchr-avx2, rawmemchr-avx2, wmemchr-sse4_1, wmemchr-avx2 and wmemchr-c. * sysdeps/x86_64/multiarch/ifunc-avx2.h: New file. * sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/memchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/memchr.c: Likewise. * sysdeps/x86_64/multiarch/rawmemchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/rawmemchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/rawmemchr.c: Likewise. * sysdeps/x86_64/multiarch/wmemchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wmemchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wmemchr.c: Likewise. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memchr_avx2, __memchr_sse2, __rawmemchr_avx2, __rawmemchr_sse2, __wmemchr_avx2 and __wmemchr_sse2.
2017-06-09 14:13:15 +02:00
IFUNC_IMPL (i, name, rawmemchr,
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__rawmemchr_evex)
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__rawmemchr_evex_rtm)
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__rawmemchr_avx2)
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__rawmemchr_avx2_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
X86_IFUNC_IMPL_ADD_V2 (array, i, rawmemchr,
1,
__rawmemchr_sse2))
x86-64: Optimize memchr/rawmemchr/wmemchr with SSE2/AVX2 SSE2 memchr is extended to support wmemchr. AVX2 memchr/rawmemchr/wmemchr are added to search 32 bytes with a single vector compare instruction. AVX2 memchr/rawmemchr/wmemchr are as fast as SSE2 memchr/rawmemchr/wmemchr for small sizes and up to 1.5X faster for larger sizes on Haswell and Skylake. Select AVX2 memchr/rawmemchr/wmemchr on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/memchr.S (MEMCHR): New. Depending on if USE_AS_WMEMCHR is defined. (PCMPEQ): Likewise. (memchr): Renamed to ... (MEMCHR): This. Support wmemchr if USE_AS_WMEMCHR is defined. Replace pcmpeqb with PCMPEQ. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memchr-sse2, rawmemchr-sse2, memchr-avx2, rawmemchr-avx2, wmemchr-sse4_1, wmemchr-avx2 and wmemchr-c. * sysdeps/x86_64/multiarch/ifunc-avx2.h: New file. * sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/memchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/memchr.c: Likewise. * sysdeps/x86_64/multiarch/rawmemchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/rawmemchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/rawmemchr.c: Likewise. * sysdeps/x86_64/multiarch/wmemchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wmemchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wmemchr.c: Likewise. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memchr_avx2, __memchr_sse2, __rawmemchr_avx2, __rawmemchr_sse2, __wmemchr_avx2 and __wmemchr_sse2.
2017-06-09 14:13:15 +02:00
/* Support sysdeps/x86_64/multiarch/strlen.c. */
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast.
2017-06-09 14:18:03 +02:00
IFUNC_IMPL (i, name, strlen,
X86_IFUNC_IMPL_ADD_V4 (array, i, strlen,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strlen_evex)
X86_IFUNC_IMPL_ADD_V4 (array, i, strlen,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strlen_evex512)
X86_IFUNC_IMPL_ADD_V3 (array, i, strlen,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__strlen_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strlen,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strlen_avx2_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, strlen,
1,
__strlen_sse2))
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast.
2017-06-09 14:18:03 +02:00
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast.
2017-06-09 14:18:03 +02:00
IFUNC_IMPL (i, name, strnlen,
X86_IFUNC_IMPL_ADD_V4 (array, i, strnlen,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strnlen_evex)
X86_IFUNC_IMPL_ADD_V4 (array, i, strnlen,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strnlen_evex512)
X86_IFUNC_IMPL_ADD_V3 (array, i, strnlen,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__strnlen_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strnlen,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strnlen_avx2_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, strnlen,
1,
__strnlen_sse2))
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast.
2017-06-09 14:18:03 +02:00
/* Support sysdeps/x86_64/multiarch/stpncpy.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, stpncpy,
X86_IFUNC_IMPL_ADD_V4 (array, i, stpncpy,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__stpncpy_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, stpncpy,
CPU_FEATURE_USABLE (AVX2),
__stpncpy_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, stpncpy,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__stpncpy_avx2_rtm)
/* ISA V2 wrapper for sse2_unaligned implementation because
the sse2_unaligned implementation is also used at ISA
level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, stpncpy,
1,
__stpncpy_sse2_unaligned))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/stpcpy.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, stpcpy,
X86_IFUNC_IMPL_ADD_V4 (array, i, stpcpy,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__stpcpy_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, stpcpy,
CPU_FEATURE_USABLE (AVX2),
__stpcpy_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, stpcpy,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__stpcpy_avx2_rtm)
/* ISA V2 wrapper for sse2_unaligned implementation because
the sse2_unaligned implementation is also used at ISA
level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, stpcpy,
1,
__stpcpy_sse2_unaligned)
X86_IFUNC_IMPL_ADD_V1 (array, i, stpcpy,
1,
__stpcpy_sse2))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, strcasecmp,
X86_IFUNC_IMPL_ADD_V4 (array, i, strcasecmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strcasecmp_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__strcasecmp_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strcasecmp_avx2_rtm)
X86_IFUNC_IMPL_ADD_V2 (array, i, strcasecmp,
CPU_FEATURE_USABLE (SSE4_2),
__strcasecmp_sse42)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, strcasecmp,
1,
__strcasecmp_sse2))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, strcasecmp_l,
X86_IFUNC_IMPL_ADD_V4 (array, i, strcasecmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strcasecmp_l_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__strcasecmp_l_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strcasecmp_l_avx2_rtm)
X86_IFUNC_IMPL_ADD_V2 (array, i, strcasecmp_l,
CPU_FEATURE_USABLE (SSE4_2),
__strcasecmp_l_sse42)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, strcasecmp_l,
1,
__strcasecmp_l_sse2))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/strcat.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, strcat,
X86_IFUNC_IMPL_ADD_V4 (array, i, strcat,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strcat_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcat,
CPU_FEATURE_USABLE (AVX2),
__strcat_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcat,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strcat_avx2_rtm)
/* ISA V2 wrapper for sse2_unaligned implementation because
the sse2_unaligned implementation is also used at ISA
level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, strcat,
1,
__strcat_sse2_unaligned)
X86_IFUNC_IMPL_ADD_V1 (array, i, strcat,
1,
__strcat_sse2))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/strchr.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, strchr,
X86_IFUNC_IMPL_ADD_V4 (array, i, strchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strchr_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__strchr_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strchr_avx2_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, strchr,
1,
__strchr_sse2)
X86_IFUNC_IMPL_ADD_V1 (array, i, strchr,
1,
__strchr_sse2_no_bsf))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/strchrnul.c. */
IFUNC_IMPL (i, name, strchrnul,
X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strchrnul_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__strchrnul_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strchrnul_avx2_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, strchrnul,
1,
__strchrnul_sse2))
/* Support sysdeps/x86_64/multiarch/strrchr.c. */
IFUNC_IMPL (i, name, strrchr,
X86_IFUNC_IMPL_ADD_V4 (array, i, strrchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI1)
&& CPU_FEATURE_USABLE (BMI2)),
__strrchr_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI1)
&& CPU_FEATURE_USABLE (BMI2)),
__strrchr_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI1)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strrchr_avx2_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, strrchr,
1,
__strrchr_sse2))
/* Support sysdeps/x86_64/multiarch/strcmp.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, strcmp,
X86_IFUNC_IMPL_ADD_V4 (array, i, strcmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strcmp_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__strcmp_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strcmp_avx2_rtm)
X86_IFUNC_IMPL_ADD_V2 (array, i, strcmp,
CPU_FEATURE_USABLE (SSE4_2),
__strcmp_sse42)
/* ISA V2 wrapper for SSE2 implementations because the SSE2
implementations are also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, strcmp,
1,
__strcmp_sse2_unaligned)
X86_IFUNC_IMPL_ADD_V2 (array, i, strcmp,
1,
__strcmp_sse2))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/strcpy.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, strcpy,
X86_IFUNC_IMPL_ADD_V4 (array, i, strcpy,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strcpy_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcpy,
CPU_FEATURE_USABLE (AVX2),
__strcpy_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcpy,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strcpy_avx2_rtm)
/* ISA V2 wrapper for sse2_unaligned implementation because
the sse2_unaligned implementation is also used at ISA
level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, strcpy,
1,
__strcpy_sse2_unaligned)
X86_IFUNC_IMPL_ADD_V1 (array, i, strcpy,
1,
__strcpy_sse2))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/strcspn.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, strcspn,
/* All implementations of strcspn are built at all ISA
levels. */
IFUNC_IMPL_ADD (array, i, strcspn, CPU_FEATURE_USABLE (SSE4_2),
2012-10-10 16:49:12 +02:00
__strcspn_sse42)
IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_generic))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, strncasecmp,
X86_IFUNC_IMPL_ADD_V4 (array, i, strncasecmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strncasecmp_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__strncasecmp_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strncasecmp_avx2_rtm)
X86_IFUNC_IMPL_ADD_V2 (array, i, strncasecmp,
CPU_FEATURE_USABLE (SSE4_2),
__strncasecmp_sse42)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, strncasecmp,
1,
__strncasecmp_sse2))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, strncasecmp_l,
X86_IFUNC_IMPL_ADD_V4 (array, i, strncasecmp,
(CPU_FEATURE_USABLE (AVX512VL)
& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strncasecmp_l_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__strncasecmp_l_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strncasecmp_l_avx2_rtm)
X86_IFUNC_IMPL_ADD_V2 (array, i, strncasecmp_l,
CPU_FEATURE_USABLE (SSE4_2),
__strncasecmp_l_sse42)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, strncasecmp_l,
1,
__strncasecmp_l_sse2))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/strncat.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, strncat,
X86_IFUNC_IMPL_ADD_V4 (array, i, strncat,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strncat_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncat,
CPU_FEATURE_USABLE (AVX2),
__strncat_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncat,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strncat_avx2_rtm)
/* ISA V2 wrapper for sse2_unaligned implementation because
the sse2_unaligned implementation is also used at ISA
level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, strncat,
1,
__strncat_sse2_unaligned))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/strncpy.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, strncpy,
X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strncpy_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncpy,
CPU_FEATURE_USABLE (AVX2),
__strncpy_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncpy,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strncpy_avx2_rtm)
/* ISA V2 wrapper for sse2_unaligned implementation because
the sse2_unaligned implementation is also used at ISA
level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, strncpy,
1,
__strncpy_sse2_unaligned))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/strpbrk.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, strpbrk,
/* All implementations of strpbrk are built at all ISA
levels. */
IFUNC_IMPL_ADD (array, i, strpbrk, CPU_FEATURE_USABLE (SSE4_2),
2012-10-10 16:49:12 +02:00
__strpbrk_sse42)
IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_generic))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/strspn.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, strspn,
/* All implementations of strspn are built at all ISA
levels. */
IFUNC_IMPL_ADD (array, i, strspn, CPU_FEATURE_USABLE (SSE4_2),
Update x86_64 multiarch functions for <cpu-features.h> This patch updates x86_64 multiarch functions to use the newly defined HAS_CPU_FEATURE, HAS_ARCH_FEATURE and LOAD_RTLD_GLOBAL_RO_RDX from <cpu-features.h>. * sysdeps/x86_64/fpu/multiarch/e_asin.c: Replace HAS_XXX with HAS_CPU_FEATURE/HAS_ARCH_FEATURE (XXX). * sysdeps/x86_64/fpu/multiarch/e_atan2.c: Likewise. * sysdeps/x86_64/fpu/multiarch/e_exp.c: Likewise. * sysdeps/x86_64/fpu/multiarch/e_log.c: Likewise. * sysdeps/x86_64/fpu/multiarch/e_pow.c: Likewise. * sysdeps/x86_64/fpu/multiarch/s_atan.c: Likewise. * sysdeps/x86_64/fpu/multiarch/s_fma.c: Likewise. * sysdeps/x86_64/fpu/multiarch/s_fmaf.c: Likewise. * sysdeps/x86_64/fpu/multiarch/s_sin.c: Likewise. * sysdeps/x86_64/fpu/multiarch/s_tan.c: Likewise. * sysdeps/x86_64/fpu/multiarch/s_ceil.S: Use LOAD_RTLD_GLOBAL_RO_RDX and HAS_CPU_FEATURE (SSE4_1). * sysdeps/x86_64/fpu/multiarch/s_ceilf.S: Likewise. * sysdeps/x86_64/fpu/multiarch/s_floor.S: Likewise. * sysdeps/x86_64/fpu/multiarch/s_floorf.S: Likewise. * sysdeps/x86_64/fpu/multiarch/s_nearbyint.S : Likewise. * sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S: Likewise. * sysdeps/x86_64/fpu/multiarch/s_rintf.S: Likewise. * sysdeps/x86_64/fpu/multiarch/s_rintf.S : Likewise. * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Likewise. * sysdeps/x86_64/multiarch/sched_cpucount.c: Likewise. * sysdeps/x86_64/multiarch/strstr.c: Likewise. * sysdeps/x86_64/multiarch/memmove.c: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise. * sysdeps/x86_64/multiarch/test-multiarch.c: Likewise. * sysdeps/x86_64/multiarch/memcmp.S: Remove __init_cpu_features call. Add LOAD_RTLD_GLOBAL_RO_RDX. Replace HAS_XXX with HAS_CPU_FEATURE/HAS_ARCH_FEATURE (XXX). * sysdeps/x86_64/multiarch/memcpy.S: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memset.S: Likewise. * sysdeps/x86_64/multiarch/memset_chk.S: Likewise. * sysdeps/x86_64/multiarch/strcat.S: Likewise. * sysdeps/x86_64/multiarch/strchr.S: Likewise. * sysdeps/x86_64/multiarch/strcmp.S: Likewise. * sysdeps/x86_64/multiarch/strcpy.S: Likewise. * sysdeps/x86_64/multiarch/strcspn.S: Likewise. * sysdeps/x86_64/multiarch/strspn.S: Likewise. * sysdeps/x86_64/multiarch/wcscpy.S: Likewise. * sysdeps/x86_64/multiarch/wmemcmp.S: Likewise.
2015-08-13 12:38:47 +02:00
__strspn_sse42)
IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_generic))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/strstr.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, strstr,
IFUNC_IMPL_ADD (array, i, strstr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (AVX512DQ)
&& CPU_FEATURE_USABLE (BMI2)),
__strstr_avx512)
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_generic))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/wcschr.c. */
IFUNC_IMPL (i, name, wcschr,
X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcschr_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__wcschr_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wcschr_avx2_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, wcschr,
1,
__wcschr_sse2))
/* Support sysdeps/x86_64/multiarch/wcsrchr.c. */
IFUNC_IMPL (i, name, wcsrchr,
X86_IFUNC_IMPL_ADD_V4 (array, i, wcsrchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI1)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsrchr_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI1)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsrchr_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI1)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wcsrchr_avx2_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, wcsrchr,
1,
__wcsrchr_sse2))
x86-64: Optimize strcmp/wcscmp and strncmp/wcsncmp with AVX2 Optimize x86-64 strcmp/wcscmp and strncmp/wcsncmp with AVX2. It uses vector comparison as much as possible. Peak performance observed on a SkyLake machine: 9x, 3x, 2.5x and 5.5x for strcmp, strncmp, wcscmp and wcsncmp, respectively. The larger the comparison length, the more benefit using avx2 functions, except on the strcmp, where peak is observed at length == 32 bytes. Select AVX2 strcmp/wcscmp on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strcmp-avx2, strncmp-avx2, wcscmp-avx2, wcscmp-sse2, wcsncmp-avx2 and wcsncmp-sse2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strcmp_avx2, __strncmp_avx2, __wcscmp_avx2, __wcsncmp_avx2, __wcscmp_sse2 and __wcsncmp_sse2. * sysdeps/x86_64/multiarch/strcmp.c (OPTIMIZE (avx2)): (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if AVX unaligned load is fast and vzeroupper is preferred. * sysdeps/x86_64/multiarch/strncmp.c: Likewise. * sysdeps/x86_64/multiarch/strcmp-avx2.S: New file. * sysdeps/x86_64/multiarch/strncmp-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcscmp-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcscmp-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcscmp.c: Likewise. * sysdeps/x86_64/multiarch/wcsncmp-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsncmp-sse2.c: Likewise. * sysdeps/x86_64/multiarch/wcsncmp.c: Likewise. * sysdeps/x86_64/wcscmp.S (__wcscmp): Add alias only if __wcscmp is undefined.
2018-05-03 18:09:30 +02:00
/* Support sysdeps/x86_64/multiarch/wcscmp.c. */
IFUNC_IMPL (i, name, wcscmp,
X86_IFUNC_IMPL_ADD_V4 (array, i, wcscmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcscmp_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcscmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__wcscmp_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcscmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wcscmp_avx2_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, wcscmp,
1,
__wcscmp_sse2))
x86-64: Optimize strcmp/wcscmp and strncmp/wcsncmp with AVX2 Optimize x86-64 strcmp/wcscmp and strncmp/wcsncmp with AVX2. It uses vector comparison as much as possible. Peak performance observed on a SkyLake machine: 9x, 3x, 2.5x and 5.5x for strcmp, strncmp, wcscmp and wcsncmp, respectively. The larger the comparison length, the more benefit using avx2 functions, except on the strcmp, where peak is observed at length == 32 bytes. Select AVX2 strcmp/wcscmp on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strcmp-avx2, strncmp-avx2, wcscmp-avx2, wcscmp-sse2, wcsncmp-avx2 and wcsncmp-sse2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strcmp_avx2, __strncmp_avx2, __wcscmp_avx2, __wcsncmp_avx2, __wcscmp_sse2 and __wcsncmp_sse2. * sysdeps/x86_64/multiarch/strcmp.c (OPTIMIZE (avx2)): (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if AVX unaligned load is fast and vzeroupper is preferred. * sysdeps/x86_64/multiarch/strncmp.c: Likewise. * sysdeps/x86_64/multiarch/strcmp-avx2.S: New file. * sysdeps/x86_64/multiarch/strncmp-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcscmp-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcscmp-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcscmp.c: Likewise. * sysdeps/x86_64/multiarch/wcsncmp-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsncmp-sse2.c: Likewise. * sysdeps/x86_64/multiarch/wcsncmp.c: Likewise. * sysdeps/x86_64/wcscmp.S (__wcscmp): Add alias only if __wcscmp is undefined.
2018-05-03 18:09:30 +02:00
/* Support sysdeps/x86_64/multiarch/wcsncmp.c. */
IFUNC_IMPL (i, name, wcsncmp,
X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsncmp_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsncmp_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wcsncmp_avx2_rtm)
/* ISA V2 wrapper for GENERIC implementation because the
GENERIC implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncmp,
1,
__wcsncmp_generic))
x86-64: Optimize strcmp/wcscmp and strncmp/wcsncmp with AVX2 Optimize x86-64 strcmp/wcscmp and strncmp/wcsncmp with AVX2. It uses vector comparison as much as possible. Peak performance observed on a SkyLake machine: 9x, 3x, 2.5x and 5.5x for strcmp, strncmp, wcscmp and wcsncmp, respectively. The larger the comparison length, the more benefit using avx2 functions, except on the strcmp, where peak is observed at length == 32 bytes. Select AVX2 strcmp/wcscmp on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strcmp-avx2, strncmp-avx2, wcscmp-avx2, wcscmp-sse2, wcsncmp-avx2 and wcsncmp-sse2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strcmp_avx2, __strncmp_avx2, __wcscmp_avx2, __wcsncmp_avx2, __wcscmp_sse2 and __wcsncmp_sse2. * sysdeps/x86_64/multiarch/strcmp.c (OPTIMIZE (avx2)): (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if AVX unaligned load is fast and vzeroupper is preferred. * sysdeps/x86_64/multiarch/strncmp.c: Likewise. * sysdeps/x86_64/multiarch/strcmp-avx2.S: New file. * sysdeps/x86_64/multiarch/strncmp-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcscmp-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcscmp-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcscmp.c: Likewise. * sysdeps/x86_64/multiarch/wcsncmp-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsncmp-sse2.c: Likewise. * sysdeps/x86_64/multiarch/wcsncmp.c: Likewise. * sysdeps/x86_64/wcscmp.S (__wcscmp): Add alias only if __wcscmp is undefined.
2018-05-03 18:09:30 +02:00
/* Support sysdeps/x86_64/multiarch/wcscpy.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, wcscpy,
/* ISA V4 wrapper for SSSE3 implementation because
the SSSE3 implementation is also used at ISA
level 3/4. */
X86_IFUNC_IMPL_ADD_V4 (array, i, wcscpy,
CPU_FEATURE_USABLE (SSSE3),
__wcscpy_ssse3)
X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
1,
__wcscpy_generic))
2012-10-10 16:49:12 +02:00
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast.
2017-06-09 14:18:03 +02:00
/* Support sysdeps/x86_64/multiarch/wcslen.c. */
IFUNC_IMPL (i, name, wcslen,
X86_IFUNC_IMPL_ADD_V4 (array, i, wcslen,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcslen_evex)
X86_IFUNC_IMPL_ADD_V4 (array, i, wcslen,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcslen_evex512)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcslen,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__wcslen_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcslen,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wcslen_avx2_rtm)
X86_IFUNC_IMPL_ADD_V2 (array, i, wcslen,
CPU_FEATURE_USABLE (SSE4_1),
__wcslen_sse4_1)
X86_IFUNC_IMPL_ADD_V1 (array, i, wcslen,
1,
__wcslen_sse2))
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast.
2017-06-09 14:18:03 +02:00
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
IFUNC_IMPL (i, name, wcsnlen,
X86_IFUNC_IMPL_ADD_V4 (array, i, wcsnlen,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsnlen_evex)
X86_IFUNC_IMPL_ADD_V4 (array, i, wcsnlen,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsnlen_evex512)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcsnlen,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsnlen_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcsnlen,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wcsnlen_avx2_rtm)
X86_IFUNC_IMPL_ADD_V2 (array, i, wcsnlen,
CPU_FEATURE_USABLE (SSE4_1),
__wcsnlen_sse4_1)
X86_IFUNC_IMPL_ADD_V1 (array, i, wcsnlen,
1,
__wcsnlen_generic))
x86-64: Optimize memchr/rawmemchr/wmemchr with SSE2/AVX2 SSE2 memchr is extended to support wmemchr. AVX2 memchr/rawmemchr/wmemchr are added to search 32 bytes with a single vector compare instruction. AVX2 memchr/rawmemchr/wmemchr are as fast as SSE2 memchr/rawmemchr/wmemchr for small sizes and up to 1.5X faster for larger sizes on Haswell and Skylake. Select AVX2 memchr/rawmemchr/wmemchr on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/memchr.S (MEMCHR): New. Depending on if USE_AS_WMEMCHR is defined. (PCMPEQ): Likewise. (memchr): Renamed to ... (MEMCHR): This. Support wmemchr if USE_AS_WMEMCHR is defined. Replace pcmpeqb with PCMPEQ. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memchr-sse2, rawmemchr-sse2, memchr-avx2, rawmemchr-avx2, wmemchr-sse4_1, wmemchr-avx2 and wmemchr-c. * sysdeps/x86_64/multiarch/ifunc-avx2.h: New file. * sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/memchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/memchr.c: Likewise. * sysdeps/x86_64/multiarch/rawmemchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/rawmemchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/rawmemchr.c: Likewise. * sysdeps/x86_64/multiarch/wmemchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wmemchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wmemchr.c: Likewise. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memchr_avx2, __memchr_sse2, __rawmemchr_avx2, __rawmemchr_sse2, __wmemchr_avx2 and __wmemchr_sse2.
2017-06-09 14:13:15 +02:00
/* Support sysdeps/x86_64/multiarch/wmemchr.c. */
IFUNC_IMPL (i, name, wmemchr,
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wmemchr_evex)
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wmemchr_evex_rtm)
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__wmemchr_avx2)
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wmemchr_avx2_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
x86: Add support for compiling {raw|w}memchr with high ISA level 1. Refactor files so that all implementations for in the multiarch directory. - Essentially moved sse2 {raw|w}memchr.S implementation to multiarch/{raw|w}memchr-sse2.S - The non-multiarch {raw|w}memchr.S file now only includes one of the implementations in the multiarch directory based on the compiled ISA level (only used for non-multiarch builds. Otherwise we go through the ifunc selector). 2. Add ISA level build guards to different implementations. - I.e memchr-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (memchr-evex{-rtm}.S). 3. Add new multiarch/rtld-{raw}memchr.S that just include the non-multiarch {raw}memchr.S which will in turn select the best implementation based on the compiled ISA level. 4. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. - Guranteed replacement essentially means that for any ISA level build there must be a function that the baseline of the ISA supports. So for {raw|w}memchr.S since there is not ISA level 2 function, the ISA level 2 build still includes the ISA level 1 (sse2) function. Once we reach the ISA level 3 build, however, {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA level 1 implementation ({raw|w}memchr-sse2.S) will not be built. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch.
2022-06-23 01:51:20 +02:00
X86_IFUNC_IMPL_ADD_V2 (array, i, wmemchr,
1,
__wmemchr_sse2))
x86-64: Optimize memchr/rawmemchr/wmemchr with SSE2/AVX2 SSE2 memchr is extended to support wmemchr. AVX2 memchr/rawmemchr/wmemchr are added to search 32 bytes with a single vector compare instruction. AVX2 memchr/rawmemchr/wmemchr are as fast as SSE2 memchr/rawmemchr/wmemchr for small sizes and up to 1.5X faster for larger sizes on Haswell and Skylake. Select AVX2 memchr/rawmemchr/wmemchr on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/memchr.S (MEMCHR): New. Depending on if USE_AS_WMEMCHR is defined. (PCMPEQ): Likewise. (memchr): Renamed to ... (MEMCHR): This. Support wmemchr if USE_AS_WMEMCHR is defined. Replace pcmpeqb with PCMPEQ. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memchr-sse2, rawmemchr-sse2, memchr-avx2, rawmemchr-avx2, wmemchr-sse4_1, wmemchr-avx2 and wmemchr-c. * sysdeps/x86_64/multiarch/ifunc-avx2.h: New file. * sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/memchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/memchr.c: Likewise. * sysdeps/x86_64/multiarch/rawmemchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/rawmemchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/rawmemchr.c: Likewise. * sysdeps/x86_64/multiarch/wmemchr-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wmemchr-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wmemchr.c: Likewise. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memchr_avx2, __memchr_sse2, __rawmemchr_avx2, __rawmemchr_sse2, __wmemchr_avx2 and __wmemchr_sse2.
2017-06-09 14:13:15 +02:00
/* Support sysdeps/x86_64/multiarch/wmemcmp.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, wmemcmp,
X86_IFUNC_IMPL_ADD_V4 (array, i, wmemcmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_evex_movbe)
X86_IFUNC_IMPL_ADD_V3 (array, i, wmemcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_avx2_movbe)
X86_IFUNC_IMPL_ADD_V3 (array, i, wmemcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)
&& CPU_FEATURE_USABLE (RTM)),
__wmemcmp_avx2_movbe_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, wmemcmp,
1,
__wmemcmp_sse2))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/wmemset.c. */
x86-64: Optimize wmemset with SSE2/AVX2/AVX512 The difference between memset and wmemset is byte vs int. Add stubs to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size: SSE2 wmemset: shl $0x2,%rdx movd %esi,%xmm0 mov %rdi,%rax pshufd $0x0,%xmm0,%xmm0 jmp entry_from_wmemset SSE2 memset: movd %esi,%xmm0 mov %rdi,%rax punpcklbw %xmm0,%xmm0 punpcklwd %xmm0,%xmm0 pshufd $0x0,%xmm0,%xmm0 entry_from_wmemset: Since the ERMS versions of wmemset requires "rep stosl" instead of "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset are added. The SSE2 wmemset is about 3X faster and the AVX2 wmemset is about 6X faster on Haswell. * include/wchar.h (__wmemset_chk): New. * sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_CHK_SYMBOL): Likewise. (WMEMSET_SYMBOL): Likewise. (__wmemset): Add hidden definition. (wmemset): Add weak hidden definition. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add wmemset_chk-nonshared. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned, __wmemset_avx2_unaligned, __wmemset_avx512_unaligned, __wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned and __wmemset_chk_avx512_unaligned. * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated. (WMEMSET_CHK_SYMBOL): New. (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise. (WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise. * sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New. (libc_hidden_builtin_def): Also define __GI_wmemset and __GI___wmemset. (weak_alias): New. * sysdeps/x86_64/multiarch/wmemset.c: New file. * sysdeps/x86_64/multiarch/wmemset.h: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk.c: Likewise. * sysdeps/x86_64/wmemset.c: Likewise. * sysdeps/x86_64/wmemset_chk.c: Likewise.
2017-06-05 20:09:48 +02:00
IFUNC_IMPL (i, name, wmemset,
X86_IFUNC_IMPL_ADD_V4 (array, i, wmemset,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wmemset_evex_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, wmemset,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wmemset_avx512_unaligned)
X86_IFUNC_IMPL_ADD_V3 (array, i, wmemset,
CPU_FEATURE_USABLE (AVX2),
__wmemset_avx2_unaligned)
X86_IFUNC_IMPL_ADD_V3 (array, i, wmemset,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__wmemset_avx2_unaligned_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, wmemset, 1,
__wmemset_sse2_unaligned))
x86-64: Optimize wmemset with SSE2/AVX2/AVX512 The difference between memset and wmemset is byte vs int. Add stubs to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size: SSE2 wmemset: shl $0x2,%rdx movd %esi,%xmm0 mov %rdi,%rax pshufd $0x0,%xmm0,%xmm0 jmp entry_from_wmemset SSE2 memset: movd %esi,%xmm0 mov %rdi,%rax punpcklbw %xmm0,%xmm0 punpcklwd %xmm0,%xmm0 pshufd $0x0,%xmm0,%xmm0 entry_from_wmemset: Since the ERMS versions of wmemset requires "rep stosl" instead of "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset are added. The SSE2 wmemset is about 3X faster and the AVX2 wmemset is about 6X faster on Haswell. * include/wchar.h (__wmemset_chk): New. * sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_CHK_SYMBOL): Likewise. (WMEMSET_SYMBOL): Likewise. (__wmemset): Add hidden definition. (wmemset): Add weak hidden definition. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add wmemset_chk-nonshared. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned, __wmemset_avx2_unaligned, __wmemset_avx512_unaligned, __wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned and __wmemset_chk_avx512_unaligned. * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated. (WMEMSET_CHK_SYMBOL): New. (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise. (WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise. * sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New. (libc_hidden_builtin_def): Also define __GI_wmemset and __GI___wmemset. (weak_alias): New. * sysdeps/x86_64/multiarch/wmemset.c: New file. * sysdeps/x86_64/multiarch/wmemset.h: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk.c: Likewise. * sysdeps/x86_64/wmemset.c: Likewise. * sysdeps/x86_64/wmemset_chk.c: Likewise.
2017-06-05 20:09:48 +02:00
2012-10-10 16:49:12 +02:00
#ifdef SHARED
x86-64: Implement memmove family IFUNC selectors in C Implement memmove family IFUNC selectors in C. All internal calls within libc.so can use IFUNC on x86-64 since unlike x86, x86-64 supports PC-relative addressing to access the GOT entry so that it can call via PLT without using an extra register. For libc.a, we can't use IFUNC for functions which are called before IFUNC has been initialized. Use IFUNC internally reduces the icache footprint since libc.so and other codes in the process use the same implementations. This patch uses IFUNC for memmove family functions within libc. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memmove-sse2-unaligned-erms, memcpy_chk-nonshared, mempcpy_chk-nonshared and memmove_chk-nonshared. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __memmove_chk_erms, __memcpy_chk_erms and __mempcpy_chk_erms. Update comments. * sysdeps/x86_64/multiarch/ifunc-memmove.h: New file. * sysdeps/x86_64/multiarch/memcpy.c: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk.c: Likewise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memmove.c: Likewise. * sysdeps/x86_64/multiarch/memmove_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise. * sysdeps/x86_64/multiarch/mempcpy.c: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.c: Likewise. * sysdeps/x86_64/multiarch/memcpy.S: Removed. * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S (__mempcpy_chk_erms): New function. (__memmove_chk_erms): Likewise. (__memcpy_chk_erms): New alias.
2017-06-14 21:10:57 +02:00
/* Support sysdeps/x86_64/multiarch/memcpy_chk.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, __memcpy_chk,
Add x86-64 memmove with unaligned load/store and rep movsb Implement x86-64 memmove with unaligned load/store and rep movsb. Support 16-byte, 32-byte and 64-byte vector register sizes. When size <= 8 times of vector register size, there is no check for address overlap bewteen source and destination. Since overhead for overlap check is small when size > 8 times of vector register size, memcpy is an alias of memmove. A single file provides 2 implementations of memmove, one with rep movsb and the other without rep movsb. They share the same codes when size is between 2 times of vector register size and REP_MOVSB_THRESHOLD which is 2KB for 16-byte vector register size and scaled up by large vector register size. Key features: 1. Use overlapping load and store to avoid branch. 2. For size <= 8 times of vector register size, load all sources into registers and store them together. 3. If there is no address overlap bewteen source and destination, copy from both ends with 4 times of vector register size at a time. 4. If address of destination > address of source, backward copy 8 times of vector register size at a time. 5. Otherwise, forward copy 8 times of vector register size at a time. 6. Use rep movsb only for forward copy. Avoid slow backward rep movsb by fallbacking to backward copy 8 times of vector register size at a time. 7. Skip when address of destination == address of source. [BZ #19776] * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and memmove-avx512-unaligned-erms. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memmove_chk_avx512_unaligned_2, __memmove_chk_avx512_unaligned_erms, __memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms, __memmove_chk_sse2_unaligned_2, __memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2, __memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2, __memmove_avx512_unaligned_erms, __memmove_erms, __memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms, __memcpy_chk_avx512_unaligned_2, __memcpy_chk_avx512_unaligned_erms, __memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms, __memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms, __memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms, __memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms, __memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms, __memcpy_erms, __mempcpy_chk_avx512_unaligned_2, __mempcpy_chk_avx512_unaligned_erms, __mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms, __mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms, __mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms, __mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms, __mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and __mempcpy_erms. * sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New file. * sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S: Likwise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likwise. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: Likwise.
2016-03-31 19:04:26 +02:00
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512F),
__memcpy_chk_avx512_no_vzeroupper)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_avx512_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_avx512_unaligned_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_evex_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_evex_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX),
__memcpy_chk_avx_unaligned)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX),
__memcpy_chk_avx_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_chk_avx_unaligned_rtm)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_chk_avx_unaligned_erms_rtm)
/* By V3 we assume fast aligned copy. */
X86_IFUNC_IMPL_ADD_V2 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__memcpy_chk_ssse3)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2 (SSSE3 is too
optimized around aligned copy to be better as general
purpose memmove). */
X86_IFUNC_IMPL_ADD_V2 (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2_unaligned)
X86_IFUNC_IMPL_ADD_V2 (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2_unaligned_erms))
#endif
2012-10-10 16:49:12 +02:00
x86-64: Implement memmove family IFUNC selectors in C Implement memmove family IFUNC selectors in C. All internal calls within libc.so can use IFUNC on x86-64 since unlike x86, x86-64 supports PC-relative addressing to access the GOT entry so that it can call via PLT without using an extra register. For libc.a, we can't use IFUNC for functions which are called before IFUNC has been initialized. Use IFUNC internally reduces the icache footprint since libc.so and other codes in the process use the same implementations. This patch uses IFUNC for memmove family functions within libc. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memmove-sse2-unaligned-erms, memcpy_chk-nonshared, mempcpy_chk-nonshared and memmove_chk-nonshared. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __memmove_chk_erms, __memcpy_chk_erms and __mempcpy_chk_erms. Update comments. * sysdeps/x86_64/multiarch/ifunc-memmove.h: New file. * sysdeps/x86_64/multiarch/memcpy.c: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk.c: Likewise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memmove.c: Likewise. * sysdeps/x86_64/multiarch/memmove_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise. * sysdeps/x86_64/multiarch/mempcpy.c: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.c: Likewise. * sysdeps/x86_64/multiarch/memcpy.S: Removed. * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S (__mempcpy_chk_erms): New function. (__memmove_chk_erms): Likewise. (__memcpy_chk_erms): New alias.
2017-06-14 21:10:57 +02:00
/* Support sysdeps/x86_64/multiarch/memcpy.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, memcpy,
Add x86-64 memmove with unaligned load/store and rep movsb Implement x86-64 memmove with unaligned load/store and rep movsb. Support 16-byte, 32-byte and 64-byte vector register sizes. When size <= 8 times of vector register size, there is no check for address overlap bewteen source and destination. Since overhead for overlap check is small when size > 8 times of vector register size, memcpy is an alias of memmove. A single file provides 2 implementations of memmove, one with rep movsb and the other without rep movsb. They share the same codes when size is between 2 times of vector register size and REP_MOVSB_THRESHOLD which is 2KB for 16-byte vector register size and scaled up by large vector register size. Key features: 1. Use overlapping load and store to avoid branch. 2. For size <= 8 times of vector register size, load all sources into registers and store them together. 3. If there is no address overlap bewteen source and destination, copy from both ends with 4 times of vector register size at a time. 4. If address of destination > address of source, backward copy 8 times of vector register size at a time. 5. Otherwise, forward copy 8 times of vector register size at a time. 6. Use rep movsb only for forward copy. Avoid slow backward rep movsb by fallbacking to backward copy 8 times of vector register size at a time. 7. Skip when address of destination == address of source. [BZ #19776] * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and memmove-avx512-unaligned-erms. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memmove_chk_avx512_unaligned_2, __memmove_chk_avx512_unaligned_erms, __memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms, __memmove_chk_sse2_unaligned_2, __memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2, __memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2, __memmove_avx512_unaligned_erms, __memmove_erms, __memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms, __memcpy_chk_avx512_unaligned_2, __memcpy_chk_avx512_unaligned_erms, __memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms, __memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms, __memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms, __memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms, __memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms, __memcpy_erms, __mempcpy_chk_avx512_unaligned_2, __mempcpy_chk_avx512_unaligned_erms, __mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms, __mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms, __mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms, __mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms, __mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and __mempcpy_erms. * sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New file. * sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S: Likwise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likwise. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: Likwise.
2016-03-31 19:04:26 +02:00
IFUNC_IMPL_ADD (array, i, memcpy, 1,
__memcpy_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512F),
__memcpy_avx512_no_vzeroupper)
X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_avx512_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_avx512_unaligned_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_evex_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_evex_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
CPU_FEATURE_USABLE (AVX),
__memcpy_avx_unaligned)
X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
CPU_FEATURE_USABLE (AVX),
__memcpy_avx_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_avx_unaligned_rtm)
X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_avx_unaligned_erms_rtm)
/* By V3 we assume fast aligned copy. */
X86_IFUNC_IMPL_ADD_V2 (array, i, memcpy,
CPU_FEATURE_USABLE (SSSE3),
__memcpy_ssse3)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2 (SSSE3 is too
optimized around aligned copy to be better as general
purpose memmove). */
X86_IFUNC_IMPL_ADD_V2 (array, i, memcpy, 1,
__memcpy_sse2_unaligned)
X86_IFUNC_IMPL_ADD_V2 (array, i, memcpy, 1,
__memcpy_sse2_unaligned_erms))
2012-10-10 16:49:12 +02:00
#ifdef SHARED
x86-64: Implement memmove family IFUNC selectors in C Implement memmove family IFUNC selectors in C. All internal calls within libc.so can use IFUNC on x86-64 since unlike x86, x86-64 supports PC-relative addressing to access the GOT entry so that it can call via PLT without using an extra register. For libc.a, we can't use IFUNC for functions which are called before IFUNC has been initialized. Use IFUNC internally reduces the icache footprint since libc.so and other codes in the process use the same implementations. This patch uses IFUNC for memmove family functions within libc. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memmove-sse2-unaligned-erms, memcpy_chk-nonshared, mempcpy_chk-nonshared and memmove_chk-nonshared. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __memmove_chk_erms, __memcpy_chk_erms and __mempcpy_chk_erms. Update comments. * sysdeps/x86_64/multiarch/ifunc-memmove.h: New file. * sysdeps/x86_64/multiarch/memcpy.c: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk.c: Likewise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memmove.c: Likewise. * sysdeps/x86_64/multiarch/memmove_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise. * sysdeps/x86_64/multiarch/mempcpy.c: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.c: Likewise. * sysdeps/x86_64/multiarch/memcpy.S: Removed. * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S (__mempcpy_chk_erms): New function. (__memmove_chk_erms): Likewise. (__memcpy_chk_erms): New alias.
2017-06-14 21:10:57 +02:00
/* Support sysdeps/x86_64/multiarch/mempcpy_chk.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, __mempcpy_chk,
x86-64: Implement memmove family IFUNC selectors in C Implement memmove family IFUNC selectors in C. All internal calls within libc.so can use IFUNC on x86-64 since unlike x86, x86-64 supports PC-relative addressing to access the GOT entry so that it can call via PLT without using an extra register. For libc.a, we can't use IFUNC for functions which are called before IFUNC has been initialized. Use IFUNC internally reduces the icache footprint since libc.so and other codes in the process use the same implementations. This patch uses IFUNC for memmove family functions within libc. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memmove-sse2-unaligned-erms, memcpy_chk-nonshared, mempcpy_chk-nonshared and memmove_chk-nonshared. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __memmove_chk_erms, __memcpy_chk_erms and __mempcpy_chk_erms. Update comments. * sysdeps/x86_64/multiarch/ifunc-memmove.h: New file. * sysdeps/x86_64/multiarch/memcpy.c: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk.c: Likewise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memmove.c: Likewise. * sysdeps/x86_64/multiarch/memmove_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise. * sysdeps/x86_64/multiarch/mempcpy.c: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.c: Likewise. * sysdeps/x86_64/multiarch/memcpy.S: Removed. * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S (__mempcpy_chk_erms): New function. (__memmove_chk_erms): Likewise. (__memcpy_chk_erms): New alias.
2017-06-14 21:10:57 +02:00
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512F),
__mempcpy_chk_avx512_no_vzeroupper)
X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_avx512_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_avx512_unaligned_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_evex_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_evex_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX),
__mempcpy_chk_avx_unaligned)
X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX),
__mempcpy_chk_avx_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_chk_avx_unaligned_rtm)
X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_chk_avx_unaligned_erms_rtm)
/* By V3 we assume fast aligned copy. */
X86_IFUNC_IMPL_ADD_V2 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__mempcpy_chk_ssse3)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2 (SSSE3 is too
optimized around aligned copy to be better as general
purpose memmove). */
X86_IFUNC_IMPL_ADD_V2 (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2_unaligned)
X86_IFUNC_IMPL_ADD_V2 (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2_unaligned_erms))
#endif
2012-10-10 16:49:12 +02:00
x86-64: Implement memmove family IFUNC selectors in C Implement memmove family IFUNC selectors in C. All internal calls within libc.so can use IFUNC on x86-64 since unlike x86, x86-64 supports PC-relative addressing to access the GOT entry so that it can call via PLT without using an extra register. For libc.a, we can't use IFUNC for functions which are called before IFUNC has been initialized. Use IFUNC internally reduces the icache footprint since libc.so and other codes in the process use the same implementations. This patch uses IFUNC for memmove family functions within libc. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memmove-sse2-unaligned-erms, memcpy_chk-nonshared, mempcpy_chk-nonshared and memmove_chk-nonshared. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __memmove_chk_erms, __memcpy_chk_erms and __mempcpy_chk_erms. Update comments. * sysdeps/x86_64/multiarch/ifunc-memmove.h: New file. * sysdeps/x86_64/multiarch/memcpy.c: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk.c: Likewise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memmove.c: Likewise. * sysdeps/x86_64/multiarch/memmove_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise. * sysdeps/x86_64/multiarch/mempcpy.c: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.c: Likewise. * sysdeps/x86_64/multiarch/memcpy.S: Removed. * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S (__mempcpy_chk_erms): New function. (__memmove_chk_erms): Likewise. (__memcpy_chk_erms): New alias.
2017-06-14 21:10:57 +02:00
/* Support sysdeps/x86_64/multiarch/mempcpy.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, mempcpy,
Add x86-64 memmove with unaligned load/store and rep movsb Implement x86-64 memmove with unaligned load/store and rep movsb. Support 16-byte, 32-byte and 64-byte vector register sizes. When size <= 8 times of vector register size, there is no check for address overlap bewteen source and destination. Since overhead for overlap check is small when size > 8 times of vector register size, memcpy is an alias of memmove. A single file provides 2 implementations of memmove, one with rep movsb and the other without rep movsb. They share the same codes when size is between 2 times of vector register size and REP_MOVSB_THRESHOLD which is 2KB for 16-byte vector register size and scaled up by large vector register size. Key features: 1. Use overlapping load and store to avoid branch. 2. For size <= 8 times of vector register size, load all sources into registers and store them together. 3. If there is no address overlap bewteen source and destination, copy from both ends with 4 times of vector register size at a time. 4. If address of destination > address of source, backward copy 8 times of vector register size at a time. 5. Otherwise, forward copy 8 times of vector register size at a time. 6. Use rep movsb only for forward copy. Avoid slow backward rep movsb by fallbacking to backward copy 8 times of vector register size at a time. 7. Skip when address of destination == address of source. [BZ #19776] * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and memmove-avx512-unaligned-erms. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memmove_chk_avx512_unaligned_2, __memmove_chk_avx512_unaligned_erms, __memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms, __memmove_chk_sse2_unaligned_2, __memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2, __memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2, __memmove_avx512_unaligned_erms, __memmove_erms, __memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms, __memcpy_chk_avx512_unaligned_2, __memcpy_chk_avx512_unaligned_erms, __memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms, __memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms, __memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms, __memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms, __memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms, __memcpy_erms, __mempcpy_chk_avx512_unaligned_2, __mempcpy_chk_avx512_unaligned_erms, __mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms, __mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms, __mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms, __mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms, __mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and __mempcpy_erms. * sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New file. * sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S: Likwise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likwise. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: Likwise.
2016-03-31 19:04:26 +02:00
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
__mempcpy_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512F),
__mempcpy_avx512_no_vzeroupper)
X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_avx512_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_avx512_unaligned_erms)
X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_evex_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_evex_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX),
__mempcpy_avx_unaligned)
X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX),
__mempcpy_avx_unaligned_erms)
X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_avx_unaligned_rtm)
X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_avx_unaligned_erms_rtm)
/* By V3 we assume fast aligned copy. */
X86_IFUNC_IMPL_ADD_V2 (array, i, mempcpy,
CPU_FEATURE_USABLE (SSSE3),
__mempcpy_ssse3)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2 (SSSE3 is too
optimized around aligned copy to be better as general
purpose memmove). */
X86_IFUNC_IMPL_ADD_V2 (array, i, mempcpy, 1,
__mempcpy_sse2_unaligned)
X86_IFUNC_IMPL_ADD_V2 (array, i, mempcpy, 1,
__mempcpy_sse2_unaligned_erms))
2012-10-10 16:49:12 +02:00
/* Support sysdeps/x86_64/multiarch/strncmp.c. */
2012-10-10 16:49:12 +02:00
IFUNC_IMPL (i, name, strncmp,
X86_IFUNC_IMPL_ADD_V4 (array, i, strncmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strncmp_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__strncmp_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strncmp_avx2_rtm)
X86_IFUNC_IMPL_ADD_V2 (array, i, strncmp,
CPU_FEATURE_USABLE (SSE4_2),
__strncmp_sse42)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, strncmp,
1,
__strncmp_sse2))
x86-64: Optimize wmemset with SSE2/AVX2/AVX512 The difference between memset and wmemset is byte vs int. Add stubs to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size: SSE2 wmemset: shl $0x2,%rdx movd %esi,%xmm0 mov %rdi,%rax pshufd $0x0,%xmm0,%xmm0 jmp entry_from_wmemset SSE2 memset: movd %esi,%xmm0 mov %rdi,%rax punpcklbw %xmm0,%xmm0 punpcklwd %xmm0,%xmm0 pshufd $0x0,%xmm0,%xmm0 entry_from_wmemset: Since the ERMS versions of wmemset requires "rep stosl" instead of "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset are added. The SSE2 wmemset is about 3X faster and the AVX2 wmemset is about 6X faster on Haswell. * include/wchar.h (__wmemset_chk): New. * sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_CHK_SYMBOL): Likewise. (WMEMSET_SYMBOL): Likewise. (__wmemset): Add hidden definition. (wmemset): Add weak hidden definition. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add wmemset_chk-nonshared. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned, __wmemset_avx2_unaligned, __wmemset_avx512_unaligned, __wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned and __wmemset_chk_avx512_unaligned. * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated. (WMEMSET_CHK_SYMBOL): New. (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise. (WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise. * sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New. (libc_hidden_builtin_def): Also define __GI_wmemset and __GI___wmemset. (weak_alias): New. * sysdeps/x86_64/multiarch/wmemset.c: New file. * sysdeps/x86_64/multiarch/wmemset.h: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk.c: Likewise. * sysdeps/x86_64/wmemset.c: Likewise. * sysdeps/x86_64/wmemset_chk.c: Likewise.
2017-06-05 20:09:48 +02:00
#ifdef SHARED
/* Support sysdeps/x86_64/multiarch/wmemset_chk.c. */
x86-64: Optimize wmemset with SSE2/AVX2/AVX512 The difference between memset and wmemset is byte vs int. Add stubs to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size: SSE2 wmemset: shl $0x2,%rdx movd %esi,%xmm0 mov %rdi,%rax pshufd $0x0,%xmm0,%xmm0 jmp entry_from_wmemset SSE2 memset: movd %esi,%xmm0 mov %rdi,%rax punpcklbw %xmm0,%xmm0 punpcklwd %xmm0,%xmm0 pshufd $0x0,%xmm0,%xmm0 entry_from_wmemset: Since the ERMS versions of wmemset requires "rep stosl" instead of "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset are added. The SSE2 wmemset is about 3X faster and the AVX2 wmemset is about 6X faster on Haswell. * include/wchar.h (__wmemset_chk): New. * sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_CHK_SYMBOL): Likewise. (WMEMSET_SYMBOL): Likewise. (__wmemset): Add hidden definition. (wmemset): Add weak hidden definition. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add wmemset_chk-nonshared. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned, __wmemset_avx2_unaligned, __wmemset_avx512_unaligned, __wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned and __wmemset_chk_avx512_unaligned. * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated. (WMEMSET_CHK_SYMBOL): New. (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise. (WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise. * sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New. (libc_hidden_builtin_def): Also define __GI_wmemset and __GI___wmemset. (weak_alias): New. * sysdeps/x86_64/multiarch/wmemset.c: New file. * sysdeps/x86_64/multiarch/wmemset.h: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk.c: Likewise. * sysdeps/x86_64/wmemset.c: Likewise. * sysdeps/x86_64/wmemset_chk.c: Likewise.
2017-06-05 20:09:48 +02:00
IFUNC_IMPL (i, name, __wmemset_chk,
X86_IFUNC_IMPL_ADD_V4 (array, i, __wmemset_chk,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wmemset_chk_evex_unaligned)
X86_IFUNC_IMPL_ADD_V4 (array, i, __wmemset_chk,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wmemset_chk_avx512_unaligned)
X86_IFUNC_IMPL_ADD_V3 (array, i, __wmemset_chk,
CPU_FEATURE_USABLE (AVX2),
__wmemset_chk_avx2_unaligned)
X86_IFUNC_IMPL_ADD_V3 (array, i, __wmemset_chk,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__wmemset_chk_avx2_unaligned_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
implementation is also used at ISA level 2. */
X86_IFUNC_IMPL_ADD_V2 (array, i, __wmemset_chk, 1,
__wmemset_chk_sse2_unaligned))
2012-10-10 16:49:12 +02:00
#endif
return 0;
2012-10-10 16:49:12 +02:00
}