Improve 64bit memcpy/memmove for Atom, Core 2 and Core i7

This patch includes optimized 64bit memcpy/memmove for Atom, Core 2 and
Core i7.  It improves memcpy by up to 3X on Atom, up to 4X on Core 2 and
up to 1X on Core i7.  It also improves memmove by up to 3X on Atom, up to
4X on Core 2 and up to 2X on Core i7.
This commit is contained in:
H.J. Lu 2010-06-30 08:26:11 -07:00 committed by Ulrich Drepper
parent d85f8ff667
commit 6fb8cbcb58
21 changed files with 6681 additions and 10 deletions

View file

@ -1,3 +1,35 @@
2010-06-25 H.J. Lu <hongjiu.lu@intel.com>
* debug/memmove_chk.c (__memmove_chk): Renamed to ...
(MEMMOVE_CHK): ...this. Default to __memmove_chk.
* string/memmove.c (memmove): Renamed to ...
(MEMMOVE): ...this. Default to memmove.
* sysdeps/x86_64/memcpy.S: Use ENTRY_CHK and END_CHK.
* sysdeps/x86_64/sysdep.h (ENTRY_CHK): Define.
(END_CHK): Define.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memcpy-ssse3 mempcpy-ssse3 memmove-ssse3 memcpy-ssse3-back
mempcpy-ssse3-back memmove-ssse3-back.
* sysdeps/x86_64/multiarch/bcopy.S: New file .
* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: New file.
* sysdeps/x86_64/multiarch/memcpy-ssse3.S: New file.
* sysdeps/x86_64/multiarch/memcpy.S: New file.
* sysdeps/x86_64/multiarch/memcpy_chk.S: New file.
* sysdeps/x86_64/multiarch/memmove-ssse3-back.S: New file.
* sysdeps/x86_64/multiarch/memmove-ssse3.S: New file.
* sysdeps/x86_64/multiarch/memmove.c: New file.
* sysdeps/x86_64/multiarch/memmove_chk.c: New file.
* sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S: New file.
* sysdeps/x86_64/multiarch/mempcpy-ssse3.S: New file.
* sysdeps/x86_64/multiarch/mempcpy.S: New file.
* sysdeps/x86_64/multiarch/mempcpy_chk.S: New file.
* sysdeps/x86_64/multiarch/init-arch.h (bit_Fast_Copy_Backward):
Define.
(index_Fast_Copy_Backward): Define.
(HAS_ARCH_FEATURE): Define.
(HAS_FAST_REP_STRING): Define.
(HAS_FAST_COPY_BACKWARD): Define.
2010-06-21 Andreas Schwab <schwab@redhat.com>
* sysdeps/unix/sysv/linux/getlogin_r.c (__getlogin_r_loginuid):

View file

@ -23,8 +23,12 @@
#include <memcopy.h>
#include <pagecopy.h>
#ifndef MEMMOVE_CHK
# define MEMMOVE_CHK __memmove_chk
#endif
void *
__memmove_chk (dest, src, len, destlen)
MEMMOVE_CHK (dest, src, len, destlen)
void *dest;
const void *src;
size_t len;

View file

@ -37,9 +37,12 @@
#define rettype void *
#endif
#ifndef MEMMOVE
#define MEMMOVE memmove
#endif
rettype
memmove (a1, a2, len)
MEMMOVE (a1, a2, len)
a1const void *a1;
a2const void *a2;
size_t len;

View file

@ -40,12 +40,12 @@
.text
#if defined PIC && !defined NOT_IN_libc
ENTRY (__memcpy_chk)
ENTRY_CHK (__memcpy_chk)
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__memcpy_chk)
END_CHK (__memcpy_chk)
#endif
ENTRY(memcpy) /* (void *, const void*, size_t) */

View file

@ -5,7 +5,9 @@ endif
ifeq ($(subdir),string)
sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4
strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-strcspn-c.c += -msse4

View file

@ -0,0 +1,7 @@
#include <sysdep.h>
.text
ENTRY(bcopy)
xchg %rdi, %rsi
jmp HIDDEN_BUILTIN_JUMPTARGET(memmove)
END(bcopy)

View file

@ -78,10 +78,13 @@ __init_cpu_features (void)
case 0x25:
case 0x2e:
case 0x2f:
/* Rep string instructions are fast on Intel Core i3, i5
and i7. */
/* Rep string instructions and copy backward are fast on
Intel Core i3, i5 and i7. */
#if index_Fast_Rep_String != index_Fast_Copy_Backward
# error index_Fast_Rep_String != index_Fast_Copy_Backward
#endif
__cpu_features.feature[index_Fast_Rep_String]
|= bit_Fast_Rep_String;
|= bit_Fast_Rep_String | bit_Fast_Copy_Backward;
break;
}
}

View file

@ -17,6 +17,7 @@
02111-1307 USA. */
#define bit_Fast_Rep_String (1 << 0)
#define bit_Fast_Copy_Backward (1 << 1)
#ifdef __ASSEMBLER__
@ -32,7 +33,8 @@
# define index_SSE4_1 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
# define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
#define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE
# define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE
# define index_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE
#else /* __ASSEMBLER__ */
@ -102,6 +104,16 @@ extern const struct cpu_features *__get_cpu_features (void)
# define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20)
# define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12)
# define index_Fast_Rep_String FEATURE_INDEX_1
# define index_Fast_Rep_String FEATURE_INDEX_1
# define index_Fast_Copy_Backward FEATURE_INDEX_1
#define HAS_ARCH_FEATURE(idx, bit) \
((__get_cpu_features ()->feature[idx] & (bit)) != 0)
#define HAS_FAST_REP_STRING \
HAS_ARCH_FEATURE (index_Fast_Rep_String, bit_Fast_Rep_String)
#define HAS_FAST_COPY_BACKWARD \
HAS_ARCH_FEATURE (index_Fast_Copy_Backward, bit_Fast_Copy_Backward)
#endif /* __ASSEMBLER__ */

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,73 @@
/* Multiple versions of memcpy
Copyright (C) 2010 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <init-arch.h>
/* Define multiple versions only for the definition in lib and for
DSO. In static binaries we need memcpy before the initialization
happened. */
#if defined SHARED && !defined NOT_IN_libc
.text
ENTRY(memcpy)
.type memcpy, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
1: leaq __memcpy_sse2(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jz 2f
leaq __memcpy_ssse3(%rip), %rax
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
jz 2f
leaq __memcpy_ssse3_back(%rip), %rax
2: ret
END(memcpy)
# undef ENTRY
# define ENTRY(name) \
.type __memcpy_sse2, @function; \
.p2align 4; \
__memcpy_sse2: cfi_startproc; \
CALL_MCOUNT
# undef END
# define END(name) \
cfi_endproc; .size __memcpy_sse2, .-__memcpy_sse2
# undef ENTRY_CHK
# define ENTRY_CHK(name) \
.type __memcpy_chk_sse2, @function; \
.globl __memcpy_chk_sse2; \
.p2align 4; \
__memcpy_chk_sse2: cfi_startproc; \
CALL_MCOUNT
# undef END_CHK
# define END_CHK(name) \
cfi_endproc; .size __memcpy_chk_sse2, .-__memcpy_chk_sse2
# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal memcpy calls through a PLT.
The speedup we get from using SSSE3 instruction is likely eaten away
by the indirect call in the PLT. */
# define libc_hidden_builtin_def(name) \
.globl __GI_memcpy; __GI_memcpy = __memcpy_sse2
#endif
#include "../memcpy.S"

View file

@ -0,0 +1,47 @@
/* Multiple versions of __memcpy_chk
Copyright (C) 2010 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <init-arch.h>
/* Define multiple versions only for the definition in lib and for
DSO. There are no multiarch memcpy functions for static binaries.
*/
#ifndef NOT_IN_libc
# ifdef SHARED
.text
ENTRY(__memcpy_chk)
.type __memcpy_chk, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
1: leaq __memcpy_chk_sse2(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jz 2f
leaq __memcpy_chk_ssse3(%rip), %rax
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
jz 2f
leaq __memcpy_chk_ssse3_back(%rip), %rax
2: ret
END(__memcpy_chk)
# else
# include "../memcpy_chk.S"
# endif
#endif

View file

@ -0,0 +1,4 @@
#define USE_AS_MEMMOVE
#define MEMCPY __memmove_ssse3_back
#define MEMCPY_CHK __memmove_chk_ssse3_back
#include "memcpy-ssse3-back.S"

View file

@ -0,0 +1,4 @@
#define USE_AS_MEMMOVE
#define MEMCPY __memmove_ssse3
#define MEMCPY_CHK __memmove_chk_ssse3
#include "memcpy-ssse3.S"

View file

@ -0,0 +1,24 @@
#ifndef NOT_IN_libc
#include "init-arch.h"
#define MEMMOVE __memmove_sse2
#ifdef SHARED
# undef libc_hidden_builtin_def
# define libc_hidden_builtin_def(name) \
__hidden_ver1 (__memmove_sse2, __GI_memmove, __memmove_sse2);
#endif
#endif
#include "string/memmove.c"
#ifndef NOT_IN_libc
extern __typeof (__memmove_sse2) __memmove_sse2 attribute_hidden;
extern __typeof (__memmove_sse2) __memmove_ssse3 attribute_hidden;
extern __typeof (__memmove_sse2) __memmove_ssse3_back attribute_hidden;
libc_ifunc (memmove,
HAS_SSSE3
? (HAS_FAST_COPY_BACKWARD
? __memmove_ssse3_back : __memmove_ssse3)
: __memmove_sse2);
#endif

View file

@ -0,0 +1,15 @@
#include "init-arch.h"
#define MEMMOVE_CHK __memmove_chk_sse2
#include "debug/memmove_chk.c"
extern __typeof (__memmove_chk_sse2) __memmove_chk_sse2 attribute_hidden;
extern __typeof (__memmove_chk_sse2) __memmove_chk_ssse3 attribute_hidden;
extern __typeof (__memmove_chk_sse2) __memmove_chk_ssse3_back attribute_hidden;
libc_ifunc (__memmove_chk,
HAS_SSSE3
? (HAS_FAST_COPY_BACKWARD
? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
: __memmove_chk_sse2);

View file

@ -0,0 +1,4 @@
#define USE_AS_MEMPCPY
#define MEMCPY __mempcpy_ssse3_back
#define MEMCPY_CHK __mempcpy_chk_ssse3_back
#include "memcpy-ssse3-back.S"

View file

@ -0,0 +1,4 @@
#define USE_AS_MEMPCPY
#define MEMCPY __mempcpy_ssse3
#define MEMCPY_CHK __mempcpy_chk_ssse3
#include "memcpy-ssse3.S"

View file

@ -0,0 +1,75 @@
/* Multiple versions of mempcpy
Copyright (C) 2010 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <init-arch.h>
/* Define multiple versions only for the definition in lib and for
DSO. In static binaries we need mempcpy before the initialization
happened. */
#if defined SHARED && !defined NOT_IN_libc
ENTRY(__mempcpy)
.type __mempcpy, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
1: leaq __mempcpy_sse2(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jz 2f
leaq __mempcpy_ssse3(%rip), %rax
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
jz 2f
leaq __mempcpy_ssse3_back(%rip), %rax
2: ret
END(__mempcpy)
# undef ENTRY
# define ENTRY(name) \
.type __mempcpy_sse2, @function; \
.p2align 4; \
__mempcpy_sse2: cfi_startproc; \
CALL_MCOUNT
# undef END
# define END(name) \
cfi_endproc; .size __mempcpy_sse2, .-__mempcpy_sse2
# undef ENTRY_CHK
# define ENTRY_CHK(name) \
.type __mempcpy_chk_sse2, @function; \
.globl __mempcpy_chk_sse2; \
.p2align 4; \
__mempcpy_chk_sse2: cfi_startproc; \
CALL_MCOUNT
# undef END_CHK
# define END_CHK(name) \
cfi_endproc; .size __mempcpy_chk_sse2, .-__mempcpy_chk_sse2
# undef libc_hidden_def
# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal mempcpy calls through a PLT.
The speedup we get from using SSSE3 instruction is likely eaten away
by the indirect call in the PLT. */
# define libc_hidden_def(name) \
.globl __GI_mempcpy; __GI_mempcpy = __mempcpy_sse2
# define libc_hidden_builtin_def(name) \
.globl __GI___mempcpy; __GI___mempcpy = __mempcpy_sse2
#endif
#include "../mempcpy.S"

View file

@ -0,0 +1,47 @@
/* Multiple versions of __mempcpy_chk
Copyright (C) 2010 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <init-arch.h>
/* Define multiple versions only for the definition in lib and for
DSO. There are no multiarch mempcpy functions for static binaries.
*/
#ifndef NOT_IN_libc
# ifdef SHARED
.text
ENTRY(__mempcpy_chk)
.type __mempcpy_chk, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
1: leaq __mempcpy_chk_sse2(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jz 2f
leaq __mempcpy_chk_ssse3(%rip), %rax
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
jz 2f
leaq __mempcpy_chk_ssse3_back(%rip), %rax
2: ret
END(__mempcpy_chk)
# else
# include "../mempcpy_chk.S"
# endif
#endif

View file

@ -58,6 +58,9 @@
cfi_endproc; \
ASM_SIZE_DIRECTIVE(name)
#define ENTRY_CHK(name) ENTRY (name)
#define END_CHK(name) END (name)
/* If compiled for profiling, call `mcount' at the start of each function. */
#ifdef PROF
/* The mcount code relies on a normal frame pointer being on the stack