Faster strchr implementation.

This commit is contained in:
Ondřej Bílka 2013-09-11 17:07:38 +02:00
parent b987c77672
commit 5905e7b3e2
5 changed files with 170 additions and 201 deletions

View file

@ -1,3 +1,12 @@
2013-09-11 Ondřej Bílka <neleai@seznam.cz>
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Remove: __strchr_sse42.
* sysdeps/x86_64/multiarch/strchr.S (__strchr_sse42): Remove.
(strchr): Remove __strchr_sse42 ifunc selection.
* sysdeps/x86_64/strchr.S (strchr): Use optimized implementation.
* sysdeps/x86_64/strchrnul.S: Include sysdeps/x86_64/strchr.S.
2013-09-11 Will Newton <will.newton@linaro.org>
* benchtests/bench-timing.h (TIMING_INIT): Rename ITERS
@ -35,7 +44,7 @@
* malloc/malloc.c (__libc_pvalloc): Check the value of bytes
does not overflow.
2013-09-10 Ondřej Bílka <neleai@seznam.cz>
2013-09-10 Ondřej Bílka <neleai@seznam.cz>
* sysdeps/ieee754/dbl-64/e_j0.c: Remove DO_NOT_USE_THIS conditionals.
* sysdeps/ieee754/dbl-64/e_j1.c: Likewise.

View file

@ -110,7 +110,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strchr.S. */
IFUNC_IMPL (i, name, strchr,
IFUNC_IMPL_ADD (array, i, strchr, HAS_SSE4_2, __strchr_sse42)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))

View file

@ -29,12 +29,6 @@ ENTRY(strchr)
jne 1f
call __init_cpu_features
1: leaq __strchr_sse2(%rip), %rax
testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
jnz 2f
testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
jz 2f
leaq __strchr_sse42(%rip), %rax
ret
2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
jz 3f
leaq __strchr_sse2_no_bsf(%rip), %rax
@ -42,127 +36,6 @@ ENTRY(strchr)
END(strchr)
/*
This implementation uses SSE4 instructions to compare up to 16 bytes
at a time looking for the first occurrence of the character c in the
string s:
char *strchr (const char *s, int c);
We use 0xa:
_SIDD_SBYTE_OPS
| _SIDD_CMP_EQUAL_EACH
| _SIDD_LEAST_SIGNIFICANT
on pcmpistri to compare xmm/mem128
0 1 2 3 4 5 6 7 8 9 A B C D E F
X X X X X X X X X X X X X X X X
against xmm
0 1 2 3 4 5 6 7 8 9 A B C D E F
C C C C C C C C C C C C C C C C
to find out if the first 16byte data element has a byte C and the
offset of the first byte. There are 3 cases:
1. The first 16byte data element has the byte C at the offset X.
2. The first 16byte data element has EOS and doesn't have the byte C.
3. The first 16byte data element is valid and doesn't have the byte C.
Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
case ECX CFlag ZFlag SFlag
1 X 1 0/1 0
2 16 0 1 0
3 16 0 0 0
We exit from the loop for cases 1 and 2 with jbe which branches
when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
X for case 1. */
.section .text.sse4.2,"ax",@progbits
.align 16
.type __strchr_sse42, @function
.globl __strchr_sse42
.hidden __strchr_sse42
__strchr_sse42:
cfi_startproc
CALL_MCOUNT
testb %sil, %sil
je __strend_sse4
pxor %xmm2, %xmm2
movd %esi, %xmm1
movl %edi, %ecx
pshufb %xmm2, %xmm1
andl $15, %ecx
movq %rdi, %r8
je L(aligned_start)
/* Handle unaligned string. */
andq $-16, %r8
movdqa (%r8), %xmm0
pcmpeqb %xmm0, %xmm2
pcmpeqb %xmm1, %xmm0
/* Find where NULL is. */
pmovmskb %xmm2, %edx
/* Check if there is a match. */
pmovmskb %xmm0, %esi
/* Remove the leading bytes. */
sarl %cl, %edx
sarl %cl, %esi
testl %esi, %esi
je L(unaligned_no_match)
/* Check which byte is a match. */
bsfl %esi, %eax
/* Is there a NULL? */
testl %edx, %edx
je L(unaligned_match)
bsfl %edx, %esi
cmpl %esi, %eax
/* Return NULL if NULL comes first. */
ja L(return_null)
L(unaligned_match):
addq %rdi, %rax
ret
.p2align 4
L(unaligned_no_match):
testl %edx, %edx
jne L(return_null)
/* Loop start on aligned string. */
L(loop):
addq $16, %r8
L(aligned_start):
pcmpistri $0x2, (%r8), %xmm1
jbe L(wrap)
addq $16, %r8
pcmpistri $0x2, (%r8), %xmm1
jbe L(wrap)
addq $16, %r8
pcmpistri $0x2, (%r8), %xmm1
jbe L(wrap)
addq $16, %r8
pcmpistri $0x2, (%r8), %xmm1
jbe L(wrap)
jmp L(loop)
L(wrap):
jc L(loop_exit)
/* Return NULL. */
L(return_null):
xorl %eax, %eax
ret
/* Loop exit. */
.p2align 4
L(loop_exit):
leaq (%r8,%rcx), %rax
ret
cfi_endproc
.size __strchr_sse42, .-__strchr_sse42
# undef ENTRY
# define ENTRY(name) \

View file

@ -19,51 +19,174 @@
#include <sysdep.h>
# ifndef ALIGN
# define ALIGN(n) .p2align n
# endif
.text
ENTRY (strchr)
movd %esi, %xmm1
movq %rdi, %rcx
movl %edi, %eax
andl $4095, %eax
punpcklbw %xmm1, %xmm1
andq $~15, %rdi
pxor %xmm2, %xmm2
punpcklbw %xmm1, %xmm1
orl $0xffffffff, %esi
movdqa (%rdi), %xmm0
cmpl $4032, %eax
punpcklwd %xmm1, %xmm1
pshufd $0, %xmm1, %xmm1
subq %rdi, %rcx
movdqa %xmm0, %xmm3
leaq 16(%rdi), %rdi
jg L(cross_page)
movdqu (%rdi), %xmm0
pxor %xmm3, %xmm3
movdqa %xmm0, %xmm4
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm3
shl %cl, %esi
pmovmskb %xmm0, %edx
pmovmskb %xmm3, %ecx
andl %esi, %edx
andl %esi, %ecx
orl %edx, %ecx
jnz 1f
pcmpeqb %xmm3, %xmm4
por %xmm4, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
je L(next_48_bytes)
bsf %eax, %eax
#ifdef AS_STRCHRNUL
leaq (%rdi,%rax), %rax
#else
movl $0, %edx
leaq (%rdi,%rax), %rax
cmpb %sil, (%rax)
cmovne %rdx, %rax
#endif
ret
2: movdqa (%rdi), %xmm0
leaq 16(%rdi), %rdi
movdqa %xmm0, %xmm3
ALIGN(3)
L(next_48_bytes):
movdqu 16(%rdi), %xmm0
movdqa %xmm0, %xmm4
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm3
pmovmskb %xmm0, %edx
pmovmskb %xmm3, %ecx
orl %edx, %ecx
jz 2b
pcmpeqb %xmm3, %xmm4
por %xmm4, %xmm0
pmovmskb %xmm0, %ecx
movdqu 32(%rdi), %xmm0
movdqa %xmm0, %xmm4
pcmpeqb %xmm1, %xmm0
salq $16, %rcx
pcmpeqb %xmm3, %xmm4
por %xmm4, %xmm0
pmovmskb %xmm0, %eax
movdqu 48(%rdi), %xmm0
pcmpeqb %xmm0, %xmm3
salq $32, %rax
pcmpeqb %xmm1, %xmm0
orq %rcx, %rax
por %xmm3, %xmm0
pmovmskb %xmm0, %ecx
salq $48, %rcx
orq %rcx, %rax
testq %rax, %rax
jne L(return)
L(loop_start):
/* We use this alignment to force loop be aligned to 8 but not
16 bytes. This gives better sheduling on AMD processors. */
ALIGN(4)
pxor %xmm6, %xmm6
andq $-64, %rdi
ALIGN(3)
L(loop64):
addq $64, %rdi
movdqa (%rdi), %xmm5
movdqa 16(%rdi), %xmm2
movdqa 32(%rdi), %xmm3
pxor %xmm1, %xmm5
movdqa 48(%rdi), %xmm4
pxor %xmm1, %xmm2
pxor %xmm1, %xmm3
pminub (%rdi), %xmm5
pxor %xmm1, %xmm4
pminub 16(%rdi), %xmm2
pminub 32(%rdi), %xmm3
pminub %xmm2, %xmm5
pminub 48(%rdi), %xmm4
pminub %xmm3, %xmm5
pminub %xmm4, %xmm5
pcmpeqb %xmm6, %xmm5
pmovmskb %xmm5, %eax
testl %eax, %eax
je L(loop64)
movdqa (%rdi), %xmm5
movdqa %xmm5, %xmm0
pcmpeqb %xmm1, %xmm5
pcmpeqb %xmm6, %xmm0
por %xmm0, %xmm5
pcmpeqb %xmm6, %xmm2
pcmpeqb %xmm6, %xmm3
pcmpeqb %xmm6, %xmm4
pmovmskb %xmm5, %ecx
pmovmskb %xmm2, %eax
salq $16, %rax
pmovmskb %xmm3, %r8d
pmovmskb %xmm4, %edx
salq $32, %r8
orq %r8, %rax
orq %rcx, %rax
salq $48, %rdx
orq %rdx, %rax
ALIGN(3)
L(return):
bsfq %rax, %rax
#ifdef AS_STRCHRNUL
leaq (%rdi,%rax), %rax
#else
movl $0, %edx
leaq (%rdi,%rax), %rax
cmpb %sil, (%rax)
cmovne %rdx, %rax
#endif
ret
ALIGN(4)
L(cross_page):
movq %rdi, %rdx
pxor %xmm2, %xmm2
andq $-64, %rdx
movdqa %xmm1, %xmm0
movdqa (%rdx), %xmm3
movdqa %xmm3, %xmm4
pcmpeqb %xmm1, %xmm3
pcmpeqb %xmm2, %xmm4
por %xmm4, %xmm3
pmovmskb %xmm3, %r8d
movdqa 16(%rdx), %xmm3
movdqa %xmm3, %xmm4
pcmpeqb %xmm1, %xmm3
pcmpeqb %xmm2, %xmm4
por %xmm4, %xmm3
pmovmskb %xmm3, %eax
movdqa 32(%rdx), %xmm3
movdqa %xmm3, %xmm4
pcmpeqb %xmm1, %xmm3
salq $16, %rax
pcmpeqb %xmm2, %xmm4
por %xmm4, %xmm3
pmovmskb %xmm3, %r9d
movdqa 48(%rdx), %xmm3
pcmpeqb %xmm3, %xmm2
salq $32, %r9
pcmpeqb %xmm3, %xmm0
orq %r9, %rax
orq %r8, %rax
por %xmm2, %xmm0
pmovmskb %xmm0, %ecx
salq $48, %rcx
orq %rcx, %rax
movl %edi, %ecx
subb %dl, %cl
shrq %cl, %rax
testq %rax, %rax
jne L(return)
jmp L(loop_start)
1: bsfl %edx, %edx
jz 4f
bsfl %ecx, %ecx
leaq -16(%rdi,%rdx), %rax
cmpl %edx, %ecx
je 5f
4: xorl %eax, %eax
5: ret
END (strchr)
#ifndef AS_STRCHRNUL
weak_alias (strchr, index)
libc_hidden_builtin_def (strchr)
#endif

View file

@ -20,43 +20,8 @@
#include <sysdep.h>
.text
ENTRY (__strchrnul)
movd %esi, %xmm1
movq %rdi, %rcx
punpcklbw %xmm1, %xmm1
andq $~15, %rdi
pxor %xmm2, %xmm2
punpcklbw %xmm1, %xmm1
orl $0xffffffff, %esi
movdqa (%rdi), %xmm0
pshufd $0, %xmm1, %xmm1
subq %rdi, %rcx
movdqa %xmm0, %xmm3
leaq 16(%rdi), %rdi
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm3
shl %cl, %esi
pmovmskb %xmm0, %edx
pmovmskb %xmm3, %ecx
orl %edx, %ecx
andl %esi, %ecx
jnz 1f
2: movdqa (%rdi), %xmm0
leaq 16(%rdi), %rdi
movdqa %xmm0, %xmm3
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm3
pmovmskb %xmm0, %edx
pmovmskb %xmm3, %ecx
orl %edx, %ecx
jz 2b
1: bsfl %ecx, %edx
leaq -16(%rdi,%rdx), %rax
ret
END (__strchrnul)
#define strchr __strchrnul
#define AS_STRCHRNUL
#include "strchr.S"
weak_alias (__strchrnul, strchrnul)