IFUNC for Cavium ThunderX2

* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
	Add memcpy_thunderx2.
	* sysdeps/aarch64/multiarch/ifunc-impl-list.c (MAX_IFUNC):
	Increment to 4.
	(__libc_ifunc_impl_list): Add __memcpy_thunderx2.
	* sysdeps/aarch64/multiarch/memcpy.c (libc_ifunc): Add IS_THUNDERX2
	and IS_THUNDERX2PA checks.
	* sysdeps/aarch64/multiarch/memcpy_thunderx.S (USE_THUNDERX2):
	Use macro to set name appropriately.
	(memcpy): Use USE_THUNDERX2 macro to modify prefetches.
	* sysdeps/aarch64/multiarch/memcpy_thunderx2.S: New file.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h (IS_THUNDERX2PA):
	New macro.
	(IS_THUNDERX2): New macro.
This commit is contained in:
Steve Ellcey 2018-02-22 08:38:47 -08:00
parent da81ae645d
commit e9537dddc7
7 changed files with 73 additions and 10 deletions

View file

@ -1,3 +1,20 @@
2018-02-22 Steve Ellcey <sellcey@cavium.com>
* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
Add memcpy_thunderx2.
* sysdeps/aarch64/multiarch/ifunc-impl-list.c (MAX_IFUNC):
Increment to 4.
(__libc_ifunc_impl_list): Add __memcpy_thunderx2.
* sysdeps/aarch64/multiarch/memcpy.c (libc_ifunc): Add IS_THUNDERX2
and IS_THUNDERX2PA checks.
* sysdeps/aarch64/multiarch/memcpy_thunderx.S (USE_THUNDERX2):
Use macro to set name appropriately.
(memcpy): Use USE_THUNDERX2 macro to modify prefetches.
* sysdeps/aarch64/multiarch/memcpy_thunderx2.S: New file.
* sysdeps/unix/sysv/linux/aarch64/cpu-features.h (IS_THUNDERX2PA):
New macro.
(IS_THUNDERX2): New macro.
2018-02-22 Stefan Liebler <stli@linux.vnet.ibm.com>
* sysdeps/s390/fpu/libm-test-ulps: Regenerated.

View file

@ -1,4 +1,4 @@
ifeq ($(subdir),string)
sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
memmove_falkor memset_generic memset_falkor
sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
memcpy_falkor memmove_falkor memset_generic memset_falkor
endif

View file

@ -25,7 +25,7 @@
#include <stdio.h>
/* Maximum number of IFUNC implementations. */
#define MAX_IFUNC 3
#define MAX_IFUNC 4
size_t
__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
@ -40,6 +40,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */
IFUNC_IMPL (i, name, memcpy,
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
IFUNC_IMPL (i, name, memmove,

View file

@ -30,6 +30,7 @@ extern __typeof (__redirect_memcpy) __libc_memcpy;
extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
libc_ifunc (__libc_memcpy,
@ -37,7 +38,9 @@ libc_ifunc (__libc_memcpy,
? __memcpy_thunderx
: (IS_FALKOR (midr)
? __memcpy_falkor
: __memcpy_generic)));
: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
? __memcpy_thunderx2
: __memcpy_generic))));
# undef memcpy
strong_alias (__libc_memcpy, memcpy);

View file

@ -74,11 +74,13 @@
#if IS_IN (libc)
# undef MEMCPY
# define MEMCPY __memcpy_thunderx
# undef MEMMOVE
# define MEMMOVE __memmove_thunderx
# define USE_THUNDERX
# ifndef USE_THUNDERX2
# undef MEMCPY
# define MEMCPY __memcpy_thunderx
# undef MEMMOVE
# define MEMMOVE __memmove_thunderx
# define USE_THUNDERX
# endif
ENTRY_ALIGN (MEMMOVE, 6)
@ -180,7 +182,7 @@ L(copy96):
.p2align 4
L(copy_long):
# ifdef USE_THUNDERX
# if defined(USE_THUNDERX) || defined (USE_THUNDERX2)
/* On thunderx, large memcpy's are helped by software prefetching.
This loop is identical to the one below it but with prefetching
@ -194,7 +196,11 @@ L(copy_long):
bic dst, dstin, 15
ldp D_l, D_h, [src]
sub src, src, tmp1
# if defined(USE_THUNDERX)
prfm pldl1strm, [src, 384]
# elif defined(USE_THUNDERX2)
prfm pldl1strm, [src, 256]
# endif
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16]
stp D_l, D_h, [dstin]
@ -204,9 +210,13 @@ L(copy_long):
subs count, count, 128 + 16 /* Test and readjust count. */
L(prefetch_loop64):
# if defined(USE_THUNDERX)
tbz src, #6, 1f
prfm pldl1strm, [src, 512]
1:
# elif defined(USE_THUNDERX2)
prfm pldl1strm, [src, 256]
# endif
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [src, 16]
stp B_l, B_h, [dst, 32]

View file

@ -0,0 +1,27 @@
/* A Thunderx2 Optimized memcpy implementation for AARCH64.
Copyright (C) 2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
/* The actual code in this memcpy and memmove is in memcpy_thunderx.S.
The only real differences are with the prefetching instructions. */
#define MEMCPY __memcpy_thunderx2
#define MEMMOVE __memmove_thunderx2
#define USE_THUNDERX2
#include "memcpy_thunderx.S"

View file

@ -41,6 +41,11 @@
#define IS_THUNDERX(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \
&& MIDR_PARTNUM(midr) == 0x0a1)
#define IS_THUNDERX2PA(midr) (MIDR_IMPLEMENTOR(midr) == 'B' \
&& MIDR_PARTNUM(midr) == 0x516)
#define IS_THUNDERX2(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \
&& MIDR_PARTNUM(midr) == 0xaf)
#define IS_FALKOR(midr) (MIDR_IMPLEMENTOR(midr) == 'Q' \
&& MIDR_PARTNUM(midr) == 0xc00)