x86-64: Create microbenchmark infrastructure for libmvec

Add python script to generate libmvec microbenchmark from the input
values for each libmvec function using skeleton benchmark template.

Creates double and float benchmarks with vector length 1, 2, 4, 8,
and 16 for each libmvec function.  Vector length 1 corresponds to
scalar version of function and is included for vector function perf
comparison.

Co-authored-by: Haochen Jiang <haochen.jiang@intel.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Sunil K Pandey 2021-11-05 00:56:47 -07:00
parent d8c2e8e043
commit a43c0b5483
4 changed files with 642 additions and 0 deletions

View File

@ -29,6 +29,23 @@ libmvec-funcs = \
sin \
sincos \
# Define libmvec function for benchtests directory.
libmvec-bench-funcs = \
bench-libmvec-double = \
$(addprefix double-vlen1-, $(libmvec-bench-funcs)) \
$(addprefix double-vlen2-, $(libmvec-bench-funcs)) \
$(addprefix double-vlen4-, $(libmvec-bench-funcs)) \
$(addprefix double-vlen4-avx2-, $(libmvec-bench-funcs)) \
$(addprefix double-vlen8-, $(libmvec-bench-funcs)) \
bench-libmvec-float = \
$(addsuffix f, $(addprefix float-vlen1-, $(libmvec-bench-funcs))) \
$(addsuffix f, $(addprefix float-vlen4-, $(libmvec-bench-funcs))) \
$(addsuffix f, $(addprefix float-vlen8-, $(libmvec-bench-funcs))) \
$(addsuffix f, $(addprefix float-vlen8-avx2-, $(libmvec-bench-funcs))) \
$(addsuffix f, $(addprefix float-vlen16-, $(libmvec-bench-funcs))) \
# The base libmvec ABI tests.
libmvec-abi-func-tests = \
$(addprefix test-double-libmvec-,$(libmvec-funcs)) \
@ -83,5 +100,23 @@ $(common-objpfx)libmvec.mk: $(common-objpfx)config.make
echo " \$$(float-vlen16-arch-ext-cflags)"; \
echo; \
done; \
echo "endif"; \
echo "ifeq (\$$(subdir),benchtests)"; \
for t in $(libmvec-bench-funcs); do \
echo "CFLAGS-bench-double-vlen4-$$t.c = \\"; \
echo " \$$(double-vlen4-arch-ext-cflags)"; \
echo "CFLAGS-bench-double-vlen4-avx2-$$t.c = \\"; \
echo " \$$(double-vlen4-arch-ext2-cflags)"; \
echo "CFLAGS-bench-double-vlen8-$$t.c = \\"; \
echo " \$$(double-vlen8-arch-ext-cflags)"; \
echo; \
echo "CFLAGS-bench-float-vlen8-$${t}f.c = \\"; \
echo " \$$(float-vlen8-arch-ext-cflags)"; \
echo "CFLAGS-bench-float-vlen8-avx2-$${t}f.c = \\"; \
echo " \$$(float-vlen8-arch-ext2-cflags)"; \
echo "CFLAGS-bench-float-vlen16-$${t}f.c = \\"; \
echo " \$$(float-vlen16-arch-ext-cflags)"; \
echo; \
done; \
echo "endif") > $@T
mv -f $@T $@

View File

@ -72,3 +72,43 @@ ifeq ($(subdir)$(config-cflags-mprefer-vector-width),mathyes)
# performance of sin and cos by more than 40% on Skylake.
CFLAGS-branred.c = -mprefer-vector-width=128
endif
ifeq ($(subdir),benchtests)
double-vlen4-arch-ext-cflags = -mavx
double-vlen4-arch-ext2-cflags = -mavx2
double-vlen8-arch-ext-cflags = -mavx512f
float-vlen8-arch-ext-cflags = -mavx
float-vlen8-arch-ext2-cflags = -mavx2
float-vlen16-arch-ext-cflags = -mavx512f
bench-libmvec := $(bench-libmvec-double) $(bench-libmvec-float)
ifeq (${BENCHSET},)
bench += $(bench-libmvec)
endif
ifeq (${STATIC-BENCHTESTS},yes)
libmvec-benchtests = $(common-objpfx)mathvec/libmvec.a $(common-objpfx)math/libm.a
else
libmvec-benchtests = $(libmvec) $(libm)
endif
$(addprefix $(objpfx)bench-,$(bench-libmvec-double)): $(libmvec-benchtests)
$(addprefix $(objpfx)bench-,$(bench-libmvec-float)): $(libmvec-benchtests)
bench-libmvec-deps = $(..)sysdeps/x86_64/fpu/bench-libmvec-skeleton.c bench-timing.h Makefile
$(objpfx)bench-float-%.c: $(bench-libmvec-deps)
{ if [ -n "$($*-INCLUDE)" ]; then \
cat $($*-INCLUDE); \
fi; \
$(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py $(basename $(@F)); } > $@-tmp
mv -f $@-tmp $@
$(objpfx)bench-double-%.c: $(bench-libmvec-deps)
{ if [ -n "$($*-INCLUDE)" ]; then \
cat $($*-INCLUDE); \
fi; \
$(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py $(basename $(@F)); } > $@-tmp
mv -f $@-tmp $@
endif

View File

@ -0,0 +1,103 @@
/* Skeleton for libmvec benchmark programs.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <time.h>
#include <inttypes.h>
#include <bench-timing.h>
#include <json-lib.h>
#include <bench-util.h>
#include <math-tests-arch.h>
#include <bench-util.c>
#define D_ITERS 10000
int
main (int argc, char **argv)
{
unsigned long i, k;
timing_t start, end;
json_ctx_t json_ctx;
#if defined REQUIRE_AVX
if (!CPU_FEATURE_ACTIVE (AVX))
{
printf ("AVX not supported.\n");
return 0;
}
#elif defined REQUIRE_AVX2
if (!CPU_FEATURE_ACTIVE (AVX2))
{
printf ("AVX2 not supported.\n");
return 0;
}
#elif defined REQUIRE_AVX512F
if (!CPU_FEATURE_ACTIVE (AVX512F))
{
printf ("AVX512F not supported.\n");
return 0;
}
#endif
bench_start ();
#ifdef BENCH_INIT
BENCH_INIT ();
#endif
json_init (&json_ctx, 2, stdout);
/* Begin function. */
json_attr_object_begin (&json_ctx, FUNCNAME);
for (int v = 0; v < NUM_VARIANTS; v++)
{
double d_total_time = 0;
timing_t cur;
for (k = 0; k < D_ITERS; k++)
{
TIMING_NOW (start);
for (i = 0; i < NUM_SAMPLES (v); i++)
BENCH_FUNC (v, i);
TIMING_NOW (end);
TIMING_DIFF (cur, start, end);
TIMING_ACCUM (d_total_time, cur);
}
double d_total_data_set = D_ITERS * NUM_SAMPLES (v) * STRIDE;
/* Begin variant. */
json_attr_object_begin (&json_ctx, VARIANT (v));
json_attr_double (&json_ctx, "duration", d_total_time);
json_attr_double (&json_ctx, "iterations", d_total_data_set);
json_attr_double (&json_ctx, "mean", d_total_time / d_total_data_set);
/* End variant. */
json_attr_object_end (&json_ctx);
}
/* End function. */
json_attr_object_end (&json_ctx);
return 0;
}

View File

@ -0,0 +1,464 @@
#!/usr/bin/python3
# Copyright (C) 2021 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <https://www.gnu.org/licenses/>.
"""Benchmark program generator script
This script takes a function name as input and generates a program using
an libmvec input file located in the sysdeps/x86_64/fpu directory. The
name of the input file should be of the form libmvec-foo-inputs where
'foo' is the name of the function.
"""
from __future__ import print_function
import sys
import os
import itertools
import re
# Macro definitions for functions that take no arguments. For functions
# that take arguments, the STRUCT_TEMPLATE, ARGS_TEMPLATE and
# VARIANTS_TEMPLATE are used instead.
DEFINES_TEMPLATE = '''
#define CALL_BENCH_FUNC(v, i) %(func)s ()
#define NUM_VARIANTS (1)
#define NUM_SAMPLES(v) (1)
#define VARIANT(v) FUNCNAME "()"
'''
# Structures to store arguments for the function call. A function may
# have its inputs partitioned to represent distinct performance
# characteristics or distinct flavors of the function. Each such
# variant is represented by the _VARIANT structure. The ARGS structure
# represents a single set of arguments.
BENCH_VEC_TEMPLATE = '''
#define CALL_BENCH_FUNC(v, i) (__extension__ ({ \\
%(defs)s mx0 = %(func)s (%(func_args)s); \\
mx0; }))
'''
BENCH_SCALAR_TEMPLATE = '''
#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s)
'''
STRUCT_TEMPLATE = '''struct args
{
%(args)s
double timing;
};
struct _variants
{
const char *name;
int count;
struct args *in;
};
'''
# The actual input arguments.
ARGS_TEMPLATE = '''struct args in%(argnum)d[%(num_args)d] = {
%(args)s
};
'''
# The actual variants, along with macros defined to access the variants.
VARIANTS_TEMPLATE = '''struct _variants variants[%(num_variants)d] = {
%(variants)s
};
#define NUM_VARIANTS %(num_variants)d
#define NUM_SAMPLES(i) (variants[i].count)
#define VARIANT(i) (variants[i].name)
'''
# Epilogue for the generated source file.
EPILOGUE = '''
#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);})
#define FUNCNAME "%(func)s"
#include <bench-libmvec-skeleton.c>'''
def gen_source(func_types, directives, all_vals):
"""Generate source for the function
Generate the C source for the function from the values and
directives.
Args:
func: The function name
directives: A dictionary of directives applicable to this function
all_vals: A dictionary input values
"""
# The includes go in first.
for header in directives['includes']:
print('#include <%s>' % header)
for header in directives['include-sources']:
print('#include "%s"' % header)
argtype_vtable = {
2: '128',
4: '256',
8: '512'
}
prefix_vtable = {
2: 'b',
4: 'c',
8: 'e'
}
# Get all the function properties
funcname_argtype = ''
float_flag = False
if func_types[1] == 'float':
float_flag = True
avx_flag = False
if func_types[3] == 'avx2':
avx_flag = True
funcname_stride = int(func_types[2][4:])
funcname_origin = func_types[-1]
if float_flag:
funcname_origin = funcname_origin[:-1]
if funcname_stride == 1:
# Prepare for scalar functions file generation
funcname_prefix = ''
funcname_prefix_1 = ''
funcname_argtype = 'double'
if float_flag:
funcname_argtype = 'float'
else:
# Prepare for libmvec functions file generation
funcname_prefix_1 = len(directives['args']) * 'v' + '_'
aligned_stride = funcname_stride
if float_flag:
aligned_stride /= 2
funcname_prefix = '_ZGV'
if (avx_flag and (aligned_stride == 4)):
funcname_prefix += 'd'
else:
funcname_prefix += prefix_vtable[aligned_stride]
funcname_prefix = funcname_prefix + 'N' + func_types[2][4:]
funcname_argtype = '__m' + argtype_vtable[aligned_stride]
if not float_flag:
funcname_argtype += 'd'
# Include x86intrin.h for vector functions
if not funcname_stride == 1:
print('#include <x86intrin.h>')
if (avx_flag and (aligned_stride == 4)):
# For bench-float-vlen8-avx2* and bench-double-vlen4-avx2*
print('#define REQUIRE_AVX2')
elif aligned_stride == 8:
# For bench-float-vlen16* and bench-double-vlen8*
print('#define REQUIRE_AVX512F')
elif aligned_stride == 4:
# For bench-float-vlen8* and bench-double-vlen4* without avx2
print('#define REQUIRE_AVX')
else:
print('#define FUNCTYPE %s' % funcname_argtype)
print('#define STRIDE %d ' % funcname_stride)
funcname = funcname_prefix + funcname_prefix_1 + funcname_origin
if float_flag:
funcname += 'f'
funcname_rettype = funcname_argtype
if directives['ret'] == '':
funcname_rettype = 'void'
funcname_inputtype = []
for arg, i in zip(directives['args'], itertools.count()):
funcname_inputtype.append(funcname_argtype)
if arg[0] == '<' and arg[-1] == '>':
pos = arg.rfind('*')
if pos == -1:
die('Output argument must be a pointer type')
funcname_inputtype[i] += ' *'
if not funcname_stride == 1:
if len(directives['args']) == 2:
print('extern %s %s (%s, %s);' % (funcname_rettype, funcname, funcname_inputtype[0], funcname_inputtype[1]))
elif len(directives['args']) == 3:
print('extern %s %s (%s, %s, %s);' % (funcname_rettype, funcname, funcname_inputtype[0], funcname_inputtype[1], funcname_inputtype[2]))
else:
print('extern %s %s (%s);' % (funcname_rettype, funcname, funcname_inputtype[0]))
# Print macros. This branches out to a separate routine if
# the function takes arguments.
if not directives['args']:
print(DEFINES_TEMPLATE % {'funcname': funcname})
outargs = []
else:
outargs = _print_arg_data(funcname, float_flag, funcname_argtype, funcname_stride, directives, all_vals)
# Print the output variable definitions if necessary.
for out in outargs:
print(out)
# If we have a return value from the function, make sure it is
# assigned to prevent the compiler from optimizing out the
# call.
getret = ''
if directives['ret']:
if funcname_argtype != '':
print('static %s volatile ret;' % funcname_argtype)
getret = 'ret ='
else:
print('static %s volatile ret;' % directives['ret'])
getret = 'ret ='
# Test initialization.
if directives['init']:
print('#define BENCH_INIT %s' % directives['init'])
print(EPILOGUE % {'getret': getret, 'func': funcname})
def _print_arg_data(func, float_flag, funcname_argtype, funcname_stride, directives, all_vals):
"""Print argument data
This is a helper function for gen_source that prints structure and
values for arguments and their variants and returns output arguments
if any are found.
Args:
func: Function name
float_flag: True if function is float type
funcname_argtype: Type for vector variants
funcname_stride: Vector Length
directives: A dictionary of directives applicable to this function
all_vals: A dictionary input values
Returns:
Returns a list of definitions for function arguments that act as
output parameters.
"""
# First, all of the definitions. We process writing of
# CALL_BENCH_FUNC, struct args and also the output arguments
# together in a single traversal of the arguments list.
func_args = []
_func_args = []
arg_struct = []
outargs = []
# Conversion function for each type
vtable = {
'__m128d': '_mm_loadu_pd',
'__m256d': '_mm256_loadu_pd',
'__m512d': '_mm512_loadu_pd',
'__m128': '_mm_loadu_ps',
'__m256': '_mm256_loadu_ps',
'__m512': '_mm512_loadu_ps',
'double': '',
'float': ''
}
# For double max_vlen=8, for float max_vlen=16.
if float_flag == True:
max_vlen = 16
else:
max_vlen = 8
for arg, i in zip(directives['args'], itertools.count()):
if arg[0] == '<' and arg[-1] == '>':
outargs.append('static %s out%d __attribute__((used));' % (funcname_argtype, i))
func_args.append('&out%d' % i)
_func_args.append('&out%d' % i)
else:
arg_struct.append(' %s arg%d[STRIDE];' % (arg, i))
func_args.append('%s (variants[v].in[i].arg%d)' %
(vtable[funcname_argtype], i))
_func_args.append('variants[v].in[i].arg%d[0]' % i)
if funcname_stride == 1:
print(BENCH_SCALAR_TEMPLATE % {'func': func,
'func_args': ', '.join(_func_args)})
elif directives['ret'] == '':
print(BENCH_SCALAR_TEMPLATE % {'func': func,
'func_args': ', '.join(func_args)})
else:
print(BENCH_VEC_TEMPLATE % {'func': func, 'func_args': ', '.join(func_args),
'defs': funcname_argtype})
print(STRUCT_TEMPLATE % {'args': '\n'.join(arg_struct)})
# Now print the values.
variants = []
for (k, _vals), i in zip(all_vals.items(), itertools.count()):
vals = []
temp_vals = []
j = 0
temp_j = 0
result_v = ['', '', '']
for _v in _vals:
nums = _v.split(',')
for l in range(0, len(nums)):
result_v[l] = result_v[l] + nums[l].strip() + ','
j += 1
temp_j += 1
if temp_j == funcname_stride:
final_result = ''
for l in range(0, len(nums)):
final_result = final_result + '{' + result_v[l][:-1] + '},'
temp_vals.append(final_result[:-1])
temp_j = 0
result_v = ['', '', '']
# Make sure amount of test data is multiple of max_vlen
# to keep data size same for all vector length.
if j == max_vlen:
vals.extend(temp_vals)
temp_vals = []
j = 0
out = [' {%s, 0},' % v for v in vals]
# Members for the variants structure list that we will
# print later.
variants.append(' {"%s", %d, in%d},' % (k, len(vals), i))
print(ARGS_TEMPLATE % {'argnum': i, 'num_args': len(vals),
'args': '\n'.join(out)})
# Print the variants and the last set of macros.
print(VARIANTS_TEMPLATE % {'num_variants': len(all_vals),
'variants': '\n'.join(variants)})
return outargs
def _process_directive(d_name, d_val, func_args):
"""Process a directive.
Evaluate the directive name and value passed and return the
processed value. This is a helper function for parse_file.
Args:
d_name: Name of the directive
d_val: The string value to process
Returns:
The processed value, which may be the string as it is or an object
that describes the directive.
"""
# Process the directive values if necessary. name and ret don't
# need any processing.
if d_name.startswith('include'):
d_val = d_val.split(',')
elif d_name == 'args':
d_val = d_val.split(':')
# Check if args type match
if not d_val[0] == func_args:
die("Args mismatch, should be %s, but get %s" % (d_val[0], func_args))
# Return the values.
return d_val
def parse_file(func_types):
"""Parse an input file
Given a function name, open and parse an input file for the function
and get the necessary parameters for the generated code and the list
of inputs.
Args:
func: The function name
Returns:
A tuple of two elements, one a dictionary of directives and the
other a dictionary of all input values.
"""
all_vals = {}
# Valid directives.
directives = {
'name': '',
'args': [],
'includes': [],
'include-sources': [],
'ret': '',
'init': ''
}
func = func_types[-1]
try:
with open('../sysdeps/x86_64/fpu/libmvec-%s-inputs' % func) as f:
for line in f:
# Look for directives and parse it if found.
if line.startswith('##'):
try:
d_name, d_val = line[2:].split(':', 1)
d_name = d_name.strip()
d_val = d_val.strip()
directives[d_name] = _process_directive(d_name, d_val, func_types[1])
except (IndexError, KeyError):
die('Invalid directive: %s' % line[2:])
# Skip blank lines and comments.
line = line.split('#', 1)[0].rstrip()
if not line:
continue
# Otherwise, we're an input. Add to the appropriate
# input set.
cur_name = directives['name']
all_vals.setdefault(cur_name, [])
all_vals[cur_name].append(line)
except IOError as ex:
die("Failed to open input file (%s): %s" % (ex.filename, ex.strerror))
return directives, all_vals
def die(msg):
"""Exit with an error
Prints an error message to the standard error stream and exits with
a non-zero status.
Args:
msg: The error message to print to standard error
"""
print('%s\n' % msg, file=sys.stderr)
sys.exit(os.EX_DATAERR)
def main(args):
"""Main function
Use the first command line argument as function name and parse its
input file to generate C source that calls the function repeatedly
for the input.
Args:
args: The command line arguments with the program name dropped
Returns:
os.EX_USAGE on error and os.EX_OK on success.
"""
if len(args) != 1:
print('Usage: %s <function>' % sys.argv[0])
return os.EX_USAGE
func_types = args[0].split('-')
directives, all_vals = parse_file(func_types)
gen_source(func_types, directives, all_vals)
return os.EX_OK
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))