Use HP_TIMING for benchmarks if available
HP_TIMING uses native timestamping instructions if available, thus greatly reducing the overhead of recording start and end times for function calls. For architectures that don't have HP_TIMING available, we fall back to the clock_gettime bits. One may also override this by invoking the benchmark as follows: make USE_CLOCK_GETTIME=1 bench and get the benchmark results using clock_gettime. One has to do `make bench-clean` to ensure that the benchmark programs are rebuilt.
This commit is contained in:
parent
0f7d347bd0
commit
43fe811b73
|
@ -1,5 +1,12 @@
|
|||
2013-05-13 Siddhesh Poyarekar <siddhesh@redhat.com>
|
||||
|
||||
* benchtests/Makefile (CPPFLAGS-nonlib): Add
|
||||
-DUSE_CLOCK_GETTIME if USE_CLOCK_GETTIME is defined.
|
||||
(bench-deps): Add bench-timing.h.
|
||||
* benchtests-bench-skeleton.c: Include bench-timing.h.
|
||||
(main): Use TIMING_* macros instead of clock_gettime.
|
||||
* benchtests/bench-timing.h: New file.
|
||||
|
||||
[BZ #14582]
|
||||
* sysdeps/ieee754/s_lib_version.c (_LIB_VERSION_INTERNAL):
|
||||
Renamed from _LIB_VERSION.
|
||||
|
|
|
@ -86,13 +86,19 @@ endif
|
|||
|
||||
CPPFLAGS-nonlib = -DDURATION=$(BENCH_DURATION)
|
||||
|
||||
# Use clock_gettime to measure performance of functions. The default is to use
|
||||
# HP_TIMING if it is available.
|
||||
ifdef USE_CLOCK_GETTIME
|
||||
CPPFLAGS-nonlib += -DUSE_CLOCK_GETTIME
|
||||
endif
|
||||
|
||||
# This makes sure CPPFLAGS-nonlib and CFLAGS-nonlib are passed
|
||||
# for all these modules.
|
||||
cpp-srcs-left := $(binaries-bench:=.c)
|
||||
lib := nonlib
|
||||
include $(patsubst %,$(..)cppflags-iterator.mk,$(cpp-srcs-left))
|
||||
|
||||
bench-deps := bench-skeleton.c Makefile
|
||||
bench-deps := bench-skeleton.c bench-timing.h Makefile
|
||||
|
||||
run-bench = $(test-wrapper-env) \
|
||||
GCONV_PATH=$(common-objpfx)iconvdata LC_ALL=C \
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <inttypes.h>
|
||||
#include "bench-timing.h"
|
||||
|
||||
volatile unsigned int dontoptimize = 0;
|
||||
|
||||
|
@ -45,21 +46,16 @@ int
|
|||
main (int argc, char **argv)
|
||||
{
|
||||
unsigned long i, k;
|
||||
struct timespec start, end, runtime;
|
||||
struct timespec runtime;
|
||||
timing_t start, end;
|
||||
|
||||
startup();
|
||||
|
||||
memset (&runtime, 0, sizeof (runtime));
|
||||
memset (&start, 0, sizeof (start));
|
||||
memset (&end, 0, sizeof (end));
|
||||
|
||||
clock_getres (CLOCK_PROCESS_CPUTIME_ID, &start);
|
||||
unsigned long iters;
|
||||
|
||||
/* Measure 1000 times the resolution of the clock. So for a 1ns resolution
|
||||
clock, we measure 1000 iterations of the function call at a time.
|
||||
Measurements close to the minimum clock resolution won't make much sense,
|
||||
but it's better than having nothing at all. */
|
||||
unsigned long iters = 1000 * start.tv_nsec;
|
||||
TIMING_INIT (iters);
|
||||
|
||||
for (int v = 0; v < NUM_VARIANTS; v++)
|
||||
{
|
||||
|
@ -68,19 +64,18 @@ main (int argc, char **argv)
|
|||
runtime.tv_sec += DURATION;
|
||||
|
||||
double d_total_i = 0;
|
||||
uint64_t total = 0, max = 0, min = 0x7fffffffffffffff;
|
||||
timing_t total = 0, max = 0, min = 0x7fffffffffffffff;
|
||||
while (1)
|
||||
{
|
||||
for (i = 0; i < NUM_SAMPLES (v); i++)
|
||||
{
|
||||
clock_gettime (CLOCK_PROCESS_CPUTIME_ID, &start);
|
||||
uint64_t cur;
|
||||
TIMING_NOW (start);
|
||||
for (k = 0; k < iters; k++)
|
||||
BENCH_FUNC (v, i);
|
||||
clock_gettime (CLOCK_PROCESS_CPUTIME_ID, &end);
|
||||
TIMING_NOW (end);
|
||||
|
||||
uint64_t cur = (end.tv_nsec - start.tv_nsec
|
||||
+ ((end.tv_sec - start.tv_sec)
|
||||
* (uint64_t) 1000000000));
|
||||
TIMING_DIFF (cur, start, end);
|
||||
|
||||
if (cur > max)
|
||||
max = cur;
|
||||
|
@ -88,7 +83,7 @@ main (int argc, char **argv)
|
|||
if (cur < min)
|
||||
min = cur;
|
||||
|
||||
total += cur;
|
||||
TIMING_ACCUM (total, cur);
|
||||
|
||||
d_total_i += iters;
|
||||
}
|
||||
|
@ -104,13 +99,11 @@ main (int argc, char **argv)
|
|||
double d_iters;
|
||||
|
||||
done:
|
||||
d_total_s = total * 1e-9;
|
||||
d_total_s = total;
|
||||
d_iters = iters;
|
||||
|
||||
printf ("%s: ITERS:%g: TOTAL:%gs, MAX:%gns, MIN:%gns, %g iter/s\n",
|
||||
VARIANT (v),
|
||||
d_total_i, d_total_s, max / d_iters, min / d_iters,
|
||||
d_total_i / d_total_s);
|
||||
TIMING_PRINT_STATS (VARIANT (v), d_total_s, d_iters, d_total_i, max,
|
||||
min);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
72
benchtests/bench-timing.h
Normal file
72
benchtests/bench-timing.h
Normal file
|
@ -0,0 +1,72 @@
|
|||
/* Define timing macros.
|
||||
Copyright (C) 2013 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <hp-timing.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if HP_TIMING_AVAIL && !defined USE_CLOCK_GETTIME
|
||||
# define GL(x) _##x
|
||||
# define GLRO(x) _##x
|
||||
hp_timing_t _dl_hp_timing_overhead;
|
||||
typedef hp_timing_t timing_t;
|
||||
|
||||
# define TIMING_INIT(iters) \
|
||||
({ \
|
||||
HP_TIMING_DIFF_INIT(); \
|
||||
(iters) = 1000; \
|
||||
})
|
||||
|
||||
# define TIMING_NOW(var) HP_TIMING_NOW (var)
|
||||
# define TIMING_DIFF(diff, start, end) HP_TIMING_DIFF ((diff), (start), (end))
|
||||
# define TIMING_ACCUM(sum, diff) HP_TIMING_ACCUM_NT ((sum), (diff))
|
||||
|
||||
# define TIMING_PRINT_STATS(func, d_total_s, d_iters, d_total_i, max, min) \
|
||||
printf ("%s: ITERS:%g: TOTAL:%gMcy, MAX:%gcy, MIN:%gcy, %g calls/Mcy\n", \
|
||||
(func), (d_total_i), (d_total_s) * 1e-6, (max) / (d_iters), \
|
||||
(min) / (d_iters), 1e6 * (d_total_i) / (d_total_s));
|
||||
|
||||
#else
|
||||
typedef uint64_t timing_t;
|
||||
|
||||
/* Measure 1000 times the resolution of the clock. So for a 1ns
|
||||
resolution clock, we measure 1000 iterations of the function call at a
|
||||
time. Measurements close to the minimum clock resolution won't make
|
||||
much sense, but it's better than having nothing at all. */
|
||||
# define TIMING_INIT(iters) \
|
||||
({ \
|
||||
struct timespec start; \
|
||||
clock_getres (CLOCK_PROCESS_CPUTIME_ID, &start); \
|
||||
(iters) = 1000 * start.tv_nsec; \
|
||||
})
|
||||
|
||||
# define TIMING_NOW(var) \
|
||||
({ \
|
||||
struct timespec tv; \
|
||||
clock_gettime (CLOCK_PROCESS_CPUTIME_ID, &tv); \
|
||||
(var) = (uint64_t) (tv.tv_nsec + (uint64_t) 1000000000 * tv.tv_sec); \
|
||||
})
|
||||
|
||||
# define TIMING_DIFF(diff, start, end) (diff) = (end) - (start)
|
||||
# define TIMING_ACCUM(sum, diff) (sum) += (diff)
|
||||
|
||||
# define TIMING_PRINT_STATS(func, d_total_s, d_iters, d_total_i, max, min) \
|
||||
printf ("%s: ITERS:%g: TOTAL:%gs, MAX:%gns, MIN:%gns, %g iter/s\n", \
|
||||
(func), (d_total_i), (d_total_s) * 1e-9, (max) / (d_iters), \
|
||||
(min) / (d_iters), 1e9 * (d_total_i) / (d_total_s))
|
||||
|
||||
#endif
|
Loading…
Reference in a new issue