malloc: Add Huge Page support for mmap

With the morecore hook removed, there is not easy way to provide huge
pages support on with glibc allocator without resorting to transparent
huge pages.  And some users and programs do prefer to use the huge pages
directly instead of THP for multiple reasons: no splitting, re-merging
by the VM, no TLB shootdowns for running processes, fast allocation
from the reserve pool, no competition with the rest of the processes
unlike THP, no swapping all, etc.

This patch extends the 'glibc.malloc.hugetlb' tunable: the value
'2' means to use huge pages directly with the system default size,
while a positive value means and specific page size that is matched
against the supported ones by the system.

Currently only memory allocated on sysmalloc() is handled, the arenas
still uses the default system page size.

To test is a new rule is added tests-malloc-hugetlb2, which run the
addes tests with the required GLIBC_TUNABLE setting.  On systems without
a reserved huge pages pool, is just stress the mmap(MAP_HUGETLB)
allocation failure.  To improve test coverage it is required to create
a pool with some allocated pages.

Checked on x86_64-linux-gnu.

Reviewed-by: DJ Delorie <dj@redhat.com>
This commit is contained in:
Adhemerval Zanella 2021-08-16 15:08:27 -03:00
parent 6cc3ccc67e
commit 98d5fcb8d0
11 changed files with 207 additions and 15 deletions

8
NEWS
View file

@ -93,9 +93,11 @@ Major new features:
configuration.
* On Linux, a new tunable, glibc.malloc.hugetlb, can be used to
make malloc issue madvise plus MADV_HUGEPAGE on mmap and sbrk calls.
Setting this might improve performance with Transparent Huge Pages madvise
mode depending of the workload.
either make malloc issue madvise plus MADV_HUGEPAGE on mmap and sbrk
or to use huge pages directly with mmap calls with the MAP_HUGETLB
flags). The former can improve performance when Transparent Huge Pages
is set to 'madvise' mode while the latter uses the system reserved
huge pages.
Deprecated and removed features, and other changes affecting compatibility:

17
Rules
View file

@ -158,6 +158,7 @@ tests: $(tests:%=$(objpfx)%.out) $(tests-internal:%=$(objpfx)%.out) \
$(tests-mcheck:%=$(objpfx)%-mcheck.out) \
$(tests-malloc-check:%=$(objpfx)%-malloc-check.out) \
$(tests-malloc-hugetlb1:%=$(objpfx)%-malloc-hugetlb1.out) \
$(tests-malloc-hugetlb2:%=$(objpfx)%-malloc-hugetlb2.out) \
$(tests-special) $(tests-printers-out)
xtests: tests $(xtests:%=$(objpfx)%.out) $(xtests-special)
endif
@ -170,6 +171,7 @@ else
tests-expected = $(tests) $(tests-internal) $(tests-printers) \
$(tests-container) $(tests-malloc-check:%=%-malloc-check) \
$(tests-malloc-hugetlb1:%=%-malloc-hugetlb1) \
$(tests-malloc-hugetlb2:%=%-malloc-hugetlb2) \
$(tests-mcheck:%=%-mcheck)
endif
tests:
@ -199,6 +201,7 @@ endif
binaries-mcheck-tests = $(tests-mcheck:%=%-mcheck)
binaries-malloc-check-tests = $(tests-malloc-check:%=%-malloc-check)
binaries-malloc-hugetlb1-tests = $(tests-malloc-hugetlb1:%=%-malloc-hugetlb1)
binaries-malloc-hugetlb2-tests = $(tests-malloc-hugetlb2:%=%-malloc-hugetlb2)
else
binaries-all-notests =
binaries-all-tests = $(tests) $(tests-internal) $(xtests) $(test-srcs)
@ -211,6 +214,7 @@ binaries-pie-notests =
binaries-mcheck-tests =
binaries-malloc-check-tests =
binaries-malloc-hugetlb1-tests =
binaries-malloc-hugetlb2-tests =
endif
binaries-pie = $(binaries-pie-tests) $(binaries-pie-notests)
@ -259,6 +263,14 @@ $(addprefix $(objpfx),$(binaries-malloc-hugetlb1-tests)): %-malloc-hugetlb1: %.o
$(+link-tests)
endif
ifneq "$(strip $(binaries-malloc-hugetlb2-tests))" ""
$(addprefix $(objpfx),$(binaries-malloc-hugetlb2-tests)): %-malloc-hugetlb2: %.o \
$(link-extra-libs-tests) \
$(sort $(filter $(common-objpfx)lib%,$(link-libc))) \
$(addprefix $(csu-objpfx),start.o) $(+preinit) $(+postinit)
$(+link-tests)
endif
ifneq "$(strip $(binaries-pie-tests))" ""
$(addprefix $(objpfx),$(binaries-pie-tests)): %: %.o \
$(link-extra-libs-tests) \
@ -302,6 +314,11 @@ $(1)-malloc-hugetlb1-ENV += GLIBC_TUNABLES=glibc.malloc.hugetlb=1
endef
$(foreach t,$(tests-malloc-hugetlb1),$(eval $(call malloc-hugetlb1-ENVS,$(t))))
# All malloc-hugetlb2 tests will be run with GLIBC_TUNABLE=glibc.malloc.hugetlb=2
define malloc-hugetlb2-ENVS
$(1)-malloc-hugetlb2-ENV += GLIBC_TUNABLES=glibc.malloc.hugetlb=2
endef
$(foreach t,$(tests-malloc-hugetlb2),$(eval $(call malloc-hugetlb2-ENVS,$(t))))
# mcheck tests need the debug DSO to support -lmcheck.
define mcheck-ENVS

View file

@ -93,9 +93,8 @@ glibc {
security_level: SXID_IGNORE
}
hugetlb {
type: INT_32
type: SIZE_T
minval: 0
maxval: 1
}
}
cpu {

View file

@ -1,7 +1,7 @@
glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0x[f]+)
glibc.malloc.arena_test: 0x0 (min: 0x1, max: 0x[f]+)
glibc.malloc.check: 0 (min: 0, max: 3)
glibc.malloc.hugetlb: 0 (min: 0, max: 1)
glibc.malloc.hugetlb: 0x0 (min: 0x0, max: 0x[f]+)
glibc.malloc.mmap_max: 0 (min: 0, max: 2147483647)
glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0x[f]+)
glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0x[f]+)

View file

@ -78,9 +78,9 @@ tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
$(tests-static),$(tests))
# Run all testes with GLIBC_TUNABLES=glibc.malloc.hugetlb=1 that check the
# Transparent Huge Pages support. We need exclude some tests that define
# the ENV vars.
# Run all tests with GLIBC_TUNABLES=glibc.malloc.hugetlb={1,2} which check
# the Transparent Huge Pages support (1) or automatic huge page support (2).
# We need exclude some tests that define the ENV vars.
tests-exclude-hugetlb1 = \
tst-compathooks-off \
tst-compathooks-on \
@ -93,6 +93,8 @@ tests-exclude-hugetlb1 = \
tst-mallocstate
tests-malloc-hugetlb1 = \
$(filter-out $(tests-exclude-hugetlb1), $(tests))
tests-malloc-hugetlb2 = \
$(filter-out $(tests-exclude-hugetlb1), $(tests))
# -lmcheck needs __malloc_initialize_hook, which was deprecated in 2.24.
ifeq ($(have-GLIBC_2.23)$(build-shared),yesyes)

View file

@ -230,7 +230,7 @@ TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t)
TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
#endif
TUNABLE_CALLBACK_FNDECL (set_mxfast, size_t)
TUNABLE_CALLBACK_FNDECL (set_hugetlb, int32_t)
TUNABLE_CALLBACK_FNDECL (set_hugetlb, size_t)
#else
/* Initialization routine. */
#include <string.h>
@ -331,7 +331,7 @@ ptmalloc_init (void)
TUNABLE_CALLBACK (set_tcache_unsorted_limit));
# endif
TUNABLE_GET (mxfast, size_t, TUNABLE_CALLBACK (set_mxfast));
TUNABLE_GET (hugetlb, int32_t, TUNABLE_CALLBACK (set_hugetlb));
TUNABLE_GET (hugetlb, size_t, TUNABLE_CALLBACK (set_hugetlb));
#else
if (__glibc_likely (_environ != NULL))
{

View file

@ -1883,6 +1883,10 @@ struct malloc_par
#if HAVE_TUNABLES
/* Transparent Large Page support. */
INTERNAL_SIZE_T thp_pagesize;
/* A value different than 0 means to align mmap allocation to hp_pagesize
add hp_flags on flags. */
INTERNAL_SIZE_T hp_pagesize;
int hp_flags;
#endif
/* Memory map support */
@ -2440,7 +2444,10 @@ sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
if (mm == MAP_FAILED)
return mm;
madvise_thp (mm, size);
#ifdef MAP_HUGETLB
if (!(extra_flags & MAP_HUGETLB))
madvise_thp (mm, size);
#endif
/*
The offset to the start of the mmapped region is stored in the prev_size
@ -2528,7 +2535,18 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av)
|| ((unsigned long) (nb) >= (unsigned long) (mp_.mmap_threshold)
&& (mp_.n_mmaps < mp_.n_mmaps_max)))
{
char *mm = sysmalloc_mmap (nb, pagesize, 0, av);
char *mm;
#if HAVE_TUNABLES
if (mp_.hp_pagesize > 0 && nb >= mp_.hp_pagesize)
{
/* There is no need to isse the THP madvise call if Huge Pages are
used directly. */
mm = sysmalloc_mmap (nb, mp_.hp_pagesize, mp_.hp_flags, av);
if (mm != MAP_FAILED)
return mm;
}
#endif
mm = sysmalloc_mmap (nb, pagesize, 0, av);
if (mm != MAP_FAILED)
return mm;
tried_mmap = true;
@ -2609,7 +2627,9 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av)
}
else if (!tried_mmap)
{
/* We can at least try to use to mmap memory. */
/* We can at least try to use to mmap memory. If new_heap fails
it is unlikely that trying to allocate huge pages will
succeed. */
char *mm = sysmalloc_mmap (nb, pagesize, 0, av);
if (mm != MAP_FAILED)
return mm;
@ -5383,7 +5403,7 @@ do_set_mxfast (size_t value)
#if HAVE_TUNABLES
static __always_inline int
do_set_hugetlb (int32_t value)
do_set_hugetlb (size_t value)
{
if (value == 1)
{
@ -5395,6 +5415,9 @@ do_set_hugetlb (int32_t value)
if (thp_mode == malloc_thp_mode_madvise)
mp_.thp_pagesize = __malloc_default_thp_pagesize ();
}
else if (value >= 2)
__malloc_hugepage_config (value == 2 ? 0 : value, &mp_.hp_pagesize,
&mp_.hp_flags);
return 0;
}
#endif

View file

@ -278,6 +278,13 @@ default value is @code{0}, which disables any additional support on
Setting its value to @code{1} enables the use of @code{madvise} with
@code{MADV_HUGEPAGE} after memory allocation with @code{mmap}. It is enabled
only if the system supports Transparent Huge Page (currently only on Linux).
Setting its value to @code{2} enables the use of Huge Page directly with
@code{mmap} with the use of @code{MAP_HUGETLB} flag. The huge page size
to use will be the default one provided by the system. A value larger than
@code{2} specifies huge page size, which will be matched against the system
supported ones. If provided value is invalid, @code{MAP_HUGETLB} will not
be used.
@end deftp
@node Dynamic Linking Tunables

View file

@ -29,3 +29,11 @@ __malloc_thp_mode (void)
{
return malloc_thp_mode_not_supported;
}
/* Return the default transparent huge page size. */
void
__malloc_hugepage_config (size_t requested, size_t *pagesize, int *flags)
{
*pagesize = 0;
*flags = 0;
}

View file

@ -34,4 +34,11 @@ enum malloc_thp_mode_t
enum malloc_thp_mode_t __malloc_thp_mode (void) attribute_hidden;
/* Return the supported huge page size from the REQUESTED sizes on PAGESIZE
along with the required extra mmap flags on FLAGS, Requesting the value
of 0 returns the default huge page size, otherwise the value will be
matched against the sizes supported by the system. */
void __malloc_hugepage_config (size_t requested, size_t *pagesize, int *flags)
attribute_hidden;
#endif /* _MALLOC_HUGEPAGES_H */

View file

@ -17,8 +17,10 @@
not, see <https://www.gnu.org/licenses/>. */
#include <intprops.h>
#include <dirent.h>
#include <malloc-hugepages.h>
#include <not-cancel.h>
#include <sys/mman.h>
unsigned long int
__malloc_default_thp_pagesize (void)
@ -72,3 +74,128 @@ __malloc_thp_mode (void)
}
return malloc_thp_mode_not_supported;
}
static size_t
malloc_default_hugepage_size (void)
{
int fd = __open64_nocancel ("/proc/meminfo", O_RDONLY);
if (fd == -1)
return 0;
size_t hpsize = 0;
char buf[512];
off64_t off = 0;
while (1)
{
ssize_t r = __pread64_nocancel (fd, buf, sizeof (buf) - 1, off);
if (r < 0)
break;
buf[r] = '\0';
/* If the tag is not found, read the last line again. */
const char *s = strstr (buf, "Hugepagesize:");
if (s == NULL)
{
char *nl = strrchr (buf, '\n');
if (nl == NULL)
break;
off += (nl + 1) - buf;
continue;
}
/* The default huge page size is in the form:
Hugepagesize: NUMBER kB */
s += sizeof ("Hugepagesize: ") - 1;
for (int i = 0; (s[i] >= '0' && s[i] <= '9') || s[i] == ' '; i++)
{
if (s[i] == ' ')
continue;
hpsize *= 10;
hpsize += s[i] - '0';
}
hpsize *= 1024;
break;
}
__close_nocancel (fd);
return hpsize;
}
static inline int
hugepage_flags (size_t pagesize)
{
return MAP_HUGETLB | (__builtin_ctzll (pagesize) << MAP_HUGE_SHIFT);
}
void
__malloc_hugepage_config (size_t requested, size_t *pagesize, int *flags)
{
*pagesize = 0;
*flags = 0;
if (requested == 0)
{
*pagesize = malloc_default_hugepage_size ();
if (*pagesize != 0)
*flags = hugepage_flags (*pagesize);
return;
}
/* Each entry represents a supported huge page in the form of:
hugepages-<size>kB. */
int dirfd = __open64_nocancel ("/sys/kernel/mm/hugepages",
O_RDONLY | O_DIRECTORY, 0);
if (dirfd == -1)
return;
char buffer[1024];
while (true)
{
#if !IS_IN(libc)
# define __getdents64 getdents64
#endif
ssize_t ret = __getdents64 (dirfd, buffer, sizeof (buffer));
if (ret == -1)
break;
else if (ret == 0)
break;
bool found = false;
char *begin = buffer, *end = buffer + ret;
while (begin != end)
{
unsigned short int d_reclen;
memcpy (&d_reclen, begin + offsetof (struct dirent64, d_reclen),
sizeof (d_reclen));
const char *dname = begin + offsetof (struct dirent64, d_name);
begin += d_reclen;
if (dname[0] == '.'
|| strncmp (dname, "hugepages-", sizeof ("hugepages-") - 1) != 0)
continue;
size_t hpsize = 0;
const char *sizestr = dname + sizeof ("hugepages-") - 1;
for (int i = 0; sizestr[i] >= '0' && sizestr[i] <= '9'; i++)
{
hpsize *= 10;
hpsize += sizestr[i] - '0';
}
hpsize *= 1024;
if (hpsize == requested)
{
*pagesize = hpsize;
*flags = hugepage_flags (*pagesize);
found = true;
break;
}
}
if (found)
break;
}
__close_nocancel (dirfd);
}