From a8b627aaed409a15260c25988970c795bf963812 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 11 Oct 2018 18:23:26 +0200 Subject: [PATCH] main: bump fs.nr_open + fs.max-file to their largest possible values After discussions with kernel folks, a system with memcg really shouldn't need extra hard limits on file descriptors anymore, as they are properly accounted for by memcg anyway. Hence, let's bump these values to their maximums. This also adds a build time option to turn thiss off, to cover those users who do not want to use memcg. --- NEWS | 11 +++++++ meson.build | 3 ++ meson_options.txt | 4 +++ src/core/main.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 102 insertions(+) diff --git a/NEWS b/NEWS index c8e6152c2e..d378b08b70 100644 --- a/NEWS +++ b/NEWS @@ -52,6 +52,17 @@ CHANGES WITH 240 in spe: anymore (and neither can any shared library they use — or any shared library used by any shared library they use and so on). + * The fs.nr_open and fs.file-max sysctls are now automatically bumped + to the highest possible values, as separate accounting of file + descriptors is no longer necessary, as memcg tracks them correctly as + part of the memory accounting anyway. Thus, from the four limits on + file descriptors currently enforced (fs.file-max, fs.nr_open, + RLIMIT_NOFILE hard, RLIMIT_NOFILE soft) we turn off the first two, + and keep only the latter two. A set of build-time options + (-Dbump-proc-sys-fs-file-max=no and -Dbump-proc-sys-fs-nr-open=no) + has been added to revert this change in behaviour, which might be + an option for systems that turn off memcg in the kernel. + CHANGES WITH 239: * NETWORK INTERFACE DEVICE NAMING CHANGES: systemd-udevd's "net_id" diff --git a/meson.build b/meson.build index 30834c86e3..ee8ab1ae29 100644 --- a/meson.build +++ b/meson.build @@ -73,6 +73,9 @@ sysvrcnd_path = get_option('sysvrcnd-path') conf.set10('HAVE_SYSV_COMPAT', sysvinit_path != '' and sysvrcnd_path != '', description : 'SysV init scripts and rcN.d links are supported') +conf.set10('BUMP_PROC_SYS_FS_FILE_MAX', get_option('bump-proc-sys-fs-file-max')) +conf.set10('BUMP_PROC_SYS_FS_NR_OPEN', get_option('bump-proc-sys-fs-nr-open')) + # join_paths ignore the preceding arguments if an absolute component is # encountered, so this should canonicalize various paths when they are # absolute or relative. diff --git a/meson_options.txt b/meson_options.txt index 83ade5bea4..b5a20fb0e2 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -49,6 +49,10 @@ option('debug-extra', type : 'array', choices : ['hashmap', 'mmap-cache'], value description : 'enable extra debugging') option('memory-accounting-default', type : 'boolean', description : 'enable MemoryAccounting= by default') +option('bump-proc-sys-fs-file-max', type : 'boolean', + description : 'bump /proc/sys/fs/file-max to ULONG_MAX') +option('bump-proc-sys-fs-nr-open', type : 'boolean', + description : 'bump /proc/sys/fs/nr_open to INT_MAX') option('valgrind', type : 'boolean', value : false, description : 'do extra operations to avoid valgrind warnings') option('log-trace', type : 'boolean', value : false, diff --git a/src/core/main.c b/src/core/main.c index ace0bbb15d..6b910fc91a 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -73,6 +73,7 @@ #include "stdio-util.h" #include "strv.h" #include "switch-root.h" +#include "sysctl-util.h" #include "terminal-util.h" #include "umask-util.h" #include "user-util.h" @@ -1162,6 +1163,88 @@ static int prepare_reexecute( return 0; } +static void bump_file_max_and_nr_open(void) { + + /* Let's bump fs.file-max and fs.nr_open to their respective maximums. On current kernels large numbers of file + * descriptors are no longer a performance problem and their memory is properly tracked by memcg, thus counting + * them and limiting them in another two layers of limits is unnecessary and just complicates things. This + * function hence turns off 2 of the 4 levels of limits on file descriptors, and makes RLIMIT_NOLIMIT (soft + + * hard) the only ones that really matter. */ + +#if BUMP_PROC_SYS_FS_FILE_MAX || BUMP_PROC_SYS_FS_NR_OPEN + _cleanup_free_ char *t = NULL; + int r; +#endif + +#if BUMP_PROC_SYS_FS_FILE_MAX + /* I so wanted to use STRINGIFY(ULONG_MAX) here, but alas we can't as glibc/gcc define that as + * "(0x7fffffffffffffffL * 2UL + 1UL)". Seriously. 😢 */ + if (asprintf(&t, "%lu\n", ULONG_MAX) < 0) { + log_oom(); + return; + } + + r = sysctl_write("fs/file-max", t); + if (r < 0) + log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.file-max, ignoring: %m"); +#endif + +#if BUMP_PROC_SYS_FS_FILE_MAX && BUMP_PROC_SYS_FS_NR_OPEN + t = mfree(t); +#endif + +#if BUMP_PROC_SYS_FS_NR_OPEN + int v = INT_MAX; + + /* Arg! The kernel enforces maximum and minimum values on the fs.nr_open, but we don't really know what they + * are. The expression by which the maximum is determined is dependent on the architecture, and is something we + * don't really want to copy to userspace, as it is dependent on implementation details of the kernel. Since + * the kernel doesn't expose the maximum value to us, we can only try and hope. Hence, let's start with + * INT_MAX, and then keep halving the value until we find one that works. Ugly? Yes, absolutely, but kernel + * APIs are kernel APIs, so what do can we do... 🤯 */ + + for (;;) { + int k; + + v &= ~(__SIZEOF_POINTER__ - 1); /* Round down to next multiple of the pointer size */ + if (v < 1024) { + log_warning("Can't bump fs.nr_open, value too small."); + break; + } + + k = read_nr_open(); + if (k < 0) { + log_error_errno(k, "Failed to read fs.nr_open: %m"); + break; + } + if (k >= v) { /* Already larger */ + log_debug("Skipping bump, value is already larger."); + break; + } + + if (asprintf(&t, "%i\n", v) < 0) { + log_oom(); + return; + } + + r = sysctl_write("fs/nr_open", t); + t = mfree(t); + if (r == -EINVAL) { + log_debug("Couldn't write fs.nr_open as %i, halving it.", v); + v /= 2; + continue; + } + if (r < 0) { + log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.nr_open, ignoring: %m"); + break; + } + + log_debug("Successfully bumped fs.nr_open to %i", v); + break; + } +#endif +} + static int bump_rlimit_nofile(struct rlimit *saved_rlimit) { int r, nr; @@ -1883,6 +1966,7 @@ static int initialize_runtime( machine_id_setup(NULL, arg_machine_id, NULL); loopback_setup(); bump_unix_max_dgram_qlen(); + bump_file_max_and_nr_open(); test_usr(); write_container_id(); }