tree-wide: add size limits for tmpfs mounts

Limit size of various tmpfs mounts to 10% of RAM, except volatile root and /var
to 25%. Another exception is made for /dev (also /devs for PrivateDevices) and
/sys/fs/cgroup since no (or very few) regular files are expected to be used.

In addition, since directories, symbolic links, device specials and xattrs are
not counted towards the size= limit, number of inodes is also limited
correspondingly: 4MB size translates to 1k of inodes (assuming 4k each), 10% of
RAM (using 16GB of RAM as baseline) translates to 400k and 25% to 1M inodes.

Because nr_inodes option can't use ratios like size option, there's an
unfortunate side effect that with small memory systems the limit may be on the
too large side. Also, on an extremely small device with only 256MB of RAM, 10%
of RAM for /run may not be enough for re-exec of PID1 because 16MB of free
space is required.
This commit is contained in:
Topi Miettinen 2020-04-14 16:39:36 +03:00 committed by Lennart Poettering
parent 7cc8fb3ef0
commit 7d85383edb
8 changed files with 63 additions and 45 deletions

View File

@ -23,6 +23,7 @@
#include "macro.h"
#include "mkdir.h"
#include "mount-setup.h"
#include "mount-util.h"
#include "mountpoint-util.h"
#include "nulstr-util.h"
#include "path-util.h"
@ -60,51 +61,51 @@ typedef struct MountPoint {
#endif
static const MountPoint mount_table[] = {
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
NULL, MNT_FATAL|MNT_IN_CONTAINER },
{ "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
NULL, MNT_FATAL|MNT_IN_CONTAINER },
{ "devtmpfs", "/dev", "devtmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_STRICTATIME,
{ "devtmpfs", "/dev", "devtmpfs", "mode=755" TMPFS_LIMITS_DEV, MS_NOSUID|MS_NOEXEC|MS_STRICTATIME,
NULL, MNT_FATAL|MNT_IN_CONTAINER },
{ "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
NULL, MNT_NONE },
#if ENABLE_SMACK
{ "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV,
mac_smack_use, MNT_FATAL },
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*" TMPFS_LIMITS_DEV_SHM, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
mac_smack_use, MNT_FATAL },
#endif
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777" TMPFS_LIMITS_DEV_SHM, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
NULL, MNT_FATAL|MNT_IN_CONTAINER },
{ "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
{ "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
NULL, MNT_IN_CONTAINER },
#if ENABLE_SMACK
{ "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
mac_smack_use, MNT_FATAL },
#endif
{ "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/run", "tmpfs", "mode=755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
NULL, MNT_FATAL|MNT_IN_CONTAINER },
{ "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
{ "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
{ "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755" TMPFS_LIMITS_SYS_FS_CGROUP, MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
{ "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
{ "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
{ "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_legacy_wanted, MNT_IN_CONTAINER },
{ "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
{ "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
NULL, MNT_NONE },
#if ENABLE_EFI
{ "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
is_efi_boot, MNT_NONE },
#endif
{ "bpf", "/sys/fs/bpf", "bpf", "mode=700", MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "bpf", "/sys/fs/bpf", "bpf", "mode=700", MS_NOSUID|MS_NOEXEC|MS_NODEV,
NULL, MNT_NONE, },
};
@ -352,7 +353,7 @@ int mount_cgroup_controllers(void) {
}
/* Now that we mounted everything, let's make the tmpfs the cgroup file systems are mounted into read-only. */
(void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
(void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755" TMPFS_LIMITS_SYS_FS_CGROUP);
return 0;
}

View File

@ -130,9 +130,9 @@ static const MountEntry protect_home_read_only_table[] = {
/* ProtectHome=tmpfs table */
static const MountEntry protect_home_tmpfs_table[] = {
{ "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
{ "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
{ "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
{ "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
{ "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
{ "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
};
/* ProtectHome=yes table */
@ -295,7 +295,7 @@ static int append_empty_dir_mounts(MountEntry **p, char **strv) {
.mode = EMPTY_DIR,
.ignore = false,
.read_only = true,
.options_const = "mode=755",
.options_const = "mode=755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
};
}
@ -341,7 +341,7 @@ static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs,
"Path is not absolute: %s",
t->path);
str = strjoin("mode=0755,", t->options);
str = strjoin("mode=0755" TMPFS_LIMITS_TEMPORARY_FS ",", t->options);
if (!str)
return -ENOMEM;
@ -686,7 +686,7 @@ static int mount_private_dev(MountEntry *m) {
dev = strjoina(temporary_mount, "/dev");
(void) mkdir(dev, 0755);
if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755" TMPFS_LIMITS_DEV) < 0) {
r = log_debug_errno(errno, "Failed to mount tmpfs on '%s': %m", dev);
goto fail;
}

View File

@ -809,7 +809,7 @@ static int add_volatile_var(void) {
"/var",
NULL,
"tmpfs",
"mode=0755",
"mode=0755" TMPFS_LIMITS_VAR,
0,
0,
SPECIAL_LOCAL_FS_TARGET,

View File

@ -319,7 +319,7 @@ static int mount_legacy_cgns_supported(
* uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
* pass uid 0 and not uid_shift to tmpfs_patch_options().
*/
r = tmpfs_patch_options("mode=755", 0, selinux_apifs_context, &options);
r = tmpfs_patch_options("mode=755" TMPFS_LIMITS_SYS_FS_CGROUP, 0, selinux_apifs_context, &options);
if (r < 0)
return log_oom();
@ -421,7 +421,7 @@ static int mount_legacy_cgns_unsupported(
if (r == 0) {
_cleanup_free_ char *options = NULL;
r = tmpfs_patch_options("mode=755", uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &options);
r = tmpfs_patch_options("mode=755" TMPFS_LIMITS_SYS_FS_CGROUP, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &options);
if (r < 0)
return log_oom();

View File

@ -602,25 +602,25 @@ int mount_all(const char *dest,
MOUNT_IN_USERNS|MOUNT_MKDIR },
/* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
{ "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/tmp", "tmpfs", "mode=1777" TMPFS_LIMITS_TMP, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR },
{ "tmpfs", "/sys", "tmpfs", "mode=555", MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "tmpfs", "/sys", "tmpfs", "mode=555" TMPFS_LIMITS_SYS, MS_NOSUID|MS_NOEXEC|MS_NODEV,
MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR },
{ "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR }, /* skipped if above was mounted */
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
MOUNT_FATAL|MOUNT_MKDIR }, /* skipped if above was mounted */
{ "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
{ "tmpfs", "/dev", "tmpfs", "mode=755" TMPFS_LIMITS_DEV, MS_NOSUID|MS_STRICTATIME,
MOUNT_FATAL|MOUNT_MKDIR },
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777" TMPFS_LIMITS_DEV_SHM, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
MOUNT_FATAL|MOUNT_MKDIR },
{ "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/run", "tmpfs", "mode=755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
MOUNT_FATAL|MOUNT_MKDIR },
#if HAVE_SELINUX
{ "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,
{ "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,
0 }, /* Bind mount first */
{ NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
{ NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
0 }, /* Then, make it r/o */
#endif
};
@ -1023,7 +1023,7 @@ static int setup_volatile_state(const char *directory, uid_t uid_shift, const ch
if (r < 0 && errno != EEXIST)
return log_error_errno(errno, "Failed to create %s: %m", directory);
options = "mode=755";
options = "mode=755" TMPFS_LIMITS_VOLATILE_STATE;
r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
if (r < 0)
return log_oom();
@ -1068,7 +1068,7 @@ static int setup_volatile_yes(const char *directory, uid_t uid_shift, const char
if (!mkdtemp(template))
return log_error_errno(errno, "Failed to create temporary directory: %m");
options = "mode=755";
options = "mode=755" TMPFS_LIMITS_ROOTFS;
r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
if (r < 0)
goto fail;
@ -1135,7 +1135,7 @@ static int setup_volatile_overlay(const char *directory, uid_t uid_shift, const
if (!mkdtemp(template))
return log_error_errno(errno, "Failed to create temporary directory: %m");
options = "mode=755";
options = "mode=755" TMPFS_LIMITS_ROOTFS;
r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
if (r < 0)
goto finish;

View File

@ -6,6 +6,23 @@
#include "macro.h"
/* 4MB for contents of regular files, 64k inodes for directories, symbolic links and device specials,
using large storage array systems as a baseline */
#define TMPFS_LIMITS_DEV ",size=4m,nr_inodes=64k"
/* Very little, if any use expected */
#define TMPFS_LIMITS_EMPTY_OR_ALMOST ",size=4m,nr_inodes=1k"
#define TMPFS_LIMITS_SYS TMPFS_LIMITS_EMPTY_OR_ALMOST
#define TMPFS_LIMITS_SYS_FS_CGROUP TMPFS_LIMITS_EMPTY_OR_ALMOST
/* 10% of RAM (using 16GB of RAM as a baseline) translates to 400k inodes (assuming 4k each) and 25%
translates to 1M inodes */
#define TMPFS_LIMITS_TMP ",size=10%,nr_inodes=400k"
#define TMPFS_LIMITS_DEV_SHM TMPFS_LIMITS_TMP
#define TMPFS_LIMITS_RUN TMPFS_LIMITS_TMP
#define TMPFS_LIMITS_TEMPORARY_FS TMPFS_LIMITS_TMP
#define TMPFS_LIMITS_VAR ",size=25%,nr_inodes=1m"
#define TMPFS_LIMITS_ROOTFS TMPFS_LIMITS_VAR
#define TMPFS_LIMITS_VOLATILE_STATE TMPFS_LIMITS_VAR
int repeat_unmount(const char *path, int flags);
int umount_recursive(const char *target, int flags);
int bind_remount_recursive(const char *prefix, unsigned long new_flags, unsigned long flags_mask, char **blacklist);

View File

@ -29,7 +29,7 @@ static int make_volatile(const char *path) {
if (r < 0)
return log_error_errno(r, "Couldn't generate volatile sysroot directory: %m");
r = mount_verbose(LOG_ERR, "tmpfs", "/run/systemd/volatile-sysroot", "tmpfs", MS_STRICTATIME, "mode=755");
r = mount_verbose(LOG_ERR, "tmpfs", "/run/systemd/volatile-sysroot", "tmpfs", MS_STRICTATIME, "mode=755" TMPFS_LIMITS_ROOTFS);
if (r < 0)
goto finish_rmdir;
@ -80,7 +80,7 @@ static int make_overlay(const char *path) {
if (r < 0)
return log_error_errno(r, "Couldn't create overlay sysroot directory: %m");
r = mount_verbose(LOG_ERR, "tmpfs", "/run/systemd/overlay-sysroot", "tmpfs", MS_STRICTATIME, "mode=755");
r = mount_verbose(LOG_ERR, "tmpfs", "/run/systemd/overlay-sysroot", "tmpfs", MS_STRICTATIME, "mode=755" TMPFS_LIMITS_ROOTFS);
if (r < 0)
goto finish;

View File

@ -22,4 +22,4 @@ After=swap.target
What=tmpfs
Where=/tmp
Type=tmpfs
Options=mode=1777,strictatime,nosuid,nodev
Options=mode=1777,strictatime,nosuid,nodev,size=10%,nr_inodes=400k