From 7d85383edbab73274dc81cc888d884bb01070bc2 Mon Sep 17 00:00:00 2001 From: Topi Miettinen Date: Tue, 14 Apr 2020 16:39:36 +0300 Subject: [PATCH] tree-wide: add size limits for tmpfs mounts Limit size of various tmpfs mounts to 10% of RAM, except volatile root and /var to 25%. Another exception is made for /dev (also /devs for PrivateDevices) and /sys/fs/cgroup since no (or very few) regular files are expected to be used. In addition, since directories, symbolic links, device specials and xattrs are not counted towards the size= limit, number of inodes is also limited correspondingly: 4MB size translates to 1k of inodes (assuming 4k each), 10% of RAM (using 16GB of RAM as baseline) translates to 400k and 25% to 1M inodes. Because nr_inodes option can't use ratios like size option, there's an unfortunate side effect that with small memory systems the limit may be on the too large side. Also, on an extremely small device with only 256MB of RAM, 10% of RAM for /run may not be enough for re-exec of PID1 because 16MB of free space is required. --- src/core/mount-setup.c | 43 ++++++++++++++------------- src/core/namespace.c | 12 ++++---- src/fstab-generator/fstab-generator.c | 2 +- src/nspawn/nspawn-cgroup.c | 4 +-- src/nspawn/nspawn-mount.c | 24 +++++++-------- src/shared/mount-util.h | 17 +++++++++++ src/volatile-root/volatile-root.c | 4 +-- units/tmp.mount | 2 +- 8 files changed, 63 insertions(+), 45 deletions(-) diff --git a/src/core/mount-setup.c b/src/core/mount-setup.c index ffe3d4cc64..dd44d5eb92 100644 --- a/src/core/mount-setup.c +++ b/src/core/mount-setup.c @@ -23,6 +23,7 @@ #include "macro.h" #include "mkdir.h" #include "mount-setup.h" +#include "mount-util.h" #include "mountpoint-util.h" #include "nulstr-util.h" #include "path-util.h" @@ -60,51 +61,51 @@ typedef struct MountPoint { #endif static const MountPoint mount_table[] = { - { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL, MNT_FATAL|MNT_IN_CONTAINER }, - { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL, MNT_FATAL|MNT_IN_CONTAINER }, - { "devtmpfs", "/dev", "devtmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_STRICTATIME, + { "devtmpfs", "/dev", "devtmpfs", "mode=755" TMPFS_LIMITS_DEV, MS_NOSUID|MS_NOEXEC|MS_STRICTATIME, NULL, MNT_FATAL|MNT_IN_CONTAINER }, - { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL, MNT_NONE }, #if ENABLE_SMACK - { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV, mac_smack_use, MNT_FATAL }, - { "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + { "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*" TMPFS_LIMITS_DEV_SHM, MS_NOSUID|MS_NODEV|MS_STRICTATIME, mac_smack_use, MNT_FATAL }, #endif - { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + { "tmpfs", "/dev/shm", "tmpfs", "mode=1777" TMPFS_LIMITS_DEV_SHM, MS_NOSUID|MS_NODEV|MS_STRICTATIME, NULL, MNT_FATAL|MNT_IN_CONTAINER }, - { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, + { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, NULL, MNT_IN_CONTAINER }, #if ENABLE_SMACK - { "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + { "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME, mac_smack_use, MNT_FATAL }, #endif - { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + { "tmpfs", "/run", "tmpfs", "mode=755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME, NULL, MNT_FATAL|MNT_IN_CONTAINER }, - { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV, cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, - { "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, - { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, + { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755" TMPFS_LIMITS_SYS_FS_CGROUP, MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER }, - { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV, cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, - { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, - { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV, cg_is_legacy_wanted, MNT_IN_CONTAINER }, - { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV, cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER }, - { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL, MNT_NONE }, #if ENABLE_EFI - { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, is_efi_boot, MNT_NONE }, #endif - { "bpf", "/sys/fs/bpf", "bpf", "mode=700", MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "bpf", "/sys/fs/bpf", "bpf", "mode=700", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL, MNT_NONE, }, }; @@ -352,7 +353,7 @@ int mount_cgroup_controllers(void) { } /* Now that we mounted everything, let's make the tmpfs the cgroup file systems are mounted into read-only. */ - (void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755"); + (void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755" TMPFS_LIMITS_SYS_FS_CGROUP); return 0; } diff --git a/src/core/namespace.c b/src/core/namespace.c index 36bc95a2df..34eb469fb8 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -130,9 +130,9 @@ static const MountEntry protect_home_read_only_table[] = { /* ProtectHome=tmpfs table */ static const MountEntry protect_home_tmpfs_table[] = { - { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME }, - { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME }, - { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME }, + { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME }, + { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME }, + { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME }, }; /* ProtectHome=yes table */ @@ -295,7 +295,7 @@ static int append_empty_dir_mounts(MountEntry **p, char **strv) { .mode = EMPTY_DIR, .ignore = false, .read_only = true, - .options_const = "mode=755", + .options_const = "mode=755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, }; } @@ -341,7 +341,7 @@ static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, "Path is not absolute: %s", t->path); - str = strjoin("mode=0755,", t->options); + str = strjoin("mode=0755" TMPFS_LIMITS_TEMPORARY_FS ",", t->options); if (!str) return -ENOMEM; @@ -686,7 +686,7 @@ static int mount_private_dev(MountEntry *m) { dev = strjoina(temporary_mount, "/dev"); (void) mkdir(dev, 0755); - if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) { + if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755" TMPFS_LIMITS_DEV) < 0) { r = log_debug_errno(errno, "Failed to mount tmpfs on '%s': %m", dev); goto fail; } diff --git a/src/fstab-generator/fstab-generator.c b/src/fstab-generator/fstab-generator.c index 08c7b76dba..bef2f27d3a 100644 --- a/src/fstab-generator/fstab-generator.c +++ b/src/fstab-generator/fstab-generator.c @@ -809,7 +809,7 @@ static int add_volatile_var(void) { "/var", NULL, "tmpfs", - "mode=0755", + "mode=0755" TMPFS_LIMITS_VAR, 0, 0, SPECIAL_LOCAL_FS_TARGET, diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c index dcee851e93..a16ee5c60a 100644 --- a/src/nspawn/nspawn-cgroup.c +++ b/src/nspawn/nspawn-cgroup.c @@ -319,7 +319,7 @@ static int mount_legacy_cgns_supported( * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply * pass uid 0 and not uid_shift to tmpfs_patch_options(). */ - r = tmpfs_patch_options("mode=755", 0, selinux_apifs_context, &options); + r = tmpfs_patch_options("mode=755" TMPFS_LIMITS_SYS_FS_CGROUP, 0, selinux_apifs_context, &options); if (r < 0) return log_oom(); @@ -421,7 +421,7 @@ static int mount_legacy_cgns_unsupported( if (r == 0) { _cleanup_free_ char *options = NULL; - r = tmpfs_patch_options("mode=755", uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &options); + r = tmpfs_patch_options("mode=755" TMPFS_LIMITS_SYS_FS_CGROUP, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &options); if (r < 0) return log_oom(); diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 59bd73d2ea..33cc19a425 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -602,25 +602,25 @@ int mount_all(const char *dest, MOUNT_IN_USERNS|MOUNT_MKDIR }, /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */ - { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + { "tmpfs", "/tmp", "tmpfs", "mode=1777" TMPFS_LIMITS_TMP, MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR }, - { "tmpfs", "/sys", "tmpfs", "mode=555", MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "tmpfs", "/sys", "tmpfs", "mode=555" TMPFS_LIMITS_SYS, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR }, - { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR }, /* skipped if above was mounted */ - { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_MKDIR }, /* skipped if above was mounted */ - { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, + { "tmpfs", "/dev", "tmpfs", "mode=755" TMPFS_LIMITS_DEV, MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL|MOUNT_MKDIR }, - { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + { "tmpfs", "/dev/shm", "tmpfs", "mode=1777" TMPFS_LIMITS_DEV_SHM, MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL|MOUNT_MKDIR }, - { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + { "tmpfs", "/run", "tmpfs", "mode=755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL|MOUNT_MKDIR }, #if HAVE_SELINUX - { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, + { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, 0 }, /* Bind mount first */ - { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, + { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 }, /* Then, make it r/o */ #endif }; @@ -1023,7 +1023,7 @@ static int setup_volatile_state(const char *directory, uid_t uid_shift, const ch if (r < 0 && errno != EEXIST) return log_error_errno(errno, "Failed to create %s: %m", directory); - options = "mode=755"; + options = "mode=755" TMPFS_LIMITS_VOLATILE_STATE; r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf); if (r < 0) return log_oom(); @@ -1068,7 +1068,7 @@ static int setup_volatile_yes(const char *directory, uid_t uid_shift, const char if (!mkdtemp(template)) return log_error_errno(errno, "Failed to create temporary directory: %m"); - options = "mode=755"; + options = "mode=755" TMPFS_LIMITS_ROOTFS; r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf); if (r < 0) goto fail; @@ -1135,7 +1135,7 @@ static int setup_volatile_overlay(const char *directory, uid_t uid_shift, const if (!mkdtemp(template)) return log_error_errno(errno, "Failed to create temporary directory: %m"); - options = "mode=755"; + options = "mode=755" TMPFS_LIMITS_ROOTFS; r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf); if (r < 0) goto finish; diff --git a/src/shared/mount-util.h b/src/shared/mount-util.h index 06fddacf16..024787912b 100644 --- a/src/shared/mount-util.h +++ b/src/shared/mount-util.h @@ -6,6 +6,23 @@ #include "macro.h" +/* 4MB for contents of regular files, 64k inodes for directories, symbolic links and device specials, + using large storage array systems as a baseline */ +#define TMPFS_LIMITS_DEV ",size=4m,nr_inodes=64k" +/* Very little, if any use expected */ +#define TMPFS_LIMITS_EMPTY_OR_ALMOST ",size=4m,nr_inodes=1k" +#define TMPFS_LIMITS_SYS TMPFS_LIMITS_EMPTY_OR_ALMOST +#define TMPFS_LIMITS_SYS_FS_CGROUP TMPFS_LIMITS_EMPTY_OR_ALMOST +/* 10% of RAM (using 16GB of RAM as a baseline) translates to 400k inodes (assuming 4k each) and 25% + translates to 1M inodes */ +#define TMPFS_LIMITS_TMP ",size=10%,nr_inodes=400k" +#define TMPFS_LIMITS_DEV_SHM TMPFS_LIMITS_TMP +#define TMPFS_LIMITS_RUN TMPFS_LIMITS_TMP +#define TMPFS_LIMITS_TEMPORARY_FS TMPFS_LIMITS_TMP +#define TMPFS_LIMITS_VAR ",size=25%,nr_inodes=1m" +#define TMPFS_LIMITS_ROOTFS TMPFS_LIMITS_VAR +#define TMPFS_LIMITS_VOLATILE_STATE TMPFS_LIMITS_VAR + int repeat_unmount(const char *path, int flags); int umount_recursive(const char *target, int flags); int bind_remount_recursive(const char *prefix, unsigned long new_flags, unsigned long flags_mask, char **blacklist); diff --git a/src/volatile-root/volatile-root.c b/src/volatile-root/volatile-root.c index af78a87d6f..e55864d6cc 100644 --- a/src/volatile-root/volatile-root.c +++ b/src/volatile-root/volatile-root.c @@ -29,7 +29,7 @@ static int make_volatile(const char *path) { if (r < 0) return log_error_errno(r, "Couldn't generate volatile sysroot directory: %m"); - r = mount_verbose(LOG_ERR, "tmpfs", "/run/systemd/volatile-sysroot", "tmpfs", MS_STRICTATIME, "mode=755"); + r = mount_verbose(LOG_ERR, "tmpfs", "/run/systemd/volatile-sysroot", "tmpfs", MS_STRICTATIME, "mode=755" TMPFS_LIMITS_ROOTFS); if (r < 0) goto finish_rmdir; @@ -80,7 +80,7 @@ static int make_overlay(const char *path) { if (r < 0) return log_error_errno(r, "Couldn't create overlay sysroot directory: %m"); - r = mount_verbose(LOG_ERR, "tmpfs", "/run/systemd/overlay-sysroot", "tmpfs", MS_STRICTATIME, "mode=755"); + r = mount_verbose(LOG_ERR, "tmpfs", "/run/systemd/overlay-sysroot", "tmpfs", MS_STRICTATIME, "mode=755" TMPFS_LIMITS_ROOTFS); if (r < 0) goto finish; diff --git a/units/tmp.mount b/units/tmp.mount index 27bd0f235c..7066e52261 100644 --- a/units/tmp.mount +++ b/units/tmp.mount @@ -22,4 +22,4 @@ After=swap.target What=tmpfs Where=/tmp Type=tmpfs -Options=mode=1777,strictatime,nosuid,nodev +Options=mode=1777,strictatime,nosuid,nodev,size=10%,nr_inodes=400k