From f3ab1d39b1a5766f4dbbca5fe652df7e23e3c8fe Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 30 Apr 2018 19:38:41 +0200 Subject: [PATCH 1/6] mkosi: add mkosi snippet for ubuntu, too --- .mkosi/mkosi.ubuntu | 72 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 .mkosi/mkosi.ubuntu diff --git a/.mkosi/mkosi.ubuntu b/.mkosi/mkosi.ubuntu new file mode 100644 index 0000000000..bbda5dd054 --- /dev/null +++ b/.mkosi/mkosi.ubuntu @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: LGPL-2.1+ +# +# This file is part of systemd. +# +# Copyright 2016 Daniel Rusek + +# This is a settings file for OS image generation using mkosi (https://github.com/systemd/mkosi). +# Simply invoke "mkosi" in the project directory to build an OS image. + +[Distribution] +Distribution=ubuntu +Release=bionic +Repositories=main,universe + +[Output] +Format=raw_btrfs +Bootable=no + +[Partitions] +RootSize=2G + +[Packages] +BuildPackages= + acl + docbook-xml + docbook-xsl + gcc + gettext + git + gnu-efi + gperf + iptables-dev + libacl1-dev + libaudit-dev + libblkid-dev + libbz2-dev + libcap-dev + libcryptsetup-dev + libcurl4-gnutls-dev + libdbus-1-dev + libdw-dev + libfdisk-dev + libgcrypt20-dev + libgnutls28-dev + libidn2-0-dev + libkmod-dev + liblz4-dev + liblz4-tool + liblzma-dev + libmicrohttpd-dev + libmount-dev + libpam0g-dev + libqrencode-dev + libseccomp-dev + libsmartcols-dev + libtool + libxkbcommon-dev + m4 + meson + pkg-config + python3 + python3-lxml + tree + tzdata + uuid-dev + xsltproc + xz-utils + +Packages= + libqrencode3 + locales + libidn2-0 From 705268414f6ba6aa96c56d6c39b5ebf74426e847 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 18 Apr 2018 21:19:54 +0200 Subject: [PATCH 2/6] seccomp: add new system call filter, suitable as default whitelist for system services Currently we employ mostly system call blacklisting for our system services. Let's add a new system call filter group @system-service that helps turning this around into a whitelist by default. The new group is very similar to nspawn's default filter list, but in some ways more restricted (as sethostname() and suchlike shouldn't be available to most system services just like that) and in others more relaxed (for example @keyring is blocked in nspawn since it's not properly virtualized yet in the kernel, but is fine for regular system services). --- man/systemd.exec.xml | 12 +++++++ src/shared/seccomp-util.c | 69 +++++++++++++++++++++++++++++++++++++++ src/shared/seccomp-util.h | 1 + src/test/test-seccomp.c | 3 +- 4 files changed, 84 insertions(+), 1 deletion(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index a17db8d850..3bd790b485 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1490,6 +1490,10 @@ RestrictNamespaces=~cgroup net @sync Synchronizing files and memory to disk: (fsync2, msync2, and related calls) + + @system-service + A reasonable set of system calls used by common system services, excluding any special purpose calls. This is the recommended starting point for whitelisting system calls for system services, as it contains what is typically needed by system services, but excludes overly specific interfaces. For example, the following APIs are excluded: @clock, @mount, @swap, @reboot. + @timer System calls for scheduling operations by time (alarm2, timer_create2, …) @@ -1504,6 +1508,14 @@ RestrictNamespaces=~cgroup net systemd-analyze syscall-filter to list the actual list of system calls in each filter. + Generally, whitelisting system calls (rather than blacklisting) is the safer mode of operation. It is + recommended to enforce system call whitelists for all long-running system services. Specifically, the + following lines are a relatively safe basic choice for the majority of system services: + + [Service] +SystemCallFilter=@system-service +SystemCallErrorNumber=EPERM + It is recommended to combine the file system namespacing related options with SystemCallFilter=~@mount, in order to prohibit the unit's processes to undo the mappings. Specifically these are the options PrivateTmp=, diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 517a1b4509..4a02d8c35f 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -756,6 +756,75 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "sync_file_range\0" "syncfs\0" }, + [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = { + .name = "@system-service", + .help = "General system service operations", + .value = + "@aio\0" + "@basic-io\0" + "@chown\0" + "@default\0" + "@file-system\0" + "@io-event\0" + "@ipc\0" + "@keyring\0" + "@memlock\0" + "@network-io\0" + "@process\0" + "@resources\0" + "@setuid\0" + "@signal\0" + "@sync\0" + "@timer\0" + "brk\0" + "capget\0" + "capset\0" + "copy_file_range\0" + "fadvise64\0" + "fadvise64_64\0" + "flock\0" + "get_mempolicy\0" + "getcpu\0" + "getpriority\0" + "getrandom\0" + "ioctl\0" + "ioprio_get\0" + "kcmp\0" + "madvise\0" + "mincore\0" + "mprotect\0" + "mremap\0" + "name_to_handle_at\0" + "oldolduname\0" + "olduname\0" + "personality\0" + "readahead\0" + "readdir\0" + "remap_file_pages\0" + "sched_get_priority_max\0" + "sched_get_priority_min\0" + "sched_getaffinity\0" + "sched_getattr\0" + "sched_getparam\0" + "sched_getscheduler\0" + "sched_rr_get_interval\0" + "sched_yield\0" + "sendfile\0" + "sendfile64\0" + "setfsgid\0" + "setfsgid32\0" + "setfsuid\0" + "setfsuid32\0" + "setpgid\0" + "setsid\0" + "splice\0" + "sysinfo\0" + "tee\0" + "umask\0" + "uname\0" + "userfaultfd\0" + "vmsplice\0" + }, [SYSCALL_FILTER_SET_TIMER] = { .name = "@timer", .help = "Schedule operations by time", diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 7dfff9df78..eac857afb9 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -47,6 +47,7 @@ enum { SYSCALL_FILTER_SET_SIGNAL, SYSCALL_FILTER_SET_SWAP, SYSCALL_FILTER_SET_SYNC, + SYSCALL_FILTER_SET_SYSTEM_SERVICE, SYSCALL_FILTER_SET_TIMER, _SYSCALL_FILTER_SET_MAX }; diff --git a/src/test/test-seccomp.c b/src/test/test-seccomp.c index 33ec680753..d82cb5c1c5 100644 --- a/src/test/test-seccomp.c +++ b/src/test/test-seccomp.c @@ -104,7 +104,8 @@ static void test_filter_sets(void) { if (pid == 0) { /* Child? */ int fd; - if (i == SYSCALL_FILTER_SET_DEFAULT) /* if we look at the default set, whitelist instead of blacklist */ + /* if we look at the default set (or one that includes it), whitelist instead of blacklist */ + if (IN_SET(i, SYSCALL_FILTER_SET_DEFAULT, SYSCALL_FILTER_SET_SYSTEM_SERVICE)) r = seccomp_load_syscall_filter_set(SCMP_ACT_ERRNO(EUCLEAN), syscall_filter_sets + i, SCMP_ACT_ALLOW); else r = seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + i, SCMP_ACT_ERRNO(EUCLEAN)); From e05ee49b144110b1ecff030cdadc439604152f16 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 18 Apr 2018 21:45:44 +0200 Subject: [PATCH 3/6] seccomp: explain why we use setuid rather than @setuid in @privileged --- src/shared/seccomp-util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 4a02d8c35f..c433cb90dc 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -632,7 +632,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "setresuid32\0" "setreuid\0" "setreuid32\0" - "setuid\0" + "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */ "setuid32\0" "vhangup\0" }, From ee8f26180d01e3ddd4e5f20b03b81e5e737657ae Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 19 Apr 2018 11:04:17 +0200 Subject: [PATCH 4/6] units: switch from system call blacklist to whitelist This is generally the safer approach, and is what container managers (including nspawn) do, hence let's move to this too for our own services. This is particularly useful as this this means the new @system-service system call filter group will get serious real-life testing quickly. This also switches from firing SIGSYS on unexpected syscalls to returning EPERM. This would have probably been a better default anyway, but it's hard to change that these days. When whitelisting system calls SIGSYS is highly problematic as system calls that are newly introduced to Linux become minefields for services otherwise. Note that this enables a system call filter for udev for the first time, and will block @clock, @mount and @swap from it. Some downstream distributions might want to revert this locally if they want to permit unsafe operations on udev rules, but in general this shiuld be mostly safe, as we already set MountFlags=shared for udevd, hence at least @mount won't change anything. --- units/systemd-coredump@.service.in | 3 ++- units/systemd-hostnamed.service.in | 3 ++- units/systemd-importd.service.in | 3 ++- units/systemd-journald.service.in | 3 ++- units/systemd-localed.service.in | 3 ++- units/systemd-logind.service.in | 3 ++- units/systemd-machined.service.in | 3 ++- units/systemd-networkd.service.in | 3 ++- units/systemd-resolved.service.in | 3 ++- units/systemd-timedated.service.in | 3 ++- units/systemd-timesyncd.service.in | 3 ++- units/systemd-udevd.service.in | 2 ++ 12 files changed, 24 insertions(+), 11 deletions(-) diff --git a/units/systemd-coredump@.service.in b/units/systemd-coredump@.service.in index 68fa55c807..215696ecd1 100644 --- a/units/systemd-coredump@.service.in +++ b/units/systemd-coredump@.service.in @@ -33,7 +33,8 @@ MemoryDenyWriteExecute=yes RestrictRealtime=yes RestrictNamespaces=yes RestrictAddressFamilies=AF_UNIX -SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io @reboot @swap +SystemCallFilter=@system-service +SystemCallErrorNumber=EPERM SystemCallArchitectures=native LockPersonality=yes IPAddressDeny=any diff --git a/units/systemd-hostnamed.service.in b/units/systemd-hostnamed.service.in index 993134f3d6..da74b4fe8b 100644 --- a/units/systemd-hostnamed.service.in +++ b/units/systemd-hostnamed.service.in @@ -29,7 +29,8 @@ MemoryDenyWriteExecute=yes RestrictRealtime=yes RestrictNamespaces=yes RestrictAddressFamilies=AF_UNIX -SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io @reboot @swap +SystemCallFilter=@system-service sethostname +SystemCallErrorNumber=EPERM SystemCallArchitectures=native LockPersonality=yes IPAddressDeny=any diff --git a/units/systemd-importd.service.in b/units/systemd-importd.service.in index 8e93c2d524..20704a8232 100644 --- a/units/systemd-importd.service.in +++ b/units/systemd-importd.service.in @@ -23,6 +23,7 @@ MemoryDenyWriteExecute=yes RestrictRealtime=yes RestrictNamespaces=net RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 -SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @obsolete @raw-io @reboot @swap +SystemCallFilter=@system-service @mount +SystemCallErrorNumber=EPERM SystemCallArchitectures=native LockPersonality=yes diff --git a/units/systemd-journald.service.in b/units/systemd-journald.service.in index df76fe4226..52939e6820 100644 --- a/units/systemd-journald.service.in +++ b/units/systemd-journald.service.in @@ -29,7 +29,8 @@ MemoryDenyWriteExecute=yes RestrictRealtime=yes RestrictNamespaces=yes RestrictAddressFamilies=AF_UNIX AF_NETLINK -SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io @reboot @swap +SystemCallFilter=@system-service +SystemCallErrorNumber=EPERM SystemCallArchitectures=native LockPersonality=yes IPAddressDeny=any diff --git a/units/systemd-localed.service.in b/units/systemd-localed.service.in index ba8a08f3b4..a24e61a0cd 100644 --- a/units/systemd-localed.service.in +++ b/units/systemd-localed.service.in @@ -29,7 +29,8 @@ MemoryDenyWriteExecute=yes RestrictRealtime=yes RestrictNamespaces=yes RestrictAddressFamilies=AF_UNIX -SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io @reboot @swap +SystemCallFilter=@system-service +SystemCallErrorNumber=EPERM SystemCallArchitectures=native LockPersonality=yes IPAddressDeny=any diff --git a/units/systemd-logind.service.in b/units/systemd-logind.service.in index 168fc007b0..5e090bcf23 100644 --- a/units/systemd-logind.service.in +++ b/units/systemd-logind.service.in @@ -30,7 +30,8 @@ MemoryDenyWriteExecute=yes RestrictRealtime=yes RestrictNamespaces=yes RestrictAddressFamilies=AF_UNIX AF_NETLINK -SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io @reboot @swap +SystemCallFilter=@system-service +SystemCallErrorNumber=EPERM SystemCallArchitectures=native LockPersonality=yes IPAddressDeny=any diff --git a/units/systemd-machined.service.in b/units/systemd-machined.service.in index f9e789db42..1200a90a61 100644 --- a/units/systemd-machined.service.in +++ b/units/systemd-machined.service.in @@ -23,7 +23,8 @@ CapabilityBoundingSet=CAP_KILL CAP_SYS_PTRACE CAP_SYS_ADMIN CAP_SETGID CAP_SYS_C MemoryDenyWriteExecute=yes RestrictRealtime=yes RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6 -SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @obsolete @raw-io @reboot @swap +SystemCallFilter=@system-service @mount +SystemCallErrorNumber=EPERM SystemCallArchitectures=native LockPersonality=yes IPAddressDeny=any diff --git a/units/systemd-networkd.service.in b/units/systemd-networkd.service.in index adb219a01d..371ab3a9cf 100644 --- a/units/systemd-networkd.service.in +++ b/units/systemd-networkd.service.in @@ -35,7 +35,8 @@ MemoryDenyWriteExecute=yes RestrictRealtime=yes RestrictNamespaces=yes RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6 AF_PACKET -SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io @reboot @swap +SystemCallFilter=@system-service +SystemCallErrorNumber=EPERM SystemCallArchitectures=native LockPersonality=yes RuntimeDirectory=systemd/netif diff --git a/units/systemd-resolved.service.in b/units/systemd-resolved.service.in index 7b92735f19..9982ecebff 100644 --- a/units/systemd-resolved.service.in +++ b/units/systemd-resolved.service.in @@ -38,7 +38,8 @@ MemoryDenyWriteExecute=yes RestrictRealtime=yes RestrictNamespaces=yes RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6 -SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io @reboot @swap +SystemCallFilter=@system-service +SystemCallErrorNumber=EPERM SystemCallArchitectures=native LockPersonality=yes RuntimeDirectory=systemd/resolve diff --git a/units/systemd-timedated.service.in b/units/systemd-timedated.service.in index cf13e40ced..906bb4326c 100644 --- a/units/systemd-timedated.service.in +++ b/units/systemd-timedated.service.in @@ -27,7 +27,8 @@ MemoryDenyWriteExecute=yes RestrictRealtime=yes RestrictNamespaces=yes RestrictAddressFamilies=AF_UNIX -SystemCallFilter=~@cpu-emulation @debug @keyring @module @mount @obsolete @raw-io @reboot @swap +SystemCallFilter=@system-service @clock +SystemCallErrorNumber=EPERM SystemCallArchitectures=native LockPersonality=yes IPAddressDeny=any diff --git a/units/systemd-timesyncd.service.in b/units/systemd-timesyncd.service.in index 6bfe28627b..4a490b6e16 100644 --- a/units/systemd-timesyncd.service.in +++ b/units/systemd-timesyncd.service.in @@ -38,7 +38,8 @@ RestrictRealtime=yes RestrictNamespaces=yes RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 RuntimeDirectory=systemd/timesync -SystemCallFilter=~@cpu-emulation @debug @keyring @module @mount @obsolete @raw-io @reboot @swap +SystemCallFilter=@system-service @clock +SystemCallErrorNumber=EPERM SystemCallArchitectures=native LockPersonality=yes StateDirectory=systemd/timesync diff --git a/units/systemd-udevd.service.in b/units/systemd-udevd.service.in index 2b9fa69d9b..6a3814e5d9 100644 --- a/units/systemd-udevd.service.in +++ b/units/systemd-udevd.service.in @@ -29,6 +29,8 @@ PrivateMounts=yes MemoryDenyWriteExecute=yes RestrictRealtime=yes RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6 +SystemCallFilter=@system-service @module @raw-io +SystemCallErrorNumber=EPERM SystemCallArchitectures=native LockPersonality=yes IPAddressDeny=any From 6f659e5075a5da1ffb1a3e30f38451a524cd7472 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Jun 2018 17:47:53 +0200 Subject: [PATCH 5/6] portable: add SystemCallFilter=@system-service to the three main portable service profiles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … but leave the "trusted" profile unmodified, it shall have full access to all system calls, as before. --- src/portable/profile/default/service.conf | 2 ++ src/portable/profile/nonetwork/service.conf | 2 ++ src/portable/profile/strict/service.conf | 2 ++ 3 files changed, 6 insertions(+) diff --git a/src/portable/profile/default/service.conf b/src/portable/profile/default/service.conf index 993d351638..792be50229 100644 --- a/src/portable/profile/default/service.conf +++ b/src/portable/profile/default/service.conf @@ -27,4 +27,6 @@ LockPersonality=yes MemoryDenyWriteExecute=yes RestrictRealtime=yes RestrictNamespaces=yes +SystemCallFilter=@system-service +SystemCallErrorNumber=EPERM SystemCallArchitectures=native diff --git a/src/portable/profile/nonetwork/service.conf b/src/portable/profile/nonetwork/service.conf index 0d9c5a38d8..c81cebe03f 100644 --- a/src/portable/profile/nonetwork/service.conf +++ b/src/portable/profile/nonetwork/service.conf @@ -25,6 +25,8 @@ LockPersonality=yes MemoryDenyWriteExecute=yes RestrictRealtime=yes RestrictNamespaces=yes +SystemCallFilter=@system-service +SystemCallErrorNumber=EPERM SystemCallArchitectures=native PrivateNetwork=yes IPAddressDeny=any diff --git a/src/portable/profile/strict/service.conf b/src/portable/profile/strict/service.conf index d12620fc99..d10fb5a1e8 100644 --- a/src/portable/profile/strict/service.conf +++ b/src/portable/profile/strict/service.conf @@ -23,6 +23,8 @@ NoNewPrivileges=yes MemoryDenyWriteExecute=yes RestrictRealtime=yes RestrictNamespaces=yes +SystemCallFilter=@system-service +SystemCallErrorNumber=EPERM SystemCallArchitectures=native PrivateNetwork=yes IPAddressDeny=any From e01d9e2193ad4699a0507fc631613b5666d4d897 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 19 Apr 2018 16:51:04 +0200 Subject: [PATCH 6/6] update NEWS --- NEWS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/NEWS b/NEWS index cca6692c4b..03fe0eca83 100644 --- a/NEWS +++ b/NEWS @@ -46,6 +46,15 @@ CHANGES WITH 239 in spe: both runtime and persistent enablement/masking, i.e. it will remove any relevant symlinks both in /run and /etc. + * Note that all long-running system services shipped with systemd will + now default to a system call whitelist (rather than a blacklist, as + before). In particular, systemd-udevd will now enforce one too. For + most cases this should be safe, however downstream distributions + which disabled sandboxing of systemd-udevd (specifically the + MountFlags= setting), might want to disable this security feature + too, as the default whitelisting will prohibit all mount, swap, + reboot and clock changing operations from udev rules. + * sd-boot acquired new loader configuration settings to optionally turn off Windows and MacOS boot partition discovery as well as reboot-into-firmware menu items. It is also able to pick a better