core: add new ReadOnlySystem= and ProtectedHome= settings for service units

ReadOnlySystem= uses fs namespaces to mount /usr and /boot read-only for
a service.

ProtectedHome= uses fs namespaces to mount /home and /run/user
inaccessible or read-only for a service.

This patch also enables these settings for all our long-running services.

Together they should be good building block for a minimal service
sandbox, removing the ability for services to modify the operating
system or access the user's private data.
This commit is contained in:
Lennart Poettering 2014-06-03 23:41:44 +02:00
parent 85b5673b33
commit 417116f234
21 changed files with 187 additions and 4 deletions

View File

@ -764,7 +764,7 @@
capability sets as documented in
<citerefentry><refentrytitle>cap_from_text</refentrytitle><manvolnum>3</manvolnum></citerefentry>.
Note that these capability sets are
usually influenced by the capabilities
usually influenced (and filtered) by the capabilities
attached to the executed file. Due to
that
<varname>CapabilityBoundingSet=</varname>
@ -934,6 +934,63 @@
accessible).</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>ReadOnlySystem=</varname></term>
<listitem><para>Takes a boolean
argument. If true, mounts the
<filename>/usr</filename> and
<filename>/boot</filename> directories
read-only for processes invoked by
this unit. This setting ensures that
any modification of the vendor
supplied operating system is
prohibited for the service. It is
recommended to enable this setting for
all long-running services, unless they
are involved with system updates or
need to modify the operating system in
other ways. Note however, that
processes retaining the CAP_SYS_ADMIN
capability can undo the effect of this
setting. This setting is hence
particularly useful for daemons which
have this capability removed, for
example with
<varname>CapabilityBoundingSet=</varname>. Defaults
to off.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>ProtectedHome=</varname></term>
<listitem><para>Takes a boolean
argument or
<literal>read-only</literal>. If true,
the directories
<filename>/home</filename> and
<filename>/run/user</filename> are
made inaccessible and empty for
processes invoked by this unit. If set
to <literal>read-only</literal> the
two directores are made read-only
instead. It is recommended to enable
this setting for all long-running
services (in particular network-facing
one), to ensure they cannot get access
to private user data, unless the
services actually require access to
the user's private data. Note however,
that processes retaining the
CAP_SYS_ADMIN capability can undo the
effect of this setting. This setting
is hence particularly useful for
daemons which have this capability
removed, for example with
<varname>CapabilityBoundingSet=</varname>. Defaults
to off.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>MountFlags=</varname></term>
@ -968,6 +1025,8 @@
namespace related options
(<varname>PrivateTmp=</varname>,
<varname>PrivateDevices=</varname>,
<varname>ReadOnlySystem=</varname>,
<varname>ProtectedHome=</varname>,
<varname>ReadOnlyDirectories=</varname>,
<varname>InaccessibleDirectories=</varname>
and

View File

@ -35,6 +35,7 @@
#include "capability.h"
#include "env-util.h"
#include "af-list.h"
#include "namespace.h"
#ifdef HAVE_SECCOMP
#include "seccomp-util.h"
@ -44,6 +45,8 @@ BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_exec_output, exec_output, ExecOutp
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_input, exec_input, ExecInput);
static BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_protected_home, protected_home, ProtectedHome);
static int property_get_environment_files(
sd_bus *bus,
const char *path,
@ -626,6 +629,8 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectedHome", "s", bus_property_get_protected_home, offsetof(ExecContext, protected_home), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ReadOnlySystem", "b", bus_property_get_bool, offsetof(ExecContext, read_only_system), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("UtmpIdentifier", "s", NULL, offsetof(ExecContext, utmp_id), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("SELinuxContext", "(bs)", property_get_selinux_context, 0, SD_BUS_VTABLE_PROPERTY_CONST),

View File

@ -1569,7 +1569,9 @@ int exec_spawn(ExecCommand *command,
!strv_isempty(context->inaccessible_dirs) ||
context->mount_flags != 0 ||
(context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir)) ||
context->private_devices) {
context->private_devices ||
context->read_only_system ||
context->protected_home != PROTECTED_HOME_NO) {
char *tmp = NULL, *var = NULL;
@ -1593,8 +1595,9 @@ int exec_spawn(ExecCommand *command,
tmp,
var,
context->private_devices,
context->protected_home,
context->read_only_system,
context->mount_flags);
if (err < 0) {
r = EXIT_NAMESPACE;
goto fail_child;
@ -2111,6 +2114,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
"%sPrivateTmp: %s\n"
"%sPrivateNetwork: %s\n"
"%sPrivateDevices: %s\n"
"%sProtectedHome: %s\n"
"%sReadOnlySystem: %s\n"
"%sIgnoreSIGPIPE: %s\n",
prefix, c->umask,
prefix, c->working_directory ? c->working_directory : "/",
@ -2119,6 +2124,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
prefix, yes_no(c->private_tmp),
prefix, yes_no(c->private_network),
prefix, yes_no(c->private_devices),
prefix, protected_home_to_string(c->protected_home),
prefix, yes_no(c->read_only_system),
prefix, yes_no(c->ignore_sigpipe));
STRV_FOREACH(e, c->environment)

View File

@ -39,6 +39,7 @@ typedef struct ExecRuntime ExecRuntime;
#include "set.h"
#include "fdset.h"
#include "missing.h"
#include "namespace.h"
typedef enum ExecInput {
EXEC_INPUT_NULL,
@ -156,6 +157,8 @@ struct ExecContext {
bool private_tmp;
bool private_network;
bool private_devices;
bool read_only_system;
ProtectedHome protected_home;
bool no_new_privileges;

View File

@ -80,6 +80,8 @@ $1.InaccessibleDirectories, config_parse_namespace_path_strv, 0,
$1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp)
$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network)
$1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices)
$1.ReadOnlySystem, config_parse_bool, 0, offsetof($1, exec_context.read_only_system)
$1.ProtectedHome, config_parse_protected_home, 0, offsetof($1, exec_context)
$1.MountFlags, config_parse_exec_mount_flags, 0, offsetof($1, exec_context)
$1.Personality, config_parse_personality, 0, offsetof($1, exec_context.personality)
$1.RuntimeDirectoryMode, config_parse_mode, 0, offsetof($1, exec_context.runtime_directory_mode)

View File

@ -3044,6 +3044,49 @@ int config_parse_no_new_privileges(
return 0;
}
int config_parse_protected_home(
const char* unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
ExecContext *c = data;
int k;
assert(filename);
assert(lvalue);
assert(rvalue);
assert(data);
/* Our enum shall be a superset of booleans, hence first try
* to parse as as boolean, and then as enum */
k = parse_boolean(rvalue);
if (k > 0)
c->protected_home = PROTECTED_HOME_YES;
else if (k == 0)
c->protected_home = PROTECTED_HOME_NO;
else {
ProtectedHome h;
h = protected_home_from_string(rvalue);
if (h < 0){
log_syntax(unit, LOG_ERR, filename, line, -h, "Failed to parse protected home value, ignoring: %s", rvalue);
return 0;
}
c->protected_home = h;
}
return 0;
}
#define FOLLOW_MAX 8
static int open_follow(char **filename, FILE **_f, Set *names, char **_final) {

View File

@ -97,6 +97,7 @@ int config_parse_set_status(const char *unit, const char *filename, unsigned lin
int config_parse_namespace_path_strv(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_no_new_privileges(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_cpu_quota(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_protected_home(const char* unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
/* gperf prototypes */
const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, unsigned length);

View File

@ -331,6 +331,8 @@ int setup_namespace(
char* tmp_dir,
char* var_tmp_dir,
bool private_dev,
ProtectedHome protected_home,
bool read_only_system,
unsigned mount_flags) {
BindMount *m, *mounts = NULL;
@ -347,7 +349,9 @@ int setup_namespace(
strv_length(read_write_dirs) +
strv_length(read_only_dirs) +
strv_length(inaccessible_dirs) +
private_dev;
private_dev +
(protected_home != PROTECTED_HOME_NO ? 2 : 0) +
(read_only_system ? 2 : 0);
if (n > 0) {
m = mounts = (BindMount *) alloca(n * sizeof(BindMount));
@ -381,6 +385,18 @@ int setup_namespace(
m++;
}
if (protected_home != PROTECTED_HOME_NO) {
r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user"), protected_home == PROTECTED_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
if (r < 0)
return r;
}
if (read_only_system) {
r = append_mounts(&m, STRV_MAKE("/usr", "-/boot"), READONLY);
if (r < 0)
return r;
}
assert(mounts + n == m);
qsort(mounts, n, sizeof(BindMount), mount_path_compare);
@ -581,3 +597,11 @@ fail:
return r;
}
static const char *const protected_home_table[_PROTECTED_HOME_MAX] = {
[PROTECTED_HOME_NO] = "no",
[PROTECTED_HOME_YES] = "yes",
[PROTECTED_HOME_READ_ONLY] = "read-only",
};
DEFINE_STRING_TABLE_LOOKUP(protected_home, ProtectedHome);

View File

@ -23,12 +23,24 @@
#include <stdbool.h>
#include "macro.h"
typedef enum ProtectedHome {
PROTECTED_HOME_NO,
PROTECTED_HOME_YES,
PROTECTED_HOME_READ_ONLY,
_PROTECTED_HOME_MAX,
_PROTECTED_HOME_INVALID = -1
} ProtectedHome;
int setup_namespace(char **read_write_dirs,
char **read_only_dirs,
char **inaccessible_dirs,
char *tmp_dir,
char *var_tmp_dir,
bool private_dev,
ProtectedHome protected_home,
bool read_only_system,
unsigned mount_flags);
int setup_tmp_dirs(const char *id,
@ -36,3 +48,6 @@ int setup_tmp_dirs(const char *id,
char **var_tmp_dir);
int setup_netns(int netns_storage_socket[2]);
const char* protected_home_to_string(ProtectedHome p) _const_;
ProtectedHome protected_home_from_string(const char *s) _pure_;

View File

@ -60,6 +60,8 @@ int main(int argc, char *argv[]) {
tmp_dir,
var_tmp_dir,
true,
PROTECTED_HOME_NO,
false,
0);
if (r < 0) {
log_error("Failed to setup namespace: %s", strerror(-r));

View File

@ -18,3 +18,5 @@ WatchdogSec=1min
PrivateTmp=yes
PrivateDevices=yes
PrivateNetwork=yes
ReadOnlySystem=yes
ProtectedHome=yes

View File

@ -17,6 +17,8 @@ SupplementaryGroups=systemd-journal
PrivateTmp=yes
PrivateDevices=yes
PrivateNetwork=yes
ReadOnlySystem=yes
ProtectedHome=yes
[Install]
Also=systemd-journal-gatewayd.socket

View File

@ -20,6 +20,8 @@ RestartSec=0
NotifyAccess=all
StandardOutput=null
CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_SYS_PTRACE CAP_SYSLOG CAP_AUDIT_CONTROL CAP_CHOWN CAP_DAC_READ_SEARCH CAP_FOWNER CAP_SETUID CAP_SETGID
ReadOnlySystem=yes
ProtectedHome=yes
WatchdogSec=1min
# Increase the default a bit in order to allow many simultaneous

View File

@ -18,3 +18,5 @@ WatchdogSec=1min
PrivateTmp=yes
PrivateDevices=yes
PrivateNetwork=yes
ReadOnlySystem=yes
ProtectedHome=yes

View File

@ -25,6 +25,8 @@ RestartSec=0
BusName=org.freedesktop.login1
CapabilityBoundingSet=CAP_SYS_ADMIN CAP_AUDIT_CONTROL CAP_CHOWN CAP_KILL CAP_DAC_READ_SEARCH CAP_DAC_OVERRIDE CAP_FOWNER CAP_SYS_TTY_CONFIG
WatchdogSec=1min
ReadOnlySystem=yes
ProtectedHome=yes
# Increase the default a bit in order to allow many simultaneous
# logins since we keep one fd open per session.

View File

@ -20,3 +20,5 @@ WatchdogSec=1min
PrivateTmp=yes
PrivateDevices=yes
PrivateNetwork=yes
ReadOnlySystem=yes
ProtectedHome=yes

View File

@ -20,6 +20,8 @@ Restart=always
RestartSec=0
ExecStart=@rootlibexecdir@/systemd-networkd
CapabilityBoundingSet=CAP_NET_ADMIN CAP_NET_BIND_SERVICE CAP_NET_BROADCAST CAP_NET_RAW CAP_SETUID CAP_SETGID CAP_SETPCAP CAP_CHOWN CAP_DAC_OVERRIDE CAP_FOWNER
ReadOnlySystem=yes
ProtectedHome=yes
WatchdogSec=1min
[Install]

View File

@ -16,6 +16,8 @@ Restart=always
RestartSec=0
ExecStart=@rootlibexecdir@/systemd-resolved
CapabilityBoundingSet=CAP_SETUID CAP_SETGID CAP_SETPCAP CAP_CHOWN CAP_DAC_OVERRIDE CAP_FOWNER
ReadOnlySystem=yes
ProtectedHome=yes
[Install]
WantedBy=multi-user.target

View File

@ -16,3 +16,5 @@ BusName=org.freedesktop.timedate1
CapabilityBoundingSet=CAP_SYS_TIME
WatchdogSec=1min
PrivateTmp=yes
ReadOnlySystem=yes
ProtectedHome=yes

View File

@ -23,6 +23,8 @@ ExecStart=@rootlibexecdir@/systemd-timesyncd
CapabilityBoundingSet=CAP_SYS_TIME CAP_SETUID CAP_SETGID CAP_SETPCAP CAP_CHOWN CAP_DAC_OVERRIDE CAP_FOWNER
PrivateTmp=yes
PrivateDevices=yes
ReadOnlySystem=yes
ProtectedHome=yes
WatchdogSec=1min
[Install]

View File

@ -22,3 +22,5 @@ Restart=always
RestartSec=0
ExecStart=@rootlibexecdir@/systemd-udevd
MountFlags=slave
ReadOnlySystem=yes
ProtectedHome=yes