core: simplify ProtectSystem= implementation

ProtectSystem= with all its different modes and other options like
PrivateDevices= + ProtectKernelTunables= + ProtectHome= are orthogonal,
however currently it's a bit hard to parse that from the implementation
view. Simplify it by giving each mode its own table with all paths and
references to other Protect options.

With this change some entries are duplicated, but we do not care since
duplicate mounts are first sorted by the most restrictive mode then
cleaned.
This commit is contained in:
Djalal Harouni 2016-09-25 12:21:25 +02:00
parent 49accde7bd
commit f471b2afa1
1 changed files with 114 additions and 59 deletions

View File

@ -70,6 +70,14 @@ typedef struct TargetMount {
bool ignore; /* Ignore if path does not exist */
} TargetMount;
/*
* The following Protect tables are to protect paths and mark some of them
* READONLY, in case a path is covered by an option from another table, then
* it is marked READWRITE in the current one, and the more restrictive mode is
* applied from that other table. This way all options can be combined in a
* safe and comprehensible way for users.
*/
/* ProtectKernelTunables= option and the related filesystem APIs */
static const TargetMount protect_kernel_tunables_table[] = {
{ "/proc/sys", READONLY, false },
@ -89,6 +97,45 @@ static const TargetMount protect_kernel_tunables_table[] = {
{ "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
};
/* ProtectSystem=yes table */
static const TargetMount protect_system_yes_table[] = {
{ "/usr", READONLY, false },
{ "/boot", READONLY, true },
{ "/efi", READONLY, true },
};
/* ProtectSystem=full includes ProtectSystem=yes */
static const TargetMount protect_system_full_table[] = {
{ "/usr", READONLY, false },
{ "/boot", READONLY, true },
{ "/efi", READONLY, true },
{ "/etc", READONLY, false },
};
/*
* ProtectSystem=strict table. In this strict mode, we mount everything
* read-only, except for /proc, /dev, /sys which are the kernel API VFS,
* which are left writable, but PrivateDevices= + ProtectKernelTunables=
* protect those, and these options should be fully orthogonal.
* (And of course /home and friends are also left writable, as ProtectHome=
* shall manage those, orthogonally).
*/
static const TargetMount protect_system_strict_table[] = {
{ "/", READONLY, false },
{ "/proc", READWRITE, false }, /* ProtectKernelTunables= */
{ "/sys", READWRITE, false }, /* ProtectKernelTunables= */
{ "/dev", READWRITE, false }, /* PrivateDevices= */
{ "/home", READWRITE, true }, /* ProtectHome= */
{ "/run/user", READWRITE, true }, /* ProtectHome= */
{ "/root", READWRITE, true }, /* ProtectHome= */
};
static void set_bind_mount(BindMount **p, const char *path, MountMode mode, bool ignore) {
(*p)->path = path;
(*p)->mode = mode;
(*p)->ignore = ignore;
}
static int append_mounts(BindMount **p, char **strv, MountMode mode) {
char **i;
@ -105,27 +152,71 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
if (!path_is_absolute(*i))
return -EINVAL;
(*p)->path = *i;
(*p)->mode = mode;
(*p)->ignore = ignore;
set_bind_mount(p, *i, mode, ignore);
(*p)++;
}
return 0;
}
static void append_protect_kernel_tunables(BindMount **p, const char *root_directory) {
unsigned int i;
static int append_target_mounts(BindMount **p, const char *root_directory, const TargetMount *mounts, const size_t size) {
unsigned i;
assert(p);
assert(mounts);
for (i = 0; i < size; i++) {
/*
* Here we assume that the ignore field is set during
* declaration we do not support "-" at the beginning.
*/
const TargetMount *m = &mounts[i];
const char *path = prefix_roota(root_directory, m->path);
if (!path_is_absolute(path))
return -EINVAL;
set_bind_mount(p, path, m->mode, m->ignore);
(*p)++;
}
return 0;
}
static int append_protect_kernel_tunables(BindMount **p, const char *root_directory) {
assert(p);
return append_target_mounts(p, root_directory, protect_kernel_tunables_table,
ELEMENTSOF(protect_kernel_tunables_table));
}
static int append_protect_system(BindMount **p, const char *root_directory, ProtectSystem protect_system) {
int r = 0;
assert(p);
for (i = 0; i < ELEMENTSOF(protect_kernel_tunables_table); i++) {
const TargetMount *t = &protect_kernel_tunables_table[i];
(*p)->path = prefix_roota(root_directory, t->path);
(*p)->mode = t->mode;
(*p)->ignore = t->ignore;
(*p)++;
if (protect_system == PROTECT_SYSTEM_NO)
return 0;
switch (protect_system) {
case PROTECT_SYSTEM_STRICT:
r = append_target_mounts(p, root_directory, protect_system_strict_table,
ELEMENTSOF(protect_system_strict_table));
break;
case PROTECT_SYSTEM_YES:
r = append_target_mounts(p, root_directory, protect_system_yes_table,
ELEMENTSOF(protect_system_yes_table));
break;
case PROTECT_SYSTEM_FULL:
r = append_target_mounts(p, root_directory, protect_system_full_table,
ELEMENTSOF(protect_system_full_table));
break;
default:
r = -EINVAL;
break;
}
return r;
}
static int mount_path_compare(const void *a, const void *b) {
@ -538,6 +629,14 @@ static unsigned namespace_calculate_mounts(
ProtectHome protect_home,
ProtectSystem protect_system) {
unsigned protect_system_cnt =
(protect_system == PROTECT_SYSTEM_STRICT ?
ELEMENTSOF(protect_system_strict_table) :
((protect_system == PROTECT_SYSTEM_FULL) ?
ELEMENTSOF(protect_system_full_table) :
((protect_system == PROTECT_SYSTEM_YES) ?
ELEMENTSOF(protect_system_yes_table) : 0)));
return !!tmp_dir + !!var_tmp_dir +
strv_length(read_write_paths) +
strv_length(read_only_paths) +
@ -546,10 +645,7 @@ static unsigned namespace_calculate_mounts(
(protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
(protect_cgroups ? 1 : 0) +
(protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
(protect_system == PROTECT_SYSTEM_STRICT ?
(2 + !private_dev + !protect_sysctl) :
((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
(protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
protect_system_cnt;
}
int setup_namespace(
@ -648,50 +744,9 @@ int setup_namespace(
return r;
}
if (protect_system == PROTECT_SYSTEM_STRICT) {
/* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the
* kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
* protect those, and these options should be fully orthogonal. (And of course /home and
* friends are also left writable, as ProtectHome= shall manage those, orthogonally, see
* above). */
m->path = prefix_roota(root_directory, "/");
m->mode = READONLY;
m++;
m->path = prefix_roota(root_directory, "/proc");
m->mode = READWRITE;
m++;
if (!private_dev) {
m->path = prefix_roota(root_directory, "/dev");
m->mode = READWRITE;
m++;
}
if (!protect_sysctl) {
m->path = prefix_roota(root_directory, "/sys");
m->mode = READWRITE;
m++;
}
} else if (protect_system != PROTECT_SYSTEM_NO) {
const char *usr_dir, *boot_dir, *efi_dir, *etc_dir;
/* In any other mode we simply mark the relevant three directories ready-only. */
usr_dir = prefix_roota(root_directory, "/usr");
boot_dir = prefix_roota(root_directory, "/boot");
boot_dir = strjoina("-", boot_dir);
efi_dir = prefix_roota(root_directory, "/efi");
efi_dir = strjoina("-", efi_dir);
etc_dir = prefix_roota(root_directory, "/etc");
r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir)
: STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY);
if (r < 0)
return r;
}
r = append_protect_system(&m, root_directory, protect_system);
if (r < 0)
return r;
assert(mounts + n == m);