2020-11-09 05:23:58 +01:00
|
|
|
/* SPDX-License-Identifier: LGPL-2.1-or-later */
|
2012-07-18 19:07:51 +02:00
|
|
|
#pragma once
|
2010-04-21 22:15:06 +02:00
|
|
|
|
|
|
|
/***
|
2018-06-12 19:00:24 +02:00
|
|
|
Copyright © 2016 Djalal Harouni
|
2010-04-21 22:15:06 +02:00
|
|
|
***/
|
|
|
|
|
2017-10-10 09:49:20 +02:00
|
|
|
typedef struct NamespaceInfo NamespaceInfo;
|
2016-11-23 22:21:40 +01:00
|
|
|
typedef struct BindMount BindMount;
|
2018-02-21 01:17:52 +01:00
|
|
|
typedef struct TemporaryFileSystem TemporaryFileSystem;
|
2020-07-14 17:18:41 +02:00
|
|
|
typedef struct MountImage MountImage;
|
2016-10-12 14:11:16 +02:00
|
|
|
|
2010-04-21 22:15:06 +02:00
|
|
|
#include <stdbool.h>
|
|
|
|
|
2016-12-23 14:26:05 +01:00
|
|
|
#include "dissect-image.h"
|
2020-06-28 19:54:49 +02:00
|
|
|
#include "fs-util.h"
|
2014-06-03 23:41:44 +02:00
|
|
|
#include "macro.h"
|
2020-06-28 19:54:49 +02:00
|
|
|
#include "string-util.h"
|
2014-06-03 23:41:44 +02:00
|
|
|
|
2014-06-04 18:07:55 +02:00
|
|
|
typedef enum ProtectHome {
|
|
|
|
PROTECT_HOME_NO,
|
|
|
|
PROTECT_HOME_YES,
|
|
|
|
PROTECT_HOME_READ_ONLY,
|
2018-02-21 01:13:11 +01:00
|
|
|
PROTECT_HOME_TMPFS,
|
2014-06-04 18:07:55 +02:00
|
|
|
_PROTECT_HOME_MAX,
|
|
|
|
_PROTECT_HOME_INVALID = -1
|
|
|
|
} ProtectHome;
|
|
|
|
|
2017-10-10 09:46:13 +02:00
|
|
|
typedef enum NamespaceType {
|
|
|
|
NAMESPACE_MOUNT,
|
|
|
|
NAMESPACE_CGROUP,
|
|
|
|
NAMESPACE_UTS,
|
|
|
|
NAMESPACE_IPC,
|
|
|
|
NAMESPACE_USER,
|
|
|
|
NAMESPACE_PID,
|
|
|
|
NAMESPACE_NET,
|
|
|
|
_NAMESPACE_TYPE_MAX,
|
|
|
|
_NAMESPACE_TYPE_INVALID = -1,
|
|
|
|
} NamespaceType;
|
|
|
|
|
2014-06-04 18:07:55 +02:00
|
|
|
typedef enum ProtectSystem {
|
|
|
|
PROTECT_SYSTEM_NO,
|
|
|
|
PROTECT_SYSTEM_YES,
|
|
|
|
PROTECT_SYSTEM_FULL,
|
2016-08-25 15:57:21 +02:00
|
|
|
PROTECT_SYSTEM_STRICT,
|
2014-06-04 18:07:55 +02:00
|
|
|
_PROTECT_SYSTEM_MAX,
|
|
|
|
_PROTECT_SYSTEM_INVALID = -1
|
|
|
|
} ProtectSystem;
|
2014-06-03 23:41:44 +02:00
|
|
|
|
2020-08-06 12:51:50 +02:00
|
|
|
typedef enum ProtectProc {
|
|
|
|
PROTECT_PROC_DEFAULT,
|
|
|
|
PROTECT_PROC_NOACCESS, /* hidepid=noaccess */
|
|
|
|
PROTECT_PROC_INVISIBLE, /* hidepid=invisible */
|
|
|
|
PROTECT_PROC_PTRACEABLE, /* hidepid=ptraceable */
|
|
|
|
_PROTECT_PROC_MAX,
|
|
|
|
_PROTECT_PROC_INVALID = -1,
|
|
|
|
} ProtectProc;
|
|
|
|
|
|
|
|
typedef enum ProcSubset {
|
|
|
|
PROC_SUBSET_ALL,
|
|
|
|
PROC_SUBSET_PID, /* subset=pid */
|
|
|
|
_PROC_SUBSET_MAX,
|
|
|
|
_PROC_SUBSET_INVALID = -1,
|
|
|
|
} ProcSubset;
|
|
|
|
|
2017-10-10 09:49:20 +02:00
|
|
|
struct NamespaceInfo {
|
core/namespace: drop bitfield annotations from boolean fields
Such microoptimization makes sense when the structure is used in many many copies,
but here's it's not, and the few bytes we save are not worth the extra code the
compiler has to generate:
return ns_info->mount_apivfs ||
ns_info->protect_control_groups ||
ns_info->protect_kernel_tunables ||
...
before:
49b187: 48 8b 45 f8 mov -0x8(%rbp),%rax
49b18b: 0f b6 00 movzbl (%rax),%eax
49b18e: 83 e0 80 and $0xffffff80,%eax
49b191: 84 c0 test %al,%al
49b193: 75 32 jne 49b1c7 <namespace_info_mount_apivfs+0x80>
49b195: 48 8b 45 f8 mov -0x8(%rbp),%rax
49b199: 0f b6 00 movzbl (%rax),%eax
49b19c: 83 e0 08 and $0x8,%eax
49b19f: 84 c0 test %al,%al
49b1a1: 75 24 jne 49b1c7 <namespace_info_mount_apivfs+0x80>
49b1a3: 48 8b 45 f8 mov -0x8(%rbp),%rax
49b1a7: 0f b6 00 movzbl (%rax),%eax
49b1aa: 83 e0 10 and $0x10,%eax
49b1ad: 84 c0 test %al,%al
49b1af: 75 16 jne 49b1c7 <namespace_info_mount_apivfs+0x80>
after:
49b024: 48 8b 45 f8 mov -0x8(%rbp),%rax
49b028: 0f b6 40 07 movzbl 0x7(%rax),%eax
49b02c: 84 c0 test %al,%al
49b02e: 75 2e jne 49b05e <namespace_info_mount_apivfs+0x7a>
49b030: 48 8b 45 f8 mov -0x8(%rbp),%rax
49b034: 0f b6 40 03 movzbl 0x3(%rax),%eax
49b038: 84 c0 test %al,%al
49b03a: 75 22 jne 49b05e <namespace_info_mount_apivfs+0x7a>
49b03c: 48 8b 45 f8 mov -0x8(%rbp),%rax
49b040: 0f b6 40 04 movzbl 0x4(%rax),%eax
49b044: 84 c0 test %al,%al
49b046: 75 16 jne 49b05e <namespace_info_mount_apivfs+0x7a>
2020-09-22 12:48:25 +02:00
|
|
|
bool ignore_protect_paths;
|
|
|
|
bool private_dev;
|
|
|
|
bool private_mounts;
|
|
|
|
bool protect_control_groups;
|
|
|
|
bool protect_kernel_tunables;
|
|
|
|
bool protect_kernel_modules;
|
|
|
|
bool protect_kernel_logs;
|
|
|
|
bool mount_apivfs;
|
|
|
|
bool protect_hostname;
|
2020-08-06 11:32:53 +02:00
|
|
|
ProtectHome protect_home;
|
|
|
|
ProtectSystem protect_system;
|
2020-08-06 12:51:50 +02:00
|
|
|
ProtectProc protect_proc;
|
|
|
|
ProcSubset proc_subset;
|
2016-10-12 14:11:16 +02:00
|
|
|
};
|
|
|
|
|
2016-11-23 22:21:40 +01:00
|
|
|
struct BindMount {
|
|
|
|
char *source;
|
|
|
|
char *destination;
|
core/namespace: drop bitfield annotations from boolean fields
Such microoptimization makes sense when the structure is used in many many copies,
but here's it's not, and the few bytes we save are not worth the extra code the
compiler has to generate:
return ns_info->mount_apivfs ||
ns_info->protect_control_groups ||
ns_info->protect_kernel_tunables ||
...
before:
49b187: 48 8b 45 f8 mov -0x8(%rbp),%rax
49b18b: 0f b6 00 movzbl (%rax),%eax
49b18e: 83 e0 80 and $0xffffff80,%eax
49b191: 84 c0 test %al,%al
49b193: 75 32 jne 49b1c7 <namespace_info_mount_apivfs+0x80>
49b195: 48 8b 45 f8 mov -0x8(%rbp),%rax
49b199: 0f b6 00 movzbl (%rax),%eax
49b19c: 83 e0 08 and $0x8,%eax
49b19f: 84 c0 test %al,%al
49b1a1: 75 24 jne 49b1c7 <namespace_info_mount_apivfs+0x80>
49b1a3: 48 8b 45 f8 mov -0x8(%rbp),%rax
49b1a7: 0f b6 00 movzbl (%rax),%eax
49b1aa: 83 e0 10 and $0x10,%eax
49b1ad: 84 c0 test %al,%al
49b1af: 75 16 jne 49b1c7 <namespace_info_mount_apivfs+0x80>
after:
49b024: 48 8b 45 f8 mov -0x8(%rbp),%rax
49b028: 0f b6 40 07 movzbl 0x7(%rax),%eax
49b02c: 84 c0 test %al,%al
49b02e: 75 2e jne 49b05e <namespace_info_mount_apivfs+0x7a>
49b030: 48 8b 45 f8 mov -0x8(%rbp),%rax
49b034: 0f b6 40 03 movzbl 0x3(%rax),%eax
49b038: 84 c0 test %al,%al
49b03a: 75 22 jne 49b05e <namespace_info_mount_apivfs+0x7a>
49b03c: 48 8b 45 f8 mov -0x8(%rbp),%rax
49b040: 0f b6 40 04 movzbl 0x4(%rax),%eax
49b044: 84 c0 test %al,%al
49b046: 75 16 jne 49b05e <namespace_info_mount_apivfs+0x7a>
2020-09-22 12:48:25 +02:00
|
|
|
bool read_only;
|
|
|
|
bool nosuid;
|
|
|
|
bool recursive;
|
|
|
|
bool ignore_enoent;
|
2016-11-23 22:21:40 +01:00
|
|
|
};
|
|
|
|
|
2018-02-21 01:17:52 +01:00
|
|
|
struct TemporaryFileSystem {
|
|
|
|
char *path;
|
|
|
|
char *options;
|
|
|
|
};
|
|
|
|
|
2020-07-14 17:18:41 +02:00
|
|
|
struct MountImage {
|
|
|
|
char *source;
|
|
|
|
char *destination;
|
2020-07-31 16:06:15 +02:00
|
|
|
LIST_HEAD(MountOptions, mount_options);
|
2020-07-14 17:18:41 +02:00
|
|
|
bool ignore_enoent;
|
|
|
|
};
|
|
|
|
|
2016-11-23 22:21:40 +01:00
|
|
|
int setup_namespace(
|
|
|
|
const char *root_directory,
|
2016-12-23 14:26:05 +01:00
|
|
|
const char *root_image,
|
2020-06-29 14:19:31 +02:00
|
|
|
const MountOptions *root_image_options,
|
2017-10-10 09:49:20 +02:00
|
|
|
const NamespaceInfo *ns_info,
|
2016-11-23 22:21:40 +01:00
|
|
|
char **read_write_paths,
|
|
|
|
char **read_only_paths,
|
|
|
|
char **inaccessible_paths,
|
execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=
Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.
Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:
/var/lib/foo (created as directory)
Now, if DynamicUser=1 is set, we'll instead get this on the host:
/var/lib/private (created as directory with mode 0700, root:root)
/var/lib/private/foo (created as directory)
/var/lib/foo → private/foo (created as symlink)
And from inside the unit:
/var/lib/private (a tmpfs mount with mode 0755, root:root)
/var/lib/private/foo (bind mounted from the host)
/var/lib/foo → private/foo (the same symlink as above)
This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din. This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.
This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.
With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.
2017-09-28 18:55:45 +02:00
|
|
|
char **empty_directories,
|
2016-11-23 22:21:40 +01:00
|
|
|
const BindMount *bind_mounts,
|
tree-wide: be more careful with the type of array sizes
Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.
Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.
So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.
This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:
1. strv_length()' return type becomes size_t
2. the unit file changes array size becomes size_t
3. DNS answer and query array sizes become size_t
Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745
2018-04-27 14:09:31 +02:00
|
|
|
size_t n_bind_mounts,
|
2018-02-21 01:17:52 +01:00
|
|
|
const TemporaryFileSystem *temporary_filesystems,
|
tree-wide: be more careful with the type of array sizes
Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.
Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.
So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.
This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:
1. strv_length()' return type becomes size_t
2. the unit file changes array size becomes size_t
3. DNS answer and query array sizes become size_t
Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745
2018-04-27 14:09:31 +02:00
|
|
|
size_t n_temporary_filesystems,
|
2020-07-14 17:18:41 +02:00
|
|
|
const MountImage *mount_images,
|
|
|
|
size_t n_mount_images,
|
2016-11-23 22:21:40 +01:00
|
|
|
const char *tmp_dir,
|
|
|
|
const char *var_tmp_dir,
|
2020-08-14 15:54:48 +02:00
|
|
|
const char *creds_path,
|
2019-11-25 16:22:45 +01:00
|
|
|
const char *log_namespace,
|
2016-12-23 14:26:05 +01:00
|
|
|
unsigned long mount_flags,
|
2020-06-03 10:50:45 +02:00
|
|
|
const void *root_hash,
|
|
|
|
size_t root_hash_size,
|
|
|
|
const char *root_hash_path,
|
2020-06-08 15:02:55 +02:00
|
|
|
const void *root_hash_sig,
|
|
|
|
size_t root_hash_sig_size,
|
|
|
|
const char *root_hash_sig_path,
|
2020-06-03 10:50:45 +02:00
|
|
|
const char *root_verity,
|
2019-05-21 20:02:34 +02:00
|
|
|
DissectImageFlags dissected_image_flags,
|
|
|
|
char **error_path);
|
2016-11-23 22:21:40 +01:00
|
|
|
|
2020-06-28 19:54:49 +02:00
|
|
|
#define RUN_SYSTEMD_EMPTY "/run/systemd/empty"
|
|
|
|
|
|
|
|
static inline void namespace_cleanup_tmpdir(char *p) {
|
|
|
|
PROTECT_ERRNO;
|
|
|
|
if (!streq_ptr(p, RUN_SYSTEMD_EMPTY))
|
|
|
|
(void) rmdir(p);
|
|
|
|
free(p);
|
|
|
|
}
|
|
|
|
DEFINE_TRIVIAL_CLEANUP_FUNC(char*, namespace_cleanup_tmpdir);
|
|
|
|
|
2016-11-23 22:21:40 +01:00
|
|
|
int setup_tmp_dirs(
|
|
|
|
const char *id,
|
|
|
|
char **tmp_dir,
|
|
|
|
char **var_tmp_dir);
|
2013-11-27 20:23:18 +01:00
|
|
|
|
2019-07-12 16:39:07 +02:00
|
|
|
int setup_netns(const int netns_storage_socket[static 2]);
|
|
|
|
int open_netns_path(const int netns_storage_socket[static 2], const char *path);
|
2014-06-03 23:41:44 +02:00
|
|
|
|
2014-06-04 18:07:55 +02:00
|
|
|
const char* protect_home_to_string(ProtectHome p) _const_;
|
|
|
|
ProtectHome protect_home_from_string(const char *s) _pure_;
|
|
|
|
|
|
|
|
const char* protect_system_to_string(ProtectSystem p) _const_;
|
|
|
|
ProtectSystem protect_system_from_string(const char *s) _pure_;
|
2016-11-23 22:21:40 +01:00
|
|
|
|
2020-08-06 12:51:50 +02:00
|
|
|
const char* protect_proc_to_string(ProtectProc i) _const_;
|
|
|
|
ProtectProc protect_proc_from_string(const char *s) _pure_;
|
|
|
|
|
|
|
|
const char* proc_subset_to_string(ProcSubset i) _const_;
|
|
|
|
ProcSubset proc_subset_from_string(const char *s) _pure_;
|
|
|
|
|
tree-wide: be more careful with the type of array sizes
Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.
Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.
So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.
This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:
1. strv_length()' return type becomes size_t
2. the unit file changes array size becomes size_t
3. DNS answer and query array sizes become size_t
Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745
2018-04-27 14:09:31 +02:00
|
|
|
void bind_mount_free_many(BindMount *b, size_t n);
|
|
|
|
int bind_mount_add(BindMount **b, size_t *n, const BindMount *item);
|
2017-10-10 09:46:13 +02:00
|
|
|
|
tree-wide: be more careful with the type of array sizes
Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.
Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.
So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.
This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:
1. strv_length()' return type becomes size_t
2. the unit file changes array size becomes size_t
3. DNS answer and query array sizes become size_t
Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745
2018-04-27 14:09:31 +02:00
|
|
|
void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n);
|
|
|
|
int temporary_filesystem_add(TemporaryFileSystem **t, size_t *n,
|
2018-02-21 01:17:52 +01:00
|
|
|
const char *path, const char *options);
|
|
|
|
|
2020-07-14 17:18:41 +02:00
|
|
|
MountImage* mount_image_free_many(MountImage *m, size_t *n);
|
|
|
|
int mount_image_add(MountImage **m, size_t *n, const MountImage *item);
|
|
|
|
|
2017-10-10 09:46:13 +02:00
|
|
|
const char* namespace_type_to_string(NamespaceType t) _const_;
|
|
|
|
NamespaceType namespace_type_from_string(const char *s) _pure_;
|
|
|
|
|
|
|
|
bool ns_type_supported(NamespaceType type);
|