Systemd/src/core/namespace.h

115 lines
3.3 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: LGPL-2.1+ */
#pragma once
/***
Copyright 2016 Djalal Harouni
***/
typedef struct NamespaceInfo NamespaceInfo;
typedef struct BindMount BindMount;
typedef struct TemporaryFileSystem TemporaryFileSystem;
#include <stdbool.h>
#include "dissect-image.h"
#include "macro.h"
typedef enum ProtectHome {
PROTECT_HOME_NO,
PROTECT_HOME_YES,
PROTECT_HOME_READ_ONLY,
PROTECT_HOME_TMPFS,
_PROTECT_HOME_MAX,
_PROTECT_HOME_INVALID = -1
} ProtectHome;
typedef enum NamespaceType {
NAMESPACE_MOUNT,
NAMESPACE_CGROUP,
NAMESPACE_UTS,
NAMESPACE_IPC,
NAMESPACE_USER,
NAMESPACE_PID,
NAMESPACE_NET,
_NAMESPACE_TYPE_MAX,
_NAMESPACE_TYPE_INVALID = -1,
} NamespaceType;
typedef enum ProtectSystem {
PROTECT_SYSTEM_NO,
PROTECT_SYSTEM_YES,
PROTECT_SYSTEM_FULL,
PROTECT_SYSTEM_STRICT,
_PROTECT_SYSTEM_MAX,
_PROTECT_SYSTEM_INVALID = -1
} ProtectSystem;
struct NamespaceInfo {
bool ignore_protect_paths:1;
bool private_dev:1;
core: add new PrivateMounts= unit setting This new setting is supposed to be useful in most cases where "MountFlags=slave" is currently used, i.e. as an explicit way to run a service in its own mount namespace and decouple propagation from all mounts of the new mount namespace towards the host. The effect of MountFlags=slave and PrivateMounts=yes is mostly the same, as both cause a CLONE_NEWNS namespace to be opened, and both will result in all mounts within it to be mounted MS_SLAVE. The difference is mostly on the conceptual/philosophical level: configuring the propagation mode is nothing people should have to think about, in particular as the matter is not precisely easyto grok. Moreover, MountFlags= allows configuration of "private" and "slave" modes which don't really make much sense to use in real-life and are quite confusing. In particular PrivateMounts=private means mounts made on the host stay pinned for good by the service which is particularly nasty for removable media mount. And PrivateMounts=shared is in most ways a NOP when used a alone... The main technical difference between setting only MountFlags=slave or only PrivateMounts=yes in a unit file is that the former remounts all mounts to MS_SLAVE and leaves them there, while that latter remounts them to MS_SHARED again right after. The latter is generally a nicer approach, since it disables propagation, while MS_SHARED is afterwards in effect, which is really nice as that means further namespacing down the tree will get MS_SHARED logic by default and we unify how applications see our mounts as we always pass them as MS_SHARED regardless whether any mount namespacing is used or not. The effect of PrivateMounts=yes was implied already by all the other mount namespacing options. With this new option we add an explicit knob for it, to request it without any other option used as well. See: #4393
2018-06-01 11:10:49 +02:00
bool private_mounts:1;
bool protect_control_groups:1;
bool protect_kernel_tunables:1;
bool protect_kernel_modules:1;
bool mount_apivfs:1;
};
struct BindMount {
char *source;
char *destination;
bool read_only:1;
bool recursive:1;
bool ignore_enoent:1;
};
struct TemporaryFileSystem {
char *path;
char *options;
};
int setup_namespace(
const char *root_directory,
const char *root_image,
const NamespaceInfo *ns_info,
char **read_write_paths,
char **read_only_paths,
char **inaccessible_paths,
execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage= Let's clean up the interaction of StateDirectory= (and friends) to DynamicUser=1: instead of creating these directories directly below /var/lib, place them in /var/lib/private instead if DynamicUser=1 is set, making that directory 0700 and owned by root:root. This way, if a dynamic UID is later reused, access to the old run's state directory is prohibited for that user. Then, use file system namespacing inside the service to make /var/lib/private a readable tmpfs, hiding all state directories that are not listed in StateDirectory=, and making access to the actual state directory possible. Mount all directories listed in StateDirectory= to the same places inside the service (which means they'll now be mounted into the tmpfs instance). Finally, add a symlink from the state directory name in /var/lib/ to the one in /var/lib/private, so that both the host and the service can access the path under the same location. Here's an example: let's say a service runs with StateDirectory=foo. When DynamicUser=0 is set, it will get the following setup, and no difference between what the unit and what the host sees: /var/lib/foo (created as directory) Now, if DynamicUser=1 is set, we'll instead get this on the host: /var/lib/private (created as directory with mode 0700, root:root) /var/lib/private/foo (created as directory) /var/lib/foo → private/foo (created as symlink) And from inside the unit: /var/lib/private (a tmpfs mount with mode 0755, root:root) /var/lib/private/foo (bind mounted from the host) /var/lib/foo → private/foo (the same symlink as above) This takes inspiration from how container trees are protected below /var/lib/machines: they generally reuse UIDs/GIDs of the host, but because /var/lib/machines itself is set to 0700 host users cannot access files in the container tree even if the UIDs/GIDs are reused. However, for this commit we add one further trick: inside and outside of the unit /var/lib/private is a different thing: outside it is a plain, inaccessible directory, and inside it is a world-readable tmpfs mount with only the whitelisted subdirs below it, bind mounte din. This means, from the outside the dir acts as an access barrier, but from the inside it does not. And the symlink created in /var/lib/foo itself points across the barrier in both cases, so that root and the unit's user always have access to these dirs without knowing the details of this mounting magic. This logic resolves a major shortcoming of DynamicUser=1 units: previously they couldn't safely store persistant data. With this change they can have their own private state, log and data directories, which they can write to, but which are protected from UID recycling. With this change, if RootDirectory= or RootImage= are used it is ensured that the specified state/log/cache directories are always mounted in from the host. This change of semantics I think is much preferable since this means the root directory/image logic can be used easily for read-only resource bundling (as all writable data resides outside of the image). Note that this is a change of behaviour, but given that we haven't released any systemd version with StateDirectory= and friends implemented this should be a safe change to make (in particular as previously it wasn't clear what would actually happen when used in combination). Moreover, by making this change we can later add a "+" modifier to these setings too working similar to the same modifier in ReadOnlyPaths= and friends, making specified paths relative to the container itself.
2017-09-28 18:55:45 +02:00
char **empty_directories,
const BindMount *bind_mounts,
size_t n_bind_mounts,
const TemporaryFileSystem *temporary_filesystems,
size_t n_temporary_filesystems,
const char *tmp_dir,
const char *var_tmp_dir,
ProtectHome protect_home,
ProtectSystem protect_system,
unsigned long mount_flags,
DissectImageFlags dissected_image_flags);
int setup_tmp_dirs(
const char *id,
char **tmp_dir,
char **var_tmp_dir);
int setup_netns(int netns_storage_socket[2]);
const char* protect_home_to_string(ProtectHome p) _const_;
ProtectHome protect_home_from_string(const char *s) _pure_;
ProtectHome protect_home_or_bool_from_string(const char *s);
const char* protect_system_to_string(ProtectSystem p) _const_;
ProtectSystem protect_system_from_string(const char *s) _pure_;
ProtectSystem protect_system_or_bool_from_string(const char *s);
void bind_mount_free_many(BindMount *b, size_t n);
int bind_mount_add(BindMount **b, size_t *n, const BindMount *item);
void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n);
int temporary_filesystem_add(TemporaryFileSystem **t, size_t *n,
const char *path, const char *options);
const char* namespace_type_to_string(NamespaceType t) _const_;
NamespaceType namespace_type_from_string(const char *s) _pure_;
bool ns_type_supported(NamespaceType type);