Systemd/src/core/execute.h

387 lines
11 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: LGPL-2.1+ */
#pragma once
typedef struct ExecStatus ExecStatus;
typedef struct ExecCommand ExecCommand;
typedef struct ExecContext ExecContext;
typedef struct ExecRuntime ExecRuntime;
typedef struct ExecParameters ExecParameters;
typedef struct Manager Manager;
#include <sched.h>
#include <stdbool.h>
#include <stdio.h>
#include <sys/capability.h>
#include "cgroup-util.h"
#include "fdset.h"
#include "list.h"
#include "missing.h"
#include "namespace.h"
#include "nsflags.h"
#define EXEC_STDIN_DATA_MAX (64U*1024U*1024U)
typedef enum ExecUtmpMode {
EXEC_UTMP_INIT,
EXEC_UTMP_LOGIN,
EXEC_UTMP_USER,
_EXEC_UTMP_MODE_MAX,
_EXEC_UTMP_MODE_INVALID = -1
} ExecUtmpMode;
typedef enum ExecInput {
EXEC_INPUT_NULL,
EXEC_INPUT_TTY,
EXEC_INPUT_TTY_FORCE,
EXEC_INPUT_TTY_FAIL,
EXEC_INPUT_SOCKET,
EXEC_INPUT_NAMED_FD,
EXEC_INPUT_DATA,
EXEC_INPUT_FILE,
_EXEC_INPUT_MAX,
_EXEC_INPUT_INVALID = -1
} ExecInput;
2010-01-28 02:06:20 +01:00
typedef enum ExecOutput {
EXEC_OUTPUT_INHERIT,
EXEC_OUTPUT_NULL,
EXEC_OUTPUT_TTY,
EXEC_OUTPUT_SYSLOG,
EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
EXEC_OUTPUT_KMSG,
EXEC_OUTPUT_KMSG_AND_CONSOLE,
EXEC_OUTPUT_JOURNAL,
EXEC_OUTPUT_JOURNAL_AND_CONSOLE,
EXEC_OUTPUT_SOCKET,
EXEC_OUTPUT_NAMED_FD,
EXEC_OUTPUT_FILE,
EXEC_OUTPUT_FILE_APPEND,
_EXEC_OUTPUT_MAX,
_EXEC_OUTPUT_INVALID = -1
2010-01-28 02:06:20 +01:00
} ExecOutput;
typedef enum ExecPreserveMode {
EXEC_PRESERVE_NO,
EXEC_PRESERVE_YES,
EXEC_PRESERVE_RESTART,
_EXEC_PRESERVE_MODE_MAX,
_EXEC_PRESERVE_MODE_INVALID = -1
} ExecPreserveMode;
typedef enum ExecKeyringMode {
EXEC_KEYRING_INHERIT,
EXEC_KEYRING_PRIVATE,
EXEC_KEYRING_SHARED,
_EXEC_KEYRING_MODE_MAX,
_EXEC_KEYRING_MODE_INVALID = -1,
} ExecKeyringMode;
struct ExecStatus {
dual_timestamp start_timestamp;
dual_timestamp exit_timestamp;
pid_t pid;
2010-01-24 00:39:29 +01:00
int code; /* as in siginfo_t::si_code */
int status; /* as in sigingo_t::si_status */
};
typedef enum ExecCommandFlags {
EXEC_COMMAND_IGNORE_FAILURE = 1,
EXEC_COMMAND_FULLY_PRIVILEGED = 2,
EXEC_COMMAND_NO_SETUID = 4,
EXEC_COMMAND_AMBIENT_MAGIC = 8,
} ExecCommandFlags;
struct ExecCommand {
char *path;
char **argv;
ExecStatus exec_status;
ExecCommandFlags flags;
LIST_FIELDS(ExecCommand, command); /* useful for chaining commands */
};
struct ExecRuntime {
int n_ref;
Manager *manager;
/* unit id of the owner */
char *id;
char *tmp_dir;
char *var_tmp_dir;
/* An AF_UNIX socket pair, that contains a datagram containing a file descriptor referring to the network
* namespace. */
int netns_storage_socket[2];
};
typedef enum ExecDirectoryType {
EXEC_DIRECTORY_RUNTIME = 0,
EXEC_DIRECTORY_STATE,
EXEC_DIRECTORY_CACHE,
EXEC_DIRECTORY_LOGS,
EXEC_DIRECTORY_CONFIGURATION,
_EXEC_DIRECTORY_TYPE_MAX,
_EXEC_DIRECTORY_TYPE_INVALID = -1,
} ExecDirectoryType;
typedef struct ExecDirectory {
char **paths;
mode_t mode;
} ExecDirectory;
struct ExecContext {
char **environment;
char **environment_files;
char **pass_environment;
char **unset_environment;
struct rlimit *rlimit[_RLIMIT_MAX];
char *working_directory, *root_directory, *root_image;
bool working_directory_missing_ok;
bool working_directory_home;
mode_t umask;
int oom_score_adjust;
int nice;
int ioprio;
int cpu_sched_policy;
int cpu_sched_priority;
2010-07-04 16:44:58 +02:00
cpu_set_t *cpuset;
unsigned cpuset_ncpus;
ExecInput std_input;
ExecOutput std_output;
ExecOutput std_error;
char *stdio_fdname[3];
char *stdio_file[3];
void *stdin_data;
size_t stdin_data_size;
nsec_t timer_slack_nsec;
2010-01-28 02:06:20 +01:00
bool stdio_as_fds;
char *tty_path;
bool tty_reset;
bool tty_vhangup;
bool tty_vt_disallocate;
2012-02-09 03:18:04 +01:00
bool ignore_sigpipe;
/* Since resolving these names might involve socket
* connections and we don't want to deadlock ourselves these
* names are resolved on execution only and in the child
* process. */
char *user;
char *group;
char **supplementary_groups;
char *pam_name;
char *utmp_id;
ExecUtmpMode utmp_mode;
bool selinux_context_ignore;
char *selinux_context;
bool apparmor_profile_ignore;
char *apparmor_profile;
bool smack_process_label_ignore;
char *smack_process_label;
ExecKeyringMode keyring_mode;
char **read_write_paths, **read_only_paths, **inaccessible_paths;
unsigned long mount_flags;
BindMount *bind_mounts;
size_t n_bind_mounts;
TemporaryFileSystem *temporary_filesystems;
size_t n_temporary_filesystems;
uint64_t capability_bounding_set;
uint64_t capability_ambient_set;
int secure_bits;
int syslog_priority;
char *syslog_identifier;
bool syslog_level_prefix;
core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald And let's make use of it to implement two new unit settings with it: 1. LogLevelMax= is a new per-unit setting that may be used to configure log priority filtering: set it to LogLevelMax=notice and only messages of level "notice" and lower (i.e. more important) will be processed, all others are dropped. 2. LogExtraFields= is a new per-unit setting for configuring per-unit journal fields, that are implicitly included in every log record generated by the unit's processes. It takes field/value pairs in the form of FOO=BAR. Also, related to this, one exisiting unit setting is ported to this new facility: 3. The invocation ID is now pulled from /run/systemd/units/ instead of cgroupfs xattrs. This substantially relaxes requirements of systemd on the kernel version and the privileges it runs with (specifically, cgroupfs xattrs are not available in containers, since they are stored in kernel memory, and hence are unsafe to permit to lesser privileged code). /run/systemd/units/ is a new directory, which contains a number of files and symlinks encoding the above information. PID 1 creates and manages these files, and journald reads them from there. Note that this is supposed to be a direct path between PID 1 and the journal only, due to the special runtime environment the journal runs in. Normally, today we shouldn't introduce new interfaces that (mis-)use a file system as IPC framework, and instead just an IPC system, but this is very hard to do between the journal and PID 1, as long as the IPC system is a subject PID 1 manages, and itself a client to the journal. This patch cleans up a couple of types used in journal code: specifically we switch to size_t for a couple of memory-sizing values, as size_t is the right choice for everything that is memory. Fixes: #4089 Fixes: #3041 Fixes: #4441
2017-11-02 19:43:32 +01:00
int log_level_max;
struct iovec* log_extra_fields;
size_t n_log_extra_fields;
bool cpu_sched_reset_on_fork;
bool non_blocking;
bool private_tmp;
bool private_network;
bool private_devices;
bool private_users;
core: add new PrivateMounts= unit setting This new setting is supposed to be useful in most cases where "MountFlags=slave" is currently used, i.e. as an explicit way to run a service in its own mount namespace and decouple propagation from all mounts of the new mount namespace towards the host. The effect of MountFlags=slave and PrivateMounts=yes is mostly the same, as both cause a CLONE_NEWNS namespace to be opened, and both will result in all mounts within it to be mounted MS_SLAVE. The difference is mostly on the conceptual/philosophical level: configuring the propagation mode is nothing people should have to think about, in particular as the matter is not precisely easyto grok. Moreover, MountFlags= allows configuration of "private" and "slave" modes which don't really make much sense to use in real-life and are quite confusing. In particular PrivateMounts=private means mounts made on the host stay pinned for good by the service which is particularly nasty for removable media mount. And PrivateMounts=shared is in most ways a NOP when used a alone... The main technical difference between setting only MountFlags=slave or only PrivateMounts=yes in a unit file is that the former remounts all mounts to MS_SLAVE and leaves them there, while that latter remounts them to MS_SHARED again right after. The latter is generally a nicer approach, since it disables propagation, while MS_SHARED is afterwards in effect, which is really nice as that means further namespacing down the tree will get MS_SHARED logic by default and we unify how applications see our mounts as we always pass them as MS_SHARED regardless whether any mount namespacing is used or not. The effect of PrivateMounts=yes was implied already by all the other mount namespacing options. With this new option we add an explicit knob for it, to request it without any other option used as well. See: #4393
2018-06-01 11:10:49 +02:00
bool private_mounts;
ProtectSystem protect_system;
ProtectHome protect_home;
bool protect_kernel_tunables;
bool protect_kernel_modules;
bool protect_control_groups;
bool mount_apivfs;
bool no_new_privileges;
bool dynamic_user;
bool remove_ipc;
/* This is not exposed to the user but available
* internally. We need it to make sure that whenever we spawn
* /usr/bin/mount it is run in the same process group as us so
* that the autofs logic detects that it belongs to us and we
* don't enter a trigger loop. */
bool same_pgrp;
unsigned long personality;
bool lock_personality;
unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */
Hashmap *syscall_filter;
Set *syscall_archs;
int syscall_errno;
bool syscall_whitelist:1;
Set *address_families;
bool address_families_whitelist:1;
ExecPreserveMode runtime_directory_preserve_mode;
ExecDirectory directories[_EXEC_DIRECTORY_TYPE_MAX];
bool memory_deny_write_execute;
bool restrict_realtime;
bool oom_score_adjust_set:1;
bool nice_set:1;
bool ioprio_set:1;
bool cpu_sched_set:1;
};
static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) {
assert(c);
return (c->restrict_namespaces & NAMESPACE_FLAGS_ALL) != NAMESPACE_FLAGS_ALL;
}
typedef enum ExecFlags {
EXEC_APPLY_SANDBOXING = 1 << 0,
EXEC_APPLY_CHROOT = 1 << 1,
EXEC_APPLY_TTY_STDIN = 1 << 2,
EXEC_NEW_KEYRING = 1 << 3,
EXEC_PASS_LOG_UNIT = 1 << 4, /* Whether to pass the unit name to the service's journal stream connection */
EXEC_CHOWN_DIRECTORIES = 1 << 5, /* chown() the runtime/state/cache/log directories to the user we run as, under all conditions */
EXEC_NSS_BYPASS_BUS = 1 << 6, /* Set the SYSTEMD_NSS_BYPASS_BUS environment variable, to disable nss-systemd for dbus */
EXEC_CGROUP_DELEGATE = 1 << 7,
/* The following are not used by execute.c, but by consumers internally */
EXEC_PASS_FDS = 1 << 8,
EXEC_IS_CONTROL = 1 << 9,
EXEC_SETENV_RESULT = 1 << 10,
EXEC_SET_WATCHDOG = 1 << 11,
} ExecFlags;
struct ExecParameters {
char **argv;
char **environment;
int *fds;
char **fd_names;
size_t n_storage_fds;
size_t n_socket_fds;
ExecFlags flags;
bool selinux_context_net:1;
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
CGroupMask cgroup_supported;
const char *cgroup_path;
char **prefix;
const char *confirm_spawn;
usec_t watchdog_usec;
int *idle_pipe;
int stdin_fd;
int stdout_fd;
int stderr_fd;
};
#include "unit.h"
#include "dynamic-user.h"
core,network: major per-object logging rework This changes log_unit_info() (and friends) to take a real Unit* object insted of just a unit name as parameter. The call will now prefix all logged messages with the unit name, thus allowing the unit name to be dropped from the various passed romat strings, simplifying invocations drastically, and unifying log output across messages. Also, UNIT= vs. USER_UNIT= is now derived from the Manager object attached to the Unit object, instead of getpid(). This has the benefit of correcting the field for --test runs. Also contains a couple of other logging improvements: - Drops a couple of strerror() invocations in favour of using %m. - Not only .mount units now warn if a symlinks exist for the mount point already, .automount units do that too, now. - A few invocations of log_struct() that didn't actually pass any additional structured data have been replaced by simpler invocations of log_unit_info() and friends. - For structured data a new LOG_UNIT_MESSAGE() macro has been added, that works like LOG_MESSAGE() but prefixes the message with the unit name. Similar, there's now LOG_LINK_MESSAGE() and LOG_NETDEV_MESSAGE(). - For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(), LOG_NETDEV_INTERFACE() macros have been added that generate the necessary per object fields. The old log_unit_struct() call has been removed in favour of these new macros used in raw log_struct() invocations. In addition to removing one more function call this allows generated structured log messages that contain two object fields, as necessary for example for network interfaces that are joined into another network interface, and whose messages shall be indexed by both. - The LOG_ERRNO() macro has been removed, in favour of log_struct_errno(). The latter has the benefit of ensuring that %m in format strings is properly resolved to the specified error number. - A number of logging messages have been converted to use log_unit_info() instead of log_info() - The client code in sysv-generator no longer #includes core code from src/core/. - log_unit_full_errno() has been removed, log_unit_full() instead takes an errno now, too. - log_unit_info(), log_link_info(), log_netdev_info() and friends, now avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
int exec_spawn(Unit *unit,
ExecCommand *command,
const ExecContext *context,
const ExecParameters *exec_params,
ExecRuntime *runtime,
DynamicCreds *dynamic_creds,
pid_t *ret);
void exec_command_done_array(ExecCommand *c, size_t n);
ExecCommand* exec_command_free_list(ExecCommand *c);
void exec_command_free_array(ExecCommand **c, size_t n);
2010-01-26 07:02:51 +01:00
void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix);
void exec_command_append_list(ExecCommand **l, ExecCommand *e);
int exec_command_set(ExecCommand *c, const char *path, ...);
int exec_command_append(ExecCommand *c, const char *path, ...);
2010-01-26 07:02:51 +01:00
void exec_context_init(ExecContext *c);
void exec_context_done(ExecContext *c);
void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix);
int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_root);
const char* exec_context_fdname(const ExecContext *c, int fd_index);
bool exec_context_may_touch_console(const ExecContext *c);
bool exec_context_maintains_privileges(const ExecContext *c);
int exec_context_get_effective_ioprio(const ExecContext *c);
core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald And let's make use of it to implement two new unit settings with it: 1. LogLevelMax= is a new per-unit setting that may be used to configure log priority filtering: set it to LogLevelMax=notice and only messages of level "notice" and lower (i.e. more important) will be processed, all others are dropped. 2. LogExtraFields= is a new per-unit setting for configuring per-unit journal fields, that are implicitly included in every log record generated by the unit's processes. It takes field/value pairs in the form of FOO=BAR. Also, related to this, one exisiting unit setting is ported to this new facility: 3. The invocation ID is now pulled from /run/systemd/units/ instead of cgroupfs xattrs. This substantially relaxes requirements of systemd on the kernel version and the privileges it runs with (specifically, cgroupfs xattrs are not available in containers, since they are stored in kernel memory, and hence are unsafe to permit to lesser privileged code). /run/systemd/units/ is a new directory, which contains a number of files and symlinks encoding the above information. PID 1 creates and manages these files, and journald reads them from there. Note that this is supposed to be a direct path between PID 1 and the journal only, due to the special runtime environment the journal runs in. Normally, today we shouldn't introduce new interfaces that (mis-)use a file system as IPC framework, and instead just an IPC system, but this is very hard to do between the journal and PID 1, as long as the IPC system is a subject PID 1 manages, and itself a client to the journal. This patch cleans up a couple of types used in journal code: specifically we switch to size_t for a couple of memory-sizing values, as size_t is the right choice for everything that is memory. Fixes: #4089 Fixes: #3041 Fixes: #4441
2017-11-02 19:43:32 +01:00
void exec_context_free_log_extra_fields(ExecContext *c);
2010-07-04 18:49:58 +02:00
void exec_status_start(ExecStatus *s, pid_t pid);
void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status);
void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix);
int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *name, bool create, ExecRuntime **ret);
ExecRuntime *exec_runtime_unref(ExecRuntime *r, bool destroy);
int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds);
int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds);
void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds);
void exec_runtime_vacuum(Manager *m);
const char* exec_output_to_string(ExecOutput i) _const_;
ExecOutput exec_output_from_string(const char *s) _pure_;
const char* exec_input_to_string(ExecInput i) _const_;
ExecInput exec_input_from_string(const char *s) _pure_;
const char* exec_utmp_mode_to_string(ExecUtmpMode i) _const_;
ExecUtmpMode exec_utmp_mode_from_string(const char *s) _pure_;
const char* exec_preserve_mode_to_string(ExecPreserveMode i) _const_;
ExecPreserveMode exec_preserve_mode_from_string(const char *s) _pure_;
const char* exec_keyring_mode_to_string(ExecKeyringMode i) _const_;
ExecKeyringMode exec_keyring_mode_from_string(const char *s) _pure_;
const char* exec_directory_type_to_string(ExecDirectoryType i) _const_;
ExecDirectoryType exec_directory_type_from_string(const char *s) _pure_;