Systemd/src/core/unit.h

857 lines
32 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: LGPL-2.1+ */
#pragma once
2010-01-26 21:39:06 +01:00
#include <stdbool.h>
#include <stdlib.h>
#include <unistd.h>
2010-01-26 21:39:06 +01:00
#include "bpf-program.h"
#include "condition.h"
#include "emergency-action.h"
2011-07-31 18:28:02 +02:00
#include "install.h"
#include "list.h"
#include "unit-name.h"
#include "cgroup.h"
2010-01-26 21:39:06 +01:00
typedef struct UnitRef UnitRef;
typedef enum KillOperation {
KILL_TERMINATE,
KILL_TERMINATE_AND_LOG,
KILL_KILL,
KILL_WATCHDOG,
_KILL_OPERATION_MAX,
_KILL_OPERATION_INVALID = -1
} KillOperation;
typedef enum CollectMode {
COLLECT_INACTIVE,
COLLECT_INACTIVE_OR_FAILED,
_COLLECT_MODE_MAX,
_COLLECT_MODE_INVALID = -1,
} CollectMode;
2010-01-26 21:39:06 +01:00
static inline bool UNIT_IS_ACTIVE_OR_RELOADING(UnitActiveState t) {
return IN_SET(t, UNIT_ACTIVE, UNIT_RELOADING);
2010-01-26 21:39:06 +01:00
}
static inline bool UNIT_IS_ACTIVE_OR_ACTIVATING(UnitActiveState t) {
return IN_SET(t, UNIT_ACTIVE, UNIT_ACTIVATING, UNIT_RELOADING);
2010-01-26 21:39:06 +01:00
}
static inline bool UNIT_IS_INACTIVE_OR_DEACTIVATING(UnitActiveState t) {
return IN_SET(t, UNIT_INACTIVE, UNIT_FAILED, UNIT_DEACTIVATING);
}
static inline bool UNIT_IS_INACTIVE_OR_FAILED(UnitActiveState t) {
return IN_SET(t, UNIT_INACTIVE, UNIT_FAILED);
2010-01-26 21:39:06 +01:00
}
/* Stores the 'reason' a dependency was created as a bit mask, i.e. due to which configuration source it came to be. We
* use this so that we can selectively flush out parts of dependencies again. Note that the same dependency might be
* created as a result of multiple "reasons", hence the bitmask. */
typedef enum UnitDependencyMask {
/* Configured directly by the unit file, .wants/.requries symlink or drop-in, or as an immediate result of a
* non-dependency option configured that way. */
UNIT_DEPENDENCY_FILE = 1 << 0,
/* As unconditional implicit dependency (not affected by unit configuration — except by the unit name and
* type) */
UNIT_DEPENDENCY_IMPLICIT = 1 << 1,
/* A dependency effected by DefaultDependencies=yes. Note that dependencies marked this way are conceptually
* just a subset of UNIT_DEPENDENCY_FILE, as DefaultDependencies= is itself a unit file setting that can only
* be set in unit files. We make this two separate bits only to help debugging how dependencies came to be. */
UNIT_DEPENDENCY_DEFAULT = 1 << 2,
/* A dependency created from udev rules */
UNIT_DEPENDENCY_UDEV = 1 << 3,
/* A dependency created because of some unit's RequiresMountsFor= setting */
UNIT_DEPENDENCY_PATH = 1 << 4,
/* A dependency created because of data read from /proc/self/mountinfo and no other configuration source */
UNIT_DEPENDENCY_MOUNTINFO_IMPLICIT = 1 << 5,
/* A dependency created because of data read from /proc/self/mountinfo, but conditionalized by
* DefaultDependencies= and thus also involving configuration from UNIT_DEPENDENCY_FILE sources */
UNIT_DEPENDENCY_MOUNTINFO_DEFAULT = 1 << 6,
/* A dependency created because of data read from /proc/swaps and no other configuration source */
UNIT_DEPENDENCY_PROC_SWAP = 1 << 7,
_UNIT_DEPENDENCY_MASK_FULL = (1 << 8) - 1,
} UnitDependencyMask;
/* The Unit's dependencies[] hashmaps use this structure as value. It has the same size as a void pointer, and thus can
* be stored directly as hashmap value, without any indirection. Note that this stores two masks, as both the origin
* and the destination of a dependency might have created it. */
typedef union UnitDependencyInfo {
void *data;
struct {
UnitDependencyMask origin_mask:16;
UnitDependencyMask destination_mask:16;
} _packed_;
} UnitDependencyInfo;
2010-01-29 06:04:08 +01:00
#include "job.h"
struct UnitRef {
/* Keeps tracks of references to a unit. This is useful so
* that we can merge two units if necessary and correct all
* references to them */
Unit *source, *target;
LIST_FIELDS(UnitRef, refs_by_target);
};
typedef struct Unit {
2010-01-26 21:39:06 +01:00
Manager *manager;
2010-04-06 02:43:58 +02:00
2010-01-26 21:39:06 +01:00
UnitType type;
UnitLoadState load_state;
2010-04-06 02:43:58 +02:00
Unit *merged_into;
2010-01-26 21:39:06 +01:00
char *id; /* One name is special because we use it for identification. Points to an entry in the names set */
2010-04-15 03:11:11 +02:00
char *instance;
2010-01-26 21:39:06 +01:00
Set *names;
/* For each dependency type we maintain a Hashmap whose key is the Unit* object, and the value encodes why the
* dependency exists, using the UnitDependencyInfo type */
Hashmap *dependencies[_UNIT_DEPENDENCY_MAX];
/* Similar, for RequiresMountsFor= path dependencies. The key is the path, the value the UnitDependencyInfo type */
Hashmap *requires_mounts_for;
2010-01-26 21:39:06 +01:00
char *description;
char **documentation;
char *fragment_path; /* if loaded from a config file this is the primary path to it */
char *source_path; /* if converted, the source file */
char **dropin_paths;
usec_t fragment_mtime;
usec_t source_mtime;
usec_t dropin_mtime;
2010-01-26 21:39:06 +01:00
/* If this is a transient unit we are currently writing, this is where we are writing it to */
FILE *transient_file;
core: add NOP jobs, job type collapsing Two of our current job types are special: JOB_TRY_RESTART, JOB_RELOAD_OR_START. They differ from other job types by being sensitive to the unit active state. They perform some action when the unit is active and some other action otherwise. This raises a question: when exactly should the unit state be checked to make the decision? Currently the unit state is checked when the job becomes runnable. It's more sensible to check the state immediately when the job is added by the user. When the user types "systemctl try-restart foo.service", he really intends to restart the service if it's running right now. If it isn't running right now, the restart is pointless. Consider the example (from Bugzilla[1]): sleep.service takes some time to start. hello.service has After=sleep.service. Both services get started. Two jobs will appear: hello.service/start waiting sleep.service/start running Then someone runs "systemctl try-restart hello.service". Currently the try-restart operation will block and wait for sleep.service/start to complete. The correct result is to complete the try-restart operation immediately with success, because hello.service is not running. The two original jobs must not be disturbed by this. To fix this we introduce two new concepts: - a new job type: JOB_NOP A JOB_NOP job does not do anything to the unit. It does not pull in any dependencies. It is always immediately runnable. When installed to a unit, it sits in a special slot (u->nop_job) where it never conflicts with the installed job (u->job) of a different type. It never merges with jobs of other types, but it can merge into an already installed JOB_NOP job. - "collapsing" of job types When a job of one of the two special types is added, the state of the unit is checked immediately and the job type changes: JOB_TRY_RESTART -> JOB_RESTART or JOB_NOP JOB_RELOAD_OR_START -> JOB_RELOAD or JOB_START Should a job type JOB_RELOAD_OR_START appear later during job merging, it collapses immediately afterwards. Collapsing actually makes some things simpler, because there are now fewer job types that are allowed in the transaction. [1] Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=753586
2012-04-25 11:58:27 +02:00
/* If there is something to do with this unit, then this is the installed job for it */
2010-01-26 21:39:06 +01:00
Job *job;
core: add NOP jobs, job type collapsing Two of our current job types are special: JOB_TRY_RESTART, JOB_RELOAD_OR_START. They differ from other job types by being sensitive to the unit active state. They perform some action when the unit is active and some other action otherwise. This raises a question: when exactly should the unit state be checked to make the decision? Currently the unit state is checked when the job becomes runnable. It's more sensible to check the state immediately when the job is added by the user. When the user types "systemctl try-restart foo.service", he really intends to restart the service if it's running right now. If it isn't running right now, the restart is pointless. Consider the example (from Bugzilla[1]): sleep.service takes some time to start. hello.service has After=sleep.service. Both services get started. Two jobs will appear: hello.service/start waiting sleep.service/start running Then someone runs "systemctl try-restart hello.service". Currently the try-restart operation will block and wait for sleep.service/start to complete. The correct result is to complete the try-restart operation immediately with success, because hello.service is not running. The two original jobs must not be disturbed by this. To fix this we introduce two new concepts: - a new job type: JOB_NOP A JOB_NOP job does not do anything to the unit. It does not pull in any dependencies. It is always immediately runnable. When installed to a unit, it sits in a special slot (u->nop_job) where it never conflicts with the installed job (u->job) of a different type. It never merges with jobs of other types, but it can merge into an already installed JOB_NOP job. - "collapsing" of job types When a job of one of the two special types is added, the state of the unit is checked immediately and the job type changes: JOB_TRY_RESTART -> JOB_RESTART or JOB_NOP JOB_RELOAD_OR_START -> JOB_RELOAD or JOB_START Should a job type JOB_RELOAD_OR_START appear later during job merging, it collapses immediately afterwards. Collapsing actually makes some things simpler, because there are now fewer job types that are allowed in the transaction. [1] Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=753586
2012-04-25 11:58:27 +02:00
/* JOB_NOP jobs are special and can be installed without disturbing the real job. */
Job *nop_job;
/* The slot used for watching NameOwnerChanged signals */
sd_bus_slot *match_bus_slot;
/* References to this unit from clients */
sd_bus_track *bus_track;
char **deserialized_refs;
/* Job timeout and action to take */
usec_t job_timeout;
usec_t job_running_timeout;
bool job_running_timeout_set:1;
EmergencyAction job_timeout_action;
char *job_timeout_reboot_arg;
/* References to this */
LIST_HEAD(UnitRef, refs_by_target);
/* Conditions to check */
LIST_HEAD(Condition, conditions);
LIST_HEAD(Condition, asserts);
dual_timestamp condition_timestamp;
dual_timestamp assert_timestamp;
/* Updated whenever the low-level state changes */
dual_timestamp state_change_timestamp;
/* Updated whenever the (high-level) active state enters or leaves the active or inactive states */
dual_timestamp inactive_exit_timestamp;
dual_timestamp active_enter_timestamp;
dual_timestamp active_exit_timestamp;
dual_timestamp inactive_enter_timestamp;
2010-01-26 21:39:06 +01:00
UnitRef slice;
2010-01-29 06:04:08 +01:00
/* Per type list */
LIST_FIELDS(Unit, units_by_type);
/* Load queue */
LIST_FIELDS(Unit, load_queue);
/* D-Bus queue */
LIST_FIELDS(Unit, dbus_queue);
2010-04-06 02:43:58 +02:00
/* Cleanup queue */
LIST_FIELDS(Unit, cleanup_queue);
/* GC queue */
LIST_FIELDS(Unit, gc_queue);
/* CGroup realize members queue */
LIST_FIELDS(Unit, cgroup_realize_queue);
/* cgroup empty queue */
LIST_FIELDS(Unit, cgroup_empty_queue);
/* Target dependencies queue */
LIST_FIELDS(Unit, target_deps_queue);
/* Queue of units with StopWhenUnneeded set that shell be checked for clean-up. */
LIST_FIELDS(Unit, stop_when_unneeded_queue);
/* PIDs we keep an eye on. Note that a unit might have many
* more, but these are the ones we care enough about to
* process SIGCHLD for */
Set *pids;
core: rework how we track which PIDs to watch for a unit Previously, we'd maintain two hashmaps keyed by PIDs, pointing to Unit interested in SIGCHLD events for them. This scheme allowed a specific PID to be watched by exactly 0, 1 or 2 units. With this rework this is replaced by a single hashmap which is primarily keyed by the PID and points to a Unit interested in it. However, it optionally also keyed by the negated PID, in which case it points to a NULL terminated array of additional Unit objects also interested. This scheme means arbitrary numbers of Units may now watch the same PID. Runtime and memory behaviour should not be impact by this change, as for the common case (i.e. each PID only watched by a single unit) behaviour stays the same, but for the uncommon case (a PID watched by more than one unit) we only pay with a single additional memory allocation for the array. Why this all? Primarily, because allowing exactly two units to watch a specific PID is not sufficient for some niche cases, as processes can belong to more than one unit these days: 1. sd_notify() with MAINPID= can be used to attach a process from a different cgroup to multiple units. 2. Similar, the PIDFile= setting in unit files can be used for similar setups, 3. By creating a scope unit a main process of a service may join a different unit, too. 4. On cgroupsv1 we frequently end up watching all processes remaining in a scope, and if a process opens lots of scopes one after the other it might thus end up being watch by many of them. This patch hence removes the 2-unit-per-PID limit. It also makes a couple of other changes, some of them quite relevant: - manager_get_unit_by_pid() (and the bus call wrapping it) when there's ambiguity will prefer returning the Unit the process belongs to based on cgroup membership, and only check the watch-pids hashmap if that fails. This change in logic is probably more in line with what people expect and makes things more stable as each process can belong to exactly one cgroup only. - Every SIGCHLD event is now dispatched to all units interested in its PID. Previously, there was some magic conditionalization: the SIGCHLD would only be dispatched to the unit if it was only interested in a single PID only, or the PID belonged to the control or main PID or we didn't dispatch a signle SIGCHLD to the unit in the current event loop iteration yet. These rules were quite arbitrary and also redundant as the the per-unit handlers would filter the PIDs anyway a second time. With this change we'll hence relax the rules: all we do now is dispatch every SIGCHLD event exactly once to each unit interested in it, and it's up to the unit to then use or ignore this. We use a generation counter in the unit to ensure that we only invoke the unit handler once for each event, protecting us from confusion if a unit is both associated with a specific PID through cgroup membership and through the "watch_pids" logic. It also protects us from being confused if the "watch_pids" hashmap is altered while we are dispatching to it (which is a very likely case). - sd_notify() message dispatching has been reworked to be very similar to SIGCHLD handling now. A generation counter is used for dispatching as well. This also adds a new test that validates that "watch_pid" registration and unregstration works correctly.
2018-01-12 13:41:05 +01:00
/* Used in SIGCHLD and sd_notify() message event invocation logic to avoid that we dispatch the same event
* multiple times on the same unit. */
unsigned sigchldgen;
unsigned notifygen;
/* Used during GC sweeps */
2010-04-23 18:47:49 +02:00
unsigned gc_marker;
/* Error code when we didn't manage to load the unit (negative) */
int load_error;
/* Put a ratelimit on unit starting */
RateLimit start_limit;
EmergencyAction start_limit_action;
/* What to do on failure or success */
EmergencyAction success_action, failure_action;
int success_action_exit_status, failure_action_exit_status;
char *reboot_arg;
/* Make sure we never enter endless loops with the check unneeded logic, or the BindsTo= logic */
RateLimit auto_stop_ratelimit;
/* Reference to a specific UID/GID */
uid_t ref_uid;
gid_t ref_gid;
/* Cached unit file state and preset */
2011-07-31 18:28:02 +02:00
UnitFileState unit_file_state;
int unit_file_preset;
2011-07-31 18:28:02 +02:00
/* Where the cpu.stat or cpuacct.usage was at the time the unit was started */
nsec_t cpu_usage_base;
nsec_t cpu_usage_last; /* the most recently read value */
/* Counterparts in the cgroup filesystem */
char *cgroup_path;
CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */
CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */
CGroupMask cgroup_invalidated_mask; /* A mask specifiying controllers which shall be considered invalidated, and require re-realization */
CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
int cgroup_inotify_wd;
/* Device Controller BPF program */
BPFProgram *bpf_device_control_installed;
/* IP BPF Firewalling/accounting */
int ip_accounting_ingress_map_fd;
int ip_accounting_egress_map_fd;
int ipv4_allow_map_fd;
int ipv6_allow_map_fd;
int ipv4_deny_map_fd;
int ipv6_deny_map_fd;
bpf: rework how we keep track and attach cgroup bpf programs So, the kernel's management of cgroup/BPF programs is a bit misdesigned: if you attach a BPF program to a cgroup and close the fd for it it will stay pinned to the cgroup with no chance of ever removing it again (or otherwise getting ahold of it again), because the fd is used for selecting which BPF program to detach. The only way to get rid of the program again is to destroy the cgroup itself. This is particularly bad for root the cgroup (and in fact any other cgroup that we cannot realistically remove during runtime, such as /system.slice, /init.scope or /system.slice/dbus.service) as getting rid of the program only works by rebooting the system. To counter this let's closely keep track to which cgroup a BPF program is attached and let's implicitly detach the BPF program when we are about to close the BPF fd. This hence changes the bpf_program_cgroup_attach() function to track where we attached the program and changes bpf_program_cgroup_detach() to use this information. Moreover bpf_program_unref() will now implicitly call bpf_program_cgroup_detach(). In order to simplify things, bpf_program_cgroup_attach() will now implicitly invoke bpf_program_load_kernel() when necessary, simplifying the caller's side. Finally, this adds proper reference counting to BPF programs. This is useful for working with two BPF programs in parallel: the BPF program we are preparing for installation and the BPF program we so far installed, shortening the window when we detach the old one and reattach the new one.
2018-02-20 19:28:24 +01:00
BPFProgram *ip_bpf_ingress, *ip_bpf_ingress_installed;
BPFProgram *ip_bpf_egress, *ip_bpf_egress_installed;
uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX];
core: rework how we track service and scope PIDs This reworks how systemd tracks processes on cgroupv1 systems where cgroup notification is not reliable. Previously, whenever we had reason to believe that new processes showed up or got removed we'd scan the cgroup of the scope or service unit for new processes, and would tidy up the list of PIDs previously watched. This scanning is relatively slow, and does not scale well. With this change behaviour is changed: instead of scanning for new/removed processes right away we do this work in a per-unit deferred event loop job. This event source is scheduled at a very low priority, so that it is executed when we have time but does not starve other event sources. This has two benefits: this expensive work is coalesced, if events happen in quick succession, and we won't delay SIGCHLD handling for too long. This patch basically replaces all direct invocation of unit_watch_all_pids() in scope.c and service.c with invocations of the new unit_enqueue_rewatch_pids() call which just enqueues a request of watching/tidying up the PID sets (with one exception: in scope_enter_signal() and service_enter_signal() we'll still do unit_watch_all_pids() synchronously first, since we really want to know all processes we are about to kill so that we can track them properly. Moreover, all direct invocations of unit_tidy_watch_pids() and unit_synthesize_cgroup_empty_event() are removed too, when the unit_enqueue_rewatch_pids() call is invoked, as the queued job will run those operations too. All of this is done on cgroupsv1 systems only, and is disabled on cgroupsv2 systems as cgroup-empty notifications are reliable there, and we do not need SIGCHLD events to track processes there. Fixes: #9138
2018-05-31 15:41:59 +02:00
/* Low-priority event source which is used to remove watched PIDs that have gone away, and subscribe to any new
* ones which might have appeared. */
sd_event_source *rewatch_pids_event_source;
/* How to start OnFailure units */
JobMode on_failure_job_mode;
/* Tweaking the GC logic */
CollectMode collect_mode;
core: add "invocation ID" concept to service manager This adds a new invocation ID concept to the service manager. The invocation ID identifies each runtime cycle of a unit uniquely. A new randomized 128bit ID is generated each time a unit moves from and inactive to an activating or active state. The primary usecase for this concept is to connect the runtime data PID 1 maintains about a service with the offline data the journal stores about it. Previously we'd use the unit name plus start/stop times, which however is highly racy since the journal will generally process log data after the service already ended. The "invocation ID" kinda matches the "boot ID" concept of the Linux kernel, except that it applies to an individual unit instead of the whole system. The invocation ID is passed to the activated processes as environment variable. It is additionally stored as extended attribute on the cgroup of the unit. The latter is used by journald to automatically retrieve it for each log logged message and attach it to the log entry. The environment variable is very easily accessible, even for unprivileged services. OTOH the extended attribute is only accessible to privileged processes (this is because cgroupfs only supports the "trusted." xattr namespace, not "user."). The environment variable may be altered by services, the extended attribute may not be, hence is the better choice for the journal. Note that reading the invocation ID off the extended attribute from journald is racy, similar to the way reading the unit name for a logging process is. This patch adds APIs to read the invocation ID to sd-id128: sd_id128_get_invocation() may be used in a similar fashion to sd_id128_get_boot(). PID1's own logging is updated to always include the invocation ID when it logs information about a unit. A new bus call GetUnitByInvocationID() is added that allows retrieving a bus path to a unit by its invocation ID. The bus path is built using the invocation ID, thus providing a path for referring to a unit that is valid only for the current runtime cycleof it. Outlook for the future: should the kernel eventually allow passing of cgroup information along AF_UNIX/SOCK_DGRAM messages via a unique cgroup id, then we can alter the invocation ID to be generated as hash from that rather than entirely randomly. This way we can derive the invocation race-freely from the messages.
2016-08-30 23:18:46 +02:00
/* The current invocation ID */
sd_id128_t invocation_id;
char invocation_id_string[SD_ID128_STRING_MAX]; /* useful when logging */
/* Garbage collect us we nobody wants or requires us anymore */
bool stop_when_unneeded;
/* Create default dependencies */
bool default_dependencies;
/* Refuse manual starting, allow starting only indirectly via dependency. */
bool refuse_manual_start;
/* Don't allow the user to stop this unit manually, allow stopping only indirectly via dependency. */
bool refuse_manual_stop;
2010-08-30 22:45:46 +02:00
/* Allow isolation requests */
bool allow_isolate;
/* Ignore this unit when isolating */
bool ignore_on_isolate;
2012-07-15 10:41:40 +02:00
/* Did the last condition check succeed? */
bool condition_result;
bool assert_result;
/* Is this a transient unit? */
bool transient;
/* Is this a unit that is always running and cannot be stopped? */
bool perpetual;
2018-11-22 22:25:27 +01:00
/* Booleans indicating membership of this unit in the various queues */
bool in_load_queue:1;
bool in_dbus_queue:1;
bool in_cleanup_queue:1;
bool in_gc_queue:1;
bool in_cgroup_realize_queue:1;
bool in_cgroup_empty_queue:1;
bool in_target_deps_queue:1;
bool in_stop_when_unneeded_queue:1;
bool sent_dbus_new_signal:1;
bool in_audit:1;
bool on_console:1;
bool cgroup_realized:1;
bool cgroup_members_mask_valid:1;
/* Reset cgroup accounting next time we fork something off */
bool reset_accounting:1;
bool start_limit_hit:1;
/* Did we already invoke unit_coldplug() for this unit? */
2015-05-19 16:03:01 +02:00
bool coldplugged:1;
/* For transient units: whether to add a bus track reference after creating the unit */
bool bus_track_add:1;
core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald And let's make use of it to implement two new unit settings with it: 1. LogLevelMax= is a new per-unit setting that may be used to configure log priority filtering: set it to LogLevelMax=notice and only messages of level "notice" and lower (i.e. more important) will be processed, all others are dropped. 2. LogExtraFields= is a new per-unit setting for configuring per-unit journal fields, that are implicitly included in every log record generated by the unit's processes. It takes field/value pairs in the form of FOO=BAR. Also, related to this, one exisiting unit setting is ported to this new facility: 3. The invocation ID is now pulled from /run/systemd/units/ instead of cgroupfs xattrs. This substantially relaxes requirements of systemd on the kernel version and the privileges it runs with (specifically, cgroupfs xattrs are not available in containers, since they are stored in kernel memory, and hence are unsafe to permit to lesser privileged code). /run/systemd/units/ is a new directory, which contains a number of files and symlinks encoding the above information. PID 1 creates and manages these files, and journald reads them from there. Note that this is supposed to be a direct path between PID 1 and the journal only, due to the special runtime environment the journal runs in. Normally, today we shouldn't introduce new interfaces that (mis-)use a file system as IPC framework, and instead just an IPC system, but this is very hard to do between the journal and PID 1, as long as the IPC system is a subject PID 1 manages, and itself a client to the journal. This patch cleans up a couple of types used in journal code: specifically we switch to size_t for a couple of memory-sizing values, as size_t is the right choice for everything that is memory. Fixes: #4089 Fixes: #3041 Fixes: #4441
2017-11-02 19:43:32 +01:00
/* Remember which unit state files we created */
bool exported_invocation_id:1;
bool exported_log_level_max:1;
bool exported_log_extra_fields:1;
bool exported_log_rate_limit_interval:1;
bool exported_log_rate_limit_burst:1;
/* When writing transient unit files, stores which section we stored last. If < 0, we didn't write any yet. If
* == 0 we are in the [Unit] section, if > 0 we are in the unit type-specific section. */
signed int last_section_private:2;
} Unit;
2010-01-26 21:39:06 +01:00
typedef struct UnitStatusMessageFormats {
const char *starting_stopping[2];
const char *finished_start_job[_JOB_RESULT_MAX];
const char *finished_stop_job[_JOB_RESULT_MAX];
} UnitStatusMessageFormats;
/* Flags used when writing drop-in files or transient unit files */
typedef enum UnitWriteFlags {
/* Write a runtime unit file or drop-in (i.e. one below /run) */
UNIT_RUNTIME = 1 << 0,
/* Write a persistent drop-in (i.e. one below /etc) */
UNIT_PERSISTENT = 1 << 1,
/* Place this item in the per-unit-type private section, instead of [Unit] */
UNIT_PRIVATE = 1 << 2,
/* Apply specifier escaping before writing */
UNIT_ESCAPE_SPECIFIERS = 1 << 3,
/* Apply C escaping before writing */
UNIT_ESCAPE_C = 1 << 4,
} UnitWriteFlags;
/* Returns true if neither persistent, nor runtime storage is requested, i.e. this is a check invocation only */
static inline bool UNIT_WRITE_FLAGS_NOOP(UnitWriteFlags flags) {
return (flags & (UNIT_RUNTIME|UNIT_PERSISTENT)) == 0;
}
#include "kill.h"
typedef struct UnitVTable {
/* How much memory does an object of this unit type need */
size_t object_size;
/* If greater than 0, the offset into the object where
* ExecContext is found, if the unit type has that */
size_t exec_context_offset;
/* If greater than 0, the offset into the object where
* CGroupContext is found, if the unit type has that */
size_t cgroup_context_offset;
/* If greater than 0, the offset into the object where
* KillContext is found, if the unit type has that */
size_t kill_context_offset;
/* If greater than 0, the offset into the object where the
* pointer to ExecRuntime is found, if the unit type has
* that */
size_t exec_runtime_offset;
/* If greater than 0, the offset into the object where the pointer to DynamicCreds is found, if the unit type
* has that. */
size_t dynamic_creds_offset;
/* The name of the configuration file section with the private settings of this unit */
const char *private_section;
/* Config file sections this unit type understands, separated
* by NUL chars */
const char *sections;
/* This should reset all type-specific variables. This should
2010-04-21 03:27:44 +02:00
* not allocate memory, and is called with zero-initialized
* data. It should hence only initialize variables that need
* to be set != 0. */
void (*init)(Unit *u);
2010-04-21 03:27:44 +02:00
/* This should free all type-specific variables. It should be
* idempotent. */
void (*done)(Unit *u);
/* Actually load data from disk. This may fail, and should set
* load_state to UNIT_LOADED, UNIT_MERGED or leave it at
* UNIT_STUB if no configuration could be found. */
int (*load)(Unit *u);
/* During deserialization we only record the intended state to return to. With coldplug() we actually put the
* deserialized state in effect. This is where unit_notify() should be called to start things up. Note that
* this callback is invoked *before* we leave the reloading state of the manager, i.e. *before* we consider the
* reloading to be complete. Thus, this callback should just restore the exact same state for any unit that was
* in effect before the reload, i.e. units should not catch up with changes happened during the reload. That's
* what catchup() below is for. */
int (*coldplug)(Unit *u);
2010-01-26 21:39:06 +01:00
/* This is called shortly after all units' coldplug() call was invoked, and *after* the manager left the
* reloading state. It's supposed to catch up with state changes due to external events we missed so far (for
* example because they took place while we were reloading/reexecing) */
void (*catchup)(Unit *u);
2010-01-26 21:39:06 +01:00
void (*dump)(Unit *u, FILE *f, const char *prefix);
int (*start)(Unit *u);
int (*stop)(Unit *u);
int (*reload)(Unit *u);
int (*kill)(Unit *u, KillWho w, int signo, sd_bus_error *error);
2010-10-22 16:11:50 +02:00
2010-01-26 21:39:06 +01:00
bool (*can_reload)(Unit *u);
2010-04-21 03:27:44 +02:00
/* Write all data that cannot be restored from other sources
* away using unit_serialize_item() */
int (*serialize)(Unit *u, FILE *f, FDSet *fds);
/* Restore one item from the serialization */
int (*deserialize_item)(Unit *u, const char *key, const char *data, FDSet *fds);
/* Try to match up fds with what we need for this unit */
void (*distribute_fds)(Unit *u, FDSet *fds);
2010-01-26 21:39:06 +01:00
/* Boils down the more complex internal state of this unit to
* a simpler one that the engine can understand */
UnitActiveState (*active_state)(Unit *u);
/* Returns the substate specific to this unit type as
* string. This is purely information so that we can give the
* user a more fine grained explanation in which actual state a
* unit is in. */
const char* (*sub_state_to_string)(Unit *u);
/* Additionally to UnitActiveState determine whether unit is to be restarted. */
bool (*will_restart)(Unit *u);
/* Return false when there is a reason to prevent this unit from being gc'ed
* even though nothing references it and it isn't active in any way. */
bool (*may_gc)(Unit *u);
/* When the unit is not running and no job for it queued we shall release its runtime resources */
void (*release_resources)(Unit *u);
/* Invoked on every child that died */
2010-01-26 21:39:06 +01:00
void (*sigchld_event)(Unit *u, pid_t pid, int code, int status);
/* Reset failed state if we are in failed state */
void (*reset_failed)(Unit *u);
/* Called whenever any of the cgroups this unit watches for
* ran empty */
void (*notify_cgroup_empty)(Unit *u);
2010-03-31 16:29:55 +02:00
/* Called whenever a process of this unit sends us a message */
void (*notify_message)(Unit *u, const struct ucred *ucred, char **tags, FDSet *fds);
/* Called whenever a name this Unit registered for comes or goes away. */
void (*bus_name_owner_change)(Unit *u, const char *name, const char *old_owner, const char *new_owner);
/* Called for each property that is being set */
int (*bus_set_property)(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
/* Called after at least one property got changed to apply the necessary change */
int (*bus_commit_properties)(Unit *u);
/* Return the unit this unit is following */
Unit *(*following)(Unit *u);
/* Return the set of units that are following each other */
int (*following_set)(Unit *u, Set **s);
/* Invoked each time a unit this unit is triggering changes
* state or gains/loses a job */
void (*trigger_notify)(Unit *u, Unit *trigger);
/* Called whenever CLOCK_REALTIME made a jump */
void (*time_change)(Unit *u);
/* Called whenever /etc/localtime was modified */
void (*timezone_change)(Unit *u);
/* Returns the next timeout of a unit */
int (*get_timeout)(Unit *u, usec_t *timeout);
/* Returns the main PID if there is any defined, or 0. */
pid_t (*main_pid)(Unit *u);
/* Returns the main PID if there is any defined, or 0. */
pid_t (*control_pid)(Unit *u);
/* Returns true if the unit currently needs access to the console */
bool (*needs_console)(Unit *u);
/* Returns the exit status to propagate in case of FailureAction=exit/SuccessAction=exit; usually returns the
* exit code of the "main" process of the service or similar. */
int (*exit_status)(Unit *u);
/* Like the enumerate() callback further down, but only enumerates the perpetual units, i.e. all units that
* unconditionally exist and are always active. The main reason to keep both enumeration functions separate is
* philosophical: the state of perpetual units should be put in place by coldplug(), while the state of those
* discovered through regular enumeration should be put in place by catchup(), see below. */
void (*enumerate_perpetual)(Manager *m);
/* This is called for each unit type and should be used to enumerate units already existing in the system
* internally and load them. However, everything that is loaded here should still stay in inactive state. It is
* the job of the catchup() call above to put the units into the discovered state. */
void (*enumerate)(Manager *m);
2010-01-29 03:18:09 +01:00
/* Type specific cleanups. */
void (*shutdown)(Manager *m);
/* If this function is set and return false all jobs for units
* of this type will immediately fail. */
bool (*supported)(void);
/* The bus vtable */
const sd_bus_vtable *bus_vtable;
/* The strings to print in status messages */
UnitStatusMessageFormats status_message_formats;
/* True if transient units of this type are OK */
bool can_transient:1;
/* True if cgroup delegation is permissible */
bool can_delegate:1;
/* True if units of this type shall be startable only once and then never again */
bool once_only:1;
/* True if queued jobs of this type should be GC'ed if no other job needs them anymore */
bool gc_jobs:1;
} UnitVTable;
2010-01-26 21:39:06 +01:00
extern const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX];
static inline const UnitVTable* UNIT_VTABLE(Unit *u) {
return unit_vtable[u->type];
}
2010-01-26 21:39:06 +01:00
/* For casting a unit into the various unit types */
#define DEFINE_CAST(UPPERCASE, MixedCase) \
static inline MixedCase* UPPERCASE(Unit *u) { \
if (_unlikely_(!u || u->type != UNIT_##UPPERCASE)) \
2010-01-26 21:39:06 +01:00
return NULL; \
\
return (MixedCase*) u; \
}
/* For casting the various unit types into a unit */
#define UNIT(u) \
({ \
typeof(u) _u_ = (u); \
Unit *_w_ = _u_ ? &(_u_)->meta : NULL; \
_w_; \
})
2010-01-26 21:39:06 +01:00
#define UNIT_HAS_EXEC_CONTEXT(u) (UNIT_VTABLE(u)->exec_context_offset > 0)
#define UNIT_HAS_CGROUP_CONTEXT(u) (UNIT_VTABLE(u)->cgroup_context_offset > 0)
#define UNIT_HAS_KILL_CONTEXT(u) (UNIT_VTABLE(u)->kill_context_offset > 0)
static inline Unit* UNIT_TRIGGER(Unit *u) {
return hashmap_first_key(u->dependencies[UNIT_TRIGGERS]);
}
Unit *unit_new(Manager *m, size_t size);
2010-01-26 21:39:06 +01:00
void unit_free(Unit *u);
2018-03-09 21:34:28 +01:00
DEFINE_TRIVIAL_CLEANUP_FUNC(Unit *, unit_free);
2010-01-26 21:39:06 +01:00
int unit_new_for_name(Manager *m, size_t size, const char *name, Unit **ret);
2010-01-26 21:39:06 +01:00
int unit_add_name(Unit *u, const char *name);
2010-04-15 03:11:11 +02:00
int unit_add_dependency(Unit *u, UnitDependency d, Unit *other, bool add_reference, UnitDependencyMask mask);
int unit_add_two_dependencies(Unit *u, UnitDependency d, UnitDependency e, Unit *other, bool add_reference, UnitDependencyMask mask);
int unit_add_dependency_by_name(Unit *u, UnitDependency d, const char *name, bool add_reference, UnitDependencyMask mask);
int unit_add_two_dependencies_by_name(Unit *u, UnitDependency d, UnitDependency e, const char *name, bool add_reference, UnitDependencyMask mask);
2010-04-06 02:43:58 +02:00
int unit_add_exec_dependencies(Unit *u, ExecContext *c);
int unit_choose_id(Unit *u, const char *name);
2010-01-29 03:18:09 +01:00
int unit_set_description(Unit *u, const char *description);
2010-01-26 21:39:06 +01:00
bool unit_may_gc(Unit *u);
2010-01-26 21:39:06 +01:00
void unit_add_to_load_queue(Unit *u);
void unit_add_to_dbus_queue(Unit *u);
2010-04-06 02:43:58 +02:00
void unit_add_to_cleanup_queue(Unit *u);
void unit_add_to_gc_queue(Unit *u);
void unit_add_to_target_deps_queue(Unit *u);
void unit_submit_to_stop_when_unneeded_queue(Unit *u);
2010-01-26 21:39:06 +01:00
int unit_merge(Unit *u, Unit *other);
2010-04-06 02:43:58 +02:00
int unit_merge_by_name(Unit *u, const char *other);
Unit *unit_follow_merge(Unit *u) _pure_;
2010-01-26 21:39:06 +01:00
int unit_load_fragment_and_dropin(Unit *u);
int unit_load_fragment_and_dropin_optional(Unit *u);
2010-01-26 21:39:06 +01:00
int unit_load(Unit *unit);
int unit_set_slice(Unit *u, Unit *slice);
int unit_set_default_slice(Unit *u);
const char *unit_description(Unit *u) _pure_;
2010-01-26 21:39:06 +01:00
bool unit_has_name(const Unit *u, const char *name);
2010-01-26 21:39:06 +01:00
UnitActiveState unit_active_state(Unit *u);
const char* unit_sub_state_to_string(Unit *u);
2010-01-26 21:39:06 +01:00
void unit_dump(Unit *u, FILE *f, const char *prefix);
bool unit_can_reload(Unit *u) _pure_;
bool unit_can_start(Unit *u) _pure_;
bool unit_can_stop(Unit *u) _pure_;
bool unit_can_isolate(Unit *u) _pure_;
2010-01-26 21:39:06 +01:00
int unit_start(Unit *u);
int unit_stop(Unit *u);
int unit_reload(Unit *u);
int unit_kill(Unit *u, KillWho w, int signo, sd_bus_error *error);
int unit_kill_common(Unit *u, KillWho who, int signo, pid_t main_pid, pid_t control_pid, sd_bus_error *error);
2010-10-22 16:11:50 +02:00
typedef enum UnitNotifyFlags {
UNIT_NOTIFY_RELOAD_FAILURE = 1 << 0,
UNIT_NOTIFY_WILL_AUTO_RESTART = 1 << 1,
} UnitNotifyFlags;
void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, UnitNotifyFlags flags);
2010-01-26 21:39:06 +01:00
int unit_watch_pid(Unit *u, pid_t pid);
void unit_unwatch_pid(Unit *u, pid_t pid);
void unit_unwatch_all_pids(Unit *u);
core: rework how we track service and scope PIDs This reworks how systemd tracks processes on cgroupv1 systems where cgroup notification is not reliable. Previously, whenever we had reason to believe that new processes showed up or got removed we'd scan the cgroup of the scope or service unit for new processes, and would tidy up the list of PIDs previously watched. This scanning is relatively slow, and does not scale well. With this change behaviour is changed: instead of scanning for new/removed processes right away we do this work in a per-unit deferred event loop job. This event source is scheduled at a very low priority, so that it is executed when we have time but does not starve other event sources. This has two benefits: this expensive work is coalesced, if events happen in quick succession, and we won't delay SIGCHLD handling for too long. This patch basically replaces all direct invocation of unit_watch_all_pids() in scope.c and service.c with invocations of the new unit_enqueue_rewatch_pids() call which just enqueues a request of watching/tidying up the PID sets (with one exception: in scope_enter_signal() and service_enter_signal() we'll still do unit_watch_all_pids() synchronously first, since we really want to know all processes we are about to kill so that we can track them properly. Moreover, all direct invocations of unit_tidy_watch_pids() and unit_synthesize_cgroup_empty_event() are removed too, when the unit_enqueue_rewatch_pids() call is invoked, as the queued job will run those operations too. All of this is done on cgroupsv1 systems only, and is disabled on cgroupsv2 systems as cgroup-empty notifications are reliable there, and we do not need SIGCHLD events to track processes there. Fixes: #9138
2018-05-31 15:41:59 +02:00
int unit_enqueue_rewatch_pids(Unit *u);
void unit_dequeue_rewatch_pids(Unit *u);
2010-01-26 21:39:06 +01:00
int unit_install_bus_match(Unit *u, sd_bus *bus, const char *name);
int unit_watch_bus_name(Unit *u, const char *name);
void unit_unwatch_bus_name(Unit *u, const char *name);
2010-01-26 21:39:06 +01:00
bool unit_job_is_applicable(Unit *u, JobType j);
2010-01-27 00:15:56 +01:00
int set_unit_path(const char *p);
2010-04-08 00:52:14 +02:00
char *unit_dbus_path(Unit *u);
core: add "invocation ID" concept to service manager This adds a new invocation ID concept to the service manager. The invocation ID identifies each runtime cycle of a unit uniquely. A new randomized 128bit ID is generated each time a unit moves from and inactive to an activating or active state. The primary usecase for this concept is to connect the runtime data PID 1 maintains about a service with the offline data the journal stores about it. Previously we'd use the unit name plus start/stop times, which however is highly racy since the journal will generally process log data after the service already ended. The "invocation ID" kinda matches the "boot ID" concept of the Linux kernel, except that it applies to an individual unit instead of the whole system. The invocation ID is passed to the activated processes as environment variable. It is additionally stored as extended attribute on the cgroup of the unit. The latter is used by journald to automatically retrieve it for each log logged message and attach it to the log entry. The environment variable is very easily accessible, even for unprivileged services. OTOH the extended attribute is only accessible to privileged processes (this is because cgroupfs only supports the "trusted." xattr namespace, not "user."). The environment variable may be altered by services, the extended attribute may not be, hence is the better choice for the journal. Note that reading the invocation ID off the extended attribute from journald is racy, similar to the way reading the unit name for a logging process is. This patch adds APIs to read the invocation ID to sd-id128: sd_id128_get_invocation() may be used in a similar fashion to sd_id128_get_boot(). PID1's own logging is updated to always include the invocation ID when it logs information about a unit. A new bus call GetUnitByInvocationID() is added that allows retrieving a bus path to a unit by its invocation ID. The bus path is built using the invocation ID, thus providing a path for referring to a unit that is valid only for the current runtime cycleof it. Outlook for the future: should the kernel eventually allow passing of cgroup information along AF_UNIX/SOCK_DGRAM messages via a unique cgroup id, then we can alter the invocation ID to be generated as hash from that rather than entirely randomly. This way we can derive the invocation race-freely from the messages.
2016-08-30 23:18:46 +02:00
char *unit_dbus_path_invocation_id(Unit *u);
2010-04-08 00:52:14 +02:00
int unit_load_related_unit(Unit *u, const char *type, Unit **_found);
bool unit_can_serialize(Unit *u) _pure_;
int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs);
2010-04-21 03:27:44 +02:00
int unit_deserialize(Unit *u, FILE *f, FDSet *fds);
int unit_deserialize_skip(FILE *f);
2010-04-21 03:27:44 +02:00
int unit_add_node_dependency(Unit *u, const char *what, bool wants, UnitDependency d, UnitDependencyMask mask);
int unit_coldplug(Unit *u);
void unit_catchup(Unit *u);
void unit_status_printf(Unit *u, const char *status, const char *unit_status_msg_format) _printf_(3, 0);
bool unit_need_daemon_reload(Unit *u);
void unit_reset_failed(Unit *u);
Unit *unit_following(Unit *u);
int unit_following_set(Unit *u, Set **s);
const char *unit_slice_name(Unit *u);
bool unit_stop_pending(Unit *u) _pure_;
bool unit_inactive_or_pending(Unit *u) _pure_;
bool unit_active_or_pending(Unit *u);
bool unit_will_restart(Unit *u);
2010-09-01 03:35:04 +02:00
int unit_add_default_target_dependency(Unit *u, Unit *target);
void unit_start_on_failure(Unit *u);
void unit_trigger_notify(Unit *u);
2011-07-31 18:28:02 +02:00
UnitFileState unit_get_unit_file_state(Unit *u);
int unit_get_unit_file_preset(Unit *u);
2011-07-31 18:28:02 +02:00
Unit* unit_ref_set(UnitRef *ref, Unit *source, Unit *target);
void unit_ref_unset(UnitRef *ref);
#define UNIT_DEREF(ref) ((ref).target)
#define UNIT_ISSET(ref) (!!(ref).target)
int unit_patch_contexts(Unit *u);
ExecContext *unit_get_exec_context(Unit *u) _pure_;
KillContext *unit_get_kill_context(Unit *u) _pure_;
CGroupContext *unit_get_cgroup_context(Unit *u) _pure_;
ExecRuntime *unit_get_exec_runtime(Unit *u) _pure_;
int unit_setup_exec_runtime(Unit *u);
int unit_setup_dynamic_creds(Unit *u);
char* unit_escape_setting(const char *s, UnitWriteFlags flags, char **buf);
char* unit_concat_strv(char **l, UnitWriteFlags flags);
int unit_write_setting(Unit *u, UnitWriteFlags flags, const char *name, const char *data);
int unit_write_settingf(Unit *u, UnitWriteFlags mode, const char *name, const char *format, ...) _printf_(4,5);
int unit_kill_context(Unit *u, KillContext *c, KillOperation k, pid_t main_pid, pid_t control_pid, bool main_pid_alien);
int unit_make_transient(Unit *u);
int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask);
bool unit_type_supported(UnitType t);
bool unit_is_pristine(Unit *u);
bool unit_is_unneeded(Unit *u);
pid_t unit_control_pid(Unit *u);
pid_t unit_main_pid(Unit *u);
static inline bool unit_supported(Unit *u) {
return unit_type_supported(u->type);
}
void unit_warn_if_dir_nonempty(Unit *u, const char* where);
int unit_fail_if_noncanonical(Unit *u, const char* where);
int unit_start_limit_test(Unit *u);
void unit_unref_uid(Unit *u, bool destroy_now);
int unit_ref_uid(Unit *u, uid_t uid, bool clean_ipc);
void unit_unref_gid(Unit *u, bool destroy_now);
int unit_ref_gid(Unit *u, gid_t gid, bool clean_ipc);
int unit_ref_uid_gid(Unit *u, uid_t uid, gid_t gid);
void unit_unref_uid_gid(Unit *u, bool destroy_now);
void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid);
core: add "invocation ID" concept to service manager This adds a new invocation ID concept to the service manager. The invocation ID identifies each runtime cycle of a unit uniquely. A new randomized 128bit ID is generated each time a unit moves from and inactive to an activating or active state. The primary usecase for this concept is to connect the runtime data PID 1 maintains about a service with the offline data the journal stores about it. Previously we'd use the unit name plus start/stop times, which however is highly racy since the journal will generally process log data after the service already ended. The "invocation ID" kinda matches the "boot ID" concept of the Linux kernel, except that it applies to an individual unit instead of the whole system. The invocation ID is passed to the activated processes as environment variable. It is additionally stored as extended attribute on the cgroup of the unit. The latter is used by journald to automatically retrieve it for each log logged message and attach it to the log entry. The environment variable is very easily accessible, even for unprivileged services. OTOH the extended attribute is only accessible to privileged processes (this is because cgroupfs only supports the "trusted." xattr namespace, not "user."). The environment variable may be altered by services, the extended attribute may not be, hence is the better choice for the journal. Note that reading the invocation ID off the extended attribute from journald is racy, similar to the way reading the unit name for a logging process is. This patch adds APIs to read the invocation ID to sd-id128: sd_id128_get_invocation() may be used in a similar fashion to sd_id128_get_boot(). PID1's own logging is updated to always include the invocation ID when it logs information about a unit. A new bus call GetUnitByInvocationID() is added that allows retrieving a bus path to a unit by its invocation ID. The bus path is built using the invocation ID, thus providing a path for referring to a unit that is valid only for the current runtime cycleof it. Outlook for the future: should the kernel eventually allow passing of cgroup information along AF_UNIX/SOCK_DGRAM messages via a unique cgroup id, then we can alter the invocation ID to be generated as hash from that rather than entirely randomly. This way we can derive the invocation race-freely from the messages.
2016-08-30 23:18:46 +02:00
int unit_set_invocation_id(Unit *u, sd_id128_t id);
int unit_acquire_invocation_id(Unit *u);
bool unit_shall_confirm_spawn(Unit *u);
int unit_set_exec_params(Unit *s, ExecParameters *p);
int unit_fork_helper_process(Unit *u, const char *name, pid_t *ret);
void unit_remove_dependencies(Unit *u, UnitDependencyMask mask);
core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald And let's make use of it to implement two new unit settings with it: 1. LogLevelMax= is a new per-unit setting that may be used to configure log priority filtering: set it to LogLevelMax=notice and only messages of level "notice" and lower (i.e. more important) will be processed, all others are dropped. 2. LogExtraFields= is a new per-unit setting for configuring per-unit journal fields, that are implicitly included in every log record generated by the unit's processes. It takes field/value pairs in the form of FOO=BAR. Also, related to this, one exisiting unit setting is ported to this new facility: 3. The invocation ID is now pulled from /run/systemd/units/ instead of cgroupfs xattrs. This substantially relaxes requirements of systemd on the kernel version and the privileges it runs with (specifically, cgroupfs xattrs are not available in containers, since they are stored in kernel memory, and hence are unsafe to permit to lesser privileged code). /run/systemd/units/ is a new directory, which contains a number of files and symlinks encoding the above information. PID 1 creates and manages these files, and journald reads them from there. Note that this is supposed to be a direct path between PID 1 and the journal only, due to the special runtime environment the journal runs in. Normally, today we shouldn't introduce new interfaces that (mis-)use a file system as IPC framework, and instead just an IPC system, but this is very hard to do between the journal and PID 1, as long as the IPC system is a subject PID 1 manages, and itself a client to the journal. This patch cleans up a couple of types used in journal code: specifically we switch to size_t for a couple of memory-sizing values, as size_t is the right choice for everything that is memory. Fixes: #4089 Fixes: #3041 Fixes: #4441
2017-11-02 19:43:32 +01:00
void unit_export_state_files(Unit *u);
void unit_unlink_state_files(Unit *u);
int unit_prepare_exec(Unit *u);
void unit_warn_leftover_processes(Unit *u);
bool unit_needs_console(Unit *u);
const char *unit_label_path(Unit *u);
int unit_pid_attachable(Unit *unit, pid_t pid, sd_bus_error *error);
void unit_log_success(Unit *u);
void unit_log_failure(Unit *u, const char *result);
static inline void unit_log_result(Unit *u, bool success, const char *result) {
if (success)
unit_log_success(u);
else
unit_log_failure(u, result);
}
void unit_log_process_exit(Unit *u, int level, const char *kind, const char *command, int code, int status);
int unit_exit_status(Unit *u);
int unit_success_action_exit_status(Unit *u);
int unit_failure_action_exit_status(Unit *u);
/* Macros which append UNIT= or USER_UNIT= to the message */
core,network: major per-object logging rework This changes log_unit_info() (and friends) to take a real Unit* object insted of just a unit name as parameter. The call will now prefix all logged messages with the unit name, thus allowing the unit name to be dropped from the various passed romat strings, simplifying invocations drastically, and unifying log output across messages. Also, UNIT= vs. USER_UNIT= is now derived from the Manager object attached to the Unit object, instead of getpid(). This has the benefit of correcting the field for --test runs. Also contains a couple of other logging improvements: - Drops a couple of strerror() invocations in favour of using %m. - Not only .mount units now warn if a symlinks exist for the mount point already, .automount units do that too, now. - A few invocations of log_struct() that didn't actually pass any additional structured data have been replaced by simpler invocations of log_unit_info() and friends. - For structured data a new LOG_UNIT_MESSAGE() macro has been added, that works like LOG_MESSAGE() but prefixes the message with the unit name. Similar, there's now LOG_LINK_MESSAGE() and LOG_NETDEV_MESSAGE(). - For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(), LOG_NETDEV_INTERFACE() macros have been added that generate the necessary per object fields. The old log_unit_struct() call has been removed in favour of these new macros used in raw log_struct() invocations. In addition to removing one more function call this allows generated structured log messages that contain two object fields, as necessary for example for network interfaces that are joined into another network interface, and whose messages shall be indexed by both. - The LOG_ERRNO() macro has been removed, in favour of log_struct_errno(). The latter has the benefit of ensuring that %m in format strings is properly resolved to the specified error number. - A number of logging messages have been converted to use log_unit_info() instead of log_info() - The client code in sysv-generator no longer #includes core code from src/core/. - log_unit_full_errno() has been removed, log_unit_full() instead takes an errno now, too. - log_unit_info(), log_link_info(), log_netdev_info() and friends, now avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
#define log_unit_full(unit, level, error, ...) \
({ \
const Unit *_u = (unit); \
core: add "invocation ID" concept to service manager This adds a new invocation ID concept to the service manager. The invocation ID identifies each runtime cycle of a unit uniquely. A new randomized 128bit ID is generated each time a unit moves from and inactive to an activating or active state. The primary usecase for this concept is to connect the runtime data PID 1 maintains about a service with the offline data the journal stores about it. Previously we'd use the unit name plus start/stop times, which however is highly racy since the journal will generally process log data after the service already ended. The "invocation ID" kinda matches the "boot ID" concept of the Linux kernel, except that it applies to an individual unit instead of the whole system. The invocation ID is passed to the activated processes as environment variable. It is additionally stored as extended attribute on the cgroup of the unit. The latter is used by journald to automatically retrieve it for each log logged message and attach it to the log entry. The environment variable is very easily accessible, even for unprivileged services. OTOH the extended attribute is only accessible to privileged processes (this is because cgroupfs only supports the "trusted." xattr namespace, not "user."). The environment variable may be altered by services, the extended attribute may not be, hence is the better choice for the journal. Note that reading the invocation ID off the extended attribute from journald is racy, similar to the way reading the unit name for a logging process is. This patch adds APIs to read the invocation ID to sd-id128: sd_id128_get_invocation() may be used in a similar fashion to sd_id128_get_boot(). PID1's own logging is updated to always include the invocation ID when it logs information about a unit. A new bus call GetUnitByInvocationID() is added that allows retrieving a bus path to a unit by its invocation ID. The bus path is built using the invocation ID, thus providing a path for referring to a unit that is valid only for the current runtime cycleof it. Outlook for the future: should the kernel eventually allow passing of cgroup information along AF_UNIX/SOCK_DGRAM messages via a unique cgroup id, then we can alter the invocation ID to be generated as hash from that rather than entirely randomly. This way we can derive the invocation race-freely from the messages.
2016-08-30 23:18:46 +02:00
_u ? log_object_internal(level, error, __FILE__, __LINE__, __func__, _u->manager->unit_log_field, _u->id, _u->manager->invocation_log_field, _u->invocation_id_string, ##__VA_ARGS__) : \
core,network: major per-object logging rework This changes log_unit_info() (and friends) to take a real Unit* object insted of just a unit name as parameter. The call will now prefix all logged messages with the unit name, thus allowing the unit name to be dropped from the various passed romat strings, simplifying invocations drastically, and unifying log output across messages. Also, UNIT= vs. USER_UNIT= is now derived from the Manager object attached to the Unit object, instead of getpid(). This has the benefit of correcting the field for --test runs. Also contains a couple of other logging improvements: - Drops a couple of strerror() invocations in favour of using %m. - Not only .mount units now warn if a symlinks exist for the mount point already, .automount units do that too, now. - A few invocations of log_struct() that didn't actually pass any additional structured data have been replaced by simpler invocations of log_unit_info() and friends. - For structured data a new LOG_UNIT_MESSAGE() macro has been added, that works like LOG_MESSAGE() but prefixes the message with the unit name. Similar, there's now LOG_LINK_MESSAGE() and LOG_NETDEV_MESSAGE(). - For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(), LOG_NETDEV_INTERFACE() macros have been added that generate the necessary per object fields. The old log_unit_struct() call has been removed in favour of these new macros used in raw log_struct() invocations. In addition to removing one more function call this allows generated structured log messages that contain two object fields, as necessary for example for network interfaces that are joined into another network interface, and whose messages shall be indexed by both. - The LOG_ERRNO() macro has been removed, in favour of log_struct_errno(). The latter has the benefit of ensuring that %m in format strings is properly resolved to the specified error number. - A number of logging messages have been converted to use log_unit_info() instead of log_info() - The client code in sysv-generator no longer #includes core code from src/core/. - log_unit_full_errno() has been removed, log_unit_full() instead takes an errno now, too. - log_unit_info(), log_link_info(), log_netdev_info() and friends, now avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
log_internal(level, error, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \
})
#define log_unit_debug(unit, ...) log_unit_full(unit, LOG_DEBUG, 0, ##__VA_ARGS__)
#define log_unit_info(unit, ...) log_unit_full(unit, LOG_INFO, 0, ##__VA_ARGS__)
#define log_unit_notice(unit, ...) log_unit_full(unit, LOG_NOTICE, 0, ##__VA_ARGS__)
#define log_unit_warning(unit, ...) log_unit_full(unit, LOG_WARNING, 0, ##__VA_ARGS__)
#define log_unit_error(unit, ...) log_unit_full(unit, LOG_ERR, 0, ##__VA_ARGS__)
#define log_unit_debug_errno(unit, error, ...) log_unit_full(unit, LOG_DEBUG, error, ##__VA_ARGS__)
#define log_unit_info_errno(unit, error, ...) log_unit_full(unit, LOG_INFO, error, ##__VA_ARGS__)
#define log_unit_notice_errno(unit, error, ...) log_unit_full(unit, LOG_NOTICE, error, ##__VA_ARGS__)
#define log_unit_warning_errno(unit, error, ...) log_unit_full(unit, LOG_WARNING, error, ##__VA_ARGS__)
#define log_unit_error_errno(unit, error, ...) log_unit_full(unit, LOG_ERR, error, ##__VA_ARGS__)
#define LOG_UNIT_MESSAGE(unit, fmt, ...) "MESSAGE=%s: " fmt, (unit)->id, ##__VA_ARGS__
#define LOG_UNIT_ID(unit) (unit)->manager->unit_log_format_string, (unit)->id
#define LOG_UNIT_INVOCATION_ID(unit) (unit)->manager->invocation_log_format_string, (unit)->invocation_id_string
const char* collect_mode_to_string(CollectMode m) _const_;
CollectMode collect_mode_from_string(const char *s) _pure_;